提交 baba9e5d 作者: 薛凌堃

政策法规脚本维护

上级 f2ff6737
...@@ -505,27 +505,36 @@ class BaseCore: ...@@ -505,27 +505,36 @@ class BaseCore:
for i in range(0, 3): for i in range(0, 3):
try: try:
response = requests.get(file_href, headers=headers, verify=False, timeout=20) response = requests.get(file_href, headers=headers, verify=False, timeout=20)
file_size = int(response.headers.get('Content-Length'))
break break
except: except Exception as e:
time.sleep(3) time.sleep(3)
if i ==2:
return retData
continue continue
try:
if response.status_code == 200:
file_size = int(response.headers.get('Content-Length'))
else:
return retData
except:
file_size = ''
for i in range(0, 3): for i in range(0, 3):
try: try:
name = str(self.getuuid()) + category name = str(self.getuuid()) + category
result = obsClient.putContent('zzsn', 'PolicyDocuments/' + name, content=response.content) result = obsClient.putContent('zzsn', 'PolicyDocuments/' + name, content=response.content)
break break
except: except:
time.sleep(3) time.sleep(3)
continue continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1] retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = result['body']['objectUrl'] retData['full_path'] = result['body']['objectUrl']
retData['file_size'] = self.convert_size(file_size) try:
retData['file_size'] = self.convert_size(file_size)
except:
retData['file_size'] = ''
retData['create_time'] = time_now retData['create_time'] = time_now
return retData return retData
except Exception as e: except Exception as e:
......
...@@ -34,8 +34,8 @@ def get_content3(): ...@@ -34,8 +34,8 @@ def get_content3():
doc_href = soup.find('div', class_='zsy_content') doc_href = soup.find('div', class_='zsy_content')
try: try:
org_content = doc_href.select('.zsy_cotitle')[0] org_content = doc_href.select('.zsy_cotitle')[0]
org = re.findall('文章来源:(.*?)发布时间:', org_content)[0].strip() org = re.findall('文章来源:(.*?)发布时间:', str(org_content))[0].strip()
except: except Exception as e:
org = '' org = ''
try: try:
contentWithTag = doc_href.find('div', class_='zsy_comain') contentWithTag = doc_href.find('div', class_='zsy_comain')
...@@ -103,7 +103,7 @@ def get_content3(): ...@@ -103,7 +103,7 @@ def get_content3():
'id': '', # 'id': '', #
'labels': [{'relationId': "1642", 'relationName': "国务院国资委", 'labelMark': "policy"}], 'labels': [{'relationId': "1642", 'relationName': "国务院国资委", 'labelMark': "policy"}],
# 关联标签id 关联标签名称 关联标签标识 # 关联标签id 关联标签名称 关联标签标识
'origin': '', # 政策发布机关 'origin': org, # 政策发布机关
'organ': org, # 政策发文机关 'organ': org, # 政策发文机关
'topicClassification': '', # 政策文件分类 'topicClassification': '', # 政策文件分类
'issuedNumber': pub_hao, # 发文字号 'issuedNumber': pub_hao, # 发文字号
...@@ -168,10 +168,10 @@ def get_content3(): ...@@ -168,10 +168,10 @@ def get_content3():
href = f'http://www.sasac.gov.cn{href_.replace("../../..", "")}' href = f'http://www.sasac.gov.cn{href_.replace("../../..", "")}'
# 判断是否已经爬取过 # 判断是否已经爬取过
is_href = baseTool.db_storage.find_one({'网址': href}) is_href = baseTool.db_storage.find_one({'网址': href})
if is_href: # if is_href:
num += 1 # num += 1
log.info('已采集----------跳过') # log.info('已采集----------跳过')
continue # continue
title = doc_item('a').attr('title') title = doc_item('a').attr('title')
pub_time = doc_item('span').text().replace('[', '').replace(']', '') pub_time = doc_item('span').text().replace('[', '').replace(']', '')
except: except:
...@@ -184,9 +184,9 @@ def get_content3(): ...@@ -184,9 +184,9 @@ def get_content3():
end_time = time.time() end_time = time.time()
log.info(f'共抓取国资委文件{count}条数据,耗时{end_time - start_time}') log.info(f'共抓取国资委文件{count}条数据,耗时{end_time - start_time}')
# partOne() partOne()
# 增量执行需要注释掉partTwo() # 增量执行需要注释掉partTwo()
partTwo() # partTwo()
if __name__ == "__main__": if __name__ == "__main__":
......
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from ClassTool import ClassTool
baseTool = ClassTool()
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 吉林
def ji_lin():
start = time.time()
num = 0
count = 0
url = 'http://gzw.jl.gov.cn/zwgk/zcwj/'
try:
resp_text = requests.get(url=url, headers=baseTool.headers, verify=False)
resp_text.encoding = 'utf-8'
html = resp_text.text
soup = BeautifulSoup(html, 'html.parser')
result = soup.find(class_='list ej_list')
li_list = result.find_all('li')
for a in li_list:
id_list = []
a_text = str(a)
href = a.find('a')['href'] # 网站链接
if re.findall('http', href):
real_href = href
else:
real_href = url + a_text.split('href=".')[-1].split('" target="_blank')[0]
title = a.find('a').text.replace('\n', '')
is_href = baseTool.db_storage.find_one({'网址': real_href})
if is_href:
num += 1
continue
try:
# real_href = 'http://gzw.jl.gov.cn/zwgk/zcwj//201906/t20190624_2310742.html'
href_text = requests.get(url=real_href, headers=baseTool.headers, verify=False)
i_html = href_text.text.encode("ISO-8859-1")
i_html = i_html.decode("utf-8")
i_soup = BeautifulSoup(i_html, 'html.parser')
# print(i_soup)
# 相对路径转化为绝对路径
soup = baseTool.paserUrl(i_soup, real_href)
soup.prettify()
try:
i_come = i_soup.find('span', class_='source')
i_time = i_soup.find('span', class_='time')
pub_come = i_come.text.split('.write(" ')[1].split('");')[0].strip()
pub_time = i_time.text.split('时间:')[1].strip()
except:
i_come = i_soup.find('div', class_='zsy_cotitle')
i_time = i_soup.find('div', class_='zsy_cotitle')
if (i_come):
# pub_come = i_come.find('p')
try:
pub_come = i_come.find('p').text.split('信息来源 > ')[1].split('发布时间:')[0].strip()
except:
pub_come = i_come.find('p').text.split('文章来源')[1].split('发布时间:')[0].strip()
# print(pub_time)
pub_time = i_time.find('p').text.split('发布时间:')[1].strip()
# print(pub_come)
else:
pub = i_soup.find(class_='share')
pub_time = pub.find(class_='left').find('span', class_='time').text
if '时间' in pub_time:
pub_time = pub_time.split('时间:')[1].strip()
pub_come = pub.find(class_='right').find('span', class_='source').text.split('来源:')[1].strip()
# print(pub_come)
i_content = soup.find(class_='zsy_comain')
if i_content:
# print(real_href)
# 去掉扫一扫
try:
soup.find('div', id='qr_container').decompose()
soup.find('div', id='div_div').decompose()
except:
i_content = soup
# 去掉style
# 去掉style标签
try:
for styleTag in soup.find_all('style'):
styleTag.extract()
except:
i_content = soup
contentWithTag = soup.find(class_='zsy_comain')
content = contentWithTag.text.strip()
if content == '' or content == 'None':
log.info(f'{real_href}-----{title}----内容为空')
continue
# 发文字号
find_hao = i_content.find_all('p')[:3]
pub_hao = ''
for j in find_hao:
if '号' in j.text:
pub_hao = j.text
else:
continue
fj = soup.find('div', style='width:920px; margin: 0 auto;')
if fj:
li_list = fj.find_all('li')
for li in li_list:
fu_jian_href = li.find('a')['href']
# 如果是附件
if '.pdf' in fu_jian_href or '.wps' in fu_jian_href or '.docx' in fu_jian_href or '.doc' in fu_jian_href or 'xls' in fu_jian_href or '.zip' in fu_jian_href \
or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
file_name = fu_jian_href.text.strip()
category = os.path.splitext(fu_jian_href)[1]
if category not in file_name:
file_name = file_name + category
# print(fu_jian_href)
retData = baseCore.uptoOBS(fu_jian_href, '1670', file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '吉林市国资委', file_name, num, pub_time)
id_list.append(att_id)
#
# # todo:将返回的地址更新到soup
li.find('a')['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
else:
continue
else:
i_content = soup.find(class_="content")
# 将文章中的附件字段删去
pattern = r'\d+\.'
# pattern = r"附件:\d+\.\s*(.*)"
for p in i_content.find_all('div')[-10:]:
p_text = p.text
matches = re.findall(pattern, p_text)
if matches:
for k in matches:
if k in p_text:
p.extract()
contentWithTag = i_content
content = contentWithTag.text.strip()
if content == '' or content == 'None':
log.info(f'{real_href}-----{title}----内容为空')
continue
# 找到附件上传至文件服务器
fj_soup = i_soup.find('div', class_='wenjianfujian')
fj_list = fj_soup.find_all('a')
# for fu_jian_href in fj_list:
# fj_href = fu_jian_href['href']
# file_name = fu_jian_href.text.strip()
# # 如果是附件
# if '.pdf' in fj_href or '.wps' in fj_href or '.docx' in fj_href or '.doc' in fj_href or 'xls' in fj_href or '.zip' in fj_href \
# or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
# or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
# # print(fj_href)
# category = os.path.splitext(fj_href)[1]
# if category not in file_name:
# file_name = file_name + category
# retData = baseCore.uptoOBS(fj_href, '1670', file_name)
# if retData['state']:
# pass
# else:
# continue
# att_id, full_path = baseCore.tableUpdate(retData, '吉林省国资委', file_name, num, pub_time)
# id_list.append(att_id)
# #
# # # todo:将返回的地址更新到soup
# fu_jian_href['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
# else:
# continue
if '扫一扫在手机打开当前页' in content:
content.replace('扫一扫在手机打开当前页', '')
soup.find('div', id='div_div').decompose()
soup.find('div', id='qr_container').decompose()
else:
pass
log.info(title)
# print('............................................................')
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_list,
'author': '',
'content': content,
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1670", 'relationName': "吉林市国资委", 'labelMark': "policy"}],
'origin': pub_come,
'organ': '',
'topicClassification': '',
'issuedNumber': '',
'publishDate': pub_time,
'writtenDate': None,
'sid': '1697458829758697473',
'sourceAddress': real_href,
'summary': '',
'title': title
}
# 如果内容为空,则数据不传接口
if content == '' or content == 'None':
continue
else:
# print(dic_news)
flag = baseTool.sendKafka(dic_news)
if flag:
baseTool.save_data(dic_news)
num = num + 1
count += 1
except Exception as e:
log.info(e)
pass
except:
pass
end = time.time()
log.info(f'共{count}条...........共耗时 {end - start}秒')
if __name__ == "__main__":
ji_lin()
\ No newline at end of file
import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from ClassTool import ClassTool
baseTool = ClassTool()
from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
# 天津
def tian_jin():
def tian_jin1():
num = 0
count = 0
start_time = time.time()
for page in range(0, 3):
if page == 0:
url = 'http://sasac.tj.gov.cn/ZWGK1142/zcwj/sjwj/'
else:
url = f'https://sasac.tj.gov.cn/ZWGK1142/zcwj/sjwj/index_{page}.html'
try:
baseTool.headers['Accept-Language'] = 'zh-CN,zh;q=0.9'
req = requests.get(url=url, headers=baseTool.headers, verify=False)
req_text = req.text.encode("ISO-8859-1")
req_text = req_text.decode("utf-8")
soup = BeautifulSoup(req_text, 'html.parser')
doc_items = soup.select('#content > div.mainContent > div > div.mBd > ul')[0]
li_list = doc_items.find_all('li')
for li in li_list:
title = str(li.find('a').text).replace('\n', '').lstrip().strip()
i_href = str(li.find('a').get('href'))
if 'ZTZL' in i_href:
href = i_href.replace('../../../', 'https://sasac.tj.gov.cn/')
elif './' in i_href:
href = i_href.replace('./', 'https://sasac.tj.gov.cn/ZWGK1142/zcwj/sjwj/')
else:
href = i_href
is_href = baseTool.db_storage.find_one({'网址': href})
if is_href:
num += 1
continue
try:
# href_text = requests.get(url=href, headers=headers, verify=False).content.decode('utf-8')
driver = baseTool.getDriver()
driver.get(href)
time.sleep(2)
href_text = driver.page_source
soup = baseTool.paserUrl(href_text, href)
doc_href = pq(str(soup))
title = doc_href('div[class="top-container"]>div:nth-child(1)>:nth-child(2)').text()
organ = doc_href('div[class="top-container"]>div:nth-child(3)>:nth-child(2)').text()
issuedNumber = doc_href('div[class="top-container"]>div:nth-child(4)>:nth-child(2)').text()
topicClassification = doc_href(
'div[class="top-container"]>div:nth-child(5)>:nth-child(2)').text()
writtenDate_ = doc_href('div[class="top-container"]>div:nth-child(6)>:nth-child(2)').text()
publishDate_ = doc_href('div[class="top-container"]>div:nth-child(7)>:nth-child(2)').text()
date_obj1 = datetime.datetime.strptime(writtenDate_, "%Y年%m月%d日")
writtenDate = date_obj1.strftime("%Y-%m-%d")
date_obj2 = datetime.datetime.strptime(publishDate_, "%Y年%m月%d日")
publishDate = date_obj2.strftime("%Y-%m-%d")
doc_href('div[id="articlePlayer"]').remove()
contentWithTag = doc_href('div[id="xlrllt"]')
origin = ''
if len(title) < 1:
title = doc_href('div[class="common-content-mainTitle"]').text()
issuedNumber = doc_href('div[class="common-content-subTitle"]').text()
origin = doc_href('div[class="property"]>span:nth-child(1)').text().replace('文章来源:',
'').strip()
publishDate = doc_href('div[class="property"]>span:nth-child(2)').text().replace('发布时间:',
'').strip()
rmtag2 = doc_href('div[id="articlePlayer"]')
rmtag2.remove()
contentWithTag = doc_href('div[id="zoom"]')
if len(writtenDate) < 1:
writtenDate = None
if len(publishDate) < 1:
publishDate = doc_href('meta[name="PubDate"]').attr('content')
soup = baseTool.paserUrl(str(contentWithTag), href)
fu_jian_soup = soup.find_all('a')
id_list = []
for file in fu_jian_soup:
try:
file_href = file['href']
except Exception as e:
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1683', file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num, publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
# id_ = redefid(id_list)
contentWithTag = str(soup.prettify())
if len(contentWithTag) < 1:
if len(fu_jian_soup) < 1:
continue
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_list,
'author': '',
'content': str(content),
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1683", 'relationName': "天津市国资委", 'labelMark': "policy"}],
'origin': origin,
'organ': organ,
'topicClassification': topicClassification,
'issuedNumber': issuedNumber,
'publishDate': publishDate,
'writtenDate': writtenDate,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
'title': title
}
# print(dic_news)
flag = baseTool.sendKafka(dic_news)
if flag:
baseTool.save_data(dic_news)
num += 1
count += 1
except Exception as e:
pass
except:
pass
end_time = time.time()
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def tian_jin2():
"""
http://sasac.tj.gov.cn/ZWGK1142/zcwj/wjwj/index.html 4
"""
num = 0
count = 0
start_time = time.time()
for page in range(0, 5):
if page == 0:
url = 'http://sasac.tj.gov.cn/ZWGK1142/zcwj/wjwj/index.html'
else:
url = f'http://sasac.tj.gov.cn/ZWGK1142/zcwj/wjwj/index_{page}.html'
try:
baseTool.headers['Accept-Language'] = 'zh-CN,zh;q=0.9'
req = requests.get(url=url, headers=baseTool.headers, verify=False)
req_text = req.text.encode("ISO-8859-1")
req_text = req_text.decode("utf-8")
soup = BeautifulSoup(req_text, 'html.parser')
doc_items = soup.select('#content > div.mainContent > div > div.mBd > ul')[0]
li_list = doc_items.find_all('li')
for li in li_list:
title = str(li.find('a').text).replace('\n', '').lstrip().strip()
href = str(li.find('a').get('href'))
if 'http:' in href:
continue
else:
href = url.split('index')[0] + href.replace('./', '')
is_href = baseTool.db_storage.find_one({'网址': href})
if is_href:
num += 1
continue
try:
# href_text = requests.get(url=href, headers=headers, verify=False).content.decode('utf-8')
driver = baseTool.getDriver()
driver.get(href)
time.sleep(2)
href_text = driver.page_source
soup = baseTool.paserUrl(href_text, href)
doc_href = pq(str(soup))
title = doc_href('div[class="top-container"]>div:nth-child(1)>:nth-child(2)').text()
organ = doc_href('div[class="top-container"]>div:nth-child(3)>:nth-child(2)').text()
issuedNumber = doc_href('div[class="top-container"]>div:nth-child(4)>:nth-child(2)').text()
topicClassification = doc_href(
'div[class="top-container"]>div:nth-child(5)>:nth-child(2)').text()
writtenDate_ = doc_href('div[id="content_cwrq"]').text()
publishDate_ = doc_href('div[id="content_fbrq"]').text()
date_obj1 = datetime.datetime.strptime(writtenDate_, "%Y年%m月%d日")
writtenDate = date_obj1.strftime("%Y-%m-%d")
date_obj2 = datetime.datetime.strptime(publishDate_, "%Y年%m月%d日")
publishDate = date_obj2.strftime("%Y-%m-%d")
contentWithTag = doc_href('div[id="xlrllt"]')
origin = ''
if len(title) < 1:
title = doc_href('div[class="common-content-mainTitle"]').text()
issuedNumber = doc_href('div[class="common-content-subTitle"]').text()
origin = doc_href('div[class="property"]>span:nth-child(1)').text().replace('文章来源:',
'').strip()
publishDate = doc_href('div[class="property"]>span:nth-child(2)').text().replace('发布时间:',
'').strip()
rmtag2 = doc_href('div[id="articlePlayer"]')
rmtag2.remove()
contentWithTag = doc_href('div[id="zoom"]')
if len(writtenDate) < 1:
writtenDate = None
if len(publishDate) < 1:
publishDate = doc_href('meta[name="PubDate"]').attr('content')
soup = baseTool.paserUrl(str(contentWithTag), href)
fu_jian_soup = soup.find_all('a')
id_list = []
for file in fu_jian_soup:
try:
file_href = file['href']
except Exception as e:
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1683', file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num, publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
# id_ = redefid(id_list)
if id_list:
pass
else:
doc_href("ul[class='qt-attachments-list']").remove()
contentWithTag = str(soup.prettify())
if len(contentWithTag) < 1:
if len(fu_jian_soup) < 1:
continue
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_list,
'author': '',
'content': str(content),
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1683", 'relationName': "天津市国资委", 'labelMark': "policy"}],
'origin': origin,
'organ': organ,
'topicClassification': topicClassification,
'issuedNumber': issuedNumber,
'publishDate': publishDate,
'writtenDate': writtenDate,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
'title': title
}
# print(dic_news)
flag = baseTool.sendKafka(dic_news)
if flag:
baseTool.save_data(dic_news)
num += 1
count += 1
except:
pass
except:
pass
end_time = time.time()
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
def tian_jin3():
num = 0
count = 0
start_time = time.time()
for page in range(1, 3):
if page == 1:
url = 'https://sasac.tj.gov.cn/ZWGK1142/zcwj/gjjwj/index.html'
else:
# https://sasac.tj.gov.cn/ZWGK1142/zcwj/gjjwj/index_1.html
url = f'https://sasac.tj.gov.cn/ZWGK1142/zcwj/gjjwj/index_{page - 1}.html'
try:
req = requests.get(url, baseTool.headers, verify=False)
req_text = req.text.encode("ISO-8859-1")
req_text = req_text.decode("utf-8")
soup = BeautifulSoup(req_text, 'html.parser')
doc_items = soup.select('#content > div.mainContent > div > div.mBd > ul')[0]
li_list = doc_items.find_all('li')
for li in li_list:
title = str(li.find('a').text).replace('\n', '').lstrip().strip()
href = str(li.find('a').get('href'))
try:
publishDate = li.find('div', attrs={'class': 'other'}).text
except:
publishDate = None
if 'http' not in href:
if '../../../' in href:
href = href.replace('../../../', 'https://sasac.tj.gov.cn/')
href = href.replace('./', 'https://sasac.tj.gov.cn/ZWGK1142/zcwj/gjjwj/')
is_href = baseTool.db_storage.find_one({'网址': href})
if is_href:
num += 1
continue
try:
# res = requests.get(href, headers)
# page_text = res.text.encode("ISO-8859-1")
# page_text = page_text.decode("utf-8")
driver = baseTool.getDriver()
driver.get(href)
time.sleep(2)
href_text = driver.page_source
soup = baseTool.paserUrl(href_text, href)
doc_href = pq(str(soup))
title = doc_href('table[class="bd1"]>tbody>tr:nth-child(3)>td:nth-child(2)').text()
organ = doc_href('table[class="bd1"]>tbody>tr:nth-child(2)>td:nth-child(2)').text()
issuedNumber = doc_href('table[class="bd1"]>tbody>tr:nth-child(4)>td:nth-child(2)').text()
topicClassification = doc_href(
'table[class="bd1"]>tbody>tr:nth-child(1)>td:nth-child(4)').text()
writtenDate = doc_href('table[class="bd1"]>tbody>tr:nth-child(2)>td:nth-child(4)').text()
publishDate = doc_href('table[class="bd1"]>tbody>tr:nth-child(4)>td:nth-child(4)').text()
contentWithTag = doc_href('div[id="UCAP-CONTENT"]')
origin = ''
if len(title) < 1:
title = doc_href('div[class="common-content-mainTitle"]').text()
issuedNumber = doc_href('div[class="common-content-subTitle"]').text()
origin = doc_href('div[class="property"]>span:nth-child(1)').text().replace('文章来源:',
'').strip()
publishDate = doc_href('div[class="property"]>span:nth-child(2)').text().replace('发布时间:',
'').strip()
rmtag2 = doc_href('div[id="articlePlayer"]')
rmtag2.remove()
contentWithTag = doc_href('div[id="zoom"]')
if len(title) < 1:
doc_href = doc_href('div[aria-label="内容文本区"]')
doc_soup = BeautifulSoup(str(doc_href), 'html.parser')
info_list = doc_soup.find('tbody').find('tbody').find('tr').find_all('table')
title_tag = info_list[0]
organ = info_list[2].find('span', id="laiyuan").text
publishDate = info_list[2].find_all('td', class_="hui12")[-1].text
contentWithTag = info_list[-1]
if len(writtenDate) < 1:
writtenDate = None
if len(publishDate) < 1:
publishDate = doc_href('meta[name="PubDate"]').attr('content')
soup = baseTool.paserUrl(str(contentWithTag), href)
fu_jian_soup = soup.find_all('a')
id_list = []
for file in fu_jian_soup:
try:
file_href = file['href']
except Exception as e:
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
retData = baseCore.uptoOBS(file_href, '1683', file_name)
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num, publishDate)
id_list.append(att_id)
# todo:将返回的地址更新到soup
file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
# id_ = redefid(id_list)
contentWithTag = str(soup.prettify())
if len(contentWithTag) < 1:
if len(fu_jian_soup) < 1:
continue
content = soup.text
if content == '' or content == None:
log.info(f'-----{href}----{title}----内容为空-----')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_list,
'author': '',
'content': str(content),
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1683", 'relationName': "天津市国资委", 'labelMark': "policy"}],
'origin': origin,
'organ': organ,
'topicClassification': topicClassification,
'issuedNumber': issuedNumber,
'publishDate': publishDate,
'writtenDate': writtenDate,
'sid': '1697458829758697473',
'sourceAddress': href,
'summary': '',
'title': title
}
# print(dic_news)
flag = baseTool.sendKafka(dic_news)
if flag:
baseTool.save_data(dic_news)
num += 1
count += 1
except Exception as e:
pass
except:
pass
end_time = time.time()
log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
tian_jin1()
tian_jin2()
tian_jin3()
if __name__ == "__main__":
tian_jin()
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论