提交 9ab6c127 作者: LiuLiYuan

政策法规 9/9

上级 eeb41ef7
...@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup ...@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup
from kafka import KafkaProducer from kafka import KafkaProducer
from pyquery import PyQuery as pq from pyquery import PyQuery as pq
from requests.packages import urllib3 from requests.packages import urllib3
from requests.adapters import HTTPAdapter
from BaseCore import BaseCore from BaseCore import BaseCore
baseCore = BaseCore() baseCore = BaseCore()
...@@ -110,7 +110,7 @@ def sendKafka(dic_news): ...@@ -110,7 +110,7 @@ def sendKafka(dic_news):
# 传输成功,写入日志中 # 传输成功,写入日志中
state = 1 state = 1
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
# return True return True
except Exception as e: except Exception as e:
...@@ -124,6 +124,7 @@ def sendKafka(dic_news): ...@@ -124,6 +124,7 @@ def sendKafka(dic_news):
e = 'Kafka操作失败' e = 'Kafka操作失败'
state = 0 state = 0
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
return False
def redefid(idList): def redefid(idList):
id_ = ','.join(map(str, idList)) id_ = ','.join(map(str, idList))
...@@ -132,7 +133,38 @@ def redefid(idList): ...@@ -132,7 +133,38 @@ def redefid(idList):
def remove_dup(): def remove_dup():
pass pass
# 国务院文件
def get_content1(): def get_content1():
def getPageConunt(a_list, url, headers, s):
data = {"code": "18122f54c5c", "thirdPartyCode": "thirdparty_code_107", "thirdPartyTableId": 30,
"resultFields": ["pub_url", "maintitle", "fwzh", "cwrq", "publish_time"],
"trackTotalHits": "true",
"searchFields": [{"fieldName": "maintitle", "searchWord": ""}], "isPreciseSearch": 0,
"sorts": [{"sortField": "publish_time", "sortOrder": "DESC"}], "childrenInfoIds": [[a_list[1]]],
"pageSize": 20, "pageNo": 1}
data = json.dumps(data)
ip = baseCore.get_proxy()
res = s.post(url=url, headers=headers, data=data, verify=False, proxies=ip)
# 获得结果为json格式
res_text = json.loads(res.text)
pageCount = res_text['result']['data']['pager']['pageCount']
return pageCount
def getList(a_list, url, headers, pageNo, s):
# post请求所需参数
data = {"code": "18122f54c5c", "thirdPartyCode": "thirdparty_code_107", "thirdPartyTableId": 30,
"resultFields": ["pub_url", "maintitle", "fwzh", "cwrq", "publish_time"],
"trackTotalHits": "true",
"searchFields": [{"fieldName": "maintitle", "searchWord": ""}], "isPreciseSearch": 0,
"sorts": [{"sortField": "publish_time", "sortOrder": "DESC"}], "childrenInfoIds": [[a_list[1]]],
"pageSize": 20, "pageNo": pageNo}
data = json.dumps(data)
ip = baseCore.get_proxy()
res = s.post(url=url, headers=headers, data=data, verify=False, proxies=ip)
res_text = json.loads(res.text)
page_list = res_text['result']['data']['list']
return page_list
start_time = time.time() start_time = time.time()
num = 0 num = 0
# 过网站验证所需 athenaAppKey athenaAppName # 过网站验证所需 athenaAppKey athenaAppName
...@@ -163,86 +195,142 @@ def get_content1(): ...@@ -163,86 +195,142 @@ def get_content1():
result_list = [['国令', "1108"], ['国发', "1107"], ['国函', "1106"], ['国发明电', "1105"], ['国办发', "1104"], result_list = [['国令', "1108"], ['国发', "1107"], ['国函', "1106"], ['国发明电', "1105"], ['国办发', "1104"],
['国办函', "1103"], ['国办函', "1103"],
['国办发明电', "1102"], ['其他', "1101"]] ['国办发明电', "1102"], ['其他', "1101"]]
try:
for a_list in result_list: for a_list in result_list:
s = requests.session() s = requests.session()
s.mount('https://', HTTPAdapter(max_retries=3))
s.mount('http://', HTTPAdapter(max_retries=3))
s.keep_alive = False s.keep_alive = False
pageNo = 1
pcodeJiguan = a_list[0] pcodeJiguan = a_list[0]
# post请求所需参数 try:
data = {"code": "18122f54c5c", "thirdPartyCode": "thirdparty_code_107", "thirdPartyTableId": 30, pageCount = getPageConunt(a_list, url, headers, s)
"resultFields": ["pub_url", "maintitle", "fwzh", "cwrq", "publish_time"], for pageNo in range(1, pageCount + 1):
"trackTotalHits": "true", try:
"searchFields": [{"fieldName": "maintitle", "searchWord": ""}], "isPreciseSearch": 0, try:
"sorts": [{"sortField": "publish_time", "sortOrder": "DESC"}], "childrenInfoIds": [[a_list[1]]], page_list = getList(a_list, url, headers, pageNo, s)
"pageSize": 20, "pageNo": pageNo} except:
data = json.dumps(data)
res = s.post(url=url, headers=headers, data=data, verify=False)
# 获得结果为json格式
res_text = json.loads(res.text)
page_list = res_text['result']['data']['list']
s.close() s.close()
page_list = getList(a_list, url, headers, pageNo, s)
for page in page_list: for page in page_list:
id_list = []
# 获取所需信息 # 获取所需信息
title = page['maintitle'] title = page['maintitle'] # 标题
pub_time1 = page['publish_time'] pub_time1 = page['publish_time'] # 发布时间
pub_time2 = page['cwrq'] pub_time2 = page['cwrq'] # 成文时间
pub_code = page['fwzh'] pub_code = page['fwzh'] # 发文字号
href = page['pub_url'] href = page['pub_url'] # 网址
# 判断是否已经爬取过 # 判断是否已经爬取过
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
log.info('已采集----------跳过')
continue continue
try: try:
resp_href = requests.get(url=href, headers=headers_, verify=False) resp_href = requests.get(url=href, headers=headers_, verify=False)
resp_href.encoding = resp_href.apparent_encoding resp_href.encoding = resp_href.apparent_encoding
i_html = resp_href.text i_html = resp_href.text
if '您访问的页面不存在或已删除' in i_html: if '您访问的页面不存在或已删除' in i_html:
# log.error(f'{title}...{href}...页面不存在或已删除')
continue continue
i_soup = BeautifulSoup(i_html, 'html.parser') i_soup = BeautifulSoup(i_html, 'html.parser')
i_soup = paserUrl(i_soup, href)
source = str(i_soup.find_all('tbody')[0]) source = str(i_soup.find_all('tbody')[0])
pub_org = source.split('<td><b>发文机关:</b></td>')[1].split('<td>')[1].split('</td>')[0] pub_org = source.split('<td><b>发文机关:</b></td>')[1].split('<td>')[1].split('</td>')[
child_type = source.split('<td class="w340 zcwj_ztfl">')[1].split('</td>')[0] 0] # 发文机关
content = str(i_soup.find('table', attrs={'class': 'pages_content'})) child_type = source.split('<td class="w340 zcwj_ztfl">')[1].split('</td>')[0] # 主题分类
fu_jian_result = re.findall('href="(.*?)"', content) contentWithTag = i_soup.find('div',class_='wrap mxxgkwrap mxxgkwrap_gwywj').find('table',class_='border-table noneBorder pages_content')
fu_jian_href_list = [] # 去除扫一扫
if len(fu_jian_result) > 0: contentWithTag.find('div', attrs={'id': 'div_div'}).decompose()
for fu_jian_re in fu_jian_result: content = contentWithTag.text # 不带标签正文
if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \ fu_jian_soup = contentWithTag.find_all('a')
or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \ time.sleep(0.5)
or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re: for file in fu_jian_soup:
fu_jian_href = fu_jian_re try:
fu_jian_href_list.append(fu_jian_href) file_href = file['href']
result_dict = { except Exception as e:
'标题': title, log.info(f'---{href}--------{e}-------')
'来源': '', continue
'发文机关': pub_org, if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
'发文字号': pub_code, or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
'内容-未去标签': content, or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
'附件网址': fu_jian_href_list, file_name = file.text.strip()
'发布时间': pub_time1, retData = baseCore.uploadToserver(file_href,'1766')
'成文时间': pub_time2, if retData['state']:
'主题分类': child_type, pass
'网址': href, else:
'归属': pcodeJiguan, continue
'信息来源': '国务院文件', att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num)
'tid': 1766, id_list.append(att_id)
#todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path
except:
log.error(f'{title}...{href}...获取内容失败')
continue
#todo:替换完成之后,将附件上传至文件服务器
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#todo:传kafka字段
dic_news = {
'attachmentIds': id_list, #附件id
'author': '', #作者
'content': content, #正文不带标签
'contentWithTag': str(contentWithTag), #正文带标签
'createDate': time_now, #创建时间
'deleteFlag': 0, #是否删除(0为默认,1为删除)
'id': '', #
'labels': [{'relationId': "1766", 'relationName': "国务院文件", 'labelMark': "policy"}], #关联标签id 关联标签名称 关联标签标识
'origin': '', #政策发布机关
'organ': pub_org, #政策发文机关
'topicClassification': child_type, #政策文件分类
'issuedNumber': pub_code, #发文字号
'publishDate': pub_time1, #发布时间
'writtenDate': pub_time2, #成文时间
'sid': '1697458829758697473', #信息源id
'sourceAddress': href[0], #原文链接
'summary': '', #摘要
'title': title #标题
} }
resp_href.close() # print(dic_news)
print(title) flag = sendKafka(dic_news)
# save_data(result_dict) if flag:
# time.sleep(1) save_data(dic_news)
num += 1 num += 1
except: except:
pass log.error(f'{pcodeJiguan}...第{pageNo}页获取列表失败')
continue
except: except:
pass log.error(f'{pcodeJiguan}...获取总数失败')
continue
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,共耗时{start_time - end_time}') print(f'共抓取{num}条数据,共耗时{start_time - end_time}')
# 国务院部门文件 # 国务院部门文件
def get_content2(): def get_content2():
def getTotalpage(bmfl,headers,session):
ip = baseCore.get_proxy()
pageNo = 1
time.sleep(2)
# 拼接url
url_ = f'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
resp = session.get(url=url_, headers=headers, verify=False,proxies=ip)
resp_text = resp.text
resp_json = json.loads(resp_text)
totalpage = resp_json['searchVO']['totalpage']
return totalpage
def getContentList(bmfl,pageNo,headers,session):
ip = baseCore.get_proxy()
url_ = f'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
# 请求结果为json格式
resp = session.get(url=url_, headers=headers, verify=False,proxies=ip)
resp_text = resp.text
resp_json = json.loads(resp_text)
content_list = resp_json['searchVO']['listVO']
return content_list
session = requests.session()
session.mount('https://', HTTPAdapter(max_retries=3))
session.mount('http://', HTTPAdapter(max_retries=3))
session.keep_alive = False
start_time = time.time() start_time = time.time()
num = 0 num = 0
result_list = ['外交部', '国家发展和改革委员会', '教育部', '科学技术部', '工业和信息化部', '国家民族事务委员会', '公安部', '国家安全部', '民政部', '司法部', '财政部', result_list = ['外交部', '国家发展和改革委员会', '教育部', '科学技术部', '工业和信息化部', '国家民族事务委员会', '公安部', '国家安全部', '民政部', '司法部', '财政部',
...@@ -262,20 +350,16 @@ def get_content2(): ...@@ -262,20 +350,16 @@ def get_content2():
for bmfl in result_list: for bmfl in result_list:
try: try:
pageNo = 0 totalpage = getTotalpage(bmfl,headers,session)
time.sleep(2) for pageNo in range(1,totalpage+1):
# 拼接url
url_ = f'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
try: try:
# 请求结果为json格式 try:
resp = requests.get(url=url_, headers=headers, verify=False) content_list = getContentList(bmfl,pageNo,headers,session)
resp_text = resp.text
resp_json = json.loads(resp_text)
content_list = resp_json['searchVO']['listVO']
resp.close()
except: except:
continue session.close()
content_list = getContentList(bmfl,pageNo,headers,session)
for content_dict in content_list: for content_dict in content_list:
id_list = []
href = content_dict['url'] # 详情页 href = content_dict['url'] # 详情页
title = content_dict['title'] # 标题 title = content_dict['title'] # 标题
pub_code = content_dict['pcode'] # 发文字号 pub_code = content_dict['pcode'] # 发文字号
...@@ -294,55 +378,198 @@ def get_content2(): ...@@ -294,55 +378,198 @@ def get_content2():
child_type = content_dict['childtype'] # 主题分类 child_type = content_dict['childtype'] # 主题分类
except: except:
child_type = '' child_type = ''
# 判断是否已经爬取过 # # 判断是否已经爬取过
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
log.info('已采集----------跳过')
continue continue
try: try:
resp = requests.get(url=href, headers=headers, verify=False) resp = requests.get(url=href, headers=headers, verify=False)
resp.encoding = 'utf-8' resp.encoding = resp.apparent_encoding
resp_text = resp.text resp_text = resp.text
soup = BeautifulSoup(resp_text, 'html.parser') soup = BeautifulSoup(resp_text, 'html.parser')
time.sleep(1) soup = paserUrl(soup,href)
content = str(soup.find('div', attrs={'class': 'pages_content mhide'})) time.sleep(0.5)
fu_jian_result = re.findall('href="(.*?)"', content) contentWithTag = soup.find('div', attrs={'class': 'pages_content mhide'})
fu_jian_href_list = [] content = contentWithTag.text
if len(fu_jian_result) > 0: fu_jian_soup = contentWithTag.find_all('a')
for fu_jian_re in fu_jian_result: for file in fu_jian_soup:
if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \ try:
or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \ file_href = file['href']
or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re: except Exception as e:
fu_jian_href = href.split('content')[0] + fu_jian_re log.info(f'---{href}--------{e}-------')
fu_jian_href_list.append(fu_jian_href) continue
resp.close() if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
result_dict = { or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
'标题': title, or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
'来源': '', file_name = file.text.strip()
'发文机关': pub_org, retData = baseCore.uploadToserver(file_href,'1699')
'发文字号': pub_code, if retData['state']:
'内容-未去标签': content, pass
'附件网址': fu_jian_href_list, else:
'发布时间': pub_time1, continue
'成文时间': pub_time2, att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num)
'主题分类': child_type, id_list.append(att_id)
'网址': href,
'归属': bmfl, #todo:将返回的地址更新到soup
'信息来源': '国务院部门文件', file['href'] = 'http://114.115.215.96/' + full_path
'tid': 1699, except:
print(f'{title}...{href}获取内容失败')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#todo:传kafka字段
dic_news = {
'attachmentIds': id_list, #附件id
'author': '', #作者
'content': content, #正文不带标签
'contentWithTag': str(contentWithTag), #正文带标签
'createDate': time_now, #创建时间
'deleteFlag': 0, #是否删除(0为默认,1为删除)
'id': '', #
'labels': [{'relationId': "1699", 'relationName': "国务院各部委文件", 'labelMark': "policy"}], #关联标签id 关联标签名称 关联标签标识
'origin': '', #政策发布机关
'organ': pub_org, #政策发文机关
'topicClassification': child_type, #政策文件分类
'issuedNumber': pub_code, #发文字号
'publishDate': pub_time1, #发布时间
'writtenDate': pub_time2, #成文时间
'sid': '1697458829758697473', #信息源id
'sourceAddress': href, #原文链接
'summary': '', #摘要
'title': title #标题
} }
print(title) # print(dic_news)
save_data(result_dict) flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1 num += 1
except: except:
pass print(f'{bmfl}...第{pageNo}页获取信息列表失败')
continue
except: except:
pass print(f'{bmfl}...获取页数失败')
continue
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,耗时{end_time - start_time}') print(f'共抓取{num}条数据,耗时{end_time - start_time}')
# 国务院国有资产监督管理委员会-政策发布 # 国务院国有资产监督管理委员会-政策发布
def get_content3(): def get_content3():
def getPage():
url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
req = requests.get(url, headers=headers, verify=False)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
totalpage = re.findall("maxPageNum = (.*);", soup.select('#pag_2603340')[0].text)[0]
return int(totalpage)
def sendContent(href, headers,title,pub_time,num):
id_list = []
resp_href = requests.request("GET", href, headers=headers, verify=False)
resp_href.encoding = resp_href.apparent_encoding
soup = BeautifulSoup(resp_href.text, 'lxml')
soup = paserUrl(soup, href)
doc_href = soup.find('div', class_='zsy_content')
try:
org_content = doc_href.select('.zsy_cotitle')[0]
org = re.findall('文章来源:(.*?)发布时间:', org_content)[0].strip()
except:
org = ''
contentWithTag = doc_href.find('div', class_='zsy_comain')
contentWithTag.select('#qr_container')[0].decompose()
contentWithTag.find('div', attrs={'id': 'div_div'}).decompose()
contentWithTag.find('div', class_='related').decompose()
contentWithTag.find('div', class_='jiathis_style_24x24').decompose()
try:
p_list = contentWithTag.findAll('p')
pub_hao = ''
for p in p_list:
p = str(p.text)
if '号' in p and '〔' in p and '〕' in p or '[' in p and ']' in p and '号' in p or '【' in p and '】' in p and '号' in p:
try:
pub_hao = p.split('日')[1].split('自')[0].strip().lstrip()
except:
pub_hao = p.strip().lstrip()
break
except:
pub_hao = ''
if len(pub_hao) > 15:
pub_hao = ''
content = contentWithTag.text
fu_jian_soup = contentWithTag.find_all('a')
for file in fu_jian_soup:
try:
file_href = file['href']
except Exception as e:
log.info(f'---{href}--------{e}-------')
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href,'1642')
if retData['state']:
pass
else:
continue
att_id,full_path = baseCore.tableUpdate(retData,'国务院国资委',file_name,num)
id_list.append(att_id)
#todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#todo:传kafka字段
dic_news = {
'attachmentIds': id_list, #附件id
'author': '', #作者
'content': content, #正文不带标签
'contentWithTag': str(contentWithTag), #正文带标签
'createDate': time_now, #创建时间
'deleteFlag': 0, #是否删除(0为默认,1为删除)
'id': '', #
'labels': [{'relationId': "1642", 'relationName': "国务院国资委", 'labelMark': "policy"}], #关联标签id 关联标签名称 关联标签标识
'origin': '', #政策发布机关
'organ': org, #政策发文机关
'topicClassification': '', #政策文件分类
'issuedNumber': pub_hao, #发文字号
'publishDate': pub_time, #发布时间
'writtenDate': '', #成文时间
'sid': '1697458829758697473', #信息源id
'sourceAddress': href, #原文链接
'summary': '', #摘要
'title': title #标题
}
# print(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
def partTwo():
start_time = time.time()
num = 0
totalpage = getPage()
for page in range(1, totalpage):
url = f"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index_2603340_{page}.html"
href_resp = requests.request("GET", url, headers=headers, verify=False)
resp_text = href_resp.content.decode('UTF-8')
li_list = resp_text.split('<li>')
del (li_list[0])
for li in li_list:
id_list = []
href_ = li.split('<a href="')[1].split('" target=')[0]
title = li.split('title="')[1].split('">')[0]
href = f'http://www.sasac.gov.cn{href_.replace("../../..", "")}'
pub_time = li.split('<span>[')[1].split(']</span>')[0]
is_href = db_storage.find_one({'网址': href})
if is_href:
log.info('已采集----------跳过')
continue
sendContent(href, headers,title,pub_time,num)
num += 1
end_time = time.time()
print(f'共抓取{num}条数据,耗时{end_time - start_time}')
def partOne():
start_time = time.time() start_time = time.time()
num = 0 num = 0
url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html" url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
...@@ -363,70 +590,22 @@ def get_content3(): ...@@ -363,70 +590,22 @@ def get_content3():
# 判断是否已经爬取过 # 判断是否已经爬取过
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
log.info('已采集----------跳过')
continue continue
title = doc_item('a').attr('title') title = doc_item('a').attr('title')
pub_time = doc_item('span').text().replace('[', '').replace(']', '') pub_time = doc_item('span').text().replace('[', '').replace(']', '')
except: except:
continue continue
try: sendContent(href, headers,title,pub_time,num)
try:
resp_href = requests.request("GET", href, headers=headers, verify=False)
doc_href = pq(resp_href.content)
time.sleep(1)
content_html = str(doc_href('.zsy_comain').remove('style').remove('#qr_container'))
content = pq(content_html).text()
except:
continue
if content.strip() == '':
continue
try:
org_content = doc_href('.zsy_cotitle').text()
org = re.findall('文章来源:(.*?)发布时间:', org_content)[0].strip()
except:
org = ''
try:
resp_href.encoding = 'utf-8'
resp_text_ = BeautifulSoup(resp_href.text, 'html.parser')
zsy_comain = resp_text_.find('div', attrs={'class': 'zsy_comain'})
p_list = zsy_comain.findAll('p')
pub_hao = ''
for p in p_list:
p = str(p.text)
if '号' in p and '〔' in p and '〕' in p or '[' in p and ']' in p and '号' in p or '【' in p and '】' in p and '号' in p:
try:
pub_hao = p.split('日')[1].split('自')[0].strip().lstrip()
except:
pub_hao = p.strip().lstrip()
break
except:
pub_hao = ''
if len(pub_hao) > 45:
pub_hao = ''
result_dict = {
'标题': title,
'来源': org,
'发文机关': '',
'发文字号': pub_hao,
'内容-未去标签': content_html,
'附件网址': [],
'发布时间': pub_time,
'成文时间': '',
'主题分类': '',
'网址': href,
'归属': '国务院国资委',
'信息来源': '国务院国资委',
'tid': 1642,
}
save_data(result_dict)
print(title)
num += 1 num += 1
except: except:
pass pass
except:
pass
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,耗时{end_time - start_time}') print(f'共抓取{num}条数据,耗时{end_time - start_time}')
partOne()
partTwo()
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urljoin from urllib.parse import urljoin
...@@ -569,7 +748,8 @@ def bei_jing(): ...@@ -569,7 +748,8 @@ def bei_jing():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
# print(id) # print(id)
# id_list.append(id) # id_list.append(id)
...@@ -687,8 +867,9 @@ def nei_meng_gu(): ...@@ -687,8 +867,9 @@ def nei_meng_gu():
'summary':'', 'summary':'',
'title':title 'title':title
} }
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
num = num + 1 num = num + 1
...@@ -872,7 +1053,8 @@ def ji_lin(): ...@@ -872,7 +1053,8 @@ def ji_lin():
continue continue
else: else:
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
num = num + 1 num = num + 1
except Exception as e: except Exception as e:
...@@ -1006,7 +1188,8 @@ def shang_hai(): ...@@ -1006,7 +1188,8 @@ def shang_hai():
'summary': '', 'summary': '',
'title': title 'title': title
} }
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
num = num + 1 num = num + 1
except: except:
...@@ -1123,7 +1306,8 @@ def zhe_jiang(): ...@@ -1123,7 +1306,8 @@ def zhe_jiang():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
num = num + 1 num = num + 1
...@@ -1278,7 +1462,8 @@ def fu_jian(): ...@@ -1278,7 +1462,8 @@ def fu_jian():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
print(title) print(title)
num += 1 num += 1
...@@ -1386,7 +1571,8 @@ def shan_dong(): ...@@ -1386,7 +1571,8 @@ def shan_dong():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
if content == '' or content == 'None': if content == '' or content == 'None':
continue continue
...@@ -1485,7 +1671,8 @@ def guang_dong(): ...@@ -1485,7 +1671,8 @@ def guang_dong():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
print(title) print(title)
# save_data(result_dict) # save_data(result_dict)
...@@ -1656,7 +1843,8 @@ def hai_nan(): ...@@ -1656,7 +1843,8 @@ def hai_nan():
'title': title 'title': title
} }
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
print(title) print(title)
...@@ -1724,7 +1912,8 @@ def hai_nan(): ...@@ -1724,7 +1912,8 @@ def hai_nan():
'summary': '', 'summary': '',
'title': title 'title': title
} }
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
href_text.close() href_text.close()
# save_data(result_dict) # save_data(result_dict)
...@@ -1826,7 +2015,8 @@ def hai_nan(): ...@@ -1826,7 +2015,8 @@ def hai_nan():
'title': title 'title': title
} }
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
href_text.close() href_text.close()
# save_data(result_dict) # save_data(result_dict)
...@@ -1929,7 +2119,8 @@ def hai_nan(): ...@@ -1929,7 +2119,8 @@ def hai_nan():
'title': title 'title': title
} }
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
href_text.close() href_text.close()
# save_data(result_dict) # save_data(result_dict)
...@@ -2012,7 +2203,8 @@ def hai_nan(): ...@@ -2012,7 +2203,8 @@ def hai_nan():
'title': title 'title': title
} }
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
href_text.close() href_text.close()
# save_data(result_dict) # save_data(result_dict)
...@@ -2182,7 +2374,8 @@ def si_chuan(): ...@@ -2182,7 +2374,8 @@ def si_chuan():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
print(title) print(title)
...@@ -2304,7 +2497,8 @@ def guang_xi(): ...@@ -2304,7 +2497,8 @@ def guang_xi():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
print(title) print(title)
num = num + 1 num = num + 1
...@@ -2409,7 +2603,8 @@ def gui_zhou(): ...@@ -2409,7 +2603,8 @@ def gui_zhou():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
print(title) print(title)
# save_data(result_dict) # save_data(result_dict)
...@@ -2518,7 +2713,8 @@ def yun_nan(): ...@@ -2518,7 +2713,8 @@ def yun_nan():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
print(title) print(title)
num = num + 1 num = num + 1
...@@ -2627,8 +2823,9 @@ def yun_nan(): ...@@ -2627,8 +2823,9 @@ def yun_nan():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
# sendKafka(dic_news) flag = sendKafka(dic_news)
# save_data(dic_news) if flag:
save_data(dic_news)
print(title) print(title)
num = num + 1 num = num + 1
...@@ -2751,7 +2948,8 @@ def chong_qing(): ...@@ -2751,7 +2948,8 @@ def chong_qing():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
print(title) print(title)
# save_data(result_dict) # save_data(result_dict)
...@@ -2873,7 +3071,8 @@ def tian_jin(): ...@@ -2873,7 +3071,8 @@ def tian_jin():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
except: except:
...@@ -2992,7 +3191,8 @@ def tian_jin(): ...@@ -2992,7 +3191,8 @@ def tian_jin():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
except: except:
...@@ -3115,7 +3315,8 @@ def tian_jin(): ...@@ -3115,7 +3315,8 @@ def tian_jin():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
except: except:
...@@ -3221,7 +3422,8 @@ def xin_jiang(): ...@@ -3221,7 +3422,8 @@ def xin_jiang():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
except: except:
...@@ -3318,7 +3520,8 @@ def xin_jiang(): ...@@ -3318,7 +3520,8 @@ def xin_jiang():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
href_res.close() href_res.close()
...@@ -3436,7 +3639,8 @@ def shan_xi(): ...@@ -3436,7 +3639,8 @@ def shan_xi():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
except: except:
...@@ -3544,7 +3748,8 @@ def liao_ning(): ...@@ -3544,7 +3748,8 @@ def liao_ning():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
except: except:
...@@ -3638,7 +3843,8 @@ def hei_long_jiang(): ...@@ -3638,7 +3843,8 @@ def hei_long_jiang():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
except: except:
...@@ -3751,7 +3957,8 @@ def jiang_su(): ...@@ -3751,7 +3957,8 @@ def jiang_su():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
except: except:
...@@ -3841,7 +4048,8 @@ def an_hui(): ...@@ -3841,7 +4048,8 @@ def an_hui():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
except: except:
...@@ -3935,7 +4143,8 @@ def an_hui(): ...@@ -3935,7 +4143,8 @@ def an_hui():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
href_res.close() href_res.close()
...@@ -4062,7 +4271,8 @@ def jiang_xi(): ...@@ -4062,7 +4271,8 @@ def jiang_xi():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
except: except:
...@@ -4154,7 +4364,8 @@ def he_nan(): ...@@ -4154,7 +4364,8 @@ def he_nan():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
href_res.close() href_res.close()
...@@ -4251,7 +4462,8 @@ def hu_nan(): ...@@ -4251,7 +4462,8 @@ def hu_nan():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
except: except:
...@@ -4372,7 +4584,8 @@ def gan_su(): ...@@ -4372,7 +4584,8 @@ def gan_su():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
except Exception as e: except Exception as e:
...@@ -4506,7 +4719,8 @@ def gan_su(): ...@@ -4506,7 +4719,8 @@ def gan_su():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
except Exception as e: except Exception as e:
...@@ -4661,7 +4875,8 @@ def gan_su(): ...@@ -4661,7 +4875,8 @@ def gan_su():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
except Exception as e: except Exception as e:
...@@ -4759,7 +4974,8 @@ def ning_xia(): ...@@ -4759,7 +4974,8 @@ def ning_xia():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
except: except:
...@@ -4857,7 +5073,8 @@ def shanxi(): ...@@ -4857,7 +5073,8 @@ def shanxi():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
res_href.close() res_href.close()
...@@ -4951,7 +5168,8 @@ def xi_zang(): ...@@ -4951,7 +5168,8 @@ def xi_zang():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
except: except:
...@@ -5047,7 +5265,8 @@ def qing_hai(): ...@@ -5047,7 +5265,8 @@ def qing_hai():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
# print(id) # print(id)
# id_list.append(id) # id_list.append(id)
...@@ -5164,7 +5383,8 @@ def qing_hai(): ...@@ -5164,7 +5383,8 @@ def qing_hai():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
# print(id) # print(id)
# id_list.append(id) # id_list.append(id)
...@@ -5262,7 +5482,8 @@ def he_bei(): ...@@ -5262,7 +5482,8 @@ def he_bei():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
except: except:
...@@ -5370,7 +5591,8 @@ def hu_bei(): ...@@ -5370,7 +5591,8 @@ def hu_bei():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
if flag:
save_data(dic_news) save_data(dic_news)
num += 1 num += 1
except Exception as e: except Exception as e:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论