提交 9ab6c127 作者: LiuLiYuan

政策法规 9/9

上级 eeb41ef7
......@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup
from kafka import KafkaProducer
from pyquery import PyQuery as pq
from requests.packages import urllib3
from requests.adapters import HTTPAdapter
from BaseCore import BaseCore
baseCore = BaseCore()
......@@ -110,7 +110,7 @@ def sendKafka(dic_news):
# 传输成功,写入日志中
state = 1
takeTime = baseCore.getTimeCost(start_time, time.time())
# return True
return True
except Exception as e:
......@@ -124,6 +124,7 @@ def sendKafka(dic_news):
e = 'Kafka操作失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
return False
def redefid(idList):
id_ = ','.join(map(str, idList))
......@@ -132,7 +133,38 @@ def redefid(idList):
def remove_dup():
pass
# 国务院文件
def get_content1():
def getPageConunt(a_list, url, headers, s):
data = {"code": "18122f54c5c", "thirdPartyCode": "thirdparty_code_107", "thirdPartyTableId": 30,
"resultFields": ["pub_url", "maintitle", "fwzh", "cwrq", "publish_time"],
"trackTotalHits": "true",
"searchFields": [{"fieldName": "maintitle", "searchWord": ""}], "isPreciseSearch": 0,
"sorts": [{"sortField": "publish_time", "sortOrder": "DESC"}], "childrenInfoIds": [[a_list[1]]],
"pageSize": 20, "pageNo": 1}
data = json.dumps(data)
ip = baseCore.get_proxy()
res = s.post(url=url, headers=headers, data=data, verify=False, proxies=ip)
# 获得结果为json格式
res_text = json.loads(res.text)
pageCount = res_text['result']['data']['pager']['pageCount']
return pageCount
def getList(a_list, url, headers, pageNo, s):
# post请求所需参数
data = {"code": "18122f54c5c", "thirdPartyCode": "thirdparty_code_107", "thirdPartyTableId": 30,
"resultFields": ["pub_url", "maintitle", "fwzh", "cwrq", "publish_time"],
"trackTotalHits": "true",
"searchFields": [{"fieldName": "maintitle", "searchWord": ""}], "isPreciseSearch": 0,
"sorts": [{"sortField": "publish_time", "sortOrder": "DESC"}], "childrenInfoIds": [[a_list[1]]],
"pageSize": 20, "pageNo": pageNo}
data = json.dumps(data)
ip = baseCore.get_proxy()
res = s.post(url=url, headers=headers, data=data, verify=False, proxies=ip)
res_text = json.loads(res.text)
page_list = res_text['result']['data']['list']
return page_list
start_time = time.time()
num = 0
# 过网站验证所需 athenaAppKey athenaAppName
......@@ -163,86 +195,142 @@ def get_content1():
result_list = [['国令', "1108"], ['国发', "1107"], ['国函', "1106"], ['国发明电', "1105"], ['国办发', "1104"],
['国办函', "1103"],
['国办发明电', "1102"], ['其他', "1101"]]
try:
for a_list in result_list:
s = requests.session()
s.mount('https://', HTTPAdapter(max_retries=3))
s.mount('http://', HTTPAdapter(max_retries=3))
s.keep_alive = False
pageNo = 1
pcodeJiguan = a_list[0]
# post请求所需参数
data = {"code": "18122f54c5c", "thirdPartyCode": "thirdparty_code_107", "thirdPartyTableId": 30,
"resultFields": ["pub_url", "maintitle", "fwzh", "cwrq", "publish_time"],
"trackTotalHits": "true",
"searchFields": [{"fieldName": "maintitle", "searchWord": ""}], "isPreciseSearch": 0,
"sorts": [{"sortField": "publish_time", "sortOrder": "DESC"}], "childrenInfoIds": [[a_list[1]]],
"pageSize": 20, "pageNo": pageNo}
data = json.dumps(data)
res = s.post(url=url, headers=headers, data=data, verify=False)
# 获得结果为json格式
res_text = json.loads(res.text)
page_list = res_text['result']['data']['list']
try:
pageCount = getPageConunt(a_list, url, headers, s)
for pageNo in range(1, pageCount + 1):
try:
try:
page_list = getList(a_list, url, headers, pageNo, s)
except:
s.close()
page_list = getList(a_list, url, headers, pageNo, s)
for page in page_list:
id_list = []
# 获取所需信息
title = page['maintitle']
pub_time1 = page['publish_time']
pub_time2 = page['cwrq']
pub_code = page['fwzh']
href = page['pub_url']
title = page['maintitle'] # 标题
pub_time1 = page['publish_time'] # 发布时间
pub_time2 = page['cwrq'] # 成文时间
pub_code = page['fwzh'] # 发文字号
href = page['pub_url'] # 网址
# 判断是否已经爬取过
is_href = db_storage.find_one({'网址': href})
if is_href:
log.info('已采集----------跳过')
continue
try:
resp_href = requests.get(url=href, headers=headers_, verify=False)
resp_href.encoding = resp_href.apparent_encoding
i_html = resp_href.text
if '您访问的页面不存在或已删除' in i_html:
# log.error(f'{title}...{href}...页面不存在或已删除')
continue
i_soup = BeautifulSoup(i_html, 'html.parser')
i_soup = paserUrl(i_soup, href)
source = str(i_soup.find_all('tbody')[0])
pub_org = source.split('<td><b>发文机关:</b></td>')[1].split('<td>')[1].split('</td>')[0]
child_type = source.split('<td class="w340 zcwj_ztfl">')[1].split('</td>')[0]
content = str(i_soup.find('table', attrs={'class': 'pages_content'}))
fu_jian_result = re.findall('href="(.*?)"', content)
fu_jian_href_list = []
if len(fu_jian_result) > 0:
for fu_jian_re in fu_jian_result:
if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
fu_jian_href = fu_jian_re
fu_jian_href_list.append(fu_jian_href)
result_dict = {
'标题': title,
'来源': '',
'发文机关': pub_org,
'发文字号': pub_code,
'内容-未去标签': content,
'附件网址': fu_jian_href_list,
'发布时间': pub_time1,
'成文时间': pub_time2,
'主题分类': child_type,
'网址': href,
'归属': pcodeJiguan,
'信息来源': '国务院文件',
'tid': 1766,
pub_org = source.split('<td><b>发文机关:</b></td>')[1].split('<td>')[1].split('</td>')[
0] # 发文机关
child_type = source.split('<td class="w340 zcwj_ztfl">')[1].split('</td>')[0] # 主题分类
contentWithTag = i_soup.find('div',class_='wrap mxxgkwrap mxxgkwrap_gwywj').find('table',class_='border-table noneBorder pages_content')
# 去除扫一扫
contentWithTag.find('div', attrs={'id': 'div_div'}).decompose()
content = contentWithTag.text # 不带标签正文
fu_jian_soup = contentWithTag.find_all('a')
time.sleep(0.5)
for file in fu_jian_soup:
try:
file_href = file['href']
except Exception as e:
log.info(f'---{href}--------{e}-------')
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href,'1766')
if retData['state']:
pass
else:
continue
att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num)
id_list.append(att_id)
#todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path
except:
log.error(f'{title}...{href}...获取内容失败')
continue
#todo:替换完成之后,将附件上传至文件服务器
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#todo:传kafka字段
dic_news = {
'attachmentIds': id_list, #附件id
'author': '', #作者
'content': content, #正文不带标签
'contentWithTag': str(contentWithTag), #正文带标签
'createDate': time_now, #创建时间
'deleteFlag': 0, #是否删除(0为默认,1为删除)
'id': '', #
'labels': [{'relationId': "1766", 'relationName': "国务院文件", 'labelMark': "policy"}], #关联标签id 关联标签名称 关联标签标识
'origin': '', #政策发布机关
'organ': pub_org, #政策发文机关
'topicClassification': child_type, #政策文件分类
'issuedNumber': pub_code, #发文字号
'publishDate': pub_time1, #发布时间
'writtenDate': pub_time2, #成文时间
'sid': '1697458829758697473', #信息源id
'sourceAddress': href[0], #原文链接
'summary': '', #摘要
'title': title #标题
}
resp_href.close()
print(title)
# save_data(result_dict)
# time.sleep(1)
# print(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
except:
pass
log.error(f'{pcodeJiguan}...第{pageNo}页获取列表失败')
continue
except:
pass
log.error(f'{pcodeJiguan}...获取总数失败')
continue
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{start_time - end_time}')
# 国务院部门文件
def get_content2():
def getTotalpage(bmfl,headers,session):
ip = baseCore.get_proxy()
pageNo = 1
time.sleep(2)
# 拼接url
url_ = f'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
resp = session.get(url=url_, headers=headers, verify=False,proxies=ip)
resp_text = resp.text
resp_json = json.loads(resp_text)
totalpage = resp_json['searchVO']['totalpage']
return totalpage
def getContentList(bmfl,pageNo,headers,session):
ip = baseCore.get_proxy()
url_ = f'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
# 请求结果为json格式
resp = session.get(url=url_, headers=headers, verify=False,proxies=ip)
resp_text = resp.text
resp_json = json.loads(resp_text)
content_list = resp_json['searchVO']['listVO']
return content_list
session = requests.session()
session.mount('https://', HTTPAdapter(max_retries=3))
session.mount('http://', HTTPAdapter(max_retries=3))
session.keep_alive = False
start_time = time.time()
num = 0
result_list = ['外交部', '国家发展和改革委员会', '教育部', '科学技术部', '工业和信息化部', '国家民族事务委员会', '公安部', '国家安全部', '民政部', '司法部', '财政部',
......@@ -262,20 +350,16 @@ def get_content2():
for bmfl in result_list:
try:
pageNo = 0
time.sleep(2)
# 拼接url
url_ = f'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
totalpage = getTotalpage(bmfl,headers,session)
for pageNo in range(1,totalpage+1):
try:
# 请求结果为json格式
resp = requests.get(url=url_, headers=headers, verify=False)
resp_text = resp.text
resp_json = json.loads(resp_text)
content_list = resp_json['searchVO']['listVO']
resp.close()
try:
content_list = getContentList(bmfl,pageNo,headers,session)
except:
continue
session.close()
content_list = getContentList(bmfl,pageNo,headers,session)
for content_dict in content_list:
id_list = []
href = content_dict['url'] # 详情页
title = content_dict['title'] # 标题
pub_code = content_dict['pcode'] # 发文字号
......@@ -294,55 +378,198 @@ def get_content2():
child_type = content_dict['childtype'] # 主题分类
except:
child_type = ''
# 判断是否已经爬取过
# # 判断是否已经爬取过
is_href = db_storage.find_one({'网址': href})
if is_href:
log.info('已采集----------跳过')
continue
try:
resp = requests.get(url=href, headers=headers, verify=False)
resp.encoding = 'utf-8'
resp.encoding = resp.apparent_encoding
resp_text = resp.text
soup = BeautifulSoup(resp_text, 'html.parser')
time.sleep(1)
content = str(soup.find('div', attrs={'class': 'pages_content mhide'}))
fu_jian_result = re.findall('href="(.*?)"', content)
fu_jian_href_list = []
if len(fu_jian_result) > 0:
for fu_jian_re in fu_jian_result:
if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
fu_jian_href = href.split('content')[0] + fu_jian_re
fu_jian_href_list.append(fu_jian_href)
resp.close()
result_dict = {
'标题': title,
'来源': '',
'发文机关': pub_org,
'发文字号': pub_code,
'内容-未去标签': content,
'附件网址': fu_jian_href_list,
'发布时间': pub_time1,
'成文时间': pub_time2,
'主题分类': child_type,
'网址': href,
'归属': bmfl,
'信息来源': '国务院部门文件',
'tid': 1699,
soup = paserUrl(soup,href)
time.sleep(0.5)
contentWithTag = soup.find('div', attrs={'class': 'pages_content mhide'})
content = contentWithTag.text
fu_jian_soup = contentWithTag.find_all('a')
for file in fu_jian_soup:
try:
file_href = file['href']
except Exception as e:
log.info(f'---{href}--------{e}-------')
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href,'1699')
if retData['state']:
pass
else:
continue
att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num)
id_list.append(att_id)
#todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path
except:
print(f'{title}...{href}获取内容失败')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#todo:传kafka字段
dic_news = {
'attachmentIds': id_list, #附件id
'author': '', #作者
'content': content, #正文不带标签
'contentWithTag': str(contentWithTag), #正文带标签
'createDate': time_now, #创建时间
'deleteFlag': 0, #是否删除(0为默认,1为删除)
'id': '', #
'labels': [{'relationId': "1699", 'relationName': "国务院各部委文件", 'labelMark': "policy"}], #关联标签id 关联标签名称 关联标签标识
'origin': '', #政策发布机关
'organ': pub_org, #政策发文机关
'topicClassification': child_type, #政策文件分类
'issuedNumber': pub_code, #发文字号
'publishDate': pub_time1, #发布时间
'writtenDate': pub_time2, #成文时间
'sid': '1697458829758697473', #信息源id
'sourceAddress': href, #原文链接
'summary': '', #摘要
'title': title #标题
}
print(title)
save_data(result_dict)
# print(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
except:
pass
print(f'{bmfl}...第{pageNo}页获取信息列表失败')
continue
except:
pass
print(f'{bmfl}...获取页数失败')
continue
end_time = time.time()
print(f'共抓取{num}条数据,耗时{end_time - start_time}')
# 国务院国有资产监督管理委员会-政策发布
def get_content3():
def getPage():
url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
req = requests.get(url, headers=headers, verify=False)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
totalpage = re.findall("maxPageNum = (.*);", soup.select('#pag_2603340')[0].text)[0]
return int(totalpage)
def sendContent(href, headers,title,pub_time,num):
id_list = []
resp_href = requests.request("GET", href, headers=headers, verify=False)
resp_href.encoding = resp_href.apparent_encoding
soup = BeautifulSoup(resp_href.text, 'lxml')
soup = paserUrl(soup, href)
doc_href = soup.find('div', class_='zsy_content')
try:
org_content = doc_href.select('.zsy_cotitle')[0]
org = re.findall('文章来源:(.*?)发布时间:', org_content)[0].strip()
except:
org = ''
contentWithTag = doc_href.find('div', class_='zsy_comain')
contentWithTag.select('#qr_container')[0].decompose()
contentWithTag.find('div', attrs={'id': 'div_div'}).decompose()
contentWithTag.find('div', class_='related').decompose()
contentWithTag.find('div', class_='jiathis_style_24x24').decompose()
try:
p_list = contentWithTag.findAll('p')
pub_hao = ''
for p in p_list:
p = str(p.text)
if '号' in p and '〔' in p and '〕' in p or '[' in p and ']' in p and '号' in p or '【' in p and '】' in p and '号' in p:
try:
pub_hao = p.split('日')[1].split('自')[0].strip().lstrip()
except:
pub_hao = p.strip().lstrip()
break
except:
pub_hao = ''
if len(pub_hao) > 15:
pub_hao = ''
content = contentWithTag.text
fu_jian_soup = contentWithTag.find_all('a')
for file in fu_jian_soup:
try:
file_href = file['href']
except Exception as e:
log.info(f'---{href}--------{e}-------')
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href,'1642')
if retData['state']:
pass
else:
continue
att_id,full_path = baseCore.tableUpdate(retData,'国务院国资委',file_name,num)
id_list.append(att_id)
#todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#todo:传kafka字段
dic_news = {
'attachmentIds': id_list, #附件id
'author': '', #作者
'content': content, #正文不带标签
'contentWithTag': str(contentWithTag), #正文带标签
'createDate': time_now, #创建时间
'deleteFlag': 0, #是否删除(0为默认,1为删除)
'id': '', #
'labels': [{'relationId': "1642", 'relationName': "国务院国资委", 'labelMark': "policy"}], #关联标签id 关联标签名称 关联标签标识
'origin': '', #政策发布机关
'organ': org, #政策发文机关
'topicClassification': '', #政策文件分类
'issuedNumber': pub_hao, #发文字号
'publishDate': pub_time, #发布时间
'writtenDate': '', #成文时间
'sid': '1697458829758697473', #信息源id
'sourceAddress': href, #原文链接
'summary': '', #摘要
'title': title #标题
}
# print(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
def partTwo():
start_time = time.time()
num = 0
totalpage = getPage()
for page in range(1, totalpage):
url = f"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index_2603340_{page}.html"
href_resp = requests.request("GET", url, headers=headers, verify=False)
resp_text = href_resp.content.decode('UTF-8')
li_list = resp_text.split('<li>')
del (li_list[0])
for li in li_list:
id_list = []
href_ = li.split('<a href="')[1].split('" target=')[0]
title = li.split('title="')[1].split('">')[0]
href = f'http://www.sasac.gov.cn{href_.replace("../../..", "")}'
pub_time = li.split('<span>[')[1].split(']</span>')[0]
is_href = db_storage.find_one({'网址': href})
if is_href:
log.info('已采集----------跳过')
continue
sendContent(href, headers,title,pub_time,num)
num += 1
end_time = time.time()
print(f'共抓取{num}条数据,耗时{end_time - start_time}')
def partOne():
start_time = time.time()
num = 0
url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
......@@ -363,70 +590,22 @@ def get_content3():
# 判断是否已经爬取过
is_href = db_storage.find_one({'网址': href})
if is_href:
log.info('已采集----------跳过')
continue
title = doc_item('a').attr('title')
pub_time = doc_item('span').text().replace('[', '').replace(']', '')
except:
continue
try:
try:
resp_href = requests.request("GET", href, headers=headers, verify=False)
doc_href = pq(resp_href.content)
time.sleep(1)
content_html = str(doc_href('.zsy_comain').remove('style').remove('#qr_container'))
content = pq(content_html).text()
except:
continue
if content.strip() == '':
continue
try:
org_content = doc_href('.zsy_cotitle').text()
org = re.findall('文章来源:(.*?)发布时间:', org_content)[0].strip()
except:
org = ''
try:
resp_href.encoding = 'utf-8'
resp_text_ = BeautifulSoup(resp_href.text, 'html.parser')
zsy_comain = resp_text_.find('div', attrs={'class': 'zsy_comain'})
p_list = zsy_comain.findAll('p')
pub_hao = ''
for p in p_list:
p = str(p.text)
if '号' in p and '〔' in p and '〕' in p or '[' in p and ']' in p and '号' in p or '【' in p and '】' in p and '号' in p:
try:
pub_hao = p.split('日')[1].split('自')[0].strip().lstrip()
except:
pub_hao = p.strip().lstrip()
break
except:
pub_hao = ''
if len(pub_hao) > 45:
pub_hao = ''
result_dict = {
'标题': title,
'来源': org,
'发文机关': '',
'发文字号': pub_hao,
'内容-未去标签': content_html,
'附件网址': [],
'发布时间': pub_time,
'成文时间': '',
'主题分类': '',
'网址': href,
'归属': '国务院国资委',
'信息来源': '国务院国资委',
'tid': 1642,
}
save_data(result_dict)
print(title)
sendContent(href, headers,title,pub_time,num)
num += 1
except:
pass
except:
pass
end_time = time.time()
print(f'共抓取{num}条数据,耗时{end_time - start_time}')
partOne()
partTwo()
from bs4 import BeautifulSoup
from urllib.parse import urljoin
......@@ -569,7 +748,8 @@ def bei_jing():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
# print(id)
# id_list.append(id)
......@@ -687,8 +867,9 @@ def nei_meng_gu():
'summary':'',
'title':title
}
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num = num + 1
......@@ -872,7 +1053,8 @@ def ji_lin():
continue
else:
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num = num + 1
except Exception as e:
......@@ -1006,7 +1188,8 @@ def shang_hai():
'summary': '',
'title': title
}
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num = num + 1
except:
......@@ -1123,7 +1306,8 @@ def zhe_jiang():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num = num + 1
......@@ -1278,7 +1462,8 @@ def fu_jian():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
print(title)
num += 1
......@@ -1386,7 +1571,8 @@ def shan_dong():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
if content == '' or content == 'None':
continue
......@@ -1485,7 +1671,8 @@ def guang_dong():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
print(title)
# save_data(result_dict)
......@@ -1656,7 +1843,8 @@ def hai_nan():
'title': title
}
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
print(title)
......@@ -1724,7 +1912,8 @@ def hai_nan():
'summary': '',
'title': title
}
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
href_text.close()
# save_data(result_dict)
......@@ -1826,7 +2015,8 @@ def hai_nan():
'title': title
}
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
href_text.close()
# save_data(result_dict)
......@@ -1929,7 +2119,8 @@ def hai_nan():
'title': title
}
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
href_text.close()
# save_data(result_dict)
......@@ -2012,7 +2203,8 @@ def hai_nan():
'title': title
}
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
href_text.close()
# save_data(result_dict)
......@@ -2182,7 +2374,8 @@ def si_chuan():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
print(title)
......@@ -2304,7 +2497,8 @@ def guang_xi():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
print(title)
num = num + 1
......@@ -2409,7 +2603,8 @@ def gui_zhou():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
print(title)
# save_data(result_dict)
......@@ -2518,7 +2713,8 @@ def yun_nan():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
print(title)
num = num + 1
......@@ -2627,8 +2823,9 @@ def yun_nan():
'title': title
}
# print(dic_news)
# sendKafka(dic_news)
# save_data(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
print(title)
num = num + 1
......@@ -2751,7 +2948,8 @@ def chong_qing():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
print(title)
# save_data(result_dict)
......@@ -2873,7 +3071,8 @@ def tian_jin():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
except:
......@@ -2992,7 +3191,8 @@ def tian_jin():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
except:
......@@ -3115,7 +3315,8 @@ def tian_jin():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
except:
......@@ -3221,7 +3422,8 @@ def xin_jiang():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
except:
......@@ -3318,7 +3520,8 @@ def xin_jiang():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
href_res.close()
......@@ -3436,7 +3639,8 @@ def shan_xi():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
except:
......@@ -3544,7 +3748,8 @@ def liao_ning():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
except:
......@@ -3638,7 +3843,8 @@ def hei_long_jiang():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
except:
......@@ -3751,7 +3957,8 @@ def jiang_su():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
except:
......@@ -3841,7 +4048,8 @@ def an_hui():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
except:
......@@ -3935,7 +4143,8 @@ def an_hui():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
href_res.close()
......@@ -4062,7 +4271,8 @@ def jiang_xi():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
except:
......@@ -4154,7 +4364,8 @@ def he_nan():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
href_res.close()
......@@ -4251,7 +4462,8 @@ def hu_nan():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
except:
......@@ -4372,7 +4584,8 @@ def gan_su():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
except Exception as e:
......@@ -4506,7 +4719,8 @@ def gan_su():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
except Exception as e:
......@@ -4661,7 +4875,8 @@ def gan_su():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
except Exception as e:
......@@ -4759,7 +4974,8 @@ def ning_xia():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
except:
......@@ -4857,7 +5073,8 @@ def shanxi():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
res_href.close()
......@@ -4951,7 +5168,8 @@ def xi_zang():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
except:
......@@ -5047,7 +5265,8 @@ def qing_hai():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
# print(id)
# id_list.append(id)
......@@ -5164,7 +5383,8 @@ def qing_hai():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
# print(id)
# id_list.append(id)
......@@ -5262,7 +5482,8 @@ def he_bei():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
except:
......@@ -5370,7 +5591,8 @@ def hu_bei():
'title': title
}
# print(dic_news)
sendKafka(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
except Exception as e:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论