提交 8fb1c602 作者: 薛凌堃

Merge remote-tracking branch 'origin/master'

...@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup ...@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup
from kafka import KafkaProducer from kafka import KafkaProducer
from pyquery import PyQuery as pq from pyquery import PyQuery as pq
from requests.packages import urllib3 from requests.packages import urllib3
from requests.adapters import HTTPAdapter
from BaseCore import BaseCore from BaseCore import BaseCore
baseCore = BaseCore() baseCore = BaseCore()
...@@ -116,7 +116,7 @@ def sendKafka(dic_news): ...@@ -116,7 +116,7 @@ def sendKafka(dic_news):
# 传输成功,写入日志中 # 传输成功,写入日志中
state = 1 state = 1
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
# return True return True
except Exception as e: except Exception as e:
...@@ -130,6 +130,7 @@ def sendKafka(dic_news): ...@@ -130,6 +130,7 @@ def sendKafka(dic_news):
e = 'Kafka操作失败' e = 'Kafka操作失败'
state = 0 state = 0
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
return False
def redefid(idList): def redefid(idList):
...@@ -140,8 +141,39 @@ def redefid(idList): ...@@ -140,8 +141,39 @@ def redefid(idList):
def remove_dup(): def remove_dup():
pass pass
# 国务院文件
def get_content1(): def get_content1():
def getPageConunt(a_list, url, headers, s):
data = {"code": "18122f54c5c", "thirdPartyCode": "thirdparty_code_107", "thirdPartyTableId": 30,
"resultFields": ["pub_url", "maintitle", "fwzh", "cwrq", "publish_time"],
"trackTotalHits": "true",
"searchFields": [{"fieldName": "maintitle", "searchWord": ""}], "isPreciseSearch": 0,
"sorts": [{"sortField": "publish_time", "sortOrder": "DESC"}], "childrenInfoIds": [[a_list[1]]],
"pageSize": 20, "pageNo": 1}
data = json.dumps(data)
ip = baseCore.get_proxy()
res = s.post(url=url, headers=headers, data=data, verify=False, proxies=ip)
# 获得结果为json格式
res_text = json.loads(res.text)
pageCount = res_text['result']['data']['pager']['pageCount']
return pageCount
def getList(a_list, url, headers, pageNo, s):
# post请求所需参数
data = {"code": "18122f54c5c", "thirdPartyCode": "thirdparty_code_107", "thirdPartyTableId": 30,
"resultFields": ["pub_url", "maintitle", "fwzh", "cwrq", "publish_time"],
"trackTotalHits": "true",
"searchFields": [{"fieldName": "maintitle", "searchWord": ""}], "isPreciseSearch": 0,
"sorts": [{"sortField": "publish_time", "sortOrder": "DESC"}], "childrenInfoIds": [[a_list[1]]],
"pageSize": 20, "pageNo": pageNo}
data = json.dumps(data)
ip = baseCore.get_proxy()
res = s.post(url=url, headers=headers, data=data, verify=False, proxies=ip)
res_text = json.loads(res.text)
page_list = res_text['result']['data']['list']
return page_list
start_time = time.time() start_time = time.time()
num = 0 num = 0
# 过网站验证所需 athenaAppKey athenaAppName # 过网站验证所需 athenaAppKey athenaAppName
...@@ -172,86 +204,142 @@ def get_content1(): ...@@ -172,86 +204,142 @@ def get_content1():
result_list = [['国令', "1108"], ['国发', "1107"], ['国函', "1106"], ['国发明电', "1105"], ['国办发', "1104"], result_list = [['国令', "1108"], ['国发', "1107"], ['国函', "1106"], ['国发明电', "1105"], ['国办发', "1104"],
['国办函', "1103"], ['国办函', "1103"],
['国办发明电', "1102"], ['其他', "1101"]] ['国办发明电', "1102"], ['其他', "1101"]]
try: for a_list in result_list:
for a_list in result_list: s = requests.session()
s = requests.session() s.mount('https://', HTTPAdapter(max_retries=3))
s.keep_alive = False s.mount('http://', HTTPAdapter(max_retries=3))
pageNo = 1 s.keep_alive = False
pcodeJiguan = a_list[0] pcodeJiguan = a_list[0]
# post请求所需参数 try:
data = {"code": "18122f54c5c", "thirdPartyCode": "thirdparty_code_107", "thirdPartyTableId": 30, pageCount = getPageConunt(a_list, url, headers, s)
"resultFields": ["pub_url", "maintitle", "fwzh", "cwrq", "publish_time"], for pageNo in range(1, pageCount + 1):
"trackTotalHits": "true",
"searchFields": [{"fieldName": "maintitle", "searchWord": ""}], "isPreciseSearch": 0,
"sorts": [{"sortField": "publish_time", "sortOrder": "DESC"}], "childrenInfoIds": [[a_list[1]]],
"pageSize": 20, "pageNo": pageNo}
data = json.dumps(data)
res = s.post(url=url, headers=headers, data=data, verify=False)
# 获得结果为json格式
res_text = json.loads(res.text)
page_list = res_text['result']['data']['list']
s.close()
for page in page_list:
# 获取所需信息
title = page['maintitle']
pub_time1 = page['publish_time']
pub_time2 = page['cwrq']
pub_code = page['fwzh']
href = page['pub_url']
# 判断是否已经爬取过
is_href = db_storage.find_one({'网址': href})
if is_href:
continue
try: try:
resp_href = requests.get(url=href, headers=headers_, verify=False) try:
resp_href.encoding = resp_href.apparent_encoding page_list = getList(a_list, url, headers, pageNo, s)
i_html = resp_href.text except:
if '您访问的页面不存在或已删除' in i_html: s.close()
continue page_list = getList(a_list, url, headers, pageNo, s)
i_soup = BeautifulSoup(i_html, 'html.parser') for page in page_list:
source = str(i_soup.find_all('tbody')[0]) id_list = []
pub_org = source.split('<td><b>发文机关:</b></td>')[1].split('<td>')[1].split('</td>')[0] # 获取所需信息
child_type = source.split('<td class="w340 zcwj_ztfl">')[1].split('</td>')[0] title = page['maintitle'] # 标题
content = str(i_soup.find('table', attrs={'class': 'pages_content'})) pub_time1 = page['publish_time'] # 发布时间
fu_jian_result = re.findall('href="(.*?)"', content) pub_time2 = page['cwrq'] # 成文时间
fu_jian_href_list = [] pub_code = page['fwzh'] # 发文字号
if len(fu_jian_result) > 0: href = page['pub_url'] # 网址
for fu_jian_re in fu_jian_result: # 判断是否已经爬取过
if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \ is_href = db_storage.find_one({'网址': href})
or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \ if is_href:
or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re: log.info('已采集----------跳过')
fu_jian_href = fu_jian_re continue
fu_jian_href_list.append(fu_jian_href) try:
result_dict = { resp_href = requests.get(url=href, headers=headers_, verify=False)
'标题': title, resp_href.encoding = resp_href.apparent_encoding
'来源': '', i_html = resp_href.text
'发文机关': pub_org, if '您访问的页面不存在或已删除' in i_html:
'发文字号': pub_code, # log.error(f'{title}...{href}...页面不存在或已删除')
'内容-未去标签': content, continue
'附件网址': fu_jian_href_list, i_soup = BeautifulSoup(i_html, 'html.parser')
'发布时间': pub_time1, i_soup = paserUrl(i_soup, href)
'成文时间': pub_time2, source = str(i_soup.find_all('tbody')[0])
'主题分类': child_type, pub_org = source.split('<td><b>发文机关:</b></td>')[1].split('<td>')[1].split('</td>')[
'网址': href, 0] # 发文机关
'归属': pcodeJiguan, child_type = source.split('<td class="w340 zcwj_ztfl">')[1].split('</td>')[0] # 主题分类
'信息来源': '国务院文件', contentWithTag = i_soup.find('div',class_='wrap mxxgkwrap mxxgkwrap_gwywj').find('table',class_='border-table noneBorder pages_content')
'tid': 1766, # 去除扫一扫
} contentWithTag.find('div', attrs={'id': 'div_div'}).decompose()
resp_href.close() content = contentWithTag.text # 不带标签正文
print(title) fu_jian_soup = contentWithTag.find_all('a')
# save_data(result_dict) time.sleep(0.5)
# time.sleep(1) for file in fu_jian_soup:
num += 1 try:
file_href = file['href']
except Exception as e:
log.info(f'---{href}--------{e}-------')
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href,'1766')
if retData['state']:
pass
else:
continue
att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num)
id_list.append(att_id)
#todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path
except:
log.error(f'{title}...{href}...获取内容失败')
continue
#todo:替换完成之后,将附件上传至文件服务器
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#todo:传kafka字段
dic_news = {
'attachmentIds': id_list, #附件id
'author': '', #作者
'content': content, #正文不带标签
'contentWithTag': str(contentWithTag), #正文带标签
'createDate': time_now, #创建时间
'deleteFlag': 0, #是否删除(0为默认,1为删除)
'id': '', #
'labels': [{'relationId': "1766", 'relationName': "国务院文件", 'labelMark': "policy"}], #关联标签id 关联标签名称 关联标签标识
'origin': '', #政策发布机关
'organ': pub_org, #政策发文机关
'topicClassification': child_type, #政策文件分类
'issuedNumber': pub_code, #发文字号
'publishDate': pub_time1, #发布时间
'writtenDate': pub_time2, #成文时间
'sid': '1697458829758697473', #信息源id
'sourceAddress': href[0], #原文链接
'summary': '', #摘要
'title': title #标题
}
# print(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
except: except:
pass log.error(f'{pcodeJiguan}...第{pageNo}页获取列表失败')
except: continue
pass except:
end_time = time.time() log.error(f'{pcodeJiguan}...获取总数失败')
print(f'共抓取{num}条数据,共耗时{start_time - end_time}') continue
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{start_time - end_time}')
# 国务院部门文件 # 国务院部门文件
def get_content2(): def get_content2():
def getTotalpage(bmfl,headers,session):
ip = baseCore.get_proxy()
pageNo = 1
time.sleep(2)
# 拼接url
url_ = f'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
resp = session.get(url=url_, headers=headers, verify=False,proxies=ip)
resp_text = resp.text
resp_json = json.loads(resp_text)
totalpage = resp_json['searchVO']['totalpage']
return totalpage
def getContentList(bmfl,pageNo,headers,session):
ip = baseCore.get_proxy()
url_ = f'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
# 请求结果为json格式
resp = session.get(url=url_, headers=headers, verify=False,proxies=ip)
resp_text = resp.text
resp_json = json.loads(resp_text)
content_list = resp_json['searchVO']['listVO']
return content_list
session = requests.session()
session.mount('https://', HTTPAdapter(max_retries=3))
session.mount('http://', HTTPAdapter(max_retries=3))
session.keep_alive = False
start_time = time.time() start_time = time.time()
num = 0 num = 0
result_list = ['外交部', '国家发展和改革委员会', '教育部', '科学技术部', '工业和信息化部', '国家民族事务委员会', '公安部', '国家安全部', '民政部', '司法部', '财政部', result_list = ['外交部', '国家发展和改革委员会', '教育部', '科学技术部', '工业和信息化部', '国家民族事务委员会', '公安部', '国家安全部', '民政部', '司法部', '财政部',
...@@ -271,171 +359,261 @@ def get_content2(): ...@@ -271,171 +359,261 @@ def get_content2():
for bmfl in result_list: for bmfl in result_list:
try: try:
pageNo = 0 totalpage = getTotalpage(bmfl,headers,session)
time.sleep(2) for pageNo in range(1,totalpage+1):
# 拼接url
url_ = f'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
try:
# 请求结果为json格式
resp = requests.get(url=url_, headers=headers, verify=False)
resp_text = resp.text
resp_json = json.loads(resp_text)
content_list = resp_json['searchVO']['listVO']
resp.close()
except:
continue
for content_dict in content_list:
href = content_dict['url'] # 详情页
title = content_dict['title'] # 标题
pub_code = content_dict['pcode'] # 发文字号
try: try:
pub_time = int(content_dict['pubtime'] / 1000) # 发布时间 try:
pub_time1 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(pub_time)) content_list = getContentList(bmfl,pageNo,headers,session)
except: except:
pub_time1 = '' session.close()
try: content_list = getContentList(bmfl,pageNo,headers,session)
p_time = int(content_dict['ptime'] / 1000) # 成文时间 for content_dict in content_list:
pub_time2 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(p_time)) id_list = []
except: href = content_dict['url'] # 详情页
pub_time2 = '' title = content_dict['title'] # 标题
pub_org = content_dict['puborg'] # 发文机关 pub_code = content_dict['pcode'] # 发文字号
try: try:
child_type = content_dict['childtype'] # 主题分类 pub_time = int(content_dict['pubtime'] / 1000) # 发布时间
pub_time1 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(pub_time))
except:
pub_time1 = ''
try:
p_time = int(content_dict['ptime'] / 1000) # 成文时间
pub_time2 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(p_time))
except:
pub_time2 = ''
pub_org = content_dict['puborg'] # 发文机关
try:
child_type = content_dict['childtype'] # 主题分类
except:
child_type = ''
# # 判断是否已经爬取过
is_href = db_storage.find_one({'网址': href})
if is_href:
log.info('已采集----------跳过')
continue
try:
resp = requests.get(url=href, headers=headers, verify=False)
resp.encoding = resp.apparent_encoding
resp_text = resp.text
soup = BeautifulSoup(resp_text, 'html.parser')
soup = paserUrl(soup,href)
time.sleep(0.5)
contentWithTag = soup.find('div', attrs={'class': 'pages_content mhide'})
content = contentWithTag.text
fu_jian_soup = contentWithTag.find_all('a')
for file in fu_jian_soup:
try:
file_href = file['href']
except Exception as e:
log.info(f'---{href}--------{e}-------')
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href,'1699')
if retData['state']:
pass
else:
continue
att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num)
id_list.append(att_id)
#todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path
except:
print(f'{title}...{href}获取内容失败')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#todo:传kafka字段
dic_news = {
'attachmentIds': id_list, #附件id
'author': '', #作者
'content': content, #正文不带标签
'contentWithTag': str(contentWithTag), #正文带标签
'createDate': time_now, #创建时间
'deleteFlag': 0, #是否删除(0为默认,1为删除)
'id': '', #
'labels': [{'relationId': "1699", 'relationName': "国务院各部委文件", 'labelMark': "policy"}], #关联标签id 关联标签名称 关联标签标识
'origin': '', #政策发布机关
'organ': pub_org, #政策发文机关
'topicClassification': child_type, #政策文件分类
'issuedNumber': pub_code, #发文字号
'publishDate': pub_time1, #发布时间
'writtenDate': pub_time2, #成文时间
'sid': '1697458829758697473', #信息源id
'sourceAddress': href, #原文链接
'summary': '', #摘要
'title': title #标题
}
# print(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
num += 1
except: except:
child_type = '' print(f'{bmfl}...第{pageNo}页获取信息列表失败')
# 判断是否已经爬取过
is_href = db_storage.find_one({'网址': href})
if is_href:
continue continue
try:
resp = requests.get(url=href, headers=headers, verify=False)
resp.encoding = 'utf-8'
resp_text = resp.text
soup = BeautifulSoup(resp_text, 'html.parser')
time.sleep(1)
content = str(soup.find('div', attrs={'class': 'pages_content mhide'}))
fu_jian_result = re.findall('href="(.*?)"', content)
fu_jian_href_list = []
if len(fu_jian_result) > 0:
for fu_jian_re in fu_jian_result:
if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
fu_jian_href = href.split('content')[0] + fu_jian_re
fu_jian_href_list.append(fu_jian_href)
resp.close()
result_dict = {
'标题': title,
'来源': '',
'发文机关': pub_org,
'发文字号': pub_code,
'内容-未去标签': content,
'附件网址': fu_jian_href_list,
'发布时间': pub_time1,
'成文时间': pub_time2,
'主题分类': child_type,
'网址': href,
'归属': bmfl,
'信息来源': '国务院部门文件',
'tid': 1699,
}
print(title)
save_data(result_dict)
num += 1
except:
pass
except: except:
pass print(f'{bmfl}...获取页数失败')
continue
end_time = time.time() end_time = time.time()
print(f'共抓取{num}条数据,耗时{end_time - start_time}') print(f'共抓取{num}条数据,耗时{end_time - start_time}')
# 国务院国有资产监督管理委员会-政策发布 # 国务院国有资产监督管理委员会-政策发布
def get_content3(): def get_content3():
start_time = time.time() def getPage():
num = 0 url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html" req = requests.get(url, headers=headers, verify=False)
try: req.encoding = req.apparent_encoding
# get请求,需要取消ssl验证 soup = BeautifulSoup(req.text, 'html.parser')
href_resp = requests.request("GET", url, headers=headers, verify=False) totalpage = re.findall("maxPageNum = (.*);", soup.select('#pag_2603340')[0].text)[0]
resp_text = href_resp.content.decode('UTF-8') return int(totalpage)
doc_resp = pq(resp_text)
doc_items = doc_resp('.zsy_conlist li').items() def sendContent(href, headers,title,pub_time,num):
time.sleep(1) id_list = []
for doc_item in doc_items: resp_href = requests.request("GET", href, headers=headers, verify=False)
# 获取所需数据 resp_href.encoding = resp_href.apparent_encoding
soup = BeautifulSoup(resp_href.text, 'lxml')
soup = paserUrl(soup, href)
doc_href = soup.find('div', class_='zsy_content')
try:
org_content = doc_href.select('.zsy_cotitle')[0]
org = re.findall('文章来源:(.*?)发布时间:', org_content)[0].strip()
except:
org = ''
contentWithTag = doc_href.find('div', class_='zsy_comain')
contentWithTag.select('#qr_container')[0].decompose()
contentWithTag.find('div', attrs={'id': 'div_div'}).decompose()
contentWithTag.find('div', class_='related').decompose()
contentWithTag.find('div', class_='jiathis_style_24x24').decompose()
try:
p_list = contentWithTag.findAll('p')
pub_hao = ''
for p in p_list:
p = str(p.text)
if '号' in p and '〔' in p and '〕' in p or '[' in p and ']' in p and '号' in p or '【' in p and '】' in p and '号' in p:
try:
pub_hao = p.split('日')[1].split('自')[0].strip().lstrip()
except:
pub_hao = p.strip().lstrip()
break
except:
pub_hao = ''
if len(pub_hao) > 15:
pub_hao = ''
content = contentWithTag.text
fu_jian_soup = contentWithTag.find_all('a')
for file in fu_jian_soup:
try: try:
href_ = doc_item('a').attr('href') file_href = file['href']
if href_ is None: except Exception as e:
log.info(f'---{href}--------{e}-------')
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
file_name = file.text.strip()
retData = baseCore.uploadToserver(file_href,'1642')
if retData['state']:
pass
else:
continue continue
att_id,full_path = baseCore.tableUpdate(retData,'国务院国资委',file_name,num)
id_list.append(att_id)
#todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#todo:传kafka字段
dic_news = {
'attachmentIds': id_list, #附件id
'author': '', #作者
'content': content, #正文不带标签
'contentWithTag': str(contentWithTag), #正文带标签
'createDate': time_now, #创建时间
'deleteFlag': 0, #是否删除(0为默认,1为删除)
'id': '', #
'labels': [{'relationId': "1642", 'relationName': "国务院国资委", 'labelMark': "policy"}], #关联标签id 关联标签名称 关联标签标识
'origin': '', #政策发布机关
'organ': org, #政策发文机关
'topicClassification': '', #政策文件分类
'issuedNumber': pub_hao, #发文字号
'publishDate': pub_time, #发布时间
'writtenDate': '', #成文时间
'sid': '1697458829758697473', #信息源id
'sourceAddress': href, #原文链接
'summary': '', #摘要
'title': title #标题
}
# print(dic_news)
flag = sendKafka(dic_news)
if flag:
save_data(dic_news)
def partTwo():
start_time = time.time()
num = 0
totalpage = getPage()
for page in range(1, totalpage):
url = f"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index_2603340_{page}.html"
href_resp = requests.request("GET", url, headers=headers, verify=False)
resp_text = href_resp.content.decode('UTF-8')
li_list = resp_text.split('<li>')
del (li_list[0])
for li in li_list:
id_list = []
href_ = li.split('<a href="')[1].split('" target=')[0]
title = li.split('title="')[1].split('">')[0]
href = f'http://www.sasac.gov.cn{href_.replace("../../..", "")}' href = f'http://www.sasac.gov.cn{href_.replace("../../..", "")}'
# 判断是否已经爬取过 pub_time = li.split('<span>[')[1].split(']</span>')[0]
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
log.info('已采集----------跳过')
continue continue
title = doc_item('a').attr('title') sendContent(href, headers,title,pub_time,num)
pub_time = doc_item('span').text().replace('[', '').replace(']', '') num += 1
except: end_time = time.time()
continue print(f'共抓取{num}条数据,耗时{end_time - start_time}')
try:
def partOne():
start_time = time.time()
num = 0
url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
try:
# get请求,需要取消ssl验证
href_resp = requests.request("GET", url, headers=headers, verify=False)
resp_text = href_resp.content.decode('UTF-8')
doc_resp = pq(resp_text)
doc_items = doc_resp('.zsy_conlist li').items()
time.sleep(1)
for doc_item in doc_items:
# 获取所需数据
try: try:
resp_href = requests.request("GET", href, headers=headers, verify=False) href_ = doc_item('a').attr('href')
doc_href = pq(resp_href.content) if href_ is None:
time.sleep(1) continue
content_html = str(doc_href('.zsy_comain').remove('style').remove('#qr_container')) href = f'http://www.sasac.gov.cn{href_.replace("../../..", "")}'
content = pq(content_html).text() # 判断是否已经爬取过
is_href = db_storage.find_one({'网址': href})
if is_href:
log.info('已采集----------跳过')
continue
title = doc_item('a').attr('title')
pub_time = doc_item('span').text().replace('[', '').replace(']', '')
except: except:
continue continue
if content.strip() == '': sendContent(href, headers,title,pub_time,num)
continue
try:
org_content = doc_href('.zsy_cotitle').text()
org = re.findall('文章来源:(.*?)发布时间:', org_content)[0].strip()
except:
org = ''
try:
resp_href.encoding = 'utf-8'
resp_text_ = BeautifulSoup(resp_href.text, 'html.parser')
zsy_comain = resp_text_.find('div', attrs={'class': 'zsy_comain'})
p_list = zsy_comain.findAll('p')
pub_hao = ''
for p in p_list:
p = str(p.text)
if '号' in p and '〔' in p and '〕' in p or '[' in p and ']' in p and '号' in p or '【' in p and '】' in p and '号' in p:
try:
pub_hao = p.split('日')[1].split('自')[0].strip().lstrip()
except:
pub_hao = p.strip().lstrip()
break
except:
pub_hao = ''
if len(pub_hao) > 45:
pub_hao = ''
result_dict = {
'标题': title,
'来源': org,
'发文机关': '',
'发文字号': pub_hao,
'内容-未去标签': content_html,
'附件网址': [],
'发布时间': pub_time,
'成文时间': '',
'主题分类': '',
'网址': href,
'归属': '国务院国资委',
'信息来源': '国务院国资委',
'tid': 1642,
}
save_data(result_dict)
print(title)
num += 1 num += 1
except: except:
pass pass
except: end_time = time.time()
pass print(f'共抓取{num}条数据,耗时{end_time - start_time}')
end_time = time.time()
print(f'共抓取{num}条数据,耗时{end_time - start_time}')
partOne()
partTwo()
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urljoin from urllib.parse import urljoin
...@@ -580,8 +758,9 @@ def bei_jing(): ...@@ -580,8 +758,9 @@ def bei_jing():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
# print(id) # print(id)
# id_list.append(id) # id_list.append(id)
num += 1 num += 1
...@@ -698,9 +877,10 @@ def nei_meng_gu(): ...@@ -698,9 +877,10 @@ def nei_meng_gu():
'summary': '', 'summary': '',
'title': title 'title': title
} }
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
num = num + 1 num = num + 1
except: except:
...@@ -890,8 +1070,9 @@ def ji_lin(): ...@@ -890,8 +1070,9 @@ def ji_lin():
continue continue
else: else:
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
num = num + 1 num = num + 1
except Exception as e: except Exception as e:
print(e) print(e)
...@@ -1024,8 +1205,9 @@ def shang_hai(): ...@@ -1024,8 +1205,9 @@ def shang_hai():
'summary': '', 'summary': '',
'title': title 'title': title
} }
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
num = num + 1 num = num + 1
except: except:
pass pass
...@@ -1143,8 +1325,9 @@ def zhe_jiang(): ...@@ -1143,8 +1325,9 @@ def zhe_jiang():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
num = num + 1 num = num + 1
except: except:
...@@ -1301,8 +1484,9 @@ def fu_jian(): ...@@ -1301,8 +1484,9 @@ def fu_jian():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
print(title) print(title)
num += 1 num += 1
except: except:
...@@ -1410,8 +1594,9 @@ def shan_dong(): ...@@ -1410,8 +1594,9 @@ def shan_dong():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
if content == '' or content == 'None': if content == '' or content == 'None':
continue continue
else: else:
...@@ -1512,8 +1697,9 @@ def guang_dong(): ...@@ -1512,8 +1697,9 @@ def guang_dong():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
print(title) print(title)
# save_data(result_dict) # save_data(result_dict)
num = num + 1 num = num + 1
...@@ -1697,8 +1883,9 @@ def hai_nan(): ...@@ -1697,8 +1883,9 @@ def hai_nan():
'title': title 'title': title
} }
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
print(title) print(title)
num = num + 1 num = num + 1
...@@ -1768,8 +1955,9 @@ def hai_nan(): ...@@ -1768,8 +1955,9 @@ def hai_nan():
'summary': '', 'summary': '',
'title': title 'title': title
} }
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
href_text.close() href_text.close()
# save_data(result_dict) # save_data(result_dict)
print(title) print(title)
...@@ -1873,8 +2061,9 @@ def hai_nan(): ...@@ -1873,8 +2061,9 @@ def hai_nan():
'title': title 'title': title
} }
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
href_text.close() href_text.close()
# save_data(result_dict) # save_data(result_dict)
print(title) print(title)
...@@ -1979,8 +2168,9 @@ def hai_nan(): ...@@ -1979,8 +2168,9 @@ def hai_nan():
'title': title 'title': title
} }
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
href_text.close() href_text.close()
# save_data(result_dict) # save_data(result_dict)
print(title) print(title)
...@@ -2065,8 +2255,9 @@ def hai_nan(): ...@@ -2065,8 +2255,9 @@ def hai_nan():
'title': title 'title': title
} }
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
href_text.close() href_text.close()
# save_data(result_dict) # save_data(result_dict)
print(title) print(title)
...@@ -2238,8 +2429,9 @@ def si_chuan(): ...@@ -2238,8 +2429,9 @@ def si_chuan():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
print(title) print(title)
num = num + 1 num = num + 1
...@@ -2363,8 +2555,9 @@ def guang_xi(): ...@@ -2363,8 +2555,9 @@ def guang_xi():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
print(title) print(title)
num = num + 1 num = num + 1
except: except:
...@@ -2471,8 +2664,9 @@ def gui_zhou(): ...@@ -2471,8 +2664,9 @@ def gui_zhou():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
print(title) print(title)
# save_data(result_dict) # save_data(result_dict)
num = num + 1 num = num + 1
...@@ -2584,8 +2778,9 @@ def yun_nan(): ...@@ -2584,8 +2778,9 @@ def yun_nan():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
print(title) print(title)
num = num + 1 num = num + 1
except: except:
...@@ -2696,8 +2891,9 @@ def yun_nan(): ...@@ -2696,8 +2891,9 @@ def yun_nan():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
# sendKafka(dic_news) flag = sendKafka(dic_news)
# save_data(dic_news) if flag:
save_data(dic_news)
print(title) print(title)
num = num + 1 num = num + 1
...@@ -2826,8 +3022,9 @@ def chong_qing(): ...@@ -2826,8 +3022,9 @@ def chong_qing():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
print(title) print(title)
# save_data(result_dict) # save_data(result_dict)
num += 1 num += 1
...@@ -2951,8 +3148,9 @@ def tian_jin(): ...@@ -2951,8 +3148,9 @@ def tian_jin():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
num += 1 num += 1
except: except:
pass pass
...@@ -3073,8 +3271,9 @@ def tian_jin(): ...@@ -3073,8 +3271,9 @@ def tian_jin():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
num += 1 num += 1
except: except:
pass pass
...@@ -3199,8 +3398,9 @@ def tian_jin(): ...@@ -3199,8 +3398,9 @@ def tian_jin():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
num += 1 num += 1
except: except:
pass pass
...@@ -3306,8 +3506,9 @@ def xin_jiang(): ...@@ -3306,8 +3506,9 @@ def xin_jiang():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
num += 1 num += 1
except: except:
pass pass
...@@ -3403,8 +3604,9 @@ def xin_jiang(): ...@@ -3403,8 +3604,9 @@ def xin_jiang():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
num += 1 num += 1
href_res.close() href_res.close()
except: except:
...@@ -3521,8 +3723,9 @@ def shan_xi(): ...@@ -3521,8 +3723,9 @@ def shan_xi():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
num += 1 num += 1
except: except:
pass pass
...@@ -3630,8 +3833,9 @@ def liao_ning(): ...@@ -3630,8 +3833,9 @@ def liao_ning():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
num += 1 num += 1
except: except:
pass pass
...@@ -3723,8 +3927,9 @@ def hei_long_jiang(): ...@@ -3723,8 +3927,9 @@ def hei_long_jiang():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
num += 1 num += 1
except: except:
pass pass
...@@ -3836,8 +4041,9 @@ def jiang_su(): ...@@ -3836,8 +4041,9 @@ def jiang_su():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
num += 1 num += 1
except: except:
pass pass
...@@ -3930,8 +4136,9 @@ def an_hui(): ...@@ -3930,8 +4136,9 @@ def an_hui():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
num += 1 num += 1
except: except:
pass pass
...@@ -4025,8 +4232,9 @@ def an_hui(): ...@@ -4025,8 +4232,9 @@ def an_hui():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
num += 1 num += 1
href_res.close() href_res.close()
except: except:
...@@ -4158,8 +4366,9 @@ def jiang_xi(): ...@@ -4158,8 +4366,9 @@ def jiang_xi():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
num += 1 num += 1
except: except:
pass pass
...@@ -4250,8 +4459,9 @@ def he_nan(): ...@@ -4250,8 +4459,9 @@ def he_nan():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
num += 1 num += 1
href_res.close() href_res.close()
resp_text.close() resp_text.close()
...@@ -4351,8 +4561,9 @@ def hu_nan(): ...@@ -4351,8 +4561,9 @@ def hu_nan():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
num += 1 num += 1
except: except:
pass pass
...@@ -4472,8 +4683,9 @@ def gan_su(): ...@@ -4472,8 +4683,9 @@ def gan_su():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
num += 1 num += 1
except Exception as e: except Exception as e:
print(e) print(e)
...@@ -4607,8 +4819,9 @@ def gan_su(): ...@@ -4607,8 +4819,9 @@ def gan_su():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
num += 1 num += 1
except Exception as e: except Exception as e:
print(e) print(e)
...@@ -4763,8 +4976,9 @@ def gan_su(): ...@@ -4763,8 +4976,9 @@ def gan_su():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
num += 1 num += 1
except Exception as e: except Exception as e:
print(e) print(e)
...@@ -4862,8 +5076,9 @@ def ning_xia(): ...@@ -4862,8 +5076,9 @@ def ning_xia():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
num += 1 num += 1
except: except:
pass pass
...@@ -4960,8 +5175,9 @@ def shanxi(): ...@@ -4960,8 +5175,9 @@ def shanxi():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
num += 1 num += 1
res_href.close() res_href.close()
except: except:
...@@ -5053,8 +5269,9 @@ def xi_zang(): ...@@ -5053,8 +5269,9 @@ def xi_zang():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
num += 1 num += 1
except: except:
pass pass
...@@ -5148,8 +5365,9 @@ def qing_hai(): ...@@ -5148,8 +5365,9 @@ def qing_hai():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
# print(id) # print(id)
# id_list.append(id) # id_list.append(id)
num += 1 num += 1
...@@ -5265,8 +5483,9 @@ def qing_hai(): ...@@ -5265,8 +5483,9 @@ def qing_hai():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
# print(id) # print(id)
# id_list.append(id) # id_list.append(id)
num += 1 num += 1
...@@ -5363,8 +5582,9 @@ def he_bei(): ...@@ -5363,8 +5582,9 @@ def he_bei():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
num += 1 num += 1
except: except:
pass pass
...@@ -5471,8 +5691,9 @@ def hu_bei(): ...@@ -5471,8 +5691,9 @@ def hu_bei():
'title': title 'title': title
} }
# print(dic_news) # print(dic_news)
sendKafka(dic_news) flag = sendKafka(dic_news)
save_data(dic_news) if flag:
save_data(dic_news)
num += 1 num += 1
except Exception as e: except Exception as e:
pass pass
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论