提交 cf9c7394 作者: 刘伟刚
...@@ -33,7 +33,7 @@ taskType = '政策法规' ...@@ -33,7 +33,7 @@ taskType = '政策法规'
各地方国资委 各地方国资委
""" """
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji['国务院_国资委'] db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji['国务院_国资委_copy1']
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
...@@ -52,86 +52,17 @@ def paserUrl(html,listurl): ...@@ -52,86 +52,17 @@ def paserUrl(html,listurl):
link['src'] = urljoin(listurl, link['src']) link['src'] = urljoin(listurl, link['src'])
return html return html
def replaceUrl(hostUrl,src):
if '../' in src:
src = src.strip('../')
if './' in src:
src = src.strip('.')
finnal_href = hostUrl + src
return finnal_href
def save_data(result_dict): def save_data(dic_news):
try: aaa_dic = {
aa = result_dict['信息来源']
a_dict = result_dict '附件id':dic_news['attachmentIds'],
except: '网址':dic_news['sourceAddress'],
try: 'tid':dic_news['labels'][0]['relationId'],
tid = result_dict['tid'] '来源':dic_news['labels'][0]['relationName'],
except: '创建时间':dic_news['createDate']
tid = '1666' }
pass db_storage.insert_one(aaa_dic)
a_dict = {
'标题': result_dict['标题'],
'来源': result_dict['来源'],
'发文机关': '',
'发文字号': result_dict['号'],
'内容-未去标签': result_dict['内容'],
'附件网址': result_dict['附件网址'],
'发布时间': result_dict['发布时间'],
'成文时间': '',
'主题分类': '',
'网址': result_dict['网址'],
'归属': result_dict['归属'],
'信息来源': '地方国资委',
'tid': tid,
}
# a_dict['内容-未去标签'] = a_dict['内容-未去标签'].split('扫一扫在手机打开')[0]
#
if a_dict['标题']:
pass
else:
return
try:
post_url = 'http://39.105.62.235:1820/ExtarctLawInfo'
headers_ = {
'Content-Type': 'application/json'
}
resp = requests.post(post_url, headers=headers_, verify=False, data=json.dumps(a_dict))
if resp.status_code == 500:
try:
tid = result_dict['tid']
except:
tid = '1666'
a_dict = {
'标题': result_dict['标题'],
'来源': result_dict['来源'],
'发文机关': '',
'发文字号': result_dict['号'],
'内容-未去标签': '--',
'附件网址': result_dict['附件网址'],
'发布时间': result_dict['发布时间'],
'成文时间': '',
'主题分类': '',
'网址': result_dict['网址'],
'归属': result_dict['归属'],
'信息来源': '地方国资委',
'tid': tid,
}
resp = requests.post(post_url, headers=headers_, verify=False, data=json.dumps(a_dict))
print('推送:', resp.status_code)
if resp.status_code != 200:
print('推送失败!')
time.sleep(10)
a_dict['is_send'] = ''
db_storage.insert_one(a_dict)
return
except:
print('推送失败!')
time.sleep(10)
a_dict['is_send'] = ''
db_storage.insert_one(a_dict)
return
db_storage.insert_one(a_dict)
def sendKafka(dic_news): def sendKafka(dic_news):
start_time = time.time() start_time = time.time()
...@@ -475,7 +406,7 @@ from urllib.parse import urljoin ...@@ -475,7 +406,7 @@ from urllib.parse import urljoin
# 北京 # 北京
def bei_jing(): def bei_jing():
id_list = []
num = 0 num = 0
start_time = time.time() start_time = time.time()
# 有反爬需要使用selenium # 有反爬需要使用selenium
...@@ -521,37 +452,56 @@ def bei_jing(): ...@@ -521,37 +452,56 @@ def bei_jing():
break break
updown.click() updown.click()
time.sleep(2) time.sleep(2)
for href in hrefs[4:6]: log.info(f'------{len(hrefs)}条数据-------------')
num = 0
for href in hrefs:
id_list = []
title = href[1] title = href[1]
#todo:测试需要 注释掉判重 #todo:测试需要 注释掉判重
# 判断是否已经爬取过 # 判断是否已经爬取过
# is_href = db_storage.find_one({'网址': href[0]}) is_href = db_storage.find_one({'网址': href[0]})
# if is_href: if is_href:
# continue log.info('已采集----------跳过')
continue
# 对获取信息页面发送请求 # 对获取信息页面发送请求
bro.get(href[0]) bro.get(href[0])
time.sleep(1) time.sleep(1)
# 获取所要信息 # 获取所要信息
pub = bro.find_element(By.CLASS_NAME, 'doc-info') pub = bro.find_element(By.CLASS_NAME, 'doc-info')
topic = str(pub.text).split('[主题分类] ')[1].split('\n')[0].strip()
#发文机构
organ = str(pub.text).split('[发文机构] ')[1].split('\n')[0].strip()
pub_time = str(pub.text).split('[发布日期] ')[1].split('[有效性] ')[0].strip().lstrip() pub_time = str(pub.text).split('[发布日期] ')[1].split('[有效性] ')[0].strip().lstrip()
pub_source = str(pub.text).split('[发文机构] ')[1].split('[联合发文单位] ')[0].split('[实施日期] ')[0].strip().lstrip() writtenDate = str(pub.text).split('[成文日期] ')[1].split('\n')[0].strip()
# pub_source = str(pub.text).split('[发文机构] ')[1].split('[联合发文单位] ')[0].split('[实施日期] ')[0].strip().lstrip()
pub_hao = pub.find_element(By.CLASS_NAME, 'fwzh').text.replace('[发文字号] ', '').lstrip().strip() pub_hao = pub.find_element(By.CLASS_NAME, 'fwzh').text.replace('[发文字号] ', '').lstrip().strip()
try:
pub_list = bro.find_elements(By.CLASS_NAME,'article-info')
for source in pub_list:
if '来源' in source.text:
pub_source = source.text.split('来源:')[1].split('\n')[0]
# print(pub_source)
except:
pub_source = ''
#.split('来源:')[1]
if '号' not in pub_hao: if '号' not in pub_hao:
pub_hao = '' pub_hao = ''
cont = bro.find_element(By.ID, 'div_zhengwen').get_attribute('innerHTML') cont = bro.find_element(By.ID, 'div_zhengwen').get_attribute('innerHTML')
soup_cont = BeautifulSoup(cont,'lxml') soup_cont = BeautifulSoup(cont,'lxml')
soup = paserUrl(soup_cont, href[0]) soup = paserUrl(soup_cont, href[0])
text = str(soup.prettify()) text = str(soup.prettify())
print(text) #todo:去掉扫一扫
soup.find('div',id='div_div').decompose()
# print(title) # print(title)
num = 0
fu_jian_soup = soup.find_all('a') fu_jian_soup = soup.find_all('a')
for file in fu_jian_soup: for file in fu_jian_soup:
num+=1 try:
file_href = file['href'] file_href = file['href']
except Exception as e:
log.info(f'---{href[0]}--------{e}-------')
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \ if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \ or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href: or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
...@@ -567,45 +517,46 @@ def bei_jing(): ...@@ -567,45 +517,46 @@ def bei_jing():
#todo:将返回的地址更新到soup #todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path file['href'] = 'http://114.115.215.96/' + full_path
id_ = redefid(id_list) # id_ = redefid(id_list)
#todo:替换完成之后,将附件上传至文件服务器 #todo:替换完成之后,将附件上传至文件服务器
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段 # todo:传kafka字段
dic_news = { dic_news = {
'attachmentIds': id_, 'attachmentIds': id_list,
'author': '', 'author': '',
'content': str(soup_cont.text), 'content': str(soup.text),
'contentWithTag': str(soup_cont), 'contentWithTag': str(soup),
'createDate': time_now, 'createDate': time_now,
'deleteFlag': 0, 'deleteFlag': 0,
'id': '', 'id': '',
'labels': [{'relationId': "1667", 'relationName': "北京市国资委", 'labelMark': "policy"}], 'labels': [{'relationId': "1667", 'relationName': "北京市国资委", 'labelMark': "policy"}],
'origin': pub_source, 'origin': pub_source,
'organ': pub_hao, 'organ': organ,
'topicClassification': '', 'topicClassification': topic,
'issuedNumber': pub_hao, 'issuedNumber': pub_hao,
'publishDate': pub_time, 'publishDate': pub_time,
'writtenDate': pub_time, 'writtenDate': writtenDate,
'sid': '1697458829758697473', 'sid': '1697458829758697473',
'sourceAddress': '', 'sourceAddress': href[0],
'summary': '', 'summary': '',
'title': title 'title': title
} }
print(dic_news) # print(dic_news)
# sendKafka(dic_news) sendKafka(dic_news)
save_data(dic_news)
# print(id) # print(id)
# id_list.append(id) # id_list.append(id)
num += 1 num += 1
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
bro.quit() bro.quit()
except Exception as e: except Exception as e:
print(e) log.info(e)
pass pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
# 内蒙古 # 内蒙古
def nei_meng_gu(): def nei_meng_gu():
id_list = []
start = time.time() start = time.time()
num = 0 num = 0
url = 'http://gzw.nmg.gov.cn/zfxxgk/zcfg/index.html' url = 'http://gzw.nmg.gov.cn/zfxxgk/zcfg/index.html'
...@@ -617,6 +568,7 @@ def nei_meng_gu(): ...@@ -617,6 +568,7 @@ def nei_meng_gu():
result = soup.find(class_='right_two') result = soup.find(class_='right_two')
li_list = result.find_all(class_='font14wr') li_list = result.find_all(class_='font14wr')
for a in li_list[:1]: for a in li_list[:1]:
id_list = []
a_text = str(a) a_text = str(a)
real_href = 'https://gzw.nmg.gov.cn/zfxxgk' + a_text.split('href="..')[-1].split('" target="_blank')[0] real_href = 'https://gzw.nmg.gov.cn/zfxxgk' + a_text.split('href="..')[-1].split('" target="_blank')[0]
# # 判断是否已经爬取过 # # 判断是否已经爬取过
...@@ -631,13 +583,19 @@ def nei_meng_gu(): ...@@ -631,13 +583,19 @@ def nei_meng_gu():
href_text.encoding = 'utf-8' href_text.encoding = 'utf-8'
i_html = href_text.text i_html = href_text.text
i_soup = BeautifulSoup(i_html, 'html.parser') i_soup = BeautifulSoup(i_html, 'html.parser')
#todo:将html中的a标签相对路径改为绝对路径
i_soup = paserUrl(i_soup,real_href)
i_result = i_soup.find('div', id='d_laiyuan') i_result = i_soup.find('div', id='d_laiyuan')
time_ = i_result.find_all('span')[0] time_ = i_result.find_all('span')[0]
time_ = str(time_) time_ = str(time_)
pub_time = time_.split('<span>')[1].split('</span>')[0].replace('发布时间:', '') pub_time = time_.split('<span>')[1].split('</span>')[0].replace('发布时间:', '')
source = i_result.find_all('span')[1] #发布机关
source = str(source) origin = i_result.find_all('span')[1]
pub_source = source.split('<span>')[1].split('</span>')[0].replace('来源:', '') origin = str(origin)
pub_source = origin.split('<span>')[1].split('</span>')[0].replace('来源:', '')
#发文机关
organ = origin
fwzh = i_soup.find_all('td')[7] fwzh = i_soup.find_all('td')[7]
pub_hao_result = re.findall('〔(.*?)〕', str(fwzh)) pub_hao_result = re.findall('〔(.*?)〕', str(fwzh))
if len(pub_hao_result) == 0: if len(pub_hao_result) == 0:
...@@ -647,16 +605,19 @@ def nei_meng_gu(): ...@@ -647,16 +605,19 @@ def nei_meng_gu():
pub_hao = str(fwzh).split('<td>')[1].split('</td>')[0] pub_hao = str(fwzh).split('<td>')[1].split('</td>')[0]
else: else:
pub_hao = '' pub_hao = ''
#成文时间
writtenDate = i_soup.find_all('td')[9].text
topicClassification = i_soup.find_all('td')[3].text
i_content = str(i_soup.find(class_='d_show')) i_content = str(i_soup.find(class_='d_show'))
if i_content: if i_content:
content = i_content content = i_content
else: else:
i_content = str(i_soup.find(class_='view TRS_UEDITOR trs_paper_default')) i_content = str(i_soup.find(class_='view TRS_UEDITOR trs_paper_default'))
content = i_content content = i_content
#todo:内蒙古市的附件不在正文中,异步加载出来,替换不了标签,附件可上传att表中
fujian = i_soup.find_all(class_='ql_detailbro_right_qztp') fujian = i_soup.find(class_='xy_zcwjxl_downloadPC_list')
fu_jian_result = re.findall('href="(.*?)"', str(fujian)) fu_jian_result = re.findall('href="(.*?)"', str(fujian))
fu_jian_href_list = [] # fu_jian_result = fujian.find('a')['href']
if len(fu_jian_result) > 0: if len(fu_jian_result) > 0:
for fu_jian_re in fu_jian_result: for fu_jian_re in fu_jian_result:
if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \ if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
...@@ -664,58 +625,53 @@ def nei_meng_gu(): ...@@ -664,58 +625,53 @@ def nei_meng_gu():
or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re: or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
fu_jian_re = str(real_href).split('/t')[0] + '/' + str(fu_jian_re).split('./')[1] fu_jian_re = str(real_href).split('/t')[0] + '/' + str(fu_jian_re).split('./')[1]
fu_jian_href = fu_jian_re fu_jian_href = fu_jian_re
fu_jian_href_list.append(fu_jian_href) #todo:附件上传至文件服务器
#todo:附件需要上传文件服务器 type_id:7 retData = baseCore.uploadToserver(fu_jian_href,'1669')
if retData['state']:
result_dict = { pass
'标题': title, else:
'来源': pub_source, continue
'号': pub_hao, att_id, full_path = baseCore.tableUpdate(retData, '内蒙古自治区国资委', title, num)
'内容': content, id_list.append(att_id)
'附件网址': fu_jian_href_list, # # todo:将返回的地址更新到soup
'发布时间': pub_time, # fu_jian_link['href'] = 'http://114.115.215.96/' + full_path
'网址': real_href,
'归属': '内蒙古自治区国资委',
}
print(title) print(title)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
id = baseCore.getNextSeq() id = baseCore.getNextSeq()
# todo:传kafka字段 # todo:传kafka字段
dic_news = { dic_news = {
'attachmentIds': "14,15,16", 'attachmentIds': id_list,
'author': '', 'author': '',
'content': content, 'content': content,
'contentWithTag': content, 'contentWithTag': content,
'createDate': time_now, 'createDate': time_now,
'deleteFlag': 0, 'deleteFlag': 0,
'id': id, 'id': '',
'labels':[{'relationId': "1669", 'relationName': "内蒙古自治区国资委", 'labelMark': "policy"}], 'labels':[{'relationId': "1669", 'relationName': "内蒙古自治区国资委", 'labelMark': "policy"}],
'origin': pub_source, 'origin': origin,
'organ': pub_hao, 'organ': organ,
'topicClassification': '', 'topicClassification': topicClassification,
'issuedNumber': pub_hao, 'issuedNumber': pub_hao,
'publishDate': pub_time, 'publishDate': pub_time,
'writtenDate':pub_time, 'writtenDate':writtenDate,
'sid':'0987654321', 'sid':'1697458829758697473',
'sourceAddress':'', 'sourceAddress':real_href,
'summary':'', 'summary':'',
'title':title 'title':title
} }
sendKafka(dic_news) sendKafka(dic_news)
print(id)
id_list.append(id) save_data(dic_news)
# save_data(result_dict)
num = num + 1 num = num + 1
break break
except: except:
pass pass
except: except:
pass pass
print(id_list)
end = time.time() end = time.time()
print('共', num, '条', '...........', '共耗时', end - start, '秒') print('共', num, '条', '...........', '共耗时', end - start, '秒')
# 吉林 # 吉林
def ji_lin(): def ji_lin():
start = time.time() start = time.time()
...@@ -3950,9 +3906,9 @@ if __name__ == '__main__': ...@@ -3950,9 +3906,9 @@ if __name__ == '__main__':
# get_content1() # get_content1()
# get_content2() # get_content2()
# get_content3() # get_content3()
bei_jing() # bei_jing()
# nei_meng_gu() # nei_meng_gu()
# ji_lin() ji_lin()
# shang_hai() # shang_hai()
# zhe_jiang() # zhe_jiang()
# fu_jian() # fu_jian()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论