提交 cf9c7394 作者: 刘伟刚
......@@ -33,7 +33,7 @@ taskType = '政策法规'
各地方国资委
"""
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji['国务院_国资委']
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji['国务院_国资委_copy1']
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
......@@ -52,86 +52,17 @@ def paserUrl(html,listurl):
link['src'] = urljoin(listurl, link['src'])
return html
def replaceUrl(hostUrl,src):
if '../' in src:
src = src.strip('../')
if './' in src:
src = src.strip('.')
finnal_href = hostUrl + src
return finnal_href
def save_data(result_dict):
try:
aa = result_dict['信息来源']
a_dict = result_dict
except:
try:
tid = result_dict['tid']
except:
tid = '1666'
pass
a_dict = {
'标题': result_dict['标题'],
'来源': result_dict['来源'],
'发文机关': '',
'发文字号': result_dict['号'],
'内容-未去标签': result_dict['内容'],
'附件网址': result_dict['附件网址'],
'发布时间': result_dict['发布时间'],
'成文时间': '',
'主题分类': '',
'网址': result_dict['网址'],
'归属': result_dict['归属'],
'信息来源': '地方国资委',
'tid': tid,
}
# a_dict['内容-未去标签'] = a_dict['内容-未去标签'].split('扫一扫在手机打开')[0]
#
if a_dict['标题']:
pass
else:
return
try:
post_url = 'http://39.105.62.235:1820/ExtarctLawInfo'
headers_ = {
'Content-Type': 'application/json'
}
resp = requests.post(post_url, headers=headers_, verify=False, data=json.dumps(a_dict))
if resp.status_code == 500:
try:
tid = result_dict['tid']
except:
tid = '1666'
a_dict = {
'标题': result_dict['标题'],
'来源': result_dict['来源'],
'发文机关': '',
'发文字号': result_dict['号'],
'内容-未去标签': '--',
'附件网址': result_dict['附件网址'],
'发布时间': result_dict['发布时间'],
'成文时间': '',
'主题分类': '',
'网址': result_dict['网址'],
'归属': result_dict['归属'],
'信息来源': '地方国资委',
'tid': tid,
}
resp = requests.post(post_url, headers=headers_, verify=False, data=json.dumps(a_dict))
print('推送:', resp.status_code)
if resp.status_code != 200:
print('推送失败!')
time.sleep(10)
a_dict['is_send'] = ''
db_storage.insert_one(a_dict)
return
except:
print('推送失败!')
time.sleep(10)
a_dict['is_send'] = ''
db_storage.insert_one(a_dict)
return
db_storage.insert_one(a_dict)
def save_data(dic_news):
aaa_dic = {
'附件id':dic_news['attachmentIds'],
'网址':dic_news['sourceAddress'],
'tid':dic_news['labels'][0]['relationId'],
'来源':dic_news['labels'][0]['relationName'],
'创建时间':dic_news['createDate']
}
db_storage.insert_one(aaa_dic)
def sendKafka(dic_news):
start_time = time.time()
......@@ -475,7 +406,7 @@ from urllib.parse import urljoin
# 北京
def bei_jing():
id_list = []
num = 0
start_time = time.time()
# 有反爬需要使用selenium
......@@ -521,37 +452,56 @@ def bei_jing():
break
updown.click()
time.sleep(2)
for href in hrefs[4:6]:
log.info(f'------{len(hrefs)}条数据-------------')
num = 0
for href in hrefs:
id_list = []
title = href[1]
#todo:测试需要 注释掉判重
# 判断是否已经爬取过
# is_href = db_storage.find_one({'网址': href[0]})
# if is_href:
# continue
is_href = db_storage.find_one({'网址': href[0]})
if is_href:
log.info('已采集----------跳过')
continue
# 对获取信息页面发送请求
bro.get(href[0])
time.sleep(1)
# 获取所要信息
pub = bro.find_element(By.CLASS_NAME, 'doc-info')
topic = str(pub.text).split('[主题分类] ')[1].split('\n')[0].strip()
#发文机构
organ = str(pub.text).split('[发文机构] ')[1].split('\n')[0].strip()
pub_time = str(pub.text).split('[发布日期] ')[1].split('[有效性] ')[0].strip().lstrip()
pub_source = str(pub.text).split('[发文机构] ')[1].split('[联合发文单位] ')[0].split('[实施日期] ')[0].strip().lstrip()
writtenDate = str(pub.text).split('[成文日期] ')[1].split('\n')[0].strip()
# pub_source = str(pub.text).split('[发文机构] ')[1].split('[联合发文单位] ')[0].split('[实施日期] ')[0].strip().lstrip()
pub_hao = pub.find_element(By.CLASS_NAME, 'fwzh').text.replace('[发文字号] ', '').lstrip().strip()
try:
pub_list = bro.find_elements(By.CLASS_NAME,'article-info')
for source in pub_list:
if '来源' in source.text:
pub_source = source.text.split('来源:')[1].split('\n')[0]
# print(pub_source)
except:
pub_source = ''
#.split('来源:')[1]
if '号' not in pub_hao:
pub_hao = ''
cont = bro.find_element(By.ID, 'div_zhengwen').get_attribute('innerHTML')
soup_cont = BeautifulSoup(cont,'lxml')
soup = paserUrl(soup_cont, href[0])
text = str(soup.prettify())
print(text)
#todo:去掉扫一扫
soup.find('div',id='div_div').decompose()
# print(title)
num = 0
fu_jian_soup = soup.find_all('a')
for file in fu_jian_soup:
num+=1
file_href = file['href']
try:
file_href = file['href']
except Exception as e:
log.info(f'---{href[0]}--------{e}-------')
continue
if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
......@@ -567,45 +517,46 @@ def bei_jing():
#todo:将返回的地址更新到soup
file['href'] = 'http://114.115.215.96/' + full_path
id_ = redefid(id_list)
# id_ = redefid(id_list)
#todo:替换完成之后,将附件上传至文件服务器
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:传kafka字段
dic_news = {
'attachmentIds': id_,
'attachmentIds': id_list,
'author': '',
'content': str(soup_cont.text),
'contentWithTag': str(soup_cont),
'content': str(soup.text),
'contentWithTag': str(soup),
'createDate': time_now,
'deleteFlag': 0,
'id': '',
'labels': [{'relationId': "1667", 'relationName': "北京市国资委", 'labelMark': "policy"}],
'origin': pub_source,
'organ': pub_hao,
'topicClassification': '',
'organ': organ,
'topicClassification': topic,
'issuedNumber': pub_hao,
'publishDate': pub_time,
'writtenDate': pub_time,
'writtenDate': writtenDate,
'sid': '1697458829758697473',
'sourceAddress': '',
'sourceAddress': href[0],
'summary': '',
'title': title
}
print(dic_news)
# sendKafka(dic_news)
# print(dic_news)
sendKafka(dic_news)
save_data(dic_news)
# print(id)
# id_list.append(id)
num += 1
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
bro.quit()
except Exception as e:
print(e)
log.info(e)
pass
end_time = time.time()
print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
# 内蒙古
def nei_meng_gu():
id_list = []
start = time.time()
num = 0
url = 'http://gzw.nmg.gov.cn/zfxxgk/zcfg/index.html'
......@@ -617,6 +568,7 @@ def nei_meng_gu():
result = soup.find(class_='right_two')
li_list = result.find_all(class_='font14wr')
for a in li_list[:1]:
id_list = []
a_text = str(a)
real_href = 'https://gzw.nmg.gov.cn/zfxxgk' + a_text.split('href="..')[-1].split('" target="_blank')[0]
# # 判断是否已经爬取过
......@@ -631,13 +583,19 @@ def nei_meng_gu():
href_text.encoding = 'utf-8'
i_html = href_text.text
i_soup = BeautifulSoup(i_html, 'html.parser')
#todo:将html中的a标签相对路径改为绝对路径
i_soup = paserUrl(i_soup,real_href)
i_result = i_soup.find('div', id='d_laiyuan')
time_ = i_result.find_all('span')[0]
time_ = str(time_)
pub_time = time_.split('<span>')[1].split('</span>')[0].replace('发布时间:', '')
source = i_result.find_all('span')[1]
source = str(source)
pub_source = source.split('<span>')[1].split('</span>')[0].replace('来源:', '')
#发布机关
origin = i_result.find_all('span')[1]
origin = str(origin)
pub_source = origin.split('<span>')[1].split('</span>')[0].replace('来源:', '')
#发文机关
organ = origin
fwzh = i_soup.find_all('td')[7]
pub_hao_result = re.findall('〔(.*?)〕', str(fwzh))
if len(pub_hao_result) == 0:
......@@ -647,16 +605,19 @@ def nei_meng_gu():
pub_hao = str(fwzh).split('<td>')[1].split('</td>')[0]
else:
pub_hao = ''
#成文时间
writtenDate = i_soup.find_all('td')[9].text
topicClassification = i_soup.find_all('td')[3].text
i_content = str(i_soup.find(class_='d_show'))
if i_content:
content = i_content
else:
i_content = str(i_soup.find(class_='view TRS_UEDITOR trs_paper_default'))
content = i_content
fujian = i_soup.find_all(class_='ql_detailbro_right_qztp')
#todo:内蒙古市的附件不在正文中,异步加载出来,替换不了标签,附件可上传att表中
fujian = i_soup.find(class_='xy_zcwjxl_downloadPC_list')
fu_jian_result = re.findall('href="(.*?)"', str(fujian))
fu_jian_href_list = []
# fu_jian_result = fujian.find('a')['href']
if len(fu_jian_result) > 0:
for fu_jian_re in fu_jian_result:
if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
......@@ -664,58 +625,53 @@ def nei_meng_gu():
or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
fu_jian_re = str(real_href).split('/t')[0] + '/' + str(fu_jian_re).split('./')[1]
fu_jian_href = fu_jian_re
fu_jian_href_list.append(fu_jian_href)
#todo:附件需要上传文件服务器 type_id:7
result_dict = {
'标题': title,
'来源': pub_source,
'号': pub_hao,
'内容': content,
'附件网址': fu_jian_href_list,
'发布时间': pub_time,
'网址': real_href,
'归属': '内蒙古自治区国资委',
}
#todo:附件上传至文件服务器
retData = baseCore.uploadToserver(fu_jian_href,'1669')
if retData['state']:
pass
else:
continue
att_id, full_path = baseCore.tableUpdate(retData, '内蒙古自治区国资委', title, num)
id_list.append(att_id)
# # todo:将返回的地址更新到soup
# fu_jian_link['href'] = 'http://114.115.215.96/' + full_path
print(title)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
id = baseCore.getNextSeq()
# todo:传kafka字段
dic_news = {
'attachmentIds': "14,15,16",
'attachmentIds': id_list,
'author': '',
'content': content,
'contentWithTag': content,
'createDate': time_now,
'deleteFlag': 0,
'id': id,
'id': '',
'labels':[{'relationId': "1669", 'relationName': "内蒙古自治区国资委", 'labelMark': "policy"}],
'origin': pub_source,
'organ': pub_hao,
'topicClassification': '',
'origin': origin,
'organ': organ,
'topicClassification': topicClassification,
'issuedNumber': pub_hao,
'publishDate': pub_time,
'writtenDate':pub_time,
'sid':'0987654321',
'sourceAddress':'',
'writtenDate':writtenDate,
'sid':'1697458829758697473',
'sourceAddress':real_href,
'summary':'',
'title':title
}
sendKafka(dic_news)
print(id)
id_list.append(id)
# save_data(result_dict)
save_data(dic_news)
num = num + 1
break
except:
pass
except:
pass
print(id_list)
end = time.time()
print('共', num, '条', '...........', '共耗时', end - start, '秒')
# 吉林
def ji_lin():
start = time.time()
......@@ -3950,9 +3906,9 @@ if __name__ == '__main__':
# get_content1()
# get_content2()
# get_content3()
bei_jing()
# bei_jing()
# nei_meng_gu()
# ji_lin()
ji_lin()
# shang_hai()
# zhe_jiang()
# fu_jian()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论