提交 53ccb166 作者: 薛凌堃

政策法规脚本维护

上级 95d7e6b2
......@@ -36,7 +36,7 @@ taskType = '政策法规'
各地方国资委
"""
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji[
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN[
'国务院_国资委_copy1']
driver_path = r'D:\cmd100\chromedriver.exe'
......@@ -97,7 +97,8 @@ def save_data(dic_news):
'tid': dic_news['labels'][0]['relationId'],
'来源': dic_news['labels'][0]['relationName'],
'创建时间': dic_news['createDate'],
'带标签内容': dic_news['contentWithTag'][:100]
'带标签内容': dic_news['contentWithTag'][:100],
'发布时间': dic_news['publishDate']
}
db_storage.insert_one(aaa_dic)
......@@ -1009,6 +1010,8 @@ def ji_lin():
else:
pub = i_soup.find(class_='share')
pub_time = pub.find(class_='left').find('span', class_='time').text
if '时间' in pub_time:
pub_time = pub_time.split('时间:')[1].strip()
pub_come = pub.find(class_='right').find('span', class_='source').text.split('来源:')[1].strip()
# print(pub_come)
i_content = soup.find(class_='zsy_comain')
......@@ -2393,12 +2396,12 @@ def hai_nan():
i_href = str(i_href).replace('../../', 'https://www.gov.cn/zhengce/')
try:
try:
is_href1 = db_storage.find_one({'网址': href.split('?')[0]})
is_href2 = db_storage.find_one({'网址': href})
is_href1 = db_storage.find_one({'网址': i_href.split('?')[0]})
is_href2 = db_storage.find_one({'网址': i_href})
except:
is_href1 = False
is_href2 = db_storage.find_one({'网址': href})
is_href2 = db_storage.find_one({'网址': i_href})
if is_href1 or is_href2:
num += 1
log.info('已采集=====跳过')
......@@ -3966,7 +3969,14 @@ def shan_xi():
href = href[0].replace('../../', 'http://gzw.shanxi.gov.cn/zxhrdgz/fzyd/zywj/').replace('./',
'http://gzw.shanxi.gov.cn/zxhrdgz/fzyd/zywj/')
title = tr.xpath('./td[1]/a/span//text()')[0]
publishDate = tr.xpath('./td[2]/span/text()')[0]
publishDate_ = tr.xpath('./td[2]/span/text()')[0]
pattern = r'\d{4}/\d{2}/\d{2}'
matched = re.findall(pattern, publishDate_)
if matched:
date = pd.to_datetime(publishDate_, format='%Y/%m/%d')
publishDate = date.strftime('%Y-%m-%d')
else:
publishDate = publishDate_
is_href = db_storage.find_one({'网址': href})
if is_href:
num += 1
......@@ -6238,14 +6248,14 @@ def hu_bei():
if __name__ == '__main__':
# get_content1()
# get_content3()
# bei_jing()
# nei_meng_gu()
# ji_lin()
# shang_hai()
# zhe_jiang()
# fu_jian()
get_content1()
get_content3()
bei_jing()
nei_meng_gu()
ji_lin()
shang_hai()
zhe_jiang()
fu_jian()
shan_dong()
guang_dong()
hai_nan()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论