提交 53ccb166 作者: 薛凌堃

政策法规脚本维护

上级 95d7e6b2
...@@ -36,7 +36,7 @@ taskType = '政策法规' ...@@ -36,7 +36,7 @@ taskType = '政策法规'
各地方国资委 各地方国资委
""" """
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji[ db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN[
'国务院_国资委_copy1'] '国务院_国资委_copy1']
driver_path = r'D:\cmd100\chromedriver.exe' driver_path = r'D:\cmd100\chromedriver.exe'
...@@ -97,7 +97,8 @@ def save_data(dic_news): ...@@ -97,7 +97,8 @@ def save_data(dic_news):
'tid': dic_news['labels'][0]['relationId'], 'tid': dic_news['labels'][0]['relationId'],
'来源': dic_news['labels'][0]['relationName'], '来源': dic_news['labels'][0]['relationName'],
'创建时间': dic_news['createDate'], '创建时间': dic_news['createDate'],
'带标签内容': dic_news['contentWithTag'][:100] '带标签内容': dic_news['contentWithTag'][:100],
'发布时间': dic_news['publishDate']
} }
db_storage.insert_one(aaa_dic) db_storage.insert_one(aaa_dic)
...@@ -1009,6 +1010,8 @@ def ji_lin(): ...@@ -1009,6 +1010,8 @@ def ji_lin():
else: else:
pub = i_soup.find(class_='share') pub = i_soup.find(class_='share')
pub_time = pub.find(class_='left').find('span', class_='time').text pub_time = pub.find(class_='left').find('span', class_='time').text
if '时间' in pub_time:
pub_time = pub_time.split('时间:')[1].strip()
pub_come = pub.find(class_='right').find('span', class_='source').text.split('来源:')[1].strip() pub_come = pub.find(class_='right').find('span', class_='source').text.split('来源:')[1].strip()
# print(pub_come) # print(pub_come)
i_content = soup.find(class_='zsy_comain') i_content = soup.find(class_='zsy_comain')
...@@ -2393,12 +2396,12 @@ def hai_nan(): ...@@ -2393,12 +2396,12 @@ def hai_nan():
i_href = str(i_href).replace('../../', 'https://www.gov.cn/zhengce/') i_href = str(i_href).replace('../../', 'https://www.gov.cn/zhengce/')
try: try:
try: try:
is_href1 = db_storage.find_one({'网址': href.split('?')[0]}) is_href1 = db_storage.find_one({'网址': i_href.split('?')[0]})
is_href2 = db_storage.find_one({'网址': href}) is_href2 = db_storage.find_one({'网址': i_href})
except: except:
is_href1 = False is_href1 = False
is_href2 = db_storage.find_one({'网址': href}) is_href2 = db_storage.find_one({'网址': i_href})
if is_href1 or is_href2: if is_href1 or is_href2:
num += 1 num += 1
log.info('已采集=====跳过') log.info('已采集=====跳过')
...@@ -3966,7 +3969,14 @@ def shan_xi(): ...@@ -3966,7 +3969,14 @@ def shan_xi():
href = href[0].replace('../../', 'http://gzw.shanxi.gov.cn/zxhrdgz/fzyd/zywj/').replace('./', href = href[0].replace('../../', 'http://gzw.shanxi.gov.cn/zxhrdgz/fzyd/zywj/').replace('./',
'http://gzw.shanxi.gov.cn/zxhrdgz/fzyd/zywj/') 'http://gzw.shanxi.gov.cn/zxhrdgz/fzyd/zywj/')
title = tr.xpath('./td[1]/a/span//text()')[0] title = tr.xpath('./td[1]/a/span//text()')[0]
publishDate = tr.xpath('./td[2]/span/text()')[0] publishDate_ = tr.xpath('./td[2]/span/text()')[0]
pattern = r'\d{4}/\d{2}/\d{2}'
matched = re.findall(pattern, publishDate_)
if matched:
date = pd.to_datetime(publishDate_, format='%Y/%m/%d')
publishDate = date.strftime('%Y-%m-%d')
else:
publishDate = publishDate_
is_href = db_storage.find_one({'网址': href}) is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num += 1 num += 1
...@@ -6238,14 +6248,14 @@ def hu_bei(): ...@@ -6238,14 +6248,14 @@ def hu_bei():
if __name__ == '__main__': if __name__ == '__main__':
# get_content1() get_content1()
# get_content3() get_content3()
# bei_jing() bei_jing()
# nei_meng_gu() nei_meng_gu()
# ji_lin() ji_lin()
# shang_hai() shang_hai()
# zhe_jiang() zhe_jiang()
# fu_jian() fu_jian()
shan_dong() shan_dong()
guang_dong() guang_dong()
hai_nan() hai_nan()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论