提交 2adbab73 作者: LiuLiYuan

政策法规 10/27

上级 c2749092
...@@ -397,7 +397,7 @@ def get_content2(): ...@@ -397,7 +397,7 @@ def get_content2():
if is_href: if is_href:
num+=1 num+=1
log.info('已采集----------跳过') log.info('已采集----------跳过')
time.sleep(0.5) time.sleep(1)
continue continue
try: try:
resp = requests.get(url=href, headers=headers, verify=False) resp = requests.get(url=href, headers=headers, verify=False)
...@@ -663,7 +663,8 @@ def bei_jing(): ...@@ -663,7 +663,8 @@ def bei_jing():
# bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=r'D:/chrome/103/chromedriver.exe') # bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=r'D:/chrome/103/chromedriver.exe')
chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe' chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
chromedriver = r'D:\cmd100\chromedriver.exe' chromedriver = r'D:\cmd100\chromedriver.exe'
bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=chromedriver) #bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=chromedriver)
bro = webdriver.Chrome(options=chrome_options, executable_path=chromedriver)
with open('../../base/stealth.min.js') as f: with open('../../base/stealth.min.js') as f:
js = f.read() js = f.read()
...@@ -1830,7 +1831,10 @@ def hai_nan(): ...@@ -1830,7 +1831,10 @@ def hai_nan():
href = 'http://gzw.hainan.gov.cn/zwgk_23509/' + href.replace('../../', '') href = 'http://gzw.hainan.gov.cn/zwgk_23509/' + href.replace('../../', '')
elif './' in href: elif './' in href:
href = href.replace('./', 'http://gzw.hainan.gov.cn/zwgk_23509/zfwj/bmwj/') href = href.replace('./', 'http://gzw.hainan.gov.cn/zwgk_23509/zfwj/bmwj/')
is_href = db_storage.find_one({'网址': href}) try:
is_href = db_storage.find_one({'网址': href.split('?')[0]})
except:
is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1 num+=1
continue continue
...@@ -1906,7 +1910,7 @@ def hai_nan(): ...@@ -1906,7 +1910,7 @@ def hai_nan():
pub_time = tbody_text.split('发文日期:')[1].split('名  称:')[0].strip().lstrip().replace('年', pub_time = tbody_text.split('发文日期:')[1].split('名  称:')[0].strip().lstrip().replace('年',
'-').replace( '-').replace(
'月', '-').replace('日', '') '月', '-').replace('日', '')
writtenDate = '' writtenDate = None
topicClassification = tbody_text.split('分  类:')[1].split('发文机关:')[0].strip().lstrip() topicClassification = tbody_text.split('分  类:')[1].split('发文机关:')[0].strip().lstrip()
contentWithTag = source.find('div', attrs={'class': 'zx-xxxqy-nr'}) contentWithTag = source.find('div', attrs={'class': 'zx-xxxqy-nr'})
content = contentWithTag.text content = contentWithTag.text
...@@ -1963,7 +1967,7 @@ def hai_nan(): ...@@ -1963,7 +1967,7 @@ def hai_nan():
0].strip().lstrip() 0].strip().lstrip()
pub_source = '' pub_source = ''
pub_hao = '' pub_hao = ''
writtenDate = '' writtenDate = None
topicClassification = '' topicClassification = ''
contentWithTag = source.find('div', attrs={'class': 'TRS_UEDITOR'}) contentWithTag = source.find('div', attrs={'class': 'TRS_UEDITOR'})
content = contentWithTag.text content = contentWithTag.text
...@@ -2018,7 +2022,10 @@ def hai_nan(): ...@@ -2018,7 +2022,10 @@ def hai_nan():
title = str(doc_item).split('target="_blank">')[1].split('</a>')[0] title = str(doc_item).split('target="_blank">')[1].split('</a>')[0]
href = 'https://www.hainan.gov.cn' + str(doc_item).split('href="')[1].split('" target')[0] href = 'https://www.hainan.gov.cn' + str(doc_item).split('href="')[1].split('" target')[0]
# print(title,href) # print(title,href)
is_href = db_storage.find_one({'网址': href}) try:
is_href = db_storage.find_one({'网址': href.split('?')[0]})
except:
is_href = db_storage.find_one({'网址': href})
if is_href: if is_href:
num+=1 num+=1
continue continue
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论