政策法规 10/27

2adbab73 · LiuLiYuan · c2749092 · 2adbab73
--- a/comData/policylaw/policy.py
+++ b/comData/policylaw/policy.py
@@ -397,7 +397,7 @@ def get_content2():
                if is_href:
                    num+=1
                    log.info('已采集----------跳过')
-                    time.sleep(0.5)
+                    time.sleep(1)
                    continue
                try:
                    resp = requests.get(url=href, headers=headers, verify=False)
@@ -663,7 +663,8 @@ def bei_jing():
    # bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=r'D:/chrome/103/chromedriver.exe')
    chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
    chromedriver = r'D:\cmd100\chromedriver.exe'
-    bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=chromedriver)
+    #bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=chromedriver)
+    bro = webdriver.Chrome(options=chrome_options, executable_path=chromedriver)
    with open('../../base/stealth.min.js') as f:
        js = f.read()
@@ -1830,7 +1831,10 @@ def hai_nan():
                        href = 'http://gzw.hainan.gov.cn/zwgk_23509/' + href.replace('../../', '')
                    elif './' in href:
                        href = href.replace('./', 'http://gzw.hainan.gov.cn/zwgk_23509/zfwj/bmwj/')
-                    is_href = db_storage.find_one({'网址': href})
+                    try:
+                        is_href = db_storage.find_one({'网址': href.split('?')[0]})
+                    except:
+                        is_href = db_storage.find_one({'网址': href})
                    if is_href:
                        num+=1
                        continue
@@ -1906,7 +1910,7 @@ def hai_nan():
                                pub_time = tbody_text.split('发文日期：')[1].split('名　　称：')[0].strip().lstrip().replace('年',
                                                                                                                   '-').replace(
                                    '月', '-').replace('日', '')
-                                writtenDate = ''
+                                writtenDate = None
                                topicClassification = tbody_text.split('分　　类：')[1].split('发文机关：')[0].strip().lstrip()
                                contentWithTag = source.find('div', attrs={'class': 'zx-xxxqy-nr'})
                                content = contentWithTag.text
@@ -1963,7 +1967,7 @@ def hai_nan():
                                        0].strip().lstrip()
                                pub_source = ''
                                pub_hao = ''
-                                writtenDate = ''
+                                writtenDate = None
                                topicClassification = ''
                                contentWithTag = source.find('div', attrs={'class': 'TRS_UEDITOR'})
                                content = contentWithTag.text
@@ -2018,7 +2022,10 @@ def hai_nan():
                title = str(doc_item).split('target="_blank">')[1].split('</a>')[0]
                href = 'https://www.hainan.gov.cn' + str(doc_item).split('href="')[1].split('" target')[0]
                # print(title,href)
-                is_href = db_storage.find_one({'网址': href})
+                try:
+                    is_href = db_storage.find_one({'网址': href.split('?')[0]})
+                except:
+                    is_href = db_storage.find_one({'网址': href})
                if is_href:
                    num+=1
                    continue