政策法规脚本维护

53ccb166 · 薛凌堃 · 95d7e6b2 · 53ccb166
--- a/comData/policylaw/policy.py
+++ b/comData/policylaw/policy.py
@@ -36,7 +36,7 @@ taskType = '政策法规'
 各地方国资委
 """
-db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji[
+db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN[
    '国务院_国资委_copy1']
 driver_path = r'D:\cmd100\chromedriver.exe'
@@ -97,7 +97,8 @@ def save_data(dic_news):
        'tid': dic_news['labels'][0]['relationId'],
        '来源': dic_news['labels'][0]['relationName'],
        '创建时间': dic_news['createDate'],
-        '带标签内容': dic_news['contentWithTag'][:100]
+        '带标签内容': dic_news['contentWithTag'][:100],
+        '发布时间': dic_news['publishDate']
    }
    db_storage.insert_one(aaa_dic)
@@ -1009,6 +1010,8 @@ def ji_lin():
                    else:
                        pub = i_soup.find(class_='share')
                        pub_time = pub.find(class_='left').find('span', class_='time').text
+                        if '时间' in pub_time:
+                            pub_time = pub_time.split('时间：')[1].strip()
                        pub_come = pub.find(class_='right').find('span', class_='source').text.split('来源：')[1].strip()
                        # print(pub_come)
                i_content = soup.find(class_='zsy_comain')
@@ -2393,12 +2396,12 @@ def hai_nan():
                    i_href = str(i_href).replace('../../', 'https://www.gov.cn/zhengce/')
                try:
                    try:
-                        is_href1 = db_storage.find_one({'网址': href.split('?')[0]})
+                        is_href1 = db_storage.find_one({'网址': i_href.split('?')[0]})
-                        is_href2 = db_storage.find_one({'网址': href})
+                        is_href2 = db_storage.find_one({'网址': i_href})
                    except:
                        is_href1 = False
-                        is_href2 = db_storage.find_one({'网址': href})
+                        is_href2 = db_storage.find_one({'网址': i_href})
                    if is_href1 or is_href2:
                        num += 1
                        log.info('已采集=====跳过')
@@ -3966,7 +3969,14 @@ def shan_xi():
                href = href[0].replace('../../', 'http://gzw.shanxi.gov.cn/zxhrdgz/fzyd/zywj/').replace('./',
                                                                                                        'http://gzw.shanxi.gov.cn/zxhrdgz/fzyd/zywj/')
                title = tr.xpath('./td[1]/a/span//text()')[0]
-                publishDate = tr.xpath('./td[2]/span/text()')[0]
+                publishDate_ = tr.xpath('./td[2]/span/text()')[0]
+                pattern = r'\d{4}/\d{2}/\d{2}'
+                matched = re.findall(pattern, publishDate_)
+                if matched:
+                    date = pd.to_datetime(publishDate_, format='%Y/%m/%d')
+                    publishDate = date.strftime('%Y-%m-%d')
+                else:
+                    publishDate = publishDate_
                is_href = db_storage.find_one({'网址': href})
                if is_href:
                    num += 1
@@ -6238,14 +6248,14 @@ def hu_bei():
 if __name__ == '__main__':
-    # get_content1()
+    get_content1()
-    # get_content3()
+    get_content3()
-    # bei_jing()
+    bei_jing()
-    # nei_meng_gu()
+    nei_meng_gu()
-    # ji_lin()
+    ji_lin()
-    # shang_hai()
+    shang_hai()
-    # zhe_jiang()
+    zhe_jiang()
-    # fu_jian()
+    fu_jian()
    shan_dong()
    guang_dong()
    hai_nan()