政策法规采集 10/21

4d6ca3e2 · LiuLiYuan · aa593218 · 4d6ca3e2 · 4d6ca3e2
--- a/comData/policylaw/policy.py
+++ b/comData/policylaw/policy.py
 # _*_ coding:utf-8 _*_

 """数据全量跑一遍，不做判重逻辑"""
-import datetime
 import json
-import os
 import re
 import time
-
+import datetime
 import fitz
 import pymongo
 import requests
+from bs4 import BeautifulSoup
 from kafka import KafkaProducer
 from pyquery import PyQuery as pq
 from requests.packages import urllib3
-from requests.adapters import HTTPAdapter
+from urllib.parse import urljoin
 from BaseCore import BaseCore
-
 baseCore = BaseCore()

 urllib3.disable_warnings()
@@ -24,8 +22,8 @@ from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.common.by import By
 from lxml import etree
 from random import choice
-from bs4 import BeautifulSoup
-from urllib.parse import urljoin
+from requests.adapters import HTTPAdapter
+
 log = baseCore.getLogger()
 taskType = '政策法规'

@@ -36,11 +34,10 @@ taskType = '政策法规'
 各地方国资委
 """

-db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji[
-    '国务院_国资委_copy1']
+db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji['国务院_国资委_copy1']

-driver_path = r'F:\spider\cmd100\chromedriver.exe'
-chromr_bin = r'F:\spider\Google\Chrome\Application\chrome.exe'
+driver_path=  r'D:\cmd100\chromedriver.exe'
+chromr_bin=   r'D:\Google\Chrome\Application\chrome.exe'

 headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
@@ -64,9 +61,10 @@ def paserUrl(html, listurl):
 def getDriver():
    service = Service(driver_path)
    chrome_options = webdriver.ChromeOptions()
-    chrome_options.add_argument('--headless')
+    # chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    # chrome_options.add_argument('--no-sandbox')
+    chrome_options.add_argument('log-level=3')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])  # 屏蔽chrome自动化受控提示
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")  # 禁用启用Blink运行时的功能去掉webdriver痕迹
@@ -77,6 +75,12 @@ def getDriver():
        'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36')
    # bro = webdriver.Chrome(chrome_options=chrome_options, service=service)
    bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=driver_path)
+    # with open('stealth.min.js') as f:
+    #     js = f.read()
+    #
+    # bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
+    #     "source": js
+    # })
    return bro

 def save_data(dic_news):
@@ -203,109 +207,111 @@ def get_content1():
        s.keep_alive = False
        pcodeJiguan = a_list[0]
        try:
-            pageCount = getPageConunt(a_list, url, headers, s)
-            for pageNo in range(1, pageCount + 1):
+            #pageCount = getPageConunt(a_list, url, headers, s)
+            #for pageNo in range(1, pageCount + 1):
+            pageNo = 1
+            try:
                try:
+                    page_list = getList(a_list, url, headers, pageNo, s)
+                except:
+                    s.close()
+                    page_list = getList(a_list, url, headers, pageNo, s)
+                for page in page_list:
+                    id_list = []
+                    # 获取所需信息
+                    title = page['maintitle']  # 标题
+                    pub_time1 = page['publish_time']  # 发布时间
+                    pub_time2 = page['cwrq']  # 成文时间
+                    pub_code = page['fwzh']  # 发文字号
+                    href = page['pub_url']  # 网址
+                    # 判断是否已经爬取过
+                    is_href = db_storage.find_one({'网址': href})
+                    if is_href:
+                        num+=1
+                        log.info('已采集----------跳过')
+                        time.sleep(0.5)
+                        continue
                    try:
-                        page_list = getList(a_list, url, headers, pageNo, s)
-                    except:
-                        s.close()
-                        page_list = getList(a_list, url, headers, pageNo, s)
-                    for page in page_list:
-                        id_list = []
-                        # 获取所需信息
-                        title = page['maintitle']  # 标题
-                        pub_time1 = page['publish_time']  # 发布时间
-                        pub_time2 = page['cwrq']  # 成文时间
-                        pub_code = page['fwzh']  # 发文字号
-                        href = page['pub_url']  # 网址
-                        # 判断是否已经爬取过
-                        is_href = db_storage.find_one({'网址': href})
-                        if is_href:
-                            num+=1
-                            log.info('已采集----------跳过')
+                        resp_href = requests.get(url=href, headers=headers_, verify=False)
+                        resp_href.encoding = resp_href.apparent_encoding
+                        i_html = resp_href.text
+                        if '您访问的页面不存在或已删除' in i_html:
+                            # log.error(f'{title}...{href}...页面不存在或已删除')
                            continue
-                        try:
-                            resp_href = requests.get(url=href, headers=headers_, verify=False)
-                            resp_href.encoding = resp_href.apparent_encoding
-                            i_html = resp_href.text
-                            if '您访问的页面不存在或已删除' in i_html:
-                                # log.error(f'{title}...{href}...页面不存在或已删除')
+                        i_soup = BeautifulSoup(i_html, 'html.parser')
+                        i_soup = paserUrl(i_soup, href)
+                        source = str(i_soup.find_all('tbody')[0])
+                        pub_org = source.split('<td><b>发文机关：</b></td>')[1].split('<td>')[1].split('</td>')[
+                            0]  # 发文机关
+                        child_type = source.split('<td class="w340 zcwj_ztfl">')[1].split('</td>')[0]  # 主题分类
+                        contentWithTag = i_soup.find('div',class_='wrap mxxgkwrap mxxgkwrap_gwywj').find('table',class_='border-table noneBorder pages_content')
+                        # 去除扫一扫
+                        contentWithTag.find('div', attrs={'id': 'div_div'}).decompose()
+                        content = contentWithTag.text  # 不带标签正文
+                        fu_jian_soup = contentWithTag.find_all('a')
+                        time.sleep(0.5)
+                        for file in fu_jian_soup:
+                            try:
+                                file_href = file['href']
+                            except Exception as e:
+                                log.info(f'---{href}--------{e}-------')
                                continue
-                            i_soup = BeautifulSoup(i_html, 'html.parser')
-                            i_soup = paserUrl(i_soup, href)
-                            source = str(i_soup.find_all('tbody')[0])
-                            pub_org = source.split('<td><b>发文机关：</b></td>')[1].split('<td>')[1].split('</td>')[
-                                0]  # 发文机关
-                            child_type = source.split('<td class="w340 zcwj_ztfl">')[1].split('</td>')[0]  # 主题分类
-                            contentWithTag = i_soup.find('div',class_='wrap mxxgkwrap mxxgkwrap_gwywj').find('table',class_='border-table noneBorder pages_content')
-                            # 去除扫一扫
-                            contentWithTag.find('div', attrs={'id': 'div_div'}).decompose()
-                            content = contentWithTag.text  # 不带标签正文
-                            fu_jian_soup = contentWithTag.find_all('a')
-                            time.sleep(0.5)
-                            for file in fu_jian_soup:
-                                try:
-                                    file_href = file['href']
-                                except Exception as e:
-                                    log.info(f'---{href}--------{e}-------')
+                            if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
+                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
+                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
+                                file_name = file.text.strip()
+                                category = os.path.splitext(file_href)[1]
+                                if category not in file_name:
+                                    file_name = file_name + category
+                                retData = baseCore.uptoOBS(file_href,'1766',file_name)
+                                if retData['state']:
+                                    pass
+                                else:
                                    continue
-                                if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
-                                        or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
-                                        or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
-                                    file_name = file.text.strip()
-                                    category = os.path.splitext(file_href)[1]
-                                    if category not in file_name:
-                                        file_name = file_name + category
-                                    retData = baseCore.uptoOBS(file_href,'1766',file_name)
-                                    if retData['state']:
-                                        pass
-                                    else:
-                                        continue
-                                    att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num,pub_time1)
-                                    id_list.append(att_id)
+                                att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num,pub_time1)
+                                id_list.append(att_id)

-                                    #todo:将返回的地址更新到soup
-                                    file['href'] = full_path
-                        except:
-                            log.error(f'{title}...{href}...获取内容失败')
-                            continue
-                        #todo:替换完成之后，将附件上传至文件服务器
-                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-                        #todo:传kafka字段
-                        dic_news = {
-                            'attachmentIds': id_list,                     #附件id
-                            'author': '',                                 #作者
-                            'content': content,                           #正文不带标签
-                            'contentWithTag': str(contentWithTag),        #正文带标签
-                            'createDate': time_now,                       #创建时间
-                            'deleteFlag': 0,                              #是否删除(0为默认，1为删除)
-                            'id': '',                                     #
-                            'labels': [{'relationId': "1766", 'relationName': "国务院文件", 'labelMark': "policy"}],   #关联标签id  关联标签名称  关联标签标识
-                            'origin': '',                                 #政策发布机关
-                            'organ': pub_org,                             #政策发文机关
-                            'topicClassification': child_type,            #政策文件分类
-                            'issuedNumber': pub_code,                     #发文字号
-                            'publishDate': pub_time1,                     #发布时间
-                            'writtenDate': pub_time2,                     #成文时间
-                            'sid': '1697458829758697473',                 #信息源id
-                            'sourceAddress': href,                     #原文链接
-                            'summary': '',                                #摘要
-                            'title': title                                #标题
-                        }
-                        # print(dic_news)
-                        flag = sendKafka(dic_news)
-                        if flag:
-                            save_data(dic_news)
-                        num += 1
-                except:
-                    log.error(f'{pcodeJiguan}...第{pageNo}页获取列表失败')
-                    continue
+                                #todo:将返回的地址更新到soup
+                                file['href'] = full_path
+                    except:
+                        log.error(f'{title}...{href}...获取内容失败')
+                        continue
+                    #todo:替换完成之后，将附件上传至文件服务器
+                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                    #todo:传kafka字段
+                    dic_news = {
+                        'attachmentIds': id_list,                     #附件id
+                        'author': '',                                 #作者
+                        'content': content,                           #正文不带标签
+                        'contentWithTag': str(contentWithTag),        #正文带标签
+                        'createDate': time_now,                       #创建时间
+                        'deleteFlag': 0,                              #是否删除(0为默认，1为删除)
+                        'id': '',                                     #
+                        'labels': [{'relationId': "1766", 'relationName': "国务院文件", 'labelMark': "policy"}],   #关联标签id  关联标签名称  关联标签标识
+                        'origin': '',                                 #政策发布机关
+                        'organ': pub_org,                             #政策发文机关
+                        'topicClassification': child_type,            #政策文件分类
+                        'issuedNumber': pub_code,                     #发文字号
+                        'publishDate': pub_time1,                     #发布时间
+                        'writtenDate': pub_time2,                     #成文时间
+                        'sid': '1697458829758697473',                 #信息源id
+                        'sourceAddress': href,                     #原文链接
+                        'summary': '',                                #摘要
+                        'title': title                                #标题
+                    }
+                    # print(dic_news)
+                    flag = sendKafka(dic_news)
+                    if flag:
+                        save_data(dic_news)
+                    num += 1
+            except:
+                log.error(f'{pcodeJiguan}...第{pageNo}页获取列表失败')
+                continue
        except:
            log.error(f'{pcodeJiguan}...获取总数失败')
            continue
-        end_time = time.time()
-        log.info(f'共抓取国务院文件{num}条数据，共耗时{start_time - end_time}')
+    end_time = time.time()
+    log.info(f'共抓取国务院文件{num}条数据，共耗时{end_time-start_time}')

 # 国务院部门文件
 def get_content2():
@@ -355,114 +361,117 @@ def get_content2():
                   '国家知识产权局', '国家档案局', '国家保密局', '国家密码管理局', '国家宗教事务局', '国务院台湾事务办公室', '国家乡村振兴局', '国家电影局']

    for bmfl in result_list:
+        #try:
+        #totalpage = getTotalpage(bmfl,headers,session)
+        #for pageNo in range(1,totalpage+1):
+        #for pageNo in range(1,6):
+        pageNo = 1
        try:
-            totalpage = getTotalpage(bmfl,headers,session)
-            for pageNo in range(1,totalpage+1):
+            try:
+                content_list = getContentList(bmfl,pageNo,headers,session)
+            except:
+                session.close()
+                content_list = getContentList(bmfl,pageNo,headers,session)
+            for content_dict in content_list:
+                id_list = []
+                href = content_dict['url']  # 详情页
+                title = content_dict['title']  # 标题
+                pub_code = content_dict['pcode']  # 发文字号
                try:
-                    try:
-                        content_list = getContentList(bmfl,pageNo,headers,session)
-                    except:
-                        session.close()
-                        content_list = getContentList(bmfl,pageNo,headers,session)
-                    for content_dict in content_list:
-                        id_list = []
-                        href = content_dict['url']  # 详情页
-                        title = content_dict['title']  # 标题
-                        pub_code = content_dict['pcode']  # 发文字号
-                        try:
-                            pub_time = int(content_dict['pubtime'] / 1000)  # 发布时间
-                            pub_time1 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(pub_time))
-                        except:
-                            pub_time1 = ''
-                        try:
-                            p_time = int(content_dict['ptime'] / 1000)  # 成文时间
-                            pub_time2 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(p_time))
-                        except:
-                            pub_time2 = ''
-                        pub_org = content_dict['puborg']  # 发文机关
+                    pub_time = int(content_dict['pubtime'] / 1000)  # 发布时间
+                    pub_time1 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(pub_time))
+                except:
+                    pub_time1 = None
+                try:
+                    p_time = int(content_dict['ptime'] / 1000)  # 成文时间
+                    pub_time2 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(p_time))
+                except:
+                    pub_time2 = None
+                pub_org = content_dict['puborg']  # 发文机关
+                try:
+                    child_type = content_dict['childtype']  # 主题分类
+                except:
+                    child_type = ''
+                # # 判断是否已经爬取过
+                is_href = db_storage.find_one({'网址': href})
+                if is_href:
+                    num+=1
+                    log.info('已采集----------跳过')
+                    time.sleep(0.5)
+                    continue
+                try:
+                    resp = requests.get(url=href, headers=headers, verify=False)
+                    resp.encoding = resp.apparent_encoding
+                    resp_text = resp.text
+                    soup = BeautifulSoup(resp_text, 'html.parser')
+                    soup = paserUrl(soup,href)
+                    time.sleep(0.5)
+                    contentWithTag = soup.find('div', attrs={'class': 'pages_content mhide'})
+                    content = contentWithTag.text
+                    if content == '' or content == 'None':
+                        log.info(f'----{href}---{title}---内容为空---')
+                        continue
+                    fu_jian_soup = contentWithTag.find_all('a')
+                    for file in fu_jian_soup:
                        try:
-                            child_type = content_dict['childtype']  # 主题分类
-                        except:
-                            child_type = ''
-                        # # 判断是否已经爬取过
-                        is_href = db_storage.find_one({'网址': href})
-                        if is_href:
-                            num+=1
-                            log.info('已采集----------跳过')
+                            file_href = file['href']
+                        except Exception as e:
+                            log.info(f'---{href}--------{e}-------')
                            continue
-                        try:
-                            resp = requests.get(url=href, headers=headers, verify=False)
-                            resp.encoding = resp.apparent_encoding
-                            resp_text = resp.text
-                            soup = BeautifulSoup(resp_text, 'html.parser')
-                            soup = paserUrl(soup,href)
-                            time.sleep(0.5)
-                            contentWithTag = soup.find('div', attrs={'class': 'pages_content mhide'})
-                            content = contentWithTag.text
-                            if content == '' or content == 'None':
-                                log.info(f'----{href}---{title}---内容为空---')
+                        if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
+                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
+                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
+                            file_name = file.text.strip()
+                            category = os.path.splitext(file_href)[1]
+                            if category not in file_name:
+                                file_name = file_name + category
+                            retData = baseCore.uptoOBS(file_href,'1699',file_name)
+                            if retData['state']:
+                                pass
+                            else:
                                continue
-                            fu_jian_soup = contentWithTag.find_all('a')
-                            for file in fu_jian_soup:
-                                try:
-                                    file_href = file['href']
-                                except Exception as e:
-                                    log.info(f'---{href}--------{e}-------')
-                                    continue
-                                if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
-                                        or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
-                                        or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
-                                    file_name = file.text.strip()
-                                    category = os.path.splitext(file_href)[1]
-                                    if category not in file_name:
-                                        file_name = file_name + category
-                                    retData = baseCore.uptoOBS(file_href,'1699',file_name)
-                                    if retData['state']:
-                                        pass
-                                    else:
-                                        continue
-                                    att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num,pub_time1)
-                                    id_list.append(att_id)
+                            att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num,pub_time1)
+                            id_list.append(att_id)

-                                    #todo:将返回的地址更新到soup
-                                    file['href'] = full_path
-                        except:
-                            log.error(f'{title}...{href}获取内容失败')
-                            continue
-                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-                        #todo:传kafka字段
-                        dic_news = {
-                            'attachmentIds': id_list,                     #附件id
-                            'author': '',                                 #作者
-                            'content': content,                           #正文不带标签
-                            'contentWithTag': str(contentWithTag),        #正文带标签
-                            'createDate': time_now,                       #创建时间
-                            'deleteFlag': 0,                              #是否删除(0为默认，1为删除)
-                            'id': '',                                     #
-                            'labels': [{'relationId': "1699", 'relationName': "国务院各部委文件", 'labelMark': "policy"}],   #关联标签id  关联标签名称  关联标签标识
-                            'origin': '',                                 #政策发布机关
-                            'organ': pub_org,                             #政策发文机关
-                            'topicClassification': child_type,            #政策文件分类
-                            'issuedNumber': pub_code,                     #发文字号
-                            'publishDate': pub_time1,                     #发布时间
-                            'writtenDate': pub_time2,                     #成文时间
-                            'sid': '1697458829758697473',                 #信息源id
-                            'sourceAddress': href,                     #原文链接
-                            'summary': '',                                #摘要
-                            'title': title                                #标题
-                        }
-                        # print(dic_news)
-                        flag = sendKafka(dic_news)
-                        if flag:
-                            save_data(dic_news)
-                        count += 1
-                        num += 1
+                            #todo:将返回的地址更新到soup
+                            file['href'] = full_path
                except:
-                    log.error(f'{bmfl}...第{pageNo}页获取信息列表失败')
+                    log.error(f'{title}...{href}获取内容失败')
                    continue
+                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                #todo:传kafka字段
+                dic_news = {
+                    'attachmentIds': id_list,                     #附件id
+                    'author': '',                                 #作者
+                    'content': content,                           #正文不带标签
+                    'contentWithTag': str(contentWithTag),        #正文带标签
+                    'createDate': time_now,                       #创建时间
+                    'deleteFlag': 0,                              #是否删除(0为默认，1为删除)
+                    'id': '',                                     #
+                    'labels': [{'relationId': "1699", 'relationName': "国务院各部委文件", 'labelMark': "policy"}],   #关联标签id  关联标签名称  关联标签标识
+                    'origin': '',                                 #政策发布机关
+                    'organ': pub_org,                             #政策发文机关
+                    'topicClassification': child_type,            #政策文件分类
+                    'issuedNumber': pub_code,                     #发文字号
+                    'publishDate': pub_time1,                     #发布时间
+                    'writtenDate': pub_time2,                     #成文时间
+                    'sid': '1697458829758697473',                 #信息源id
+                    'sourceAddress': href,                     #原文链接
+                    'summary': '',                                #摘要
+                    'title': title                                #标题
+                }
+                # print(dic_news)
+                flag = sendKafka(dic_news)
+                if flag:
+                    save_data(dic_news)
+                count += 1
+                num += 1
        except:
-            log.error(f'{bmfl}...获取页数失败')
+            log.error(f'{bmfl}...第{pageNo}页获取信息列表失败')
            continue
+        #except:
+        #    log.error(f'{bmfl}...获取页数失败')
+        #    continue
    end_time = time.time()
    log.info(f'共抓取国务院部门文件{count}条数据，耗时{end_time - start_time}')

@@ -553,7 +562,7 @@ def get_content3():
            'topicClassification': '',            #政策文件分类
            'issuedNumber': pub_hao,                     #发文字号
            'publishDate': pub_time,                     #发布时间
-            'writtenDate': '',                     #成文时间
+            'writtenDate': None,                     #成文时间
            'sid': '1697458829758697473',                 #信息源id
            'sourceAddress': href,                     #原文链接
            'summary': '',                                #摘要
@@ -744,7 +753,7 @@ def bei_jing():
                        or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                    file_name = file.text.strip()
                    category = os.path.splitext(file_href)[1]
-                    if category not in file_name:
+                    if category not in file_name :
                        file_name = file_name + category
                    retData = baseCore.uptoOBS(file_href, '1667',pathType,file_name)
                    if retData['state']:
@@ -870,7 +879,7 @@ def nei_meng_gu():
                            fu_jian_re = str(real_href).split('/t')[0] + '/' + str(fu_jian_re).split('./')[1]
                            fu_jian_href = fu_jian_re
                            category = os.path.splitext(fu_jian_href)[1]
-                            if category not in title:
+                            if category not in title :
                                file_name = title + category
                            # print(fu_jian_href)
                            # todo:附件上传至文件服务器
@@ -918,7 +927,7 @@ def nei_meng_gu():
        pass

    end = time.time()
-    print('共', num, '条', '...........', '共耗时', end - start, '秒')
+    log.info('共', num, '条', '...........', '共耗时', end - start, '秒')

 # 吉林
 def ji_lin():
@@ -982,7 +991,7 @@ def ji_lin():
                        # print(pub_come)
                i_content = soup.find(class_='zsy_comain')
                if i_content:
-                    print(real_href)
+                    #print(real_href)
                    # 去掉扫一扫
                    try:
                        soup.find('div', id='qr_container').decompose()
@@ -1020,7 +1029,7 @@ def ji_lin():
                                    or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                file_name = fu_jian_href.text.strip()
                                category = os.path.splitext(fu_jian_href)[1]
-                                if category not in file_name:
+                                if category not in file_name :
                                    file_name = file_name + category
                                # print(fu_jian_href)
                                retData = baseCore.uptoOBS(fu_jian_href, '1670',pathType,file_name)
@@ -1065,7 +1074,7 @@ def ji_lin():
                                or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
                            # print(fj_href)
                            category = os.path.splitext(fj_href)[1]
-                            if category not in file_name:
+                            if category not in file_name :
                                file_name = file_name + category
                            retData = baseCore.uptoOBS(fj_href, '1670',pathType,file_name)
                            if retData['state']:
@@ -1104,7 +1113,7 @@ def ji_lin():
                    'topicClassification': '',
                    'issuedNumber': '',
                    'publishDate': pub_time,
-                    'writtenDate': '',
+                    'writtenDate': None,
                    'sid': '1697458829758697473',
                    'sourceAddress': real_href,
                    'summary': '',
@@ -1126,7 +1135,7 @@ def ji_lin():
    except:
        pass
    end = time.time()
-    print('共', count, '条', '...........', '共耗时', end - start, '秒')
+    log.info('共', count, '条', '...........', '共耗时', end - start, '秒')

 # 上海
 def shang_hai():
@@ -1219,7 +1228,7 @@ def shang_hai():
                                or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                            category = os.path.splitext(fu_jian_href)[1]
-                            if category not in file_name:
+                            if category not in file_name :
                                file_name = file_name + category
                            retData = baseCore.uptoOBS(fu_jian_href, '1671',pathType,file_name)
                            if retData['state']:
@@ -1252,7 +1261,7 @@ def shang_hai():
                        'topicClassification': '',
                        'issuedNumber': pub_hao,
                        'publishDate': pub_time,
-                        'writtenDate': '',
+                        'writtenDate': None,
                        'sid': '1697458829758697473',
                        'sourceAddress': href,
                        'summary': '',
@@ -1268,7 +1277,7 @@ def shang_hai():
        except:
            pass
    end = time.time()
-    print('共', count, '条', '...........', '共耗时', end - start, '秒')
+    log.info('共', count, '条', '...........', '共耗时', end - start, '秒')

 # 浙江
 def zhe_jiang():
@@ -1376,7 +1385,7 @@ def zhe_jiang():
                    'topicClassification': '',
                    'issuedNumber': pub_hao,
                    'publishDate': pub_time,
-                    'writtenDate': '',
+                    'writtenDate': None,
                    'sid': '1697458829758697473',
                    'sourceAddress': href,
                    'summary': '',
@@ -1393,7 +1402,7 @@ def zhe_jiang():
    except:
        pass
    end = time.time()
-    print('共', count, '条', '...........', '共耗时', end - start, '秒')
+    log.info('共', count, '条', '...........', '共耗时', end - start, '秒')

 # 福建
 def fu_jian():
@@ -1445,7 +1454,7 @@ def fu_jian():
                        i_soup = BeautifulSoup(i_html, 'html.parser')
                real_href = href
                # real_href = 'http://gzw.fujian.gov.cn/zwgk/zcfg/201806/t20180619_3065065.htm'
-                print(real_href)
+                #print(real_href)
                is_href = db_storage.find_one({'网址': real_href})
                if is_href:
                    num+=1
@@ -1460,7 +1469,7 @@ def fu_jian():
                        content = baseCore.pdf_content(resp_content)
                        contentwithtag = ''
                        category = os.path.splitext(real_href)[1]
-                        if category not in title:
+                        if category not in title :
                            file_name = title + category
                        # 文件上传至服务器
                        retData = baseCore.uptoOBS(real_href, '1673',pathType,file_name)
@@ -1471,7 +1480,7 @@ def fu_jian():
                        att_id, full_path = baseCore.tableUpdate(retData, '福建省国资委', file_name, num,'')
                        id_list.append(att_id)
                        pub_hao = ''
-                        pub_time = ''
+                        pub_time = None
                        pub_source = ''

                    else:
@@ -1508,7 +1517,7 @@ def fu_jian():
                                        or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
                                        or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
                                    category = os.path.splitext(fj_href)[1]
-                                    if category not in file_name:
+                                    if category not in file_name :
                                        file_name = file_name + category
                                    print(fj_href)
                                    # 找到附件后 上传至文件服务器
@@ -1524,7 +1533,7 @@ def fu_jian():

                        except:
                            pub_source = ''
-                            pub_time = ''
+                            pub_time = None
                            contentwithtag = i_soup.find('tabs tab_base_01 rules_con1')
                            content = contentwithtag.text.strip()
                            if content == '' or content == None:
@@ -1548,7 +1557,7 @@ def fu_jian():
                        'topicClassification': '',
                        'issuedNumber': pub_hao,
                        'publishDate': pub_time,
-                        'writtenDate': '',
+                        'writtenDate': None,
                        'sid': '1697458829758697473',
                        'sourceAddress': real_href,
                        'summary': '',
@@ -1566,7 +1575,7 @@ def fu_jian():
    except:
        pass
    end_time = time.time()
-    print(f'共抓取{count}条数据，共耗时{end_time - start_time}')
+    log.info(f'共抓取{count}条数据，共耗时{end_time - start_time}')

 # 山东
 def shan_dong():
@@ -1633,7 +1642,7 @@ def shan_dong():
                            for h1 in h1_list:
                                title = title + str(h1.text)
                            title.strip().lstrip()
-                        pub_time = ''
+                        pub_time = None
                        span_list = source.find_all('span')
                        i = 0
                        for span in span_list:
@@ -1683,7 +1692,7 @@ def shan_dong():
        except:
            pass
    end = time.time()
-    print('共', count, '条', '...........', '共耗时', end - start, '秒')
+    log.info('共', count, '条', '...........', '共耗时', end - start, '秒')

 # 广东
 def guang_dong():
@@ -1745,7 +1754,7 @@ def guang_dong():
                                or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
                                or '.xlsx' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
                            category = os.path.splitext(fj_href)[1]
-                            if category not in file_name:
+                            if category not in file_name :
                                file_name = file_name + category
                            # 附件上传至文件服务器
                            retData = baseCore.uptoOBS(fj_href, '1676',pathType,file_name)
@@ -1774,7 +1783,7 @@ def guang_dong():
                        'topicClassification': '',
                        'issuedNumber': '',
                        'publishDate': pub_time,
-                        'writtenDate': '',
+                        'writtenDate': None,
                        'sid': '1697458829758697473',
                        'sourceAddress': href,
                        'summary': '',
@@ -1792,7 +1801,7 @@ def guang_dong():
    except:
        pass
    end = time.time()
-    print('共', count, '条', '...........', '共耗时', end - start, '秒')
+    log.info('共', count, '条', '...........', '共耗时', end - start, '秒')

 # 海南
 def hai_nan():
@@ -1869,7 +1878,7 @@ def hai_nan():
                                        or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                        or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                    category = os.path.splitext(fu_jian_href)[1]
-                                    if category not in file_name:
+                                    if category not in file_name :
                                        file_name = file_name + category
                                    # 上传至文件服务器
                                    retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
@@ -1916,7 +1925,7 @@ def hai_nan():
                                                or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                                or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                            category = os.path.splitext(fu_jian_href)[1]
-                                            if category not in file_name:
+                                            if category not in file_name :
                                                file_name = file_name + category
                                            # print(f'----附件：{fu_jian_href}-----filename:{file_name}')
                                            # 附件上传至文件服务器
@@ -1995,7 +2004,7 @@ def hai_nan():
            except:
                pass
        end_time = time.time()
-        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

    def hai_nan2():
        def hai_nan_sw(page_href):
@@ -2126,7 +2135,7 @@ def hai_nan():
                        pub_source = ''
                        pub_time = str(pub_result.text).split('来源：')[0].lstrip().strip()
                        pub_hao = ''
-                        writtenDate = ''
+                        writtenDate = None,
                        contentWithTag = doc_href.find('div', attrs={'class': 'xxgk_content_content'})
                        content = contentWithTag.text
                        if content == '' or content == None:
@@ -2143,7 +2152,7 @@ def hai_nan():
                                or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                            category = os.path.splitext(fu_jian_href)[1]
-                            if category not in file_name:
+                            if category not in file_name :
                                file_name = file_name + category
                            # 上传至文件服务器
                            retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
@@ -2241,7 +2250,7 @@ def hai_nan():
                        pub_time = str(pub_result.text).split('来源：')[0].lstrip().strip()
                        pub_hao = ''
                        pub_source = ''
-                        writtenDate = ''
+                        writtenDate = None,
                        contentWithTag = doc_href.find('div', attrs={'class': 'xxgk_content_content'})
                        content = contentWithTag.text
                        if content == '' or content == None:
@@ -2259,7 +2268,7 @@ def hai_nan():
                                    or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                    or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                category = os.path.splitext(fu_jian_href)[1]
-                                if category not in file_name:
+                                if category not in file_name :
                                    file_name = file_name + category
                                # 上传至文件服务器
                                retData = baseCore.uptoOBS(fu_jian_href, '1677',pathType,file_name)
@@ -2360,7 +2369,7 @@ def hai_nan():
                                0].strip()
                        except:
                            pub_source = ''
-                            pub_time = ''
+                            pub_time = None
                        pub_hao = ''
                        contentWithTag = doc_href.find(class_='pages_content')
                        content = contentWithTag.text
@@ -2383,7 +2392,7 @@ def hai_nan():
                        'topicClassification': '',
                        'issuedNumber': pub_hao,
                        'publishDate': pub_time,
-                        'writtenDate': '',
+                        'writtenDate': None,
                        'sid': '1697458829758697473',
                        'sourceAddress': i_href,
                        'summary': '',
@@ -2479,7 +2488,7 @@ def hai_nan():
            except:
                pass
            end_time = time.time()
-            print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+            log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

        start()
    hai_nan1()
@@ -2538,7 +2547,7 @@ def si_chuan():
                                or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                            category = os.path.splitext(fu_jian_href)[1]
-                            if category not in file_name:
+                            if category not in file_name :
                                file_name = file_name + category
                            # 对附件上传至文件服务器
                            retData = baseCore.uptoOBS(fu_jian_href, '1678',pathType,file_name)
@@ -2567,7 +2576,7 @@ def si_chuan():
                        'topicClassification': '',
                        'issuedNumber': '',
                        'publishDate': pub_time,
-                        'writtenDate': '',
+                        'writtenDate': None,
                        'sid': '1697458829758697473',
                        'sourceAddress': href,
                        'summary': '',
@@ -2585,7 +2594,7 @@ def si_chuan():
        except:
            pass
    end_time = time.time()
-    print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

 # 广西
 def guang_xi():
@@ -2671,7 +2680,7 @@ def guang_xi():
                                    or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                    or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                category = os.path.splitext(fu_jian_href)[1]
-                                if category not in file_name:
+                                if category not in file_name :
                                    file_name = file_name + category
                                # 附件上传至文件服务器
                                retData = baseCore.uptoOBS(fu_jian_href, '1692',pathType,file_name)
@@ -2701,7 +2710,7 @@ def guang_xi():
                            'topicClassification': '',
                            'issuedNumber': pub_hao,
                            'publishDate': pub_time,
-                            'writtenDate': '',
+                            'writtenDate': None,
                            'sid': '1697458829758697473',
                            'sourceAddress': href,
                            'summary': '',
@@ -2718,7 +2727,7 @@ def guang_xi():
            except:
                pass
    end_time = time.time()
-    print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

 # 贵州
 def gui_zhou():
@@ -2788,7 +2797,7 @@ def gui_zhou():
                                or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                            category = os.path.splitext(fu_jian_href)[1]
-                            if category not in file_name:
+                            if category not in file_name :
                                file_name = file_name + category
                            # 附件上传至文件服务器
                            retData = baseCore.uptoOBS(fu_jian_href, '1694',pathType,file_name)
@@ -2818,7 +2827,7 @@ def gui_zhou():
                        'topicClassification': '',
                        'issuedNumber': pub_hao,
                        'publishDate': pub_time,
-                        'writtenDate': '',
+                        'writtenDate': None,
                        'sid': '1697458829758697473',
                        'sourceAddress': href,
                        'summary': '',
@@ -2836,7 +2845,7 @@ def gui_zhou():
        except:
            pass
    end_time = time.time()
-    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+    log.info(f'共抓取{num}条数据,共耗时{end_time - start_time}')

 # 云南
 def yun_nan():
@@ -2870,7 +2879,7 @@ def yun_nan():
                        continue
                    try:
                        fu_jian_href_list = []
-                        print(href)
+                        #print(href)
                        if '.shtml' in href:
                            href_resp = requests.get(url=href, headers=headers, verify=False)
                            href_resp.encoding = href_resp.apparent_encoding
@@ -2901,7 +2910,7 @@ def yun_nan():
                                        or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                    try:
                                        category = os.path.splitext(fu_jian_href)[1]
-                                        if category not in file_name:
+                                        if category not in file_name :
                                            file_name = file_name + category
                                        # 附件上传至文件服务器
                                        retData = baseCore.uptoOBS(fu_jian_href, '1679',pathType,file_name)
@@ -2939,8 +2948,8 @@ def yun_nan():
                            'organ': '',
                            'topicClassification': '',
                            'issuedNumber': pub_hao,
-                            'publishDate': '',
-                            'writtenDate': '',
+                            'publishDate': None,
+                            'writtenDate': None,
                            'sid': '1697458829758697473',
                            'sourceAddress': href,
                            'summary': '',
@@ -2959,7 +2968,7 @@ def yun_nan():
            except:
                pass
        end_time = time.time()
-        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

    def yun_nan2():
        num = 0
@@ -3022,7 +3031,7 @@ def yun_nan():
                                    # print(fu_jian_href)
                                    try:
                                        category = os.path.splitext(fu_jian_href)[1]
-                                        if category not in file_name:
+                                        if category not in file_name :
                                            file_name = file_name + category
                                        # 附件上传至文件服务器
                                        retData = baseCore.uptoOBS(fu_jian_href, '1679',pathType,file_name)
@@ -3060,7 +3069,7 @@ def yun_nan():
                            'topicClassification': '',
                            'issuedNumber': pub_hao,
                            'publishDate': pub_time,
-                            'writtenDate': '',
+                            'writtenDate': None,
                            'sid': '1697458829758697473',
                            'sourceAddress': href,
                            'summary': '',
@@ -3079,7 +3088,7 @@ def yun_nan():
            except:
                pass
        end_time = time.time()
-        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

    yun_nan1()
    yun_nan2()
@@ -3148,8 +3157,8 @@ def chong_qing():
                        except:
                            origin = ''
                            topicClassification = ''
-                            pub_time = ''
-                            writtenDate = ''
+                            pub_time = None
+                            writtenDate = None
                            pub_hao = ''
                            contentWithTag = doc_href.find('div', class_='zwxl-content')
                            content = contentWithTag.text
@@ -3169,7 +3178,7 @@ def chong_qing():
                                    or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                try:
                                    category = os.path.splitext(fu_jian_href)[1]
-                                    if category not in file_name:
+                                    if category not in file_name :
                                        file_name = file_name + category
                                    # 附件上传至文件服务器
                                    retData = baseCore.uptoOBS(fu_jian_href, '1693',pathType,file_name)
@@ -3219,7 +3228,7 @@ def chong_qing():
        except:
            pass
    end_time = time.time()
-    print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

 # 天津
 def tian_jin():
@@ -3282,7 +3291,7 @@ def tian_jin():
                            rmtag2.remove()
                            contentWithTag = doc_href('div[id="zoom"]')
                        if len(writtenDate) < 1:
-                            writtenDate = ''
+                            writtenDate = None
                        if len(publishDate) < 1:
                            publishDate = doc_href('meta[name="PubDate"]').attr('content')
                        soup = paserUrl(str(contentWithTag), href)
@@ -3298,7 +3307,7 @@ def tian_jin():
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
                                category = os.path.splitext(file_href)[1]
-                                if category not in file_name:
+                                if category not in file_name :
                                    file_name = file_name + category
                                retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
                                if retData['state']:
@@ -3351,7 +3360,7 @@ def tian_jin():
            except:
                pass
        end_time = time.time()
-        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

    def tian_jin2():
        """
@@ -3413,7 +3422,7 @@ def tian_jin():
                            rmtag2.remove()
                            contentWithTag = doc_href('div[id="zoom"]')
                        if len(writtenDate) < 1:
-                            writtenDate = ''
+                            writtenDate = None
                        if len(publishDate) < 1:
                            publishDate = doc_href('meta[name="PubDate"]').attr('content')
                        soup = paserUrl(str(contentWithTag), href)
@@ -3429,7 +3438,7 @@ def tian_jin():
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
                                category = os.path.splitext(file_href)[1]
-                                if category not in file_name:
+                                if category not in file_name :
                                    file_name = file_name + category
                                retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
                                if retData['state']:
@@ -3482,7 +3491,7 @@ def tian_jin():
            except:
                pass
        end_time = time.time()
-        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

    def tian_jin3():
        num = 0
@@ -3507,7 +3516,7 @@ def tian_jin():
                    try:
                        publishDate = li.find('div', attrs={'class': 'other'}).text
                    except:
-                        publishDate = ''
+                        publishDate = None
                    if 'http' not in href:
                        if '../../../' in href:
                            href = href.replace('../../../', 'https://sasac.tj.gov.cn/')
@@ -3548,7 +3557,7 @@ def tian_jin():
                            rmtag2.remove()
                            contentWithTag = doc_href('div[id="zoom"]')
                        if len(writtenDate) < 1:
-                            writtenDate = ''
+                            writtenDate = None
                        if len(publishDate) < 1:
                            publishDate = doc_href('meta[name="PubDate"]').attr('content')
                        soup = paserUrl(str(contentWithTag), href)
@@ -3564,7 +3573,7 @@ def tian_jin():
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
                                category = os.path.splitext(file_href)[1]
-                                if category not in file_name:
+                                if category not in file_name :
                                    file_name = file_name + category
                                retData = baseCore.uptoOBS(file_href, '1683',pathType,file_name)
                                if retData['state']:
@@ -3617,7 +3626,7 @@ def tian_jin():
            except:
                pass
        end_time = time.time()
-        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

    tian_jin1()
    tian_jin2()
@@ -3673,7 +3682,7 @@ def xin_jiang():
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
                                category = os.path.splitext(file_href)[1]
-                                if category not in file_name:
+                                if category not in file_name :
                                    file_name = file_name + category
                                retData = baseCore.uptoOBS(file_href, '1682',pathType,file_name)
                                if retData['state']:
@@ -3717,7 +3726,7 @@ def xin_jiang():
                            'topicClassification': "",
                            'issuedNumber': issuedNumber,
                            'publishDate': publishDate,
-                            'writtenDate': "",
+                            'writtenDate': None,
                            'sid': '1697458829758697473',
                            'sourceAddress': href,
                            'summary': '',
@@ -3734,7 +3743,7 @@ def xin_jiang():
            except:
                pass
        end_time = time.time()
-        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

    def xin_jiang_jsbt():
        num = 0
@@ -3780,7 +3789,7 @@ def xin_jiang():
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
                                category = os.path.splitext(file_href)[1]
-                                if category not in file_name:
+                                if category not in file_name :
                                    file_name = file_name + category
                                retData = baseCore.uptoOBS(file_href, '1682',pathType,file_name)
                                if retData['state']:
@@ -3824,7 +3833,7 @@ def xin_jiang():
                            'topicClassification': "",
                            'issuedNumber': issuedNumber,
                            'publishDate': publishDate,
-                            'writtenDate': "",
+                            'writtenDate': None,
                            'sid': '1697458829758697473',
                            'sourceAddress': href,
                            'summary': '',
@@ -3843,7 +3852,7 @@ def xin_jiang():
            except:
                pass
        end_time = time.time()
-        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

    xin_jiang1()
    xin_jiang_jsbt()
@@ -3881,7 +3890,7 @@ def shan_xi():
                try:
                    if ".pdf" in href:
                        content = ''
-                        publishDate = ''
+                        publishDate = None
                        origin = ''
                        fu_jian_soup = [href]
                        contentWithTag = ''
@@ -3908,7 +3917,7 @@ def shan_xi():
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
                            category = os.path.splitext(file_href)[1]
-                            if category not in file_name:
+                            if category not in file_name :
                                file_name = file_name + category
                            retData = baseCore.uptoOBS(file_href, '1684',pathType,file_name)
                            if retData['state']:
@@ -3952,7 +3961,7 @@ def shan_xi():
                        'topicClassification': "",
                        'issuedNumber': issuedNumber,
                        'publishDate': publishDate,
-                        'writtenDate': "",
+                        'writtenDate': None,
                        'sid': '1697458829758697473',
                        'sourceAddress': href,
                        'summary': '',
@@ -3969,7 +3978,7 @@ def shan_xi():
        except:
            pass
    end_time = time.time()
-    print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

 # 辽宁
 def liao_ning():
@@ -4028,7 +4037,7 @@ def liao_ning():
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
                            category = os.path.splitext(file_href)[1]
-                            if category not in file_name:
+                            if category not in file_name :
                                file_name = file_name + category
                            retData = baseCore.uptoOBS(file_href, '1685',pathType,file_name)
                            if retData['state']:
@@ -4071,7 +4080,7 @@ def liao_ning():
                        'topicClassification': "",
                        'issuedNumber': issuedNumber,
                        'publishDate': publishDate,
-                        'writtenDate': "",
+                        'writtenDate': None,
                        'sid': '1697458829758697473',
                        'sourceAddress': href,
                        'summary': '',
@@ -4088,7 +4097,7 @@ def liao_ning():
        except:
            pass
    end_time = time.time()
-    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+    log.info(f'共抓取{num}条数据,共耗时{end_time - start_time}')

 # 黑龙江
 def hei_long_jiang():
@@ -4141,7 +4150,7 @@ def hei_long_jiang():
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
                                category = os.path.splitext(file_href)[1]
-                                if category not in file_name:
+                                if category not in file_name :
                                    file_name = file_name + category
                                retData = baseCore.uptoOBS(file_href, '1687',pathType,file_name)
                                if retData['state']:
@@ -4174,7 +4183,7 @@ def hei_long_jiang():
                            'topicClassification': '',
                            'issuedNumber': pub_hao,
                            'publishDate': publishDate,
-                            'writtenDate': '',
+                            'writtenDate': None,
                            'sid': '1697458829758697473',
                            'sourceAddress': href,
                            'summary': '',
@@ -4193,7 +4202,7 @@ def hei_long_jiang():
        except:
            pass
    end_time = time.time()
-    print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

 # 江苏
 def jiang_su():
@@ -4257,7 +4266,7 @@ def jiang_su():
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
                            category = os.path.splitext(file_href)[1]
-                            if category not in file_name:
+                            if category not in file_name :
                                file_name = file_name + category
                            retData = baseCore.uptoOBS(file_href, '1687',pathType,file_name)
                            if retData['state']:
@@ -4314,7 +4323,7 @@ def jiang_su():
        except:
            pass
    end_time = time.time()
-    print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

 # 安徽
 def an_hui():
@@ -4368,7 +4377,7 @@ def an_hui():
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
                                category = os.path.splitext(file_href)[1]
-                                if category not in file_name:
+                                if category not in file_name :
                                    file_name = file_name + category
                                retData = baseCore.uptoOBS(file_href, '1688',pathType,file_name)
                                if retData['state']:
@@ -4418,7 +4427,7 @@ def an_hui():
            except:
                pass
        end_time = time.time()
-        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

    def an_hui2():
        num = 0
@@ -4472,7 +4481,7 @@ def an_hui():
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
                                category = os.path.splitext(file_href)[1]
-                                if category not in file_name:
+                                if category not in file_name :
                                    file_name = file_name + category
                                retData = baseCore.uptoOBS(file_href, '1688',pathType,file_name)
                                if retData['state']:
@@ -4524,7 +4533,7 @@ def an_hui():
            except:
                pass
        end_time = time.time()
-        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

    an_hui1()
    an_hui2()
@@ -4607,7 +4616,7 @@ def jiang_xi():
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
                            category = os.path.splitext(file_href)[1]
-                            if category not in file_name:
+                            if category not in file_name :
                                file_name = file_name + category
                            retData = baseCore.uptoOBS(file_href, '1689',pathType,file_name)
                            if retData['state']:
@@ -4647,7 +4656,7 @@ def jiang_xi():
                        'organ': organ,
                        'topicClassification': topicClassification,
                        'issuedNumber': pub_hao,
-                        'publishDate': '',
+                        'publishDate': None,
                        'writtenDate': writtenDate,
                        'sid': '1697458829758697473',
                        'sourceAddress': href,
@@ -4665,7 +4674,7 @@ def jiang_xi():
        except:
            pass
    end_time = time.time()
-    print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

 # 河南
 def he_nan():
@@ -4711,7 +4720,7 @@ def he_nan():
                            or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                        file_name = file.text.strip()
                        category = os.path.splitext(file_href)[1]
-                        if category not in file_name:
+                        if category not in file_name :
                            file_name = file_name + category
                        retData = baseCore.uptoOBS(file_href, '1690',pathType,file_name)
                        if retData['state']:
@@ -4750,7 +4759,7 @@ def he_nan():
                    'topicClassification': '',
                    'issuedNumber': issuedNumber,
                    'publishDate': publishDate,
-                    'writtenDate': '',
+                    'writtenDate': None,
                    'sid': '1697458829758697473',
                    'sourceAddress': href,
                    'summary': '',
@@ -4767,7 +4776,7 @@ def he_nan():
        except:
            pass
    end_time = time.time()
-    print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

 # 湖南
 def hu_nan():
@@ -4828,7 +4837,7 @@ def hu_nan():
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
                            category = os.path.splitext(file_href)[1]
-                            if category not in file_name:
+                            if category not in file_name :
                                file_name = file_name + category
                            retData = baseCore.uptoOBS(file_href, '1691',pathType,file_name)
                            if retData['state']:
@@ -4878,7 +4887,7 @@ def hu_nan():
        except:
            pass
    end_time = time.time()
-    print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

 # 甘肃
 def gan_su():
@@ -4963,7 +4972,7 @@ def gan_su():
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
                            category = os.path.splitext(file_href)[1]
-                            if category not in file_name:
+                            if category not in file_name :
                                file_name = file_name + category
                            retData = baseCore.uptoOBS(file_href, '1696',file_name)
                            if retData['state']:
@@ -5015,7 +5024,7 @@ def gan_su():
                pass
        bro.quit()
        end_time = time.time()
-        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+        log.info(f'共抓取{num}条数据,共耗时{end_time - start_time}')

    def gan_su2():
        num = 0
@@ -5097,7 +5106,7 @@ def gan_su():
                        origin = doc('div[class="links_tab"]>table>tbody>tr:nth-child(2)>td:nth-child(2)').text()
                        pub_hao = doc('div[class="links_tab"]>table>tbody>tr:nth-child(5)>td:nth-child(2)').text()
                        contentWithTag = doc('div[id="content"]')
-                        print(title)
+                        #print(title)

                    soup = paserUrl(str(contentWithTag), href)
                    try:
@@ -5119,7 +5128,7 @@ def gan_su():
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
                            category = os.path.splitext(file_href)[1]
-                            if category not in file_name:
+                            if category not in file_name :
                                file_name = file_name + category
                            log.info(f'{file_name}---{href}--')
                            retData = baseCore.uptoOBS(file_href, '1696',file_name)
@@ -5176,7 +5185,7 @@ def gan_su():
            pass
        bro.quit()
        end_time = time.time()
-        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

    def gan_su3():
        num = 0
@@ -5260,13 +5269,13 @@ def gan_su():
                        origin = doc('div[class="links_tab"]>table>tbody>tr:nth-child(2)>td:nth-child(2)').text()
                        pub_hao = doc('div[class="links_tab"]>table>tbody>tr:nth-child(5)>td:nth-child(2)').text()
                        contentWithTag = doc('div[id="content"]')
-                        print(title)
+                        #print(title)
                    if len(title) == 0 or contentWithTag.text() == '':
                        title = doc('div[class="main"]>h1').text().lstrip().strip()
                        writtenDate = doc('div[class="main"]>div[class="clearbox"]>p:nth-child(1)').text().split('日期：')[0].split(' ')[0].lstrip().strip()
                        origin = doc('div[class="main"]>div[class="clearbox"]>p:nth-child(1)').text().split('来源：')[0].lstrip().strip()
                        contentWithTag = doc('div[class="detailContent"]')
-                        print(title)
+                        #print(title)

                    soup = paserUrl(str(contentWithTag), href)
                    try:
@@ -5288,7 +5297,7 @@ def gan_su():
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
                            category = os.path.splitext(file_href)[1]
-                            if category not in file_name:
+                            if category not in file_name :
                                file_name = file_name + category
                            retData = baseCore.uptoOBS(file_href, '1696',file_name)
                            if retData['state']:
@@ -5304,7 +5313,7 @@ def gan_su():
                    content = soup.text
                    if content == '' or content == None:
                        log.info(f'-----{href}----{title}----内容为空-----')
-                        print(bro.page_source)
+                        #print(bro.page_source)
                        continue
                    if len(content) < 2:
                        continue
@@ -5345,7 +5354,7 @@ def gan_su():
            pass
        bro.quit()
        end_time = time.time()
-        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

    gan_su1()
    gan_su2()
@@ -5401,7 +5410,7 @@ def ning_xia():
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
                            category = os.path.splitext(file_href)[1]
-                            if category not in file_name:
+                            if category not in file_name :
                                file_name = file_name + category
                            retData = baseCore.uptoOBS(file_href, '1697',pathType,file_name)
                            if retData['state']:
@@ -5453,7 +5462,7 @@ def ning_xia():
        except:
            pass
    end_time = time.time()
-    print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

 # 陕西
 def shanxi():
@@ -5511,7 +5520,7 @@ def shanxi():
                            or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                        file_name = file.text.strip()
                        category = os.path.splitext(file_href)[1]
-                        if category not in file_name:
+                        if category not in file_name :
                            file_name = file_name + category
                        retData = baseCore.uptoOBS(file_href, '1680',pathType,file_name)
                        if retData['state']:
@@ -5544,7 +5553,7 @@ def shanxi():
                    'topicClassification': "",
                    'issuedNumber': "",
                    'publishDate': publishDate,
-                    'writtenDate': "",
+                    'writtenDate': None,
                    'sid': '1697458829758697473',
                    'sourceAddress': href,
                    'summary': '',
@@ -5563,7 +5572,7 @@ def shanxi():
    except:
        pass
    end_time = time.time()
-    print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

 # 西藏
 def xi_zang():
@@ -5617,7 +5626,7 @@ def xi_zang():
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
                            category = os.path.splitext(file_href)[1]
-                            if category not in file_name:
+                            if category not in file_name :
                                file_name = file_name + category
                            retData = baseCore.uptoOBS(file_href, '1695',pathType,file_name)
                            if retData['state']:
@@ -5647,7 +5656,7 @@ def xi_zang():
                        'topicClassification': "",
                        'issuedNumber': "",
                        'publishDate': publishDate,
-                        'writtenDate': "",
+                        'writtenDate': None,
                        'sid': '1697458829758697473',
                        'sourceAddress': href,
                        'summary': '',
@@ -5664,7 +5673,7 @@ def xi_zang():
        except:
            pass
    end_time = time.time()
-    print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

 # 青海
 def qing_hai():
@@ -5722,7 +5731,7 @@ def qing_hai():
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
                                category = os.path.splitext(file_href)[1]
-                                if category not in file_name:
+                                if category not in file_name :
                                    file_name = file_name + category
                                retData = baseCore.uptoOBS(file_href, '1681',pathType,file_name)
                                if retData['state']:
@@ -5771,7 +5780,7 @@ def qing_hai():
        except:
            pass
        end_time = time.time()
-        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

    def qing_hai2():
        num = 0
@@ -5849,7 +5858,7 @@ def qing_hai():
                                            or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                        file_name = file.text.strip()
                                        category = os.path.splitext(file_href)[1]
-                                        if category not in file_name:
+                                        if category not in file_name :
                                            file_name = file_name + category
                                        retData = baseCore.uptoOBS(file_href, '1681',pathType,file_name)
                                        if retData['state']:
@@ -5899,7 +5908,7 @@ def qing_hai():
            except:
                pass
        end_time = time.time()
-        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

    qing_hai1()
    qing_hai2()
@@ -5943,7 +5952,7 @@ def he_bei():
                        or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                    file_name = file.text.strip()
                    category = os.path.splitext(file_href)[1]
-                    if category not in file_name:
+                    if category not in file_name :
                        file_name = file_name + category
                    retData = baseCore.uptoOBS(file_href, '1668',pathType,file_name)
                    if retData['state']:
@@ -5987,7 +5996,7 @@ def he_bei():
                'topicClassification': "",
                'issuedNumber': issuedNumber,
                'publishDate': publishDate,
-                'writtenDate': "",
+                'writtenDate': None,
                'sid': '1697458829758697473',
                'sourceAddress': href,
                'summary': '',
@@ -6002,7 +6011,7 @@ def he_bei():
    except:
        pass
    end_time = time.time()
-    print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

 # 湖北
 def hu_bei():
@@ -6068,7 +6077,7 @@ def hu_bei():
                        or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                    file_name = file.text.strip()
                    category = os.path.splitext(file_href)[1]
-                    if category not in file_name:
+                    if category not in file_name :
                        file_name = file_name + category
                    retData = baseCore.uptoOBS(file_href, '1675',pathType,file_name)
                    if retData['state']:
@@ -6120,44 +6129,45 @@ def hu_bei():
            pass
    driver.close()
    end_time = time.time()
-    print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

 if __name__ == '__main__':
-    # get_content1()
-    # get_content2()
-    # get_content3()
-    # bei_jing()
-    # nei_meng_gu()
-    # ji_lin()
-    # shang_hai()
-    # zhe_jiang()
-    # fu_jian()
-    # shan_dong()
-    # guang_dong()
-    # hai_nan()
-    # si_chuan()
-    # guang_xi()
-    # gui_zhou()
-    # yun_nan()
-    # chong_qing()
-    # tian_jin()
-    # xin_jiang()
-    # shan_xi()
-    # liao_ning()
-    # hei_long_jiang()
-    # jiang_su()
-    # an_hui()
-    # jiang_xi()
-    # he_nan()
-    # hu_nan()
+    get_content1()
+    get_content2()
+    get_content3()
+    bei_jing()
+    nei_meng_gu()
+    ji_lin()
+    shang_hai()
+    zhe_jiang()
+    fu_jian()
+    shan_dong()
+    guang_dong()
+    hai_nan()
+    si_chuan()
+    guang_xi()
+    gui_zhou()
+    yun_nan()
+    chong_qing()
+    tian_jin()
+    xin_jiang()
+    shan_xi()
+    liao_ning()
+    hei_long_jiang()
+    jiang_su()
+    an_hui()
+    jiang_xi()
+    he_nan()
+    hu_nan()
    gan_su()
-    # ning_xia()
-    # xi_zang()
-    # shanxi()
-    # qing_hai()
-    # he_bei()
-    # qing_hai()
-    # current_time = datetime.datetime.now()
-    # midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
-    # sleep_seconds = (midnight_time - current_time).total_seconds()
-    # time.sleep(sleep_seconds)
+    ning_xia()
+    xi_zang()
+    shanxi()
+    qing_hai()
+    he_bei()
+    qing_hai()
+    current_time = datetime.datetime.now()
+    midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
+    sleep_seconds = (midnight_time - current_time).total_seconds()
+    time.sleep(sleep_seconds)
+
--- a/comData/policylaw/tingtype.py
+++ b/comData/policylaw/tingtype.py
-import datetime
 import json
 import random
 import time
 from urllib.parse import urljoin
-
+import datetime
 import pymongo
 from kafka import KafkaProducer
 from tqdm import tqdm
@@ -12,15 +11,31 @@ import pymysql
 import requests
 from bs4 import BeautifulSoup
 import urllib3
-from base.BaseCore import BaseCore
+from lxml import etree
+from BaseCore import BaseCore
+
 baseCore = BaseCore()
 log = baseCore.getLogger()
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 cnx = baseCore.cnx
 cursor = baseCore.cursor
-db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji['国务院_国资委_copy1']
+db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji[
+    '国务院_国资委_copy1']
+headers = {
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+    'Accept-Encoding': 'gzip, deflate',
+    'Accept-Language': 'zh-CN,zh;q=0.9',
+    'Cache-Control': 'no-cache',
+    'Connection': 'keep-alive',
+    'Cookie': 'Hm_lvt_fa835457efbc11dfb88752e70521d23b=1690184499; Hm_lpvt_fa835457efbc11dfb88752e70521d23b=1690184499; SF_cookie_1=98184645; Hm_lvt_2b5618a441c142a90e1a75f4b226c252=1690189470; Hm_lpvt_2b5618a441c142a90e1a75f4b226c252=1690189470; zh_choose=n; wdcid=30ffdae06d11dbde; wdlast=1690189470; wdses=13ee59561f2fb725',
+    'Host': 'www.sasac.gov.cn',
+    'Pragma': 'no-cache',
+    'Referer': 'https://www.baidu.com/link?url=CcQEFfXAeQsxu1IlLlxj8WHugAcJ7sBjOBqvZYDfN7WE6OZpSUM4prK6DiADOqTP&wd=&eqid=d507a037000987780000000364be37d4',
+    'Upgrade-Insecure-Requests': '1',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
+}

-def paserUrl(html,listurl):
+def paserUrl(html, listurl):
    # soup = BeautifulSoup(html, 'html.parser')
    # 获取所有的<a>标签和<img>标签
    links = html.find_all(['a', 'img'])
@@ -36,18 +51,19 @@ def paserUrl(html,listurl):
 def save_data(dic_news):
    aaa_dic = {

-        '附件id':dic_news['attachmentIds'],
-        '网址':dic_news['sourceAddress'],
-        'tid':dic_news['labels'][0]['relationId'],
-        '来源':dic_news['labels'][0]['relationName'],
-        '创建时间':dic_news['createDate'],
+        '附件id': dic_news['attachmentIds'],
+        '网址': dic_news['sourceAddress'],
+        'tid': dic_news['labels'][0]['relationId'],
+        '来源': dic_news['labels'][0]['relationName'],
+        '创建时间': dic_news['createDate'],
        '带标签内容': dic_news['contentWithTag'][:100]
    }
    db_storage.insert_one(aaa_dic)

+
 def sendKafka(dic_news):
    start_time = time.time()
-    try:#114.116.116.241
+    try:  # 114.116.116.241
        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
        kafka_result = producer.send("policy",
                                     json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
@@ -78,215 +94,233 @@ def sendKafka(dic_news):
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())

-def work(href_type,ting_type,relationId):
-    ip = baseCore.get_proxy()
-    log.info(f'\n================厅局类别==={ting_type}========================')
-    if 'http' in href_type:
-        url_type = href_type
-    else:
-        url_type = 'http://www.sasac.gov.cn/' + href_type.replace('../', '')
-    # print(url_type)
-    i_res = requests.get(url=url_type, headers=headers, proxies=ip)
-    i_soup = BeautifulSoup(i_res.content, 'html.parser')
-    time.sleep(2)
-    news_list = i_soup.find('div', class_='tjywBottom').find_all('li')
-    # 文章列表
-    # print('================新闻列表==================')
-    for news in tqdm(news_list):
-        try:
-            news_href = news.find('a')['href']
-        except:
-            continue
-        if 'http' in news_href:
-            news_url = news_href
-        else:
-            news_url = 'http://www.sasac.gov.cn/' + news_href.replace('../', '')
-        # 判断是否已经爬取过
-        is_href = db_storage.find_one({'网址': news_url})
-        if is_href:
-            log.info('已采集----------跳过')
-            continue
-        news_title = news.find('a').text.split('[')[0]
-        log.info(f'\n----正在采集: {news_title}-------')
-        pub_time = news.find('span').text.replace('[', '').replace(']', '')
-        # 文章信息
-        header = {
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
-            'Accept-Encoding': 'gzip, deflate',
-            'Accept-Language': 'zh-CN,zh;q=0.9',
-            'Cache-Control': 'no-cache',
-            'Cookie': 'wdcid=30ffdae06d11dbde; __jsluid_h=e623973ba12a5f48b086f8c5cee6fffa; SF_cookie_1=67313298; Hm_lvt_fa835457efbc11dfb88752e70521d23b=1693808034; zh_choose=n; Hm_lvt_2b5618a441c142a90e1a75f4b226c252=1694078708; wdses=381c6ab86ce01570; wdlast=1694163647; Hm_lpvt_fa835457efbc11dfb88752e70521d23b=1694163647; Hm_lpvt_2b5618a441c142a90e1a75f4b226c252=1694165617',
-            'Host': 'www.sasac.gov.cn',
-            'Pragma': 'no-cache',
-            'Proxy-Connection': 'keep-alive',
-            'Referer': 'http://www.sasac.gov.cn/n2588020/n2588072/n2590818/n2590820/c28651762/content.html',
-            'Upgrade-Insecure-Requests': '1',
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
-        }
-        # news_url = 'http://www.sasac.gov.cn/n2588020/n2588072/n2590818/n2590820/c28102228/content.html'
-        ii_res = requests.get(url=news_url, headers=header, proxies=ip)
-        ii_soup = BeautifulSoup(ii_res.content, 'html.parser')
-        # todo:相对路径转化为绝对路径
-        ii_soup = paserUrl(ii_soup, news_url)
-        # 去掉扫一扫
-        try:
-            ii_soup.find('div', id='qr_container').decompose()
-        except:
-            pass
-        # 去掉style标签
-        for styleTag in ii_soup.find_all('style'):
-            styleTag.extract()
-
-        time.sleep(2)
-        try:
-            news_info = ii_soup.find('div', class_='zsy_cotitle')
-        except Exception as e:
-            log.error(e)
-            news_info = ''
-        if news_info:
+# 国资委_内设机构
+def gzw_nsjg():
+    # 获取页面数据
+    def get_page_nsjg(href, ting_type, relationId, page):
+        start_time = time.time()
+        num = 0
+        for pageNo in range(1, page + 1):
+            if pageNo != 1:
+                href = href.replace(f'_{pageNo - 1}.html', f'_{pageNo}.html')
+            if pageNo == page:
+                tag = href.split('/')[-1]
+                href = href.replace(tag, 'index.html')
            try:
-                # origin
-                pub_source = news_info.find('p').text.split('文章来源：')[1].split('发布时间')[0].strip()
+                req = requests.get(url=href, headers=headers, verify=False)
+                req_text = req.text.encode("ISO-8859-1")
+                req_text = req_text.decode("utf-8")
+                soup = BeautifulSoup(req_text, 'html.parser')
+                soup = paserUrl(soup, href)
+                li_list = soup.find('ul', attrs={'class': 'ld-tjywList'}).find_all('li')
            except:
-                pub_source = ''
-            try:
-                contentWithTag = ii_soup.find('div', 'zsy_comain')
-                content = contentWithTag.text.strip()
-
-            except:
-                content = ''
-                contentWithTag = ''
-            if len(content) > 100:
-                pass
-            else:
-                continue
-            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-
-            dic_news = {
-                'attachmentIds': [],
-                'author': '',
-                'content': content,
-                'contentWithTag': str(contentWithTag),
-                'createDate': time_now,
-                'deleteFlag': 0,
-                'id': '',
-                'labels': [{'relationId': relationId, 'relationName': ting_type, 'labelMark': "policy"}],
-                'origin': pub_source,
-                'organ': '',
-                'topicClassification': '',
-                'issuedNumber': '',
-                'publishDate': pub_time,
-                'writtenDate': '',
-                'sid': '1697458829758697473',
-                'sourceAddress': news_url,
-                'summary': '',
-                'title': news_title
-            }
-            sendKafka(dic_news)
-            save_data(dic_news)
-            log.info(f'{ting_type}-----{news_title}----发送成功', )
-        else:
-            dic_error = {
-                '标题': news_title,
-                '原文链接': news_url,
-                '厅局类别': ting_type
-            }
-            log.error(dic_error)
+                req = requests.get(url=href, headers=headers, verify=False)
+                req_text = req.text.encode("ISO-8859-1")
+                req_text = req_text.decode("utf-8")
+                soup = BeautifulSoup(req_text, 'html.parser')
+                soup = paserUrl(soup, href)
+                li_list = soup.find_all('li')
+            for li in li_list:
+                try:
+                    real_href = li.find('a').get('href')
+                except:
+                    continue
+                is_href = db_storage.find_one({'网址': real_href})
+                if is_href:
+                    log.info('已采集----------跳过')
+                    continue
+                try:
+                    try:
+                        try:
+                            req_ = requests.get(url=real_href, headers=headers, verify=False)
+                            req_.encoding = req_.apparent_encoding
+                            soup_ = BeautifulSoup(req_.text, 'html.parser')
+                            div_content = soup_.find('div', attrs={'class': 'zsy_content'})
+                            pub_result = div_content.find('div', attrs={'class': 'zsy_cotitle'})
+                            try:
+                                title = str(pub_result.text).split('文章来源：')[0].replace('\n', '').replace('\r',
+                                                                                                         '').lstrip().strip()
+                                publishDate = str(pub_result.text).split('发布时间：')[1].strip().lstrip()
+                                pub_source = str(pub_result.text).split('文章来源：')[1].split('发布时间：')[0].lstrip().strip()
+                            except:
+                                title = str(pub_result.text).split('发布时间：')[0].replace('\n', '').replace('\r',
+                                                                                                         '').lstrip().strip()
+                                publishDate = str(pub_result.text).split('发布时间：')[1].strip().lstrip()
+                        except:
+                            req_ = requests.get(url=real_href, headers=headers, verify=False)
+                            req_.encoding = req_.apparent_encoding
+                            soup_ = BeautifulSoup(req_.text, 'html.parser')
+                            pub_result = soup_.find('div', attrs={'class': 'zsy_cotitle'})
+                            real_href = str(pub_result.text).split('location.href="')[1].split('";')[0].lstrip().strip()
+                            req_.close()
+                            req_ = requests.get(url=real_href, headers=headers, verify=False)
+                            req_.encoding = req_.apparent_encoding
+                            soup_ = BeautifulSoup(req_.text, 'html.parser')
+                            div_content = soup_.find('div', attrs={'class': 'zsy_content'})
+                            pub_result = div_content.find('div', attrs={'class': 'zsy_cotitle'})
+                            try:
+                                title = str(pub_result.text).split('文章来源：')[0].replace('\n', '').replace('\r',
+                                                                                                         '').lstrip().strip()
+                                publishDate = str(pub_result.text).split('发布时间：')[1].strip().lstrip()
+                                pub_source = str(pub_result.text).split('文章来源：')[1].split('发布时间：')[0].lstrip().strip()
+                            except:
+                                title = str(pub_result.text).split('发布时间：')[0].replace('\n', '').replace('\r',
+                                                                                                         '').lstrip().strip()
+                                publishDate = str(pub_result.text).split('发布时间：')[1].strip().lstrip()
+                        req_.close()
+                    except:
+                        req_ = requests.get(url=real_href, headers=headers, verify=False)
+                        req_.encoding = req_.apparent_encoding
+                        soup_ = BeautifulSoup(req_.text, 'html.parser')
+                        yaoqiu_list = soup_.find('div', attrs={'class': 'yaoqiu_list'})
+                        li_list_ = yaoqiu_list.find_all('li')
+                        for li_ in li_list_:
+                            href_ = li_.find('a').get('href')
+                            real_href = href_.replace('../../../', 'http://www.sasac.gov.cn/')
+                            req_ = requests.get(url=real_href, headers=headers, verify=False)
+                            req_.encoding = req_.apparent_encoding
+                            soup_ = BeautifulSoup(req_.text, 'html.parser')
+                            div_content = soup_.find('div', attrs={'class': 'zsy_content'})
+                            pub_result = div_content.find('div', attrs={'class': 'zsy_cotitle'})
+                            try:
+                                title = str(pub_result.text).split('文章来源：')[0].replace('\n', '').replace('\r',
+                                                                                                         '').lstrip().strip()
+                                publishDate = str(pub_result.text).split('发布时间：')[1].strip().lstrip()
+                                pub_source = str(pub_result.text).split('文章来源：')[1].split('发布时间：')[0].lstrip().strip()
+                            except:
+                                title = str(pub_result.text).split('发布时间：')[0].replace('\n', '').replace('\r',
+                                                                                                         '').lstrip().strip()
+                                publishDate = str(pub_result.text).split('发布时间：')[1].strip().lstrip()
+                                pub_source = ''
+                    if 'location.href' in title:
+                        continue
+                    if '404 Ba' in str(div_content):
+                        continue
+                    contentWithTag = div_content.find('div',class_='zsy_comain')
+                    try:
+                        contentWithTag.find('div', id='qr_container').decompose()
+                    except:
+                        pass
+                    # 去掉style标签
+                    for styleTag in contentWithTag.find_all('style'):
+                        styleTag.extract()
+                    content = contentWithTag.text
+                    if content == '':
+                        log.error(f'{real_href}===获取正文失败')
+                        continue
+                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                    dic_news = {
+                        'attachmentIds': [],
+                        'author': '',
+                        'content': content,
+                        'contentWithTag': str(contentWithTag),
+                        'createDate': time_now,
+                        'deleteFlag': 0,
+                        'id': '',
+                        'labels': [{'relationId': relationId, 'relationName': ting_type, 'labelMark': "policy"}],
+                        'origin': pub_source,
+                        'organ': '',
+                        'topicClassification': '',
+                        'issuedNumber': '',
+                        'publishDate': publishDate,
+                        'writtenDate': None,
+                        'sid': '1697458829758697473',
+                        'sourceAddress': real_href,
+                        'summary': '',
+                        'title': title
+                    }
+                    #print(content)
+                    #print(contentWithTag)
+                    sendKafka(dic_news)
+                    save_data(dic_news)
+                    log.info(f'{ting_type}-----{title}----发送成功', )
+                    num += 1
+                except Exception as e:
+                    pass
+            req.close()
+        end_time = time.time()
+        print(f'抓取{num}条数据，共耗时{end_time - start_time}')

+    # 获取页面列表
+    def get_page_nsjg_list(href, institution, tid):
+        href_list = {
+            '办公厅（党委办公厅）': ['http://www.sasac.gov.cn/n2588020/n2588072/n2590818/n2590820/index_2642999_1.html', 9],
+            '综合研究局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591482/n2591484/index_2656923_1.html', 5],
+            '政策法规局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2590860/n2590862/index_2644230_1.html', 21],
+            '规划发展局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2590902/n2590904/index_2646556_1.html', 9],
+            '财务监管与运行评价局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2590944/n2590946/index_2647546_1.html', 9],
+            '产权管理局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591020/n2591022/index_2648251_1.html', 7],
+            '企业改革局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591064/n2591066/index_2648748_1.html', 15],
+            '考核分配局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591106/n2591108/index_2649149_1.html', 6],
+            '资本运营与收益管理局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591192/n2591194/index_2649585_1.html', 3],
+            '科技创新局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591148/n2591150/index_2650085_1.html', 14],
+            '社会责任局': ['http://www.sasac.gov.cn/n2588020/n2588072/n23746822/n23746853/index_23747054_.html', 10],
+            '综合监督局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591284/n2591286/index.html', 1],
+            '监督追责局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591266/n2591268/index_2654822_1.html', 2],
+            '企业领导人员管理一局（董事会工作局）': [
+                'http://www.sasac.gov.cn/n2588020/n2588072/n2591302/n2591304/index_2657539_1.html', 4],
+            '企业领导人员管理二局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591344/n2591346/index_2657636_1.html', 4],
+            '党建工作局（党委组织部、党委统战部）': [
+                'http://www.sasac.gov.cn/n2588020/n2588072/n2591386/n2591388/index_2656630_1.html', 14],
+            '宣传工作局（党委宣传部）': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591426/n2591428/index_2656835_1.html',
+                             21],
+            '国际合作局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591548/n2591550/index_2657011_1.html', 28],
+            '人事局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591586/n2591588/index_2656275_1.html', 7],
+            '行业协会商会党建工作局（行业协会商会工作局）': [
+                'http://www.sasac.gov.cn/n2588020/n2588072/n2591626/n2591628/index_2656076_1.html', 4],
+            '机关服务管理局（离退休干部管理局）': [
+                'http://www.sasac.gov.cn/n2588020/n2588072/n2591644/n2591646/index_2655780_1.html', 9],
+            '机关党委': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591684/n2591686/index_2655222_1.html', 33],
+            '党委巡视工作办公室、国资委巡视组': [
+                'http://www.sasac.gov.cn/n2588020/n2588072/n2591770/n2591772/index_2655029_1.html', 8],
+            '中央纪委国家监委驻国资委纪检监察组': ['http://www.sasac.gov.cn/n2588020/n2877928/n2878219/index_2879099_1.html', 18]}
+        href_ = href_list[institution][0]
+        page = href_list[institution][1]
+        get_page_nsjg(href_, institution, tid, page)

-#中央纪委国家监委驻国资委纪检监察组
-def job1(a_type):
-    href = a_type['href']
-    ting_type = a_type.text
-    return href,ting_type
+    # 开始
+    def gzw_nsjg_start():
+        url = 'http://www.sasac.gov.cn/n2588020/index.html'
+        req = requests.get(url=url, headers=headers, verify=False)
+        req_text = req.text.encode("ISO-8859-1")
+        req_text = req_text.decode("utf-8")
+        all_institution = []
+        tree = etree.HTML(req_text)
+        institution = tree.xpath('/html/body/div[4]/div[2]/div/dl[1]/dt/a/text()')[0].replace('\n', '').replace('\r',
+                                                                                                                '')
+        institution_href = tree.xpath('/html/body/div[4]/div[2]/div/dl[1]/dt/a/@href')[0].replace('../',
+                                                                                                  'http://www.sasac.gov.cn/')
+        all_institution.append([institution, institution_href])
+        dd_list = tree.xpath('/html/body/div[4]/div[2]/div/dl[2]/dd')
+        for dd in dd_list:
+            institution = dd.xpath('./a/text()')[0].replace('\n', '').replace('\r', '')
+            institution_href = dd.xpath('./a/@href')[0].replace('../', 'http://www.sasac.gov.cn/')
+            all_institution.append([institution, institution_href])

-def job():
-    url = 'http://www.sasac.gov.cn/n2588020/index.html'
-    ip = baseCore.get_proxy()
-    res = requests.get(url=url, headers=headers, proxies=ip)
-    soup = BeautifulSoup(res.content, 'html.parser')
-    time.sleep(2)
-    # 厅局列表
-    list_type = soup.find('div', class_='l-jgkk-right column').find_all('dd')[:22]
-    a_soup = soup.find('div', class_='l-jgkk-right column').find_all('dt')[0]
-    a_type = a_soup.text.strip()
-    a_href = a_soup.find('a')['href']
-    a_id = '1874'
-    list_error = []
-    num = 0
-    start_time = time.time()
-    work(a_href,a_type, a_id)
-    for type in tqdm(list_type):
-        list_news = []
-        href_type = type.find('a')['href']
-        ting_type = type.find('a').text
-        try:
-            relationId = mapId_dic[ting_type]
-        except:
-            continue
-        work(href_type,ting_type,relationId)
-
-    num += 1
-    end_time = time.time()
-    log.info(f'共抓取{num}条数据,共耗时{end_time - start_time}')
-
-    time.sleep(1)
-    # writer.save()
-    # df_error = pd.DataFrame(list_error)
-    # df_error.to_excel('未采到文章.xlsx',index=False)
+        tids = {'办公厅（党委办公厅）': 1643, '综合研究局': 1644, '政策法规局': 1645, '规划发展局': 1646, '财务监管与运行评价局': 1647, '产权管理局': 1648,
+                '企业改革局': 1649, '考核分配局': 1650, '资本运营与收益管理局': 1651, '科技创新局': 1652, '社会责任局': 2064, '综合监督局': 1653,
+                '监督追责局': 1654,
+                '企业领导人员管理一局（董事会工作局）': 1655, '企业领导人员管理二局': 1656, '党建工作局（党委组织部、党委统战部）': 1657, '宣传工作局（党委宣传部）': 1658,
+                '国际合作局': 1659, '人事局': 1660, '行业协会商会党建工作局（行业协会商会工作局）': 1661, '机关服务管理局（离退休干部管理局）': 1662, '机关党委': 1663,
+                '党委巡视工作办公室、国资委巡视组': 1664, '中央纪委国家监委驻国资委纪检监察组': 1874}
+        for a in all_institution:
+            institution = a[0]
+            href = a[1]
+            tid = tids[institution]
+            log.info(f'\n================厅局类别==={institution}========================')
+            get_page_nsjg_list(href, institution, tid)

+    gzw_nsjg_start()

-if __name__=='__main__':

-    mapId_dic = {
-        '办公厅（党委办公厅）':'1643',
-        '综合研究局':'1644',
-        '政策法规局':'1645',
-        '规划发展局':'1646',
-        '财务监管与运行评价局':'1647',
-        '产权管理局':'1648',
-        '企业改革局':'1649',
-        '考核分配局':'1650',
-        '资本运营与收益管理局':'1651',
-        '科技创新局':'1652',
-        '综合监督局':'1653',
-        '监督追责局':'1654',
-        '企业领导人员管理一局（董事会工作局）':'1655',
-        '企业领导人员管理二局':'1656',
-        '党建工作局（党委组织部、党委统战部）':'1657',
-        '宣传工作局（党委宣传部）':'1658',
-        '国际合作局':'1659',
-        '人事局':'1660',
-        '机关服务管理局（离退休干部管理局）':'1662',
-        '机关党委':'1663',
-        '党委巡视工作办公室、国资委巡视组':'1664',
-    }
-
-    headers = {
-    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
-    'Accept-Encoding':'gzip, deflate',
-    'Accept-Language':'zh-CN,zh;q=0.9',
-    'Cache-Control':'no-cache',
-    'Connection':'keep-alive',
-    'Cookie':'Hm_lvt_fa835457efbc11dfb88752e70521d23b=1690184499; Hm_lpvt_fa835457efbc11dfb88752e70521d23b=1690184499; SF_cookie_1=98184645; Hm_lvt_2b5618a441c142a90e1a75f4b226c252=1690189470; Hm_lpvt_2b5618a441c142a90e1a75f4b226c252=1690189470; zh_choose=n; wdcid=30ffdae06d11dbde; wdlast=1690189470; wdses=13ee59561f2fb725',
-    'Host':'www.sasac.gov.cn',
-    'Pragma':'no-cache',
-    'Referer':'https://www.baidu.com/link?url=CcQEFfXAeQsxu1IlLlxj8WHugAcJ7sBjOBqvZYDfN7WE6OZpSUM4prK6DiADOqTP&wd=&eqid=d507a037000987780000000364be37d4',
-    'Upgrade-Insecure-Requests':'1',
-    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
-    }
+if __name__ == '__main__':
    try:
-        job()
+        gzw_nsjg()
    except Exception as e:
-        print(e)
-    current_time = datetime.datetime.now()
-    midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
-    sleep_seconds = (midnight_time - current_time).total_seconds()
-    time.sleep(sleep_seconds)
+        log.error(e)
+    #current_time = datetime.datetime.now()
+    #midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
+    #sleep_seconds = (midnight_time - current_time).total_seconds()
+    #time.sleep(sleep_seconds)
    # 创建一个ExcelWriter对象
    # writer = pd.ExcelWriter('国务院厅局.xlsx')
-
-
-
-
-