政策法规脚本维护

af27b7ec · 薛凌堃 · 687dbf5e · af27b7ec · af27b7ec · af27b7ec
--- a/comData/policylaw/BaseCore.py
+++ b/comData/policylaw/BaseCore.py
@@ -501,25 +501,26 @@ class BaseCore:
                   'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
                   'create_time': '', 'page_size': '', 'content': ''}
        headers['User-Agent'] = self.getRandomUserAgent()
-        for i in range(0, 3):
-            try:
-                response = requests.get(file_href, headers=headers, verify=False, timeout=20)
-                file_size = int(response.headers.get('Content-Length'))
-                break
-            except:
-                time.sleep(3)
-                continue
-        for i in range(0, 3):
-            try:
-                name = str(self.getuuid()) + category
-                result = obsClient.putContent('zzsn', 'PolicyDocuments/' + name, content=response.content)
-                break
-            except:
-                time.sleep(3)
-                continue
        try:
+            for i in range(0, 3):
+                try:
+                    response = requests.get(file_href, headers=headers, verify=False, timeout=20)
+                    file_size = int(response.headers.get('Content-Length'))
+                    break
+                except:
+                    time.sleep(3)
+                    continue
+            for i in range(0, 3):
+                try:
+                    name = str(self.getuuid()) + category
+                    result = obsClient.putContent('zzsn', 'PolicyDocuments/' + name, content=response.content)
+                    break
+                except:
+                    time.sleep(3)
+                    continue
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            retData['state'] = True
            retData['path'] = result['body']['objectUrl'].split('.com')[1]

--- a/comData/policylaw/ClassTool.py
+++ b/comData/policylaw/ClassTool.py
+#!/usr/bin/env python
+# coding=utf-8
+import json
+import time
+import pymongo
+from bs4 import BeautifulSoup
+from kafka import KafkaProducer
+from requests.packages import urllib3
+from urllib.parse import urljoin
+from BaseCore import BaseCore
+baseCore = BaseCore()
+urllib3.disable_warnings()
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+log = baseCore.getLogger()
+class ClassTool():
+    def __init__(self):
+        self.taskType = '政策法规'
+        self.db_storage =pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN[
+            '国务院_国资委_copy1']
+        self.driver_path = r'D:\cmd100\chromedriver.exe'
+        self.chromr_bin = r'D:\Google\Chrome\Application\chrome.exe'
+        self.headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
+        }
+    # 将html中的相对地址转换成绝对地址
+    def paserUrl(self, html, listurl):
+        # 获取所有的<a>标签和<img>标签
+        if isinstance(html, str):
+            html = BeautifulSoup(html, 'html.parser')
+        links = html.find_all(['a', 'img'])
+        # 遍历标签，将相对地址转换为绝对地址
+        for link in links:
+            if 'href' in link.attrs:
+                link['href'] = urljoin(listurl, link['href'])
+            elif 'src' in link.attrs:
+                link['src'] = urljoin(listurl, link['src'])
+        return html
+    def getDriver(self):
+        service = Service(self.driver_path)
+        chrome_options = webdriver.ChromeOptions()
+        # chrome_options.add_argument('--headless')
+        chrome_options.add_argument('--disable-gpu')
+        # chrome_options.add_argument('--no-sandbox')
+        chrome_options.add_argument('log-level=3')
+        chrome_options.add_argument('--disable-dev-shm-usage')
+        chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])  # 屏蔽chrome自动化受控提示
+        chrome_options.add_argument("--disable-blink-features=AutomationControlled")  # 禁用启用Blink运行时的功能去掉webdriver痕迹
+        chrome_options.add_experimental_option('useAutomationExtension', False)
+        chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
+        chrome_options.binary_location = self.chromr_bin
+        chrome_options.add_argument(
+            'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36')
+        # bro = webdriver.Chrome(chrome_options=chrome_options, service=service)
+        bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=self.driver_path)
+        # with open('stealth.min.js') as f:
+        #     js = f.read()
+        #
+        # bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
+        #     "source": js
+        # })
+        return bro
+    def save_data(self, dic_news):
+        aaa_dic = {
+            '附件id': dic_news['attachmentIds'],
+            '网址': dic_news['sourceAddress'],
+            'tid': dic_news['labels'][0]['relationId'],
+            '来源': dic_news['labels'][0]['relationName'],
+            '创建时间': dic_news['createDate'],
+            '带标签内容': dic_news['contentWithTag'][:100],
+            '发布时间': dic_news['publishDate']
+        }
+        self.db_storage.insert_one(aaa_dic)
+    def sendKafka(self, dic_news):
+        try:  # 114.116.116.241
+            producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], max_request_size=1024 * 1024 * 20)
+            kafka_result = producer.send("policy",
+                                         json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
+            print(kafka_result.get(timeout=10))
+            dic_result = {
+                'success': 'ture',
+                'message': '操作成功',
+                'code': '200',
+            }
+            log.info(dic_result)
+            # 传输成功,写入日志中
+            return True
+        except Exception as e:
+            dic_result = {
+                'success': 'false',
+                'message': '操作失败',
+                'code': '204',
+                'e': e
+            }
+            log.error(dic_result)
+            return False
\ No newline at end of file
--- a/comData/policylaw/an_hui.py
+++ b/comData/policylaw/an_hui.py
--- a/comData/policylaw/bei_jing.py
+++ b/comData/policylaw/bei_jing.py
+import os
+import time
+from bs4 import BeautifulSoup
+from selenium.webdriver.common.by import By
+from selenium import webdriver
+from ClassTool import ClassTool
+baseTool = ClassTool()
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# 北京
+def bei_jing():
+    num = 0
+    start_time = time.time()
+    # 有反爬需要使用selenium
+    # service = Service(r'D:/chrome/113/chromedriver.exe')
+    # 配置selenium
+    chrome_options = webdriver.ChromeOptions()
+    chrome_options.add_argument('--headless')
+    chrome_options.add_argument('--disable-gpu')
+    chrome_options.add_experimental_option(
+        "excludeSwitches", ["enable-automation"])
+    chrome_options.add_experimental_option('useAutomationExtension', False)
+    chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
+    chrome_options.add_argument('log-level=3')
+    chrome_options.add_argument(
+        'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
+    # bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=r'D:/chrome/103/chromedriver.exe')
+    chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
+    chromedriver = r'D:\cmd100\chromedriver.exe'
+    # bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=chromedriver)
+    bro = webdriver.Chrome(options=chrome_options, executable_path=chromedriver)
+    # with open('../../base/stealth.min.js') as f:
+    #     js = f.read()
+    #
+    # bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
+    #     "source": js
+    # })
+    url = 'http://gzw.beijing.gov.cn/xxfb/zcfg/index.html'
+    hrefs = []
+    try:
+        bro.get(url)
+        time.sleep(2)
+        bro.execute_script('window.scrollTo(0, document.body.scrollHeight)')
+        time.sleep(1)
+        while True:
+            # 获取所有要爬取页面的url
+            ul = bro.find_element(By.CLASS_NAME, 'public_list_team')
+            li_list = ul.find_elements(By.TAG_NAME, 'li')
+            for li in li_list:
+                href_ = li.find_element(By.TAG_NAME, 'a').get_attribute('href')
+                title_ = li.find_element(By.TAG_NAME, 'a').get_attribute('title')
+                hrefs.append([href_, title_])
+            updown = bro.find_element(By.CLASS_NAME, 'fanye').find_elements(By.TAG_NAME, 'a')[-1]
+            if updown.get_attribute('title') != '下一页':
+                break
+            updown.click()
+            time.sleep(2)
+        log.info(f'------{len(hrefs)}条数据-------------')
+        num = 0
+        count = 0
+        for href in hrefs:
+            id_list = []
+            title = href[1]
+            # todo:测试需要 注释掉判重
+            # 判断是否已经爬取过
+            is_href = baseTool.db_storage.find_one({'网址': href[0]})
+            if is_href:
+                num += 1
+                log.info('已采集----------跳过')
+                continue
+            # 对获取信息页面发送请求
+            bro.get(href[0])
+            time.sleep(1)
+            # 获取所要信息
+            pub = bro.find_element(By.CLASS_NAME, 'doc-info')
+            topic = str(pub.text).split('[主题分类] ')[1].split('\n')[0].strip()
+            # 发文机构
+            organ = str(pub.text).split('[发文机构] ')[1].split('\n')[0].strip()
+            pub_time = str(pub.text).split('[发布日期] ')[1].split('[有效性] ')[0].strip().lstrip()
+            writtenDate = str(pub.text).split('[成文日期] ')[1].split('\n')[0].strip()
+            # pub_source = str(pub.text).split('[发文机构] ')[1].split('[联合发文单位] ')[0].split('[实施日期] ')[0].strip().lstrip()
+            pub_hao = pub.find_element(By.CLASS_NAME, 'fwzh').text.replace('[发文字号] ', '').lstrip().strip()
+            try:
+                pub_list = bro.find_elements(By.CLASS_NAME, 'article-info')
+                for source in pub_list:
+                    if '来源' in source.text:
+                        pub_source = source.text.split('来源：')[1].split('\n')[0]
+                        # print(pub_source)
+            except:
+                pub_source = ''
+            # .split('来源：')[1]
+            if '号' not in pub_hao:
+                pub_hao = ''
+            cont = bro.find_element(By.ID, 'div_zhengwen').get_attribute('innerHTML')
+            soup_cont = BeautifulSoup(cont, 'lxml')
+            soup = baseTool.paserUrl(soup_cont, href[0])
+            soup.prettify()
+            if soup.text == '' or soup.text == 'None':
+                log.info(f'----{href[0]}----{title}----内容为空----')
+                continue
+            # todo:去掉扫一扫
+            try:
+                soup.find('div', id='div_div').decompose()
+            except:
+                continue
+            # log.info(title)
+            fu_jian_soup = soup.find_all('a')
+            for file in fu_jian_soup:
+                try:
+                    file_href = file['href']
+                except Exception as e:
+                    log.info(f'---{href[0]}--------{e}-------')
+                    continue
+                if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
+                        or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
+                        or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
+                    file_name = file.text.strip()
+                    category = os.path.splitext(file_href)[1]
+                    if category not in file_name:
+                        file_name = file_name + category
+                    retData = baseCore.uptoOBS(file_href, '1667', file_name)
+                    if retData['state']:
+                        pass
+                    else:
+                        continue
+                    att_id, full_path = baseCore.tableUpdate(retData, '北京市国资委', file_name, num, pub_time)
+                    id_list.append(att_id)
+                    # todo:将返回的地址更新到soup
+                    file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
+            # id_ = redefid(id_list)
+            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+            # todo:传kafka字段
+            dic_news = {
+                'attachmentIds': id_list,
+                'author': '',
+                'content': str(soup.text),
+                'contentWithTag': str(soup),
+                'createDate': time_now,
+                'deleteFlag': 0,
+                'id': '',
+                'labels': [{'relationId': "1667", 'relationName': "北京市国资委", 'labelMark': "policy"}],
+                'origin': pub_source,
+                'organ': organ,
+                'topicClassification': topic,
+                'issuedNumber': pub_hao,
+                'publishDate': pub_time,
+                'writtenDate': writtenDate,
+                'sid': '1697458829758697473',
+                'sourceAddress': href[0],
+                'summary': '',
+                'title': title
+            }
+            # print(dic_news)
+            flag = baseTool.sendKafka(dic_news)
+            if flag:
+                baseTool.save_data(dic_news)
+                num += 1
+                count += 1
+        end_time = time.time()
+        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+        bro.quit()
+    except Exception as e:
+        log.info(e)
+        pass
+if __name__ == "__main__":
+    bei_jing()
\ No newline at end of file
--- a/comData/policylaw/chong_qing.py
+++ b/comData/policylaw/chong_qing.py
+import datetime
+import os
+import re
+import time
+import requests
+from bs4 import BeautifulSoup
+from pyquery import PyQuery as pq
+from ClassTool import ClassTool
+baseTool = ClassTool()
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# 重庆
+def chong_qing():
+    """
+    http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/xzgfxwj/  4
+    http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/qtwj/  2
+    """
+    num = 0
+    count = 0
+    pathType = 'policy/chongqing/'
+    start_time = time.time()
+    for page in range(0, 4):
+        if page == 0:
+            url = 'http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/zcwj/index.html'
+        else:
+            url = 'http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/zcwj/index_{}.html'.format(page)
+        #     url = 'http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/zcwj/index_3.html'
+        try:
+            resp_text = requests.get(url=url, headers=baseTool.headers, verify=False).content
+            doc_resp = pq(resp_text)
+            doc_items = doc_resp('.zsj-fr-main').items()
+            for doc_item in doc_items:
+                id_list = []
+                titles = doc_item('a').items()
+                for title_item in titles:
+                    title = title_item.text().strip()
+                    href = title_item('a').attr('href')
+                    if '../' in href:
+                        href = url.split('zcwj/index')[0] + title_item('a').attr('href').replace('../', '')
+                    else:
+                        href = url.split('index')[0] + title_item('a').attr('href').replace('./', '')
+                    is_href = baseTool.db_storage.find_one({'网址': href})
+                    if is_href:
+                        num += 1
+                        continue
+                    try:
+                        # print(href)
+                        # href = 'https://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/zcwj/202007/t20200728_7729850.html'
+                        href_text = requests.get(url=href, headers=baseTool.headers, verify=False).content
+                        doc_href = pq(href_text)
+                        try:
+                            pub_result = doc_href('.zwxl-table').text().replace(' ', '')
+                            pub_time = pub_result.split('[发布日期]')[1].strip() + ' 00:00:00'
+                            pub_hao = pub_result.split('[发文字号]')[1].split('[主题分类]')[0].strip()
+                            topicClassification = pub_result.split('[主题分类]')[1].split('[体裁分类]')[0].strip()
+                            origin = pub_result.split('[发布机构]')[1].split('[成文日期]')[0].strip()
+                            writtenDate = pub_result.split('[成文日期]')[1].split('[发布日期]')[0].strip()
+                            doc_href = BeautifulSoup(str(doc_href), 'html.parser')
+                            # 相对路径转化为绝对路径
+                            doc_href = baseTool.paserUrl(doc_href, href)
+                            # 去掉扫一扫
+                            try:
+                                doc_href.find('div', id='div_div').decompose()
+                                # 去掉分享
+                                doc_href.find('div', class_='bdsharebuttonbox').decompose()
+                            except:
+                                pass
+                            contentWithTag = doc_href.find('div', class_='zwxl-article')
+                            content = contentWithTag.text
+                            if content == '' or content == None:
+                                log.info(f'-----{href}----{title}----内容为空-----')
+                                continue
+                        except:
+                            origin = ''
+                            topicClassification = ''
+                            pub_time = None
+                            writtenDate = None
+                            pub_hao = ''
+                            contentWithTag = doc_href.find('div', class_='zwxl-content')
+                            content = contentWithTag.text
+                            if content == '' or content == None:
+                                log.info(f'-----{href}----{title}----内容为空-----')
+                                continue
+                        fu_jian_list = contentWithTag.find_all('a')
+                        # print(fu_jian_list)
+                        for fu_jian in fu_jian_list:
+                            try:
+                                fu_jian_href = fu_jian['href']
+                            except:
+                                continue
+                            file_name = fu_jian.text
+                            if '.pdf' in fu_jian_href or '.docx' in fu_jian_href or '.doc' in fu_jian_href or 'xlsx' in fu_jian_href or '.zip' in fu_jian_href \
+                                    or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
+                                    or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
+                                try:
+                                    category = os.path.splitext(fu_jian_href)[1]
+                                    if category not in file_name:
+                                        file_name = file_name + category
+                                    # 附件上传至文件服务器
+                                    retData = baseCore.uptoOBS(fu_jian_href, '1693', file_name)
+                                    if retData['state']:
+                                        pass
+                                    else:
+                                        continue
+                                    att_id, full_path = baseCore.tableUpdate(retData, '重庆市国资委', file_name, num,
+                                                                             pub_time)
+                                    id_list.append(att_id)
+                                    # 将附件链接替换
+                                    fu_jian['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
+                                except:
+                                    continue
+                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                        # todo:传kafka字段
+                        dic_news = {
+                            'attachmentIds': id_list,
+                            'author': '',
+                            'content': content,
+                            'contentWithTag': str(contentWithTag),
+                            'createDate': time_now,
+                            'deleteFlag': 0,
+                            'id': '',
+                            'labels': [{'relationId': "1693", 'relationName': "重庆市国资委",
+                                        'labelMark': "policy"}],
+                            'origin': origin,
+                            'organ': '',
+                            'topicClassification': topicClassification,
+                            'issuedNumber': pub_hao,
+                            'publishDate': pub_time,
+                            'writtenDate': writtenDate,
+                            'sid': '1697458829758697473',
+                            'sourceAddress': href,
+                            'summary': '',
+                            'title': title
+                        }
+                        # print(dic_news)
+                        flag = baseTool.sendKafka(dic_news)
+                        if flag:
+                            baseTool.save_data(dic_news)
+                            log.info(title)
+                            count += 1
+                            num += 1
+                    except:
+                        pass
+        except:
+            pass
+    end_time = time.time()
+    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+if __name__ == "__main__":
+    chong_qing()
\ No newline at end of file
--- a/comData/policylaw/fu_jian.py
+++ b/comData/policylaw/fu_jian.py
+import os
+import time
+import requests
+from bs4 import BeautifulSoup
+from ClassTool import ClassTool
+baseTool = ClassTool()
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# 福建
+def fu_jian():
+    error_tag = str(404)
+    num = 0
+    count = 0
+    start_time = time.time()
+    url = 'http://gzw.fujian.gov.cn/zwgk/zcfg/'
+    try:
+        resp_text = requests.get(url=url, headers=baseTool.headers, verify=False)
+        resp_text.encoding = 'utf-8'
+        html = resp_text.text
+        soup = BeautifulSoup(html, 'html.parser')
+        # print(soup)
+        result = soup.find_all(class_='borbot-line')
+        for li_list in result:
+            li = li_list.find_all('li')
+            for a in li:
+                id_list = []
+                # print(a)
+                a_text = str(a)
+                title = a_text.split('title="')[-1].split('">')[0].replace('\n', '')
+                href_ = str(a.find('a').get('href'))  # 网站链接
+                href = href_.replace('../', './').replace('./', 'http://gzw.fujian.gov.cn/zwgk/')
+                href_text = requests.get(url=href, headers=baseTool.headers, verify=False)
+                href_text.encoding = href_text.apparent_encoding
+                i_html = href_text.text
+                i_soup = BeautifulSoup(i_html, 'html.parser')
+                try:
+                    error_ = str(i_soup.find('strong').text)
+                except:
+                    error_ = ''
+                if error_ == error_tag:
+                    href = href_.replace('../', './').replace('./', 'http://gzw.fujian.gov.cn/zwgk/zcfg/')
+                    href_text = requests.get(url=href, headers=baseTool.headers, verify=False)
+                    href_text.encoding = href_text.apparent_encoding
+                    i_html = href_text.text
+                    i_soup = BeautifulSoup(i_html, 'html.parser')
+                    try:
+                        error_ = str(i_soup.find('strong').text)
+                    except:
+                        error_ = ''
+                    if error_ == error_tag:
+                        href = href_.replace('../../', 'http://gzw.fujian.gov.cn/')
+                        href_text = requests.get(url=href, headers=baseTool.headers, verify=False)
+                        href_text.encoding = href_text.apparent_encoding
+                        i_html = href_text.text
+                        i_soup = BeautifulSoup(i_html, 'html.parser')
+                real_href = href
+                # real_href = 'http://gzw.fujian.gov.cn/zwgk/zcfg/201806/t20180619_3065065.htm'
+                # print(real_href)
+                is_href = baseTool.db_storage.find_one({'网址': real_href})
+                if is_href:
+                    num += 1
+                    continue
+                try:
+                    # 文章是远程pdf
+                    # 直接下载文件至服务器，解析出正文内容
+                    if '.pdf' in real_href:
+                        # pass
+                        resp_content = requests.get(real_href, headers=baseTool.headers, verify=False, timeout=20).content
+                        # 解析出pdf内容
+                        content = baseCore.pdf_content(resp_content)
+                        contentwithtag = ''
+                        category = os.path.splitext(real_href)[1]
+                        if category not in title:
+                            file_name = title + category
+                        # 文件上传至服务器
+                        retData = baseCore.uptoOBS(real_href, '1673', file_name)
+                        if retData['state']:
+                            pass
+                        else:
+                            continue
+                        att_id, full_path = baseCore.tableUpdate(retData, '福建省国资委', file_name, num, '')
+                        id_list.append(att_id)
+                        pub_hao = ''
+                        pub_time = None
+                        pub_source = ''
+                    else:
+                        try:
+                            href_text = requests.get(url=real_href, headers=baseTool.headers, verify=False)
+                            href_text.encoding = href_text.apparent_encoding
+                            i_html = href_text.text
+                            i_soup = BeautifulSoup(i_html, 'html.parser')
+                            # 相对路径转化为绝对路径
+                            i_soup = baseTool.paserUrl(i_soup, real_href)
+                            source_ = str(i_soup.find('div', attrs={'class': 'xl_tit2_l'}).text)
+                            pub_source = source_.split('来源：')[1].split('发布时间：')[0].strip().lstrip()
+                            pub_time = source_.split('发布时间：')[1].split('浏览量：')[0].strip().lstrip()
+                            contentwithtag = i_soup.find('div', attrs={'class': 'xl_con1'})
+                            content = i_soup.find('div', attrs={'class': 'xl_con1'}).text
+                            if content == '' or content == None:
+                                log.info(f'-----{href}----{title}----内容为空-----')
+                                continue
+                            pub_hao = ''
+                            # print(real_href)
+                            # todo:获取附件地址
+                            try:
+                                fu_jian_list = i_soup.find('ul', class_='clearflx myzj_xl_list').find_all('a')
+                            except:
+                                pass
+                                fu_jian_list = []
+                            for fu_jian in fu_jian_list:
+                                try:
+                                    fj_href = fu_jian['href']
+                                except:
+                                    continue
+                                file_name = fu_jian.text
+                                if '.doc' in fj_href or '.docx' in fj_href or '.xlsx' in fj_href or '.pdf' in fj_href or '.xls' in fj_href or '.zip' in fj_href \
+                                        or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
+                                        or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
+                                    category = os.path.splitext(fj_href)[1]
+                                    if category not in file_name:
+                                        file_name = file_name + category
+                                    print(fj_href)
+                                    # 找到附件后 上传至文件服务器
+                                    retData = baseCore.uptoOBS(fj_href, '1673', file_name)
+                                    if retData['state']:
+                                        pass
+                                    else:
+                                        continue
+                                    att_id, full_path = baseCore.tableUpdate(retData, '福建省国资委', file_name, num,
+                                                                             pub_time)
+                                    id_list.append(att_id)
+                                    # 将文件服务器的链接替换
+                                    fu_jian['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
+                        except:
+                            pub_source = ''
+                            pub_time = None
+                            contentwithtag = i_soup.find('tabs tab_base_01 rules_con1')
+                            content = contentwithtag.text.strip()
+                            if content == '' or content == None:
+                                log.info(f'-----{href}----{title}----内容为空-----')
+                                continue
+                            pub_hao = contentwithtag.find_all('div', class_='rules_tit1 b-free-read-leaf').text.dtrip()
+                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                    # todo:传kafka字段
+                    dic_news = {
+                        'attachmentIds': id_list,
+                        'author': '',
+                        'content': content,
+                        'contentWithTag': str(contentwithtag),
+                        'createDate': time_now,
+                        'deleteFlag': 0,
+                        'id': '',
+                        'labels': [{'relationId': "1673", 'relationName': "福建省国资委", 'labelMark': "policy"}],
+                        'origin': pub_source,
+                        'organ': '',
+                        'topicClassification': '',
+                        'issuedNumber': pub_hao,
+                        'publishDate': pub_time,
+                        'writtenDate': None,
+                        'sid': '1697458829758697473',
+                        'sourceAddress': real_href,
+                        'summary': '',
+                        'title': title
+                    }
+                    # log.info(dic_news)
+                    flag = baseTool.sendKafka(dic_news)
+                    if flag:
+                        baseTool.save_data(dic_news)
+                        log.info(title)
+                        num += 1
+                        count += 1
+                except:
+                    pass
+    except:
+        pass
+    end_time = time.time()
+    log.info(f'共抓取{count}条数据，共耗时{end_time - start_time}')
+if __name__ == "__main__":
+    fu_jian()
\ No newline at end of file
--- a/comData/policylaw/gan_su.py
+++ b/comData/policylaw/gan_su.py
--- a/comData/policylaw/guang_dong.py
+++ b/comData/policylaw/guang_dong.py
+import datetime
+import os
+import re
+import time
+import requests
+from bs4 import BeautifulSoup
+from pyquery import PyQuery as pq
+from ClassTool import ClassTool
+baseTool = ClassTool()
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# 广东
+def guang_dong():
+    start = time.time()
+    num = 0
+    count = 0
+    url = 'http://gzw.gd.gov.cn/zcfg/index.html'
+    try:
+        resp_href = requests.get(url=url, headers=baseTool.headers, verify=False)
+        resp_href.encoding = resp_href.apparent_encoding
+        doc_resp = BeautifulSoup(resp_href.text, 'html.parser')
+        page_items = str(doc_resp.find('div', attrs={'class': 'page'}).text)
+        total = page_items.split('共 ')[1].split(' 条')[0].strip().lstrip()
+        total = int(total)
+        if total % 23 != 0:
+            pagen = total / 23 + 1
+        else:
+            pagen = total / 23
+        for page in range(1, int(pagen + 1)):
+            if page == 1:
+                url = 'http://gzw.gd.gov.cn/zcfg/index.html'
+            else:
+                url = f'http://gzw.gd.gov.cn/zcfg/index_{page}.html'
+            resp_text = requests.get(url=url, headers=baseTool.headers, verify=False).text
+            doc_resp = pq(resp_text)
+            doc_items = doc_resp('.list li').items()
+            for doc_item in doc_items:
+                id_list = []
+                title = doc_item('a').text().replace('\n', '')
+                href = doc_item('a').attr('href')
+                is_href = baseTool.db_storage.find_one({'网址': href})
+                if is_href:
+                    num += 1
+                    continue
+                try:
+                    # print(href)
+                    href_text = requests.get(url=href, headers=baseTool.headers, verify=False).text
+                    doc_href = pq(href_text)
+                    pub_result = doc_href('.title_info_sub').text()
+                    pub_time = pub_result.split('文章来源：')[0].replace('发布时间：', '').strip() + ' 00:00:00'
+                    pub_source = pub_result.split('文章来源：')[1].strip()
+                    i_soup = BeautifulSoup(href_text, 'html.parser')
+                    i_soup = baseTool.paserUrl(i_soup, href)
+                    content = i_soup.find('div', attrs={'class', 'box_info'})
+                    contentwithTag = str(content)
+                    if content == '' or content == None:
+                        log.info(f'{href}-----{title}----内容为空----')
+                        continue
+                    fu_jian_list = content.find_all('a')
+                    for fu_jian in fu_jian_list:
+                        try:
+                            file_name = fu_jian.text
+                            fj_href = fu_jian['href']
+                        except:
+                            continue
+                        if '.doc' in fj_href or '.docx' in fj_href or '.pdf' in fj_href or '.xls' in fj_href or '.zip' in fj_href \
+                                or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
+                                or '.xlsx' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
+                            category = os.path.splitext(fj_href)[1]
+                            if category not in file_name:
+                                file_name = file_name + category
+                            # 附件上传至文件服务器
+                            retData = baseCore.uptoOBS(fj_href, '1676', file_name)
+                            if retData['state']:
+                                pass
+                            else:
+                                continue
+                            att_id, full_path = baseCore.tableUpdate(retData, '广东省国资委', file_name, num, pub_time)
+                            id_list.append(att_id)
+                            # 将文件服务器的链接替换
+                            fu_jian['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
+                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                    # todo:传kafka字段
+                    dic_news = {
+                        'attachmentIds': id_list,
+                        'author': '',
+                        'content': content.text,
+                        'contentWithTag': str(contentwithTag),
+                        'createDate': time_now,
+                        'deleteFlag': 0,
+                        'id': '',
+                        'labels': [{'relationId': "1676", 'relationName': "广东省国资委", 'labelMark': "policy"}],
+                        'origin': pub_source,
+                        'organ': '',
+                        'topicClassification': '',
+                        'issuedNumber': '',
+                        'publishDate': pub_time,
+                        'writtenDate': None,
+                        'sid': '1697458829758697473',
+                        'sourceAddress': href,
+                        'summary': '',
+                        'title': title
+                    }
+                    # print(dic_news)
+                    flag = baseTool.sendKafka(dic_news)
+                    if flag:
+                        baseTool.save_data(dic_news)
+                        log.info(title)
+                        num = num + 1
+                        count += 1
+                except:
+                    pass
+    except:
+        pass
+    end = time.time()
+    log.info(f'共抓取{count}条数据，共耗时{end - start}')
+if __name__ == "__main__":
+    guang_dong()
\ No newline at end of file
--- a/comData/policylaw/guang_xi.py
+++ b/comData/policylaw/guang_xi.py
+import datetime
+import os
+import re
+import time
+import requests
+from bs4 import BeautifulSoup
+from pyquery import PyQuery as pq
+from ClassTool import ClassTool
+baseTool = ClassTool()
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# 广西
+def guang_xi():
+    num = 0
+    count = 0
+    start_time = time.time()
+    url_all = """
+    http://gzw.gxzf.gov.cn/wjzx/2023nwj/  1
+    http://gzw.gxzf.gov.cn/wjzx/2022nwj/  1
+    http://gzw.gxzf.gov.cn/wjzx/2021nwj/  1
+    http://gzw.gxzf.gov.cn/wjzx/2020nwj/  1
+    http://gzw.gxzf.gov.cn/wjzx/2019nwj/  1
+    http://gzw.gxzf.gov.cn/wjzx/2018nwj/  2
+    http://gzw.gxzf.gov.cn/wjzx/2017nwj/  2
+    http://gzw.gxzf.gov.cn/wjzx/2016nwj/  2
+    http://gzw.gxzf.gov.cn/wjzx/2015nwj/  3
+    http://gzw.gxzf.gov.cn/wjzx/2014nwj/  2
+    http://gzw.gxzf.gov.cn/wjzx/2013nwj/  2
+    http://gzw.gxzf.gov.cn/wjzx/2012nwj/  2
+    http://gzw.gxzf.gov.cn/wjzx/2011nwj/  5
+    http://gzw.gxzf.gov.cn/wjzx/wjhbdej2008n2010n/  1
+    http://gzw.gxzf.gov.cn/wjzx/wjhbdyj2004n2007n/  1
+    http://gzw.gxzf.gov.cn/wjzx/gfxwjhb2004n2013n/  1
+    http://gzw.gxzf.gov.cn/wjzx/jshgfxwj2004n2015n/  1
+    http://gzw.gxzf.gov.cn/wjzx/gfxwjhb2004n2015n/  1
+    """
+    url_list = url_all.split('\n')
+    for url_info in url_list[1:-1]:
+        url_info = url_info.strip()
+        url_1 = url_info.split(' ')[0].strip()
+        for page in range(0, 1):
+            if page == 0:
+                url = f'{url_1}index.shtml'
+            else:
+                url = f'{url_1}index_{page}.shtml'
+            try:
+                resp_text = requests.get(url=url, headers=baseTool.headers, verify=False).content
+                doc_resp = pq(resp_text)
+                doc_items = doc_resp('#morelist li').items()
+                for doc_item in doc_items:
+                    id_list = []
+                    title = doc_item('a').attr('title').strip()
+                    href = url.split('index')[0] + doc_item('a').attr('href').replace('./', '')
+                    is_href = baseTool.db_storage.find_one({'网址': href})
+                    if is_href:
+                        num += 1
+                        continue
+                    try:
+                        # print(href)
+                        href_text = requests.get(url=href, headers=baseTool.headers, verify=False).content
+                        doc_href = pq(href_text)
+                        pub_result = doc_href('.article-inf-left').text()
+                        pub_hao_result = doc_href('.article-h2').text()
+                        if '﹝' in pub_hao_result and '﹞' in pub_hao_result:
+                            pub_hao = pub_hao_result.replace('﹝', '〔').replace('﹞', '〕')
+                        elif '〔' in pub_hao_result and '〕' in pub_hao_result:
+                            pub_hao = pub_hao_result
+                        else:
+                            pub_hao = ''
+                        pub_time = pub_result.split('来源：')[0].strip() + ':00'
+                        try:
+                            pub_source = pub_result.split('来源：')[1].split('作者：')[0].strip()
+                        except:
+                            pub_source = pub_result.split('来源：')[1].strip()
+                        contentWithTag = doc_href('.article-con div:first-child')
+                        # 相对路径转化为绝对路径
+                        contentWithTag = BeautifulSoup(str(contentWithTag), 'html.parser')
+                        contentWithTag = baseTool.paserUrl(contentWithTag, href)
+                        content = contentWithTag.text.strip()
+                        if content == '' or content == None:
+                            log.info(f'-----{href}----{title}----内容为空-----')
+                            continue
+                        fu_jian_list = contentWithTag.find_all('a')
+                        for fu_jian in fu_jian_list:
+                            try:
+                                fu_jian_href = fu_jian['href']
+                            except:
+                                continue
+                            file_name = fu_jian.text.strip()
+                            if '.docx' in fu_jian_href or '.pdf' in fu_jian_href or '.xlsx' in fu_jian_href or '.zip' in fu_jian_href \
+                                    or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
+                                    or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
+                                category = os.path.splitext(fu_jian_href)[1]
+                                if category not in file_name:
+                                    file_name = file_name + category
+                                # 附件上传至文件服务器
+                                retData = baseCore.uptoOBS(fu_jian_href, '1692', file_name)
+                                if retData['state']:
+                                    pass
+                                else:
+                                    continue
+                                att_id, full_path = baseCore.tableUpdate(retData, '广西壮族自治区国资委', file_name, num,
+                                                                         pub_time)
+                                id_list.append(att_id)
+                                # 将附件链接替换
+                                fu_jian['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
+                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                        # todo:传kafka字段
+                        dic_news = {
+                            'attachmentIds': id_list,
+                            'author': '',
+                            'content': content,
+                            'contentWithTag': str(contentWithTag),
+                            'createDate': time_now,
+                            'deleteFlag': 0,
+                            'id': '',
+                            'labels': [{'relationId': "1692", 'relationName': "广西壮族自治区国资委", 'labelMark': "policy"}],
+                            'origin': '',
+                            'organ': pub_source,
+                            'topicClassification': '',
+                            'issuedNumber': pub_hao,
+                            'publishDate': pub_time,
+                            'writtenDate': None,
+                            'sid': '1697458829758697473',
+                            'sourceAddress': href,
+                            'summary': '',
+                            'title': title
+                        }
+                        # print(dic_news)
+                        flag = baseTool.sendKafka(dic_news)
+                        if flag:
+                            baseTool.save_data(dic_news)
+                            log.info(title)
+                            num = num + 1
+                    except:
+                        pass
+            except:
+                pass
+    end_time = time.time()
+    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+if __name__ == "__main__":
+    guang_xi()
\ No newline at end of file
--- a/comData/policylaw/gui_zhou.py
+++ b/comData/policylaw/gui_zhou.py
+import datetime
+import os
+import re
+import time
+import requests
+from bs4 import BeautifulSoup
+from pyquery import PyQuery as pq
+from ClassTool import ClassTool
+baseTool = ClassTool()
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# 贵州
+def gui_zhou():
+    """
+    http://gzw.guizhou.gov.cn/zwgk/xxgkml/zcwj/  11
+    http://gzw.guizhou.gov.cn/zwgk/xxgkml/qlqdhzrqd/  1
+    """
+    num = 0
+    count = 0
+    start_time = time.time()
+    for page in range(0, 11):
+        if page == 0:
+            url = 'http://gzw.guizhou.gov.cn/zwgk/xxgkml/zcwj/alist.html'
+        else:
+            url = f'http://gzw.guizhou.gov.cn/zwgk/xxgkml/zcwj/alist_{page}.html'
+        try:
+            resp_text = requests.get(url=url, headers=baseTool.headers, verify=False).content
+            doc_resp = pq(resp_text)
+            doc_items = doc_resp('.c').items()
+            for doc_item in doc_items:
+                id_list = []
+                href = doc_item('a').attr('href')
+                is_href = baseTool.db_storage.find_one({'网址': href})
+                if is_href:
+                    num += 1
+                    continue
+                try:
+                    # print(href)
+                    # href = 'http://gzw.guizhou.gov.cn/zwgk/xxgkml/zcwj/hyzcfg/202110/t20211026_71215292.html'
+                    title = doc_item('a').text().strip()
+                    href_text = requests.get(url=href, headers=baseTool.headers, verify=False)
+                    if '404 Not Found' in href_text.text:
+                        continue
+                    doc_href = pq(href_text.content)
+                    # 发文机构
+                    organ = doc_href('#NewsArticleSource').text()
+                    pub_result = doc_href('.xxgk_xl_top').text().replace('var str = ""; var str_1 = "', '').replace(
+                        '"; if (str == "") { document.write(str_1); } else { document.write(str); }', '')
+                    pub_time = pub_result.split('发文日期: ')[1].split('文号:')[0].strip().replace('年', '-').replace('月',
+                                                                                                               '-').replace(
+                        '日', ' ') + ' 00:00:00'
+                    # origin
+                    pub_source = pub_result.split('发布机构:')[1].split('发文日期:')[0].strip()
+                    pub_hao = pub_result.split('文号:')[1].split('是否有效:')[0].strip()
+                    topicClassification = pub_result.split('信息分类:')[1].split('发布机构:')[0].strip()
+                    if pub_source == '无':
+                        pub_source = ''
+                    if pub_hao == '无':
+                        pub_hao = ''
+                    contentWithTag = doc_href('#Zoom').children()
+                    contentWithTag = BeautifulSoup(str(contentWithTag), 'html.parser')
+                    contentWithTag = baseTool.paserUrl(contentWithTag, href)
+                    content = contentWithTag.text.strip()
+                    if content == '' or content == None:
+                        log.info(f'-----{href}----{title}----内容为空-----')
+                        continue
+                    fu_jian_list = contentWithTag.find_all('a')
+                    for fu_jian in fu_jian_list:
+                        try:
+                            fu_jian_href = fu_jian['href']
+                        except:
+                            continue
+                        file_name = fu_jian.text.strip()
+                        if '.doc' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
+                                or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
+                                or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
+                            category = os.path.splitext(fu_jian_href)[1]
+                            if category not in file_name:
+                                file_name = file_name + category
+                            # 附件上传至文件服务器
+                            retData = baseCore.uptoOBS(fu_jian_href, '1694', file_name)
+                            if retData['state']:
+                                pass
+                            else:
+                                continue
+                            att_id, full_path = baseCore.tableUpdate(retData, '贵州省国资委', file_name, num, pub_time)
+                            id_list.append(att_id)
+                            # 将附件链接替换
+                            fu_jian['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
+                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                    # todo:传kafka字段
+                    dic_news = {
+                        'attachmentIds': id_list,
+                        'author': '',
+                        'content': content,
+                        'contentWithTag': str(contentWithTag),
+                        'createDate': time_now,
+                        'deleteFlag': 0,
+                        'id': '',
+                        'labels': [{'relationId': "1694", 'relationName': "贵州省国资委", 'labelMark': "policy"}],
+                        'origin': pub_source,
+                        'organ': organ,
+                        'topicClassification': topicClassification,
+                        'issuedNumber': pub_hao,
+                        'publishDate': pub_time,
+                        'writtenDate': None,
+                        'sid': '1697458829758697473',
+                        'sourceAddress': href,
+                        'summary': '',
+                        'title': title
+                    }
+                    # print(dic_news)
+                    flag = baseTool.sendKafka(dic_news)
+                    if flag:
+                        baseTool.save_data(dic_news)
+                        log.info(title)
+                        count += 1
+                        num = num + 1
+                except Exception as e:
+                    pass
+        except Exception as e:
+            pass
+    end_time = time.time()
+    log.info(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+if __name__ == "__main__":
+    gui_zhou()
\ No newline at end of file
--- a/comData/policylaw/gwyfile.py
+++ b/comData/policylaw/gwyfile.py
--- a/comData/policylaw/gwyparts.py
+++ b/comData/policylaw/gwyparts.py
+import json
+import os
+import time
+from random import choice
+import requests
+from bs4 import BeautifulSoup
+from requests.adapters import HTTPAdapter
+from ClassTool import ClassTool
+baseTool = ClassTool()
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# 国务院部门文件
+def get_content2():
+    def getTotalpage(bmfl, headers, session):
+        ip = baseCore.get_proxy()
+        pageNo = 1
+        time.sleep(2)
+        # 拼接url
+        url_ = f'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
+        resp = session.get(url=url_, headers=headers, verify=False, proxies=ip)
+        resp_text = resp.text
+        resp_json = json.loads(resp_text)
+        totalpage = resp_json['searchVO']['totalpage']
+        return totalpage
+    def getContentList(bmfl, pageNo, headers, session):
+        ip = baseCore.get_proxy()
+        url_ = f'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
+        # 请求结果为json格式
+        resp = session.get(url=url_, headers=headers, verify=False, proxies=ip)
+        resp_text = resp.text
+        resp_json = json.loads(resp_text)
+        content_list = resp_json['searchVO']['listVO']
+        return content_list
+    session = requests.session()
+    session.mount('https://', HTTPAdapter(max_retries=3))
+    session.mount('http://', HTTPAdapter(max_retries=3))
+    session.keep_alive = False
+    start_time = time.time()
+    num = 0
+    count = 0
+    result_list = ['外交部', '国家发展和改革委员会', '教育部', '科学技术部', '工业和信息化部', '国家民族事务委员会', '公安部', '国家安全部', '民政部', '司法部', '财政部',
+                   '人力资源和社会保障部', '自然资源部', '生态环境部', '住房和城乡建设部', '交通运输部', '水利部', '农业农村部', '商务部', '文化和旅游部',
+                   '国家卫生健康委员会',
+                   '退役军人事务部',
+                   '应急管理部', '人民银行', '审计署', '国务院国有资产监督管理委员会', '海关总署', '国家税务总局', '国家市场监督管理总局', '国家金融监督管理总局',
+                   '国家广播电视总局',
+                   '国家体育总局',
+                   '国家统计局', '国家国际发展合作署', '国家医疗保障局', '国家机关事务管理局', '国家标准化管理委员会', '国家新闻出版署', '国家版权局', '国家互联网信息办公室',
+                   '中国科学院',
+                   '中国社会科学院', '中国工程院', '中国气象局', '中国银行保险监督管理委员会', '中国证券监督管理委员会', '国家粮食和物资储备局', '国家能源局', '国家国防科技工业局',
+                   '国家烟草专卖局',
+                   '国家移民管理局', '国家林业和草原局', '国家铁路局', '中国民用航空局', '国家邮政局', '国家文物局', '国家中医药管理局', '国家矿山安全监察局', '国家外汇管理局',
+                   '国家药品监督管理局',
+                   '国家知识产权局', '国家档案局', '国家保密局', '国家密码管理局', '国家宗教事务局', '国务院台湾事务办公室', '国家乡村振兴局', '国家电影局']
+    for bmfl in result_list:
+        # try:
+        # totalpage = getTotalpage(bmfl,headers,session)
+        # for pageNo in range(1,totalpage+1):
+        # for pageNo in range(1,6):
+        pageNo = 1
+        try:
+            try:
+                content_list = getContentList(bmfl, pageNo, baseTool.headers, session)
+            except:
+                session.close()
+                content_list = getContentList(bmfl, pageNo, baseTool.headers, session)
+            for content_dict in content_list:
+                id_list = []
+                href = content_dict['url']  # 详情页
+                title = content_dict['title']  # 标题
+                pub_code = content_dict['pcode']  # 发文字号
+                try:
+                    pub_time = int(content_dict['pubtime'] / 1000)  # 发布时间
+                    pub_time1 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(pub_time))
+                except:
+                    pub_time1 = None
+                try:
+                    p_time = int(content_dict['ptime'] / 1000)  # 成文时间
+                    pub_time2 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(p_time))
+                except:
+                    pub_time2 = None
+                pub_org = content_dict['puborg']  # 发文机关
+                try:
+                    child_type = content_dict['childtype']  # 主题分类
+                except:
+                    child_type = ''
+                # # 判断是否已经爬取过
+                is_href = baseTool.db_storage.find_one({'网址': href})
+                if is_href:
+                    num += 1
+                    log.info('已采集----------跳过')
+                    time.sleep(1)
+                    continue
+                try:
+                    resp = requests.get(url=href, headers=headers, verify=False)
+                    resp.encoding = resp.apparent_encoding
+                    resp_text = resp.text
+                    soup = BeautifulSoup(resp_text, 'html.parser')
+                    soup = baseTool.paserUrl(soup, href)
+                    time.sleep(0.5)
+                    contentWithTag = soup.find('div', attrs={'class': 'pages_content mhide'})
+                    content = contentWithTag.text
+                    if content == '' or content == 'None':
+                        log.info(f'----{href}---{title}---内容为空---')
+                        continue
+                    fu_jian_soup = contentWithTag.find_all('a')
+                    for file in fu_jian_soup:
+                        try:
+                            file_href = file['href']
+                        except Exception as e:
+                            log.info(f'---{href}--------{e}-------')
+                            continue
+                        if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
+                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
+                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
+                            file_name = file.text.strip()
+                            category = os.path.splitext(file_href)[1]
+                            if category not in file_name:
+                                file_name = file_name + category
+                            retData = baseCore.uptoOBS(file_href, '1699', file_name)
+                            if retData['state']:
+                                pass
+                            else:
+                                continue
+                            att_id, full_path = baseCore.tableUpdate(retData, '国务院文件', file_name, num, pub_time1)
+                            id_list.append(att_id)
+                            # todo:将返回的地址更新到soup
+                            file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
+                except:
+                    log.error(f'{title}...{href}获取内容失败')
+                    continue
+                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                # todo:传kafka字段
+                dic_news = {
+                    'attachmentIds': id_list,  # 附件id
+                    'author': '',  # 作者
+                    'content': content,  # 正文不带标签
+                    'contentWithTag': str(contentWithTag),  # 正文带标签
+                    'createDate': time_now,  # 创建时间
+                    'deleteFlag': 0,  # 是否删除(0为默认，1为删除)
+                    'id': '',  #
+                    'labels': [{'relationId': "1699", 'relationName': "国务院各部委文件", 'labelMark': "policy"}],
+                    # 关联标签id  关联标签名称  关联标签标识
+                    'origin': '',  # 政策发布机关
+                    'organ': pub_org,  # 政策发文机关
+                    'topicClassification': child_type,  # 政策文件分类
+                    'issuedNumber': pub_code,  # 发文字号
+                    'publishDate': pub_time1,  # 发布时间
+                    'writtenDate': pub_time2,  # 成文时间
+                    'sid': '1697458829758697473',  # 信息源id
+                    'sourceAddress': href,  # 原文链接
+                    'summary': '',  # 摘要
+                    'title': title  # 标题
+                }
+                # print(dic_news)
+                flag = baseTool.sendKafka(dic_news)
+                if flag:
+                    baseTool.save_data(dic_news)
+                count += 1
+                num += 1
+        except:
+            log.error(f'{bmfl}...第{pageNo}页获取信息列表失败')
+            continue
+        # except:
+        #    log.error(f'{bmfl}...获取页数失败')
+        #    continue
+    end_time = time.time()
+    log.info(f'共抓取国务院部门文件{count}条数据，耗时{end_time - start_time}')
+if __name__ == "__main__":
+    get_content2()
\ No newline at end of file
--- a/comData/policylaw/gwysasac.py
+++ b/comData/policylaw/gwysasac.py
+import os
+import re
+import time
+from pyquery import PyQuery as pq
+import requests
+from bs4 import BeautifulSoup
+from ClassTool import ClassTool
+baseTool = ClassTool()
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# 国务院国有资产监督管理委员会-政策发布
+def get_content3():
+    pathType = 'policy/gyzc/'
+    def getPage():
+        url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
+        req = requests.get(url, headers=baseTool.headers, verify=False)
+        req.encoding = req.apparent_encoding
+        soup = BeautifulSoup(req.text, 'html.parser')
+        # totalpage = re.findall("总页数:(.*)", soup.select('#pag_2603340')[0].text)[0]
+        totalpage = '17'
+        return int(totalpage)
+    def sendContent(href, headers, title, pub_time, num):
+        id_list = []
+        resp_href = requests.request("GET", href, headers=headers, verify=False)
+        resp_href.encoding = resp_href.apparent_encoding
+        soup = BeautifulSoup(resp_href.text, 'lxml')
+        soup = baseTool.paserUrl(soup, href)
+        doc_href = soup.find('div', class_='zsy_content')
+        try:
+            org_content = doc_href.select('.zsy_cotitle')[0]
+            org = re.findall('文章来源：(.*?)发布时间：', org_content)[0].strip()
+        except:
+            org = ''
+        try:
+            contentWithTag = doc_href.find('div', class_='zsy_comain')
+        except:
+            return
+        contentWithTag.select('#qr_container')[0].decompose()
+        contentWithTag.find('div', attrs={'id': 'div_div'}).decompose()
+        contentWithTag.find('div', class_='related').decompose()
+        contentWithTag.find('div', class_='jiathis_style_24x24').decompose()
+        try:
+            p_list = contentWithTag.findAll('p')
+            pub_hao = ''
+            for p in p_list:
+                p = str(p.text)
+                if '号' in p and '〔' in p and '〕' in p or '[' in p and ']' in p and '号' in p or '【' in p and '】' in p and '号' in p:
+                    try:
+                        pub_hao = p.split('日')[1].split('自')[0].strip().lstrip()
+                    except:
+                        pub_hao = p.strip().lstrip()
+                    break
+        except:
+            pub_hao = ''
+        if len(pub_hao) > 15:
+            pub_hao = ''
+        content = contentWithTag.text
+        if content == '' or content == 'None':
+            log.info(f'----{href}----{title}----内容为空----')
+            return
+        fu_jian_soup = contentWithTag.find_all('a')
+        for file in fu_jian_soup:
+            try:
+                file_href = file['href']
+            except Exception as e:
+                log.info(f'---{href}--------{e}-------')
+                continue
+            if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
+                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
+                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
+                file_name = file.text.strip()
+                category = os.path.splitext(file_href)[1]
+                if category not in file_name:
+                    file_name = file_name + category
+                retData = baseCore.uptoOBS(file_href, '1642', file_name)
+                if retData['state']:
+                    pass
+                else:
+                    continue
+                try:
+                    att_id, full_path = baseCore.tableUpdate(retData, '国务院国资委', file_name, num, pub_time)
+                    id_list.append(att_id)
+                except:
+                    continue
+                # todo:将返回的地址更新到soup
+                file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
+        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        # todo:传kafka字段
+        dic_news = {
+            'attachmentIds': id_list,  # 附件id
+            'author': '',  # 作者
+            'content': content,  # 正文不带标签
+            'contentWithTag': str(contentWithTag),  # 正文带标签
+            'createDate': time_now,  # 创建时间
+            'deleteFlag': 0,  # 是否删除(0为默认，1为删除)
+            'id': '',  #
+            'labels': [{'relationId': "1642", 'relationName': "国务院国资委", 'labelMark': "policy"}],
+            # 关联标签id  关联标签名称  关联标签标识
+            'origin': '',  # 政策发布机关
+            'organ': org,  # 政策发文机关
+            'topicClassification': '',  # 政策文件分类
+            'issuedNumber': pub_hao,  # 发文字号
+            'publishDate': pub_time,  # 发布时间
+            'writtenDate': None,  # 成文时间
+            'sid': '1697458829758697473',  # 信息源id
+            'sourceAddress': href,  # 原文链接
+            'summary': '',  # 摘要
+            'title': title  # 标题
+        }
+        # log.info(title)
+        flag = baseTool.sendKafka(dic_news)
+        if flag:
+            baseTool.save_data(dic_news)
+    def partTwo():
+        start_time = time.time()
+        num = 0
+        count = 0
+        totalpage = getPage()
+        for page in range(1, totalpage):
+            url = f"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index_2603340_{page}.html"
+            href_resp = requests.request("GET", url, headers=baseTool.headers, verify=False)
+            resp_text = href_resp.content.decode('UTF-8')
+            li_list = resp_text.split('<li>')
+            del (li_list[0])
+            for li in li_list:
+                id_list = []
+                href_ = li.split('<a href="')[1].split('" target=')[0]
+                title = li.split('title="')[1].split('">')[0]
+                href = f'http://www.sasac.gov.cn{href_.replace("../../..", "")}'
+                pub_time = li.split('<span>[')[1].split(']</span>')[0]
+                is_href = baseTool.db_storage.find_one({'网址': href})
+                if is_href:
+                    num += 1
+                    log.info('已采集----------跳过')
+                    continue
+                sendContent(href, baseTool.headers, title, pub_time, num)
+                num += 1
+                count += 1
+        end_time = time.time()
+        log.info(f'共抓取国资委文件{count}条数据，耗时{end_time - start_time}')
+    def partOne():
+        start_time = time.time()
+        num = 0
+        count = 0
+        url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
+        try:
+            # get请求,需要取消ssl验证
+            href_resp = requests.request("GET", url, headers=baseTool.headers, verify=False)
+            resp_text = href_resp.content.decode('UTF-8')
+            doc_resp = pq(resp_text)
+            doc_items = doc_resp('.zsy_conlist li').items()
+            time.sleep(1)
+            for doc_item in doc_items:
+                # 获取所需数据
+                try:
+                    href_ = doc_item('a').attr('href')
+                    if href_ is None:
+                        continue
+                    href = f'http://www.sasac.gov.cn{href_.replace("../../..", "")}'
+                    # 判断是否已经爬取过
+                    is_href = baseTool.db_storage.find_one({'网址': href})
+                    if is_href:
+                        num += 1
+                        log.info('已采集----------跳过')
+                        continue
+                    title = doc_item('a').attr('title')
+                    pub_time = doc_item('span').text().replace('[', '').replace(']', '')
+                except:
+                    continue
+                sendContent(href, baseTool.headers, title, pub_time, num)
+                num += 1
+                count += 1
+        except:
+            pass
+        end_time = time.time()
+        log.info(f'共抓取国资委文件{count}条数据，耗时{end_time - start_time}')
+    # partOne()
+    # 增量执行需要注释掉partTwo()
+    partTwo()
+if __name__ == "__main__":
+    get_content3()
\ No newline at end of file
--- a/comData/policylaw/hai_nan.py
+++ b/comData/policylaw/hai_nan.py
--- a/comData/policylaw/he_bei.py
+++ b/comData/policylaw/he_bei.py
+import datetime
+import os
+import re
+import time
+import requests
+from bs4 import BeautifulSoup
+from pyquery import PyQuery as pq
+from ClassTool import ClassTool
+baseTool = ClassTool()
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# 河北
+def he_bei():
+    num = 0
+    count = 0
+    start_time = time.time()
+    url = 'http://hbsa.hebei.gov.cn/Json/GFXWJ51.json'
+    try:
+        res = requests.get(url, baseTool.headers)
+        # print(res)
+        json = res.json()
+        # print(json)
+        for info in json:
+            title = info['title']
+            contentWithTag = info['content']
+            id = info['id']
+            href = 'http://hbsa.hebei.gov.cn/xxgk/GFXWJ?id=' + str(id)
+            is_href = baseTool.db_storage.find_one({'网址': href})
+            if is_href:
+                num += 1
+                continue
+            pub_time_ = info['updated']
+            m = round(pub_time_ / 1000)  # 四舍五入取10位时间戳（秒级）
+            n = time.localtime(m)  # 将时间戳转换成时间元祖tuple
+            publishDate = time.strftime("%Y-%m-%d %H:%M:%S", n)[:10]  # 格式化输出时间
+            origin = ''
+            soup = baseTool.paserUrl(str(contentWithTag), href)
+            fu_jian_soup = soup.find_all('a')
+            id_list = []
+            for file in fu_jian_soup:
+                try:
+                    file_href = file['href']
+                except Exception as e:
+                    continue
+                if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
+                        or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
+                        or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
+                    file_name = file.text.strip()
+                    category = os.path.splitext(file_href)[1]
+                    if category not in file_name:
+                        file_name = file_name + category
+                    retData = baseCore.uptoOBS(file_href, '1668', file_name)
+                    if retData['state']:
+                        pass
+                    else:
+                        continue
+                    att_id, full_path = baseCore.tableUpdate(retData, '河北省国资委', file_name, num, publishDate)
+                    id_list.append(att_id)
+                    # todo:将返回的地址更新到soup
+                    file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
+            # id_ = redefid(id_list)
+            contentWithTag = str(soup.prettify())
+            if len(contentWithTag) < 1:
+                if len(fu_jian_soup) < 1:
+                    continue
+            content = soup.text
+            if content == '' or content == None:
+                log.info(f'-----{href}----{title}----内容为空-----')
+                continue
+            pattern = r'(冀国.{1,}?号)|(国资.{1,}?号)'
+            match_list = re.findall(pattern, content)
+            if len(match_list) > 0:
+                issuedNumber = match_list[0][0]
+                if len(issuedNumber) > 20:
+                    issuedNumber = ''
+            else:
+                issuedNumber = ''
+            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+            # todo:传kafka字段
+            dic_news = {
+                'attachmentIds': id_list,
+                'author': '',
+                'content': str(content),
+                'contentWithTag': str(contentWithTag),
+                'createDate': time_now,
+                'deleteFlag': 0,
+                'id': '',
+                'labels': [{'relationId': "1668", 'relationName': "河北省国资委", 'labelMark': "policy"}],
+                'origin': origin,
+                'organ': "",
+                'topicClassification': "",
+                'issuedNumber': issuedNumber,
+                'publishDate': publishDate,
+                'writtenDate': None,
+                'sid': '1697458829758697473',
+                'sourceAddress': href,
+                'summary': '',
+                'title': title
+            }
+            # print(dic_news)
+            flag = baseTool.sendKafka(dic_news)
+            if flag:
+                baseTool.save_data(dic_news)
+                num += 1
+                count += 1
+    except:
+        pass
+    end_time = time.time()
+    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+if __name__ == "__main__":
+    he_bei()
\ No newline at end of file
--- a/comData/policylaw/he_nan.py
+++ b/comData/policylaw/he_nan.py
+import datetime
+import os
+import re
+import time
+import requests
+from bs4 import BeautifulSoup
+from pyquery import PyQuery as pq
+from ClassTool import ClassTool
+baseTool = ClassTool()
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# 河南
+def he_nan():
+    num = 0
+    count = 0
+    pathType = 'policy/henan/'
+    start_time = time.time()
+    for page in range(0, 7):
+        if page == 0:
+            url = 'http://gzw.henan.gov.cn/xxgk/fdzdgknr/zcfg/index.html'
+        else:
+            url = f'http://gzw.henan.gov.cn/xxgk/fdzdgknr/zcfg/index_{page}.html'
+        try:
+            resp_text = requests.get(url=url, headers=headers, verify=False)
+            doc_resp = pq(resp_text.content)
+            doc_items = doc_resp('.mt15.list-box li').items()
+            for doc_item in doc_items:
+                title = doc_item('a').text().strip()
+                href = doc_item('a').attr('href')
+                is_href = baseTool.db_storage.find_one({'网址': href})
+                if is_href:
+                    num += 1
+                    continue
+                href_res = requests.get(url=href, headers=headers, verify=False)
+                href_res.encoding = href_res.apparent_encoding
+                href_text = href_res.text
+                soup = BeautifulSoup(href_text, 'html.parser')
+                origin = soup.select('#source')[0].text
+                publishDate = soup.select('#pubDate')[0].text
+                contentWithTag = str(soup.select('#content')[0])
+                # contentWithTag =doc('div[class="information-zt-show"]')
+                # soup=BeautifulSoup(str(contentWithTag), 'html.parser')
+                soup = baseTool.paserUrl(str(contentWithTag), href)
+                fu_jian_soup = soup.find_all('a')
+                id_list = []
+                for file in fu_jian_soup:
+                    try:
+                        file_href = file['href']
+                    except Exception as e:
+                        continue
+                    if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
+                            or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
+                            or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
+                        file_name = file.text.strip()
+                        category = os.path.splitext(file_href)[1]
+                        if category not in file_name:
+                            file_name = file_name + category
+                        retData = baseCore.uptoOBS(file_href, '1690', file_name)
+                        if retData['state']:
+                            pass
+                        else:
+                            continue
+                        att_id, full_path = baseCore.tableUpdate(retData, '河南省国资委', file_name, num, publishDate)
+                        id_list.append(att_id)
+                        # todo:将返回的地址更新到soup
+                        file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
+                contentWithTag = str(soup.prettify())
+                content = soup.text
+                if content == '' or content == None:
+                    log.info(f'-----{href}----{title}----内容为空-----')
+                    continue
+                pattern = r'(豫国.{1,}?号)|(国.{1,}?号)'
+                match_list = re.findall(pattern, content)
+                if len(match_list) > 0:
+                    issuedNumber = match_list[0][0]
+                else:
+                    issuedNumber = ''
+                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                # todo:传kafka字段
+                dic_news = {
+                    'attachmentIds': id_list,
+                    'author': '',
+                    'content': str(content),
+                    'contentWithTag': str(contentWithTag),
+                    'createDate': time_now,
+                    'deleteFlag': 0,
+                    'id': '',
+                    'labels': [{'relationId': "1690", 'relationName': "河南省国资委", 'labelMark': "policy"}],
+                    'origin': origin,
+                    'organ': '',
+                    'topicClassification': '',
+                    'issuedNumber': issuedNumber,
+                    'publishDate': publishDate,
+                    'writtenDate': None,
+                    'sid': '1697458829758697473',
+                    'sourceAddress': href,
+                    'summary': '',
+                    'title': title
+                }
+                # print(dic_news)
+                flag = baseTool.sendKafka(dic_news)
+                if flag:
+                    baseTool.save_data(dic_news)
+                    num += 1
+                    count += 1
+                href_res.close()
+            resp_text.close()
+        except:
+            pass
+    end_time = time.time()
+    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+if __name__ == "__main__":
+    he_nan()
\ No newline at end of file
--- a/comData/policylaw/hei_long_jiang.py
+++ b/comData/policylaw/hei_long_jiang.py
+import datetime
+import os
+import re
+import time
+import requests
+from bs4 import BeautifulSoup
+from pyquery import PyQuery as pq
+from ClassTool import ClassTool
+baseTool = ClassTool()
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# 黑龙江
+def hei_long_jiang():
+    pathType = 'policy/heilongjiang/'
+    num = 0
+    count = 0
+    start_time = time.time()
+    for page in range(1, 3):
+        url = f'http://gzw.hlj.gov.cn/common/search/a4e4f3e94596456db749bfb0f7937cc7?_isAgg=true&_isJson=true&_pageSize=10&_template=index&_rangeTimeGte=&_channelName=&page={page}'
+        try:
+            web = requests.get(url=url, headers=baseTool.headers, verify=False)
+            text = web.json()
+            rows = text['data']['rows']
+            try:
+                for row in range(int(rows)):
+                    result = text['data']['results'][row]
+                    title = result['title']
+                    href = 'http://gzw.hlj.gov.cn' + result['url']
+                    publishDate = result['publishedTimeStr']
+                    list_all = text['data']['results'][row]['domainMetaList'][1]['resultList'][0]
+                    if list_all['name'] == '文号':
+                        pub_hao = list_all['value']
+                    else:
+                        pub_hao = ''
+                    is_href = baseTool.db_storage.find_one({'网址': href})
+                    if is_href:
+                        num += 1
+                        continue
+                    try:
+                        contentWithTag = text['data']['results'][row]['contentHtml']
+                        href_text = requests.get(url=href, headers=baseTool.headers, verify=False)
+                        href_text.encoding = href_text.apparent_encoding
+                        href_text = href_text.text
+                        doc_href = BeautifulSoup(href_text, 'html.parser')
+                        origin = doc_href.find(class_='ly')
+                        if origin:
+                            origin = origin.find('b').text
+                        else:
+                            origin = ''
+                        soup = baseTool.paserUrl(str(contentWithTag), href)
+                        fu_jian_soup = soup.find_all('a')
+                        id_list = []
+                        for file in fu_jian_soup:
+                            try:
+                                file_href = file['href']
+                            except Exception as e:
+                                continue
+                            if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
+                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
+                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
+                                file_name = file.text.strip()
+                                category = os.path.splitext(file_href)[1]
+                                if category not in file_name:
+                                    file_name = file_name + category
+                                retData = baseCore.uptoOBS(file_href, '1687', file_name)
+                                if retData['state']:
+                                    pass
+                                else:
+                                    continue
+                                att_id, full_path = baseCore.tableUpdate(retData, '江苏省国资委', file_name, num, publishDate)
+                                id_list.append(att_id)
+                                # todo:将返回的地址更新到soup
+                                file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
+                        contentWithTag = str(soup.prettify())
+                        content = soup.text
+                        if content == '' or content == None:
+                            log.info(f'-----{href}----{title}----内容为空-----')
+                            continue
+                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                        # todo:传kafka字段
+                        dic_news = {
+                            'attachmentIds': id_list,
+                            'author': '',
+                            'content': str(content),
+                            'contentWithTag': str(contentWithTag),
+                            'createDate': time_now,
+                            'deleteFlag': 0,
+                            'id': '',
+                            'labels': [{'relationId': "1687", 'relationName': "江苏省国资委", 'labelMark': "policy"}],
+                            'origin': origin,
+                            'organ': '',
+                            'topicClassification': '',
+                            'issuedNumber': pub_hao,
+                            'publishDate': publishDate,
+                            'writtenDate': None,
+                            'sid': '1697458829758697473',
+                            'sourceAddress': href,
+                            'summary': '',
+                            'title': title
+                        }
+                        # print(dic_news)
+                        flag = baseTool.sendKafka(dic_news)
+                        if flag:
+                            baseTool.save_data(dic_news)
+                            num += 1
+                            count += 1
+                    except:
+                        pass
+            except:
+                pass
+        except:
+            pass
+    end_time = time.time()
+    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+if __name__ == "__main__":
+    hei_long_jiang()
\ No newline at end of file
--- a/comData/policylaw/hu_bei.py
+++ b/comData/policylaw/hu_bei.py
+import os
+import time
+from pyquery import PyQuery as pq
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
+from ClassTool import ClassTool
+baseTool = ClassTool()
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# 湖北
+def hu_bei(chromr_bin=None):
+    num = 0
+    count = 0
+    start_time = time.time()
+    hrefs = []
+    url = 'http://gzw.hubei.gov.cn/zfxxgk/zc/gfxwj/'
+    chrome_driver = baseTool.driver_path
+    path = Service(chrome_driver)
+    chrome_options = webdriver.ChromeOptions()
+    # chrome_options.add_argument('--headless')
+    chrome_options.add_argument('--disable-gpu')
+    chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
+    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
+    chrome_options.binary_location = baseTool.chromr_bin
+    driver = webdriver.Chrome(service=path, chrome_options=chrome_options)
+    driver.get(url)
+    time.sleep(2)
+    ul = driver.find_element(By.ID, 'ulList')
+    li_list = ul.find_elements(By.TAG_NAME, 'li')
+    time.sleep(1)
+    for li in li_list:
+        href = li.find_element(By.TAG_NAME, 'a').get_attribute('href')
+        hrefs.append(href)
+    for href in hrefs:
+        is_href = baseTool.db_storage.find_one({'网址': href})
+        if is_href:
+            num += 1
+            continue
+        try:
+            driver.get(href)
+            time.sleep(2)
+            dhtml = driver.page_source
+            if len(dhtml) < 400:
+                driver.get(href)
+                time.sleep(2)
+            doc = pq(dhtml)
+            article = doc('div[class="article"]')
+            adoc = pq(article)
+            title = adoc('h2').text()
+            publishDate = adoc('div[class="info"]>span:nth-child(1)').text()
+            origin = adoc('div[class="info"]>span:nth-child(3)').text()
+            organ = ''
+            topicClassification = adoc('td[bfdi="93"]').text()
+            issuedNumber = adoc('td[bfdi="101"]').text()
+            writtenDate = adoc('td[bfdi="98"]').text()
+            rmtag = adoc('p:contains("附件：")')
+            rmtag2 = adoc('div[class="hbgov-qrcode-content"]')
+            rmtag.remove()
+            rmtag2.remove()
+            contentWithTag = adoc('div[class="article-box"]')
+            soup = baseTool.paserUrl(str(contentWithTag), href)
+            fu_jian_soup = soup.find_all('a')
+            id_list = []
+            for file in fu_jian_soup:
+                try:
+                    file_href = file['href']
+                except Exception as e:
+                    continue
+                if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
+                        or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
+                        or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
+                    file_name = file.text.strip()
+                    category = os.path.splitext(file_href)[1]
+                    if category not in file_name:
+                        file_name = file_name + category
+                    retData = baseCore.uptoOBS(file_href, '1675', file_name)
+                    if retData['state']:
+                        pass
+                    else:
+                        continue
+                    att_id, full_path = baseCore.tableUpdate(retData, '湖北省国资委', file_name, num, publishDate)
+                    id_list.append(att_id)
+                    # todo:将返回的地址更新到soup
+                    file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
+            # id_ = redefid(id_list)
+            contentWithTag = str(soup.prettify())
+            if len(contentWithTag) < 1:
+                if len(fu_jian_soup) < 1:
+                    continue
+            content = soup.text
+            if content == '' or content == None:
+                log.info(f'-----{href}----{title}----内容为空-----')
+                continue
+            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+            # todo:传kafka字段
+            dic_news = {
+                'attachmentIds': id_list,
+                'author': '',
+                'content': str(content),
+                'contentWithTag': str(contentWithTag),
+                'createDate': time_now,
+                'deleteFlag': 0,
+                'id': '',
+                'labels': [{'relationId': "1675", 'relationName': "湖北省国资委", 'labelMark': "policy"}],
+                'origin': origin,
+                'organ': organ,
+                'topicClassification': topicClassification,
+                'issuedNumber': issuedNumber,
+                'publishDate': publishDate,
+                'writtenDate': writtenDate,
+                'sid': '1697458829758697473',
+                'sourceAddress': href,
+                'summary': '',
+                'title': title
+            }
+            # print(dic_news)
+            flag = baseTool.sendKafka(dic_news)
+            if flag:
+                baseTool.save_data(dic_news)
+                num += 1
+                count += 1
+        except Exception as e:
+            pass
+    driver.close()
+    end_time = time.time()
+    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+if __name__ == "__main__":
+    hu_bei()
\ No newline at end of file
--- a/comData/policylaw/hu_nan.py
+++ b/comData/policylaw/hu_nan.py
+import datetime
+import os
+import re
+import time
+import requests
+from bs4 import BeautifulSoup
+from pyquery import PyQuery as pq
+from ClassTool import ClassTool
+baseTool = ClassTool()
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# 湖南
+def hu_nan():
+    num = 0
+    count = 0
+    pathType = 'policy/hunan/'
+    start_time = time.time()
+    for page in range(1, 7):
+        if page == 1:
+            # http://gzw.hunan.gov.cn/gzw/xxgk_71571/zcfg/index.html
+            url = 'http://gzw.hunan.gov.cn/gzw/xxgk_71571/zcfg/index.html'
+        else:
+            url = f'http://gzw.hunan.gov.cn/gzw/xxgk_71571/zcfg/index_{page}.html'
+        try:
+            resp_text = requests.get(url=url, headers=baseTool.headers, verify=False).content
+            doc_resp = pq(resp_text)
+            doc_items = doc_resp('.table tbody tr').items()
+            for doc_item in doc_items:
+                href = 'http://gzw.hunan.gov.cn' + doc_item('a').attr('href')
+                publishDate = doc_item('td:nth-child(3)').text()
+                is_href = baseTool.db_storage.find_one({'网址': href})
+                if is_href:
+                    num += 1
+                    continue
+                # href = 'http://gzw.hunan.gov.cn/gzw/xxgk_71571/zcfg/201109/t20110920_1942364.html'
+                try:
+                    res = requests.get(url=href, headers=baseTool.headers, verify=False)
+                    res.encoding = res.apparent_encoding
+                    res_text = res.text
+                    # soup = BeautifulSoup(res_text, 'html.parser')
+                    soup = baseTool.paserUrl(res_text, href)
+                    # pub_result = str(soup.find('div', attrs={'class': 'information-zt-list fn-clear'}).text)
+                    # writtenDate = pub_result.split('发文日期：')[1].split('名称：')[0].strip() + ':00'
+                    # title = pub_result.split('名称：')[1].split('主题分类：')[0].lstrip().strip()
+                    # organ = pub_result.split('发布机构: ')[1].split('if(')[0].lstrip().strip()
+                    doc = pq(str(soup))
+                    organ = doc('div[class="information-zt-list fn-clear"]>ul>li:nth-child(3)').text().replace('发布机构：',
+                                                                                                               '')
+                    if 'document.write' in organ:
+                        organ = ''
+                    writtenDate = doc('div[class="information-zt-list fn-clear"]>ul>li:nth-child(4)').text().replace(
+                        '发文日期：', '')
+                    title = doc('div[class="information-zt-list fn-clear"]>ul>li:nth-child(5)').text().replace('名称：',
+                                                                                                               '')
+                    topicClassification = doc(
+                        'div[class="information-zt-list fn-clear"]>ul>li:nth-child(6)').text().replace('主题分类：', '')
+                    contentWithTag = doc('div[class="information-zt-show"]')
+                    soup = BeautifulSoup(str(contentWithTag), 'html.parser')
+                    fu_jian_soup = soup.find_all('a')
+                    id_list = []
+                    for file in fu_jian_soup:
+                        try:
+                            file_href = file['href']
+                        except Exception as e:
+                            continue
+                        if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
+                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
+                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
+                            file_name = file.text.strip()
+                            category = os.path.splitext(file_href)[1]
+                            if category not in file_name:
+                                file_name = file_name + category
+                            retData = baseCore.uptoOBS(file_href, '1691', file_name)
+                            if retData['state']:
+                                pass
+                            else:
+                                continue
+                            att_id, full_path = baseCore.tableUpdate(retData, '湖南省国资委', file_name, num, publishDate)
+                            id_list.append(att_id)
+                            # todo:将返回的地址更新到soup
+                            file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
+                    contentWithTag = str(soup.prettify())
+                    content = soup.text
+                    if content == '' or content == None:
+                        log.info(f'-----{href}----{title}----内容为空-----')
+                        continue
+                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                    # todo:传kafka字段
+                    dic_news = {
+                        'attachmentIds': id_list,
+                        'author': '',
+                        'content': str(content),
+                        'contentWithTag': str(contentWithTag),
+                        'createDate': time_now,
+                        'deleteFlag': 0,
+                        'id': '',
+                        'labels': [{'relationId': "1691", 'relationName': "湖南省国资委", 'labelMark': "policy"}],
+                        'origin': '',
+                        'organ': organ,
+                        'topicClassification': topicClassification,
+                        'issuedNumber': '',
+                        'publishDate': publishDate,
+                        'writtenDate': writtenDate,
+                        'sid': '1697458829758697473',
+                        'sourceAddress': href,
+                        'summary': '',
+                        'title': title
+                    }
+                    # print(dic_news)
+                    flag = baseTool.sendKafka(dic_news)
+                    if flag:
+                        baseTool.save_data(dic_news)
+                        num += 1
+                        count += 1
+                except:
+                    pass
+        except:
+            pass
+    end_time = time.time()
+    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+if __name__ == "__main__":
+    hu_nan()
\ No newline at end of file
--- a/comData/policylaw/jiang_su.py
+++ b/comData/policylaw/jiang_su.py
+import datetime
+import os
+import re
+import time
+import requests
+from bs4 import BeautifulSoup
+from pyquery import PyQuery as pq
+from ClassTool import ClassTool
+baseTool = ClassTool()
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# 江苏
+def jiang_su():
+    num = 0
+    count = 0
+    pathType = 'policy/jiangsu/'
+    start_time = time.time()
+    pagestart = 1
+    pageend = 45
+    for page in range(1, 3):
+        url = f"http://jsgzw.jiangsu.gov.cn/module/web/jpage/dataproxy.jsp?startrecord={pagestart}&endrecord={pageend}&perpage=15"
+        pagestart = pageend + 1
+        pageend = pageend + 45
+        payload = "col=1&appid=1&webid=39&path=%2F&columnid=85683&sourceContentType=1&unitid=369983&webname=%E6%B1%9F%E8%8B%8F%E7%9C%81%E5%9B%BD%E8%B5%84%E5%A7%94&permissiontype=0"
+        header = {
+            'Connection': 'keep-alive',
+            'Accept': 'application/xml, text/xml, */*; q=0.01',
+            'X-Requested-With': 'XMLHttpRequest',
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
+            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+            'Origin': 'http://jsgzw.jiangsu.gov.cn',
+            'Referer': 'http://jsgzw.jiangsu.gov.cn/col/col61490/index.html?uid=247686&pageNum=4',
+            'Accept-Language': 'zh-CN,zh;q=0.9',
+            'Cookie': 'JSESSIONID=ADB520E83E1FC10429D961634BAD303D; __jsluid_h=02c2c950abb71f547a79da79719246aa; _gscu_210493472=24936291qq5dvl18; _gscbrs_210493472=1; yunsuo_session_verify=60cc00825d4e2dd3dee278a301f60f1e; _gscs_210493472=24936291p77pyu18|pv:3'
+        }
+        try:
+            resp_text = requests.request("POST", url, headers=header, data=payload).text
+            li_list = re.findall('CDATA\[(.*?)\]\]></record>', str(resp_text))
+            for li in li_list:
+                a = BeautifulSoup(li, 'lxml').find('a')
+                href = 'https://jsgzw.jiangsu.gov.cn/' + a['href']
+                title = a.text
+                is_href = baseTool.db_storage.find_one({'网址': href})
+                if is_href:
+                    num += 1
+                    continue
+                try:
+                    href_text = requests.get(url=href, headers=baseTool.headers, verify=False)
+                    href_text.encoding = href_text.apparent_encoding
+                    href_text = href_text.text
+                    doc_href = BeautifulSoup(href_text, 'html.parser')
+                    soup = baseTool.paserUrl(href_text, href)
+                    doc = pq(str(soup))
+                    publishDate = doc('div[class="cf tip"]>span:contains(发布日期：)').text().replace('发布日期：', '')
+                    writtenDate = doc('table[class="xlt_table"]>tbody>tr:nth-child(1)>td:nth-child(4)').text()
+                    organ = doc('table[class="xlt_table"]>tbody>tr:nth-child(2)>td:nth-child(2)').text()
+                    pub_hao = doc('table[class="xlt_table"]>tbody>tr:nth-child(2)>td:nth-child(4)').text()
+                    contentWithTag = doc('div[id="zoom"]')
+                    if len(contentWithTag) < 1:
+                        contentWithTag = doc('div[class="main-txt"]')
+                    soup = baseTool.paserUrl(str(contentWithTag), href)
+                    fu_jian_soup = soup.find_all('a')
+                    id_list = []
+                    for file in fu_jian_soup:
+                        try:
+                            file_href = file['href']
+                        except Exception as e:
+                            continue
+                        if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
+                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
+                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
+                            file_name = file.text.strip()
+                            category = os.path.splitext(file_href)[1]
+                            if category not in file_name:
+                                file_name = file_name + category
+                            retData = baseCore.uptoOBS(file_href, '1687', file_name)
+                            if retData['state']:
+                                pass
+                            else:
+                                continue
+                            att_id, full_path = baseCore.tableUpdate(retData, '江苏省国资委', file_name, num, publishDate)
+                            id_list.append(att_id)
+                            # todo:将返回的地址更新到soup
+                            file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
+                    contentWithTag = str(soup.prettify())
+                    content = soup.text
+                    if content == '' or content == None:
+                        log.info(f'-----{href}----{title}----内容为空-----')
+                        continue
+                    if len(pub_hao) < 1:
+                        pattern = r'(苏国.{1,}?号)|(国.{1,}?号)'
+                        match_list = re.findall(pattern, content)
+                        if len(match_list) > 0:
+                            pub_hao = match_list[0][0]
+                        else:
+                            pub_hao = ''
+                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                    # todo:传kafka字段
+                    dic_news = {
+                        'attachmentIds': id_list,
+                        'author': '',
+                        'content': str(content),
+                        'contentWithTag': str(contentWithTag),
+                        'createDate': time_now,
+                        'deleteFlag': 0,
+                        'id': '',
+                        'labels': [{'relationId': "1687", 'relationName': "江苏省国资委", 'labelMark': "policy"}],
+                        'origin': '',
+                        'organ': organ,
+                        'topicClassification': '',
+                        'issuedNumber': pub_hao,
+                        'publishDate': publishDate,
+                        'writtenDate': writtenDate,
+                        'sid': '1697458829758697473',
+                        'sourceAddress': href,
+                        'summary': '',
+                        'title': title
+                    }
+                    # print(dic_news)
+                    flag = baseTool.sendKafka(dic_news)
+                    if flag:
+                        baseTool.save_data(dic_news)
+                        num += 1
+                        count += 1
+                except:
+                    pass
+        except:
+            pass
+    end_time = time.time()
+    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+if __name__ == "__main__":
+    jiang_su()
\ No newline at end of file
--- a/comData/policylaw/jiang_xi.py
+++ b/comData/policylaw/jiang_xi.py
+import datetime
+import os
+import re
+import time
+import requests
+from bs4 import BeautifulSoup
+from pyquery import PyQuery as pq
+from ClassTool import ClassTool
+baseTool = ClassTool()
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# 江西
+def jiang_xi():
+    """
+    1-60
+    61-120
+    121-164
+    """
+    num = 0
+    count = 0
+    pathType = 'policy/jiangxi/'
+    start_time = time.time()
+    startrecord = 1
+    endrecord = 60
+    for page in range(1, 3):
+        url = f"http://gzw.jiangxi.gov.cn/module/web/jpage/dataproxy.jsp?startrecord={startrecord}&endrecord={endrecord}&perpage=20"
+        startrecord = endrecord + 1
+        endrecord = endrecord + 60
+        payload = "col=1&webid=175&path=http%3A%2F%2Fgzw.jiangxi.gov.cn%2F&columnid=22977&sourceContentType=1&unitid=402016&webname=%E6%B1%9F%E8%A5%BF%E7%9C%81%E5%9B%BD%E6%9C%89%E8%B5%84%E4%BA%A7%E7%9B%91%E7%9D%A3%E7%AE%A1%E7%90%86%E5%A7%94%E5%91%98%E4%BC%9A&permissiontype=0"
+        header = {
+            'Connection': 'keep-alive',
+            'Accept': 'application/xml, text/xml, */*; q=0.01',
+            'X-Requested-With': 'XMLHttpRequest',
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
+            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+            'Origin': 'http://gzw.jiangxi.gov.cn',
+            'Referer': 'http://gzw.jiangxi.gov.cn/col/col22977/index.html?uid=402016&pageNum=9',
+            'Accept-Language': 'zh-CN,zh;q=0.9',
+            'Cookie': 'JSESSIONID=F601A052571881210819664F5BD38015; JSESSIONID=6E54DB27D82E844B825DD675AE19E399'
+        }
+        try:
+            resp_text = requests.request("POST", url, headers=header, data=payload).text
+            href_list = re.findall("href='(.*?)'", resp_text)
+            for href in href_list:
+                is_href = baseTool.db_storage.find_one({'网址': href})
+                if is_href:
+                    num += 1
+                    continue
+                try:
+                    href_res = requests.get(url=href, headers=baseTool.headers, verify=False)
+                    href_res.encoding = href_res.apparent_encoding
+                    href_text = href_res.text
+                    soup = baseTool.paserUrl(href_text, href)
+                    doc = pq(str(soup))
+                    try:
+                        # origin=soup.find(text='信息来源：').text.replace('信息来源：','')
+                        origin = doc('td:contains("信息来源：")').text().replace('信息来源：', '')
+                    except Exception as e:
+                        origin = ''
+                    title = doc('tr[class="biaoti"]>td:nth-child(1)').text().replace('标题:', '')
+                    organ = doc('div[class="xxgk-quote"]>table>tbody>tr:nth-child(1)>td:nth-child(2)').text().replace(
+                        '发文机关:', '')
+                    pub_hao = doc('div[class="xxgk-quote"]>table>tbody>tr:nth-child(1)>td:nth-child(3)').text().replace(
+                        '文号:', '')
+                    topicClassification = doc(
+                        'div[class="xxgk-quote"]>table>tbody>tr:nth-child(2)>td:nth-child(1)').text().replace('主题分类:',
+                                                                                                              '')
+                    writtenDate = doc(
+                        'div[class="xxgk-quote"]>table>tbody>tr:nth-child(2)>td:nth-child(3)').text().replace('成文日期:',
+                                                                                                              '')
+                    # pub_result = str(soup.find('div', attrs={'class': 'xxgk-quote'}).text)
+                    # title = pub_result.split('标？？？？？？题: ')[1].split('有？？效？？性: ')[0].lstrip().strip()
+                    # organ = pub_result.split('发文机关:')[1].split('文？？？？？？号:')[0].lstrip().strip()
+                    # pub_hao = pub_result.split('文？？？？？？号:')[1].split('主题分类: ')[0].lstrip().strip()
+                    # writtenDate = pub_result.split('成文日期:')[1].split('标？？？？？？题: ')[0].lstrip().strip()
+                    contentWithTag = doc('div[id="zoom"]')
+                    soup = baseTool.paserUrl(str(contentWithTag), href)
+                    fu_jian_soup = soup.find_all('a')
+                    id_list = []
+                    for file in fu_jian_soup:
+                        try:
+                            file_href = file['href']
+                        except Exception as e:
+                            continue
+                        if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
+                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
+                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
+                            file_name = file.text.strip()
+                            category = os.path.splitext(file_href)[1]
+                            if category not in file_name:
+                                file_name = file_name + category
+                            retData = baseCore.uptoOBS(file_href, '1689', file_name)
+                            if retData['state']:
+                                pass
+                            else:
+                                continue
+                            att_id, full_path = baseCore.tableUpdate(retData, '江西省国资委', file_name, num, writtenDate)
+                            id_list.append(att_id)
+                            # todo:将返回的地址更新到soup
+                            file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
+                    contentWithTag = str(soup.prettify())
+                    content = soup.text
+                    if content == '' or content == None:
+                        log.info(f'-----{href}----{title}----内容为空-----')
+                        continue
+                    if len(pub_hao) < 1:
+                        pattern = r'(赣国资.{1,}?号)|(国.{1,}?号)'
+                        match_list = re.findall(pattern, content)
+                        if len(match_list) > 0:
+                            pub_hao = match_list[0][0]
+                        else:
+                            pub_hao = ''
+                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                    # todo:传kafka字段
+                    dic_news = {
+                        'attachmentIds': id_list,
+                        'author': '',
+                        'content': str(content),
+                        'contentWithTag': str(contentWithTag),
+                        'createDate': time_now,
+                        'deleteFlag': 0,
+                        'id': '',
+                        'labels': [{'relationId': "1689", 'relationName': "江西省国资委", 'labelMark': "policy"}],
+                        'origin': origin,
+                        'organ': organ,
+                        'topicClassification': topicClassification,
+                        'issuedNumber': pub_hao,
+                        'publishDate': None,
+                        'writtenDate': writtenDate,
+                        'sid': '1697458829758697473',
+                        'sourceAddress': href,
+                        'summary': '',
+                        'title': title
+                    }
+                    # print(dic_news)
+                    flag = baseTool.sendKafka(dic_news)
+                    if flag:
+                        baseTool.save_data(dic_news)
+                        num += 1
+                        count += 1
+                except:
+                    pass
+        except:
+            pass
+    end_time = time.time()
+    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+if __name__ == "__main__":
+    jiang_xi()
\ No newline at end of file
--- a/comData/policylaw/liao_ning.py
+++ b/comData/policylaw/liao_ning.py
+import datetime
+import os
+import re
+import time
+import requests
+from bs4 import BeautifulSoup
+from pyquery import PyQuery as pq
+from ClassTool import ClassTool
+baseTool = ClassTool()
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# 辽宁
+def liao_ning():
+    num = 0
+    count = 0
+    start_time = time.time()
+    for page in range(1, 3):
+        url = f'https://gzw.ln.gov.cn/gzw/xxgk/zc/zcfb/aa251549-{page}.shtml'
+        try:
+            resp_text = requests.get(url=url, headers=baseTool.headers, verify=False)
+            resp_text.encoding = resp_text.apparent_encoding
+            resp_text = resp_text.text
+            doc_resp = BeautifulSoup(resp_text, 'html.parser')
+            doc_items = doc_resp.select(
+                '#aa25154996104f57858a48e0b1aecca9 > div:nth-of-type(2) > div.tablist-show > div.tab-list-page')[0]
+            li_list = doc_items.select('li')
+            for li in li_list:
+                # print(li)
+                href = str(li.select('a')[0].get('href'))
+                if 'http' not in href:
+                    if 'https' not in href:
+                        href = 'https://gzw.ln.gov.cn/' + href
+                is_href = baseTool.db_storage.find_one({'网址': href})
+                if is_href:
+                    num += 1
+                    continue
+                try:
+                    href_text = requests.get(url=href, headers=baseTool.headers, verify=False)
+                    href_text.encoding = href_text.apparent_encoding
+                    href_text = href_text.text
+                    doc_href = baseTool.paserUrl(href_text, href)
+                    doc = pq(str(doc_href))
+                    title = doc('p[class="govxlTText"]').text().strip()
+                    origintag = doc('p[class="govxlTText2"]').text().strip()
+                    origin = origintag.split('文章来源：')[1].split('发布时间：')[0].strip()
+                    publishDate = origintag.split('发布时间：')[1].strip().replace('年', '-').replace('月', '-').replace('日',
+                                                                                                                  '') + ' 00:00:00'
+                    contentWithTag = doc('div[class="TRS_Editor"]')
+                    if len(title) < 1:
+                        title = doc('h1[class="title"]')
+                        issuedNumber = doc('p[class="wjh"]')
+                    if len(contentWithTag) < 1:
+                        contentWithTag = doc('div[class="content"]')
+                    soup = baseTool.paserUrl(str(contentWithTag), href)
+                    fu_jian_soup = soup.find_all('a')
+                    id_list = []
+                    for file in fu_jian_soup:
+                        try:
+                            file_href = file['href']
+                        except Exception as e:
+                            continue
+                        if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
+                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
+                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
+                            file_name = file.text.strip()
+                            category = os.path.splitext(file_href)[1]
+                            if category not in file_name:
+                                file_name = file_name + category
+                            retData = baseCore.uptoOBS(file_href, '1685', file_name)
+                            if retData['state']:
+                                pass
+                            else:
+                                continue
+                            att_id, full_path = baseCore.tableUpdate(retData, '辽宁省国资委', file_name, num, publishDate)
+                            id_list.append(att_id)
+                            # todo:将返回的地址更新到soup
+                            file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
+                    # id_ = redefid(id_list)
+                    contentWithTag = str(soup.prettify())
+                    if len(contentWithTag) < 1:
+                        continue
+                    content = soup.text
+                    if content == '' or content == None:
+                        log.info(f'-----{href}----{title}----内容为空-----')
+                        continue
+                    pattern = r'(辽国资.{1,}?号)|(国资.{1,}?号)'
+                    match_list = re.findall(pattern, content)
+                    if len(match_list) > 0:
+                        issuedNumber = match_list[0][0]
+                        if len(issuedNumber) > 20:
+                            issuedNumber = ''
+                    else:
+                        issuedNumber = ''
+                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                    # todo:传kafka字段
+                    dic_news = {
+                        'attachmentIds': id_list,
+                        'author': '',
+                        'content': str(content),
+                        'contentWithTag': str(contentWithTag),
+                        'createDate': time_now,
+                        'deleteFlag': 0,
+                        'id': '',
+                        'labels': [{'relationId': "1685", 'relationName': "辽宁省国资委", 'labelMark': "policy"}],
+                        'origin': origin,
+                        'organ': "",
+                        'topicClassification': "",
+                        'issuedNumber': issuedNumber,
+                        'publishDate': publishDate,
+                        'writtenDate': None,
+                        'sid': '1697458829758697473',
+                        'sourceAddress': href,
+                        'summary': '',
+                        'title': title
+                    }
+                    # print(dic_news)
+                    flag = baseTool.sendKafka(dic_news)
+                    if flag:
+                        baseTool.save_data(dic_news)
+                        num += 1
+                        count += 1
+                except:
+                    pass
+        except:
+            pass
+    end_time = time.time()
+    log.info(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+if __name__ == "__main__":
+    liao_ning()
\ No newline at end of file
--- a/comData/policylaw/nei_meng_gu.py
+++ b/comData/policylaw/nei_meng_gu.py
+import os
+import re
+import time
+import requests
+from bs4 import BeautifulSoup
+from ClassTool import ClassTool
+baseTool = ClassTool()
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# 内蒙古
+def nei_meng_gu():
+    start = time.time()
+    num = 0
+    url = 'http://gzw.nmg.gov.cn/zfxxgk/zcfg/index.html'
+    try:
+        resp_text = requests.get(url=url, headers=baseTool.headers, verify=False)
+        resp_text.encoding = 'utf-8'
+        html = resp_text.text
+        soup = BeautifulSoup(html, 'html.parser')
+        result = soup.find(class_='right_two')
+        li_list = result.find_all(class_='font14wr')
+        for a in li_list:
+            id_list = []
+            a_text = str(a)
+            real_href = 'https://gzw.nmg.gov.cn/zfxxgk' + a_text.split('href="..')[-1].split('" target="_blank')[0]
+            # # 判断是否已经爬取过
+            # todo:测试用 注释掉判重
+            is_href = baseTool.db_storage.find_one({'网址': real_href})
+            if is_href:
+                num += 1
+                continue
+            try:
+                # 获取所需信息
+                title = a_text.split('target="_blank">')[-1].split('</a>')[0]
+                href_text = requests.get(url=real_href, headers=baseTool.headers, verify=False)
+                href_text.encoding = 'utf-8'
+                i_html = href_text.text
+                i_soup = BeautifulSoup(i_html, 'html.parser')
+                # todo:将html中的a标签相对路径改为绝对路径
+                i_soup = baseTool.paserUrl(i_soup, real_href)
+                i_result = i_soup.find('div', id='d_laiyuan')
+                time_ = i_result.find_all('span')[0]
+                time_ = str(time_)
+                pub_time = time_.split('<span>')[1].split('</span>')[0].replace('发布时间：', '')
+                # 发布机关
+                origin = i_result.find_all('span')[1]
+                origin = str(origin)
+                pub_source = origin.split('<span>')[1].split('</span>')[0].replace('来源：', '')
+                # 发文机关
+                organ = origin
+                fwzh = i_soup.find_all('td')[7]
+                pub_hao_result = re.findall('〔(.*?)〕', str(fwzh))
+                if len(pub_hao_result) == 0:
+                    pub_hao = ''
+                else:
+                    if '内' in str(fwzh):
+                        pub_hao = str(fwzh).split('<td>')[1].split('</td>')[0]
+                    else:
+                        pub_hao = ''
+                # 成文时间
+                writtenDate = i_soup.find_all('td')[9].text
+                topicClassification = i_soup.find_all('td')[3].text
+                i_content = i_soup.find(class_='d_show')
+                if i_content:
+                    content = str(i_content)
+                else:
+                    i_content = i_soup.find(class_='view TRS_UEDITOR trs_paper_default')
+                    content = str(i_content)
+                if i_content.text == '' or i_content.text == 'None':
+                    log.info(f'{real_href}------{title}----内容为空-----')
+                    continue
+                # todo:内蒙古市的附件不在正文中，异步加载出来，替换不了标签，附件可上传att表中
+                fujian = i_soup.find(class_='xy_zcwjxl_downloadPC_list')
+                fu_jian_result = re.findall('href="(.*?)"', str(fujian))
+                if len(fu_jian_result) > 0:
+                    for fu_jian_re in fu_jian_result:
+                        if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
+                                or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
+                                or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
+                            fu_jian_re = str(real_href).split('/t')[0] + '/' + str(fu_jian_re).split('./')[1]
+                            fu_jian_href = fu_jian_re
+                            category = os.path.splitext(fu_jian_href)[1]
+                            if category not in title:
+                                file_name = title + category
+                            # print(fu_jian_href)
+                            # todo:附件上传至文件服务器
+                            retData = baseCore.uptoOBS(fu_jian_href, '1669', file_name)
+                            if retData['state']:
+                                pass
+                            else:
+                                continue
+                            att_id, full_path = baseCore.tableUpdate(retData, '内蒙古自治区国资委', file_name, num, pub_time)
+                            id_list.append(att_id)
+                log.info(title)
+                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                # todo:传kafka字段
+                dic_news = {
+                    'attachmentIds': id_list,
+                    'author': '',
+                    'content': i_content.text,
+                    'contentWithTag': content,
+                    'createDate': time_now,
+                    'deleteFlag': 0,
+                    'id': '',
+                    'labels': [{'relationId': "1669", 'relationName': "内蒙古自治区国资委", 'labelMark': "policy"}],
+                    'origin': origin,
+                    'organ': organ,
+                    'topicClassification': topicClassification,
+                    'issuedNumber': pub_hao,
+                    'publishDate': pub_time,
+                    'writtenDate': writtenDate,
+                    'sid': '1697458829758697473',
+                    'sourceAddress': real_href,
+                    'summary': '',
+                    'title': title
+                }
+                flag = baseTool.sendKafka(dic_news)
+                if flag:
+                    baseTool.save_data(dic_news)
+                num = num + 1
+            except:
+                pass
+    except:
+        pass
+    end = time.time()
+    log.info(f'共抓取{num}条数据,共耗时{end - start}')
+if __name__ == "__main__":
+    nei_meng_gu()
\ No newline at end of file
--- a/comData/policylaw/ning_xia.py
+++ b/comData/policylaw/ning_xia.py
+import datetime
+import os
+import re
+import time
+import requests
+from bs4 import BeautifulSoup
+from pyquery import PyQuery as pq
+from ClassTool import ClassTool
+baseTool = ClassTool()
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# 宁夏
+def ning_xia():
+    num = 0
+    count = 0
+    pathType = 'policy/ningxia/'
+    start_time = time.time()
+    for page in range(0, 3):
+        if page == 0:
+            url = 'http://gzw.nx.gov.cn/zcfg/zcwj/gzwwj/index.html'
+        else:
+            url = f'http://gzw.nx.gov.cn/zcfg/zcwj/gzwwj/index_{page}.html'
+        try:
+            res = requests.get(url=url, headers=baseTool.headers, verify=False)
+            res.encoding = res.apparent_encoding
+            res_text = res.text
+            soup = BeautifulSoup(res_text, 'html.parser')
+            li_list = soup.find('div', attrs={'class': 'stdnewslist'}).find_all('li')
+            for li in li_list:
+                title = li.find('a').get('title').replace('</p>', '').replace('<p>', '')
+                href = url.split('index')[0] + li.find('a').get('href').replace('./', '')
+                publishDate = li.find('span', attrs={'class': 'stdnewslistspan'}).text
+                is_href = baseTool.db_storage.find_one({'网址': href})
+                if is_href:
+                    num += 1
+                    continue
+                try:
+                    href_res = requests.get(url=href, headers=baseTool.headers, verify=False)
+                    href_res.encoding = href_res.apparent_encoding
+                    href_text = href_res.text
+                    # soup_ = BeautifulSoup(href_text, 'html.parser')
+                    soup_ = baseTool.paserUrl(href_text, href)
+                    pub_result = soup_.find('table', attrs={'class': 'gk-xl-table'}).text.replace(' ', '')
+                    writtenDate = pub_result.split('生成日期')[1].split('发文字号')[0].strip() + ' 00:00:00'
+                    pub_hao = pub_result.split('发文字号')[1].split('公开形式')[0].strip()
+                    organ = pub_result.split('所属机构')[1].split('有效性')[0].strip()
+                    contentWithTag = soup_.find('div', attrs={'class': 'content'}).find('div',
+                                                                                        attrs={'class': 'TRS_UEDITOR'})
+                    soup = BeautifulSoup(str(contentWithTag), 'html.parser')
+                    fu_jian_soup = soup.find_all('a')
+                    id_list = []
+                    for file in fu_jian_soup:
+                        try:
+                            file_href = file['href']
+                        except Exception as e:
+                            continue
+                        if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
+                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
+                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
+                            file_name = file.text.strip()
+                            category = os.path.splitext(file_href)[1]
+                            if category not in file_name:
+                                file_name = file_name + category
+                            retData = baseCore.uptoOBS(file_href, '1697', file_name)
+                            if retData['state']:
+                                pass
+                            else:
+                                continue
+                            att_id, full_path = baseCore.tableUpdate(retData, '宁夏回族自治区国资委', file_name, num, publishDate)
+                            id_list.append(att_id)
+                            # todo:将返回的地址更新到soup
+                            file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
+                    # id_ = redefid(id_list)
+                    contentWithTag = str(soup.prettify())
+                    content = soup.text
+                    if content == '' or content == None:
+                        log.info(f'-----{href}----{title}----内容为空-----')
+                        continue
+                    t = time.strptime(publishDate, "%Y年%m月%d日")
+                    publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
+                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                    # todo:传kafka字段
+                    dic_news = {
+                        'attachmentIds': id_list,
+                        'author': '',
+                        'content': str(content),
+                        'contentWithTag': str(contentWithTag),
+                        'createDate': time_now,
+                        'deleteFlag': 0,
+                        'id': '',
+                        'labels': [{'relationId': "1697", 'relationName': "宁夏回族自治区国资委", 'labelMark': "policy"}],
+                        'origin': '',
+                        'organ': organ,
+                        'topicClassification': "",
+                        'issuedNumber': pub_hao,
+                        'publishDate': publishDate,
+                        'writtenDate': writtenDate,
+                        'sid': '1697458829758697473',
+                        'sourceAddress': href,
+                        'summary': '',
+                        'title': title
+                    }
+                    # print(dic_news)
+                    flag = baseTool.sendKafka(dic_news)
+                    if flag:
+                        baseTool.save_data(dic_news)
+                        num += 1
+                        count += 1
+                except:
+                    pass
+        except:
+            pass
+    end_time = time.time()
+    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+if __name__ == "__main__":
+    ning_xia()
\ No newline at end of file
--- a/comData/policylaw/qing_hai.py
+++ b/comData/policylaw/qing_hai.py
--- a/comData/policylaw/shan_dong.py
+++ b/comData/policylaw/shan_dong.py
+import datetime
+import os
+import re
+import time
+import requests
+from bs4 import BeautifulSoup
+from pyquery import PyQuery as pq
+from ClassTool import ClassTool
+baseTool = ClassTool()
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# 山东
+def shan_dong():
+    headers = {
+        'Cookie': 'COLLCK=2502513302; COLLCK=2493627587',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.183'
+    }
+    start = time.time()
+    num = 0
+    count = 0
+    url_list = ['http://gzw.shandong.gov.cn/channels/ch06086/', 'http://gzw.shandong.gov.cn/channels/ch06088/']
+    for url in url_list:
+        try:
+            resp_text = requests.get(url=url, headers=headers, verify=False)
+            resp_text.encoding = 'utf-8'
+            html = resp_text.text
+            soup = BeautifulSoup(html, 'html.parser')
+            result = soup.find_all(class_='pagedContent')
+            for li in result:
+                href = li.find('a')['href']
+                is_href = db_storage.find_one({'网址': href})
+                if is_href:
+                    num += 1
+                    continue
+                try:
+                    href_text = requests.get(url=href, headers=headers, verify=False)
+                    href_text.encoding = href_text.apparent_encoding
+                    i_html = href_text.text
+                    i_soup = BeautifulSoup(i_html, 'html.parser')
+                    try:
+                        source = i_soup.find_all('tbody')[0]
+                        title = str(source).split('标　　题：</strong>')[1].split('</td>')[0].replace('\r', '').replace('\n',
+                                                                                                                   '')
+                        pub_time = re.findall('<strong>发布日期：</strong>(.*?)</td>', str(source))
+                        pub_time = ''.join(pub_time)
+                        pub_hao = re.findall('<strong>发文字号：</strong>(.*?)</td>', str(source))
+                        pub_hao = ''.join(pub_hao)
+                        pub_source = re.findall('<strong>发文机关：</strong>(.*?)</td>', str(source))
+                        pub_source = ''.join(pub_source)
+                        writtenDate = re.findall('<strong>成文日期：</strong>(.*?)</td>', str(source))
+                        writtenDate = ''.join(writtenDate)
+                        # print(pub_time,pub_source,pub_hao)
+                        content = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle").text
+                        contentwithtag = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle")
+                        if content == '' or content == None:
+                            log.info(f'-----{href}----{title}----内容为空-----')
+                            continue
+                        if pub_hao == '无':
+                            p_list = content.find_all('p')
+                            for p in p_list:
+                                p_text = p.text
+                                if '〔' and '〕' in p_text:
+                                    pub_hao = p_text
+                                    break
+                                else:
+                                    continue
+                    except:
+                        try:
+                            title = str(i_soup.find('div', attrs={'class': 'wz_title'}).text).strip().lstrip()
+                        except:
+                            title = ''
+                            source = i_soup.find('div', attrs={'id': 'nr'})
+                            h1_list = source.find_all('h1')
+                            for h1 in h1_list:
+                                title = title + str(h1.text)
+                            title.strip().lstrip()
+                        pub_time = None
+                        span_list = source.find_all('span')
+                        i = 0
+                        for span in span_list:
+                            span_text = span.text
+                            if '〔' and '〕' in span_text or '鲁国' in span_text or '国办发' in span_text:
+                                pub_hao = str(span_text)
+                                if '号' not in pub_hao:
+                                    pub_hao = pub_hao + str(span_list[i + 1].text)
+                                break
+                            i = i + 1
+                        content = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle").text
+                        contentwithtag = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle")
+                        if content == '' or content == None:
+                            log.info(f'-----{href}----{title}----内容为空-----')
+                            continue
+                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                    # todo:传kafka字段
+                    dic_news = {
+                        'attachmentIds': [],
+                        'author': '',
+                        'content': content,
+                        'contentWithTag': str(contentwithtag),
+                        'createDate': time_now,
+                        'deleteFlag': 0,
+                        'id': '',
+                        'labels': [{'relationId': "1674", 'relationName': "山东省国资委", 'labelMark': "policy"}],
+                        'origin': '',
+                        'organ': pub_source,
+                        'topicClassification': '',
+                        'issuedNumber': pub_hao,
+                        'publishDate': pub_time,
+                        'writtenDate': writtenDate,
+                        'sid': '1697458829758697473',
+                        'sourceAddress': href,
+                        'summary': '',
+                        'title': title
+                    }
+                    # print(dic_news)
+                    flag = sendKafka(dic_news)
+                    if flag:
+                        save_data(dic_news)
+                        log.info(title)
+                        num = num + 1
+                        count += 1
+                except:
+                    pass
+        except:
+            pass
+    end = time.time()
+    log.info('共', count, '条', '...........', '共耗时', end - start, '秒')
+if __name__ == "__main__":
+    shan_dong()
\ No newline at end of file
--- a/comData/policylaw/shan_xi.py
+++ b/comData/policylaw/shan_xi.py
+import datetime
+import os
+import re
+import time
+import requests
+from lxml import etree
+from pyquery import PyQuery as pq
+from ClassTool import ClassTool
+baseTool = ClassTool()
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# 山西
+def shan_xi():
+    num = 0
+    count = 0
+    start_time = time.time()
+    for page in range(1, 7):
+        if page == 1:
+            url = 'http://gzw.shanxi.gov.cn/zxhrdgz/fzyd/zywj/'
+        else:
+            url = f'http://gzw.shanxi.gov.cn/zxhrdgz/fzyd/zywj/index_{page - 1}.shtml'
+        try:
+            res = requests.get(url, baseTool.headers)
+            page_text = res.text.encode("ISO-8859-1")
+            page_text = page_text.decode("utf-8")
+            tree = etree.HTML(page_text)
+            tr_list = tree.xpath(
+                '/html/body/table[3]/tbody/tr/td[2]/table/tbody/tr[3]/td/table[2]/tbody/tr[3]/td/form/table/tbody/tr')
+            for tr in tr_list:
+                href = tr.xpath('./td[1]/a/@href')
+                if href == []:
+                    continue
+                href = href[0].replace('../../', 'http://gzw.shanxi.gov.cn/zxhrdgz/fzyd/zywj/').replace('./',
+                                                                                                        'http://gzw.shanxi.gov.cn/zxhrdgz/fzyd/zywj/')
+                title = tr.xpath('./td[1]/a/span//text()')[0]
+                publishDate_ = str(tr.xpath('./td[2]/span/text()')[0]).strip()
+                time_obj = datetime.datetime.strptime(publishDate_, "%Y/%m/%d")
+                # 将datetime对象格式化为年月日的字符串
+                publishDate = time_obj.strftime("%Y-%m-%d")
+                is_href = baseTool.db_storage.find_one({'网址': href})
+                if is_href:
+                    num += 1
+                    continue
+                try:
+                    if ".pdf" in href:
+                        content = ''
+                        publishDate = None
+                        origin = ''
+                        fu_jian_soup = [href]
+                        contentWithTag = ''
+                    else:
+                        res = requests.get(href, baseTool.headers)
+                        page_text = res.text.encode("ISO-8859-1")
+                        page_text = page_text.decode("utf-8")
+                        page = baseTool.paserUrl(page_text, href)
+                        doc = pq(str(page))
+                        title = doc('title').text()
+                        origin = ''
+                        contentWithTag = doc('div[id="vsb_content"]')
+                    soup = baseTool.paserUrl(str(contentWithTag), href)
+                    if len(fu_jian_soup) < 1:
+                        fu_jian_soup = soup.find_all('a')
+                    id_list = []
+                    for file in fu_jian_soup:
+                        try:
+                            file_href = file['href']
+                        except Exception as e:
+                            continue
+                        if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
+                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
+                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
+                            file_name = file.text.strip()
+                            category = os.path.splitext(file_href)[1]
+                            if category not in file_name:
+                                file_name = file_name + category
+                            retData = baseCore.uptoOBS(file_href, '1684', file_name)
+                            if retData['state']:
+                                pass
+                            else:
+                                continue
+                            att_id, full_path = baseCore.tableUpdate(retData, '山西省国资委', file_name, num, publishDate)
+                            id_list.append(att_id)
+                            # todo:将返回的地址更新到soup
+                            file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
+                    # id_ = redefid(id_list)
+                    contentWithTag = str(soup.prettify())
+                    if len(contentWithTag) < 1:
+                        if len(fu_jian_soup) < 1:
+                            continue
+                    content = soup.text
+                    if content == '' or content == None:
+                        log.info(f'-----{href}----{title}----内容为空-----')
+                        continue
+                    pattern = r'(晋国资.{1,}?号)|(国资.{1,}?号)'
+                    match_list = re.findall(pattern, content)
+                    if len(match_list) > 0:
+                        issuedNumber = match_list[0][0]
+                        if len(issuedNumber) > 20:
+                            issuedNumber = ''
+                    else:
+                        issuedNumber = ''
+                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                    # todo:传kafka字段
+                    dic_news = {
+                        'attachmentIds': id_list,
+                        'author': '',
+                        'content': str(content),
+                        'contentWithTag': str(contentWithTag),
+                        'createDate': time_now,
+                        'deleteFlag': 0,
+                        'id': '',
+                        'labels': [{'relationId': "1684", 'relationName': "山西省国资委", 'labelMark': "policy"}],
+                        'origin': origin,
+                        'organ': "",
+                        'topicClassification': "",
+                        'issuedNumber': issuedNumber,
+                        'publishDate': publishDate,
+                        'writtenDate': None,
+                        'sid': '1697458829758697473',
+                        'sourceAddress': href,
+                        'summary': '',
+                        'title': title
+                    }
+                    # print(dic_news)
+                    flag = baseTool.sendKafka(dic_news)
+                    if flag:
+                        baseTool.save_data(dic_news)
+                        num += 1
+                        count += 1
+                except Exception as e:
+                    pass
+        except Exception as e:
+            pass
+    end_time = time.time()
+    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+if __name__ == "__main__":
+    shan_xi()
\ No newline at end of file
--- a/comData/policylaw/shang_hai.py
+++ b/comData/policylaw/shang_hai.py
+import datetime
+import os
+import re
+import time
+import requests
+from bs4 import BeautifulSoup
+from pyquery import PyQuery as pq
+from ClassTool import ClassTool
+baseTool = ClassTool()
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# 上海
+def shang_hai():
+    start = time.time()
+    num = 0
+    count = 0
+    for page in range(1, 7):
+        if page == 1:
+            url = 'https://www.gzw.sh.gov.cn/shgzw_flfg_zcfg_gfxwj/index.html'
+        else:
+            url = f'https://www.gzw.sh.gov.cn/shgzw_flfg_zcfg_gfxwj/index_{page}.html'
+        try:
+            resp_text = requests.get(url=url, headers=baseTool.headers, verify=False).text
+            doc_resp = pq(resp_text)
+            doc_items = doc_resp('.gqzc_list_right ul li').items()
+            for doc_item in doc_items:
+                id_list = []
+                title = doc_item('a').attr('title').strip()
+                pub_time = doc_item('span').text() + ' 00:00:00'
+                href = doc_item('a').attr('href')
+                if 'https:/' in href:
+                    pass
+                else:
+                    href = 'https://www.gzw.sh.gov.cn' + href
+                is_href = baseTool.db_storage.find_one({'网址': href})
+                if is_href:
+                    num += 1
+                    continue
+                try:
+                    # href = 'https://www.gzw.sh.gov.cn/shgzw_xxgk_zxgkxx/20230119/7c5e9691b2b54ff293e5d16d746d1a61.html'
+                    href_text = requests.get(url=href, headers=baseTool.headers, verify=False).text
+                    doc_href = pq(href_text)
+                    doc_href_ = BeautifulSoup(href_text, 'html.parser')
+                    # 相对路径转化为绝对路径
+                    doc_href_ = baseTool.paserUrl(doc_href_, href)
+                    info_list = doc_href_.find_all('span', style='text-align: center;margin-left: 42%;')
+                    pub_source = info_list[1].find('b').text.split('信息来源：')[1]
+                    content = doc_href_.find('div', attrs={'class': 'detail_03'})
+                    if content == '' or content == 'None':
+                        log.info(f'{href}-----{title}----内容为空')
+                        continue
+                    # 将文章中的附件字段删去
+                    pattern = r'\d+\.'
+                    for p in content.find_all('p')[-22:]:
+                        p_text = p.text
+                        if len(p_text) > 50:
+                            continue
+                        matches = re.findall(pattern, p_text)
+                        for k in matches:
+                            if k in p_text:
+                                p.extract()
+                    try:
+                        pub_result = doc_href('.detail_03')
+                        pub_result('meta')
+                        pub_result = '沪' + str(pub_result('meta')).split('沪')[1].split('号')[0].strip() + '号'
+                    except:
+                        try:
+                            pub_result = str(
+                                '沪' + doc_href('.detail_03 ul').text().split('沪')[1].split('号')[0].strip() + '号')
+                        except:
+                            pub_result = str(doc_href('.detail_03 p').text().split('号')[0].strip() + '号')
+                    if '﹝' in pub_result and '﹞' in pub_result:
+                        pub_hao = pub_result.replace('﹝', '〔').replace('﹞', '〕')
+                    elif '〔' in pub_result and '〕' in pub_result:
+                        pub_hao = pub_result
+                    elif '【' in pub_result and '】' in pub_result:
+                        pub_hao = pub_result
+                    elif '[' in pub_result and ']' in pub_result:
+                        pub_hao = pub_result
+                    else:
+                        pub_hao = ''
+                    if len(pub_hao) > 20:
+                        pub_hao = ''
+                    # todo:找到附件标签,正文内容带有附件
+                    fu_jian_soup = content.find('ul')
+                    if fu_jian_soup:
+                        li_list = fu_jian_soup.find_all('a')
+                    else:
+                        li_list = []
+                    for a in li_list:
+                        fu_jian_href = a['href']
+                        file_name = a.text
+                        if '.doc' in fu_jian_href or '.docx' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
+                                or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
+                                or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
+                            category = os.path.splitext(fu_jian_href)[1]
+                            if category not in file_name:
+                                file_name = file_name + category
+                            retData = baseCore.uptoOBS(fu_jian_href, '1671', file_name)
+                            if retData['state']:
+                                pass
+                            else:
+                                continue
+                            att_id, full_path = baseCore.tableUpdate(retData, '上海市国资委', file_name, num, pub_time)
+                            id_list.append(att_id)
+                            # todo:将返回的地址更新到soup
+                            a['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
+                        else:
+                            continue
+                    log.info(title)
+                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                    # todo:传kafka字段
+                    dic_news = {
+                        'attachmentIds': id_list,
+                        'author': '',
+                        'content': content.text,
+                        'contentWithTag': str(content),
+                        'createDate': time_now,
+                        'deleteFlag': 0,
+                        'id': '',
+                        'labels': [{'relationId': "1671", 'relationName': "上海市国资委", 'labelMark': "policy"}],
+                        'origin': pub_source,
+                        'organ': '',
+                        'topicClassification': '',
+                        'issuedNumber': pub_hao,
+                        'publishDate': pub_time,
+                        'writtenDate': None,
+                        'sid': '1697458829758697473',
+                        'sourceAddress': href,
+                        'summary': '',
+                        'title': title
+                    }
+                    flag = baseTool.sendKafka(dic_news)
+                    if flag:
+                        baseTool.save_data(dic_news)
+                        num = num + 1
+                        count += 1
+                except:
+                    pass
+        except:
+            pass
+    end = time.time()
+    log.info(f'共抓取{count}条数据,共耗时{end - start}')
+if __name__ == "__main__":
+    shang_hai()
\ No newline at end of file
--- a/comData/policylaw/shanxi.py
+++ b/comData/policylaw/shanxi.py
+import datetime
+import os
+import re
+import time
+import requests
+from bs4 import BeautifulSoup
+from pyquery import PyQuery as pq
+from ClassTool import ClassTool
+baseTool = ClassTool()
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# 陕西
+def shanxi():
+    num = 0
+    count = 0
+    start_time = time.time()
+    url = 'https://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?chid=100127'
+    # url = 'https://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?chid=100127'
+    try:
+        res = requests.get(url=url, headers=baseTool.headers)
+        res.encoding = res.apparent_encoding
+        res_text = res.text
+        soup = BeautifulSoup(res_text, 'html.parser')
+        # soup = paserUrl(res_text, 'https://sxgz.shaanxi.gov.cn')
+        # print(soup)
+        result = soup.find(class_='scroll_cont')
+        li_list = result.find_all('li')
+        for li in li_list:
+            href = li.find('a')['href']
+            if 'http' in str(href):
+                href = href
+            else:
+                href = 'https://sxgz.shaanxi.gov.cn/' + href
+            is_href = baseTool.db_storage.find_one({'网址': href})
+            if is_href:
+                num += 1
+                continue
+            try:
+                res_href = requests.get(url=href, headers=baseTool.headers)
+                res_href.encoding = res_href.apparent_encoding
+                res_text = res_href.text
+                # i_soup = BeautifulSoup(res_text, 'html.parser')
+                i_soup = baseTool.paserUrl(res_text, href)
+                title = i_soup.find(class_='m-gk-title').text
+                i_result = i_soup.find(class_='ftitle')
+                span_list = i_result.find_all('span')
+                origin = str(span_list[0]).split('<span>')[1].split('</span>')[0]
+                publishDate = str(span_list[2]).split('<span>')[1].split('</span>')[0]
+                t = time.strptime(publishDate, "%Y/%m/%d %H:%M:%S")
+                publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
+                contentWithTag = i_soup.find(class_='scroll_cont')
+                soup = BeautifulSoup(str(contentWithTag), 'html.parser')
+                div_tag = soup.find(id='ztl')
+                div_tag.extract()
+                fu_jian_soup = soup.find_all('a')
+                id_list = []
+                for file in fu_jian_soup:
+                    try:
+                        file_href = file['href']
+                    except Exception as e:
+                        continue
+                    if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
+                            or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
+                            or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
+                        file_name = file.text.strip()
+                        category = os.path.splitext(file_href)[1]
+                        if category not in file_name:
+                            file_name = file_name + category
+                        retData = baseCore.uptoOBS(file_href, '1680', file_name)
+                        if retData['state']:
+                            pass
+                        else:
+                            continue
+                        att_id, full_path = baseCore.tableUpdate(retData, '陕西省国资委', file_name, num, publishDate)
+                        id_list.append(att_id)
+                        # todo:将返回的地址更新到soup
+                        file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
+                # id_ = redefid(id_list)
+                contentWithTag = str(soup.prettify())
+                content = soup.text
+                if content == '' or content == None:
+                    log.info(f'-----{href}----{title}----内容为空-----')
+                    continue
+                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                # todo:传kafka字段
+                dic_news = {
+                    'attachmentIds': id_list,
+                    'author': '',
+                    'content': str(content),
+                    'contentWithTag': str(contentWithTag),
+                    'createDate': time_now,
+                    'deleteFlag': 0,
+                    'id': '',
+                    'labels': [{'relationId': "1680", 'relationName': "陕西省国资委", 'labelMark': "policy"}],
+                    'origin': origin,
+                    'organ': "",
+                    'topicClassification': "",
+                    'issuedNumber': "",
+                    'publishDate': publishDate,
+                    'writtenDate': None,
+                    'sid': '1697458829758697473',
+                    'sourceAddress': href,
+                    'summary': '',
+                    'title': title
+                }
+                # print(dic_news)
+                flag = baseTool.sendKafka(dic_news)
+                if flag:
+                    baseTool.save_data(dic_news)
+                    num += 1
+                    count += 1
+                res_href.close()
+            except Exception as e:
+                pass
+        res.close()
+    except:
+        pass
+    end_time = time.time()
+    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+if __name__ == "__main__":
+    shanxi()
\ No newline at end of file
--- a/comData/policylaw/si_chuan.py
+++ b/comData/policylaw/si_chuan.py
+import datetime
+import os
+import re
+import time
+import requests
+from bs4 import BeautifulSoup
+from pyquery import PyQuery as pq
+from ClassTool import ClassTool
+baseTool = ClassTool()
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# 四川
+def si_chuan():
+    num = 0
+    count = 0
+    start_time = time.time()
+    for page in range(1, 3):
+        if page == 1:
+            url = 'http://gzw.sc.gov.cn/scsgzw/CU2304010701/cu_xxgk_xzgfxwj.shtml'
+        else:
+            url = 'http://gzw.sc.gov.cn/scsgzw/CU2304010701/cu_xxgk_xzgfxwj_2.shtml'
+        try:
+            resp_text = requests.get(url=url, headers=baseTool.headers, verify=False).text
+            doc_resp = pq(resp_text)
+            doc_items_ = doc_resp('.biaobody')
+            doc_items = doc_items_('li').items()
+            for doc_item in doc_items:
+                id_list = []
+                # print(doc_item)
+                pub_time = doc_item('.lie4').text().strip() + ' 00:00:00'
+                pub_hao = doc_item('.lie3').text().strip()
+                title = doc_item('a').attr('title')
+                href = doc_item('a').attr('href')
+                if 'http:' not in href:
+                    href = 'http://gzw.sc.gov.cn' + doc_item('a').attr('href')
+                # href = 'http://gzw.sc.gov.cn/scsgzw/CU2304010701/2018/1/22/9c5db691e09f4efdafce41763a0d7e03.shtml'
+                is_href = baseTool.db_storage.find_one({'网址': href})
+                if is_href:
+                    num += 1
+                    continue
+                try:
+                    # print(href)
+                    href_text = requests.get(url=href, headers=baseTool.headers, verify=False).text
+                    doc_href = pq(href_text)
+                    # title = str(doc_href('.xxgkzn_title').text()).replace('\n', '').replace('\r', '')
+                    # content = str(doc_href('#scrollBox').children())
+                    # 将doc_href转化为BeautifulSoup
+                    doc_href = BeautifulSoup(str(doc_href), 'html.parser')
+                    # 相对路径转化为绝对路径
+                    doc_href = baseTool.paserUrl(doc_href, href)
+                    contentWithTag = doc_href.find('div', id='scrollBox')
+                    content = contentWithTag.text
+                    if content == '' or content == None:
+                        log.info(f'-----{href}----{title}----内容为空-----')
+                        continue
+                    fu_jian_list = doc_href.find_all('a')
+                    for fu_jian in fu_jian_list:
+                        try:
+                            fu_jian_href = fu_jian['href']
+                        except:
+                            continue
+                        file_name = fu_jian.text.strip()
+                        if '.doc' in fu_jian_href or '.docx' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.xlsx' in fu_jian_href or '.zip' in fu_jian_href \
+                                or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
+                                or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
+                            category = os.path.splitext(fu_jian_href)[1]
+                            if category not in file_name:
+                                file_name = file_name + category
+                            # 对附件上传至文件服务器
+                            retData = baseCore.uptoOBS(fu_jian_href, '1678', file_name)
+                            if retData['state']:
+                                pass
+                            else:
+                                continue
+                            att_id, full_path = baseCore.tableUpdate(retData, '四川省国资委', file_name, num, pub_time)
+                            id_list.append(att_id)
+                            fu_jian['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
+                            # fu_jian_href_list.append(fu_jian_href)
+                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                    # todo:传kafka字段
+                    dic_news = {
+                        'attachmentIds': id_list,
+                        'author': '',
+                        'content': content,
+                        'contentWithTag': str(contentWithTag),
+                        'createDate': time_now,
+                        'deleteFlag': 0,
+                        'id': '',
+                        'labels': [{'relationId': "1678", 'relationName': "四川省国资委", 'labelMark': "policy"}],
+                        'origin': '',
+                        'organ': '',
+                        'topicClassification': '',
+                        'issuedNumber': pub_hao,
+                        'publishDate': pub_time,
+                        'writtenDate': None,
+                        'sid': '1697458829758697473',
+                        'sourceAddress': href,
+                        'summary': '',
+                        'title': title
+                    }
+                    # print(dic_news)
+                    flag = baseTool.sendKafka(dic_news)
+                    if flag:
+                        baseTool.save_data(dic_news)
+                        log.info(title)
+                        count += 1
+                        num = num + 1
+                except Exception as e:
+                    pass
+        except:
+            pass
+    end_time = time.time()
+    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+if __name__ == "__main__":
+    si_chuan()
\ No newline at end of file
--- a/comData/policylaw/start.py
+++ b/comData/policylaw/start.py
+import datetime
+import time
+from comData.policylaw.an_hui import an_hui
+from comData.policylaw.bei_jing import bei_jing
+from comData.policylaw.chong_qing import chong_qing
+from comData.policylaw.fu_jian import fu_jian
+from comData.policylaw.guang_dong import guang_dong
+from comData.policylaw.guang_xi import guang_xi
+from comData.policylaw.gui_zhou import gui_zhou
+from comData.policylaw.gwyfile import get_content1
+from comData.policylaw.gwyparts import get_content2
+from comData.policylaw.gwysasac import get_content3
+from comData.policylaw.hai_nan import hai_nan
+from comData.policylaw.he_nan import he_nan
+from comData.policylaw.hei_long_jiang import hei_long_jiang
+from comData.policylaw.ji_lin import ji_lin
+from comData.policylaw.jiang_su import jiang_su
+from comData.policylaw.jiang_xi import jiang_xi
+from comData.policylaw.liao_ning import liao_ning
+from comData.policylaw.nei_meng_gu import nei_meng_gu
+from comData.policylaw.shan_dong import shan_dong
+from comData.policylaw.shan_xi import shan_xi
+from comData.policylaw.shang_hai import shang_hai
+from comData.policylaw.si_chuan import si_chuan
+from comData.policylaw.tian_jin import tian_jin
+from comData.policylaw.xin_jiang import xin_jiang
+from comData.policylaw.yun_nan import yun_nan
+from comData.policylaw.zhe_jiang import zhe_jiang
+if __name__ == "__main__":
+    get_content1()
+    get_content3()
+    bei_jing()
+    nei_meng_gu()
+    ji_lin()
+    shang_hai()
+    zhe_jiang()
+    fu_jian()
+    shan_dong()
+    guang_dong()
+    hai_nan()
+    si_chuan()
+    guang_xi()
+    gui_zhou()
+    yun_nan()
+    chong_qing()
+    tian_jin()
+    xin_jiang()
+    shan_xi()
+    liao_ning()
+    hei_long_jiang()
+    jiang_su()
+    an_hui()
+    jiang_xi()
+    he_nan()
+    hu_nan()
+    gan_su()
+    ning_xia()
+    xi_zang()
+    shanxi()
+    qing_hai()
+    he_bei()
+    qing_hai()
+    get_content2()
+    current_time = datetime.datetime.now()
+    midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
+    sleep_seconds = (midnight_time - current_time).total_seconds()
+    time.sleep(sleep_seconds)
\ No newline at end of file
--- a/comData/policylaw/xi_zang.py
+++ b/comData/policylaw/xi_zang.py
+import datetime
+import os
+import re
+import time
+import requests
+from bs4 import BeautifulSoup
+from pyquery import PyQuery as pq
+from ClassTool import ClassTool
+baseTool = ClassTool()
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# 西藏
+def xi_zang():
+    start_time = time.time()
+    pathType = 'policy/xizang/'
+    url_list = ['http://gzw.lasa.gov.cn/gzw/zccfg/common_list.shtml',
+                'http://gzw.lasa.gov.cn/gzw/wjzl/common_list.shtml', ]
+    for url in url_list:
+        num = 0
+        count = 0
+        try:
+            res = requests.get(url=url, headers=baseTool.headers)
+            res.encoding = res.apparent_encoding
+            res_text = res.text
+            # soup = BeautifulSoup(res_text, 'html.parser')
+            soup = baseTool.paserUrl(res_text, url)
+            result = soup.find('ul', class_='list')
+            li_list = result.find_all('li')
+            for li in li_list:
+                href = li.find('a')['href']
+                title = li.find('a').text
+                is_href = baseTool.db_storage.find_one({'网址': href})
+                if is_href:
+                    num += 1
+                    continue
+                try:
+                    res_href = requests.get(url=href, headers=baseTool.headers)
+                    res_href.encoding = res_href.apparent_encoding
+                    res_href = res_href.text
+                    # i_soup = BeautifulSoup(res_href, 'html.parser')
+                    i_soup = baseTool.paserUrl(res_href, href)
+                    i_result = i_soup.find(class_='inform')
+                    div_list = i_result.find_all('div')
+                    publishDate = str(div_list[0]).split('<div>')[1].split('</div>')[0].replace('发布时间：', '')
+                    origin = str(div_list[1]).split('<div>')[1].split('</div>')[0].replace('来源：', '')
+                    contentWithTag = str(i_soup.find(id='NewsContent'))
+                    soup = BeautifulSoup(contentWithTag, 'html.parser')
+                    content = soup.text
+                    if content == '' or content == None:
+                        log.info(f'-----{href}----{title}----内容为空-----')
+                        continue
+                    fu_jian_soup = soup.find_all('a')
+                    id_list = []
+                    for file in fu_jian_soup:
+                        try:
+                            file_href = file['href']
+                        except Exception as e:
+                            continue
+                        if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
+                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
+                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
+                            file_name = file.text.strip()
+                            category = os.path.splitext(file_href)[1]
+                            if category not in file_name:
+                                file_name = file_name + category
+                            retData = baseCore.uptoOBS(file_href, '1695', file_name)
+                            if retData['state']:
+                                pass
+                            else:
+                                continue
+                            att_id, full_path = baseCore.tableUpdate(retData, '西藏自治区国资委', file_name, num, publishDate)
+                            id_list.append(att_id)
+                            # todo:将返回的地址更新到soup
+                            file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
+                    # id_ = redefid(id_list)
+                    contentWithTag = str(soup.prettify())
+                    # todo:替换完成之后，将附件上传至文件服务器
+                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                    # todo:传kafka字段
+                    dic_news = {
+                        'attachmentIds': id_list,
+                        'author': '',
+                        'content': str(content),
+                        'contentWithTag': str(contentWithTag),
+                        'createDate': time_now,
+                        'deleteFlag': 0,
+                        'id': '',
+                        'labels': [{'relationId': "1695", 'relationName': "西藏自治区国资委", 'labelMark': "policy"}],
+                        'origin': origin,
+                        'organ': "",
+                        'topicClassification': "",
+                        'issuedNumber': "",
+                        'publishDate': publishDate,
+                        'writtenDate': None,
+                        'sid': '1697458829758697473',
+                        'sourceAddress': href,
+                        'summary': '',
+                        'title': title
+                    }
+                    # print(dic_news)
+                    flag = baseTool.sendKafka(dic_news)
+                    if flag:
+                        baseTool.save_data(dic_news)
+                        num += 1
+                        count += 1
+                except:
+                    pass
+        except:
+            pass
+    end_time = time.time()
+    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+if __name__ == "__main__":
+    xi_zang()
\ No newline at end of file
--- a/comData/policylaw/xin_jiang.py
+++ b/comData/policylaw/xin_jiang.py
--- a/comData/policylaw/yun_nan.py
+++ b/comData/policylaw/yun_nan.py
--- a/comData/policylaw/zhe_jiang.py
+++ b/comData/policylaw/zhe_jiang.py
+import datetime
+import os
+import re
+import time
+import requests
+from bs4 import BeautifulSoup
+from pyquery import PyQuery as pq
+from ClassTool import ClassTool
+baseTool = ClassTool()
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# 浙江
+def zhe_jiang():
+    start = time.time()
+    num = 0
+    count = 0
+    url = 'http://gzw.zj.gov.cn/col/col1229430928/index.html'
+    try:
+        res = requests.get(url, baseTool.headers).content
+        soup = BeautifulSoup(res, 'html.parser')
+        # print(soup)
+        # recordset = soup.find('recordset')
+        list_li = re.findall('CDATA\[\\n(.*?)\]\]></record>', str(soup))
+        # print(list_li)
+        for li in list_li:
+            fj_href_list = []
+            li = BeautifulSoup(li, 'lxml')
+            href = li.find('a')['href']
+            pub_time = li.find('a').find('span').text
+            title = li.find('a').text.replace(pub_time, '').strip()
+            # log.info(title)
+            if 'http' in href:
+                href = href
+            else:
+                href = 'http://gzw.zj.gov.cn/' + href
+            is_href = baseTool.db_storage.find_one({'网址': href})
+            if is_href:
+                num += 1
+                continue
+            try:
+                href_text = requests.get(url=href, headers=baseTool.headers, verify=False)
+                href_text.encoding = href_text.apparent_encoding
+                i_html = href_text.text
+                i_soup = BeautifulSoup(i_html, 'html.parser')
+                # 将相对路径转化为绝对路径
+                i_soup = baseTool.paserUrl(i_soup, href)
+                # g_xxgk_table cf
+                i_info = i_soup.find_all(class_='g_xxgk_td')
+                if len(i_info) != 0:
+                    try:
+                        pub_source = str(i_info[4]).split('"g_xxgk_td">')[1].split('</div>')[0]
+                        # pub_time = str(i_info[5]).split('"g_xxgk_td">')[1].split('</div>')[0]
+                        pub_hao = str(i_info[2]).split('"g_xxgk_td">')[1].split('</div>')[0]
+                        content = i_soup.find(class_='g_content').text
+                        contentWithTag = str(i_soup.find(class_='g_content'))
+                    except:
+                        # pub_source = str(i_info[3])
+                        # print(pub_source)
+                        pub_source = str(i_info[2]).split('"g_xxgk_td">')[1].split('</div>')[0]
+                        # pub_time = str(i_info[3]).split('"g_xxgk_td">')[1].split('</div>')[0]
+                        pub_hao = ''
+                        content = i_soup.find(class_='g_content').text
+                        contentWithTag = str(i_soup.find(class_='g_content'))
+                else:
+                    try:
+                        source = i_soup.find('span', class_='rich_media_meta rich_media_meta_nickname')
+                        pub_source = source.find('a').text
+                        time_ = i_soup.find('em', id='publish_time')
+                        pub_time = time_.text
+                        pub_hao = ''
+                        content = i_soup.find(
+                            class_='zh_CN wx_wap_page wx_wap_desktop_fontsize_2 mm_appmsg comment_feature discuss_tab appmsg_skin_default appmsg_style_default pages_skin_pc not_in_mm').text
+                        contentWithTag = str(i_soup.find(
+                            class_='zh_CN wx_wap_page wx_wap_desktop_fontsize_2 mm_appmsg comment_feature discuss_tab appmsg_skin_default appmsg_style_default pages_skin_pc not_in_mm'))
+                    except:
+                        try:
+                            source = i_soup.find_all(class_='ant-space-item')
+                            # pub_time = str(source[1]).split('<span>')[1].split('</span>')[0]
+                            pub_source = str(source[0]).split('<span>')[1].split('</span>')[0].replace('来源：', '')
+                            pub_hao = ''
+                            content = i_soup.find(class_='index_wrapper__L_zqV').text
+                            contentWithTag = str(i_soup.find(class_='index_wrapper__L_zqV'))
+                        except:
+                            source = i_soup.find('div', class_='zsy_cotitle').find('p').text
+                            pub_source = source.split('文章来源：')[1].split('发布时间：')[0]
+                            pub_hao = ''
+                            content = i_soup.find('div', class_='zsy_comain').replace('扫一扫在手机打开当前页', '').strip().text
+                            contentWithTag = str(i_soup.find('div', class_='zsy_comain')).replace('扫一扫在手机打开当前页',
+                                                                                                  '').strip()
+                            # fujian_list = i_soup.find(class_='related').find_all('li')
+                            # for fujian in fujian_list:
+                            #     # print(fujian)
+                            #     fujian_href = 'http://www.sasac.gov.cn/' + str(fujian.find('a')['href']).replace('../', '')
+                            #     fj_href_list.append(fujian_href)
+                            # print(fj_href_list)
+                log.info(title)
+                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                # todo:传kafka字段
+                if content == '' or content == 'None':
+                    log.info(f'{href}-----{title}----内容为空')
+                    continue
+                dic_news = {
+                    'attachmentIds': [],
+                    'author': '',
+                    'content': content,
+                    'contentWithTag': str(contentWithTag),
+                    'createDate': time_now,
+                    'deleteFlag': 0,
+                    'id': '',
+                    'labels': [{'relationId': "1672", 'relationName': "浙江省国资委", 'labelMark': "policy"}],
+                    'origin': pub_source,
+                    'organ': pub_source,
+                    'topicClassification': '',
+                    'issuedNumber': pub_hao,
+                    'publishDate': pub_time,
+                    'writtenDate': None,
+                    'sid': '1697458829758697473',
+                    'sourceAddress': href,
+                    'summary': '',
+                    'title': title
+                }
+                # print(dic_news)
+                flag = baseTool.sendKafka(dic_news)
+                if flag:
+                    baseTool.save_data(dic_news)
+                    num = num + 1
+                    count += 1
+            except:
+                pass
+    except:
+        pass
+    end = time.time()
+    log.info(f'共抓取{num}条数据,共耗时{end - start}')
+if __name__ == "__main__":
+    zhe_jiang()
\ No newline at end of file