政策法规脚本维护

baba9e5d · 薛凌堃 · f2ff6737 · baba9e5d · baba9e5d · baba9e5d
--- a/comData/policylaw/BaseCore.py
+++ b/comData/policylaw/BaseCore.py
@@ -505,27 +505,36 @@ class BaseCore:
            for i in range(0, 3):
                try:
                    response = requests.get(file_href, headers=headers, verify=False, timeout=20)
-                    file_size = int(response.headers.get('Content-Length'))
                    break
-                except:
+                except Exception as e:
                    time.sleep(3)
+                    if i ==2:
+                        return retData
                    continue
+            try:
+                if response.status_code == 200:
+                    file_size = int(response.headers.get('Content-Length'))
+                else:
+                    return retData
+            except:
+                file_size = ''
            for i in range(0, 3):
                try:
                    name = str(self.getuuid()) + category
                    result = obsClient.putContent('zzsn', 'PolicyDocuments/' + name, content=response.content)
                    break
                except:
                    time.sleep(3)
                    continue
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            retData['state'] = True
            retData['path'] = result['body']['objectUrl'].split('.com')[1]
            retData['full_path'] = result['body']['objectUrl']
-            retData['file_size'] = self.convert_size(file_size)
+            try:
+                retData['file_size'] = self.convert_size(file_size)
+            except:
+                retData['file_size'] = ''
            retData['create_time'] = time_now
            return retData
        except Exception as e:

--- a/comData/policylaw/gwysasac.py
+++ b/comData/policylaw/gwysasac.py
@@ -34,8 +34,8 @@ def get_content3():
        doc_href = soup.find('div', class_='zsy_content')
        try:
            org_content = doc_href.select('.zsy_cotitle')[0]
-            org = re.findall('文章来源：(.*?)发布时间：', org_content)[0].strip()
+            org = re.findall('文章来源：(.*?)发布时间：', str(org_content))[0].strip()
-        except:
+        except Exception as e:
            org = ''
        try:
            contentWithTag = doc_href.find('div', class_='zsy_comain')
@@ -103,7 +103,7 @@ def get_content3():
            'id': '',  #
            'labels': [{'relationId': "1642", 'relationName': "国务院国资委", 'labelMark': "policy"}],
            # 关联标签id  关联标签名称  关联标签标识
-            'origin': '',  # 政策发布机关
+            'origin': org,  # 政策发布机关
            'organ': org,  # 政策发文机关
            'topicClassification': '',  # 政策文件分类
            'issuedNumber': pub_hao,  # 发文字号
@@ -168,10 +168,10 @@ def get_content3():
                    href = f'http://www.sasac.gov.cn{href_.replace("../../..", "")}'
                    # 判断是否已经爬取过
                    is_href = baseTool.db_storage.find_one({'网址': href})
-                    if is_href:
+                    # if is_href:
-                        num += 1
+                    #     num += 1
-                        log.info('已采集----------跳过')
+                    #     log.info('已采集----------跳过')
-                        continue
+                    #     continue
                    title = doc_item('a').attr('title')
                    pub_time = doc_item('span').text().replace('[', '').replace(']', '')
                except:
@@ -184,9 +184,9 @@ def get_content3():
        end_time = time.time()
        log.info(f'共抓取国资委文件{count}条数据，耗时{end_time - start_time}')
-    # partOne()
+    partOne()
    # 增量执行需要注释掉partTwo()
-    partTwo()
+    # partTwo()
 if __name__ == "__main__":

--- a/comData/policylaw/ji_lin.py
+++ b/comData/policylaw/ji_lin.py
+import os
+import re
+import time
+import requests
+from bs4 import BeautifulSoup
+from ClassTool import ClassTool
+baseTool = ClassTool()
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# 吉林
+def ji_lin():
+    start = time.time()
+    num = 0
+    count = 0
+    url = 'http://gzw.jl.gov.cn/zwgk/zcwj/'
+    try:
+        resp_text = requests.get(url=url, headers=baseTool.headers, verify=False)
+        resp_text.encoding = 'utf-8'
+        html = resp_text.text
+        soup = BeautifulSoup(html, 'html.parser')
+        result = soup.find(class_='list ej_list')
+        li_list = result.find_all('li')
+        for a in li_list:
+            id_list = []
+            a_text = str(a)
+            href = a.find('a')['href']  # 网站链接
+            if re.findall('http', href):
+                real_href = href
+            else:
+                real_href = url + a_text.split('href=".')[-1].split('" target="_blank')[0]
+            title = a.find('a').text.replace('\n', '')
+            is_href = baseTool.db_storage.find_one({'网址': real_href})
+            if is_href:
+                num += 1
+                continue
+            try:
+                # real_href = 'http://gzw.jl.gov.cn/zwgk/zcwj//201906/t20190624_2310742.html'
+                href_text = requests.get(url=real_href, headers=baseTool.headers, verify=False)
+                i_html = href_text.text.encode("ISO-8859-1")
+                i_html = i_html.decode("utf-8")
+                i_soup = BeautifulSoup(i_html, 'html.parser')
+                # print(i_soup)
+                # 相对路径转化为绝对路径
+                soup = baseTool.paserUrl(i_soup, real_href)
+                soup.prettify()
+                try:
+                    i_come = i_soup.find('span', class_='source')
+                    i_time = i_soup.find('span', class_='time')
+                    pub_come = i_come.text.split('.write(" ')[1].split('");')[0].strip()
+                    pub_time = i_time.text.split('时间：')[1].strip()
+                except:
+                    i_come = i_soup.find('div', class_='zsy_cotitle')
+                    i_time = i_soup.find('div', class_='zsy_cotitle')
+                    if (i_come):
+                        # pub_come = i_come.find('p')
+                        try:
+                            pub_come = i_come.find('p').text.split('信息来源 > ')[1].split('发布时间：')[0].strip()
+                        except:
+                            pub_come = i_come.find('p').text.split('文章来源')[1].split('发布时间：')[0].strip()
+                        # print(pub_time)
+                        pub_time = i_time.find('p').text.split('发布时间：')[1].strip()
+                        # print(pub_come)
+                    else:
+                        pub = i_soup.find(class_='share')
+                        pub_time = pub.find(class_='left').find('span', class_='time').text
+                        if '时间' in pub_time:
+                            pub_time = pub_time.split('时间：')[1].strip()
+                        pub_come = pub.find(class_='right').find('span', class_='source').text.split('来源：')[1].strip()
+                        # print(pub_come)
+                i_content = soup.find(class_='zsy_comain')
+                if i_content:
+                    # print(real_href)
+                    # 去掉扫一扫
+                    try:
+                        soup.find('div', id='qr_container').decompose()
+                        soup.find('div', id='div_div').decompose()
+                    except:
+                        i_content = soup
+                    # 去掉style
+                    # 去掉style标签
+                    try:
+                        for styleTag in soup.find_all('style'):
+                            styleTag.extract()
+                    except:
+                        i_content = soup
+                    contentWithTag = soup.find(class_='zsy_comain')
+                    content = contentWithTag.text.strip()
+                    if content == '' or content == 'None':
+                        log.info(f'{real_href}-----{title}----内容为空')
+                        continue
+                    # 发文字号
+                    find_hao = i_content.find_all('p')[:3]
+                    pub_hao = ''
+                    for j in find_hao:
+                        if '号' in j.text:
+                            pub_hao = j.text
+                        else:
+                            continue
+                    fj = soup.find('div', style='width:920px; margin: 0 auto;')
+                    if fj:
+                        li_list = fj.find_all('li')
+                        for li in li_list:
+                            fu_jian_href = li.find('a')['href']
+                            # 如果是附件
+                            if '.pdf' in fu_jian_href or '.wps' in fu_jian_href or '.docx' in fu_jian_href or '.doc' in fu_jian_href or 'xls' in fu_jian_href or '.zip' in fu_jian_href \
+                                    or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
+                                    or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
+                                file_name = fu_jian_href.text.strip()
+                                category = os.path.splitext(fu_jian_href)[1]
+                                if category not in file_name:
+                                    file_name = file_name + category
+                                # print(fu_jian_href)
+                                retData = baseCore.uptoOBS(fu_jian_href, '1670', file_name)
+                                if retData['state']:
+                                    pass
+                                else:
+                                    continue
+                                att_id, full_path = baseCore.tableUpdate(retData, '吉林市国资委', file_name, num, pub_time)
+                                id_list.append(att_id)
+                                #
+                                # # todo:将返回的地址更新到soup
+                                li.find('a')['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
+                            else:
+                                continue
+                else:
+                    i_content = soup.find(class_="content")
+                    # 将文章中的附件字段删去
+                    pattern = r'\d+\.'
+                    # pattern = r"附件：\d+\.\s*(.*)"
+                    for p in i_content.find_all('div')[-10:]:
+                        p_text = p.text
+                        matches = re.findall(pattern, p_text)
+                        if matches:
+                            for k in matches:
+                                if k in p_text:
+                                    p.extract()
+                    contentWithTag = i_content
+                    content = contentWithTag.text.strip()
+                    if content == '' or content == 'None':
+                        log.info(f'{real_href}-----{title}----内容为空')
+                        continue
+                    # 找到附件上传至文件服务器
+                    fj_soup = i_soup.find('div', class_='wenjianfujian')
+                    fj_list = fj_soup.find_all('a')
+                    # for fu_jian_href in fj_list:
+                    #     fj_href = fu_jian_href['href']
+                    #     file_name = fu_jian_href.text.strip()
+                    #     # 如果是附件
+                    #     if '.pdf' in fj_href or '.wps' in fj_href or '.docx' in fj_href or '.doc' in fj_href or 'xls' in fj_href or '.zip' in fj_href \
+                    #             or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
+                    #             or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
+                    #         # print(fj_href)
+                    #         category = os.path.splitext(fj_href)[1]
+                    #         if category not in file_name:
+                    #             file_name = file_name + category
+                    #         retData = baseCore.uptoOBS(fj_href, '1670', file_name)
+                    #         if retData['state']:
+                    #             pass
+                    #         else:
+                    #             continue
+                    #         att_id, full_path = baseCore.tableUpdate(retData, '吉林省国资委', file_name, num, pub_time)
+                    #         id_list.append(att_id)
+                    #         #
+                    #         # # todo:将返回的地址更新到soup
+                    #         fu_jian_href['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
+                    #     else:
+                    #         continue
+                if '扫一扫在手机打开当前页' in content:
+                    content.replace('扫一扫在手机打开当前页', '')
+                    soup.find('div', id='div_div').decompose()
+                    soup.find('div', id='qr_container').decompose()
+                else:
+                    pass
+                log.info(title)
+                # print('............................................................')
+                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                # todo:传kafka字段
+                dic_news = {
+                    'attachmentIds': id_list,
+                    'author': '',
+                    'content': content,
+                    'contentWithTag': str(contentWithTag),
+                    'createDate': time_now,
+                    'deleteFlag': 0,
+                    'id': '',
+                    'labels': [{'relationId': "1670", 'relationName': "吉林市国资委", 'labelMark': "policy"}],
+                    'origin': pub_come,
+                    'organ': '',
+                    'topicClassification': '',
+                    'issuedNumber': '',
+                    'publishDate': pub_time,
+                    'writtenDate': None,
+                    'sid': '1697458829758697473',
+                    'sourceAddress': real_href,
+                    'summary': '',
+                    'title': title
+                }
+                # 如果内容为空，则数据不传接口
+                if content == '' or content == 'None':
+                    continue
+                else:
+                    # print(dic_news)
+                    flag = baseTool.sendKafka(dic_news)
+                    if flag:
+                        baseTool.save_data(dic_news)
+                    num = num + 1
+                    count += 1
+            except Exception as e:
+                log.info(e)
+                pass
+    except:
+        pass
+    end = time.time()
+    log.info(f'共{count}条...........共耗时 {end - start}秒')
+if __name__ == "__main__":
+    ji_lin()
\ No newline at end of file
--- a/comData/policylaw/tian_jin.py
+++ b/comData/policylaw/tian_jin.py
+import datetime
+import os
+import re
+import time
+import requests
+from bs4 import BeautifulSoup
+from pyquery import PyQuery as pq
+from ClassTool import ClassTool
+baseTool = ClassTool()
+from BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+# 天津
+def tian_jin():
+    def tian_jin1():
+        num = 0
+        count = 0
+        start_time = time.time()
+        for page in range(0, 3):
+            if page == 0:
+                url = 'http://sasac.tj.gov.cn/ZWGK1142/zcwj/sjwj/'
+            else:
+                url = f'https://sasac.tj.gov.cn/ZWGK1142/zcwj/sjwj/index_{page}.html'
+            try:
+                baseTool.headers['Accept-Language'] = 'zh-CN,zh;q=0.9'
+                req = requests.get(url=url, headers=baseTool.headers, verify=False)
+                req_text = req.text.encode("ISO-8859-1")
+                req_text = req_text.decode("utf-8")
+                soup = BeautifulSoup(req_text, 'html.parser')
+                doc_items = soup.select('#content > div.mainContent > div > div.mBd > ul')[0]
+                li_list = doc_items.find_all('li')
+                for li in li_list:
+                    title = str(li.find('a').text).replace('\n', '').lstrip().strip()
+                    i_href = str(li.find('a').get('href'))
+                    if 'ZTZL' in i_href:
+                        href = i_href.replace('../../../', 'https://sasac.tj.gov.cn/')
+                    elif './' in i_href:
+                        href = i_href.replace('./', 'https://sasac.tj.gov.cn/ZWGK1142/zcwj/sjwj/')
+                    else:
+                        href = i_href
+                    is_href = baseTool.db_storage.find_one({'网址': href})
+                    if is_href:
+                        num += 1
+                        continue
+                    try:
+                        # href_text = requests.get(url=href, headers=headers, verify=False).content.decode('utf-8')
+                        driver = baseTool.getDriver()
+                        driver.get(href)
+                        time.sleep(2)
+                        href_text = driver.page_source
+                        soup = baseTool.paserUrl(href_text, href)
+                        doc_href = pq(str(soup))
+                        title = doc_href('div[class="top-container"]>div:nth-child(1)>:nth-child(2)').text()
+                        organ = doc_href('div[class="top-container"]>div:nth-child(3)>:nth-child(2)').text()
+                        issuedNumber = doc_href('div[class="top-container"]>div:nth-child(4)>:nth-child(2)').text()
+                        topicClassification = doc_href(
+                            'div[class="top-container"]>div:nth-child(5)>:nth-child(2)').text()
+                        writtenDate_ = doc_href('div[class="top-container"]>div:nth-child(6)>:nth-child(2)').text()
+                        publishDate_ = doc_href('div[class="top-container"]>div:nth-child(7)>:nth-child(2)').text()
+                        date_obj1 = datetime.datetime.strptime(writtenDate_, "%Y年%m月%d日")
+                        writtenDate = date_obj1.strftime("%Y-%m-%d")
+                        date_obj2 = datetime.datetime.strptime(publishDate_, "%Y年%m月%d日")
+                        publishDate = date_obj2.strftime("%Y-%m-%d")
+                        doc_href('div[id="articlePlayer"]').remove()
+                        contentWithTag = doc_href('div[id="xlrllt"]')
+                        origin = ''
+                        if len(title) < 1:
+                            title = doc_href('div[class="common-content-mainTitle"]').text()
+                            issuedNumber = doc_href('div[class="common-content-subTitle"]').text()
+                            origin = doc_href('div[class="property"]>span:nth-child(1)').text().replace('文章来源：',
+                                                                                                        '').strip()
+                            publishDate = doc_href('div[class="property"]>span:nth-child(2)').text().replace('发布时间：',
+                                                                                                             '').strip()
+                            rmtag2 = doc_href('div[id="articlePlayer"]')
+                            rmtag2.remove()
+                            contentWithTag = doc_href('div[id="zoom"]')
+                        if len(writtenDate) < 1:
+                            writtenDate = None
+                        if len(publishDate) < 1:
+                            publishDate = doc_href('meta[name="PubDate"]').attr('content')
+                        soup = baseTool.paserUrl(str(contentWithTag), href)
+                        fu_jian_soup = soup.find_all('a')
+                        id_list = []
+                        for file in fu_jian_soup:
+                            try:
+                                file_href = file['href']
+                            except Exception as e:
+                                continue
+                            if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
+                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
+                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
+                                file_name = file.text.strip()
+                                category = os.path.splitext(file_href)[1]
+                                if category not in file_name:
+                                    file_name = file_name + category
+                                retData = baseCore.uptoOBS(file_href, '1683', file_name)
+                                if retData['state']:
+                                    pass
+                                else:
+                                    continue
+                                att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num, publishDate)
+                                id_list.append(att_id)
+                                # todo:将返回的地址更新到soup
+                                file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
+                        # id_ = redefid(id_list)
+                        contentWithTag = str(soup.prettify())
+                        if len(contentWithTag) < 1:
+                            if len(fu_jian_soup) < 1:
+                                continue
+                        content = soup.text
+                        if content == '' or content == None:
+                            log.info(f'-----{href}----{title}----内容为空-----')
+                            continue
+                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                        # todo:传kafka字段
+                        dic_news = {
+                            'attachmentIds': id_list,
+                            'author': '',
+                            'content': str(content),
+                            'contentWithTag': str(contentWithTag),
+                            'createDate': time_now,
+                            'deleteFlag': 0,
+                            'id': '',
+                            'labels': [{'relationId': "1683", 'relationName': "天津市国资委", 'labelMark': "policy"}],
+                            'origin': origin,
+                            'organ': organ,
+                            'topicClassification': topicClassification,
+                            'issuedNumber': issuedNumber,
+                            'publishDate': publishDate,
+                            'writtenDate': writtenDate,
+                            'sid': '1697458829758697473',
+                            'sourceAddress': href,
+                            'summary': '',
+                            'title': title
+                        }
+                        # print(dic_news)
+                        flag = baseTool.sendKafka(dic_news)
+                        if flag:
+                            baseTool.save_data(dic_news)
+                            num += 1
+                            count += 1
+                    except Exception as e:
+                        pass
+            except:
+                pass
+        end_time = time.time()
+        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+    def tian_jin2():
+        """
+        http://sasac.tj.gov.cn/ZWGK1142/zcwj/wjwj/index.html  4
+        """
+        num = 0
+        count = 0
+        start_time = time.time()
+        for page in range(0, 5):
+            if page == 0:
+                url = 'http://sasac.tj.gov.cn/ZWGK1142/zcwj/wjwj/index.html'
+            else:
+                url = f'http://sasac.tj.gov.cn/ZWGK1142/zcwj/wjwj/index_{page}.html'
+            try:
+                baseTool.headers['Accept-Language'] = 'zh-CN,zh;q=0.9'
+                req = requests.get(url=url, headers=baseTool.headers, verify=False)
+                req_text = req.text.encode("ISO-8859-1")
+                req_text = req_text.decode("utf-8")
+                soup = BeautifulSoup(req_text, 'html.parser')
+                doc_items = soup.select('#content > div.mainContent > div > div.mBd > ul')[0]
+                li_list = doc_items.find_all('li')
+                for li in li_list:
+                    title = str(li.find('a').text).replace('\n', '').lstrip().strip()
+                    href = str(li.find('a').get('href'))
+                    if 'http:' in href:
+                        continue
+                    else:
+                        href = url.split('index')[0] + href.replace('./', '')
+                    is_href = baseTool.db_storage.find_one({'网址': href})
+                    if is_href:
+                        num += 1
+                        continue
+                    try:
+                        # href_text = requests.get(url=href, headers=headers, verify=False).content.decode('utf-8')
+                        driver = baseTool.getDriver()
+                        driver.get(href)
+                        time.sleep(2)
+                        href_text = driver.page_source
+                        soup = baseTool.paserUrl(href_text, href)
+                        doc_href = pq(str(soup))
+                        title = doc_href('div[class="top-container"]>div:nth-child(1)>:nth-child(2)').text()
+                        organ = doc_href('div[class="top-container"]>div:nth-child(3)>:nth-child(2)').text()
+                        issuedNumber = doc_href('div[class="top-container"]>div:nth-child(4)>:nth-child(2)').text()
+                        topicClassification = doc_href(
+                            'div[class="top-container"]>div:nth-child(5)>:nth-child(2)').text()
+                        writtenDate_ = doc_href('div[id="content_cwrq"]').text()
+                        publishDate_ = doc_href('div[id="content_fbrq"]').text()
+                        date_obj1 = datetime.datetime.strptime(writtenDate_, "%Y年%m月%d日")
+                        writtenDate = date_obj1.strftime("%Y-%m-%d")
+                        date_obj2 = datetime.datetime.strptime(publishDate_, "%Y年%m月%d日")
+                        publishDate = date_obj2.strftime("%Y-%m-%d")
+                        contentWithTag = doc_href('div[id="xlrllt"]')
+                        origin = ''
+                        if len(title) < 1:
+                            title = doc_href('div[class="common-content-mainTitle"]').text()
+                            issuedNumber = doc_href('div[class="common-content-subTitle"]').text()
+                            origin = doc_href('div[class="property"]>span:nth-child(1)').text().replace('文章来源：',
+                                                                                                        '').strip()
+                            publishDate = doc_href('div[class="property"]>span:nth-child(2)').text().replace('发布时间：',
+                                                                                                             '').strip()
+                            rmtag2 = doc_href('div[id="articlePlayer"]')
+                            rmtag2.remove()
+                            contentWithTag = doc_href('div[id="zoom"]')
+                        if len(writtenDate) < 1:
+                            writtenDate = None
+                        if len(publishDate) < 1:
+                            publishDate = doc_href('meta[name="PubDate"]').attr('content')
+                        soup = baseTool.paserUrl(str(contentWithTag), href)
+                        fu_jian_soup = soup.find_all('a')
+                        id_list = []
+                        for file in fu_jian_soup:
+                            try:
+                                file_href = file['href']
+                            except Exception as e:
+                                continue
+                            if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
+                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
+                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
+                                file_name = file.text.strip()
+                                category = os.path.splitext(file_href)[1]
+                                if category not in file_name:
+                                    file_name = file_name + category
+                                retData = baseCore.uptoOBS(file_href, '1683', file_name)
+                                if retData['state']:
+                                    pass
+                                else:
+                                    continue
+                                att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num, publishDate)
+                                id_list.append(att_id)
+                                # todo:将返回的地址更新到soup
+                                file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
+                        # id_ = redefid(id_list)
+                        if id_list:
+                            pass
+                        else:
+                            doc_href("ul[class='qt-attachments-list']").remove()
+                        contentWithTag = str(soup.prettify())
+                        if len(contentWithTag) < 1:
+                            if len(fu_jian_soup) < 1:
+                                continue
+                        content = soup.text
+                        if content == '' or content == None:
+                            log.info(f'-----{href}----{title}----内容为空-----')
+                            continue
+                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                        # todo:传kafka字段
+                        dic_news = {
+                            'attachmentIds': id_list,
+                            'author': '',
+                            'content': str(content),
+                            'contentWithTag': str(contentWithTag),
+                            'createDate': time_now,
+                            'deleteFlag': 0,
+                            'id': '',
+                            'labels': [{'relationId': "1683", 'relationName': "天津市国资委", 'labelMark': "policy"}],
+                            'origin': origin,
+                            'organ': organ,
+                            'topicClassification': topicClassification,
+                            'issuedNumber': issuedNumber,
+                            'publishDate': publishDate,
+                            'writtenDate': writtenDate,
+                            'sid': '1697458829758697473',
+                            'sourceAddress': href,
+                            'summary': '',
+                            'title': title
+                        }
+                        # print(dic_news)
+                        flag = baseTool.sendKafka(dic_news)
+                        if flag:
+                            baseTool.save_data(dic_news)
+                            num += 1
+                            count += 1
+                    except:
+                        pass
+            except:
+                pass
+        end_time = time.time()
+        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+    def tian_jin3():
+        num = 0
+        count = 0
+        start_time = time.time()
+        for page in range(1, 3):
+            if page == 1:
+                url = 'https://sasac.tj.gov.cn/ZWGK1142/zcwj/gjjwj/index.html'
+            else:
+                # https://sasac.tj.gov.cn/ZWGK1142/zcwj/gjjwj/index_1.html
+                url = f'https://sasac.tj.gov.cn/ZWGK1142/zcwj/gjjwj/index_{page - 1}.html'
+            try:
+                req = requests.get(url, baseTool.headers, verify=False)
+                req_text = req.text.encode("ISO-8859-1")
+                req_text = req_text.decode("utf-8")
+                soup = BeautifulSoup(req_text, 'html.parser')
+                doc_items = soup.select('#content > div.mainContent > div > div.mBd > ul')[0]
+                li_list = doc_items.find_all('li')
+                for li in li_list:
+                    title = str(li.find('a').text).replace('\n', '').lstrip().strip()
+                    href = str(li.find('a').get('href'))
+                    try:
+                        publishDate = li.find('div', attrs={'class': 'other'}).text
+                    except:
+                        publishDate = None
+                    if 'http' not in href:
+                        if '../../../' in href:
+                            href = href.replace('../../../', 'https://sasac.tj.gov.cn/')
+                        href = href.replace('./', 'https://sasac.tj.gov.cn/ZWGK1142/zcwj/gjjwj/')
+                    is_href = baseTool.db_storage.find_one({'网址': href})
+                    if is_href:
+                        num += 1
+                        continue
+                    try:
+                        # res = requests.get(href, headers)
+                        # page_text = res.text.encode("ISO-8859-1")
+                        # page_text = page_text.decode("utf-8")
+                        driver = baseTool.getDriver()
+                        driver.get(href)
+                        time.sleep(2)
+                        href_text = driver.page_source
+                        soup = baseTool.paserUrl(href_text, href)
+                        doc_href = pq(str(soup))
+                        title = doc_href('table[class="bd1"]>tbody>tr:nth-child(3)>td:nth-child(2)').text()
+                        organ = doc_href('table[class="bd1"]>tbody>tr:nth-child(2)>td:nth-child(2)').text()
+                        issuedNumber = doc_href('table[class="bd1"]>tbody>tr:nth-child(4)>td:nth-child(2)').text()
+                        topicClassification = doc_href(
+                            'table[class="bd1"]>tbody>tr:nth-child(1)>td:nth-child(4)').text()
+                        writtenDate = doc_href('table[class="bd1"]>tbody>tr:nth-child(2)>td:nth-child(4)').text()
+                        publishDate = doc_href('table[class="bd1"]>tbody>tr:nth-child(4)>td:nth-child(4)').text()
+                        contentWithTag = doc_href('div[id="UCAP-CONTENT"]')
+                        origin = ''
+                        if len(title) < 1:
+                            title = doc_href('div[class="common-content-mainTitle"]').text()
+                            issuedNumber = doc_href('div[class="common-content-subTitle"]').text()
+                            origin = doc_href('div[class="property"]>span:nth-child(1)').text().replace('文章来源：',
+                                                                                                        '').strip()
+                            publishDate = doc_href('div[class="property"]>span:nth-child(2)').text().replace('发布时间：',
+                                                                                                             '').strip()
+                            rmtag2 = doc_href('div[id="articlePlayer"]')
+                            rmtag2.remove()
+                            contentWithTag = doc_href('div[id="zoom"]')
+                            if len(title) < 1:
+                                doc_href = doc_href('div[aria-label="内容文本区"]')
+                                doc_soup = BeautifulSoup(str(doc_href), 'html.parser')
+                                info_list = doc_soup.find('tbody').find('tbody').find('tr').find_all('table')
+                                title_tag = info_list[0]
+                                organ = info_list[2].find('span', id="laiyuan").text
+                                publishDate = info_list[2].find_all('td', class_="hui12")[-1].text
+                                contentWithTag = info_list[-1]
+                        if len(writtenDate) < 1:
+                            writtenDate = None
+                        if len(publishDate) < 1:
+                            publishDate = doc_href('meta[name="PubDate"]').attr('content')
+                        soup = baseTool.paserUrl(str(contentWithTag), href)
+                        fu_jian_soup = soup.find_all('a')
+                        id_list = []
+                        for file in fu_jian_soup:
+                            try:
+                                file_href = file['href']
+                            except Exception as e:
+                                continue
+                            if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
+                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
+                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
+                                file_name = file.text.strip()
+                                category = os.path.splitext(file_href)[1]
+                                if category not in file_name:
+                                    file_name = file_name + category
+                                retData = baseCore.uptoOBS(file_href, '1683', file_name)
+                                if retData['state']:
+                                    pass
+                                else:
+                                    continue
+                                att_id, full_path = baseCore.tableUpdate(retData, '天津市国资委', file_name, num, publishDate)
+                                id_list.append(att_id)
+                                # todo:将返回的地址更新到soup
+                                file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
+                        # id_ = redefid(id_list)
+                        contentWithTag = str(soup.prettify())
+                        if len(contentWithTag) < 1:
+                            if len(fu_jian_soup) < 1:
+                                continue
+                        content = soup.text
+                        if content == '' or content == None:
+                            log.info(f'-----{href}----{title}----内容为空-----')
+                            continue
+                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                        # todo:传kafka字段
+                        dic_news = {
+                            'attachmentIds': id_list,
+                            'author': '',
+                            'content': str(content),
+                            'contentWithTag': str(contentWithTag),
+                            'createDate': time_now,
+                            'deleteFlag': 0,
+                            'id': '',
+                            'labels': [{'relationId': "1683", 'relationName': "天津市国资委", 'labelMark': "policy"}],
+                            'origin': origin,
+                            'organ': organ,
+                            'topicClassification': topicClassification,
+                            'issuedNumber': issuedNumber,
+                            'publishDate': publishDate,
+                            'writtenDate': writtenDate,
+                            'sid': '1697458829758697473',
+                            'sourceAddress': href,
+                            'summary': '',
+                            'title': title
+                        }
+                        # print(dic_news)
+                        flag = baseTool.sendKafka(dic_news)
+                        if flag:
+                            baseTool.save_data(dic_news)
+                            num += 1
+                            count += 1
+                    except Exception as e:
+                        pass
+            except:
+                pass
+        end_time = time.time()
+        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
+    tian_jin1()
+    tian_jin2()
+    tian_jin3()
+if __name__ == "__main__":
+    tian_jin()
\ No newline at end of file