政策法规 9/9

9ab6c127 · LiuLiYuan · eeb41ef7 · 9ab6c127
--- a/comData/policylaw/2.py
+++ b/comData/policylaw/2.py
@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup
 from kafka import KafkaProducer
 from pyquery import PyQuery as pq
 from requests.packages import urllib3
-
+from requests.adapters import HTTPAdapter
 from BaseCore import BaseCore
 baseCore = BaseCore()

@@ -110,7 +110,7 @@ def sendKafka(dic_news):
        # 传输成功,写入日志中
        state = 1
        takeTime = baseCore.getTimeCost(start_time, time.time())
-        # return True
+        return True

    except Exception as e:

@@ -124,6 +124,7 @@ def sendKafka(dic_news):
        e = 'Kafka操作失败'
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
+        return False

 def redefid(idList):
    id_ = ','.join(map(str, idList))
@@ -132,7 +133,38 @@ def redefid(idList):
 def remove_dup():
    pass

+# 国务院文件
 def get_content1():
+    def getPageConunt(a_list, url, headers, s):
+        data = {"code": "18122f54c5c", "thirdPartyCode": "thirdparty_code_107", "thirdPartyTableId": 30,
+                "resultFields": ["pub_url", "maintitle", "fwzh", "cwrq", "publish_time"],
+                "trackTotalHits": "true",
+                "searchFields": [{"fieldName": "maintitle", "searchWord": ""}], "isPreciseSearch": 0,
+                "sorts": [{"sortField": "publish_time", "sortOrder": "DESC"}], "childrenInfoIds": [[a_list[1]]],
+                "pageSize": 20, "pageNo": 1}
+        data = json.dumps(data)
+        ip = baseCore.get_proxy()
+        res = s.post(url=url, headers=headers, data=data, verify=False, proxies=ip)
+        # 获得结果为json格式
+        res_text = json.loads(res.text)
+        pageCount = res_text['result']['data']['pager']['pageCount']
+        return pageCount
+
+    def getList(a_list, url, headers, pageNo, s):
+        # post请求所需参数
+        data = {"code": "18122f54c5c", "thirdPartyCode": "thirdparty_code_107", "thirdPartyTableId": 30,
+                "resultFields": ["pub_url", "maintitle", "fwzh", "cwrq", "publish_time"],
+                "trackTotalHits": "true",
+                "searchFields": [{"fieldName": "maintitle", "searchWord": ""}], "isPreciseSearch": 0,
+                "sorts": [{"sortField": "publish_time", "sortOrder": "DESC"}], "childrenInfoIds": [[a_list[1]]],
+                "pageSize": 20, "pageNo": pageNo}
+        data = json.dumps(data)
+        ip = baseCore.get_proxy()
+        res = s.post(url=url, headers=headers, data=data, verify=False, proxies=ip)
+        res_text = json.loads(res.text)
+        page_list = res_text['result']['data']['list']
+        return page_list
+
    start_time = time.time()
    num = 0
    # 过网站验证所需  athenaAppKey  athenaAppName
@@ -163,86 +195,142 @@ def get_content1():
    result_list = [['国令', "1108"], ['国发', "1107"], ['国函', "1106"], ['国发明电', "1105"], ['国办发', "1104"],
                   ['国办函', "1103"],
                   ['国办发明电', "1102"], ['其他', "1101"]]
-    try:
    for a_list in result_list:
        s = requests.session()
+        s.mount('https://', HTTPAdapter(max_retries=3))
+        s.mount('http://', HTTPAdapter(max_retries=3))
        s.keep_alive = False
-            pageNo = 1
        pcodeJiguan = a_list[0]
-            # post请求所需参数
-            data = {"code": "18122f54c5c", "thirdPartyCode": "thirdparty_code_107", "thirdPartyTableId": 30,
-                    "resultFields": ["pub_url", "maintitle", "fwzh", "cwrq", "publish_time"],
-                    "trackTotalHits": "true",
-                    "searchFields": [{"fieldName": "maintitle", "searchWord": ""}], "isPreciseSearch": 0,
-                    "sorts": [{"sortField": "publish_time", "sortOrder": "DESC"}], "childrenInfoIds": [[a_list[1]]],
-                    "pageSize": 20, "pageNo": pageNo}
-            data = json.dumps(data)
-            res = s.post(url=url, headers=headers, data=data, verify=False)
-            # 获得结果为json格式
-            res_text = json.loads(res.text)
-            page_list = res_text['result']['data']['list']
+        try:
+            pageCount = getPageConunt(a_list, url, headers, s)
+            for pageNo in range(1, pageCount + 1):
+                try:
+                    try:
+                        page_list = getList(a_list, url, headers, pageNo, s)
+                    except:
                        s.close()
+                        page_list = getList(a_list, url, headers, pageNo, s)
                    for page in page_list:
+                        id_list = []
                        # 获取所需信息
-                title = page['maintitle']
-                pub_time1 = page['publish_time']
-                pub_time2 = page['cwrq']
-                pub_code = page['fwzh']
-                href = page['pub_url']
+                        title = page['maintitle']  # 标题
+                        pub_time1 = page['publish_time']  # 发布时间
+                        pub_time2 = page['cwrq']  # 成文时间
+                        pub_code = page['fwzh']  # 发文字号
+                        href = page['pub_url']  # 网址
                        # 判断是否已经爬取过
                        is_href = db_storage.find_one({'网址': href})
                        if is_href:
+                            log.info('已采集----------跳过')
                            continue
                        try:
                            resp_href = requests.get(url=href, headers=headers_, verify=False)
                            resp_href.encoding = resp_href.apparent_encoding
                            i_html = resp_href.text
                            if '您访问的页面不存在或已删除' in i_html:
+                                # log.error(f'{title}...{href}...页面不存在或已删除')
                                continue
                            i_soup = BeautifulSoup(i_html, 'html.parser')
+                            i_soup = paserUrl(i_soup, href)
                            source = str(i_soup.find_all('tbody')[0])
-                    pub_org = source.split('<td><b>发文机关：</b></td>')[1].split('<td>')[1].split('</td>')[0]
-                    child_type = source.split('<td class="w340 zcwj_ztfl">')[1].split('</td>')[0]
-                    content = str(i_soup.find('table', attrs={'class': 'pages_content'}))
-                    fu_jian_result = re.findall('href="(.*?)"', content)
-                    fu_jian_href_list = []
-                    if len(fu_jian_result) > 0:
-                        for fu_jian_re in fu_jian_result:
-                            if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
-                                    or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
-                                    or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
-                                fu_jian_href = fu_jian_re
-                                fu_jian_href_list.append(fu_jian_href)
-                    result_dict = {
-                        '标题': title,
-                        '来源': '',
-                        '发文机关': pub_org,
-                        '发文字号': pub_code,
-                        '内容-未去标签': content,
-                        '附件网址': fu_jian_href_list,
-                        '发布时间': pub_time1,
-                        '成文时间': pub_time2,
-                        '主题分类': child_type,
-                        '网址': href,
-                        '归属': pcodeJiguan,
-                        '信息来源': '国务院文件',
-                        'tid': 1766,
+                            pub_org = source.split('<td><b>发文机关：</b></td>')[1].split('<td>')[1].split('</td>')[
+                                0]  # 发文机关
+                            child_type = source.split('<td class="w340 zcwj_ztfl">')[1].split('</td>')[0]  # 主题分类
+                            contentWithTag = i_soup.find('div',class_='wrap mxxgkwrap mxxgkwrap_gwywj').find('table',class_='border-table noneBorder pages_content')
+                            # 去除扫一扫
+                            contentWithTag.find('div', attrs={'id': 'div_div'}).decompose()
+                            content = contentWithTag.text  # 不带标签正文
+                            fu_jian_soup = contentWithTag.find_all('a')
+                            time.sleep(0.5)
+                            for file in fu_jian_soup:
+                                try:
+                                    file_href = file['href']
+                                except Exception as e:
+                                    log.info(f'---{href}--------{e}-------')
+                                    continue
+                            if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
+                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
+                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
+                                file_name = file.text.strip()
+                                retData = baseCore.uploadToserver(file_href,'1766')
+                                if retData['state']:
+                                    pass
+                                else:
+                                    continue
+                                att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num)
+                                id_list.append(att_id)
+
+                                #todo:将返回的地址更新到soup
+                                file['href'] = 'http://114.115.215.96/' + full_path
+                        except:
+                            log.error(f'{title}...{href}...获取内容失败')
+                            continue
+                        #todo:替换完成之后，将附件上传至文件服务器
+                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                        #todo:传kafka字段
+                        dic_news = {
+                            'attachmentIds': id_list,                     #附件id
+                            'author': '',                                 #作者
+                            'content': content,                           #正文不带标签
+                            'contentWithTag': str(contentWithTag),        #正文带标签
+                            'createDate': time_now,                       #创建时间
+                            'deleteFlag': 0,                              #是否删除(0为默认，1为删除)
+                            'id': '',                                     #
+                            'labels': [{'relationId': "1766", 'relationName': "国务院文件", 'labelMark': "policy"}],   #关联标签id  关联标签名称  关联标签标识
+                            'origin': '',                                 #政策发布机关
+                            'organ': pub_org,                             #政策发文机关
+                            'topicClassification': child_type,            #政策文件分类
+                            'issuedNumber': pub_code,                     #发文字号
+                            'publishDate': pub_time1,                     #发布时间
+                            'writtenDate': pub_time2,                     #成文时间
+                            'sid': '1697458829758697473',                 #信息源id
+                            'sourceAddress': href[0],                     #原文链接
+                            'summary': '',                                #摘要
+                            'title': title                                #标题
                        }
-                    resp_href.close()
-                    print(title)
-                    # save_data(result_dict)
-                    # time.sleep(1)
+                        # print(dic_news)
+                        flag = sendKafka(dic_news)
+                        if flag:
+                            save_data(dic_news)
                        num += 1
                except:
-                    pass
+                    log.error(f'{pcodeJiguan}...第{pageNo}页获取列表失败')
+                    continue
        except:
-        pass
+            log.error(f'{pcodeJiguan}...获取总数失败')
+            continue
        end_time = time.time()
        print(f'共抓取{num}条数据，共耗时{start_time - end_time}')


 # 国务院部门文件
 def get_content2():
+    def getTotalpage(bmfl,headers,session):
+        ip = baseCore.get_proxy()
+        pageNo = 1
+        time.sleep(2)
+        # 拼接url
+        url_ = f'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
+        resp = session.get(url=url_, headers=headers, verify=False,proxies=ip)
+        resp_text = resp.text
+        resp_json = json.loads(resp_text)
+        totalpage = resp_json['searchVO']['totalpage']
+        return totalpage
+
+    def getContentList(bmfl,pageNo,headers,session):
+        ip = baseCore.get_proxy()
+        url_ = f'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
+        # 请求结果为json格式
+        resp = session.get(url=url_, headers=headers, verify=False,proxies=ip)
+        resp_text = resp.text
+        resp_json = json.loads(resp_text)
+        content_list = resp_json['searchVO']['listVO']
+        return content_list
+
+    session = requests.session()
+    session.mount('https://', HTTPAdapter(max_retries=3))
+    session.mount('http://', HTTPAdapter(max_retries=3))
+    session.keep_alive = False
    start_time = time.time()
    num = 0
    result_list = ['外交部', '国家发展和改革委员会', '教育部', '科学技术部', '工业和信息化部', '国家民族事务委员会', '公安部', '国家安全部', '民政部', '司法部', '财政部',
@@ -262,20 +350,16 @@ def get_content2():

    for bmfl in result_list:
        try:
-            pageNo = 0
-            time.sleep(2)
-            # 拼接url
-            url_ = f'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
+            totalpage = getTotalpage(bmfl,headers,session)
+            for pageNo in range(1,totalpage+1):
                try:
-                # 请求结果为json格式
-                resp = requests.get(url=url_, headers=headers, verify=False)
-                resp_text = resp.text
-                resp_json = json.loads(resp_text)
-                content_list = resp_json['searchVO']['listVO']
-                resp.close()
+                    try:
+                        content_list = getContentList(bmfl,pageNo,headers,session)
                    except:
-                continue
+                        session.close()
+                        content_list = getContentList(bmfl,pageNo,headers,session)
                    for content_dict in content_list:
+                        id_list = []
                        href = content_dict['url']  # 详情页
                        title = content_dict['title']  # 标题
                        pub_code = content_dict['pcode']  # 发文字号
@@ -294,55 +378,198 @@ def get_content2():
                            child_type = content_dict['childtype']  # 主题分类
                        except:
                            child_type = ''
-                # 判断是否已经爬取过
+                        # # 判断是否已经爬取过
                        is_href = db_storage.find_one({'网址': href})
                        if is_href:
+                            log.info('已采集----------跳过')
                            continue
                        try:
                            resp = requests.get(url=href, headers=headers, verify=False)
-                    resp.encoding = 'utf-8'
+                            resp.encoding = resp.apparent_encoding
                            resp_text = resp.text
                            soup = BeautifulSoup(resp_text, 'html.parser')
-                    time.sleep(1)
-                    content = str(soup.find('div', attrs={'class': 'pages_content mhide'}))
-                    fu_jian_result = re.findall('href="(.*?)"', content)
-                    fu_jian_href_list = []
-                    if len(fu_jian_result) > 0:
-                        for fu_jian_re in fu_jian_result:
-                            if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
-                                    or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
-                                    or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
-                                fu_jian_href = href.split('content')[0] + fu_jian_re
-                                fu_jian_href_list.append(fu_jian_href)
-                    resp.close()
-                    result_dict = {
-                        '标题': title,
-                        '来源': '',
-                        '发文机关': pub_org,
-                        '发文字号': pub_code,
-                        '内容-未去标签': content,
-                        '附件网址': fu_jian_href_list,
-                        '发布时间': pub_time1,
-                        '成文时间': pub_time2,
-                        '主题分类': child_type,
-                        '网址': href,
-                        '归属': bmfl,
-                        '信息来源': '国务院部门文件',
-                        'tid': 1699,
+                            soup = paserUrl(soup,href)
+                            time.sleep(0.5)
+                            contentWithTag = soup.find('div', attrs={'class': 'pages_content mhide'})
+                            content = contentWithTag.text
+                            fu_jian_soup = contentWithTag.find_all('a')
+                            for file in fu_jian_soup:
+                                try:
+                                    file_href = file['href']
+                                except Exception as e:
+                                    log.info(f'---{href}--------{e}-------')
+                                    continue
+                                if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
+                                        or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
+                                        or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
+                                    file_name = file.text.strip()
+                                    retData = baseCore.uploadToserver(file_href,'1699')
+                                    if retData['state']:
+                                        pass
+                                    else:
+                                        continue
+                                    att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num)
+                                    id_list.append(att_id)
+
+                                    #todo:将返回的地址更新到soup
+                                    file['href'] = 'http://114.115.215.96/' + full_path
+                        except:
+                            print(f'{title}...{href}获取内容失败')
+                            continue
+                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                        #todo:传kafka字段
+                        dic_news = {
+                            'attachmentIds': id_list,                     #附件id
+                            'author': '',                                 #作者
+                            'content': content,                           #正文不带标签
+                            'contentWithTag': str(contentWithTag),        #正文带标签
+                            'createDate': time_now,                       #创建时间
+                            'deleteFlag': 0,                              #是否删除(0为默认，1为删除)
+                            'id': '',                                     #
+                            'labels': [{'relationId': "1699", 'relationName': "国务院各部委文件", 'labelMark': "policy"}],   #关联标签id  关联标签名称  关联标签标识
+                            'origin': '',                                 #政策发布机关
+                            'organ': pub_org,                             #政策发文机关
+                            'topicClassification': child_type,            #政策文件分类
+                            'issuedNumber': pub_code,                     #发文字号
+                            'publishDate': pub_time1,                     #发布时间
+                            'writtenDate': pub_time2,                     #成文时间
+                            'sid': '1697458829758697473',                 #信息源id
+                            'sourceAddress': href,                     #原文链接
+                            'summary': '',                                #摘要
+                            'title': title                                #标题
                        }
-                    print(title)
-                    save_data(result_dict)
+                        # print(dic_news)
+                        flag = sendKafka(dic_news)
+                        if flag:
+                            save_data(dic_news)
                        num += 1
                except:
-                    pass
+                    print(f'{bmfl}...第{pageNo}页获取信息列表失败')
+                    continue
        except:
-            pass
+            print(f'{bmfl}...获取页数失败')
+            continue
    end_time = time.time()
    print(f'共抓取{num}条数据，耗时{end_time - start_time}')


 # 国务院国有资产监督管理委员会-政策发布
 def get_content3():
+    def getPage():
+        url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
+        req = requests.get(url, headers=headers, verify=False)
+        req.encoding = req.apparent_encoding
+        soup = BeautifulSoup(req.text, 'html.parser')
+        totalpage = re.findall("maxPageNum = (.*);", soup.select('#pag_2603340')[0].text)[0]
+        return int(totalpage)
+
+    def sendContent(href, headers,title,pub_time,num):
+        id_list = []
+        resp_href = requests.request("GET", href, headers=headers, verify=False)
+        resp_href.encoding = resp_href.apparent_encoding
+        soup = BeautifulSoup(resp_href.text, 'lxml')
+        soup = paserUrl(soup, href)
+        doc_href = soup.find('div', class_='zsy_content')
+        try:
+            org_content = doc_href.select('.zsy_cotitle')[0]
+            org = re.findall('文章来源：(.*?)发布时间：', org_content)[0].strip()
+        except:
+            org = ''
+        contentWithTag = doc_href.find('div', class_='zsy_comain')
+        contentWithTag.select('#qr_container')[0].decompose()
+        contentWithTag.find('div', attrs={'id': 'div_div'}).decompose()
+        contentWithTag.find('div', class_='related').decompose()
+        contentWithTag.find('div', class_='jiathis_style_24x24').decompose()
+        try:
+            p_list = contentWithTag.findAll('p')
+            pub_hao = ''
+            for p in p_list:
+                p = str(p.text)
+                if '号' in p and '〔' in p and '〕' in p or '[' in p and ']' in p and '号' in p or '【' in p and '】' in p and '号' in p:
+                    try:
+                        pub_hao = p.split('日')[1].split('自')[0].strip().lstrip()
+                    except:
+                        pub_hao = p.strip().lstrip()
+                    break
+        except:
+            pub_hao = ''
+        if len(pub_hao) > 15:
+            pub_hao = ''
+        content = contentWithTag.text
+        fu_jian_soup = contentWithTag.find_all('a')
+        for file in fu_jian_soup:
+            try:
+                file_href = file['href']
+            except Exception as e:
+                log.info(f'---{href}--------{e}-------')
+                continue
+            if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
+                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
+                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
+                file_name = file.text.strip()
+                retData = baseCore.uploadToserver(file_href,'1642')
+                if retData['state']:
+                    pass
+                else:
+                    continue
+                att_id,full_path = baseCore.tableUpdate(retData,'国务院国资委',file_name,num)
+                id_list.append(att_id)
+
+                #todo:将返回的地址更新到soup
+                file['href'] = 'http://114.115.215.96/' + full_path
+        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        #todo:传kafka字段
+        dic_news = {
+            'attachmentIds': id_list,                     #附件id
+            'author': '',                                 #作者
+            'content': content,                           #正文不带标签
+            'contentWithTag': str(contentWithTag),        #正文带标签
+            'createDate': time_now,                       #创建时间
+            'deleteFlag': 0,                              #是否删除(0为默认，1为删除)
+            'id': '',                                     #
+            'labels': [{'relationId': "1642", 'relationName': "国务院国资委", 'labelMark': "policy"}],   #关联标签id  关联标签名称  关联标签标识
+            'origin': '',                                 #政策发布机关
+            'organ': org,                             #政策发文机关
+            'topicClassification': '',            #政策文件分类
+            'issuedNumber': pub_hao,                     #发文字号
+            'publishDate': pub_time,                     #发布时间
+            'writtenDate': '',                     #成文时间
+            'sid': '1697458829758697473',                 #信息源id
+            'sourceAddress': href,                     #原文链接
+            'summary': '',                                #摘要
+            'title': title                                #标题
+        }
+        # print(dic_news)
+        flag = sendKafka(dic_news)
+        if flag:
+            save_data(dic_news)
+
+    def partTwo():
+        start_time = time.time()
+        num = 0
+        totalpage = getPage()
+        for page in range(1, totalpage):
+            url = f"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index_2603340_{page}.html"
+            href_resp = requests.request("GET", url, headers=headers, verify=False)
+            resp_text = href_resp.content.decode('UTF-8')
+            li_list = resp_text.split('<li>')
+            del (li_list[0])
+            for li in li_list:
+                id_list = []
+                href_ = li.split('<a href="')[1].split('" target=')[0]
+                title = li.split('title="')[1].split('">')[0]
+                href = f'http://www.sasac.gov.cn{href_.replace("../../..", "")}'
+                pub_time = li.split('<span>[')[1].split(']</span>')[0]
+                is_href = db_storage.find_one({'网址': href})
+                if is_href:
+                    log.info('已采集----------跳过')
+                    continue
+                sendContent(href, headers,title,pub_time,num)
+                num += 1
+        end_time = time.time()
+        print(f'共抓取{num}条数据，耗时{end_time - start_time}')
+
+    def partOne():
        start_time = time.time()
        num = 0
        url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
@@ -363,70 +590,22 @@ def get_content3():
                    # 判断是否已经爬取过
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
+                        log.info('已采集----------跳过')
                        continue
                    title = doc_item('a').attr('title')
                    pub_time = doc_item('span').text().replace('[', '').replace(']', '')
                except:
                    continue
-            try:
-                try:
-                    resp_href = requests.request("GET", href, headers=headers, verify=False)
-                    doc_href = pq(resp_href.content)
-                    time.sleep(1)
-                    content_html = str(doc_href('.zsy_comain').remove('style').remove('#qr_container'))
-                    content = pq(content_html).text()
-                except:
-                    continue
-                if content.strip() == '':
-                    continue
-                try:
-                    org_content = doc_href('.zsy_cotitle').text()
-                    org = re.findall('文章来源：(.*?)发布时间：', org_content)[0].strip()
-                except:
-                    org = ''
-                try:
-                    resp_href.encoding = 'utf-8'
-                    resp_text_ = BeautifulSoup(resp_href.text, 'html.parser')
-                    zsy_comain = resp_text_.find('div', attrs={'class': 'zsy_comain'})
-                    p_list = zsy_comain.findAll('p')
-                    pub_hao = ''
-                    for p in p_list:
-                        p = str(p.text)
-                        if '号' in p and '〔' in p and '〕' in p or '[' in p and ']' in p and '号' in p or '【' in p and '】' in p and '号' in p:
-                            try:
-                                pub_hao = p.split('日')[1].split('自')[0].strip().lstrip()
-                            except:
-                                pub_hao = p.strip().lstrip()
-                            break
-                except:
-                    pub_hao = ''
-                if len(pub_hao) > 45:
-                    pub_hao = ''
-                result_dict = {
-                    '标题': title,
-                    '来源': org,
-                    '发文机关': '',
-                    '发文字号': pub_hao,
-                    '内容-未去标签': content_html,
-                    '附件网址': [],
-                    '发布时间': pub_time,
-                    '成文时间': '',
-                    '主题分类': '',
-                    '网址': href,
-                    '归属': '国务院国资委',
-                    '信息来源': '国务院国资委',
-                    'tid': 1642,
-                }
-                save_data(result_dict)
-                print(title)
+                sendContent(href, headers,title,pub_time,num)
                num += 1
        except:
            pass
-    except:
-        pass
        end_time = time.time()
        print(f'共抓取{num}条数据，耗时{end_time - start_time}')

+    partOne()
+    partTwo()
+
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin

@@ -569,7 +748,8 @@ def bei_jing():
                'title': title
            }
            # print(dic_news)
-            sendKafka(dic_news)
+            flag = sendKafka(dic_news)
+            if flag:
                save_data(dic_news)
            # print(id)
            # id_list.append(id)
@@ -687,8 +867,9 @@ def nei_meng_gu():
                    'summary':'',
                    'title':title
                }
-                sendKafka(dic_news)
+                flag = sendKafka(dic_news)

+                if flag:
                    save_data(dic_news)
                num = num + 1

@@ -872,7 +1053,8 @@ def ji_lin():
                    continue
                else:
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
+                    if flag:
                        save_data(dic_news)
                    num = num + 1
            except Exception as e:
@@ -1006,7 +1188,8 @@ def shang_hai():
                        'summary': '',
                        'title': title
                    }
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
+                    if flag:
                        save_data(dic_news)
                    num = num + 1
                except:
@@ -1123,7 +1306,8 @@ def zhe_jiang():
                    'title': title
                }
                # print(dic_news)
-                sendKafka(dic_news)
+                flag = sendKafka(dic_news)
+                if flag:
                    save_data(dic_news)

                num = num + 1
@@ -1278,7 +1462,8 @@ def fu_jian():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
+                    if flag:
                        save_data(dic_news)
                    print(title)
                    num += 1
@@ -1386,7 +1571,8 @@ def shan_dong():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
+                    if flag:
                        save_data(dic_news)
                    if content == '' or content == 'None':
                        continue
@@ -1485,7 +1671,8 @@ def guang_dong():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
+                    if flag:
                        save_data(dic_news)
                    print(title)
                    # save_data(result_dict)
@@ -1656,7 +1843,8 @@ def hai_nan():
                            'title': title
                        }

-                        sendKafka(dic_news)
+                        flag = sendKafka(dic_news)
+                        if flag:
                            save_data(dic_news)
                        print(title)

@@ -1724,7 +1912,8 @@ def hai_nan():
                        'summary': '',
                        'title': title
                    }
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
+                    if flag:
                        save_data(dic_news)
                    href_text.close()
                    # save_data(result_dict)
@@ -1826,7 +2015,8 @@ def hai_nan():
                        'title': title
                    }

-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
+                    if flag:
                        save_data(dic_news)
                    href_text.close()
                    # save_data(result_dict)
@@ -1929,7 +2119,8 @@ def hai_nan():
                        'title': title
                    }

-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
+                    if flag:
                        save_data(dic_news)
                    href_text.close()
                    # save_data(result_dict)
@@ -2012,7 +2203,8 @@ def hai_nan():
                        'title': title
                    }

-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
+                    if flag:
                        save_data(dic_news)
                    href_text.close()
                    # save_data(result_dict)
@@ -2182,7 +2374,8 @@ def si_chuan():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
+                    if flag:
                        save_data(dic_news)
                    print(title)

@@ -2304,7 +2497,8 @@ def guang_xi():
                            'title': title
                        }
                        # print(dic_news)
-                        sendKafka(dic_news)
+                        flag = sendKafka(dic_news)
+                        if flag:
                            save_data(dic_news)
                        print(title)
                        num = num + 1
@@ -2409,7 +2603,8 @@ def gui_zhou():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
+                    if flag:
                        save_data(dic_news)
                    print(title)
                    # save_data(result_dict)
@@ -2518,7 +2713,8 @@ def yun_nan():
                            'title': title
                        }
                        # print(dic_news)
-                        sendKafka(dic_news)
+                        flag = sendKafka(dic_news)
+                        if flag:
                            save_data(dic_news)
                        print(title)
                        num = num + 1
@@ -2627,8 +2823,9 @@ def yun_nan():
                            'title': title
                        }
                        # print(dic_news)
-                        # sendKafka(dic_news)
-                        # save_data(dic_news)
+                        flag = sendKafka(dic_news)
+                        if flag:
+                            save_data(dic_news)
                        print(title)

                        num = num + 1
@@ -2751,7 +2948,8 @@ def chong_qing():
                            'title': title
                        }
                        # print(dic_news)
-                        sendKafka(dic_news)
+                        flag = sendKafka(dic_news)
+                        if flag:
                            save_data(dic_news)
                        print(title)
                        # save_data(result_dict)
@@ -2873,7 +3071,8 @@ def tian_jin():
                            'title': title
                        }
                        # print(dic_news)
-                        sendKafka(dic_news)
+                        flag = sendKafka(dic_news)
+                        if flag:
                            save_data(dic_news)
                        num += 1
                    except:
@@ -2992,7 +3191,8 @@ def tian_jin():
                            'title': title
                        }
                        # print(dic_news)
-                        sendKafka(dic_news)
+                        flag = sendKafka(dic_news)
+                        if flag:
                            save_data(dic_news)
                        num += 1
                    except:
@@ -3115,7 +3315,8 @@ def tian_jin():
                            'title': title
                        }
                        # print(dic_news)
-                        sendKafka(dic_news)
+                        flag = sendKafka(dic_news)
+                        if flag:
                            save_data(dic_news)
                        num += 1
                    except:
@@ -3221,7 +3422,8 @@ def xin_jiang():
                            'title': title
                        }
                        # print(dic_news)
-                        sendKafka(dic_news)
+                        flag = sendKafka(dic_news)
+                        if flag:
                            save_data(dic_news)
                        num += 1
                    except:
@@ -3318,7 +3520,8 @@ def xin_jiang():
                            'title': title
                        }
                        # print(dic_news)
-                        sendKafka(dic_news)
+                        flag = sendKafka(dic_news)
+                        if flag:
                            save_data(dic_news)
                        num += 1
                        href_res.close()
@@ -3436,7 +3639,8 @@ def shan_xi():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
+                    if flag:
                        save_data(dic_news)
                    num += 1
                except:
@@ -3544,7 +3748,8 @@ def liao_ning():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
+                    if flag:
                        save_data(dic_news)
                    num += 1
                except:
@@ -3638,7 +3843,8 @@ def hei_long_jiang():
                            'title': title
                        }
                        # print(dic_news)
-                        sendKafka(dic_news)
+                        flag = sendKafka(dic_news)
+                        if flag:
                            save_data(dic_news)
                        num += 1
                    except:
@@ -3751,7 +3957,8 @@ def jiang_su():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
+                    if flag:
                        save_data(dic_news)
                    num += 1
                except:
@@ -3841,7 +4048,8 @@ def an_hui():
                            'title': title
                        }
                        # print(dic_news)
-                        sendKafka(dic_news)
+                        flag = sendKafka(dic_news)
+                        if flag:
                            save_data(dic_news)
                        num += 1
                    except:
@@ -3935,7 +4143,8 @@ def an_hui():
                            'title': title
                        }
                        # print(dic_news)
-                        sendKafka(dic_news)
+                        flag = sendKafka(dic_news)
+                        if flag:
                            save_data(dic_news)
                        num += 1
                        href_res.close()
@@ -4062,7 +4271,8 @@ def jiang_xi():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
+                    if flag:
                        save_data(dic_news)
                    num += 1
                except:
@@ -4154,7 +4364,8 @@ def he_nan():
                    'title': title
                }
                # print(dic_news)
-                sendKafka(dic_news)
+                flag = sendKafka(dic_news)
+                if flag:
                    save_data(dic_news)
                num += 1
                href_res.close()
@@ -4251,7 +4462,8 @@ def hu_nan():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
+                    if flag:
                        save_data(dic_news)
                    num += 1
                except:
@@ -4372,7 +4584,8 @@ def gan_su():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
+                    if flag:
                        save_data(dic_news)
                    num += 1
            except Exception as e:
@@ -4506,7 +4719,8 @@ def gan_su():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
+                    if flag:
                        save_data(dic_news)
                    num += 1
                except Exception as e:
@@ -4661,7 +4875,8 @@ def gan_su():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
+                    if flag:
                        save_data(dic_news)
                    num += 1
                except Exception as e:
@@ -4759,7 +4974,8 @@ def ning_xia():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
+                    if flag:
                        save_data(dic_news)
                    num += 1
                except:
@@ -4857,7 +5073,8 @@ def shanxi():
                    'title': title
                }
                # print(dic_news)
-                sendKafka(dic_news)
+                flag = sendKafka(dic_news)
+                if flag:
                    save_data(dic_news)
                num += 1
                res_href.close()
@@ -4951,7 +5168,8 @@ def xi_zang():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
+                    if flag:
                        save_data(dic_news)
                    num += 1
                except:
@@ -5047,7 +5265,8 @@ def qing_hai():
                            'title': title
                        }
                        # print(dic_news)
-                        sendKafka(dic_news)
+                        flag = sendKafka(dic_news)
+                        if flag:
                            save_data(dic_news)
                        # print(id)
                        # id_list.append(id)
@@ -5164,7 +5383,8 @@ def qing_hai():
                                    'title': title
                                }
                                # print(dic_news)
-                                sendKafka(dic_news)
+                                flag = sendKafka(dic_news)
+                                if flag:
                                    save_data(dic_news)
                                # print(id)
                                # id_list.append(id)
@@ -5262,7 +5482,8 @@ def he_bei():
                'title': title
            }
            # print(dic_news)
-            sendKafka(dic_news)
+            flag = sendKafka(dic_news)
+            if flag:
                save_data(dic_news)
            num += 1
    except:
@@ -5370,7 +5591,8 @@ def hu_bei():
                'title': title
            }
            # print(dic_news)
-            sendKafka(dic_news)
+            flag = sendKafka(dic_news)
+            if flag:
                save_data(dic_news)
            num += 1
        except Exception as e: