Merge remote-tracking branch 'origin/master'

8fb1c602 · 薛凌堃 · 7424d8e4 · abf7739a · 8fb1c602
--- a/comData/policylaw/2.py
+++ b/comData/policylaw/2.py
@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup
 from kafka import KafkaProducer
 from pyquery import PyQuery as pq
 from requests.packages import urllib3
+from requests.adapters import HTTPAdapter
 from BaseCore import BaseCore
 baseCore = BaseCore()
@@ -116,7 +116,7 @@ def sendKafka(dic_news):
        # 传输成功,写入日志中
        state = 1
        takeTime = baseCore.getTimeCost(start_time, time.time())
-        # return True
+        return True
    except Exception as e:
@@ -130,6 +130,7 @@ def sendKafka(dic_news):
        e = 'Kafka操作失败'
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
+        return False
 def redefid(idList):
@@ -140,8 +141,39 @@ def redefid(idList):
 def remove_dup():
    pass
+# 国务院文件
 def get_content1():
+    def getPageConunt(a_list, url, headers, s):
+        data = {"code": "18122f54c5c", "thirdPartyCode": "thirdparty_code_107", "thirdPartyTableId": 30,
+                "resultFields": ["pub_url", "maintitle", "fwzh", "cwrq", "publish_time"],
+                "trackTotalHits": "true",
+                "searchFields": [{"fieldName": "maintitle", "searchWord": ""}], "isPreciseSearch": 0,
+                "sorts": [{"sortField": "publish_time", "sortOrder": "DESC"}], "childrenInfoIds": [[a_list[1]]],
+                "pageSize": 20, "pageNo": 1}
+        data = json.dumps(data)
+        ip = baseCore.get_proxy()
+        res = s.post(url=url, headers=headers, data=data, verify=False, proxies=ip)
+        # 获得结果为json格式
+        res_text = json.loads(res.text)
+        pageCount = res_text['result']['data']['pager']['pageCount']
+        return pageCount
+    def getList(a_list, url, headers, pageNo, s):
+        # post请求所需参数
+        data = {"code": "18122f54c5c", "thirdPartyCode": "thirdparty_code_107", "thirdPartyTableId": 30,
+                "resultFields": ["pub_url", "maintitle", "fwzh", "cwrq", "publish_time"],
+                "trackTotalHits": "true",
+                "searchFields": [{"fieldName": "maintitle", "searchWord": ""}], "isPreciseSearch": 0,
+                "sorts": [{"sortField": "publish_time", "sortOrder": "DESC"}], "childrenInfoIds": [[a_list[1]]],
+                "pageSize": 20, "pageNo": pageNo}
+        data = json.dumps(data)
+        ip = baseCore.get_proxy()
+        res = s.post(url=url, headers=headers, data=data, verify=False, proxies=ip)
+        res_text = json.loads(res.text)
+        page_list = res_text['result']['data']['list']
+        return page_list
    start_time = time.time()
    num = 0
    # 过网站验证所需  athenaAppKey  athenaAppName
@@ -172,86 +204,142 @@ def get_content1():
    result_list = [['国令', "1108"], ['国发', "1107"], ['国函', "1106"], ['国发明电', "1105"], ['国办发', "1104"],
                   ['国办函', "1103"],
                   ['国办发明电', "1102"], ['其他', "1101"]]
-    try:
+    for a_list in result_list:
-        for a_list in result_list:
+        s = requests.session()
-            s = requests.session()
+        s.mount('https://', HTTPAdapter(max_retries=3))
-            s.keep_alive = False
+        s.mount('http://', HTTPAdapter(max_retries=3))
-            pageNo = 1
+        s.keep_alive = False
-            pcodeJiguan = a_list[0]
+        pcodeJiguan = a_list[0]
-            # post请求所需参数
+        try:
-            data = {"code": "18122f54c5c", "thirdPartyCode": "thirdparty_code_107", "thirdPartyTableId": 30,
+            pageCount = getPageConunt(a_list, url, headers, s)
-                    "resultFields": ["pub_url", "maintitle", "fwzh", "cwrq", "publish_time"],
+            for pageNo in range(1, pageCount + 1):
-                    "trackTotalHits": "true",
-                    "searchFields": [{"fieldName": "maintitle", "searchWord": ""}], "isPreciseSearch": 0,
-                    "sorts": [{"sortField": "publish_time", "sortOrder": "DESC"}], "childrenInfoIds": [[a_list[1]]],
-                    "pageSize": 20, "pageNo": pageNo}
-            data = json.dumps(data)
-            res = s.post(url=url, headers=headers, data=data, verify=False)
-            # 获得结果为json格式
-            res_text = json.loads(res.text)
-            page_list = res_text['result']['data']['list']
-            s.close()
-            for page in page_list:
-                # 获取所需信息
-                title = page['maintitle']
-                pub_time1 = page['publish_time']
-                pub_time2 = page['cwrq']
-                pub_code = page['fwzh']
-                href = page['pub_url']
-                # 判断是否已经爬取过
-                is_href = db_storage.find_one({'网址': href})
-                if is_href:
-                    continue
                try:
-                    resp_href = requests.get(url=href, headers=headers_, verify=False)
+                    try:
-                    resp_href.encoding = resp_href.apparent_encoding
+                        page_list = getList(a_list, url, headers, pageNo, s)
-                    i_html = resp_href.text
+                    except:
-                    if '您访问的页面不存在或已删除' in i_html:
+                        s.close()
-                        continue
+                        page_list = getList(a_list, url, headers, pageNo, s)
-                    i_soup = BeautifulSoup(i_html, 'html.parser')
+                    for page in page_list:
-                    source = str(i_soup.find_all('tbody')[0])
+                        id_list = []
-                    pub_org = source.split('<td><b>发文机关：</b></td>')[1].split('<td>')[1].split('</td>')[0]
+                        # 获取所需信息
-                    child_type = source.split('<td class="w340 zcwj_ztfl">')[1].split('</td>')[0]
+                        title = page['maintitle']  # 标题
-                    content = str(i_soup.find('table', attrs={'class': 'pages_content'}))
+                        pub_time1 = page['publish_time']  # 发布时间
-                    fu_jian_result = re.findall('href="(.*?)"', content)
+                        pub_time2 = page['cwrq']  # 成文时间
-                    fu_jian_href_list = []
+                        pub_code = page['fwzh']  # 发文字号
-                    if len(fu_jian_result) > 0:
+                        href = page['pub_url']  # 网址
-                        for fu_jian_re in fu_jian_result:
+                        # 判断是否已经爬取过
-                            if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
+                        is_href = db_storage.find_one({'网址': href})
-                                    or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
+                        if is_href:
-                                    or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
+                            log.info('已采集----------跳过')
-                                fu_jian_href = fu_jian_re
+                            continue
-                                fu_jian_href_list.append(fu_jian_href)
+                        try:
-                    result_dict = {
+                            resp_href = requests.get(url=href, headers=headers_, verify=False)
-                        '标题': title,
+                            resp_href.encoding = resp_href.apparent_encoding
-                        '来源': '',
+                            i_html = resp_href.text
-                        '发文机关': pub_org,
+                            if '您访问的页面不存在或已删除' in i_html:
-                        '发文字号': pub_code,
+                                # log.error(f'{title}...{href}...页面不存在或已删除')
-                        '内容-未去标签': content,
+                                continue
-                        '附件网址': fu_jian_href_list,
+                            i_soup = BeautifulSoup(i_html, 'html.parser')
-                        '发布时间': pub_time1,
+                            i_soup = paserUrl(i_soup, href)
-                        '成文时间': pub_time2,
+                            source = str(i_soup.find_all('tbody')[0])
-                        '主题分类': child_type,
+                            pub_org = source.split('<td><b>发文机关：</b></td>')[1].split('<td>')[1].split('</td>')[
-                        '网址': href,
+                                0]  # 发文机关
-                        '归属': pcodeJiguan,
+                            child_type = source.split('<td class="w340 zcwj_ztfl">')[1].split('</td>')[0]  # 主题分类
-                        '信息来源': '国务院文件',
+                            contentWithTag = i_soup.find('div',class_='wrap mxxgkwrap mxxgkwrap_gwywj').find('table',class_='border-table noneBorder pages_content')
-                        'tid': 1766,
+                            # 去除扫一扫
-                    }
+                            contentWithTag.find('div', attrs={'id': 'div_div'}).decompose()
-                    resp_href.close()
+                            content = contentWithTag.text  # 不带标签正文
-                    print(title)
+                            fu_jian_soup = contentWithTag.find_all('a')
-                    # save_data(result_dict)
+                            time.sleep(0.5)
-                    # time.sleep(1)
+                            for file in fu_jian_soup:
-                    num += 1
+                                try:
+                                    file_href = file['href']
+                                except Exception as e:
+                                    log.info(f'---{href}--------{e}-------')
+                                    continue
+                            if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
+                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
+                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
+                                file_name = file.text.strip()
+                                retData = baseCore.uploadToserver(file_href,'1766')
+                                if retData['state']:
+                                    pass
+                                else:
+                                    continue
+                                att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num)
+                                id_list.append(att_id)
+                                #todo:将返回的地址更新到soup
+                                file['href'] = 'http://114.115.215.96/' + full_path
+                        except:
+                            log.error(f'{title}...{href}...获取内容失败')
+                            continue
+                        #todo:替换完成之后，将附件上传至文件服务器
+                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                        #todo:传kafka字段
+                        dic_news = {
+                            'attachmentIds': id_list,                     #附件id
+                            'author': '',                                 #作者
+                            'content': content,                           #正文不带标签
+                            'contentWithTag': str(contentWithTag),        #正文带标签
+                            'createDate': time_now,                       #创建时间
+                            'deleteFlag': 0,                              #是否删除(0为默认，1为删除)
+                            'id': '',                                     #
+                            'labels': [{'relationId': "1766", 'relationName': "国务院文件", 'labelMark': "policy"}],   #关联标签id  关联标签名称  关联标签标识
+                            'origin': '',                                 #政策发布机关
+                            'organ': pub_org,                             #政策发文机关
+                            'topicClassification': child_type,            #政策文件分类
+                            'issuedNumber': pub_code,                     #发文字号
+                            'publishDate': pub_time1,                     #发布时间
+                            'writtenDate': pub_time2,                     #成文时间
+                            'sid': '1697458829758697473',                 #信息源id
+                            'sourceAddress': href[0],                     #原文链接
+                            'summary': '',                                #摘要
+                            'title': title                                #标题
+                        }
+                        # print(dic_news)
+                        flag = sendKafka(dic_news)
+                        if flag:
+                            save_data(dic_news)
+                        num += 1
                except:
-                    pass
+                    log.error(f'{pcodeJiguan}...第{pageNo}页获取列表失败')
-    except:
+                    continue
-        pass
+        except:
-    end_time = time.time()
+            log.error(f'{pcodeJiguan}...获取总数失败')
-    print(f'共抓取{num}条数据，共耗时{start_time - end_time}')
+            continue
+        end_time = time.time()
+        print(f'共抓取{num}条数据，共耗时{start_time - end_time}')
 # 国务院部门文件
 def get_content2():
+    def getTotalpage(bmfl,headers,session):
+        ip = baseCore.get_proxy()
+        pageNo = 1
+        time.sleep(2)
+        # 拼接url
+        url_ = f'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
+        resp = session.get(url=url_, headers=headers, verify=False,proxies=ip)
+        resp_text = resp.text
+        resp_json = json.loads(resp_text)
+        totalpage = resp_json['searchVO']['totalpage']
+        return totalpage
+    def getContentList(bmfl,pageNo,headers,session):
+        ip = baseCore.get_proxy()
+        url_ = f'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
+        # 请求结果为json格式
+        resp = session.get(url=url_, headers=headers, verify=False,proxies=ip)
+        resp_text = resp.text
+        resp_json = json.loads(resp_text)
+        content_list = resp_json['searchVO']['listVO']
+        return content_list
+    session = requests.session()
+    session.mount('https://', HTTPAdapter(max_retries=3))
+    session.mount('http://', HTTPAdapter(max_retries=3))
+    session.keep_alive = False
    start_time = time.time()
    num = 0
    result_list = ['外交部', '国家发展和改革委员会', '教育部', '科学技术部', '工业和信息化部', '国家民族事务委员会', '公安部', '国家安全部', '民政部', '司法部', '财政部',
@@ -271,171 +359,261 @@ def get_content2():
    for bmfl in result_list:
        try:
-            pageNo = 0
+            totalpage = getTotalpage(bmfl,headers,session)
-            time.sleep(2)
+            for pageNo in range(1,totalpage+1):
-            # 拼接url
-            url_ = f'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
-            try:
-                # 请求结果为json格式
-                resp = requests.get(url=url_, headers=headers, verify=False)
-                resp_text = resp.text
-                resp_json = json.loads(resp_text)
-                content_list = resp_json['searchVO']['listVO']
-                resp.close()
-            except:
-                continue
-            for content_dict in content_list:
-                href = content_dict['url']  # 详情页
-                title = content_dict['title']  # 标题
-                pub_code = content_dict['pcode']  # 发文字号
                try:
-                    pub_time = int(content_dict['pubtime'] / 1000)  # 发布时间
+                    try:
-                    pub_time1 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(pub_time))
+                        content_list = getContentList(bmfl,pageNo,headers,session)
-                except:
+                    except:
-                    pub_time1 = ''
+                        session.close()
-                try:
+                        content_list = getContentList(bmfl,pageNo,headers,session)
-                    p_time = int(content_dict['ptime'] / 1000)  # 成文时间
+                    for content_dict in content_list:
-                    pub_time2 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(p_time))
+                        id_list = []
-                except:
+                        href = content_dict['url']  # 详情页
-                    pub_time2 = ''
+                        title = content_dict['title']  # 标题
-                pub_org = content_dict['puborg']  # 发文机关
+                        pub_code = content_dict['pcode']  # 发文字号
-                try:
+                        try:
-                    child_type = content_dict['childtype']  # 主题分类
+                            pub_time = int(content_dict['pubtime'] / 1000)  # 发布时间
+                            pub_time1 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(pub_time))
+                        except:
+                            pub_time1 = ''
+                        try:
+                            p_time = int(content_dict['ptime'] / 1000)  # 成文时间
+                            pub_time2 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(p_time))
+                        except:
+                            pub_time2 = ''
+                        pub_org = content_dict['puborg']  # 发文机关
+                        try:
+                            child_type = content_dict['childtype']  # 主题分类
+                        except:
+                            child_type = ''
+                        # # 判断是否已经爬取过
+                        is_href = db_storage.find_one({'网址': href})
+                        if is_href:
+                            log.info('已采集----------跳过')
+                            continue
+                        try:
+                            resp = requests.get(url=href, headers=headers, verify=False)
+                            resp.encoding = resp.apparent_encoding
+                            resp_text = resp.text
+                            soup = BeautifulSoup(resp_text, 'html.parser')
+                            soup = paserUrl(soup,href)
+                            time.sleep(0.5)
+                            contentWithTag = soup.find('div', attrs={'class': 'pages_content mhide'})
+                            content = contentWithTag.text
+                            fu_jian_soup = contentWithTag.find_all('a')
+                            for file in fu_jian_soup:
+                                try:
+                                    file_href = file['href']
+                                except Exception as e:
+                                    log.info(f'---{href}--------{e}-------')
+                                    continue
+                                if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
+                                        or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
+                                        or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
+                                    file_name = file.text.strip()
+                                    retData = baseCore.uploadToserver(file_href,'1699')
+                                    if retData['state']:
+                                        pass
+                                    else:
+                                        continue
+                                    att_id,full_path = baseCore.tableUpdate(retData,'国务院文件',file_name,num)
+                                    id_list.append(att_id)
+                                    #todo:将返回的地址更新到soup
+                                    file['href'] = 'http://114.115.215.96/' + full_path
+                        except:
+                            print(f'{title}...{href}获取内容失败')
+                            continue
+                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                        #todo:传kafka字段
+                        dic_news = {
+                            'attachmentIds': id_list,                     #附件id
+                            'author': '',                                 #作者
+                            'content': content,                           #正文不带标签
+                            'contentWithTag': str(contentWithTag),        #正文带标签
+                            'createDate': time_now,                       #创建时间
+                            'deleteFlag': 0,                              #是否删除(0为默认，1为删除)
+                            'id': '',                                     #
+                            'labels': [{'relationId': "1699", 'relationName': "国务院各部委文件", 'labelMark': "policy"}],   #关联标签id  关联标签名称  关联标签标识
+                            'origin': '',                                 #政策发布机关
+                            'organ': pub_org,                             #政策发文机关
+                            'topicClassification': child_type,            #政策文件分类
+                            'issuedNumber': pub_code,                     #发文字号
+                            'publishDate': pub_time1,                     #发布时间
+                            'writtenDate': pub_time2,                     #成文时间
+                            'sid': '1697458829758697473',                 #信息源id
+                            'sourceAddress': href,                     #原文链接
+                            'summary': '',                                #摘要
+                            'title': title                                #标题
+                        }
+                        # print(dic_news)
+                        flag = sendKafka(dic_news)
+                        if flag:
+                            save_data(dic_news)
+                        num += 1
                except:
-                    child_type = ''
+                    print(f'{bmfl}...第{pageNo}页获取信息列表失败')
-                # 判断是否已经爬取过
-                is_href = db_storage.find_one({'网址': href})
-                if is_href:
                    continue
-                try:
-                    resp = requests.get(url=href, headers=headers, verify=False)
-                    resp.encoding = 'utf-8'
-                    resp_text = resp.text
-                    soup = BeautifulSoup(resp_text, 'html.parser')
-                    time.sleep(1)
-                    content = str(soup.find('div', attrs={'class': 'pages_content mhide'}))
-                    fu_jian_result = re.findall('href="(.*?)"', content)
-                    fu_jian_href_list = []
-                    if len(fu_jian_result) > 0:
-                        for fu_jian_re in fu_jian_result:
-                            if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
-                                    or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
-                                    or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
-                                fu_jian_href = href.split('content')[0] + fu_jian_re
-                                fu_jian_href_list.append(fu_jian_href)
-                    resp.close()
-                    result_dict = {
-                        '标题': title,
-                        '来源': '',
-                        '发文机关': pub_org,
-                        '发文字号': pub_code,
-                        '内容-未去标签': content,
-                        '附件网址': fu_jian_href_list,
-                        '发布时间': pub_time1,
-                        '成文时间': pub_time2,
-                        '主题分类': child_type,
-                        '网址': href,
-                        '归属': bmfl,
-                        '信息来源': '国务院部门文件',
-                        'tid': 1699,
-                    }
-                    print(title)
-                    save_data(result_dict)
-                    num += 1
-                except:
-                    pass
        except:
-            pass
+            print(f'{bmfl}...获取页数失败')
+            continue
    end_time = time.time()
    print(f'共抓取{num}条数据，耗时{end_time - start_time}')
 # 国务院国有资产监督管理委员会-政策发布
 def get_content3():
-    start_time = time.time()
+    def getPage():
-    num = 0
+        url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
-    url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
+        req = requests.get(url, headers=headers, verify=False)
-    try:
+        req.encoding = req.apparent_encoding
-        # get请求,需要取消ssl验证
+        soup = BeautifulSoup(req.text, 'html.parser')
-        href_resp = requests.request("GET", url, headers=headers, verify=False)
+        totalpage = re.findall("maxPageNum = (.*);", soup.select('#pag_2603340')[0].text)[0]
-        resp_text = href_resp.content.decode('UTF-8')
+        return int(totalpage)
-        doc_resp = pq(resp_text)
-        doc_items = doc_resp('.zsy_conlist li').items()
+    def sendContent(href, headers,title,pub_time,num):
-        time.sleep(1)
+        id_list = []
-        for doc_item in doc_items:
+        resp_href = requests.request("GET", href, headers=headers, verify=False)
-            # 获取所需数据
+        resp_href.encoding = resp_href.apparent_encoding
+        soup = BeautifulSoup(resp_href.text, 'lxml')
+        soup = paserUrl(soup, href)
+        doc_href = soup.find('div', class_='zsy_content')
+        try:
+            org_content = doc_href.select('.zsy_cotitle')[0]
+            org = re.findall('文章来源：(.*?)发布时间：', org_content)[0].strip()
+        except:
+            org = ''
+        contentWithTag = doc_href.find('div', class_='zsy_comain')
+        contentWithTag.select('#qr_container')[0].decompose()
+        contentWithTag.find('div', attrs={'id': 'div_div'}).decompose()
+        contentWithTag.find('div', class_='related').decompose()
+        contentWithTag.find('div', class_='jiathis_style_24x24').decompose()
+        try:
+            p_list = contentWithTag.findAll('p')
+            pub_hao = ''
+            for p in p_list:
+                p = str(p.text)
+                if '号' in p and '〔' in p and '〕' in p or '[' in p and ']' in p and '号' in p or '【' in p and '】' in p and '号' in p:
+                    try:
+                        pub_hao = p.split('日')[1].split('自')[0].strip().lstrip()
+                    except:
+                        pub_hao = p.strip().lstrip()
+                    break
+        except:
+            pub_hao = ''
+        if len(pub_hao) > 15:
+            pub_hao = ''
+        content = contentWithTag.text
+        fu_jian_soup = contentWithTag.find_all('a')
+        for file in fu_jian_soup:
            try:
-                href_ = doc_item('a').attr('href')
+                file_href = file['href']
-                if href_ is None:
+            except Exception as e:
+                log.info(f'---{href}--------{e}-------')
+                continue
+            if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
+                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
+                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
+                file_name = file.text.strip()
+                retData = baseCore.uploadToserver(file_href,'1642')
+                if retData['state']:
+                    pass
+                else:
                    continue
+                att_id,full_path = baseCore.tableUpdate(retData,'国务院国资委',file_name,num)
+                id_list.append(att_id)
+                #todo:将返回的地址更新到soup
+                file['href'] = 'http://114.115.215.96/' + full_path
+        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        #todo:传kafka字段
+        dic_news = {
+            'attachmentIds': id_list,                     #附件id
+            'author': '',                                 #作者
+            'content': content,                           #正文不带标签
+            'contentWithTag': str(contentWithTag),        #正文带标签
+            'createDate': time_now,                       #创建时间
+            'deleteFlag': 0,                              #是否删除(0为默认，1为删除)
+            'id': '',                                     #
+            'labels': [{'relationId': "1642", 'relationName': "国务院国资委", 'labelMark': "policy"}],   #关联标签id  关联标签名称  关联标签标识
+            'origin': '',                                 #政策发布机关
+            'organ': org,                             #政策发文机关
+            'topicClassification': '',            #政策文件分类
+            'issuedNumber': pub_hao,                     #发文字号
+            'publishDate': pub_time,                     #发布时间
+            'writtenDate': '',                     #成文时间
+            'sid': '1697458829758697473',                 #信息源id
+            'sourceAddress': href,                     #原文链接
+            'summary': '',                                #摘要
+            'title': title                                #标题
+        }
+        # print(dic_news)
+        flag = sendKafka(dic_news)
+        if flag:
+            save_data(dic_news)
+    def partTwo():
+        start_time = time.time()
+        num = 0
+        totalpage = getPage()
+        for page in range(1, totalpage):
+            url = f"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index_2603340_{page}.html"
+            href_resp = requests.request("GET", url, headers=headers, verify=False)
+            resp_text = href_resp.content.decode('UTF-8')
+            li_list = resp_text.split('<li>')
+            del (li_list[0])
+            for li in li_list:
+                id_list = []
+                href_ = li.split('<a href="')[1].split('" target=')[0]
+                title = li.split('title="')[1].split('">')[0]
                href = f'http://www.sasac.gov.cn{href_.replace("../../..", "")}'
-                # 判断是否已经爬取过
+                pub_time = li.split('<span>[')[1].split(']</span>')[0]
                is_href = db_storage.find_one({'网址': href})
                if is_href:
+                    log.info('已采集----------跳过')
                    continue
-                title = doc_item('a').attr('title')
+                sendContent(href, headers,title,pub_time,num)
-                pub_time = doc_item('span').text().replace('[', '').replace(']', '')
+                num += 1
-            except:
+        end_time = time.time()
-                continue
+        print(f'共抓取{num}条数据，耗时{end_time - start_time}')
-            try:
+    def partOne():
+        start_time = time.time()
+        num = 0
+        url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
+        try:
+            # get请求,需要取消ssl验证
+            href_resp = requests.request("GET", url, headers=headers, verify=False)
+            resp_text = href_resp.content.decode('UTF-8')
+            doc_resp = pq(resp_text)
+            doc_items = doc_resp('.zsy_conlist li').items()
+            time.sleep(1)
+            for doc_item in doc_items:
+                # 获取所需数据
                try:
-                    resp_href = requests.request("GET", href, headers=headers, verify=False)
+                    href_ = doc_item('a').attr('href')
-                    doc_href = pq(resp_href.content)
+                    if href_ is None:
-                    time.sleep(1)
+                        continue
-                    content_html = str(doc_href('.zsy_comain').remove('style').remove('#qr_container'))
+                    href = f'http://www.sasac.gov.cn{href_.replace("../../..", "")}'
-                    content = pq(content_html).text()
+                    # 判断是否已经爬取过
+                    is_href = db_storage.find_one({'网址': href})
+                    if is_href:
+                        log.info('已采集----------跳过')
+                        continue
+                    title = doc_item('a').attr('title')
+                    pub_time = doc_item('span').text().replace('[', '').replace(']', '')
                except:
                    continue
-                if content.strip() == '':
+                sendContent(href, headers,title,pub_time,num)
-                    continue
-                try:
-                    org_content = doc_href('.zsy_cotitle').text()
-                    org = re.findall('文章来源：(.*?)发布时间：', org_content)[0].strip()
-                except:
-                    org = ''
-                try:
-                    resp_href.encoding = 'utf-8'
-                    resp_text_ = BeautifulSoup(resp_href.text, 'html.parser')
-                    zsy_comain = resp_text_.find('div', attrs={'class': 'zsy_comain'})
-                    p_list = zsy_comain.findAll('p')
-                    pub_hao = ''
-                    for p in p_list:
-                        p = str(p.text)
-                        if '号' in p and '〔' in p and '〕' in p or '[' in p and ']' in p and '号' in p or '【' in p and '】' in p and '号' in p:
-                            try:
-                                pub_hao = p.split('日')[1].split('自')[0].strip().lstrip()
-                            except:
-                                pub_hao = p.strip().lstrip()
-                            break
-                except:
-                    pub_hao = ''
-                if len(pub_hao) > 45:
-                    pub_hao = ''
-                result_dict = {
-                    '标题': title,
-                    '来源': org,
-                    '发文机关': '',
-                    '发文字号': pub_hao,
-                    '内容-未去标签': content_html,
-                    '附件网址': [],
-                    '发布时间': pub_time,
-                    '成文时间': '',
-                    '主题分类': '',
-                    '网址': href,
-                    '归属': '国务院国资委',
-                    '信息来源': '国务院国资委',
-                    'tid': 1642,
-                }
-                save_data(result_dict)
-                print(title)
                num += 1
-            except:
+        except:
-                pass
+            pass
-    except:
+        end_time = time.time()
-        pass
+        print(f'共抓取{num}条数据，耗时{end_time - start_time}')
-    end_time = time.time()
-    print(f'共抓取{num}条数据，耗时{end_time - start_time}')
+    partOne()
+    partTwo()
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
@@ -580,8 +758,9 @@ def bei_jing():
                'title': title
            }
            # print(dic_news)
-            sendKafka(dic_news)
+            flag = sendKafka(dic_news)
-            save_data(dic_news)
+            if flag:
+                save_data(dic_news)
            # print(id)
            # id_list.append(id)
            num += 1
@@ -698,9 +877,10 @@ def nei_meng_gu():
                    'summary': '',
                    'title': title
                }
-                sendKafka(dic_news)
+                flag = sendKafka(dic_news)
-                save_data(dic_news)
+                if flag:
+                    save_data(dic_news)
                num = num + 1
            except:
@@ -890,8 +1070,9 @@ def ji_lin():
                    continue
                else:
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
-                    save_data(dic_news)
+                    if flag:
+                        save_data(dic_news)
                    num = num + 1
            except Exception as e:
                print(e)
@@ -1024,8 +1205,9 @@ def shang_hai():
                        'summary': '',
                        'title': title
                    }
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
-                    save_data(dic_news)
+                    if flag:
+                        save_data(dic_news)
                    num = num + 1
                except:
                    pass
@@ -1143,8 +1325,9 @@ def zhe_jiang():
                    'title': title
                }
                # print(dic_news)
-                sendKafka(dic_news)
+                flag = sendKafka(dic_news)
-                save_data(dic_news)
+                if flag:
+                    save_data(dic_news)
                num = num + 1
            except:
@@ -1301,8 +1484,9 @@ def fu_jian():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
-                    save_data(dic_news)
+                    if flag:
+                        save_data(dic_news)
                    print(title)
                    num += 1
                except:
@@ -1410,8 +1594,9 @@ def shan_dong():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
-                    save_data(dic_news)
+                    if flag:
+                        save_data(dic_news)
                    if content == '' or content == 'None':
                        continue
                    else:
@@ -1512,8 +1697,9 @@ def guang_dong():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
-                    save_data(dic_news)
+                    if flag:
+                        save_data(dic_news)
                    print(title)
                    # save_data(result_dict)
                    num = num + 1
@@ -1697,8 +1883,9 @@ def hai_nan():
                            'title': title
                        }
-                        sendKafka(dic_news)
+                        flag = sendKafka(dic_news)
-                        save_data(dic_news)
+                        if flag:
+                            save_data(dic_news)
                        print(title)
                        num = num + 1
@@ -1768,8 +1955,9 @@ def hai_nan():
                        'summary': '',
                        'title': title
                    }
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
-                    save_data(dic_news)
+                    if flag:
+                        save_data(dic_news)
                    href_text.close()
                    # save_data(result_dict)
                    print(title)
@@ -1873,8 +2061,9 @@ def hai_nan():
                        'title': title
                    }
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
-                    save_data(dic_news)
+                    if flag:
+                        save_data(dic_news)
                    href_text.close()
                    # save_data(result_dict)
                    print(title)
@@ -1979,8 +2168,9 @@ def hai_nan():
                        'title': title
                    }
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
-                    save_data(dic_news)
+                    if flag:
+                        save_data(dic_news)
                    href_text.close()
                    # save_data(result_dict)
                    print(title)
@@ -2065,8 +2255,9 @@ def hai_nan():
                        'title': title
                    }
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
-                    save_data(dic_news)
+                    if flag:
+                        save_data(dic_news)
                    href_text.close()
                    # save_data(result_dict)
                    print(title)
@@ -2238,8 +2429,9 @@ def si_chuan():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
-                    save_data(dic_news)
+                    if flag:
+                        save_data(dic_news)
                    print(title)
                    num = num + 1
@@ -2363,8 +2555,9 @@ def guang_xi():
                            'title': title
                        }
                        # print(dic_news)
-                        sendKafka(dic_news)
+                        flag = sendKafka(dic_news)
-                        save_data(dic_news)
+                        if flag:
+                            save_data(dic_news)
                        print(title)
                        num = num + 1
                    except:
@@ -2471,8 +2664,9 @@ def gui_zhou():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
-                    save_data(dic_news)
+                    if flag:
+                        save_data(dic_news)
                    print(title)
                    # save_data(result_dict)
                    num = num + 1
@@ -2584,8 +2778,9 @@ def yun_nan():
                            'title': title
                        }
                        # print(dic_news)
-                        sendKafka(dic_news)
+                        flag = sendKafka(dic_news)
-                        save_data(dic_news)
+                        if flag:
+                            save_data(dic_news)
                        print(title)
                        num = num + 1
                    except:
@@ -2696,8 +2891,9 @@ def yun_nan():
                            'title': title
                        }
                        # print(dic_news)
-                        # sendKafka(dic_news)
+                        flag = sendKafka(dic_news)
-                        # save_data(dic_news)
+                        if flag:
+                            save_data(dic_news)
                        print(title)
                        num = num + 1
@@ -2826,8 +3022,9 @@ def chong_qing():
                            'title': title
                        }
                        # print(dic_news)
-                        sendKafka(dic_news)
+                        flag = sendKafka(dic_news)
-                        save_data(dic_news)
+                        if flag:
+                            save_data(dic_news)
                        print(title)
                        # save_data(result_dict)
                        num += 1
@@ -2951,8 +3148,9 @@ def tian_jin():
                            'title': title
                        }
                        # print(dic_news)
-                        sendKafka(dic_news)
+                        flag = sendKafka(dic_news)
-                        save_data(dic_news)
+                        if flag:
+                            save_data(dic_news)
                        num += 1
                    except:
                        pass
@@ -3073,8 +3271,9 @@ def tian_jin():
                            'title': title
                        }
                        # print(dic_news)
-                        sendKafka(dic_news)
+                        flag = sendKafka(dic_news)
-                        save_data(dic_news)
+                        if flag:
+                            save_data(dic_news)
                        num += 1
                    except:
                        pass
@@ -3199,8 +3398,9 @@ def tian_jin():
                            'title': title
                        }
                        # print(dic_news)
-                        sendKafka(dic_news)
+                        flag = sendKafka(dic_news)
-                        save_data(dic_news)
+                        if flag:
+                            save_data(dic_news)
                        num += 1
                    except:
                        pass
@@ -3306,8 +3506,9 @@ def xin_jiang():
                            'title': title
                        }
                        # print(dic_news)
-                        sendKafka(dic_news)
+                        flag = sendKafka(dic_news)
-                        save_data(dic_news)
+                        if flag:
+                            save_data(dic_news)
                        num += 1
                    except:
                        pass
@@ -3403,8 +3604,9 @@ def xin_jiang():
                            'title': title
                        }
                        # print(dic_news)
-                        sendKafka(dic_news)
+                        flag = sendKafka(dic_news)
-                        save_data(dic_news)
+                        if flag:
+                            save_data(dic_news)
                        num += 1
                        href_res.close()
                    except:
@@ -3521,8 +3723,9 @@ def shan_xi():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
-                    save_data(dic_news)
+                    if flag:
+                        save_data(dic_news)
                    num += 1
                except:
                    pass
@@ -3630,8 +3833,9 @@ def liao_ning():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
-                    save_data(dic_news)
+                    if flag:
+                        save_data(dic_news)
                    num += 1
                except:
                    pass
@@ -3723,8 +3927,9 @@ def hei_long_jiang():
                            'title': title
                        }
                        # print(dic_news)
-                        sendKafka(dic_news)
+                        flag = sendKafka(dic_news)
-                        save_data(dic_news)
+                        if flag:
+                            save_data(dic_news)
                        num += 1
                    except:
                        pass
@@ -3836,8 +4041,9 @@ def jiang_su():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
-                    save_data(dic_news)
+                    if flag:
+                        save_data(dic_news)
                    num += 1
                except:
                    pass
@@ -3930,8 +4136,9 @@ def an_hui():
                            'title': title
                        }
                        # print(dic_news)
-                        sendKafka(dic_news)
+                        flag = sendKafka(dic_news)
-                        save_data(dic_news)
+                        if flag:
+                            save_data(dic_news)
                        num += 1
                    except:
                        pass
@@ -4025,8 +4232,9 @@ def an_hui():
                            'title': title
                        }
                        # print(dic_news)
-                        sendKafka(dic_news)
+                        flag = sendKafka(dic_news)
-                        save_data(dic_news)
+                        if flag:
+                            save_data(dic_news)
                        num += 1
                        href_res.close()
                    except:
@@ -4158,8 +4366,9 @@ def jiang_xi():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
-                    save_data(dic_news)
+                    if flag:
+                        save_data(dic_news)
                    num += 1
                except:
                    pass
@@ -4250,8 +4459,9 @@ def he_nan():
                    'title': title
                }
                # print(dic_news)
-                sendKafka(dic_news)
+                flag = sendKafka(dic_news)
-                save_data(dic_news)
+                if flag:
+                    save_data(dic_news)
                num += 1
                href_res.close()
            resp_text.close()
@@ -4351,8 +4561,9 @@ def hu_nan():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
-                    save_data(dic_news)
+                    if flag:
+                        save_data(dic_news)
                    num += 1
                except:
                    pass
@@ -4472,8 +4683,9 @@ def gan_su():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
-                    save_data(dic_news)
+                    if flag:
+                        save_data(dic_news)
                    num += 1
            except Exception as e:
                print(e)
@@ -4607,8 +4819,9 @@ def gan_su():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
-                    save_data(dic_news)
+                    if flag:
+                        save_data(dic_news)
                    num += 1
                except Exception as e:
                    print(e)
@@ -4763,8 +4976,9 @@ def gan_su():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
-                    save_data(dic_news)
+                    if flag:
+                        save_data(dic_news)
                    num += 1
                except Exception as e:
                    print(e)
@@ -4862,8 +5076,9 @@ def ning_xia():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
-                    save_data(dic_news)
+                    if flag:
+                        save_data(dic_news)
                    num += 1
                except:
                    pass
@@ -4960,8 +5175,9 @@ def shanxi():
                    'title': title
                }
                # print(dic_news)
-                sendKafka(dic_news)
+                flag = sendKafka(dic_news)
-                save_data(dic_news)
+                if flag:
+                    save_data(dic_news)
                num += 1
                res_href.close()
            except:
@@ -5053,8 +5269,9 @@ def xi_zang():
                        'title': title
                    }
                    # print(dic_news)
-                    sendKafka(dic_news)
+                    flag = sendKafka(dic_news)
-                    save_data(dic_news)
+                    if flag:
+                        save_data(dic_news)
                    num += 1
                except:
                    pass
@@ -5148,8 +5365,9 @@ def qing_hai():
                            'title': title
                        }
                        # print(dic_news)
-                        sendKafka(dic_news)
+                        flag = sendKafka(dic_news)
-                        save_data(dic_news)
+                        if flag:
+                            save_data(dic_news)
                        # print(id)
                        # id_list.append(id)
                        num += 1
@@ -5265,8 +5483,9 @@ def qing_hai():
                                    'title': title
                                }
                                # print(dic_news)
-                                sendKafka(dic_news)
+                                flag = sendKafka(dic_news)
-                                save_data(dic_news)
+                                if flag:
+                                    save_data(dic_news)
                                # print(id)
                                # id_list.append(id)
                                num += 1
@@ -5363,8 +5582,9 @@ def he_bei():
                'title': title
            }
            # print(dic_news)
-            sendKafka(dic_news)
+            flag = sendKafka(dic_news)
-            save_data(dic_news)
+            if flag:
+                save_data(dic_news)
            num += 1
    except:
        pass
@@ -5471,8 +5691,9 @@ def hu_bei():
                'title': title
            }
            # print(dic_news)
-            sendKafka(dic_news)
+            flag = sendKafka(dic_news)
-            save_data(dic_news)
+            if flag:
+                save_data(dic_news)
            num += 1
        except Exception as e:
            pass