政策法规调整上传附件方式

03de4b81 · 薛凌堃 · d5ee8877 · 03de4b81
--- a/comData/policylaw/policy.py
+++ b/comData/policylaw/policy.py
@@ -91,7 +91,8 @@ def save_data(dic_news):
        '网址': dic_news['sourceAddress'],
        'tid': dic_news['labels'][0]['relationId'],
        '来源': dic_news['labels'][0]['relationName'],
-        '创建时间': dic_news['createDate']
+        '创建时间': dic_news['createDate'],
+        '带标签内容':dic_news['contentWithTag'][:100]
    }
    db_storage.insert_one(aaa_dic)
@@ -138,6 +139,7 @@ def remove_dup():
 # 国务院文件
 def get_content1():
+    pathType = 'policy/gwywj/'
    def getPageConunt(a_list, url, headers, s):
        data = {"code": "18122f54c5c", "thirdPartyCode": "thirdparty_code_107", "thirdPartyTableId": 30,
                "resultFields": ["pub_url", "maintitle", "fwzh", "cwrq", "publish_time"],
@@ -256,7 +258,7 @@ def get_content1():
                                        or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                        or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                    file_name = file.text.strip()
-                                    retData = baseCore.uploadToserver(file_href,'1766')
+                                    retData = baseCore.uptoOBS(file_href,'1766',pathType,file_name)
                                    if retData['state']:
                                        pass
                                    else:
@@ -265,7 +267,7 @@ def get_content1():
                                    id_list.append(att_id)
                                    #todo:将返回的地址更新到soup
-                                    file['href'] = 'http://114.115.215.96/' + full_path
+                                    file['href'] = full_path
                        except:
                            log.error(f'{title}...{href}...获取内容失败')
                            continue
@@ -308,6 +310,7 @@ def get_content1():
 # 国务院部门文件
 def get_content2():
+    pathType = 'policy/gwybmwj/'
    def getTotalpage(bmfl,headers,session):
        ip = baseCore.get_proxy()
        pageNo = 1
@@ -336,6 +339,7 @@ def get_content2():
    session.keep_alive = False
    start_time = time.time()
    num = 0
+    count = 0
    result_list = ['外交部', '国家发展和改革委员会', '教育部', '科学技术部', '工业和信息化部', '国家民族事务委员会', '公安部', '国家安全部', '民政部', '司法部', '财政部',
                   '人力资源和社会保障部', '自然资源部', '生态环境部', '住房和城乡建设部', '交通运输部', '水利部', '农业农村部', '商务部', '文化和旅游部',
                   '国家卫生健康委员会',
@@ -396,6 +400,9 @@ def get_content2():
                            time.sleep(0.5)
                            contentWithTag = soup.find('div', attrs={'class': 'pages_content mhide'})
                            content = contentWithTag.text
+                            if content == '' or content == 'None':
+                                log.info(f'----{href}---{title}---内容为空---')
+                                continue
                            fu_jian_soup = contentWithTag.find_all('a')
                            for file in fu_jian_soup:
                                try:
@@ -407,7 +414,7 @@ def get_content2():
                                        or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                        or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                    file_name = file.text.strip()
-                                    retData = baseCore.uploadToserver(file_href,'1699')
+                                    retData = baseCore.uptoOBS(file_href,'1699',pathType,file_name)
                                    if retData['state']:
                                        pass
                                    else:
@@ -416,7 +423,7 @@ def get_content2():
                                    id_list.append(att_id)
                                    #todo:将返回的地址更新到soup
-                                    file['href'] = 'http://114.115.215.96/' + full_path
+                                    file['href'] = full_path
                        except:
                            log.error(f'{title}...{href}获取内容失败')
                            continue
@@ -446,6 +453,7 @@ def get_content2():
                        flag = sendKafka(dic_news)
                        if flag:
                            save_data(dic_news)
+                        count += 1
                        num += 1
                except:
                    log.error(f'{bmfl}...第{pageNo}页获取信息列表失败')
@@ -454,10 +462,11 @@ def get_content2():
            log.error(f'{bmfl}...获取页数失败')
            continue
    end_time = time.time()
-    log.info(f'共抓取国务院部门文件{num}条数据，耗时{end_time - start_time}')
+    log.info(f'共抓取国务院部门文件{count}条数据，耗时{end_time - start_time}')
 # 国务院国有资产监督管理委员会-政策发布
 def get_content3():
+    pathType = 'policy/gyzc/'
    def getPage():
        url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
        req = requests.get(url, headers=headers, verify=False)
@@ -499,6 +508,9 @@ def get_content3():
        if len(pub_hao) > 15:
            pub_hao = ''
        content = contentWithTag.text
+        if content == '' or content == 'None':
+            log.info(f'----{href}----{title}----内容为空----')
+            return
        fu_jian_soup = contentWithTag.find_all('a')
        for file in fu_jian_soup:
            try:
@@ -510,7 +522,7 @@ def get_content3():
                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                file_name = file.text.strip()
-                retData = baseCore.uploadToserver(file_href,'1642')
+                retData = baseCore.uptoOBS(file_href,'1642',pathType,file_name)
                if retData['state']:
                    pass
                else:
@@ -519,7 +531,7 @@ def get_content3():
                id_list.append(att_id)
                #todo:将返回的地址更新到soup
-                file['href'] = 'http://114.115.215.96/' + full_path
+                file['href'] = full_path
        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        #todo:传kafka字段
        dic_news = {
@@ -542,7 +554,7 @@ def get_content3():
            'summary': '',                                #摘要
            'title': title                                #标题
        }
-        # print(title)
+        # log.info(title)
        flag = sendKafka(dic_news)
        if flag:
            save_data(dic_news)
@@ -550,6 +562,7 @@ def get_content3():
    def partTwo():
        start_time = time.time()
        num = 0
+        count = 0
        totalpage = getPage()
        for page in range(1, totalpage):
            url = f"http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index_2603340_{page}.html"
@@ -570,12 +583,14 @@ def get_content3():
                    continue
                sendContent(href, headers,title,pub_time,num)
                num += 1
+                count += 1
        end_time = time.time()
-        log.info(f'共抓取国资委文件{num}条数据，耗时{end_time - start_time}')
+        log.info(f'共抓取国资委文件{count}条数据，耗时{end_time - start_time}')
    def partOne():
        start_time = time.time()
        num = 0
+        count = 0
        url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
        try:
            # get请求,需要取消ssl验证
@@ -603,10 +618,11 @@ def get_content3():
                    continue
                sendContent(href, headers,title,pub_time,num)
                num += 1
+                count += 1
        except:
            pass
        end_time = time.time()
-        log.info(f'共抓取国资委文件{num}条数据，耗时{end_time - start_time}')
+        log.info(f'共抓取国资委文件{count}条数据，耗时{end_time - start_time}')
    partOne()
    # 增量执行需要注释掉partTwo()
@@ -614,7 +630,7 @@ def get_content3():
 # 北京
 def bei_jing():
-    num = 0
    start_time = time.time()
    pathType = 'policy/beijing/'
    # 有反爬需要使用selenium
@@ -662,6 +678,7 @@ def bei_jing():
            time.sleep(2)
        log.info(f'------{len(hrefs)}条数据-------------')
        num = 0
+        count = 0
        for href in hrefs:
            id_list = []
            title = href[1]
@@ -700,12 +717,15 @@ def bei_jing():
            soup = paserUrl(soup_cont, href[0])
            soup.prettify()
+            if soup.text == '' or soup.text == 'None':
+                log.info(f'----{href[0]}----{title}----内容为空----')
+                continue
            # todo:去掉扫一扫
            try:
                soup.find('div', id='div_div').decompose()
            except:
                continue
-            # print(title)
+            # log.info(title)
            fu_jian_soup = soup.find_all('a')
            for file in fu_jian_soup:
@@ -756,11 +776,10 @@ def bei_jing():
            flag = sendKafka(dic_news)
            if flag:
                save_data(dic_news)
-            # print(id)
-            # id_list.append(id)
                num += 1
+                count += 1
        end_time = time.time()
-        log.info(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
        bro.quit()
    except Exception as e:
        log.info(e)
@@ -827,6 +846,9 @@ def nei_meng_gu():
                else:
                    i_content = i_soup.find(class_='view TRS_UEDITOR trs_paper_default')
                    content = str(i_content)
+                if i_content.text == '' or i_content.text == 'None':
+                    log.info(f'{real_href}------{title}----内容为空-----')
+                    continue
                # todo:内蒙古市的附件不在正文中，异步加载出来，替换不了标签，附件可上传att表中
                fujian = i_soup.find(class_='xy_zcwjxl_downloadPC_list')
                fu_jian_result = re.findall('href="(.*?)"', str(fujian))
@@ -849,7 +871,7 @@ def nei_meng_gu():
                            att_id, full_path = baseCore.tableUpdate(retData, '内蒙古自治区国资委', title, num)
                            id_list.append(att_id)
-                print(title)
+                log.info(title)
                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                # todo:传kafka字段
@@ -892,6 +914,7 @@ def ji_lin():
    pathType = 'policy/jilin/'
    start = time.time()
    num = 0
+    count = 0
    url = 'http://gzw.jl.gov.cn/zwgk/zcwj/'
    try:
        resp_text = requests.get(url=url, headers=headers, verify=False)
@@ -964,6 +987,9 @@ def ji_lin():
                        i_content = soup
                    contentWithTag = soup.find(class_='zsy_comain')
                    content = contentWithTag.text.strip()
+                    if content == '' or content == 'None':
+                        log.info(f'{real_href}-----{title}----内容为空')
+                        continue
                    # 发文字号
                    find_hao = i_content.find_all('p')[:3]
                    pub_hao = ''
@@ -1010,6 +1036,9 @@ def ji_lin():
                                    p.extract()
                    contentWithTag = i_content
                    content = contentWithTag.text.strip()
+                    if content == '' or content == 'None':
+                        log.info(f'{real_href}-----{title}----内容为空')
+                        continue
                    # 找到附件上传至文件服务器
                    fj_soup = i_soup.find('div', class_='wenjianfujian')
                    fj_list = fj_soup.find_all('a')
@@ -1040,7 +1069,7 @@ def ji_lin():
                    soup.find('div', id='qr_container').decompose()
                else:
                    pass
-                print(title)
+                log.info(title)
                # print('............................................................')
                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                # todo:传kafka字段
@@ -1073,13 +1102,14 @@ def ji_lin():
                    if flag:
                        save_data(dic_news)
                    num = num + 1
+                    count += 1
            except Exception as e:
                log.info(e)
                pass
    except:
        pass
    end = time.time()
-    print('共', num, '条', '...........', '共耗时', end - start, '秒')
+    print('共', count, '条', '...........', '共耗时', end - start, '秒')
 # 上海
@@ -1087,6 +1117,7 @@ def shang_hai():
    start = time.time()
    pathType = 'policy/shanghai/'
    num = 0
+    count =0
    for page in range(1, 7):
        if page == 1:
@@ -1111,7 +1142,7 @@ def shang_hai():
                    num+=1
                    continue
                try:
-                    href = 'https://www.gzw.sh.gov.cn/shgzw_xxgk_zxgkxx/20230119/7c5e9691b2b54ff293e5d16d746d1a61.html'
+                    # href = 'https://www.gzw.sh.gov.cn/shgzw_xxgk_zxgkxx/20230119/7c5e9691b2b54ff293e5d16d746d1a61.html'
                    href_text = requests.get(url=href, headers=headers, verify=False).text
                    doc_href = pq(href_text)
                    doc_href_ = BeautifulSoup(href_text, 'html.parser')
@@ -1120,6 +1151,9 @@ def shang_hai():
                    info_list = doc_href_.find_all('span', style='text-align: center;margin-left: 42%;')
                    pub_source = info_list[1].find('b').text.split('信息来源：')[1]
                    content = doc_href_.find('div', attrs={'class': 'detail_03'})
+                    if content == '' or content == 'None':
+                        log.info(f'{href}-----{title}----内容为空')
+                        continue
                    # 将文章中的附件字段删去
                    pattern = r'\d+\.'
@@ -1181,7 +1215,7 @@ def shang_hai():
                        else:
                            continue
-                    print(title)
+                    log.info(title)
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # todo:传kafka字段
@@ -1209,18 +1243,19 @@ def shang_hai():
                    if flag:
                        save_data(dic_news)
                        num = num + 1
+                        count += 1
                except:
                    pass
        except:
            pass
    end = time.time()
-    print('共', num, '条', '...........', '共耗时', end - start, '秒')
+    print('共', count, '条', '...........', '共耗时', end - start, '秒')
 # 浙江
 def zhe_jiang():
    start = time.time()
-    pathType = 'policy/zhejiang/'
    num = 0
+    count = 0
    url = 'http://gzw.zj.gov.cn/col/col1229430928/index.html'
    try:
        res = requests.get(url, headers).content
@@ -1235,7 +1270,7 @@ def zhe_jiang():
            href = li.find('a')['href']
            pub_time = li.find('a').find('span').text
            title = li.find('a').text.replace(pub_time, '').strip()
-            # print(title)
+            # log.info(title)
            if 'http' in href:
                href = href
            else:
@@ -1302,9 +1337,12 @@ def zhe_jiang():
                            #     fj_href_list.append(fujian_href)
                            # print(fj_href_list)
-                print(title)
+                log.info(title)
                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                # todo:传kafka字段
+                if content == '' or content == 'None':
+                    log.info(f'{href}-----{title}----内容为空')
+                    continue
                dic_news = {
                    'attachmentIds': [],
                    'author': '',
@@ -1329,20 +1367,21 @@ def zhe_jiang():
                flag = sendKafka(dic_news)
                if flag:
                    save_data(dic_news)
                    num = num + 1
+                    count += 1
            except:
                pass
    except:
        pass
    end = time.time()
-    print('共', num, '条', '...........', '共耗时', end - start, '秒')
+    print('共', count, '条', '...........', '共耗时', end - start, '秒')
 # 福建
 def fu_jian():
    error_tag = str(404)
    pathType = 'policy/fujian/'
    num = 0
+    count = 0
    start_time = time.time()
    url = 'http://gzw.fujian.gov.cn/zwgk/zcfg/'
    try:
@@ -1386,8 +1425,8 @@ def fu_jian():
                        i_html = href_text.text
                        i_soup = BeautifulSoup(i_html, 'html.parser')
                real_href = href
-                # real_href = 'http://gzw.fujian.gov.cn/zwgk/xxgkzl/xxgkml/gfxwj/202211/t20221129_6064610.htm'
+                # real_href = 'http://gzw.fujian.gov.cn/zwgk/zcfg/201806/t20180619_3065065.htm'
-                # print(real_href)
+                print(real_href)
                is_href = db_storage.find_one({'网址': real_href})
                if is_href:
                    num+=1
@@ -1437,6 +1476,7 @@ def fu_jian():
                                if '.doc' in fj_href or '.docx' in fj_href or '.xlsx' in fj_href or '.pdf' in fj_href or '.xls' in fj_href or '.zip' in fj_href \
                                        or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
                                        or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
+                                    print(fj_href)
                                    # 找到附件后 上传至文件服务器
                                    retData = baseCore.uptoOBS(fj_href, '1673',pathType,file_name)
                                    if retData['state']:
@@ -1453,6 +1493,9 @@ def fu_jian():
                            pub_time = source_.split('发布时间：')[1].split('浏览量：')[0].strip().lstrip()
                            contentwithtag = i_soup.find('div', attrs={'class': 'xl_con1'})
                            content = i_soup.find('div', attrs={'class': 'xl_con1'}).text
+                            if content == '' or content == None:
+                                log.info(f'-----{href}----{title}----内容为空-----')
+                                continue
                            pub_hao = ''
                        except:
@@ -1460,6 +1503,9 @@ def fu_jian():
                            pub_time = ''
                            contentwithtag = i_soup.find('tabs tab_base_01 rules_con1')
                            content = contentwithtag.text.strip()
+                            if content == '' or content == None:
+                                log.info(f'-----{href}----{title}----内容为空-----')
+                                continue
                            pub_hao = contentwithtag.find_all('div', class_='rules_tit1 b-free-read-leaf').text.dtrip()
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
@@ -1484,18 +1530,19 @@ def fu_jian():
                        'summary': '',
                        'title': title
                    }
-                    # print(dic_news)
+                    # log.info(dic_news)
                    flag = sendKafka(dic_news)
                    if flag:
                        save_data(dic_news)
-                    print(title)
+                        log.info(title)
                        num += 1
+                        count += 1
                except:
                    pass
    except:
        pass
    end_time = time.time()
-    print(f'共抓取{num}条数据，共耗时{end_time - start_time}')
+    print(f'共抓取{count}条数据，共耗时{end_time - start_time}')
 # 山东
 def shan_dong():
@@ -1505,6 +1552,7 @@ def shan_dong():
    }
    start = time.time()
    num = 0
+    count = 0
    url_list = ['http://gzw.shandong.gov.cn/channels/ch06086/', 'http://gzw.shandong.gov.cn/channels/ch06088/']
    for url in url_list:
        try:
@@ -1539,6 +1587,9 @@ def shan_dong():
                        # print(pub_time,pub_source,pub_hao)
                        content = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle").text
                        contentwithtag = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle")
+                        if content == '' or content == None:
+                            log.info(f'-----{href}----{title}----内容为空-----')
+                            continue
                        if pub_hao == '无':
                            p_list = content.find_all('p')
                            for p in p_list:
@@ -1571,6 +1622,9 @@ def shan_dong():
                            i = i + 1
                        content = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle").text
                        contentwithtag = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle")
+                        if content == '' or content == None:
+                            log.info(f'-----{href}----{title}----内容为空-----')
+                            continue
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # todo:传kafka字段
                    dic_news = {
@@ -1597,23 +1651,22 @@ def shan_dong():
                    flag = sendKafka(dic_news)
                    if flag:
                        save_data(dic_news)
-                    if content == '' or content == 'None':
+                        log.info(title)
-                        continue
-                    else:
-                        print(title)
                        num = num + 1
+                        count += 1
                except:
                    pass
        except:
            pass
    end = time.time()
-    print('共', num, '条', '...........', '共耗时', end - start, '秒')
+    print('共', count, '条', '...........', '共耗时', end - start, '秒')
 # 广东
 def guang_dong():
    start = time.time()
    pathType = 'policy/guangdong/'
    num = 0
+    count = 0
    url = 'http://gzw.gd.gov.cn/zcfg/index.html'
    try:
        resp_href = requests.get(url=url, headers=headers, verify=False)
@@ -1653,6 +1706,9 @@ def guang_dong():
                    i_soup = paserUrl(i_soup, href)
                    content = i_soup.find('div', attrs={'class', 'box_info'})
                    contentwithTag = str(content)
+                    if content == '' or content == None:
+                        log.info(f'{href}-----{title}----内容为空----')
+                        continue
                    fu_jian_list = content.find_all('a')
                    for fu_jian in fu_jian_list:
                        try:
@@ -1701,15 +1757,15 @@ def guang_dong():
                    flag = sendKafka(dic_news)
                    if flag:
                        save_data(dic_news)
-                    print(title)
+                        log.info(title)
-                    # save_data(result_dict)
                        num = num + 1
+                        count += 1
                except:
                    pass
    except:
        pass
    end = time.time()
-    print('共', num, '条', '...........', '共耗时', end - start, '秒')
+    print('共', count, '条', '...........', '共耗时', end - start, '秒')
 # 海南
 def hai_nan():
@@ -1717,6 +1773,7 @@ def hai_nan():
    def hai_nan1():
        # 部门文件
        num = 0
+        count = 0
        start_time = time.time()
        for page in range(13):
            if page == 0:
@@ -1770,6 +1827,9 @@ def hai_nan():
                            except:
                                pass
                            content = contentWithTag.text
+                            if content == '' or content == None:
+                                log.info(f'-----{href}----{title}----内容为空-----')
+                                continue
                            fu_jian_list = contentWithTag.find_all('a')
                            for fu_jian in fu_jian_list:
                                try:
@@ -1811,6 +1871,9 @@ def hai_nan():
                                topicClassification = tbody_text.split('分　　类：')[1].split('发文机关：')[0].strip().lstrip()
                                contentWithTag = source.find('div', attrs={'class': 'zx-xxxqy-nr'})
                                content = contentWithTag.text
+                                if content == '' or content == None:
+                                    log.info(f'-----{href}----{title}----内容为空-----')
+                                    continue
                                fu_jian_list = source.find_all('a')
                                try:
                                    for fu_jian in fu_jian_list:
@@ -1862,6 +1925,9 @@ def hai_nan():
                                topicClassification = ''
                                contentWithTag = source.find('div', attrs={'class': 'TRS_UEDITOR'})
                                content = contentWithTag.text
+                                if content == '' or content == None:
+                                    log.info(f'-----{href}----{title}----内容为空-----')
+                                    continue
                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                        # todo:传kafka字段
                        dic_news = {
@@ -1888,19 +1954,20 @@ def hai_nan():
                        flag = sendKafka(dic_news)
                        if flag:
                            save_data(dic_news)
-                        print(title)
+                            log.info(title)
+                            count += 1
                            num = num + 1
                    except:
                        pass
            except:
                pass
        end_time = time.time()
-        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
    def hai_nan2():
        def hai_nan_sw(page_href):
            num = 0
+            count = 0
            req = requests.get(url=page_href, headers=headers, verify=False)
            req.encoding = req.apparent_encoding
            doc_resp = BeautifulSoup(req.text, 'html.parser')
@@ -1936,6 +2003,9 @@ def hai_nan():
                    pub_time = str(pub_result[3]).split('发布日期：</strong>')[1].split('</span>')[0].strip()
                    contentWithTag = doc_href.find(class_='con_cen line mar-t2 xxgk_content_content')
                    content = contentWithTag.text
+                    if content == '' or content == None:
+                        log.info(f'-----{href}----{title}----内容为空-----')
+                        continue
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # todo:传kafka字段
                    dic_news = {
@@ -1961,10 +2031,11 @@ def hai_nan():
                    flag = sendKafka(dic_news)
                    if flag:
                        save_data(dic_news)
-                    href_text.close()
+                        log.info(title)
-                    # save_data(result_dict)
-                    print(title)
                        num += 1
+                        count += 1
+                    href_text.close()
                except:
                    pass
            req.close()
@@ -1972,6 +2043,7 @@ def hai_nan():
        def hai_nan_szf(page_href):
            num = 0
+            count = 0
            req = requests.get(url=page_href, headers=headers, verify=False)
            req.encoding = req.apparent_encoding
            doc_resp = BeautifulSoup(req.text, 'html.parser')
@@ -2010,6 +2082,9 @@ def hai_nan():
                        pub_time = str(pub_result[3]).split('发布日期：</strong>')[1].split('</span>')[0].strip()
                        contentWithTag = doc_href.find(class_='con_cen line mar-t2 xxgk_content_content')
                        content = contentWithTag.text
+                        if content == '' or content == None:
+                            log.info(f'-----{href}----{title}----内容为空-----')
+                            continue
                    except:
                        # print(href)
                        pub_result = doc_href.find('div', attrs={'class': 'line mar-t2 con_div'})
@@ -2021,6 +2096,9 @@ def hai_nan():
                        writtenDate = ''
                        contentWithTag = doc_href.find('div', attrs={'class': 'xxgk_content_content'})
                        content = contentWithTag.text
+                        if content == '' or content == None:
+                            log.info(f'-----{href}----{title}----内容为空-----')
+                            continue
                    fu_jian_list = contentWithTag.find_all('a')
                    for fu_jian in fu_jian_list:
                        try:
@@ -2068,10 +2146,12 @@ def hai_nan():
                    flag = sendKafka(dic_news)
                    if flag:
                        save_data(dic_news)
+                        log.info(title)
+                        num += 1
+                        count += 1
                    href_text.close()
                    # save_data(result_dict)
-                    print(title)
-                    num += 1
                except:
                    pass
            req.close()
@@ -2079,6 +2159,7 @@ def hai_nan():
        def hai_nan_szfbgt(page_href):
            num = 0
+            count = 0
            req = requests.get(url=page_href, headers=headers, verify=False)
            req.encoding = req.apparent_encoding
            doc_resp = BeautifulSoup(req.text, 'html.parser')
@@ -2127,6 +2208,9 @@ def hai_nan():
                        writtenDate = ''
                        contentWithTag = doc_href.find('div', attrs={'class': 'xxgk_content_content'})
                        content = contentWithTag.text
+                        if content == '' or content == None:
+                            log.info(f'-----{href}----{title}----内容为空-----')
+                            continue
                    fu_jian_list = contentWithTag.find_all('a')
                    if fu_jian_list:
                        for fu_jian in fu_jian_list:
@@ -2147,7 +2231,7 @@ def hai_nan():
                                att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num)
                                id_list.append(att_id)
                                fu_jian['href'] = full_path
-                                print(f'----附件：{fu_jian_href}')
+                                # print(f'----附件：{fu_jian_href}')
                    else:
                        pass
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
@@ -2176,10 +2260,10 @@ def hai_nan():
                    flag = sendKafka(dic_news)
                    if flag:
                        save_data(dic_news)
-                    href_text.close()
+                        log.info(title)
-                    # save_data(result_dict)
-                    print(title)
                        num += 1
+                        count += 1
+                    href_text.close()
                except:
                    pass
            req.close()
@@ -2187,6 +2271,7 @@ def hai_nan():
        def hai_nan_zy(page_href):
            num = 0
+            count = 0
            req = requests.get(url=page_href, headers=headers, verify=False)
            req.encoding = req.apparent_encoding
            doc_resp = BeautifulSoup(req.content, 'html.parser')
@@ -2240,6 +2325,9 @@ def hai_nan():
                        pub_hao = ''
                        contentWithTag = doc_href.find(class_='pages_content')
                        content = contentWithTag.text
+                        if content == '' or content == None:
+                            log.info(f'-----{i_href}----{title}----内容为空-----')
+                            continue
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # todo:传kafka字段
                    dic_news = {
@@ -2266,10 +2354,12 @@ def hai_nan():
                    flag = sendKafka(dic_news)
                    if flag:
                        save_data(dic_news)
+                        log.info(title)
+                        num += 1
+                        count += 1
                    href_text.close()
                    # save_data(result_dict)
-                    print(title)
-                    num += 1
                except:
                    pass
            req.close()
@@ -2277,6 +2367,7 @@ def hai_nan():
        def start():
            num = 0
+            count = 0
            start_time = time.time()
            url = "https://www.hainan.gov.cn/hainan/qzcwj/zywj.shtml"
            try:
@@ -2306,7 +2397,7 @@ def hai_nan():
                            else:
                                page_href = str(url) + f'home_{page}.htm'
                            try:
-                                num += hai_nan_zy(page_href)
+                                count += hai_nan_zy(page_href)
                            except:
                                pass
                            time.sleep(1)
@@ -2320,7 +2411,7 @@ def hai_nan():
                            else:
                                page_href = str(url).split('list3')[0] + 'list3_{}.shtml'.format(page + 1)
                            try:
-                                num += hai_nan_sw(page_href)
+                                count += hai_nan_sw(page_href)
                            except:
                                pass
                    elif url == leibie_href_list[2]:
@@ -2332,7 +2423,7 @@ def hai_nan():
                            else:
                                page_href = str(url).split('list3')[0] + 'list3_{}.shtml'.format(page + 1)
                            try:
-                                num += hai_nan_szf(page_href)
+                                count += hai_nan_szf(page_href)
                            except:
                                pass
                    else:
@@ -2343,22 +2434,22 @@ def hai_nan():
                            else:
                                page_href = str(url).split('list3')[0] + 'list3_{}.shtml'.format(page + 1)
                            try:
-                                num += hai_nan_szfbgt(page_href)
+                                count += hai_nan_szfbgt(page_href)
                            except:
                                pass
            except:
                pass
            end_time = time.time()
-            print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+            print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
        start()
    hai_nan1()
    hai_nan2()
 # 四川
 def si_chuan():
    num = 0
+    count = 0
    pathType = 'policy/sichuan/'
    start_time = time.time()
    for page in range(1, 3):
@@ -2393,6 +2484,9 @@ def si_chuan():
                    doc_href = paserUrl(doc_href, href)
                    contentWithTag = doc_href.find('div', id='scrollBox')
                    content = contentWithTag.text
+                    if content == '' or content == None:
+                        log.info(f'-----{href}----{title}----内容为空-----')
+                        continue
                    fu_jian_list = doc_href.find_all('a')
                    for fu_jian in fu_jian_list:
@@ -2441,19 +2535,20 @@ def si_chuan():
                    flag = sendKafka(dic_news)
                    if flag:
                        save_data(dic_news)
-                    print(title)
+                        log.info(title)
+                        count += 1
                        num = num + 1
                except:
                    pass
        except:
            pass
    end_time = time.time()
-    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+    print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
 # 广西
 def guang_xi():
    num = 0
+    count = 0
    pathType = 'policy/guangxi/'
    start_time = time.time()
    url_all = """
@@ -2519,6 +2614,9 @@ def guang_xi():
                        contentWithTag = BeautifulSoup(str(contentWithTag), 'html.parser')
                        contentWithTag = paserUrl(contentWithTag, href)
                        content = contentWithTag.text.strip()
+                        if content == '' or content == None:
+                            log.info(f'-----{href}----{title}----内容为空-----')
+                            continue
                        fu_jian_list = contentWithTag.find_all('a')
                        for fu_jian in fu_jian_list:
@@ -2568,14 +2666,14 @@ def guang_xi():
                        flag = sendKafka(dic_news)
                        if flag:
                            save_data(dic_news)
-                        print(title)
+                            log.info(title)
                            num = num + 1
                    except:
                        pass
            except:
                pass
    end_time = time.time()
-    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+    print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
 # 贵州
 def gui_zhou():
@@ -2585,6 +2683,7 @@ def gui_zhou():
    """
    pathType = 'policy/guizhou/'
    num = 0
+    count = 0
    start_time = time.time()
    for page in range(0, 11):
        if page == 0:
@@ -2630,6 +2729,9 @@ def gui_zhou():
                    contentWithTag = paserUrl(contentWithTag, href)
                    content = contentWithTag.text.strip()
+                    if content == '' or content == None:
+                        log.info(f'-----{href}----{title}----内容为空-----')
+                        continue
                    fu_jian_list = contentWithTag.find_all('a')
                    for fu_jian in fu_jian_list:
                        try:
@@ -2678,8 +2780,8 @@ def gui_zhou():
                    flag = sendKafka(dic_news)
                    if flag:
                        save_data(dic_news)
-                    print(title)
+                        log.info(title)
-                    # save_data(result_dict)
+                        count += 1
                        num = num + 1
                except:
                    pass
@@ -2697,6 +2799,7 @@ def yun_nan():
        http://gzw.yn.gov.cn/yngzw/c100040/zfxxgk_list.shtml  1
        """
        num = 0
+        count = 0
        start_time = time.time()
        for page in range(1, 6):
            if page == 1:
@@ -2735,6 +2838,9 @@ def yun_nan():
                            contentwithTag = \
                            doc_href.select('#gknbxq_container > div > div.zfxxgk-content.zfxxgk_content')[0]
                            content = contentwithTag.text
+                            if content == '' or content == None:
+                                log.info(f'-----{href}----{title}----内容为空-----')
+                                continue
                            fu_jian_list = contentwithTag.find_all('a')
                            for fu_jian in fu_jian_list:
                                try:
@@ -2793,18 +2899,20 @@ def yun_nan():
                        flag = sendKafka(dic_news)
                        if flag:
                            save_data(dic_news)
-                        print(title)
+                            log.info(title)
                            num = num + 1
+                            count += 1
                    except:
                        pass
                resp.close()
            except:
                pass
        end_time = time.time()
-        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
    def yun_nan2():
        num = 0
+        count = 0
        start_time = time.time()
        for page in range(1, 4):
            if page == 1:
@@ -2828,7 +2936,7 @@ def yun_nan():
                        num+=1
                        continue
                    try:
-                        print(href)
+                        # print(href)
                        if '.shtml' in href:
                            res_ = requests.get(href, headers)
                            page_text_ = res_.text.encode("ISO-8859-1")
@@ -2847,6 +2955,9 @@ def yun_nan():
                                pub_hao = ''
                            contentwithTag = page.find('div', attrs={'class': 'zfxxgk-right'})
                            content = contentwithTag.text
+                            if content == '' or content == None:
+                                log.info(f'-----{href}----{title}----内容为空-----')
+                                continue
                            fu_jian_list = contentwithTag.find_all('a')
                            for fu_jian in fu_jian_list:
                                try:
@@ -2857,7 +2968,7 @@ def yun_nan():
                                if '.doc' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
                                        or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                        or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
-                                    print(fu_jian_href)
+                                    # print(fu_jian_href)
                                    try:
                                        # 附件上传至文件服务器
                                        retData = baseCore.uptoOBS(fu_jian_href, '1679',pathType,file_name)
@@ -2876,9 +2987,7 @@ def yun_nan():
                        elif 'display' in href:
                            continue
                        else:
-                            content = ''
+                            continue
-                            contentwithTag = ''
-                            pub_hao = ''
                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                        # todo:传kafka字段
@@ -2907,8 +3016,8 @@ def yun_nan():
                        flag = sendKafka(dic_news)
                        if flag:
                            save_data(dic_news)
-                        print(title)
+                            log.info(title)
+                            count += 1
                            num = num + 1
                    except:
                        pass
@@ -2916,7 +3025,7 @@ def yun_nan():
            except:
                pass
        end_time = time.time()
-        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
    yun_nan1()
    yun_nan2()
@@ -2928,6 +3037,7 @@ def chong_qing():
    http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/qtwj/  2
    """
    num = 0
+    count = 0
    pathType = 'policy/chongqing/'
    start_time = time.time()
    for page in range(0, 4):
@@ -2955,7 +3065,7 @@ def chong_qing():
                        num+=1
                        continue
                    try:
-                        print(href)
+                        # print(href)
                        # href = 'https://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/zcwj/202007/t20200728_7729850.html'
                        href_text = requests.get(url=href, headers=headers, verify=False).content
                        doc_href = pq(href_text)
@@ -2978,6 +3088,9 @@ def chong_qing():
                                pass
                            contentWithTag = doc_href.find('div', class_='zwxl-article')
                            content = contentWithTag.text
+                            if content == '' or content == None:
+                                log.info(f'-----{href}----{title}----内容为空-----')
+                                continue
                        except:
                            origin = ''
                            topicClassification = ''
@@ -2986,7 +3099,9 @@ def chong_qing():
                            pub_hao = ''
                            contentWithTag = doc_href.find('div', class_='zwxl-content')
                            content = contentWithTag.text
+                            if content == '' or content == None:
+                                log.info(f'-----{href}----{title}----内容为空-----')
+                                continue
                        fu_jian_list = contentWithTag.find_all('a')
                        # print(fu_jian_list)
                        for fu_jian in fu_jian_list:
@@ -3039,21 +3154,22 @@ def chong_qing():
                        flag = sendKafka(dic_news)
                        if flag:
                            save_data(dic_news)
-                        print(title)
+                            log.info(title)
-                        # save_data(result_dict)
+                            count += 1
                            num += 1
                    except:
                        pass
        except:
            pass
    end_time = time.time()
-    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+    print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
 # 天津
 def tian_jin():
    pathType = 'policy/tianjin/'
    def tian_jin1():
        num = 0
+        count = 0
        start_time = time.time()
        for page in range(0, 3):
            if page == 0:
@@ -3139,7 +3255,9 @@ def tian_jin():
                            if len(fu_jian_soup) < 1:
                                continue
                        content = soup.text
+                        if content == '' or content == None:
+                            log.info(f'-----{href}----{title}----内容为空-----')
+                            continue
                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                        # todo:传kafka字段
                        dic_news = {
@@ -3167,18 +3285,20 @@ def tian_jin():
                        if flag:
                            save_data(dic_news)
                            num += 1
+                            count += 1
                    except:
                        pass
            except:
                pass
        end_time = time.time()
-        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
    def tian_jin2():
        """
        http://sasac.tj.gov.cn/ZWGK1142/zcwj/wjwj/index.html  4
        """
        num = 0
+        count =0
        start_time = time.time()
        for page in range(0, 5):
            if page == 0:
@@ -3263,7 +3383,9 @@ def tian_jin():
                            if len(fu_jian_soup) < 1:
                                continue
                        content = soup.text
+                        if content == '' or content == None:
+                            log.info(f'-----{href}----{title}----内容为空-----')
+                            continue
                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                        # todo:传kafka字段
                        dic_news = {
@@ -3291,15 +3413,17 @@ def tian_jin():
                        if flag:
                            save_data(dic_news)
                            num += 1
+                            count += 1
                    except:
                        pass
            except:
                pass
        end_time = time.time()
-        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
    def tian_jin3():
        num = 0
+        count = 0
        start_time = time.time()
        for page in range(1, 3):
            if page == 1:
@@ -3391,7 +3515,9 @@ def tian_jin():
                            if len(fu_jian_soup) < 1:
                                continue
                        content = soup.text
+                        if content == '' or content == None:
+                            log.info(f'-----{href}----{title}----内容为空-----')
+                            continue
                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                        # todo:传kafka字段
                        dic_news = {
@@ -3419,12 +3545,13 @@ def tian_jin():
                        if flag:
                            save_data(dic_news)
                            num += 1
+                            count += 1
                    except:
                        pass
            except:
                pass
        end_time = time.time()
-        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
    tian_jin1()
    tian_jin2()
@@ -3435,6 +3562,7 @@ def xin_jiang():
    pathType = 'policy/xinjiang/'
    def xin_jiang1():
        num = 0
+        count = 0
        start_time = time.time()
        for page in range(1, 10):
            if page == 1:
@@ -3493,6 +3621,9 @@ def xin_jiang():
                            if len(fu_jian_soup) < 1:
                                continue
                        content = soup.text
+                        if content == '' or content == None:
+                            log.info(f'-----{href}----{title}----内容为空-----')
+                            continue
                        pattern = r'(新国.{1,}?号)|(国资.{1,}?号)'
                        match_list = re.findall(pattern, content)
                        if len(match_list) > 0:
@@ -3528,15 +3659,17 @@ def xin_jiang():
                        if flag:
                            save_data(dic_news)
                            num += 1
+                            count += 1
                    except:
                        pass
            except:
                pass
        end_time = time.time()
-        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
    def xin_jiang_jsbt():
        num = 0
+        count = 0
        start_time = time.time()
        for page in range(1, 6):
            if page == 1:
@@ -3592,6 +3725,9 @@ def xin_jiang():
                            if len(fu_jian_soup) < 1:
                                continue
                        content = soup.text
+                        if content == '' or content == None:
+                            log.info(f'-----{href}----{title}----内容为空-----')
+                            continue
                        pattern = r'(新国.{1,}?号)|(国资.{1,}?号)'
                        match_list = re.findall(pattern, content)
                        if len(match_list) > 0:
@@ -3627,6 +3763,7 @@ def xin_jiang():
                        if flag:
                            save_data(dic_news)
                            num += 1
+                            count += 1
                        href_res.close()
                    except:
                        pass
@@ -3634,7 +3771,7 @@ def xin_jiang():
            except:
                pass
        end_time = time.time()
-        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
    xin_jiang1()
    xin_jiang_jsbt()
@@ -3643,6 +3780,7 @@ def xin_jiang():
 def shan_xi():
    pathType = 'policy/shanxi/'
    num = 0
+    count = 0
    start_time = time.time()
    for page in range(1, 7):
        if page == 1:
@@ -3712,6 +3850,9 @@ def shan_xi():
                        if len(fu_jian_soup) < 1:
                            continue
                    content = soup.text
+                    if content == '' or content == None:
+                        log.info(f'-----{href}----{title}----内容为空-----')
+                        continue
                    pattern = r'(晋国资.{1,}?号)|(国资.{1,}?号)'
                    match_list = re.findall(pattern, content)
                    if len(match_list) > 0:
@@ -3747,17 +3888,19 @@ def shan_xi():
                    if flag:
                        save_data(dic_news)
                        num += 1
+                        count += 1
                except:
                    pass
        except:
            pass
    end_time = time.time()
-    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+    print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
 # 辽宁
 def liao_ning():
    pathType = 'policy/liaoning/'
    num = 0
+    count = 0
    start_time = time.time()
    for page in range(1, 3):
        url = f'https://gzw.ln.gov.cn/gzw/xxgk/zc/zcfb/aa251549-{page}.shtml'
@@ -3823,6 +3966,9 @@ def liao_ning():
                    if len(contentWithTag) < 1:
                        continue
                    content = soup.text
+                    if content == '' or content == None:
+                        log.info(f'-----{href}----{title}----内容为空-----')
+                        continue
                    pattern = r'(辽国资.{1,}?号)|(国资.{1,}?号)'
                    match_list = re.findall(pattern, content)
                    if len(match_list) > 0:
@@ -3858,6 +4004,7 @@ def liao_ning():
                    if flag:
                        save_data(dic_news)
                        num += 1
+                        count += 1
                except:
                    pass
        except:
@@ -3869,6 +4016,7 @@ def liao_ning():
 def hei_long_jiang():
    pathType = 'policy/heilongjiang/'
    num = 0
+    count = 0
    start_time = time.time()
    for page in range(1, 3):
        url = f'http://gzw.hlj.gov.cn/common/search/a4e4f3e94596456db749bfb0f7937cc7?_isAgg=true&_isJson=true&_pageSize=10&_template=index&_rangeTimeGte=&_channelName=&page={page}'
@@ -3926,6 +4074,9 @@ def hei_long_jiang():
                        contentWithTag = str(soup.prettify())
                        content = soup.text
+                        if content == '' or content == None:
+                            log.info(f'-----{href}----{title}----内容为空-----')
+                            continue
                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                        # todo:传kafka字段
                        dic_news = {
@@ -3953,6 +4104,7 @@ def hei_long_jiang():
                        if flag:
                            save_data(dic_news)
                            num += 1
+                            count += 1
                    except:
                        pass
            except:
@@ -3960,11 +4112,12 @@ def hei_long_jiang():
        except:
            pass
    end_time = time.time()
-    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+    print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
 # 江苏
 def jiang_su():
    num = 0
+    count = 0
    pathType = 'policy/jiangsu/'
    start_time = time.time()
    pagestart = 1
@@ -4034,6 +4187,9 @@ def jiang_su():
                    contentWithTag = str(soup.prettify())
                    content = soup.text
+                    if content == '' or content == None:
+                        log.info(f'-----{href}----{title}----内容为空-----')
+                        continue
                    if len(pub_hao) < 1:
                        pattern = r'(苏国.{1,}?号)|(国.{1,}?号)'
                        match_list = re.findall(pattern, content)
@@ -4068,18 +4224,20 @@ def jiang_su():
                    if flag:
                        save_data(dic_news)
                        num += 1
+                        count += 1
                except:
                    pass
        except:
            pass
    end_time = time.time()
-    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+    print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
 # 安徽
 def an_hui():
    pathType = 'policy/anhui/'
    def an_hui1():
        num = 0
+        count = 0
        start_time = time.time()
        for page in range(1, 4):
            url = f'http://gzw.ah.gov.cn/site/label/8888?IsAjax=1&dataType=html&_=0.4981381464472001&labelName=publicInfoList&siteId=6788071&pageSize=15&pageIndex={page}&action=list&isDate=false&dateFormat=yyyy%E5%B9%B4MM%E6%9C%88dd%E6%97%A5&length=15&organId=7031&type=4&catIds=&catId=6717051&cId=&result=&title=&fileNum=&keyWords=&file=%2Fxxgk%2FpublicInfoList_newest2020_zc'
@@ -4137,6 +4295,9 @@ def an_hui():
                        contentWithTag = str(soup.prettify())
                        content = soup.text
+                        if content == '' or content == None:
+                            log.info(f'-----{href}----{title}----内容为空-----')
+                            continue
                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                        # todo:传kafka字段
                        dic_news = {
@@ -4164,15 +4325,17 @@ def an_hui():
                        if flag:
                            save_data(dic_news)
                            num += 1
+                            count += 1
                    except:
                        pass
            except:
                pass
        end_time = time.time()
-        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
    def an_hui2():
        num = 0
+        count = 0
        start_time = time.time()
        for page in range(1, 25):
            url = f'http://gzw.ah.gov.cn/site/label/8888?_=0.5237800193505848&labelName=publicInfoList&siteId=6788071&pageSize=15&pageIndex={page}&action=list&isDate=false&dateFormat=yyyy%E5%B9%B4MM%E6%9C%88dd%E6%97%A5&length=15&organId=7031&type=4&catIds=43793891%2C43793901&catId=&cId=&result=&title=&fileNum=&keyWords=&file=%2Fxxgk%2FpublicInfoList_newest2020_zc'
@@ -4233,6 +4396,9 @@ def an_hui():
                        contentWithTag = str(soup.prettify())
                        content = soup.text
+                        if content == '' or content == None:
+                            log.info(f'-----{href}----{title}----内容为空-----')
+                            continue
                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                        # todo:传kafka字段
                        dic_news = {
@@ -4260,6 +4426,7 @@ def an_hui():
                        if flag:
                            save_data(dic_news)
                            num += 1
+                            count += 1
                        href_res.close()
                    except:
                        pass
@@ -4267,7 +4434,7 @@ def an_hui():
            except:
                pass
        end_time = time.time()
-        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
    an_hui1()
    an_hui2()
@@ -4280,6 +4447,7 @@ def jiang_xi():
    121-164
    """
    num = 0
+    count = 0
    pathType = 'policy/jiangxi/'
    start_time = time.time()
    startrecord = 1
@@ -4360,6 +4528,9 @@ def jiang_xi():
                    contentWithTag = str(soup.prettify())
                    content = soup.text
+                    if content == '' or content == None:
+                        log.info(f'-----{href}----{title}----内容为空-----')
+                        continue
                    if len(pub_hao) < 1:
                        pattern = r'(赣国资.{1,}?号)|(国.{1,}?号)'
                        match_list = re.findall(pattern, content)
@@ -4395,16 +4566,18 @@ def jiang_xi():
                    if flag:
                        save_data(dic_news)
                        num += 1
+                        count += 1
                except:
                    pass
        except:
            pass
    end_time = time.time()
-    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+    print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
 # 河南
 def he_nan():
    num = 0
+    count = 0
    pathType = 'policy/henan/'
    start_time = time.time()
    for page in range(0, 7):
@@ -4456,6 +4629,9 @@ def he_nan():
                contentWithTag = str(soup.prettify())
                content = soup.text
+                if content == '' or content == None:
+                    log.info(f'-----{href}----{title}----内容为空-----')
+                    continue
                pattern = r'(豫国.{1,}?号)|(国.{1,}?号)'
                match_list = re.findall(pattern, content)
                if len(match_list) > 0:
@@ -4489,16 +4665,18 @@ def he_nan():
                if flag:
                    save_data(dic_news)
                    num += 1
+                    count += 1
                href_res.close()
            resp_text.close()
        except:
            pass
    end_time = time.time()
-    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+    print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
 # 湖南
 def hu_nan():
    num = 0
+    count = 0
    pathType = 'policy/hunan/'
    start_time = time.time()
    for page in range(1, 7):
@@ -4565,6 +4743,9 @@ def hu_nan():
                    contentWithTag = str(soup.prettify())
                    content = soup.text
+                    if content == '' or content == None:
+                        log.info(f'-----{href}----{title}----内容为空-----')
+                        continue
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # todo:传kafka字段
                    dic_news = {
@@ -4592,18 +4773,20 @@ def hu_nan():
                    if flag:
                        save_data(dic_news)
                        num += 1
+                        count += 1
                except:
                    pass
        except:
            pass
    end_time = time.time()
-    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+    print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
 # 甘肃
 def gan_su():
    pathType = 'policy/gansu/'
    def gan_su1():
        num = 0
+        count = 0
        start_time = time.time()
        bro = getDriver()
        urls = ['http://gzw.gansu.gov.cn/gzw/c115543/xxgk_list.shtml',
@@ -4686,6 +4869,9 @@ def gan_su():
                    # id_ = redefid(id_list)
                    contentWithTag = str(soup.prettify())
                    content = soup.text
+                    if content == '' or content == None:
+                        log.info(f'-----{href}----{title}----内容为空-----')
+                        continue
                    # t = time.strptime(publishDate, "%Y年%m月%d日")
                    # publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
@@ -4715,6 +4901,7 @@ def gan_su():
                    if flag:
                        save_data(dic_news)
                        num += 1
+                        count += 1
            except Exception as e:
                print(e)
                pass
@@ -4724,6 +4911,7 @@ def gan_su():
    def gan_su2():
        num = 0
+        count = 0
        start_time = time.time()
        bro = getDriver()
        url = 'http://gzw.gansu.gov.cn/gzw/c115552/xxgk_list.shtml'
@@ -4821,6 +5009,9 @@ def gan_su():
                    contentWithTag = str(soup.prettify())
                    content = soup.text
+                    if content == '' or content == None:
+                        log.info(f'-----{href}----{title}----内容为空-----')
+                        continue
                    if len(content) < 2:
                        continue
                    # t = time.strptime(publishDate, "%Y年%m月%d日")
@@ -4852,6 +5043,7 @@ def gan_su():
                    if flag:
                        save_data(dic_news)
                        num += 1
+                        count += 1
                except Exception as e:
                    print(e)
        except Exception as e:
@@ -4859,10 +5051,11 @@ def gan_su():
            pass
        bro.quit()
        end_time = time.time()
-        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
    def gan_su3():
        num = 0
+        count = 0
        start_time = time.time()
        # # service = Service(r'D:/chrome/103/chromedriver.exe')
        # chrome_options = webdriver.ChromeOptions()
@@ -4979,6 +5172,9 @@ def gan_su():
                    contentWithTag = str(soup.prettify())
                    content = soup.text
+                    if content == '' or content == None:
+                        log.info(f'-----{href}----{title}----内容为空-----')
+                        continue
                    if len(content) < 2:
                        continue
                    # t = time.strptime(publishDate, "%Y年%m月%d日")
@@ -5010,13 +5206,14 @@ def gan_su():
                    if flag:
                        save_data(dic_news)
                        num += 1
+                        count += 1
                except Exception as e:
                    print(e)
        except:
            pass
        bro.quit()
        end_time = time.time()
-        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
    gan_su1()
    gan_su2()
@@ -5025,6 +5222,7 @@ def gan_su():
 # 宁夏
 def ning_xia():
    num = 0
+    count = 0
    pathType = 'policy/ningxia/'
    start_time = time.time()
    for page in range(0, 3):
@@ -5082,6 +5280,9 @@ def ning_xia():
                    # id_ = redefid(id_list)
                    contentWithTag = str(soup.prettify())
                    content = soup.text
+                    if content == '' or content == None:
+                        log.info(f'-----{href}----{title}----内容为空-----')
+                        continue
                    t = time.strptime(publishDate, "%Y年%m月%d日")
                    publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
@@ -5111,16 +5312,18 @@ def ning_xia():
                    if flag:
                        save_data(dic_news)
                        num += 1
+                        count += 1
                except:
                    pass
        except:
            pass
    end_time = time.time()
-    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+    print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
 # 陕西
 def shanxi():
    num = 0
+    count = 0
    pathType = 'policy/shan_xi/'
    start_time = time.time()
    url = 'https://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?chid=100127'
@@ -5184,6 +5387,9 @@ def shanxi():
                # id_ = redefid(id_list)
                contentWithTag = str(soup.prettify())
                content = soup.text
+                if content == '' or content == None:
+                    log.info(f'-----{href}----{title}----内容为空-----')
+                    continue
                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                # todo:传kafka字段
                dic_news = {
@@ -5211,6 +5417,7 @@ def shanxi():
                if flag:
                    save_data(dic_news)
                    num += 1
+                    count += 1
                res_href.close()
            except:
                pass
@@ -5218,7 +5425,7 @@ def shanxi():
    except:
        pass
    end_time = time.time()
-    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+    print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
 # 西藏
 def xi_zang():
@@ -5228,6 +5435,7 @@ def xi_zang():
                'http://gzw.lasa.gov.cn/gzw/wjzl/common_list.shtml', ]
    for url in url_list:
        num = 0
+        count = 0
        try:
            res = requests.get(url=url, headers=headers)
            res.encoding = res.apparent_encoding
@@ -5256,6 +5464,9 @@ def xi_zang():
                    contentWithTag = str(i_soup.find(id='NewsContent'))
                    soup = BeautifulSoup(contentWithTag, 'html.parser')
                    content = soup.text
+                    if content == '' or content == None:
+                        log.info(f'-----{href}----{title}----内容为空-----')
+                        continue
                    fu_jian_soup = soup.find_all('a')
                    id_list = []
                    for file in fu_jian_soup:
@@ -5306,18 +5517,20 @@ def xi_zang():
                    if flag:
                        save_data(dic_news)
                        num += 1
+                        count += 1
                except:
                    pass
        except:
            pass
    end_time = time.time()
-    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+    print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
 # 青海
 def qing_hai():
    pathType = 'policy/qinghai/'
    def qing_hai1():
        num = 0
+        count = 0
        start_time = time.time()
        url_mode = 'http://gxgz.qinghai.gov.cn/xxgk/List_wj.aspx?lmid=604'
        try:
@@ -5353,6 +5566,9 @@ def qing_hai():
                        origin = str(page.find('div', attrs={'class': 'foot-fb'}))
                        soup = BeautifulSoup(contentWithTag, 'html.parser')
                        content = soup.text
+                        if content == '' or content == None:
+                            log.info(f'-----{durl}----{title}----内容为空-----')
+                            continue
                        fu_jian_soup = soup.find_all('a')
                        id_list = []
                        for file in fu_jian_soup:
@@ -5364,7 +5580,7 @@ def qing_hai():
                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
-                                retData = baseCore.uploadToserver(file_href, '1681')
+                                retData = baseCore.uptoOBS(file_href, '1681',pathType,file_name)
                                if retData['state']:
                                    pass
                                else:
@@ -5405,15 +5621,17 @@ def qing_hai():
                        # print(id)
                        # id_list.append(id)
                            num += 1
+                            count += 1
                except:
                    pass
        except:
            pass
        end_time = time.time()
-        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
    def qing_hai2():
        num = 0
+        count = 0
        start_time = time.time()
        urls = [
            'http://gxgz.qinghai.gov.cn/xxgk/List_wj.aspx?lmid=627',
@@ -5446,6 +5664,7 @@ def qing_hai():
                            durl = tr.find('a').get('href')
                            is_href = db_storage.find_one({'网址': durl})
                            if is_href:
+                                num+=1
                                log.info('已采集----------跳过')
                                continue
                            title = tr.find('a').text
@@ -5471,6 +5690,9 @@ def qing_hai():
                                    origin = ''
                                soup = BeautifulSoup(contentWithTag, 'html.parser')
                                content = soup.text
+                                if content == '' or content == None:
+                                    log.info(f'-----{durl}----{title}----内容为空-----')
+                                    continue
                                fu_jian_soup = soup.find_all('a')
                                id_list = []
                                for file in fu_jian_soup:
@@ -5482,7 +5704,7 @@ def qing_hai():
                                            or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                            or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                        file_name = file.text.strip()
-                                        retData = baseCore.uploadToserver(file_href, '1681')
+                                        retData = baseCore.uptoOBS(file_href, '1681',pathType,file_name)
                                        if retData['state']:
                                            pass
                                        else:
@@ -5490,7 +5712,7 @@ def qing_hai():
                                        att_id, full_path = baseCore.tableUpdate(retData, '青海省国资委', file_name, num)
                                        id_list.append(att_id)
                                        # todo:将返回的地址更新到soup
-                                        file['href'] = 'http://114.115.215.96/' + full_path
+                                        file['href'] = full_path
                                # id_ = redefid(id_list)
                                contentWithTag = str(soup.prettify())
                                # todo:替换完成之后，将附件上传至文件服务器
@@ -5523,13 +5745,14 @@ def qing_hai():
                                    # print(id)
                                    # id_list.append(id)
                                    num += 1
+                                    count += 1
                        except:
                            pass
                res.close()
            except:
                pass
        end_time = time.time()
-        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+        print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
    qing_hai1()
    qing_hai2()
@@ -5537,6 +5760,8 @@ def qing_hai():
 # 河北
 def he_bei():
    num = 0
+    count = 0
+    pathType = 'policy/hebei/'
    start_time = time.time()
    url = 'http://hbsa.hebei.gov.cn/Json/GFXWJ51.json'
    try:
@@ -5551,6 +5776,7 @@ def he_bei():
            href = 'http://hbsa.hebei.gov.cn/xxgk/GFXWJ?id=' + str(id)
            is_href = db_storage.find_one({'网址': href})
            if is_href:
+                num+=1
                continue
            pub_time_ = info['updated']
            m = round(pub_time_ / 1000)  # 四舍五入取10位时间戳（秒级）
@@ -5569,7 +5795,7 @@ def he_bei():
                        or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                        or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                    file_name = file.text.strip()
-                    retData = baseCore.uploadToserver(file_href, '1668')
+                    retData = baseCore.uptoOBS(file_href, '1668',pathType,file_name)
                    if retData['state']:
                        pass
                    else:
@@ -5577,13 +5803,16 @@ def he_bei():
                    att_id, full_path = baseCore.tableUpdate(retData, '河北省国资委', file_name, num)
                    id_list.append(att_id)
                    # todo:将返回的地址更新到soup
-                    file['href'] = 'http://114.115.215.96/' + full_path
+                    file['href'] = full_path
            # id_ = redefid(id_list)
            contentWithTag = str(soup.prettify())
            if len(contentWithTag) < 1:
                if len(fu_jian_soup) < 1:
                    continue
            content = soup.text
+            if content == '' or content == None:
+                log.info(f'-----{href}----{title}----内容为空-----')
+                continue
            pattern = r'(冀国.{1,}?号)|(国资.{1,}?号)'
            match_list = re.findall(pattern, content)
            if len(match_list) > 0:
@@ -5619,14 +5848,17 @@ def he_bei():
            if flag:
                save_data(dic_news)
                num += 1
+                count += 1
    except:
        pass
    end_time = time.time()
-    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+    print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
 # 湖北
 def hu_bei():
    num = 0
+    count = 0
+    pathType = 'policy/hubei/'
    start_time = time.time()
    hrefs = []
    url = 'http://gzw.hubei.gov.cn/zfxxgk/zc/gfxwj/'
@@ -5649,6 +5881,7 @@ def hu_bei():
    for href in hrefs:
        is_href = db_storage.find_one({'网址': href})
        if is_href:
+            num+=1
            continue
        try:
            driver.get(href)
@@ -5684,7 +5917,7 @@ def hu_bei():
                        or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                        or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                    file_name = file.text.strip()
-                    retData = baseCore.uploadToserver(file_href, '1675')
+                    retData = baseCore.uptoOBS(file_href, '1675',pathType,file_name)
                    if retData['state']:
                        pass
                    else:
@@ -5692,14 +5925,16 @@ def hu_bei():
                    att_id, full_path = baseCore.tableUpdate(retData, '湖北省国资委', file_name, num)
                    id_list.append(att_id)
                    # todo:将返回的地址更新到soup
-                    file['href'] = 'http://114.115.215.96/' + full_path
+                    file['href'] = full_path
            # id_ = redefid(id_list)
            contentWithTag = str(soup.prettify())
            if len(contentWithTag) < 1:
                if len(fu_jian_soup) < 1:
                    continue
            content = soup.text
+            if content == '' or content == None:
+                log.info(f'-----{href}----{title}----内容为空-----')
+                continue
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            # todo:传kafka字段
            dic_news = {
@@ -5727,48 +5962,49 @@ def hu_bei():
            if flag:
                save_data(dic_news)
                num += 1
+                count += 1
        except Exception as e:
            pass
    driver.close()
    end_time = time.time()
-    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')
+    print(f'共抓取{count}条数据,共耗时{end_time - start_time}')
 if __name__ == '__main__':
-    # get_content1()
+    get_content1()
-    # get_content2()
+    get_content2()
-    # get_content3()
+    get_content3()
-    # bei_jing()
+    bei_jing()
-    # nei_meng_gu()
+    nei_meng_gu()
    ji_lin()
-    # shang_hai()
+    shang_hai()
-    # zhe_jiang()
+    zhe_jiang()
-    # fu_jian()
+    fu_jian()
-    # shan_dong()
+    shan_dong()
-    # guang_dong()
+    guang_dong()
-    # hai_nan()
+    hai_nan()
-    # si_chuan()
+    si_chuan()
-    # guang_xi()
+    guang_xi()
-    # gui_zhou()
+    gui_zhou()
-    # yun_nan()
+    yun_nan()
-    # chong_qing()
+    chong_qing()
-    # tian_jin()
+    tian_jin()
-    # xin_jiang()
+    xin_jiang()
-    # shan_xi()
+    shan_xi()
-    # liao_ning()
+    liao_ning()
-    # hei_long_jiang()
+    hei_long_jiang()
-    # jiang_su()
+    jiang_su()
-    # an_hui()
+    an_hui()
-    # jiang_xi()
+    jiang_xi()
-    # he_nan()
+    he_nan()
-    # hu_nan()
+    hu_nan()
-    # gan_su()
+    gan_su()
-    # ning_xia()
+    ning_xia()
-    # xi_zang()
+    xi_zang()
-    # shanxi()
+    shanxi()
-    # qing_hai()
+    qing_hai()
-    # he_bei()
+    he_bei()
-    # qing_hai()
+    qing_hai()
-    # current_time = datetime.datetime.now()
+    current_time = datetime.datetime.now()
-    # midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
+    midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
-    # sleep_seconds = (midnight_time - current_time).total_seconds()
+    sleep_seconds = (midnight_time - current_time).total_seconds()
-    # time.sleep(sleep_seconds)
+    time.sleep(sleep_seconds)