政策法规

eeb41ef7 · 薛凌堃 · d5722767 · eeb41ef7
--- a/comData/policylaw/2.py
+++ b/comData/policylaw/2.py
@@ -725,7 +725,7 @@ def ji_lin():
            if is_href:
                continue
            try:
-                # real_href = 'http://gzw.jl.gov.cn/zwgk/zcwj/202211/t20221123_2310750.html'
+                # real_href = 'http://gzw.jl.gov.cn/zwgk/zcwj//201906/t20190624_2310742.html'
                href_text = requests.get(url=real_href, headers=headers, verify=False)
                i_html = href_text.text.encode("ISO-8859-1")
                i_html = i_html.decode("utf-8")
@@ -733,7 +733,8 @@ def ji_lin():
                # print(i_soup)
                #相对路径转化为绝对路径
                soup = paserUrl(i_soup, real_href)
-                text = str(soup.prettify())
+                soup.prettify()
                try:
                    i_come = i_soup.find('span', class_='source')
                    i_time = i_soup.find('span', class_='time')
@@ -756,9 +757,18 @@ def ji_lin():
                        pub_time = pub.find(class_='left').find('span', class_='time').text
                        pub_come = pub.find(class_='right').find('span', class_='source').text.split('来源：')[1].strip()
                        # print(pub_come)
-                i_content = i_soup.find(class_='zsy_comain')
+                i_content = soup.find(class_='zsy_comain')
                if i_content:
-                    content = str(i_content)
+                    print(real_href)
+                    #去掉扫一扫
+                    soup.find('div',id='qr_container').decompose()
+                    soup.find('div',id='div_div').decompose()
+                    #去掉style
+                    # 去掉style标签
+                    for styleTag in soup.find_all('style'):
+                        styleTag.extract()
+                    contentWithTag = soup.find(class_='zsy_comain')
+                    content = contentWithTag.text.strip()
                    #发文字号
                    find_hao = i_content.find_all('p')[:3]
                    pub_hao = ''
@@ -767,7 +777,7 @@ def ji_lin():
                            pub_hao = j.text
                        else:
                            continue
-                    fj = i_soup.find('div', style='width:920px; margin: 0 auto;')
+                    fj = soup.find('div', style='width:920px; margin: 0 auto;')
                    if fj:
                        li_list = fj.find_all('li')
                        for li in li_list:
@@ -790,16 +800,20 @@ def ji_lin():
                            else:
                                continue
                else:
-                    i_content= i_soup.find(class_="content")
+                    i_content= soup.find(class_="content")
                    #将文章中的附件字段删去
                    pattern = r'\d+\.'
+                    # pattern = r"附件：\d+\.\s*(.*)"
                    for p in i_content.find_all('div')[-10:]:
                        p_text = p.text
                        matches = re.findall(pattern, p_text)
-                        for k in matches:
+                        if matches:
-                            if k in p_text:
-                                p.extract()
+                            for k in matches:
-                    content = str(i_content)
+                                if k in p_text:
+                                    p.extract()
+                    contentWithTag = i_content
+                    content = contentWithTag.text.strip()
                    #找到附件上传至文件服务器
                    fj_soup = i_soup.find('div',class_='wenjianfujian')
                    fj_list = fj_soup.find_all('a')
@@ -815,7 +829,7 @@ def ji_lin():
                                pass
                            else:
                                continue
-                            att_id, full_path = baseCore.tableUpdate(retData, '吉林市国资委', file_name, num)
+                            att_id, full_path = baseCore.tableUpdate(retData, '吉林省国资委', file_name, num)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
@@ -836,8 +850,8 @@ def ji_lin():
                dic_news = {
                    'attachmentIds': id_list,
                    'author': '',
-                    'content': str(i_content.text),
+                    'content': content,
-                    'contentWithTag': content,
+                    'contentWithTag': contentWithTag,
                    'createDate': time_now,
                    'deleteFlag': 0,
                    'id': '',
@@ -1168,15 +1182,17 @@ def fu_jian():
                        i_html = href_text.text
                        i_soup = BeautifulSoup(i_html, 'html.parser')
                real_href = href
+                real_href = 'http://gzw.fujian.gov.cn/zwgk/xxgkzl/xxgkml/gfxwj/202211/t20221129_6064610.htm'
                # print(real_href)
-                is_href = db_storage.find_one({'网址': real_href})
+                # is_href = db_storage.find_one({'网址': real_href})
-                if is_href:
+                # if is_href:
-                    continue
+                #     continue
                try:
-                #     # 文章是远程pdf
+                    # 文章是远程pdf
-                #直接下载文件至服务器，解析出正文内容
+                    # 直接下载文件至服务器，解析出正文内容
                    if '.pdf' in real_href:
+                        # pass
                        resp_content = requests.get(real_href, headers=headers, verify=False, timeout=20).content
                        #解析出pdf内容
                        content = baseCore.pdf_content(resp_content)
@@ -1195,8 +1211,6 @@ def fu_jian():
                    else:
                        try:
-                            real_href = 'http://gzw.fujian.gov.cn/ztzl/gzjgfzjs/gfxwj_7426/201809/t20180911_4492105.htm'
                            href_text = requests.get(url=real_href, headers=headers, verify=False)
                            href_text.encoding = href_text.apparent_encoding
                            i_html = href_text.text
@@ -1208,6 +1222,7 @@ def fu_jian():
                            try:
                                fu_jian_list = i_soup.find('ul',class_='clearflx myzj_xl_list').find_all('a')
                            except:
+                                pass
                                fu_jian_list = []
                            for fu_jian in fu_jian_list:
                                fj_href = fu_jian['href']
@@ -1234,29 +1249,12 @@ def fu_jian():
                            pub_hao = ''
                        except:
-                            print(f'-------其他情况：{real_href}-------')
+                            pub_source = ''
-                            continue
+                            pub_time = ''
-                            # href_text = requests.get(url=real_href, headers=headers, verify=False)
+                            contentwithtag = i_soup.find('tabs tab_base_01 rules_con1')
-                            # href_text.encoding = href_text.apparent_encoding
+                            content = contentwithtag.text.strip()
-                            # i_html = href_text.text
+                            pub_hao = contentwithtag.find_all('div',class_='rules_tit1 b-free-read-leaf').text.dtrip()
-                            # i_soup = BeautifulSoup(i_html, 'html.parser')
-                            # i_soup = paserUrl(i_soup, real_href)
-                            # # print(i_soup)
-                            # source = str(i_soup.find('table', attrs={'class': 'tp-pho'}).text)
-                            # pub_hao = source.split('文号')[1].split('发布机构')[0].strip().lstrip()
-                            # pub_source = source.split('发布机构')[1].split('生成日期')[0].strip().lstrip()
-                            # pub_time = source.split('生成日期')[1].split('标题')[0].strip().lstrip()
-                            # content = i_soup.find('div', attrs={'class': 'xl-article-nr'})
-                            # fu_jian_result = re.findall('href="(.*?)"', str(content))
-                            # fu_jian_href_list = []
-                            # if len(fu_jian_result) > 0:
-                            #     for fu_jian_re in fu_jian_result:
-                            #         if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
-                            #                 or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
-                            #                 or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
-                            #             fu_jian_href = fu_jian_re
-                            #             print(fu_jian_href)
-                            #             fu_jian_href_list.append(fu_jian_href)
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # todo:传kafka字段
                    dic_news = {
@@ -1283,7 +1281,6 @@ def fu_jian():
                    sendKafka(dic_news)
                    save_data(dic_news)
                    print(title)
-                    # save_data(result_dict)
                    num += 1
                except:
                    pass
@@ -1727,7 +1724,8 @@ def hai_nan():
                        'summary': '',
                        'title': title
                    }
+                    sendKafka(dic_news)
+                    save_data(dic_news)
                    href_text.close()
                    # save_data(result_dict)
                    print(title)
@@ -1777,7 +1775,7 @@ def hai_nan():
                        contentWithTag = doc_href.find(class_='con_cen line mar-t2 xxgk_content_content')
                        content = contentWithTag.text
                    except:
-                        print(href)
+                        # print(href)
                        pub_result = doc_href.find('div', attrs={'class': 'line mar-t2 con_div'})
                        topicClassification = ''
                        origin = str(pub_result.text).split('来源：')[1].split(' 【字体：')[0].lstrip().strip()
@@ -1991,31 +1989,31 @@ def hai_nan():
                        pub_hao = ''
                        contentWithTag = doc_href.find(class_='pages_content')
                        content = contentWithTag.text
-                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-                        # todo:传kafka字段
+                    # todo:传kafka字段
-                        dic_news = {
+                    dic_news = {
-                            'attachmentIds': [],
+                        'attachmentIds': [],
-                            'author': '',
+                        'author': '',
-                            'content': content,
+                        'content': content,
-                            'contentWithTag': str(contentWithTag),
+                        'contentWithTag': str(contentWithTag),
-                            'createDate': time_now,
+                        'createDate': time_now,
-                            'deleteFlag': 0,
+                        'deleteFlag': 0,
-                            'id': '',
+                        'id': '',
-                            'labels': [{'relationId': "1677", 'relationName': "海南省国资委", 'labelMark': "policy"}],
+                        'labels': [{'relationId': "1677", 'relationName': "海南省国资委", 'labelMark': "policy"}],
-                            'origin': '',
+                        'origin': '',
-                            'organ': pub_source,
+                        'organ': pub_source,
-                            'topicClassification': '',
+                        'topicClassification': '',
-                            'issuedNumber': pub_hao,
+                        'issuedNumber': pub_hao,
-                            'publishDate': pub_time,
+                        'publishDate': pub_time,
-                            'writtenDate': '',
+                        'writtenDate': '',
-                            'sid': '1697458829758697473',
+                        'sid': '1697458829758697473',
-                            'sourceAddress': i_href,
+                        'sourceAddress': i_href,
-                            'summary': '',
+                        'summary': '',
-                            'title': title
+                        'title': title
-                        }
+                    }
-                        sendKafka(dic_news)
+                    sendKafka(dic_news)
-                        save_data(dic_news)
+                    save_data(dic_news)
                    href_text.close()
                    # save_data(result_dict)
                    print(title)
@@ -2411,9 +2409,8 @@ def gui_zhou():
                        'title': title
                    }
                    # print(dic_news)
-                    # sendKafka(dic_news)
+                    sendKafka(dic_news)
-                    # save_data(dic_news)
+                    save_data(dic_news)
                    print(title)
                    # save_data(result_dict)
                    num = num + 1
@@ -2700,7 +2697,7 @@ def chong_qing():
                            contentWithTag = doc_href.find('div',class_='zwxl-article')
                            content = contentWithTag.text
                        except:
-                            pub_source = ''
+                            origin = ''
                            topicClassification = ''
                            pub_time = ''
                            writtenDate = ''
@@ -2742,7 +2739,7 @@ def chong_qing():
                            'id': '',
                            'labels': [{'relationId': "1693", 'relationName': "重庆市国资委",
                                        'labelMark': "policy"}],
-                            'origin': '',
+                            'origin': origin,
                            'organ': '',
                            'topicClassification': topicClassification,
                            'issuedNumber': pub_hao,
@@ -5392,7 +5389,7 @@ if __name__ == '__main__':
    # ji_lin()
    # shang_hai()
    # zhe_jiang()
-    # fu_jian()
+    fu_jian()
    # shan_dong()
    # guang_dong()
    # hai_nan()