福建省人民政府

3d6d75b6 · 薛凌堃 · db64a87a · 3d6d75b6
--- a/REITs专题数据/policy-fujian.py
+++ b/REITs专题数据/policy-fujian.py
 import time
@@ -5,10 +5,15 @@ import numpy as np
 import pandas as pd
 import requests
 from bs4 import BeautifulSoup
-from base import BaseCore
+import BaseCore
 baseCore = BaseCore.BaseCore()
 log = baseCore.getLogger()
+from reits import Policy
+policy = Policy()
+topic = 'policy'
+webname = '福建省人民政府'
 headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
    'X-Requested-With': 'XMLHttpRequest',
@@ -42,10 +47,10 @@ def getDataJson(data_post):
 def getContent(num, url, publishDate):
+    id_list = []
    url_ = url.split('/')[-1]
    url_ = url.replace(url_, '')
-    fjhref_list = ''
-    fjtitle_list = ''
    soup = getSoup(url)
    contentWithTag = soup.find('div', class_='TRS_Editor')
    try:
@@ -63,27 +68,24 @@ def getContent(num, url, publishDate):
    a_list = contentWithTag.find_all('a')
    for a in a_list:
        fj_href = a.get('href').replace('./', url_)
-        fjhref_list += fj_href + '\n'
        fj_title = a.text.lstrip().strip()
        category = os.path.splitext(fj_href)[1]
        if category not in fj_title:
            fj_title = fj_title + category
-        fj_title = f'{num}-{publishDate}-{fj_title}'
-        fjtitle_list += fj_title + '\n'
+        att_id, full_path = policy.attuributefile(fj_title, fj_href, num, publishDate)
-        fjcontent = getFjContent(fj_href)
+        if att_id:
-        file = f'./相关政策/福建省人民政府/政策文件/{fj_title}'
+            id_list.append(att_id)
-        with open(file, 'wb') as f:
+            a['href'] = full_path
-            f.write(fjcontent)
-        log.info(f'{fj_title}===附件下载成功')
-    fjtitle_list = fjtitle_list.lstrip().strip()
-    fjhref_list = fjhref_list.lstrip().strip()
    content = contentWithTag.text.lstrip().strip()
-    return content, fjtitle_list, fjhref_list
+    return content, contentWithTag, id_list
 def doJob():
-    if not os.path.exists('./相关政策/福建省人民政府/政策文件'):
-        os.makedirs('./相关政策/福建省人民政府/政策文件')
    data_posts = [{
        'isCollapse': '', 'siteType': '1', 'typeQueryJsonToMap': '', 'pubOrgType': '1', 'jiGuanList': '',
        'siteCode': '', 'zhuTiIdList': '', 'isCrdept': '', 'mainSiteId': 'ff808081624641aa0162476c0e0e0055',
@@ -114,10 +116,15 @@ def doJob():
            publishDate = data_['crtime'].replace('.','-')
            origin = data_['docsourcename']
            href = data_['docpuburl']
+            # 根据链接判重
+            is_member = baseCore.r.sismember('REITs::' + webname, href)
+            if is_member:
+                continue
            try:
                writtenDate = data_['pubdate'].replace('.','-')
            except:
-                writtenDate = ''
+                writtenDate = None
            try:
                organ = data_['puborg']
            except:
@@ -128,16 +135,37 @@ def doJob():
                pub_hao = ''
            summary = data_['doccontent']
            summary = BeautifulSoup(summary, 'lxml').text.lstrip().strip()
-            content, fjtitle_list, fjhref_list = getContent(num, href, publishDate[:10])
+            content, contentWithTag, id_list = getContent(num, href, publishDate[:10])
-            data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
+            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-                    fjhref_list]
+            contentWithTag_str = str(contentWithTag)
-            data_list.append(data)
+            dic_info = {
-            log.info(f'{title}===采集成功')
+                'attachmentIds': id_list,
+                'author': '',
+                'content': content,
+                'contentWithTag': contentWithTag_str,
+                'deleteFlag': 0,
+                'id': '',
+                'title': title,
+                'publishDate': publishDate,
+                'origin': origin,
+                'sourceAddress': href,
+                'writtenDate': writtenDate,
+                'organ': organ,
+                'topicClassification': '',
+                'issuedNumber': pub_hao,
+                'summary': summary,
+                'createDate': time_now,
+                'sid': '1729043067106865154',
+            }
+            try:
+                baseCore.sendkafka(dic_info, topic)
+                baseCore.r.sadd('REITs::' + webname, href)
+                log.info(f'采集成功--{title}--{href}')
+            except Exception as e:
+                for att_id in id_list:
+                    baseCore.deliteATT(att_id)
            num += 1
            time.sleep(1)
-    df = pd.DataFrame(np.array(data_list))
-    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
-    df.to_excel('./相关政策/福建省人民政府/福建省人民政府政策文件.xlsx', index=False)
 if __name__ == '__main__':