江西省人民政府

5bba0870 · 薛凌堃 · 07bd3604 · 5bba0870
--- a/REITs专题数据/policy-jiangxi.py
+++ b/REITs专题数据/policy-jiangxi.py
-import requests
+import time
+import time
+
 import requests
 from bs4 import BeautifulSoup
-from base import BaseCore
+
 import os
 import pandas as pd
 import numpy as np
+import BaseCore

 baseCore = BaseCore.BaseCore()
 log = baseCore.getLogger()
+
+from reits import Policy
+policy = Policy()
+
+
+topic = 'policy'
+webname = '江西省人民政府'
 headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
    'X-Requested-With': 'XMLHttpRequest',
@@ -54,15 +64,14 @@ def getDataJson():


 def getContent(url, num, publishDate):
-    fjhref_list = ''
-    fjtitle_list = ''
+    id_list = []
    soup = getSoup(url)
    contentWithTag = soup.find('div', attrs={'id': 'zoom'})
    img_list = contentWithTag.find_all('img')
    num_ = 1
    for img in img_list:
        fj_href = 'http://www.jiangxi.gov.cn' + img.get('src')
-        fjhref_list += fj_href + '\n'
+
        fj_title = img.get('title')
        if fj_title == '':
            fj_title = str(num_)
@@ -70,13 +79,11 @@ def getContent(url, num, publishDate):
        category = os.path.splitext(fj_href)[1]
        if category not in fj_title:
            fj_title = fj_title + category
-        fj_title = f'{num}-{publishDate}-{fj_title}'
-        fjtitle_list += fj_title + '\n'
-        fjcontent = getFjContent(fj_href)
-        file = f'./相关政策/江西省人民政府/政策文件/{fj_title}'
-        with open(file, 'wb') as f:
-            f.write(fjcontent)
-        log.info(f'{fj_title}===附件下载成功')
+        att_id, full_path = policy.attuributefile(fj_title, fj_href, num, publishDate)
+        if att_id:
+            id_list.append(att_id)
+            img['href'] = full_path
+
    try:
        scripts = contentWithTag.find_all('script')
        for script in scripts:
@@ -90,12 +97,11 @@ def getContent(url, num, publishDate):
    except:
        pass
    content = contentWithTag.text.lstrip().strip()
-    return content, fjtitle_list, fjhref_list
+    return content, contentWithTag, id_list


 def doJob():
-    if not os.path.exists('./相关政策/江西省人民政府/政策文件'):
-        os.makedirs('./相关政策/江西省人民政府/政策文件')
+
    data_json = getDataJson()
    data_list = []
    num = 1
@@ -111,18 +117,43 @@ def doJob():
            -1].text.lstrip().strip()
        summary = soup.find('table', class_='jcse-service-table').find_all('tr')[2].text.lstrip().strip()
        href = soup.find('table', class_='jcse-service-table').find_all('tr')[3].find('a').get('href')
+        # 根据链接判重
+        is_member = baseCore.r.sismember('REITs::' + webname, href)
+        if is_member:
+            continue
+
        publishDate = writtenDate
        origin = '江西省人民政府'
-        content, fjtitle_list, fjhref_list = getContent(href, num, publishDate)
-        data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
-                fjhref_list]
-        data_list.append(data)
-        log.info(f'{title}===采集成功')
+        content, contentWithTag, id_list = getContent(href, num, publishDate)
        num += 1
-    df = pd.DataFrame(np.array(data_list))
-    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
-    df.to_excel('./相关政策/江西省人民政府/江西省人民政府政策文件.xlsx', index=False)
-
+        contentWithTag_str = str(contentWithTag)
+        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        dic_info = {
+            'attachmentIds': id_list,
+            'author': '',
+            'content': content,
+            'contentWithTag': contentWithTag_str,
+            'deleteFlag': 0,
+            'id': '',
+            'title': title,
+            'publishDate': publishDate,
+            'origin': origin,
+            'sourceAddress': href,
+            'writtenDate': writtenDate,
+            'organ': organ,
+            'topicClassification': '',
+            'issuedNumber': pub_hao,
+            'summary': summary,
+            'createDate': time_now,
+            'sid': '1729043445107838978',
+        }
+        try:
+            baseCore.sendkafka(dic_info, topic)
+            baseCore.r.sadd('REITs::' + webname, href)
+            log.info(f'采集成功--{title}--{href}')
+        except Exception as e:
+            for att_id in id_list:
+                baseCore.deliteATT(att_id)

 if __name__ == '__main__':
    doJob()