11.29

401719e1 · 薛凌堃 · 80c02904 · 401719e1
--- a/REITs专题数据/RuleGuide-shanghai.py
+++ b/REITs专题数据/RuleGuide-shanghai.py
 import os
 import os
+import time
 from urllib.parse import urljoin
 import numpy as np
 import pandas as pd
 import requests
 from bs4 import BeautifulSoup
-from base import BaseCore
+import BaseCore
 baseCore = BaseCore.BaseCore()
 log = baseCore.getLogger()
+from reits import Policy
+policy = Policy()
+topic = 'policy'
+webname = '上海证券交易所REITs'
 headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Encoding': 'gzip, deflate',
@@ -53,9 +60,8 @@ def getSoup(url):
    return soup
-def getContent(url, publishDate, num):
+def getContent(url, publishDate, num, id_list):
-    fjhref_list = ''
-    fjtitle_list = ''
    soup = getSoup(url)
    soup = paserUrl(soup, 'http://www.sse.com.cn/')
    contentWithTag = soup.find('div', class_='allZoom')
@@ -70,16 +76,14 @@ def getContent(url, publishDate, num):
            continue
        if category not in fj_title:
            fj_title = fj_title + category
-        fj_title = f'{num}-{publishDate}-{fj_title}'
+        # 上传附件至obs
-        fjtitle_list += fj_title + '\n'
+        att_id, full_path = policy.attuributefile(fj_title, fj_href, num, publishDate)
-        fjhref_list += fj_href + '\n'
+        if att_id:
-        fjcontent = getFjContent(fj_href)
+            id_list.append(att_id)
-        file = f'./相关政策/上海证券交易所/政策文件/{fj_title}'
+            a['href'] = full_path
-        with open(file, 'wb') as f:
-            f.write(fjcontent)
-        log.info(f'{fj_title}===附件下载成功')
    content = contentWithTag.text
-    return pub_hao, content,fjtitle_list,fjhref_list
+    return pub_hao, content, id_list, contentWithTag
 def doJob():
@@ -93,22 +97,48 @@ def doJob():
        soup = paserUrl(soup, 'http://www.sse.com.cn/')
        li_list = soup.find('ul', class_='list').find_all('li')
        for li in li_list:
+            id_list = []
            title = li.find('a').text.lstrip().strip()
            href = li.find('a').get('href')
+            # 根据链接判重
+            is_member = baseCore.r.sismember('REITs::' + webname, href)
+            if is_member:
+                continue
            origin = '上海证券交易所'
            publishDate = li.find('i', class_='date').text.lstrip().strip()
            writtenDate = publishDate
            organ = '上海证券交易所'
            summary = ''
-            pub_hao, content, fjtitle_list, fjhref_list = getContent(href, publishDate, num)
+            pub_hao, content, id_list, contentWithTag = getContent(href, publishDate, num, id_list)
-            data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
+            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-                    fjhref_list]
+            dic_info = {
-            data_list.append(data)
+                'attachmentIds': id_list,
-            log.info(f'{title}===采集成功')
+                'author': '',
+                'content': content,
+                'contentWithTag': str(contentWithTag),
+                'deleteFlag': 0,
+                'id': '',
+                'title': title,
+                'publishDate': publishDate,
+                'origin': origin,
+                'sourceAddress': href,
+                'writtenDate': writtenDate,
+                'organ': organ,
+                'topicClassification': '',
+                'issuedNumber': pub_hao,
+                'summary': summary,
+                'createDate': time_now,
+                'sid': '1729047166793469954',
+            }
+            try:
+                baseCore.sendkafka(dic_info, topic)
+                baseCore.r.sadd('REITs::' + webname, href)
+                log.info(f'采集成功--{title}--{href}')
+            except:
+                for att_id in id_list:
+                    baseCore.deliteATT(att_id)
            num += 1
-    df = pd.DataFrame(np.array(data_list))
-    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
-    df.to_excel('./相关政策/上海证券交易所/上海证券交易所政策文件.xlsx', index=False)
 if __name__ == '__main__':