REITs专题 12/02

d7b3c3cf · LiuLiYuan · 7ef6f432 · d7b3c3cf · d7b3c3cf · d7b3c3cf
--- a/REITs专题数据/LawRules-2-shenzhen.py
+++ b/REITs专题数据/LawRules-2-shenzhen.py
+import re
+import re
+import requests
+from bs4 import BeautifulSoup
+from base import BaseCore
+from retry import retry
+baseCore = BaseCore.BaseCore()
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
+}
+def getContentA(url):
+    pass
+def getContentB(url):
+    req = requests.get(url,headers=headers)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text,'html.parser')
+    contentWithTag = soup.find('div',attrs={'id':'desContent'})
+    a_list = contentWithTag.find_all('a')
+    for a in a_list:
+        href = a.get('href')
+        file_name = a.text.strip()
+    content = contentWithTag.text.strip()
+def doJob():
+    urls = ['http://reits.szse.cn/lawrule/laws/index.html',
+            'http://reits.szse.cn/lawrule/regulations/csrcorder/index.html',
+            'http://reits.szse.cn/lawrule/regulations/csrcannoun/index.html']
+    for url in urls:
+        req = requests.get(url, headers=headers)
+        req.encoding = req.apparent_encoding
+        soup = BeautifulSoup(req.text, 'lxml')
+        li_list = soup.find('ul', class_='newslist').find_all('li')
+        for li in li_list:
+            info = str(li.find('script'))
+            href = re.findall('curHref = \'(.*?)\';', info)[0].replace('./', 'http://reits.szse.cn/lawrule/laws/')
+            title = re.findall('curTitle =\'(.*?)\';', info)[0]
+            publishDate = li.find('span', class_='time').text.strip()
+            if '.html' in href:
+                getContentA(href)
+            else:
+                getContentB(href)
+if __name__ == '__main__':
+    doJob()
--- a/REITs专题数据/RuleGuide-shenzhen.py
+++ b/REITs专题数据/RuleGuide-shenzhen.py
+import os
+import os
+import re
+import time
+import requests
+from bs4 import BeautifulSoup
+import BaseCore
+from reits import Policy
+policy = Policy()
+topic = 'policy'
+webname = '深圳证券交易所REITs'
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
+}
+def getContent(url, publishDate, num, id_list):
+    req = requests.get(url, headers=headers)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'html.parser')
+    contentWithTag = soup.find('div', attrs={'id': 'desContent'})
+    pub_hao = contentWithTag.find('p').text.strip()
+    if pub_hao == '':
+        pub_hao = contentWithTag.find_all('p')[1].text.strip()
+    if '号' not in pub_hao:
+        pub_hao = ''
+    a_list = contentWithTag.find_all('a')
+    for a in a_list:
+        fj_href = a.get('href')
+        if not fj_href:
+            continue
+        fj_title = a.text.strip()
+        category = os.path.splitext(fj_href)[1]
+        if '.' not in category or '.cn' in category:
+            continue
+        if category not in fj_title:
+            fj_title = fj_title + category
+        # 上传附件至obs
+        att_id, full_path = policy.attuributefile(fj_title, fj_href, num, publishDate)
+        if att_id:
+            id_list.append(att_id)
+            a['href'] = full_path
+    content = contentWithTag.text.strip()
+    return pub_hao, content, id_list, str(contentWithTag)
+def doJob():
+    urls = ['http://reits.szse.cn/lawrule/bussrules/latest/index.html',
+            'http://reits.szse.cn/lawrule/bussrules/supervise/index.html']
+    num = 1
+    for url in urls:
+        req = requests.get(url, headers=headers)
+        req.encoding = req.apparent_encoding
+        soup = BeautifulSoup(req.text, 'lxml')
+        li_list = soup.find('ul', class_='newslist').find_all('li')
+        for li in li_list:
+            id_list = []
+            info = str(li.find('script'))
+            href = re.findall('curHref = \'(.*?)\';', info)[0].replace('./', url.replace(url.split('/')[-1], ''))
+            title = re.findall('curTitle =\'(.*?)\';', info)[0]
+            publishDate = li.find('span', class_='time').text.strip()
+            # 根据链接判重
+            is_member = baseCore.r.sismember('REITs::' + webname, href)
+            if is_member:
+                log.info(f'{title}===已采集')
+                continue
+            origin = '深圳证券交易所'
+            writtenDate = publishDate
+            organ = '深圳证券交易所'
+            summary = ''
+            pub_hao, content, id_list, contentWithTag = getContent(href, publishDate, num, id_list)
+            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+            dic_info = {
+                'attachmentIds': id_list,
+                'author': '',
+                'content': content,
+                'contentWithTag': str(contentWithTag),
+                'deleteFlag': 0,
+                'id': '',
+                'title': title,
+                'publishDate': publishDate,
+                'origin': origin,
+                'sourceAddress': href,
+                'writtenDate': writtenDate,
+                'organ': organ,
+                'topicClassification': '',
+                'issuedNumber': pub_hao,
+                'summary': summary,
+                'createDate': time_now,
+                'sid': '1730508406971613186',
+            }
+            try:
+                baseCore.sendkafka(dic_info, topic)
+                baseCore.r.sadd('REITs::' + webname, href)
+                log.info(f'采集成功--{title}--{href}')
+            except:
+                for att_id in id_list:
+                    baseCore.deliteATT(att_id)
+            num += 1
+            time.sleep(3)
+if __name__ == '__main__':
+    doJob()
--- a/REITs专题数据/policy-zhejiang.py
+++ b/REITs专题数据/policy-zhejiang.py