深圳交易所

ca21124d · 薛凌堃 · f7a4f608 · ca21124d
--- a/REITs专题数据/LawRules-2-shenzhen.py
+++ b/REITs专题数据/LawRules-2-shenzhen.py
-import re
+import re
 import re
+import time

+import fitz
 import requests
 from bs4 import BeautifulSoup

-from base import BaseCore
+import BaseCore
 from retry import retry
-
+from reits import Policy
+policy = Policy()
 baseCore = BaseCore.BaseCore()
-
+log = baseCore.getLogger()
 headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
 }
-
+topic = 'policy'
+webname = '深圳交易所'

 def getContentA(url):
-    pass
+    content = ""
+    req = requests.get(url, headers=headers)
+    req.encoding = req.apparent_encoding
+    try:
+        with fitz.open(stream=req.content, filetype='pdf') as doc:
+            page_size = doc.page_count
+            for page in doc.pages():
+                content += page.get_text()
+    except:
+        return ''
+    return content


-def getContentB(url):
+def getContentB(url,publishDate,num):
+    id_list = []
    req = requests.get(url,headers=headers)
    req.encoding = req.apparent_encoding
    soup = BeautifulSoup(req.text,'html.parser')
    contentWithTag = soup.find('div',attrs={'id':'desContent'})
+    content = contentWithTag.text.strip()
+    # print(content)
    a_list = contentWithTag.find_all('a')
    for a in a_list:
        href = a.get('href')
        file_name = a.text.strip()
-
-    content = contentWithTag.text.strip()
+        att_id, full_path = policy.attuributefile(file_name,href,num,publishDate)
+        num += 1
+        if att_id:
+            id_list.append(att_id)
+            a['href'] = full_path
+    contentWithTag_str = str(contentWithTag)
+    return content, contentWithTag_str, id_list,num


 def doJob():
@@ -35,6 +57,7 @@ def doJob():
            'http://reits.szse.cn/lawrule/regulations/csrcorder/index.html',
            'http://reits.szse.cn/lawrule/regulations/csrcannoun/index.html']
    for url in urls:
+        num = 1
        req = requests.get(url, headers=headers)
        req.encoding = req.apparent_encoding
        soup = BeautifulSoup(req.text, 'lxml')
@@ -42,13 +65,94 @@ def doJob():
        for li in li_list:
            info = str(li.find('script'))
            href = re.findall('curHref = \'(.*?)\';', info)[0].replace('./', 'http://reits.szse.cn/lawrule/laws/')
+
+            if 'csrcorder' in url:
+                href = re.findall('curHref = \'(.*?)\';', info)[0].replace('./','http://reits.szse.cn/lawrule/regulations/csrcorder/')
+                origin = '国家发展改革委'
+            elif 'csrcannoun' in url:
+                href = re.findall('curHref = \'(.*?)\';', info)[0].replace('./','http://reits.szse.cn/lawrule/regulations/csrcannoun/')
+                origin = '中国证监会'
            title = re.findall('curTitle =\'(.*?)\';', info)[0]
            publishDate = li.find('span', class_='time').text.strip()
            if '.html' in href:
-                getContentA(href)
-            else:
-                getContentB(href)
+                # 根据链接判重
+                is_member = baseCore.r.sismember('REITs::' + webname, href)
+                if is_member:
+                    continue

+                content, contentWithTag_str, id_list, num = getContentB(href,publishDate,num)
+                num += 1
+                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                dic_info = {
+                    'attachmentIds': id_list,
+                    'author': '',
+                    'content': content,
+                    'contentWithTag': contentWithTag_str,
+                    'deleteFlag': 0,
+                    'id': '',
+                    'title': title,
+                    'publishDate': publishDate,
+                    'origin': origin,
+                    'sourceAddress': href,
+                    'writtenDate': '',
+                    'organ': '',
+                    'topicClassification': '',
+                    'issuedNumber': '',
+                    'summary': '',
+                    'createDate': time_now,
+                    'sid': '1729032681013825538',
+                }
+                try:
+                    baseCore.sendkafka(dic_info, topic)
+                    baseCore.r.sadd('REITs::' + webname, href)
+                    log.info(f'采集成功--{title}--{href}')
+                except:
+                    for att_id in id_list:
+                        baseCore.deliteATT(att_id)
+            else:
+                id_list = []
+                # 根据链接判重
+                is_member = baseCore.r.sismember('REITs::' + webname, href)
+                if is_member:
+                    continue
+                content = getContentA(href)
+                if content:
+                    pass
+                else:
+                    log.info(f'{title}---{href}')
+                    continue
+                # 上传附件
+                att_id, full_path = policy.attuributefile(title, href, num, publishDate)
+                if att_id:
+                    id_list.append(att_id)
+                num += 1
+                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                dic_info = {
+                    'attachmentIds': id_list,
+                    'author': '',
+                    'content': content,
+                    'contentWithTag': '',
+                    'deleteFlag': 0,
+                    'id': '',
+                    'title': title,
+                    'publishDate': publishDate,
+                    'origin': '深圳证券交易所',
+                    'sourceAddress': href,
+                    'writtenDate': '',
+                    'organ': '',
+                    'topicClassification': '',
+                    'issuedNumber': '',
+                    'summary': '',
+                    'createDate': time_now,
+                    'sid': '1729032681013825538',
+                }
+                try:
+                    baseCore.sendkafka(dic_info, topic)
+                    baseCore.r.sadd('REITs::' + webname, href)
+                    log.info(f'采集成功--{title}--{href}')
+                except:
+                    for att_id in id_list:
+                        baseCore.deliteATT(att_id)

 if __name__ == '__main__':
    doJob()