Reits专题

5d788bc9 · 薛凌堃 · c702fb7b · 5d788bc9 · 5d788bc9
--- a/REITs专题数据/BaseCore.py
+++ b/REITs专题数据/BaseCore.py
-# REITs专题核心工具包
+# REITs专题核心工具包
@@ -5,6 +5,7 @@ import random
 import socket
 import sys
 import time
+import uuid

 import fitz
 import logbook
@@ -252,7 +253,7 @@ class BaseCore:
                               charset='utf8mb4')
        self.cursor_ = self.cnx_.cursor()
        # 连接到Redis
-        self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
+        self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=0)

        self.pool_caiji = PooledDB(
            creator=pymysql,
@@ -451,6 +452,7 @@ class BaseCore:
    # def doc_page(self,file_path):
    #     doc = Document(file_path)
    #     return len(doc.sections)
+
    def deliteATT(self,id):
        delitesql = f"delete from clb_sys_attachment where id = '{id}' "
        self.cursor_.execute(delitesql)
@@ -492,6 +494,9 @@ class BaseCore:
            id = selects[0]
            return id,full_path

+    def getuuid(self):
+        get_timestamp_uuid = uuid.uuid1()  # 根据 时间戳生成 uuid , 保证全球唯一
+        return get_timestamp_uuid

    # 获取文件大小
    def convert_size(self,size_bytes):
@@ -520,37 +525,25 @@ class BaseCore:
            except:
                time.sleep(3)
                continue
-        page_size = 0
        for i in range(0, 3):
            try:
-                # name = file_name
-                if category in file_name:
-                    pass
-                else:
-                    file_name = file_name + category
-                result = obsClient.putContent('zzsn', 'PolicyDocuments/' + file_name, content=response.content)
+                file_name = str(self.getuuid()) + category
+                result = obsClient.putContent('zzsn', 'PolicyDocument/' + file_name, content=response.content)
                break
            except:
                time.sleep(3)
                continue
-
-        if page_size < 1:
-            # pdf解析失败
-            # print(f'======pdf解析失败=====')
-            return retData
-        else:
-            try:
-                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-                retData['state'] = True
-                retData['path'] = result['body']['objectUrl'].split('.com')[1]
-                retData['full_path'] = unquote(result['body']['objectUrl'])
-                retData['file_size'] = self.convert_size(file_size)
-                retData['create_time'] = time_now
-            except Exception as e:
-                print(f'error:{e}')
-                return retData
-
+        try:
+            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+            retData['state'] = True
+            retData['path'] = result['body']['objectUrl'].split('.com')[1]
+            retData['full_path'] = unquote(result['body']['objectUrl'])
+            retData['file_size'] = self.convert_size(file_size)
+            retData['create_time'] = time_now
+        except Exception as e:
+            print(f'error:{e}')
            return retData
+        return retData

    def sendkafka(self, post_data, topic):
        try:

--- a/REITs专题数据/reits.py
+++ b/REITs专题数据/reits.py
-import os
+import os
@@ -107,7 +107,7 @@ class Policy():
            category = os.path.splitext(file_href)[1]
            if category not in file_name:
                file_name = file_name + category
-            retData = baseCore.uptoOBS(file_href, '9999', file_name)
+            retData = baseCore.uptoOBS(file_href, '', file_name)
            if retData['state']:
                pass
            else:
@@ -136,7 +136,7 @@ class Policy():

 policy = Policy()
 #国家发展和改革委员会 https://www.ndrc.gov.cn/xxgk/wjk/index.html?tab=all&qt=
-def reform(wb,file_path):
+def reform():
    headers = {
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Encoding': 'gzip, deflate, br',
@@ -153,22 +153,30 @@ def reform(wb,file_path):
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"'
    }
-    DataList = []
+    # DataList = []
    num = 0
-    path = 'data/国家改革发展委员会'
-    if not os.path.exists(path):
-        os.makedirs(path)
+    webname = '中华人民共和国国家发展和改革委员会'
+    # path = 'data/国家改革发展委员会'
+    # if not os.path.exists(path):
+    #     os.makedirs(path)
    for page in range(1,3):
        url = f'https://fwfx.ndrc.gov.cn/api/query?qt=REITs&tab=all&page={page}&pageSize=20&siteCode=bm04000fgk&key=CAB549A94CF659904A7D6B0E8FC8A7E9&startDateStr=&endDateStr=&timeOption=0&sort=dateDesc'
        result = policy.getrequest_json(headers, url)
        data_list = result['data']['resultList']
        for info in data_list:
            num += 1
+            id_list = []
            # info = data_list[1]
            publishDate_ = info['docDate']
            title = info['title']
            summary = info['summary'].replace('<em>','').replace('</em>','')
            newsUrl = info['url']
+
+            # 根据链接判重
+            is_member = baseCore.r.sismember('REITs::' + webname, newsUrl)
+            if is_member:
+                continue
+
            header = {
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                'Accept-Encoding': 'gzip, deflate, br',
@@ -190,6 +198,7 @@ def reform(wb,file_path):
            }
            newssoup = policy.getrequest_soup(header, newsUrl)
            # print(newssoup)
+            policy.paserUrl(newssoup, newsUrl)
            try:
                pubHao = ''
                source = ''
@@ -229,20 +238,19 @@ def reform(wb,file_path):
                pattern = r"\d{4}年\d{1,2}月\d{1,2}日"
                match = re.match(pattern, publishDate)
                if match:
+                    date1 = datetime.strptime(publishDate, "%Y年%m月%d日")
+                    publishDate = date1.strftime("%Y-%m-%d")
                    pass
                else:
                    publishDate = ''
                policy.deletep(contentWithTag, 3, 'div', 'style', 'text-align: center;')
                policy.deletek(contentWithTag)
+
                content = contentWithTag.text
                try:
                    policy.paserUrl(newssoup,newsUrl)
                    att = newssoup.find('div', class_='attachment_r')
-                    fu_jian_name = ''
-                    fu_jian_href = ''
                except:
-                    fu_jian_name = ''
-                    fu_jian_href = ''
                    att = ''
                if att:
                    for a in att.find_all('a'):
@@ -255,49 +263,61 @@ def reform(wb,file_path):
                            pass
                        else:
                            file_name = file_name + category
-                        rename_file = f'{str(num)}_{publishDate}_{file_name}'
-                        fu_jian_name += rename_file + '\n'
-                        fu_jian_href += file_href + '\n'
-                        policy.downloadfile(file_href, f'{path}/{rename_file}')
+                        att_id,full_path = policy.attuributefile(file_name,file_href,num,publishDate_)
+                        if att_id:
+                            id_list.append(att_id)
+                            a['href'] = full_path

+                    contentWithTag_str = str(contentWithTag) + str(newssoup.find('div', class_='attachment'))
+                else:
+                    contentWithTag_str = str(contentWithTag)
+
+                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                dic_info = {
-                    '序号': num,
-                    '标题': title,
-                    '发布时间': publishDate_,
-                    '来源': source,
-                    '原文链接': newsUrl,
-                    '发文时间': publishDate,
-                    '发文机构': '',
-                    '发文字号': pubHao,
-                    '摘要': summary,
-                    '正文': content,
-                    '附件名称': fu_jian_name,
-                    '附件链接': fu_jian_href,
+                    'attachmentIds': id_list,
+                    'author': '',
+                    'content': content,
+                    'contentWithTag': contentWithTag_str,
+                    'deleteFlag': 0,
+                    'id': '',
+                    'title': title,
+                    'publishDate': publishDate_,
+                    'origin': source,
+                    'sourceAddress': newsUrl,
+                    'writtenDate': publishDate,
+                    'organ': '',
+                    'topicClassification': '',
+                    'issuedNumber': pubHao,
+                    'summary': summary,
+                    'createDate': time_now,
+                    'sid': '1729029275400646658',
                }
-                DataList.append(dic_info)
+                # DataList.append(dic_info)
                try:
                    baseCore.sendkafka(dic_info, topic)
+                    baseCore.r.sadd('REITs::' + webname, newsUrl)
+                    log.info(f'采集成功--{title}--{newsUrl}')
                except:
-
-                sheet_name = "国家发展和改革委员会"
-                if sheet_name in wb.sheetnames:
-                    log.info(f"{sheet_name}工作表已存在！")
-                else:
-                    # 创建新工作表
-                    wb.create_sheet(sheet_name)
-                    print(f"{sheet_name}新工作表创建完成！")
-                # 保存Excel文件
-                wb.save(file_path)
-
-                baseCore.writerToExcel(DataList, file_path, sheet_name)
-
+                    for att_id in id_list:
+                        baseCore.deliteATT(att_id)
+                # sheet_name = "国家发展和改革委员会"
+                # if sheet_name in wb.sheetnames:
+                #     log.info(f"{sheet_name}工作表已存在！")
+                # else:
+                #     # 创建新工作表
+                #     wb.create_sheet(sheet_name)
+                #     print(f"{sheet_name}新工作表创建完成！")
+                # # 保存Excel文件
+                # wb.save(file_path)
+                #
+                # baseCore.writerToExcel(DataList, file_path, sheet_name)
            except Exception as e:
                log.info(f"error！！！{newsUrl}")
                log.info({e})
        log.info(f'====第{page}页====处理结束，已采集{num}条数据=================')

 #证券期货 https://neris.csrc.gov.cn/falvfagui/multipleFindController/indexJsp
-def zhengquanqihuo(wb,file_path):
+def zhengquanqihuo():
    headers = {
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Encoding': 'gzip, deflate, br',
@@ -337,11 +357,12 @@ def zhengquanqihuo(wb,file_path):
    total = pageUtil['rowCount']
    page_size = pageUtil['pageSize']
    Max_page = int(total / page_size)
-    DataList = []
+    # DataList = []
    num = 0
-    path = 'data/证监会'
-    if not os.path.exists(path):
-        os.makedirs(path)
+    webname = '证券期货法规数据库系统'
+    # path = 'data/证监会'
+    # if not os.path.exists(path):
+    #     os.makedirs(path)
    for page in range(0, Max_page+1):
        payload_page = {
            'pageNo': page + 1,
@@ -359,6 +380,7 @@ def zhengquanqihuo(wb,file_path):
        data_page = policy.requestPost(headers, url, payload_page)
        info_list = data_page['pageUtil']['pageList']
        for info in info_list:
+            id_list = []
            num += 1
            try:
                title = info['secFutrsLawName']
@@ -369,41 +391,63 @@ def zhengquanqihuo(wb,file_path):
                # print(publishDate)
                secFutrsLawId = info['secFutrsLawId']
                newsUrl = f'https://neris.csrc.gov.cn/falvfagui/rdqsHeader/mainbody?navbarId=3&secFutrsLawId={secFutrsLawId}&body=REITs'
+
+                # 根据链接判重
+                is_member = baseCore.r.sismember('REITs::' + webname, newsUrl)
+                if is_member:
+                    continue
+
                browser = policy.createDriver()
                browser.get(newsUrl)
                time.sleep(1)
                page_source = browser.page_source
                newssoup = BeautifulSoup(page_source, 'html.parser')
+                policy.paserUrl(newssoup,newsUrl)
                # print(newssoup)
                contentWithTag = newssoup.find('div', class_='law_text mainBody catalog')
                content = contentWithTag.text.replace('显示注释', '')
                # print(content)
+                contentWithTag_str = str(contentWithTag)
+
+                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                dic_info = {
-                    '序号': num,
-                    '标题': title,
-                    '发布时间': publishDate,
-                    '来源': source,
-                    '原文链接': newsUrl,
-                    '发文时间': publishDate,
-                    '发文机构': source,
-                    '发文字号': pubHao,
-                    '摘要': '',
-                    '正文': content,
-                    '附件名称': '',
-                    '附件链接': '',
+                    'attachmentIds': id_list,
+                    'author': '',
+                    'content': content,
+                    'contentWithTag': contentWithTag_str,
+                    'deleteFlag': 0,
+                    'id': '',
+                    'title': title,
+                    'publishDate': publishDate,
+                    'origin': source,
+                    'sourceAddress': newsUrl,
+                    'writtenDate': publishDate,
+                    'organ': source,
+                    'issuedNumber': pubHao,
+                    'summary': '',
+                    'topicClassification': '',
+                    'createDate': time_now,
+                    'sid': '1729030277461815298',
                }
-                DataList.append(dic_info)
-                sheet_name = "证监会"
-                if sheet_name in wb.sheetnames:
-                    log.info(f"{sheet_name}工作表已存在！")
-                else:
-                    # 创建新工作表
-                    wb.create_sheet(sheet_name)
-                    print(f"{sheet_name}新工作表创建完成！")
-                # 保存Excel文件
-                wb.save(file_path)
-
-                baseCore.writerToExcel(DataList, file_path, sheet_name)
+                try:
+                    baseCore.sendkafka(dic_info, topic)
+                    baseCore.r.sadd('REITs::' + webname, newsUrl)
+                    log.info(f'采集成功--{title}--{newsUrl}')
+                except:
+                    for att_id in id_list:
+                        baseCore.deliteATT(att_id)
+                # DataList.append(dic_info)
+                # sheet_name = "证监会"
+                # if sheet_name in wb.sheetnames:
+                #     log.info(f"{sheet_name}工作表已存在！")
+                # else:
+                #     # 创建新工作表
+                #     wb.create_sheet(sheet_name)
+                #     print(f"{sheet_name}新工作表创建完成！")
+                # # 保存Excel文件
+                # wb.save(file_path)
+                #
+                # baseCore.writerToExcel(DataList, file_path, sheet_name)
            except Exception as e:
                log.info(f"error！！！{num}")
                log.info({e})
@@ -428,9 +472,10 @@ def sse(wb,file_path):
    total_page = result['data']['totalPage']
    DataList = []
    num = 0
-    path = 'data/上海交易所'
-    if not os.path.exists(path):
-        os.makedirs(path)
+    webname = '上海证券交易所'
+    # path = 'data/上海交易所'
+    # if not os.path.exists(path):
+    #     os.makedirs(path)
    for page in range(0, int(total_page)):
        url_page = f'http://query.sse.com.cn/search/getESSearchDoc.do?page={page}&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C8685%2C9348%2C12632%2C12768%2C12769%2C12770%2C12771%2C12772%2C12773%2C12774%2C12775%2C12776%2C12777%2C12778%2C12779%2C12780%2C12781%2C12782%2C12783%2C12784%2C12785%2C12786%2C12787%2C12788%2C12789%2C12790%2C12791%2C12792%2C12793%2C12794%2C12795%2C12796%2C12797%2C12798%2C12799%2C12800%2C12801%2C12802%2C12803%2C12804%2C12805%2C12806%2C12807%2C12808%2C12809%2C12810%2C12811%2C12812%2C13061%2C13282%2C13283%2C13284%2C13285%2C13286%2C13287%2C13288%2C13289%2C13294%2C13364%2C13365%2C13366%2C13367%2C14595%2C14596%2C14597%2C14598%2C14599%2C14600%2C14601%2C14602%2C14603%2C14604%2C14605%2C14606&trackId=50619067167713018335655119683810&_=1699508921761'
        data = policy.getrequest_json(headers, url_page)
@@ -456,9 +501,14 @@ def sse(wb,file_path):
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
            }
            newsUrl = 'http://www.sse.com.cn' + news['extend'][4]['value']
+
+            # 根据链接判重
+            is_member = baseCore.r.sismember('REITs::' + webname, newsUrl)
+            if is_member:
+                continue
+
            if '.pdf' in newsUrl:
-                fu_jian_name = ''
-                fu_jian_href = ''
+
                content = ''
                response = requests.get(newsUrl, timeout=20)
                with fitz.open(stream=response.content, filetype='pdf') as doc:
@@ -466,10 +516,10 @@ def sse(wb,file_path):
                        content += page.get_text()
                file_href = newsUrl
                file_name = title
-                rename_file = f'{str(num)}_{publishDate}_{file_name}'
-                fu_jian_name += rename_file + '\n'
-                fu_jian_href += file_href + '\n'
-                policy.downloadfile(file_href, f'{path}/{rename_file}')
+
+                policy.attuributefile(title, newsUrl, num, publishDate)
+
+
                dic_info = {
                    '序号': num,
                    '标题': title,
@@ -553,100 +603,6 @@ def sse(wb,file_path):

            baseCore.writerToExcel(DataList, file_path, sheet_name)

-#北京市人民政府 https://www.beijing.gov.cn/so/s?siteCode=1100000088&tab=zcfg&qt=REITs
-def beijing():
-    url = 'https://www.beijing.gov.cn/so/ss/query/s'
-    payload = {
-        'siteCode': '1100000088',
-        'tab': 'zcfg',
-        'qt': 'REITs',
-        'sort': 'relevance',
-        'keyPlace': '0',
-        'locationCode': '110000000000',
-        'page': '1',
-        'pageSize': '20',
-        'ie': '89b5e964-dc3c-4a5b-8d80-f6c408769b4a'
-    }
-    headers = {
-        'Accept': 'application/json, text/javascript, */*; q=0.01',
-        'Accept-Encoding': 'gzip, deflate, br',
-        'Accept-Language': 'zh-CN,zh;q=0.9',
-        'Connection': 'keep-alive',
-        'Content-Length': '148',
-        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
-        'Cookie': 'Path=/; Path=/; __jsluid_s=91bdb0d83098fd2e8a8455a9085a22e2; JSESSIONID=M2FmNDczYzYtMmNkYS00N2I0LThhNDgtYWJiMTdhOTIyZDI4; _va_ref=%5B%22%22%2C%22%22%2C1699515166%2C%22https%3A%2F%2Fdocs.qq.com%2F%22%5D; _va_ses=*; JSESSIONID=CD61DA650DB33324962A3BF2527672D0; arialoadData=false; _va_id=c7a63e4b2199befd.1699358536.2.1699515273.1699515166.; CPS_SESSION=2FEFDC54444B24762D057AD6BDE3C7BF',
-        'Host': 'www.beijing.gov.cn',
-        'Origin': 'https://www.beijing.gov.cn',
-        'Referer': 'https://www.beijing.gov.cn/so/s?siteCode=1100000088&tab=zcfg&qt=REITs',
-        'Sec-Fetch-Dest': 'empty',
-        'Sec-Fetch-Mode': 'cors',
-        'Sec-Fetch-Site': 'same-origin',
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
-        'X-Requested-With': 'XMLHttpRequest',
-        'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
-        'sec-ch-ua-mobile': '?0',
-        'sec-ch-ua-platform': '"Windows"'
-    }
-    result = policy.requestPost(headers, url, payload)
-    total = result['totalHits']
-    page_size = result['currentHits']
-    Max_page = int(total / page_size)
-    for page in range(0, Max_page):
-        payload_page = {
-            'siteCode': '1100000088',
-            'tab': 'zcfg',
-            'qt': 'REITs',
-            'sort': 'relevance',
-            'keyPlace': '0',
-            'locationCode': '110000000000',
-            'page': page + 1,
-            'pageSize': '20',
-            'ie': '89b5e964-dc3c-4a5b-8d80-f6c408769b4a'
-        }
-        data = policy.requestPost(headers, url, payload_page)
-        info_list = data['resultDocs']
-        # print(info_list)
-        for info_ in info_list:
-            info = info_['data']
-            title = info['titleO']
-            titleLabel = info['titleLabel']['value']
-            publishDate = info['docDate']
-            # source = info['siteLabel']['value']
-            newsUrl = info['url']
-
-            if titleLabel == '政策解读':
-                newssoup = policy.getrequest_soup(headers, newsUrl)
-                print(newssoup)
-                contentWithTag = newssoup.find('div', id='mainText')
-                content = contentWithTag.text
-                source = newssoup.select('p[class="fl"]>span')[1].replace('来源：', '')
-            formatRows = info['formatRows']
-            num = 1
-            for row in formatRows:
-                for col in row['col']:
-                    name = col['text']
-                    if name == '相关附件':
-                        value = col['value']
-                        file_href = value.keys()
-                        file_name = value.values()
-                        # 附件上传
-                        policy.attuributefile(file_name,file_href,num,publishDate)
-                        num += 1
-                    value = col['value'][0]
-
-                    dic_info[name] = value
-
-            dic_info = {
-                'title': title,
-                'publishDate': publishDate,
-                'source': source,
-                'newsUrl': newsUrl,
-                'file_href': file_href
-            }
-
-            # print(dic_info)
-        # break
-
 # 河北省人民政府
 def hebei():
    path = 'data/河北省人民政府'
@@ -851,10 +807,6 @@ def hebei():
                baseCore.writerToExcel(DataList, file_path, sheet_name)
        break

-# 广东省人民政府
-def guangdong():
-
-    pass

 # 贵州省人民政府
 def guizhou():
@@ -963,12 +915,12 @@ def guizhou():


 if __name__=="__main__":
-    file_path = f'data/REITs贵州省人民政府.xlsx'
-    wb = policy.createfile(file_path)
-    # reform(wb,file_path)
+    # file_path = f'data/REITs贵州省人民政府.xlsx'
+    # wb = policy.createfile(file_path)
+    # reform()
    # shenzhen()
-    # zhengquanqihuo(wb,file_path)
-    # sse(wb,file_path)
+    zhengquanqihuo()
+    # sse()
    # hebei()
-    guizhou()
+    # guizhou()
 # zhengquanqihuo()
\ No newline at end of file