11/16

6173700f · 薛凌堃 · d3fd7612 · 6173700f · 6173700f
--- a/REITs专题数据/reits.py
+++ b/REITs专题数据/reits.py
 import os
@@ -36,6 +36,11 @@ class Policy():
        data_json = req.json()
        return data_json
+    def requestPost_html(self,headers, url, payload):
+        req = requests.post(headers=headers, url=url, data=payload)
+        result = BeautifulSoup(req.content,'html.parser')
+        return result
    def createDriver(self):
        chrome_driver = r'D:\cmd100\chromedriver.exe'
        path = Service(chrome_driver)
@@ -48,11 +53,22 @@ class Policy():
        return driver
    def deletep(self,soup,i,tag,attribute_to_delete,value_to_delete):
-        # 查找带有指定属性的P标签并删除
+        # 查找带有指定属性的标签并删除
        tags = soup.find_all(tag, {attribute_to_delete: value_to_delete})
        for tag in tags[:i]:
            tag.decompose()
+    def deletespan(self, td):
+        spans = td.find_all('span')
+        for span in spans:
+            span.extract()  # 删除span标签
+    def deletetag(self, td,tag):
+        tags = td.find_all(tag)
+        for tag_ in tags:
+            tag_.extract()  # 删除指定标签
    def deletek(self,soup):
        # 删除空白标签（例如<p></p>、<p><br></p>, img、video、hr除外）
        for i in soup.find_all(lambda tag: len(tag.get_text()) == 0 and tag.name not in ["img", "video", "br"] and tag.name != "br" or tag.get_text()==' '):
@@ -386,7 +402,6 @@ def zhengquanqihuo(wb,file_path):
 #深圳交易所 http://www.szse.cn/lawrules/index.html
 #上海交易所 http://www.sse.com.cn/home/search/index.shtml?webswd=REITs
 def sse(wb,file_path):
    url = 'http://query.sse.com.cn/search/getESSearchDoc.do?page=0&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C8685%2C9348%2C12632%2C12768%2C12769%2C12770%2C12771%2C12772%2C12773%2C12774%2C12775%2C12776%2C12777%2C12778%2C12779%2C12780%2C12781%2C12782%2C12783%2C12784%2C12785%2C12786%2C12787%2C12788%2C12789%2C12790%2C12791%2C12792%2C12793%2C12794%2C12795%2C12796%2C12797%2C12798%2C12799%2C12800%2C12801%2C12802%2C12803%2C12804%2C12805%2C12806%2C12807%2C12808%2C12809%2C12810%2C12811%2C12812%2C13061%2C13282%2C13283%2C13284%2C13285%2C13286%2C13287%2C13288%2C13289%2C13294%2C13364%2C13365%2C13366%2C13367%2C14595%2C14596%2C14597%2C14598%2C14599%2C14600%2C14601%2C14602%2C14603%2C14604%2C14605%2C14606&trackId=50619067167713018335655119683810&_=1699508921761'
@@ -483,20 +498,24 @@ def sse(wb,file_path):
                fu_jian_name = ''
                fu_jian_href = ''
                for fujian in fujian_list:
-                    file_href = fujian['href']
+                    try:
+                        file_href = fujian['href']
+                    except:
+                        continue
                    file_name = fujian.text.strip(' ')
                    category = os.path.splitext(file_href)[1]
                    if category in file_name:
                        pass
                    else:
                        file_name = file_name + category
-                    rename_file = f'{str(num)}_{publishDate}_{file_name}'.replace('\\','').replace('/','').replace('|','').replace('>','').replace('<','').replace('*','').replace('：','').replace('？','').replace('—','')
+                    rename_file = f'{str(num)}_{publishDate[:10]}_{file_name}'.replace('\\','').replace('/','').replace('|','').replace('>','').replace('<','').replace('*','').replace('：','').replace('？','').replace('—','').replace('-','')
                    fu_jian_name += rename_file + '\n'
                    fu_jian_href += file_href + '\n'
                    try:
                        policy.downloadfile(file_href, f'{path}/{rename_file}')
                    except:
                        log.info(f'--{page}-{num}======{newsUrl}')
+                        continue
                dic_info = {
                    '序号': num,
                    '标题': title,
@@ -525,10 +544,7 @@ def sse(wb,file_path):
            baseCore.writerToExcel(DataList, file_path, sheet_name)
 #北京市人民政府 https://www.beijing.gov.cn/so/s?siteCode=1100000088&tab=zcfg&qt=REITs
 def beijing():
    url = 'https://www.beijing.gov.cn/so/ss/query/s'
    payload = {
@@ -622,12 +638,264 @@ def beijing():
            # print(dic_info)
        # break
+# 河北省人民政府
+def hebei():
+    path = 'data/河北省人民政府'
+    if not os.path.exists(path):
+        os.makedirs(path)
+    num = 0
+    url = "https://www.hebei.gov.cn/search/pcRender?pageId=b97a38833f7343cebc31dec44544f684"
+    appNames = ['热点专题']
+    for appName in appNames:
+        payload = {'qAnd': ' ',
+                   'qOr': ' ',
+                   'qAll': ' ',
+                   'qNot': ' ',
+                   'startTime': ' ',
+                   'endTime': ' ',
+                   'advSearch': ' ',
+                   'originalSearchUrl': ' /search/pcRender?pageId=b97a38833f7343cebc31dec44544f684',
+                   'originalSearch': ' ',
+                   'app': ' 20c723b3a36e4906b0d91e6950d3dc29,8b157f193fb54ea7837d6380a37bb84a,0ad7369c794e4b2fbd6a4e76f9b84e9c,47fb4bc5c08d49d3b937c56c7960a909,9f54f8001d8747e4826d542fedcc6abc,b42baf238f43435ea7f796bec4ef7592,c943f166fb9042d288743397b12978fc,4b2050e6bb5d48dc9b200385dd99b4e3,7b5b083a6d254960ab34e34009e7e8d7,aa9d0848dcb84e8b919fd02b2da090b4,54e1a38a0e2846a4bc60258af5ced450,b88b6ee476494a16b66ea9cacc0456ee,4d0e00783a2e4037a6d3bdcd1fe98fb1,a8cb58e7494e4ae4a682b0e79df63dc6,f70c53427500439cbdeee467c5a185a6,d3f6aaca16c54e7b8626993314ad27b7,4d63955d8ec441018e8fddc6131997b0',
+                   'searchArea': ' ',
+                   'appName': appName,
+                   'sr': ' score desc',
+                   'advtime': ' ',
+                   'advrange': ' ',
+                   'articleType': ' ',
+                   'siteId': ' ',
+                   'siteName': ' ',
+                   'ext': ' ',
+                   'pNo': ' 1',
+                   'deviceType': ' pc',
+                   'q2': ' ',
+                   'q': ' REITs'}
+        headers = {
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'Accept-Language': 'zh-CN,zh;q=0.9',
+            'Cache-Control': 'max-age=0',
+            'Connection': 'keep-alive',
+            'Content-Length': '907',
+            'Content-Type': 'application/x-www-form-urlencoded',
+            'Cookie': 'aisearchbehavior=42b33c1f2d22475bb571093346193219; JSESSIONID=251311215A6447AE509141936F4569D4; arialoadData=true',
+            'Host': 'www.hebei.gov.cn',
+            'Origin': 'https://www.hebei.gov.cn',
+            'Referer': 'https://www.hebei.gov.cn/search/pcRender?pageId=b97a38833f7343cebc31dec44544f684',
+            'Sec-Fetch-Dest': 'document',
+            'Sec-Fetch-Mode': 'navigate',
+            'Sec-Fetch-Site': 'same-origin',
+            'Sec-Fetch-User': '?1',
+            'Upgrade-Insecure-Requests': '1',
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
+            'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
+            'sec-ch-ua-mobile': '?0',
+            'sec-ch-ua-platform': '"Windows"'
+        }
+        soup_ = policy.requestPost_html(headers, url, payload)
+        # 第一次请求获取页数
+        pages = int(soup_.find('span',class_='default-result-tolal-records').find('span').text)
+        DataList = []
+        for page in range(1, pages+1):
+            payload_page = {
+                    'qAnd': ' ',
+                   'qOr': ' ',
+                   'qAll': ' ',
+                   'qNot': ' ',
+                   'startTime': ' ',
+                   'endTime': ' ',
+                   'advSearch': ' ',
+                   'originalSearchUrl': ' /search/pcRender?pageId=b97a38833f7343cebc31dec44544f684',
+                   'originalSearch': ' ',
+                   'app': ' 20c723b3a36e4906b0d91e6950d3dc29,8b157f193fb54ea7837d6380a37bb84a,0ad7369c794e4b2fbd6a4e76f9b84e9c,47fb4bc5c08d49d3b937c56c7960a909,9f54f8001d8747e4826d542fedcc6abc,b42baf238f43435ea7f796bec4ef7592,c943f166fb9042d288743397b12978fc,4b2050e6bb5d48dc9b200385dd99b4e3,7b5b083a6d254960ab34e34009e7e8d7,aa9d0848dcb84e8b919fd02b2da090b4,54e1a38a0e2846a4bc60258af5ced450,b88b6ee476494a16b66ea9cacc0456ee,4d0e00783a2e4037a6d3bdcd1fe98fb1,a8cb58e7494e4ae4a682b0e79df63dc6,f70c53427500439cbdeee467c5a185a6,d3f6aaca16c54e7b8626993314ad27b7,4d63955d8ec441018e8fddc6131997b0',
+                   'searchArea': ' ',
+                   'appName': appName,
+                   'sr': ' score desc',
+                   'advtime': ' ',
+                   'advrange': ' ',
+                   'articleType': ' ',
+                   'siteId': ' ',
+                   'siteName': ' ',
+                   'ext': ' ',
+                   'pNo': str(page),
+                   'deviceType': ' pc',
+                   'q2': ' ',
+                   'q': ' REITs'}
+            soup = policy.requestPost_html(headers, url, payload_page)
+            list_news = soup.find_all('div',class_='szf-data-tpl1-item')
+            for news in list_news:
+                num += 1
+                title = news.find('h3').text
+                summary = news.find('div').find('p', class_='txtCon').text
+                publishDate = news.find('div').find('p', class_='dates').text.replace('发布日期：', '').replace('\n', '')
+                news_href = news.find('div').find('p', class_='txtCon').find('a')['href']
+                # news_href = 'http://info.hebei.gov.cn//hbszfxxgk/6898876/7026469/7026511/7026506/7033297/index.html'
+                news_req = requests.get(news_href, headers)
+                news_soup = BeautifulSoup(news_req.content, 'html.parser')
+                writeDate = ''
+                pub_hao = ''
+                source = ''
+                content = ''
+                pub_origin = ''
+                try:
+                    content = news_soup.find('div', id='zoom').text
+                    contentWithTag = news_soup.find('div', id='zoom')
+                    try:
+                        source = news_soup.find('div', class_='article_tit').find('li', class_='xl_laiyuan').text
+                    except:
+                        source = ''
+                    try:
+                        info_ = news_soup.find('div',class_='xxgk_bmxl')
+                        policy.deletetag(info_, 'strong')
+                        policy.deletek(info_)
+                        info_list = info_.find_all('td')
+                        pub_origin = info_list[1].text
+                        pub_hao = info_list[2].text
+                    except:
+                        # 处理空标签
+                        policy.deletek(news_soup)
+                        p_list = news_soup.find_all('p')
+                        for p in p_list:
+                            text_pubhao = p.text
+                            if '号' in text_pubhao and '〔' in text_pubhao:
+                                pattern = r"冀政办字〔\d+〕\d+号"
+                                match = re.search(pattern, text_pubhao)
+                                if match:
+                                    pub_hao = match.group(0)
+                                    break
+                            else:
+                                continue
+                            writeDate_ = p.text
+                            pattern = r"\d{4}年\d{1,2}月\d{1,2}日"
+                            match = re.search(pattern, writeDate_)
+                            if match:
+                                writeDate = match.group(0)
+                                break
+                            else:
+                                continue
+                except:
+                    try:
+                        contentWithTag = news_soup.find('div',class_='xxgk_gfxwjk_xqy-wznr')
+                        content = news_soup.find('div',class_='xxgk_gfxwjk_xqy-wznr').text
+                        info = news_soup.find('div', class_='xxgk_gfxwjk-xqy-touxx')
+                        policy.deletespan(info)
+                        pub_hao = info.find('p', class_='xxgk_gfxwjk-xqy-touxx4').text
+                        pub_origin = info.find('p', class_='xxgk_gfxwjk-xqy-touxx3').text
+                        writeDate = info.find('p', class_='xxgk_gfxwjk-xqy-touxx5').text
+                    except:
+                        pass
+                # 附件：
+                fu_jian_name = ''
+                fu_jian_href = ''
+                try:
+                    fujian_href = contentWithTag.find_all('a')
+                    policy.paserUrl(contentWithTag, news_href)
+                    for file_href_ in fujian_href:
+                        file_href = file_href_['href']
+                        file_name = file_href_.text
+                        category = os.path.splitext(file_href)[1]
+                        if category in file_name:
+                            pass
+                        else:
+                            file_name = file_name + category
+                        rename_file = f'{str(num)}_{publishDate}_{file_name}'
+                        fu_jian_name += rename_file + '\n'
+                        fu_jian_href += file_href + '\n'
+                        policy.downloadfile(file_href, f'{path}/{rename_file}')
+                except Exception as e:
+                    pass
+                if content == '':
+                    continue
+                dic_info = {
+                    '序号': num,
+                    '标题': title.replace('\n', ''),
+                    '发布时间': publishDate,
+                    '来源': source,
+                    '原文链接': news_href,
+                    '发文时间': writeDate,
+                    '发文机构': pub_origin,
+                    '发文字号': pub_hao,
+                    '摘要': summary.replace('\n', ''),
+                    '正文': content,
+                    '附件名称': fu_jian_name,
+                    '附件链接': fu_jian_href,
+                }
+                print(dic_info)
+                DataList.append(dic_info)
+                sheet_name = appName
+                if sheet_name in wb.sheetnames:
+                    log.info(f"{sheet_name}工作表已存在！")
+                else:
+                    # 创建新工作表
+                    wb.create_sheet(sheet_name)
+                    print(f"{sheet_name}新工作表创建完成！")
+                # 保存Excel文件
+                wb.save(file_path)
+                baseCore.writerToExcel(DataList, file_path, sheet_name)
+        break
+# 广东省人民政府
+def guangdong():
+    pass
+# 贵州省人民政府
+def guizhou():
+    url = "https://www.guizhou.gov.cn/irs/front/search"
+    payload = "{\"tenantId\":\"186\",\"configTenantId\":\"\",\"tenantIds\":\"\",\"searchWord\":\"REITs\",\"historySearchWords\":[\"REITs\"],\"dataTypeId\":\"965\",\"orderBy\":\"related\",\"searchBy\":\"all\",\"appendixType\":\"\",\"granularity\":\"ALL\",\"beginDateTime\":\"\",\"endDateTime\":\"\",\"isSearchForced\":0,\"filters\":[],\"pageNo\":1,\"pageSize\":9}"
+    headers = {
+        'Accept': 'application/json, text/javascript, */*; q=0.01',
+        'Accept-Encoding': 'gzip, deflate, br',
+        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
+        'Connection': 'keep-alive',
+        'Content-Length': '291',
+        'Content-Type': 'application/json',
+        'Cookie': 'SESSION=MGY2NWQ3NjctZTNhZC00OTJhLWIzNGQtMDI1MmQ5MWVlZmNm; _trs_uv=lp15qktj_367_a56u; _trs_ua_s_1=lp15qktj_367_lac; yfx_c_g_u_id_10000921=_ck23111620182819813554574558557; yfx_f_l_v_t_10000921=f_t_1700137108976__r_t_1700137108976__v_t_1700137108976__r_c_0; arialoadData=false',
+        'Host': 'www.guizhou.gov.cn',
+        'Origin': 'https://www.guizhou.gov.cn',
+        'Referer': 'https://www.guizhou.gov.cn/so/search.shtml?tenantId=186&tenantIds=&configTenantId=&searchWord=REITs&dataTypeId=965&sign=6bd8592c-2e19-4f22-ae6d-f129f729e795',
+        'Sec-Fetch-Dest': 'empty',
+        'Sec-Fetch-Mode': 'cors',
+        'Sec-Fetch-Site': 'same-origin',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+        'X-Requested-With': 'XMLHttpRequest',
+        'sec-ch-ua': '"Microsoft Edge";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
+        'sec-ch-ua-mobile': '?0',
+        'sec-ch-ua-platform': '"Windows"'
+    }
+    jsonData = policy.requestPost(headers, url, payload)
+    result_list = jsonData['data']['middle']["list"]
+    for datainfo in result_list:
+        title = datainfo['title']
+        publishData = datainfo['time']
+        source = datainfo['source']
+        summary = datainfo['content']
+        newsUrl = datainfo['url']
+        soup = policy.getrequest_soup(headers,newsUrl)
+        # print(soup)
+        pub_hao = soup.find('head').find('title')
+        print(pub_hao)
+    pass
 if __name__=="__main__":
-    file_path = f'data/REITs国家改革发展委员会.xlsx'
+    file_path = f'data/REITs深圳交易所.xlsx'
    wb = policy.createfile(file_path)
    # reform(wb,file_path)
+    # shenzhen()
    # zhengquanqihuo(wb,file_path)
-    sse(wb,file_path)
+    # sse(wb,file_path)
+    # hebei()
+    guizhou()
 # zhengquanqihuo()
\ No newline at end of file
--- a/comData/noticeReport/公告补采3.py
+++ b/comData/noticeReport/公告补采3.py
 """
@@ -291,18 +291,18 @@ def run_threads(num_threads,esMethod):
        thread.join()
 if __name__ == '__main__':
-    # while True:
+    for i in range(0,5):
        esMethod = EsMethod()
        p = 0
-        # result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
+        result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
-        # total = result['hits']['total']['value']
+        total = result['hits']['total']['value']
-        # if total == 0:
+        if total == 0:
-        #     log.info('++++已没有数据+++++')
+            log.info('++++已没有数据+++++')
-        #     break
+            break
        start = time.time()
-        num_threads = 8
+        num_threads = 10
        run_threads(num_threads,esMethod)
-        log.info(f'8线程 每个处理200条数据 总耗时{time.time()-start}秒')
+        log.info(f'10线程 每个处理200条数据 总耗时{time.time()-start}秒')