11.29

3ad4b1e5 · 薛凌堃 · 5d788bc9 · 3ad4b1e5 · 3ad4b1e5 · 3ad4b1e5
--- a/REITs专题数据/reits.py
+++ b/REITs专题数据/reits.py
 import os
@@ -234,15 +234,16 @@ def reform():
                    try:
                        publishDate = newssoup.find('div',class_="article_con article_con_title").find_all('span')[-1].text
                    except:
-                        publishDate = ''
+                        publishDate = None
-                pattern = r"\d{4}年\d{1,2}月\d{1,2}日"
+                if publishDate:
-                match = re.match(pattern, publishDate)
+                    pattern = r"\d{4}年\d{1,2}月\d{1,2}日"
-                if match:
+                    match = re.match(pattern, publishDate)
-                    date1 = datetime.strptime(publishDate, "%Y年%m月%d日")
+                    if match:
-                    publishDate = date1.strftime("%Y-%m-%d")
+                        date1 = datetime.strptime(publishDate, "%Y年%m月%d日")
-                    pass
+                        publishDate = date1.strftime("%Y-%m-%d")
-                else:
+                        pass
-                    publishDate = ''
+                    else:
+                        publishDate = None
                policy.deletep(contentWithTag, 3, 'div', 'style', 'text-align: center;')
                policy.deletek(contentWithTag)
@@ -313,7 +314,7 @@ def reform():
                # baseCore.writerToExcel(DataList, file_path, sheet_name)
            except Exception as e:
                log.info(f"error！！！{newsUrl}")
-                log.info({e})
+                log.info(e)
        log.info(f'====第{page}页====处理结束，已采集{num}条数据=================')
 #证券期货 https://neris.csrc.gov.cn/falvfagui/multipleFindController/indexJsp
@@ -456,7 +457,7 @@ def zhengquanqihuo():
 #深圳交易所 http://www.szse.cn/lawrules/index.html
 #上海交易所 http://www.sse.com.cn/home/search/index.shtml?webswd=REITs
-def sse(wb,file_path):
+def sse():
    url = 'http://query.sse.com.cn/search/getESSearchDoc.do?page=0&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C8685%2C9348%2C12632%2C12768%2C12769%2C12770%2C12771%2C12772%2C12773%2C12774%2C12775%2C12776%2C12777%2C12778%2C12779%2C12780%2C12781%2C12782%2C12783%2C12784%2C12785%2C12786%2C12787%2C12788%2C12789%2C12790%2C12791%2C12792%2C12793%2C12794%2C12795%2C12796%2C12797%2C12798%2C12799%2C12800%2C12801%2C12802%2C12803%2C12804%2C12805%2C12806%2C12807%2C12808%2C12809%2C12810%2C12811%2C12812%2C13061%2C13282%2C13283%2C13284%2C13285%2C13286%2C13287%2C13288%2C13289%2C13294%2C13364%2C13365%2C13366%2C13367%2C14595%2C14596%2C14597%2C14598%2C14599%2C14600%2C14601%2C14602%2C14603%2C14604%2C14605%2C14606&trackId=50619067167713018335655119683810&_=1699508921761'
    headers = {
        'Accept': '*/*',
@@ -470,7 +471,7 @@ def sse(wb,file_path):
    }
    result = policy.getrequest_json(headers,url)
    total_page = result['data']['totalPage']
-    DataList = []
+    # DataList = []
    num = 0
    webname = '上海证券交易所'
    # path = 'data/上海交易所'
@@ -481,8 +482,9 @@ def sse(wb,file_path):
        data = policy.getrequest_json(headers, url_page)
        newslist = data['data']['knowledgeList']
        # print(newslist)
-        for news in newslist[:1]:
+        for news in newslist:
            num += 1
+            id_list = []
            title = news['title'].replace("<em>",'').replace('</em>','')
            publishDate = news['createTime']
            # print(newsUrl)
@@ -506,111 +508,131 @@ def sse(wb,file_path):
            is_member = baseCore.r.sismember('REITs::' + webname, newsUrl)
            if is_member:
                continue
+            try:
+                if '.pdf' in newsUrl:
+                    # pass
+                    content = ''
+                    response = requests.get(newsUrl, timeout=20)
+                    with fitz.open(stream=response.content, filetype='pdf') as doc:
+                        for page in doc.pages():
+                            content += page.get_text()
+                    file_href = newsUrl
+                    file_name = title
+                    att_id, full_path = policy.attuributefile(title, newsUrl, num, publishDate)
+                    if att_id:
+                        id_list.append(att_id)
+                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                    dic_info = {
+                        'attachmentIds':id_list,
+                        'author': '',
+                        'content': content,
+                        'contentWithTag': '',
+                        'deleteFlag': 0,
+                        'id': '',
+                        'title': title,
+                        'publishDate': publishDate,
+                        'origin': source,
+                        'sourceAddress': newsUrl,
+                        'writtenDate': None,
+                        'organ': '',
+                        'topicClassification': '',
+                        'issuedNumber': '',
+                        'summary': summary,
+                        'createDate': time_now,
+                        'sid': '1729035244826374145',
+                    }
+                    # DataList.append(dic_info)
+                    try:
+                        baseCore.sendkafka(dic_info, topic)
+                        baseCore.r.sadd('REITs::' + webname, newsUrl)
+                        log.info(f'采集成功--{title}--{newsUrl}')
+                    except:
+                        for att_id in id_list:
+                            baseCore.deliteATT(att_id)
+                else:
+                    newssoup = policy.getrequest_soup(header, newsUrl)
+                    # print(newssoup)
+                    policy.paserUrl(newssoup, newsUrl)
+                    content_ = newssoup.find('div', class_='allZoom')
+                    # print(content_)
+                    # #  将链接替换为绝对路径
+                    contentWithTag = policy.paserUrl(content_, newsUrl)
+                    try:
+                        pubHao = contentWithTag.find('p',style='text-align: center;').text.strip(' ')
+                        if '〔' in pubHao:
+                            pass
+                        else:
+                            pubHao = ''
+                    except:
+                        pubHao = ''
+                    # print(contentWithTag)
+                    content = contentWithTag.text
-            if '.pdf' in newsUrl:
+                    fujian_list = contentWithTag.find_all('a')
-                content = ''
-                response = requests.get(newsUrl, timeout=20)
-                with fitz.open(stream=response.content, filetype='pdf') as doc:
-                    for page in doc.pages():
-                        content += page.get_text()
-                file_href = newsUrl
-                file_name = title
-                policy.attuributefile(title, newsUrl, num, publishDate)
+                    for fujian in fujian_list:
+                        try:
+                            file_href = fujian['href']
+                        except:
+                            continue
+                        file_name = fujian.text.strip(' ')
+                        category = os.path.splitext(file_href)[1]
+                        if category in file_name:
+                            pass
+                        else:
+                            file_name = file_name + category
-                dic_info = {
+                        att_id, full_path = policy.attuributefile(file_name, file_href, num, publishDate)
-                    '序号': num,
+                        if att_id:
-                    '标题': title,
+                            id_list.append(att_id)
-                    '发布时间': publishDate,
+                            fujian['href'] = full_path
-                    '来源': source,
-                    '原文链接': newsUrl,
-                    '发文时间': '',
-                    '发文机构': '',
-                    '发文字号': '',
-                    '摘要': summary,
-                    '正文': content,
-                    '附件名称': fu_jian_name,
-                    '附件链接': fu_jian_href,
-                }
-                DataList.append(dic_info)
-            else:
-                newssoup = policy.getrequest_soup(header, newsUrl)
-                # print(newssoup)
-                content_ = newssoup.find('div', class_='allZoom')
-                # print(content_)
-                # #  将链接替换为绝对路径
-                contentWithTag = policy.paserUrl(content_, newsUrl)
-                try:
-                    pubHao = contentWithTag.find('p',style='text-align: center;').text.strip(' ')
-                    if '〔' in pubHao:
-                        pass
-                    else:
-                        pubHao = ''
-                except:
-                    pubHao = ''
-                # print(contentWithTag)
-                content = contentWithTag.text
-                fujian_list = contentWithTag.find_all('a')
+                    contentWithTag_str = str(contentWithTag)
-                fu_jian_name = ''
+                    # print(contentWithTag_str)
-                fu_jian_href = ''
-                for fujian in fujian_list:
+                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-                    try:
+                    dic_info = {
-                        file_href = fujian['href']
+                        'attachmentIds':id_list,
-                    except:
+                        'author': '',
-                        continue
+                        'content': content,
-                    file_name = fujian.text.strip(' ')
+                        'contentWithTag': contentWithTag_str,
-                    category = os.path.splitext(file_href)[1]
+                        'deleteFlag': 0,
-                    if category in file_name:
+                        'id': '',
-                        pass
+                        'title': title,
-                    else:
+                        'publishDate': publishDate,
-                        file_name = file_name + category
+                        'origin': source,
-                    rename_file = f'{str(num)}_{publishDate[:10]}_{file_name}'.replace('\\','').replace('/','').replace('|','').replace('>','').replace('<','').replace('*','').replace('：','').replace('？','').replace('—','').replace('-','')
+                        'sourceAddress': newsUrl,
-                    fu_jian_name += rename_file + '\n'
+                        'writtenDate': None,
-                    fu_jian_href += file_href + '\n'
+                        'organ': '',
+                        'issuedNumber': pubHao,
+                        'summary': summary,
+                        'createDate': time_now,
+                        'sid': '1729035244826374145'
+                    }
                    try:
-                        policy.downloadfile(file_href, f'{path}/{rename_file}')
+                        baseCore.sendkafka(dic_info, topic)
+                        baseCore.r.sadd('REITs::' + webname, newsUrl)
+                        log.info(f'采集成功--{title}--{newsUrl}')
                    except:
-                        log.info(f'--{page}-{num}======{newsUrl}')
+                        for att_id in id_list:
-                        continue
+                            baseCore.deliteATT(att_id)
-                dic_info = {
+            except Exception as e:
-                    '序号': num,
+                log.info(f"error！！！{newsUrl}")
-                    '标题': title,
+                log.info(e)
-                    '发布时间': publishDate,
+        log.info(f'====第{page}页====处理结束，已采集{num}条数据=================')
-                    '来源': source,
-                    '原文链接': newsUrl,
-                    '发文时间': '',
-                    '发文机构': '',
-                    '发文字号': pubHao,
-                    '摘要': summary,
-                    '正文': content,
-                    '附件名称': fu_jian_name,
-                    '附件链接': fu_jian_href,
-                }
-                DataList.append(dic_info)
-            sheet_name = "上海交易所"
-            if sheet_name in wb.sheetnames:
-                log.info(f"{sheet_name}工作表已存在！")
-            else:
-                # 创建新工作表
-                wb.create_sheet(sheet_name)
-                print(f"{sheet_name}新工作表创建完成！")
-            # 保存Excel文件
-            wb.save(file_path)
-            baseCore.writerToExcel(DataList, file_path, sheet_name)
 # 河北省人民政府
 def hebei():
-    path = 'data/河北省人民政府'
+    # path = 'data/河北省人民政府'
-    if not os.path.exists(path):
+    # if not os.path.exists(path):
-        os.makedirs(path)
+    #     os.makedirs(path)
    num = 0
+    webname = '河北省人民政府'
    url = "https://www.hebei.gov.cn/search/pcRender?pageId=b97a38833f7343cebc31dec44544f684"
-    appNames = ['热点专题']
+    appNames = ['信息公开']
    for appName in appNames:
        payload = {'qAnd': ' ',
                   'qOr': ' ',
@@ -661,7 +683,7 @@ def hebei():
        soup_ = policy.requestPost_html(headers, url, payload)
        # 第一次请求获取页数
        pages = int(soup_.find('span',class_='default-result-tolal-records').find('span').text)
-        DataList = []
+        # DataList = []
        for page in range(1, pages+1):
            payload_page = {
                    'qAnd': ' ',
@@ -692,130 +714,142 @@ def hebei():
            list_news = soup.find_all('div',class_='szf-data-tpl1-item')
            for news in list_news:
                num += 1
+                id_list = []
                title = news.find('h3').text
                summary = news.find('div').find('p', class_='txtCon').text
-                publishDate = news.find('div').find('p', class_='dates').text.replace('发布日期：', '').replace('\n', '')
+                publishDate_ = news.find('div').find('p', class_='dates').text.replace('发布日期：', '').replace('\n', '')
+                date1 = datetime.strptime(publishDate_, "%Y年%m月%d日")
+                publishDate = date1.strftime("%Y-%m-%d")
                news_href = news.find('div').find('p', class_='txtCon').find('a')['href']
                # news_href = 'http://info.hebei.gov.cn//hbszfxxgk/6898876/7026469/7026511/7026506/7033297/index.html'
-                news_req = requests.get(news_href, headers)
-                news_soup = BeautifulSoup(news_req.content, 'html.parser')
-                writeDate = ''
-                pub_hao = ''
-                source = ''
-                content = ''
-                pub_origin = ''
                try:
-                    content = news_soup.find('div', id='zoom').text
+                    # 根据链接判重
-                    contentWithTag = news_soup.find('div', id='zoom')
+                    is_member = baseCore.r.sismember('REITs::' + webname, news_href)
-                    try:
+                    if is_member:
-                        source = news_soup.find('div', class_='article_tit').find('li', class_='xl_laiyuan').text
+                        continue
-                    except:
-                        source = ''
+                    news_req = requests.get(news_href, headers)
+                    news_soup = BeautifulSoup(news_req.content, 'html.parser')
+                    policy.paserUrl(news_soup, news_href)
+                    writeDate = None
+                    pub_hao = ''
+                    source = ''
+                    content = ''
+                    pub_origin = ''
                    try:
-                        info_ = news_soup.find('div',class_='xxgk_bmxl')
+                        content = news_soup.find('div', id='zoom').text
-                        policy.deletetag(info_, 'strong')
+                        contentWithTag = news_soup.find('div', id='zoom')
-                        policy.deletek(info_)
+                        try:
-                        info_list = info_.find_all('td')
+                            source = news_soup.find('div', class_='article_tit').find('li', class_='xl_laiyuan').text
-                        pub_origin = info_list[1].text
+                        except:
-                        pub_hao = info_list[2].text
+                            source = ''
-                    except:
-                        # 处理空标签
+                        try:
-                        policy.deletek(news_soup)
+                            info_ = news_soup.find('div',class_='xxgk_bmxl')
-                        p_list = news_soup.find_all('p')
+                            policy.deletetag(info_, 'strong')
-                        for p in p_list:
+                            policy.deletek(info_)
-                            text_pubhao = p.text
+                            info_list = info_.find_all('td')
-                            if '号' in text_pubhao and '〔' in text_pubhao:
+                            pub_origin = info_list[1].text
-                                pattern = r"冀政办字〔\d+〕\d+号"
+                            pub_hao = info_list[2].text
-                                match = re.search(pattern, text_pubhao)
+                        except:
+                            # 处理空标签
+                            policy.deletek(news_soup)
+                            p_list = news_soup.find_all('p')
+                            for p in p_list:
+                                text_pubhao = p.text
+                                if '号' in text_pubhao and '〔' in text_pubhao:
+                                    pattern = r"冀政办字〔\d+〕\d+号"
+                                    match = re.search(pattern, text_pubhao)
+                                    if match:
+                                        pub_hao = match.group(0)
+                                        break
+                                else:
+                                    continue
+                                writeDate_ = p.text
+                                pattern = r"\d{4}年\d{1,2}月\d{1,2}日"
+                                match = re.search(pattern, writeDate_)
                                if match:
-                                    pub_hao = match.group(0)
+                                    writeDate1 = match.group(0)
+                                    date2 = datetime.strptime(writeDate1, "%Y年%m月%d日")
+                                    writeDate = date2.strftime("%Y-%m-%d")
                                    break
+                                else:
+                                    continue
+                    except:
+                        try:
+                            contentWithTag = news_soup.find('div',class_='xxgk_gfxwjk_xqy-wznr')
+                            content = news_soup.find('div',class_='xxgk_gfxwjk_xqy-wznr').text
+                            info = news_soup.find('div', class_='xxgk_gfxwjk-xqy-touxx')
+                            policy.deletespan(info)
+                            pub_hao = info.find('p', class_='xxgk_gfxwjk-xqy-touxx4').text
+                            pub_origin = info.find('p', class_='xxgk_gfxwjk-xqy-touxx3').text
+                            writeDate = info.find('p', class_='xxgk_gfxwjk-xqy-touxx5').text
+                        except:
+                            pass
+                    # 附件：
+                    try:
+                        fujian_href = contentWithTag.find_all('a')
+                        for file_href_ in fujian_href:
+                            file_href = file_href_['href']
+                            file_name = file_href_.text
+                            category = os.path.splitext(file_href)[1]
+                            if category in file_name:
+                                pass
                            else:
-                                continue
+                                file_name = file_name + category
+                            att_id, full_path = policy.attuributefile(file_name, file_href, num, publishDate)
-                            writeDate_ = p.text
+                            if att_id:
-                            pattern = r"\d{4}年\d{1,2}月\d{1,2}日"
+                                id_list.append(att_id)
-                            match = re.search(pattern, writeDate_)
+                                file_href_['href'] = full_path
-                            if match:
+                        contentWithTag_str = str(contentWithTag)
-                                writeDate = match.group(0)
+                    except Exception as e:
-                                break
+                        contentWithTag_str = str(contentWithTag)
-                            else:
-                                continue
+                    if content == '':
-                except:
+                        continue
+                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                    dic_info = {
+                        'attachmentIds':id_list,
+                        'author': '',
+                        'content': content,
+                        'contentWithTag': contentWithTag_str,
+                        'title': title.replace('\n', ''),
+                        'publishDate': publishDate,
+                        'origin': source,
+                        'sourceAddress': news_href,
+                        'writtenDate': writeDate,
+                        'organ': pub_origin,
+                        'issuedNumber': pub_hao,
+                        'summary': summary.replace('\n', ''),
+                        'createDate': time_now,
+                        'sid': '1729041576348274689',
+                    }
+                    # print(dic_info)
                    try:
-                        contentWithTag = news_soup.find('div',class_='xxgk_gfxwjk_xqy-wznr')
+                        baseCore.sendkafka(dic_info, topic)
-                        content = news_soup.find('div',class_='xxgk_gfxwjk_xqy-wznr').text
+                        baseCore.r.sadd('REITs::' + webname, news_href)
-                        info = news_soup.find('div', class_='xxgk_gfxwjk-xqy-touxx')
+                        log.info(f'采集成功--{title}--{news_href}')
-                        policy.deletespan(info)
-                        pub_hao = info.find('p', class_='xxgk_gfxwjk-xqy-touxx4').text
-                        pub_origin = info.find('p', class_='xxgk_gfxwjk-xqy-touxx3').text
-                        writeDate = info.find('p', class_='xxgk_gfxwjk-xqy-touxx5').text
                    except:
-                        pass
+                        for att_id in id_list:
-                # 附件：
+                            baseCore.deliteATT(att_id)
-                fu_jian_name = ''
-                fu_jian_href = ''
-                try:
-                    fujian_href = contentWithTag.find_all('a')
-                    policy.paserUrl(contentWithTag, news_href)
-                    for file_href_ in fujian_href:
-                        file_href = file_href_['href']
-                        file_name = file_href_.text
-                        category = os.path.splitext(file_href)[1]
-                        if category in file_name:
-                            pass
-                        else:
-                            file_name = file_name + category
-                        rename_file = f'{str(num)}_{publishDate}_{file_name}'
-                        fu_jian_name += rename_file + '\n'
-                        fu_jian_href += file_href + '\n'
-                        policy.downloadfile(file_href, f'{path}/{rename_file}')
                except Exception as e:
-                    pass
+                    log.info(f"error！！！{news_href}")
-                if content == '':
+                    log.info(e)
-                    continue
+            log.info(f'====第{page}页====处理结束，已采集{num}条数据=================')
-                dic_info = {
-                    '序号': num,
-                    '标题': title.replace('\n', ''),
-                    '发布时间': publishDate,
-                    '来源': source,
-                    '原文链接': news_href,
-                    '发文时间': writeDate,
-                    '发文机构': pub_origin,
-                    '发文字号': pub_hao,
-                    '摘要': summary.replace('\n', ''),
-                    '正文': content,
-                    '附件名称': fu_jian_name,
-                    '附件链接': fu_jian_href,
-                }
-                print(dic_info)
-                DataList.append(dic_info)
-                sheet_name = appName
-                if sheet_name in wb.sheetnames:
-                    log.info(f"{sheet_name}工作表已存在！")
-                else:
-                    # 创建新工作表
-                    wb.create_sheet(sheet_name)
-                    print(f"{sheet_name}新工作表创建完成！")
-                # 保存Excel文件
-                wb.save(file_path)
-                baseCore.writerToExcel(DataList, file_path, sheet_name)
-        break
 # 贵州省人民政府
 def guizhou():
    url = "https://www.guizhou.gov.cn/irs/front/search"
    num = 0
-    path = 'data/贵州省人民政府'
+    # path = 'data/贵州省人民政府'
-    if not os.path.exists(path):
+    # if not os.path.exists(path):
-        os.makedirs(path)
+    #     os.makedirs(path)
-    DataList = []
+    # DataList = []
+    webname = '贵州省人民政府'
    payload = "{\"tenantId\":\"186\",\"configTenantId\":\"\",\"tenantIds\":\"\",\"searchWord\":\"REITs\",\"historySearchWords\":[\"REITs\"],\"dataTypeId\":\"965\",\"orderBy\":\"related\",\"searchBy\":\"all\",\"appendixType\":\"\",\"granularity\":\"ALL\",\"beginDateTime\":\"\",\"endDateTime\":\"\",\"isSearchForced\":0,\"filters\":[],\"pageNo\":1,\"pageSize\":9}"
    headers = {
        'Accept': 'application/json, text/javascript, */*; q=0.01',
@@ -841,77 +875,85 @@ def guizhou():
    result_list = jsonData['data']['middle']["list"]
    for datainfo in result_list:
        num += 1
+        id_list = []
        title = datainfo['title']
        publishDate = datainfo['time']
        source = datainfo['source']
        summary = datainfo['content']
        newsUrl = datainfo['url']
-        soup = policy.getrequest_soup(headers, newsUrl)
+        # 根据链接判重
-        # print(soup)
+        is_member = baseCore.r.sismember('REITs::' + webname, newsUrl)
-        pub_hao_ = soup.find('head').find('title').text
+        if is_member:
-        start_index = pub_hao_.find("(") + 1
+            continue
-        end_index = pub_hao_.find(")")
-        pub_hao = pub_hao_[start_index:end_index]
-        print(pub_hao)
-        # 删除包含特定字段的标签
-        contentWithTag = soup.find('div', class_='Zoom Box')
-        policy.deletetext(contentWithTag, 'p', title)
-        policy.deletetext(contentWithTag, 'p', pub_hao)
-        content = contentWithTag.text
-        # 附件：
-        fu_jian_name = ''
-        fu_jian_href = ''
        try:
-            fujian_href = contentWithTag.find_all('a')
+            soup = policy.getrequest_soup(headers, newsUrl)
-            policy.paserUrl(contentWithTag, newsUrl)
+            # print(soup)
-            for file_href_ in fujian_href:
+            policy.paserUrl(soup, newsUrl)
-                file_href = file_href_['href']
+            pub_hao_ = soup.find('head').find('title').text
-                file_name = file_href_.text
+            start_index = pub_hao_.find("(") + 1
-                category = os.path.splitext(file_href)[1]
+            end_index = pub_hao_.find(")")
-                if category in file_name:
+            pub_hao = pub_hao_[start_index:end_index]
-                    pass
+            # print(pub_hao)
-                else:
+            # 删除包含特定字段的标签
-                    file_name = file_name + category
-                rename_file = f'{str(num)}_{publishDate.replace("-", "")[:8]}_{file_name}'
+            contentWithTag = soup.find('div', class_='Zoom Box')
-                fu_jian_name += rename_file + '\n'
+            policy.deletetext(contentWithTag, 'p', title)
-                fu_jian_href += file_href + '\n'
+            policy.deletetext(contentWithTag, 'p', pub_hao)
-                policy.downloadfile(file_href, f'{path}/{rename_file}')
+            content = contentWithTag.text
-        except:
-            pass
+            # 附件：
-        dic_info = {
+            try:
-            '序号': num,
+                fujian_href = contentWithTag.find_all('a')
-            '标题': title.replace('\n', ''),
-            '发布时间': publishDate,
-            '来源': source,
-            '原文链接': newsUrl,
-            '发文时间': publishDate,
-            '发文机构': '',
-            '发文字号': pub_hao,
-            '摘要': summary.replace('\n', '').replace('<em>', '').replace('</em>', ''),
-            '正文': content,
-            '附件名称': fu_jian_name,
-            '附件链接': fu_jian_href,
-        }
-        print(dic_info)
-        DataList.append(dic_info)
-        sheet_name = '贵州省人民政府政策文件'
-        if sheet_name in wb.sheetnames:
-            log.info(f"{sheet_name}工作表已存在！")
-        else:
-            # 创建新工作表
-            wb.create_sheet(sheet_name)
-            print(f"{sheet_name}新工作表创建完成！")
-        # 保存Excel文件
-        wb.save(file_path)
-        baseCore.writerToExcel(DataList, file_path, sheet_name)
+                for file_href_ in fujian_href:
+                    file_href = file_href_['href']
+                    file_name = file_href_.text
+                    category = os.path.splitext(file_href)[1]
+                    if category in file_name:
+                        pass
+                    else:
+                        file_name = file_name + category
+                    att_id, full_path = policy.attuributefile(file_name, file_href, num, publishDate)
+                    if att_id:
+                        id_list.append(att_id)
+                        file_href_['href'] = full_path
+            except:
+                pass
+            contentWithTag_str = str(contentWithTag)
+            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+            dic_info = {
+                'attachmentIds':id_list,
+                'author': '',
+                'content': content,
+                'contentWithTag': contentWithTag_str,
+                'deleteFlag': 0,
+                'id': '',
+                'title': title.replace('\n', ''),
+                'publishDate': publishDate,
+                'source': source,
+                'sourceAddress': newsUrl,
+                'writtenDate': publishDate,
+                'organ': '',
+                'issuedNumber': pub_hao,
+                'summary': summary.replace('\n', '').replace('<em>', '').replace('</em>', ''),
+                'createDate': time_now,
+                'sid': '1729046185945182210',
+            }
+            # print(dic_info)
+            try:
+                baseCore.sendkafka(dic_info, topic)
+                baseCore.r.sadd('REITs::' + webname, newsUrl)
+                log.info(f'采集成功--{title}--{newsUrl}')
+            except:
+                for att_id in id_list:
+                    baseCore.deliteATT(att_id)
+        except Exception as e:
+            log.info(f"error！！！{newsUrl}")
+            log.info(e)
+    log.info(f'====处理结束，已采集{num}条数据=================')
-    pass
 if __name__=="__main__":
@@ -919,8 +961,9 @@ if __name__=="__main__":
    # wb = policy.createfile(file_path)
    # reform()
    # shenzhen()
-    zhengquanqihuo()
+    # zhengquanqihuo()
    # sse()
    # hebei()
-    # guizhou()
+    guizhou()
 # zhengquanqihuo()
\ No newline at end of file
--- a/comData/YanBao/deletebyid.py
+++ b/comData/YanBao/deletebyid.py
@@ -32,7 +32,7 @@ class EsMethod(object):
    def __init__(self):
        # 创建Elasticsearch对象，并提供账号信息
        self.es = Elasticsearch(['http://114.116.19.92:9700'],  http_auth=('elastic', 'zzsn9988'),timeout=300 )
-        self.index_name='researchreportdata'
+        self.index_name='policy'
    '''
    删除
@@ -52,7 +52,10 @@ if __name__ == "__main__":
        if item:
            log.info(item)
            id = item.decode()
-            esMethod.delete(esMethod.index_name,id)
+            try:
+                esMethod.delete(esMethod.index_name,id)
+            except:
+                continue
        else:
            log.info('已删除完毕')
            break

--- a/comData/YanBao/get_doc_id.py
+++ b/comData/YanBao/get_doc_id.py
+import json
+import threading
+import time
+import uuid
+import redis
+import requests
+from retry import retry
+from elasticsearch import Elasticsearch
+from base import BaseCore
+from obs import ObsClient
+import fitz
+from urllib.parse import unquote
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+baseCore = BaseCore.BaseCore()
+# 使用连接池
+# cnx_ = baseCore.pool_11.connection()
+# cursor_ = cnx_.cursor()
+cnx_ = baseCore.cnx_
+cursor_ = cnx_.cursor()
+lock = threading.Lock()
+pathType = 'QYNotice/'
+taskType = '企业研报/东方财富网'
+pool = redis.ConnectionPool(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
+class EsMethod(object):
+    def __init__(self):
+        # 创建Elasticsearch对象，并提供账号信息
+        self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
+        self.index_name = 'policy'
+    def queryatt(self,index_name,pnum):
+       body = {
+           "query": {
+               "bool": {
+                   "must": [
+                       {
+                           "term": {
+                               "sid.keyword": {
+                                   "value": "1697458829758697473"
+                               }
+                           }
+                       },
+                       {
+                           "range": {
+                               "createDate": {
+                                   "gte": "2023-11-28T10:00:00",
+                                   "lte": "2023-11-29T10:00:00"
+                               }
+                           }
+                       }
+                   ]
+               }
+           },
+           "track_total_hits": True,
+           "size": 200,
+           "from": pnum
+       }
+       filter_path = ['hits.hits._id',
+                      'hits.total.value',
+                      'hits.hits._source.title',
+                      'hits.hits._source.sourceAddress',
+                      'hits.hits._source.createDate',
+                      ]  # 字段2
+       result = self.es.search(index=index_name
+                               , doc_type='_doc'
+                               , filter_path=filter_path
+                               , body=body)
+       # log.info(result)
+       return result
+def main(page, p, esMethod):
+    redis_conn = redis.Redis(connection_pool=pool)
+    result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
+    total = result['hits']['total']['value']
+    if total == 0:
+        log.info('++++已没有数据+++++')
+        return
+    msglist = result['hits']['hits']
+    log.info(f'---第{page}页{len(msglist)}条数据----共{total}条数据----')
+    for mms in msglist:
+        id = mms['_id']
+        title = mms['_source']['title']
+        sourceAddress = mms['_source']['sourceAddress']
+        log.info(f'{id}--{title}--{sourceAddress}---')
+        if redis_conn.lrem('YanBao:id', 0, id) == 0:
+            redis_conn.lpush('YanBao:id', id)
+        else:
+            continue
+def run_threads(num_threads,esMethod,j):
+    threads = []
+    for i in range(num_threads):
+        page = j + i + 1
+        p = j + i * 200
+        thread = threading.Thread(target=main, args=(page, p, esMethod))
+        threads.append(thread)
+        thread.start()
+    for thread in threads:
+        thread.join()
+if __name__ == "__main__":
+    j = 0
+    for i in range(5):
+        esMethod = EsMethod()
+        # result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
+        # total = result['hits']['total']['value']
+        # if total == 0:
+        #     log.info('++++已没有数据+++++')
+        #     break
+        start = time.time()
+        num_threads = 5
+        run_threads(num_threads, esMethod, j)
+        j += 1000
+        log.info(f'5线程 每个处理200条数据 总耗时{time.time() - start}秒')
\ No newline at end of file
--- a/comData/annualReport1023/updateyear.py
+++ b/comData/annualReport1023/updateyear.py
@@ -84,14 +84,15 @@ if __name__ == "__main__":
        id_ = redis_conn.lpop('YanBao:up')
        # id = "23112104300"
-        if id:
+        if id_:
            pass
        else:
            log.info('已无数据')
+            break
        id = id_.decode()
        result_ = esMethod.queryatt(index_name=esMethod.index_name, id=id)
        result = result_['hits']['hits'][0]
        num = 0
        publishDate = result['_source']['publishDate']
-        u_publishDate = '2023-08-31' #+ publishDate.split('T')[1]
+        u_publishDate = '2022-12-31' #+ publishDate.split('T')[1]
        esMethod.updateaunn(esMethod.index_name, str(id), u_publishDate)
--- a/comData/noticeReport/东方财富网-港股公告.py
+++ b/comData/noticeReport/东方财富网-港股公告.py
 import os
@@ -323,7 +323,10 @@ def spider(browser, code, social_code, com_name):
        # span_tag = browser.find_element(By.CLASS_NAME,'mbox')
        span_tag = browser.find_element(By.XPATH, '//div[@class="mbox"]/span[2]')
        current_page = int(span_tag.text)
-        totalpage = int(soup.find_all('div', class_='mbox')[-1].find_all('a')[-1].text)
+        try:
+            totalpage = int(soup.find_all('div', class_='mbox')[-1].find_all('a')[-1].text)
+        except:
+            totalpage = int(soup.find_all('div', class_='mbox')[-1].find_all('a')[-2].text)
        if current_page < totalpage:
            # 说明还未到最后一页
            span_tag.find_element(By.XPATH, './following-sibling::a[1]').click()

--- a/comData/policylaw/BaseCore.py
+++ b/comData/policylaw/BaseCore.py
@@ -508,7 +508,7 @@ class BaseCore:
            except:
                time.sleep(3)
                continue
-        page_size = 0
+        # page_size = 0
        for i in range(0, 3):
            try:
                # name = file_name
@@ -522,23 +522,23 @@ class BaseCore:
                time.sleep(3)
                continue
-        if page_size < 1:
+        # if page_size < 1:
-            # pdf解析失败
+        #     # pdf解析失败
-            # print(f'======pdf解析失败=====')
+        #     # print(f'======pdf解析失败=====')
+        #     return retData
+        # else:
+        try:
+            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+            retData['state'] = True
+            retData['path'] = result['body']['objectUrl'].split('.com')[1]
+            retData['full_path'] = unquote(result['body']['objectUrl'])
+            retData['file_size'] = self.convert_size(file_size)
+            retData['create_time'] = time_now
+        except Exception as e:
+            print(f'error:{e}')
            return retData
-        else:
-            try:
-                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-                retData['state'] = True
-                retData['path'] = result['body']['objectUrl'].split('.com')[1]
-                retData['full_path'] = unquote(result['body']['objectUrl'])
-                retData['file_size'] = self.convert_size(file_size)
-                retData['create_time'] = time_now
-            except Exception as e:
-                print(f'error:{e}')
-                return retData
-            return retData
+        return retData

--- a/comData/policylaw/policy.py
+++ b/comData/policylaw/policy.py