11.29

fa46345c · 薛凌堃 · 7e42c8e8 · fa46345c
--- a/REITs专题数据/reits.py
+++ b/REITs专题数据/reits.py
-import os
+import os
@@ -508,52 +508,67 @@ def sse():
            is_member = baseCore.r.sismember('REITs::' + webname, newsUrl)
            if is_member:
                continue
+            try:
+                if '.pdf' in newsUrl:
+                    # pass
+                    content = ''
+                    response = requests.get(newsUrl, timeout=20)
+                    with fitz.open(stream=response.content, filetype='pdf') as doc:
+                        for page in doc.pages():
+                            content += page.get_text()
+                    file_href = newsUrl
+                    file_name = title
+
+                    att_id, full_path = policy.attuributefile(title, newsUrl, num, publishDate)
+                    if att_id:
+                        id_list.append(att_id)
+                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

-            if '.pdf' in newsUrl:
-
-                content = ''
-                response = requests.get(newsUrl, timeout=20)
-                with fitz.open(stream=response.content, filetype='pdf') as doc:
-                    for page in doc.pages():
-                        content += page.get_text()
-                file_href = newsUrl
-                file_name = title
-
-                policy.attuributefile(title, newsUrl, num, publishDate)
-
-
-                dic_info = {
-                    '序号': num,
-                    '标题': title,
-                    '发布时间': publishDate,
-                    '来源': source,
-                    '原文链接': newsUrl,
-                    '发文时间': '',
-                    '发文机构': '',
-                    '发文字号': '',
-                    '摘要': summary,
-                    '正文': content,
-                    '附件名称': fu_jian_name,
-                    '附件链接': fu_jian_href,
-                }
-                DataList.append(dic_info)
-            else:
-                newssoup = policy.getrequest_soup(header, newsUrl)
-                # print(newssoup)
-                content_ = newssoup.find('div', class_='allZoom')
-                # print(content_)
-                # #  将链接替换为绝对路径
-                contentWithTag = policy.paserUrl(content_, newsUrl)
-                try:
-                    pubHao = contentWithTag.find('p',style='text-align: center;').text.strip(' ')
-                    if '〔' in pubHao:
-                        pass
-                    else:
+                    dic_info = {
+                        'attachmentIds':id_list,
+                        'author': '',
+                        'content': content,
+                        'contentWithTag': '',
+                        'deleteFlag': 0,
+                        'id': '',
+                        'title': title,
+                        'publishDate': publishDate,
+                        'origin': source,
+                        'sourceAddress': newsUrl,
+                        'writtenDate': None,
+                        'organ': '',
+                        'topicClassification': '',
+                        'issuedNumber': '',
+                        'summary': summary,
+                        'createDate': time_now,
+                        'sid': '1729035244826374145',
+                    }
+                    # DataList.append(dic_info)
+                    try:
+                        baseCore.sendkafka(dic_info, topic)
+                        baseCore.r.sadd('REITs::' + webname, newsUrl)
+                        log.info(f'采集成功--{title}--{newsUrl}')
+                    except:
+                        for att_id in id_list:
+                            baseCore.deliteATT(att_id)
+                else:
+                    newssoup = policy.getrequest_soup(header, newsUrl)
+                    # print(newssoup)
+                    policy.paserUrl(newssoup, newsUrl)
+                    content_ = newssoup.find('div', class_='allZoom')
+                    # print(content_)
+                    # #  将链接替换为绝对路径
+                    contentWithTag = policy.paserUrl(content_, newsUrl)
+                    try:
+                        pubHao = contentWithTag.find('p',style='text-align: center;').text.strip(' ')
+                        if '〔' in pubHao:
+                            pass
+                        else:
+                            pubHao = ''
+                    except:
                        pubHao = ''
-                except:
-                    pubHao = ''
-                # print(contentWithTag)
-                content = contentWithTag.text
+                    # print(contentWithTag)
+                    content = contentWithTag.text

                    fujian_list = contentWithTag.find_all('a')

@@ -753,78 +768,78 @@ def hebei():
                                else:
                                    continue

-                            writeDate_ = p.text
-                            pattern = r"\d{4}年\d{1,2}月\d{1,2}日"
-                            match = re.search(pattern, writeDate_)
-                            if match:
-                                writeDate = match.group(0)
-                                break
+                                writeDate_ = p.text
+                                pattern = r"\d{4}年\d{1,2}月\d{1,2}日"
+                                match = re.search(pattern, writeDate_)
+                                if match:
+                                    writeDate1 = match.group(0)
+                                    date2 = datetime.strptime(writeDate1, "%Y年%m月%d日")
+                                    writeDate = date2.strftime("%Y-%m-%d")
+                                    break
+                                else:
+                                    continue
+                    except:
+                        try:
+                            contentWithTag = news_soup.find('div',class_='xxgk_gfxwjk_xqy-wznr')
+                            content = news_soup.find('div',class_='xxgk_gfxwjk_xqy-wznr').text
+                            info = news_soup.find('div', class_='xxgk_gfxwjk-xqy-touxx')
+                            policy.deletespan(info)
+                            pub_hao = info.find('p', class_='xxgk_gfxwjk-xqy-touxx4').text
+                            pub_origin = info.find('p', class_='xxgk_gfxwjk-xqy-touxx3').text
+                            writeDate = info.find('p', class_='xxgk_gfxwjk-xqy-touxx5').text
+                        except:
+                            pass
+                    # 附件：
+                    try:
+                        fujian_href = contentWithTag.find_all('a')
+
+                        for file_href_ in fujian_href:
+                            file_href = file_href_['href']
+                            file_name = file_href_.text
+                            category = os.path.splitext(file_href)[1]
+                            if category in file_name:
+                                pass
                            else:
-                                continue
-                except:
+                                file_name = file_name + category
+                            att_id, full_path = policy.attuributefile(file_name, file_href, num, publishDate)
+                            if att_id:
+                                id_list.append(att_id)
+                                file_href_['href'] = full_path
+                        contentWithTag_str = str(contentWithTag)
+                    except Exception as e:
+                        contentWithTag_str = str(contentWithTag)
+
+                    if content == '':
+                        continue
+                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                    dic_info = {
+                        'attachmentIds':id_list,
+                        'author': '',
+                        'content': content,
+                        'contentWithTag': contentWithTag_str,
+                        'title': title.replace('\n', ''),
+                        'publishDate': publishDate,
+                        'origin': source,
+                        'sourceAddress': news_href,
+                        'writtenDate': writeDate,
+                        'organ': pub_origin,
+                        'issuedNumber': pub_hao,
+                        'summary': summary.replace('\n', ''),
+                        'createDate': time_now,
+                        'sid': '1729041576348274689',
+                    }
+                    # print(dic_info)
                    try:
-                        contentWithTag = news_soup.find('div',class_='xxgk_gfxwjk_xqy-wznr')
-                        content = news_soup.find('div',class_='xxgk_gfxwjk_xqy-wznr').text
-                        info = news_soup.find('div', class_='xxgk_gfxwjk-xqy-touxx')
-                        policy.deletespan(info)
-                        pub_hao = info.find('p', class_='xxgk_gfxwjk-xqy-touxx4').text
-                        pub_origin = info.find('p', class_='xxgk_gfxwjk-xqy-touxx3').text
-                        writeDate = info.find('p', class_='xxgk_gfxwjk-xqy-touxx5').text
+                        baseCore.sendkafka(dic_info, topic)
+                        baseCore.r.sadd('REITs::' + webname, news_href)
+                        log.info(f'采集成功--{title}--{news_href}')
                    except:
-                        pass
-                # 附件：
-                fu_jian_name = ''
-                fu_jian_href = ''
-                try:
-                    fujian_href = contentWithTag.find_all('a')
-                    policy.paserUrl(contentWithTag, news_href)
-                    for file_href_ in fujian_href:
-                        file_href = file_href_['href']
-                        file_name = file_href_.text
-                        category = os.path.splitext(file_href)[1]
-                        if category in file_name:
-                            pass
-                        else:
-                            file_name = file_name + category
-                        rename_file = f'{str(num)}_{publishDate}_{file_name}'
-                        fu_jian_name += rename_file + '\n'
-                        fu_jian_href += file_href + '\n'
-                        policy.downloadfile(file_href, f'{path}/{rename_file}')
-
+                        for att_id in id_list:
+                            baseCore.deliteATT(att_id)
                except Exception as e:
-                    pass
-                if content == '':
-                    continue
-                dic_info = {
-                    '序号': num,
-                    '标题': title.replace('\n', ''),
-                    '发布时间': publishDate,
-                    '来源': source,
-                    '原文链接': news_href,
-                    '发文时间': writeDate,
-                    '发文机构': pub_origin,
-                    '发文字号': pub_hao,
-                    '摘要': summary.replace('\n', ''),
-                    '正文': content,
-                    '附件名称': fu_jian_name,
-                    '附件链接': fu_jian_href,
-                }
-                print(dic_info)
-                DataList.append(dic_info)
-
-                sheet_name = appName
-                if sheet_name in wb.sheetnames:
-                    log.info(f"{sheet_name}工作表已存在！")
-                else:
-                    # 创建新工作表
-                    wb.create_sheet(sheet_name)
-                    print(f"{sheet_name}新工作表创建完成！")
-                # 保存Excel文件
-                wb.save(file_path)
-
-                baseCore.writerToExcel(DataList, file_path, sheet_name)
-        break
-
+                    log.info(f"error！！！{news_href}")
+                    log.info(e)
+            log.info(f'====第{page}页====处理结束，已采集{num}条数据=================')

 # 贵州省人民政府
 def guizhou():
@@ -948,6 +963,7 @@ if __name__=="__main__":
    # shenzhen()
    # zhengquanqihuo()
    # sse()
-    # hebei()
+    hebei()
    # guizhou()
+
 # zhengquanqihuo()
\ No newline at end of file