11.29

fa46345c · 薛凌堃 · 7e42c8e8 · fa46345c
--- a/REITs专题数据/reits.py
+++ b/REITs专题数据/reits.py
 import os
@@ -508,9 +508,9 @@ def sse():
            is_member = baseCore.r.sismember('REITs::' + webname, newsUrl)
            if is_member:
                continue
+            try:
                if '.pdf' in newsUrl:
+                    # pass
                    content = ''
                    response = requests.get(newsUrl, timeout=20)
                    with fitz.open(stream=response.content, filetype='pdf') as doc:
@@ -519,27 +519,42 @@ def sse():
                    file_href = newsUrl
                    file_name = title
-                policy.attuributefile(title, newsUrl, num, publishDate)
+                    att_id, full_path = policy.attuributefile(title, newsUrl, num, publishDate)
+                    if att_id:
+                        id_list.append(att_id)
+                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    dic_info = {
-                    '序号': num,
+                        'attachmentIds':id_list,
-                    '标题': title,
+                        'author': '',
-                    '发布时间': publishDate,
+                        'content': content,
-                    '来源': source,
+                        'contentWithTag': '',
-                    '原文链接': newsUrl,
+                        'deleteFlag': 0,
-                    '发文时间': '',
+                        'id': '',
-                    '发文机构': '',
+                        'title': title,
-                    '发文字号': '',
+                        'publishDate': publishDate,
-                    '摘要': summary,
+                        'origin': source,
-                    '正文': content,
+                        'sourceAddress': newsUrl,
-                    '附件名称': fu_jian_name,
+                        'writtenDate': None,
-                    '附件链接': fu_jian_href,
+                        'organ': '',
+                        'topicClassification': '',
+                        'issuedNumber': '',
+                        'summary': summary,
+                        'createDate': time_now,
+                        'sid': '1729035244826374145',
                    }
-                DataList.append(dic_info)
+                    # DataList.append(dic_info)
+                    try:
+                        baseCore.sendkafka(dic_info, topic)
+                        baseCore.r.sadd('REITs::' + webname, newsUrl)
+                        log.info(f'采集成功--{title}--{newsUrl}')
+                    except:
+                        for att_id in id_list:
+                            baseCore.deliteATT(att_id)
                else:
                    newssoup = policy.getrequest_soup(header, newsUrl)
                    # print(newssoup)
+                    policy.paserUrl(newssoup, newsUrl)
                    content_ = newssoup.find('div', class_='allZoom')
                    # print(content_)
                    # #  将链接替换为绝对路径
@@ -757,7 +772,9 @@ def hebei():
                                pattern = r"\d{4}年\d{1,2}月\d{1,2}日"
                                match = re.search(pattern, writeDate_)
                                if match:
-                                writeDate = match.group(0)
+                                    writeDate1 = match.group(0)
+                                    date2 = datetime.strptime(writeDate1, "%Y年%m月%d日")
+                                    writeDate = date2.strftime("%Y-%m-%d")
                                    break
                                else:
                                    continue
@@ -773,11 +790,9 @@ def hebei():
                        except:
                            pass
                    # 附件：
-                fu_jian_name = ''
-                fu_jian_href = ''
                    try:
                        fujian_href = contentWithTag.find_all('a')
-                    policy.paserUrl(contentWithTag, news_href)
                        for file_href_ in fujian_href:
                            file_href = file_href_['href']
                            file_name = file_href_.text
@@ -786,45 +801,45 @@ def hebei():
                                pass
                            else:
                                file_name = file_name + category
-                        rename_file = f'{str(num)}_{publishDate}_{file_name}'
+                            att_id, full_path = policy.attuributefile(file_name, file_href, num, publishDate)
-                        fu_jian_name += rename_file + '\n'
+                            if att_id:
-                        fu_jian_href += file_href + '\n'
+                                id_list.append(att_id)
-                        policy.downloadfile(file_href, f'{path}/{rename_file}')
+                                file_href_['href'] = full_path
+                        contentWithTag_str = str(contentWithTag)
                    except Exception as e:
-                    pass
+                        contentWithTag_str = str(contentWithTag)
                    if content == '':
                        continue
+                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    dic_info = {
-                    '序号': num,
+                        'attachmentIds':id_list,
-                    '标题': title.replace('\n', ''),
+                        'author': '',
-                    '发布时间': publishDate,
+                        'content': content,
-                    '来源': source,
+                        'contentWithTag': contentWithTag_str,
-                    '原文链接': news_href,
+                        'title': title.replace('\n', ''),
-                    '发文时间': writeDate,
+                        'publishDate': publishDate,
-                    '发文机构': pub_origin,
+                        'origin': source,
-                    '发文字号': pub_hao,
+                        'sourceAddress': news_href,
-                    '摘要': summary.replace('\n', ''),
+                        'writtenDate': writeDate,
-                    '正文': content,
+                        'organ': pub_origin,
-                    '附件名称': fu_jian_name,
+                        'issuedNumber': pub_hao,
-                    '附件链接': fu_jian_href,
+                        'summary': summary.replace('\n', ''),
+                        'createDate': time_now,
+                        'sid': '1729041576348274689',
                    }
-                print(dic_info)
+                    # print(dic_info)
-                DataList.append(dic_info)
+                    try:
+                        baseCore.sendkafka(dic_info, topic)
-                sheet_name = appName
+                        baseCore.r.sadd('REITs::' + webname, news_href)
-                if sheet_name in wb.sheetnames:
+                        log.info(f'采集成功--{title}--{news_href}')
-                    log.info(f"{sheet_name}工作表已存在！")
+                    except:
-                else:
+                        for att_id in id_list:
-                    # 创建新工作表
+                            baseCore.deliteATT(att_id)
-                    wb.create_sheet(sheet_name)
+                except Exception as e:
-                    print(f"{sheet_name}新工作表创建完成！")
+                    log.info(f"error！！！{news_href}")
-                # 保存Excel文件
+                    log.info(e)
-                wb.save(file_path)
+            log.info(f'====第{page}页====处理结束，已采集{num}条数据=================')
-                baseCore.writerToExcel(DataList, file_path, sheet_name)
-        break
 # 贵州省人民政府
 def guizhou():
@@ -948,6 +963,7 @@ if __name__=="__main__":
    # shenzhen()
    # zhengquanqihuo()
    # sse()
-    # hebei()
+    hebei()
    # guizhou()
 # zhengquanqihuo()
\ No newline at end of file