提交 fa46345c 作者: 薛凌堃

11.29

上级 7e42c8e8
import os
import os
......@@ -508,9 +508,9 @@ def sse():
is_member = baseCore.r.sismember('REITs::' + webname, newsUrl)
if is_member:
continue
try:
if '.pdf' in newsUrl:
# pass
content = ''
response = requests.get(newsUrl, timeout=20)
with fitz.open(stream=response.content, filetype='pdf') as doc:
......@@ -519,27 +519,42 @@ def sse():
file_href = newsUrl
file_name = title
policy.attuributefile(title, newsUrl, num, publishDate)
att_id, full_path = policy.attuributefile(title, newsUrl, num, publishDate)
if att_id:
id_list.append(att_id)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info = {
'序号': num,
'标题': title,
'发布时间': publishDate,
'来源': source,
'原文链接': newsUrl,
'发文时间': '',
'发文机构': '',
'发文字号': '',
'摘要': summary,
'正文': content,
'附件名称': fu_jian_name,
'附件链接': fu_jian_href,
'attachmentIds':id_list,
'author': '',
'content': content,
'contentWithTag': '',
'deleteFlag': 0,
'id': '',
'title': title,
'publishDate': publishDate,
'origin': source,
'sourceAddress': newsUrl,
'writtenDate': None,
'organ': '',
'topicClassification': '',
'issuedNumber': '',
'summary': summary,
'createDate': time_now,
'sid': '1729035244826374145',
}
DataList.append(dic_info)
# DataList.append(dic_info)
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, newsUrl)
log.info(f'采集成功--{title}--{newsUrl}')
except:
for att_id in id_list:
baseCore.deliteATT(att_id)
else:
newssoup = policy.getrequest_soup(header, newsUrl)
# print(newssoup)
policy.paserUrl(newssoup, newsUrl)
content_ = newssoup.find('div', class_='allZoom')
# print(content_)
# # 将链接替换为绝对路径
......@@ -757,7 +772,9 @@ def hebei():
pattern = r"\d{4}年\d{1,2}月\d{1,2}日"
match = re.search(pattern, writeDate_)
if match:
writeDate = match.group(0)
writeDate1 = match.group(0)
date2 = datetime.strptime(writeDate1, "%Y年%m月%d日")
writeDate = date2.strftime("%Y-%m-%d")
break
else:
continue
......@@ -773,11 +790,9 @@ def hebei():
except:
pass
# 附件:
fu_jian_name = ''
fu_jian_href = ''
try:
fujian_href = contentWithTag.find_all('a')
policy.paserUrl(contentWithTag, news_href)
for file_href_ in fujian_href:
file_href = file_href_['href']
file_name = file_href_.text
......@@ -786,45 +801,45 @@ def hebei():
pass
else:
file_name = file_name + category
rename_file = f'{str(num)}_{publishDate}_{file_name}'
fu_jian_name += rename_file + '\n'
fu_jian_href += file_href + '\n'
policy.downloadfile(file_href, f'{path}/{rename_file}')
att_id, full_path = policy.attuributefile(file_name, file_href, num, publishDate)
if att_id:
id_list.append(att_id)
file_href_['href'] = full_path
contentWithTag_str = str(contentWithTag)
except Exception as e:
pass
contentWithTag_str = str(contentWithTag)
if content == '':
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info = {
'序号': num,
'标题': title.replace('\n', ''),
'发布时间': publishDate,
'来源': source,
'原文链接': news_href,
'发文时间': writeDate,
'发文机构': pub_origin,
'发文字号': pub_hao,
'摘要': summary.replace('\n', ''),
'正文': content,
'附件名称': fu_jian_name,
'附件链接': fu_jian_href,
'attachmentIds':id_list,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'title': title.replace('\n', ''),
'publishDate': publishDate,
'origin': source,
'sourceAddress': news_href,
'writtenDate': writeDate,
'organ': pub_origin,
'issuedNumber': pub_hao,
'summary': summary.replace('\n', ''),
'createDate': time_now,
'sid': '1729041576348274689',
}
print(dic_info)
DataList.append(dic_info)
sheet_name = appName
if sheet_name in wb.sheetnames:
log.info(f"{sheet_name}工作表已存在!")
else:
# 创建新工作表
wb.create_sheet(sheet_name)
print(f"{sheet_name}新工作表创建完成!")
# 保存Excel文件
wb.save(file_path)
baseCore.writerToExcel(DataList, file_path, sheet_name)
break
# print(dic_info)
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, news_href)
log.info(f'采集成功--{title}--{news_href}')
except:
for att_id in id_list:
baseCore.deliteATT(att_id)
except Exception as e:
log.info(f"error!!!{news_href}")
log.info(e)
log.info(f'====第{page}页====处理结束,已采集{num}条数据=================')
# 贵州省人民政府
def guizhou():
......@@ -948,6 +963,7 @@ if __name__=="__main__":
# shenzhen()
# zhengquanqihuo()
# sse()
# hebei()
hebei()
# guizhou()
# zhengquanqihuo()
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论