提交 fa46345c 作者: 薛凌堃

11.29

上级 7e42c8e8
import os import os
...@@ -508,9 +508,9 @@ def sse(): ...@@ -508,9 +508,9 @@ def sse():
is_member = baseCore.r.sismember('REITs::' + webname, newsUrl) is_member = baseCore.r.sismember('REITs::' + webname, newsUrl)
if is_member: if is_member:
continue continue
try:
if '.pdf' in newsUrl: if '.pdf' in newsUrl:
# pass
content = '' content = ''
response = requests.get(newsUrl, timeout=20) response = requests.get(newsUrl, timeout=20)
with fitz.open(stream=response.content, filetype='pdf') as doc: with fitz.open(stream=response.content, filetype='pdf') as doc:
...@@ -519,27 +519,42 @@ def sse(): ...@@ -519,27 +519,42 @@ def sse():
file_href = newsUrl file_href = newsUrl
file_name = title file_name = title
policy.attuributefile(title, newsUrl, num, publishDate) att_id, full_path = policy.attuributefile(title, newsUrl, num, publishDate)
if att_id:
id_list.append(att_id)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info = { dic_info = {
'序号': num, 'attachmentIds':id_list,
'标题': title, 'author': '',
'发布时间': publishDate, 'content': content,
'来源': source, 'contentWithTag': '',
'原文链接': newsUrl, 'deleteFlag': 0,
'发文时间': '', 'id': '',
'发文机构': '', 'title': title,
'发文字号': '', 'publishDate': publishDate,
'摘要': summary, 'origin': source,
'正文': content, 'sourceAddress': newsUrl,
'附件名称': fu_jian_name, 'writtenDate': None,
'附件链接': fu_jian_href, 'organ': '',
'topicClassification': '',
'issuedNumber': '',
'summary': summary,
'createDate': time_now,
'sid': '1729035244826374145',
} }
DataList.append(dic_info) # DataList.append(dic_info)
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, newsUrl)
log.info(f'采集成功--{title}--{newsUrl}')
except:
for att_id in id_list:
baseCore.deliteATT(att_id)
else: else:
newssoup = policy.getrequest_soup(header, newsUrl) newssoup = policy.getrequest_soup(header, newsUrl)
# print(newssoup) # print(newssoup)
policy.paserUrl(newssoup, newsUrl)
content_ = newssoup.find('div', class_='allZoom') content_ = newssoup.find('div', class_='allZoom')
# print(content_) # print(content_)
# # 将链接替换为绝对路径 # # 将链接替换为绝对路径
...@@ -757,7 +772,9 @@ def hebei(): ...@@ -757,7 +772,9 @@ def hebei():
pattern = r"\d{4}年\d{1,2}月\d{1,2}日" pattern = r"\d{4}年\d{1,2}月\d{1,2}日"
match = re.search(pattern, writeDate_) match = re.search(pattern, writeDate_)
if match: if match:
writeDate = match.group(0) writeDate1 = match.group(0)
date2 = datetime.strptime(writeDate1, "%Y年%m月%d日")
writeDate = date2.strftime("%Y-%m-%d")
break break
else: else:
continue continue
...@@ -773,11 +790,9 @@ def hebei(): ...@@ -773,11 +790,9 @@ def hebei():
except: except:
pass pass
# 附件: # 附件:
fu_jian_name = ''
fu_jian_href = ''
try: try:
fujian_href = contentWithTag.find_all('a') fujian_href = contentWithTag.find_all('a')
policy.paserUrl(contentWithTag, news_href)
for file_href_ in fujian_href: for file_href_ in fujian_href:
file_href = file_href_['href'] file_href = file_href_['href']
file_name = file_href_.text file_name = file_href_.text
...@@ -786,45 +801,45 @@ def hebei(): ...@@ -786,45 +801,45 @@ def hebei():
pass pass
else: else:
file_name = file_name + category file_name = file_name + category
rename_file = f'{str(num)}_{publishDate}_{file_name}' att_id, full_path = policy.attuributefile(file_name, file_href, num, publishDate)
fu_jian_name += rename_file + '\n' if att_id:
fu_jian_href += file_href + '\n' id_list.append(att_id)
policy.downloadfile(file_href, f'{path}/{rename_file}') file_href_['href'] = full_path
contentWithTag_str = str(contentWithTag)
except Exception as e: except Exception as e:
pass contentWithTag_str = str(contentWithTag)
if content == '': if content == '':
continue continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info = { dic_info = {
'序号': num, 'attachmentIds':id_list,
'标题': title.replace('\n', ''), 'author': '',
'发布时间': publishDate, 'content': content,
'来源': source, 'contentWithTag': contentWithTag_str,
'原文链接': news_href, 'title': title.replace('\n', ''),
'发文时间': writeDate, 'publishDate': publishDate,
'发文机构': pub_origin, 'origin': source,
'发文字号': pub_hao, 'sourceAddress': news_href,
'摘要': summary.replace('\n', ''), 'writtenDate': writeDate,
'正文': content, 'organ': pub_origin,
'附件名称': fu_jian_name, 'issuedNumber': pub_hao,
'附件链接': fu_jian_href, 'summary': summary.replace('\n', ''),
'createDate': time_now,
'sid': '1729041576348274689',
} }
print(dic_info) # print(dic_info)
DataList.append(dic_info) try:
baseCore.sendkafka(dic_info, topic)
sheet_name = appName baseCore.r.sadd('REITs::' + webname, news_href)
if sheet_name in wb.sheetnames: log.info(f'采集成功--{title}--{news_href}')
log.info(f"{sheet_name}工作表已存在!") except:
else: for att_id in id_list:
# 创建新工作表 baseCore.deliteATT(att_id)
wb.create_sheet(sheet_name) except Exception as e:
print(f"{sheet_name}新工作表创建完成!") log.info(f"error!!!{news_href}")
# 保存Excel文件 log.info(e)
wb.save(file_path) log.info(f'====第{page}页====处理结束,已采集{num}条数据=================')
baseCore.writerToExcel(DataList, file_path, sheet_name)
break
# 贵州省人民政府 # 贵州省人民政府
def guizhou(): def guizhou():
...@@ -948,6 +963,7 @@ if __name__=="__main__": ...@@ -948,6 +963,7 @@ if __name__=="__main__":
# shenzhen() # shenzhen()
# zhengquanqihuo() # zhengquanqihuo()
# sse() # sse()
# hebei() hebei()
# guizhou() # guizhou()
# zhengquanqihuo() # zhengquanqihuo()
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论