提交 fa46345c 作者: 薛凌堃

11.29

上级 7e42c8e8
import os
import os
......@@ -508,52 +508,67 @@ def sse():
is_member = baseCore.r.sismember('REITs::' + webname, newsUrl)
if is_member:
continue
try:
if '.pdf' in newsUrl:
# pass
content = ''
response = requests.get(newsUrl, timeout=20)
with fitz.open(stream=response.content, filetype='pdf') as doc:
for page in doc.pages():
content += page.get_text()
file_href = newsUrl
file_name = title
att_id, full_path = policy.attuributefile(title, newsUrl, num, publishDate)
if att_id:
id_list.append(att_id)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
if '.pdf' in newsUrl:
content = ''
response = requests.get(newsUrl, timeout=20)
with fitz.open(stream=response.content, filetype='pdf') as doc:
for page in doc.pages():
content += page.get_text()
file_href = newsUrl
file_name = title
policy.attuributefile(title, newsUrl, num, publishDate)
dic_info = {
'序号': num,
'标题': title,
'发布时间': publishDate,
'来源': source,
'原文链接': newsUrl,
'发文时间': '',
'发文机构': '',
'发文字号': '',
'摘要': summary,
'正文': content,
'附件名称': fu_jian_name,
'附件链接': fu_jian_href,
}
DataList.append(dic_info)
else:
newssoup = policy.getrequest_soup(header, newsUrl)
# print(newssoup)
content_ = newssoup.find('div', class_='allZoom')
# print(content_)
# # 将链接替换为绝对路径
contentWithTag = policy.paserUrl(content_, newsUrl)
try:
pubHao = contentWithTag.find('p',style='text-align: center;').text.strip(' ')
if '〔' in pubHao:
pass
else:
dic_info = {
'attachmentIds':id_list,
'author': '',
'content': content,
'contentWithTag': '',
'deleteFlag': 0,
'id': '',
'title': title,
'publishDate': publishDate,
'origin': source,
'sourceAddress': newsUrl,
'writtenDate': None,
'organ': '',
'topicClassification': '',
'issuedNumber': '',
'summary': summary,
'createDate': time_now,
'sid': '1729035244826374145',
}
# DataList.append(dic_info)
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, newsUrl)
log.info(f'采集成功--{title}--{newsUrl}')
except:
for att_id in id_list:
baseCore.deliteATT(att_id)
else:
newssoup = policy.getrequest_soup(header, newsUrl)
# print(newssoup)
policy.paserUrl(newssoup, newsUrl)
content_ = newssoup.find('div', class_='allZoom')
# print(content_)
# # 将链接替换为绝对路径
contentWithTag = policy.paserUrl(content_, newsUrl)
try:
pubHao = contentWithTag.find('p',style='text-align: center;').text.strip(' ')
if '〔' in pubHao:
pass
else:
pubHao = ''
except:
pubHao = ''
except:
pubHao = ''
# print(contentWithTag)
content = contentWithTag.text
# print(contentWithTag)
content = contentWithTag.text
fujian_list = contentWithTag.find_all('a')
......@@ -753,78 +768,78 @@ def hebei():
else:
continue
writeDate_ = p.text
pattern = r"\d{4}年\d{1,2}月\d{1,2}日"
match = re.search(pattern, writeDate_)
if match:
writeDate = match.group(0)
break
writeDate_ = p.text
pattern = r"\d{4}年\d{1,2}月\d{1,2}日"
match = re.search(pattern, writeDate_)
if match:
writeDate1 = match.group(0)
date2 = datetime.strptime(writeDate1, "%Y年%m月%d日")
writeDate = date2.strftime("%Y-%m-%d")
break
else:
continue
except:
try:
contentWithTag = news_soup.find('div',class_='xxgk_gfxwjk_xqy-wznr')
content = news_soup.find('div',class_='xxgk_gfxwjk_xqy-wznr').text
info = news_soup.find('div', class_='xxgk_gfxwjk-xqy-touxx')
policy.deletespan(info)
pub_hao = info.find('p', class_='xxgk_gfxwjk-xqy-touxx4').text
pub_origin = info.find('p', class_='xxgk_gfxwjk-xqy-touxx3').text
writeDate = info.find('p', class_='xxgk_gfxwjk-xqy-touxx5').text
except:
pass
# 附件:
try:
fujian_href = contentWithTag.find_all('a')
for file_href_ in fujian_href:
file_href = file_href_['href']
file_name = file_href_.text
category = os.path.splitext(file_href)[1]
if category in file_name:
pass
else:
continue
except:
file_name = file_name + category
att_id, full_path = policy.attuributefile(file_name, file_href, num, publishDate)
if att_id:
id_list.append(att_id)
file_href_['href'] = full_path
contentWithTag_str = str(contentWithTag)
except Exception as e:
contentWithTag_str = str(contentWithTag)
if content == '':
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info = {
'attachmentIds':id_list,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'title': title.replace('\n', ''),
'publishDate': publishDate,
'origin': source,
'sourceAddress': news_href,
'writtenDate': writeDate,
'organ': pub_origin,
'issuedNumber': pub_hao,
'summary': summary.replace('\n', ''),
'createDate': time_now,
'sid': '1729041576348274689',
}
# print(dic_info)
try:
contentWithTag = news_soup.find('div',class_='xxgk_gfxwjk_xqy-wznr')
content = news_soup.find('div',class_='xxgk_gfxwjk_xqy-wznr').text
info = news_soup.find('div', class_='xxgk_gfxwjk-xqy-touxx')
policy.deletespan(info)
pub_hao = info.find('p', class_='xxgk_gfxwjk-xqy-touxx4').text
pub_origin = info.find('p', class_='xxgk_gfxwjk-xqy-touxx3').text
writeDate = info.find('p', class_='xxgk_gfxwjk-xqy-touxx5').text
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, news_href)
log.info(f'采集成功--{title}--{news_href}')
except:
pass
# 附件:
fu_jian_name = ''
fu_jian_href = ''
try:
fujian_href = contentWithTag.find_all('a')
policy.paserUrl(contentWithTag, news_href)
for file_href_ in fujian_href:
file_href = file_href_['href']
file_name = file_href_.text
category = os.path.splitext(file_href)[1]
if category in file_name:
pass
else:
file_name = file_name + category
rename_file = f'{str(num)}_{publishDate}_{file_name}'
fu_jian_name += rename_file + '\n'
fu_jian_href += file_href + '\n'
policy.downloadfile(file_href, f'{path}/{rename_file}')
for att_id in id_list:
baseCore.deliteATT(att_id)
except Exception as e:
pass
if content == '':
continue
dic_info = {
'序号': num,
'标题': title.replace('\n', ''),
'发布时间': publishDate,
'来源': source,
'原文链接': news_href,
'发文时间': writeDate,
'发文机构': pub_origin,
'发文字号': pub_hao,
'摘要': summary.replace('\n', ''),
'正文': content,
'附件名称': fu_jian_name,
'附件链接': fu_jian_href,
}
print(dic_info)
DataList.append(dic_info)
sheet_name = appName
if sheet_name in wb.sheetnames:
log.info(f"{sheet_name}工作表已存在!")
else:
# 创建新工作表
wb.create_sheet(sheet_name)
print(f"{sheet_name}新工作表创建完成!")
# 保存Excel文件
wb.save(file_path)
baseCore.writerToExcel(DataList, file_path, sheet_name)
break
log.info(f"error!!!{news_href}")
log.info(e)
log.info(f'====第{page}页====处理结束,已采集{num}条数据=================')
# 贵州省人民政府
def guizhou():
......@@ -948,6 +963,7 @@ if __name__=="__main__":
# shenzhen()
# zhengquanqihuo()
# sse()
# hebei()
hebei()
# guizhou()
# zhengquanqihuo()
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论