提交 bcb04605 作者: 薛凌堃

证监会年报 title加上.pdf后缀

上级 9fa0c999
import json import json
...@@ -99,9 +99,19 @@ def SpiderByZJH(url, payload, dic_info, num, start_time): ...@@ -99,9 +99,19 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
pdf_url_info = td_list[2] pdf_url_info = td_list[2]
# print(pdf_url) # print(pdf_url)
pdf_url = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[0].strip('\'') pdf_url = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[0].strip('\'')
name_pdf = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[1].strip('\'') name_pdf = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[1].strip('\'') + '.pdf'
pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'') pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')
# todo:判断发布日期是否是日期格式
pattern = r"^\d{4}-\d{2}-\d{2}$" # 正则表达式匹配YYYY-MM-DD格式的日期
date_time_pattern = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}"
if re.match(pattern, pub_time):
pass
else:
if re.match(date_time_pattern, pub_time):
pass
else:
continue
# print(name) # print(name)
report_type = td_list[4].text.strip() report_type = td_list[4].text.strip()
# print(report_type) # print(report_type)
...@@ -154,7 +164,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time): ...@@ -154,7 +164,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
'origin': '证监会', 'origin': '证监会',
'publishDate': pub_time, 'publishDate': pub_time,
'sid': '1684032033495392257', 'sid': '1684032033495392257',
'sourceAddress': '', # 原文链接 'sourceAddress': pdf_url, # 原文链接
'summary': '', 'summary': '',
'title': name_pdf, 'title': name_pdf,
'type': 1, 'type': 1,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论