提交 7bc6080a 作者: LiuLiYuan

Merge remote-tracking branch 'origin/master'

......@@ -12,6 +12,7 @@ import time
from urllib.parse import urljoin
from base.BaseCore import BaseCore
from datetime import datetime
baseCore = BaseCore()
import requests
import urllib3
......@@ -126,7 +127,16 @@ def spider(com_name,cik,up_okCount):
accessionNumber = accessionNumber_list[i-1]
#发布日期
filingDate = filingDate_list[i-1]
year = filingDate[:4]
# filingDate = '2022-04-01'
date = datetime.strptime(filingDate, '%Y-%m-%d') # 将日期字符串转换为datetime对象
month = date.month # 获取月份
if month <= 6:
year = date.year - 1
elif month > 6:
year = date.year
# year = filingDate[:4]
u_1 = cik
# u_1 = '1395064'
u_2 = accessionNumber.replace('-','')
......
import json
import json
......@@ -73,7 +73,7 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['path'] = unquote(result['body']['objectUrl'].split('.com')[1])
retData['full_path'] = unquote(result['body']['objectUrl'])
retData['file_size'] = convert_size(file_size)
retData['create_time'] = time_now
......@@ -113,15 +113,17 @@ def tableUpdate(retData, com_name, year, pdf_name, num):
# id = ''
# return id
# else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, page_size,path,'zzsn')
cursor_.execute(Upsql, values) # 插入
cnx_.commit() # 提交
try:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, page_size,path,'zzsn')
cursor_.execute(Upsql, values) # 插入
cnx_.commit() # 提交
except Exception as e:
print(e)
log.info("更新完成:{}".format(Upsql))
selects = secrchATT(item_id, pdf_name, type_id,order_by)
id = selects[0]
......@@ -251,11 +253,11 @@ def ifInstert(short_name, social_code, pdf_url):
else:
return ifexist
def InsterInto(social_code, pdf_url,pub_time):
def InsterInto(social_code, pdf_url,pub_time,pdf_name):
insert = False
# 信息插入数据库
try:
insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,publish_time,create_time) values(%s,%s,%s,%s,%s,now())'''
insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,publish_time,title,create_time) values(%s,%s,%s,%s,%s,%s,now())'''
list_info = [
social_code,
......@@ -263,6 +265,7 @@ def InsterInto(social_code, pdf_url,pub_time):
'证监会',
'1',
pub_time,
pdf_name,
]
#144数据库
cursor.execute(insert_sql, tuple(list_info))
......@@ -412,10 +415,14 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库
pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split('\',')[2].strip('\'')
#todo:判断发布日期是否是日期格式
pattern = r"^\d{4}-\d{2}-\d{2}$" # 正则表达式匹配YYYY-MM-DD格式的日期
date_time_pattern = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}"
if re.match(pattern, pub_time):
pass
else:
continue
if re.match(date_time_pattern, pub_time):
pass
else:
continue
year = pub_time[:4]
report_type = td_list[4].text.strip()
......@@ -434,7 +441,7 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '成功')
#发送kafka成功之后 再插入数据库
insert = InsterInto(social_code,pdf_url,pub_time)
insert = InsterInto(social_code,pdf_url,pub_time,name_pdf)
if insert:
log.info(f'===={social_code}========{name_pdf}=====插入库成功')
pass
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论