提交 7bc6080a 作者: LiuLiYuan

Merge remote-tracking branch 'origin/master'

...@@ -12,6 +12,7 @@ import time ...@@ -12,6 +12,7 @@ import time
from urllib.parse import urljoin from urllib.parse import urljoin
from base.BaseCore import BaseCore from base.BaseCore import BaseCore
from datetime import datetime
baseCore = BaseCore() baseCore = BaseCore()
import requests import requests
import urllib3 import urllib3
...@@ -126,7 +127,16 @@ def spider(com_name,cik,up_okCount): ...@@ -126,7 +127,16 @@ def spider(com_name,cik,up_okCount):
accessionNumber = accessionNumber_list[i-1] accessionNumber = accessionNumber_list[i-1]
#发布日期 #发布日期
filingDate = filingDate_list[i-1] filingDate = filingDate_list[i-1]
year = filingDate[:4] # filingDate = '2022-04-01'
date = datetime.strptime(filingDate, '%Y-%m-%d') # 将日期字符串转换为datetime对象
month = date.month # 获取月份
if month <= 6:
year = date.year - 1
elif month > 6:
year = date.year
# year = filingDate[:4]
u_1 = cik u_1 = cik
# u_1 = '1395064' # u_1 = '1395064'
u_2 = accessionNumber.replace('-','') u_2 = accessionNumber.replace('-','')
......
import json import json
...@@ -73,7 +73,7 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code): ...@@ -73,7 +73,7 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
try: try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1] retData['path'] = unquote(result['body']['objectUrl'].split('.com')[1])
retData['full_path'] = unquote(result['body']['objectUrl']) retData['full_path'] = unquote(result['body']['objectUrl'])
retData['file_size'] = convert_size(file_size) retData['file_size'] = convert_size(file_size)
retData['create_time'] = time_now retData['create_time'] = time_now
...@@ -113,15 +113,17 @@ def tableUpdate(retData, com_name, year, pdf_name, num): ...@@ -113,15 +113,17 @@ def tableUpdate(retData, com_name, year, pdf_name, num):
# id = '' # id = ''
# return id # return id
# else: # else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''' try:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by, values = (
status, create_by, year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
create_time, page_size,path,'zzsn') status, create_by,
create_time, page_size,path,'zzsn')
cursor_.execute(Upsql, values) # 插入 cursor_.execute(Upsql, values) # 插入
cnx_.commit() # 提交 cnx_.commit() # 提交
except Exception as e:
print(e)
log.info("更新完成:{}".format(Upsql)) log.info("更新完成:{}".format(Upsql))
selects = secrchATT(item_id, pdf_name, type_id,order_by) selects = secrchATT(item_id, pdf_name, type_id,order_by)
id = selects[0] id = selects[0]
...@@ -251,11 +253,11 @@ def ifInstert(short_name, social_code, pdf_url): ...@@ -251,11 +253,11 @@ def ifInstert(short_name, social_code, pdf_url):
else: else:
return ifexist return ifexist
def InsterInto(social_code, pdf_url,pub_time): def InsterInto(social_code, pdf_url,pub_time,pdf_name):
insert = False insert = False
# 信息插入数据库 # 信息插入数据库
try: try:
insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,publish_time,create_time) values(%s,%s,%s,%s,%s,now())''' insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,publish_time,title,create_time) values(%s,%s,%s,%s,%s,%s,now())'''
list_info = [ list_info = [
social_code, social_code,
...@@ -263,6 +265,7 @@ def InsterInto(social_code, pdf_url,pub_time): ...@@ -263,6 +265,7 @@ def InsterInto(social_code, pdf_url,pub_time):
'证监会', '证监会',
'1', '1',
pub_time, pub_time,
pdf_name,
] ]
#144数据库 #144数据库
cursor.execute(insert_sql, tuple(list_info)) cursor.execute(insert_sql, tuple(list_info))
...@@ -412,10 +415,14 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库 ...@@ -412,10 +415,14 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库
pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split('\',')[2].strip('\'') pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split('\',')[2].strip('\'')
#todo:判断发布日期是否是日期格式 #todo:判断发布日期是否是日期格式
pattern = r"^\d{4}-\d{2}-\d{2}$" # 正则表达式匹配YYYY-MM-DD格式的日期 pattern = r"^\d{4}-\d{2}-\d{2}$" # 正则表达式匹配YYYY-MM-DD格式的日期
date_time_pattern = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}"
if re.match(pattern, pub_time): if re.match(pattern, pub_time):
pass pass
else: else:
continue if re.match(date_time_pattern, pub_time):
pass
else:
continue
year = pub_time[:4] year = pub_time[:4]
report_type = td_list[4].text.strip() report_type = td_list[4].text.strip()
...@@ -434,7 +441,7 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库 ...@@ -434,7 +441,7 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '成功') baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '成功')
#发送kafka成功之后 再插入数据库 #发送kafka成功之后 再插入数据库
insert = InsterInto(social_code,pdf_url,pub_time) insert = InsterInto(social_code,pdf_url,pub_time,name_pdf)
if insert: if insert:
log.info(f'===={social_code}========{name_pdf}=====插入库成功') log.info(f'===={social_code}========{name_pdf}=====插入库成功')
pass pass
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论