提交 4abbca5f 作者: 薛凌堃

证监会年报文件上传至obs

上级 102b4511
import json
import json
......@@ -21,6 +21,7 @@ tracker_conf = get_tracker_conf('./client.conf')
client = Fdfs_client(tracker_conf)
taskType = '企业年报/证监会'
pathType = 'ZJHAnnualReport/'
def RequestUrl(url, payload, item_id, start_time):
# ip = get_proxy()[random.randint(0, 3)]
......@@ -43,26 +44,26 @@ def RequestUrl(url, payload, item_id, start_time):
return soup
def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
create_by, create_time, page_size):
sel_sql = '''select item_id from clb_sys_attachment where item_id = %s and year = %s and type_id=1'''
cursor_.execute(sel_sql, (item_id, year))
selects = cursor_.fetchone()
if selects:
print(f'{name_pdf},{year}已存在')
else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
create_by,
create_time, page_size)
cursor_.execute(Upsql, values) # 插入
cnx.commit() # 提交
print("更新完成:{}".format(Upsql))
# def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
# create_by, create_time, page_size):
#
# sel_sql = '''select item_id from clb_sys_attachment where item_id = %s and year = %s and type_id=1'''
# cursor_.execute(sel_sql, (item_id, year))
# selects = cursor_.fetchone()
# if selects:
# print(f'{name_pdf},{year}已存在')
#
# else:
# Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
#
# values = (
# year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
# create_by,
# create_time, page_size)
#
# cursor_.execute(Upsql, values) # 插入
# cnx.commit() # 提交
# print("更新完成:{}".format(Upsql))
# 采集信息
def SpiderByZJH(url, payload, dic_info, num, start_time):
......@@ -121,19 +122,24 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
cursor_.execute(sel_sql, (item_id, year))
selects = cursor_.fetchone()
if selects:
print(f'com_name:{short_name}、{year}已存在')
log.info(f'com_name:{short_name}、{year}已存在')
continue
else:
retData = baseCore.upLoadToServe(pdf_url, 1, social_code)
retData = baseCore.uptoOBS(pdf_url,name_pdf, 1, social_code,pathType,taskType,start_time)
if retData['state']:
pass
else:
log.info(f'====pdf解析失败====')
return False
#插入数据库获取att_id
num = num + 1
att_id = baseCore.tableUpdate(retData, short_name, year, name_pdf, num)
content = retData['content']
if retData['state']:
if att_id:
pass
else:
log.info(f'====pdf解析失败====')
return False
content = retData['content']
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_news = {
'attachmentIds': att_id,
......@@ -169,7 +175,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
'message': '操作成功',
'code': '200',
}
print(dic_result)
log.info(dic_result)
return True
except Exception as e:
dic_result = {
......@@ -181,7 +187,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败')
print(dic_result)
log.info(dic_result)
return False
else:
continue
......@@ -311,7 +317,8 @@ if __name__ == '__main__':
time.sleep(20)
continue
dic_info = baseCore.getInfomation(social_code)
count = dic_info[15]
count = dic_info[16]
log.info(f'====正在采集{social_code}=====')
# 沪市http://eid.csrc.gov.cn/101111/index.html 深市http://eid.csrc.gov.cn/101811/index.html 北交所http://eid.csrc.gov.cn/102611/index.html
# url 变量 翻页 栏目 http://eid.csrc.gov.cn/101811/index_3_f.html
url_parms = ['101111', '101811', '102611']
......@@ -322,7 +329,7 @@ if __name__ == '__main__':
dic_parms = getUrl(code, url_parms, Catagory2_parms)
SpiderByZJH(dic_parms["url"], dic_parms["payload"], dic_info, num, start_time)
end_time = time.time()
print(f'{dic_info[4]} ---- 该企业耗时 ---- {end_time - start_time}')
log.info(f'{dic_info[4]} ---- 该企业耗时 ---- {end_time - start_time}')
count += 1
runType = 'AnnualReportCount'
baseCore.updateRun(social_code, runType, count)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论