提交 4abbca5f 作者: 薛凌堃

证监会年报文件上传至obs

上级 102b4511
import json import json
...@@ -21,6 +21,7 @@ tracker_conf = get_tracker_conf('./client.conf') ...@@ -21,6 +21,7 @@ tracker_conf = get_tracker_conf('./client.conf')
client = Fdfs_client(tracker_conf) client = Fdfs_client(tracker_conf)
taskType = '企业年报/证监会' taskType = '企业年报/证监会'
pathType = 'ZJHAnnualReport/'
def RequestUrl(url, payload, item_id, start_time): def RequestUrl(url, payload, item_id, start_time):
# ip = get_proxy()[random.randint(0, 3)] # ip = get_proxy()[random.randint(0, 3)]
...@@ -43,26 +44,26 @@ def RequestUrl(url, payload, item_id, start_time): ...@@ -43,26 +44,26 @@ def RequestUrl(url, payload, item_id, start_time):
return soup return soup
def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status, # def tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
create_by, create_time, page_size): # create_by, create_time, page_size):
#
sel_sql = '''select item_id from clb_sys_attachment where item_id = %s and year = %s and type_id=1''' # sel_sql = '''select item_id from clb_sys_attachment where item_id = %s and year = %s and type_id=1'''
cursor_.execute(sel_sql, (item_id, year)) # cursor_.execute(sel_sql, (item_id, year))
selects = cursor_.fetchone() # selects = cursor_.fetchone()
if selects: # if selects:
print(f'{name_pdf},{year}已存在') # print(f'{name_pdf},{year}已存在')
#
else: # else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''' # Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
#
values = ( # values = (
year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status, # year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
create_by, # create_by,
create_time, page_size) # create_time, page_size)
#
cursor_.execute(Upsql, values) # 插入 # cursor_.execute(Upsql, values) # 插入
cnx.commit() # 提交 # cnx.commit() # 提交
print("更新完成:{}".format(Upsql)) # print("更新完成:{}".format(Upsql))
# 采集信息 # 采集信息
def SpiderByZJH(url, payload, dic_info, num, start_time): def SpiderByZJH(url, payload, dic_info, num, start_time):
...@@ -121,19 +122,24 @@ def SpiderByZJH(url, payload, dic_info, num, start_time): ...@@ -121,19 +122,24 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
cursor_.execute(sel_sql, (item_id, year)) cursor_.execute(sel_sql, (item_id, year))
selects = cursor_.fetchone() selects = cursor_.fetchone()
if selects: if selects:
print(f'com_name:{short_name}、{year}已存在') log.info(f'com_name:{short_name}、{year}已存在')
continue continue
else: else:
retData = baseCore.upLoadToServe(pdf_url, 1, social_code) retData = baseCore.uptoOBS(pdf_url,name_pdf, 1, social_code,pathType,taskType,start_time)
if retData['state']:
pass
else:
log.info(f'====pdf解析失败====')
return False
#插入数据库获取att_id #插入数据库获取att_id
num = num + 1 num = num + 1
att_id = baseCore.tableUpdate(retData, short_name, year, name_pdf, num) att_id = baseCore.tableUpdate(retData, short_name, year, name_pdf, num)
content = retData['content'] if att_id:
if retData['state']:
pass pass
else: else:
log.info(f'====pdf解析失败====')
return False return False
content = retData['content']
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_news = { dic_news = {
'attachmentIds': att_id, 'attachmentIds': att_id,
...@@ -169,7 +175,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time): ...@@ -169,7 +175,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
'message': '操作成功', 'message': '操作成功',
'code': '200', 'code': '200',
} }
print(dic_result) log.info(dic_result)
return True return True
except Exception as e: except Exception as e:
dic_result = { dic_result = {
...@@ -181,7 +187,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time): ...@@ -181,7 +187,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
state = 0 state = 0
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败') baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败')
print(dic_result) log.info(dic_result)
return False return False
else: else:
continue continue
...@@ -311,7 +317,8 @@ if __name__ == '__main__': ...@@ -311,7 +317,8 @@ if __name__ == '__main__':
time.sleep(20) time.sleep(20)
continue continue
dic_info = baseCore.getInfomation(social_code) dic_info = baseCore.getInfomation(social_code)
count = dic_info[15] count = dic_info[16]
log.info(f'====正在采集{social_code}=====')
# 沪市http://eid.csrc.gov.cn/101111/index.html 深市http://eid.csrc.gov.cn/101811/index.html 北交所http://eid.csrc.gov.cn/102611/index.html # 沪市http://eid.csrc.gov.cn/101111/index.html 深市http://eid.csrc.gov.cn/101811/index.html 北交所http://eid.csrc.gov.cn/102611/index.html
# url 变量 翻页 栏目 http://eid.csrc.gov.cn/101811/index_3_f.html # url 变量 翻页 栏目 http://eid.csrc.gov.cn/101811/index_3_f.html
url_parms = ['101111', '101811', '102611'] url_parms = ['101111', '101811', '102611']
...@@ -322,7 +329,7 @@ if __name__ == '__main__': ...@@ -322,7 +329,7 @@ if __name__ == '__main__':
dic_parms = getUrl(code, url_parms, Catagory2_parms) dic_parms = getUrl(code, url_parms, Catagory2_parms)
SpiderByZJH(dic_parms["url"], dic_parms["payload"], dic_info, num, start_time) SpiderByZJH(dic_parms["url"], dic_parms["payload"], dic_info, num, start_time)
end_time = time.time() end_time = time.time()
print(f'{dic_info[4]} ---- 该企业耗时 ---- {end_time - start_time}') log.info(f'{dic_info[4]} ---- 该企业耗时 ---- {end_time - start_time}')
count += 1 count += 1
runType = 'AnnualReportCount' runType = 'AnnualReportCount'
baseCore.updateRun(social_code, runType, count) baseCore.updateRun(social_code, runType, count)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论