提交 61beba6a 作者: 薛凌堃

9/8

上级 f5fc57ce
......@@ -130,56 +130,14 @@ def spider(com_name,cik):
soup = paserUrl(soup,news_url)
content = soup.text.strip()
# url = f'https://www.sec.gov/edgar/browse/?CIK={cik}&owner=exclude'
# browser.get(url)
# time.sleep(3)
# page_source = browser.page_source
# soup = BeautifulSoup(page_source, 'html.parser')
# # print(soup)
# select_ann = soup.find_all('tr', class_='odd')
#
# for tr in select_ann:
# form_type = tr.find('td').text
# if form_type == '20-F':
# # print(tr)
# # 获取原文链接
# href = tr.find('a', class_='document-link')['href']
# print(href)
# if 'ix?doc' in href:
# href = 'https://www.sec.gov/' + href.split('/ix?doc=/')[1]
# else:
# href = 'https://www.sec.gov' + href
# print(href)
# # 获取发布时间
# a_list = tr.find_all('a')
# # print(a_list)
# for a in a_list:
# text = a.text
# match = re.search(pattern, text)
# if match:
# pub_date = match.group(0)
# # print(pub_date)
# year = pub_date[:4]
# break
# else:
# pub_date = ''
# year = ''
# # 根据年报的链接,请求年报内容,不需要上传文件服务器,直接发送kafka
# browser.get(href)
# time.sleep(3)
# i_page_source = browser.page_source
# i_soup = BeautifulSoup(i_page_source, 'html.parser')
# # print(i_page_source)
# content = i_soup.text
# 采集下来正文内容,直接传输kafka
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
title = f'{com_name}:{year}年年度报告'
log.info(f'---{title}----采集完成----发送数据----')
dic_news = {
'attachmentIds': '',
'author': '',
'content': content,
'contentWithTag': soup,
'contentWithTag': str(soup),
'createDate': time_now,
'deleteFlag': '0',
'id': '',
......@@ -218,6 +176,7 @@ def spider(com_name,cik):
'code': '204',
'e': e
}
log.info(f'{dic_result}---{e}')
def getrequest(social_code,url,headers,data):
......@@ -320,11 +279,13 @@ if __name__ == '__main__':
baseCore.recordLog(social_code, taskType, state, takeTime, '', exeception)
continue
if cik is None:
exeception = 'cik为空'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', exeception)
continue
cik = getCIK(social_code,code)
if cik == '':
exeception = 'cik为空'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', exeception)
continue
# code = 'BP'
# com_name = '英国石油公司'
# cik = ''
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论