Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
7baf2215
提交
7baf2215
authored
9月 08, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
9/8
上级
780a2b5e
隐藏空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
203 行增加
和
2 行删除
+203
-2
annualreportUS.py
comData/annualReport_XQW/annualreportUS.py
+201
-0
雪球网-年报.py
comData/annualReport_ZJH/雪球网-年报.py
+2
-2
没有找到文件。
comData/annualReport_XQW/annualreportUS.py
0 → 100644
浏览文件 @
7baf2215
"""
打开SEC网址——【FILINGS】——【Company Filing】——输入证券代码——选10-K和20-F为年报
1. 根据美股代码 拿到企业对应的cik
2. 根据cik 拼接链接拿到json数据
3. 遍历json数组文件 拼接详情链接
4. 解析详情文章 通过kafka发送数据
"""
import
json
import
re
import
time
from
base.BaseCore
import
BaseCore
baseCore
=
BaseCore
()
import
requests
from
bs4
import
BeautifulSoup
from
kafka
import
KafkaProducer
from
selenium
import
webdriver
def
spider
(
com_name
,
cik
):
url
=
f
'https://www.sec.gov/edgar/browse/?CIK={cik}&owner=exclude'
browser
.
get
(
url
)
time
.
sleep
(
3
)
page_source
=
browser
.
page_source
soup
=
BeautifulSoup
(
page_source
,
'html.parser'
)
# print(soup)
select_ann
=
soup
.
find_all
(
'tr'
,
class_
=
'odd'
)
for
tr
in
select_ann
:
form_type
=
tr
.
find
(
'td'
)
.
text
if
form_type
==
'20-F'
:
# print(tr)
# 获取原文链接
href
=
tr
.
find
(
'a'
,
class_
=
'document-link'
)[
'href'
]
print
(
href
)
if
'ix?doc'
in
href
:
href
=
'https://www.sec.gov/'
+
href
.
split
(
'/ix?doc=/'
)[
1
]
else
:
href
=
'https://www.sec.gov'
+
href
print
(
href
)
# 获取发布时间
a_list
=
tr
.
find_all
(
'a'
)
# print(a_list)
for
a
in
a_list
:
text
=
a
.
text
match
=
re
.
search
(
pattern
,
text
)
if
match
:
pub_date
=
match
.
group
(
0
)
# print(pub_date)
year
=
pub_date
[:
4
]
break
else
:
pub_date
=
''
year
=
''
# 根据年报的链接,请求年报内容,不需要上传文件服务器,直接发送kafka
browser
.
get
(
href
)
time
.
sleep
(
3
)
i_page_source
=
browser
.
page_source
i_soup
=
BeautifulSoup
(
i_page_source
,
'html.parser'
)
# print(i_page_source)
content
=
i_soup
.
text
# 采集下来正文内容,直接传输kafka
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
title
=
f
'{com_name}:{year}年年度报告'
dic_news
=
{
'attachmentIds'
:
''
,
'author'
:
''
,
'content'
:
content
,
'contentWithTag'
:
i_page_source
,
'createDate'
:
time_now
,
'deleteFlag'
:
'0'
,
'id'
:
''
,
'keyWords'
:
''
,
'lang'
:
'zh'
,
'origin'
:
'SEC美国证券交易委员会'
,
'publishDate'
:
pub_date
,
'sid'
:
'1684032033495392257'
,
'sourceAddress'
:
href
,
# 原文链接
'summary'
:
''
,
'title'
:
title
,
'type'
:
1
,
'socialCreditCode'
:
social_code
,
'year'
:
year
}
# print(dic_news)
# 将相应字段通过kafka传输保存
# try:
# producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
# kafka_result = producer.send("researchReportTopic",
# json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
#
# print(kafka_result.get(timeout=10))
#
# dic_result = {
# 'success': 'ture',
# 'message': '操作成功',
# 'code': '200',
# }
# print(dic_result)
#
# except Exception as e:
# dic_result = {
# 'success': 'false',
# 'message': '操作失败',
# 'code': '204',
# 'e': e
# }
def
getrequest
(
social_code
,
url
,
headers
,
data
):
#通过请求post接口获取企业的CIK
response
=
requests
.
post
(
url
=
url
,
headers
=
headers
,
data
=
data
)
# ,proxies=ip)
response
.
encoding
=
response
.
apparent_encoding
# 检查响应状态码
if
response
.
status_code
==
200
:
# 请求成功,处理响应数据
# print(response.text)
result
=
response
.
json
()
# print(result)
pass
else
:
# 请求失败,输出错误信息
print
(
'请求失败:'
,
response
.
status_code
,
response
.
text
)
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
url
,
'请求失败'
)
result
=
''
return
result
#模拟浏览器
chromedriver
=
"D:/chrome/chromedriver.exe"
browser
=
webdriver
.
Chrome
(
chromedriver
)
pattern
=
r"\d{4}-\d{2}-\d{2}"
if
__name__
==
'__main__'
:
headers
=
{
'authority'
:
'efts.sec.gov'
,
'method'
:
'POST'
,
'path'
:
'/LATEST/search-index'
,
'scheme'
:
'https'
,
'accept'
:
'*/*'
,
'accept-encoding'
:
'gzip deflate br'
,
'accept-language'
:
'zh-CNzh;q=0.9en;q=0.8'
,
'content-length'
:
'34'
,
'content-type'
:
'application/x-www-form-urlencoded; charset=UTF-8'
,
'origin'
:
'https://www.sec.gov'
,
'referer'
:
'https://www.sec.gov/'
,
'sec-fetch-dest'
:
'empty'
,
'sec-fetch-mode'
:
'cors'
,
'sec-fetch-site'
:
'same-site'
,
'user-agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/80.0.3987.116 Safari/537.36'
}
url
=
'https://efts.sec.gov/LATEST/search-index'
num
=
0
taskType
=
'企业年报/SEC'
while
True
:
start_time
=
time
.
time
()
social_code
=
''
# if not social_code:
# time.sleep(20)
# continue
# if social_code == 'None':
# time.sleep(20)
# continue
# if social_code == '':
# time.sleep(20)
# continue
# dic_info = baseCore.getInfomation(social_code)
# count = dic_info[15]
# code = dic_info[3]
# com_name = dic_info[4]
# if code is None:
# exeception = '股票代码为空'
# state = 0
# takeTime = baseCore.getTimeCost(start_time, time.time())
# baseCore.recordLog(social_code, taskType, state, takeTime, '', exeception)
# continue
code
=
'BP'
#"MNSO" post请求 获取企业CIK
# payload = {"keysTyped":f"{code}","narrow":flag}
payload
=
{
"keysTyped"
:
"BP"
,
"narrow"
:
True
}
data
=
json
.
dumps
(
payload
)
result
=
getrequest
(
social_code
,
url
,
headers
,
data
)
# print(result)
#判断接口返回的数据哪一条是该企业 根据股票代码
tickers
=
result
[
'hits'
][
'hits'
]
for
ticker
in
tickers
:
i_t
=
ticker
[
'_source'
][
'tickers'
]
if
i_t
==
code
:
cik
=
ticker
[
'_id'
]
print
(
cik
)
break
# break
spider
(
com_name
,
cik
)
break
comData/annualReport_ZJH/雪球网-年报.py
浏览文件 @
7baf2215
# -*-
coding: utf-8 -*-
# -*-
coding: utf-8 -*-
...
@@ -240,7 +240,7 @@ if __name__ == '__main__':
...
@@ -240,7 +240,7 @@ if __name__ == '__main__':
start_time
=
time
.
time
()
start_time
=
time
.
time
()
# 获取企业信息
# 获取企业信息
# social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
# social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
social_code
=
'91330
20071331910XJ
'
social_code
=
'91330
60072360502XQ
'
if
not
social_code
:
if
not
social_code
:
time
.
sleep
(
20
)
time
.
sleep
(
20
)
continue
continue
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论