Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
c1b41f41
提交
c1b41f41
authored
9月 15, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
新三板基本信息
上级
2697722b
隐藏空白字符变更
内嵌
并排
正在显示
3 个修改的文件
包含
4 行增加
和
200 行删除
+4
-200
annualreport_US.py
comData/annualReport_US/annualreport_US.py
+0
-196
NQbase_info.py
comData/dfcfwGpdm/NQenterprise/NQbase_info.py
+3
-3
get_tokenCookies.py
comData/weixin_solo/get_tokenCookies.py
+1
-1
没有找到文件。
comData/annualReport_US/annualreport_US.py
deleted
100644 → 0
浏览文件 @
2697722b
"""
打开SEC网址——【FILINGS】——【Company Filing】——输入证券代码——选10-K和20-F为年报
"""
import
json
import
re
import
time
from
base.BaseCore
import
BaseCore
baseCore
=
BaseCore
()
import
requests
from
bs4
import
BeautifulSoup
from
kafka
import
KafkaProducer
from
selenium
import
webdriver
def
spider
(
com_name
,
cik
):
url
=
f
'https://www.sec.gov/edgar/browse/?CIK={cik}&owner=exclude'
browser
.
get
(
url
)
time
.
sleep
(
3
)
page_source
=
browser
.
page_source
soup
=
BeautifulSoup
(
page_source
,
'html.parser'
)
# print(soup)
select_ann
=
soup
.
find_all
(
'tr'
,
class_
=
'odd'
)
for
tr
in
select_ann
:
form_type
=
tr
.
find
(
'td'
)
.
text
if
form_type
==
'20-F'
:
# print(tr)
# 获取原文链接
href
=
tr
.
find
(
'a'
,
class_
=
'document-link'
)[
'href'
]
print
(
href
)
if
'ix?doc'
in
href
:
href
=
'https://www.sec.gov/'
+
href
.
split
(
'/ix?doc=/'
)[
1
]
else
:
href
=
'https://www.sec.gov'
+
href
print
(
href
)
# 获取发布时间
a_list
=
tr
.
find_all
(
'a'
)
# print(a_list)
for
a
in
a_list
:
text
=
a
.
text
match
=
re
.
search
(
pattern
,
text
)
if
match
:
pub_date
=
match
.
group
(
0
)
# print(pub_date)
year
=
pub_date
[:
4
]
break
else
:
pub_date
=
''
year
=
''
# 根据年报的链接,请求年报内容,不需要上传文件服务器,直接发送kafka
browser
.
get
(
href
)
time
.
sleep
(
3
)
i_page_source
=
browser
.
page_source
i_soup
=
BeautifulSoup
(
i_page_source
,
'html.parser'
)
# print(i_page_source)
content
=
i_soup
.
text
# 采集下来正文内容,直接传输kafka
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
title
=
f
'{com_name}:{year}年年度报告'
dic_news
=
{
'attachmentIds'
:
''
,
'author'
:
''
,
'content'
:
content
,
'contentWithTag'
:
i_page_source
,
'createDate'
:
time_now
,
'deleteFlag'
:
'0'
,
'id'
:
''
,
'keyWords'
:
''
,
'lang'
:
'zh'
,
'origin'
:
'SEC美国证券交易委员会'
,
'publishDate'
:
pub_date
,
'sid'
:
'1684032033495392257'
,
'sourceAddress'
:
href
,
# 原文链接
'summary'
:
''
,
'title'
:
title
,
'type'
:
1
,
'socialCreditCode'
:
social_code
,
'year'
:
year
}
# print(dic_news)
# 将相应字段通过kafka传输保存
# try:
# producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
# kafka_result = producer.send("researchReportTopic",
# json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
#
# print(kafka_result.get(timeout=10))
#
# dic_result = {
# 'success': 'ture',
# 'message': '操作成功',
# 'code': '200',
# }
# print(dic_result)
#
# except Exception as e:
# dic_result = {
# 'success': 'false',
# 'message': '操作失败',
# 'code': '204',
# 'e': e
# }
def
getrequest
(
social_code
,
url
,
headers
,
data
):
#通过请求post接口获取企业的CIK
response
=
requests
.
post
(
url
=
url
,
headers
=
headers
,
data
=
data
)
# ,proxies=ip)
response
.
encoding
=
response
.
apparent_encoding
# 检查响应状态码
if
response
.
status_code
==
200
:
# 请求成功,处理响应数据
# print(response.text)
result
=
response
.
json
()
# print(result)
pass
else
:
# 请求失败,输出错误信息
print
(
'请求失败:'
,
response
.
status_code
,
response
.
text
)
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
url
,
'请求失败'
)
result
=
''
return
result
#模拟浏览器
chromedriver
=
"D:/chrome/chromedriver.exe"
browser
=
webdriver
.
Chrome
(
chromedriver
)
pattern
=
r"\d{4}-\d{2}-\d{2}"
if
__name__
==
'__main__'
:
headers
=
{
'authority'
:
'efts.sec.gov'
,
'method'
:
'POST'
,
'path'
:
'/LATEST/search-index'
,
'scheme'
:
'https'
,
'accept'
:
'*/*'
,
'accept-encoding'
:
'gzip deflate br'
,
'accept-language'
:
'zh-CNzh;q=0.9en;q=0.8'
,
'content-length'
:
'34'
,
'content-type'
:
'application/x-www-form-urlencoded; charset=UTF-8'
,
'origin'
:
'https://www.sec.gov'
,
'referer'
:
'https://www.sec.gov/'
,
'sec-fetch-dest'
:
'empty'
,
'sec-fetch-mode'
:
'cors'
,
'sec-fetch-site'
:
'same-site'
,
'user-agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/80.0.3987.116 Safari/537.36'
}
url
=
'https://efts.sec.gov/LATEST/search-index'
num
=
0
taskType
=
'企业年报/雪球网'
while
True
:
start_time
=
time
.
time
()
social_code
=
''
# if not social_code:
# time.sleep(20)
# continue
# if social_code == 'None':
# time.sleep(20)
# continue
# if social_code == '':
# time.sleep(20)
# continue
# dic_info = baseCore.getInfomation(social_code)
# count = dic_info[15]
# code = dic_info[3]
# com_name = dic_info[4]
# if code is None:
# exeception = '股票代码为空'
# state = 0
# takeTime = baseCore.getTimeCost(start_time, time.time())
# baseCore.recordLog(social_code, taskType, state, takeTime, '', exeception)
# continue
code
=
'BP'
#"MNSO" post请求 获取企业CIK
# payload = {"keysTyped":f"{code}","narrow":flag}
payload
=
{
"keysTyped"
:
"BP"
,
"narrow"
:
True
}
data
=
json
.
dumps
(
payload
)
result
=
getrequest
(
social_code
,
url
,
headers
,
data
)
# print(result)
#判断接口返回的数据哪一条是该企业 根据股票代码
tickers
=
result
[
'hits'
][
'hits'
]
for
ticker
in
tickers
:
i_t
=
ticker
[
'_source'
][
'tickers'
]
if
i_t
==
code
:
cik
=
ticker
[
'_id'
]
print
(
cik
)
break
# break
spider
(
cik
)
break
comData/dfcfwGpdm/NQenterprise/NQbase_info.py
浏览文件 @
c1b41f41
...
@@ -327,13 +327,13 @@ if __name__ == '__main__':
...
@@ -327,13 +327,13 @@ if __name__ == '__main__':
#从redis里拿数据
#从redis里拿数据
while
True
:
while
True
:
# TODO:需要隔两个小时左右抓包修改,token从数据库中获得
# TODO:需要隔两个小时左右抓包修改,token从数据库中获得
token
=
baseCore
.
GetToken
()
token
=
'027ea02da6d901a724ecca47930379b4'
list_weicha
=
[]
list_weicha
=
[]
list_all_info
=
[]
list_all_info
=
[]
name_list
=
[]
name_list
=
[]
start_time
=
time
.
time
()
start_time
=
time
.
time
()
# 获取企业信息
# 获取企业信息
com_code
=
baseCore
.
redicPullData
(
'EnterpriseIpo
qccid
:nq_gpdm'
)
com_code
=
baseCore
.
redicPullData
(
'EnterpriseIpo:nq_gpdm'
)
if
'.NQ'
in
com_code
:
if
'.NQ'
in
com_code
:
com_code1
=
com_code
com_code1
=
com_code
else
:
else
:
...
@@ -344,7 +344,7 @@ if __name__ == '__main__':
...
@@ -344,7 +344,7 @@ if __name__ == '__main__':
if
not
company_id
:
if
not
company_id
:
log
.
info
(
com_code
+
":企业ID获取失败===重新放入redis"
)
log
.
info
(
com_code
+
":企业ID获取失败===重新放入redis"
)
list_weicha
.
append
(
com_code
+
":企业ID获取失败"
)
list_weicha
.
append
(
com_code
+
":企业ID获取失败"
)
baseCore
.
rePutIntoR
(
'EnterpriseIpo
qccid
:nq_gpdm'
,
com_code
)
baseCore
.
rePutIntoR
(
'EnterpriseIpo:nq_gpdm'
,
com_code
)
log
.
info
(
'-----已重新放入redis-----'
)
log
.
info
(
'-----已重新放入redis-----'
)
time
.
sleep
(
20
)
time
.
sleep
(
20
)
continue
continue
...
...
comData/weixin_solo/get_tokenCookies.py
浏览文件 @
c1b41f41
...
@@ -57,7 +57,7 @@ if __name__=="__main__":
...
@@ -57,7 +57,7 @@ if __name__=="__main__":
url
=
"https://mp.weixin.qq.com/"
url
=
"https://mp.weixin.qq.com/"
browser
.
get
(
url
)
browser
.
get
(
url
)
# 可改动
# 可改动
time
.
sleep
(
2
0
)
time
.
sleep
(
6
0
)
s
=
requests
.
session
()
s
=
requests
.
session
()
#获取到token和cookies
#获取到token和cookies
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论