Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
9c3eea0f
提交
9c3eea0f
authored
4月 15, 2024
作者:
XveLingKun
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
美国证监会年报
上级
5a10db80
隐藏空白字符变更
内嵌
并排
正在显示
3 个修改的文件
包含
12 行增加
和
15 行删除
+12
-15
RedisPPData.py
base/RedisPPData.py
+1
-1
annualreportUS.py
comData/annualReport_US/annualreportUS.py
+9
-9
es_search_ct.py
comData/annualReport_US/es_search_ct.py
+2
-5
没有找到文件。
base/RedisPPData.py
浏览文件 @
9c3eea0f
...
@@ -423,7 +423,7 @@ def SEC_CIK():
...
@@ -423,7 +423,7 @@ def SEC_CIK():
for
item
in
cik_list
:
for
item
in
cik_list
:
# r.rpush('Sec_cik_US:uscik_baseinfo',item)
# r.rpush('Sec_cik_US:uscik_baseinfo',item)
r
.
rpush
(
'Sec_cik_US:uscik_annualReport'
,
item
)
r
.
rpush
(
'Sec_cik_US:uscik_annualReport'
,
item
)
closeSql
(
cnx
,
cursor
)
closeSql
(
cnx
,
cursor
)
#福布斯=====从数据库中读取信息放入redis
#福布斯=====从数据库中读取信息放入redis
...
...
comData/annualReport_US/annualreportUS.py
浏览文件 @
9c3eea0f
...
@@ -99,12 +99,12 @@ def spider(com_name,cik,up_okCount):
...
@@ -99,12 +99,12 @@ def spider(com_name,cik,up_okCount):
}
}
ip_dic
=
{
'https'
:
'http://127.0.0.1:1080'
,
'http'
:
'http://127.0.0.1:1080'
}
ip_dic
=
{
'https'
:
'http://127.0.0.1:1080'
,
'http'
:
'http://127.0.0.1:1080'
}
#正式
#正式
#
url_json = f'https://data.sec.gov/submissions/CIK{cik}.json'
url_json
=
f
'https://data.sec.gov/submissions/CIK{cik}.json'
#测试
#测试
url_json
=
'https://data.sec.gov/submissions/CIK0000104169.json'
#
url_json = 'https://data.sec.gov/submissions/CIK0000104169.json'
#解析页面
#解析页面
for
nnn
in
range
(
0
,
4
):
for
nnn
in
range
(
0
,
4
):
try
:
try
:
req
=
requests
.
get
(
url
=
url_json
,
headers
=
header
,
proxies
=
ip_dic
,
verify
=
False
,
timeout
=
30
)
req
=
requests
.
get
(
url
=
url_json
,
headers
=
header
,
proxies
=
ip_dic
,
verify
=
False
,
timeout
=
30
)
# req = requests.get(url=url_json, headers=header, verify=False, timeout=30)
# req = requests.get(url=url_json, headers=header, verify=False, timeout=30)
...
@@ -115,7 +115,7 @@ def spider(com_name,cik,up_okCount):
...
@@ -115,7 +115,7 @@ def spider(com_name,cik,up_okCount):
try
:
try
:
data
=
req
.
json
()
data
=
req
.
json
()
except
:
except
:
baseCore
.
rePutIntoR
(
'Sec_cik_US:uscik_annualReport'
,
social_code
)
baseCore
.
rePutIntoR
(
'Sec_cik_US:uscik_annualReport'
,
social_code
)
return
return
req
.
close
()
req
.
close
()
info
=
data
[
'filings'
][
'recent'
]
info
=
data
[
'filings'
][
'recent'
]
...
@@ -138,9 +138,9 @@ def spider(com_name,cik,up_okCount):
...
@@ -138,9 +138,9 @@ def spider(com_name,cik,up_okCount):
date
=
datetime
.
strptime
(
filingDate
,
'
%
Y-
%
m-
%
d'
)
# 将日期字符串转换为datetime对象
date
=
datetime
.
strptime
(
filingDate
,
'
%
Y-
%
m-
%
d'
)
# 将日期字符串转换为datetime对象
month
=
date
.
month
# 获取月份
month
=
date
.
month
# 获取月份
if
month
<
=
6
:
if
month
<
12
:
year
=
date
.
year
-
1
year
=
date
.
year
-
1
el
if
month
>
6
:
el
se
:
year
=
date
.
year
year
=
date
.
year
# year = filingDate[:4]
# year = filingDate[:4]
...
@@ -246,7 +246,7 @@ def spider(com_name,cik,up_okCount):
...
@@ -246,7 +246,7 @@ def spider(com_name,cik,up_okCount):
def
getrequest
(
social_code
,
url
,
headers
,
data
):
def
getrequest
(
social_code
,
url
,
headers
,
data
):
ip_dic
=
{
'https'
:
'http://127.0.0.1:1080'
,
'http'
:
'http://127.0.0.1:1080'
}
ip_dic
=
{
'https'
:
'http://127.0.0.1:1080'
,
'http'
:
'http://127.0.0.1:1080'
}
#通过请求post接口获取企业的CIK
#通过请求post接口获取企业的CIK
response
=
requests
.
post
(
url
=
url
,
headers
=
headers
,
data
=
data
,
proxies
=
ip_dic
)
response
=
requests
.
post
(
url
=
url
,
headers
=
headers
,
data
=
data
,
proxies
=
ip_dic
)
response
.
encoding
=
response
.
apparent_encoding
response
.
encoding
=
response
.
apparent_encoding
# 检查响应状态码
# 检查响应状态码
if
response
.
status_code
==
200
:
if
response
.
status_code
==
200
:
...
@@ -326,8 +326,8 @@ if __name__ == '__main__':
...
@@ -326,8 +326,8 @@ if __name__ == '__main__':
while
True
:
while
True
:
start_time
=
time
.
time
()
start_time
=
time
.
time
()
# 获取企业信息
# 获取企业信息
#
cik = baseCore.redicPullData('Sec_cik_US:uscik_annualReport')
cik
=
baseCore
.
redicPullData
(
'Sec_cik_US:uscik_annualReport'
)
cik
=
'789019'
#
cik = '789019'
data
=
fromcikgetinfo
(
cik
)
data
=
fromcikgetinfo
(
cik
)
com_name
=
data
[
2
]
com_name
=
data
[
2
]
com_code
=
data
[
3
]
com_code
=
data
[
3
]
...
...
comData/annualReport_US/es_search_ct.py
浏览文件 @
9c3eea0f
...
@@ -113,11 +113,8 @@ def main(page, p, esMethod):
...
@@ -113,11 +113,8 @@ def main(page, p, esMethod):
socialCode
=
mms
[
'_source'
][
'labels'
][
0
][
'relationId'
]
socialCode
=
mms
[
'_source'
][
'labels'
][
0
][
'relationId'
]
log
.
info
(
f
'{id}--{title}--{origin}--{sourceAddress}---'
)
log
.
info
(
f
'{id}--{title}--{origin}--{sourceAddress}---'
)
if
origin
==
'SEC美国证券交易委员会'
:
if
origin
==
'SEC美国证券交易委员会'
:
redis_conn
.
lrem
(
'NianbaoUS:id'
,
0
,
id
+
"|"
+
title
+
"|"
+
sourceAddress
+
"|"
+
year
+
"|"
+
socialCode
)
# redis_conn.lrem('NianbaoUS:id', 0, id+"|"+title+"|"+sourceAddress+"|"+year+"|"+socialCode)
redis_conn
.
lpush
(
'NianbaoUS:id'
,
id
+
"|"
+
title
+
"|"
+
sourceAddress
+
"|"
+
year
+
"|"
+
socialCode
)
redis_conn
.
sadd
(
'NianbaoUS:id'
,
id
+
"|"
+
title
+
"|"
+
sourceAddress
+
"|"
+
year
+
"|"
+
socialCode
)
else
:
redis_conn
.
lrem
(
f
'NianbaoOT_{origin}:id'
,
0
,
id
+
"|"
+
title
+
"|"
+
sourceAddress
+
"|"
+
year
+
"|"
+
socialCode
)
redis_conn
.
lpush
(
f
'NianbaoOT_{origin}:id'
,
id
+
"|"
+
title
+
"|"
+
sourceAddress
+
"|"
+
year
+
"|"
+
socialCode
)
def
run_threads
(
num_threads
,
esMethod
,
j
):
def
run_threads
(
num_threads
,
esMethod
,
j
):
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论