Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
7dcd1a4c
提交
7dcd1a4c
authored
9月 08, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
9/8
上级
7baf2215
隐藏空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
270 行增加
和
136 行删除
+270
-136
RedisPPData.py
base/RedisPPData.py
+44
-28
annualreportUS.py
comData/annualReport_XQW/annualreportUS.py
+226
-108
没有找到文件。
base/RedisPPData.py
浏览文件 @
7dcd1a4c
...
@@ -116,33 +116,6 @@ def NoticeEnterprise_task():
...
@@ -116,33 +116,6 @@ def NoticeEnterprise_task():
print
(
'定时采集异常'
,
e
)
print
(
'定时采集异常'
,
e
)
pass
pass
#企业年报
def
AnnualEnterprise
():
cnx
,
cursor
=
connectSql
()
# 获取国内企业
gn_query
=
"select SocialCode from EnterpriseInfo where Place = '1' and SecuritiesCode is not null"
cursor
.
execute
(
gn_query
)
gn_result
=
cursor
.
fetchall
()
gn_social_list
=
[
item
[
0
]
for
item
in
gn_result
]
print
(
'======='
)
for
item
in
gn_social_list
:
r
.
rpush
(
'AnnualEnterprise:gnqy_socialCode'
,
item
)
closeSql
(
cnx
,
cursor
)
#企业年报定时任务
def
AnnualEnterprise_task
():
# 实例化一个调度器
scheduler
=
BlockingScheduler
()
# 每年执行一次
scheduler
.
add_job
(
AnnualEnterprise
,
'cron'
,
second
=
'*/10'
)
try
:
# 定时开始前执行一次
AnnualEnterprise
()
scheduler
.
start
()
except
Exception
as
e
:
print
(
'定时采集异常'
,
e
)
pass
#企业基本信息
#企业基本信息
def
BaseInfoEnterprise
():
def
BaseInfoEnterprise
():
cnx
,
cursor
=
connectSql
()
cnx
,
cursor
=
connectSql
()
...
@@ -245,6 +218,33 @@ def weixin_task():
...
@@ -245,6 +218,33 @@ def weixin_task():
print
(
'定时采集异常'
,
e
)
print
(
'定时采集异常'
,
e
)
pass
pass
#企业年报证监会
def
AnnualEnterprise
():
cnx
,
cursor
=
connectSql
()
# 获取国内企业
gn_query
=
"select SocialCode from EnterpriseInfo where Place = '1' and SecuritiesCode is not null"
cursor
.
execute
(
gn_query
)
gn_result
=
cursor
.
fetchall
()
gn_social_list
=
[
item
[
0
]
for
item
in
gn_result
]
print
(
'======='
)
for
item
in
gn_social_list
:
r
.
rpush
(
'AnnualEnterprise:gnqy_socialCode'
,
item
)
closeSql
(
cnx
,
cursor
)
#企业年报定时任务
def
AnnualEnterprise_task
():
# 实例化一个调度器
scheduler
=
BlockingScheduler
()
# 每年执行一次
scheduler
.
add_job
(
AnnualEnterprise
,
'cron'
,
second
=
'*/10'
)
try
:
# 定时开始前执行一次
AnnualEnterprise
()
scheduler
.
start
()
except
Exception
as
e
:
print
(
'定时采集异常'
,
e
)
pass
# 企业年报——雪球网
# 企业年报——雪球网
def
AnnualEnterpriseXueQ
():
def
AnnualEnterpriseXueQ
():
cnx
,
cursor
=
connectSql
()
cnx
,
cursor
=
connectSql
()
...
@@ -271,6 +271,21 @@ def AnnualEnterpriseXueQ_task():
...
@@ -271,6 +271,21 @@ def AnnualEnterpriseXueQ_task():
print
(
'定时采集异常'
,
e
)
print
(
'定时采集异常'
,
e
)
pass
pass
#企业年报--美国证券交易委员会
def
AnnualEnterpriseUS
():
cnx
,
cursor
=
connectSql
()
# 获取美股企业
us_query
=
"select SocialCode from EnterpriseInfo where Place = '2' and SecuritiesType = '美股' and SecuritiesCode is not null"
# us_query = "select SocialCode from EnterpriseInfo where Place = '2' and SecuritiesType = '美股' and SecuritiesCode = 'BP' "
#ZZSN22080900000025
cursor
.
execute
(
us_query
)
us_result
=
cursor
.
fetchall
()
us_social_list
=
[
item
[
0
]
for
item
in
us_result
]
print
(
'======='
)
for
item
in
us_social_list
:
r
.
rpush
(
'AnnualEnterprise:usqy_socialCode'
,
item
)
closeSql
(
cnx
,
cursor
)
#国外企业基本信息 redis中放入id
#国外企业基本信息 redis中放入id
def
BaseInfoEnterpriseAbroad
():
def
BaseInfoEnterpriseAbroad
():
cnx
,
cursor
=
connectSql
()
cnx
,
cursor
=
connectSql
()
...
@@ -383,7 +398,8 @@ if __name__ == "__main__":
...
@@ -383,7 +398,8 @@ if __name__ == "__main__":
# NewsEnterprise()
# NewsEnterprise()
# BaseInfoEnterprise()
# BaseInfoEnterprise()
# FBS()
# FBS()
MengZhi
()
# MengZhi()
AnnualEnterpriseUS
()
# NoticeEnterprise_task()
# NoticeEnterprise_task()
# AnnualEnterprise_task()
# AnnualEnterprise_task()
# NoticeEnterprise()
# NoticeEnterprise()
...
...
comData/annualReport_XQW/annualreportUS.py
浏览文件 @
7dcd1a4c
...
@@ -9,54 +9,165 @@
...
@@ -9,54 +9,165 @@
import
json
import
json
import
re
import
re
import
time
import
time
from
urllib.parse
import
urljoin
from
base.BaseCore
import
BaseCore
from
base.BaseCore
import
BaseCore
baseCore
=
BaseCore
()
baseCore
=
BaseCore
()
import
requests
import
requests
import
urllib3
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
from
bs4
import
BeautifulSoup
from
bs4
import
BeautifulSoup
from
kafka
import
KafkaProducer
from
kafka
import
KafkaProducer
from
selenium
import
webdriver
# from selenium import webdriver
def
paserUrl
(
html
,
listurl
):
# soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links
=
html
.
find_all
([
'a'
,
'img'
])
# 遍历标签,将相对地址转换为绝对地址
for
link
in
links
:
if
'href'
in
link
.
attrs
:
link
[
'href'
]
=
urljoin
(
listurl
,
link
[
'href'
])
elif
'src'
in
link
.
attrs
:
link
[
'src'
]
=
urljoin
(
listurl
,
link
[
'src'
])
return
html
def
get_news
(
news_url
,
ip_dic
):
header
=
{
'Host'
:
'www.sec.gov'
,
'Connection'
:
'keep-alive'
,
'sec-ch-ua'
:
'"Not/A)Brand";v="99", "Google Chrome";v="115", "Chromium";v="115"'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
,
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Sec-Fetch-Site'
:
'none'
,
'Sec-Fetch-Mode'
:
'navigate'
,
'Sec-Fetch-User'
:
'?1'
,
'Sec-Fetch-Dest'
:
'document'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Cookie'
:
'_gid=GA1.2.385814648.1694135927; _ga_300V1CHKH1=GS1.1.1694135927.6.1.1694136598.0.0.0; _ga=GA1.1.733439486.1693211261; _4c_=
%7
B
%22
_4c_s_
%22%3
A
%22
dZJbj9owEIX
%2
FCvJDngj4EowTKaqqVKq20vbe7SMK9pBYC3HkGLwU8d9rQ
%2
Bh2V61fEn9z5vjInhPyLXSoIDzPCOMcYyHwFD3CcUDFCVmt4ueACqRqlinOcMprxtOsZos0ZwpSIYQUQi0WFDCaoqfgtcQ4F0vKCRX0PEWqu3lYUDDopnupE5xSHnS6d6MwpGEsx8Ez4
%2
BKmJYTzK4nam2WN
%2
Flm3
%2
FmZ1Kyxyxl9KIwnS3r4
%2
B9b9S2Y
%2
FSE5JGQTie5DMiZjjdDCGH
%2
BxVIJuI19NaovXQrd
%2
ByjzMN6MqjHUFBw0BJWXivXXvopfqYt6KZ1EeOLi4rZEAl
%2
FXnfK
%2
BNdtI
%2
F3TlrOoXVvjB4idVWvNDiaELAI24UXRz0tHDGthA9ZeZK1z
%2
FVDM59772QBy1pjDXDY6XetufjVLQTW1fSPNrq
%2
B7Y
%2
Fnh832yq51sy8HV1g2p165NNnoL3X5XJt9c7aBMKrPvnD2G
%2
FV1VJruj8R3YEp7kdq8gqaXTpisbcKNryDRoF29rzDCCMItXll7Zg45UTb5XXwP
%2
F
%2
BBf5Un26H9H7t6sfd
%2
B
%2
FCZslYxvJM8Fl8XkpIGEt0vr5umHlKaR5WFqbMuS0qBM9wXOfz
%2
BTc
%3
D
%22%7
D'
}
response
=
requests
.
get
(
url
=
news_url
,
headers
=
header
,
verify
=
False
,
timeout
=
30
)
# response = requests.get(url=news_url, verify=False, proxies=ip_dic, timeout=30)
if
response
.
status_code
==
200
:
# 请求成功,处理响应数据
# print(response.text)
result
=
BeautifulSoup
(
response
.
content
,
'html.parser'
)
# print(result)
pass
else
:
# 请求失败,输出错误信息
print
(
'请求失败:'
,
response
.
status_code
,
response
.
text
)
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
url
,
'请求失败'
)
result
=
''
return
result
def
spider
(
com_name
,
cik
):
def
spider
(
com_name
,
cik
):
url
=
f
'https://www.sec.gov/edgar/browse/?CIK={cik}&owner=exclude'
header
=
{
browser
.
get
(
url
)
'Host'
:
'data.sec.gov'
,
time
.
sleep
(
3
)
'Connection'
:
'keep-alive'
,
page_source
=
browser
.
page_source
'Pragma'
:
'no-cache'
,
soup
=
BeautifulSoup
(
page_source
,
'html.parser'
)
'Cache-Control'
:
'no-cache'
,
# print(soup)
'sec-ch-ua'
:
'"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"'
,
select_ann
=
soup
.
find_all
(
'tr'
,
class_
=
'odd'
)
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
,
for
tr
in
select_ann
:
'Upgrade-Insecure-Requests'
:
'1'
,
form_type
=
tr
.
find
(
'td'
)
.
text
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
,
if
form_type
==
'20-F'
:
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
# print(tr)
'Sec-Fetch-Site'
:
'none'
,
# 获取原文链接
'Sec-Fetch-Mode'
:
'navigate'
,
href
=
tr
.
find
(
'a'
,
class_
=
'document-link'
)[
'href'
]
'Sec-Fetch-User'
:
'?1'
,
print
(
href
)
'Sec-Fetch-Dest'
:
'document'
,
if
'ix?doc'
in
href
:
'Accept-Encoding'
:
'gzip, deflate, br'
,
href
=
'https://www.sec.gov/'
+
href
.
split
(
'/ix?doc=/'
)[
1
]
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Cookie'
:
'_4c_=
%7
B
%22
_4c_s_
%22%3
A
%22
fVPLbtswEPwVg2fLJimSonwrUqDoIS1apO0xYMi1JcSRBIqx4hr
%2
B9
%2
B5act6pLiKHM8PR7urAhgoathKmzFWhpZFG2Dm7hX3PVgcW60CvHVsx4Zz2XOiMB6czJUXIrHZlBrAuxFob73PP5uwBvQoupNJalIXUxznz3eRxYL4NQF7lQtgFz9Y9KtJfRJTluOxiG
%2
B59uk77jmgD3Mz6cIsHAXa1h
%2
BuhDqkifW7ME1pBvakSwoWxhHaRKLga6ia0w2vVhD6qjCoRvYnt0AMpL6rY3sFMCCK3WAb256SgrBHWEOOJhru
%2
BThSzB7
%2
FYtLsJwNKNWDZiv2tCw
%2
Bzq4ifi354hPy6
%2
BX05QRxXOcbFtvduSKTZlzr58uv719fMpellqxctcLk6dMqUUVLD7uMXTKqWuXy2XwzAspjBLCBsXlz246Ktx7du7zjX7EUItNHRpFwMFB5
%2
FqthmD4
%2
F4q1psNxEtIVYsTgHsXamK4LVWYiBEC9PWGYgYqI
%2
B5uU9s9wsdxFjCtNsIYrqXEXifMa43i9BzH7z6NRv7E1kZyYXnxlj32KKPaQvMfqX0rDbA
%2
BD7IFl6t1YTLBwWaqUDIrC5Nn
%2
FMaALVTgXjj20lNK855nc7Z8Voun
%2
BbcoKxTy6i5NxKl3luc8z19yCSHu2dKxd8
%2
FjcLY6HyhFP
%2
BzDK4RqG1
%2
Ff
%2
BgH1ePwH
%22%7
D; _ga_300V1CHKH1=GS1.1.1694142118.3.0.1694142118.0.0.0; _ga=GA1.2.1399540932.1693469210; _gid=GA1.2.1824845345.1694142136; ak_bmsc=CB437E1B69906A01E58692EFBAA8A225~000000000000000000000000000000~YAAQ8BQgFyY6AFaKAQAAbKy9chWzUG2FvPYSvQ1oaw2RdgKemipNBxwFJPC71bps8Pe4B7LG80Yn8Gg+yVD84WX1d+lVZqdaPr8pbsd3N8NWzwiWUcN7PSoKK1Ej/G2WgOv8Nl0s2E8E8x/5XVYtGyFwKSl5mUGNsfsL4WYI++6imjaYHtyTDxtmKhvnWHMwXCMiJgqvRCr9yf5CeXKJuhpRrSZV/GZa8qlDr5PmF1LPu2RKv1jNRfLqq+BKaO4jKN8ETA0RUxhvXEpI1cc0bxFp9t/mD6iTVhzbxJ17qiBn9DLPcXoX1yheRONu9M//SyeHfETezU2RagRHONIPZXB2oN/8Qlu+Rjz9NIZk532RTj0qCSRu48EH8nmYFcwvGXb8YNhotygum3P+ELZSCzlgolFBQp+qciKBTsuJ3JL99/HMDHO9OyheN5yw6RH/hu6/xVW95acmV925q/yjoXITR+mcZWkrH4iRncHGQmwWQR+d+pNqeBYUNNm2; bm_sv=2C2708DF01ED851C6C481514DDA7F381~YAAQ8BQgF409AFaKAQAAFsm9chW4u/u6J8XhmAzFpGSqZr1ktVU8veuhu+tJ9h+G3Lf52nquY6mUDlkG1ZBMRAkAB3WCPBGWiKSbGR6sB29QOE9LOosBZKzL742Z5a0k6rOWyoByvjl75i7j68RIqGt0h87YwwLLqnH6gx6H0uqCkg+J405BKwHjvVhnQOF3eAD5CCbaJY5GQdS8bKDjOaX7e1WVr5aqdlNdEciyrs9hxhPZSPLLXuCFIDH+~1'
}
ip_dic
=
{
'https'
:
'http://127.0.0.1:1080'
,
'http'
:
'http://127.0.0.1:1080'
}
#正式
url_json
=
f
'https://data.sec.gov/submissions/CIK{cik}.json'
#测试
# url_json = 'https://data.sec.gov/submissions/CIK0001395064.json'
#解析页面
req
=
requests
.
get
(
url
=
url_json
,
headers
=
header
,
proxies
=
ip_dic
,
verify
=
False
,
timeout
=
30
)
# ,proxies=ip_dic)
data
=
req
.
json
()
info
=
data
[
'filings'
][
'recent'
]
form_type_list
=
info
[
'form'
]
accessionNumber_list
=
info
[
'accessionNumber'
]
primaryDocument_list
=
info
[
'primaryDocument'
]
filingDate_list
=
info
[
'filingDate'
]
i
=
0
for
form
in
form_type_list
:
i
+=
1
if
form
==
'10-K'
or
form
==
'20-F'
:
print
(
form
,
i
)
accessionNumber
=
accessionNumber_list
[
i
]
#发布日期
filingDate
=
filingDate_list
[
i
]
year
=
filingDate
[:
4
]
# u_1 = cik
u_1
=
'1395064'
u_2
=
accessionNumber
.
replace
(
'-'
,
''
)
u_3
=
primaryDocument_list
[
i
]
news_url
=
'https://www.sec.gov/Archives/edgar/data/'
+
u_1
+
'/'
+
u_2
+
'/'
+
u_3
soup
=
get_news
(
news_url
,
ip_dic
)
if
soup
:
pass
else
:
else
:
href
=
'https://www.sec.gov'
+
href
continue
print
(
href
)
#相对路径转化为绝对路径
# 获取发布时间
soup
=
paserUrl
(
soup
,
news_url
)
a_list
=
tr
.
find_all
(
'a'
)
content
=
soup
.
text
.
strip
()
# print(a_list)
for
a
in
a_list
:
# url = f'https://www.sec.gov/edgar/browse/?CIK={cik}&owner=exclude'
text
=
a
.
text
# browser.get(url)
match
=
re
.
search
(
pattern
,
text
)
# time.sleep(3)
if
match
:
# page_source = browser.page_source
pub_date
=
match
.
group
(
0
)
# soup = BeautifulSoup(page_source, 'html.parser')
# print(pub_date)
# # print(soup)
year
=
pub_date
[:
4
]
# select_ann = soup.find_all('tr', class_='odd')
break
#
else
:
# for tr in select_ann:
pub_date
=
''
# form_type = tr.find('td').text
year
=
''
# if form_type == '20-F':
# 根据年报的链接,请求年报内容,不需要上传文件服务器,直接发送kafka
# # print(tr)
browser
.
get
(
href
)
# # 获取原文链接
time
.
sleep
(
3
)
# href = tr.find('a', class_='document-link')['href']
i_page_source
=
browser
.
page_source
# print(href)
i_soup
=
BeautifulSoup
(
i_page_source
,
'html.parser'
)
# if 'ix?doc' in href:
# print(i_page_source)
# href = 'https://www.sec.gov/' + href.split('/ix?doc=/')[1]
content
=
i_soup
.
text
# else:
# href = 'https://www.sec.gov' + href
# print(href)
# # 获取发布时间
# a_list = tr.find_all('a')
# # print(a_list)
# for a in a_list:
# text = a.text
# match = re.search(pattern, text)
# if match:
# pub_date = match.group(0)
# # print(pub_date)
# year = pub_date[:4]
# break
# else:
# pub_date = ''
# year = ''
# # 根据年报的链接,请求年报内容,不需要上传文件服务器,直接发送kafka
# browser.get(href)
# time.sleep(3)
# i_page_source = browser.page_source
# i_soup = BeautifulSoup(i_page_source, 'html.parser')
# # print(i_page_source)
# content = i_soup.text
# 采集下来正文内容,直接传输kafka
# 采集下来正文内容,直接传输kafka
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
...
@@ -65,45 +176,45 @@ def spider(com_name,cik):
...
@@ -65,45 +176,45 @@ def spider(com_name,cik):
'attachmentIds'
:
''
,
'attachmentIds'
:
''
,
'author'
:
''
,
'author'
:
''
,
'content'
:
content
,
'content'
:
content
,
'contentWithTag'
:
i_page_source
,
'contentWithTag'
:
soup
,
'createDate'
:
time_now
,
'createDate'
:
time_now
,
'deleteFlag'
:
'0'
,
'deleteFlag'
:
'0'
,
'id'
:
''
,
'id'
:
''
,
'keyWords'
:
''
,
'keyWords'
:
''
,
'lang'
:
'zh'
,
'lang'
:
'zh'
,
'origin'
:
'SEC美国证券交易委员会'
,
'origin'
:
'SEC美国证券交易委员会'
,
'publishDate'
:
pub_d
ate
,
'publishDate'
:
filingD
ate
,
'sid'
:
'1684032033495392257'
,
'sid'
:
'1684032033495392257'
,
'sourceAddress'
:
href
,
# 原文链接
'sourceAddress'
:
news_url
,
# 原文链接
'summary'
:
''
,
'summary'
:
''
,
'title'
:
title
,
'title'
:
title
,
'type'
:
1
,
'type'
:
1
,
'socialCreditCode'
:
social_code
,
'socialCreditCode'
:
''
,
'year'
:
year
'year'
:
year
}
}
#
print(dic_news)
print
(
dic_news
)
# 将相应字段通过kafka传输保存
# 将相应字段通过kafka传输保存
#
try:
try
:
#
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
])
#
kafka_result = producer.send("researchReportTopic",
kafka_result
=
producer
.
send
(
"researchReportTopic"
,
#
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
json
.
dumps
(
dic_news
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
#
#
print(kafka_result.get(timeout=10))
print
(
kafka_result
.
get
(
timeout
=
10
))
#
#
dic_result = {
dic_result
=
{
#
'success': 'ture',
'success'
:
'ture'
,
#
'message': '操作成功',
'message'
:
'操作成功'
,
#
'code': '200',
'code'
:
'200'
,
#
}
}
#
print(dic_result)
print
(
dic_result
)
#
#
except Exception as e:
except
Exception
as
e
:
#
dic_result = {
dic_result
=
{
#
'success': 'false',
'success'
:
'false'
,
#
'message': '操作失败',
'message'
:
'操作失败'
,
#
'code': '204',
'code'
:
'204'
,
#
'e': e
'e'
:
e
#
}
}
def
getrequest
(
social_code
,
url
,
headers
,
data
):
def
getrequest
(
social_code
,
url
,
headers
,
data
):
...
@@ -126,11 +237,6 @@ def getrequest(social_code,url,headers,data):
...
@@ -126,11 +237,6 @@ def getrequest(social_code,url,headers,data):
result
=
''
result
=
''
return
result
return
result
#模拟浏览器
chromedriver
=
"D:/chrome/chromedriver.exe"
browser
=
webdriver
.
Chrome
(
chromedriver
)
pattern
=
r"\d{4}-\d{2}-\d{2}"
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
headers
=
{
headers
=
{
'authority'
:
'efts.sec.gov'
,
'authority'
:
'efts.sec.gov'
,
...
@@ -154,44 +260,56 @@ if __name__ == '__main__':
...
@@ -154,44 +260,56 @@ if __name__ == '__main__':
taskType
=
'企业年报/SEC'
taskType
=
'企业年报/SEC'
while
True
:
while
True
:
start_time
=
time
.
time
()
start_time
=
time
.
time
()
social_code
=
''
# 获取企业信息
# if not social_code:
social_code
=
baseCore
.
redicPullData
(
'AnnualEnterprise:usqy_socialCode'
)
# time.sleep(20)
# social_code = ''
# continue
if
not
social_code
:
# if social_code == 'None':
time
.
sleep
(
20
)
# time.sleep(20)
continue
# continue
if
social_code
==
'None'
:
# if social_code == '':
time
.
sleep
(
20
)
# time.sleep(20)
continue
# continue
if
social_code
==
''
:
# dic_info = baseCore.getInfomation(social_code)
time
.
sleep
(
20
)
# count = dic_info[15]
continue
# code = dic_info[3]
dic_info
=
baseCore
.
getInfomation
(
social_code
)
# com_name = dic_info[4]
count
=
dic_info
[
15
]
# if code is None:
code
=
dic_info
[
3
]
# exeception = '股票代码为空'
com_name
=
dic_info
[
4
]
# state = 0
cik
=
dic_info
[
13
]
# takeTime = baseCore.getTimeCost(start_time, time.time())
if
code
is
None
:
# baseCore.recordLog(social_code, taskType, state, takeTime, '', exeception)
exeception
=
'股票代码为空'
# continue
state
=
0
code
=
'BP'
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
#"MNSO" post请求 获取企业CIK
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
exeception
)
# payload = {"keysTyped":f"{code}","narrow":flag}
continue
payload
=
{
"keysTyped"
:
"BP"
,
"narrow"
:
True
}
if
cik
is
None
:
data
=
json
.
dumps
(
payload
)
exeception
=
'cik为空'
result
=
getrequest
(
social_code
,
url
,
headers
,
data
)
state
=
0
# print(result)
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
#判断接口返回的数据哪一条是该企业 根据股票代码
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
exeception
)
tickers
=
result
[
'hits'
][
'hits'
]
continue
for
ticker
in
tickers
:
# code = 'BP'
i_t
=
ticker
[
'_source'
][
'tickers'
]
# com_name = '英国石油公司'
if
i_t
==
code
:
# cik = ''
cik
=
ticker
[
'_id'
]
#"MNSO" post请求 获取企业CIK 正式
print
(
cik
)
# payload = {"keysTyped":f"{code}","narrow":True}
break
# #测试
# # payload = {"keysTyped": "BP", "narrow":True}
# data = json.dumps(payload)
# result = getrequest(social_code,url,headers,data)
# # print(result)
# #判断接口返回的数据哪一条是该企业 根据股票代码
# tickers = result['hits']['hits']
# for ticker in tickers:
# i_t = ticker['_source']['tickers']
# if i_t == code:
# cik = ticker['_id']
# print(cik)
# break
# break
# break
spider
(
com_name
,
cik
)
spider
(
com_name
,
cik
)
break
#
break
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论