Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
b127603f
提交
b127603f
authored
12月 18, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
年报部署
上级
3ed701a1
隐藏空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
34 行增加
和
16 行删除
+34
-16
证监会-年报.py
comData/annualReport/证监会-年报.py
+16
-10
雪球网-年报.py
comData/annualReport/雪球网-年报.py
+18
-6
没有找到文件。
comData/annualReport/证监会-年报.py
浏览文件 @
b127603f
impor
t
json
impor
t
json
...
@@ -6,11 +6,13 @@ from kafka import KafkaProducer
...
@@ -6,11 +6,13 @@ from kafka import KafkaProducer
from
bs4
import
BeautifulSoup
from
bs4
import
BeautifulSoup
import
requests
,
re
,
time
,
pymysql
,
fitz
import
requests
,
re
,
time
,
pymysql
,
fitz
import
urllib3
import
urllib3
from
base
import
BaseCore
import
sys
sys
.
path
.
append
(
'D:
\\
kkwork
\\
zzsn_spider
\\
base'
)
import
BaseCore
baseCore
=
BaseCore
.
BaseCore
()
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
baseCore
=
BaseCore
.
BaseCore
()
log
=
baseCore
.
getLogger
()
log
=
baseCore
.
getLogger
()
cnx
=
pymysql
.
connect
(
host
=
'114.116.44.11'
,
user
=
'caiji'
,
password
=
'f7s0&7qqtK'
,
db
=
'clb_project'
,
charset
=
'utf8mb4'
)
cnx
=
pymysql
.
connect
(
host
=
'114.116.44.11'
,
user
=
'caiji'
,
password
=
'f7s0&7qqtK'
,
db
=
'clb_project'
,
charset
=
'utf8mb4'
)
...
@@ -66,7 +68,11 @@ def RequestUrl(url, payload, item_id, start_time):
...
@@ -66,7 +68,11 @@ def RequestUrl(url, payload, item_id, start_time):
def
SpiderByZJH
(
url
,
payload
,
dic_info
,
num
,
start_time
):
def
SpiderByZJH
(
url
,
payload
,
dic_info
,
num
,
start_time
):
item_id
=
dic_info
[
2
]
item_id
=
dic_info
[
2
]
# years = dic_info['call_year']
# years = dic_info['call_year']
short_name
=
dic_info
[
4
]
short_name_
=
dic_info
[
4
]
if
short_name_
:
short_name
=
short_name_
else
:
short_name
=
dic_info
[
1
]
soup
=
RequestUrl
(
url
,
payload
,
item_id
,
start_time
)
soup
=
RequestUrl
(
url
,
payload
,
item_id
,
start_time
)
if
soup
==
''
:
if
soup
==
''
:
return
return
...
@@ -96,7 +102,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
...
@@ -96,7 +102,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
pdf_url_info
=
td_list
[
2
]
pdf_url_info
=
td_list
[
2
]
# print(pdf_url)
# print(pdf_url)
pdf_url
=
pdf_url_info
[
'onclick'
]
.
strip
(
'downloadPdf1('
)
.
split
(
','
)[
0
]
.
strip
(
'
\'
'
)
pdf_url
=
pdf_url_info
[
'onclick'
]
.
strip
(
'downloadPdf1('
)
.
split
(
','
)[
0
]
.
strip
(
'
\'
'
)
name_pdf
=
pdf_url_info
[
'onclick'
]
.
strip
(
'downloadPdf1('
)
.
split
(
','
)[
1
]
.
strip
(
'
\'
'
)
+
'.pdf'
name_pdf
_
=
pdf_url_info
[
'onclick'
]
.
strip
(
'downloadPdf1('
)
.
split
(
','
)[
1
]
.
strip
(
'
\'
'
)
+
'.pdf'
pub_time
=
pdf_url_info
[
'onclick'
]
.
strip
(
'downloadPdf1('
)
.
split
(
','
)[
2
]
.
strip
(
'
\'
'
)
pub_time
=
pdf_url_info
[
'onclick'
]
.
strip
(
'downloadPdf1('
)
.
split
(
','
)[
2
]
.
strip
(
'
\'
'
)
# todo:判断发布日期是否是日期格式
# todo:判断发布日期是否是日期格式
...
@@ -118,18 +124,18 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
...
@@ -118,18 +124,18 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
report_type
=
td_list
[
4
]
.
text
.
strip
()
report_type
=
td_list
[
4
]
.
text
.
strip
()
# print(report_type)
# print(report_type)
if
report_type
==
'年报'
:
if
report_type
==
'年报'
:
if
'摘要'
in
name_pdf
:
if
'摘要'
in
name_pdf
_
:
continue
continue
# 年份还从pdf名称里抽取
# 年份还从pdf名称里抽取
try
:
try
:
year
=
re
.
findall
(
'
\
d{4}
\
s*年'
,
name_pdf
)[
0
]
.
replace
(
'年'
,
''
)
year
=
re
.
findall
(
'
\
d{4}
\
s*年'
,
name_pdf
_
)[
0
]
.
replace
(
'年'
,
''
)
except
Exception
as
e
:
except
Exception
as
e
:
# pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')[:4]
# pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')[:4]
year
=
int
(
pub_time
[:
4
])
-
1
year
=
int
(
pub_time
[:
4
])
-
1
#
year = str(year)
year
=
str
(
year
)
# page_size = 0
# page_size = 0
name_pdf
=
f
'{short_name}:{year}年年度报告.pdf'
sel_sql
=
'''select item_id,year from clb_sys_attachment where item_id =
%
s and year =
%
s and type_id=1'''
sel_sql
=
'''select item_id,year from clb_sys_attachment where item_id =
%
s and year =
%
s and type_id=1'''
cursor_
.
execute
(
sel_sql
,
(
item_id
,
year
))
cursor_
.
execute
(
sel_sql
,
(
item_id
,
year
))
selects
=
cursor_
.
fetchone
()
selects
=
cursor_
.
fetchone
()
...
@@ -322,7 +328,7 @@ if __name__ == '__main__':
...
@@ -322,7 +328,7 @@ if __name__ == '__main__':
start_time
=
time
.
time
()
start_time
=
time
.
time
()
# 获取企业信息
# 获取企业信息
social_code
=
baseCore
.
redicPullData
(
'AnnualEnterprise:gnqy_socialCode'
)
social_code
=
baseCore
.
redicPullData
(
'AnnualEnterprise:gnqy_socialCode'
)
# social_code = '91
210800765420138L
'
# social_code = '91
100000100003962T
'
if
not
social_code
:
if
not
social_code
:
time
.
sleep
(
20
)
time
.
sleep
(
20
)
continue
continue
...
...
comData/annualReport/雪球网-年报.py
浏览文件 @
b127603f
# -*-
coding: utf-8 -*-
# -*-
coding: utf-8 -*-
...
@@ -11,9 +11,10 @@ import json
...
@@ -11,9 +11,10 @@ import json
from
datetime
import
datetime
from
datetime
import
datetime
from
kafka
import
KafkaProducer
from
kafka
import
KafkaProducer
from
base.BaseCore
import
BaseCore
import
sys
sys
.
path
.
append
(
'D:
\\
KK
\\
zzsn_spider
\\
base'
)
baseCore
=
BaseCore
()
import
BaseCore
baseCore
=
BaseCore
.
BaseCore
()
import
requests
,
re
,
time
,
pymysql
,
fitz
import
requests
,
re
,
time
,
pymysql
,
fitz
from
bs4
import
BeautifulSoup
as
bs
from
bs4
import
BeautifulSoup
as
bs
from
selenium
import
webdriver
from
selenium
import
webdriver
...
@@ -35,6 +36,7 @@ chromedriver = r'D:/cmd100/chromedriver.exe'
...
@@ -35,6 +36,7 @@ chromedriver = r'D:/cmd100/chromedriver.exe'
browser
=
webdriver
.
Chrome
(
chrome_options
=
opt
,
executable_path
=
chromedriver
)
browser
=
webdriver
.
Chrome
(
chrome_options
=
opt
,
executable_path
=
chromedriver
)
log
=
baseCore
.
getLogger
()
log
=
baseCore
.
getLogger
()
requests
.
adapters
.
DEFAULT_RETRIES
=
3
requests
.
adapters
.
DEFAULT_RETRIES
=
3
#11数据库
#11数据库
cnx
=
baseCore
.
cnx_
cnx
=
baseCore
.
cnx_
cursor
=
baseCore
.
cursor_
cursor
=
baseCore
.
cursor_
...
@@ -259,10 +261,20 @@ if __name__ == '__main__':
...
@@ -259,10 +261,20 @@ if __name__ == '__main__':
while
True
:
while
True
:
start_time
=
time
.
time
()
start_time
=
time
.
time
()
# 获取企业信息
# 获取企业信息
#
social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
social_code
=
baseCore
.
redicPullData
(
'AnnualEnterprise:gnshqy_socialCode'
)
social_code
=
'91330000734507783B
'
# social_code = '91440300192176077R
'
if
not
social_code
:
if
not
social_code
:
time
.
sleep
(
20
)
time
.
sleep
(
20
)
if
not
baseCore
.
check_mysql_conn
(
cnx
):
# 11数据库
cnx
=
baseCore
.
cnx_
cursor
=
baseCore
.
cursor_
log
.
info
(
'===11数据库重新连接成功==='
)
if
not
baseCore
.
check_mysql_conn
(
cnx_
):
# 144数据库
cnx_
=
baseCore
.
cnx
cursor_
=
baseCore
.
cursor
log
.
info
(
'===144数据库重新连接成功==='
)
continue
continue
if
social_code
==
'None'
:
if
social_code
==
'None'
:
time
.
sleep
(
20
)
time
.
sleep
(
20
)
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论