Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
13609e3d
提交
13609e3d
authored
10月 25, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
雪球网年报
上级
d30620e6
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
86 行增加
和
86 行删除
+86
-86
雪球网-年报.py
comData/annualReport/雪球网-年报.py
+86
-86
没有找到文件。
comData/annualReport/雪球网-年报.py
浏览文件 @
13609e3d
# -*-
coding: utf-8 -*-
# -*-
coding: utf-8 -*-
...
@@ -35,8 +35,10 @@ chromedriver = r'D:/cmd100/chromedriver.exe'
...
@@ -35,8 +35,10 @@ chromedriver = r'D:/cmd100/chromedriver.exe'
browser
=
webdriver
.
Chrome
(
chrome_options
=
opt
,
executable_path
=
chromedriver
)
browser
=
webdriver
.
Chrome
(
chrome_options
=
opt
,
executable_path
=
chromedriver
)
log
=
baseCore
.
getLogger
()
log
=
baseCore
.
getLogger
()
requests
.
adapters
.
DEFAULT_RETRIES
=
3
requests
.
adapters
.
DEFAULT_RETRIES
=
3
#11数据库
cnx
=
pymysql
.
connect
(
host
=
'114.116.44.11'
,
user
=
'caiji'
,
password
=
'f7s0&7qqtK'
,
db
=
'clb_project'
,
charset
=
'utf8mb4'
)
cnx
=
baseCore
.
cnx_
cursor
=
baseCore
.
cursor_
#144数据库
cnx_
=
baseCore
.
cnx
cnx_
=
baseCore
.
cnx
cursor_
=
baseCore
.
cursor
cursor_
=
baseCore
.
cursor
...
@@ -159,94 +161,92 @@ def spider_annual_report(dict_info,num):
...
@@ -159,94 +161,92 @@ def spider_annual_report(dict_info,num):
# name_pdf = f"{com_name}:{year}年年报.pdf".replace('*', '')
# name_pdf = f"{com_name}:{year}年年报.pdf".replace('*', '')
# name_pdf = pdf_name_a + '.pdf'
# name_pdf = pdf_name_a + '.pdf'
with
cnx
.
cursor
()
as
cursor
:
if
'年'
in
year
:
if
'年'
in
year
:
year
=
year
.
split
(
'年'
)[
0
]
year
=
year
.
split
(
'年'
)[
0
]
else
:
else
:
pass
sel_sql
=
'''select item_id,year from clb_sys_attachment where item_id =
%
s and year =
%
s and type_id="1" '''
cursor
.
execute
(
sel_sql
,
(
social_code
,
int
(
year
)))
selects
=
cursor
.
fetchone
()
if
selects
:
log
.
info
(
f
'com_name:{com_name}、{year}已存在'
)
continue
else
:
#上传文件至obs服务器
retData
=
baseCore
.
uptoOBS
(
pdf_url
,
name_pdf
,
1
,
social_code
,
pathType
,
taskType
,
start_time
,
'XueLingKun'
)
if
retData
[
'state'
]:
pass
pass
sel_sql
=
'''select item_id,year from clb_sys_attachment where item_id =
%
s and year =
%
s and type_id="1" '''
cursor
.
execute
(
sel_sql
,
(
social_code
,
int
(
year
)))
selects
=
cursor
.
fetchone
()
if
selects
:
log
.
info
(
f
'com_name:{com_name}、{year}已存在'
)
continue
else
:
else
:
#上传文件至obs服务器
log
.
info
(
f
'====pdf解析失败===='
)
retData
=
baseCore
.
uptoOBS
(
pdf_url
,
name_pdf
,
1
,
social_code
,
pathType
,
taskType
,
start_time
,
'XueLingKun'
)
continue
if
retData
[
'state'
]:
num
=
num
+
1
pass
try
:
else
:
origin
=
'雪球网'
log
.
info
(
f
'====pdf解析失败===='
)
att_id
=
baseCore
.
tableUpdate
(
retData
,
com_name
,
year
,
name_pdf
,
num
,
pub_time
,
origin
)
continue
content
=
retData
[
'content'
]
num
=
num
+
1
state
=
1
try
:
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
origin
=
'雪球网'
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
year_url
,
'成功'
)
att_id
=
baseCore
.
tableUpdate
(
retData
,
com_name
,
year
,
name_pdf
,
num
,
pub_time
,
origin
)
except
Exception
as
e
:
content
=
retData
[
'content'
]
exception
=
'数据库传输失败'
state
=
1
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
year_url
,
'成功'
)
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
year_url
,
f
'{exception} - --{e}'
)
except
Exception
as
e
:
return
False
exception
=
'数据库传输失败'
#发送数据到kafka
state
=
0
lang
=
baseCore
.
detect_language
(
content
)
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
if
lang
==
'cn'
:
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
year_url
,
f
'{exception} - --{e}'
)
lang
=
'zh'
return
False
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
#发送数据到kafka
dic_news
=
{
lang
=
baseCore
.
detect_language
(
content
)
'attachmentIds'
:
att_id
,
if
lang
==
'cn'
:
'author'
:
''
,
lang
=
'zh'
'content'
:
content
,
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
'contentWithTag'
:
''
,
dic_news
=
{
'createDate'
:
time_now
,
'attachmentIds'
:
att_id
,
'deleteFlag'
:
'0'
,
'author'
:
''
,
'id'
:
''
,
'content'
:
content
,
'keyWords'
:
''
,
'contentWithTag'
:
''
,
'lang'
:
lang
,
'createDate'
:
time_now
,
'origin'
:
origin
,
'deleteFlag'
:
'0'
,
'publishDate'
:
datetime_string
,
'id'
:
''
,
'sid'
:
'1684032033495392257'
,
'keyWords'
:
''
,
'sourceAddress'
:
year_url
,
# 原文链接
'lang'
:
lang
,
'summary'
:
''
,
'origin'
:
origin
,
'title'
:
name_pdf
.
replace
(
',pdf'
,
''
),
'publishDate'
:
datetime_string
,
'type'
:
1
,
'sid'
:
'1684032033495392257'
,
'socialCreditCode'
:
social_code
,
'sourceAddress'
:
year_url
,
# 原文链接
'year'
:
year
'summary'
:
''
,
}
'title'
:
name_pdf
.
replace
(
',pdf'
,
''
),
# 将相应字段通过kafka传输保存
'type'
:
1
,
try
:
'socialCreditCode'
:
social_code
,
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
],
max_request_size
=
1024
*
1024
*
20
)
'year'
:
year
kafka_result
=
producer
.
send
(
"researchReportTopic"
,
}
json
.
dumps
(
dic_news
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
# 将相应字段通过kafka传输保存
try
:
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
],
max_request_size
=
1024
*
1024
*
20
)
kafka_result
=
producer
.
send
(
"researchReportTopic"
,
json
.
dumps
(
dic_news
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
print
(
kafka_result
.
get
(
timeout
=
10
))
print
(
kafka_result
.
get
(
timeout
=
10
))
dic_result
=
{
dic_result
=
{
'success'
:
'ture'
,
'success'
:
'ture'
,
'message'
:
'操作成功'
,
'message'
:
'操作成功'
,
'code'
:
'200'
,
'code'
:
'200'
,
}
}
log
.
info
(
dic_result
)
log
.
info
(
dic_result
)
# return True
# return True
except
Exception
as
e
:
except
Exception
as
e
:
dic_result
=
{
dic_result
=
{
'success'
:
'false'
,
'success'
:
'false'
,
'message'
:
'操作失败'
,
'message'
:
'操作失败'
,
'code'
:
'204'
,
'code'
:
'204'
,
'e'
:
e
'e'
:
e
}
}
state
=
0
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
pdf_url
,
'Kafka操作失败'
)
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
pdf_url
,
'Kafka操作失败'
)
log
.
info
(
dic_result
)
log
.
info
(
dic_result
)
return
False
return
False
# num = num + 1
time
.
sleep
(
2
)
time
.
sleep
(
2
)
# browser.quit()
# browser.quit()
return
True
return
True
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论