Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
127b1931
提交
127b1931
authored
8月 28, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
雪球网年报
上级
335b0090
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
281 行增加
和
0 行删除
+281
-0
雪球网-年报.py
comData/annualReport_ZJH/雪球网-年报.py
+281
-0
没有找到文件。
comData/annualReport_ZJH/雪球网-年报.py
0 → 100644
浏览文件 @
127b1931
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
"""
从数据库中读取年报缺失年份,采集对应网站上的年报,存在两种情况,标题中有年份,标题中无年份。
如果标题中有年份的话,按照原方式命名,有年份的应该都已经采过,跳过不插入更新
如果标题中无年份的话,则解析正文内容,正则表达式匹配年份,
采集一条,state 加1 如果报错的话就将state改为100,单独处理。
"""
import
json
from
kafka
import
KafkaProducer
from
base.BaseCore
import
BaseCore
baseCore
=
BaseCore
()
import
requests
,
re
,
time
,
pymysql
,
fitz
from
bs4
import
BeautifulSoup
as
bs
from
selenium
import
webdriver
chromedriver
=
"D:/chrome/chromedriver.exe"
browser
=
webdriver
.
Chrome
(
chromedriver
)
from
fdfs_client.client
import
get_tracker_conf
,
Fdfs_client
log
=
baseCore
.
getLogger
()
requests
.
adapters
.
DEFAULT_RETRIES
=
3
# conn = cx_Oracle.connect('cis/ZZsn9988_1qaz@114.116.91.1:1521/orcl')
cnx
=
pymysql
.
connect
(
host
=
'114.116.44.11'
,
user
=
'root'
,
password
=
'f7s0&7qqtK'
,
db
=
'clb_project'
,
charset
=
'utf8mb4'
)
cnx_
=
baseCore
.
cnx
cursor_
=
baseCore
.
cursor
# cnx_ = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='caiji', charset='utf8mb4')
# # cnx_ip = pymysql.connect(host='114.115.159.144',user='root', password='zzsn9988', db='clb_project', charset='utf8mb4')
# cursor_ = cnx_.cursor()
headers
=
{
"User-Agent"
:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
,
}
def
clean_text
(
text
):
"""
清理多余空行
:param text:
:return:
"""
soup
=
bs
(
text
,
'html.parser'
)
# print(soup.get_text())
text
=
soup
.
get_text
()
# str1 = re.sub('[\n]+', '\n', 'dfadf d\n \n\n \nfa ds ')
text_
=
re
.
sub
(
'
\n
+'
,
'
\n
'
,
text
.
replace
(
'
\t
'
,
''
)
.
replace
(
'
\r
'
,
''
))
return
text_
def
spider_annual_report
(
dict_info
,
num
):
social_code
=
dict_info
[
'social_code'
]
com_name
=
dict_info
[
'com_name'
]
code
=
dict_info
[
'code'
]
url_1
=
f
'https://vip.stock.finance.sina.com.cn/corp/go.php/vCB_Bulletin/stockid/{code}/page_type/ndbg.phtml'
browser
.
get
(
url_1
)
time
.
sleep
(
3
)
page_source
=
browser
.
page_source
soup
=
bs
(
page_source
,
'html.parser'
)
# res_1 = requests.get(url_1, proxies=ip)
# soup = bs(res_1.content, 'html.parser')
try
:
list_all
=
soup
.
find
(
'div'
,
{
'class'
:
'datelist'
})
.
find_all
(
'a'
)
except
:
log
.
info
(
f
'{social_code}.........年度报告列表为空'
)
exception
=
'年度报告列表为空'
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
exception
)
return
for
i
in
list_all
:
# ip = get_proxy()[random.randint(0, 3)]
pdf_name_a
=
i
.
text
year_url
=
'https://vip.stock.finance.sina.com.cn'
+
i
.
get
(
'href'
)
year_name
=
i
.
text
browser
.
get
(
year_url
)
time
.
sleep
(
5
)
page_source_2
=
browser
.
page_source
# res_2 = requests.get(year_url, proxies=ip)
soup_2
=
bs
(
page_source_2
,
'html.parser'
)
try
:
pdf_url
=
soup_2
.
find
(
'th'
,
{
'style'
:
'text-align:center'
})
.
find
(
'a'
)
.
get
(
'href'
)
except
:
#todo:无连接但是有正文内容
log
.
error
(
f
'{social_code}....{year_url}....无下载链接'
)
exception
=
'无下载链接'
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
year_url
,
exception
)
continue
#公告日期
pub_time
=
soup_2
.
find
(
'td'
,{
'class'
:
'head'
})
.
text
.
split
(
'公告日期'
)[
1
]
try
:
# 标题中有年份,
year
=
re
.
findall
(
'
\
d{4}'
,
year_name
)[
0
]
if
com_name
!=
'null'
:
name_pdf
=
f
"{com_name}:{year}年年报.pdf"
.
replace
(
'*'
,
''
)
else
:
name_pdf
=
pdf_name_a
+
'.pdf'
except
:
# 标题中无年份
content
=
soup_2
.
find
(
'div'
,
{
'id'
:
'content'
})
.
text
# 清除多余空行
content_c
=
clean_text
(
content
)
for
i
in
range
(
0
,
4
):
# 取第i行的数据
try
:
line
=
content_c
.
split
(
'
\n
'
)[
i
]
try
:
# 正则表达式匹配年份
year_
=
re
.
findall
(
'
\
d{4}
\
s*年年度报告'
,
line
)[
0
]
year
=
re
.
findall
(
'
\
d{4}'
,
year_
)[
0
]
if
com_name
!=
''
:
name_pdf
=
f
"{com_name}:{year}年年报.pdf"
.
replace
(
'*'
,
''
)
else
:
name_pdf
=
pdf_name_a
+
'.pdf'
break
except
:
try
:
result
=
soup_2
.
find
(
'td'
,
class_
=
'head'
)
.
text
year
=
str
(
int
(
re
.
findall
(
'
\
d{4}'
,
result
)[
0
])
-
1
)
if
com_name
!=
''
:
name_pdf
=
f
"{com_name}:{year}年年报.pdf"
.
replace
(
'*'
,
''
)
else
:
name_pdf
=
pdf_name_a
+
'.pdf'
except
:
continue
except
:
# result = soup_2.find('td', class_='head').text
year
=
str
(
int
(
re
.
findall
(
'
\
d{4}'
,
pub_time
)[
0
])
-
1
)
if
com_name
!=
''
:
name_pdf
=
f
"{com_name}:{year}年年报.pdf"
.
replace
(
'*'
,
''
)
else
:
name_pdf
=
pdf_name_a
+
'.pdf'
# name_pdf = f"{com_name}:{year}年年报.pdf".replace('*', '')
# name_pdf = pdf_name_a + '.pdf'
with
cnx
.
cursor
()
as
cursor
:
sel_sql
=
'''select item_id,year from clb_sys_attachment where item_id =
%
s and year =
%
s and type_id="1" '''
cursor
.
execute
(
sel_sql
,
(
social_code
,
int
(
year
)))
selects
=
cursor
.
fetchone
()
if
selects
:
print
(
f
'com_name:{com_name}、{year}已存在'
)
continue
else
:
page_size
=
0
#上传文件至文件服务器
retData
=
baseCore
.
upLoadToServe
(
pdf_url
,
1
,
social_code
)
num
=
num
+
1
try
:
att_id
=
baseCore
.
tableUpdate
(
retData
,
com_name
,
year
,
name_pdf
,
num
)
content
=
retData
[
'content'
]
if
retData
[
'state'
]:
pass
else
:
log
.
info
(
f
'====pdf解析失败===='
)
return
False
state
=
1
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
year_url
,
''
)
except
:
exception
=
'数据库传输失败'
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
year_url
,
exception
)
#发送数据到kafka
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
dic_news
=
{
'attachmentIds'
:
att_id
,
'author'
:
''
,
'content'
:
content
,
'contentWithTag'
:
''
,
'createDate'
:
time_now
,
'deleteFlag'
:
'0'
,
'id'
:
''
,
'keyWords'
:
''
,
'lang'
:
'zh'
,
'origin'
:
'雪球网'
,
'publishDate'
:
pub_time
,
'sid'
:
'1684032033495392257'
,
'sourceAddress'
:
year_url
,
# 原文链接
'summary'
:
''
,
'title'
:
name_pdf
,
'type'
:
1
,
'socialCreditCode'
:
social_code
,
'year'
:
year
}
# 将相应字段通过kafka传输保存
try
:
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
])
kafka_result
=
producer
.
send
(
"researchReportTopic"
,
json
.
dumps
(
dic_news
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
print
(
kafka_result
.
get
(
timeout
=
10
))
dic_result
=
{
'success'
:
'ture'
,
'message'
:
'操作成功'
,
'code'
:
'200'
,
}
print
(
dic_result
)
return
True
except
Exception
as
e
:
dic_result
=
{
'success'
:
'false'
,
'message'
:
'操作失败'
,
'code'
:
'204'
,
'e'
:
e
}
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
pdf_url
,
'Kafka操作失败'
)
print
(
dic_result
)
return
False
# num = num + 1
time
.
sleep
(
2
)
# browser.quit()
#state1
if
__name__
==
'__main__'
:
num
=
0
taskType
=
'企业年报/雪球网/福布斯'
while
True
:
start_time
=
time
.
time
()
# 获取企业信息
social_code
=
baseCore
.
redicPullData
(
'AnnualEnterprise:gnshqy_socialCode'
)
# social_code = '911100007109288314'
if
not
social_code
:
time
.
sleep
(
20
)
continue
if
social_code
==
'None'
:
time
.
sleep
(
20
)
continue
if
social_code
==
''
:
time
.
sleep
(
20
)
continue
dic_info
=
baseCore
.
getInfomation
(
social_code
)
count
=
dic_info
[
15
]
code
=
dic_info
[
3
]
com_name
=
dic_info
[
4
]
if
code
is
None
:
exeception
=
'股票代码为空'
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
''
,
exeception
)
continue
while
True
:
if
len
(
code
)
<
6
:
code
=
"0"
+
code
else
:
break
# years = tuple(call_year)
dict_info
=
{
'social_code'
:
social_code
,
'com_name'
:
com_name
,
'code'
:
code
,
}
# list_info.append(dict_info)
spider_annual_report
(
dict_info
,
num
)
count
+=
1
runType
=
'AnnualReportCount'
baseCore
.
updateRun
(
social_code
,
runType
,
count
)
# cursor.close()
cnx_
.
close
()
# 释放资源
baseCore
.
close
()
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论