Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
c3f4587f
提交
c3f4587f
authored
8月 28, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
证监会年报
上级
2d7e135f
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
136 行增加
和
94 行删除
+136
-94
证监会-年报.py
comData/annualReport_ZJH/证监会-年报.py
+136
-94
没有找到文件。
comData/annualReport_ZJH/证监会-年报.py
浏览文件 @
c3f4587f
import
json
import
json
from
kafka
import
KafkaProducer
from
fdfs_client.client
import
get_tracker_conf
,
Fdfs_client
from
fdfs_client.client
import
get_tracker_conf
,
Fdfs_client
...
@@ -9,6 +12,7 @@ from base import BaseCore
...
@@ -9,6 +12,7 @@ from base import BaseCore
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
baseCore
=
BaseCore
.
BaseCore
()
baseCore
=
BaseCore
.
BaseCore
()
log
=
baseCore
.
getLogger
()
# conn = cx_Oracle.connect('cis/ZZsn9988_1qaz@114.116.91.1:1521/orcl')
# conn = cx_Oracle.connect('cis/ZZsn9988_1qaz@114.116.91.1:1521/orcl')
cnx
=
pymysql
.
connect
(
host
=
'114.116.44.11'
,
user
=
'root'
,
password
=
'f7s0&7qqtK'
,
db
=
'clb_project'
,
charset
=
'utf8mb4'
)
cnx
=
pymysql
.
connect
(
host
=
'114.116.44.11'
,
user
=
'root'
,
password
=
'f7s0&7qqtK'
,
db
=
'clb_project'
,
charset
=
'utf8mb4'
)
cursor_
=
cnx
.
cursor
()
cursor_
=
cnx
.
cursor
()
...
@@ -18,28 +22,6 @@ client = Fdfs_client(tracker_conf)
...
@@ -18,28 +22,6 @@ client = Fdfs_client(tracker_conf)
taskType
=
'企业年报/证监会'
taskType
=
'企业年报/证监会'
# def get_proxy():
# cursor = cnx_ip.cursor()
# sql = "select proxy from clb_proxy"
# cursor.execute(sql)
# proxy_lists = cursor.fetchall()
# ip_list = []
# for proxy_ in proxy_lists:
# ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
# proxy_list = []
# for str_ip in ip_list:
# str_ip_list = str_ip.split('-')
# proxyMeta = "http://%(host)s:%(port)s" % {
# "host": str_ip_list[0],
# "port": str_ip_list[1],
# }
# proxy = {
# "HTTP": proxyMeta,
# "HTTPS": proxyMeta
# }
# proxy_list.append(proxy)
# return proxy_list
def
RequestUrl
(
url
,
payload
,
item_id
,
start_time
):
def
RequestUrl
(
url
,
payload
,
item_id
,
start_time
):
# ip = get_proxy()[random.randint(0, 3)]
# ip = get_proxy()[random.randint(0, 3)]
...
@@ -118,7 +100,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
...
@@ -118,7 +100,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
pdf_url
=
pdf_url_info
[
'onclick'
]
.
strip
(
'downloadPdf1('
)
.
split
(
','
)[
0
]
.
strip
(
'
\'
'
)
pdf_url
=
pdf_url_info
[
'onclick'
]
.
strip
(
'downloadPdf1('
)
.
split
(
','
)[
0
]
.
strip
(
'
\'
'
)
name_pdf
=
pdf_url_info
[
'onclick'
]
.
strip
(
'downloadPdf1('
)
.
split
(
','
)[
1
]
.
strip
(
'
\'
'
)
name_pdf
=
pdf_url_info
[
'onclick'
]
.
strip
(
'downloadPdf1('
)
.
split
(
','
)[
1
]
.
strip
(
'
\'
'
)
#
pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')
pub_time
=
pdf_url_info
[
'onclick'
]
.
strip
(
'downloadPdf1('
)
.
split
(
','
)[
2
]
.
strip
(
'
\'
'
)
# print(name)
# print(name)
report_type
=
td_list
[
4
]
.
text
.
strip
()
report_type
=
td_list
[
4
]
.
text
.
strip
()
# print(report_type)
# print(report_type)
...
@@ -129,11 +111,11 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
...
@@ -129,11 +111,11 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
try
:
try
:
year
=
re
.
findall
(
'
\
d{4}
\
s*年'
,
name_pdf
)[
0
]
.
replace
(
'年'
,
''
)
year
=
re
.
findall
(
'
\
d{4}
\
s*年'
,
name_pdf
)[
0
]
.
replace
(
'年'
,
''
)
except
Exception
as
e
:
except
Exception
as
e
:
pub_time
=
pdf_url_info
[
'onclick'
]
.
strip
(
'downloadPdf1('
)
.
split
(
','
)[
2
]
.
strip
(
'
\'
'
)[:
4
]
#
pub_time = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[2].strip('\'')[:4]
year
=
int
(
pub_time
)
-
1
year
=
int
(
pub_time
)
-
1
year
=
str
(
year
)
year
=
str
(
year
)
page_size
=
0
#
page_size = 0
sel_sql
=
'''select item_id,year from clb_sys_attachment where item_id =
%
s and year =
%
s'''
sel_sql
=
'''select item_id,year from clb_sys_attachment where item_id =
%
s and year =
%
s'''
cursor_
.
execute
(
sel_sql
,
(
item_id
,
year
))
cursor_
.
execute
(
sel_sql
,
(
item_id
,
year
))
...
@@ -142,77 +124,137 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
...
@@ -142,77 +124,137 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
print
(
f
'com_name:{short_name}、{year}已存在'
)
print
(
f
'com_name:{short_name}、{year}已存在'
)
continue
continue
else
:
else
:
# 类型为年报的话就解析该年报pdf,并入库
# # 类型为年报的话就解析该年报pdf,并入库
for
i
in
range
(
0
,
3
):
# for i in range(0, 3):
try
:
# try:
resp_content
=
requests
.
request
(
"GET"
,
pdf_url
)
.
content
# resp_content = requests.request("GET", pdf_url).content
# 获取pdf页数
# # 获取pdf页数
with
fitz
.
open
(
stream
=
resp_content
,
filetype
=
'pdf'
)
as
doc
:
# with fitz.open(stream=resp_content, filetype='pdf') as doc:
page_size
=
doc
.
page_count
# page_size = doc.page_count
break
# break
except
Exception
as
e
:
# except Exception as e:
print
(
e
)
# print(e)
time
.
sleep
(
3
)
# time.sleep(3)
continue
# continue
if
page_size
<
1
:
# if page_size < 1:
# pdf解析失败
# # pdf解析失败
print
(
f
'==={short_name}、{year}===pdf解析失败'
)
# print(f'==={short_name}、{year}===pdf解析失败')
state
=
0
# state = 0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
# takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore
.
recordLog
(
item_id
,
taskType
,
state
,
takeTime
,
pdf_url
,
'pdf解析失败'
)
# baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, 'pdf解析失败')
continue
# continue
result
=
''
# result = ''
for
i
in
range
(
0
,
3
):
# for i in range(0, 3):
try
:
# try:
result
=
client
.
upload_by_buffer
(
resp_content
,
file_ext_name
=
'pdf'
)
# result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
break
# break
except
Exception
as
e
:
# except Exception as e:
print
(
e
)
# print(e)
time
.
sleep
(
3
)
# time.sleep(3)
continue
# continue
if
result
==
''
:
# if result == '':
e
=
'上传服务器失败'
# e = '上传服务器失败'
state
=
0
# state = 0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
# takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore
.
recordLog
(
item_id
,
taskType
,
state
,
takeTime
,
pdf_url
,
e
)
# baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, e)
continue
# continue
#
if
'Remote file_id'
in
str
(
result
)
and
'Uploaded size'
in
str
(
result
):
# if 'Remote file_id' in str(result) and 'Uploaded size' in str(result):
#
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
# time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#
type_id
=
'1'
# type_id = '1'
item_id
=
dic_info
[
'social_code'
]
# item_id = dic_info['social_code']
group_name
=
'group1'
# group_name = 'group1'
#
path
=
bytes
.
decode
(
result
[
'Remote file_id'
])
.
replace
(
'group1'
,
''
)
# path = bytes.decode(result['Remote file_id']).replace('group1', '')
full_path
=
bytes
.
decode
(
result
[
'Remote file_id'
])
# full_path = bytes.decode(result['Remote file_id'])
category
=
'pdf'
# category = 'pdf'
file_size
=
result
[
'Uploaded size'
]
# file_size = result['Uploaded size']
order_by
=
num
# order_by = num
status
=
1
# status = 1
create_by
=
'XueLingKun'
# create_by = 'XueLingKun'
create_time
=
time_now
# create_time = time_now
page_size
=
page_size
# page_size = page_size
try
:
# try:
tableUpdate
(
year
,
name_pdf
,
type_id
,
item_id
,
group_name
,
path
,
full_path
,
# tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path,
category
,
file_size
,
order_by
,
status
,
create_by
,
create_time
,
page_size
)
# category, file_size, order_by, status, create_by, create_time, page_size)
state
=
1
# state = 1
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
# takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore
.
recordLog
(
item_id
,
taskType
,
state
,
takeTime
,
pdf_url
,
''
)
# baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, '')
except
:
# except:
e
=
'数据库传输失败'
# e = '数据库传输失败'
state
=
0
# state = 0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
# takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore
.
recordLog
(
item_id
,
taskType
,
state
,
takeTime
,
pdf_url
,
e
)
# baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, e)
num
=
num
+
1
# num = num + 1
time
.
sleep
(
2
)
# time.sleep(2)
# else:
# e = '采集失败'
# state = 0
# takeTime = baseCore.getTimeCost(start_time, time.time())
# baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, e)
# continue
#上传至文件服务器
retData
=
baseCore
.
upLoadToServe
(
pdf_url
,
1
,
social_code
)
#插入数据库获取att_id
num
=
num
+
1
att_id
=
baseCore
.
tableUpdate
(
retData
,
short_name
,
year
,
name_pdf
,
num
)
content
=
retData
[
'content'
]
if
retData
[
'state'
]:
pass
else
:
else
:
e
=
'采集失败'
log
.
info
(
f
'====pdf解析失败===='
)
return
False
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
dic_news
=
{
'attachmentIds'
:
att_id
,
'author'
:
''
,
'content'
:
content
,
'contentWithTag'
:
''
,
'createDate'
:
time_now
,
'deleteFlag'
:
'0'
,
'id'
:
''
,
'keyWords'
:
''
,
'lang'
:
'zh'
,
'origin'
:
'证监会'
,
'publishDate'
:
pub_time
,
'sid'
:
'1684032033495392257'
,
'sourceAddress'
:
''
,
# 原文链接
'summary'
:
''
,
'title'
:
name_pdf
,
'type'
:
1
,
'socialCreditCode'
:
social_code
,
'year'
:
year
}
# print(dic_news)
# 将相应字段通过kafka传输保存
try
:
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
])
kafka_result
=
producer
.
send
(
"researchReportTopic"
,
json
.
dumps
(
dic_news
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
print
(
kafka_result
.
get
(
timeout
=
10
))
dic_result
=
{
'success'
:
'ture'
,
'message'
:
'操作成功'
,
'code'
:
'200'
,
}
print
(
dic_result
)
return
True
except
Exception
as
e
:
dic_result
=
{
'success'
:
'false'
,
'message'
:
'操作失败'
,
'code'
:
'204'
,
'e'
:
e
}
state
=
0
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
item_id
,
taskType
,
state
,
takeTime
,
pdf_url
,
e
)
baseCore
.
recordLog
(
social_code
,
taskType
,
state
,
takeTime
,
pdf_url
,
'Kafka操作失败'
)
continue
print
(
dic_result
)
return
False
else
:
else
:
continue
continue
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论