Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
a0ee390b
提交
a0ee390b
authored
6月 15, 2024
作者:
XveLingKun
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
证监会公告
上级
79448081
显示空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
29 行增加
和
10 行删除
+29
-10
证监会-公告.py
comData/noticeReport/证监会-公告.py
+29
-10
没有找到文件。
comData/noticeReport/证监会-公告.py
浏览文件 @
a0ee390b
...
...
@@ -51,18 +51,29 @@ def convert_size(size_bytes):
return
f
"{size_bytes:.2f} {units[i]}"
def
uptoOBS
(
pdf_url
,
pdf_name
,
type_id
,
social_code
):
headers
=
{}
headers
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept-Encoding'
:
'gzip, deflate'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Cache-Control'
:
'max-age=0'
,
'Connection'
:
'keep-alive'
,
'Cookie'
:
'acw_tc=b7ccf59b17183478493685990e058ea61e8fcc97b3e3ceeb9ca72237eb; cdn_sec_tc=b7ccf59b17183478493685990e058ea61e8fcc97b3e3ceeb9ca72237eb; acw_sc__v3=666be84cf4454ec8c2436572df9f9e6dc78b409b; tfstk=fqG-Yit_Tnxu599nN49m-_AIYB8Dsb3ru0u1tkXQNmEI7DCnr3uQAkigmbqor2ZKpoisr_xrxDEIS2DuVDDuAyiif42H87mY9mn0qT0H46hKjmf3V3PSp6FriYf3q3PKRcVpjhAMs4uzaWtMjvDJxuN_WgaCxuNbhEjiRARMs4u5rzTilCYevQh4AkNCFk6Xky48OzwSAowb5Pj7OWiIlEU37k1QAzgfGzznwK58vaZN9vHWr405ar5CObNUelUOK6CLOzwRU4Zxy4hYy8ETn4oFqbimRbcz3KW0TqDtvvi6mTqSBPnYIYKOwcnzRmFo1eJz3JGK9rk2v9Etd4DZd-LWNqF82-U8eaBQwvir98kR8FubNmkab8920rhosJEaHitSoqE7Bvnk06ZoBqiYIjjcs5MZDXe_1gPEsfIB8GqT-TTvk9WUFrfOmApHgpiI6rEMumWFL-U4klYvk9WUFrzYjEyVL9yYu'
,
'Host'
:
'static.sse.com.cn'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
}
retData
=
{
'state'
:
False
,
'type_id'
:
type_id
,
'item_id'
:
social_code
,
'group_name'
:
''
,
'path'
:
''
,
'full_path'
:
''
,
'category'
:
'pdf'
,
'file_size'
:
''
,
'status'
:
1
,
'create_by'
:
'XueLingKun'
,
'create_time'
:
''
,
'page_size'
:
''
,
'content'
:
''
}
headers
[
'User-Agent'
]
=
baseCore
.
getRandomUserAgent
()
#
headers['User-Agent'] = baseCore.getRandomUserAgent()
for
i
in
range
(
0
,
3
):
try
:
response
=
requests
.
get
(
pdf_url
,
headers
=
headers
,
verify
=
False
,
timeout
=
20
)
try
:
file_size
=
int
(
response
.
headers
.
get
(
'Content-Length'
))
break
except
:
file_size
=
0
break
except
Exception
as
e
:
time
.
sleep
(
3
)
continue
page_size
=
0
...
...
@@ -78,7 +89,7 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
page_size
=
doc
.
page_count
for
page
in
doc
.
pages
():
retData
[
'content'
]
+=
page
.
get_text
()
except
:
except
Exception
as
e
:
log
.
error
(
f
'文件损坏'
)
return
retData
...
...
@@ -156,10 +167,11 @@ def tableUpdate(retData, com_name, year, pdf_name, num,pub_time,origin):
@retry
(
tries
=
3
,
delay
=
5
)
def
RequestUrl
(
url
,
payload
,
social_code
,
start_time
):
ip
=
baseCore
.
get_proxy
()
#
ip = baseCore.get_proxy()
# proxy = {'https': 'http://127.0.0.1:8888', 'http': 'http://127.0.0.1:8888'}
response
=
requests
.
post
(
url
=
url
,
headers
=
headers
,
data
=
payload
,
proxies
=
ip
)
# response = requests.post(url=url, headers=headers, data=payload, proxies=ip)
response
=
requests
.
post
(
url
=
url
,
headers
=
headers
,
data
=
payload
)
# response = requests.post(url=url, data=payload)
response
.
encoding
=
response
.
apparent_encoding
if
response
.
status_code
==
200
:
...
...
@@ -463,6 +475,13 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库
pub_time
=
date_object
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
year
=
pub_time
[:
4
]
report_type
=
td_list
[
4
]
.
text
.
strip
()
# 获取当前年份
current_year
=
datetime
.
now
()
.
year
# print(current_year)
if
int
(
current_year
)
<
int
(
year
):
continue
if
str
(
current_year
)[:
1
]
<
year
[:
1
]:
# 防止年份出现6005这种切出来股票代码的情况
continue
# 判断数据库中是否有该条资讯
ifexist
=
ifInstert
(
short_name
,
social_code
,
pdf_url
)
...
...
@@ -489,7 +508,7 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库
else
:
log
.
info
(
f
'======={short_name}========{code}===已存在'
)
# continue
break
return
if
__name__
==
'__main__'
:
num
=
0
...
...
@@ -528,8 +547,8 @@ if __name__ == '__main__':
while
True
:
start_time
=
time
.
time
()
# 获取企业信息
# social_code = baseCore.redicPullData('NoticeEnterprise:gnqy_socialCode
')
social_code
=
'91370000163446410B'
social_code
=
baseCore
.
redicPullData
(
'NoticeEnterprise:gnqy_socialCode_add
'
)
#
social_code = '91370000163446410B'
# 判断 如果Redis中已经没有数据,则等待
if
social_code
==
None
:
time
.
sleep
(
20
)
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论