Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
cb908caf
提交
cb908caf
authored
9月 01, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
政策法规
上级
593410c8
隐藏空白字符变更
内嵌
并排
正在显示
3 个修改的文件
包含
22 行增加
和
115 行删除
+22
-115
证监会-年报.py
comData/annualReport_ZJH/证监会-年报.py
+1
-73
2.py
comData/policylaw/2.py
+20
-41
get_tokenCookies.py
comData/weixin_solo/get_tokenCookies.py
+1
-1
没有找到文件。
comData/annualReport_ZJH/证监会-年报.py
浏览文件 @
cb908caf
impor
t
json
impor
t
json
...
...
@@ -124,78 +124,6 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
print
(
f
'com_name:{short_name}、{year}已存在'
)
continue
else
:
# # 类型为年报的话就解析该年报pdf,并入库
# for i in range(0, 3):
# try:
# resp_content = requests.request("GET", pdf_url).content
# # 获取pdf页数
# with fitz.open(stream=resp_content, filetype='pdf') as doc:
# page_size = doc.page_count
# break
# except Exception as e:
# print(e)
# time.sleep(3)
# continue
# if page_size < 1:
# # pdf解析失败
# print(f'==={short_name}、{year}===pdf解析失败')
# state = 0
# takeTime = baseCore.getTimeCost(start_time, time.time())
# baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, 'pdf解析失败')
# continue
# result = ''
# for i in range(0, 3):
# try:
# result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
# break
# except Exception as e:
# print(e)
# time.sleep(3)
# continue
# if result == '':
# e = '上传服务器失败'
# state = 0
# takeTime = baseCore.getTimeCost(start_time, time.time())
# baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, e)
# continue
#
# if 'Remote file_id' in str(result) and 'Uploaded size' in str(result):
#
# time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
#
# type_id = '1'
# item_id = dic_info['social_code']
# group_name = 'group1'
#
# path = bytes.decode(result['Remote file_id']).replace('group1', '')
# full_path = bytes.decode(result['Remote file_id'])
# category = 'pdf'
# file_size = result['Uploaded size']
# order_by = num
# status = 1
# create_by = 'XueLingKun'
# create_time = time_now
# page_size = page_size
# try:
# tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path,
# category, file_size, order_by, status, create_by, create_time, page_size)
# state = 1
# takeTime = baseCore.getTimeCost(start_time, time.time())
# baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, '')
# except:
# e = '数据库传输失败'
# state = 0
# takeTime = baseCore.getTimeCost(start_time, time.time())
# baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, e)
# num = num + 1
# time.sleep(2)
# else:
# e = '采集失败'
# state = 0
# takeTime = baseCore.getTimeCost(start_time, time.time())
# baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, e)
# continue
#上传至文件服务器
retData
=
baseCore
.
upLoadToServe
(
pdf_url
,
1
,
social_code
)
#插入数据库获取att_id
num
=
num
+
1
...
...
comData/policylaw/2.py
浏览文件 @
cb908caf
...
...
@@ -47,41 +47,6 @@ def replaceUrl(hostUrl,src):
finnal_href
=
hostUrl
+
src
return
finnal_href
def
attachjob
(
fu_jian_soup
,
href
):
for
fu_jian_tag
in
fu_jian_soup
:
try
:
# 附件链接
fu_jian_href
=
fu_jian_tag
[
'href'
]
pass
except
:
continue
# todo:将链接替换为绝对路径
# todo:将附件上传至文件服务器,并返回文件服务器路径和attid,并替换 不用解析内容
if
'.html'
in
fu_jian_href
or
'.pdf'
in
fu_jian_href
or
'.docx'
in
fu_jian_href
or
'.doc'
in
fu_jian_href
or
'xls'
in
fu_jian_href
or
'.zip'
in
fu_jian_href
\
or
'.rar'
in
fu_jian_href
or
'.ppt'
in
fu_jian_href
or
'.PDF'
in
fu_jian_href
or
'.DOC'
in
fu_jian_href
\
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
if
'http'
in
fu_jian_href
:
pass
else
:
# 计算有多少个../
if
'../'
in
fu_jian_href
:
count
=
fu_jian_href
.
count
(
"../"
)
if
count
==
1
:
hostUrl
=
'https://gzw.beijing.gov.cn/xxfb/zcfg/'
if
count
==
2
:
hostUrl
=
'https://gzw.beijing.gov.cn/xxfb/'
if
count
==
3
:
hostUrl
=
'https://gzw.beijing.gov.cn/xxfb/'
else
:
if
'./'
in
fu_jian_href
:
hostUrl
=
href
.
split
(
'/t'
)[
0
]
# 替换为绝对路径
fin_fj_href
=
replaceUrl
(
hostUrl
,
fu_jian_href
)
# 将新路径替换标签中的路径
fu_jian_tag
[
'href'
]
=
fin_fj_href
return
fu_jian_soup
def
save_data
(
result_dict
):
try
:
aa
=
result_dict
[
'信息来源'
]
...
...
@@ -487,6 +452,21 @@ def get_content3():
end_time
=
time
.
time
()
print
(
f
'共抓取{num}条数据,耗时{end_time - start_time}'
)
from
bs4
import
BeautifulSoup
from
urllib.parse
import
urljoin
# 将html中的相对地址转换成绝对地址
def
paserUrl
(
html
,
listurl
):
# soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links
=
html
.
find_all
([
'a'
,
'img'
])
# 遍历标签,将相对地址转换为绝对地址
for
link
in
links
:
if
'href'
in
link
.
attrs
:
link
[
'href'
]
=
urljoin
(
listurl
,
link
[
'href'
])
elif
'src'
in
link
.
attrs
:
link
[
'src'
]
=
urljoin
(
listurl
,
link
[
'src'
])
return
html
# 北京
def
bei_jing
():
...
...
@@ -556,15 +536,14 @@ def bei_jing():
cont
=
bro
.
find_element
(
By
.
ID
,
'div_zhengwen'
)
.
get_attribute
(
'innerHTML'
)
soup_cont
=
BeautifulSoup
(
cont
,
'lxml'
)
fu_jian_soup
=
soup_cont
.
find_all
(
'a'
)
attachjob
(
fu_jian_soup
,
href
[
0
])
print
(
fu_jian_soup
)
# print(fu_jian_soup
)
print
(
soup_con
t
)
print
(
title
)
soup
=
paserUrl
(
soup_cont
,
href
)
text
=
str
(
soup
.
prettify
()
)
print
(
tex
t
)
#
print(title)
num
=
0
fu_jian_soup
=
soup
.
find_all
(
'a'
)
for
file
in
fu_jian_soup
:
num
+=
1
file_href
=
file
[
'href'
]
...
...
comData/weixin_solo/get_tokenCookies.py
浏览文件 @
cb908caf
...
...
@@ -58,7 +58,7 @@ if __name__=="__main__":
url
=
"https://mp.weixin.qq.com/"
browser
.
get
(
url
)
# 可改动
time
.
sleep
(
3
0
)
time
.
sleep
(
7
0
)
s
=
requests
.
session
()
#获取到token和cookies
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论