Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
eeb41ef7
提交
eeb41ef7
authored
9月 09, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
政策法规
上级
d5722767
显示空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
47 行增加
和
50 行删除
+47
-50
2.py
comData/policylaw/2.py
+47
-50
没有找到文件。
comData/policylaw/2.py
浏览文件 @
eeb41ef7
...
...
@@ -725,7 +725,7 @@ def ji_lin():
if
is_href
:
continue
try
:
# real_href = 'http://gzw.jl.gov.cn/zwgk/zcwj/
202211/t20221123_2310750
.html'
# real_href = 'http://gzw.jl.gov.cn/zwgk/zcwj/
/201906/t20190624_2310742
.html'
href_text
=
requests
.
get
(
url
=
real_href
,
headers
=
headers
,
verify
=
False
)
i_html
=
href_text
.
text
.
encode
(
"ISO-8859-1"
)
i_html
=
i_html
.
decode
(
"utf-8"
)
...
...
@@ -733,7 +733,8 @@ def ji_lin():
# print(i_soup)
#相对路径转化为绝对路径
soup
=
paserUrl
(
i_soup
,
real_href
)
text
=
str
(
soup
.
prettify
())
soup
.
prettify
()
try
:
i_come
=
i_soup
.
find
(
'span'
,
class_
=
'source'
)
i_time
=
i_soup
.
find
(
'span'
,
class_
=
'time'
)
...
...
@@ -756,9 +757,18 @@ def ji_lin():
pub_time
=
pub
.
find
(
class_
=
'left'
)
.
find
(
'span'
,
class_
=
'time'
)
.
text
pub_come
=
pub
.
find
(
class_
=
'right'
)
.
find
(
'span'
,
class_
=
'source'
)
.
text
.
split
(
'来源:'
)[
1
]
.
strip
()
# print(pub_come)
i_content
=
i_
soup
.
find
(
class_
=
'zsy_comain'
)
i_content
=
soup
.
find
(
class_
=
'zsy_comain'
)
if
i_content
:
content
=
str
(
i_content
)
print
(
real_href
)
#去掉扫一扫
soup
.
find
(
'div'
,
id
=
'qr_container'
)
.
decompose
()
soup
.
find
(
'div'
,
id
=
'div_div'
)
.
decompose
()
#去掉style
# 去掉style标签
for
styleTag
in
soup
.
find_all
(
'style'
):
styleTag
.
extract
()
contentWithTag
=
soup
.
find
(
class_
=
'zsy_comain'
)
content
=
contentWithTag
.
text
.
strip
()
#发文字号
find_hao
=
i_content
.
find_all
(
'p'
)[:
3
]
pub_hao
=
''
...
...
@@ -767,7 +777,7 @@ def ji_lin():
pub_hao
=
j
.
text
else
:
continue
fj
=
i_
soup
.
find
(
'div'
,
style
=
'width:920px; margin: 0 auto;'
)
fj
=
soup
.
find
(
'div'
,
style
=
'width:920px; margin: 0 auto;'
)
if
fj
:
li_list
=
fj
.
find_all
(
'li'
)
for
li
in
li_list
:
...
...
@@ -790,16 +800,20 @@ def ji_lin():
else
:
continue
else
:
i_content
=
i_
soup
.
find
(
class_
=
"content"
)
i_content
=
soup
.
find
(
class_
=
"content"
)
#将文章中的附件字段删去
pattern
=
r'\d+\.'
# pattern = r"附件:\d+\.\s*(.*)"
for
p
in
i_content
.
find_all
(
'div'
)[
-
10
:]:
p_text
=
p
.
text
matches
=
re
.
findall
(
pattern
,
p_text
)
if
matches
:
for
k
in
matches
:
if
k
in
p_text
:
p
.
extract
()
content
=
str
(
i_content
)
contentWithTag
=
i_content
content
=
contentWithTag
.
text
.
strip
()
#找到附件上传至文件服务器
fj_soup
=
i_soup
.
find
(
'div'
,
class_
=
'wenjianfujian'
)
fj_list
=
fj_soup
.
find_all
(
'a'
)
...
...
@@ -815,7 +829,7 @@ def ji_lin():
pass
else
:
continue
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'吉林
市
国资委'
,
file_name
,
num
)
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'吉林
省
国资委'
,
file_name
,
num
)
id_list
.
append
(
att_id
)
# todo:将返回的地址更新到soup
...
...
@@ -836,8 +850,8 @@ def ji_lin():
dic_news
=
{
'attachmentIds'
:
id_list
,
'author'
:
''
,
'content'
:
str
(
i_content
.
text
)
,
'contentWithTag'
:
content
,
'content'
:
content
,
'contentWithTag'
:
content
WithTag
,
'createDate'
:
time_now
,
'deleteFlag'
:
0
,
'id'
:
''
,
...
...
@@ -1168,15 +1182,17 @@ def fu_jian():
i_html
=
href_text
.
text
i_soup
=
BeautifulSoup
(
i_html
,
'html.parser'
)
real_href
=
href
real_href
=
'http://gzw.fujian.gov.cn/zwgk/xxgkzl/xxgkml/gfxwj/202211/t20221129_6064610.htm'
# print(real_href)
is_href
=
db_storage
.
find_one
({
'网址'
:
real_href
})
if
is_href
:
continue
#
is_href = db_storage.find_one({'网址': real_href})
#
if is_href:
#
continue
try
:
#
# 文章是远程pdf
#
直接下载文件至服务器,解析出正文内容
# 文章是远程pdf
#
直接下载文件至服务器,解析出正文内容
if
'.pdf'
in
real_href
:
# pass
resp_content
=
requests
.
get
(
real_href
,
headers
=
headers
,
verify
=
False
,
timeout
=
20
)
.
content
#解析出pdf内容
content
=
baseCore
.
pdf_content
(
resp_content
)
...
...
@@ -1195,8 +1211,6 @@ def fu_jian():
else
:
try
:
real_href
=
'http://gzw.fujian.gov.cn/ztzl/gzjgfzjs/gfxwj_7426/201809/t20180911_4492105.htm'
href_text
=
requests
.
get
(
url
=
real_href
,
headers
=
headers
,
verify
=
False
)
href_text
.
encoding
=
href_text
.
apparent_encoding
i_html
=
href_text
.
text
...
...
@@ -1208,6 +1222,7 @@ def fu_jian():
try
:
fu_jian_list
=
i_soup
.
find
(
'ul'
,
class_
=
'clearflx myzj_xl_list'
)
.
find_all
(
'a'
)
except
:
pass
fu_jian_list
=
[]
for
fu_jian
in
fu_jian_list
:
fj_href
=
fu_jian
[
'href'
]
...
...
@@ -1234,29 +1249,12 @@ def fu_jian():
pub_hao
=
''
except
:
print
(
f
'-------其他情况:{real_href}-------'
)
continue
# href_text = requests.get(url=real_href, headers=headers, verify=False)
# href_text.encoding = href_text.apparent_encoding
# i_html = href_text.text
# i_soup = BeautifulSoup(i_html, 'html.parser')
# i_soup = paserUrl(i_soup, real_href)
# # print(i_soup)
# source = str(i_soup.find('table', attrs={'class': 'tp-pho'}).text)
# pub_hao = source.split('文号')[1].split('发布机构')[0].strip().lstrip()
# pub_source = source.split('发布机构')[1].split('生成日期')[0].strip().lstrip()
# pub_time = source.split('生成日期')[1].split('标题')[0].strip().lstrip()
# content = i_soup.find('div', attrs={'class': 'xl-article-nr'})
# fu_jian_result = re.findall('href="(.*?)"', str(content))
# fu_jian_href_list = []
# if len(fu_jian_result) > 0:
# for fu_jian_re in fu_jian_result:
# if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
# or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
# or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
# fu_jian_href = fu_jian_re
# print(fu_jian_href)
# fu_jian_href_list.append(fu_jian_href)
pub_source
=
''
pub_time
=
''
contentwithtag
=
i_soup
.
find
(
'tabs tab_base_01 rules_con1'
)
content
=
contentwithtag
.
text
.
strip
()
pub_hao
=
contentwithtag
.
find_all
(
'div'
,
class_
=
'rules_tit1 b-free-read-leaf'
)
.
text
.
dtrip
()
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
# todo:传kafka字段
dic_news
=
{
...
...
@@ -1283,7 +1281,6 @@ def fu_jian():
sendKafka
(
dic_news
)
save_data
(
dic_news
)
print
(
title
)
# save_data(result_dict)
num
+=
1
except
:
pass
...
...
@@ -1727,7 +1724,8 @@ def hai_nan():
'summary'
:
''
,
'title'
:
title
}
sendKafka
(
dic_news
)
save_data
(
dic_news
)
href_text
.
close
()
# save_data(result_dict)
print
(
title
)
...
...
@@ -1777,7 +1775,7 @@ def hai_nan():
contentWithTag
=
doc_href
.
find
(
class_
=
'con_cen line mar-t2 xxgk_content_content'
)
content
=
contentWithTag
.
text
except
:
print
(
href
)
#
print(href)
pub_result
=
doc_href
.
find
(
'div'
,
attrs
=
{
'class'
:
'line mar-t2 con_div'
})
topicClassification
=
''
origin
=
str
(
pub_result
.
text
)
.
split
(
'来源:'
)[
1
]
.
split
(
' 【字体:'
)[
0
]
.
lstrip
()
.
strip
()
...
...
@@ -2411,9 +2409,8 @@ def gui_zhou():
'title'
:
title
}
# print(dic_news)
# sendKafka(dic_news)
# save_data(dic_news)
sendKafka
(
dic_news
)
save_data
(
dic_news
)
print
(
title
)
# save_data(result_dict)
num
=
num
+
1
...
...
@@ -2700,7 +2697,7 @@ def chong_qing():
contentWithTag
=
doc_href
.
find
(
'div'
,
class_
=
'zwxl-article'
)
content
=
contentWithTag
.
text
except
:
pub_source
=
''
origin
=
''
topicClassification
=
''
pub_time
=
''
writtenDate
=
''
...
...
@@ -2742,7 +2739,7 @@ def chong_qing():
'id'
:
''
,
'labels'
:
[{
'relationId'
:
"1693"
,
'relationName'
:
"重庆市国资委"
,
'labelMark'
:
"policy"
}],
'origin'
:
''
,
'origin'
:
origin
,
'organ'
:
''
,
'topicClassification'
:
topicClassification
,
'issuedNumber'
:
pub_hao
,
...
...
@@ -5392,7 +5389,7 @@ if __name__ == '__main__':
# ji_lin()
# shang_hai()
# zhe_jiang()
#
fu_jian()
fu_jian
()
# shan_dong()
# guang_dong()
# hai_nan()
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论