Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
baba9e5d
提交
baba9e5d
authored
12月 11, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
政策法规脚本维护
上级
f2ff6737
隐藏空白字符变更
内嵌
并排
正在显示
4 个修改的文件
包含
692 行增加
和
14 行删除
+692
-14
BaseCore.py
comData/policylaw/BaseCore.py
+14
-5
gwysasac.py
comData/policylaw/gwysasac.py
+9
-9
ji_lin.py
comData/policylaw/ji_lin.py
+225
-0
tian_jin.py
comData/policylaw/tian_jin.py
+444
-0
没有找到文件。
comData/policylaw/BaseCore.py
浏览文件 @
baba9e5d
...
@@ -505,27 +505,36 @@ class BaseCore:
...
@@ -505,27 +505,36 @@ class BaseCore:
for
i
in
range
(
0
,
3
):
for
i
in
range
(
0
,
3
):
try
:
try
:
response
=
requests
.
get
(
file_href
,
headers
=
headers
,
verify
=
False
,
timeout
=
20
)
response
=
requests
.
get
(
file_href
,
headers
=
headers
,
verify
=
False
,
timeout
=
20
)
file_size
=
int
(
response
.
headers
.
get
(
'Content-Length'
))
break
break
except
:
except
Exception
as
e
:
time
.
sleep
(
3
)
time
.
sleep
(
3
)
if
i
==
2
:
return
retData
continue
continue
try
:
if
response
.
status_code
==
200
:
file_size
=
int
(
response
.
headers
.
get
(
'Content-Length'
))
else
:
return
retData
except
:
file_size
=
''
for
i
in
range
(
0
,
3
):
for
i
in
range
(
0
,
3
):
try
:
try
:
name
=
str
(
self
.
getuuid
())
+
category
name
=
str
(
self
.
getuuid
())
+
category
result
=
obsClient
.
putContent
(
'zzsn'
,
'PolicyDocuments/'
+
name
,
content
=
response
.
content
)
result
=
obsClient
.
putContent
(
'zzsn'
,
'PolicyDocuments/'
+
name
,
content
=
response
.
content
)
break
break
except
:
except
:
time
.
sleep
(
3
)
time
.
sleep
(
3
)
continue
continue
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
retData
[
'state'
]
=
True
retData
[
'state'
]
=
True
retData
[
'path'
]
=
result
[
'body'
][
'objectUrl'
]
.
split
(
'.com'
)[
1
]
retData
[
'path'
]
=
result
[
'body'
][
'objectUrl'
]
.
split
(
'.com'
)[
1
]
retData
[
'full_path'
]
=
result
[
'body'
][
'objectUrl'
]
retData
[
'full_path'
]
=
result
[
'body'
][
'objectUrl'
]
retData
[
'file_size'
]
=
self
.
convert_size
(
file_size
)
try
:
retData
[
'file_size'
]
=
self
.
convert_size
(
file_size
)
except
:
retData
[
'file_size'
]
=
''
retData
[
'create_time'
]
=
time_now
retData
[
'create_time'
]
=
time_now
return
retData
return
retData
except
Exception
as
e
:
except
Exception
as
e
:
...
...
comData/policylaw/gwysasac.py
浏览文件 @
baba9e5d
...
@@ -34,8 +34,8 @@ def get_content3():
...
@@ -34,8 +34,8 @@ def get_content3():
doc_href
=
soup
.
find
(
'div'
,
class_
=
'zsy_content'
)
doc_href
=
soup
.
find
(
'div'
,
class_
=
'zsy_content'
)
try
:
try
:
org_content
=
doc_href
.
select
(
'.zsy_cotitle'
)[
0
]
org_content
=
doc_href
.
select
(
'.zsy_cotitle'
)[
0
]
org
=
re
.
findall
(
'文章来源:(.*?)发布时间:'
,
org_content
)[
0
]
.
strip
()
org
=
re
.
findall
(
'文章来源:(.*?)发布时间:'
,
str
(
org_content
)
)[
0
]
.
strip
()
except
:
except
Exception
as
e
:
org
=
''
org
=
''
try
:
try
:
contentWithTag
=
doc_href
.
find
(
'div'
,
class_
=
'zsy_comain'
)
contentWithTag
=
doc_href
.
find
(
'div'
,
class_
=
'zsy_comain'
)
...
@@ -103,7 +103,7 @@ def get_content3():
...
@@ -103,7 +103,7 @@ def get_content3():
'id'
:
''
,
#
'id'
:
''
,
#
'labels'
:
[{
'relationId'
:
"1642"
,
'relationName'
:
"国务院国资委"
,
'labelMark'
:
"policy"
}],
'labels'
:
[{
'relationId'
:
"1642"
,
'relationName'
:
"国务院国资委"
,
'labelMark'
:
"policy"
}],
# 关联标签id 关联标签名称 关联标签标识
# 关联标签id 关联标签名称 关联标签标识
'origin'
:
''
,
# 政策发布机关
'origin'
:
org
,
# 政策发布机关
'organ'
:
org
,
# 政策发文机关
'organ'
:
org
,
# 政策发文机关
'topicClassification'
:
''
,
# 政策文件分类
'topicClassification'
:
''
,
# 政策文件分类
'issuedNumber'
:
pub_hao
,
# 发文字号
'issuedNumber'
:
pub_hao
,
# 发文字号
...
@@ -168,10 +168,10 @@ def get_content3():
...
@@ -168,10 +168,10 @@ def get_content3():
href
=
f
'http://www.sasac.gov.cn{href_.replace("../../..", "")}'
href
=
f
'http://www.sasac.gov.cn{href_.replace("../../..", "")}'
# 判断是否已经爬取过
# 判断是否已经爬取过
is_href
=
baseTool
.
db_storage
.
find_one
({
'网址'
:
href
})
is_href
=
baseTool
.
db_storage
.
find_one
({
'网址'
:
href
})
if
is_href
:
#
if is_href:
num
+=
1
#
num += 1
log
.
info
(
'已采集----------跳过'
)
#
log.info('已采集----------跳过')
continue
#
continue
title
=
doc_item
(
'a'
)
.
attr
(
'title'
)
title
=
doc_item
(
'a'
)
.
attr
(
'title'
)
pub_time
=
doc_item
(
'span'
)
.
text
()
.
replace
(
'['
,
''
)
.
replace
(
']'
,
''
)
pub_time
=
doc_item
(
'span'
)
.
text
()
.
replace
(
'['
,
''
)
.
replace
(
']'
,
''
)
except
:
except
:
...
@@ -184,9 +184,9 @@ def get_content3():
...
@@ -184,9 +184,9 @@ def get_content3():
end_time
=
time
.
time
()
end_time
=
time
.
time
()
log
.
info
(
f
'共抓取国资委文件{count}条数据,耗时{end_time - start_time}'
)
log
.
info
(
f
'共抓取国资委文件{count}条数据,耗时{end_time - start_time}'
)
#
partOne()
partOne
()
# 增量执行需要注释掉partTwo()
# 增量执行需要注释掉partTwo()
partTwo
()
#
partTwo()
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
comData/policylaw/ji_lin.py
0 → 100644
浏览文件 @
baba9e5d
import
os
import
re
import
time
import
requests
from
bs4
import
BeautifulSoup
from
ClassTool
import
ClassTool
baseTool
=
ClassTool
()
from
BaseCore
import
BaseCore
baseCore
=
BaseCore
()
log
=
baseCore
.
getLogger
()
# 吉林
def
ji_lin
():
start
=
time
.
time
()
num
=
0
count
=
0
url
=
'http://gzw.jl.gov.cn/zwgk/zcwj/'
try
:
resp_text
=
requests
.
get
(
url
=
url
,
headers
=
baseTool
.
headers
,
verify
=
False
)
resp_text
.
encoding
=
'utf-8'
html
=
resp_text
.
text
soup
=
BeautifulSoup
(
html
,
'html.parser'
)
result
=
soup
.
find
(
class_
=
'list ej_list'
)
li_list
=
result
.
find_all
(
'li'
)
for
a
in
li_list
:
id_list
=
[]
a_text
=
str
(
a
)
href
=
a
.
find
(
'a'
)[
'href'
]
# 网站链接
if
re
.
findall
(
'http'
,
href
):
real_href
=
href
else
:
real_href
=
url
+
a_text
.
split
(
'href=".'
)[
-
1
]
.
split
(
'" target="_blank'
)[
0
]
title
=
a
.
find
(
'a'
)
.
text
.
replace
(
'
\n
'
,
''
)
is_href
=
baseTool
.
db_storage
.
find_one
({
'网址'
:
real_href
})
if
is_href
:
num
+=
1
continue
try
:
# real_href = 'http://gzw.jl.gov.cn/zwgk/zcwj//201906/t20190624_2310742.html'
href_text
=
requests
.
get
(
url
=
real_href
,
headers
=
baseTool
.
headers
,
verify
=
False
)
i_html
=
href_text
.
text
.
encode
(
"ISO-8859-1"
)
i_html
=
i_html
.
decode
(
"utf-8"
)
i_soup
=
BeautifulSoup
(
i_html
,
'html.parser'
)
# print(i_soup)
# 相对路径转化为绝对路径
soup
=
baseTool
.
paserUrl
(
i_soup
,
real_href
)
soup
.
prettify
()
try
:
i_come
=
i_soup
.
find
(
'span'
,
class_
=
'source'
)
i_time
=
i_soup
.
find
(
'span'
,
class_
=
'time'
)
pub_come
=
i_come
.
text
.
split
(
'.write(" '
)[
1
]
.
split
(
'");'
)[
0
]
.
strip
()
pub_time
=
i_time
.
text
.
split
(
'时间:'
)[
1
]
.
strip
()
except
:
i_come
=
i_soup
.
find
(
'div'
,
class_
=
'zsy_cotitle'
)
i_time
=
i_soup
.
find
(
'div'
,
class_
=
'zsy_cotitle'
)
if
(
i_come
):
# pub_come = i_come.find('p')
try
:
pub_come
=
i_come
.
find
(
'p'
)
.
text
.
split
(
'信息来源 > '
)[
1
]
.
split
(
'发布时间:'
)[
0
]
.
strip
()
except
:
pub_come
=
i_come
.
find
(
'p'
)
.
text
.
split
(
'文章来源'
)[
1
]
.
split
(
'发布时间:'
)[
0
]
.
strip
()
# print(pub_time)
pub_time
=
i_time
.
find
(
'p'
)
.
text
.
split
(
'发布时间:'
)[
1
]
.
strip
()
# print(pub_come)
else
:
pub
=
i_soup
.
find
(
class_
=
'share'
)
pub_time
=
pub
.
find
(
class_
=
'left'
)
.
find
(
'span'
,
class_
=
'time'
)
.
text
if
'时间'
in
pub_time
:
pub_time
=
pub_time
.
split
(
'时间:'
)[
1
]
.
strip
()
pub_come
=
pub
.
find
(
class_
=
'right'
)
.
find
(
'span'
,
class_
=
'source'
)
.
text
.
split
(
'来源:'
)[
1
]
.
strip
()
# print(pub_come)
i_content
=
soup
.
find
(
class_
=
'zsy_comain'
)
if
i_content
:
# print(real_href)
# 去掉扫一扫
try
:
soup
.
find
(
'div'
,
id
=
'qr_container'
)
.
decompose
()
soup
.
find
(
'div'
,
id
=
'div_div'
)
.
decompose
()
except
:
i_content
=
soup
# 去掉style
# 去掉style标签
try
:
for
styleTag
in
soup
.
find_all
(
'style'
):
styleTag
.
extract
()
except
:
i_content
=
soup
contentWithTag
=
soup
.
find
(
class_
=
'zsy_comain'
)
content
=
contentWithTag
.
text
.
strip
()
if
content
==
''
or
content
==
'None'
:
log
.
info
(
f
'{real_href}-----{title}----内容为空'
)
continue
# 发文字号
find_hao
=
i_content
.
find_all
(
'p'
)[:
3
]
pub_hao
=
''
for
j
in
find_hao
:
if
'号'
in
j
.
text
:
pub_hao
=
j
.
text
else
:
continue
fj
=
soup
.
find
(
'div'
,
style
=
'width:920px; margin: 0 auto;'
)
if
fj
:
li_list
=
fj
.
find_all
(
'li'
)
for
li
in
li_list
:
fu_jian_href
=
li
.
find
(
'a'
)[
'href'
]
# 如果是附件
if
'.pdf'
in
fu_jian_href
or
'.wps'
in
fu_jian_href
or
'.docx'
in
fu_jian_href
or
'.doc'
in
fu_jian_href
or
'xls'
in
fu_jian_href
or
'.zip'
in
fu_jian_href
\
or
'.rar'
in
fu_jian_href
or
'.ppt'
in
fu_jian_href
or
'.PDF'
in
fu_jian_href
or
'.DOC'
in
fu_jian_href
\
or
'.XLS'
in
fu_jian_href
or
'.ZIP'
in
fu_jian_href
or
'.RAR'
in
fu_jian_href
:
file_name
=
fu_jian_href
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
fu_jian_href
)[
1
]
if
category
not
in
file_name
:
file_name
=
file_name
+
category
# print(fu_jian_href)
retData
=
baseCore
.
uptoOBS
(
fu_jian_href
,
'1670'
,
file_name
)
if
retData
[
'state'
]:
pass
else
:
continue
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'吉林市国资委'
,
file_name
,
num
,
pub_time
)
id_list
.
append
(
att_id
)
#
# # todo:将返回的地址更新到soup
li
.
find
(
'a'
)[
'href'
]
=
'http:zzsn.luyuen.com/'
+
str
(
full_path
)
else
:
continue
else
:
i_content
=
soup
.
find
(
class_
=
"content"
)
# 将文章中的附件字段删去
pattern
=
r'\d+\.'
# pattern = r"附件:\d+\.\s*(.*)"
for
p
in
i_content
.
find_all
(
'div'
)[
-
10
:]:
p_text
=
p
.
text
matches
=
re
.
findall
(
pattern
,
p_text
)
if
matches
:
for
k
in
matches
:
if
k
in
p_text
:
p
.
extract
()
contentWithTag
=
i_content
content
=
contentWithTag
.
text
.
strip
()
if
content
==
''
or
content
==
'None'
:
log
.
info
(
f
'{real_href}-----{title}----内容为空'
)
continue
# 找到附件上传至文件服务器
fj_soup
=
i_soup
.
find
(
'div'
,
class_
=
'wenjianfujian'
)
fj_list
=
fj_soup
.
find_all
(
'a'
)
# for fu_jian_href in fj_list:
# fj_href = fu_jian_href['href']
# file_name = fu_jian_href.text.strip()
# # 如果是附件
# if '.pdf' in fj_href or '.wps' in fj_href or '.docx' in fj_href or '.doc' in fj_href or 'xls' in fj_href or '.zip' in fj_href \
# or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
# or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
# # print(fj_href)
# category = os.path.splitext(fj_href)[1]
# if category not in file_name:
# file_name = file_name + category
# retData = baseCore.uptoOBS(fj_href, '1670', file_name)
# if retData['state']:
# pass
# else:
# continue
# att_id, full_path = baseCore.tableUpdate(retData, '吉林省国资委', file_name, num, pub_time)
# id_list.append(att_id)
# #
# # # todo:将返回的地址更新到soup
# fu_jian_href['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
# else:
# continue
if
'扫一扫在手机打开当前页'
in
content
:
content
.
replace
(
'扫一扫在手机打开当前页'
,
''
)
soup
.
find
(
'div'
,
id
=
'div_div'
)
.
decompose
()
soup
.
find
(
'div'
,
id
=
'qr_container'
)
.
decompose
()
else
:
pass
log
.
info
(
title
)
# print('............................................................')
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
# todo:传kafka字段
dic_news
=
{
'attachmentIds'
:
id_list
,
'author'
:
''
,
'content'
:
content
,
'contentWithTag'
:
str
(
contentWithTag
),
'createDate'
:
time_now
,
'deleteFlag'
:
0
,
'id'
:
''
,
'labels'
:
[{
'relationId'
:
"1670"
,
'relationName'
:
"吉林市国资委"
,
'labelMark'
:
"policy"
}],
'origin'
:
pub_come
,
'organ'
:
''
,
'topicClassification'
:
''
,
'issuedNumber'
:
''
,
'publishDate'
:
pub_time
,
'writtenDate'
:
None
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
real_href
,
'summary'
:
''
,
'title'
:
title
}
# 如果内容为空,则数据不传接口
if
content
==
''
or
content
==
'None'
:
continue
else
:
# print(dic_news)
flag
=
baseTool
.
sendKafka
(
dic_news
)
if
flag
:
baseTool
.
save_data
(
dic_news
)
num
=
num
+
1
count
+=
1
except
Exception
as
e
:
log
.
info
(
e
)
pass
except
:
pass
end
=
time
.
time
()
log
.
info
(
f
'共{count}条...........共耗时 {end - start}秒'
)
if
__name__
==
"__main__"
:
ji_lin
()
\ No newline at end of file
comData/policylaw/tian_jin.py
0 → 100644
浏览文件 @
baba9e5d
import
datetime
import
os
import
re
import
time
import
requests
from
bs4
import
BeautifulSoup
from
pyquery
import
PyQuery
as
pq
from
ClassTool
import
ClassTool
baseTool
=
ClassTool
()
from
BaseCore
import
BaseCore
baseCore
=
BaseCore
()
log
=
baseCore
.
getLogger
()
# 天津
def
tian_jin
():
def
tian_jin1
():
num
=
0
count
=
0
start_time
=
time
.
time
()
for
page
in
range
(
0
,
3
):
if
page
==
0
:
url
=
'http://sasac.tj.gov.cn/ZWGK1142/zcwj/sjwj/'
else
:
url
=
f
'https://sasac.tj.gov.cn/ZWGK1142/zcwj/sjwj/index_{page}.html'
try
:
baseTool
.
headers
[
'Accept-Language'
]
=
'zh-CN,zh;q=0.9'
req
=
requests
.
get
(
url
=
url
,
headers
=
baseTool
.
headers
,
verify
=
False
)
req_text
=
req
.
text
.
encode
(
"ISO-8859-1"
)
req_text
=
req_text
.
decode
(
"utf-8"
)
soup
=
BeautifulSoup
(
req_text
,
'html.parser'
)
doc_items
=
soup
.
select
(
'#content > div.mainContent > div > div.mBd > ul'
)[
0
]
li_list
=
doc_items
.
find_all
(
'li'
)
for
li
in
li_list
:
title
=
str
(
li
.
find
(
'a'
)
.
text
)
.
replace
(
'
\n
'
,
''
)
.
lstrip
()
.
strip
()
i_href
=
str
(
li
.
find
(
'a'
)
.
get
(
'href'
))
if
'ZTZL'
in
i_href
:
href
=
i_href
.
replace
(
'../../../'
,
'https://sasac.tj.gov.cn/'
)
elif
'./'
in
i_href
:
href
=
i_href
.
replace
(
'./'
,
'https://sasac.tj.gov.cn/ZWGK1142/zcwj/sjwj/'
)
else
:
href
=
i_href
is_href
=
baseTool
.
db_storage
.
find_one
({
'网址'
:
href
})
if
is_href
:
num
+=
1
continue
try
:
# href_text = requests.get(url=href, headers=headers, verify=False).content.decode('utf-8')
driver
=
baseTool
.
getDriver
()
driver
.
get
(
href
)
time
.
sleep
(
2
)
href_text
=
driver
.
page_source
soup
=
baseTool
.
paserUrl
(
href_text
,
href
)
doc_href
=
pq
(
str
(
soup
))
title
=
doc_href
(
'div[class="top-container"]>div:nth-child(1)>:nth-child(2)'
)
.
text
()
organ
=
doc_href
(
'div[class="top-container"]>div:nth-child(3)>:nth-child(2)'
)
.
text
()
issuedNumber
=
doc_href
(
'div[class="top-container"]>div:nth-child(4)>:nth-child(2)'
)
.
text
()
topicClassification
=
doc_href
(
'div[class="top-container"]>div:nth-child(5)>:nth-child(2)'
)
.
text
()
writtenDate_
=
doc_href
(
'div[class="top-container"]>div:nth-child(6)>:nth-child(2)'
)
.
text
()
publishDate_
=
doc_href
(
'div[class="top-container"]>div:nth-child(7)>:nth-child(2)'
)
.
text
()
date_obj1
=
datetime
.
datetime
.
strptime
(
writtenDate_
,
"
%
Y年
%
m月
%
d日"
)
writtenDate
=
date_obj1
.
strftime
(
"
%
Y-
%
m-
%
d"
)
date_obj2
=
datetime
.
datetime
.
strptime
(
publishDate_
,
"
%
Y年
%
m月
%
d日"
)
publishDate
=
date_obj2
.
strftime
(
"
%
Y-
%
m-
%
d"
)
doc_href
(
'div[id="articlePlayer"]'
)
.
remove
()
contentWithTag
=
doc_href
(
'div[id="xlrllt"]'
)
origin
=
''
if
len
(
title
)
<
1
:
title
=
doc_href
(
'div[class="common-content-mainTitle"]'
)
.
text
()
issuedNumber
=
doc_href
(
'div[class="common-content-subTitle"]'
)
.
text
()
origin
=
doc_href
(
'div[class="property"]>span:nth-child(1)'
)
.
text
()
.
replace
(
'文章来源:'
,
''
)
.
strip
()
publishDate
=
doc_href
(
'div[class="property"]>span:nth-child(2)'
)
.
text
()
.
replace
(
'发布时间:'
,
''
)
.
strip
()
rmtag2
=
doc_href
(
'div[id="articlePlayer"]'
)
rmtag2
.
remove
()
contentWithTag
=
doc_href
(
'div[id="zoom"]'
)
if
len
(
writtenDate
)
<
1
:
writtenDate
=
None
if
len
(
publishDate
)
<
1
:
publishDate
=
doc_href
(
'meta[name="PubDate"]'
)
.
attr
(
'content'
)
soup
=
baseTool
.
paserUrl
(
str
(
contentWithTag
),
href
)
fu_jian_soup
=
soup
.
find_all
(
'a'
)
id_list
=
[]
for
file
in
fu_jian_soup
:
try
:
file_href
=
file
[
'href'
]
except
Exception
as
e
:
continue
if
'.pdf'
in
file_href
or
'.docx'
in
file_href
or
'.doc'
in
file_href
or
'xlsx'
in
file_href
or
'.zip'
in
file_href
\
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1683'
,
file_name
)
if
retData
[
'state'
]:
pass
else
:
continue
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'天津市国资委'
,
file_name
,
num
,
publishDate
)
id_list
.
append
(
att_id
)
# todo:将返回的地址更新到soup
file
[
'href'
]
=
'http:zzsn.luyuen.com/'
+
str
(
full_path
)
# id_ = redefid(id_list)
contentWithTag
=
str
(
soup
.
prettify
())
if
len
(
contentWithTag
)
<
1
:
if
len
(
fu_jian_soup
)
<
1
:
continue
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
# todo:传kafka字段
dic_news
=
{
'attachmentIds'
:
id_list
,
'author'
:
''
,
'content'
:
str
(
content
),
'contentWithTag'
:
str
(
contentWithTag
),
'createDate'
:
time_now
,
'deleteFlag'
:
0
,
'id'
:
''
,
'labels'
:
[{
'relationId'
:
"1683"
,
'relationName'
:
"天津市国资委"
,
'labelMark'
:
"policy"
}],
'origin'
:
origin
,
'organ'
:
organ
,
'topicClassification'
:
topicClassification
,
'issuedNumber'
:
issuedNumber
,
'publishDate'
:
publishDate
,
'writtenDate'
:
writtenDate
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'summary'
:
''
,
'title'
:
title
}
# print(dic_news)
flag
=
baseTool
.
sendKafka
(
dic_news
)
if
flag
:
baseTool
.
save_data
(
dic_news
)
num
+=
1
count
+=
1
except
Exception
as
e
:
pass
except
:
pass
end_time
=
time
.
time
()
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
def
tian_jin2
():
"""
http://sasac.tj.gov.cn/ZWGK1142/zcwj/wjwj/index.html 4
"""
num
=
0
count
=
0
start_time
=
time
.
time
()
for
page
in
range
(
0
,
5
):
if
page
==
0
:
url
=
'http://sasac.tj.gov.cn/ZWGK1142/zcwj/wjwj/index.html'
else
:
url
=
f
'http://sasac.tj.gov.cn/ZWGK1142/zcwj/wjwj/index_{page}.html'
try
:
baseTool
.
headers
[
'Accept-Language'
]
=
'zh-CN,zh;q=0.9'
req
=
requests
.
get
(
url
=
url
,
headers
=
baseTool
.
headers
,
verify
=
False
)
req_text
=
req
.
text
.
encode
(
"ISO-8859-1"
)
req_text
=
req_text
.
decode
(
"utf-8"
)
soup
=
BeautifulSoup
(
req_text
,
'html.parser'
)
doc_items
=
soup
.
select
(
'#content > div.mainContent > div > div.mBd > ul'
)[
0
]
li_list
=
doc_items
.
find_all
(
'li'
)
for
li
in
li_list
:
title
=
str
(
li
.
find
(
'a'
)
.
text
)
.
replace
(
'
\n
'
,
''
)
.
lstrip
()
.
strip
()
href
=
str
(
li
.
find
(
'a'
)
.
get
(
'href'
))
if
'http:'
in
href
:
continue
else
:
href
=
url
.
split
(
'index'
)[
0
]
+
href
.
replace
(
'./'
,
''
)
is_href
=
baseTool
.
db_storage
.
find_one
({
'网址'
:
href
})
if
is_href
:
num
+=
1
continue
try
:
# href_text = requests.get(url=href, headers=headers, verify=False).content.decode('utf-8')
driver
=
baseTool
.
getDriver
()
driver
.
get
(
href
)
time
.
sleep
(
2
)
href_text
=
driver
.
page_source
soup
=
baseTool
.
paserUrl
(
href_text
,
href
)
doc_href
=
pq
(
str
(
soup
))
title
=
doc_href
(
'div[class="top-container"]>div:nth-child(1)>:nth-child(2)'
)
.
text
()
organ
=
doc_href
(
'div[class="top-container"]>div:nth-child(3)>:nth-child(2)'
)
.
text
()
issuedNumber
=
doc_href
(
'div[class="top-container"]>div:nth-child(4)>:nth-child(2)'
)
.
text
()
topicClassification
=
doc_href
(
'div[class="top-container"]>div:nth-child(5)>:nth-child(2)'
)
.
text
()
writtenDate_
=
doc_href
(
'div[id="content_cwrq"]'
)
.
text
()
publishDate_
=
doc_href
(
'div[id="content_fbrq"]'
)
.
text
()
date_obj1
=
datetime
.
datetime
.
strptime
(
writtenDate_
,
"
%
Y年
%
m月
%
d日"
)
writtenDate
=
date_obj1
.
strftime
(
"
%
Y-
%
m-
%
d"
)
date_obj2
=
datetime
.
datetime
.
strptime
(
publishDate_
,
"
%
Y年
%
m月
%
d日"
)
publishDate
=
date_obj2
.
strftime
(
"
%
Y-
%
m-
%
d"
)
contentWithTag
=
doc_href
(
'div[id="xlrllt"]'
)
origin
=
''
if
len
(
title
)
<
1
:
title
=
doc_href
(
'div[class="common-content-mainTitle"]'
)
.
text
()
issuedNumber
=
doc_href
(
'div[class="common-content-subTitle"]'
)
.
text
()
origin
=
doc_href
(
'div[class="property"]>span:nth-child(1)'
)
.
text
()
.
replace
(
'文章来源:'
,
''
)
.
strip
()
publishDate
=
doc_href
(
'div[class="property"]>span:nth-child(2)'
)
.
text
()
.
replace
(
'发布时间:'
,
''
)
.
strip
()
rmtag2
=
doc_href
(
'div[id="articlePlayer"]'
)
rmtag2
.
remove
()
contentWithTag
=
doc_href
(
'div[id="zoom"]'
)
if
len
(
writtenDate
)
<
1
:
writtenDate
=
None
if
len
(
publishDate
)
<
1
:
publishDate
=
doc_href
(
'meta[name="PubDate"]'
)
.
attr
(
'content'
)
soup
=
baseTool
.
paserUrl
(
str
(
contentWithTag
),
href
)
fu_jian_soup
=
soup
.
find_all
(
'a'
)
id_list
=
[]
for
file
in
fu_jian_soup
:
try
:
file_href
=
file
[
'href'
]
except
Exception
as
e
:
continue
if
'.pdf'
in
file_href
or
'.docx'
in
file_href
or
'.doc'
in
file_href
or
'xlsx'
in
file_href
or
'.zip'
in
file_href
\
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1683'
,
file_name
)
if
retData
[
'state'
]:
pass
else
:
continue
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'天津市国资委'
,
file_name
,
num
,
publishDate
)
id_list
.
append
(
att_id
)
# todo:将返回的地址更新到soup
file
[
'href'
]
=
'http:zzsn.luyuen.com/'
+
str
(
full_path
)
# id_ = redefid(id_list)
if
id_list
:
pass
else
:
doc_href
(
"ul[class='qt-attachments-list']"
)
.
remove
()
contentWithTag
=
str
(
soup
.
prettify
())
if
len
(
contentWithTag
)
<
1
:
if
len
(
fu_jian_soup
)
<
1
:
continue
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
# todo:传kafka字段
dic_news
=
{
'attachmentIds'
:
id_list
,
'author'
:
''
,
'content'
:
str
(
content
),
'contentWithTag'
:
str
(
contentWithTag
),
'createDate'
:
time_now
,
'deleteFlag'
:
0
,
'id'
:
''
,
'labels'
:
[{
'relationId'
:
"1683"
,
'relationName'
:
"天津市国资委"
,
'labelMark'
:
"policy"
}],
'origin'
:
origin
,
'organ'
:
organ
,
'topicClassification'
:
topicClassification
,
'issuedNumber'
:
issuedNumber
,
'publishDate'
:
publishDate
,
'writtenDate'
:
writtenDate
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'summary'
:
''
,
'title'
:
title
}
# print(dic_news)
flag
=
baseTool
.
sendKafka
(
dic_news
)
if
flag
:
baseTool
.
save_data
(
dic_news
)
num
+=
1
count
+=
1
except
:
pass
except
:
pass
end_time
=
time
.
time
()
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
def
tian_jin3
():
num
=
0
count
=
0
start_time
=
time
.
time
()
for
page
in
range
(
1
,
3
):
if
page
==
1
:
url
=
'https://sasac.tj.gov.cn/ZWGK1142/zcwj/gjjwj/index.html'
else
:
# https://sasac.tj.gov.cn/ZWGK1142/zcwj/gjjwj/index_1.html
url
=
f
'https://sasac.tj.gov.cn/ZWGK1142/zcwj/gjjwj/index_{page - 1}.html'
try
:
req
=
requests
.
get
(
url
,
baseTool
.
headers
,
verify
=
False
)
req_text
=
req
.
text
.
encode
(
"ISO-8859-1"
)
req_text
=
req_text
.
decode
(
"utf-8"
)
soup
=
BeautifulSoup
(
req_text
,
'html.parser'
)
doc_items
=
soup
.
select
(
'#content > div.mainContent > div > div.mBd > ul'
)[
0
]
li_list
=
doc_items
.
find_all
(
'li'
)
for
li
in
li_list
:
title
=
str
(
li
.
find
(
'a'
)
.
text
)
.
replace
(
'
\n
'
,
''
)
.
lstrip
()
.
strip
()
href
=
str
(
li
.
find
(
'a'
)
.
get
(
'href'
))
try
:
publishDate
=
li
.
find
(
'div'
,
attrs
=
{
'class'
:
'other'
})
.
text
except
:
publishDate
=
None
if
'http'
not
in
href
:
if
'../../../'
in
href
:
href
=
href
.
replace
(
'../../../'
,
'https://sasac.tj.gov.cn/'
)
href
=
href
.
replace
(
'./'
,
'https://sasac.tj.gov.cn/ZWGK1142/zcwj/gjjwj/'
)
is_href
=
baseTool
.
db_storage
.
find_one
({
'网址'
:
href
})
if
is_href
:
num
+=
1
continue
try
:
# res = requests.get(href, headers)
# page_text = res.text.encode("ISO-8859-1")
# page_text = page_text.decode("utf-8")
driver
=
baseTool
.
getDriver
()
driver
.
get
(
href
)
time
.
sleep
(
2
)
href_text
=
driver
.
page_source
soup
=
baseTool
.
paserUrl
(
href_text
,
href
)
doc_href
=
pq
(
str
(
soup
))
title
=
doc_href
(
'table[class="bd1"]>tbody>tr:nth-child(3)>td:nth-child(2)'
)
.
text
()
organ
=
doc_href
(
'table[class="bd1"]>tbody>tr:nth-child(2)>td:nth-child(2)'
)
.
text
()
issuedNumber
=
doc_href
(
'table[class="bd1"]>tbody>tr:nth-child(4)>td:nth-child(2)'
)
.
text
()
topicClassification
=
doc_href
(
'table[class="bd1"]>tbody>tr:nth-child(1)>td:nth-child(4)'
)
.
text
()
writtenDate
=
doc_href
(
'table[class="bd1"]>tbody>tr:nth-child(2)>td:nth-child(4)'
)
.
text
()
publishDate
=
doc_href
(
'table[class="bd1"]>tbody>tr:nth-child(4)>td:nth-child(4)'
)
.
text
()
contentWithTag
=
doc_href
(
'div[id="UCAP-CONTENT"]'
)
origin
=
''
if
len
(
title
)
<
1
:
title
=
doc_href
(
'div[class="common-content-mainTitle"]'
)
.
text
()
issuedNumber
=
doc_href
(
'div[class="common-content-subTitle"]'
)
.
text
()
origin
=
doc_href
(
'div[class="property"]>span:nth-child(1)'
)
.
text
()
.
replace
(
'文章来源:'
,
''
)
.
strip
()
publishDate
=
doc_href
(
'div[class="property"]>span:nth-child(2)'
)
.
text
()
.
replace
(
'发布时间:'
,
''
)
.
strip
()
rmtag2
=
doc_href
(
'div[id="articlePlayer"]'
)
rmtag2
.
remove
()
contentWithTag
=
doc_href
(
'div[id="zoom"]'
)
if
len
(
title
)
<
1
:
doc_href
=
doc_href
(
'div[aria-label="内容文本区"]'
)
doc_soup
=
BeautifulSoup
(
str
(
doc_href
),
'html.parser'
)
info_list
=
doc_soup
.
find
(
'tbody'
)
.
find
(
'tbody'
)
.
find
(
'tr'
)
.
find_all
(
'table'
)
title_tag
=
info_list
[
0
]
organ
=
info_list
[
2
]
.
find
(
'span'
,
id
=
"laiyuan"
)
.
text
publishDate
=
info_list
[
2
]
.
find_all
(
'td'
,
class_
=
"hui12"
)[
-
1
]
.
text
contentWithTag
=
info_list
[
-
1
]
if
len
(
writtenDate
)
<
1
:
writtenDate
=
None
if
len
(
publishDate
)
<
1
:
publishDate
=
doc_href
(
'meta[name="PubDate"]'
)
.
attr
(
'content'
)
soup
=
baseTool
.
paserUrl
(
str
(
contentWithTag
),
href
)
fu_jian_soup
=
soup
.
find_all
(
'a'
)
id_list
=
[]
for
file
in
fu_jian_soup
:
try
:
file_href
=
file
[
'href'
]
except
Exception
as
e
:
continue
if
'.pdf'
in
file_href
or
'.docx'
in
file_href
or
'.doc'
in
file_href
or
'xlsx'
in
file_href
or
'.zip'
in
file_href
\
or
'.rar'
in
file_href
or
'.ppt'
in
file_href
or
'.PDF'
in
file_href
or
'.DOC'
in
file_href
\
or
'.XLS'
in
file_href
or
'.ZIP'
in
file_href
or
'.RAR'
in
file_href
:
file_name
=
file
.
text
.
strip
()
category
=
os
.
path
.
splitext
(
file_href
)[
1
]
if
category
not
in
file_name
:
file_name
=
file_name
+
category
retData
=
baseCore
.
uptoOBS
(
file_href
,
'1683'
,
file_name
)
if
retData
[
'state'
]:
pass
else
:
continue
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'天津市国资委'
,
file_name
,
num
,
publishDate
)
id_list
.
append
(
att_id
)
# todo:将返回的地址更新到soup
file
[
'href'
]
=
'http:zzsn.luyuen.com/'
+
str
(
full_path
)
# id_ = redefid(id_list)
contentWithTag
=
str
(
soup
.
prettify
())
if
len
(
contentWithTag
)
<
1
:
if
len
(
fu_jian_soup
)
<
1
:
continue
content
=
soup
.
text
if
content
==
''
or
content
==
None
:
log
.
info
(
f
'-----{href}----{title}----内容为空-----'
)
continue
time_now
=
time
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
time
.
localtime
())
# todo:传kafka字段
dic_news
=
{
'attachmentIds'
:
id_list
,
'author'
:
''
,
'content'
:
str
(
content
),
'contentWithTag'
:
str
(
contentWithTag
),
'createDate'
:
time_now
,
'deleteFlag'
:
0
,
'id'
:
''
,
'labels'
:
[{
'relationId'
:
"1683"
,
'relationName'
:
"天津市国资委"
,
'labelMark'
:
"policy"
}],
'origin'
:
origin
,
'organ'
:
organ
,
'topicClassification'
:
topicClassification
,
'issuedNumber'
:
issuedNumber
,
'publishDate'
:
publishDate
,
'writtenDate'
:
writtenDate
,
'sid'
:
'1697458829758697473'
,
'sourceAddress'
:
href
,
'summary'
:
''
,
'title'
:
title
}
# print(dic_news)
flag
=
baseTool
.
sendKafka
(
dic_news
)
if
flag
:
baseTool
.
save_data
(
dic_news
)
num
+=
1
count
+=
1
except
Exception
as
e
:
pass
except
:
pass
end_time
=
time
.
time
()
log
.
info
(
f
'共抓取{count}条数据,共耗时{end_time - start_time}'
)
tian_jin1
()
tian_jin2
()
tian_jin3
()
if
__name__
==
"__main__"
:
tian_jin
()
\ No newline at end of file
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论