Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
5f3288f3
提交
5f3288f3
authored
6月 27, 2024
作者:
XveLingKun
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'origin/master'
上级
e471e82e
29d5214b
显示空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
547 行增加
和
0 行删除
+547
-0
flk.py
comData/policylaw/flk.py
+145
-0
flk_buchong.py
comData/policylaw/flk_buchong.py
+402
-0
没有找到文件。
comData/policylaw/flk.py
0 → 100644
浏览文件 @
5f3288f3
import
datetime
import
time
import
urllib.parse
import
requests
from
ClassTool
import
ClassTool
from
BaseCore
import
BaseCore
baseTool
=
ClassTool
()
baseCore
=
BaseCore
()
log
=
baseCore
.
getLogger
()
headers
=
{
'Accept'
:
'application/json, text/javascript, */*; q=0.01'
,
'Accept-Encoding'
:
'gzip, deflate, br, zstd'
,
'Accept-Language'
:
'zh-CN,zh-TW;q=0.9,zh;q=0.8'
,
'Connection'
:
'keep-alive'
,
'Host'
:
'flk.npc.gov.cn'
,
'Referer'
:
'https://flk.npc.gov.cn/fl.html'
,
'Sec-Fetch-Dest'
:
'empty'
,
'Sec-Fetch-Mode'
:
'cors'
,
'Sec-Fetch-Site'
:
'same-origin'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0'
,
'X-Requested-With'
:
'XMLHttpRequest'
,
'sec-ch-ua'
:
'"Chromium";v="124", "Microsoft Edge";v="124", "Not-A.Brand";v="99"'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
,
}
def
getDataJson
(
url
):
req
=
requests
.
get
(
url
,
headers
=
headers
)
req
.
encoding
=
req
.
apparent_encoding
datasJson
=
req
.
json
()[
'result'
][
'data'
]
totalSizes
=
req
.
json
()[
'result'
][
'totalSizes'
]
req
.
close
()
return
datasJson
,
totalSizes
def
getPdf
(
id_
,
title
,
publishDate
):
id_list
=
[]
url
=
'https://flk.npc.gov.cn/api/detail'
payload
=
{
'id'
:
id_
}
req
=
requests
.
post
(
url
,
headers
=
headers
,
data
=
payload
)
req
.
encoding
=
req
.
apparent_encoding
datasJson
=
req
.
json
()[
'result'
][
'body'
]
req
.
close
()
href
=
''
for
dataJson
in
datasJson
:
if
dataJson
[
'type'
]
==
'WORD'
:
href
=
'https://wb.flk.npc.gov.cn'
+
dataJson
[
'path'
]
break
if
not
href
:
log
.
error
(
f
'{title}===附件链接获取失败'
)
return
''
retData
=
baseCore
.
uptoOBS
(
href
,
'1699'
,
title
)
if
retData
[
'state'
]:
pass
else
:
return
''
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'国务院文件'
,
title
,
0
,
publishDate
)
id_list
.
append
(
att_id
)
return
id_list
def
getDic
(
title
,
office
,
publishDate
,
expiry
,
type
,
timeliness
,
href
,
id_
):
id_list
=
getPdf
(
id_
,
title
,
publishDate
)
if
not
id_list
:
log
.
error
(
f
'{title}===附件下载失败'
)
return
''
now
=
datetime
.
datetime
.
now
()
.
strftime
(
'
%
Y-
%
m-
%
d
%
H:
%
M:
%
S'
)
dic_news
=
{
'attachmentIds'
:
id_list
,
# 附件id
'author'
:
''
,
# 作者
'content'
:
title
,
# 正文不带标签
'contentWithTag'
:
''
,
# 正文带标签
'createDate'
:
now
,
# 创建时间
'deleteFlag'
:
0
,
# 是否删除(0为默认,1为删除)
'id'
:
''
,
#
'labels'
:
[{
'relationId'
:
"1788847783801794562"
,
'relationName'
:
"国资国企法律法规"
,
'labelMark'
:
"policy"
}],
# 关联标签id 关联标签名称 关联标签标识
'origin'
:
''
,
# 政策发布机关
'organ'
:
office
,
# 政策发文机关、制定机关
'topicClassification'
:
''
,
# 政策文件分类
'issuedNumber'
:
''
,
# 发文字号
'publishDate'
:
publishDate
,
# 政策发布时间、法律公布日期
'writtenDate'
:
None
,
# 成文时间
'implementDate'
:
expiry
,
# 施行日期
'sid'
:
'1788838266435284993'
,
# 信息源id
'sourceAddress'
:
href
,
# 原文链接
'summary'
:
''
,
# 摘要
'title'
:
title
,
# 标题
'legalPrecedenceHierarchy'
:
type
,
# 法律效力位阶
'effectiveness'
:
timeliness
,
# 实效性
}
return
dic_news
def
doJob
():
searchList
=
[
'国有资产'
,
'国资'
,
'国有企业'
,
'企业'
,
'公司'
]
for
search
in
searchList
:
search_
=
urllib
.
parse
.
quote
(
search
)
url
=
f
'https://flk.npc.gov.cn/api/?type=&fgbt={search_}&searchType=title
%3
Baccurate
%3
B1&sortTr=f_bbrq_s
%3
Bdesc&gbrqStart=&gbrqEnd=&sxrqStart=&sxrqEnd=&page=1&size=10'
datasJson
,
totalSizes
=
getDataJson
(
url
)
if
totalSizes
%
10
==
0
:
totalPage
=
totalSizes
/
10
else
:
totalPage
=
totalSizes
//
10
+
1
for
page
in
range
(
1
,
totalPage
+
1
):
if
page
!=
1
:
url
=
url
.
replace
(
f
'&page={page - 1}'
,
f
'&page={page}'
)
datasJson
,
totalSizes
=
getDataJson
(
url
)
for
dataJson
in
datasJson
:
id_
=
dataJson
[
'id'
]
title
=
dataJson
[
'title'
]
office
=
dataJson
[
'office'
]
publishDate
=
dataJson
[
'publish'
]
expiry
=
dataJson
[
'expiry'
]
type
=
dataJson
[
'type'
]
status
=
dataJson
[
'status'
]
if
status
==
'1'
:
timeliness
=
'有效'
elif
status
==
'5'
:
timeliness
=
'已修改'
elif
status
==
'9'
:
timeliness
=
'已废止'
elif
status
==
'3'
:
timeliness
=
'尚未生效'
href
=
dataJson
[
'url'
]
.
replace
(
'./'
,
'https://flk.npc.gov.cn/'
)
is_href
=
baseTool
.
db_storage
.
find_one
({
'网址'
:
href
})
if
is_href
:
log
.
info
(
f
'{title}===已采集'
)
continue
dic
=
getDic
(
title
,
office
,
publishDate
,
expiry
,
type
,
timeliness
,
href
,
id_
)
if
dic
:
flag
=
baseTool
.
sendKafka
(
dic
)
if
flag
:
baseTool
.
save_data
(
dic
)
else
:
log
.
error
(
f
'{title}==={href}===获取失败'
)
time
.
sleep
(
2
)
if
__name__
==
'__main__'
:
doJob
()
comData/policylaw/flk_buchong.py
0 → 100644
浏览文件 @
5f3288f3
import
datetime
import
json
import
os
import
re
import
time
import
urllib.parse
import
pandas
as
pd
import
pymongo
import
requests
from
bs4
import
BeautifulSoup
from
docx
import
Document
import
io
from
retry
import
retry
from
win32com.client
import
Dispatch
from
ClassTool
import
ClassTool
from
BaseCore
import
BaseCore
baseTool
=
ClassTool
()
baseCore
=
BaseCore
()
log
=
baseCore
.
getLogger
()
headers
=
{
'Accept'
:
'application/json, text/javascript, */*; q=0.01'
,
'Accept-Encoding'
:
'gzip, deflate, br, zstd'
,
'Accept-Language'
:
'zh-CN,zh-TW;q=0.9,zh;q=0.8'
,
'Connection'
:
'keep-alive'
,
'Host'
:
'flk.npc.gov.cn'
,
'Referer'
:
'https://flk.npc.gov.cn/fl.html'
,
'Sec-Fetch-Dest'
:
'empty'
,
'Sec-Fetch-Mode'
:
'cors'
,
'Sec-Fetch-Site'
:
'same-origin'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0'
,
'X-Requested-With'
:
'XMLHttpRequest'
,
'sec-ch-ua'
:
'"Chromium";v="124", "Microsoft Edge";v="124", "Not-A.Brand";v="99"'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
,
}
searchHeaders
=
{
'Accept'
:
'*/*'
,
'Accept-Encoding'
:
'gzip, deflate, br, zstd'
,
'Accept-Language'
:
'zh-CN,zh-TW;q=0.9,zh;q=0.8'
,
'Connection'
:
'keep-alive'
,
'Content-Type'
:
'application/x-www-form-urlencoded; charset=UTF-8'
,
'Sec-Fetch-Dest'
:
'empty'
,
'Sec-Fetch-Mode'
:
'cors'
,
'Sec-Fetch-Site'
:
'same-origin'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0'
,
'X-Requested-With'
:
'XMLHttpRequest'
,
'sec-ch-ua'
:
'"Not/A)Brand";v="8", "Chromium";v="126", "Microsoft Edge";v="126"'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
,
}
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017'
,
username
=
'admin'
,
password
=
'ZZsn@9988'
)
.
ZZSN
[
'国务院_国资委_copy1'
]
@retry
(
tries
=
2
,
delay
=
10
)
def
searchDate
(
key
):
url
=
'https://www.pkulaw.com/law/chl'
dataPost
=
f
'Menu=law&Keywords={urllib.parse.quote(key)}&PreKeywords={urllib.parse.quote(key)}&SearchKeywordType=Title&MatchType=Exact&RangeType=Piece&Library=chl&ClassFlag=chl&GroupLibraries=&QuerySearchCondition=Title
%2
BExact
%2
BPiece
%2
B0&QueryOnClick=False&AfterSearch=True&RequestFrom=btnSearch&SearchInResult=&PreviousLib=chl&IsSynonymSearch=false&RecordShowType=List&ClassCodeKey=&IsSearchErrorKeyword=&FirstQueryKeywords={urllib.parse.quote(key)}&FirstQueryKeywordType=Title&IsSynonymSearch=false&X-Requested-With=XMLHttpRequest'
req
=
requests
.
post
(
url
,
data
=
dataPost
,
headers
=
searchHeaders
)
req
.
encoding
=
req
.
apparent_encoding
soup
=
BeautifulSoup
(
req
.
text
,
'lxml'
)
divList
=
soup
.
find_all
(
'div'
,
class_
=
'accompanying-wrap'
)
for
divTag
in
divList
:
itemList
=
divTag
.
select
(
'> div.item'
)
if
len
(
divList
)
==
1
and
len
(
itemList
)
==
1
:
itemTag
=
itemList
[
0
]
href
=
'https://www.pkulaw.com'
+
itemTag
.
find
(
'div'
,
class_
=
'col'
)
.
find
(
'div'
,
class_
=
't'
)
.
find
(
'h4'
)
.
find
(
'a'
)
.
get
(
'href'
)
req_
=
requests
.
get
(
href
,
headers
=
searchHeaders
)
req_
.
encoding
=
req_
.
apparent_encoding
soup_
=
BeautifulSoup
(
req_
.
text
,
'html.parser'
)
liList
=
soup_
.
find
(
'div'
,
class_
=
'fields'
)
.
find
(
'ul'
)
.
find_all
(
'li'
)
publishDate
=
''
expiry
=
''
for
liTag
in
liList
:
if
'公布日期'
in
liTag
.
text
:
publishDate
=
liTag
.
text
.
split
(
'公布日期:'
)[
1
]
.
strip
()
publishDate
=
datetime
.
datetime
.
strptime
(
publishDate
,
'
%
Y.
%
m.
%
d'
)
.
strftime
(
'
%
Y-
%
m-
%
d
%
H:
%
M:
%
S'
)
if
'施行日期'
in
liTag
.
text
:
expiry
=
liTag
.
text
.
split
(
'施行日期:'
)[
1
]
.
strip
()
expiry
=
datetime
.
datetime
.
strptime
(
expiry
,
'
%
Y.
%
m.
%
d'
)
.
strftime
(
'
%
Y-
%
m-
%
d
%
H:
%
M:
%
S'
)
return
publishDate
,
expiry
else
:
for
itemTag
in
itemList
:
title
=
itemTag
.
find
(
'div'
,
class_
=
'col'
)
.
find
(
'div'
,
class_
=
't'
)
.
find
(
'h4'
)
.
find
(
'a'
)
.
text
href
=
'https://www.pkulaw.com'
+
itemTag
.
find
(
'div'
,
class_
=
'col'
)
.
find
(
'div'
,
class_
=
't'
)
.
find
(
'h4'
)
.
find
(
'a'
)
.
get
(
'href'
)
if
title
==
key
:
req_
=
requests
.
get
(
href
,
headers
=
searchHeaders
)
req_
.
encoding
=
req_
.
apparent_encoding
soup_
=
BeautifulSoup
(
req_
.
text
,
'html.parser'
)
liList
=
soup_
.
find
(
'div'
,
class_
=
'fields'
)
.
find
(
'ul'
)
.
find_all
(
'li'
)
publishDate
=
''
expiry
=
''
for
liTag
in
liList
:
if
'公布日期'
in
liTag
.
text
:
publishDate
=
liTag
.
text
.
split
(
'公布日期:'
)[
1
]
.
strip
()
publishDate
=
datetime
.
datetime
.
strptime
(
publishDate
,
'
%
Y.
%
m.
%
d'
)
.
strftime
(
'
%
Y-
%
m-
%
d
%
H:
%
M:
%
S'
)
if
'施行日期'
in
liTag
.
text
:
expiry
=
liTag
.
text
.
split
(
'施行日期:'
)[
1
]
.
strip
()
expiry
=
datetime
.
datetime
.
strptime
(
expiry
,
'
%
Y.
%
m.
%
d'
)
.
strftime
(
'
%
Y-
%
m-
%
d
%
H:
%
M:
%
S'
)
return
publishDate
,
expiry
else
:
return
''
,
''
@retry
(
tries
=
2
,
delay
=
10
)
def
getDataJson
(
url
):
req
=
requests
.
get
(
url
,
headers
=
headers
)
req
.
encoding
=
req
.
apparent_encoding
datasJson
=
req
.
json
()[
'result'
][
'data'
]
totalSizes
=
req
.
json
()[
'result'
][
'totalSizes'
]
req
.
close
()
return
datasJson
,
totalSizes
def
getPdf
(
id_
,
title
,
publishDate
):
id_list
=
[]
url
=
'https://flk.npc.gov.cn/api/detail'
payload
=
{
'id'
:
id_
}
req
=
requests
.
post
(
url
,
headers
=
headers
,
data
=
payload
)
req
.
encoding
=
req
.
apparent_encoding
datasJson
=
req
.
json
()[
'result'
][
'body'
]
req
.
close
()
href
=
''
for
dataJson
in
datasJson
:
if
dataJson
[
'type'
]
==
'WORD'
:
href
=
'https://wb.flk.npc.gov.cn'
+
dataJson
[
'path'
]
break
if
not
href
:
log
.
error
(
f
'{title}===附件链接获取失败'
)
return
''
retData
=
baseCore
.
uptoOBS
(
href
,
'1699'
,
title
)
if
retData
[
'state'
]:
pass
else
:
return
''
att_id
,
full_path
=
baseCore
.
tableUpdate
(
retData
,
'国务院文件'
,
title
,
0
,
publishDate
)
id_list
.
append
(
att_id
)
return
id_list
def
is_member_containing_string
(
string
):
cursor
=
'0'
while
True
:
# 使用 SCAN 命令遍历 Set 列表
cursor
,
members
=
baseCore
.
r
.
sscan
(
f
'flk_ok'
,
cursor
)
for
member
in
members
:
# 判断字符串是否包含指定字符串
if
string
in
member
.
decode
(
"utf-8"
):
return
True
if
cursor
==
b
'0'
or
cursor
==
0
:
break
return
False
def
is_member_containing_string_bucai
(
string
):
cursor
=
'0'
while
True
:
# 使用 SCAN 命令遍历 Set 列表
cursor
,
members
=
baseCore
.
r
.
sscan
(
f
'flk_bucai'
,
cursor
)
for
member
in
members
:
# 判断字符串是否包含指定字符串
if
string
in
member
.
decode
(
"utf-8"
):
return
True
if
cursor
==
b
'0'
or
cursor
==
0
:
break
return
False
def
selectMongo
(
url
):
data
=
db_storage
.
find_one
({
'来源'
:
'国资国企法律法规'
,
'网址'
:
url
})
id_list
=
data
[
'附件id'
]
return
id_list
@retry
(
tries
=
2
,
delay
=
10
)
def
getReqContent
(
url
):
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0'
}
req
=
requests
.
get
(
url
,
headers
=
headers
)
content
=
req
.
content
req
.
close
()
return
content
def
readDocx
(
byteStream
,
flg
):
contentWithTag
=
BeautifulSoup
(
''
,
'html.parser'
)
if
flg
:
byteStreamIo
=
io
.
BytesIO
(
byteStream
)
else
:
byteStreamIo
=
byteStream
doc
=
Document
(
byteStreamIo
)
content
=
''
for
para
in
doc
.
paragraphs
:
content
+=
f
'{para.text}
\n
'
newTag
=
contentWithTag
.
new_tag
(
'p'
)
newTag
.
string
=
para
.
text
contentWithTag
.
append
(
newTag
)
return
content
,
str
(
contentWithTag
)
def
convert_doc_to_docx_stream
(
url
):
doc_path
=
r'./tmp/tmp.doc'
reqContent
=
getReqContent
(
url
)
with
open
(
doc_path
,
'wb'
)
as
f
:
f
.
write
(
reqContent
)
# 启动Word应用程序
word
=
Dispatch
(
"kwps.Application"
)
word
.
Visible
=
False
# 后台运行,不显示Word界面
# 打开.doc文件
doc
=
word
.
Documents
.
Open
(
doc_path
)
# 创建一个内存中的字节流对象
# docx_stream = io.BytesIO()
# 将.doc文件另存为.docx格式到字节流
doc
.
SaveAs
(
r'./tmp/tmp.docx'
,
16
)
# 关闭文档
doc
.
Close
()
# 退出Word应用程序
word
.
Quit
()
log
.
info
(
'doc文件已转换为docx文件'
)
def
clearTmp
():
folder_path
=
r'./tmp'
# 遍历文件夹中的所有文件和子文件夹
for
filename
in
os
.
listdir
(
folder_path
):
# 构建完整的文件路径
file_path
=
os
.
path
.
join
(
folder_path
,
filename
)
# 删除文件或子文件夹
if
os
.
path
.
isfile
(
file_path
)
or
os
.
path
.
islink
(
file_path
):
os
.
unlink
(
file_path
)
elif
os
.
path
.
isdir
(
file_path
):
os
.
rmdir
(
file_path
)
log
.
info
(
'临时文件已删除'
)
def
getDic
(
title
,
office
,
publishDate
,
expiry
,
type
,
timeliness
,
href
,
id_
,
dateDic
):
# id_list = getPdf(id_, title, publishDate)
# if not id_list:
# log.error(f'{title}===附件下载失败')
# return ''
# 如果发布日期或实施日期中的任意一个为空,尽心搜索操作
# if not publishDate or not expiry:
# searchExpiry, searchPublishDate = searchDate(title)
# # 如果没有搜索到实施日期并且第一次也没有采集到实施日期,采集失败
# if not searchExpiry and not expiry:
# log.error(f'{title}===搜索后依旧没有实施日期')
# return {}
# # 如果没有搜索到发布日期并且第一次也没有采集到发布日期,采集失败
# if not searchPublishDate and not publishDate:
# log.error(f'{title}===搜索后依旧没有发布日期')
# return {}
# # 如果发布日期存在
# if publishDate:
# # 判断搜索到的发布日期与采集到的发布日期是否一致,不一致则采集失败
# if publishDate != searchPublishDate:
# log.error(f'{title}===搜索到发布时间与采集到发布时间不一致')
# return {}
# # 如果发布日期不存在,则赋值为搜索到的发布日期
# else:
# publishDate = searchPublishDate
# # 如果实施日期存在
# if expiry:
# # 判断搜索到的实施日期与采集到的实施日期是否一致,不一致则采集失败
# if expiry != searchExpiry:
# log.error(f'{title}===搜索到实施日期与采集到实施日期不一致')
# return {}
# # 如果实施日期不存在,则赋值为搜索到的实施日期
# else:
# # 判断搜索到的实施日期与采集到的实施日期是否一致,不一致则采集失败
# expiry = searchExpiry
publishDate
=
dateDic
[
title
][
'publishDate'
]
expiry
=
dateDic
[
title
][
'expiry'
]
try
:
id_list
=
selectMongo
(
href
)
except
:
log
.
info
(
f
'之前没有采集'
)
return
{}
attachmentId
=
id_list
[
0
]
sql
=
f
'select full_path,category from clb_sys_attachment where id="{attachmentId}"'
baseCore
.
cursor_
.
execute
(
sql
)
info
=
baseCore
.
cursor_
.
fetchone
()
fullPath
=
info
[
0
]
category
=
info
[
1
]
.
strip
(
'.'
)
log
.
info
(
f
'{title}===开始获取正文===文件类型为{category}'
)
try
:
if
category
==
'doc'
:
convert_doc_to_docx_stream
(
fullPath
)
content
,
contentWithTag
=
readDocx
(
r'./tmp/tmp.docx'
,
False
)
clearTmp
()
else
:
byteStream
=
getReqContent
(
fullPath
)
content
,
contentWithTag
=
readDocx
(
byteStream
,
True
)
except
Exception
as
e
:
log
.
error
(
f
'{title}===文件解析失败==={e}'
)
return
{}
now
=
datetime
.
datetime
.
now
()
.
strftime
(
'
%
Y-
%
m-
%
d
%
H:
%
M:
%
S'
)
dic_news
=
{
'attachmentIds'
:
id_list
,
# 附件id
'author'
:
''
,
# 作者
'content'
:
content
,
# 正文不带标签
'contentWithTag'
:
contentWithTag
,
# 正文带标签
'createDate'
:
now
,
# 创建时间
'deleteFlag'
:
0
,
# 是否删除(0为默认,1为删除)
'id'
:
''
,
#
'labels'
:
[{
'relationId'
:
"1788847783801794562"
,
'relationName'
:
"国资国企法律法规"
,
'labelMark'
:
"policy"
}],
# 关联标签id 关联标签名称 关联标签标识
'origin'
:
''
,
# 政策发布机关
'organ'
:
office
,
# 政策发文机关、制定机关
'topicClassification'
:
''
,
# 政策文件分类
'issuedNumber'
:
''
,
# 发文字号
'publishDate'
:
publishDate
,
# 政策发布时间、法律公布日期
'writtenDate'
:
None
,
# 成文时间
'implementDate'
:
expiry
,
# 施行日期
'sid'
:
'1788838266435284993'
,
# 信息源id
'sourceAddress'
:
href
,
# 原文链接
'summary'
:
''
,
# 摘要
'title'
:
title
,
# 标题
'legalPrecedenceHierarchy'
:
type
,
# 法律效力位阶
'effectiveness'
:
timeliness
,
# 实效性
}
return
dic_news
def
doJob
():
dateDic
=
getDate
()
searchList
=
[
'国有资产'
,
'国资'
,
'国有企业'
,
'企业'
,
'公司'
]
for
search
in
searchList
:
search_
=
urllib
.
parse
.
quote
(
search
)
url
=
f
'https://flk.npc.gov.cn/api/?type=&fgbt={search_}&searchType=title
%3
Baccurate
%3
B1&sortTr=f_bbrq_s
%3
Bdesc&gbrqStart=&gbrqEnd=&sxrqStart=&sxrqEnd=&page=1&size=10'
datasJson
,
totalSizes
=
getDataJson
(
url
)
if
totalSizes
%
10
==
0
:
totalPage
=
totalSizes
/
10
else
:
totalPage
=
totalSizes
//
10
+
1
for
page
in
range
(
1
,
int
(
totalPage
)
+
1
):
if
page
!=
1
:
url
=
url
.
replace
(
f
'&page={page - 1}'
,
f
'&page={page}'
)
datasJson
,
totalSizes
=
getDataJson
(
url
)
for
dataJson
in
datasJson
:
id_
=
dataJson
[
'id'
]
title
=
dataJson
[
'title'
]
office
=
dataJson
[
'office'
]
publishDate
=
dataJson
[
'publish'
]
expiry
=
dataJson
[
'expiry'
]
type
=
dataJson
[
'type'
]
status
=
dataJson
[
'status'
]
if
status
==
'1'
:
timeliness
=
'有效'
elif
status
==
'5'
:
timeliness
=
'已修改'
elif
status
==
'9'
:
timeliness
=
'已废止'
elif
status
==
'3'
:
timeliness
=
'尚未生效'
href
=
dataJson
[
'url'
]
.
replace
(
'./'
,
'https://flk.npc.gov.cn/'
)
if
is_member_containing_string_bucai
(
href
):
if
is_member_containing_string
(
href
):
log
.
info
(
f
'{title}===已补采'
)
continue
log
.
info
(
f
'开始补采==={title}'
)
dic
=
getDic
(
title
,
office
,
publishDate
,
expiry
,
type
,
timeliness
,
href
,
id_
,
dateDic
)
if
dic
:
flag
=
baseTool
.
sendKafka
(
dic
)
if
flag
:
baseCore
.
r
.
sadd
(
'flk_ok'
,
href
)
else
:
log
.
error
(
f
'{title}==={href}===获取失败'
)
time
.
sleep
(
2
)
def
getDate
():
dic
=
{}
df
=
pd
.
read_excel
(
'./副本YJZX_国资国企法律法规-缺少时间补充v3.xlsx'
,
sheet_name
=
'Sheet1'
)
titles
=
df
[
'标题'
]
.
to_list
()
publishDates
=
df
[
'发布时间'
]
.
to_list
()
expiries
=
df
[
'实施时间'
]
.
to_list
()
for
i
in
range
(
len
(
titles
)):
title
=
titles
[
i
]
publishDate
=
publishDates
[
i
]
publishDate
=
datetime
.
datetime
.
strptime
(
publishDate
,
'
%
Y.
%
m.
%
d'
)
.
strftime
(
'
%
Y-
%
m-
%
d
%
H:
%
M:
%
S'
)
expiry
=
expiries
[
i
]
expiry
=
datetime
.
datetime
.
strptime
(
expiry
,
'
%
Y.
%
m.
%
d'
)
.
strftime
(
'
%
Y-
%
m-
%
d
%
H:
%
M:
%
S'
)
dic
[
title
]
=
{
'publishDate'
:
publishDate
,
'expiry'
:
expiry
}
return
dic
if
__name__
==
'__main__'
:
doJob
()
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论