提交 2e3b5585 作者: LiuLiYuan

flk数据采集 6/25

上级 721c31d7
import datetime
import time
import urllib.parse
import requests
from ClassTool import ClassTool
from BaseCore import BaseCore
baseTool = ClassTool()
baseCore = BaseCore()
log = baseCore.getLogger()
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Connection': 'keep-alive',
'Host': 'flk.npc.gov.cn',
'Referer': 'https://flk.npc.gov.cn/fl.html',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0',
'X-Requested-With': 'XMLHttpRequest',
'sec-ch-ua': '"Chromium";v="124", "Microsoft Edge";v="124", "Not-A.Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
def getDataJson(url):
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
datasJson = req.json()['result']['data']
totalSizes = req.json()['result']['totalSizes']
req.close()
return datasJson, totalSizes
def getPdf(id_, title, publishDate):
id_list = []
url = 'https://flk.npc.gov.cn/api/detail'
payload = {'id': id_}
req = requests.post(url, headers=headers, data=payload)
req.encoding = req.apparent_encoding
datasJson = req.json()['result']['body']
req.close()
href = ''
for dataJson in datasJson:
if dataJson['type'] == 'WORD':
href = 'https://wb.flk.npc.gov.cn' + dataJson['path']
break
if not href:
log.error(f'{title}===附件链接获取失败')
return ''
retData = baseCore.uptoOBS(href, '1699', title)
if retData['state']:
pass
else:
return ''
att_id, full_path = baseCore.tableUpdate(retData, '国务院文件', title, 0, publishDate)
id_list.append(att_id)
return id_list
def getDic(title, office, publishDate, expiry, type, timeliness, href, id_):
id_list = getPdf(id_, title, publishDate)
if not id_list:
log.error(f'{title}===附件下载失败')
return ''
now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
dic_news = {
'attachmentIds': id_list, # 附件id
'author': '', # 作者
'content': title, # 正文不带标签
'contentWithTag': '', # 正文带标签
'createDate': now, # 创建时间
'deleteFlag': 0, # 是否删除(0为默认,1为删除)
'id': '', #
'labels': [{'relationId': "1788847783801794562", 'relationName': "国资国企法律法规", 'labelMark': "policy"}],
# 关联标签id 关联标签名称 关联标签标识
'origin': '', # 政策发布机关
'organ': office, # 政策发文机关、制定机关
'topicClassification': '', # 政策文件分类
'issuedNumber': '', # 发文字号
'publishDate': publishDate, # 政策发布时间、法律公布日期
'writtenDate': None, # 成文时间
'implementDate': expiry, # 施行日期
'sid': '1788838266435284993', # 信息源id
'sourceAddress': href, # 原文链接
'summary': '', # 摘要
'title': title, # 标题
'legalPrecedenceHierarchy': type, # 法律效力位阶
'effectiveness': timeliness, # 实效性
}
return dic_news
def doJob():
searchList = ['国有资产', '国资', '国有企业', '企业', '公司']
for search in searchList:
search_ = urllib.parse.quote(search)
url = f'https://flk.npc.gov.cn/api/?type=&fgbt={search_}&searchType=title%3Baccurate%3B1&sortTr=f_bbrq_s%3Bdesc&gbrqStart=&gbrqEnd=&sxrqStart=&sxrqEnd=&page=1&size=10'
datasJson, totalSizes = getDataJson(url)
if totalSizes % 10 == 0:
totalPage = totalSizes / 10
else:
totalPage = totalSizes // 10 + 1
for page in range(1, totalPage + 1):
if page != 1:
url = url.replace(f'&page={page - 1}', f'&page={page}')
datasJson, totalSizes = getDataJson(url)
for dataJson in datasJson:
id_ = dataJson['id']
title = dataJson['title']
office = dataJson['office']
publishDate = dataJson['publish']
expiry = dataJson['expiry']
type = dataJson['type']
status = dataJson['status']
if status == '1':
timeliness = '有效'
elif status == '5':
timeliness = '已修改'
elif status == '9':
timeliness = '已废止'
elif status == '3':
timeliness = '尚未生效'
href = dataJson['url'].replace('./', 'https://flk.npc.gov.cn/')
is_href = baseTool.db_storage.find_one({'网址': href})
if is_href:
log.info(f'{title}===已采集')
continue
dic = getDic(title, office, publishDate, expiry, type, timeliness, href, id_)
if dic:
flag = baseTool.sendKafka(dic)
if flag:
baseTool.save_data(dic)
else:
log.error(f'{title}==={href}===获取失败')
time.sleep(2)
if __name__ == '__main__':
doJob()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论