flk数据采集 6/25

2e3b5585 · LiuLiYuan · 721c31d7 · 2e3b5585 · 2e3b5585
--- a/comData/policylaw/flk.py
+++ b/comData/policylaw/flk.py
+import datetime
+import time
+import urllib.parse
+import requests
+from ClassTool import ClassTool
+from BaseCore import BaseCore
+baseTool = ClassTool()
+baseCore = BaseCore()
+log = baseCore.getLogger()
+headers = {
+    'Accept': 'application/json, text/javascript, */*; q=0.01',
+    'Accept-Encoding': 'gzip, deflate, br, zstd',
+    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+    'Connection': 'keep-alive',
+    'Host': 'flk.npc.gov.cn',
+    'Referer': 'https://flk.npc.gov.cn/fl.html',
+    'Sec-Fetch-Dest': 'empty',
+    'Sec-Fetch-Mode': 'cors',
+    'Sec-Fetch-Site': 'same-origin',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0',
+    'X-Requested-With': 'XMLHttpRequest',
+    'sec-ch-ua': '"Chromium";v="124", "Microsoft Edge";v="124", "Not-A.Brand";v="99"',
+    'sec-ch-ua-mobile': '?0',
+    'sec-ch-ua-platform': '"Windows"',
+}
+def getDataJson(url):
+    req = requests.get(url, headers=headers)
+    req.encoding = req.apparent_encoding
+    datasJson = req.json()['result']['data']
+    totalSizes = req.json()['result']['totalSizes']
+    req.close()
+    return datasJson, totalSizes
+def getPdf(id_, title, publishDate):
+    id_list = []
+    url = 'https://flk.npc.gov.cn/api/detail'
+    payload = {'id': id_}
+    req = requests.post(url, headers=headers, data=payload)
+    req.encoding = req.apparent_encoding
+    datasJson = req.json()['result']['body']
+    req.close()
+    href = ''
+    for dataJson in datasJson:
+        if dataJson['type'] == 'WORD':
+            href = 'https://wb.flk.npc.gov.cn' + dataJson['path']
+            break
+    if not href:
+        log.error(f'{title}===附件链接获取失败')
+        return ''
+    retData = baseCore.uptoOBS(href, '1699', title)
+    if retData['state']:
+        pass
+    else:
+        return ''
+    att_id, full_path = baseCore.tableUpdate(retData, '国务院文件', title, 0, publishDate)
+    id_list.append(att_id)
+    return id_list
+def getDic(title, office, publishDate, expiry, type, timeliness, href, id_):
+    id_list = getPdf(id_, title, publishDate)
+    if not id_list:
+        log.error(f'{title}===附件下载失败')
+        return ''
+    now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    dic_news = {
+        'attachmentIds': id_list,  # 附件id
+        'author': '',  # 作者
+        'content': title,  # 正文不带标签
+        'contentWithTag': '',  # 正文带标签
+        'createDate': now,  # 创建时间
+        'deleteFlag': 0,  # 是否删除(0为默认，1为删除)
+        'id': '',  #
+        'labels': [{'relationId': "1788847783801794562", 'relationName': "国资国企法律法规", 'labelMark': "policy"}],
+        # 关联标签id  关联标签名称  关联标签标识
+        'origin': '',  # 政策发布机关
+        'organ': office,  # 政策发文机关、制定机关
+        'topicClassification': '',  # 政策文件分类
+        'issuedNumber': '',  # 发文字号
+        'publishDate': publishDate,  # 政策发布时间、法律公布日期
+        'writtenDate': None,  # 成文时间
+        'implementDate': expiry,  # 施行日期
+        'sid': '1788838266435284993',  # 信息源id
+        'sourceAddress': href,  # 原文链接
+        'summary': '',  # 摘要
+        'title': title,  # 标题
+        'legalPrecedenceHierarchy': type,  # 法律效力位阶
+        'effectiveness': timeliness,  # 实效性
+    }
+    return dic_news
+def doJob():
+    searchList = ['国有资产', '国资', '国有企业', '企业', '公司']
+    for search in searchList:
+        search_ = urllib.parse.quote(search)
+        url = f'https://flk.npc.gov.cn/api/?type=&fgbt={search_}&searchType=title%3Baccurate%3B1&sortTr=f_bbrq_s%3Bdesc&gbrqStart=&gbrqEnd=&sxrqStart=&sxrqEnd=&page=1&size=10'
+        datasJson, totalSizes = getDataJson(url)
+        if totalSizes % 10 == 0:
+            totalPage = totalSizes / 10
+        else:
+            totalPage = totalSizes // 10 + 1
+        for page in range(1, totalPage + 1):
+            if page != 1:
+                url = url.replace(f'&page={page - 1}', f'&page={page}')
+                datasJson, totalSizes = getDataJson(url)
+            for dataJson in datasJson:
+                id_ = dataJson['id']
+                title = dataJson['title']
+                office = dataJson['office']
+                publishDate = dataJson['publish']
+                expiry = dataJson['expiry']
+                type = dataJson['type']
+                status = dataJson['status']
+                if status == '1':
+                    timeliness = '有效'
+                elif status == '5':
+                    timeliness = '已修改'
+                elif status == '9':
+                    timeliness = '已废止'
+                elif status == '3':
+                    timeliness = '尚未生效'
+                href = dataJson['url'].replace('./', 'https://flk.npc.gov.cn/')
+                is_href = baseTool.db_storage.find_one({'网址': href})
+                if is_href:
+                    log.info(f'{title}===已采集')
+                    continue
+                dic = getDic(title, office, publishDate, expiry, type, timeliness, href, id_)
+                if dic:
+                    flag = baseTool.sendKafka(dic)
+                    if flag:
+                        baseTool.save_data(dic)
+                else:
+                    log.error(f'{title}==={href}===获取失败')
+                time.sleep(2)
+if __name__ == '__main__':
+    doJob()
--- a/comData/policylaw/flk_buchong.py
+++ b/comData/policylaw/flk_buchong.py