北京市人民政府

3fdea62f · 薛凌堃 · 401719e1 · 3fdea62f
--- a/REITs专题数据/policy-beijing.py
+++ b/REITs专题数据/policy-beijing.py
-import os
+import os
 import os
 import random
-import re
-
-import fitz
-import numpy as np
-import openpyxl
-import pandas as pd
 import requests
 from bs4 import BeautifulSoup
-from datetime import datetime
 import time
-
-from openpyxl import load_workbook
 from retry import retry
-from selenium import webdriver
-from selenium.webdriver.chrome.service import Service
-
 from urllib.parse import urljoin

 import BaseCore
@@ -23,8 +11,14 @@ import BaseCore
 baseCore = BaseCore.BaseCore()
 log = baseCore.getLogger()

+from reits import Policy
+policy = Policy()
+
+
+topic = 'policy'
+webname = '北京市人民政府'

-class Policy():
+class Policy1():
    @retry(tries=3, delay=10)
    def getrequest_soup(self, url):
        ip = baseCore.get_proxy()
@@ -137,7 +131,7 @@ def getFjContent(url):
 def beijing():
    if not os.path.exists('./相关政策/北京市人民政府/政策文件'):
        os.makedirs('./相关政策/北京市人民政府/政策文件')
-    policy = Policy()
+    policy1 = Policy1()
    url = 'https://www.beijing.gov.cn/so/ss/query/s'
    payload = {
        'siteCode': '1100000088',
@@ -191,24 +185,31 @@ def beijing():
        data = policy.requestPost(headers, url, payload_page)
        info_list = data['resultDocs']
        for info_ in info_list:
-            fjtitle_list = ''
-            fjhref_list = ''
+            id_list = []
+
            info = info_['data']
            origin = info['siteLabel']['value'].lstrip().strip()
            title = info['titleO'].lstrip().strip()
            titleLabel = info['titleLabel']['value'].lstrip().strip()
            publishDate = info['docDate'].lstrip().strip()
            newsUrl = info['url'].lstrip().strip()
+            # 根据链接判重
+            is_member = baseCore.r.sismember('REITs::' + webname, newsUrl)
+            if is_member:
+                continue
            summary = info['summary'].lstrip().strip()
            summary = BeautifulSoup(summary, 'lxml').text.lstrip().strip()
-            writtenDate = ''
+            writtenDate = None
            pub_hao = ''
            organ = ''
+            content = ''
+            topicClassification = ''
+
            if titleLabel == '政策解读':
                try:
-                    newssoup = policy.getrequest_soup(newsUrl)
+                    newssoup = policy1.getrequest_soup(newsUrl)
                except:
-                    newssoup = policy.getrequest_soup_(newsUrl)
+                    newssoup = policy1.getrequest_soup_(newsUrl)
                contentWithTag = newssoup.find('div', id='mainText')
                try:
                    scripts = contentWithTag.find_all('script')
@@ -223,13 +224,14 @@ def beijing():
                except:
                    pass
                content = contentWithTag.text.lstrip().strip()
+                contentWithTag_str = str(contentWithTag)
                organ = newssoup.find('div', class_='othermessage').find('p', class_='fl').text.split('来源：')[
                    1].lstrip().strip()
            elif titleLabel == '政策文件':
                try:
-                    newssoup = policy.getrequest_soup(newsUrl)
+                    newssoup = policy1.getrequest_soup(newsUrl)
                except:
-                    newssoup = policy.getrequest_soup_(newsUrl)
+                    newssoup = policy1.getrequest_soup_(newsUrl)
                contentWithTag = newssoup.find('div', id='mainText')
                try:
                    scripts = contentWithTag.find_all('script')
@@ -248,39 +250,66 @@ def beijing():
                    if '成文日期' in li.text:
                        writtenDate = li.find('span').text.lstrip().strip()
                content = contentWithTag.text.lstrip().strip()
+                contentWithTag_str = str(contentWithTag)
                formatRows = info['formatRows']
                for row in formatRows:
                    for col in row['col']:
                        name = col['text']
                        if name == '相关附件':
+                            tag_str = ''
                            value = col['value']
                            for i in range(len(value.keys())):
                                file_href = list(value.keys())[i]
                                file_name = list(value.values())[i]
-                                fjcontent = getFjContent(file_href)
                                category = os.path.splitext(file_href)[1]
                                if category not in file_name:
                                    file_name = file_name + category
-                                file_name = f'{num}-{publishDate}-{file_name}'
-                                file = f'./相关政策/北京市人民政府/政策文件/{file_name}'
-                                fjtitle_list += file_name + '\n'
-                                with open(file, 'wb') as f:
-                                    f.write(fjcontent)
-                                log.info(f'{file_name}===附件下载成功')
+
+                                # 上传附件至obs
+                                att_id, full_path = policy.attuributefile(file_name, file_href, num, publishDate)
+                                if att_id:
+                                    id_list.append(att_id)
+                                    tag = newssoup.find('ul', class_='fujian').find_all('a')[i]
+                                    tag['href'] = full_path
+                                    tag_str += str(tag) + '<br>'
+                            contentWithTag_str += tag_str
                        elif '号' in name:
                            pub_hao = col['value'].lstrip().strip()
                        elif '发文机构' in name:
                            organ = col['value'][0].lstrip().strip()
+                        elif '主题分类' in name:
+                            topicClassification = col['value'][0].lstrip().strip()
+            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+            if content == '':
+                continue
+            dic_info = {
+                'attachmentIds': id_list,
+                'author': '',
+                'content': content,
+                'contentWithTag': contentWithTag_str,
+                'deleteFlag': 0,
+                'id': '',
+                'title': title,
+                'publishDate': publishDate,
+                'origin': origin,
+                'sourceAddress': newsUrl,
+                'writtenDate': writtenDate,
+                'organ': organ,
+                'topicClassification': topicClassification,
+                'issuedNumber': pub_hao,
+                'summary': summary,
+                'createDate': time_now,
+                'sid': '1729041207245328385',
+            }
+            try:
+                baseCore.sendkafka(dic_info, topic)
+                baseCore.r.sadd('REITs::' + webname, newsUrl)
+                log.info(f'采集成功--{title}--{newsUrl}')
+            except:
+                for att_id in id_list:
+                    baseCore.deliteATT(att_id)
            time.sleep(random.randint(10, 20))
-            data = [num, title, publishDate, origin, newsUrl, writtenDate, organ, pub_hao, summary, content,
-                    fjtitle_list, fjhref_list]
-            data_list.append(data)
-            log.info(f'{title}===采集成功')
            num += 1
-    df = pd.DataFrame(np.array(data_list))
-    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
-    df.to_excel('./相关政策/北京市人民政府/北京市人民政府政策文件.xlsx', index=False)
-

 if __name__ == '__main__':
    beijing()