辽宁省人民政府

98200599 · 薛凌堃 · 362b085c · 98200599
--- a/REITs专题数据/policy-liaoning.py
+++ b/REITs专题数据/policy-liaoning.py
-import time
+import time
@@ -6,11 +6,17 @@ import pandas as pd
 import requests
 from bs4 import BeautifulSoup
 from selenium.webdriver.common.by import By
-
-from base import BaseCore
+import BaseCore

 baseCore = BaseCore.BaseCore()
 log = baseCore.getLogger()
+
+from reits import Policy
+policy = Policy()
+
+
+topic = 'policy'
+webname = '天津市人民政府'
 headers = {
    'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
 }
@@ -22,13 +28,14 @@ def getContent(url):
    soup = BeautifulSoup(req.text, 'html.parser')
    contentWithTag = soup.find('div', class_='zfwj_detail')
    pub_hao = contentWithTag.find('p', class_='wjh').text.lstrip().strip()
-    content = contentWithTag.text.lstrip().strip()
-    return content, pub_hao
+
+    return contentWithTag, pub_hao


 def doJob():
    url = 'https://www.ln.gov.cn/search/pcRender?pageId=7b2aa485f97e40e4a0b4b635f36eda6c'
-    driver = baseCore.buildDriver()
+    # driver = baseCore.buildDriver()
+    driver = policy.createDriver()
    driver.get(url)
    time.sleep(1)
    driver.find_element(By.CLASS_NAME, 'conFl_con').find_elements(By.TAG_NAME, 'a')[-1].find_element(By.TAG_NAME,
@@ -39,24 +46,50 @@ def doJob():
    time.sleep(1)
    div_list = driver.find_elements(By.CLASS_NAME, 'searchMod')
    num = 1
-    data_list = []
+
    for div in div_list:
        title = div.find_element(By.TAG_NAME, 'a').text.replace('\n', '').lstrip().strip()
        href = div.find_element(By.TAG_NAME, 'a').get_attribute('href')
+        # 根据链接判重
+        is_member = baseCore.r.sismember('REITs::' + webname, href)
+        if is_member:
+            continue
        summary = div.find_element(By.CLASS_NAME, 'txtCon').find_element(By.TAG_NAME, 'a').text.replace('\n',
                                                                                                        '').lstrip().strip()
        publishDate = div.find_element(By.CLASS_NAME, 'dates').text.split('时间：')[1].replace('年', '-').replace('月',
                                                                                                              '-').replace(
            '日', '').lstrip().strip()
-        content, pub_hao = getContent(href)
-        data = [num, title, publishDate, '辽宁省人民政府', href, '', '', pub_hao, summary, content, '', '']
-        data_list.append(data)
-        log.info(f'{title}===采集成功')
+        contentWithTag, pub_hao = getContent(href)
+        content = contentWithTag.text.lstrip().strip()
+        contentWithTag_str = str(contentWithTag)
+        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        dic_info = {
+            'attachmentIds': [],
+            'author': '',
+            'content': content,
+            'contentWithTag': contentWithTag_str,
+            'deleteFlag': 0,
+            'id': '',
+            'title': title,
+            'publishDate': publishDate,
+            'origin': '辽宁省人民政府',
+            'sourceAddress': url,
+            'writtenDate': '',
+            'organ': '',
+            'topicClassification': '',
+            'issuedNumber': pub_hao,
+            'summary': summary,
+            'createDate': time_now,
+            'sid': '1729042213737967618',
+        }
+        try:
+            baseCore.sendkafka(dic_info, topic)
+            baseCore.r.sadd('REITs::' + webname, url)
+            log.info(f'采集成功--{title}--{url}')
+        except Exception as e:
+            continue
        num += 1
    driver.close()
-    df = pd.DataFrame(np.array(data_list))
-    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
-    df.to_excel('./辽宁省人民政府政策文件.xlsx', index=False)


 if __name__ == '__main__':