江苏省人民政府

db64a87a · 薛凌堃 · 78a94cdb · db64a87a
--- a/REITs专题数据/policy-jiangsu.py
+++ b/REITs专题数据/policy-jiangsu.py
 import os
@@ -8,11 +8,18 @@ import requests
 from bs4 import BeautifulSoup
 from selenium.webdriver.common.by import By
-from base import BaseCore
+import BaseCore
 baseCore = BaseCore.BaseCore()
 log = baseCore.getLogger()
+from reits import Policy
+policy = Policy()
+topic = 'policy'
+webname = '江苏省人民政府'
 headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
    'Content-Type': 'application/x-www-form-urlencoded',
@@ -35,9 +42,8 @@ def getFjContent(url):
    return req.content
-def getContentA(url, num, publishDate, title):
+def getContentA(url, num, publishDate, title, origin, summary):
-    fjhref_list = ''
+    id_list = []
-    fjtitle_list = ''
    soup = getSoup(url)
    organ = soup.find('div', class_='sp_time').text.split('来源：')[1].split('字体')[0].lstrip().strip()
    contentWithTag = soup.find('div', attrs={'id': 'zoom'})
@@ -60,31 +66,56 @@ def getContentA(url, num, publishDate, title):
            fj_href = img.get('src')
            try:
                fj_href = 'http://www.jiangsu.gov.cn' + fj_href
-                fjhref_list += fj_href + '\n'
                fj_title = img.get('title').lstrip().strip()
-                fj_title = f'{num}-{publishDate}-{fj_title}'
-                fjtitle_list += fj_title + '\n'
            except:
                if 'img/png' in fj_href:
-                    fj_title = f'{num}-{publishDate}-{title}-{num_}.png'
+                    fj_title = f'{title}-{num_}.png'
                elif 'img/jpg' in fj_href:
-                    fj_title = f'{num}-{publishDate}-{title}-{num_}.jpg'
+                    fj_title = f'{title}-{num_}.jpg'
                num_ += 1
-            fjcontent = getFjContent(fj_href)
+            att_id, full_path = policy.attuributefile(fj_title, fj_href,num, publishDate)
-            file = f'./相关政策/江苏省人民政府/政策文件/{fj_title}'
+            if att_id:
-            with open(file, 'wb') as f:
+                id_list.append(att_id)
-                f.write(fjcontent)
+                img['href'] = full_path
-            log.info(f'{fj_title}===附件下载成功')
+            else:
+                pass
    except:
        pass
    content = contentWithTag.text
-    return organ, content, fjtitle_list, fjhref_list
+    contentWithTag_str = str(contentWithTag)
+    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    dic_info = {
+        'attachmentIds': id_list,
+        'author': '',
+        'content': content,
+        'contentWithTag': contentWithTag_str,
+        'deleteFlag': 0,
+        'id': '',
+        'title': title,
+        'publishDate': publishDate,
+        'origin': origin,
+        'sourceAddress': url,
+        'writtenDate': None,
+        'organ': organ,
+        'topicClassification': '',
+        'issuedNumber': '',
+        'summary': summary,
+        'createDate': time_now,
+        'sid': '1729042894974537730',
+    }
+    try:
+        baseCore.sendkafka(dic_info, topic)
+        baseCore.r.sadd('REITs::' + webname, url)
+        log.info(f'采集成功--{title}--{url}')
+    except Exception as e:
+        for att_id in id_list:
+            baseCore.deliteATT(att_id)
+    return
-def getContentB(url, num, publishDate, title):
+def getContentB(url, num, publishDate, title, origin, summary):
-    fjhref_list = ''
+    id_list = []
-    fjtitle_list = ''
    soup = getSoup(url)
    info = soup.find('table', class_='xxgk_table').text.replace(' ','')
    organ = info.split('发布机构：')[1].split('发文日期')[0].lstrip().strip()
@@ -110,61 +141,88 @@ def getContentB(url, num, publishDate, title):
        fj_href = img.get('src')
        try:
            fj_title = img.get('title').lstrip().strip()
-            fj_title = f'{num}-{publishDate}-{fj_title}'
-            fjtitle_list += fj_title + '\n'
            fj_href = 'http://www.jiangsu.gov.cn' + fj_href
-            fjhref_list += fj_href + '\n'
-            fjcontent = getFjContent(fj_href)
-            file = f'./相关政策/江苏省人民政府/政策文件/{fj_title}'
-            with open(file, 'wb') as f:
-                f.write(fjcontent)
-            log.info(f'{fj_title}===附件下载成功')
        except:
            if 'image/png' in fj_href:
-                fj_title = f'{num}-{publishDate}-{title}-{num_}.png'
+                fj_title = f'{title}-{num_}.png'
            elif 'image/jpg' in fj_href:
-                fj_title = f'{num}-{publishDate}-{title}-{num_}.jpg'
+                fj_title = f'{title}-{num_}.jpg'
            num_ += 1
-            fjtitle_list += fj_title + '\n'
+        try:
+            att_id, full_path = policy.attuributefile(fj_title, fj_href, num, publishDate)
+        except:
+            att_id = ''
+        if att_id:
+            id_list.append(att_id)
+            img['href'] = full_path
+        else:
+            pass
    content = contentWithTag.text.lstrip().strip()
-    return organ, writtenDate, pub_hao, content, fjtitle_list, fjhref_list
+    contentWithTag_str = str(contentWithTag)
+    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    dic_info = {
+        'attachmentIds': id_list,
+        'author': '',
+        'content': content,
+        'contentWithTag': contentWithTag_str,
+        'deleteFlag': 0,
+        'id': '',
+        'title': title,
+        'publishDate': publishDate,
+        'origin': origin,
+        'sourceAddress': url,
+        'writtenDate': writtenDate,
+        'organ': organ,
+        'topicClassification': '',
+        'issuedNumber': pub_hao,
+        'summary': summary,
+        'createDate': time_now,
+        'sid': '1729042894974537730',
+    }
+    try:
+        baseCore.sendkafka(dic_info, topic)
+        baseCore.r.sadd('REITs::' + webname, url)
+        log.info(f'采集成功--{title}--{url}')
+    except Exception as e:
+        for att_id in id_list:
+            baseCore.deliteATT(att_id)
+    return
 def doJob():
-    if not os.path.exists('./相关政策/江苏省人民政府/政策文件'):
-        os.makedirs('./相关政策/江苏省人民政府/政策文件')
    pattern = r"\d{4}-\d{2}-\d{2}"
    url = 'http://www.jiangsu.gov.cn/jsearchfront/search.do?websiteid=320000000100000&searchid=12&pg=&p=1&tpl=38&serviceType=&cateid=27&q=REITs&pq=&oq=&eq=&pos=&sortType=0&begin=&end='
-    driver = baseCore.buildDriver()
+    # driver = baseCore.buildDriver()
+    driver = policy.createDriver()
    driver.get(url)
    time.sleep(5)
    div_list = driver.find_elements(By.CLASS_NAME,'news-result')
    num = 1
-    data_list = []
    for div in div_list:
+        id_list = []
        title = div.find_element(By.CLASS_NAME, 'jcse-news-title').find_element(By.TAG_NAME,'a').get_attribute('title').lstrip().strip()
        href = div.find_element(By.CLASS_NAME, 'jcse-news-title').find_element(By.TAG_NAME,'a').get_attribute('href')
+        # 根据链接判重
+        is_member = baseCore.r.sismember('REITs::' + webname, href)
+        if is_member:
+            continue
        type = div.find_element(By.CLASS_NAME, 'biaoqian').text.lstrip().strip()
        summary = div.find_element(By.CLASS_NAME, 'jcse-news-abs-content').text.lstrip().strip()
        dateInfo = div.find_element(By.CLASS_NAME, 'jcse-news-date').text
        publishDate = re.findall(pattern, dateInfo)[0]
        origin = dateInfo.replace(publishDate, '').lstrip().strip()
        if type == '政务公开':
-            organ, content, fjtitle_list, fjhref_list = getContentA(href, num, publishDate, title)
+            getContentA(href, num, publishDate, title, origin, summary)
-            writtenDate = ''
-            pub_hao = ''
        else:
-            organ, writtenDate, pub_hao, content, fjtitle_list, fjhref_list = getContentB(href, num, publishDate, title)
+            getContentB(href, num, publishDate, title, origin, summary)
-        data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
-                 fjhref_list]
-        data_list.append(data)
-        log.info(f'{title}===采集成功')
        num += 1
        time.sleep(5)
    driver.close()
-    df = pd.DataFrame(np.array(data_list))
-    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
-    df.to_excel('./江苏省人民政府政策文件.xlsx', index=False)
 if __name__ == '__main__':