浙江省人民政府

07bd3604 · 薛凌堃 · 3d6d75b6 · 07bd3604
--- a/REITs专题数据/policy-zhejiang.py
+++ b/REITs专题数据/policy-zhejiang.py
 import os
@@ -6,7 +6,7 @@ import requests
 from bs4 import BeautifulSoup
 from retry import retry
-from base import BaseCore
+import BaseCore
 baseCore = BaseCore.BaseCore()
 log = baseCore.getLogger()
@@ -96,13 +96,19 @@ class Policy():
            category = os.path.splitext(file_href)[1]
            if category not in file_name:
                file_name = file_name + category
+            try:
                retData = baseCore.uptoOBS(file_href, '', file_name)
+            except:
+                return '', ''
            if retData['state']:
                pass
            else:
                return '', ''
+            try:
                att_id, full_path = baseCore.tableUpdate(retData, 'RETIs文件', file_name, num, publishDate)
                return att_id, full_path
+            except:
+                return '', ''
        else:
            return '', ''
@@ -422,6 +428,8 @@ def getContent(url, publishDate, num):
        contentWithTag = soup.find('body > div:nth-of-type(2) > div:nth-of-type(3) > div:nth-of-type(3)')
    if not contentWithTag:
        contentWithTag = soup.find('div',class_='detail-pic')
+    if not contentWithTag:
+        contentWithTag = soup.find('div',class_='mian')
    try:
        contentWithTag.find('video').decompose()
        contentWithTag = None
@@ -439,9 +447,18 @@ def getContent(url, publishDate, num):
            style.decompose()
    except:
        pass
+    if contentWithTag:
+        pass
+    else:
+        log.info(f"内容未解析出来===={url}")
+        return '','',[]
    a_list = contentWithTag.find_all('a')
    for a in a_list:
        href = a.get('href')
+        if href:
+            pass
+        else:
+            continue
        fj_title = a.text.strip().lstrip()
        category = os.path.splitext(href)[1]
        if category not in fj_title:
@@ -464,6 +481,10 @@ def getDatas(page):
            '\r\n', ' ')
        href = soup.find('div', class_='titleWrapper').find('a').get('href')
        href = href.split('url=')[1].split('.html')[0].replace('%3A', ':').replace('%2F', '/') + '.html'
+        # 根据链接判重
+        is_member = baseCore.r.sismember('REITs::' + webname, href)
+        if is_member:
+            continue
        try:
            info = soup.find('table', class_='fgwj_table_list').text
            organ = info.split('发布机构：')[1].split('成文日期：')[0].lstrip().strip()
@@ -475,6 +496,10 @@ def getDatas(page):
            ' ', '').replace(' ', '').replace('\r\n', '')
        publishDate = soup.find('div', class_='sourceTime').text.split('时间:')[1].lstrip().strip()
        contentWithTag, content, id_list = getContent(href, publishDate, num)
+        if contentWithTag:
+            pass
+        else:
+            continue
        num += 1
        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        dic_info = {
@@ -494,7 +519,7 @@ def getDatas(page):
            'issuedNumber': '',
            'summary': '',
            'createDate': time_now,
-            'sid': '1729041791539326977',
+            'sid': '1730472253306552321',
        }
        try:
            baseCore.sendkafka(dic_info, topic)