REITs专题 11/28

574620c3 · LiuLiYuan · 5d788bc9 · 574620c3 · 574620c3 · 574620c3
--- a/REITs专题数据/DisInfo-shanghai.py
+++ b/REITs专题数据/DisInfo-shanghai.py
+import json
+import json
+import re
+import time
+
+import numpy as np
+import pandas as pd
+import requests
+import os
+from base import BaseCore
+
+baseCore = BaseCore.BaseCore()
+
+log = baseCore.getLogger()
+headers = {
+    'Accept': '*/*',
+    'Accept-Encoding': 'gzip, deflate',
+    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+    'Cache-Control': 'no-cache',
+    'Connection': 'keep-alive',
+    'Host': 'query.sse.com.cn',
+    'Pragma': 'no-cache',
+    'Referer': 'http://www.sse.com.cn/',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+}
+
+
+# 获取json数据
+def getJson(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    data_json = re.findall('\((.*)\)', req.text)[0]
+    data_json = json.loads(data_json)
+    req.close()
+    return data_json
+
+
+# 获取总页数
+def getTotal():
+    url = f'http://query.sse.com.cn/commonSoaQuery.do?jsonCallBack=jsonpCallback42283&sqlId=REITS_BULLETIN&isPagination=true&fundCode=&startDate=&endDate=&pageHelp.pageSize=25&pageHelp.cacheSize=1&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.endPage=5&_={int(time.time())}'
+    data_json = getJson(url)
+    total = int(data_json['pageHelp']['pageCount'])
+    return total
+
+
+# 获取pdf文件的基本信息
+def getDataList(page):
+    info_list = []
+    url = f'http://query.sse.com.cn/commonSoaQuery.do?jsonCallBack=jsonpCallback42283&sqlId=REITS_BULLETIN&isPagination=true&fundCode=&startDate=&endDate=&pageHelp.pageSize=25&pageHelp.cacheSize=1&pageHelp.pageNo={page}&pageHelp.beginPage={page}&pageHelp.endPage=5&_={int(time.time())}'
+    data_json = getJson(url)['result']
+    for data in data_json:
+        name = data['fundExtAbbr']
+        title = data['title']
+        pub_time = data['sseDate']
+        code = data['securityCode']
+        href = 'http://www.sse.com.cn' + data['url'].replace('\\', '')
+        info_list.append([title, pub_time, href, name, code])
+    return info_list
+
+
+# 获取pdf文件数据流
+def getContent(href):
+    ip = baseCore.get_proxy()
+    headers = {
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Accept-Encoding': 'gzip, deflate',
+        'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+        'Cache-Control': 'no-cache',
+        'Connection': 'keep-alive',
+        'Host': 'www.sse.com.cn',
+        'Pragma': 'no-cache',
+        'Upgrade-Insecure-Requests': '1',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+    }
+    req = requests.get(href, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    content = req.content
+    req.close()
+    return content
+
+
+def doJob():
+    data_list = []
+    total = getTotal()
+    for page in range(1, total + 1):
+        info_list = getDataList(page)
+        for info in info_list:
+            title = info[0]
+            pub_time = info[1]
+            href = info[2]
+            name = info[3]
+            code = info[4]
+            data_list.append([code,name,title,pub_time,href,'上海交易所','http://www.sse.com.cn/reits/announcements/'])
+            try:
+                content = getContent(href)
+            except:
+                log.error(f'第{page}页==={title}===连接失败')
+                continue
+            file = f'./公告_2/{code}-{name}/{title}-{pub_time}.pdf'
+            # num = 2
+            # while True:
+            #     flg = os.path.isfile(file)
+            #     if flg:
+            #         print(f'{title}===有重名')
+            #         file = f'./公告/{code}-{name}/{title}-{num}.pdf'
+            #         num += 1
+            #     else:
+            #         break
+            try:
+                try:
+                    with open(file, 'wb') as f:
+                        f.write(content)
+                except:
+                    try:
+                        os.mkdir(f'./公告_2/{code}-{name}')
+                        with open(file, 'wb') as f:
+                            f.write(content)
+                    except:
+                        os.mkdir(f'./公告_2')
+                        os.mkdir(f'./公告_2/{code}-{name}')
+                        with open(file, 'wb') as f:
+                            f.write(content)
+                log.info(f'{title}===成功')
+            except:
+                log.error(f'第{page}页==={title}===保存失败')
+    df = pd.DataFrame(np.array(data_list))
+    df.columns = ['公募REITs代码','扩位简称','公告标题','披露日期','公告网址','来源','来源网址']
+    df.to_excel('./上海交易所信息披露.xlsx',index=False)
+
+
+if __name__ == '__main__':
+    doJob()
+    baseCore.close()
--- a/REITs专题数据/FundAnncmnt-shenzhen.py
+++ b/REITs专题数据/FundAnncmnt-shenzhen.py
+import re
+import re
+import time
+
+import numpy as np
+import pandas as pd
+import requests
+import os
+import json
+from base import BaseCore
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+    'Accept': 'application/json, text/javascript, */*; q=0.01',
+    'Accept-Encoding': 'gzip, deflate',
+    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+    'Cache-Control': 'no-cache',
+    'Connection': 'keep-alive',
+    'Content-Type': 'application/json',
+    'Host': 'www.szse.cn',
+    'Origin': 'http://www.szse.cn',
+    'Pragma': 'no-cache',
+    'Referer': 'http://www.szse.cn/disclosure/fund/notice/index.html',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+    'X-Request-Type': 'ajax',
+    'X-Requested-With': 'XMLHttpRequest',
+}
+url = 'http://www.szse.cn/api/disc/announcement/annList'
+
+
+# 获取代码列表
+def getCodeList():
+    code_list = []
+    headers = {
+        'Accept': 'application/json, text/javascript, */*; q=0.01',
+        'Accept-Encoding': 'gzip, deflate',
+        'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+        'Cache-Control': 'no-cache',
+        'Connection': 'keep-alive',
+        'Content-Type': 'application/json',
+        'Host': 'www.szse.cn',
+        'Pragma': 'no-cache',
+        'Referer': 'http://www.szse.cn/market/product/list/all/index.html',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+        'X-Request-Type': 'ajax',
+        'X-Requested-With': 'XMLHttpRequest',
+    }
+    url = 'http://www.szse.cn/api/report/ShowReport/data?SHOWTYPE=JSON&CATALOGID=1105&TABKEY=tab1&selectJjlb=%E5%9F%BA%E7%A1%80%E8%AE%BE%E6%96%BD%E5%9F%BA%E9%87%91'
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    data_list = req.json()[0]['data']
+    for data_ in data_list:
+        code = re.findall('<u>(.*?)</u>', data_['sys_key'])[0]
+        code_list.append(code)
+    return code_list
+
+
+# 获取总页数
+def getPageSize(id):
+    data_post = {"seDate": ["", ""], "stock": [f"{id}"], "channelCode": ["fundinfoNotice_disc"], "pageSize": 50,
+                 "pageNum": 1}
+    data_post = json.dumps(data_post)
+    ip = baseCore.get_proxy()
+    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
+    req.encoding = req.apparent_encoding
+    total = int(req.json()['announceCount'])
+    if total % 50 == 0:
+        pageSize = int(total / 50)
+    else:
+        pageSize = int(total / 50) + 1
+    return pageSize
+
+
+# 获取json数据
+def getDataList(id, page):
+    data_post = {"seDate": ["", ""], "stock": [f"{id}"], "channelCode": ["fundinfoNotice_disc"], "pageSize": 50,
+                 "pageNum": page}
+    data_post = json.dumps(data_post)
+    ip = baseCore.get_proxy()
+    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
+    req.encoding = req.apparent_encoding
+    data_list = req.json()['data']
+    return data_list
+
+
+# 获取pdf文件数据流
+def getContent(href):
+    ip = baseCore.get_proxy()
+    req = requests.get(href, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    content = req.content
+    return content
+
+
+def doJob():
+    if not os.path.exists('./市场板块/基金公告_2'):
+        os.makedirs('./市场板块/基金公告_2')
+    info_list = []
+    code_list = getCodeList()
+    for code in code_list:
+        pageSize = getPageSize(code)
+        for page in range(1, pageSize + 1):
+            data_list = getDataList(code, page)
+            for data in data_list:
+                title = data['title']
+                name = data['secName'][0]
+                if not os.path.exists(f'./市场板块/基金公告_2/{code}-{name}'):
+                    os.makedirs(f'./市场板块/基金公告_2/{code}-{name}')
+                pub_time = data['publishTime']
+                href = 'http://www.szse.cn/api/disc/info/download?id=' + data['id']
+                info = [code, name, title, pub_time, href, '深圳交易所', 'http://www.szse.cn/disclosure/index.html']
+                info_list.append(info)
+                content = getContent(href)
+                file = rf'./市场板块/基金公告_2/{code}-{name}/{title}-{pub_time[:10]}.pdf'
+                if os.path.exists(file):
+                    log.info(f'{title}===已采集')
+                    time.sleep(3)
+                    continue
+                try:
+                    with open(file, 'wb') as f:
+                        f.write(content)
+                    log.info(f'{title}===成功')
+                except Exception as e:
+                    log.error(f'第{page}页==={title}===失败')
+                time.sleep(2)
+    df = pd.DataFrame(np.array(info_list))
+    df.columns = ['证券代码', '证券简称', '公告标题', '发布时间', '公告网址', '来源', '来源网址']
+    df.to_excel('./市场板块/深圳交易所基金公告_2.xlsx', index=False)
+
+
+if __name__ == '__main__':
+    doJob()
+    baseCore.close()
--- a/REITs专题数据/FundsList-shenzhen.py
+++ b/REITs专题数据/FundsList-shenzhen.py
+import re
+import re
+import time
+from datetime import datetime, timedelta
+
+import numpy as np
+import pandas as pd
+import pymongo
+import requests
+
+from retry import retry
+
+from base import BaseCore
+
+baseCore = BaseCore.BaseCore()
+db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').研究中心[
+    'REITs基金行情-深圳']
+log = baseCore.getLogger()
+
+headers = {
+    'Accept': 'application/json, text/javascript, */*; q=0.01',
+    'Accept-Encoding': 'gzip, deflate',
+    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+    'Cache-Control': 'no-cache',
+    'Connection': 'keep-alive',
+    'Content-Type': 'application/json',
+    'Host': 'www.szse.cn',
+    'Pragma': 'no-cache',
+    'Referer': 'http://www.szse.cn/market/product/list/all/index.html',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+    'X-Request-Type': 'ajax',
+    'X-Requested-With': 'XMLHttpRequest',
+}
+
+
+# 获取基金代码与上市时间
+@retry(tries=3, delay=3)
+def getData():
+    data_list = []
+    ip = baseCore.get_proxy()
+    url = 'https://reits.szse.cn/api/report/ShowReport/data?SHOWTYPE=JSON&CATALOGID=reits_fund_list&PAGENO=1&PAGESIZE=10'
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    data_json = req.json()[0]['data']
+    for data_ in data_json:
+        jjjcurl = re.findall('<u>(.*?)</u>', data_['jjjcurl'])[0].lstrip().strip()
+        sys_key = data_['sys_key'].lstrip().strip()
+        ssrq = data_['ssrq'].lstrip().strip()
+        # 基金简称 基金代码 上市时间
+        data = [jjjcurl, sys_key, ssrq]
+        data_list.append(data)
+    return data_list
+
+
+# 获取基金基本信息
+@retry(delay=5)
+def getInfoList():
+    code_list = []
+    url = 'http://www.szse.cn/api/report/ShowReport/data?SHOWTYPE=JSON&CATALOGID=1105&TABKEY=tab1&selectJjlb=%E5%9F%BA%E7%A1%80%E8%AE%BE%E6%96%BD%E5%9F%BA%E9%87%91'
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    data_list = req.json()[0]['data']
+    for data_ in data_list:
+        # data = {
+        #     '基金代码': re.findall('<u>(.*?)</u>', data_['sys_key'])[0],
+        #     '基金简称': re.findall('<u>(.*?)</u>', data_['jjjcurl'])[0],
+        #     '基金类别': data_['jjlb'],
+        #     '投资类别': data_['tzlb'],
+        #     '上市日期': data_['ssrq'],
+        #     '当前规模(万份)': data_['dqgm'],
+        #     '基金管理人': data_['glrmc'],
+        #     '最新基金净值': data_['cxjqhq'],
+        # }
+        data = [re.findall('<u>(.*?)</u>', data_['sys_key'])[0], re.findall('<u>(.*?)</u>', data_['jjjcurl'])[0],
+                data_['jjlb'], data_['tzlb'], data_['ssrq'], data_['dqgm'], data_['glrmc'], data_['cxjqhq'], ]
+        name_list = ['基金代码', '基金简称', '基金类别', '投资类别', '上市日期', '当前规模(万份)', '基金管理人', '最新基金净值']
+        code_list.append(data)
+    return code_list
+
+
+# 获取基金交易信息
+@retry(tries=5, delay=20)
+def getDataList(code, start_date, end_date):
+    ip = baseCore.get_proxy()
+    url = f'http://www.szse.cn/api/report/ShowReport/data?SHOWTYPE=JSON&CATALOGID=1815_stock_snapshot&TABKEY=tab2&txtDMorJC={code}&txtBeginDate={str(start_date)[:10]}&txtEndDate={str(end_date)[:10]}&archiveDate=2021-11-01'
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    data_json = req.json()[0]['data'][::-1]
+    req.close()
+    for data_ in data_json:
+        jyrq = data_['jyrq']
+        zqdm = data_['zqdm']
+        zqjc = data_['zqjc']
+        qss = data_['qss']
+        ks = data_['ks']
+        zg = data_['zg']
+        zd = data_['zd']
+        ss = data_['ss']
+        sdf = data_['sdf']
+        cjgs = data_['cjgs']
+        cjje = data_['cjje']
+        syl1 = data_['syl1']
+        is_insert = db_storage.find_one({'code': code, 'date': jyrq, 'exchange': '深圳证券交易所'})
+        if is_insert:
+            log.info(f'{code}==={jyrq}===已采集')
+            continue
+        dic_info = {
+            'code': float(zqdm),  # 代码
+            'shortName': float(zqjc),  # 简称
+            'opening': float(ks),  # 开盘价
+            'max': float(zg),  # 最高价
+            'min': float(zd),  # 最低价
+            'closed': float(ss),  # 收盘价
+            'beforeClosed': float(qss),  # 前收价
+            'volume': cjgs,  # 交易量
+            'amount': cjje,  # 交易金额
+            'date': jyrq,  # 时间
+            'country': '中国',  # 国家
+            'exchange': '深圳证券交易所'  # 交易所
+        }
+        db_storage.insert_one(dic_info)
+        log.info(f'{code}==={jyrq}===采集成功')
+        time.sleep(1)
+
+
+def doJob():
+    data_list = getData()
+    log.info('开始采集')
+    for data in data_list:
+        code = data[0]
+        name = data[1]
+        log.info(f'{code}==={name}===开始采集')
+        start_date = data[2]
+        start_date = datetime.strptime(start_date, "%Y-%m-%d")
+        current_date = datetime.now()
+        end_date = start_date + timedelta(days=5)
+        while end_date != current_date:
+            time.sleep(1)
+            try:
+                getDataList(code, start_date, end_date)
+            except:
+                log.error(f'{code}==={start_date}-{end_date}===采集失败')
+            start_date = end_date + timedelta(days=1)
+            end_date = start_date + timedelta(days=5)
+            if end_date > current_date:
+                end_date = current_date
+
+
+if __name__ == '__main__':
+    doJob()
+    baseCore.close()
--- a/REITs专题数据/LawRules-shenzhen.py
+++ b/REITs专题数据/LawRules-shenzhen.py
+import os
+import os
+import time
+from urllib.parse import urljoin
+
+import numpy as np
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+from selenium.webdriver.common.by import By
+
+from base import BaseCore
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+    'Accept': 'application/json, text/javascript, */*; q=0.01',
+    'Accept-Encoding': 'gzip, deflate',
+    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+    'Cache-Control': 'no-cache',
+    'Connection': 'keep-alive',
+    'Content-Type': 'application/x-www-form-urlencoded',
+    'Host': 'www.szse.cn',
+    'Origin': 'http://www.szse.cn',
+    'Pragma': 'no-cache',
+    # 'Referer': 'http://www.szse.cn/lawrules/search/index.html?rulekeyword=REITs&channelCode=["rules","csrcrules","szseBussrules","memorandumServicedirect","publicadvice","lawruleSearch"]&range=content&searchtype=0',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+    'X-Request-Type': 'ajax',
+    'X-Requested-With': 'XMLHttpRequest',
+}
+
+
+def paserUrl(html, listurl):
+    # 获取所有的<a>标签和<img>标签
+    if isinstance(html, str):
+        html = BeautifulSoup(html, 'html.parser')
+
+    links = html.find_all(['a', 'img'])
+    # 遍历标签，将相对地址转换为绝对地址
+    for link in links:
+        if 'href' in link.attrs:
+            link['href'] = urljoin(listurl, link['href'])
+        elif 'src' in link.attrs:
+            link['src'] = urljoin(listurl, link['src'])
+    return html
+
+
+def getFjContent(url):
+    ip = baseCore.get_proxy()
+    session = requests.session()
+    session.get('http://www.szse.cn/',headers=headers,proxies=ip)
+    req = session.get(url)
+    req.encoding = req.apparent_encoding
+    content = req.content
+    session.close()
+    return content
+
+
+def getContent(url, publishDate, num):
+    fjhref_list = ''
+    fjtitle_list = ''
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'html.parser')
+    soup = paserUrl(soup, 'http://www.szse.cn/')
+    contentWithTag = soup.find('div', class_='des-content')
+    a_list = contentWithTag.find_all('a')
+    num_ = 1
+    for a in a_list:
+        fj_href = a.get('href')
+        if not fj_href:
+            continue
+        fjhref_list += fj_href + '\n'
+        fj_title = a.text.lstrip().strip()
+        category = os.path.splitext(fj_href)[1]
+        if category not in fj_title:
+            fj_title = fj_title + category
+        fj_title = f'{num}-{publishDate}-{fj_title}'
+        fjcontent = getFjContent(fj_href)
+        file = f'./相关政策/深圳证券交易所/政策文件/{fj_title}'
+        if os.path.exists(file):
+            fj_title = fj_title.replace(category,f'-{num_}{category}')
+            num_ += 1
+        file = f'./相关政策/深圳证券交易所/政策文件/{fj_title}'
+        fjtitle_list += fj_title + '\n'
+        with open(file, 'wb') as f:
+            f.write(fjcontent)
+        log.info(f'{fj_title}===附件下载成功')
+    try:
+        scripts = contentWithTag.find_all('script')
+        for script in scripts:
+            script.decompose()
+    except:
+        pass
+    try:
+        styles = contentWithTag.find_all('style')
+        for style in styles:
+            style.decompose()
+    except:
+        pass
+    pub_hao = contentWithTag.find('p').text.lstrip().strip()
+    content = contentWithTag.text.lstrip().strip()
+    return pub_hao, content, fjtitle_list, fjhref_list
+
+
+def doJob():
+    if not os.path.exists('./相关政策/深圳证券交易所/政策文件'):
+        os.makedirs('./相关政策/深圳证券交易所/政策文件')
+    url = 'http://www.szse.cn/lawrules/search/index.html?rulekeyword=REITs&channelCode=%5B%22rules%22,%22csrcrules%22,%22szseBussrules%22,%22memorandumServicedirect%22,%22publicadvice%22,%22lawruleSearch%22%5D&range=content&searchtype=0'
+    driver = baseCore.buildDriver()
+    driver.get(url)
+    WebDriverWait(driver, 10).until(
+        EC.presence_of_element_located((By.CLASS_NAME, 'article-item'))
+    )
+    div_list = driver.find_elements(By.CLASS_NAME, 'article-item')
+    num = 0
+    data_list = []
+    for div in div_list:
+        title = div.find_element(By.TAG_NAME, 'a').text.lstrip().strip()
+        href = div.find_element(By.TAG_NAME, 'a').get_attribute('href')
+        publishDate = div.find_element(By.CLASS_NAME, 'pull-right').text.lstrip().strip()
+        writtenDate = publishDate
+        origin = '深圳证券交易所'
+        organ = origin
+        if '.pdf' in href:
+            content = ''
+            summary = ''
+            fjtitle_list = title + '.pdf'
+            fjhref_list = href
+            pub_hao = ''
+            fjcontent = getFjContent(href)
+            file = f'./相关政策/深圳证券交易所/政策文件/{title}.pdf'
+            with open(file, 'wb') as f:
+                f.write(fjcontent)
+            log.info(f'{title}===附件下载成功')
+        else:
+            summary = div.find_element(By.CLASS_NAME, 'item-content').text.lstrip().strip()
+            pub_hao, content, fjtitle_list, fjhref_list = getContent(href, publishDate, num)
+        data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
+                fjhref_list]
+        data_list.append(data)
+        log.info(f'{title}===采集成功')
+        num += 1
+    driver.close()
+    df = pd.DataFrame(np.array(data_list))
+    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
+    df.to_excel('./相关政策/深圳证券交易所/深圳证券交易所政策文件.xlsx', index=False)
+
+
+if __name__ == '__main__':
+    doJob()
+    baseCore.close()
--- a/REITs专题数据/MarketOverview-shenzhen.py
+++ b/REITs专题数据/MarketOverview-shenzhen.py
+import re
+import re
+import time
+from datetime import datetime, timedelta
+
+import numpy as np
+import pandas as pd
+import pymongo
+import requests
+from retry import retry
+
+from base import BaseCore
+db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').研究中心['REITs市场概况-深圳']
+db_storage_ = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').研究中心['REITs基金列表']
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+    'Accept': 'application/json, text/plain, */*',
+    'Accept-Encoding': 'gzip, deflate, br',
+    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+    'Cache-Control': 'no-cache',
+    'Connection': 'keep-alive',
+    'Host': 'reits.szse.cn',
+    'Origin': 'https://newmedia.szse.cn',
+    'Pragma': 'no-cache',
+    'Referer': 'https://newmedia.szse.cn/',
+    'Sec-Fetch-Dest': 'empty',
+    'Sec-Fetch-Mode': 'cors',
+    'Sec-Fetch-Site': 'same-site',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+    'sec-ch-ua': '"Microsoft Edge";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
+    'sec-ch-ua-mobile': '?0',
+    'sec-ch-ua-platform': '"Windows"',
+}
+
+
+def getData():
+    data_list = []
+    ip = baseCore.get_proxy()
+    url = 'https://reits.szse.cn/api/report/ShowReport/data?SHOWTYPE=JSON&CATALOGID=reits_fund_list&PAGENO=1&PAGESIZE=10'
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    data_json = req.json()[0]['data']
+    for data_ in data_json:
+        jjjcurl = re.findall('<u>(.*?)</u>', data_['jjjcurl'])[0].lstrip().strip()
+        sys_key = data_['sys_key'].lstrip().strip()
+        dqgm = data_['dqgm'].lstrip().strip()
+        ltgm = data_['ltgm'].lstrip().strip()
+        try:
+            glrmc = re.findall('\'>(.*?)</a>', data_['glrmc'])[0].lstrip().strip()
+        except:
+            glrmc = data_['glrmc']
+        tzlb = data_['tzlb'].lstrip().strip()
+        jjlb = data_['jjlb'].lstrip().strip()
+        ssrq = data_['ssrq'].lstrip().strip()[:10]
+        data = [jjjcurl, sys_key, dqgm, ltgm, glrmc, tzlb, jjlb, ssrq]
+        into_dict = {
+            '基金简称':jjjcurl,
+            '基金代码':sys_key,
+            '当前规模(万份)':dqgm,
+            '流通规模(万份)':ltgm,
+            '基金管理人':glrmc,
+            '投资类别':tzlb,
+            '基金类别':jjlb,
+            '上市日期':ssrq
+        }
+        db_storage_.insert_one(into_dict)
+        time.sleep(1)
+        data_list.append(data)
+    df = pd.DataFrame(np.array(data_list))
+    df.columns = ['基金简称', '基金代码', '当前规模(万份)', '流通规模(万份)', '基金管理人', '投资类别', '基金类别', '上市日期']
+    return df
+
+@retry(tries=5,delay=10)
+def getDataJson(date):
+    # ip = baseCore.get_proxy()
+    url = f'https://reits.szse.cn/api/report/ShowReport/data?SHOWTYPE=JSON&CATALOGID=reits_scgk_oa&txtQueryDate={date}'
+    # req = requests.get(url, headers=headers, proxies=ip)
+    req = requests.get(url,headers=headers)
+    data_json = req.json()[0]['data']
+    req.close()
+    return data_json
+
+# 2021-06-21
+
+def doJob():
+    log.info('=====开始采集=====')
+    start_time = time.time()
+    writer = pd.ExcelWriter('市场板块/深圳交易所市场概况.xlsx')
+    start_date = datetime(2022, 1, 1)
+    end_date = datetime.today()
+    date_range = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]
+    data_list = []
+    for date in date_range:
+        data_json = getDataJson(date)
+        for data_ in data_json:
+            data = [data_['lbmc'], data_['zqsl'], data_['zgb'], data_['cjsl'], data_['cjje'], data_['sjzz'],str(date)]
+            dic = {
+                '产品数量(只)':data_['zqsl'],
+                '股份余额(万份)':data_['zgb'],
+                '日成交份额(万份)':data_['cjsl'],
+                '日成交金额(万元)':data_['cjje'],
+                '总市值(亿元)':data_['sjzz'],
+                '日期':str(date)
+            }
+            db_storage.insert_one(dic)
+            log.info(f'{date}===采集成功')
+            data_list.append(data)
+        time.sleep(5)
+    df_1 = pd.DataFrame(np.array(data_list))
+    df_1.columns = ['基金品种', '产品数量(只)', '股份余额(万份)', '日成交份额(万份)', '日成交金额(万元)', '总市值(亿元)','日期']
+    df_1.to_excel(writer, sheet_name='基础设施公募', index=False)
+    df_2 = getData()
+    df_2.to_excel(writer, sheet_name='基金列表', index=False)
+    writer.save()
+    log.info(f'=====采集结束=====耗时{baseCore.getTimeCost(start_time,time.time())}')
+
+
+if __name__ == '__main__':
+    doJob()
+    baseCore.close()
--- a/REITs专题数据/ProductQuotes-shanghai.py
+++ b/REITs专题数据/ProductQuotes-shanghai.py
+import datetime
+import datetime
+import json
+import random
+import re
+
+import numpy as np
+import pandas as pd
+import pymongo
+import requests
+import time
+
+from retry import retry
+from selenium.webdriver.common.by import By
+
+from base import BaseCore
+
+db_storage = pymongo.MongoClient('mongodb://192.168.1.36:27017', username='admin', password='zzsn@9988').RESCenter[
+    'RETIsProdQuot']
+baseCore = BaseCore.BaseCore()
+
+log = baseCore.getLogger()
+
+headers = {
+    'Accept': '*/*',
+    'Accept-Encoding': 'gzip, deflate',
+    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+    'Cache-Control': 'no-cache',
+    'Connection': 'keep-alive',
+    'Pragma': 'no-cache',
+    'Referer': 'http://www.sse.com.cn/',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
+}
+
+
+# 获取基金代码列表
+@retry(tries=3, delay=10)
+def getCode():
+    code_list = []
+    url = f'http://yunhq.sse.com.cn:32041/v1/sh1/list/exchange/reits?callback=jQuery112407214866998855156_1699360786929&select=code%2Cname%2Clast%2Cchg_rate%2Cchange%2Cvolume%2Camount%2Cprev_close%2Copen%2Chigh%2Clow%2Camp_rate%2Ccpxxtype%2Ccpxxsubtype%2Ccpxxextendname&order=code%2Case&begin=0&end=25&_={int(time.time())}'
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    data_json = re.findall('\((.*)\)', req.text)[0]
+    data_json = json.loads(data_json)['list']
+    for data in data_json:
+        code_list.append([data[0], data[-1]])
+    req.close()
+    return code_list
+
+
+@retry(tries=4, delay=20)
+def getDataJson(code):
+    url = f'http://yunhq.sse.com.cn:32041/v1/sh1/dayk/{code}?callback=jQuery1124021168281852977966_1699359286492&begin=-1000&end=-1&period=day&_={int(time.time())}'
+    # ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers)
+    req.encoding = req.apparent_encoding
+    data_json = re.findall('\((.*)\)', req.text)[0]
+    data_json = json.loads(data_json)['kline']
+    req.close()
+    return data_json
+
+
+@retry(tries=5, delay=10)
+def getDataB(code, date_):
+    # ip = baseCore.get_proxy()
+    date = str(date_)[:4] + '-' + str(date_)[4:6] + '-' + str(date_)[6:]
+    url = f'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback99984&sqlId=COMMON_SSE_SJ_JJSJ_JJGM_REITSGM_L&FUND_CODE={code}&TRADE_DATE={date_}&SEARCH_DAY={date}&FUND_TYPE=01&_={int(time.time())}'
+    req = requests.get(url, headers=headers)
+    req.encoding = req.apparent_encoding
+    data_json_ = re.findall('\((.*)\)', req.text)[0]
+    data_json_ = json.loads(data_json_)['result'][0]
+    totalValue = data_json_['TOTAL_VALUE']
+    negoValue = data_json_['NEGO_VALUE']
+    toRate = data_json_['TO_RATE']
+    req.close()
+    time.sleep(2)
+    return totalValue, negoValue, toRate
+
+
+# 获取基金交易信息
+def getData():
+    codes_list = getCode()
+    for codes in codes_list:
+        start_time = time.time()
+        code = codes[0]
+        name = codes[1]
+        log.info(f'{code}==={name}===开始采集')
+        try:
+            data_json = getDataJson(code)
+            del (data_json)[-1]
+            num = 1
+            for data_ in data_json:
+                year = str(data_[0])[:4]
+                month = str(data_[0])[4:6]
+                day = str(data_[0])[6:]
+                date = datetime.datetime(int(year), int(month), int(day))
+                date_ytd = date - datetime.timedelta(days=1)
+                if num != 1:
+                    while True:
+                        beforeClosed = db_storage.find_one({'code':code,'date':date_ytd,'exchange':'上海证券交易所'})
+                        if beforeClosed:
+                            beforeClosed = beforeClosed['closed']
+                            break
+                        else:
+                            date_ytd = date_ytd - datetime.timedelta(days=1)
+                    num += 1
+                else:
+                    beforeClosed = 0
+                    num +=1
+                is_insert = db_storage.find_one({'code': code, 'date': date, 'exchange': '上海证券交易所'})
+                if is_insert:
+                    log.info(f'{code}==={date}===已采集')
+                    time.sleep(1)
+                    continue
+                try:
+                    totalValue, negoValue, toRate = getDataB(code, data_[0])
+                except Exception as e:
+                    log.error(e)
+                    log.error(f'{code}==={date}===采集失败')
+                    continue
+                info_dic = {
+                    'code': code,  # 代码
+                    'shortName': name,  # 简称
+                    'opening': float(data_[1]),  # 开盘价
+                    'max': float(data_[2]),  # 最高价
+                    'min': float(data_[3]),  # 最低价
+                    'closed': float(data_[4]),  # 收盘价
+                    'ytdClosed':float(beforeClosed), # 前收价
+                    'volume': float(data_[5]),  # 交易量
+                    'amount': float(data_[6]),  # 交易金额
+                    'totalValue': float(totalValue),  # 市价总值
+                    'negoValue': float(negoValue),  # 流通总值
+                    'toRate': float(toRate),  # 换手率
+                    'date': date,  # 时间
+                    'country': '中国',  # 国家
+                    'exchange': '上海证券交易所'  # 交易所
+                }
+                db_storage.insert_one(info_dic)
+                time.sleep(2)
+                log.info(f'{date}===采集成功')
+        except Exception as e:
+            log.error(e)
+            log.error(f'{code}===采集失败')
+        time.sleep(5)
+        log.info(f'{code}==={name}===记录完成===耗时{baseCore.getTimeCost(start_time, time.time())}')
+
+
+
+
+
+if __name__ == '__main__':
+    getData()
+    # getInfo()
+    baseCore.close()
--- a/REITs专题数据/ProjectDynamics-shanghai.py
+++ b/REITs专题数据/ProjectDynamics-shanghai.py
+import json
+import json
+import re
+import time
+
+import requests
+import pandas as pd
+import numpy as np
+from datetime import datetime, timedelta
+from base import BaseCore
+
+baseCore = BaseCore.BaseCore()
+
+headers = {
+    'Accept': '*/*',
+    'Accept-Encoding': 'gzip, deflate',
+    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+    'Cache-Control': 'no-cache',
+    'Connection': 'keep-alive',
+    'Host': 'query.sse.com.cn',
+    'Pragma': 'no-cache',
+    'Referer': 'http://www.sse.com.cn/',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+}
+
+
+# 获取json数据
+def getJson(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    data_json = re.findall('\((.*)\)', req.text)[0]
+    data_json = json.loads(data_json)
+    return data_json
+
+
+# 获取总页数
+def getTotal():
+    url = f'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback51800&isPagination=true&bond_type=4&sqlId=COMMON_SSE_ZCZZQXMLB&pageHelp.pageSize=25&status=&begin=&end=&pageHelp.cacheSize=1&pageHelp.pageNo=1&pageHelp.beginPage=1&_={int(time.time())}'
+    data_json = getJson(url)
+    total = int(data_json['pageHelp']['pageCount'])
+    return total
+
+
+# 获取基金id列表
+def getInfoList(page):
+    info_list = []
+    url = f'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback51800&isPagination=true&bond_type=4&sqlId=COMMON_SSE_ZCZZQXMLB&pageHelp.pageSize=25&status=&begin=&end=&pageHelp.cacheSize=1&pageHelp.pageNo=1&pageHelp.beginPage={page}&_={int(time.time())}'
+    data_json = getJson(url)
+    data_json = data_json['result']
+    for data in data_json:
+        id = data['BOND_NUM']
+        type = data['REITS_TYPE']
+        if type == '0':
+            info_list.append([id,'首次发售'])
+        elif type == '1':
+            info_list.append([id,'扩募发售'])
+        else:
+            info_list.append([id,'-'])
+    return info_list
+
+
+# 获取基本信息
+def getData(id,type):
+    url = f'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback72929&isPagination=false&audit_id={id}&sqlId=COMMON_SSE_ZCZZQXMXXXX&_={time.time()}'
+    data_ = getJson(url)['result'][0]
+    if data_['AUDIT_STATUS'] == '0':
+        audit_status = '已申报'
+    elif data_['AUDIT_STATUS'] == '1':
+        audit_status = '已受理'
+    elif data_['AUDIT_STATUS'] == '2':
+        audit_status = '已反馈'
+    elif data_['AUDIT_STATUS'] == '3':
+        audit_status = '已接收反馈意见'
+    elif data_['AUDIT_STATUS'] == '4':
+        audit_status = '通过'
+    elif data_['AUDIT_STATUS'] == '5':
+        audit_status = '未通过'
+    elif data_['AUDIT_STATUS'] == '8':
+        audit_status = '终止'
+    elif data_['AUDIT_STATUS'] == '901':
+        audit_status = '承销商/管理人超期中止'
+    elif data_['AUDIT_STATUS'] == '9':
+        audit_status = '中止'
+    elif data_['AUDIT_STATUS'] == '10':
+        audit_status = '已回复交易所意见'
+    elif data_['AUDIT_STATUS'] == '111':
+        audit_status = '提交注册'
+    elif data_['AUDIT_STATUS'] == '12':
+        audit_status = '注册生效'
+    else:
+        audit_status = '-'
+    if data_['BOND_TYPE'] == '4':
+        bond_type = '基础设施公募REITs'
+    else:
+        bond_type = '其它'
+    # data = {
+    #     '公募REITs名称': data_['AUDIT_NAME'],
+    #     '品种': bond_type,
+    #     '发起人': data_['LIST1'],
+    #     '管理人': data_['PRIORITY_MANAGER'],
+    #     '专项计划名称': data_['PRIORITY_NAME'],
+    #     '专项计划管理人': data_['LIST2'],
+    #     '无异议函文号': data_['REG_APRV_WEN_HAO'],
+    #     '项目状态': audit_status,
+    #     '更新日期': data_['PUBLISH_DATE'],
+    #     '受理日期': data_['ACCEPT_DATE'],
+    # }
+    data = [data_['AUDIT_NAME'], bond_type, data_['LIST1'], data_['PRIORITY_MANAGER'],
+            data_['PRIORITY_NAME'], data_['LIST2'], data_['REG_APRV_WEN_HAO'], audit_status, data_['PUBLISH_DATE'],
+            data_['ACCEPT_DATE'], type]
+    return data
+
+
+def doJob():
+    data_list = []
+    total = getTotal()
+    for page in range(1, total + 1):
+        info_list = getInfoList(page)
+        for info in info_list:
+            id = info[0]
+            type = info[1]
+            data = getData(id,type)
+            data_list.append(data)
+            # break
+    df = pd.DataFrame(np.array(data_list))
+    df.columns = ['公募REITs名称', '品种', '发起人', '管理人', '专项计划名称', '专项计划管理人', '无异议函文号', '项目状态', '更新日期', '受理日期','申报类型']
+    df.to_excel('./上海交易所项目动态.xlsx', index=False)
+
+
+if __name__ == '__main__':
+    doJob()
+    baseCore.close()
--- a/REITs专题数据/ProjectDynamics-shenzhen.py
+++ b/REITs专题数据/ProjectDynamics-shenzhen.py
+import re
+import re
+import time
+
+import numpy as np
+import pandas as pd
+import requests
+import os
+import json
+from base import BaseCore
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+
+headers = {
+    'Accept': 'application/json, text/javascript, */*; q=0.01',
+    'Accept-Encoding': 'gzip, deflate',
+    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+    'Cache-Control': 'no-cache',
+    'Connection': 'keep-alive',
+    'Content-Type': 'application/json;charset=utf-8',
+    'Host': 'reits.szse.cn',
+    'Pragma': 'no-cache',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+    'X-Requested-With': 'XMLHttpRequest',
+}
+
+
+def getPageSize():
+    ip = baseCore.get_proxy()
+    url = 'http://reits.szse.cn/api/reits/projectrends/query?biztypsb=21&bizType=2&pageIndex=0&pageSize=10'
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    total = int(req.json()['totalSize'])
+    if total % 10 == 0:
+        pageSize = int(total / 10)
+    else:
+        pageSize = int(total / 10) + 1
+    return pageSize
+
+
+def getDataJson(page):
+    ip = baseCore.get_proxy()
+    url = f'http://reits.szse.cn/api/reits/projectrends/query?biztypsb=21&bizType=2&pageIndex={page}&pageSize=10'
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    data_json = req.json()['data']
+    return data_json
+
+
+def doJob():
+    info_list = []
+    pageSize = getPageSize()
+    for page in range(pageSize):
+        data_json = getDataJson(page)
+        for data_ in data_json:
+            cmpnm = data_['cmpnm']
+            specialPlanName = data_['specialPlanName']
+            issueTargetName = data_['issueTargetName']
+            primitiveInterestsor = data_['primitiveInterestsor']
+            acctfm = data_['acctfm']
+            sprinst = data_['sprinst']
+            lawfm = data_['lawfm']
+            biztypsbName = data_['biztypsbName']
+            prjst = data_['prjst']
+            updtdt = data_['updtdt']
+            acptdt = data_['acptdt']
+            info_list.append([cmpnm,specialPlanName,issueTargetName,primitiveInterestsor,acctfm,sprinst,lawfm,biztypsbName,prjst,updtdt,acptdt])
+    df = pd.DataFrame(np.array(info_list))
+    df.columns = ['基金名称','专项计划名称','基础设施项目类型','原始权益人','基金管理人','专项计划管理人','托管人','申报类型','审核状态','更新日期','受理日期']
+    df.to_excel('./市场板块/深圳交易所项目动态.xlsx',index=False)
+
+if __name__ == '__main__':
+    doJob()
+    baseCore.close()
--- a/REITs专题数据/REITsDailyFund-shanghai.py
+++ b/REITs专题数据/REITsDailyFund-shanghai.py
+import json
+import json
+import re
+import time
+import calendar
+
+import pymongo
+import requests
+import pandas as pd
+import numpy as np
+from datetime import datetime, timedelta
+
+from retry import retry
+
+from base import BaseCore
+
+db_storage = pymongo.MongoClient('mongodb://192.168.1.36:27017', username='admin', password='zzsn@9988').RESCenter[
+    'REITsTxnStat']
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+    'Accept': '*/*',
+    'Accept-Encoding': 'gzip, deflate',
+    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+    'Cache-Control': 'no-cache',
+    'Connection': 'keep-alive',
+    'Host': 'query.sse.com.cn',
+    'Pragma': 'no-cache',
+    'Referer': 'http://www.sse.com.cn/',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+}
+
+
+@retry(tries=5, delay=20)
+def getJson(url):
+    # ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers)
+    req.encoding = req.apparent_encoding
+    data_json = re.findall('\((.*)\)', req.text)[0]
+    data_json = json.loads(data_json)
+    req.close()
+    return data_json
+
+
+# 2021-06-26
+# 每日概况
+def getDayData():
+    start_date = datetime(2021, 6, 21)
+    end_date = datetime.today() - timedelta(days=1)
+    date_range = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]
+    for date in date_range:
+        date_ = date.strftime('%Y-%m-%d')
+        is_insert = db_storage.find_one({'date': date, 'exchange': '上海证券交易所'})
+        if is_insert:
+            log.info(f'{date}===已采集')
+            time.sleep(1)
+            continue
+        url = f'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback89728&sqlId=COMMON_SSE_REITS_HQXX_CJTJ_DAY_L&TRADE_DATE={date_}&FUND_TYPE=01&_={int(time.time())}'
+        try:
+            data_json = getJson(url)['result']
+        except Exception as e:
+            log.error(f'{date}===连接失败==={e}')
+            time.sleep(3)
+            continue
+        try:
+            for data_ in data_json:
+                dic_info = {
+                    'number': int(data_['LIST_NUM']),  # 挂牌数
+                    'volume': float(data_['TRADE_VOL'])*10000,  # 成交量
+                    'amount': float(data_['TRADE_AMT'])*10000,  # 成交金额
+                    'totalValue': float(data_['TOTAL_VALUE'])*10000,    # 市价总额
+                    'negoValue': float(data_['NEGO_VALUE'])*10000,  # 流通市值
+                    'toRate': float(data_['TO_RATE']),    # 换手率
+                    'date': date,
+                    'country': '中国',
+                    'exchange': '上海证券交易所'
+                }
+                db_storage.insert_one(dic_info)
+                log.info(f'{date}===采集成功')
+        except Exception as e:
+            log.error(f'{date}===数据存储失败==={e}')
+        time.sleep(3)
+
+
+# 每周概况
+def getWeekData(writer):
+    data_list = []
+    start_date = datetime(2021, 6, 21)
+    end_date = datetime.today()
+    date_range = [start_date + timedelta(days=x) for x in range(0, (end_date - start_date).days + 1, 7)]
+    for date_1 in date_range:
+        date_2 = (date_1 + timedelta(days=6)).strftime('%Y-%m-%d')
+        date_1 = date_1.strftime('%Y-%m-%d')
+        url = f'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback65413&sqlId=COMMON_SSE_REITS_HQXX_CJTJ_WEEK_L&START_DATE={date_1}&END_DATE={date_2}&FUND_TYPE=01&_={int(time.time())}'
+        data_json = getJson(url)['result']
+        for data_ in data_json:
+            data = [data_['LIST_NUM'], data_['TRADE_VOL'], data_['TRADE_AMT'], data_['TOTAL_VALUE'],
+                    data_['NEGO_VALUE'], data_['TO_RATE'], f'{date_1}至{date_2}']
+            dic_info = {
+                '挂牌数': data_['LIST_NUM'],
+                '成交量(亿份)': data_['TRADE_VOL'],
+                '成交金额(亿元)': data_['TRADE_AMT'],
+                '市价总额(亿元)': data_['TOTAL_VALUE'],
+                '流通市值(亿元)': data_['NEGO_VALUE'],
+                '换手率(%)': data_['TO_RATE'],
+                '日期': f'{date_1}至{date_2}',
+                '类别': '每周概况'
+            }
+            db_storage.insert_one(dic_info)
+            log.info(f'{date_1}至{date_2}===采集完成')
+            data_list.append(data)
+        time.sleep(1)
+    df = pd.DataFrame(np.array(data_list))
+    df.columns = ['挂牌数', '成交量(亿份)', '成交金额(亿元)', '市价总额(亿元)', '流通市值(亿元)', '换手率(%)', '日期']
+    df.to_excel(writer, sheet_name='每周概况', index=False)
+
+
+# 月度概况
+def getMonthData(writer):
+    data_list = []
+    start_date = datetime.strptime('2021-06-01', '%Y-%m-%d')
+    current_date = datetime.now()
+    while start_date <= current_date:
+        year = start_date.year
+        month = start_date.month
+        date = start_date.strftime('%Y-%m')
+        url = f'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback76435&sqlId=COMMON_SSE_REITS_HQXX_CJTJ_MONTH_L&TRADE_DATE={date}&FUND_TYPE=01&_={int(time.time())}'
+        data_json = getJson(url)['result']
+        for data_ in data_json:
+            data = [data_['LIST_NUM'], data_['TRADE_VOL'], data_['TRADE_AMT'], data_['TOTAL_VALUE'],
+                    data_['NEGO_VALUE'], data_['TO_RATE'], date]
+            dic_info = {
+                '挂牌数': data_['LIST_NUM'],
+                '成交量(亿份)': data_['TRADE_VOL'],
+                '成交金额(亿元)': data_['TRADE_AMT'],
+                '市价总额(亿元)': data_['TOTAL_VALUE'],
+                '流通市值(亿元)': data_['NEGO_VALUE'],
+                '换手率(%)': data_['TO_RATE'],
+                '日期': date,
+                '类别': '月度概况'
+            }
+            db_storage.insert_one(dic_info)
+            log.info(f'{date}===采集完成')
+            data_list.append(data)
+        if month == 12:
+            start_date = start_date.replace(year=year + 1, month=1)
+        else:
+            start_date = start_date.replace(month=month + 1)
+        time.sleep(1)
+    df = pd.DataFrame(np.array(data_list))
+    df.columns = ['挂牌数', '成交量(亿份)', '成交金额(亿元)', '市价总额(亿元)', '流通市值(亿元)', '换手率(%)', '日期']
+    df.to_excel(writer, sheet_name='每月概况', index=False)
+
+
+def doJob():
+    log.info('======开始采集======')
+    getDayData()
+    log.info('===每天数据采集完===')
+
+
+    # getWeekData(writer)
+    # log.info('===每周数据采集完===')
+    # getMonthData(writer)
+    # log.info('===每月数据采集完===')
+
+
+if __name__ == '__main__':
+    doJob()
\ No newline at end of file
--- a/REITs专题数据/RuleGuide-shanghai.py
+++ b/REITs专题数据/RuleGuide-shanghai.py
+import os
+import os
+from urllib.parse import urljoin
+
+import numpy as np
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+from base import BaseCore
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+    'Accept-Encoding': 'gzip, deflate',
+    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+    'Cache-Control': 'no-cache',
+    'Connection': 'keep-alive',
+    'Host': 'www.sse.com.cn',
+    'Pragma': 'no-cache',
+    'Referer': 'http://www.sse.com.cn/reits/regulation/rules/',
+    'Upgrade-Insecure-Requests': '1',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+}
+
+
+def paserUrl(html, listurl):
+    # 获取所有的<a>标签和<img>标签
+    if isinstance(html, str):
+        html = BeautifulSoup(html, 'html.parser')
+
+    links = html.find_all(['a', 'img'])
+    # 遍历标签，将相对地址转换为绝对地址
+    for link in links:
+        if 'href' in link.attrs:
+            link['href'] = urljoin(listurl, link['href'])
+        elif 'src' in link.attrs:
+            link['src'] = urljoin(listurl, link['src'])
+    return html
+
+
+def getFjContent(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    return req.content
+
+
+def getSoup(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'html.parser')
+    return soup
+
+
+def getContent(url, publishDate, num):
+    fjhref_list = ''
+    fjtitle_list = ''
+    soup = getSoup(url)
+    soup = paserUrl(soup, 'http://www.sse.com.cn/')
+    contentWithTag = soup.find('div', class_='allZoom')
+    pub_hao = contentWithTag.find('p').text.lstrip().strip()
+    a_list = contentWithTag.find_all('a')
+    # 上传附件
+    for a in a_list:
+        fj_href = a.get('href')
+        fj_title = a.get('title')
+        category = os.path.splitext(fj_href)[1]
+        if '.' not in category or '.cn' in category:
+            continue
+        if category not in fj_title:
+            fj_title = fj_title + category
+        fj_title = f'{num}-{publishDate}-{fj_title}'
+        fjtitle_list += fj_title + '\n'
+        fjhref_list += fj_href + '\n'
+        fjcontent = getFjContent(fj_href)
+        file = f'./相关政策/上海证券交易所/政策文件/{fj_title}'
+        with open(file, 'wb') as f:
+            f.write(fjcontent)
+        log.info(f'{fj_title}===附件下载成功')
+    content = contentWithTag.text
+    return pub_hao, content,fjtitle_list,fjhref_list
+
+
+def doJob():
+    if not os.path.exists('./相关政策/上海证券交易所/政策文件'):
+        os.makedirs('./相关政策/上海证券交易所/政策文件')
+    data_list = []
+    urls = ['http://www.sse.com.cn/reits/regulation/rules/', 'http://www.sse.com.cn/reits/regulation/guide/']
+    num = 1
+    for url in urls:
+        soup = getSoup(url)
+        soup = paserUrl(soup, 'http://www.sse.com.cn/')
+        li_list = soup.find('ul', class_='list').find_all('li')
+        for li in li_list:
+            title = li.find('a').text.lstrip().strip()
+            href = li.find('a').get('href')
+            origin = '上海证券交易所'
+            publishDate = li.find('i', class_='date').text.lstrip().strip()
+            writtenDate = publishDate
+            organ = '上海证券交易所'
+            summary = ''
+            pub_hao, content, fjtitle_list, fjhref_list = getContent(href, publishDate, num)
+            data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
+                    fjhref_list]
+            data_list.append(data)
+            log.info(f'{title}===采集成功')
+            num += 1
+    df = pd.DataFrame(np.array(data_list))
+    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
+    df.to_excel('./相关政策/上海证券交易所/上海证券交易所政策文件.xlsx', index=False)
+
+
+if __name__ == '__main__':
+    doJob()
+    baseCore.close()
--- a/REITs专题数据/cushman.py
+++ b/REITs专题数据/cushman.py
+import re
+import re
+
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+import os
+import numpy as np
+from base import BaseCore
+from requests.models import Response
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+
+headers = {
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+    'Accept-Encoding': 'gzip, deflate, br',
+    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+    'Cache-Control': 'no-cache',
+    'Pragma': 'no-cache',
+    'Referer': 'https://www.cushmanwakefield.com.cn/research-report/p94.html?expert=0',
+    'Sec-Ch-Ua': '"Microsoft Edge";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
+    'Sec-Ch-Ua-Mobile': '?0',
+    'Sec-Ch-Ua-Platform': '"Windows"',
+    'Sec-Fetch-Dest': 'document',
+    'Sec-Fetch-Mode': 'navigate',
+    'Sec-Fetch-Site': 'same-origin',
+    'Sec-Fetch-User': '?1',
+    'Upgrade-Insecure-Requests': '1',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+}
+
+
+def getSoup(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'html.parser')
+    return soup
+
+
+def getPageSize():
+    url = 'https://www.cushmanwakefield.com.cn/research-report/p1.html?expert=0'
+    soup = getSoup(url)
+    total = int(re.findall('\d+', soup.find('dl', class_='sousuo_result').text.lstrip().strip())[0])
+    if total % 4 == 0:
+        pageSize = int(total / 4)
+    else:
+        pageSize = int(total / 4) + 1
+    return pageSize
+
+
+def getContent(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    return req.content
+
+
+def doJob():
+    if not os.path.exists('./研究咨询/戴德梁兴/行业视角-研究报告'):
+        os.makedirs('./研究咨询/戴德梁兴/行业视角-研究报告')
+    num = 1
+    data_list = []
+    pageSize = getPageSize()
+    for page in range(1, pageSize + 1):
+        url = f'https://www.cushmanwakefield.com.cn/research-report/p{page}.html?expert=0'
+        soup = getSoup(url)
+        div_list = soup.find('div', class_='guwen_list_box').find_all('div', class_='zhuangyuan_guwen_box')
+        for div in div_list:
+            fjtitle_list = ''
+            fjhref_list = ''
+            name = div.find('div', class_='zhuanyuan_name').text.lstrip().strip()
+            summary = div.find('div', class_='zhuanyuan_info').text.lstrip().strip()
+            href = div.find('a', class_='zhuanyuan_xinxi').get('href')
+            origin = '戴德梁兴'
+            try:
+                content = getContent(href)
+            except:
+                log.error(f'第{page}页==={name}===连接失败')
+                continue
+            title = name.replace('/',' ').replace('|',' ').replace('？',' ').replace('"','”')
+            file = f'./研究咨询/戴德梁兴/行业视角-研究报告/{title}.pdf'
+            num_ = 2
+            while True:
+                flg = os.path.isfile(file)
+                if flg:
+                    log.info(f'{name}===有重名')
+                    title_ = f'{title}-{num_}'
+                    file = f'./研究咨询/戴德梁兴/行业视角-研究报告/{title_}.pdf'
+                    num_ += 1
+                else:
+                    try:
+                        title = title_
+                    except:
+                        pass
+                    break
+            try:
+                with open(file, 'wb') as f:
+                    f.write(content)
+                log.info(f'{name}===成功')
+                fjtitle_list += title + '\n'
+                fjhref_list += href + '\n'
+                data = [num, name, origin, href, summary, fjtitle_list, fjhref_list]
+                data_list.append(data)
+            except:
+                log.error(f'第{page}页==={name}===保存失败')
+    df = pd.DataFrame(np.array(data_list))
+    df.columns = ['序号', '标题', '来源', '原文链接', '摘要', '附件名称', '附件连接']
+    df.to_excel('./研究咨询/戴德梁兴/行业视角-研究报告.xlsx', index=False)
+
+
+if __name__ == '__main__':
+    doJob()
+    baseCore.close()
--- a/REITs专题数据/info-shanghai.py
+++ b/REITs专题数据/info-shanghai.py
+import datetime
+import datetime
+import json
+import random
+import re
+
+import numpy as np
+import pandas as pd
+import pymongo
+import requests
+import time
+
+from retry import retry
+from selenium.webdriver.common.by import By
+
+from base import BaseCore
+
+db_storage_1 = pymongo.MongoClient('mongodb://192.168.1.36:27017', username='admin', password='zzsn@9988').RESCenter[
+    'REITsProdOverview']
+db_storage_2 = pymongo.MongoClient('mongodb://192.168.1.36:27017', username='admin', password='zzsn@9988').RESCenter[
+    'REITsFinancing']
+db_storage_3 = pymongo.MongoClient('mongodb://192.168.1.36:27017', username='admin', password='zzsn@9988').RESCenter[
+    'REITsEquityDist']
+db_storage_4 = pymongo.MongoClient('mongodb://192.168.1.36:27017', username='admin', password='zzsn@9988').RESCenter[
+    'REITsShareStruct']
+db_storage_5 = pymongo.MongoClient('mongodb://192.168.1.36:27017', username='admin', password='zzsn@9988').RESCenter[
+    'REITsNetWorth']
+
+baseCore = BaseCore.BaseCore()
+
+log = baseCore.getLogger()
+
+headers = {
+    'Accept': '*/*',
+    'Accept-Encoding': 'gzip, deflate',
+    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+    'Cache-Control': 'no-cache',
+    'Connection': 'keep-alive',
+    'Pragma': 'no-cache',
+    'Referer': 'http://www.sse.com.cn/',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
+}
+
+
+@retry(tries=4, delay=20)
+def getDataJson(url):
+    # ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers)
+    req.encoding = req.apparent_encoding
+    data_json = re.findall('\((.*)\)', req.text)[0]
+    data_json = json.loads(data_json)
+    req.close()
+    return data_json
+
+
+# 获取基金代码列表
+@retry(tries=3, delay=10)
+def getCode():
+    code_list = []
+    url = f'http://yunhq.sse.com.cn:32041/v1/sh1/list/exchange/reits?callback=jQuery112407214866998855156_1699360786929&select=code%2Cname%2Clast%2Cchg_rate%2Cchange%2Cvolume%2Camount%2Cprev_close%2Copen%2Chigh%2Clow%2Camp_rate%2Ccpxxtype%2Ccpxxsubtype%2Ccpxxextendname&order=code%2Case&begin=0&end=25&_={int(time.time())}'
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    data_json = re.findall('\((.*)\)', req.text)[0]
+    data_json = json.loads(data_json)['list']
+    for data in data_json:
+        code_list.append([data[0], data[-1]])
+    req.close()
+    return code_list
+
+
+# 产品概况
+def productOverview(code):
+    url = f'http://query.sse.com.cn/commonSoaQuery.do?jsonCallBack=jsonpCallback3638&isPagination=false&sqlId=FUND_BASIC_INFO&fundCode={code}&_={int(time.time())}'
+    try:
+        data_json = getDataJson(url)['result'][0]
+    except:
+        log.error(f'{code}===产品概况连接失败')
+        time.sleep(1)
+        return
+    if db_storage_1.find_one({'code': data_json['fundCode'], 'exchange': '上海证券交易所'}):
+        log.info(f"{data_json['fundCode']}===产品概况已采集")
+        time.sleep(1)
+        return
+    dic_info = {
+        'code': data_json['fundCode'],  # 代码
+        'shortName': data_json['fundExpansionAbbr'],  # 扩位简称
+        'office': data_json['lawFirm'],  # 律师事务所
+        'caretaker': data_json['companyName'],  # 管理人
+        'caretakerPhone': data_json['contactMobile'],  # 管理人联系方式
+        'custodian': data_json['trusteeName'],  # 托管人
+        'country': '中国',  # 国家
+        'exchange': '上海证券交易所'  # 交易所
+    }
+    try:
+        db_storage_1.insert_one(dic_info)
+        log.info(f"{data_json['fundCode']}===产品概况采集成功")
+    except:
+        log.info(f"{data_json['fundCode']}===产品概况保存失败")
+    time.sleep(1)
+
+
+# 筹资情况
+def financing(code, name):
+    url = f'http://query.sse.com.cn/commonSoaQuery.do?jsonCallBack=jsonpCallback74728&isPagination=true&sqlId=REITS_FXYKM&fundCode={code}&pageHelp.pageSize=10&pageHelp.cacheSize=1&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.endPage=5&_={int(time.time())}'
+    try:
+        data_jsons = getDataJson(url)['result']
+    except:
+        log.error(f'{code}===筹资情况连接失败')
+        time.sleep(1)
+        return
+    for data_json in data_jsons:
+        saleStartDate = datetime.datetime.strptime(data_json['saleStartDate'], '%Y-%m-%d')
+        saleEndDate = datetime.datetime.strptime(data_json['saleEndDate'], '%Y-%m-%d')
+        try:
+            listingDate = datetime.datetime.strptime(data_json['listingDate'], '%Y-%m-%d')
+        except:
+            listingDate = ''
+        if db_storage_2.find_one(
+                {'code': code, 'saleStartDate': saleStartDate, 'saleEndDate': saleEndDate, 'exchange': '上海证券交易所'}):
+            log.info(f"{code}===筹资情况已采集")
+            time.sleep(1)
+            continue
+        dic_info = {
+            'code': code,  # 代码
+            'shortName': name,  # 简称
+            'price': data_json['salePrice'],  # 发售价格
+            'saleStartDate': saleStartDate,  # 发售起始日期
+            'saleEndDate': saleEndDate,  # 发售终止日期
+            'saleCopies': data_json['saleCopies'],  # 发售总份数(亿份)
+            'listingDate': listingDate,  # 上市日期
+            'exchange': '上海证券交易所'  # 交易所
+        }
+        try:
+            db_storage_2.insert_one(dic_info)
+            log.info(f'{code}===筹资情况采集成功')
+        except:
+            log.error(f'{code}===筹资情况保存失败')
+        time.sleep(1)
+
+
+# 权益分配
+def equityDistribution(code, name):
+    url = f'http://query.sse.com.cn/commonSoaQuery.do?jsonCallBack=jsonpCallback10108&isPagination=true&sqlId=REITS_FH&fundCode={code}&pageHelp.pageSize=10&pageHelp.cacheSize=1&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.endPage=5&_={int(time.time())}'
+    try:
+        data_jsons = getDataJson(url)['result']
+    except:
+        log.error(f'{code}===权益分配连接失败')
+        time.sleep(1)
+        return
+    for data_json in data_jsons:
+        rightsRegistDate = datetime.datetime.strptime(data_json['rightsRegistDate'], '%Y-%m-%d')
+        exrightDate = datetime.datetime.strptime(data_json['exrightDate'], '%Y-%m-%d')
+        if db_storage_3.find_one(
+                {'code': code, 'rightsRegistDate': rightsRegistDate, 'exrightDate': exrightDate,
+                 'exchange': '上海证券交易所'}):
+            log.info(f"{code}==={rightsRegistDate}===权益分配已采集")
+            time.sleep(1)
+            continue
+        dic_info = {
+            'code': code,  # 代码
+            'shortName': name,  # 简称
+            'year': data_json['year'],  # 年份
+            'fundDividends': data_json['fundDividends'],  # 红利(元)
+            'rightsRegistDate': rightsRegistDate,  # 权益登记日
+            'exrightDate': exrightDate,  # 除权基准日
+            'exchange': '上海证券交易所'  # 交易所
+        }
+        try:
+            db_storage_3.insert_one(dic_info)
+            log.info(f'{code}==={rightsRegistDate}===权益分配采集成功')
+        except:
+            log.error(f'{code}==={rightsRegistDate}===权益分配保存失败')
+        time.sleep(1)
+
+
+# 份额结构
+def shareStructure(code, name):
+    url = f'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback66502&isPagination=true&sqlId=COMMON_SSE_SJ_JJSJ_JJGM_REITSGM_L&FUND_CODE={code}&pageHelp.pageSize=10&pageHelp.cacheSize=1&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.endPage=5&FUND_TYPE=01&_={int(time.time())}'
+    try:
+        total = getDataJson(url)['pageHelp']['pageCount']
+    except:
+        log.error(f'{code}===份额结构总数获取失败')
+        time.sleep(1)
+        return
+    for page in range(1, total + 1):
+        url = f'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback66502&isPagination=true&sqlId=COMMON_SSE_SJ_JJSJ_JJGM_REITSGM_L&FUND_CODE={code}&pageHelp.pageSize=10&pageHelp.cacheSize=1&pageHelp.pageNo={page}&pageHelp.beginPage={page}&pageHelp.endPage=5&FUND_TYPE=01&_={int(time.time())}'
+        try:
+            data_jsons = getDataJson(url)['result']
+        except:
+            log.error(f'{code}===份额结构第{page}页连接失败')
+            time.sleep(1)
+            continue
+        for data_json in data_jsons:
+            tradeDate = datetime.datetime.strptime(str(data_json['TRADE_DATE']), '%Y%m%d')
+            if db_storage_4.find_one(
+                    {'code': code, 'tradeDate': tradeDate,
+                     'exchange': '上海证券交易所'}):
+                log.info(f"{code}==={tradeDate}===份额结构已采集")
+                time.sleep(1)
+                continue
+            dic_info = {
+                'code': code,  # 代码
+                'shortName': name,  # 简称
+                'limitVol': data_json['LIMIT_VOL'],  # 场内限售份额（万份）
+                'unlimitVol': data_json['UNLIMIT_VOL'],  # 场内非限售份额（万份）
+                'totalVol': data_json['TOTAL_VOL'],  # 场内总份额（万份）
+                'tradeDate': tradeDate,  # 最新份额日期
+                'sellVol': data_json['SELL_VOL'],  # 总份额（万份）
+                'exchange': '上海证券交易所'  # 交易所
+            }
+            try:
+                db_storage_4.insert_one(dic_info)
+                log.info(f'{code}==={tradeDate}===份额结构采集成功')
+            except:
+                log.error(f'{code}==={tradeDate}===份额结构保存失败')
+            time.sleep(1)
+
+
+# 净值
+def netWorth(code, name):
+    url = f'http://query.sse.com.cn/commonSoaQuery.do?jsonCallBack=jsonpCallback66035&isPagination=true&sqlId=REITS_JZ&fundCode={code}&order=appraiseDate%7Cdesc&pageHelp.pageSize=10&pageHelp.cacheSize=1&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.endPage=5&_={int(time.time())}'
+    try:
+        data_jsons = getDataJson(url)['result']
+    except:
+        log.error(f'{code}===净值连接失败')
+        time.sleep(1)
+        return
+    for data_json in data_jsons:
+        if not data_json['fundUnitnetWorth']:
+            continue
+        if int(data_json['fundUnitnetWorth']) == 0:
+            continue
+        appraiseDate = datetime.datetime.strptime(data_json['appraiseDate'], '%Y-%m-%d')
+        if db_storage_5.find_one(
+                {'code': code, 'appraiseDate': appraiseDate,
+                 'exchange': '上海证券交易所'}):
+            log.info(f"{code}==={appraiseDate}===净值已采集")
+            time.sleep(1)
+            continue
+        dic_info = {
+            'code': code,  # 代码
+            'shortName': name,  # 简称
+            'appraiseDate': appraiseDate,  # 估值日期
+            'fundUnitnetWorth': data_json['fundUnitnetWorth'],  # REITs单位净值（元）
+            'exchange': '上海证券交易所'  # 交易所
+        }
+        try:
+            db_storage_5.insert_one(dic_info)
+            log.info(f'{code}==={appraiseDate}===净值采集成功')
+        except:
+            log.error(f'{code}==={appraiseDate}===净值保存失败')
+        time.sleep(1)
+
+
+def getInfo():
+    codes_list = getCode()
+    for codes in codes_list:
+        code = codes[0]
+        name = codes[1]
+        log.info(f'{code}==={name}===开始采集')
+        productOverview(code)
+        financing(code, name)
+        equityDistribution(code, name)
+        shareStructure(code, name)
+        netWorth(code, name)
+        log.info(f'{code}==={name}===采集结束')
+        time.sleep(5)
+
+
+if __name__ == '__main__':
+    getInfo()
\ No newline at end of file
--- a/REITs专题数据/policy-beijing.py
+++ b/REITs专题数据/policy-beijing.py
+import os
+import os
+import random
+import re
+
+import fitz
+import numpy as np
+import openpyxl
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+from datetime import datetime
+import time
+
+from openpyxl import load_workbook
+from retry import retry
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+
+from urllib.parse import urljoin
+
+import BaseCore
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+
+
+class Policy():
+    @retry(tries=3, delay=10)
+    def getrequest_soup(self, url):
+        ip = baseCore.get_proxy()
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+        }
+        req = requests.get(url, headers=headers, proxies=ip)
+        if req.status_code != 200:
+            raise
+        req.encoding = req.apparent_encoding
+        result = BeautifulSoup(req.content, 'html.parser')
+        req.close()
+        return result
+
+    @retry(tries=3, delay=10)
+    def getrequest_soup_(self, url):
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+        }
+        req = requests.get(url, headers=headers)
+        if req.status_code != 200:
+            raise
+        req.encoding = req.apparent_encoding
+        result = BeautifulSoup(req.content, 'html.parser')
+        req.close()
+        return result
+
+    def getrequest_json(self, headers, url):
+        ip = baseCore.get_proxy()
+        req = requests.get(headers=headers, url=url, proxies=ip)
+        result = req.json()
+        req.close()
+        return result
+
+    def requestPost(self, headers, url, payload):
+        # ip = baseCore.get_proxy()
+        req = requests.post(headers=headers, url=url, data=payload)
+        data_json = req.json()
+        req.close()
+        return data_json
+
+    def requestPost_html(self, headers, url, payload):
+        ip = baseCore.get_proxy()
+        req = requests.post(headers=headers, url=url, data=payload, proxies=ip)
+        result = BeautifulSoup(req.content, 'html.parser')
+        req.close()
+        return result
+
+    def deletep(self, soup, i, tag, attribute_to_delete, value_to_delete):
+        # 查找带有指定属性的标签并删除
+        tags = soup.find_all(tag, {attribute_to_delete: value_to_delete})
+        for tag in tags[:i]:
+            tag.decompose()
+
+    def deletespan(self, td):
+        spans = td.find_all('span')
+        for span in spans:
+            span.extract()  # 删除span标签
+
+    def deletetag(self, td, tag):
+        tags = td.find_all(tag)
+        for tag_ in tags:
+            tag_.extract()  # 删除指定标签
+
+    def deletetext(self, soup, tag, text):  # 删除带有特定内容的标签
+        tags = soup.find_all(tag)[:10]
+        for tag_ in tags:
+            text_ = tag_.text
+            if text in text_:
+                tag_.extract()
+
+    def deletek(self, soup):
+        # 删除空白标签（例如<p></p>、<p><br></p>, img、video、hr除外）
+        for i in soup.find_all(lambda tag: len(tag.get_text()) == 0 and tag.name not in ["img", "video",
+                                                                                         "br"] and tag.name != "br" or tag.get_text() == ' '):
+            for j in i.descendants:
+                if j.name in ["img", "video", "br"]:
+                    break
+            else:
+                i.decompose()
+
+    def paserUrl(self, html, listurl):
+        # 获取所有的<a>标签和<img>标签
+        if isinstance(html, str):
+            html = BeautifulSoup(html, 'html.parser')
+
+        links = html.find_all(['a', 'img'])
+        # 遍历标签，将相对地址转换为绝对地址
+        for link in links:
+            if 'href' in link.attrs:
+                link['href'] = urljoin(listurl, link['href'])
+            elif 'src' in link.attrs:
+                link['src'] = urljoin(listurl, link['src'])
+        return html
+
+
+def getFjContent(url):
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
+    }
+    req = requests.get(url, headers=headers)
+    req.encoding = req.apparent_encoding
+    content = req.content
+    req.close()
+    time.sleep(5)
+    return content
+
+
+# 北京市人民政府 https://www.beijing.gov.cn/so/s?siteCode=1100000088&tab=zcfg&qt=REITs
+def beijing():
+    if not os.path.exists('./相关政策/北京市人民政府/政策文件'):
+        os.makedirs('./相关政策/北京市人民政府/政策文件')
+    policy = Policy()
+    url = 'https://www.beijing.gov.cn/so/ss/query/s'
+    payload = {
+        'siteCode': '1100000088',
+        'tab': 'zcfg',
+        'qt': 'REITs',
+        'sort': 'relevance',
+        'keyPlace': '0',
+        'locationCode': '110000000000',
+        'page': '1',
+        'pageSize': '20',
+        'ie': '89b5e964-dc3c-4a5b-8d80-f6c408769b4a'
+    }
+    headers = {
+        'Accept': 'application/json, text/javascript, */*; q=0.01',
+        'Accept-Encoding': 'gzip, deflate, br',
+        'Accept-Language': 'zh-CN,zh;q=0.9',
+        'Connection': 'keep-alive',
+        'Content-Length': '148',
+        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+        'Cookie': 'Path=/; Path=/; __jsluid_s=91bdb0d83098fd2e8a8455a9085a22e2; JSESSIONID=M2FmNDczYzYtMmNkYS00N2I0LThhNDgtYWJiMTdhOTIyZDI4; _va_ref=%5B%22%22%2C%22%22%2C1699515166%2C%22https%3A%2F%2Fdocs.qq.com%2F%22%5D; _va_ses=*; JSESSIONID=CD61DA650DB33324962A3BF2527672D0; arialoadData=false; _va_id=c7a63e4b2199befd.1699358536.2.1699515273.1699515166.; CPS_SESSION=2FEFDC54444B24762D057AD6BDE3C7BF',
+        'Host': 'www.beijing.gov.cn',
+        'Origin': 'https://www.beijing.gov.cn',
+        'Referer': 'https://www.beijing.gov.cn/so/s?siteCode=1100000088&tab=zcfg&qt=REITs',
+        'Sec-Fetch-Dest': 'empty',
+        'Sec-Fetch-Mode': 'cors',
+        'Sec-Fetch-Site': 'same-origin',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
+        'X-Requested-With': 'XMLHttpRequest',
+        'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
+        'sec-ch-ua-mobile': '?0',
+        'sec-ch-ua-platform': '"Windows"'
+    }
+    data_list = []
+    result = policy.requestPost(headers, url, payload)
+    total = result['totalHits']
+    page_size = result['currentHits']
+    Max_page = int(total / page_size) + 1
+    num = 1
+    for page in range(0, Max_page):
+        payload_page = {
+            'siteCode': '1100000088',
+            'tab': 'zcfg',
+            'qt': 'REITs',
+            'sort': 'relevance',
+            'keyPlace': '0',
+            'locationCode': '110000000000',
+            'page': page + 1,
+            'pageSize': '20',
+            'ie': '89b5e964-dc3c-4a5b-8d80-f6c408769b4a'
+        }
+        data = policy.requestPost(headers, url, payload_page)
+        info_list = data['resultDocs']
+        for info_ in info_list:
+            fjtitle_list = ''
+            fjhref_list = ''
+            info = info_['data']
+            origin = info['siteLabel']['value'].lstrip().strip()
+            title = info['titleO'].lstrip().strip()
+            titleLabel = info['titleLabel']['value'].lstrip().strip()
+            publishDate = info['docDate'].lstrip().strip()
+            newsUrl = info['url'].lstrip().strip()
+            summary = info['summary'].lstrip().strip()
+            summary = BeautifulSoup(summary, 'lxml').text.lstrip().strip()
+            writtenDate = ''
+            pub_hao = ''
+            organ = ''
+            if titleLabel == '政策解读':
+                try:
+                    newssoup = policy.getrequest_soup(newsUrl)
+                except:
+                    newssoup = policy.getrequest_soup_(newsUrl)
+                contentWithTag = newssoup.find('div', id='mainText')
+                try:
+                    scripts = contentWithTag.find_all('script')
+                    for script in scripts:
+                        script.decompose()
+                except:
+                    pass
+                try:
+                    styles = contentWithTag.find_all('style')
+                    for style in styles:
+                        style.decompose()
+                except:
+                    pass
+                content = contentWithTag.text.lstrip().strip()
+                organ = newssoup.find('div', class_='othermessage').find('p', class_='fl').text.split('来源：')[
+                    1].lstrip().strip()
+            elif titleLabel == '政策文件':
+                try:
+                    newssoup = policy.getrequest_soup(newsUrl)
+                except:
+                    newssoup = policy.getrequest_soup_(newsUrl)
+                contentWithTag = newssoup.find('div', id='mainText')
+                try:
+                    scripts = contentWithTag.find_all('script')
+                    for script in scripts:
+                        script.decompose()
+                except:
+                    pass
+                try:
+                    styles = contentWithTag.find_all('style')
+                    for style in styles:
+                        style.decompose()
+                except:
+                    pass
+                li_list = newssoup.find('ol', class_='doc-info').find_all('li')
+                for li in li_list:
+                    if '成文日期' in li.text:
+                        writtenDate = li.find('span').text.lstrip().strip()
+                content = contentWithTag.text.lstrip().strip()
+                formatRows = info['formatRows']
+                for row in formatRows:
+                    for col in row['col']:
+                        name = col['text']
+                        if name == '相关附件':
+                            value = col['value']
+                            for i in range(len(value.keys())):
+                                file_href = list(value.keys())[i]
+                                file_name = list(value.values())[i]
+                                fjcontent = getFjContent(file_href)
+                                category = os.path.splitext(file_href)[1]
+                                if category not in file_name:
+                                    file_name = file_name + category
+                                file_name = f'{num}-{publishDate}-{file_name}'
+                                file = f'./相关政策/北京市人民政府/政策文件/{file_name}'
+                                fjtitle_list += file_name + '\n'
+                                with open(file, 'wb') as f:
+                                    f.write(fjcontent)
+                                log.info(f'{file_name}===附件下载成功')
+                        elif '号' in name:
+                            pub_hao = col['value'].lstrip().strip()
+                        elif '发文机构' in name:
+                            organ = col['value'][0].lstrip().strip()
+            time.sleep(random.randint(10, 20))
+            data = [num, title, publishDate, origin, newsUrl, writtenDate, organ, pub_hao, summary, content,
+                    fjtitle_list, fjhref_list]
+            data_list.append(data)
+            log.info(f'{title}===采集成功')
+            num += 1
+    df = pd.DataFrame(np.array(data_list))
+    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
+    df.to_excel('./相关政策/北京市人民政府/北京市人民政府政策文件.xlsx', index=False)
+
+
+if __name__ == '__main__':
+    beijing()
+    baseCore.close()
--- a/REITs专题数据/policy-chongqing.py
+++ b/REITs专题数据/policy-chongqing.py
+import json
+import json
+import time
+
+import requests
+from bs4 import BeautifulSoup
+from base import BaseCore
+import os
+import pandas as pd
+import numpy as np
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+    'Content-Type': 'application/json',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+    'X-Requested-With': 'XMLHttpRequest',
+}
+
+
+headers_ = {
+    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+    'X-Requested-With': 'XMLHttpRequest',
+}
+
+def getTotal():
+    url = 'http://www.cq.gov.cn/irs/front/list'
+    data_post = {"customFilter": {"operator": "and", "properties": [], "filters": [{"operator": "or", "properties": [
+        {"property": "f_202121500898", "operator": "eq", "value": "REITs"},
+        {"property": "f_202142777829", "operator": "eq", "value": "REITs"}], "filters": []}, {"operator": "or",
+                                                                                              "properties": [{
+                                                                                                  "property": "f_202146838317",
+                                                                                                  "operator": "gte",
+                                                                                                  "value": "2023-11-17 11:21:40"},
+                                                                                                  {
+                                                                                                      "property": "f_202146235090",
+                                                                                                      "operator": "gte",
+                                                                                                      "value": "2023-11-17 11:21:40"}],
+                                                                                              "filters": [
+                                                                                                  {"operator": "and",
+                                                                                                   "properties": [{
+                                                                                                       "property": "f_202146838317",
+                                                                                                       "operator": "eq",
+                                                                                                       "value": None},
+                                                                                                       {
+                                                                                                           "property": "f_202146235090",
+                                                                                                           "operator": "eq",
+                                                                                                           "value": None}]}]}]},
+                 "sorts": [{"sortField": "save_time", "sortOrder": "DESC"}], "tableName": "t_1775cd018c6",
+                 "tenantId": "7", "pageSize": 10, "pageNo": 1}
+    data_post = json.dumps(data_post)
+    ip = baseCore.get_proxy()
+    req = requests.post(url, data=data_post, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    total = int(req.json()['data']['pager']['pageCount'])
+    return total
+
+
+def getDataJson(page):
+    url = 'http://www.cq.gov.cn/irs/front/list'
+    data_post = {"customFilter": {"operator": "and", "properties": [], "filters": [{"operator": "or", "properties": [
+        {"property": "f_202121500898", "operator": "eq", "value": "REITs"},
+        {"property": "f_202142777829", "operator": "eq", "value": "REITs"}], "filters": []}, {"operator": "or",
+                                                                                              "properties": [{
+                                                                                                  "property": "f_202146838317",
+                                                                                                  "operator": "gte",
+                                                                                                  "value": "2023-11-17 11:21:40"},
+                                                                                                  {
+                                                                                                      "property": "f_202146235090",
+                                                                                                      "operator": "gte",
+                                                                                                      "value": "2023-11-17 11:21:40"}],
+                                                                                              "filters": [
+                                                                                                  {"operator": "and",
+                                                                                                   "properties": [{
+                                                                                                       "property": "f_202146838317",
+                                                                                                       "operator": "eq",
+                                                                                                       "value": None},
+                                                                                                       {
+                                                                                                           "property": "f_202146235090",
+                                                                                                           "operator": "eq",
+                                                                                                           "value": None}]}]}]},
+                 "sorts": [{"sortField": "save_time", "sortOrder": "DESC"}], "tableName": "t_1775cd018c6",
+                 "tenantId": "7", "pageSize": 10, "pageNo": page}
+    data_post = json.dumps(data_post)
+    ip = baseCore.get_proxy()
+    req = requests.post(url, data=data_post, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    return req.json()['data']['list']
+
+
+def getSoup(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'html.parser')
+    time.sleep(3)
+    return soup
+
+
+def getContent_(id):
+    url = 'http://www.cq.gov.cn/govserver/tors/getPolicyDetail.html'
+    data_post = {
+        'policyId': f'{id}'
+    }
+    ip = baseCore.get_proxy()
+    req = requests.post(url,headers=headers_,data=data_post,proxies=ip)
+    req.encoding = req.apparent_encoding
+    return req.json()['data']['DETAIL']['ZCYW']
+
+def getContent(url):
+    if 'policyId' in url:
+        id = url.split('policyId=')[1]
+        contentWithTag  = getContent_(id)
+        contentWithTag = BeautifulSoup(contentWithTag,'lxml')
+    else:
+        soup = getSoup(url)
+        contentWithTag = soup.find('div', class_='view')
+        if not contentWithTag:
+            contentWithTag = soup.find('div',class_='document')
+            contentWithTag.find('div',class_='item').decompose()
+    try:
+        scripts = contentWithTag.find_all('script')
+        for script in scripts:
+            script.decompose()
+    except:
+        pass
+    try:
+        styles = contentWithTag.find_all('style')
+        for style in styles:
+            style.decompose()
+    except:
+        pass
+    content = contentWithTag.text.lstrip().strip()
+    return content
+
+
+def getData(data_, num):
+    title = data_['f_202121500898']
+    publishDate = data_['save_time']
+    origin = data_['f_2021325755960']
+    href = data_['doc_pub_url']
+    try:
+        writtenDate = data_['f_202121607647']
+    except:
+        writtenDate = ''
+    try:
+        organ = data_['f_202121437464']
+    except:
+        organ = ''
+    try:
+        pub_hao = data_['f_202121837479']
+    except:
+        pub_hao = ''
+    summary = data_['f_202142777829']
+    summary = BeautifulSoup(summary, 'lxml').text.lstrip().strip()
+    content = getContent(href)
+    fjtitle_list = ''
+    fjhref_list = ''
+    data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
+            fjhref_list]
+    return data
+
+
+def doJob():
+    if not os.path.exists('./相关政策/重庆市人民政府/政策文件'):
+        os.makedirs('./相关政策/重庆市人民政府/政策文件')
+    total = getTotal()
+    num = 1
+    data_list = []
+    for page in range(1, total + 1):
+        data_json = getDataJson(page)
+        for data_ in data_json:
+            data = getData(data_, num)
+            num += 1
+            time.sleep(3)
+            data_list.append(data)
+            log.info(f'{data[1]}===采集成功')
+    df = pd.DataFrame(np.array(data_list))
+    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
+    df.to_excel('./相关政策/重庆市人民政府/重庆市人民政府政策文件.xlsx', index=False)
+
+
+if __name__ == '__main__':
+    doJob()
+    baseCore.close()
--- a/REITs专题数据/policy-fujian.py
+++ b/REITs专题数据/policy-fujian.py
+import time
+import time
+import os
+
+import numpy as np
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+from base import BaseCore
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+    'X-Requested-With': 'XMLHttpRequest',
+    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+    'Referer': 'https://www.fujian.gov.cn/ssp/main/index.html?key=REITs&siteId=ff808081624641aa0162476c0e0e0055&isMain='
+}
+
+
+def getSoup(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'html.parser')
+    return soup
+
+
+def getFjContent(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    return req.content
+
+
+def getDataJson(data_post):
+    url = f'https://www.fujian.gov.cn/ssp/search/api/search?time={int(time.time())}'
+    ip = baseCore.get_proxy()
+    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
+    req.encoding = req.apparent_encoding
+    data_json = req.json()['datas']
+    return data_json
+
+
+def getContent(num, url, publishDate):
+    url_ = url.split('/')[-1]
+    url_ = url.replace(url_, '')
+    fjhref_list = ''
+    fjtitle_list = ''
+    soup = getSoup(url)
+    contentWithTag = soup.find('div', class_='TRS_Editor')
+    try:
+        scripts = contentWithTag.find_all('script')
+        for script in scripts:
+            script.decompose()
+    except:
+        pass
+    try:
+        styles = contentWithTag.find_all('style')
+        for style in styles:
+            style.decompose()
+    except:
+        pass
+    a_list = contentWithTag.find_all('a')
+    for a in a_list:
+        fj_href = a.get('href').replace('./', url_)
+        fjhref_list += fj_href + '\n'
+        fj_title = a.text.lstrip().strip()
+        category = os.path.splitext(fj_href)[1]
+        if category not in fj_title:
+            fj_title = fj_title + category
+        fj_title = f'{num}-{publishDate}-{fj_title}'
+        fjtitle_list += fj_title + '\n'
+        fjcontent = getFjContent(fj_href)
+        file = f'./相关政策/福建省人民政府/政策文件/{fj_title}'
+        with open(file, 'wb') as f:
+            f.write(fjcontent)
+        log.info(f'{fj_title}===附件下载成功')
+    fjtitle_list = fjtitle_list.lstrip().strip()
+    fjhref_list = fjhref_list.lstrip().strip()
+    content = contentWithTag.text.lstrip().strip()
+    return content, fjtitle_list, fjhref_list
+
+
+def doJob():
+    if not os.path.exists('./相关政策/福建省人民政府/政策文件'):
+        os.makedirs('./相关政策/福建省人民政府/政策文件')
+    data_posts = [{
+        'isCollapse': '', 'siteType': '1', 'typeQueryJsonToMap': '', 'pubOrgType': '1', 'jiGuanList': '',
+        'siteCode': '', 'zhuTiIdList': '', 'isCrdept': '', 'mainSiteId': 'ff808081624641aa0162476c0e0e0055',
+        'siteId': 'ff808081624641aa0162476c0e0e0055', 'depSiteId': 'ff808081624641aa0162476c0e0e0055', 'type': '0',
+        'page': '1', 'rows': '10', 'historyId': '8a289fe18ba97b6a018bd6aee981642d', 'sourceType': 'SSP_DOCUMENT_ZC',
+        'isChange': '0', 'fullKey': 'N', 'wbServiceType': '13', 'fileType': '', 'feaTypeName': '', 'fileNo': '',
+        'pubOrg': '', 'zfgbPubOrg': '', 'themeType': '', 'searchTime': '', 'startDate': '', 'endDate': '',
+        'sortFiled': 'RELEVANCE', 'searchFiled': '', 'dirUseLevel': '', 'issueYear': '', 'publishYear': '',
+        'issueMonth': '', 'allKey': '', 'fullWord': '', 'oneKey': '', 'notKey': '', 'totalIssue': '', 'chnlName': '',
+        'zfgbTitle': '', 'zfgbContent': '', 'bsDeptId': '', 'siteName': '', 'keyWord': 'REITs', 'isProvince': '',
+    }, {
+        'isCollapse': '', 'siteType': '1', 'typeQueryJsonToMap': '', 'pubOrgType': '', 'jiGuanList': '',
+        'siteCode': '', 'zhuTiIdList': '', 'isCrdept': '', 'mainSiteId': 'ff808081624641aa0162476c0e0e0055',
+        'siteId': 'ff808081624641aa0162476c0e0e0055', 'depSiteId': 'ff808081624641aa0162476c0e0e0055', 'type': '0',
+        'page': '1', 'rows': '10', 'historyId': '8a28289e8ba97b6b018bd6cee1c26aa7', 'sourceType': 'SSP_JDHY',
+        'isChange': '0', 'fullKey': 'N', 'wbServiceType': '13', 'fileType': '', 'feaTypeName': '', 'fileNo': '',
+        'pubOrg': '', 'zfgbPubOrg': '', 'themeType': '', 'searchTime': '', 'startDate': '', 'endDate': '',
+        'sortFiled': 'RELEVANCE', 'searchFiled': '', 'dirUseLevel': '', 'issueYear': '', 'publishYear': '',
+        'issueMonth': '', 'allKey': '', 'fullWord': '', 'oneKey': '', 'notKey': '', 'totalIssue': '', 'chnlName': '',
+        'zfgbTitle': '', 'zfgbContent': '', 'bsDeptId': '', 'siteName': '', 'keyWord': 'REITs', 'isProvince': '',
+    }]
+    data_list = []
+    num = 1
+    for data_post in data_posts:
+        data_json = getDataJson(data_post)
+        for data_ in data_json:
+            title = data_['_doctitle']
+            publishDate = data_['crtime'].replace('.','-')
+            origin = data_['docsourcename']
+            href = data_['docpuburl']
+            try:
+                writtenDate = data_['pubdate'].replace('.','-')
+            except:
+                writtenDate = ''
+            try:
+                organ = data_['puborg']
+            except:
+                organ = ''
+            try:
+                pub_hao = data_['fileno']
+            except:
+                pub_hao = ''
+            summary = data_['doccontent']
+            summary = BeautifulSoup(summary, 'lxml').text.lstrip().strip()
+            content, fjtitle_list, fjhref_list = getContent(num, href, publishDate[:10])
+            data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
+                    fjhref_list]
+            data_list.append(data)
+            log.info(f'{title}===采集成功')
+            num += 1
+            time.sleep(1)
+    df = pd.DataFrame(np.array(data_list))
+    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
+    df.to_excel('./相关政策/福建省人民政府/福建省人民政府政策文件.xlsx', index=False)
+
+
+if __name__ == '__main__':
+    doJob()
+    baseCore.close()
--- a/REITs专题数据/policy-guangdong.py
+++ b/REITs专题数据/policy-guangdong.py
+import datetime
+import datetime
+import json
+import time
+
+import requests
+from bs4 import BeautifulSoup
+from retry import retry
+
+from base import BaseCore
+import os
+import pandas as pd
+import numpy as np
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+    'Content-Type': 'application/json',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
+    'X-XSRF-TOKEN': 'eyJpdiI6InhWUlhvRWpuUUp4ejFsQ0VVb29CaFE9PSIsInZhbHVlIjoiOUp5dHJ2SVVoNWl0K0s3UVlaZGZcL3p0a0gxc09sclRVU2JZTjg3dVUyTER4WVE4Qm1Ta2dyWUJndENmMURYVmwiLCJtYWMiOiJjNGU5YTU1MTJmZmZmZjdhZjRkNDE0NmM4Y2I3OTNkMmExYmJjZGRmYTk5MGMyMmQyM2FhYjVjMjRhZTY0NjA2In0=',
+}
+
+
+@retry(tries=5, delay=5)
+def getSoup(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    time.sleep(5)
+    soup = BeautifulSoup(req.text, 'html.parser')
+    return soup
+
+
+def getFjContent(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    return req.content
+
+
+def getPageSize():
+    ip = baseCore.get_proxy()
+    url = 'https://search.gd.gov.cn/api/search/file'
+    data_post = {"page": "1", "position": "all", "keywords": "REITs", "sort": "smart", "site_id": "2", "range": "site",
+                 "recommand": 1, "gdbsDivision": "440000", "service_area": 1}
+    data_post = json.dumps(data_post)
+    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
+    req.encoding = req.apparent_encoding
+    total = int(req.json()['data']['total'])
+    if total % 12 == 0:
+        pageSize = int(total / 12)
+    else:
+        pageSize = int(total / 12) + 1
+    return pageSize
+
+
+def getDataJson(url, data_post):
+    ip = baseCore.get_proxy()
+    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
+    req.encoding = req.apparent_encoding
+    try:
+        data_json = req.json()['data']['list']
+    except:
+        data_json = req.json()['data']['news']['list']
+    return data_json
+
+
+def getContent(url, publishDate, num):
+    fjhref_list = ''
+    fjtitle_list = ''
+    soup = getSoup(url)
+    time.sleep(2)
+    try:
+        try:
+            contentWithTag = soup.select('body > div.con > div.viewList > div.zw')[0]
+        except:
+            contentWithTag = soup.select('body > div.con > div:nth-of-type(3) > div.content > div.viewList > div.zw')[0]
+    except:
+        contentWithTag = soup.find('div', class_='article-content').find('center')
+        if not contentWithTag:
+            contentWithTag = soup.find('div', class_='article-content')
+    img_list = contentWithTag.find_all('img')
+    num_ = 1
+    for img in img_list:
+        fj_href = img.get('src')
+        if "http" not in fj_href and '//www' in fj_href:
+            fj_href = 'http:' + fj_href
+        fjhref_list += fj_href + '\n'
+        fj_title = img.get('alt')
+        if fj_title == '':
+            fj_title = str(num_)
+            num_ += 1
+        category = os.path.splitext(fj_href)[1]
+        if category not in fj_title:
+            fj_title = fj_title + category
+        fj_title = f'{num}-{publishDate}-{fj_title}'
+        fjcontent = getFjContent(fj_href)
+        file = f'./相关政策/广东省人民政府/政策文件/{fj_title}'
+        if os.path.exists(file):
+            file = file.replace(category, f'-{num_}{category}')
+            num_ += 1
+        if os.path.exists(file):
+            fj_title = fj_title.replace(category, f'-{num_}{category}')
+            file = f'./相关政策/广东省人民政府/政策文件/{fj_title}'
+        fjtitle_list += fj_title + '\n'
+        with open(file, 'wb') as f:
+            f.write(fjcontent)
+        log.info(f'{fj_title}===附件下载成功')
+    a_list = contentWithTag.find_all('a')
+    for a in a_list:
+        fj_href = a.get('href')
+        fjhref_list += fj_href + '\n'
+        fj_title = a.text.lstrip().strip()
+        if fj_title == '':
+            fj_title = str(num_)
+            num_ += 1
+        category = os.path.splitext(fj_href)[1]
+        if category not in fj_title:
+            fj_title = fj_title + category
+        fj_title = f'{num}-{publishDate}-{fj_title}'
+        fjcontent = getFjContent(fj_href)
+        file = f'./相关政策/广东省人民政府/政策文件/{fj_title}'
+        if os.path.exists(file):
+            file = file.replace(category, f'-{num_}{category}')
+            num_ += 1
+        fjtitle_list += fj_title + '\n'
+        with open(file, 'wb') as f:
+            f.write(fjcontent)
+        log.info(f'{fj_title}===附件下载成功')
+    try:
+        scripts = contentWithTag.find_all('script')
+        for script in scripts:
+            script.decompose()
+    except:
+        pass
+    try:
+        styles = contentWithTag.find_all('style')
+        for style in styles:
+            style.decompose()
+    except:
+        pass
+    content = contentWithTag.text.lstrip().strip()
+    fjtitle_list = fjtitle_list.lstrip().strip()
+    fjhref_list = fjhref_list.lstrip().strip()
+    return content, fjtitle_list, fjhref_list
+
+
+def ST(txt):
+    txt = BeautifulSoup(txt, 'lxml').text
+    return txt
+
+
+def getData(data_, num):
+    title = ST(data_['title'])
+    log.info(f'{title}===开始采集')
+    publishDate = data_['pub_time']
+    origin = data_['publisher_src']
+    href = data_['url']
+    log.info(href)
+    writtenDate = data_['date']
+    if writtenDate:
+        writtenDate = datetime.datetime.fromtimestamp(writtenDate).strftime('%Y-%m-%d')
+    organ = data_['source']
+    pub_hao = data_['document_number']
+    summary = ST(data_['content'])
+    content, fjtitle_list, fjhref_list = getContent(href, publishDate, num)
+    data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
+            fjhref_list]
+    return data
+
+
+def doJob_1():
+    if not os.path.exists('./相关政策/广东省人民政府/政策文件'):
+        os.makedirs('./相关政策/广东省人民政府/政策文件')
+    pageSize = getPageSize()
+    data_list = []
+    num = 1
+    url = 'https://search.gd.gov.cn/api/search/file'
+    for page in range(1, pageSize + 1):
+        data_post = {"page": f"{page}", "position": "all", "keywords": "REITs", "sort": "smart", "site_id": "2",
+                     "range": "site",
+                     "recommand": 1, "gdbsDivision": "440000", "service_area": 1}
+        data_post = json.dumps(data_post)
+        data_json = getDataJson(url, data_post)
+        for data_ in data_json:
+            data = getData(data_, num)
+            data_list.append(data)
+            log.info(f'{data[1]}===采集成功')
+            num += 1
+    return data_list, num
+
+
+def doJob_2(num):
+    url = 'https://search.gd.gov.cn/api/search/all'
+    types = ['政策解读', '计划规划']
+    data_list = []
+    for type in types:
+        data_post = {"label": f"{type}", "position": "all", "keywords": "REITs", "sort": "smart", "site_id": "2",
+                     "range": "site", "page": 1, "tag_name": f"{type}", "recommand": 1, "gdbsDivision": "440000",
+                     "service_area": 1}
+        data_post = json.dumps(data_post)
+        data_json = getDataJson(url, data_post)
+        for data_ in data_json:
+            data = getData(data_, num)
+            time.sleep(1)
+            data_list.append(data)
+            log.info(f'{data[1]}===采集成功')
+            num += 1
+    return data_list
+
+
+def doJob():
+    data_list = []
+    data_list_, num = doJob_1()
+    data_list += data_list_
+    data_list_ = doJob_2(num)
+    data_list += data_list_
+    df = pd.DataFrame(np.array(data_list))
+    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
+    df.to_excel('./相关政策/广东省人民政府/广东省人民政府政策文件.xlsx', index=False)
+
+
+if __name__ == '__main__':
+    doJob()
+    # doJob_1()
+    # doJob_2(2)
+    # url = 'http://www.gd.gov.cn/gkmlpt/content/4/4022/post_4022955.html#8'
+    # soup = getSoup(url)
+    #
+    # print(contentWithTag)
+    baseCore.close()
--- a/REITs专题数据/policy-guangxi.py
+++ b/REITs专题数据/policy-guangxi.py
+import json
+import json
+import os
+import time
+
+import numpy as np
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+from base import BaseCore
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+    'Content-Type': 'application/json',
+}
+
+
+def getSoup(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'html.parser')
+    return soup
+
+
+def getFjContent(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    return req.content
+
+
+def getTotal():
+    ip = baseCore.get_proxy()
+    url = 'http://www.gxzf.gov.cn/irs/front/search'
+    data_post = {"code": "181aedaa542", "dataTypeId": "241", "configCode": "",
+                 "sign": "9cc99c9d-94aa-44b4-aa79-41227a5385d7", "searchWord": "REITs", "orderBy": "related",
+                 "searchBy": "all", "appendixType": "", "granularity": "ALL", "isSearchForced": "0", "filters": [],
+                 "pageNo": 1, "pageSize": 10, "isAdvancedSearch": None, "isDefaultAdvanced": None,
+                 "advancedFilters": None, "advancedFilters ": None, "historySearchWords": []}
+    data_post = json.dumps(data_post)
+    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
+    req.encoding = req.apparent_encoding
+    return int(req.json()['data']['pager']['pageCount'])
+
+
+def getDataJson(page):
+    ip = baseCore.get_proxy()
+    url = 'http://www.gxzf.gov.cn/irs/front/search'
+    data_post = {"code": "181aedaa542", "dataTypeId": "241", "configCode": "",
+                 "sign": "9cc99c9d-94aa-44b4-aa79-41227a5385d7", "searchWord": "REITs", "orderBy": "related",
+                 "searchBy": "all", "appendixType": "", "granularity": "ALL", "isSearchForced": "0", "filters": [],
+                 "pageNo": page, "pageSize": 10, "isAdvancedSearch": None, "isDefaultAdvanced": None,
+                 "advancedFilters": None, "advancedFilters ": None, "historySearchWords": []}
+    data_post = json.dumps(data_post)
+    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
+    req.encoding = req.apparent_encoding
+    return req.json()['data']['middle']['listAndBox']
+
+
+def getContent(url, publishDate, num):
+    fjhref_list = ''
+    fjtitle_list = ''
+    url_ = url.split('/')[-1]
+    url_ = url.replace(url_, '')
+    soup = getSoup(url)
+    contentWithTag = soup.find('div', attrs={'id': 'articleFile'})
+    try:
+        scripts = contentWithTag.find_all('script')
+        for script in scripts:
+            script.decompose()
+    except:
+        pass
+    try:
+        styles = contentWithTag.find_all('style')
+        for style in styles:
+            style.decompose()
+    except:
+        pass
+    content = contentWithTag.text
+    img_list = contentWithTag.find_all('img')
+    num_ = 1
+    for img in img_list:
+        fj_href = img.get('src')
+        fjhref_list += fj_href + '\n'
+        if 'http' not in fj_href:
+            fj_href = url_ + fj_href
+        fj_title = img.get('title')
+        if fj_title == '':
+            fj_title = str(num_)
+        category = os.path.splitext(fj_href)[1]
+        if category not in fj_title:
+            fj_title = fj_title + category
+        fj_title = f'{num}-{publishDate[:10]}-{fj_title}'
+        fjtitle_list += fj_title + '\n'
+        fjcontent = getFjContent(fj_href)
+        file = f'./相关政策/广西壮族自治区人民政府/政策文件/{fj_title}'
+        with open(file, 'wb') as f:
+            f.write(fjcontent)
+        log.info(f'{fj_title}===附件下载成功')
+    a_list = soup.find('div', class_='downloadfile').find_all('a')
+    for a in a_list:
+        fj_href = a.get('href')
+        if 'http' not in fj_href:
+            fj_href = url_ + fj_href
+        fjhref_list += fj_href + '\n'
+        fj_title = a.text.lstrip().strip()
+        category = os.path.splitext(fj_href)[1]
+        if category not in fj_title:
+            fj_title = fj_title + category
+        fj_title = f'{num}-{publishDate[:10]}-{fj_title}'
+        fjtitle_list += fj_title + '\n'
+        fjcontent = getFjContent(fj_href)
+        file = f'./相关政策/广西壮族自治区人民政府/政策文件/{fj_title}'
+        with open(file, 'wb') as f:
+            f.write(fjcontent)
+        log.info(f'{fj_title}===附件下载成功')
+    fjhref_list = fjhref_list.lstrip().strip()
+    fjtitle_list = fjtitle_list.lstrip().strip()
+    return content,fjtitle_list,fjhref_list
+
+
+def getData(data_, num):
+    title = data_['data']['title']
+    publishDate = data_['data']['time']
+    origin = '广西壮族自治区人民政府'
+    href = data_['data']['url']
+    writtenDate = data_['data']['table-10']
+    organ = data_['data']['source']
+    pub_hao = data_['data']['table-5']
+    summary = data_['data']['table-7']
+    content, fjtitle_list, fjhref_list = getContent(href, publishDate, num)
+    data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
+            fjhref_list]
+    return data
+
+
+def doJob():
+    if not os.path.exists('./相关政策/广西壮族自治区人民政府/政策文件'):
+        os.makedirs('./相关政策/广西壮族自治区人民政府/政策文件')
+    data_list = []
+    num = 1
+    total = getTotal()
+    for page in range(1, total + 1):
+        data_json = getDataJson(page)
+        title_list = []
+        for data_ in data_json:
+            title = data_['data']['title']
+            if title not in title_list:
+                title_list.append(title)
+                data = getData(data_, num)
+                data_list.append(data)
+                log.info(f'{title}===采集成功')
+                num += 1
+                time.sleep(2)
+    df = pd.DataFrame(np.array(data_list))
+    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
+    df.to_excel('./相关政策/江西省人民政府/广西壮族自治区人民政府政策文件.xlsx', index=False)
+
+
+if __name__ == '__main__':
+    doJob()
+    baseCore.close()
--- a/REITs专题数据/policy-gwy.py
+++ b/REITs专题数据/policy-gwy.py
+import os
+import os
+import time
+
+import numpy as np
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+from base import BaseCore
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+    'Accept': 'application/json, text/plain, */*',
+    'Accept-Encoding': 'gzip, deflate, br',
+    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+    'Cache-Control': 'no-cache',
+    'Connection': 'keep-alive',
+    'Host': 'sousuo.www.gov.cn',
+    'Pragma': 'no-cache',
+    'Sec-Fetch-Dest': 'empty',
+    'Sec-Fetch-Mode': 'cors',
+    'Sec-Fetch-Site': 'same-origin',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+    'sec-ch-ua': '"Microsoft Edge";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
+    'sec-ch-ua-mobile': '?0',
+    'sec-ch-ua-platform': '"Windows"',
+}
+
+
+def getSoup(url):
+    ip = baseCore.get_proxy()
+    URL = 'https://www.gov.cn/'
+    session = requests.session()
+    session.get(URL,headers=headers,proxies=ip)
+    # req = requests.get(url, headers=headers, proxies=ip)
+    req = session.get(url)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'html.parser')
+    # req.close()
+    session.close()
+    return soup
+
+
+def getFjContent(url):
+    ip = baseCore.get_proxy()
+    URL = 'https://www.gov.cn/'
+    session = requests.session()
+    session.get(URL,headers=headers,proxies=ip)
+    req = session.get(url)
+    req.encoding = req.apparent_encoding
+    content = req.content
+    session.close()
+    return content
+
+
+def getPageSize(types):
+    total = 0
+    ip = baseCore.get_proxy()
+    url = 'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary&q=REITs&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=&puborg=&pcodeYear=&pcodeNum=&filetype=&p=1&n=5&inpro=&dup=&orpro=&type=gwyzcwjk'
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    for type in types:
+        num = int(req.json()['searchVO']['catMap'][f'{type}']['totalCount'])
+        total += num
+    print(total)
+    if total % 20 == 0:
+        pageSize = int(total / 20)
+    else:
+        pageSize = int(total / 20) + 1
+    req.close()
+    return pageSize
+
+
+def getDataJson(page, types):
+    data_list = []
+    ip = baseCore.get_proxy()
+    url = f'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary&q=REITs&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=&puborg=&pcodeYear=&pcodeNum=&filetype=&p={page}&n=5&inpro=&dup=&orpro=&type=gwyzcwjk'
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    for type in types:
+        data_list_ = req.json()['searchVO']['catMap'][f'{type}']['listVO']
+        data_list += data_list_
+    req.close()
+    return data_list
+
+
+def getContent(url, publishDate, num, organ):
+    fjhref_list = ''
+    fjtitle_list = ''
+    soup = getSoup(url)
+    if organ == '':
+        try:
+            organ = soup.find('div', class_='pages-date').find('span', class_='font').text.split('来源：').lstrip().strip()
+        except:
+            organ = ''
+    contentWithTag = soup.find('div', class_='TRS_UEDITOR')
+    if not contentWithTag:
+        contentWithTag = soup.find('div',class_='pages_content')
+    a_list = contentWithTag.find_all('a')
+    for a in a_list:
+        fj_href = a.get('href')
+        if '.htm' not in fj_href and '.html' not in fj_href and '.shtml' not in fj_href and '.shtm' not in fj_href:
+            fjhref_list += fj_href + '\n'
+            fj_title = a.text.lstrip().strip()
+            category = os.path.splitext(fj_href)[1]
+            if category not in fj_title:
+                fj_title = fj_title + category
+            fj_title = f'{num}-{publishDate}-{fj_title}'
+            fjtitle_list += fj_title + '\n'
+            fjcontent = getFjContent(fj_href)
+            file = f'./相关政策/国务院/政策文件/{fj_title}'
+            with open(file, 'wb') as f:
+                f.write(fjcontent)
+            log.info(f'{fj_title}===附件下载成功')
+    try:
+        scripts = contentWithTag.find_all('script')
+        for script in scripts:
+            script.decompose()
+    except:
+        pass
+    try:
+        styles = contentWithTag.find_all('style')
+        for style in styles:
+            style.decompose()
+    except:
+        pass
+    content = contentWithTag.text.lstrip().strip()
+    return content, fjtitle_list, fjhref_list, organ
+
+
+def getData(data_, num):
+    title = data_['title'].replace('\n', '').replace('\r', '')
+    title = BeautifulSoup(title,'lxml').text
+    publishDate = data_['pubtimeStr'].replace('.', '-')
+    origin = '国务院'
+    href = data_['url']
+    writtenDate = data_['ptime']
+    try:
+        organ = data_['puborg']
+    except:
+        organ = ''
+    pub_hao = data_['pcode']
+    summary = data_['summary']
+    summary = BeautifulSoup(summary, 'lxml').text.lstrip().strip()
+    content, fjtitle_list, fjhref_list, organ = getContent(href, publishDate, num, organ)
+    data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
+            fjhref_list]
+    return data
+
+
+def doJob():
+    if not os.path.exists('./相关政策/国务院/政策文件'):
+        os.makedirs('./相关政策/国务院/政策文件')
+    data_list = []
+    href_list = []
+    num = 1
+    types = ['bumenfile', 'gongwen', 'otherfile', 'gongbao']
+    pageSize = 7
+    for page in range(1, pageSize + 1):
+        data_json = getDataJson(page, types)
+        for data_ in data_json:
+            href = data_['url']
+            if href not in href_list:
+                data = getData(data_, num)
+                num += 1
+                data_list.append(data)
+                href_list.append(href)
+                log.info(f'{data[1]}===采集成功')
+            time.sleep(3)
+    df = pd.DataFrame(np.array(data_list))
+    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
+    df.to_excel('./相关政策/国务院/国务院政策文件.xlsx', index=False)
+
+
+if __name__ == '__main__':
+    doJob()
+    baseCore.close()
--- a/REITs专题数据/policy-hainan.py
+++ b/REITs专题数据/policy-hainan.py
+import requests
+import requests
+from bs4 import BeautifulSoup
+from base import BaseCore
+import os
+import pandas as pd
+import numpy as np
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+}
+
+
+def getSoup(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    if req.url == 'https://www.hainan.gov.cn/hainan/xhtml/404.html':
+        return ''
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'html.parser')
+    return soup
+
+
+def getPageSize(type):
+    url = f'https://www.hainan.gov.cn/s?siteCode=4600000001&searchWord=REITs&column={type}&wordPlace=0&orderBy=1&startTime=&endTime=&isSecondSearch=undefined&pageSize=10&pageNum=0&timeStamp=0&labelHN=&uc=0&checkHandle=1&strFileType=0&countKey=%200&sonSiteCode=&areaSearchFlag=&secondSearchWords=&left_right_flag=1'
+    soup = getSoup(url)
+    total = int(soup.find('div', class_='results-list').find('span').text.lstrip().strip())
+    if total % 10 == 0:
+        pageSize = int(total / 10)
+    else:
+        pageSize = int(total / 10) + 1
+    return pageSize
+
+
+def getContent(url, publishDate, num):
+    fjhref_list = ''
+    fjtitle_list = ''
+    soup = getSoup(url)
+    if soup == '':
+        return '','','',''
+    try:
+        writtenDate = soup.find('div', class_='zwgk_comr1').text.replace(' ', '').split('成文日期：')[1].split('标题')[
+            0].lstrip().strip()
+    except:
+        writtenDate = ''
+    contentWithTag = soup.find('div', attrs={'id':'font'})
+
+    try:
+        scripts = contentWithTag.find_all('script')
+        for script in scripts:
+            script.decompose()
+    except:
+        pass
+    try:
+        styles = contentWithTag.find_all('style')
+        for style in styles:
+            style.decompose()
+    except:
+        pass
+    try:
+        content = contentWithTag.text.lstrip().strip()
+    except:
+        print(url)
+    return writtenDate, content, fjtitle_list, fjhref_list
+
+
+def getData(div, num):
+    title = div.find('a', class_='titlec').get('title').replace('\n', '').replace('\r', '').lstrip().strip()
+    href = div.find('a', class_='titlec').get('href')
+    publishDate = div.find('span', class_='quily-con').text.lstrip().strip()
+    origin = div.find('a', class_='address-con').text.lstrip().strip()
+    try:
+        table = div.find('div', class_='search-results').find('table').text
+        organ = table.split('发文机关：')[1].split('文号：')[0].lstrip().strip()
+        pub_hao = table.split('文号：')[1].lstrip().strip()
+    except:
+        organ = ''
+        pub_hao = ''
+    try:
+        summary = div.find('p', class_='p-text-color').text.lstrip().strip()
+    except:
+        summary = ''
+    writtenDate, content, fjtitle_list, fjhref_list = getContent(href, publishDate, num)
+    if content == '':
+        return []
+    data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
+            fjhref_list]
+    return data
+
+
+def doJob():
+    if not os.path.exists('./相关政策/海南省人民政府/政策文件'):
+        os.makedirs('./相关政策/海南省人民政府/政策文件')
+    data_list = []
+    href_list = []
+    num = 1
+    types = [2682,2677]
+    for type in types:
+        pageSize = getPageSize(type)
+        for page in range(pageSize):
+            url = f'https://www.hainan.gov.cn/s?siteCode=4600000001&searchWord=REITs&column={type}&wordPlace=0&orderBy=1&startTime=&endTime=&isSecondSearch=undefined&pageSize=10&pageNum={page}&timeStamp=0&labelHN=&uc=0&checkHandle=1&strFileType=0&countKey=%200&sonSiteCode=&areaSearchFlag=&secondSearchWords=&left_right_flag=1'
+            soup = getSoup(url)
+            div_list = soup.select('#showPage > div')
+            del (div_list[-1])
+            for div in div_list:
+                href = div.find('a', class_='titlec').get('href')
+                if href not in href_list:
+                    data = getData(div, num)
+                    if data:
+                        href_list.append(href)
+                        data_list.append(data)
+                        num += 1
+                        log.info(f'{data[1]}===采集成功')
+    df = pd.DataFrame(np.array(data_list))
+    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
+    df.to_excel('./相关政策/海南省人民政府/江西省人民政府政策文件.xlsx', index=False)
+
+
+if __name__ == '__main__':
+    doJob()
+    baseCore.close()
--- a/REITs专题数据/policy-heilongjiang.py
+++ b/REITs专题数据/policy-heilongjiang.py
+import os
+import os
+import time
+
+import numpy as np
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+
+from base import BaseCore
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+    'Content-Type': 'application/x-www-form-urlencoded',
+    'Token': 'db345f2c-20fd-4cc8-9799-b9cd08b96392',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+}
+
+
+def getDataJson():
+    ip = baseCore.get_proxy()
+    url = 'https://www.hlj.gov.cn/znwd/policy/policy/policy/home/public/policyWikipedia?_method=get'
+    data_post = {
+        'sort': 'smartIndex',
+        'order': 'asc',
+        'start': '0',
+        'length': '20',
+        'filter.all': 'REITs',
+    }
+    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
+    req.encoding = req.apparent_encoding
+    data_json = req.json()['content']['content']
+    return data_json
+
+
+def getSoup(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url,headers=headers,proxies=ip)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.json()['content']['html'],'lxml')
+    return soup
+
+
+def getFjContent(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    return req.content
+
+
+def getContent(num, title, publishDate, summary, id, pub_hao, organ,type):
+    fjhref_list = ''
+    fjtitle_list = ''
+    url = f'https://www.hlj.gov.cn/znwd/policy/#/readDetails?id={id}'
+    writtenDate = ''
+    if type == '政策解读':
+        origin = organ
+        organ = ''
+        href_ = f'https://www.hlj.gov.cn/znwd/policy/policy/policy/ctrl/public/chatPolicyResolve/{id}'
+    else:
+        origin = '黑龙江省人民政府'
+        href_ = f'https://www.hlj.gov.cn/znwd/policy/policy/policy/ctrl/public/chatPolicyFile/findById/{id}'
+
+    soup = getSoup(href_)
+    try:
+        a_list = soup.find_all('a')
+        for a in a_list:
+            href = a.get('href')
+            if '.html' in href or '.shtml' in href or '.htm' in href:
+                continue
+            fjhref_list += href + '\n'
+            category = os.path.splitext(href)[1]
+            fj_title = f'{num}-{publishDate}-{a.text.lstrip().strip()}'
+            if '<' in fj_title or '>' in fj_title:
+                fj_title = fj_title.replace('<', '').replace('>', '')
+            if category not in fj_title:
+                fj_title = fj_title + category
+            fjtitle_list += fj_title + '\n'
+            fjcontent = getFjContent(href)
+            file = f'./相关政策/黑龙江省人民政府/政策文件/{fj_title}'
+            with open(file, 'wb') as f:
+                f.write(fjcontent)
+            log.info(f'{fj_title}===附件下载成功')
+    except Exception as e:
+        log.error(title, '=====', e)
+    try:
+        scripts = soup.find_all('script')
+        for script in scripts:
+            script.decompose()
+    except:
+        pass
+    try:
+        styles = soup.find_all('style')
+        for style in styles:
+            style.decompose()
+    except:
+        pass
+    content = soup.text.lstrip().strip()
+    data_ = [num, title, writtenDate, origin, url, publishDate, organ, pub_hao, summary, content, fjtitle_list,
+             fjhref_list]
+    return data_
+
+
+def doJob():
+    if not os.path.exists('./相关政策/黑龙江省人民政府/政策文件'):
+        os.makedirs('./相关政策/黑龙江省人民政府/政策文件')
+    data_list = []
+    num = 1
+    data_json = getDataJson()
+    for data_ in data_json:
+        title = data_['title']
+        publishDate = data_['date']
+        summary = data_['content']
+        id = data_['dataId']
+        type = data_['typeName']
+        try:
+            pub_hao = data_['writtenText']
+        except:
+            pub_hao = ''
+        try:
+            organ = data_['unitShowName']
+        except:
+            organ = ''
+        data = getContent(num, title, publishDate, summary, id, pub_hao, organ,type)
+        data_list.append(data)
+        num += 1
+        time.sleep(3)
+    df = pd.DataFrame(np.array(data_list))
+    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
+    df.to_excel('./相关政策/黑龙江省人民政府/黑龙江省人民政府政策文件.xlsx', index=False)
+
+if __name__ == "__main__":
+    doJob()
+    baseCore.close()
--- a/REITs专题数据/policy-hubei.py
+++ b/REITs专题数据/policy-hubei.py
+import os
+import os
+import time
+
+import numpy as np
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+from selenium.webdriver.common.by import By
+from selenium.webdriver.firefox.options import Options
+from selenium.webdriver.firefox.service import Service
+
+from base import BaseCore
+import time
+from selenium.webdriver import Firefox
+from selenium import webdriver
+
+
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+}
+
+headers_ = {
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+    'Accept-Encoding': 'gzip, deflate',
+    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+    'Cache-Control': 'no-cache',
+    'Connection': 'keep-alive',
+    'Cookie': 'token=db51d0a6-06e1-49f6-8e4f-8cec52c47bec; uuid=db51d0a6-06e1-49f6-8e4f-8cec52c47bec;',
+    'Host': 'www.hubei.gov.cn',
+    'Pragma': 'no-cache',
+    'Referer': 'http://www.hubei.gov.cn/site/hubei/search.html',
+    'Upgrade-Insecure-Requests': '1',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+}
+
+
+def getSoup(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers_, proxies=ip)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'html.parser')
+    return soup
+
+
+def getFjContent(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    return req.content
+
+
+def getDataJson(page):
+    ip = baseCore.get_proxy()
+    url = f'http://www.hubei.gov.cn/igs/front/search/list.html?index=hb-govdoc&type=govdoc&pageNumber={page}&pageSize=10&filter[AVAILABLE]=true&filter[fileNum-like]=&filter[Effectivestate]=&filter[fileYear]=&filter[fileYear-lte]=&filter[Subjectclass]=&filter[CateGory]=&filter[DOCTITLE,DOCCONTENT,fileNum-or]=REITs&code=872801132c71495bbe5a938f6acff5aa&siteId=50&filter[SITEID]=54&orderProperty=PUBDATE&orderDirection=desc&6LDjm9Ls=0MADqxalqEiunxfMA3PwdIsvIxiRRQzDxXUAXPlbOXcZq0Rg0iIRTAWPM5NCpsIcnfs9rjzmAOc6t7j5dB4VBmMHY3KtuQHQ6bnSkbepFXgB0I.UuQKzMa5IqQB19wRAMEmnB7VYU4cW'
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    return req.json()['page']['content']
+
+
+def getContent(driver, url, num):
+    driver.get(url)
+    time.sleep(5)
+    fjhref_list = ''
+    fjtitle_list = ''
+    publishDate = driver.find_element(By.CLASS_NAME,'hbgov-article-meta-time').text.split('发布时间：')[1].lstrip().strip()
+    contentWithTag = driver.find_element(By.CLASS_NAME,'hbgov-article-content')
+    img_list = contentWithTag.find_elements(By.TAG_NAME,'img')
+    num_ = 1
+    for img in img_list:
+        fj_title = img.get_attribute('title')
+        fj_href = img.get_attribute('src')
+        fjhref_list += fj_href + '\n'
+        if fj_title == '':
+            fj_title = str(num_)
+            num_ += 1
+        category = os.path.splitext(fj_href)[1]
+        if category not in fj_title:
+            fj_title = fj_title + category
+        fj_title = f'{num}-{publishDate[:10]}-{fj_title}'
+        fjtitle_list += fj_title + '\n'
+        fjcontent = getFjContent(fj_href)
+        file = f'./相关政策/湖北省人民政府/政策文件/{fj_title}'
+        if os.path.exists(file):
+            fj_title = fj_title.replace(category,f'-{num_}') + category
+            num_ += 1
+        with open(file, 'wb') as f:
+            f.write(fjcontent)
+        log.info(f'{fj_title}===附件下载成功')
+    content = contentWithTag.text.lstrip().strip()
+    fjtitle_list = fjtitle_list.lstrip().strip()
+    fjhref_list = fjhref_list.lstrip().strip()
+    return publishDate, content, fjtitle_list, fjhref_list
+
+
+def getData(driver, data_, num):
+    title = data_['DOCTITLE']
+    origin = data_['SITENAME']
+    pub_hao = data_['fileNum']
+    writtenDate = data_['PUBDATE']
+    organ = data_['publisher']
+    summary = data_['highlight']['DOCCONTENT'][0]
+    href = data_['DOCPUBURL']
+    publishDate, content, fjtitle_list, fjhref_list = getContent(driver, href, num)
+    data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
+            fjhref_list]
+    return data
+
+
+def doJob():
+    service = Service(r'F:\spider\firefox\geckodriver.exe')
+    options = Options()
+    options.set_preference("general.useragent.override", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
+    driver = webdriver.Firefox(options=options, service=service)
+    if not os.path.exists('./相关政策/湖北省人民政府/政策文件'):
+        os.makedirs('./相关政策/湖北省人民政府/政策文件')
+    data_list = []
+    num = 1
+    for page in range(1, 3):
+        data_json = getDataJson(page)
+        for data_ in data_json:
+            data = getData(driver, data_, num)
+            data_list.append(data)
+            log.info(f'{data[1]}===采集成功')
+            num += 1
+    driver.close()
+    df = pd.DataFrame(np.array(data_list))
+    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
+    df.to_excel('./相关政策/湖北省人民政府/湖北省人民政府政策文件.xlsx', index=False)
+
+#
+if __name__ == '__main__':
+    doJob()
+    # service = Service(r'F:\spider\firefox\geckodriver.exe')
+    # options = Options()
+    # options.set_preference("general.useragent.override", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
+    # driver = webdriver.Firefox(options=options, service=service)
+    # driver.get('http://www.hubei.gov.cn/zfwj/ezf/202208/t20220801_4245008.shtml')
+    # time.sleep(5)
+    # contentWithTag = driver.find_element(By.CLASS_NAME,'hbgov-article-content')
+    # img_list = contentWithTag.find_elements(By.TAG_NAME,'img')
+    # num = 1
+    # for img in img_list:
+    #     fj_href = img.get_attribute('src')
+    #     fjcontent = getFjContent(fj_href)
+    #     with open(f'./{num}.png','wb') as f:
+    #         f.write(fjcontent)
+    #     num += 1
+    baseCore.close()
--- a/REITs专题数据/policy-jiangsu.py
+++ b/REITs专题数据/policy-jiangsu.py
+import os
+import os
+import re
+import time
+
+import numpy as np
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+from selenium.webdriver.common.by import By
+
+from base import BaseCore
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+    'Content-Type': 'application/x-www-form-urlencoded',
+    'Accept': 'application/json, text/javascript, */*; q=0.01',
+}
+
+
+def getSoup(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'html.parser')
+    return soup
+
+
+def getFjContent(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    return req.content
+
+
+def getContentA(url, num, publishDate, title):
+    fjhref_list = ''
+    fjtitle_list = ''
+    soup = getSoup(url)
+    organ = soup.find('div', class_='sp_time').text.split('来源：')[1].split('字体')[0].lstrip().strip()
+    contentWithTag = soup.find('div', attrs={'id': 'zoom'})
+    try:
+        scripts = contentWithTag.find_all('script')
+        for script in scripts:
+            script.decompose()
+    except:
+        pass
+    try:
+        styles = contentWithTag.find_all('style')
+        for style in styles:
+            style.decompose()
+    except:
+        pass
+    try:
+        num_ = 1
+        img_list = contentWithTag.find_all('img')
+        for img in img_list:
+            fj_href = img.get('src')
+            try:
+                fj_href = 'http://www.jiangsu.gov.cn' + fj_href
+                fjhref_list += fj_href + '\n'
+                fj_title = img.get('title').lstrip().strip()
+                fj_title = f'{num}-{publishDate}-{fj_title}'
+                fjtitle_list += fj_title + '\n'
+            except:
+                if 'img/png' in fj_href:
+                    fj_title = f'{num}-{publishDate}-{title}-{num_}.png'
+                elif 'img/jpg' in fj_href:
+                    fj_title = f'{num}-{publishDate}-{title}-{num_}.jpg'
+                num_ += 1
+            fjcontent = getFjContent(fj_href)
+            file = f'./相关政策/江苏省人民政府/政策文件/{fj_title}'
+            with open(file, 'wb') as f:
+                f.write(fjcontent)
+            log.info(f'{fj_title}===附件下载成功')
+    except:
+        pass
+
+    content = contentWithTag.text
+    return organ, content, fjtitle_list, fjhref_list
+
+
+def getContentB(url, num, publishDate, title):
+    fjhref_list = ''
+    fjtitle_list = ''
+    soup = getSoup(url)
+    info = soup.find('table', class_='xxgk_table').text.replace(' ','')
+    organ = info.split('发布机构：')[1].split('发文日期')[0].lstrip().strip()
+    writtenDate = info.split('发文日期：')[1].split('标题：')[0].lstrip().strip()
+    pub_hao = info.split('文号：')[1].split('内容概述：')[0].lstrip().strip()
+    contentWithTag = soup.find('div', class_='article_content')
+    try:
+        scripts = contentWithTag.find_all('script')
+        for script in scripts:
+            script.decompose()
+    except:
+        pass
+    try:
+        styles = contentWithTag.find_all('style')
+        for style in styles:
+            style.decompose()
+    except:
+        pass
+    # try:
+    num_ = 1
+    img_list = contentWithTag.find_all('img')
+    for img in img_list:
+        fj_href = img.get('src')
+        try:
+            fj_title = img.get('title').lstrip().strip()
+            fj_title = f'{num}-{publishDate}-{fj_title}'
+            fjtitle_list += fj_title + '\n'
+            fj_href = 'http://www.jiangsu.gov.cn' + fj_href
+            fjhref_list += fj_href + '\n'
+            fjcontent = getFjContent(fj_href)
+            file = f'./相关政策/江苏省人民政府/政策文件/{fj_title}'
+            with open(file, 'wb') as f:
+                f.write(fjcontent)
+            log.info(f'{fj_title}===附件下载成功')
+        except:
+            if 'image/png' in fj_href:
+                fj_title = f'{num}-{publishDate}-{title}-{num_}.png'
+            elif 'image/jpg' in fj_href:
+                fj_title = f'{num}-{publishDate}-{title}-{num_}.jpg'
+            num_ += 1
+            fjtitle_list += fj_title + '\n'
+    content = contentWithTag.text.lstrip().strip()
+    return organ, writtenDate, pub_hao, content, fjtitle_list, fjhref_list
+
+
+def doJob():
+    if not os.path.exists('./相关政策/江苏省人民政府/政策文件'):
+        os.makedirs('./相关政策/江苏省人民政府/政策文件')
+    pattern = r"\d{4}-\d{2}-\d{2}"
+    url = 'http://www.jiangsu.gov.cn/jsearchfront/search.do?websiteid=320000000100000&searchid=12&pg=&p=1&tpl=38&serviceType=&cateid=27&q=REITs&pq=&oq=&eq=&pos=&sortType=0&begin=&end='
+    driver = baseCore.buildDriver()
+    driver.get(url)
+    time.sleep(5)
+    div_list = driver.find_elements(By.CLASS_NAME,'news-result')
+    num = 1
+    data_list = []
+    for div in div_list:
+        title = div.find_element(By.CLASS_NAME, 'jcse-news-title').find_element(By.TAG_NAME,'a').get_attribute('title').lstrip().strip()
+        href = div.find_element(By.CLASS_NAME, 'jcse-news-title').find_element(By.TAG_NAME,'a').get_attribute('href')
+        type = div.find_element(By.CLASS_NAME, 'biaoqian').text.lstrip().strip()
+        summary = div.find_element(By.CLASS_NAME, 'jcse-news-abs-content').text.lstrip().strip()
+        dateInfo = div.find_element(By.CLASS_NAME, 'jcse-news-date').text
+        publishDate = re.findall(pattern, dateInfo)[0]
+        origin = dateInfo.replace(publishDate, '').lstrip().strip()
+        if type == '政务公开':
+            organ, content, fjtitle_list, fjhref_list = getContentA(href, num, publishDate, title)
+            writtenDate = ''
+            pub_hao = ''
+        else:
+            organ, writtenDate, pub_hao, content, fjtitle_list, fjhref_list = getContentB(href, num, publishDate, title)
+        data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
+                 fjhref_list]
+        data_list.append(data)
+        log.info(f'{title}===采集成功')
+        num += 1
+        time.sleep(5)
+    driver.close()
+    df = pd.DataFrame(np.array(data_list))
+    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
+    df.to_excel('./江苏省人民政府政策文件.xlsx', index=False)
+
+
+if __name__ == '__main__':
+    doJob()
+    baseCore.close()
--- a/REITs专题数据/policy-jiangxi.py
+++ b/REITs专题数据/policy-jiangxi.py
+import requests
+import requests
+from bs4 import BeautifulSoup
+from base import BaseCore
+import os
+import pandas as pd
+import numpy as np
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+    'X-Requested-With': 'XMLHttpRequest',
+    'Content-Type': 'application/x-www-form-urlencoded',
+}
+
+
+def getSoup(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'html.parser')
+    return soup
+
+
+def getFjContent(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    return req.content
+
+
+def getDataJson():
+    ip = baseCore.get_proxy()
+    url = 'http://sousuo.jiangxi.gov.cn/jsearchfront/interfaces/cateSearch.do'
+    data_post = {
+        'websiteid': '360000000000000',
+        'q': 'REITs',
+        'p': '1',
+        'pg': '20',
+        'cateid': '1517',
+        'pos': 'content',
+        'pq': '',
+        'oq': '',
+        'eq': '',
+        'begin': '',
+        'end': '',
+        'tpl': '49',
+        'sortType': '',
+    }
+    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
+    req.encoding = req.apparent_encoding
+    data_json = req.json()['result']
+    return data_json
+
+
+def getContent(url, num, publishDate):
+    fjhref_list = ''
+    fjtitle_list = ''
+    soup = getSoup(url)
+    contentWithTag = soup.find('div', attrs={'id': 'zoom'})
+    img_list = contentWithTag.find_all('img')
+    num_ = 1
+    for img in img_list:
+        fj_href = 'http://www.jiangxi.gov.cn' + img.get('src')
+        fjhref_list += fj_href + '\n'
+        fj_title = img.get('title')
+        if fj_title == '':
+            fj_title = str(num_)
+            num_ += 1
+        category = os.path.splitext(fj_href)[1]
+        if category not in fj_title:
+            fj_title = fj_title + category
+        fj_title = f'{num}-{publishDate}-{fj_title}'
+        fjtitle_list += fj_title + '\n'
+        fjcontent = getFjContent(fj_href)
+        file = f'./相关政策/江西省人民政府/政策文件/{fj_title}'
+        with open(file, 'wb') as f:
+            f.write(fjcontent)
+        log.info(f'{fj_title}===附件下载成功')
+    try:
+        scripts = contentWithTag.find_all('script')
+        for script in scripts:
+            script.decompose()
+    except:
+        pass
+    try:
+        styles = contentWithTag.find_all('style')
+        for style in styles:
+            style.decompose()
+    except:
+        pass
+    content = contentWithTag.text.lstrip().strip()
+    return content, fjtitle_list, fjhref_list
+
+
+def doJob():
+    if not os.path.exists('./相关政策/江西省人民政府/政策文件'):
+        os.makedirs('./相关政策/江西省人民政府/政策文件')
+    data_json = getDataJson()
+    data_list = []
+    num = 1
+    for data_ in data_json:
+        data_ = data_.replace('\\', '')
+        soup = BeautifulSoup(data_, 'lxml')
+        title = soup.select('body > div > div:nth-of-type(1) > span:nth-of-type(2) > a')[0].text.lstrip().strip()
+        pub_hao = soup.find('table', class_='jcse-service-table').find_all('tr')[0].find_all('td')[
+            -1].text.lstrip().strip()
+        organ = soup.find('table', class_='jcse-service-table').find_all('tr')[1].find_all('td')[
+            1].text.lstrip().strip()
+        writtenDate = soup.find('table', class_='jcse-service-table').find_all('tr')[1].find_all('td')[
+            -1].text.lstrip().strip()
+        summary = soup.find('table', class_='jcse-service-table').find_all('tr')[2].text.lstrip().strip()
+        href = soup.find('table', class_='jcse-service-table').find_all('tr')[3].find('a').get('href')
+        publishDate = writtenDate
+        origin = '江西省人民政府'
+        content, fjtitle_list, fjhref_list = getContent(href, num, publishDate)
+        data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
+                fjhref_list]
+        data_list.append(data)
+        log.info(f'{title}===采集成功')
+        num += 1
+    df = pd.DataFrame(np.array(data_list))
+    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
+    df.to_excel('./相关政策/江西省人民政府/江西省人民政府政策文件.xlsx', index=False)
+
+
+if __name__ == '__main__':
+    doJob()
+    baseCore.close()
--- a/REITs专题数据/policy-jilin.py
+++ b/REITs专题数据/policy-jilin.py
+import os
+import os
+import time
+
+import numpy as np
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+
+from base import BaseCore
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+    'X-Requested-With': 'XMLHttpRequest',
+}
+
+
+def getTotal(url):
+    ip = baseCore.get_proxy()
+    data_post = 'params=%7B%22word%22%3A%22REITs%22%2C%22page%22%3A1%2C%22size%22%3A20%2C%22stype%22%3A%223%22%2C%22area%22%3A%22220000%22%2C%22atype%22%3A%221%22%2C%22dept%22%3A%22%22%2C%22ttype%22%3A%220%22%2C%22start%22%3A%22%22%2C%22end%22%3A%22%22%2C%22itype%22%3A%22%22%2C%22mattType%22%3A%220%22%2C%22serverType%22%3A%220%22%2C%22sort%22%3A0%2C%22aword%22%3A%22%22%2C%22hword%22%3A%22%22%2C%22nword%22%3A%22%22%2C%22dtword%22%3A%22%22%2C%22scope%22%3A%221%22%2C%22selecttp%22%3A%220%22%2C%22filetype%22%3A%22%E5%85%A8%E9%83%A8%22%2C%22fileyear%22%3A%22%E5%85%A8%E9%83%A8%22%2C%22stypeChild%22%3A%220%22%2C%22hs%22%3A%220%22%2C%22flag%22%3A%22%22%2C%22satisfiedId%22%3A%224FD2493B0F0D447E955C4BB94F42228C634%22%7D'
+    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
+    req.encoding = req.apparent_encoding
+    total = req.json()['data']['data']['totalPage']
+    return int(total)
+
+
+def getDataJson(url, page):
+    ip = baseCore.get_proxy()
+    data_post = f'params=%7B%22word%22%3A%22REITs%22%2C%22page%22%3A{page}%2C%22size%22%3A20%2C%22stype%22%3A%223%22%2C%22area%22%3A%22220000%22%2C%22atype%22%3A%221%22%2C%22dept%22%3A%22%22%2C%22ttype%22%3A%220%22%2C%22start%22%3A%22%22%2C%22end%22%3A%22%22%2C%22itype%22%3A%22%22%2C%22mattType%22%3A%220%22%2C%22serverType%22%3A%220%22%2C%22sort%22%3A0%2C%22aword%22%3A%22%22%2C%22hword%22%3A%22%22%2C%22nword%22%3A%22%22%2C%22dtword%22%3A%22%22%2C%22scope%22%3A%221%22%2C%22selecttp%22%3A%220%22%2C%22filetype%22%3A%22%E5%85%A8%E9%83%A8%22%2C%22fileyear%22%3A%22%E5%85%A8%E9%83%A8%22%2C%22stypeChild%22%3A%220%22%2C%22hs%22%3A%220%22%2C%22flag%22%3A%22%22%2C%22satisfiedId%22%3A%224FD2493B0F0D447E955C4BB94F42228C634%22%7D'
+    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
+    req.encoding = req.apparent_encoding
+    data_json = req.json()['data']['data']['list']
+    return data_json
+
+
+def getFjContent(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    return req.content
+
+
+def getData(num, title, href, origin, publishDate, summary):
+    writtenDate = ''
+    pub_hao = ''
+    organ = ''
+    fjhref_list = ''
+    fjtitle_list = ''
+    ip = baseCore.get_proxy()
+    req = requests.get(href, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'html.parser')
+    try:
+        scripts = soup.find_all('script')
+        for script in scripts:
+            script.decompose()
+    except:
+        pass
+    try:
+        styles = soup.find_all('style')
+        for style in styles:
+            style.decompose()
+    except:
+        pass
+    contentWithTag = soup.find('div', class_='contents_div')
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='zlyxwz')
+        if not contentWithTag:
+            contentWithTag = soup.find('div', attrs={'id': 'zlyxwz'})
+            if not contentWithTag:
+                contentWithTag = soup.find('div', attrs={'id': 'Zoom'})
+                if not contentWithTag:
+                    contentWithTag = soup.find('div',class_='sycon_bg')
+                    if not contentWithTag:
+                        contentWithTag = soup.find('div', attrs={'id': 'zoom'})
+    try:
+        try:
+            organ = soup.find('div', class_='xqy').text
+            organ = organ.split('来源：')[1].split('字体：')[0].lstrip().strip()
+        except:
+            info = soup.find('div', class_='zlylb_dy').find('table').text
+            organ = info.split('发文机关：')[1].split('成文日期：')[0].lstrip().strip()
+            writtenDate = info.split('成文日期：')[1].split('标')[0].lstrip().strip().replace('年', '-').replace('月',
+                                                                                                          '-').replace(
+                '日', '')
+            pub_hao = info.split('发文字号：')[1].split('发布日期：')[0].lstrip().strip()
+    except:
+        try:
+            organ = soup.find('div', class_='mqj_jtyst_xxnry_top_title_left_box').text.split('来源：')[1].lstrip().strip()
+        except:
+            table_list = soup.find_all('table')
+            for table in table_list:
+                if '发文机关' in table.text:
+                    info = table.text
+                    organ = info.split('发文机关：')[1].split('成文日期：')[0].lstrip().strip()
+                    writtenDate = info.split('成文日期：')[1].split('标')[0].lstrip().strip().replace('年', '-').replace('月',
+                                                                                                                  '-').replace(
+                        '日', '')
+                    pub_hao = info.split('发文字号：')[1].split('发布日期：')[0].lstrip().strip()
+                    continue
+    if pub_hao == '无':
+        pub_hao = ''
+    try:
+        a_list = contentWithTag.find_all('a')
+        for a in a_list:
+            if '.html' in a.get('href') or '.shtml' in a.get('href') or '.htm' in a.get('href'):
+                continue
+            href = a.get('href')
+            fjhref_list += href + '\n'
+            category = os.path.splitext(href)[1]
+            fj_title = f'{num}-{publishDate}-{a.text.lstrip().strip()}'
+            if '<' in fj_title or '>' in fj_title:
+                fj_title = fj_title.replace('<', '').replace('>', '')
+            if category not in fj_title:
+                fj_title = fj_title + category
+            fjtitle_list += fj_title + '\n'
+            fjcontent = getFjContent(href)
+            file = f'./相关政策/内蒙古自治区人民政府/政策文件/{fj_title}'
+            with open(file, 'wb') as f:
+                f.write(fjcontent)
+            log.info(f'{fj_title}===附件下载成功')
+    except:
+        pass
+    try:
+        a_list = soup.find('div', class_='wjfj-1026').find_all('a')
+        for a in a_list:
+            if '.html' in a.get('href') or '.shtml' in a.get('href') or '.htm' in a.get('href'):
+                continue
+            href = a.get('href')
+            fjhref_list += href + '\n'
+            category = os.path.splitext(href)[1]
+            fj_title = f'{num}-{publishDate}-{a.text.lstrip().strip()}'
+            if '<' in fj_title or '>' in fj_title:
+                fj_title = fj_title.replace('<', '').replace('>', '')
+            if category not in fj_title:
+                fj_title = fj_title + category
+            fjtitle_list += fj_title + '\n'
+            fjcontent = getFjContent(href)
+            file = f'./相关政策/内蒙古自治区人民政府/政策文件/{fj_title}'
+            with open(file, 'wb') as f:
+                f.write(fjcontent)
+            log.info(f'{fj_title}===附件下载成功')
+    except:
+        pass
+
+    content = contentWithTag.text.lstrip().strip()
+    data_ = [num, title, writtenDate, origin, href, publishDate, organ, pub_hao, summary, content, fjtitle_list,
+             fjhref_list]
+    return data_
+
+
+def doJob():
+    if not os.path.exists('./相关政策/吉林省人民政府/政策文件'):
+        os.makedirs('./相关政策/吉林省人民政府/政策文件')
+    data_list = []
+    num = 1
+    url = 'https://intellsearch.jl.gov.cn/api/data/list'
+    total = getTotal(url)
+    for page in range(1, total + 1):
+        data_json = getDataJson(url, page)
+        for data_ in data_json:
+            title = data_['title']
+            title = BeautifulSoup(title, 'lxml').find('p').text.lstrip().strip()
+            href = data_['url']
+            origin = data_['websiteName']
+            publishDate = data_['pubtime'].replace('/', '-')
+            summary = data_['content']
+            summary = BeautifulSoup(summary, 'lxml').find('p').text.lstrip().strip()
+            data = getData(num, title, href, origin, publishDate, summary)
+            data_list.append(data)
+            num += 1
+    df = pd.DataFrame(np.array(data_list))
+    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
+    df.to_excel('./吉林省人民政府政策文件.xlsx', index=False)
+
+
+if __name__ == '__main__':
+    doJob()
+    baseCore.close()
--- a/REITs专题数据/policy-liaoning.py
+++ b/REITs专题数据/policy-liaoning.py
+import time
+import time
+from urllib.parse import urljoin
+
+import numpy as np
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+from selenium.webdriver.common.by import By
+
+from base import BaseCore
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+    'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
+}
+
+
+def getContent(url):
+    req = requests.get(url, headers=headers)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'html.parser')
+    contentWithTag = soup.find('div', class_='zfwj_detail')
+    pub_hao = contentWithTag.find('p', class_='wjh').text.lstrip().strip()
+    content = contentWithTag.text.lstrip().strip()
+    return content, pub_hao
+
+
+def doJob():
+    url = 'https://www.ln.gov.cn/search/pcRender?pageId=7b2aa485f97e40e4a0b4b635f36eda6c'
+    driver = baseCore.buildDriver()
+    driver.get(url)
+    time.sleep(1)
+    driver.find_element(By.CLASS_NAME, 'conFl_con').find_elements(By.TAG_NAME, 'a')[-1].find_element(By.TAG_NAME,
+                                                                                                     'label').click()
+    time.sleep(1)
+    driver.find_element(By.CLASS_NAME, 'search_inps').send_keys('REITs')
+    driver.find_element(By.CLASS_NAME, 'search_btns').click()
+    time.sleep(1)
+    div_list = driver.find_elements(By.CLASS_NAME, 'searchMod')
+    num = 1
+    data_list = []
+    for div in div_list:
+        title = div.find_element(By.TAG_NAME, 'a').text.replace('\n', '').lstrip().strip()
+        href = div.find_element(By.TAG_NAME, 'a').get_attribute('href')
+        summary = div.find_element(By.CLASS_NAME, 'txtCon').find_element(By.TAG_NAME, 'a').text.replace('\n',
+                                                                                                        '').lstrip().strip()
+        publishDate = div.find_element(By.CLASS_NAME, 'dates').text.split('时间：')[1].replace('年', '-').replace('月',
+                                                                                                              '-').replace(
+            '日', '').lstrip().strip()
+        content, pub_hao = getContent(href)
+        data = [num, title, publishDate, '辽宁省人民政府', href, '', '', pub_hao, summary, content, '', '']
+        data_list.append(data)
+        log.info(f'{title}===采集成功')
+        num += 1
+    driver.close()
+    df = pd.DataFrame(np.array(data_list))
+    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
+    df.to_excel('./辽宁省人民政府政策文件.xlsx', index=False)
+
+
+if __name__ == '__main__':
+    doJob()
+    baseCore.close()
--- a/REITs专题数据/policy-neimenggu.py
+++ b/REITs专题数据/policy-neimenggu.py
+import os
+import os
+import time
+from urllib.parse import urljoin
+
+import numpy as np
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+from base import BaseCore
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+    'Accept': 'application/json, text/plain, */*',
+    'Accept-Encoding': 'gzip, deflate, br',
+    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+    'Cache-Control': 'no-cache',
+    'Connection': 'keep-alive',
+    'Host': 'www.nmg.gov.cn',
+    'Pragma': 'no-cache',
+    'Referer': 'https://www.nmg.gov.cn/nmg_search/',
+    'Sec-Fetch-Dest': 'empty',
+    'Sec-Fetch-Mode': 'cors',
+    'Sec-Fetch-Site': 'same-origin',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+    'sec-ch-ua': '"Microsoft Edge";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
+    'sec-ch-ua-mobile': '?0',
+    'sec-ch-ua-platform': '"Windows"',
+}
+
+headers_ = {
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+    'Accept-Encoding': 'gzip, deflate',
+    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+    'Cache-Control': 'no-cache',
+    'Connection': 'keep-alive',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+}
+
+
+def paserUrl(html, listurl):
+    # 获取所有的<a>标签和<img>标签
+    if isinstance(html, str):
+        html = BeautifulSoup(html, 'html.parser')
+
+    links = html.find_all(['a', 'img'])
+    # 遍历标签，将相对地址转换为绝对地址
+    for link in links:
+        if 'href' in link.attrs:
+            link['href'] = urljoin(listurl, link['href'])
+        elif 'src' in link.attrs:
+            link['src'] = urljoin(listurl, link['src'])
+    return html
+
+
+def getSoup(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers_, proxies=ip)
+    if req.status_code != 200:
+        return ''
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'lxml')
+    return soup
+
+
+def getPageSize():
+    ip = baseCore.get_proxy()
+    url = 'https://www.nmg.gov.cn/nmsearch/trssearch/searchAll.do?siteId=32&searchTag=zc&allKeywords=REITs&fullKeywords=&orKeywords=&notKeywords=&sort=&position=0&organization=&pageNum=1&pageSize=10&zcYear=&zcMonth=&docno=&cdesc=&publisher=&cityName=&isAlways=1&isSearchRmzfAndBgt=&isAccurate=1'
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    total = int(req.json()['data']['total'])
+    if total % 10 == 0:
+        pageSize = int(total / 10)
+    else:
+        pageSize = int(total / 10) + 1
+    return pageSize
+
+
+def getJson(page):
+    ip = baseCore.get_proxy()
+    url = f'https://www.nmg.gov.cn/nmsearch/trssearch/searchAll.do?siteId=32&searchTag=zc&allKeywords=REITs&fullKeywords=&orKeywords=&notKeywords=&sort=&position=0&organization=&pageNum={page}&pageSize=10&zcYear=&zcMonth=&docno=&cdesc=&publisher=&cityName=&isAlways=1&isSearchRmzfAndBgt=&isAccurate=1'
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    return req.json()['data']['data']
+
+
+def getFjContent(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers_, proxies=ip)
+    req.encoding = req.apparent_encoding
+    return req.content
+
+
+def getContent(num, data):
+    fjhref_list = ''
+    fjtitle_list = ''
+    title = data['title']
+    pub_hao = data['docno']
+    origin = data['sitedesc']
+    organ = data['publisher']
+    publishDate = data['docpubtime']
+    try:
+        writtenDate = data['scrq']
+    except:
+        writtenDate = ''
+    summary = BeautifulSoup(data['zc_doccontent'], 'html.parser').text.lstrip().strip()
+    url = data['docpuburl']
+    soup = getSoup(url)
+    if soup == '':
+        return ''
+    url_ = url.split('/')[-1]
+    soup = paserUrl(soup, url.replace(url_, ''))
+    contentWithTag = soup.find('div', attrs={'id': 'pare'})
+    if not contentWithTag:
+        contentWithTag = soup.find('div', attrs={'id': 'zoom'})
+        if not contentWithTag:
+            contentWithTag = soup.find('div', attrs={'id': 'd_show'})
+            if not contentWithTag:
+                contentWithTag = soup.find('div', attrs={'class': 'zoomCon'})
+                if not contentWithTag:
+                    contentWithTag = soup.find('div', attrs={'id': 'pagecontent'})
+    if writtenDate == '':
+        try:
+            tr_list = soup.find('table', class_='m-detailtb').find_all('tr')
+            for tr in tr_list:
+                if '成文日期' in tr.text:
+                    writtenDate = tr.text.split('成文日期：')[1].split('发布日期：')[0].lstrip().strip()
+        except:
+            tr_list = soup.find('div', class_='main').find('table').find_all('tr')
+            for tr in tr_list:
+                if '成文时间' in tr.text:
+                    writtenDate = tr.text.split('成文时间:')[1].lstrip().strip()
+    try:
+        contentWithTag.find('div', class_='clearfix').decompose()
+    except:
+        pass
+    try:
+        contentWithTag.find('div', class_='cc_shangxiaye').decompose()
+    except:
+        pass
+    try:
+        scripts = contentWithTag.find_all('script')
+        for script in scripts:
+            script.decompose()
+    except:
+        pass
+    try:
+        styles = contentWithTag.find_all('style')
+        for style in styles:
+            style.decompose()
+    except:
+        pass
+    try:
+        a_list = contentWithTag.find_all('a')
+        for a in a_list:
+            href = a.get('href')
+            fjhref_list += href + '\n'
+            category = os.path.splitext(href)[1]
+            fj_title = f'{num}-{publishDate}-{a.text.lstrip().strip()}'
+            if '<' in fj_title or '>' in fj_title:
+                fj_title = fj_title.replace('<', '').replace('>', '')
+            if category not in fj_title:
+                fj_title = fj_title + category
+            fjtitle_list += fj_title + '\n'
+            fjcontent = getFjContent(href)
+            file = f'./相关政策/内蒙古自治区人民政府/政策文件/{fj_title}'
+            with open(file, 'wb') as f:
+                f.write(fjcontent)
+            log.info(f'{fj_title}===附件下载成功')
+    except Exception as e:
+        log.error(title, '=====', e)
+    content = contentWithTag.text.lstrip().strip()
+    data_ = [num, title, publishDate, origin, url, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
+             fjhref_list]
+    return data_
+
+
+def doJob():
+    if not os.path.exists('./相关政策/内蒙古自治区人民政府/政策文件'):
+        os.makedirs('./相关政策/内蒙古自治区人民政府/政策文件')
+    data_list = []
+    pageSize = getPageSize()
+    num = 1
+    for page in range(1, pageSize + 1):
+        data_json = getJson(page)
+        for data_ in data_json:
+            if data_['chnldesc'] == '政策文件':
+                data = getContent(num, data_)
+                if data:
+                    data_list.append(data)
+                    num += 1
+                    log.info(f'{data[1]}===采集成功')
+    df = pd.DataFrame(np.array(data_list))
+    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
+    df.to_excel('./内蒙古自治区人民政府政策文件.xlsx', index=False)
+
+
+if __name__ == '__main__':
+    doJob()
+    baseCore.close()
--- a/REITs专题数据/policy-shandong.py
+++ b/REITs专题数据/policy-shandong.py
+import time
+import time
+
+import requests
+from bs4 import BeautifulSoup
+from base import BaseCore
+import os
+import pandas as pd
+import numpy as np
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+    'X-Requested-With': 'XMLHttpRequest',
+    'Content-Type': 'application/x-www-form-urlencoded',
+}
+
+
+def getSoup(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'html.parser')
+    return soup
+
+
+def getFjContent(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    return req.content
+
+
+def getDataJson(page):
+    ip = baseCore.get_proxy()
+    url = 'http://www.shandong.gov.cn/jsearchfront/interfaces/cateSearch.do'
+    data_post = {
+        'websiteid': '370000000088000',
+        'q': 'REITs',
+        'p': f'{page}',
+        'pg': '12',
+        'cateid': '18002',
+        'pos': '',
+        'pq': '',
+        'oq': '',
+        'eq': '',
+        'begin': '',
+        'end': '',
+        'tpl': '2204',
+        'sortFields': "[{'name':'top','clause':1},{'name':'score','clause':1}]",
+    }
+    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
+    req.encoding = req.apparent_encoding
+    return req.json()['result']
+
+
+def getContent(url, publishDate, num):
+    fjhref_list = ''
+    fjtitle_list = ''
+    soup = getSoup(url)
+    contentWithTag = soup.find('div', class_='wip_art_con')
+    a_list = contentWithTag.find_all('a')
+    num_ = 1
+    for a in a_list:
+        fj_href = a.get('href')
+        if 'http' not in fj_href:
+            fj_href = 'http://www.shandong.gov.cn' + fj_href
+        fjhref_list += fj_href + '\n'
+        fj_title = a.text.lstrip().strip().replace(' ', '')
+        if fj_title == '':
+            fj_title = str(num_)
+            num_ += 1
+        category = os.path.splitext(fj_href)[1]
+        if category not in fj_title:
+            fj_title = fj_title + category
+        fj_title = f'{num}-{publishDate}-{fj_title}'
+        fjtitle_list += fj_title + '\n'
+        fjcontent = getFjContent(fj_href)
+        file = f'./相关政策/山东省人民政府/政策文件/{fj_title}'
+        with open(file, 'wb') as f:
+            f.write(fjcontent)
+        log.info(f'{fj_title}===附件下载成功')
+    try:
+        scripts = contentWithTag.find_all('script')
+        for script in scripts:
+            script.decompose()
+    except:
+        pass
+    try:
+        styles = contentWithTag.find_all('style')
+        for style in styles:
+            style.decompose()
+    except:
+        pass
+    content = contentWithTag.text.lstrip().strip()
+    return content, fjtitle_list, fjhref_list
+
+
+def getData(soup, num):
+    origin = '山东省人民政府'
+    organ = ''
+    writtenDate = ''
+    pub_hao = ''
+    try:
+        type = soup.find('span', class_='szf_lmmc').text
+        title = soup.find('div', class_='szf_title').find('a').text.lstrip().strip()
+        if '山东省政府文件库' in type:
+            summary = soup.find('div', class_='szf_ms').text.lstrip().strip()
+            organ = soup.find('table', class_='szf_xxgk').find_all('tr')[0].find_all('td')[1].text.lstrip().strip()
+            writtenDate = soup.find('table', class_='szf_xxgk').find_all('tr')[0].find_all('td')[
+                -1].text.lstrip().strip()
+            pub_hao = soup.find('table', class_='szf_xxgk').find_all('tr')[1].find_all('td')[-1].text.lstrip().strip()
+            href = soup.find('a', class_='szf_url').text.lstrip().strip()
+            publishDate = soup.find('span', class_='szf_rq').text.lstrip().strip()
+        else:
+            summary = soup.find('div', class_='szf_ms').text.lstrip().strip()
+            href = soup.find('a', class_='szf_url').text.lstrip().strip()
+            publishDate = soup.find('span', class_='szf_rq').text.lstrip().strip()
+    except:
+        title = soup.find('div', class_='jcse-news-title').find('a').text.lstrip().strip()
+        summary = soup.find('div', class_='jcse-news-abs-content').text.lstrip().strip()
+        href = soup.find('div', class_='jcse-news-url').text.lstrip().strip()
+        publishDate = soup.find('span', class_='jcse-news-date').text.lstrip().strip()
+    content, fjtitle_list, fjhref_list = getContent(href, publishDate, num)
+    data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
+            fjhref_list]
+    return data
+
+
+def doJob():
+    if not os.path.exists('./相关政策/山东省人民政府/政策文件'):
+        os.makedirs('./相关政策/山东省人民政府/政策文件')
+    data_list = []
+    num = 1
+    for page in range(1, 3):
+        data_json = getDataJson(page)
+        for data_ in data_json:
+            data_ = data_.replace('\\', '')
+            soup = BeautifulSoup(data_, 'lxml')
+            data = getData(soup, num)
+            data_list.append(data)
+            log.info(f'{data[1]}===采集成功')
+            num += 1
+            time.sleep(3)
+    df = pd.DataFrame(np.array(data_list))
+    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
+    df.to_excel('./相关政策/山东省人民政府/山东省人民政府政策文件.xlsx', index=False)
+
+
+if __name__ == '__main__':
+    doJob()
+    baseCore.close()
--- a/REITs专题数据/policy-shanghai.py
+++ b/REITs专题数据/policy-shanghai.py
+import json
+import json
+import os
+import time
+
+import numpy as np
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+from selenium.webdriver.common.by import By
+
+from base import BaseCore
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+    'Accept': '*/*',
+    'Accept-Encoding': 'gzip, deflate, br',
+    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+    'Cache-Control': 'no-cache',
+    'Connection': 'keep-alive',
+    'Content-Type': 'text/plain',
+    'Host': 'ss.shanghai.gov.cn',
+    'Origin': 'https://www.shanghai.gov.cn',
+    'Pragma': 'no-cache',
+    'Sec-Fetch-Dest': 'empty',
+    'Sec-Fetch-Mode': 'cors',
+    'Sec-Fetch-Site': 'same-site',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+    'sec-ch-ua': '"Microsoft Edge";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
+    'sec-ch-ua-mobile': '?0',
+    'sec-ch-ua-platform': '"Windows"',
+}
+
+
+def getDataJson():
+    ip = baseCore.get_proxy()
+    url = 'https://ss.shanghai.gov.cn/manda-app/api/app/search/v1/1drao49/search'
+    data_post = {"cid": "lyHojYviSD3dOfgVFV4aGIu8Ytk7zEWy", "uid": "lyHojYviSD3dOfgVFV4aGIu8Ytk7zEWy", "query": "REITs",
+                 "current": 1, "size": 20, "disable_correction": False,
+                 "facets": {"fwjg": [{"type": "value", "name": "fwjg", "sort": {"count": "desc"}, "size": 100}]},
+                 "input_type": "Input"}
+    data_post = json.dumps(data_post)
+    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
+    req.encoding = req.apparent_encoding
+    data_json = req.json()['result']['items']
+    return data_json
+
+
+def getFjContent(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    return req.content
+
+
+def getData(data_, driver, num):
+    fjhref_list = ''
+    fjtitle_list = ''
+    title = data_['title']['raw']
+    publishDate = data_['date']['raw']
+    origin = '上海市人民政府'
+    href = data_['url']['raw']
+    organ = data_['fwjg']['raw']
+    pub_hao = data_['wh']['raw']
+    summary = data_['content']['snippet']
+    driver.get(href)
+    time.sleep(1)
+    content = driver.find_element(By.CLASS_NAME, 'Article_content').text.lstrip().strip()
+    timeTag = driver.find_element(By.CLASS_NAME, 'PBtime').text
+    try:
+        try:
+            try:
+                writtenDate = timeTag.split('印发日期：')[1].split('发布日期')[0].lstrip().strip()
+            except:
+                writtenDate = timeTag.split('印发日期：')[1].split(f'{pub_hao}')[0].lstrip().strip()
+        except:
+            writtenDate = timeTag.split('印发日期：')[1].lstrip().strip()
+    except:
+        writtenDate = ''
+    try:
+        a_list = driver.find_element(By.CLASS_NAME, 'gaoj-list').find_elements(By.TAG_NAME, 'a')
+        for a in a_list:
+            fj_href = a.get_attribute('href')
+            fjhref_list += fj_href + '\n'
+            category = os.path.splitext(href)[1]
+            fj_title = f'{num}-{publishDate}-{a.text.lstrip().strip()}'
+            if '<' in fj_title or '>' in fj_title:
+                fj_title = fj_title.replace('<', '').replace('>', '')
+            if category not in fj_title:
+                fj_title = fj_title + category
+            fjtitle_list += fj_title + '\n'
+    #         fjcontent = getFjContent(href)
+    #         file = f'./相关政策/内蒙古自治区人民政府/政策文件/{fj_title}'
+    #         with open(file, 'wb') as f:
+    #             f.write(fjcontent)
+    #         log.info(f'{fj_title}===附件下载成功')
+    except:
+        pass
+    data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
+            fjhref_list]
+    return data
+
+
+def doJob():
+    if not os.path.exists('./相关政策/上海市人民政府/政策文件'):
+        os.makedirs('./相关政策/上海市人民政府/政策文件')
+    driver = baseCore.buildDriver()
+    data_list = []
+    num = 1
+    data_json = getDataJson()
+    for data_ in data_json:
+        data = getData(data_, driver, num)
+        log.info(f'{data[1]}===采集成功')
+        data_list.append(data)
+        num += 1
+    df = pd.DataFrame(np.array(data_list))
+    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
+    df.to_excel('./相关政策/上海市人民政府/上海市人民政府政策文件.xlsx', index=False)
+
+
+if __name__ == '__main__':
+    doJob()
+    baseCore.close()
--- a/REITs专题数据/policy-shanxi.py
+++ b/REITs专题数据/policy-shanxi.py
+import os
+import os
+from urllib.parse import urljoin
+
+import numpy as np
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+from base import BaseCore
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+    'Accept': 'application/json, text/plain, */*',
+    'Accept-Encoding': 'gzip, deflate',
+    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+    'Cache-Control': 'no-cache',
+    'Connection': 'keep-alive',
+    'Host': 'www.shanxi.gov.cn',
+    'Pragma': 'no-cache',
+    'Referer': 'http://www.shanxi.gov.cn/',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+}
+
+
+def paserUrl(html, listurl):
+    # 获取所有的<a>标签和<img>标签
+    if isinstance(html, str):
+        html = BeautifulSoup(html, 'html.parser')
+
+    links = html.find_all(['a', 'img'])
+    # 遍历标签，将相对地址转换为绝对地址
+    for link in links:
+        if 'href' in link.attrs:
+            link['href'] = urljoin(listurl, link['href'])
+        elif 'src' in link.attrs:
+            link['src'] = urljoin(listurl, link['src'])
+    return html
+
+
+def getSoup(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'lxml')
+    return soup
+
+
+def getPageSize():
+    ip = baseCore.get_proxy()
+    url = 'http://www.shanxi.gov.cn/trs-search/trssearch/v2/searchAll.do?siteId=110&searchTag=zc&allKeywords=REITs&fullKeywords=&orKeywords=&notKeywords=&sort=&position=0&organization=&pageNum=1&pageSize=10&zcYear=&zcMonth=&docno=&cdesc=&publisher=&cityName=&isAlways=1&isSearchRmzfAndBgt='
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    total = req.json()['data']['total']
+    if total % 10 == 0:
+        pageSize = int(total / 10)
+    else:
+        pageSize = int(total / 10) + 1
+    return pageSize
+
+
+def getJson(page):
+    ip = baseCore.get_proxy()
+    url = f'http://www.shanxi.gov.cn/trs-search/trssearch/v2/searchAll.do?siteId=110&searchTag=zc&allKeywords=REITs&fullKeywords=&orKeywords=&notKeywords=&sort=&position=0&organization=&pageNum={page}&pageSize=10&zcYear=&zcMonth=&docno=&cdesc=&publisher=&cityName=&isAlways=1&isSearchRmzfAndBgt='
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    return req.json()['data']['data']
+
+
+def getFjContent(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    return req.content
+
+
+def getContent(num, data):
+    fjhref_list = ''
+    fjtitle_list = ''
+    title = data['title']
+    pub_hao = data['docno']
+    origin = data['sitedesc']
+    organ = data['publisher']
+    publishDate = data['docpubtime']
+    writtenDate = data['scrq']
+    summary = BeautifulSoup(data['zc_doccontent'], 'html.parser').text.lstrip().strip()
+    url = data['docpuburl']
+    url_ = url.split('/')[-1]
+    soup = getSoup(url)
+    soup = paserUrl(soup, url.replace(url_, ''))
+    contentWithTag = soup.find('dt', class_='fl_pc')
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='sxgzk-detail-con')
+    try:
+        scripts = contentWithTag.find_all('script')
+        for script in scripts:
+            script.decompose()
+    except:
+        pass
+    try:
+        styles = contentWithTag.find_all('style')
+        for style in styles:
+            style.decompose()
+    except:
+        pass
+    a_list = contentWithTag.find_all('a')
+    for a in a_list:
+        href = a.get('href')
+        fjhref_list += href + '\n'
+        category = os.path.splitext(href)[1]
+        fj_title = f'{num}-{publishDate}-{a.text.lstrip().strip()}'
+        if '<' in fj_title or '>' in fj_title:
+            fj_title = fj_title.replace('<', '').replace('>', '')
+        if category not in fj_title:
+            fj_title = fj_title + category
+        fjtitle_list += fj_title + '\n'
+        fjcontent = getFjContent(href)
+        file = f'./相关政策/山西省人民政府/政策文件/{fj_title}'
+        with open(file, 'wb') as f:
+            f.write(fjcontent)
+        log.info(f'{fj_title}===附件下载成功')
+    content = contentWithTag.text.lstrip().strip()
+    data_ = [num, title, writtenDate, origin, url, publishDate, organ, pub_hao, summary, content, fjtitle_list, fjhref_list]
+    return data_
+
+
+def doJob():
+    if not os.path.exists('./相关政策/山西省人民政府/政策文件'):
+        os.makedirs('./相关政策/山西省人民政府/政策文件')
+    num = 1
+    data_list = []
+    pageSize = getPageSize()
+    for page in range(1, pageSize + 1):
+        data_json = getJson(page)
+        for i in range(len(data_json)):
+            if data_json[i]['chnldesc'] == '政策文件':
+                data = getContent(num, data_json[i])
+                data_list.append(data)
+                log.info(f'{data[1]}===采集成功')
+                num += 1
+    df = pd.DataFrame(np.array(data_list))
+    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
+    df.to_excel('./山西省人民政府政策文件.xlsx', index=False)
+
+
+if __name__ == '__main__':
+    doJob()
+    baseCore.close()
--- a/REITs专题数据/policy-sichuan.py
+++ b/REITs专题数据/policy-sichuan.py
+import requests
+import requests
+from bs4 import BeautifulSoup
+from base import BaseCore
+import os
+import pandas as pd
+import numpy as np
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+}
+
+
+def getSoup(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'html.parser')
+    return soup
+
+
+def getFjContent(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    return req.content
+
+
+def getDataJson():
+    ip = baseCore.get_proxy()
+    url = 'https://api.so-gov.cn/query/s'
+    data_post = {
+        'siteCode': '5100000062',
+        'tab': 'zcwj',
+        'qt': 'REITs',
+        'keyPlace': '0',
+        'sort': 'dateDesc',
+        'fileType': '',
+        'timeOption': '0',
+        'locationCode': '510000000000',
+        'page': '1',
+        'pageSize': '20',
+        'ie': 'c0e059a8-7a00-4fa9-9d70-873a5284d8a0',
+    }
+    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
+    req.encoding = req.apparent_encoding
+    data_json = req.json()['resultDocs']
+    return data_json
+
+
+def getContent(url, publishDate, num):
+    url_ = url.split('/')[-1]
+    url_ = url.replace(url_, '')
+    fjhref_list = ''
+    fjtitle_list = ''
+    soup = getSoup(url)
+    try:
+        writtenDate = \
+        soup.select('#szfcontentwrap2022 > div.zfwjwzcontent > div.topbox > ul > li')[3].text.split('成文日期：')[
+            1].lstrip().strip()
+    except:
+        writtenDate = ''
+    try:
+        contentWithTag = soup.select('.contText')[0]
+    except:
+        contentWithTag = soup.select('#cmsArticleContent')[0]
+    img_list = contentWithTag.find_all('img')
+    num_ = 1
+    for img in img_list:
+        fj_href = url_ + img.get('src')
+        fjhref_list += fj_href + '\n'
+        fj_title = str(num_)
+        num_ += 1
+        category = os.path.splitext(fj_href)[1]
+        if category not in fj_title:
+            fj_title = fj_title + category
+        fj_title = f'{num}-{publishDate}-{fj_title}'
+        fjtitle_list += fj_title + '\n'
+        fjcontent = getFjContent(fj_href)
+        file = f'./相关政策/四川省人民政府/政策文件/{fj_title}'
+        with open(file, 'wb') as f:
+            f.write(fjcontent)
+        log.info(f'{fj_title}===附件下载成功')
+    content = contentWithTag.text
+    fjtitle_list = fjtitle_list.lstrip().strip()
+    fjhref_list = fjhref_list.lstrip().strip()
+    return writtenDate, content, fjtitle_list, fjhref_list
+
+
+def getData(data_, num):
+    title = data_['data']['title']
+    publishDate = data_['data']['docDate']
+    origin = data_['data']['siteLabel']['value']
+    href = data_['data']['url']
+    organ = data_['data']['myValues']['DOCPUBNAME']
+    pub_hao = data_['data']['myValues']['DOCNOVAL']
+    summary = ''
+    if '.pdf' in href or '.PDF' in href:
+        content = ''
+        writtenDate = ''
+        fjtitle_list = title + '.pdf'
+        fjhref_list = href
+        fjcontent = getFjContent(href)
+        file = f'./相关政策/四川省人民政府/政策文件/{title}.pdf'
+        with open(file, 'wb') as f:
+            f.write(fjcontent)
+        log.info(f'{title}===附件下载成功')
+    else:
+        writtenDate, content, fjtitle_list, fjhref_list = getContent(href, publishDate, num)
+    data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
+            fjhref_list]
+    return data
+
+
+def doJob():
+    if not os.path.exists('./相关政策/四川省人民政府/政策文件'):
+        os.makedirs('./相关政策/四川省人民政府/政策文件')
+    data_list = []
+    num = 1
+    data_json = getDataJson()
+    for data_ in data_json:
+        data = getData(data_, num)
+        data_list.append(data)
+        log.info(f'{data[1]}===采集成功')
+        num += 1
+    df = pd.DataFrame(np.array(data_list))
+    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
+    df.to_excel('./相关政策/四川省人民政府/四川省人民政府政策文件.xlsx', index=False)
+
+
+if __name__ == '__main__':
+    doJob()
+    baseCore.close()
--- a/REITs专题数据/policy-tianjin.py
+++ b/REITs专题数据/policy-tianjin.py
+import os
+import os
+from datetime import datetime
+from urllib.parse import urljoin
+
+import numpy as np
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+from base import BaseCore
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+    'Accept': 'application/json, text/plain, */*',
+    'Accept-Encoding': 'gzip, deflate, br',
+    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+    'Cache-Control': 'no-cache',
+    'Connection': 'keep-alive',
+    'Host': 'www.tj.gov.cn',
+    'Pragma': 'no-cache',
+    'Sec-Fetch-Dest': 'empty',
+    'Sec-Fetch-Mode': 'cors',
+    'Sec-Fetch-Site': 'same-origin',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+    'sec-ch-ua': '"Microsoft Edge";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
+    'sec-ch-ua-mobile': '?0',
+    'sec-ch-ua-platform': '"Windows"',
+}
+
+
+def paserUrl(html, listurl):
+    # 获取所有的<a>标签和<img>标签
+    if isinstance(html, str):
+        html = BeautifulSoup(html, 'html.parser')
+
+    links = html.find_all(['a', 'img'])
+    # 遍历标签，将相对地址转换为绝对地址
+    for link in links:
+        if 'href' in link.attrs:
+            link['href'] = urljoin(listurl, link['href'])
+        elif 'src' in link.attrs:
+            link['src'] = urljoin(listurl, link['src'])
+    return html
+
+
+def getSoup(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'html.parser')
+    return soup
+
+
+def getTotal():
+    ip = baseCore.get_proxy()
+    url = 'https://www.tj.gov.cn/igs/front/search.jhtml?code=78778b9ded5140d4984030cf8f469303&pageNumber=1&pageSize=10&searchWord=REITs&siteId=34&sortByFocus=true&type=21515&type1=21519'
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    total = req.json()['page']['totalPages']
+    return int(total)
+
+
+def getJson(page):
+    ip = baseCore.get_proxy()
+    url = f'https://www.tj.gov.cn/igs/front/search.jhtml?code=78778b9ded5140d4984030cf8f469303&pageNumber={page}&pageSize=10&searchWord=REITs&siteId=34&sortByFocus=true&type=21515&type1=21519'
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    return req.json()['page']['content']
+
+
+def getFjContent(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    return req.content
+
+
+def getContent(num, title, pub_time, origin, organ, url, pub_hao, summary, ):
+    fjhref_list = ''
+    fjtitle_list = ''
+    soup = getSoup(url)
+    url_ = url.split('/')[-1]
+    soup = paserUrl(soup, url.replace(url_, ''))
+    contentWithTag = soup.find('div', class_='article_content')
+    try:
+        contentWithTag.find('div', class_='articlePlayer').decompose()
+    except:
+        pass
+    try:
+        scripts = contentWithTag.find_all('script')
+        for script in scripts:
+            script.decompose()
+    except:
+        pass
+    try:
+        styles = contentWithTag.find_all('style')
+        for style in styles:
+            style.decompose()
+    except:
+        pass
+    try:
+        a_list = contentWithTag.find('div', class_='qt-attachments').find_all('a')
+        for a in a_list:
+            href = a.get('href')
+            fjhref_list += href + '\n'
+            category = os.path.splitext(href)[1]
+            fj_title = f'{num}-{pub_time}-{a.text.lstrip().strip()}'
+            if '<' in fj_title or '>' in fj_title:
+                fj_title = fj_title.replace('<', '').replace('>', '')
+            if category not in fj_title:
+                fj_title = fj_title + category
+            fjtitle_list += fj_title + '\n'
+            fjcontent = getFjContent(href)
+            file = f'./相关政策/天津市人民政府/政策文件/{fj_title}'
+            with open(file, 'wb') as f:
+                f.write(fjcontent)
+            log.info(f'{title}===附件下载成功')
+    except:
+        pass
+    try:
+        contentWithTag.find('div', class_='qt-attachments').decompose()
+    except:
+        pass
+    content = contentWithTag.text.lstrip().strip()
+    fjtitle_list = fjtitle_list.lstrip().strip()
+    fjhref_list = fjhref_list.lstrip().strip()
+    data = [num, title, pub_time, origin, url, pub_time, organ, pub_hao, summary, content, fjtitle_list, fjhref_list]
+    return data
+
+
+def doJob():
+    if not os.path.exists('./相关政策/天津市人民政府/政策文件'):
+        os.makedirs('./相关政策/天津市人民政府/政策文件')
+    data_list = []
+    total = getTotal()
+    num = 1
+    for page in range(1, total + 1):
+        data_json = getJson(page)
+        for i in range(len(data_json)):
+            title = data_json[i]['title']
+            pub_time = datetime.strptime(data_json[i]['trs_time'], "%Y-%m-%dT%H:%M:%S.%f%z").date()
+            origin = data_json[i]['trs_site']
+            organ = data_json[i]['department']
+            href = data_json[i]['url']
+            pub_hao = data_json[i]['wh']
+            summary = ''
+            data = getContent(num, title, pub_time, origin, organ, href, pub_hao, summary)
+            data_list.append(data)
+            log.info(f'{title}===采集成功')
+            num += 1
+    df = pd.DataFrame(np.array(data_list))
+    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
+    df.to_excel('./天津市人民政府政策文件.xlsx', index=False)
+
+
+if __name__ == '__main__':
+    doJob()
+    baseCore.close()
--- a/REITs专题数据/policy-yunnan.py
+++ b/REITs专题数据/policy-yunnan.py
+import os
+import os
+import re
+import time
+import datetime
+
+import numpy as np
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+from retry import retry
+from selenium.webdriver.common.by import By
+
+from base import BaseCore
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+}
+
+
+def getSoup(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'html.parser')
+    return soup
+
+
+@retry(tries=3, delay=5)
+def getFjContent(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    return req.content
+
+
+def getContent(url, publishDate, num):
+    fjhref_list = ''
+    fjtitle_list = ''
+    soup = getSoup(url)
+    contentWithTag = soup.find('div', class_='content')
+    if not contentWithTag:
+        contentWithTag = soup.find('div', class_='TRS_UEDITOR')
+    try:
+        scripts = contentWithTag.find_all('script')
+        for script in scripts:
+            script.decompose()
+    except:
+        pass
+    try:
+        styles = contentWithTag.find_all('style')
+        for style in styles:
+            style.decompose()
+    except:
+        pass
+    content = contentWithTag.text.lstrip().strip()
+    num_ = 1
+    a_list = contentWithTag.find_all('a')
+    for a in a_list:
+        fj_title = a.text.lstrip().strip()
+        fj_href = a.get('href')
+        if 'http' not in fj_href:
+            fj_href = 'https://www.yn.gov.cn' + fj_href
+        fjhref_list += fj_href + '\n'
+        if fj_title == '':
+            fj_title = str(num_)
+            num_ += 1
+        category = os.path.splitext(fj_href)[1]
+        if category not in fj_title:
+            fj_title = fj_title + category
+        fj_title = f'{num}-{publishDate}-{fj_title}'
+        fjtitle_list += fj_title + '\n'
+        fjcontent = getFjContent(fj_href)
+        file = f'./相关政策/云南省人民政府/政策文件/{fj_title}'
+        if os.path.exists(file):
+            fj_title = fj_title.replace(category, f'-{num_}{category}')
+            num_ += 1
+        file = f'./相关政策/云南省人民政府/政策文件/{fj_title}'
+        with open(file, 'wb') as f:
+            f.write(fjcontent)
+            log.info(f'{fj_title}===附件下载成功')
+    try:
+        a_list = soup.find('ul', class_='apfile').find_all('a')
+        for a in a_list:
+            fj_title = a.text.lstrip().strip()
+            fj_href = a.get('href')
+            if 'http' not in fj_href:
+                fj_href = 'https://www.yn.gov.cn' + fj_href
+            fjhref_list += fj_href + '\n'
+            if fj_title == '':
+                fj_title = str(num_)
+                num_ += 1
+            category = os.path.splitext(fj_href)[1]
+            if category not in fj_title:
+                fj_title = fj_title + category
+            fj_title = f'{num}-{publishDate}-{fj_title}'
+            fjtitle_list += fj_title + '\n'
+            fjcontent = getFjContent(fj_href)
+            file = f'./相关政策/云南省人民政府/政策文件/{fj_title}'
+            if os.path.exists(file):
+                fj_title = fj_title.replace(category, f'-{num_}{category}')
+                num_ += 1
+            file = f'./相关政策/云南省人民政府/政策文件/{fj_title}'
+            with open(file, 'wb') as f:
+                f.write(fjcontent)
+                log.info(f'{fj_title}===附件下载成功')
+    except:
+        pass
+    return content, fjtitle_list, fjhref_list
+
+
+def getData(div, num):
+    pattern = r"\d{4}-\d{2}-\d{2}"
+    title = div.find_element(By.CLASS_NAME, 'title').find_element(By.CLASS_NAME, 'fontlan').get_attribute(
+        'title').lstrip().strip()
+    href = div.find_element(By.CLASS_NAME, 'fontlan').get_attribute('href')
+    origin = '云南省人民政府'
+    try:
+        publishDate = re.findall(pattern, div.find_element(By.CLASS_NAME, 'content').text)[0]
+    except:
+        publishDate = ''
+    try:
+        organ = \
+            div.find_element(By.CLASS_NAME, 'rowtab').find_elements(By.TAG_NAME, 'div')[0].find_elements(By.TAG_NAME,
+                                                                                                         'p')[
+                1].find_element(By.CLASS_NAME, 'txt').text.lstrip().strip()
+        pub_hao = \
+            div.find_element(By.CLASS_NAME, 'rowtab').find_elements(By.TAG_NAME, 'div')[0].find_elements(By.TAG_NAME,
+                                                                                                         'p')[
+                0].find_element(By.CLASS_NAME, 'txt').text.lstrip().strip()
+        if pub_hao == '无':
+            pub_hao = ''
+    except:
+        organ = ''
+        pub_hao = ''
+    summary = ''
+    writtenDate = ''
+    if '.pdf' in href or '.PDF' in href:
+        content = ''
+        fjhref_list = href
+        fj_title = title + '.pdf'
+        fjcontent = getFjContent(fjhref_list)
+        file = f'./相关政策/云南省人民政府/政策文件/{fj_title}'
+        with open(file, 'wb') as f:
+            f.write(fjcontent)
+            log.info(f'{fj_title}===附件下载成功')
+        fjtitle_list = fj_title
+    else:
+        content, fjtitle_list, fjhref_list = getContent(href, publishDate, num)
+    data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
+            fjhref_list]
+    return data
+
+
+def doJob():
+    if not os.path.exists('./相关政策/云南省人民政府/政策文件'):
+        os.makedirs('./相关政策/云南省人民政府/政策文件')
+    data_list = []
+    url = 'https://sheng.so-gov.cn/s?siteCode=5300000033&qt=REITs'
+    driver = baseCore.buildDriver()
+    driver.get(url)
+    time.sleep(2)
+    num = 1
+    for type in range(3, 5):
+        driver.find_elements(By.XPATH, '/html/body/div/div[6]/div[2]/div[3]/ul/li')[type].click()
+        time.sleep(2)
+        if type == 3:
+            driver.find_element(By.ID, 'key_place_context_id').click()
+            time.sleep(2)
+        try:
+            total = int(driver.find_element(By.CLASS_NAME, 'pagination').find_elements(By.TAG_NAME, 'a')[-2].text)
+        except:
+            total = 1
+        for page in range(total):
+            time.sleep(2)
+            div_list = driver.find_elements(By.XPATH, '//*[@id="results"]/div')
+            for div in div_list:
+                data = getData(div, num)
+                data_list.append(data)
+                log.info(f'{data[1]}===采集成功')
+                num += 1
+            try:
+                driver.find_element(By.CLASS_NAME, 'pagination').find_element(By.CLASS_NAME, 'next').click()
+            except:
+                pass
+    df = pd.DataFrame(np.array(data_list))
+    df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
+    df.to_excel('./相关政策/云南省人民政府/云南省人民政府政策文件.xlsx', index=False)
+
+
+if __name__ == '__main__':
+    doJob()
+    baseCore.close()
--- a/REITs专题数据/policy-zhejiang.py
+++ b/REITs专题数据/policy-zhejiang.py
+import time
+import time
+
+import requests
+from bs4 import BeautifulSoup
+from retry import retry
+
+from base import BaseCore
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+    'X-Requested-With': 'XMLHttpRequest',
+}
+
+
+@retry(tries=3,delay=10)
+def getPageSize():
+    ip = baseCore.get_proxy()
+    url = 'https://search.zj.gov.cn/jsearchfront/interfaces/cateSearch.do'
+    data_post = {
+        'websiteid': '330000000000000',
+        'pg': '10',
+        'p': '1',
+        'tpl': '1569',
+        'cateid': '372',
+        'word': 'REITs',
+        'checkError': '1',
+        'isContains': '1',
+        'q': 'REITs',
+        'pos': 'content,filenumber',
+        'sortType': '1',
+    }
+    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
+    req.encoding = req.apparent_encoding
+    total = req.json()['total']
+    if total % 10 == 0:
+        pageSize = total // 10
+    else:
+        pageSize = total // 10 + 1
+    req.close()
+    return pageSize
+
+@retry(tries=3,delay=10)
+def getDataJson(page):
+    ip = baseCore.get_proxy()
+    url = 'https://search.zj.gov.cn/jsearchfront/interfaces/cateSearch.do'
+    data_post = {
+        'websiteid': '330000000000000',
+        'pg': '10',
+        'p': f'{page}',
+        'tpl': '1569',
+        'cateid': '372',
+        'word': 'REITs',
+        'checkError': '1',
+        'isContains': '1',
+        'q': 'REITs',
+        'pos': 'content,filenumber',
+        'sortType': '1',
+    }
+    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
+    req.encoding = req.apparent_encoding
+    data_json = req.json()['result']
+    return data_json
+
+def getDatas(page):
+    data_json = getDataJson(page)
+    for data_ in data_json:
+        soup = BeautifulSoup(data_, 'lxml')
+        title = soup.find('div', class_='titleWrapper').find('a', class_='textTitle').text.lstrip().strip().replace(' ','').replace('\r\n',' ')
+        href = soup.find('div', class_='titleWrapper').find('a', class_='textTitle').get('href')
+        href = href.split('url=')[1].split('.html')[0].replace('%3A',':').replace('%2F','/') + '.html'
+        try:
+            info = soup.find('table', class_='fgwj_table_list').text
+            organ = info.split('发布机构：')[1].split('成文日期：')[0].lstrip().strip()
+            writtenDate = info.split('成文日期：')[1].lstrip().strip()
+        except:
+            organ = ''
+            writtenDate = None
+        origin = soup.find('div', class_='sourceTime').text.split('来源:')[1].split('时间:')[0].lstrip().strip().replace(' ','').replace(' ', '').replace('\r\n', '')
+        publishDate = soup.find('div', class_='sourceTime').text.split('时间:')[1].lstrip().strip()
+        log.info(origin)
+        time.sleep(5)
+
+
+def doJob():
+    pageSize = getPageSize()
+    for page in range(1, pageSize + 1):
+        datas = getDatas(page)
+
+
+if __name__ == '__main__':
+    doJob()
+    # url = 'http%3A%2F%2Fwww.zj.gov.cn%2Fart%2F2022%2F4%2F18%2Fart_1229630461_2401403.html'
+    # req = requests.get(url,headers=headers)
+    # req.encoding = req.apparent_encoding
+    baseCore.close()
--- a/REITs专题数据/reits.py
+++ b/REITs专题数据/reits.py
-import os
+import os
@@ -136,7 +136,7 @@ class Policy():

 policy = Policy()
 #国家发展和改革委员会 https://www.ndrc.gov.cn/xxgk/wjk/index.html?tab=all&qt=
-def reform():
+def reform(wb,file_path):
    headers = {
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Encoding': 'gzip, deflate, br',
@@ -171,12 +171,6 @@ def reform():
            title = info['title']
            summary = info['summary'].replace('<em>','').replace('</em>','')
            newsUrl = info['url']
-
-            # 根据链接判重
-            is_member = baseCore.r.sismember('REITs::' + webname, newsUrl)
-            if is_member:
-                continue
-
            header = {
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                'Accept-Encoding': 'gzip, deflate, br',
@@ -245,7 +239,6 @@ def reform():
                    publishDate = ''
                policy.deletep(contentWithTag, 3, 'div', 'style', 'text-align: center;')
                policy.deletek(contentWithTag)
-
                content = contentWithTag.text
                try:
                    policy.paserUrl(newssoup,newsUrl)
@@ -357,12 +350,11 @@ def zhengquanqihuo():
    total = pageUtil['rowCount']
    page_size = pageUtil['pageSize']
    Max_page = int(total / page_size)
-    # DataList = []
+    DataList = []
    num = 0
-    webname = '证券期货法规数据库系统'
-    # path = 'data/证监会'
-    # if not os.path.exists(path):
-    #     os.makedirs(path)
+    path = 'data/证监会'
+    if not os.path.exists(path):
+        os.makedirs(path)
    for page in range(0, Max_page+1):
        payload_page = {
            'pageNo': page + 1,
@@ -380,7 +372,6 @@ def zhengquanqihuo():
        data_page = policy.requestPost(headers, url, payload_page)
        info_list = data_page['pageUtil']['pageList']
        for info in info_list:
-            id_list = []
            num += 1
            try:
                title = info['secFutrsLawName']
@@ -391,12 +382,6 @@ def zhengquanqihuo():
                # print(publishDate)
                secFutrsLawId = info['secFutrsLawId']
                newsUrl = f'https://neris.csrc.gov.cn/falvfagui/rdqsHeader/mainbody?navbarId=3&secFutrsLawId={secFutrsLawId}&body=REITs'
-
-                # 根据链接判重
-                is_member = baseCore.r.sismember('REITs::' + webname, newsUrl)
-                if is_member:
-                    continue
-
                browser = policy.createDriver()
                browser.get(newsUrl)
                time.sleep(1)
@@ -429,25 +414,18 @@ def zhengquanqihuo():
                    'createDate': time_now,
                    'sid': '1729030277461815298',
                }
-                try:
-                    baseCore.sendkafka(dic_info, topic)
-                    baseCore.r.sadd('REITs::' + webname, newsUrl)
-                    log.info(f'采集成功--{title}--{newsUrl}')
-                except:
-                    for att_id in id_list:
-                        baseCore.deliteATT(att_id)
-                # DataList.append(dic_info)
-                # sheet_name = "证监会"
-                # if sheet_name in wb.sheetnames:
-                #     log.info(f"{sheet_name}工作表已存在！")
-                # else:
-                #     # 创建新工作表
-                #     wb.create_sheet(sheet_name)
-                #     print(f"{sheet_name}新工作表创建完成！")
-                # # 保存Excel文件
-                # wb.save(file_path)
-                #
-                # baseCore.writerToExcel(DataList, file_path, sheet_name)
+                DataList.append(dic_info)
+                sheet_name = "证监会"
+                if sheet_name in wb.sheetnames:
+                    log.info(f"{sheet_name}工作表已存在！")
+                else:
+                    # 创建新工作表
+                    wb.create_sheet(sheet_name)
+                    print(f"{sheet_name}新工作表创建完成！")
+                # 保存Excel文件
+                wb.save(file_path)
+
+                baseCore.writerToExcel(DataList, file_path, sheet_name)
            except Exception as e:
                log.info(f"error！！！{num}")
                log.info({e})
@@ -472,10 +450,9 @@ def sse(wb,file_path):
    total_page = result['data']['totalPage']
    DataList = []
    num = 0
-    webname = '上海证券交易所'
-    # path = 'data/上海交易所'
-    # if not os.path.exists(path):
-    #     os.makedirs(path)
+    path = 'data/上海交易所'
+    if not os.path.exists(path):
+        os.makedirs(path)
    for page in range(0, int(total_page)):
        url_page = f'http://query.sse.com.cn/search/getESSearchDoc.do?page={page}&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C8685%2C9348%2C12632%2C12768%2C12769%2C12770%2C12771%2C12772%2C12773%2C12774%2C12775%2C12776%2C12777%2C12778%2C12779%2C12780%2C12781%2C12782%2C12783%2C12784%2C12785%2C12786%2C12787%2C12788%2C12789%2C12790%2C12791%2C12792%2C12793%2C12794%2C12795%2C12796%2C12797%2C12798%2C12799%2C12800%2C12801%2C12802%2C12803%2C12804%2C12805%2C12806%2C12807%2C12808%2C12809%2C12810%2C12811%2C12812%2C13061%2C13282%2C13283%2C13284%2C13285%2C13286%2C13287%2C13288%2C13289%2C13294%2C13364%2C13365%2C13366%2C13367%2C14595%2C14596%2C14597%2C14598%2C14599%2C14600%2C14601%2C14602%2C14603%2C14604%2C14605%2C14606&trackId=50619067167713018335655119683810&_=1699508921761'
        data = policy.getrequest_json(headers, url_page)
@@ -516,10 +493,10 @@ def sse(wb,file_path):
                        content += page.get_text()
                file_href = newsUrl
                file_name = title
-
-                policy.attuributefile(title, newsUrl, num, publishDate)
-
-
+                rename_file = f'{str(num)}_{publishDate}_{file_name}'
+                fu_jian_name += rename_file + '\n'
+                fu_jian_href += file_href + '\n'
+                policy.downloadfile(file_href, f'{path}/{rename_file}')
                dic_info = {
                    '序号': num,
                    '标题': title,
@@ -603,6 +580,100 @@ def sse(wb,file_path):

            baseCore.writerToExcel(DataList, file_path, sheet_name)

+#北京市人民政府 https://www.beijing.gov.cn/so/s?siteCode=1100000088&tab=zcfg&qt=REITs
+def beijing():
+    url = 'https://www.beijing.gov.cn/so/ss/query/s'
+    payload = {
+        'siteCode': '1100000088',
+        'tab': 'zcfg',
+        'qt': 'REITs',
+        'sort': 'relevance',
+        'keyPlace': '0',
+        'locationCode': '110000000000',
+        'page': '1',
+        'pageSize': '20',
+        'ie': '89b5e964-dc3c-4a5b-8d80-f6c408769b4a'
+    }
+    headers = {
+        'Accept': 'application/json, text/javascript, */*; q=0.01',
+        'Accept-Encoding': 'gzip, deflate, br',
+        'Accept-Language': 'zh-CN,zh;q=0.9',
+        'Connection': 'keep-alive',
+        'Content-Length': '148',
+        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+        'Cookie': 'Path=/; Path=/; __jsluid_s=91bdb0d83098fd2e8a8455a9085a22e2; JSESSIONID=M2FmNDczYzYtMmNkYS00N2I0LThhNDgtYWJiMTdhOTIyZDI4; _va_ref=%5B%22%22%2C%22%22%2C1699515166%2C%22https%3A%2F%2Fdocs.qq.com%2F%22%5D; _va_ses=*; JSESSIONID=CD61DA650DB33324962A3BF2527672D0; arialoadData=false; _va_id=c7a63e4b2199befd.1699358536.2.1699515273.1699515166.; CPS_SESSION=2FEFDC54444B24762D057AD6BDE3C7BF',
+        'Host': 'www.beijing.gov.cn',
+        'Origin': 'https://www.beijing.gov.cn',
+        'Referer': 'https://www.beijing.gov.cn/so/s?siteCode=1100000088&tab=zcfg&qt=REITs',
+        'Sec-Fetch-Dest': 'empty',
+        'Sec-Fetch-Mode': 'cors',
+        'Sec-Fetch-Site': 'same-origin',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
+        'X-Requested-With': 'XMLHttpRequest',
+        'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
+        'sec-ch-ua-mobile': '?0',
+        'sec-ch-ua-platform': '"Windows"'
+    }
+    result = policy.requestPost(headers, url, payload)
+    total = result['totalHits']
+    page_size = result['currentHits']
+    Max_page = int(total / page_size)
+    for page in range(0, Max_page):
+        payload_page = {
+            'siteCode': '1100000088',
+            'tab': 'zcfg',
+            'qt': 'REITs',
+            'sort': 'relevance',
+            'keyPlace': '0',
+            'locationCode': '110000000000',
+            'page': page + 1,
+            'pageSize': '20',
+            'ie': '89b5e964-dc3c-4a5b-8d80-f6c408769b4a'
+        }
+        data = policy.requestPost(headers, url, payload_page)
+        info_list = data['resultDocs']
+        # print(info_list)
+        for info_ in info_list:
+            info = info_['data']
+            title = info['titleO']
+            titleLabel = info['titleLabel']['value']
+            publishDate = info['docDate']
+            # source = info['siteLabel']['value']
+            newsUrl = info['url']
+
+            if titleLabel == '政策解读':
+                newssoup = policy.getrequest_soup(headers, newsUrl)
+                print(newssoup)
+                contentWithTag = newssoup.find('div', id='mainText')
+                content = contentWithTag.text
+                source = newssoup.select('p[class="fl"]>span')[1].replace('来源：', '')
+            formatRows = info['formatRows']
+            num = 1
+            for row in formatRows:
+                for col in row['col']:
+                    name = col['text']
+                    if name == '相关附件':
+                        value = col['value']
+                        file_href = value.keys()
+                        file_name = value.values()
+                        # 附件上传
+                        policy.attuributefile(file_name,file_href,num,publishDate)
+                        num += 1
+                    value = col['value'][0]
+
+                    dic_info[name] = value
+
+            dic_info = {
+                'title': title,
+                'publishDate': publishDate,
+                'source': source,
+                'newsUrl': newsUrl,
+                'file_href': file_href
+            }
+
+            # print(dic_info)
+        # break
+
 # 河北省人民政府
 def hebei():
    path = 'data/河北省人民政府'
@@ -807,6 +878,10 @@ def hebei():
                baseCore.writerToExcel(DataList, file_path, sheet_name)
        break

+# 广东省人民政府
+def guangdong():
+
+    pass

 # 贵州省人民政府
 def guizhou():
@@ -915,12 +990,12 @@ def guizhou():


 if __name__=="__main__":
-    # file_path = f'data/REITs贵州省人民政府.xlsx'
-    # wb = policy.createfile(file_path)
-    # reform()
+    file_path = f'data/REITs贵州省人民政府.xlsx'
+    wb = policy.createfile(file_path)
+    # reform(wb,file_path)
    # shenzhen()
    zhengquanqihuo()
    # sse()
    # hebei()
-    # guizhou()
+    #guizhou()
 # zhengquanqihuo()
\ No newline at end of file