REITs 政策法规 03/21

b3fa91e8 · LiuLiYuan · 88209302 · b3fa91e8 · b3fa91e8 · b3fa91e8
--- a/REITs_policyData/policy_beijing.py
+++ b/REITs_policyData/policy_beijing.py
@@ -15,8 +15,8 @@ from reits import Policy
 policy = Policy()


-topic = 'policy'
-webname = '北京市人民政府'
+topic = 'research_center_fourth'
+webname = '北京市人民政府_'

 class Policy1():
    @retry(tries=3, delay=10)
@@ -282,14 +282,17 @@ def beijing():
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            if content == '':
                continue
+            lang = baseCore.detect_language(content)
            dic_info = {
                'attachmentIds': id_list,
+                'subjectId': '1729315113088765953',
+                'lang':lang,
                'author': '',
                'content': content,
                'contentWithTag': contentWithTag_str,
                'deleteFlag': 0,
                'checkStatus': 1,
-                'id': '',
+                'id': '1729315113088765953'+str(int(time.time())),
                'title': title,
                'publishDate': publishDate,
                'origin': origin,
@@ -312,6 +315,6 @@ def beijing():
            time.sleep(random.randint(10, 20))
            num += 1

-# if __name__ == '__main__':
-#     beijing()
-#     baseCore.close()
+if __name__ == '__main__':
+    beijing()
+    baseCore.close()
--- a/REITs_policyData/policy_fujian.py
+++ b/REITs_policyData/policy_fujian.py
+import json
 import time
 import os

@@ -12,8 +13,8 @@ log = baseCore.getLogger()

 from reits import Policy
 policy = Policy()
-topic = 'policy'
-webname = '福建省人民政府'
+topic = 'research_center_fourth'
+webname = '福建省人民政府_'
 headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
    'X-Requested-With': 'XMLHttpRequest',
@@ -66,6 +67,7 @@ def getContent(num, url, publishDate):
            style.decompose()
    except:
        pass
+    try:
        a_list = soup.find('div', class_='xl_list1').find_all('a')
        for a in a_list:
            fj_href = a.get('href')
@@ -82,6 +84,8 @@ def getContent(num, url, publishDate):
            if att_id:
                id_list.append(att_id)
                a['href'] = full_path
+    except:
+        pass

    content = contentWithTag.text.lstrip().strip()

@@ -116,7 +120,10 @@ def doJob():
    for data_post in data_posts:
        data_json = getDataJson(data_post)
        for data_ in data_json:
+            try:
                title = data_['_doctitle']
+            except:
+                title = data_['doctitle']
            publishDate = data_['crtime'].replace('.','-')
            origin = data_['docsourcename']
            href = data_['docpuburl']
@@ -142,14 +149,17 @@ def doJob():
            content, contentWithTag, id_list = getContent(num, href, publishDate[:10])
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            contentWithTag_str = str(contentWithTag)
+            lang = baseCore.detect_language(content)
            dic_info = {
                'attachmentIds': id_list,
+                'subjectId': '1729315113088765953',
+                'lang': lang,
                'author': '',
                'content': content,
                'contentWithTag': contentWithTag_str,
                'deleteFlag': 0,
                'checkStatus': 1,
-                'id': '',
+                'id': '1729315113088765953'+str(int(time.time())),
                'title': title,
                'publishDate': publishDate,
                'origin': origin,

--- a/REITs_policyData/policy_guangdong.py
+++ b/REITs_policyData/policy_guangdong.py
@@ -20,7 +20,7 @@ policy = Policy()


 topic = 'research_center_fourth'
-webname = '广东省人民政府'
+webname = '广东省人民政府_'
 headers = {
    'Content-Type': 'application/json',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
@@ -225,10 +225,4 @@ def doJob():

 if __name__ == '__main__':
    doJob()
-    # doJob_1()
-    # doJob_2(2)
-    # url = 'http://www.gd.gov.cn/gkmlpt/content/4/4022/post_4022955.html#8'
-    # soup = getSoup(url)
-    #
-    # print(contentWithTag)
    baseCore.close()
--- a/REITs_policyData/policy_guangxi.py
+++ b/REITs_policyData/policy_guangxi.py
@@ -12,11 +12,11 @@ baseCore = BaseCore.BaseCore()
 log = baseCore.getLogger()

 from reits import Policy
-policy = Policy()

+policy = Policy()

-topic = 'policy'
-webname = '广西壮族自治区人民政府'
+topic = 'research_center_fourth'
+webname = '广西壮族自治区人民政府_'
 headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
    'Content-Type': 'application/json',
@@ -41,11 +41,27 @@ def getFjContent(url):
 def getTotal():
    ip = baseCore.get_proxy()
    url = 'http://www.gxzf.gov.cn/irs/front/search'
-    data_post = {"code": "181aedaa542", "dataTypeId": "241", "configCode": "",
-                 "sign": "9cc99c9d-94aa-44b4-aa79-41227a5385d7", "searchWord": "REITs", "orderBy": "related",
-                 "searchBy": "all", "appendixType": "", "granularity": "ALL", "isSearchForced": "0", "filters": [],
-                 "pageNo": 1, "pageSize": 10, "isAdvancedSearch": None, "isDefaultAdvanced": None,
-                 "advancedFilters": None, "advancedFilters ": None, "historySearchWords": []}
+    # data_post = {"code": "181aedaa542", "dataTypeId": "241", "configCode": "",
+    #              "sign": "9cc99c9d-94aa-44b4-aa79-41227a5385d7", "searchWord": "REITs", "orderBy": "related",
+    #              "searchBy": "all", "appendixType": "", "granularity": "ALL", "isSearchForced": "0", "filters": [],
+    #              "pageNo": 1, "pageSize": 10, "isAdvancedSearch": None, "isDefaultAdvanced": None,
+    #              "advancedFilters": None, "advancedFilters ": None, "historySearchWords": []}
+    data_post = {'advancedFilters': None,
+                 'appendixType': "",
+                 'code': "181aedaa542",
+                 'configCode': "",
+                 'dataTypeId': "241",
+                 'filters': [],
+                 'granularity': "ALL",
+                 'historySearchWords': [],
+                 'isAdvancedSearch': None,
+                 'isDefaultAdvanced': None,
+                 'isSearchForced': "0",
+                 'orderBy': "related",
+                 'pageNo': 1,
+                 'pageSize': 10,
+                 'searchBy': "all",
+                 'searchWord': "REITs", }
    data_post = json.dumps(data_post)
    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
    req.encoding = req.apparent_encoding
@@ -55,11 +71,27 @@ def getTotal():
 def getDataJson(page):
    ip = baseCore.get_proxy()
    url = 'http://www.gxzf.gov.cn/irs/front/search'
-    data_post = {"code": "181aedaa542", "dataTypeId": "241", "configCode": "",
-                 "sign": "9cc99c9d-94aa-44b4-aa79-41227a5385d7", "searchWord": "REITs", "orderBy": "related",
-                 "searchBy": "all", "appendixType": "", "granularity": "ALL", "isSearchForced": "0", "filters": [],
-                 "pageNo": page, "pageSize": 10, "isAdvancedSearch": None, "isDefaultAdvanced": None,
-                 "advancedFilters": None, "advancedFilters ": None, "historySearchWords": []}
+    # data_post = {"code": "181aedaa542", "dataTypeId": "241", "configCode": "",
+    #              "sign": "9cc99c9d-94aa-44b4-aa79-41227a5385d7", "searchWord": "REITs", "orderBy": "related",
+    #              "searchBy": "all", "appendixType": "", "granularity": "ALL", "isSearchForced": "0", "filters": [],
+    #              "pageNo": page, "pageSize": 10, "isAdvancedSearch": None, "isDefaultAdvanced": None,
+    #              "advancedFilters": None, "advancedFilters ": None, "historySearchWords": []}
+    data_post = {'advancedFilters': None,
+                 'appendixType': "",
+                 'code': "181aedaa542",
+                 'configCode': "",
+                 'dataTypeId': "241",
+                 'filters': [],
+                 'granularity': "ALL",
+                 'historySearchWords': [],
+                 'isAdvancedSearch': None,
+                 'isDefaultAdvanced': None,
+                 'isSearchForced': "0",
+                 'orderBy': "related",
+                 'pageNo': page,
+                 'pageSize': 10,
+                 'searchBy': "all",
+                 'searchWord': "REITs", }
    data_post = json.dumps(data_post)
    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
    req.encoding = req.apparent_encoding
@@ -117,14 +149,17 @@ def getData(data_, num):
    content, contentWithTag, id_list = getContent(href, publishDate, num)
    contentWithTag_str = str(contentWithTag)
    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    lang = baseCore.detect_language(content)
    dic_info = {
        'attachmentIds': id_list,
        'author': '',
+        'subjectId': '1729315113088765953',
+        'lang': lang,
        'content': content,
        'contentWithTag': contentWithTag_str,
        'deleteFlag': 0,
        'checkStatus': 1,
-        'id': '',
+        'id': '1729315113088765953' + str(int(time.time())),
        'title': title,
        'publishDate': publishDate,
        'origin': origin,
@@ -163,7 +198,6 @@ def doJob():
                time.sleep(2)


-
 if __name__ == '__main__':
    doJob()
    baseCore.close()
--- a/REITs_policyData/policy_hainan.py
+++ b/REITs_policyData/policy_hainan.py
@@ -17,8 +17,8 @@ from reits import Policy
 policy = Policy()


-topic = 'policy'
-webname = '海南省人民政府'
+topic = 'research_center_fourth'
+webname = '海南省人民政府_'

 headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
@@ -108,14 +108,17 @@ def getData(div, num):
        return
    contentWithTag_str = str(contentWithTag)
    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    lang = baseCore.detect_language(content)
    dic_info = {
        'attachmentIds': [],
+        'subjectId': '1729315113088765953',
+        'lang': lang,
        'author': '',
        'content': content,
        'contentWithTag': contentWithTag_str,
        'deleteFlag': 0,
        'checkStatus': 1,
-        'id': '',
+        'id': '1729315113088765953'+str(int(time.time())),
        'title': title,
        'publishDate': publishDate,
        'origin': origin,

--- a/REITs_policyData/policy_heilongjiang.py
+++ b/REITs_policyData/policy_heilongjiang.py
-#coding=utf-8
+# coding=utf-8
 import os
 import time

@@ -10,14 +10,14 @@ baseCore = BaseCore.BaseCore()
 log = baseCore.getLogger()

 from reits import Policy
-policy = Policy()

+policy = Policy()

-topic = 'policy'
-webname = '黑龙江省人民政府'
+topic = 'research_center_fourth'
+webname = '黑龙江省人民政府_'
 headers = {
    'Content-Type': 'application/x-www-form-urlencoded',
-    'Token': '9a9ff46e-f534-43b8-bad1-063d80af7e51',
+    'Token': 'b946cd4e-77a4-42f5-bcaf-a9c4f26b5191',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
 }

@@ -26,11 +26,12 @@ def getDataJson():
    ip = baseCore.get_proxy()
    url = 'https://www.hlj.gov.cn/znwd/policy/policy/policy/home/public/policyWikipedia?_method=get'
    data_post = {
-        'sort': 'smartIndex',
-        'order': 'asc',
+        'sort': 'date',
+        'order': 'desc',
        'start': '0',
        'length': '20',
        'filter.all': 'REITs',
+        'filter.tyoe': '0'
    }
    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
    req.encoding = req.apparent_encoding
@@ -54,7 +55,7 @@ def getFjContent(url):
    return req.content


-def getContent(num, title, publishDate, summary, id, pub_hao, organ,type):
+def getContent(num, title, publishDate, summary, id, pub_hao, organ, type):
    id_list = []
    url = f'https://www.hlj.gov.cn/znwd/policy/#/readDetails?id={id}'
    writtenDate = None
@@ -83,7 +84,7 @@ def getContent(num, title, publishDate, summary, id, pub_hao, organ,type):
                fj_title = fj_title.replace('<', '').replace('>', '')
            if category not in fj_title:
                fj_title = fj_title + category
-            att_id, full_path = policy.attuributefile(fj_title,href,num,publishDate)
+            att_id, full_path = policy.attuributefile(fj_title, href, num, publishDate)
            if att_id:
                id_list.append(att_id)
                a['href'] = full_path
@@ -104,14 +105,17 @@ def getContent(num, title, publishDate, summary, id, pub_hao, organ,type):
    content = soup.text.lstrip().strip()
    contentWithTag_str = str(soup)
    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    lang = baseCore.detect_language(content)
    dic_info = {
        'attachmentIds': id_list,
+        'subjectId': '1729315113088765953',
+        'lang': lang,
        'author': '',
        'content': content,
        'contentWithTag': contentWithTag_str,
        'deleteFlag': 0,
        'checkStatus': 1,
-        'id': '',
+        'id': '1729315113088765953' + str(int(time.time())),
        'title': title,
        'publishDate': publishDate,
        'origin': origin,
@@ -135,7 +139,6 @@ def getContent(num, title, publishDate, summary, id, pub_hao, organ,type):


 def doJob():
-
    num = 1
    data_json = getDataJson()
    for data_ in data_json:
@@ -152,7 +155,7 @@ def doJob():
            organ = data_['unitShowName']
        except:
            organ = ''
-        data = getContent(num, title, publishDate, summary, id, pub_hao, organ,type)
+        data = getContent(num, title, publishDate, summary, id, pub_hao, organ, type)
        # data_list.append(data)
        num += 1
        time.sleep(3)

--- a/REITs_policyData/policy_hubei.py
+++ b/REITs_policyData/policy_hubei.py
@@ -21,8 +21,8 @@ from reits import Policy
 policy = Policy()


-topic = 'policy'
-webname = '湖北省人民政府'
+topic = 'research_center_fourth'
+webname = '湖北省人民政府_'
 headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
 }
@@ -114,14 +114,17 @@ def getData(driver, data_, num):

    contentWithTag_str = str(contentWithTag)
    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    lang = baseCore.detect_language(content)
    dic_info = {
        'attachmentIds': id_list,
+        'subjectId': '1729315113088765953',
+        'lang': lang,
        'author': '',
        'content': content,
        'contentWithTag': contentWithTag_str,
        'deleteFlag': 0,
        'checkStatus': 1,
-        'id': '',
+        'id': '1729315113088765953'+str(int(time.time())),
        'title': title,
        'publishDate': publishDate,
        'origin': origin,
@@ -145,7 +148,8 @@ def getData(driver, data_, num):


 def doJob():
-    service = Service(r'D:/soft/geckodriver.exe')
+    # service = Service(r'D:/soft/geckodriver.exe')
+    service = Service(r'F:\spider\firefox\geckodriver_1.exe')
    options = Options()
    options.set_preference("general.useragent.override", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
    driver = webdriver.Firefox(options=options, service=service)

--- a/REITs_policyData/policy_jiangsu.py
+++ b/REITs_policyData/policy_jiangsu.py
@@ -17,8 +17,8 @@ from reits import Policy
 policy = Policy()


-topic = 'policy'
-webname = '江苏省人民政府'
+topic = 'research_center_fourth'
+webname = '江苏省人民政府_'

 headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
@@ -85,14 +85,17 @@ def getContentA(url, num, publishDate, title, origin, summary):
    content = contentWithTag.text
    contentWithTag_str = str(contentWithTag)
    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    lang = baseCore.detect_language(content)
    dic_info = {
        'attachmentIds': id_list,
+        'subjectId': '1729315113088765953',
+        'lang': lang,
        'author': '',
        'content': content,
        'contentWithTag': contentWithTag_str,
        'deleteFlag': 0,
        'checkStatus': 1,
-        'id': '',
+        'id': '1729315113088765953'+str(int(time.time())),
        'title': title,
        'publishDate': publishDate,
        'origin': origin,
@@ -163,13 +166,16 @@ def getContentB(url, num, publishDate, title, origin, summary):
    content = contentWithTag.text.lstrip().strip()
    contentWithTag_str = str(contentWithTag)
    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    lang = baseCore.detect_language(content)
    dic_info = {
        'attachmentIds': id_list,
+        'subjectId': '1729315113088765953',
+        'lang': lang,
        'author': '',
        'content': content,
        'contentWithTag': contentWithTag_str,
        'deleteFlag': 0,
-        'id': '',
+        'id': '1729315113088765953'+str(int(time.time())),
        'title': title,
        'publishDate': publishDate,
        'origin': origin,

--- a/REITs_policyData/policy_jiangxi.py
+++ b/REITs_policyData/policy_jiangxi.py
@@ -16,7 +16,7 @@ policy = Policy()


 topic = 'research_center_fourth'
-webname = '江西省人民政府'
+webname = '江西省人民政府_'
 headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
    'X-Requested-With': 'XMLHttpRequest',

--- a/REITs_policyData/policy_jilin.py
+++ b/REITs_policyData/policy_jilin.py
@@ -14,8 +14,8 @@ from reits import Policy
 policy = Policy()


-topic = 'policy'
-webname = '吉林市人民政府'
+topic = 'research_center_fourth'
+webname = '吉林市人民政府_'
 headers = {
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
@@ -155,14 +155,17 @@ def getData(num, title, url, origin, publishDate, summary):
        return
    contentWithTag_str = str(contentWithTag)
    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    lang = baseCore.detect_language(content)
    dic_info = {
        'attachmentIds': id_list,
+        'subjectId': '1729315113088765953',
+        'lang': lang,
        'author': '',
        'content': content,
        'contentWithTag': contentWithTag_str,
        'deleteFlag': 0,
        'checkStatus': 1,
-        'id': '',
+        'id': '1729315113088765953'+str(int(time.time())),
        'title': title,
        'publishDate': publishDate,
        'origin': origin,

--- a/REITs_policyData/policy_liaoning.py
+++ b/REITs_policyData/policy_liaoning.py
@@ -15,8 +15,8 @@ from reits import Policy
 policy = Policy()


-topic = 'policy'
-webname = '辽宁省人民政府'
+topic = 'research_center_fourth'
+webname = '辽宁省人民政府_'
 headers = {
    'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
 }
@@ -63,14 +63,17 @@ def doJob():
        content = contentWithTag.text.lstrip().strip()
        contentWithTag_str = str(contentWithTag)
        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        lang = baseCore.detect_language(content)
        dic_info = {
            'attachmentIds': [],
+            'subjectId': '1729315113088765953',
+            'lang': lang,
            'author': '',
            'content': content,
            'contentWithTag': contentWithTag_str,
            'deleteFlag': 0,
            'checkStatus': 1,
-            'id': '',
+            'id': '1729315113088765953'+str(int(time.time())),
            'title': title,
            'publishDate': publishDate,
            'origin': '辽宁省人民政府',

--- a/REITs_policyData/policy_neimenggu.py
+++ b/REITs_policyData/policy_neimenggu.py
@@ -15,8 +15,8 @@ from reits import Policy
 policy = Policy()


-topic = 'policy'
-webname = '内蒙古自治区人民政府'
+topic = 'research_center_fourth'
+webname = '内蒙古自治区人民政府_'
 headers = {
    'Accept': 'application/json, text/plain, */*',
    'Accept-Encoding': 'gzip, deflate, br',
@@ -188,14 +188,17 @@ def getContent(num, data):
        return
    contentWithTag_str = str(contentWithTag)
    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    lang = baseCore.detect_language(content)
    dic_info = {
        'attachmentIds': id_list,
+        'subjectId': '1729315113088765953',
+        'lang': lang,
        'author': '',
        'content': content,
        'contentWithTag': contentWithTag_str,
        'deleteFlag': 0,
        'checkStatus': 1,
-        'id': '',
+        'id': '1729315113088765953'+str(int(time.time())),
        'title': title,
        'publishDate': publishDate,
        'origin': origin,

--- a/REITs_policyData/policy_shandong.py
+++ b/REITs_policyData/policy_shandong.py
@@ -11,8 +11,8 @@ log = baseCore.getLogger()
 from reits import Policy
 policy = Policy()

-topic = 'policy'
-webname = '山东省人民政府'
+topic = 'research_center_fourth'
+webname = '山东省人民政府_'
 headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
    'X-Requested-With': 'XMLHttpRequest',
@@ -131,14 +131,17 @@ def getData(soup, num):

    contentWithTag_str = str(contentWithTag)
    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    lang = baseCore.detect_language(content)
    dic_info = {
        'attachmentIds': id_list,
+        'subjectId': '1729315113088765953',
+        'lang': lang,
        'author': '',
        'content': content,
        'contentWithTag': contentWithTag_str,
        'deleteFlag': 0,
        'checkStatus': 1,
-        'id': '',
+        'id': '1729315113088765953'+str(int(time.time())),
        'title': title,
        'publishDate': publishDate,
        'origin': origin,

--- a/REITs_policyData/policy_shanghai.py
+++ b/REITs_policyData/policy_shanghai.py
@@ -17,8 +17,8 @@ from reits import Policy
 policy = Policy()


-topic = 'policy'
-webname = '上海市人民政府'
+topic = 'research_center_fourth'
+webname = '上海市人民政府_'
 headers = {
    'Accept': '*/*',
    'Accept-Encoding': 'gzip, deflate, br',
@@ -111,14 +111,17 @@ def getData(data_, driver, num):
    #         fjhref_list]
    contentWithTag_str = str(contentWithTag)
    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    lang = baseCore.detect_language(content)
    dic_info = {
        'attachmentIds': id_list,
+        'subjectId': '1729315113088765953',
+        'lang': lang,
        'author': '',
        'content': content,
        'contentWithTag': contentWithTag_str,
        'deleteFlag': 0,
        'checkStatus': 1,
-        'id': '',
+        'id': '1729315113088765953'+str(int(time.time())),
        'title': title,
        'publishDate': publishDate,
        'origin': origin,

--- a/REITs_policyData/policy_shanxi.py
+++ b/REITs_policyData/policy_shanxi.py
@@ -14,8 +14,8 @@ from reits import Policy
 policy = Policy()


-topic = 'policy'
-webname = '山西省人民政府'
+topic = 'research_center_fourth'
+webname = '山西省人民政府_'
 headers = {
    'Accept': 'application/json, text/plain, */*',
    'Accept-Encoding': 'gzip, deflate',
@@ -130,14 +130,17 @@ def getContent(num, data):
            a['href'] = full_path
    contentWithTag_str = str(contentWithTag)
    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    lang = baseCore.detect_language(content)
    dic_info = {
        'attachmentIds': id_list,
+        'subjectId': '1729315113088765953',
+        'lang': lang,
        'author': '',
        'content': content,
        'contentWithTag': contentWithTag_str,
        'deleteFlag': 0,
        'checkStatus': 1,
-        'id': '',
+        'id': '1729315113088765953'+str(int(time.time())),
        'title': title,
        'publishDate': publishDate,
        'origin': origin,

--- a/REITs_policyData/policy_sichuan.py
+++ b/REITs_policyData/policy_sichuan.py
@@ -14,8 +14,8 @@ log = baseCore.getLogger()
 from reits import Policy
 policy = Policy()

-topic = 'policy'
-webname = '四川省人民政府'
+topic = 'research_center_fourth'
+webname = '四川省人民政府_'
 headers = {
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
@@ -106,14 +106,17 @@ def getData(data_, num):

        contentWithTag_str = str(contentWithTag)
    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    lang = baseCore.detect_language(content)
    dic_info = {
        'attachmentIds': id_list,
+        'subjectId': '1729315113088765953',
+        'lang': lang,
        'author': '',
        'content': content,
        'contentWithTag': contentWithTag_str,
        'deleteFlag': 0,
        'checkStatus': 1,
-        'id': '',
+        'id': '1729315113088765953'+str(int(time.time())),
        'title': title,
        'publishDate': publishDate,
        'origin': origin,

--- a/REITs_policyData/policy_tianjin.py
+++ b/REITs_policyData/policy_tianjin.py
@@ -16,8 +16,8 @@ from reits import Policy
 policy = Policy()


-topic = 'policy'
-webname = '天津市人民政府'
+topic = 'research_center_fourth'
+webname = '天津市人民政府_'

 import urllib3
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
@@ -137,14 +137,17 @@ def getContent(num, title, pub_time, origin, organ, url, pub_hao, summary):
    content = contentWithTag.text.lstrip().strip()
    contentWithTag_str = str(contentWithTag)
    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    lang = baseCore.detect_language(content)
    dic_info = {
        'attachmentIds': id_list,
+        'subjectId': '1729315113088765953',
+        'lang': lang,
        'author': '',
        'content': content,
        'contentWithTag': contentWithTag_str,
        'deleteFlag': 0,
        'checkStatus': 1,
-        'id': '',
+        'id': '1729315113088765953'+str(int(time.time())),
        'title': title,
        'publishDate': pub_time,
        'origin': origin,

--- a/REITs_policyData/policy_yunnan.py
+++ b/REITs_policyData/policy_yunnan.py
@@ -19,8 +19,8 @@ from reits import Policy
 policy = Policy()


-topic = 'policy'
-webname = '云南省人民政府'
+topic = 'research_center_fourth'
+webname = '云南省人民政府_'
 headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
 }
@@ -149,14 +149,17 @@ def getData(div, num):
        content, contentWithTag, id_list = getContent(href, publishDate, num)
        contentWithTag_str = str(contentWithTag)
    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    lang = baseCore.detect_language(content)
    dic_info = {
        'attachmentIds': id_list,
+        'subjectId': '1729315113088765953',
+        'lang': lang,
        'author': '',
        'content': content,
        'contentWithTag': contentWithTag_str,
        'deleteFlag': 0,
        'checkStatus': 1,
-        'id': '',
+        'id': '1729315113088765953'+str(int(time.time())),
        'title': title,
        'publishDate': publishDate,
        'origin': origin,

--- a/REITs_policyData/policy_zhejiang.py
+++ b/REITs_policyData/policy_zhejiang.py
@@ -16,8 +16,8 @@ headers = {
    'X-Requested-With': 'XMLHttpRequest',
 }

-topic = 'policy'
-webname = '浙江省人民政府'
+topic = 'research_center_fourth'
+webname = '浙江省人民政府_'

 class Policy():
    def getrequest_soup(self, headers, url):
@@ -502,14 +502,17 @@ def getDatas(page):
            continue
        num += 1
        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        lang = baseCore.detect_language(content)
        dic_info = {
            'attachmentIds': id_list,
+            'subjectId': '1729315113088765953',
+            'lang': lang,
            'author': '',
            'content': content,
            'contentWithTag': contentWithTag,
            'deleteFlag': 0,
            'checkStatus': 1,
-            'id': '',
+            'id': '1729315113088765953'+str(int(time.time())),
            'title': title,
            'publishDate': publishDate,
            'origin': origin,

--- a/REITs_policyData/reits.py
+++ b/REITs_policyData/reits.py
@@ -42,10 +42,12 @@ class Policy():
        return result

    def createDriver(self):
-        chrome_driver = r'D:\cmd100\chromedriver.exe'
+        # chrome_driver = r'D:\cmd100\chromedriver.exe'
+        chrome_driver = r'F:\spider\cmd100\chromedriver.exe'
        path = Service(chrome_driver)
        chrome_options = webdriver.ChromeOptions()
-        chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
+        # chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
+        chrome_options.binary_location = r'F:\spider\85\Google\Chrome\Application\chrome.exe'
        # 设置代理
        # proxy = "127.0.0.1:8080"  # 代理地址和端口
        # chrome_options.add_argument('--proxy-server=http://' + proxy)

--- a/REITs专题数据/FundAnncmnt-hkex.py
+++ b/REITs专题数据/FundAnncmnt-hkex.py
-import datetime
+import datetime
@@ -250,6 +250,7 @@ def doJob(obsOperate):
                    continue
                att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', file_title, num, str(date)[:10])
                num += 1
+                createDate = datetime.datetime.now().strftime('%Y-%m-%d')
                dic_info = {
                    'code': code,  # 代码
                    'name': name,  # 基金名称
@@ -260,6 +261,7 @@ def doJob(obsOperate):
                    'date': date,  # 时间（datetime 类型）
                    'strDate': str(date)[:10],  # 时间（字符串类型）
                    'exchange': '香港交易所',  # 交易所
+                    'createDate':createDate     # 创建时间
                }
                db_storage.insert_one(dic_info)
                log.info(f'{code}==={title}===采集成功')

--- a/REITs专题数据/cushman.py
+++ b/REITs专题数据/cushman.py
+import re
+import re
+
+import fitz
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+import os
+import numpy as np
+from base import BaseCore
+from requests.models import Response
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+
+headers = {
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+    'Accept-Encoding': 'gzip, deflate, br',
+    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+    'Cache-Control': 'no-cache',
+    'Pragma': 'no-cache',
+    'Referer': 'https://www.cushmanwakefield.com.cn/research-report/p94.html?expert=0',
+    'Sec-Ch-Ua': '"Microsoft Edge";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
+    'Sec-Ch-Ua-Mobile': '?0',
+    'Sec-Ch-Ua-Platform': '"Windows"',
+    'Sec-Fetch-Dest': 'document',
+    'Sec-Fetch-Mode': 'navigate',
+    'Sec-Fetch-Site': 'same-origin',
+    'Sec-Fetch-User': '?1',
+    'Upgrade-Insecure-Requests': '1',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
+}
+
+
+def getSoup(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'html.parser')
+    return soup
+
+
+def getPageSize():
+    # url = 'https://www.cushmanwakefield.com.cn/research-report/p1.html?expert=0'
+    url = 'https://www.cushmanwakefield.com.cn/research-report/p1.html?expert=1'
+    soup = getSoup(url)
+    total = int(re.findall('\d+', soup.find('dl', class_='sousuo_result').text.lstrip().strip())[0])
+    if total % 4 == 0:
+        pageSize = int(total / 4)
+    else:
+        pageSize = int(total / 4) + 1
+    return pageSize
+
+
+def getContent(url):
+    content = ''
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers, proxies=ip)
+    # req.encoding = req.apparent_encoding
+    with fitz.open(stream=req.content, filetype='pdf') as doc:
+        page_size = doc.page_count
+        for page in doc.pages():
+            content += page.get_text()
+    return content
+
+
+def doJob():
+    num = 1
+    data_list = []
+    pageSize = getPageSize()
+    for page in range(1, pageSize + 1):
+        # url = f'https://www.cushmanwakefield.com.cn/research-report/p{page}.html?expert=0'
+
+        url = f'https://www.cushmanwakefield.com.cn/research-report/p{page}.html?expert=1'
+        soup = getSoup(url)
+        div_list = soup.find('div', class_='guwen_list_box').find_all('div', class_='zhuangyuan_guwen_box')
+        for div in div_list:
+            fjtitle_list = ''
+            fjhref_list = ''
+            name = div.find('div', class_='zhuanyuan_name').text.lstrip().strip()
+            summary = div.find('div', class_='zhuanyuan_info').text.lstrip().strip()
+            href = div.find('a', class_='zhuanyuan_xinxi').get('href')
+            origin = '戴德梁兴'
+            try:
+                content = getContent(href)
+                # print(content)
+            except Exception as e:
+                log.error(f'第{page}页==={name}===连接失败')
+                continue
+            title = name.replace('/',' ').replace('|',' ').replace('？',' ').replace('"','”')
+
+
+
+
+
+if __name__ == '__main__':
+    doJob()
+    baseCore.close()