提交 b3fa91e8 作者: LiuLiYuan

REITs 政策法规 03/21

上级 88209302
...@@ -15,8 +15,8 @@ from reits import Policy ...@@ -15,8 +15,8 @@ from reits import Policy
policy = Policy() policy = Policy()
topic = 'policy' topic = 'research_center_fourth'
webname = '北京市人民政府' webname = '北京市人民政府_'
class Policy1(): class Policy1():
@retry(tries=3, delay=10) @retry(tries=3, delay=10)
...@@ -282,14 +282,17 @@ def beijing(): ...@@ -282,14 +282,17 @@ def beijing():
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
if content == '': if content == '':
continue continue
lang = baseCore.detect_language(content)
dic_info = { dic_info = {
'attachmentIds': id_list, 'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang':lang,
'author': '', 'author': '',
'content': content, 'content': content,
'contentWithTag': contentWithTag_str, 'contentWithTag': contentWithTag_str,
'deleteFlag': 0, 'deleteFlag': 0,
'checkStatus': 1, 'checkStatus': 1,
'id': '', 'id': '1729315113088765953'+str(int(time.time())),
'title': title, 'title': title,
'publishDate': publishDate, 'publishDate': publishDate,
'origin': origin, 'origin': origin,
...@@ -312,6 +315,6 @@ def beijing(): ...@@ -312,6 +315,6 @@ def beijing():
time.sleep(random.randint(10, 20)) time.sleep(random.randint(10, 20))
num += 1 num += 1
# if __name__ == '__main__': if __name__ == '__main__':
# beijing() beijing()
# baseCore.close() baseCore.close()
import json
import time import time
import os import os
...@@ -12,8 +13,8 @@ log = baseCore.getLogger() ...@@ -12,8 +13,8 @@ log = baseCore.getLogger()
from reits import Policy from reits import Policy
policy = Policy() policy = Policy()
topic = 'policy' topic = 'research_center_fourth'
webname = '福建省人民政府' webname = '福建省人民政府_'
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'X-Requested-With': 'XMLHttpRequest', 'X-Requested-With': 'XMLHttpRequest',
...@@ -66,6 +67,7 @@ def getContent(num, url, publishDate): ...@@ -66,6 +67,7 @@ def getContent(num, url, publishDate):
style.decompose() style.decompose()
except: except:
pass pass
try:
a_list = soup.find('div', class_='xl_list1').find_all('a') a_list = soup.find('div', class_='xl_list1').find_all('a')
for a in a_list: for a in a_list:
fj_href = a.get('href') fj_href = a.get('href')
...@@ -82,6 +84,8 @@ def getContent(num, url, publishDate): ...@@ -82,6 +84,8 @@ def getContent(num, url, publishDate):
if att_id: if att_id:
id_list.append(att_id) id_list.append(att_id)
a['href'] = full_path a['href'] = full_path
except:
pass
content = contentWithTag.text.lstrip().strip() content = contentWithTag.text.lstrip().strip()
...@@ -116,7 +120,10 @@ def doJob(): ...@@ -116,7 +120,10 @@ def doJob():
for data_post in data_posts: for data_post in data_posts:
data_json = getDataJson(data_post) data_json = getDataJson(data_post)
for data_ in data_json: for data_ in data_json:
try:
title = data_['_doctitle'] title = data_['_doctitle']
except:
title = data_['doctitle']
publishDate = data_['crtime'].replace('.','-') publishDate = data_['crtime'].replace('.','-')
origin = data_['docsourcename'] origin = data_['docsourcename']
href = data_['docpuburl'] href = data_['docpuburl']
...@@ -142,14 +149,17 @@ def doJob(): ...@@ -142,14 +149,17 @@ def doJob():
content, contentWithTag, id_list = getContent(num, href, publishDate[:10]) content, contentWithTag, id_list = getContent(num, href, publishDate[:10])
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
contentWithTag_str = str(contentWithTag) contentWithTag_str = str(contentWithTag)
lang = baseCore.detect_language(content)
dic_info = { dic_info = {
'attachmentIds': id_list, 'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '', 'author': '',
'content': content, 'content': content,
'contentWithTag': contentWithTag_str, 'contentWithTag': contentWithTag_str,
'deleteFlag': 0, 'deleteFlag': 0,
'checkStatus': 1, 'checkStatus': 1,
'id': '', 'id': '1729315113088765953'+str(int(time.time())),
'title': title, 'title': title,
'publishDate': publishDate, 'publishDate': publishDate,
'origin': origin, 'origin': origin,
......
...@@ -20,7 +20,7 @@ policy = Policy() ...@@ -20,7 +20,7 @@ policy = Policy()
topic = 'research_center_fourth' topic = 'research_center_fourth'
webname = '广东省人民政府' webname = '广东省人民政府_'
headers = { headers = {
'Content-Type': 'application/json', 'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
...@@ -225,10 +225,4 @@ def doJob(): ...@@ -225,10 +225,4 @@ def doJob():
if __name__ == '__main__': if __name__ == '__main__':
doJob() doJob()
# doJob_1()
# doJob_2(2)
# url = 'http://www.gd.gov.cn/gkmlpt/content/4/4022/post_4022955.html#8'
# soup = getSoup(url)
#
# print(contentWithTag)
baseCore.close() baseCore.close()
...@@ -12,11 +12,11 @@ baseCore = BaseCore.BaseCore() ...@@ -12,11 +12,11 @@ baseCore = BaseCore.BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
from reits import Policy from reits import Policy
policy = Policy()
policy = Policy()
topic = 'policy' topic = 'research_center_fourth'
webname = '广西壮族自治区人民政府' webname = '广西壮族自治区人民政府_'
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'Content-Type': 'application/json', 'Content-Type': 'application/json',
...@@ -41,11 +41,27 @@ def getFjContent(url): ...@@ -41,11 +41,27 @@ def getFjContent(url):
def getTotal(): def getTotal():
ip = baseCore.get_proxy() ip = baseCore.get_proxy()
url = 'http://www.gxzf.gov.cn/irs/front/search' url = 'http://www.gxzf.gov.cn/irs/front/search'
data_post = {"code": "181aedaa542", "dataTypeId": "241", "configCode": "", # data_post = {"code": "181aedaa542", "dataTypeId": "241", "configCode": "",
"sign": "9cc99c9d-94aa-44b4-aa79-41227a5385d7", "searchWord": "REITs", "orderBy": "related", # "sign": "9cc99c9d-94aa-44b4-aa79-41227a5385d7", "searchWord": "REITs", "orderBy": "related",
"searchBy": "all", "appendixType": "", "granularity": "ALL", "isSearchForced": "0", "filters": [], # "searchBy": "all", "appendixType": "", "granularity": "ALL", "isSearchForced": "0", "filters": [],
"pageNo": 1, "pageSize": 10, "isAdvancedSearch": None, "isDefaultAdvanced": None, # "pageNo": 1, "pageSize": 10, "isAdvancedSearch": None, "isDefaultAdvanced": None,
"advancedFilters": None, "advancedFilters ": None, "historySearchWords": []} # "advancedFilters": None, "advancedFilters ": None, "historySearchWords": []}
data_post = {'advancedFilters': None,
'appendixType': "",
'code': "181aedaa542",
'configCode': "",
'dataTypeId': "241",
'filters': [],
'granularity': "ALL",
'historySearchWords': [],
'isAdvancedSearch': None,
'isDefaultAdvanced': None,
'isSearchForced': "0",
'orderBy': "related",
'pageNo': 1,
'pageSize': 10,
'searchBy': "all",
'searchWord': "REITs", }
data_post = json.dumps(data_post) data_post = json.dumps(data_post)
req = requests.post(url, headers=headers, data=data_post, proxies=ip) req = requests.post(url, headers=headers, data=data_post, proxies=ip)
req.encoding = req.apparent_encoding req.encoding = req.apparent_encoding
...@@ -55,11 +71,27 @@ def getTotal(): ...@@ -55,11 +71,27 @@ def getTotal():
def getDataJson(page): def getDataJson(page):
ip = baseCore.get_proxy() ip = baseCore.get_proxy()
url = 'http://www.gxzf.gov.cn/irs/front/search' url = 'http://www.gxzf.gov.cn/irs/front/search'
data_post = {"code": "181aedaa542", "dataTypeId": "241", "configCode": "", # data_post = {"code": "181aedaa542", "dataTypeId": "241", "configCode": "",
"sign": "9cc99c9d-94aa-44b4-aa79-41227a5385d7", "searchWord": "REITs", "orderBy": "related", # "sign": "9cc99c9d-94aa-44b4-aa79-41227a5385d7", "searchWord": "REITs", "orderBy": "related",
"searchBy": "all", "appendixType": "", "granularity": "ALL", "isSearchForced": "0", "filters": [], # "searchBy": "all", "appendixType": "", "granularity": "ALL", "isSearchForced": "0", "filters": [],
"pageNo": page, "pageSize": 10, "isAdvancedSearch": None, "isDefaultAdvanced": None, # "pageNo": page, "pageSize": 10, "isAdvancedSearch": None, "isDefaultAdvanced": None,
"advancedFilters": None, "advancedFilters ": None, "historySearchWords": []} # "advancedFilters": None, "advancedFilters ": None, "historySearchWords": []}
data_post = {'advancedFilters': None,
'appendixType': "",
'code': "181aedaa542",
'configCode': "",
'dataTypeId': "241",
'filters': [],
'granularity': "ALL",
'historySearchWords': [],
'isAdvancedSearch': None,
'isDefaultAdvanced': None,
'isSearchForced': "0",
'orderBy': "related",
'pageNo': page,
'pageSize': 10,
'searchBy': "all",
'searchWord': "REITs", }
data_post = json.dumps(data_post) data_post = json.dumps(data_post)
req = requests.post(url, headers=headers, data=data_post, proxies=ip) req = requests.post(url, headers=headers, data=data_post, proxies=ip)
req.encoding = req.apparent_encoding req.encoding = req.apparent_encoding
...@@ -117,14 +149,17 @@ def getData(data_, num): ...@@ -117,14 +149,17 @@ def getData(data_, num):
content, contentWithTag, id_list = getContent(href, publishDate, num) content, contentWithTag, id_list = getContent(href, publishDate, num)
contentWithTag_str = str(contentWithTag) contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = { dic_info = {
'attachmentIds': id_list, 'attachmentIds': id_list,
'author': '', 'author': '',
'subjectId': '1729315113088765953',
'lang': lang,
'content': content, 'content': content,
'contentWithTag': contentWithTag_str, 'contentWithTag': contentWithTag_str,
'deleteFlag': 0, 'deleteFlag': 0,
'checkStatus': 1, 'checkStatus': 1,
'id': '', 'id': '1729315113088765953' + str(int(time.time())),
'title': title, 'title': title,
'publishDate': publishDate, 'publishDate': publishDate,
'origin': origin, 'origin': origin,
...@@ -163,7 +198,6 @@ def doJob(): ...@@ -163,7 +198,6 @@ def doJob():
time.sleep(2) time.sleep(2)
if __name__ == '__main__': if __name__ == '__main__':
doJob() doJob()
baseCore.close() baseCore.close()
...@@ -17,8 +17,8 @@ from reits import Policy ...@@ -17,8 +17,8 @@ from reits import Policy
policy = Policy() policy = Policy()
topic = 'policy' topic = 'research_center_fourth'
webname = '海南省人民政府' webname = '海南省人民政府_'
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
...@@ -108,14 +108,17 @@ def getData(div, num): ...@@ -108,14 +108,17 @@ def getData(div, num):
return return
contentWithTag_str = str(contentWithTag) contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = { dic_info = {
'attachmentIds': [], 'attachmentIds': [],
'subjectId': '1729315113088765953',
'lang': lang,
'author': '', 'author': '',
'content': content, 'content': content,
'contentWithTag': contentWithTag_str, 'contentWithTag': contentWithTag_str,
'deleteFlag': 0, 'deleteFlag': 0,
'checkStatus': 1, 'checkStatus': 1,
'id': '', 'id': '1729315113088765953'+str(int(time.time())),
'title': title, 'title': title,
'publishDate': publishDate, 'publishDate': publishDate,
'origin': origin, 'origin': origin,
......
#coding=utf-8 # coding=utf-8
import os import os
import time import time
...@@ -10,14 +10,14 @@ baseCore = BaseCore.BaseCore() ...@@ -10,14 +10,14 @@ baseCore = BaseCore.BaseCore()
log = baseCore.getLogger() log = baseCore.getLogger()
from reits import Policy from reits import Policy
policy = Policy()
policy = Policy()
topic = 'policy' topic = 'research_center_fourth'
webname = '黑龙江省人民政府' webname = '黑龙江省人民政府_'
headers = { headers = {
'Content-Type': 'application/x-www-form-urlencoded', 'Content-Type': 'application/x-www-form-urlencoded',
'Token': '9a9ff46e-f534-43b8-bad1-063d80af7e51', 'Token': 'b946cd4e-77a4-42f5-bcaf-a9c4f26b5191',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
} }
...@@ -26,11 +26,12 @@ def getDataJson(): ...@@ -26,11 +26,12 @@ def getDataJson():
ip = baseCore.get_proxy() ip = baseCore.get_proxy()
url = 'https://www.hlj.gov.cn/znwd/policy/policy/policy/home/public/policyWikipedia?_method=get' url = 'https://www.hlj.gov.cn/znwd/policy/policy/policy/home/public/policyWikipedia?_method=get'
data_post = { data_post = {
'sort': 'smartIndex', 'sort': 'date',
'order': 'asc', 'order': 'desc',
'start': '0', 'start': '0',
'length': '20', 'length': '20',
'filter.all': 'REITs', 'filter.all': 'REITs',
'filter.tyoe': '0'
} }
req = requests.post(url, headers=headers, data=data_post, proxies=ip) req = requests.post(url, headers=headers, data=data_post, proxies=ip)
req.encoding = req.apparent_encoding req.encoding = req.apparent_encoding
...@@ -54,7 +55,7 @@ def getFjContent(url): ...@@ -54,7 +55,7 @@ def getFjContent(url):
return req.content return req.content
def getContent(num, title, publishDate, summary, id, pub_hao, organ,type): def getContent(num, title, publishDate, summary, id, pub_hao, organ, type):
id_list = [] id_list = []
url = f'https://www.hlj.gov.cn/znwd/policy/#/readDetails?id={id}' url = f'https://www.hlj.gov.cn/znwd/policy/#/readDetails?id={id}'
writtenDate = None writtenDate = None
...@@ -83,7 +84,7 @@ def getContent(num, title, publishDate, summary, id, pub_hao, organ,type): ...@@ -83,7 +84,7 @@ def getContent(num, title, publishDate, summary, id, pub_hao, organ,type):
fj_title = fj_title.replace('<', '').replace('>', '') fj_title = fj_title.replace('<', '').replace('>', '')
if category not in fj_title: if category not in fj_title:
fj_title = fj_title + category fj_title = fj_title + category
att_id, full_path = policy.attuributefile(fj_title,href,num,publishDate) att_id, full_path = policy.attuributefile(fj_title, href, num, publishDate)
if att_id: if att_id:
id_list.append(att_id) id_list.append(att_id)
a['href'] = full_path a['href'] = full_path
...@@ -104,14 +105,17 @@ def getContent(num, title, publishDate, summary, id, pub_hao, organ,type): ...@@ -104,14 +105,17 @@ def getContent(num, title, publishDate, summary, id, pub_hao, organ,type):
content = soup.text.lstrip().strip() content = soup.text.lstrip().strip()
contentWithTag_str = str(soup) contentWithTag_str = str(soup)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = { dic_info = {
'attachmentIds': id_list, 'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '', 'author': '',
'content': content, 'content': content,
'contentWithTag': contentWithTag_str, 'contentWithTag': contentWithTag_str,
'deleteFlag': 0, 'deleteFlag': 0,
'checkStatus': 1, 'checkStatus': 1,
'id': '', 'id': '1729315113088765953' + str(int(time.time())),
'title': title, 'title': title,
'publishDate': publishDate, 'publishDate': publishDate,
'origin': origin, 'origin': origin,
...@@ -135,7 +139,6 @@ def getContent(num, title, publishDate, summary, id, pub_hao, organ,type): ...@@ -135,7 +139,6 @@ def getContent(num, title, publishDate, summary, id, pub_hao, organ,type):
def doJob(): def doJob():
num = 1 num = 1
data_json = getDataJson() data_json = getDataJson()
for data_ in data_json: for data_ in data_json:
...@@ -152,7 +155,7 @@ def doJob(): ...@@ -152,7 +155,7 @@ def doJob():
organ = data_['unitShowName'] organ = data_['unitShowName']
except: except:
organ = '' organ = ''
data = getContent(num, title, publishDate, summary, id, pub_hao, organ,type) data = getContent(num, title, publishDate, summary, id, pub_hao, organ, type)
# data_list.append(data) # data_list.append(data)
num += 1 num += 1
time.sleep(3) time.sleep(3)
......
...@@ -21,8 +21,8 @@ from reits import Policy ...@@ -21,8 +21,8 @@ from reits import Policy
policy = Policy() policy = Policy()
topic = 'policy' topic = 'research_center_fourth'
webname = '湖北省人民政府' webname = '湖北省人民政府_'
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
} }
...@@ -114,14 +114,17 @@ def getData(driver, data_, num): ...@@ -114,14 +114,17 @@ def getData(driver, data_, num):
contentWithTag_str = str(contentWithTag) contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = { dic_info = {
'attachmentIds': id_list, 'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '', 'author': '',
'content': content, 'content': content,
'contentWithTag': contentWithTag_str, 'contentWithTag': contentWithTag_str,
'deleteFlag': 0, 'deleteFlag': 0,
'checkStatus': 1, 'checkStatus': 1,
'id': '', 'id': '1729315113088765953'+str(int(time.time())),
'title': title, 'title': title,
'publishDate': publishDate, 'publishDate': publishDate,
'origin': origin, 'origin': origin,
...@@ -145,7 +148,8 @@ def getData(driver, data_, num): ...@@ -145,7 +148,8 @@ def getData(driver, data_, num):
def doJob(): def doJob():
service = Service(r'D:/soft/geckodriver.exe') # service = Service(r'D:/soft/geckodriver.exe')
service = Service(r'F:\spider\firefox\geckodriver_1.exe')
options = Options() options = Options()
options.set_preference("general.useragent.override", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3") options.set_preference("general.useragent.override", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
driver = webdriver.Firefox(options=options, service=service) driver = webdriver.Firefox(options=options, service=service)
......
...@@ -17,8 +17,8 @@ from reits import Policy ...@@ -17,8 +17,8 @@ from reits import Policy
policy = Policy() policy = Policy()
topic = 'policy' topic = 'research_center_fourth'
webname = '江苏省人民政府' webname = '江苏省人民政府_'
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
...@@ -85,14 +85,17 @@ def getContentA(url, num, publishDate, title, origin, summary): ...@@ -85,14 +85,17 @@ def getContentA(url, num, publishDate, title, origin, summary):
content = contentWithTag.text content = contentWithTag.text
contentWithTag_str = str(contentWithTag) contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = { dic_info = {
'attachmentIds': id_list, 'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '', 'author': '',
'content': content, 'content': content,
'contentWithTag': contentWithTag_str, 'contentWithTag': contentWithTag_str,
'deleteFlag': 0, 'deleteFlag': 0,
'checkStatus': 1, 'checkStatus': 1,
'id': '', 'id': '1729315113088765953'+str(int(time.time())),
'title': title, 'title': title,
'publishDate': publishDate, 'publishDate': publishDate,
'origin': origin, 'origin': origin,
...@@ -163,13 +166,16 @@ def getContentB(url, num, publishDate, title, origin, summary): ...@@ -163,13 +166,16 @@ def getContentB(url, num, publishDate, title, origin, summary):
content = contentWithTag.text.lstrip().strip() content = contentWithTag.text.lstrip().strip()
contentWithTag_str = str(contentWithTag) contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = { dic_info = {
'attachmentIds': id_list, 'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '', 'author': '',
'content': content, 'content': content,
'contentWithTag': contentWithTag_str, 'contentWithTag': contentWithTag_str,
'deleteFlag': 0, 'deleteFlag': 0,
'id': '', 'id': '1729315113088765953'+str(int(time.time())),
'title': title, 'title': title,
'publishDate': publishDate, 'publishDate': publishDate,
'origin': origin, 'origin': origin,
......
...@@ -16,7 +16,7 @@ policy = Policy() ...@@ -16,7 +16,7 @@ policy = Policy()
topic = 'research_center_fourth' topic = 'research_center_fourth'
webname = '江西省人民政府' webname = '江西省人民政府_'
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'X-Requested-With': 'XMLHttpRequest', 'X-Requested-With': 'XMLHttpRequest',
......
...@@ -14,8 +14,8 @@ from reits import Policy ...@@ -14,8 +14,8 @@ from reits import Policy
policy = Policy() policy = Policy()
topic = 'policy' topic = 'research_center_fourth'
webname = '吉林市人民政府' webname = '吉林市人民政府_'
headers = { headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
...@@ -155,14 +155,17 @@ def getData(num, title, url, origin, publishDate, summary): ...@@ -155,14 +155,17 @@ def getData(num, title, url, origin, publishDate, summary):
return return
contentWithTag_str = str(contentWithTag) contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = { dic_info = {
'attachmentIds': id_list, 'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '', 'author': '',
'content': content, 'content': content,
'contentWithTag': contentWithTag_str, 'contentWithTag': contentWithTag_str,
'deleteFlag': 0, 'deleteFlag': 0,
'checkStatus': 1, 'checkStatus': 1,
'id': '', 'id': '1729315113088765953'+str(int(time.time())),
'title': title, 'title': title,
'publishDate': publishDate, 'publishDate': publishDate,
'origin': origin, 'origin': origin,
......
...@@ -15,8 +15,8 @@ from reits import Policy ...@@ -15,8 +15,8 @@ from reits import Policy
policy = Policy() policy = Policy()
topic = 'policy' topic = 'research_center_fourth'
webname = '辽宁省人民政府' webname = '辽宁省人民政府_'
headers = { headers = {
'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0' 'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
} }
...@@ -63,14 +63,17 @@ def doJob(): ...@@ -63,14 +63,17 @@ def doJob():
content = contentWithTag.text.lstrip().strip() content = contentWithTag.text.lstrip().strip()
contentWithTag_str = str(contentWithTag) contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = { dic_info = {
'attachmentIds': [], 'attachmentIds': [],
'subjectId': '1729315113088765953',
'lang': lang,
'author': '', 'author': '',
'content': content, 'content': content,
'contentWithTag': contentWithTag_str, 'contentWithTag': contentWithTag_str,
'deleteFlag': 0, 'deleteFlag': 0,
'checkStatus': 1, 'checkStatus': 1,
'id': '', 'id': '1729315113088765953'+str(int(time.time())),
'title': title, 'title': title,
'publishDate': publishDate, 'publishDate': publishDate,
'origin': '辽宁省人民政府', 'origin': '辽宁省人民政府',
......
...@@ -15,8 +15,8 @@ from reits import Policy ...@@ -15,8 +15,8 @@ from reits import Policy
policy = Policy() policy = Policy()
topic = 'policy' topic = 'research_center_fourth'
webname = '内蒙古自治区人民政府' webname = '内蒙古自治区人民政府_'
headers = { headers = {
'Accept': 'application/json, text/plain, */*', 'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br', 'Accept-Encoding': 'gzip, deflate, br',
...@@ -188,14 +188,17 @@ def getContent(num, data): ...@@ -188,14 +188,17 @@ def getContent(num, data):
return return
contentWithTag_str = str(contentWithTag) contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = { dic_info = {
'attachmentIds': id_list, 'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '', 'author': '',
'content': content, 'content': content,
'contentWithTag': contentWithTag_str, 'contentWithTag': contentWithTag_str,
'deleteFlag': 0, 'deleteFlag': 0,
'checkStatus': 1, 'checkStatus': 1,
'id': '', 'id': '1729315113088765953'+str(int(time.time())),
'title': title, 'title': title,
'publishDate': publishDate, 'publishDate': publishDate,
'origin': origin, 'origin': origin,
......
...@@ -11,8 +11,8 @@ log = baseCore.getLogger() ...@@ -11,8 +11,8 @@ log = baseCore.getLogger()
from reits import Policy from reits import Policy
policy = Policy() policy = Policy()
topic = 'policy' topic = 'research_center_fourth'
webname = '山东省人民政府' webname = '山东省人民政府_'
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'X-Requested-With': 'XMLHttpRequest', 'X-Requested-With': 'XMLHttpRequest',
...@@ -131,14 +131,17 @@ def getData(soup, num): ...@@ -131,14 +131,17 @@ def getData(soup, num):
contentWithTag_str = str(contentWithTag) contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = { dic_info = {
'attachmentIds': id_list, 'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '', 'author': '',
'content': content, 'content': content,
'contentWithTag': contentWithTag_str, 'contentWithTag': contentWithTag_str,
'deleteFlag': 0, 'deleteFlag': 0,
'checkStatus': 1, 'checkStatus': 1,
'id': '', 'id': '1729315113088765953'+str(int(time.time())),
'title': title, 'title': title,
'publishDate': publishDate, 'publishDate': publishDate,
'origin': origin, 'origin': origin,
......
...@@ -17,8 +17,8 @@ from reits import Policy ...@@ -17,8 +17,8 @@ from reits import Policy
policy = Policy() policy = Policy()
topic = 'policy' topic = 'research_center_fourth'
webname = '上海市人民政府' webname = '上海市人民政府_'
headers = { headers = {
'Accept': '*/*', 'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br', 'Accept-Encoding': 'gzip, deflate, br',
...@@ -111,14 +111,17 @@ def getData(data_, driver, num): ...@@ -111,14 +111,17 @@ def getData(data_, driver, num):
# fjhref_list] # fjhref_list]
contentWithTag_str = str(contentWithTag) contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = { dic_info = {
'attachmentIds': id_list, 'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '', 'author': '',
'content': content, 'content': content,
'contentWithTag': contentWithTag_str, 'contentWithTag': contentWithTag_str,
'deleteFlag': 0, 'deleteFlag': 0,
'checkStatus': 1, 'checkStatus': 1,
'id': '', 'id': '1729315113088765953'+str(int(time.time())),
'title': title, 'title': title,
'publishDate': publishDate, 'publishDate': publishDate,
'origin': origin, 'origin': origin,
......
...@@ -14,8 +14,8 @@ from reits import Policy ...@@ -14,8 +14,8 @@ from reits import Policy
policy = Policy() policy = Policy()
topic = 'policy' topic = 'research_center_fourth'
webname = '山西省人民政府' webname = '山西省人民政府_'
headers = { headers = {
'Accept': 'application/json, text/plain, */*', 'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate', 'Accept-Encoding': 'gzip, deflate',
...@@ -130,14 +130,17 @@ def getContent(num, data): ...@@ -130,14 +130,17 @@ def getContent(num, data):
a['href'] = full_path a['href'] = full_path
contentWithTag_str = str(contentWithTag) contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = { dic_info = {
'attachmentIds': id_list, 'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '', 'author': '',
'content': content, 'content': content,
'contentWithTag': contentWithTag_str, 'contentWithTag': contentWithTag_str,
'deleteFlag': 0, 'deleteFlag': 0,
'checkStatus': 1, 'checkStatus': 1,
'id': '', 'id': '1729315113088765953'+str(int(time.time())),
'title': title, 'title': title,
'publishDate': publishDate, 'publishDate': publishDate,
'origin': origin, 'origin': origin,
......
...@@ -14,8 +14,8 @@ log = baseCore.getLogger() ...@@ -14,8 +14,8 @@ log = baseCore.getLogger()
from reits import Policy from reits import Policy
policy = Policy() policy = Policy()
topic = 'policy' topic = 'research_center_fourth'
webname = '四川省人民政府' webname = '四川省人民政府_'
headers = { headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
...@@ -106,14 +106,17 @@ def getData(data_, num): ...@@ -106,14 +106,17 @@ def getData(data_, num):
contentWithTag_str = str(contentWithTag) contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = { dic_info = {
'attachmentIds': id_list, 'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '', 'author': '',
'content': content, 'content': content,
'contentWithTag': contentWithTag_str, 'contentWithTag': contentWithTag_str,
'deleteFlag': 0, 'deleteFlag': 0,
'checkStatus': 1, 'checkStatus': 1,
'id': '', 'id': '1729315113088765953'+str(int(time.time())),
'title': title, 'title': title,
'publishDate': publishDate, 'publishDate': publishDate,
'origin': origin, 'origin': origin,
......
...@@ -16,8 +16,8 @@ from reits import Policy ...@@ -16,8 +16,8 @@ from reits import Policy
policy = Policy() policy = Policy()
topic = 'policy' topic = 'research_center_fourth'
webname = '天津市人民政府' webname = '天津市人民政府_'
import urllib3 import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
...@@ -137,14 +137,17 @@ def getContent(num, title, pub_time, origin, organ, url, pub_hao, summary): ...@@ -137,14 +137,17 @@ def getContent(num, title, pub_time, origin, organ, url, pub_hao, summary):
content = contentWithTag.text.lstrip().strip() content = contentWithTag.text.lstrip().strip()
contentWithTag_str = str(contentWithTag) contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = { dic_info = {
'attachmentIds': id_list, 'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '', 'author': '',
'content': content, 'content': content,
'contentWithTag': contentWithTag_str, 'contentWithTag': contentWithTag_str,
'deleteFlag': 0, 'deleteFlag': 0,
'checkStatus': 1, 'checkStatus': 1,
'id': '', 'id': '1729315113088765953'+str(int(time.time())),
'title': title, 'title': title,
'publishDate': pub_time, 'publishDate': pub_time,
'origin': origin, 'origin': origin,
......
...@@ -19,8 +19,8 @@ from reits import Policy ...@@ -19,8 +19,8 @@ from reits import Policy
policy = Policy() policy = Policy()
topic = 'policy' topic = 'research_center_fourth'
webname = '云南省人民政府' webname = '云南省人民政府_'
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
} }
...@@ -149,14 +149,17 @@ def getData(div, num): ...@@ -149,14 +149,17 @@ def getData(div, num):
content, contentWithTag, id_list = getContent(href, publishDate, num) content, contentWithTag, id_list = getContent(href, publishDate, num)
contentWithTag_str = str(contentWithTag) contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = { dic_info = {
'attachmentIds': id_list, 'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '', 'author': '',
'content': content, 'content': content,
'contentWithTag': contentWithTag_str, 'contentWithTag': contentWithTag_str,
'deleteFlag': 0, 'deleteFlag': 0,
'checkStatus': 1, 'checkStatus': 1,
'id': '', 'id': '1729315113088765953'+str(int(time.time())),
'title': title, 'title': title,
'publishDate': publishDate, 'publishDate': publishDate,
'origin': origin, 'origin': origin,
......
...@@ -16,8 +16,8 @@ headers = { ...@@ -16,8 +16,8 @@ headers = {
'X-Requested-With': 'XMLHttpRequest', 'X-Requested-With': 'XMLHttpRequest',
} }
topic = 'policy' topic = 'research_center_fourth'
webname = '浙江省人民政府' webname = '浙江省人民政府_'
class Policy(): class Policy():
def getrequest_soup(self, headers, url): def getrequest_soup(self, headers, url):
...@@ -502,14 +502,17 @@ def getDatas(page): ...@@ -502,14 +502,17 @@ def getDatas(page):
continue continue
num += 1 num += 1
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = { dic_info = {
'attachmentIds': id_list, 'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '', 'author': '',
'content': content, 'content': content,
'contentWithTag': contentWithTag, 'contentWithTag': contentWithTag,
'deleteFlag': 0, 'deleteFlag': 0,
'checkStatus': 1, 'checkStatus': 1,
'id': '', 'id': '1729315113088765953'+str(int(time.time())),
'title': title, 'title': title,
'publishDate': publishDate, 'publishDate': publishDate,
'origin': origin, 'origin': origin,
......
...@@ -42,10 +42,12 @@ class Policy(): ...@@ -42,10 +42,12 @@ class Policy():
return result return result
def createDriver(self): def createDriver(self):
chrome_driver = r'D:\cmd100\chromedriver.exe' # chrome_driver = r'D:\cmd100\chromedriver.exe'
chrome_driver = r'F:\spider\cmd100\chromedriver.exe'
path = Service(chrome_driver) path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions() chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe' # chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
chrome_options.binary_location = r'F:\spider\85\Google\Chrome\Application\chrome.exe'
# 设置代理 # 设置代理
# proxy = "127.0.0.1:8080" # 代理地址和端口 # proxy = "127.0.0.1:8080" # 代理地址和端口
# chrome_options.add_argument('--proxy-server=http://' + proxy) # chrome_options.add_argument('--proxy-server=http://' + proxy)
......
import datetime import datetime
...@@ -250,6 +250,7 @@ def doJob(obsOperate): ...@@ -250,6 +250,7 @@ def doJob(obsOperate):
continue continue
att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', file_title, num, str(date)[:10]) att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', file_title, num, str(date)[:10])
num += 1 num += 1
createDate = datetime.datetime.now().strftime('%Y-%m-%d')
dic_info = { dic_info = {
'code': code, # 代码 'code': code, # 代码
'name': name, # 基金名称 'name': name, # 基金名称
...@@ -260,6 +261,7 @@ def doJob(obsOperate): ...@@ -260,6 +261,7 @@ def doJob(obsOperate):
'date': date, # 时间(datetime 类型) 'date': date, # 时间(datetime 类型)
'strDate': str(date)[:10], # 时间(字符串类型) 'strDate': str(date)[:10], # 时间(字符串类型)
'exchange': '香港交易所', # 交易所 'exchange': '香港交易所', # 交易所
'createDate':createDate # 创建时间
} }
db_storage.insert_one(dic_info) db_storage.insert_one(dic_info)
log.info(f'{code}==={title}===采集成功') log.info(f'{code}==={title}===采集成功')
......
import re
import re
import fitz
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import numpy as np
from base import BaseCore
from requests.models import Response
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Pragma': 'no-cache',
'Referer': 'https://www.cushmanwakefield.com.cn/research-report/p94.html?expert=0',
'Sec-Ch-Ua': '"Microsoft Edge";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
def getSoup(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
return soup
def getPageSize():
# url = 'https://www.cushmanwakefield.com.cn/research-report/p1.html?expert=0'
url = 'https://www.cushmanwakefield.com.cn/research-report/p1.html?expert=1'
soup = getSoup(url)
total = int(re.findall('\d+', soup.find('dl', class_='sousuo_result').text.lstrip().strip())[0])
if total % 4 == 0:
pageSize = int(total / 4)
else:
pageSize = int(total / 4) + 1
return pageSize
def getContent(url):
content = ''
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
# req.encoding = req.apparent_encoding
with fitz.open(stream=req.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
content += page.get_text()
return content
def doJob():
num = 1
data_list = []
pageSize = getPageSize()
for page in range(1, pageSize + 1):
# url = f'https://www.cushmanwakefield.com.cn/research-report/p{page}.html?expert=0'
url = f'https://www.cushmanwakefield.com.cn/research-report/p{page}.html?expert=1'
soup = getSoup(url)
div_list = soup.find('div', class_='guwen_list_box').find_all('div', class_='zhuangyuan_guwen_box')
for div in div_list:
fjtitle_list = ''
fjhref_list = ''
name = div.find('div', class_='zhuanyuan_name').text.lstrip().strip()
summary = div.find('div', class_='zhuanyuan_info').text.lstrip().strip()
href = div.find('a', class_='zhuanyuan_xinxi').get('href')
origin = '戴德梁兴'
try:
content = getContent(href)
# print(content)
except Exception as e:
log.error(f'第{page}页==={name}===连接失败')
continue
title = name.replace('/',' ').replace('|',' ').replace('?',' ').replace('"','”')
if __name__ == '__main__':
doJob()
baseCore.close()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论