提交 b3fa91e8 作者: LiuLiYuan

REITs 政策法规 03/21

上级 88209302
......@@ -15,8 +15,8 @@ from reits import Policy
policy = Policy()
topic = 'policy'
webname = '北京市人民政府'
topic = 'research_center_fourth'
webname = '北京市人民政府_'
class Policy1():
@retry(tries=3, delay=10)
......@@ -282,14 +282,17 @@ def beijing():
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
if content == '':
continue
lang = baseCore.detect_language(content)
dic_info = {
'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang':lang,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'checkStatus': 1,
'id': '',
'id': '1729315113088765953'+str(int(time.time())),
'title': title,
'publishDate': publishDate,
'origin': origin,
......@@ -312,6 +315,6 @@ def beijing():
time.sleep(random.randint(10, 20))
num += 1
# if __name__ == '__main__':
# beijing()
# baseCore.close()
if __name__ == '__main__':
beijing()
baseCore.close()
import json
import time
import os
......@@ -12,8 +13,8 @@ log = baseCore.getLogger()
from reits import Policy
policy = Policy()
topic = 'policy'
webname = '福建省人民政府'
topic = 'research_center_fourth'
webname = '福建省人民政府_'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'X-Requested-With': 'XMLHttpRequest',
......@@ -66,6 +67,7 @@ def getContent(num, url, publishDate):
style.decompose()
except:
pass
try:
a_list = soup.find('div', class_='xl_list1').find_all('a')
for a in a_list:
fj_href = a.get('href')
......@@ -82,6 +84,8 @@ def getContent(num, url, publishDate):
if att_id:
id_list.append(att_id)
a['href'] = full_path
except:
pass
content = contentWithTag.text.lstrip().strip()
......@@ -116,7 +120,10 @@ def doJob():
for data_post in data_posts:
data_json = getDataJson(data_post)
for data_ in data_json:
try:
title = data_['_doctitle']
except:
title = data_['doctitle']
publishDate = data_['crtime'].replace('.','-')
origin = data_['docsourcename']
href = data_['docpuburl']
......@@ -142,14 +149,17 @@ def doJob():
content, contentWithTag, id_list = getContent(num, href, publishDate[:10])
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
contentWithTag_str = str(contentWithTag)
lang = baseCore.detect_language(content)
dic_info = {
'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'checkStatus': 1,
'id': '',
'id': '1729315113088765953'+str(int(time.time())),
'title': title,
'publishDate': publishDate,
'origin': origin,
......
......@@ -20,7 +20,7 @@ policy = Policy()
topic = 'research_center_fourth'
webname = '广东省人民政府'
webname = '广东省人民政府_'
headers = {
'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
......@@ -225,10 +225,4 @@ def doJob():
if __name__ == '__main__':
doJob()
# doJob_1()
# doJob_2(2)
# url = 'http://www.gd.gov.cn/gkmlpt/content/4/4022/post_4022955.html#8'
# soup = getSoup(url)
#
# print(contentWithTag)
baseCore.close()
......@@ -12,11 +12,11 @@ baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
from reits import Policy
policy = Policy()
policy = Policy()
topic = 'policy'
webname = '广西壮族自治区人民政府'
topic = 'research_center_fourth'
webname = '广西壮族自治区人民政府_'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'Content-Type': 'application/json',
......@@ -41,11 +41,27 @@ def getFjContent(url):
def getTotal():
ip = baseCore.get_proxy()
url = 'http://www.gxzf.gov.cn/irs/front/search'
data_post = {"code": "181aedaa542", "dataTypeId": "241", "configCode": "",
"sign": "9cc99c9d-94aa-44b4-aa79-41227a5385d7", "searchWord": "REITs", "orderBy": "related",
"searchBy": "all", "appendixType": "", "granularity": "ALL", "isSearchForced": "0", "filters": [],
"pageNo": 1, "pageSize": 10, "isAdvancedSearch": None, "isDefaultAdvanced": None,
"advancedFilters": None, "advancedFilters ": None, "historySearchWords": []}
# data_post = {"code": "181aedaa542", "dataTypeId": "241", "configCode": "",
# "sign": "9cc99c9d-94aa-44b4-aa79-41227a5385d7", "searchWord": "REITs", "orderBy": "related",
# "searchBy": "all", "appendixType": "", "granularity": "ALL", "isSearchForced": "0", "filters": [],
# "pageNo": 1, "pageSize": 10, "isAdvancedSearch": None, "isDefaultAdvanced": None,
# "advancedFilters": None, "advancedFilters ": None, "historySearchWords": []}
data_post = {'advancedFilters': None,
'appendixType': "",
'code': "181aedaa542",
'configCode': "",
'dataTypeId': "241",
'filters': [],
'granularity': "ALL",
'historySearchWords': [],
'isAdvancedSearch': None,
'isDefaultAdvanced': None,
'isSearchForced': "0",
'orderBy': "related",
'pageNo': 1,
'pageSize': 10,
'searchBy': "all",
'searchWord': "REITs", }
data_post = json.dumps(data_post)
req = requests.post(url, headers=headers, data=data_post, proxies=ip)
req.encoding = req.apparent_encoding
......@@ -55,11 +71,27 @@ def getTotal():
def getDataJson(page):
ip = baseCore.get_proxy()
url = 'http://www.gxzf.gov.cn/irs/front/search'
data_post = {"code": "181aedaa542", "dataTypeId": "241", "configCode": "",
"sign": "9cc99c9d-94aa-44b4-aa79-41227a5385d7", "searchWord": "REITs", "orderBy": "related",
"searchBy": "all", "appendixType": "", "granularity": "ALL", "isSearchForced": "0", "filters": [],
"pageNo": page, "pageSize": 10, "isAdvancedSearch": None, "isDefaultAdvanced": None,
"advancedFilters": None, "advancedFilters ": None, "historySearchWords": []}
# data_post = {"code": "181aedaa542", "dataTypeId": "241", "configCode": "",
# "sign": "9cc99c9d-94aa-44b4-aa79-41227a5385d7", "searchWord": "REITs", "orderBy": "related",
# "searchBy": "all", "appendixType": "", "granularity": "ALL", "isSearchForced": "0", "filters": [],
# "pageNo": page, "pageSize": 10, "isAdvancedSearch": None, "isDefaultAdvanced": None,
# "advancedFilters": None, "advancedFilters ": None, "historySearchWords": []}
data_post = {'advancedFilters': None,
'appendixType': "",
'code': "181aedaa542",
'configCode': "",
'dataTypeId': "241",
'filters': [],
'granularity': "ALL",
'historySearchWords': [],
'isAdvancedSearch': None,
'isDefaultAdvanced': None,
'isSearchForced': "0",
'orderBy': "related",
'pageNo': page,
'pageSize': 10,
'searchBy': "all",
'searchWord': "REITs", }
data_post = json.dumps(data_post)
req = requests.post(url, headers=headers, data=data_post, proxies=ip)
req.encoding = req.apparent_encoding
......@@ -117,14 +149,17 @@ def getData(data_, num):
content, contentWithTag, id_list = getContent(href, publishDate, num)
contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = {
'attachmentIds': id_list,
'author': '',
'subjectId': '1729315113088765953',
'lang': lang,
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'checkStatus': 1,
'id': '',
'id': '1729315113088765953' + str(int(time.time())),
'title': title,
'publishDate': publishDate,
'origin': origin,
......@@ -163,7 +198,6 @@ def doJob():
time.sleep(2)
if __name__ == '__main__':
doJob()
baseCore.close()
......@@ -17,8 +17,8 @@ from reits import Policy
policy = Policy()
topic = 'policy'
webname = '海南省人民政府'
topic = 'research_center_fourth'
webname = '海南省人民政府_'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
......@@ -108,14 +108,17 @@ def getData(div, num):
return
contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = {
'attachmentIds': [],
'subjectId': '1729315113088765953',
'lang': lang,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'checkStatus': 1,
'id': '',
'id': '1729315113088765953'+str(int(time.time())),
'title': title,
'publishDate': publishDate,
'origin': origin,
......
#coding=utf-8
# coding=utf-8
import os
import time
......@@ -10,14 +10,14 @@ baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
from reits import Policy
policy = Policy()
policy = Policy()
topic = 'policy'
webname = '黑龙江省人民政府'
topic = 'research_center_fourth'
webname = '黑龙江省人民政府_'
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'Token': '9a9ff46e-f534-43b8-bad1-063d80af7e51',
'Token': 'b946cd4e-77a4-42f5-bcaf-a9c4f26b5191',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
......@@ -26,11 +26,12 @@ def getDataJson():
ip = baseCore.get_proxy()
url = 'https://www.hlj.gov.cn/znwd/policy/policy/policy/home/public/policyWikipedia?_method=get'
data_post = {
'sort': 'smartIndex',
'order': 'asc',
'sort': 'date',
'order': 'desc',
'start': '0',
'length': '20',
'filter.all': 'REITs',
'filter.tyoe': '0'
}
req = requests.post(url, headers=headers, data=data_post, proxies=ip)
req.encoding = req.apparent_encoding
......@@ -54,7 +55,7 @@ def getFjContent(url):
return req.content
def getContent(num, title, publishDate, summary, id, pub_hao, organ,type):
def getContent(num, title, publishDate, summary, id, pub_hao, organ, type):
id_list = []
url = f'https://www.hlj.gov.cn/znwd/policy/#/readDetails?id={id}'
writtenDate = None
......@@ -83,7 +84,7 @@ def getContent(num, title, publishDate, summary, id, pub_hao, organ,type):
fj_title = fj_title.replace('<', '').replace('>', '')
if category not in fj_title:
fj_title = fj_title + category
att_id, full_path = policy.attuributefile(fj_title,href,num,publishDate)
att_id, full_path = policy.attuributefile(fj_title, href, num, publishDate)
if att_id:
id_list.append(att_id)
a['href'] = full_path
......@@ -104,14 +105,17 @@ def getContent(num, title, publishDate, summary, id, pub_hao, organ,type):
content = soup.text.lstrip().strip()
contentWithTag_str = str(soup)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = {
'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'checkStatus': 1,
'id': '',
'id': '1729315113088765953' + str(int(time.time())),
'title': title,
'publishDate': publishDate,
'origin': origin,
......@@ -135,7 +139,6 @@ def getContent(num, title, publishDate, summary, id, pub_hao, organ,type):
def doJob():
num = 1
data_json = getDataJson()
for data_ in data_json:
......@@ -152,7 +155,7 @@ def doJob():
organ = data_['unitShowName']
except:
organ = ''
data = getContent(num, title, publishDate, summary, id, pub_hao, organ,type)
data = getContent(num, title, publishDate, summary, id, pub_hao, organ, type)
# data_list.append(data)
num += 1
time.sleep(3)
......
......@@ -21,8 +21,8 @@ from reits import Policy
policy = Policy()
topic = 'policy'
webname = '湖北省人民政府'
topic = 'research_center_fourth'
webname = '湖北省人民政府_'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
......@@ -114,14 +114,17 @@ def getData(driver, data_, num):
contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = {
'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'checkStatus': 1,
'id': '',
'id': '1729315113088765953'+str(int(time.time())),
'title': title,
'publishDate': publishDate,
'origin': origin,
......@@ -145,7 +148,8 @@ def getData(driver, data_, num):
def doJob():
service = Service(r'D:/soft/geckodriver.exe')
# service = Service(r'D:/soft/geckodriver.exe')
service = Service(r'F:\spider\firefox\geckodriver_1.exe')
options = Options()
options.set_preference("general.useragent.override", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
driver = webdriver.Firefox(options=options, service=service)
......
......@@ -17,8 +17,8 @@ from reits import Policy
policy = Policy()
topic = 'policy'
webname = '江苏省人民政府'
topic = 'research_center_fourth'
webname = '江苏省人民政府_'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
......@@ -85,14 +85,17 @@ def getContentA(url, num, publishDate, title, origin, summary):
content = contentWithTag.text
contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = {
'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'checkStatus': 1,
'id': '',
'id': '1729315113088765953'+str(int(time.time())),
'title': title,
'publishDate': publishDate,
'origin': origin,
......@@ -163,13 +166,16 @@ def getContentB(url, num, publishDate, title, origin, summary):
content = contentWithTag.text.lstrip().strip()
contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = {
'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'id': '',
'id': '1729315113088765953'+str(int(time.time())),
'title': title,
'publishDate': publishDate,
'origin': origin,
......
......@@ -16,7 +16,7 @@ policy = Policy()
topic = 'research_center_fourth'
webname = '江西省人民政府'
webname = '江西省人民政府_'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'X-Requested-With': 'XMLHttpRequest',
......
......@@ -14,8 +14,8 @@ from reits import Policy
policy = Policy()
topic = 'policy'
webname = '吉林市人民政府'
topic = 'research_center_fourth'
webname = '吉林市人民政府_'
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
......@@ -155,14 +155,17 @@ def getData(num, title, url, origin, publishDate, summary):
return
contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = {
'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'checkStatus': 1,
'id': '',
'id': '1729315113088765953'+str(int(time.time())),
'title': title,
'publishDate': publishDate,
'origin': origin,
......
......@@ -15,8 +15,8 @@ from reits import Policy
policy = Policy()
topic = 'policy'
webname = '辽宁省人民政府'
topic = 'research_center_fourth'
webname = '辽宁省人民政府_'
headers = {
'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
}
......@@ -63,14 +63,17 @@ def doJob():
content = contentWithTag.text.lstrip().strip()
contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = {
'attachmentIds': [],
'subjectId': '1729315113088765953',
'lang': lang,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'checkStatus': 1,
'id': '',
'id': '1729315113088765953'+str(int(time.time())),
'title': title,
'publishDate': publishDate,
'origin': '辽宁省人民政府',
......
......@@ -15,8 +15,8 @@ from reits import Policy
policy = Policy()
topic = 'policy'
webname = '内蒙古自治区人民政府'
topic = 'research_center_fourth'
webname = '内蒙古自治区人民政府_'
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
......@@ -188,14 +188,17 @@ def getContent(num, data):
return
contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = {
'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'checkStatus': 1,
'id': '',
'id': '1729315113088765953'+str(int(time.time())),
'title': title,
'publishDate': publishDate,
'origin': origin,
......
......@@ -11,8 +11,8 @@ log = baseCore.getLogger()
from reits import Policy
policy = Policy()
topic = 'policy'
webname = '山东省人民政府'
topic = 'research_center_fourth'
webname = '山东省人民政府_'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'X-Requested-With': 'XMLHttpRequest',
......@@ -131,14 +131,17 @@ def getData(soup, num):
contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = {
'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'checkStatus': 1,
'id': '',
'id': '1729315113088765953'+str(int(time.time())),
'title': title,
'publishDate': publishDate,
'origin': origin,
......
......@@ -17,8 +17,8 @@ from reits import Policy
policy = Policy()
topic = 'policy'
webname = '上海市人民政府'
topic = 'research_center_fourth'
webname = '上海市人民政府_'
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
......@@ -111,14 +111,17 @@ def getData(data_, driver, num):
# fjhref_list]
contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = {
'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'checkStatus': 1,
'id': '',
'id': '1729315113088765953'+str(int(time.time())),
'title': title,
'publishDate': publishDate,
'origin': origin,
......
......@@ -14,8 +14,8 @@ from reits import Policy
policy = Policy()
topic = 'policy'
webname = '山西省人民政府'
topic = 'research_center_fourth'
webname = '山西省人民政府_'
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate',
......@@ -130,14 +130,17 @@ def getContent(num, data):
a['href'] = full_path
contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = {
'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'checkStatus': 1,
'id': '',
'id': '1729315113088765953'+str(int(time.time())),
'title': title,
'publishDate': publishDate,
'origin': origin,
......
......@@ -14,8 +14,8 @@ log = baseCore.getLogger()
from reits import Policy
policy = Policy()
topic = 'policy'
webname = '四川省人民政府'
topic = 'research_center_fourth'
webname = '四川省人民政府_'
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
......@@ -106,14 +106,17 @@ def getData(data_, num):
contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = {
'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'checkStatus': 1,
'id': '',
'id': '1729315113088765953'+str(int(time.time())),
'title': title,
'publishDate': publishDate,
'origin': origin,
......
......@@ -16,8 +16,8 @@ from reits import Policy
policy = Policy()
topic = 'policy'
webname = '天津市人民政府'
topic = 'research_center_fourth'
webname = '天津市人民政府_'
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
......@@ -137,14 +137,17 @@ def getContent(num, title, pub_time, origin, organ, url, pub_hao, summary):
content = contentWithTag.text.lstrip().strip()
contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = {
'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'checkStatus': 1,
'id': '',
'id': '1729315113088765953'+str(int(time.time())),
'title': title,
'publishDate': pub_time,
'origin': origin,
......
......@@ -19,8 +19,8 @@ from reits import Policy
policy = Policy()
topic = 'policy'
webname = '云南省人民政府'
topic = 'research_center_fourth'
webname = '云南省人民政府_'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
......@@ -149,14 +149,17 @@ def getData(div, num):
content, contentWithTag, id_list = getContent(href, publishDate, num)
contentWithTag_str = str(contentWithTag)
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = {
'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '',
'content': content,
'contentWithTag': contentWithTag_str,
'deleteFlag': 0,
'checkStatus': 1,
'id': '',
'id': '1729315113088765953'+str(int(time.time())),
'title': title,
'publishDate': publishDate,
'origin': origin,
......
......@@ -16,8 +16,8 @@ headers = {
'X-Requested-With': 'XMLHttpRequest',
}
topic = 'policy'
webname = '浙江省人民政府'
topic = 'research_center_fourth'
webname = '浙江省人民政府_'
class Policy():
def getrequest_soup(self, headers, url):
......@@ -502,14 +502,17 @@ def getDatas(page):
continue
num += 1
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
lang = baseCore.detect_language(content)
dic_info = {
'attachmentIds': id_list,
'subjectId': '1729315113088765953',
'lang': lang,
'author': '',
'content': content,
'contentWithTag': contentWithTag,
'deleteFlag': 0,
'checkStatus': 1,
'id': '',
'id': '1729315113088765953'+str(int(time.time())),
'title': title,
'publishDate': publishDate,
'origin': origin,
......
......@@ -42,10 +42,12 @@ class Policy():
return result
def createDriver(self):
chrome_driver = r'D:\cmd100\chromedriver.exe'
# chrome_driver = r'D:\cmd100\chromedriver.exe'
chrome_driver = r'F:\spider\cmd100\chromedriver.exe'
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
# chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
chrome_options.binary_location = r'F:\spider\85\Google\Chrome\Application\chrome.exe'
# 设置代理
# proxy = "127.0.0.1:8080" # 代理地址和端口
# chrome_options.add_argument('--proxy-server=http://' + proxy)
......
import datetime
import datetime
......@@ -250,6 +250,7 @@ def doJob(obsOperate):
continue
att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', file_title, num, str(date)[:10])
num += 1
createDate = datetime.datetime.now().strftime('%Y-%m-%d')
dic_info = {
'code': code, # 代码
'name': name, # 基金名称
......@@ -260,6 +261,7 @@ def doJob(obsOperate):
'date': date, # 时间(datetime 类型)
'strDate': str(date)[:10], # 时间(字符串类型)
'exchange': '香港交易所', # 交易所
'createDate':createDate # 创建时间
}
db_storage.insert_one(dic_info)
log.info(f'{code}==={title}===采集成功')
......
import re
import re
import fitz
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import numpy as np
from base import BaseCore
from requests.models import Response
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Pragma': 'no-cache',
'Referer': 'https://www.cushmanwakefield.com.cn/research-report/p94.html?expert=0',
'Sec-Ch-Ua': '"Microsoft Edge";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
def getSoup(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
return soup
def getPageSize():
# url = 'https://www.cushmanwakefield.com.cn/research-report/p1.html?expert=0'
url = 'https://www.cushmanwakefield.com.cn/research-report/p1.html?expert=1'
soup = getSoup(url)
total = int(re.findall('\d+', soup.find('dl', class_='sousuo_result').text.lstrip().strip())[0])
if total % 4 == 0:
pageSize = int(total / 4)
else:
pageSize = int(total / 4) + 1
return pageSize
def getContent(url):
content = ''
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
# req.encoding = req.apparent_encoding
with fitz.open(stream=req.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
content += page.get_text()
return content
def doJob():
num = 1
data_list = []
pageSize = getPageSize()
for page in range(1, pageSize + 1):
# url = f'https://www.cushmanwakefield.com.cn/research-report/p{page}.html?expert=0'
url = f'https://www.cushmanwakefield.com.cn/research-report/p{page}.html?expert=1'
soup = getSoup(url)
div_list = soup.find('div', class_='guwen_list_box').find_all('div', class_='zhuangyuan_guwen_box')
for div in div_list:
fjtitle_list = ''
fjhref_list = ''
name = div.find('div', class_='zhuanyuan_name').text.lstrip().strip()
summary = div.find('div', class_='zhuanyuan_info').text.lstrip().strip()
href = div.find('a', class_='zhuanyuan_xinxi').get('href')
origin = '戴德梁兴'
try:
content = getContent(href)
# print(content)
except Exception as e:
log.error(f'第{page}页==={name}===连接失败')
continue
title = name.replace('/',' ').replace('|',' ').replace('?',' ').replace('"','”')
if __name__ == '__main__':
doJob()
baseCore.close()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论