提交 574620c3 作者: LiuLiYuan

REITs专题 11/28

上级 5d788bc9
import json
import json
import re
import time
import numpy as np
import pandas as pd
import requests
import os
from base import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'query.sse.com.cn',
'Pragma': 'no-cache',
'Referer': 'http://www.sse.com.cn/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
# 获取json数据
def getJson(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
data_json = re.findall('\((.*)\)', req.text)[0]
data_json = json.loads(data_json)
req.close()
return data_json
# 获取总页数
def getTotal():
url = f'http://query.sse.com.cn/commonSoaQuery.do?jsonCallBack=jsonpCallback42283&sqlId=REITS_BULLETIN&isPagination=true&fundCode=&startDate=&endDate=&pageHelp.pageSize=25&pageHelp.cacheSize=1&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.endPage=5&_={int(time.time())}'
data_json = getJson(url)
total = int(data_json['pageHelp']['pageCount'])
return total
# 获取pdf文件的基本信息
def getDataList(page):
info_list = []
url = f'http://query.sse.com.cn/commonSoaQuery.do?jsonCallBack=jsonpCallback42283&sqlId=REITS_BULLETIN&isPagination=true&fundCode=&startDate=&endDate=&pageHelp.pageSize=25&pageHelp.cacheSize=1&pageHelp.pageNo={page}&pageHelp.beginPage={page}&pageHelp.endPage=5&_={int(time.time())}'
data_json = getJson(url)['result']
for data in data_json:
name = data['fundExtAbbr']
title = data['title']
pub_time = data['sseDate']
code = data['securityCode']
href = 'http://www.sse.com.cn' + data['url'].replace('\\', '')
info_list.append([title, pub_time, href, name, code])
return info_list
# 获取pdf文件数据流
def getContent(href):
ip = baseCore.get_proxy()
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'www.sse.com.cn',
'Pragma': 'no-cache',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
req = requests.get(href, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
content = req.content
req.close()
return content
def doJob():
data_list = []
total = getTotal()
for page in range(1, total + 1):
info_list = getDataList(page)
for info in info_list:
title = info[0]
pub_time = info[1]
href = info[2]
name = info[3]
code = info[4]
data_list.append([code,name,title,pub_time,href,'上海交易所','http://www.sse.com.cn/reits/announcements/'])
try:
content = getContent(href)
except:
log.error(f'第{page}页==={title}===连接失败')
continue
file = f'./公告_2/{code}-{name}/{title}-{pub_time}.pdf'
# num = 2
# while True:
# flg = os.path.isfile(file)
# if flg:
# print(f'{title}===有重名')
# file = f'./公告/{code}-{name}/{title}-{num}.pdf'
# num += 1
# else:
# break
try:
try:
with open(file, 'wb') as f:
f.write(content)
except:
try:
os.mkdir(f'./公告_2/{code}-{name}')
with open(file, 'wb') as f:
f.write(content)
except:
os.mkdir(f'./公告_2')
os.mkdir(f'./公告_2/{code}-{name}')
with open(file, 'wb') as f:
f.write(content)
log.info(f'{title}===成功')
except:
log.error(f'第{page}页==={title}===保存失败')
df = pd.DataFrame(np.array(data_list))
df.columns = ['公募REITs代码','扩位简称','公告标题','披露日期','公告网址','来源','来源网址']
df.to_excel('./上海交易所信息披露.xlsx',index=False)
if __name__ == '__main__':
doJob()
baseCore.close()
import re
import re
import time
import numpy as np
import pandas as pd
import requests
import os
import json
from base import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Host': 'www.szse.cn',
'Origin': 'http://www.szse.cn',
'Pragma': 'no-cache',
'Referer': 'http://www.szse.cn/disclosure/fund/notice/index.html',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'X-Request-Type': 'ajax',
'X-Requested-With': 'XMLHttpRequest',
}
url = 'http://www.szse.cn/api/disc/announcement/annList'
# 获取代码列表
def getCodeList():
code_list = []
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Host': 'www.szse.cn',
'Pragma': 'no-cache',
'Referer': 'http://www.szse.cn/market/product/list/all/index.html',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'X-Request-Type': 'ajax',
'X-Requested-With': 'XMLHttpRequest',
}
url = 'http://www.szse.cn/api/report/ShowReport/data?SHOWTYPE=JSON&CATALOGID=1105&TABKEY=tab1&selectJjlb=%E5%9F%BA%E7%A1%80%E8%AE%BE%E6%96%BD%E5%9F%BA%E9%87%91'
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
data_list = req.json()[0]['data']
for data_ in data_list:
code = re.findall('<u>(.*?)</u>', data_['sys_key'])[0]
code_list.append(code)
return code_list
# 获取总页数
def getPageSize(id):
data_post = {"seDate": ["", ""], "stock": [f"{id}"], "channelCode": ["fundinfoNotice_disc"], "pageSize": 50,
"pageNum": 1}
data_post = json.dumps(data_post)
ip = baseCore.get_proxy()
req = requests.post(url, headers=headers, data=data_post, proxies=ip)
req.encoding = req.apparent_encoding
total = int(req.json()['announceCount'])
if total % 50 == 0:
pageSize = int(total / 50)
else:
pageSize = int(total / 50) + 1
return pageSize
# 获取json数据
def getDataList(id, page):
data_post = {"seDate": ["", ""], "stock": [f"{id}"], "channelCode": ["fundinfoNotice_disc"], "pageSize": 50,
"pageNum": page}
data_post = json.dumps(data_post)
ip = baseCore.get_proxy()
req = requests.post(url, headers=headers, data=data_post, proxies=ip)
req.encoding = req.apparent_encoding
data_list = req.json()['data']
return data_list
# 获取pdf文件数据流
def getContent(href):
ip = baseCore.get_proxy()
req = requests.get(href, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
content = req.content
return content
def doJob():
if not os.path.exists('./市场板块/基金公告_2'):
os.makedirs('./市场板块/基金公告_2')
info_list = []
code_list = getCodeList()
for code in code_list:
pageSize = getPageSize(code)
for page in range(1, pageSize + 1):
data_list = getDataList(code, page)
for data in data_list:
title = data['title']
name = data['secName'][0]
if not os.path.exists(f'./市场板块/基金公告_2/{code}-{name}'):
os.makedirs(f'./市场板块/基金公告_2/{code}-{name}')
pub_time = data['publishTime']
href = 'http://www.szse.cn/api/disc/info/download?id=' + data['id']
info = [code, name, title, pub_time, href, '深圳交易所', 'http://www.szse.cn/disclosure/index.html']
info_list.append(info)
content = getContent(href)
file = rf'./市场板块/基金公告_2/{code}-{name}/{title}-{pub_time[:10]}.pdf'
if os.path.exists(file):
log.info(f'{title}===已采集')
time.sleep(3)
continue
try:
with open(file, 'wb') as f:
f.write(content)
log.info(f'{title}===成功')
except Exception as e:
log.error(f'第{page}页==={title}===失败')
time.sleep(2)
df = pd.DataFrame(np.array(info_list))
df.columns = ['证券代码', '证券简称', '公告标题', '发布时间', '公告网址', '来源', '来源网址']
df.to_excel('./市场板块/深圳交易所基金公告_2.xlsx', index=False)
if __name__ == '__main__':
doJob()
baseCore.close()
import re
import re
import time
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import pymongo
import requests
from retry import retry
from base import BaseCore
baseCore = BaseCore.BaseCore()
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').研究中心[
'REITs基金行情-深圳']
log = baseCore.getLogger()
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Host': 'www.szse.cn',
'Pragma': 'no-cache',
'Referer': 'http://www.szse.cn/market/product/list/all/index.html',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'X-Request-Type': 'ajax',
'X-Requested-With': 'XMLHttpRequest',
}
# 获取基金代码与上市时间
@retry(tries=3, delay=3)
def getData():
data_list = []
ip = baseCore.get_proxy()
url = 'https://reits.szse.cn/api/report/ShowReport/data?SHOWTYPE=JSON&CATALOGID=reits_fund_list&PAGENO=1&PAGESIZE=10'
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
data_json = req.json()[0]['data']
for data_ in data_json:
jjjcurl = re.findall('<u>(.*?)</u>', data_['jjjcurl'])[0].lstrip().strip()
sys_key = data_['sys_key'].lstrip().strip()
ssrq = data_['ssrq'].lstrip().strip()
# 基金简称 基金代码 上市时间
data = [jjjcurl, sys_key, ssrq]
data_list.append(data)
return data_list
# 获取基金基本信息
@retry(delay=5)
def getInfoList():
code_list = []
url = 'http://www.szse.cn/api/report/ShowReport/data?SHOWTYPE=JSON&CATALOGID=1105&TABKEY=tab1&selectJjlb=%E5%9F%BA%E7%A1%80%E8%AE%BE%E6%96%BD%E5%9F%BA%E9%87%91'
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
data_list = req.json()[0]['data']
for data_ in data_list:
# data = {
# '基金代码': re.findall('<u>(.*?)</u>', data_['sys_key'])[0],
# '基金简称': re.findall('<u>(.*?)</u>', data_['jjjcurl'])[0],
# '基金类别': data_['jjlb'],
# '投资类别': data_['tzlb'],
# '上市日期': data_['ssrq'],
# '当前规模(万份)': data_['dqgm'],
# '基金管理人': data_['glrmc'],
# '最新基金净值': data_['cxjqhq'],
# }
data = [re.findall('<u>(.*?)</u>', data_['sys_key'])[0], re.findall('<u>(.*?)</u>', data_['jjjcurl'])[0],
data_['jjlb'], data_['tzlb'], data_['ssrq'], data_['dqgm'], data_['glrmc'], data_['cxjqhq'], ]
name_list = ['基金代码', '基金简称', '基金类别', '投资类别', '上市日期', '当前规模(万份)', '基金管理人', '最新基金净值']
code_list.append(data)
return code_list
# 获取基金交易信息
@retry(tries=5, delay=20)
def getDataList(code, start_date, end_date):
ip = baseCore.get_proxy()
url = f'http://www.szse.cn/api/report/ShowReport/data?SHOWTYPE=JSON&CATALOGID=1815_stock_snapshot&TABKEY=tab2&txtDMorJC={code}&txtBeginDate={str(start_date)[:10]}&txtEndDate={str(end_date)[:10]}&archiveDate=2021-11-01'
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
data_json = req.json()[0]['data'][::-1]
req.close()
for data_ in data_json:
jyrq = data_['jyrq']
zqdm = data_['zqdm']
zqjc = data_['zqjc']
qss = data_['qss']
ks = data_['ks']
zg = data_['zg']
zd = data_['zd']
ss = data_['ss']
sdf = data_['sdf']
cjgs = data_['cjgs']
cjje = data_['cjje']
syl1 = data_['syl1']
is_insert = db_storage.find_one({'code': code, 'date': jyrq, 'exchange': '深圳证券交易所'})
if is_insert:
log.info(f'{code}==={jyrq}===已采集')
continue
dic_info = {
'code': float(zqdm), # 代码
'shortName': float(zqjc), # 简称
'opening': float(ks), # 开盘价
'max': float(zg), # 最高价
'min': float(zd), # 最低价
'closed': float(ss), # 收盘价
'beforeClosed': float(qss), # 前收价
'volume': cjgs, # 交易量
'amount': cjje, # 交易金额
'date': jyrq, # 时间
'country': '中国', # 国家
'exchange': '深圳证券交易所' # 交易所
}
db_storage.insert_one(dic_info)
log.info(f'{code}==={jyrq}===采集成功')
time.sleep(1)
def doJob():
data_list = getData()
log.info('开始采集')
for data in data_list:
code = data[0]
name = data[1]
log.info(f'{code}==={name}===开始采集')
start_date = data[2]
start_date = datetime.strptime(start_date, "%Y-%m-%d")
current_date = datetime.now()
end_date = start_date + timedelta(days=5)
while end_date != current_date:
time.sleep(1)
try:
getDataList(code, start_date, end_date)
except:
log.error(f'{code}==={start_date}-{end_date}===采集失败')
start_date = end_date + timedelta(days=1)
end_date = start_date + timedelta(days=5)
if end_date > current_date:
end_date = current_date
if __name__ == '__main__':
doJob()
baseCore.close()
import os
import os
import time
from urllib.parse import urljoin
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from base import BaseCore
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'Host': 'www.szse.cn',
'Origin': 'http://www.szse.cn',
'Pragma': 'no-cache',
# 'Referer': 'http://www.szse.cn/lawrules/search/index.html?rulekeyword=REITs&channelCode=["rules","csrcrules","szseBussrules","memorandumServicedirect","publicadvice","lawruleSearch"]&range=content&searchtype=0',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'X-Request-Type': 'ajax',
'X-Requested-With': 'XMLHttpRequest',
}
def paserUrl(html, listurl):
# 获取所有的<a>标签和<img>标签
if isinstance(html, str):
html = BeautifulSoup(html, 'html.parser')
links = html.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return html
def getFjContent(url):
ip = baseCore.get_proxy()
session = requests.session()
session.get('http://www.szse.cn/',headers=headers,proxies=ip)
req = session.get(url)
req.encoding = req.apparent_encoding
content = req.content
session.close()
return content
def getContent(url, publishDate, num):
fjhref_list = ''
fjtitle_list = ''
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
soup = paserUrl(soup, 'http://www.szse.cn/')
contentWithTag = soup.find('div', class_='des-content')
a_list = contentWithTag.find_all('a')
num_ = 1
for a in a_list:
fj_href = a.get('href')
if not fj_href:
continue
fjhref_list += fj_href + '\n'
fj_title = a.text.lstrip().strip()
category = os.path.splitext(fj_href)[1]
if category not in fj_title:
fj_title = fj_title + category
fj_title = f'{num}-{publishDate}-{fj_title}'
fjcontent = getFjContent(fj_href)
file = f'./相关政策/深圳证券交易所/政策文件/{fj_title}'
if os.path.exists(file):
fj_title = fj_title.replace(category,f'-{num_}{category}')
num_ += 1
file = f'./相关政策/深圳证券交易所/政策文件/{fj_title}'
fjtitle_list += fj_title + '\n'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
try:
scripts = contentWithTag.find_all('script')
for script in scripts:
script.decompose()
except:
pass
try:
styles = contentWithTag.find_all('style')
for style in styles:
style.decompose()
except:
pass
pub_hao = contentWithTag.find('p').text.lstrip().strip()
content = contentWithTag.text.lstrip().strip()
return pub_hao, content, fjtitle_list, fjhref_list
def doJob():
if not os.path.exists('./相关政策/深圳证券交易所/政策文件'):
os.makedirs('./相关政策/深圳证券交易所/政策文件')
url = 'http://www.szse.cn/lawrules/search/index.html?rulekeyword=REITs&channelCode=%5B%22rules%22,%22csrcrules%22,%22szseBussrules%22,%22memorandumServicedirect%22,%22publicadvice%22,%22lawruleSearch%22%5D&range=content&searchtype=0'
driver = baseCore.buildDriver()
driver.get(url)
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, 'article-item'))
)
div_list = driver.find_elements(By.CLASS_NAME, 'article-item')
num = 0
data_list = []
for div in div_list:
title = div.find_element(By.TAG_NAME, 'a').text.lstrip().strip()
href = div.find_element(By.TAG_NAME, 'a').get_attribute('href')
publishDate = div.find_element(By.CLASS_NAME, 'pull-right').text.lstrip().strip()
writtenDate = publishDate
origin = '深圳证券交易所'
organ = origin
if '.pdf' in href:
content = ''
summary = ''
fjtitle_list = title + '.pdf'
fjhref_list = href
pub_hao = ''
fjcontent = getFjContent(href)
file = f'./相关政策/深圳证券交易所/政策文件/{title}.pdf'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{title}===附件下载成功')
else:
summary = div.find_element(By.CLASS_NAME, 'item-content').text.lstrip().strip()
pub_hao, content, fjtitle_list, fjhref_list = getContent(href, publishDate, num)
data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
fjhref_list]
data_list.append(data)
log.info(f'{title}===采集成功')
num += 1
driver.close()
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./相关政策/深圳证券交易所/深圳证券交易所政策文件.xlsx', index=False)
if __name__ == '__main__':
doJob()
baseCore.close()
import re
import re
import time
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import pymongo
import requests
from retry import retry
from base import BaseCore
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').研究中心['REITs市场概况-深圳']
db_storage_ = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').研究中心['REITs基金列表']
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'reits.szse.cn',
'Origin': 'https://newmedia.szse.cn',
'Pragma': 'no-cache',
'Referer': 'https://newmedia.szse.cn/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'sec-ch-ua': '"Microsoft Edge";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
def getData():
data_list = []
ip = baseCore.get_proxy()
url = 'https://reits.szse.cn/api/report/ShowReport/data?SHOWTYPE=JSON&CATALOGID=reits_fund_list&PAGENO=1&PAGESIZE=10'
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
data_json = req.json()[0]['data']
for data_ in data_json:
jjjcurl = re.findall('<u>(.*?)</u>', data_['jjjcurl'])[0].lstrip().strip()
sys_key = data_['sys_key'].lstrip().strip()
dqgm = data_['dqgm'].lstrip().strip()
ltgm = data_['ltgm'].lstrip().strip()
try:
glrmc = re.findall('\'>(.*?)</a>', data_['glrmc'])[0].lstrip().strip()
except:
glrmc = data_['glrmc']
tzlb = data_['tzlb'].lstrip().strip()
jjlb = data_['jjlb'].lstrip().strip()
ssrq = data_['ssrq'].lstrip().strip()[:10]
data = [jjjcurl, sys_key, dqgm, ltgm, glrmc, tzlb, jjlb, ssrq]
into_dict = {
'基金简称':jjjcurl,
'基金代码':sys_key,
'当前规模(万份)':dqgm,
'流通规模(万份)':ltgm,
'基金管理人':glrmc,
'投资类别':tzlb,
'基金类别':jjlb,
'上市日期':ssrq
}
db_storage_.insert_one(into_dict)
time.sleep(1)
data_list.append(data)
df = pd.DataFrame(np.array(data_list))
df.columns = ['基金简称', '基金代码', '当前规模(万份)', '流通规模(万份)', '基金管理人', '投资类别', '基金类别', '上市日期']
return df
@retry(tries=5,delay=10)
def getDataJson(date):
# ip = baseCore.get_proxy()
url = f'https://reits.szse.cn/api/report/ShowReport/data?SHOWTYPE=JSON&CATALOGID=reits_scgk_oa&txtQueryDate={date}'
# req = requests.get(url, headers=headers, proxies=ip)
req = requests.get(url,headers=headers)
data_json = req.json()[0]['data']
req.close()
return data_json
# 2021-06-21
def doJob():
log.info('=====开始采集=====')
start_time = time.time()
writer = pd.ExcelWriter('市场板块/深圳交易所市场概况.xlsx')
start_date = datetime(2022, 1, 1)
end_date = datetime.today()
date_range = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]
data_list = []
for date in date_range:
data_json = getDataJson(date)
for data_ in data_json:
data = [data_['lbmc'], data_['zqsl'], data_['zgb'], data_['cjsl'], data_['cjje'], data_['sjzz'],str(date)]
dic = {
'产品数量(只)':data_['zqsl'],
'股份余额(万份)':data_['zgb'],
'日成交份额(万份)':data_['cjsl'],
'日成交金额(万元)':data_['cjje'],
'总市值(亿元)':data_['sjzz'],
'日期':str(date)
}
db_storage.insert_one(dic)
log.info(f'{date}===采集成功')
data_list.append(data)
time.sleep(5)
df_1 = pd.DataFrame(np.array(data_list))
df_1.columns = ['基金品种', '产品数量(只)', '股份余额(万份)', '日成交份额(万份)', '日成交金额(万元)', '总市值(亿元)','日期']
df_1.to_excel(writer, sheet_name='基础设施公募', index=False)
df_2 = getData()
df_2.to_excel(writer, sheet_name='基金列表', index=False)
writer.save()
log.info(f'=====采集结束=====耗时{baseCore.getTimeCost(start_time,time.time())}')
if __name__ == '__main__':
doJob()
baseCore.close()
import datetime
import datetime
import json
import random
import re
import numpy as np
import pandas as pd
import pymongo
import requests
import time
from retry import retry
from selenium.webdriver.common.by import By
from base import BaseCore
db_storage = pymongo.MongoClient('mongodb://192.168.1.36:27017', username='admin', password='zzsn@9988').RESCenter[
'RETIsProdQuot']
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Referer': 'http://www.sse.com.cn/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
}
# 获取基金代码列表
@retry(tries=3, delay=10)
def getCode():
code_list = []
url = f'http://yunhq.sse.com.cn:32041/v1/sh1/list/exchange/reits?callback=jQuery112407214866998855156_1699360786929&select=code%2Cname%2Clast%2Cchg_rate%2Cchange%2Cvolume%2Camount%2Cprev_close%2Copen%2Chigh%2Clow%2Camp_rate%2Ccpxxtype%2Ccpxxsubtype%2Ccpxxextendname&order=code%2Case&begin=0&end=25&_={int(time.time())}'
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
data_json = re.findall('\((.*)\)', req.text)[0]
data_json = json.loads(data_json)['list']
for data in data_json:
code_list.append([data[0], data[-1]])
req.close()
return code_list
@retry(tries=4, delay=20)
def getDataJson(code):
url = f'http://yunhq.sse.com.cn:32041/v1/sh1/dayk/{code}?callback=jQuery1124021168281852977966_1699359286492&begin=-1000&end=-1&period=day&_={int(time.time())}'
# ip = baseCore.get_proxy()
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
data_json = re.findall('\((.*)\)', req.text)[0]
data_json = json.loads(data_json)['kline']
req.close()
return data_json
@retry(tries=5, delay=10)
def getDataB(code, date_):
# ip = baseCore.get_proxy()
date = str(date_)[:4] + '-' + str(date_)[4:6] + '-' + str(date_)[6:]
url = f'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback99984&sqlId=COMMON_SSE_SJ_JJSJ_JJGM_REITSGM_L&FUND_CODE={code}&TRADE_DATE={date_}&SEARCH_DAY={date}&FUND_TYPE=01&_={int(time.time())}'
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
data_json_ = re.findall('\((.*)\)', req.text)[0]
data_json_ = json.loads(data_json_)['result'][0]
totalValue = data_json_['TOTAL_VALUE']
negoValue = data_json_['NEGO_VALUE']
toRate = data_json_['TO_RATE']
req.close()
time.sleep(2)
return totalValue, negoValue, toRate
# 获取基金交易信息
def getData():
codes_list = getCode()
for codes in codes_list:
start_time = time.time()
code = codes[0]
name = codes[1]
log.info(f'{code}==={name}===开始采集')
try:
data_json = getDataJson(code)
del (data_json)[-1]
num = 1
for data_ in data_json:
year = str(data_[0])[:4]
month = str(data_[0])[4:6]
day = str(data_[0])[6:]
date = datetime.datetime(int(year), int(month), int(day))
date_ytd = date - datetime.timedelta(days=1)
if num != 1:
while True:
beforeClosed = db_storage.find_one({'code':code,'date':date_ytd,'exchange':'上海证券交易所'})
if beforeClosed:
beforeClosed = beforeClosed['closed']
break
else:
date_ytd = date_ytd - datetime.timedelta(days=1)
num += 1
else:
beforeClosed = 0
num +=1
is_insert = db_storage.find_one({'code': code, 'date': date, 'exchange': '上海证券交易所'})
if is_insert:
log.info(f'{code}==={date}===已采集')
time.sleep(1)
continue
try:
totalValue, negoValue, toRate = getDataB(code, data_[0])
except Exception as e:
log.error(e)
log.error(f'{code}==={date}===采集失败')
continue
info_dic = {
'code': code, # 代码
'shortName': name, # 简称
'opening': float(data_[1]), # 开盘价
'max': float(data_[2]), # 最高价
'min': float(data_[3]), # 最低价
'closed': float(data_[4]), # 收盘价
'ytdClosed':float(beforeClosed), # 前收价
'volume': float(data_[5]), # 交易量
'amount': float(data_[6]), # 交易金额
'totalValue': float(totalValue), # 市价总值
'negoValue': float(negoValue), # 流通总值
'toRate': float(toRate), # 换手率
'date': date, # 时间
'country': '中国', # 国家
'exchange': '上海证券交易所' # 交易所
}
db_storage.insert_one(info_dic)
time.sleep(2)
log.info(f'{date}===采集成功')
except Exception as e:
log.error(e)
log.error(f'{code}===采集失败')
time.sleep(5)
log.info(f'{code}==={name}===记录完成===耗时{baseCore.getTimeCost(start_time, time.time())}')
if __name__ == '__main__':
getData()
# getInfo()
baseCore.close()
import json
import json
import re
import time
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from base import BaseCore
baseCore = BaseCore.BaseCore()
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'query.sse.com.cn',
'Pragma': 'no-cache',
'Referer': 'http://www.sse.com.cn/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
# 获取json数据
def getJson(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
data_json = re.findall('\((.*)\)', req.text)[0]
data_json = json.loads(data_json)
return data_json
# 获取总页数
def getTotal():
url = f'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback51800&isPagination=true&bond_type=4&sqlId=COMMON_SSE_ZCZZQXMLB&pageHelp.pageSize=25&status=&begin=&end=&pageHelp.cacheSize=1&pageHelp.pageNo=1&pageHelp.beginPage=1&_={int(time.time())}'
data_json = getJson(url)
total = int(data_json['pageHelp']['pageCount'])
return total
# 获取基金id列表
def getInfoList(page):
info_list = []
url = f'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback51800&isPagination=true&bond_type=4&sqlId=COMMON_SSE_ZCZZQXMLB&pageHelp.pageSize=25&status=&begin=&end=&pageHelp.cacheSize=1&pageHelp.pageNo=1&pageHelp.beginPage={page}&_={int(time.time())}'
data_json = getJson(url)
data_json = data_json['result']
for data in data_json:
id = data['BOND_NUM']
type = data['REITS_TYPE']
if type == '0':
info_list.append([id,'首次发售'])
elif type == '1':
info_list.append([id,'扩募发售'])
else:
info_list.append([id,'-'])
return info_list
# 获取基本信息
def getData(id,type):
url = f'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback72929&isPagination=false&audit_id={id}&sqlId=COMMON_SSE_ZCZZQXMXXXX&_={time.time()}'
data_ = getJson(url)['result'][0]
if data_['AUDIT_STATUS'] == '0':
audit_status = '已申报'
elif data_['AUDIT_STATUS'] == '1':
audit_status = '已受理'
elif data_['AUDIT_STATUS'] == '2':
audit_status = '已反馈'
elif data_['AUDIT_STATUS'] == '3':
audit_status = '已接收反馈意见'
elif data_['AUDIT_STATUS'] == '4':
audit_status = '通过'
elif data_['AUDIT_STATUS'] == '5':
audit_status = '未通过'
elif data_['AUDIT_STATUS'] == '8':
audit_status = '终止'
elif data_['AUDIT_STATUS'] == '901':
audit_status = '承销商/管理人超期中止'
elif data_['AUDIT_STATUS'] == '9':
audit_status = '中止'
elif data_['AUDIT_STATUS'] == '10':
audit_status = '已回复交易所意见'
elif data_['AUDIT_STATUS'] == '111':
audit_status = '提交注册'
elif data_['AUDIT_STATUS'] == '12':
audit_status = '注册生效'
else:
audit_status = '-'
if data_['BOND_TYPE'] == '4':
bond_type = '基础设施公募REITs'
else:
bond_type = '其它'
# data = {
# '公募REITs名称': data_['AUDIT_NAME'],
# '品种': bond_type,
# '发起人': data_['LIST1'],
# '管理人': data_['PRIORITY_MANAGER'],
# '专项计划名称': data_['PRIORITY_NAME'],
# '专项计划管理人': data_['LIST2'],
# '无异议函文号': data_['REG_APRV_WEN_HAO'],
# '项目状态': audit_status,
# '更新日期': data_['PUBLISH_DATE'],
# '受理日期': data_['ACCEPT_DATE'],
# }
data = [data_['AUDIT_NAME'], bond_type, data_['LIST1'], data_['PRIORITY_MANAGER'],
data_['PRIORITY_NAME'], data_['LIST2'], data_['REG_APRV_WEN_HAO'], audit_status, data_['PUBLISH_DATE'],
data_['ACCEPT_DATE'], type]
return data
def doJob():
data_list = []
total = getTotal()
for page in range(1, total + 1):
info_list = getInfoList(page)
for info in info_list:
id = info[0]
type = info[1]
data = getData(id,type)
data_list.append(data)
# break
df = pd.DataFrame(np.array(data_list))
df.columns = ['公募REITs名称', '品种', '发起人', '管理人', '专项计划名称', '专项计划管理人', '无异议函文号', '项目状态', '更新日期', '受理日期','申报类型']
df.to_excel('./上海交易所项目动态.xlsx', index=False)
if __name__ == '__main__':
doJob()
baseCore.close()
import re
import re
import time
import numpy as np
import pandas as pd
import requests
import os
import json
from base import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Type': 'application/json;charset=utf-8',
'Host': 'reits.szse.cn',
'Pragma': 'no-cache',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'X-Requested-With': 'XMLHttpRequest',
}
def getPageSize():
ip = baseCore.get_proxy()
url = 'http://reits.szse.cn/api/reits/projectrends/query?biztypsb=21&bizType=2&pageIndex=0&pageSize=10'
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
total = int(req.json()['totalSize'])
if total % 10 == 0:
pageSize = int(total / 10)
else:
pageSize = int(total / 10) + 1
return pageSize
def getDataJson(page):
ip = baseCore.get_proxy()
url = f'http://reits.szse.cn/api/reits/projectrends/query?biztypsb=21&bizType=2&pageIndex={page}&pageSize=10'
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
data_json = req.json()['data']
return data_json
def doJob():
info_list = []
pageSize = getPageSize()
for page in range(pageSize):
data_json = getDataJson(page)
for data_ in data_json:
cmpnm = data_['cmpnm']
specialPlanName = data_['specialPlanName']
issueTargetName = data_['issueTargetName']
primitiveInterestsor = data_['primitiveInterestsor']
acctfm = data_['acctfm']
sprinst = data_['sprinst']
lawfm = data_['lawfm']
biztypsbName = data_['biztypsbName']
prjst = data_['prjst']
updtdt = data_['updtdt']
acptdt = data_['acptdt']
info_list.append([cmpnm,specialPlanName,issueTargetName,primitiveInterestsor,acctfm,sprinst,lawfm,biztypsbName,prjst,updtdt,acptdt])
df = pd.DataFrame(np.array(info_list))
df.columns = ['基金名称','专项计划名称','基础设施项目类型','原始权益人','基金管理人','专项计划管理人','托管人','申报类型','审核状态','更新日期','受理日期']
df.to_excel('./市场板块/深圳交易所项目动态.xlsx',index=False)
if __name__ == '__main__':
doJob()
baseCore.close()
import json
import json
import re
import time
import calendar
import pymongo
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from retry import retry
from base import BaseCore
db_storage = pymongo.MongoClient('mongodb://192.168.1.36:27017', username='admin', password='zzsn@9988').RESCenter[
'REITsTxnStat']
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'query.sse.com.cn',
'Pragma': 'no-cache',
'Referer': 'http://www.sse.com.cn/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
@retry(tries=5, delay=20)
def getJson(url):
# ip = baseCore.get_proxy()
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
data_json = re.findall('\((.*)\)', req.text)[0]
data_json = json.loads(data_json)
req.close()
return data_json
# 2021-06-26
# 每日概况
def getDayData():
start_date = datetime(2021, 6, 21)
end_date = datetime.today() - timedelta(days=1)
date_range = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]
for date in date_range:
date_ = date.strftime('%Y-%m-%d')
is_insert = db_storage.find_one({'date': date, 'exchange': '上海证券交易所'})
if is_insert:
log.info(f'{date}===已采集')
time.sleep(1)
continue
url = f'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback89728&sqlId=COMMON_SSE_REITS_HQXX_CJTJ_DAY_L&TRADE_DATE={date_}&FUND_TYPE=01&_={int(time.time())}'
try:
data_json = getJson(url)['result']
except Exception as e:
log.error(f'{date}===连接失败==={e}')
time.sleep(3)
continue
try:
for data_ in data_json:
dic_info = {
'number': int(data_['LIST_NUM']), # 挂牌数
'volume': float(data_['TRADE_VOL'])*10000, # 成交量
'amount': float(data_['TRADE_AMT'])*10000, # 成交金额
'totalValue': float(data_['TOTAL_VALUE'])*10000, # 市价总额
'negoValue': float(data_['NEGO_VALUE'])*10000, # 流通市值
'toRate': float(data_['TO_RATE']), # 换手率
'date': date,
'country': '中国',
'exchange': '上海证券交易所'
}
db_storage.insert_one(dic_info)
log.info(f'{date}===采集成功')
except Exception as e:
log.error(f'{date}===数据存储失败==={e}')
time.sleep(3)
# 每周概况
def getWeekData(writer):
data_list = []
start_date = datetime(2021, 6, 21)
end_date = datetime.today()
date_range = [start_date + timedelta(days=x) for x in range(0, (end_date - start_date).days + 1, 7)]
for date_1 in date_range:
date_2 = (date_1 + timedelta(days=6)).strftime('%Y-%m-%d')
date_1 = date_1.strftime('%Y-%m-%d')
url = f'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback65413&sqlId=COMMON_SSE_REITS_HQXX_CJTJ_WEEK_L&START_DATE={date_1}&END_DATE={date_2}&FUND_TYPE=01&_={int(time.time())}'
data_json = getJson(url)['result']
for data_ in data_json:
data = [data_['LIST_NUM'], data_['TRADE_VOL'], data_['TRADE_AMT'], data_['TOTAL_VALUE'],
data_['NEGO_VALUE'], data_['TO_RATE'], f'{date_1}至{date_2}']
dic_info = {
'挂牌数': data_['LIST_NUM'],
'成交量(亿份)': data_['TRADE_VOL'],
'成交金额(亿元)': data_['TRADE_AMT'],
'市价总额(亿元)': data_['TOTAL_VALUE'],
'流通市值(亿元)': data_['NEGO_VALUE'],
'换手率(%)': data_['TO_RATE'],
'日期': f'{date_1}至{date_2}',
'类别': '每周概况'
}
db_storage.insert_one(dic_info)
log.info(f'{date_1}至{date_2}===采集完成')
data_list.append(data)
time.sleep(1)
df = pd.DataFrame(np.array(data_list))
df.columns = ['挂牌数', '成交量(亿份)', '成交金额(亿元)', '市价总额(亿元)', '流通市值(亿元)', '换手率(%)', '日期']
df.to_excel(writer, sheet_name='每周概况', index=False)
# 月度概况
def getMonthData(writer):
data_list = []
start_date = datetime.strptime('2021-06-01', '%Y-%m-%d')
current_date = datetime.now()
while start_date <= current_date:
year = start_date.year
month = start_date.month
date = start_date.strftime('%Y-%m')
url = f'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback76435&sqlId=COMMON_SSE_REITS_HQXX_CJTJ_MONTH_L&TRADE_DATE={date}&FUND_TYPE=01&_={int(time.time())}'
data_json = getJson(url)['result']
for data_ in data_json:
data = [data_['LIST_NUM'], data_['TRADE_VOL'], data_['TRADE_AMT'], data_['TOTAL_VALUE'],
data_['NEGO_VALUE'], data_['TO_RATE'], date]
dic_info = {
'挂牌数': data_['LIST_NUM'],
'成交量(亿份)': data_['TRADE_VOL'],
'成交金额(亿元)': data_['TRADE_AMT'],
'市价总额(亿元)': data_['TOTAL_VALUE'],
'流通市值(亿元)': data_['NEGO_VALUE'],
'换手率(%)': data_['TO_RATE'],
'日期': date,
'类别': '月度概况'
}
db_storage.insert_one(dic_info)
log.info(f'{date}===采集完成')
data_list.append(data)
if month == 12:
start_date = start_date.replace(year=year + 1, month=1)
else:
start_date = start_date.replace(month=month + 1)
time.sleep(1)
df = pd.DataFrame(np.array(data_list))
df.columns = ['挂牌数', '成交量(亿份)', '成交金额(亿元)', '市价总额(亿元)', '流通市值(亿元)', '换手率(%)', '日期']
df.to_excel(writer, sheet_name='每月概况', index=False)
def doJob():
log.info('======开始采集======')
getDayData()
log.info('===每天数据采集完===')
# getWeekData(writer)
# log.info('===每周数据采集完===')
# getMonthData(writer)
# log.info('===每月数据采集完===')
if __name__ == '__main__':
doJob()
\ No newline at end of file
import os
import os
from urllib.parse import urljoin
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from base import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'www.sse.com.cn',
'Pragma': 'no-cache',
'Referer': 'http://www.sse.com.cn/reits/regulation/rules/',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
def paserUrl(html, listurl):
# 获取所有的<a>标签和<img>标签
if isinstance(html, str):
html = BeautifulSoup(html, 'html.parser')
links = html.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return html
def getFjContent(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
return req.content
def getSoup(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
return soup
def getContent(url, publishDate, num):
fjhref_list = ''
fjtitle_list = ''
soup = getSoup(url)
soup = paserUrl(soup, 'http://www.sse.com.cn/')
contentWithTag = soup.find('div', class_='allZoom')
pub_hao = contentWithTag.find('p').text.lstrip().strip()
a_list = contentWithTag.find_all('a')
# 上传附件
for a in a_list:
fj_href = a.get('href')
fj_title = a.get('title')
category = os.path.splitext(fj_href)[1]
if '.' not in category or '.cn' in category:
continue
if category not in fj_title:
fj_title = fj_title + category
fj_title = f'{num}-{publishDate}-{fj_title}'
fjtitle_list += fj_title + '\n'
fjhref_list += fj_href + '\n'
fjcontent = getFjContent(fj_href)
file = f'./相关政策/上海证券交易所/政策文件/{fj_title}'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
content = contentWithTag.text
return pub_hao, content,fjtitle_list,fjhref_list
def doJob():
if not os.path.exists('./相关政策/上海证券交易所/政策文件'):
os.makedirs('./相关政策/上海证券交易所/政策文件')
data_list = []
urls = ['http://www.sse.com.cn/reits/regulation/rules/', 'http://www.sse.com.cn/reits/regulation/guide/']
num = 1
for url in urls:
soup = getSoup(url)
soup = paserUrl(soup, 'http://www.sse.com.cn/')
li_list = soup.find('ul', class_='list').find_all('li')
for li in li_list:
title = li.find('a').text.lstrip().strip()
href = li.find('a').get('href')
origin = '上海证券交易所'
publishDate = li.find('i', class_='date').text.lstrip().strip()
writtenDate = publishDate
organ = '上海证券交易所'
summary = ''
pub_hao, content, fjtitle_list, fjhref_list = getContent(href, publishDate, num)
data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
fjhref_list]
data_list.append(data)
log.info(f'{title}===采集成功')
num += 1
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./相关政策/上海证券交易所/上海证券交易所政策文件.xlsx', index=False)
if __name__ == '__main__':
doJob()
baseCore.close()
import re
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import numpy as np
from base import BaseCore
from requests.models import Response
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Pragma': 'no-cache',
'Referer': 'https://www.cushmanwakefield.com.cn/research-report/p94.html?expert=0',
'Sec-Ch-Ua': '"Microsoft Edge";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
def getSoup(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
return soup
def getPageSize():
url = 'https://www.cushmanwakefield.com.cn/research-report/p1.html?expert=0'
soup = getSoup(url)
total = int(re.findall('\d+', soup.find('dl', class_='sousuo_result').text.lstrip().strip())[0])
if total % 4 == 0:
pageSize = int(total / 4)
else:
pageSize = int(total / 4) + 1
return pageSize
def getContent(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
return req.content
def doJob():
if not os.path.exists('./研究咨询/戴德梁兴/行业视角-研究报告'):
os.makedirs('./研究咨询/戴德梁兴/行业视角-研究报告')
num = 1
data_list = []
pageSize = getPageSize()
for page in range(1, pageSize + 1):
url = f'https://www.cushmanwakefield.com.cn/research-report/p{page}.html?expert=0'
soup = getSoup(url)
div_list = soup.find('div', class_='guwen_list_box').find_all('div', class_='zhuangyuan_guwen_box')
for div in div_list:
fjtitle_list = ''
fjhref_list = ''
name = div.find('div', class_='zhuanyuan_name').text.lstrip().strip()
summary = div.find('div', class_='zhuanyuan_info').text.lstrip().strip()
href = div.find('a', class_='zhuanyuan_xinxi').get('href')
origin = '戴德梁兴'
try:
content = getContent(href)
except:
log.error(f'第{page}页==={name}===连接失败')
continue
title = name.replace('/',' ').replace('|',' ').replace('?',' ').replace('"','”')
file = f'./研究咨询/戴德梁兴/行业视角-研究报告/{title}.pdf'
num_ = 2
while True:
flg = os.path.isfile(file)
if flg:
log.info(f'{name}===有重名')
title_ = f'{title}-{num_}'
file = f'./研究咨询/戴德梁兴/行业视角-研究报告/{title_}.pdf'
num_ += 1
else:
try:
title = title_
except:
pass
break
try:
with open(file, 'wb') as f:
f.write(content)
log.info(f'{name}===成功')
fjtitle_list += title + '\n'
fjhref_list += href + '\n'
data = [num, name, origin, href, summary, fjtitle_list, fjhref_list]
data_list.append(data)
except:
log.error(f'第{page}页==={name}===保存失败')
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '来源', '原文链接', '摘要', '附件名称', '附件连接']
df.to_excel('./研究咨询/戴德梁兴/行业视角-研究报告.xlsx', index=False)
if __name__ == '__main__':
doJob()
baseCore.close()
import datetime
import datetime
import json
import random
import re
import numpy as np
import pandas as pd
import pymongo
import requests
import time
from retry import retry
from selenium.webdriver.common.by import By
from base import BaseCore
db_storage_1 = pymongo.MongoClient('mongodb://192.168.1.36:27017', username='admin', password='zzsn@9988').RESCenter[
'REITsProdOverview']
db_storage_2 = pymongo.MongoClient('mongodb://192.168.1.36:27017', username='admin', password='zzsn@9988').RESCenter[
'REITsFinancing']
db_storage_3 = pymongo.MongoClient('mongodb://192.168.1.36:27017', username='admin', password='zzsn@9988').RESCenter[
'REITsEquityDist']
db_storage_4 = pymongo.MongoClient('mongodb://192.168.1.36:27017', username='admin', password='zzsn@9988').RESCenter[
'REITsShareStruct']
db_storage_5 = pymongo.MongoClient('mongodb://192.168.1.36:27017', username='admin', password='zzsn@9988').RESCenter[
'REITsNetWorth']
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Referer': 'http://www.sse.com.cn/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
}
@retry(tries=4, delay=20)
def getDataJson(url):
# ip = baseCore.get_proxy()
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
data_json = re.findall('\((.*)\)', req.text)[0]
data_json = json.loads(data_json)
req.close()
return data_json
# 获取基金代码列表
@retry(tries=3, delay=10)
def getCode():
code_list = []
url = f'http://yunhq.sse.com.cn:32041/v1/sh1/list/exchange/reits?callback=jQuery112407214866998855156_1699360786929&select=code%2Cname%2Clast%2Cchg_rate%2Cchange%2Cvolume%2Camount%2Cprev_close%2Copen%2Chigh%2Clow%2Camp_rate%2Ccpxxtype%2Ccpxxsubtype%2Ccpxxextendname&order=code%2Case&begin=0&end=25&_={int(time.time())}'
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
data_json = re.findall('\((.*)\)', req.text)[0]
data_json = json.loads(data_json)['list']
for data in data_json:
code_list.append([data[0], data[-1]])
req.close()
return code_list
# 产品概况
def productOverview(code):
url = f'http://query.sse.com.cn/commonSoaQuery.do?jsonCallBack=jsonpCallback3638&isPagination=false&sqlId=FUND_BASIC_INFO&fundCode={code}&_={int(time.time())}'
try:
data_json = getDataJson(url)['result'][0]
except:
log.error(f'{code}===产品概况连接失败')
time.sleep(1)
return
if db_storage_1.find_one({'code': data_json['fundCode'], 'exchange': '上海证券交易所'}):
log.info(f"{data_json['fundCode']}===产品概况已采集")
time.sleep(1)
return
dic_info = {
'code': data_json['fundCode'], # 代码
'shortName': data_json['fundExpansionAbbr'], # 扩位简称
'office': data_json['lawFirm'], # 律师事务所
'caretaker': data_json['companyName'], # 管理人
'caretakerPhone': data_json['contactMobile'], # 管理人联系方式
'custodian': data_json['trusteeName'], # 托管人
'country': '中国', # 国家
'exchange': '上海证券交易所' # 交易所
}
try:
db_storage_1.insert_one(dic_info)
log.info(f"{data_json['fundCode']}===产品概况采集成功")
except:
log.info(f"{data_json['fundCode']}===产品概况保存失败")
time.sleep(1)
# 筹资情况
def financing(code, name):
url = f'http://query.sse.com.cn/commonSoaQuery.do?jsonCallBack=jsonpCallback74728&isPagination=true&sqlId=REITS_FXYKM&fundCode={code}&pageHelp.pageSize=10&pageHelp.cacheSize=1&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.endPage=5&_={int(time.time())}'
try:
data_jsons = getDataJson(url)['result']
except:
log.error(f'{code}===筹资情况连接失败')
time.sleep(1)
return
for data_json in data_jsons:
saleStartDate = datetime.datetime.strptime(data_json['saleStartDate'], '%Y-%m-%d')
saleEndDate = datetime.datetime.strptime(data_json['saleEndDate'], '%Y-%m-%d')
try:
listingDate = datetime.datetime.strptime(data_json['listingDate'], '%Y-%m-%d')
except:
listingDate = ''
if db_storage_2.find_one(
{'code': code, 'saleStartDate': saleStartDate, 'saleEndDate': saleEndDate, 'exchange': '上海证券交易所'}):
log.info(f"{code}===筹资情况已采集")
time.sleep(1)
continue
dic_info = {
'code': code, # 代码
'shortName': name, # 简称
'price': data_json['salePrice'], # 发售价格
'saleStartDate': saleStartDate, # 发售起始日期
'saleEndDate': saleEndDate, # 发售终止日期
'saleCopies': data_json['saleCopies'], # 发售总份数(亿份)
'listingDate': listingDate, # 上市日期
'exchange': '上海证券交易所' # 交易所
}
try:
db_storage_2.insert_one(dic_info)
log.info(f'{code}===筹资情况采集成功')
except:
log.error(f'{code}===筹资情况保存失败')
time.sleep(1)
# 权益分配
def equityDistribution(code, name):
url = f'http://query.sse.com.cn/commonSoaQuery.do?jsonCallBack=jsonpCallback10108&isPagination=true&sqlId=REITS_FH&fundCode={code}&pageHelp.pageSize=10&pageHelp.cacheSize=1&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.endPage=5&_={int(time.time())}'
try:
data_jsons = getDataJson(url)['result']
except:
log.error(f'{code}===权益分配连接失败')
time.sleep(1)
return
for data_json in data_jsons:
rightsRegistDate = datetime.datetime.strptime(data_json['rightsRegistDate'], '%Y-%m-%d')
exrightDate = datetime.datetime.strptime(data_json['exrightDate'], '%Y-%m-%d')
if db_storage_3.find_one(
{'code': code, 'rightsRegistDate': rightsRegistDate, 'exrightDate': exrightDate,
'exchange': '上海证券交易所'}):
log.info(f"{code}==={rightsRegistDate}===权益分配已采集")
time.sleep(1)
continue
dic_info = {
'code': code, # 代码
'shortName': name, # 简称
'year': data_json['year'], # 年份
'fundDividends': data_json['fundDividends'], # 红利(元)
'rightsRegistDate': rightsRegistDate, # 权益登记日
'exrightDate': exrightDate, # 除权基准日
'exchange': '上海证券交易所' # 交易所
}
try:
db_storage_3.insert_one(dic_info)
log.info(f'{code}==={rightsRegistDate}===权益分配采集成功')
except:
log.error(f'{code}==={rightsRegistDate}===权益分配保存失败')
time.sleep(1)
# 份额结构
def shareStructure(code, name):
url = f'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback66502&isPagination=true&sqlId=COMMON_SSE_SJ_JJSJ_JJGM_REITSGM_L&FUND_CODE={code}&pageHelp.pageSize=10&pageHelp.cacheSize=1&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.endPage=5&FUND_TYPE=01&_={int(time.time())}'
try:
total = getDataJson(url)['pageHelp']['pageCount']
except:
log.error(f'{code}===份额结构总数获取失败')
time.sleep(1)
return
for page in range(1, total + 1):
url = f'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback66502&isPagination=true&sqlId=COMMON_SSE_SJ_JJSJ_JJGM_REITSGM_L&FUND_CODE={code}&pageHelp.pageSize=10&pageHelp.cacheSize=1&pageHelp.pageNo={page}&pageHelp.beginPage={page}&pageHelp.endPage=5&FUND_TYPE=01&_={int(time.time())}'
try:
data_jsons = getDataJson(url)['result']
except:
log.error(f'{code}===份额结构第{page}页连接失败')
time.sleep(1)
continue
for data_json in data_jsons:
tradeDate = datetime.datetime.strptime(str(data_json['TRADE_DATE']), '%Y%m%d')
if db_storage_4.find_one(
{'code': code, 'tradeDate': tradeDate,
'exchange': '上海证券交易所'}):
log.info(f"{code}==={tradeDate}===份额结构已采集")
time.sleep(1)
continue
dic_info = {
'code': code, # 代码
'shortName': name, # 简称
'limitVol': data_json['LIMIT_VOL'], # 场内限售份额(万份)
'unlimitVol': data_json['UNLIMIT_VOL'], # 场内非限售份额(万份)
'totalVol': data_json['TOTAL_VOL'], # 场内总份额(万份)
'tradeDate': tradeDate, # 最新份额日期
'sellVol': data_json['SELL_VOL'], # 总份额(万份)
'exchange': '上海证券交易所' # 交易所
}
try:
db_storage_4.insert_one(dic_info)
log.info(f'{code}==={tradeDate}===份额结构采集成功')
except:
log.error(f'{code}==={tradeDate}===份额结构保存失败')
time.sleep(1)
# 净值
def netWorth(code, name):
url = f'http://query.sse.com.cn/commonSoaQuery.do?jsonCallBack=jsonpCallback66035&isPagination=true&sqlId=REITS_JZ&fundCode={code}&order=appraiseDate%7Cdesc&pageHelp.pageSize=10&pageHelp.cacheSize=1&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.endPage=5&_={int(time.time())}'
try:
data_jsons = getDataJson(url)['result']
except:
log.error(f'{code}===净值连接失败')
time.sleep(1)
return
for data_json in data_jsons:
if not data_json['fundUnitnetWorth']:
continue
if int(data_json['fundUnitnetWorth']) == 0:
continue
appraiseDate = datetime.datetime.strptime(data_json['appraiseDate'], '%Y-%m-%d')
if db_storage_5.find_one(
{'code': code, 'appraiseDate': appraiseDate,
'exchange': '上海证券交易所'}):
log.info(f"{code}==={appraiseDate}===净值已采集")
time.sleep(1)
continue
dic_info = {
'code': code, # 代码
'shortName': name, # 简称
'appraiseDate': appraiseDate, # 估值日期
'fundUnitnetWorth': data_json['fundUnitnetWorth'], # REITs单位净值(元)
'exchange': '上海证券交易所' # 交易所
}
try:
db_storage_5.insert_one(dic_info)
log.info(f'{code}==={appraiseDate}===净值采集成功')
except:
log.error(f'{code}==={appraiseDate}===净值保存失败')
time.sleep(1)
def getInfo():
codes_list = getCode()
for codes in codes_list:
code = codes[0]
name = codes[1]
log.info(f'{code}==={name}===开始采集')
productOverview(code)
financing(code, name)
equityDistribution(code, name)
shareStructure(code, name)
netWorth(code, name)
log.info(f'{code}==={name}===采集结束')
time.sleep(5)
if __name__ == '__main__':
getInfo()
\ No newline at end of file
import os
import os
import random
import re
import fitz
import numpy as np
import openpyxl
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import time
from openpyxl import load_workbook
from retry import retry
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from urllib.parse import urljoin
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
class Policy():
@retry(tries=3, delay=10)
def getrequest_soup(self, url):
ip = baseCore.get_proxy()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
req = requests.get(url, headers=headers, proxies=ip)
if req.status_code != 200:
raise
req.encoding = req.apparent_encoding
result = BeautifulSoup(req.content, 'html.parser')
req.close()
return result
@retry(tries=3, delay=10)
def getrequest_soup_(self, url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
req = requests.get(url, headers=headers)
if req.status_code != 200:
raise
req.encoding = req.apparent_encoding
result = BeautifulSoup(req.content, 'html.parser')
req.close()
return result
def getrequest_json(self, headers, url):
ip = baseCore.get_proxy()
req = requests.get(headers=headers, url=url, proxies=ip)
result = req.json()
req.close()
return result
def requestPost(self, headers, url, payload):
# ip = baseCore.get_proxy()
req = requests.post(headers=headers, url=url, data=payload)
data_json = req.json()
req.close()
return data_json
def requestPost_html(self, headers, url, payload):
ip = baseCore.get_proxy()
req = requests.post(headers=headers, url=url, data=payload, proxies=ip)
result = BeautifulSoup(req.content, 'html.parser')
req.close()
return result
def deletep(self, soup, i, tag, attribute_to_delete, value_to_delete):
# 查找带有指定属性的标签并删除
tags = soup.find_all(tag, {attribute_to_delete: value_to_delete})
for tag in tags[:i]:
tag.decompose()
def deletespan(self, td):
spans = td.find_all('span')
for span in spans:
span.extract() # 删除span标签
def deletetag(self, td, tag):
tags = td.find_all(tag)
for tag_ in tags:
tag_.extract() # 删除指定标签
def deletetext(self, soup, tag, text): # 删除带有特定内容的标签
tags = soup.find_all(tag)[:10]
for tag_ in tags:
text_ = tag_.text
if text in text_:
tag_.extract()
def deletek(self, soup):
# 删除空白标签(例如<p></p>、<p><br></p>, img、video、hr除外)
for i in soup.find_all(lambda tag: len(tag.get_text()) == 0 and tag.name not in ["img", "video",
"br"] and tag.name != "br" or tag.get_text() == ' '):
for j in i.descendants:
if j.name in ["img", "video", "br"]:
break
else:
i.decompose()
def paserUrl(self, html, listurl):
# 获取所有的<a>标签和<img>标签
if isinstance(html, str):
html = BeautifulSoup(html, 'html.parser')
links = html.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return html
def getFjContent(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
}
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
content = req.content
req.close()
time.sleep(5)
return content
# 北京市人民政府 https://www.beijing.gov.cn/so/s?siteCode=1100000088&tab=zcfg&qt=REITs
def beijing():
if not os.path.exists('./相关政策/北京市人民政府/政策文件'):
os.makedirs('./相关政策/北京市人民政府/政策文件')
policy = Policy()
url = 'https://www.beijing.gov.cn/so/ss/query/s'
payload = {
'siteCode': '1100000088',
'tab': 'zcfg',
'qt': 'REITs',
'sort': 'relevance',
'keyPlace': '0',
'locationCode': '110000000000',
'page': '1',
'pageSize': '20',
'ie': '89b5e964-dc3c-4a5b-8d80-f6c408769b4a'
}
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Length': '148',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': 'Path=/; Path=/; __jsluid_s=91bdb0d83098fd2e8a8455a9085a22e2; JSESSIONID=M2FmNDczYzYtMmNkYS00N2I0LThhNDgtYWJiMTdhOTIyZDI4; _va_ref=%5B%22%22%2C%22%22%2C1699515166%2C%22https%3A%2F%2Fdocs.qq.com%2F%22%5D; _va_ses=*; JSESSIONID=CD61DA650DB33324962A3BF2527672D0; arialoadData=false; _va_id=c7a63e4b2199befd.1699358536.2.1699515273.1699515166.; CPS_SESSION=2FEFDC54444B24762D057AD6BDE3C7BF',
'Host': 'www.beijing.gov.cn',
'Origin': 'https://www.beijing.gov.cn',
'Referer': 'https://www.beijing.gov.cn/so/s?siteCode=1100000088&tab=zcfg&qt=REITs',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
data_list = []
result = policy.requestPost(headers, url, payload)
total = result['totalHits']
page_size = result['currentHits']
Max_page = int(total / page_size) + 1
num = 1
for page in range(0, Max_page):
payload_page = {
'siteCode': '1100000088',
'tab': 'zcfg',
'qt': 'REITs',
'sort': 'relevance',
'keyPlace': '0',
'locationCode': '110000000000',
'page': page + 1,
'pageSize': '20',
'ie': '89b5e964-dc3c-4a5b-8d80-f6c408769b4a'
}
data = policy.requestPost(headers, url, payload_page)
info_list = data['resultDocs']
for info_ in info_list:
fjtitle_list = ''
fjhref_list = ''
info = info_['data']
origin = info['siteLabel']['value'].lstrip().strip()
title = info['titleO'].lstrip().strip()
titleLabel = info['titleLabel']['value'].lstrip().strip()
publishDate = info['docDate'].lstrip().strip()
newsUrl = info['url'].lstrip().strip()
summary = info['summary'].lstrip().strip()
summary = BeautifulSoup(summary, 'lxml').text.lstrip().strip()
writtenDate = ''
pub_hao = ''
organ = ''
if titleLabel == '政策解读':
try:
newssoup = policy.getrequest_soup(newsUrl)
except:
newssoup = policy.getrequest_soup_(newsUrl)
contentWithTag = newssoup.find('div', id='mainText')
try:
scripts = contentWithTag.find_all('script')
for script in scripts:
script.decompose()
except:
pass
try:
styles = contentWithTag.find_all('style')
for style in styles:
style.decompose()
except:
pass
content = contentWithTag.text.lstrip().strip()
organ = newssoup.find('div', class_='othermessage').find('p', class_='fl').text.split('来源:')[
1].lstrip().strip()
elif titleLabel == '政策文件':
try:
newssoup = policy.getrequest_soup(newsUrl)
except:
newssoup = policy.getrequest_soup_(newsUrl)
contentWithTag = newssoup.find('div', id='mainText')
try:
scripts = contentWithTag.find_all('script')
for script in scripts:
script.decompose()
except:
pass
try:
styles = contentWithTag.find_all('style')
for style in styles:
style.decompose()
except:
pass
li_list = newssoup.find('ol', class_='doc-info').find_all('li')
for li in li_list:
if '成文日期' in li.text:
writtenDate = li.find('span').text.lstrip().strip()
content = contentWithTag.text.lstrip().strip()
formatRows = info['formatRows']
for row in formatRows:
for col in row['col']:
name = col['text']
if name == '相关附件':
value = col['value']
for i in range(len(value.keys())):
file_href = list(value.keys())[i]
file_name = list(value.values())[i]
fjcontent = getFjContent(file_href)
category = os.path.splitext(file_href)[1]
if category not in file_name:
file_name = file_name + category
file_name = f'{num}-{publishDate}-{file_name}'
file = f'./相关政策/北京市人民政府/政策文件/{file_name}'
fjtitle_list += file_name + '\n'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{file_name}===附件下载成功')
elif '号' in name:
pub_hao = col['value'].lstrip().strip()
elif '发文机构' in name:
organ = col['value'][0].lstrip().strip()
time.sleep(random.randint(10, 20))
data = [num, title, publishDate, origin, newsUrl, writtenDate, organ, pub_hao, summary, content,
fjtitle_list, fjhref_list]
data_list.append(data)
log.info(f'{title}===采集成功')
num += 1
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./相关政策/北京市人民政府/北京市人民政府政策文件.xlsx', index=False)
if __name__ == '__main__':
beijing()
baseCore.close()
import json
import json
import time
import requests
from bs4 import BeautifulSoup
from base import BaseCore
import os
import pandas as pd
import numpy as np
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'X-Requested-With': 'XMLHttpRequest',
}
headers_ = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'X-Requested-With': 'XMLHttpRequest',
}
def getTotal():
url = 'http://www.cq.gov.cn/irs/front/list'
data_post = {"customFilter": {"operator": "and", "properties": [], "filters": [{"operator": "or", "properties": [
{"property": "f_202121500898", "operator": "eq", "value": "REITs"},
{"property": "f_202142777829", "operator": "eq", "value": "REITs"}], "filters": []}, {"operator": "or",
"properties": [{
"property": "f_202146838317",
"operator": "gte",
"value": "2023-11-17 11:21:40"},
{
"property": "f_202146235090",
"operator": "gte",
"value": "2023-11-17 11:21:40"}],
"filters": [
{"operator": "and",
"properties": [{
"property": "f_202146838317",
"operator": "eq",
"value": None},
{
"property": "f_202146235090",
"operator": "eq",
"value": None}]}]}]},
"sorts": [{"sortField": "save_time", "sortOrder": "DESC"}], "tableName": "t_1775cd018c6",
"tenantId": "7", "pageSize": 10, "pageNo": 1}
data_post = json.dumps(data_post)
ip = baseCore.get_proxy()
req = requests.post(url, data=data_post, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
total = int(req.json()['data']['pager']['pageCount'])
return total
def getDataJson(page):
url = 'http://www.cq.gov.cn/irs/front/list'
data_post = {"customFilter": {"operator": "and", "properties": [], "filters": [{"operator": "or", "properties": [
{"property": "f_202121500898", "operator": "eq", "value": "REITs"},
{"property": "f_202142777829", "operator": "eq", "value": "REITs"}], "filters": []}, {"operator": "or",
"properties": [{
"property": "f_202146838317",
"operator": "gte",
"value": "2023-11-17 11:21:40"},
{
"property": "f_202146235090",
"operator": "gte",
"value": "2023-11-17 11:21:40"}],
"filters": [
{"operator": "and",
"properties": [{
"property": "f_202146838317",
"operator": "eq",
"value": None},
{
"property": "f_202146235090",
"operator": "eq",
"value": None}]}]}]},
"sorts": [{"sortField": "save_time", "sortOrder": "DESC"}], "tableName": "t_1775cd018c6",
"tenantId": "7", "pageSize": 10, "pageNo": page}
data_post = json.dumps(data_post)
ip = baseCore.get_proxy()
req = requests.post(url, data=data_post, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
return req.json()['data']['list']
def getSoup(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
time.sleep(3)
return soup
def getContent_(id):
url = 'http://www.cq.gov.cn/govserver/tors/getPolicyDetail.html'
data_post = {
'policyId': f'{id}'
}
ip = baseCore.get_proxy()
req = requests.post(url,headers=headers_,data=data_post,proxies=ip)
req.encoding = req.apparent_encoding
return req.json()['data']['DETAIL']['ZCYW']
def getContent(url):
if 'policyId' in url:
id = url.split('policyId=')[1]
contentWithTag = getContent_(id)
contentWithTag = BeautifulSoup(contentWithTag,'lxml')
else:
soup = getSoup(url)
contentWithTag = soup.find('div', class_='view')
if not contentWithTag:
contentWithTag = soup.find('div',class_='document')
contentWithTag.find('div',class_='item').decompose()
try:
scripts = contentWithTag.find_all('script')
for script in scripts:
script.decompose()
except:
pass
try:
styles = contentWithTag.find_all('style')
for style in styles:
style.decompose()
except:
pass
content = contentWithTag.text.lstrip().strip()
return content
def getData(data_, num):
title = data_['f_202121500898']
publishDate = data_['save_time']
origin = data_['f_2021325755960']
href = data_['doc_pub_url']
try:
writtenDate = data_['f_202121607647']
except:
writtenDate = ''
try:
organ = data_['f_202121437464']
except:
organ = ''
try:
pub_hao = data_['f_202121837479']
except:
pub_hao = ''
summary = data_['f_202142777829']
summary = BeautifulSoup(summary, 'lxml').text.lstrip().strip()
content = getContent(href)
fjtitle_list = ''
fjhref_list = ''
data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
fjhref_list]
return data
def doJob():
if not os.path.exists('./相关政策/重庆市人民政府/政策文件'):
os.makedirs('./相关政策/重庆市人民政府/政策文件')
total = getTotal()
num = 1
data_list = []
for page in range(1, total + 1):
data_json = getDataJson(page)
for data_ in data_json:
data = getData(data_, num)
num += 1
time.sleep(3)
data_list.append(data)
log.info(f'{data[1]}===采集成功')
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./相关政策/重庆市人民政府/重庆市人民政府政策文件.xlsx', index=False)
if __name__ == '__main__':
doJob()
baseCore.close()
import time
import time
import os
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from base import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'X-Requested-With': 'XMLHttpRequest',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Referer': 'https://www.fujian.gov.cn/ssp/main/index.html?key=REITs&siteId=ff808081624641aa0162476c0e0e0055&isMain='
}
def getSoup(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
return soup
def getFjContent(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
return req.content
def getDataJson(data_post):
url = f'https://www.fujian.gov.cn/ssp/search/api/search?time={int(time.time())}'
ip = baseCore.get_proxy()
req = requests.post(url, headers=headers, data=data_post, proxies=ip)
req.encoding = req.apparent_encoding
data_json = req.json()['datas']
return data_json
def getContent(num, url, publishDate):
url_ = url.split('/')[-1]
url_ = url.replace(url_, '')
fjhref_list = ''
fjtitle_list = ''
soup = getSoup(url)
contentWithTag = soup.find('div', class_='TRS_Editor')
try:
scripts = contentWithTag.find_all('script')
for script in scripts:
script.decompose()
except:
pass
try:
styles = contentWithTag.find_all('style')
for style in styles:
style.decompose()
except:
pass
a_list = contentWithTag.find_all('a')
for a in a_list:
fj_href = a.get('href').replace('./', url_)
fjhref_list += fj_href + '\n'
fj_title = a.text.lstrip().strip()
category = os.path.splitext(fj_href)[1]
if category not in fj_title:
fj_title = fj_title + category
fj_title = f'{num}-{publishDate}-{fj_title}'
fjtitle_list += fj_title + '\n'
fjcontent = getFjContent(fj_href)
file = f'./相关政策/福建省人民政府/政策文件/{fj_title}'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
fjtitle_list = fjtitle_list.lstrip().strip()
fjhref_list = fjhref_list.lstrip().strip()
content = contentWithTag.text.lstrip().strip()
return content, fjtitle_list, fjhref_list
def doJob():
if not os.path.exists('./相关政策/福建省人民政府/政策文件'):
os.makedirs('./相关政策/福建省人民政府/政策文件')
data_posts = [{
'isCollapse': '', 'siteType': '1', 'typeQueryJsonToMap': '', 'pubOrgType': '1', 'jiGuanList': '',
'siteCode': '', 'zhuTiIdList': '', 'isCrdept': '', 'mainSiteId': 'ff808081624641aa0162476c0e0e0055',
'siteId': 'ff808081624641aa0162476c0e0e0055', 'depSiteId': 'ff808081624641aa0162476c0e0e0055', 'type': '0',
'page': '1', 'rows': '10', 'historyId': '8a289fe18ba97b6a018bd6aee981642d', 'sourceType': 'SSP_DOCUMENT_ZC',
'isChange': '0', 'fullKey': 'N', 'wbServiceType': '13', 'fileType': '', 'feaTypeName': '', 'fileNo': '',
'pubOrg': '', 'zfgbPubOrg': '', 'themeType': '', 'searchTime': '', 'startDate': '', 'endDate': '',
'sortFiled': 'RELEVANCE', 'searchFiled': '', 'dirUseLevel': '', 'issueYear': '', 'publishYear': '',
'issueMonth': '', 'allKey': '', 'fullWord': '', 'oneKey': '', 'notKey': '', 'totalIssue': '', 'chnlName': '',
'zfgbTitle': '', 'zfgbContent': '', 'bsDeptId': '', 'siteName': '', 'keyWord': 'REITs', 'isProvince': '',
}, {
'isCollapse': '', 'siteType': '1', 'typeQueryJsonToMap': '', 'pubOrgType': '', 'jiGuanList': '',
'siteCode': '', 'zhuTiIdList': '', 'isCrdept': '', 'mainSiteId': 'ff808081624641aa0162476c0e0e0055',
'siteId': 'ff808081624641aa0162476c0e0e0055', 'depSiteId': 'ff808081624641aa0162476c0e0e0055', 'type': '0',
'page': '1', 'rows': '10', 'historyId': '8a28289e8ba97b6b018bd6cee1c26aa7', 'sourceType': 'SSP_JDHY',
'isChange': '0', 'fullKey': 'N', 'wbServiceType': '13', 'fileType': '', 'feaTypeName': '', 'fileNo': '',
'pubOrg': '', 'zfgbPubOrg': '', 'themeType': '', 'searchTime': '', 'startDate': '', 'endDate': '',
'sortFiled': 'RELEVANCE', 'searchFiled': '', 'dirUseLevel': '', 'issueYear': '', 'publishYear': '',
'issueMonth': '', 'allKey': '', 'fullWord': '', 'oneKey': '', 'notKey': '', 'totalIssue': '', 'chnlName': '',
'zfgbTitle': '', 'zfgbContent': '', 'bsDeptId': '', 'siteName': '', 'keyWord': 'REITs', 'isProvince': '',
}]
data_list = []
num = 1
for data_post in data_posts:
data_json = getDataJson(data_post)
for data_ in data_json:
title = data_['_doctitle']
publishDate = data_['crtime'].replace('.','-')
origin = data_['docsourcename']
href = data_['docpuburl']
try:
writtenDate = data_['pubdate'].replace('.','-')
except:
writtenDate = ''
try:
organ = data_['puborg']
except:
organ = ''
try:
pub_hao = data_['fileno']
except:
pub_hao = ''
summary = data_['doccontent']
summary = BeautifulSoup(summary, 'lxml').text.lstrip().strip()
content, fjtitle_list, fjhref_list = getContent(num, href, publishDate[:10])
data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
fjhref_list]
data_list.append(data)
log.info(f'{title}===采集成功')
num += 1
time.sleep(1)
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./相关政策/福建省人民政府/福建省人民政府政策文件.xlsx', index=False)
if __name__ == '__main__':
doJob()
baseCore.close()
import datetime
import datetime
import json
import time
import requests
from bs4 import BeautifulSoup
from retry import retry
from base import BaseCore
import os
import pandas as pd
import numpy as np
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'X-XSRF-TOKEN': 'eyJpdiI6InhWUlhvRWpuUUp4ejFsQ0VVb29CaFE9PSIsInZhbHVlIjoiOUp5dHJ2SVVoNWl0K0s3UVlaZGZcL3p0a0gxc09sclRVU2JZTjg3dVUyTER4WVE4Qm1Ta2dyWUJndENmMURYVmwiLCJtYWMiOiJjNGU5YTU1MTJmZmZmZjdhZjRkNDE0NmM4Y2I3OTNkMmExYmJjZGRmYTk5MGMyMmQyM2FhYjVjMjRhZTY0NjA2In0=',
}
@retry(tries=5, delay=5)
def getSoup(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
time.sleep(5)
soup = BeautifulSoup(req.text, 'html.parser')
return soup
def getFjContent(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
return req.content
def getPageSize():
ip = baseCore.get_proxy()
url = 'https://search.gd.gov.cn/api/search/file'
data_post = {"page": "1", "position": "all", "keywords": "REITs", "sort": "smart", "site_id": "2", "range": "site",
"recommand": 1, "gdbsDivision": "440000", "service_area": 1}
data_post = json.dumps(data_post)
req = requests.post(url, headers=headers, data=data_post, proxies=ip)
req.encoding = req.apparent_encoding
total = int(req.json()['data']['total'])
if total % 12 == 0:
pageSize = int(total / 12)
else:
pageSize = int(total / 12) + 1
return pageSize
def getDataJson(url, data_post):
ip = baseCore.get_proxy()
req = requests.post(url, headers=headers, data=data_post, proxies=ip)
req.encoding = req.apparent_encoding
try:
data_json = req.json()['data']['list']
except:
data_json = req.json()['data']['news']['list']
return data_json
def getContent(url, publishDate, num):
fjhref_list = ''
fjtitle_list = ''
soup = getSoup(url)
time.sleep(2)
try:
try:
contentWithTag = soup.select('body > div.con > div.viewList > div.zw')[0]
except:
contentWithTag = soup.select('body > div.con > div:nth-of-type(3) > div.content > div.viewList > div.zw')[0]
except:
contentWithTag = soup.find('div', class_='article-content').find('center')
if not contentWithTag:
contentWithTag = soup.find('div', class_='article-content')
img_list = contentWithTag.find_all('img')
num_ = 1
for img in img_list:
fj_href = img.get('src')
if "http" not in fj_href and '//www' in fj_href:
fj_href = 'http:' + fj_href
fjhref_list += fj_href + '\n'
fj_title = img.get('alt')
if fj_title == '':
fj_title = str(num_)
num_ += 1
category = os.path.splitext(fj_href)[1]
if category not in fj_title:
fj_title = fj_title + category
fj_title = f'{num}-{publishDate}-{fj_title}'
fjcontent = getFjContent(fj_href)
file = f'./相关政策/广东省人民政府/政策文件/{fj_title}'
if os.path.exists(file):
file = file.replace(category, f'-{num_}{category}')
num_ += 1
if os.path.exists(file):
fj_title = fj_title.replace(category, f'-{num_}{category}')
file = f'./相关政策/广东省人民政府/政策文件/{fj_title}'
fjtitle_list += fj_title + '\n'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
a_list = contentWithTag.find_all('a')
for a in a_list:
fj_href = a.get('href')
fjhref_list += fj_href + '\n'
fj_title = a.text.lstrip().strip()
if fj_title == '':
fj_title = str(num_)
num_ += 1
category = os.path.splitext(fj_href)[1]
if category not in fj_title:
fj_title = fj_title + category
fj_title = f'{num}-{publishDate}-{fj_title}'
fjcontent = getFjContent(fj_href)
file = f'./相关政策/广东省人民政府/政策文件/{fj_title}'
if os.path.exists(file):
file = file.replace(category, f'-{num_}{category}')
num_ += 1
fjtitle_list += fj_title + '\n'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
try:
scripts = contentWithTag.find_all('script')
for script in scripts:
script.decompose()
except:
pass
try:
styles = contentWithTag.find_all('style')
for style in styles:
style.decompose()
except:
pass
content = contentWithTag.text.lstrip().strip()
fjtitle_list = fjtitle_list.lstrip().strip()
fjhref_list = fjhref_list.lstrip().strip()
return content, fjtitle_list, fjhref_list
def ST(txt):
txt = BeautifulSoup(txt, 'lxml').text
return txt
def getData(data_, num):
title = ST(data_['title'])
log.info(f'{title}===开始采集')
publishDate = data_['pub_time']
origin = data_['publisher_src']
href = data_['url']
log.info(href)
writtenDate = data_['date']
if writtenDate:
writtenDate = datetime.datetime.fromtimestamp(writtenDate).strftime('%Y-%m-%d')
organ = data_['source']
pub_hao = data_['document_number']
summary = ST(data_['content'])
content, fjtitle_list, fjhref_list = getContent(href, publishDate, num)
data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
fjhref_list]
return data
def doJob_1():
if not os.path.exists('./相关政策/广东省人民政府/政策文件'):
os.makedirs('./相关政策/广东省人民政府/政策文件')
pageSize = getPageSize()
data_list = []
num = 1
url = 'https://search.gd.gov.cn/api/search/file'
for page in range(1, pageSize + 1):
data_post = {"page": f"{page}", "position": "all", "keywords": "REITs", "sort": "smart", "site_id": "2",
"range": "site",
"recommand": 1, "gdbsDivision": "440000", "service_area": 1}
data_post = json.dumps(data_post)
data_json = getDataJson(url, data_post)
for data_ in data_json:
data = getData(data_, num)
data_list.append(data)
log.info(f'{data[1]}===采集成功')
num += 1
return data_list, num
def doJob_2(num):
url = 'https://search.gd.gov.cn/api/search/all'
types = ['政策解读', '计划规划']
data_list = []
for type in types:
data_post = {"label": f"{type}", "position": "all", "keywords": "REITs", "sort": "smart", "site_id": "2",
"range": "site", "page": 1, "tag_name": f"{type}", "recommand": 1, "gdbsDivision": "440000",
"service_area": 1}
data_post = json.dumps(data_post)
data_json = getDataJson(url, data_post)
for data_ in data_json:
data = getData(data_, num)
time.sleep(1)
data_list.append(data)
log.info(f'{data[1]}===采集成功')
num += 1
return data_list
def doJob():
data_list = []
data_list_, num = doJob_1()
data_list += data_list_
data_list_ = doJob_2(num)
data_list += data_list_
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./相关政策/广东省人民政府/广东省人民政府政策文件.xlsx', index=False)
if __name__ == '__main__':
doJob()
# doJob_1()
# doJob_2(2)
# url = 'http://www.gd.gov.cn/gkmlpt/content/4/4022/post_4022955.html#8'
# soup = getSoup(url)
#
# print(contentWithTag)
baseCore.close()
import json
import json
import os
import time
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from base import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'Content-Type': 'application/json',
}
def getSoup(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
return soup
def getFjContent(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
return req.content
def getTotal():
ip = baseCore.get_proxy()
url = 'http://www.gxzf.gov.cn/irs/front/search'
data_post = {"code": "181aedaa542", "dataTypeId": "241", "configCode": "",
"sign": "9cc99c9d-94aa-44b4-aa79-41227a5385d7", "searchWord": "REITs", "orderBy": "related",
"searchBy": "all", "appendixType": "", "granularity": "ALL", "isSearchForced": "0", "filters": [],
"pageNo": 1, "pageSize": 10, "isAdvancedSearch": None, "isDefaultAdvanced": None,
"advancedFilters": None, "advancedFilters ": None, "historySearchWords": []}
data_post = json.dumps(data_post)
req = requests.post(url, headers=headers, data=data_post, proxies=ip)
req.encoding = req.apparent_encoding
return int(req.json()['data']['pager']['pageCount'])
def getDataJson(page):
ip = baseCore.get_proxy()
url = 'http://www.gxzf.gov.cn/irs/front/search'
data_post = {"code": "181aedaa542", "dataTypeId": "241", "configCode": "",
"sign": "9cc99c9d-94aa-44b4-aa79-41227a5385d7", "searchWord": "REITs", "orderBy": "related",
"searchBy": "all", "appendixType": "", "granularity": "ALL", "isSearchForced": "0", "filters": [],
"pageNo": page, "pageSize": 10, "isAdvancedSearch": None, "isDefaultAdvanced": None,
"advancedFilters": None, "advancedFilters ": None, "historySearchWords": []}
data_post = json.dumps(data_post)
req = requests.post(url, headers=headers, data=data_post, proxies=ip)
req.encoding = req.apparent_encoding
return req.json()['data']['middle']['listAndBox']
def getContent(url, publishDate, num):
fjhref_list = ''
fjtitle_list = ''
url_ = url.split('/')[-1]
url_ = url.replace(url_, '')
soup = getSoup(url)
contentWithTag = soup.find('div', attrs={'id': 'articleFile'})
try:
scripts = contentWithTag.find_all('script')
for script in scripts:
script.decompose()
except:
pass
try:
styles = contentWithTag.find_all('style')
for style in styles:
style.decompose()
except:
pass
content = contentWithTag.text
img_list = contentWithTag.find_all('img')
num_ = 1
for img in img_list:
fj_href = img.get('src')
fjhref_list += fj_href + '\n'
if 'http' not in fj_href:
fj_href = url_ + fj_href
fj_title = img.get('title')
if fj_title == '':
fj_title = str(num_)
category = os.path.splitext(fj_href)[1]
if category not in fj_title:
fj_title = fj_title + category
fj_title = f'{num}-{publishDate[:10]}-{fj_title}'
fjtitle_list += fj_title + '\n'
fjcontent = getFjContent(fj_href)
file = f'./相关政策/广西壮族自治区人民政府/政策文件/{fj_title}'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
a_list = soup.find('div', class_='downloadfile').find_all('a')
for a in a_list:
fj_href = a.get('href')
if 'http' not in fj_href:
fj_href = url_ + fj_href
fjhref_list += fj_href + '\n'
fj_title = a.text.lstrip().strip()
category = os.path.splitext(fj_href)[1]
if category not in fj_title:
fj_title = fj_title + category
fj_title = f'{num}-{publishDate[:10]}-{fj_title}'
fjtitle_list += fj_title + '\n'
fjcontent = getFjContent(fj_href)
file = f'./相关政策/广西壮族自治区人民政府/政策文件/{fj_title}'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
fjhref_list = fjhref_list.lstrip().strip()
fjtitle_list = fjtitle_list.lstrip().strip()
return content,fjtitle_list,fjhref_list
def getData(data_, num):
title = data_['data']['title']
publishDate = data_['data']['time']
origin = '广西壮族自治区人民政府'
href = data_['data']['url']
writtenDate = data_['data']['table-10']
organ = data_['data']['source']
pub_hao = data_['data']['table-5']
summary = data_['data']['table-7']
content, fjtitle_list, fjhref_list = getContent(href, publishDate, num)
data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
fjhref_list]
return data
def doJob():
if not os.path.exists('./相关政策/广西壮族自治区人民政府/政策文件'):
os.makedirs('./相关政策/广西壮族自治区人民政府/政策文件')
data_list = []
num = 1
total = getTotal()
for page in range(1, total + 1):
data_json = getDataJson(page)
title_list = []
for data_ in data_json:
title = data_['data']['title']
if title not in title_list:
title_list.append(title)
data = getData(data_, num)
data_list.append(data)
log.info(f'{title}===采集成功')
num += 1
time.sleep(2)
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./相关政策/江西省人民政府/广西壮族自治区人民政府政策文件.xlsx', index=False)
if __name__ == '__main__':
doJob()
baseCore.close()
import os
import os
import time
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from base import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'sousuo.www.gov.cn',
'Pragma': 'no-cache',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'sec-ch-ua': '"Microsoft Edge";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
def getSoup(url):
ip = baseCore.get_proxy()
URL = 'https://www.gov.cn/'
session = requests.session()
session.get(URL,headers=headers,proxies=ip)
# req = requests.get(url, headers=headers, proxies=ip)
req = session.get(url)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
# req.close()
session.close()
return soup
def getFjContent(url):
ip = baseCore.get_proxy()
URL = 'https://www.gov.cn/'
session = requests.session()
session.get(URL,headers=headers,proxies=ip)
req = session.get(url)
req.encoding = req.apparent_encoding
content = req.content
session.close()
return content
def getPageSize(types):
total = 0
ip = baseCore.get_proxy()
url = 'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary&q=REITs&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=&puborg=&pcodeYear=&pcodeNum=&filetype=&p=1&n=5&inpro=&dup=&orpro=&type=gwyzcwjk'
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
for type in types:
num = int(req.json()['searchVO']['catMap'][f'{type}']['totalCount'])
total += num
print(total)
if total % 20 == 0:
pageSize = int(total / 20)
else:
pageSize = int(total / 20) + 1
req.close()
return pageSize
def getDataJson(page, types):
data_list = []
ip = baseCore.get_proxy()
url = f'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary&q=REITs&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=&puborg=&pcodeYear=&pcodeNum=&filetype=&p={page}&n=5&inpro=&dup=&orpro=&type=gwyzcwjk'
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
for type in types:
data_list_ = req.json()['searchVO']['catMap'][f'{type}']['listVO']
data_list += data_list_
req.close()
return data_list
def getContent(url, publishDate, num, organ):
fjhref_list = ''
fjtitle_list = ''
soup = getSoup(url)
if organ == '':
try:
organ = soup.find('div', class_='pages-date').find('span', class_='font').text.split('来源:').lstrip().strip()
except:
organ = ''
contentWithTag = soup.find('div', class_='TRS_UEDITOR')
if not contentWithTag:
contentWithTag = soup.find('div',class_='pages_content')
a_list = contentWithTag.find_all('a')
for a in a_list:
fj_href = a.get('href')
if '.htm' not in fj_href and '.html' not in fj_href and '.shtml' not in fj_href and '.shtm' not in fj_href:
fjhref_list += fj_href + '\n'
fj_title = a.text.lstrip().strip()
category = os.path.splitext(fj_href)[1]
if category not in fj_title:
fj_title = fj_title + category
fj_title = f'{num}-{publishDate}-{fj_title}'
fjtitle_list += fj_title + '\n'
fjcontent = getFjContent(fj_href)
file = f'./相关政策/国务院/政策文件/{fj_title}'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
try:
scripts = contentWithTag.find_all('script')
for script in scripts:
script.decompose()
except:
pass
try:
styles = contentWithTag.find_all('style')
for style in styles:
style.decompose()
except:
pass
content = contentWithTag.text.lstrip().strip()
return content, fjtitle_list, fjhref_list, organ
def getData(data_, num):
title = data_['title'].replace('\n', '').replace('\r', '')
title = BeautifulSoup(title,'lxml').text
publishDate = data_['pubtimeStr'].replace('.', '-')
origin = '国务院'
href = data_['url']
writtenDate = data_['ptime']
try:
organ = data_['puborg']
except:
organ = ''
pub_hao = data_['pcode']
summary = data_['summary']
summary = BeautifulSoup(summary, 'lxml').text.lstrip().strip()
content, fjtitle_list, fjhref_list, organ = getContent(href, publishDate, num, organ)
data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
fjhref_list]
return data
def doJob():
if not os.path.exists('./相关政策/国务院/政策文件'):
os.makedirs('./相关政策/国务院/政策文件')
data_list = []
href_list = []
num = 1
types = ['bumenfile', 'gongwen', 'otherfile', 'gongbao']
pageSize = 7
for page in range(1, pageSize + 1):
data_json = getDataJson(page, types)
for data_ in data_json:
href = data_['url']
if href not in href_list:
data = getData(data_, num)
num += 1
data_list.append(data)
href_list.append(href)
log.info(f'{data[1]}===采集成功')
time.sleep(3)
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./相关政策/国务院/国务院政策文件.xlsx', index=False)
if __name__ == '__main__':
doJob()
baseCore.close()
import requests
import requests
from bs4 import BeautifulSoup
from base import BaseCore
import os
import pandas as pd
import numpy as np
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
def getSoup(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
if req.url == 'https://www.hainan.gov.cn/hainan/xhtml/404.html':
return ''
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
return soup
def getPageSize(type):
url = f'https://www.hainan.gov.cn/s?siteCode=4600000001&searchWord=REITs&column={type}&wordPlace=0&orderBy=1&startTime=&endTime=&isSecondSearch=undefined&pageSize=10&pageNum=0&timeStamp=0&labelHN=&uc=0&checkHandle=1&strFileType=0&countKey=%200&sonSiteCode=&areaSearchFlag=&secondSearchWords=&left_right_flag=1'
soup = getSoup(url)
total = int(soup.find('div', class_='results-list').find('span').text.lstrip().strip())
if total % 10 == 0:
pageSize = int(total / 10)
else:
pageSize = int(total / 10) + 1
return pageSize
def getContent(url, publishDate, num):
fjhref_list = ''
fjtitle_list = ''
soup = getSoup(url)
if soup == '':
return '','','',''
try:
writtenDate = soup.find('div', class_='zwgk_comr1').text.replace(' ', '').split('成文日期:')[1].split('标题')[
0].lstrip().strip()
except:
writtenDate = ''
contentWithTag = soup.find('div', attrs={'id':'font'})
try:
scripts = contentWithTag.find_all('script')
for script in scripts:
script.decompose()
except:
pass
try:
styles = contentWithTag.find_all('style')
for style in styles:
style.decompose()
except:
pass
try:
content = contentWithTag.text.lstrip().strip()
except:
print(url)
return writtenDate, content, fjtitle_list, fjhref_list
def getData(div, num):
title = div.find('a', class_='titlec').get('title').replace('\n', '').replace('\r', '').lstrip().strip()
href = div.find('a', class_='titlec').get('href')
publishDate = div.find('span', class_='quily-con').text.lstrip().strip()
origin = div.find('a', class_='address-con').text.lstrip().strip()
try:
table = div.find('div', class_='search-results').find('table').text
organ = table.split('发文机关:')[1].split('文号:')[0].lstrip().strip()
pub_hao = table.split('文号:')[1].lstrip().strip()
except:
organ = ''
pub_hao = ''
try:
summary = div.find('p', class_='p-text-color').text.lstrip().strip()
except:
summary = ''
writtenDate, content, fjtitle_list, fjhref_list = getContent(href, publishDate, num)
if content == '':
return []
data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
fjhref_list]
return data
def doJob():
if not os.path.exists('./相关政策/海南省人民政府/政策文件'):
os.makedirs('./相关政策/海南省人民政府/政策文件')
data_list = []
href_list = []
num = 1
types = [2682,2677]
for type in types:
pageSize = getPageSize(type)
for page in range(pageSize):
url = f'https://www.hainan.gov.cn/s?siteCode=4600000001&searchWord=REITs&column={type}&wordPlace=0&orderBy=1&startTime=&endTime=&isSecondSearch=undefined&pageSize=10&pageNum={page}&timeStamp=0&labelHN=&uc=0&checkHandle=1&strFileType=0&countKey=%200&sonSiteCode=&areaSearchFlag=&secondSearchWords=&left_right_flag=1'
soup = getSoup(url)
div_list = soup.select('#showPage > div')
del (div_list[-1])
for div in div_list:
href = div.find('a', class_='titlec').get('href')
if href not in href_list:
data = getData(div, num)
if data:
href_list.append(href)
data_list.append(data)
num += 1
log.info(f'{data[1]}===采集成功')
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./相关政策/海南省人民政府/江西省人民政府政策文件.xlsx', index=False)
if __name__ == '__main__':
doJob()
baseCore.close()
import os
import os
import time
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from base import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'Token': 'db345f2c-20fd-4cc8-9799-b9cd08b96392',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
def getDataJson():
ip = baseCore.get_proxy()
url = 'https://www.hlj.gov.cn/znwd/policy/policy/policy/home/public/policyWikipedia?_method=get'
data_post = {
'sort': 'smartIndex',
'order': 'asc',
'start': '0',
'length': '20',
'filter.all': 'REITs',
}
req = requests.post(url, headers=headers, data=data_post, proxies=ip)
req.encoding = req.apparent_encoding
data_json = req.json()['content']['content']
return data_json
def getSoup(url):
ip = baseCore.get_proxy()
req = requests.get(url,headers=headers,proxies=ip)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.json()['content']['html'],'lxml')
return soup
def getFjContent(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
return req.content
def getContent(num, title, publishDate, summary, id, pub_hao, organ,type):
fjhref_list = ''
fjtitle_list = ''
url = f'https://www.hlj.gov.cn/znwd/policy/#/readDetails?id={id}'
writtenDate = ''
if type == '政策解读':
origin = organ
organ = ''
href_ = f'https://www.hlj.gov.cn/znwd/policy/policy/policy/ctrl/public/chatPolicyResolve/{id}'
else:
origin = '黑龙江省人民政府'
href_ = f'https://www.hlj.gov.cn/znwd/policy/policy/policy/ctrl/public/chatPolicyFile/findById/{id}'
soup = getSoup(href_)
try:
a_list = soup.find_all('a')
for a in a_list:
href = a.get('href')
if '.html' in href or '.shtml' in href or '.htm' in href:
continue
fjhref_list += href + '\n'
category = os.path.splitext(href)[1]
fj_title = f'{num}-{publishDate}-{a.text.lstrip().strip()}'
if '<' in fj_title or '>' in fj_title:
fj_title = fj_title.replace('<', '').replace('>', '')
if category not in fj_title:
fj_title = fj_title + category
fjtitle_list += fj_title + '\n'
fjcontent = getFjContent(href)
file = f'./相关政策/黑龙江省人民政府/政策文件/{fj_title}'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
except Exception as e:
log.error(title, '=====', e)
try:
scripts = soup.find_all('script')
for script in scripts:
script.decompose()
except:
pass
try:
styles = soup.find_all('style')
for style in styles:
style.decompose()
except:
pass
content = soup.text.lstrip().strip()
data_ = [num, title, writtenDate, origin, url, publishDate, organ, pub_hao, summary, content, fjtitle_list,
fjhref_list]
return data_
def doJob():
if not os.path.exists('./相关政策/黑龙江省人民政府/政策文件'):
os.makedirs('./相关政策/黑龙江省人民政府/政策文件')
data_list = []
num = 1
data_json = getDataJson()
for data_ in data_json:
title = data_['title']
publishDate = data_['date']
summary = data_['content']
id = data_['dataId']
type = data_['typeName']
try:
pub_hao = data_['writtenText']
except:
pub_hao = ''
try:
organ = data_['unitShowName']
except:
organ = ''
data = getContent(num, title, publishDate, summary, id, pub_hao, organ,type)
data_list.append(data)
num += 1
time.sleep(3)
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./相关政策/黑龙江省人民政府/黑龙江省人民政府政策文件.xlsx', index=False)
if __name__ == "__main__":
doJob()
baseCore.close()
import os
import os
import time
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from base import BaseCore
import time
from selenium.webdriver import Firefox
from selenium import webdriver
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
headers_ = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Cookie': 'token=db51d0a6-06e1-49f6-8e4f-8cec52c47bec; uuid=db51d0a6-06e1-49f6-8e4f-8cec52c47bec;',
'Host': 'www.hubei.gov.cn',
'Pragma': 'no-cache',
'Referer': 'http://www.hubei.gov.cn/site/hubei/search.html',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
def getSoup(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers_, proxies=ip)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
return soup
def getFjContent(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
return req.content
def getDataJson(page):
ip = baseCore.get_proxy()
url = f'http://www.hubei.gov.cn/igs/front/search/list.html?index=hb-govdoc&type=govdoc&pageNumber={page}&pageSize=10&filter[AVAILABLE]=true&filter[fileNum-like]=&filter[Effectivestate]=&filter[fileYear]=&filter[fileYear-lte]=&filter[Subjectclass]=&filter[CateGory]=&filter[DOCTITLE,DOCCONTENT,fileNum-or]=REITs&code=872801132c71495bbe5a938f6acff5aa&siteId=50&filter[SITEID]=54&orderProperty=PUBDATE&orderDirection=desc&6LDjm9Ls=0MADqxalqEiunxfMA3PwdIsvIxiRRQzDxXUAXPlbOXcZq0Rg0iIRTAWPM5NCpsIcnfs9rjzmAOc6t7j5dB4VBmMHY3KtuQHQ6bnSkbepFXgB0I.UuQKzMa5IqQB19wRAMEmnB7VYU4cW'
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
return req.json()['page']['content']
def getContent(driver, url, num):
driver.get(url)
time.sleep(5)
fjhref_list = ''
fjtitle_list = ''
publishDate = driver.find_element(By.CLASS_NAME,'hbgov-article-meta-time').text.split('发布时间:')[1].lstrip().strip()
contentWithTag = driver.find_element(By.CLASS_NAME,'hbgov-article-content')
img_list = contentWithTag.find_elements(By.TAG_NAME,'img')
num_ = 1
for img in img_list:
fj_title = img.get_attribute('title')
fj_href = img.get_attribute('src')
fjhref_list += fj_href + '\n'
if fj_title == '':
fj_title = str(num_)
num_ += 1
category = os.path.splitext(fj_href)[1]
if category not in fj_title:
fj_title = fj_title + category
fj_title = f'{num}-{publishDate[:10]}-{fj_title}'
fjtitle_list += fj_title + '\n'
fjcontent = getFjContent(fj_href)
file = f'./相关政策/湖北省人民政府/政策文件/{fj_title}'
if os.path.exists(file):
fj_title = fj_title.replace(category,f'-{num_}') + category
num_ += 1
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
content = contentWithTag.text.lstrip().strip()
fjtitle_list = fjtitle_list.lstrip().strip()
fjhref_list = fjhref_list.lstrip().strip()
return publishDate, content, fjtitle_list, fjhref_list
def getData(driver, data_, num):
title = data_['DOCTITLE']
origin = data_['SITENAME']
pub_hao = data_['fileNum']
writtenDate = data_['PUBDATE']
organ = data_['publisher']
summary = data_['highlight']['DOCCONTENT'][0]
href = data_['DOCPUBURL']
publishDate, content, fjtitle_list, fjhref_list = getContent(driver, href, num)
data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
fjhref_list]
return data
def doJob():
service = Service(r'F:\spider\firefox\geckodriver.exe')
options = Options()
options.set_preference("general.useragent.override", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
driver = webdriver.Firefox(options=options, service=service)
if not os.path.exists('./相关政策/湖北省人民政府/政策文件'):
os.makedirs('./相关政策/湖北省人民政府/政策文件')
data_list = []
num = 1
for page in range(1, 3):
data_json = getDataJson(page)
for data_ in data_json:
data = getData(driver, data_, num)
data_list.append(data)
log.info(f'{data[1]}===采集成功')
num += 1
driver.close()
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./相关政策/湖北省人民政府/湖北省人民政府政策文件.xlsx', index=False)
#
if __name__ == '__main__':
doJob()
# service = Service(r'F:\spider\firefox\geckodriver.exe')
# options = Options()
# options.set_preference("general.useragent.override", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
# driver = webdriver.Firefox(options=options, service=service)
# driver.get('http://www.hubei.gov.cn/zfwj/ezf/202208/t20220801_4245008.shtml')
# time.sleep(5)
# contentWithTag = driver.find_element(By.CLASS_NAME,'hbgov-article-content')
# img_list = contentWithTag.find_elements(By.TAG_NAME,'img')
# num = 1
# for img in img_list:
# fj_href = img.get_attribute('src')
# fjcontent = getFjContent(fj_href)
# with open(f'./{num}.png','wb') as f:
# f.write(fjcontent)
# num += 1
baseCore.close()
import os
import os
import re
import time
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from base import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': 'application/json, text/javascript, */*; q=0.01',
}
def getSoup(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
return soup
def getFjContent(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
return req.content
def getContentA(url, num, publishDate, title):
fjhref_list = ''
fjtitle_list = ''
soup = getSoup(url)
organ = soup.find('div', class_='sp_time').text.split('来源:')[1].split('字体')[0].lstrip().strip()
contentWithTag = soup.find('div', attrs={'id': 'zoom'})
try:
scripts = contentWithTag.find_all('script')
for script in scripts:
script.decompose()
except:
pass
try:
styles = contentWithTag.find_all('style')
for style in styles:
style.decompose()
except:
pass
try:
num_ = 1
img_list = contentWithTag.find_all('img')
for img in img_list:
fj_href = img.get('src')
try:
fj_href = 'http://www.jiangsu.gov.cn' + fj_href
fjhref_list += fj_href + '\n'
fj_title = img.get('title').lstrip().strip()
fj_title = f'{num}-{publishDate}-{fj_title}'
fjtitle_list += fj_title + '\n'
except:
if 'img/png' in fj_href:
fj_title = f'{num}-{publishDate}-{title}-{num_}.png'
elif 'img/jpg' in fj_href:
fj_title = f'{num}-{publishDate}-{title}-{num_}.jpg'
num_ += 1
fjcontent = getFjContent(fj_href)
file = f'./相关政策/江苏省人民政府/政策文件/{fj_title}'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
except:
pass
content = contentWithTag.text
return organ, content, fjtitle_list, fjhref_list
def getContentB(url, num, publishDate, title):
fjhref_list = ''
fjtitle_list = ''
soup = getSoup(url)
info = soup.find('table', class_='xxgk_table').text.replace(' ','')
organ = info.split('发布机构:')[1].split('发文日期')[0].lstrip().strip()
writtenDate = info.split('发文日期:')[1].split('标题:')[0].lstrip().strip()
pub_hao = info.split('文号:')[1].split('内容概述:')[0].lstrip().strip()
contentWithTag = soup.find('div', class_='article_content')
try:
scripts = contentWithTag.find_all('script')
for script in scripts:
script.decompose()
except:
pass
try:
styles = contentWithTag.find_all('style')
for style in styles:
style.decompose()
except:
pass
# try:
num_ = 1
img_list = contentWithTag.find_all('img')
for img in img_list:
fj_href = img.get('src')
try:
fj_title = img.get('title').lstrip().strip()
fj_title = f'{num}-{publishDate}-{fj_title}'
fjtitle_list += fj_title + '\n'
fj_href = 'http://www.jiangsu.gov.cn' + fj_href
fjhref_list += fj_href + '\n'
fjcontent = getFjContent(fj_href)
file = f'./相关政策/江苏省人民政府/政策文件/{fj_title}'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
except:
if 'image/png' in fj_href:
fj_title = f'{num}-{publishDate}-{title}-{num_}.png'
elif 'image/jpg' in fj_href:
fj_title = f'{num}-{publishDate}-{title}-{num_}.jpg'
num_ += 1
fjtitle_list += fj_title + '\n'
content = contentWithTag.text.lstrip().strip()
return organ, writtenDate, pub_hao, content, fjtitle_list, fjhref_list
def doJob():
if not os.path.exists('./相关政策/江苏省人民政府/政策文件'):
os.makedirs('./相关政策/江苏省人民政府/政策文件')
pattern = r"\d{4}-\d{2}-\d{2}"
url = 'http://www.jiangsu.gov.cn/jsearchfront/search.do?websiteid=320000000100000&searchid=12&pg=&p=1&tpl=38&serviceType=&cateid=27&q=REITs&pq=&oq=&eq=&pos=&sortType=0&begin=&end='
driver = baseCore.buildDriver()
driver.get(url)
time.sleep(5)
div_list = driver.find_elements(By.CLASS_NAME,'news-result')
num = 1
data_list = []
for div in div_list:
title = div.find_element(By.CLASS_NAME, 'jcse-news-title').find_element(By.TAG_NAME,'a').get_attribute('title').lstrip().strip()
href = div.find_element(By.CLASS_NAME, 'jcse-news-title').find_element(By.TAG_NAME,'a').get_attribute('href')
type = div.find_element(By.CLASS_NAME, 'biaoqian').text.lstrip().strip()
summary = div.find_element(By.CLASS_NAME, 'jcse-news-abs-content').text.lstrip().strip()
dateInfo = div.find_element(By.CLASS_NAME, 'jcse-news-date').text
publishDate = re.findall(pattern, dateInfo)[0]
origin = dateInfo.replace(publishDate, '').lstrip().strip()
if type == '政务公开':
organ, content, fjtitle_list, fjhref_list = getContentA(href, num, publishDate, title)
writtenDate = ''
pub_hao = ''
else:
organ, writtenDate, pub_hao, content, fjtitle_list, fjhref_list = getContentB(href, num, publishDate, title)
data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
fjhref_list]
data_list.append(data)
log.info(f'{title}===采集成功')
num += 1
time.sleep(5)
driver.close()
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./江苏省人民政府政策文件.xlsx', index=False)
if __name__ == '__main__':
doJob()
baseCore.close()
import requests
import requests
from bs4 import BeautifulSoup
from base import BaseCore
import os
import pandas as pd
import numpy as np
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'X-Requested-With': 'XMLHttpRequest',
'Content-Type': 'application/x-www-form-urlencoded',
}
def getSoup(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
return soup
def getFjContent(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
return req.content
def getDataJson():
ip = baseCore.get_proxy()
url = 'http://sousuo.jiangxi.gov.cn/jsearchfront/interfaces/cateSearch.do'
data_post = {
'websiteid': '360000000000000',
'q': 'REITs',
'p': '1',
'pg': '20',
'cateid': '1517',
'pos': 'content',
'pq': '',
'oq': '',
'eq': '',
'begin': '',
'end': '',
'tpl': '49',
'sortType': '',
}
req = requests.post(url, headers=headers, data=data_post, proxies=ip)
req.encoding = req.apparent_encoding
data_json = req.json()['result']
return data_json
def getContent(url, num, publishDate):
fjhref_list = ''
fjtitle_list = ''
soup = getSoup(url)
contentWithTag = soup.find('div', attrs={'id': 'zoom'})
img_list = contentWithTag.find_all('img')
num_ = 1
for img in img_list:
fj_href = 'http://www.jiangxi.gov.cn' + img.get('src')
fjhref_list += fj_href + '\n'
fj_title = img.get('title')
if fj_title == '':
fj_title = str(num_)
num_ += 1
category = os.path.splitext(fj_href)[1]
if category not in fj_title:
fj_title = fj_title + category
fj_title = f'{num}-{publishDate}-{fj_title}'
fjtitle_list += fj_title + '\n'
fjcontent = getFjContent(fj_href)
file = f'./相关政策/江西省人民政府/政策文件/{fj_title}'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
try:
scripts = contentWithTag.find_all('script')
for script in scripts:
script.decompose()
except:
pass
try:
styles = contentWithTag.find_all('style')
for style in styles:
style.decompose()
except:
pass
content = contentWithTag.text.lstrip().strip()
return content, fjtitle_list, fjhref_list
def doJob():
if not os.path.exists('./相关政策/江西省人民政府/政策文件'):
os.makedirs('./相关政策/江西省人民政府/政策文件')
data_json = getDataJson()
data_list = []
num = 1
for data_ in data_json:
data_ = data_.replace('\\', '')
soup = BeautifulSoup(data_, 'lxml')
title = soup.select('body > div > div:nth-of-type(1) > span:nth-of-type(2) > a')[0].text.lstrip().strip()
pub_hao = soup.find('table', class_='jcse-service-table').find_all('tr')[0].find_all('td')[
-1].text.lstrip().strip()
organ = soup.find('table', class_='jcse-service-table').find_all('tr')[1].find_all('td')[
1].text.lstrip().strip()
writtenDate = soup.find('table', class_='jcse-service-table').find_all('tr')[1].find_all('td')[
-1].text.lstrip().strip()
summary = soup.find('table', class_='jcse-service-table').find_all('tr')[2].text.lstrip().strip()
href = soup.find('table', class_='jcse-service-table').find_all('tr')[3].find('a').get('href')
publishDate = writtenDate
origin = '江西省人民政府'
content, fjtitle_list, fjhref_list = getContent(href, num, publishDate)
data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
fjhref_list]
data_list.append(data)
log.info(f'{title}===采集成功')
num += 1
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./相关政策/江西省人民政府/江西省人民政府政策文件.xlsx', index=False)
if __name__ == '__main__':
doJob()
baseCore.close()
import os
import os
import time
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from base import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'X-Requested-With': 'XMLHttpRequest',
}
def getTotal(url):
ip = baseCore.get_proxy()
data_post = 'params=%7B%22word%22%3A%22REITs%22%2C%22page%22%3A1%2C%22size%22%3A20%2C%22stype%22%3A%223%22%2C%22area%22%3A%22220000%22%2C%22atype%22%3A%221%22%2C%22dept%22%3A%22%22%2C%22ttype%22%3A%220%22%2C%22start%22%3A%22%22%2C%22end%22%3A%22%22%2C%22itype%22%3A%22%22%2C%22mattType%22%3A%220%22%2C%22serverType%22%3A%220%22%2C%22sort%22%3A0%2C%22aword%22%3A%22%22%2C%22hword%22%3A%22%22%2C%22nword%22%3A%22%22%2C%22dtword%22%3A%22%22%2C%22scope%22%3A%221%22%2C%22selecttp%22%3A%220%22%2C%22filetype%22%3A%22%E5%85%A8%E9%83%A8%22%2C%22fileyear%22%3A%22%E5%85%A8%E9%83%A8%22%2C%22stypeChild%22%3A%220%22%2C%22hs%22%3A%220%22%2C%22flag%22%3A%22%22%2C%22satisfiedId%22%3A%224FD2493B0F0D447E955C4BB94F42228C634%22%7D'
req = requests.post(url, headers=headers, data=data_post, proxies=ip)
req.encoding = req.apparent_encoding
total = req.json()['data']['data']['totalPage']
return int(total)
def getDataJson(url, page):
ip = baseCore.get_proxy()
data_post = f'params=%7B%22word%22%3A%22REITs%22%2C%22page%22%3A{page}%2C%22size%22%3A20%2C%22stype%22%3A%223%22%2C%22area%22%3A%22220000%22%2C%22atype%22%3A%221%22%2C%22dept%22%3A%22%22%2C%22ttype%22%3A%220%22%2C%22start%22%3A%22%22%2C%22end%22%3A%22%22%2C%22itype%22%3A%22%22%2C%22mattType%22%3A%220%22%2C%22serverType%22%3A%220%22%2C%22sort%22%3A0%2C%22aword%22%3A%22%22%2C%22hword%22%3A%22%22%2C%22nword%22%3A%22%22%2C%22dtword%22%3A%22%22%2C%22scope%22%3A%221%22%2C%22selecttp%22%3A%220%22%2C%22filetype%22%3A%22%E5%85%A8%E9%83%A8%22%2C%22fileyear%22%3A%22%E5%85%A8%E9%83%A8%22%2C%22stypeChild%22%3A%220%22%2C%22hs%22%3A%220%22%2C%22flag%22%3A%22%22%2C%22satisfiedId%22%3A%224FD2493B0F0D447E955C4BB94F42228C634%22%7D'
req = requests.post(url, headers=headers, data=data_post, proxies=ip)
req.encoding = req.apparent_encoding
data_json = req.json()['data']['data']['list']
return data_json
def getFjContent(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
return req.content
def getData(num, title, href, origin, publishDate, summary):
writtenDate = ''
pub_hao = ''
organ = ''
fjhref_list = ''
fjtitle_list = ''
ip = baseCore.get_proxy()
req = requests.get(href, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
try:
scripts = soup.find_all('script')
for script in scripts:
script.decompose()
except:
pass
try:
styles = soup.find_all('style')
for style in styles:
style.decompose()
except:
pass
contentWithTag = soup.find('div', class_='contents_div')
if not contentWithTag:
contentWithTag = soup.find('div', class_='zlyxwz')
if not contentWithTag:
contentWithTag = soup.find('div', attrs={'id': 'zlyxwz'})
if not contentWithTag:
contentWithTag = soup.find('div', attrs={'id': 'Zoom'})
if not contentWithTag:
contentWithTag = soup.find('div',class_='sycon_bg')
if not contentWithTag:
contentWithTag = soup.find('div', attrs={'id': 'zoom'})
try:
try:
organ = soup.find('div', class_='xqy').text
organ = organ.split('来源:')[1].split('字体:')[0].lstrip().strip()
except:
info = soup.find('div', class_='zlylb_dy').find('table').text
organ = info.split('发文机关:')[1].split('成文日期:')[0].lstrip().strip()
writtenDate = info.split('成文日期:')[1].split('标')[0].lstrip().strip().replace('年', '-').replace('月',
'-').replace(
'日', '')
pub_hao = info.split('发文字号:')[1].split('发布日期:')[0].lstrip().strip()
except:
try:
organ = soup.find('div', class_='mqj_jtyst_xxnry_top_title_left_box').text.split('来源:')[1].lstrip().strip()
except:
table_list = soup.find_all('table')
for table in table_list:
if '发文机关' in table.text:
info = table.text
organ = info.split('发文机关:')[1].split('成文日期:')[0].lstrip().strip()
writtenDate = info.split('成文日期:')[1].split('标')[0].lstrip().strip().replace('年', '-').replace('月',
'-').replace(
'日', '')
pub_hao = info.split('发文字号:')[1].split('发布日期:')[0].lstrip().strip()
continue
if pub_hao == '无':
pub_hao = ''
try:
a_list = contentWithTag.find_all('a')
for a in a_list:
if '.html' in a.get('href') or '.shtml' in a.get('href') or '.htm' in a.get('href'):
continue
href = a.get('href')
fjhref_list += href + '\n'
category = os.path.splitext(href)[1]
fj_title = f'{num}-{publishDate}-{a.text.lstrip().strip()}'
if '<' in fj_title or '>' in fj_title:
fj_title = fj_title.replace('<', '').replace('>', '')
if category not in fj_title:
fj_title = fj_title + category
fjtitle_list += fj_title + '\n'
fjcontent = getFjContent(href)
file = f'./相关政策/内蒙古自治区人民政府/政策文件/{fj_title}'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
except:
pass
try:
a_list = soup.find('div', class_='wjfj-1026').find_all('a')
for a in a_list:
if '.html' in a.get('href') or '.shtml' in a.get('href') or '.htm' in a.get('href'):
continue
href = a.get('href')
fjhref_list += href + '\n'
category = os.path.splitext(href)[1]
fj_title = f'{num}-{publishDate}-{a.text.lstrip().strip()}'
if '<' in fj_title or '>' in fj_title:
fj_title = fj_title.replace('<', '').replace('>', '')
if category not in fj_title:
fj_title = fj_title + category
fjtitle_list += fj_title + '\n'
fjcontent = getFjContent(href)
file = f'./相关政策/内蒙古自治区人民政府/政策文件/{fj_title}'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
except:
pass
content = contentWithTag.text.lstrip().strip()
data_ = [num, title, writtenDate, origin, href, publishDate, organ, pub_hao, summary, content, fjtitle_list,
fjhref_list]
return data_
def doJob():
if not os.path.exists('./相关政策/吉林省人民政府/政策文件'):
os.makedirs('./相关政策/吉林省人民政府/政策文件')
data_list = []
num = 1
url = 'https://intellsearch.jl.gov.cn/api/data/list'
total = getTotal(url)
for page in range(1, total + 1):
data_json = getDataJson(url, page)
for data_ in data_json:
title = data_['title']
title = BeautifulSoup(title, 'lxml').find('p').text.lstrip().strip()
href = data_['url']
origin = data_['websiteName']
publishDate = data_['pubtime'].replace('/', '-')
summary = data_['content']
summary = BeautifulSoup(summary, 'lxml').find('p').text.lstrip().strip()
data = getData(num, title, href, origin, publishDate, summary)
data_list.append(data)
num += 1
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./吉林省人民政府政策文件.xlsx', index=False)
if __name__ == '__main__':
doJob()
baseCore.close()
import time
import time
from urllib.parse import urljoin
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from base import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
}
def getContent(url):
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
contentWithTag = soup.find('div', class_='zfwj_detail')
pub_hao = contentWithTag.find('p', class_='wjh').text.lstrip().strip()
content = contentWithTag.text.lstrip().strip()
return content, pub_hao
def doJob():
url = 'https://www.ln.gov.cn/search/pcRender?pageId=7b2aa485f97e40e4a0b4b635f36eda6c'
driver = baseCore.buildDriver()
driver.get(url)
time.sleep(1)
driver.find_element(By.CLASS_NAME, 'conFl_con').find_elements(By.TAG_NAME, 'a')[-1].find_element(By.TAG_NAME,
'label').click()
time.sleep(1)
driver.find_element(By.CLASS_NAME, 'search_inps').send_keys('REITs')
driver.find_element(By.CLASS_NAME, 'search_btns').click()
time.sleep(1)
div_list = driver.find_elements(By.CLASS_NAME, 'searchMod')
num = 1
data_list = []
for div in div_list:
title = div.find_element(By.TAG_NAME, 'a').text.replace('\n', '').lstrip().strip()
href = div.find_element(By.TAG_NAME, 'a').get_attribute('href')
summary = div.find_element(By.CLASS_NAME, 'txtCon').find_element(By.TAG_NAME, 'a').text.replace('\n',
'').lstrip().strip()
publishDate = div.find_element(By.CLASS_NAME, 'dates').text.split('时间:')[1].replace('年', '-').replace('月',
'-').replace(
'日', '').lstrip().strip()
content, pub_hao = getContent(href)
data = [num, title, publishDate, '辽宁省人民政府', href, '', '', pub_hao, summary, content, '', '']
data_list.append(data)
log.info(f'{title}===采集成功')
num += 1
driver.close()
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./辽宁省人民政府政策文件.xlsx', index=False)
if __name__ == '__main__':
doJob()
baseCore.close()
import os
import os
import time
from urllib.parse import urljoin
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from base import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'www.nmg.gov.cn',
'Pragma': 'no-cache',
'Referer': 'https://www.nmg.gov.cn/nmg_search/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'sec-ch-ua': '"Microsoft Edge";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
headers_ = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
def paserUrl(html, listurl):
# 获取所有的<a>标签和<img>标签
if isinstance(html, str):
html = BeautifulSoup(html, 'html.parser')
links = html.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return html
def getSoup(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers_, proxies=ip)
if req.status_code != 200:
return ''
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'lxml')
return soup
def getPageSize():
ip = baseCore.get_proxy()
url = 'https://www.nmg.gov.cn/nmsearch/trssearch/searchAll.do?siteId=32&searchTag=zc&allKeywords=REITs&fullKeywords=&orKeywords=&notKeywords=&sort=&position=0&organization=&pageNum=1&pageSize=10&zcYear=&zcMonth=&docno=&cdesc=&publisher=&cityName=&isAlways=1&isSearchRmzfAndBgt=&isAccurate=1'
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
total = int(req.json()['data']['total'])
if total % 10 == 0:
pageSize = int(total / 10)
else:
pageSize = int(total / 10) + 1
return pageSize
def getJson(page):
ip = baseCore.get_proxy()
url = f'https://www.nmg.gov.cn/nmsearch/trssearch/searchAll.do?siteId=32&searchTag=zc&allKeywords=REITs&fullKeywords=&orKeywords=&notKeywords=&sort=&position=0&organization=&pageNum={page}&pageSize=10&zcYear=&zcMonth=&docno=&cdesc=&publisher=&cityName=&isAlways=1&isSearchRmzfAndBgt=&isAccurate=1'
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
return req.json()['data']['data']
def getFjContent(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers_, proxies=ip)
req.encoding = req.apparent_encoding
return req.content
def getContent(num, data):
fjhref_list = ''
fjtitle_list = ''
title = data['title']
pub_hao = data['docno']
origin = data['sitedesc']
organ = data['publisher']
publishDate = data['docpubtime']
try:
writtenDate = data['scrq']
except:
writtenDate = ''
summary = BeautifulSoup(data['zc_doccontent'], 'html.parser').text.lstrip().strip()
url = data['docpuburl']
soup = getSoup(url)
if soup == '':
return ''
url_ = url.split('/')[-1]
soup = paserUrl(soup, url.replace(url_, ''))
contentWithTag = soup.find('div', attrs={'id': 'pare'})
if not contentWithTag:
contentWithTag = soup.find('div', attrs={'id': 'zoom'})
if not contentWithTag:
contentWithTag = soup.find('div', attrs={'id': 'd_show'})
if not contentWithTag:
contentWithTag = soup.find('div', attrs={'class': 'zoomCon'})
if not contentWithTag:
contentWithTag = soup.find('div', attrs={'id': 'pagecontent'})
if writtenDate == '':
try:
tr_list = soup.find('table', class_='m-detailtb').find_all('tr')
for tr in tr_list:
if '成文日期' in tr.text:
writtenDate = tr.text.split('成文日期:')[1].split('发布日期:')[0].lstrip().strip()
except:
tr_list = soup.find('div', class_='main').find('table').find_all('tr')
for tr in tr_list:
if '成文时间' in tr.text:
writtenDate = tr.text.split('成文时间:')[1].lstrip().strip()
try:
contentWithTag.find('div', class_='clearfix').decompose()
except:
pass
try:
contentWithTag.find('div', class_='cc_shangxiaye').decompose()
except:
pass
try:
scripts = contentWithTag.find_all('script')
for script in scripts:
script.decompose()
except:
pass
try:
styles = contentWithTag.find_all('style')
for style in styles:
style.decompose()
except:
pass
try:
a_list = contentWithTag.find_all('a')
for a in a_list:
href = a.get('href')
fjhref_list += href + '\n'
category = os.path.splitext(href)[1]
fj_title = f'{num}-{publishDate}-{a.text.lstrip().strip()}'
if '<' in fj_title or '>' in fj_title:
fj_title = fj_title.replace('<', '').replace('>', '')
if category not in fj_title:
fj_title = fj_title + category
fjtitle_list += fj_title + '\n'
fjcontent = getFjContent(href)
file = f'./相关政策/内蒙古自治区人民政府/政策文件/{fj_title}'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
except Exception as e:
log.error(title, '=====', e)
content = contentWithTag.text.lstrip().strip()
data_ = [num, title, publishDate, origin, url, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
fjhref_list]
return data_
def doJob():
if not os.path.exists('./相关政策/内蒙古自治区人民政府/政策文件'):
os.makedirs('./相关政策/内蒙古自治区人民政府/政策文件')
data_list = []
pageSize = getPageSize()
num = 1
for page in range(1, pageSize + 1):
data_json = getJson(page)
for data_ in data_json:
if data_['chnldesc'] == '政策文件':
data = getContent(num, data_)
if data:
data_list.append(data)
num += 1
log.info(f'{data[1]}===采集成功')
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./内蒙古自治区人民政府政策文件.xlsx', index=False)
if __name__ == '__main__':
doJob()
baseCore.close()
import time
import time
import requests
from bs4 import BeautifulSoup
from base import BaseCore
import os
import pandas as pd
import numpy as np
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'X-Requested-With': 'XMLHttpRequest',
'Content-Type': 'application/x-www-form-urlencoded',
}
def getSoup(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
return soup
def getFjContent(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
return req.content
def getDataJson(page):
ip = baseCore.get_proxy()
url = 'http://www.shandong.gov.cn/jsearchfront/interfaces/cateSearch.do'
data_post = {
'websiteid': '370000000088000',
'q': 'REITs',
'p': f'{page}',
'pg': '12',
'cateid': '18002',
'pos': '',
'pq': '',
'oq': '',
'eq': '',
'begin': '',
'end': '',
'tpl': '2204',
'sortFields': "[{'name':'top','clause':1},{'name':'score','clause':1}]",
}
req = requests.post(url, headers=headers, data=data_post, proxies=ip)
req.encoding = req.apparent_encoding
return req.json()['result']
def getContent(url, publishDate, num):
fjhref_list = ''
fjtitle_list = ''
soup = getSoup(url)
contentWithTag = soup.find('div', class_='wip_art_con')
a_list = contentWithTag.find_all('a')
num_ = 1
for a in a_list:
fj_href = a.get('href')
if 'http' not in fj_href:
fj_href = 'http://www.shandong.gov.cn' + fj_href
fjhref_list += fj_href + '\n'
fj_title = a.text.lstrip().strip().replace(' ', '')
if fj_title == '':
fj_title = str(num_)
num_ += 1
category = os.path.splitext(fj_href)[1]
if category not in fj_title:
fj_title = fj_title + category
fj_title = f'{num}-{publishDate}-{fj_title}'
fjtitle_list += fj_title + '\n'
fjcontent = getFjContent(fj_href)
file = f'./相关政策/山东省人民政府/政策文件/{fj_title}'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
try:
scripts = contentWithTag.find_all('script')
for script in scripts:
script.decompose()
except:
pass
try:
styles = contentWithTag.find_all('style')
for style in styles:
style.decompose()
except:
pass
content = contentWithTag.text.lstrip().strip()
return content, fjtitle_list, fjhref_list
def getData(soup, num):
origin = '山东省人民政府'
organ = ''
writtenDate = ''
pub_hao = ''
try:
type = soup.find('span', class_='szf_lmmc').text
title = soup.find('div', class_='szf_title').find('a').text.lstrip().strip()
if '山东省政府文件库' in type:
summary = soup.find('div', class_='szf_ms').text.lstrip().strip()
organ = soup.find('table', class_='szf_xxgk').find_all('tr')[0].find_all('td')[1].text.lstrip().strip()
writtenDate = soup.find('table', class_='szf_xxgk').find_all('tr')[0].find_all('td')[
-1].text.lstrip().strip()
pub_hao = soup.find('table', class_='szf_xxgk').find_all('tr')[1].find_all('td')[-1].text.lstrip().strip()
href = soup.find('a', class_='szf_url').text.lstrip().strip()
publishDate = soup.find('span', class_='szf_rq').text.lstrip().strip()
else:
summary = soup.find('div', class_='szf_ms').text.lstrip().strip()
href = soup.find('a', class_='szf_url').text.lstrip().strip()
publishDate = soup.find('span', class_='szf_rq').text.lstrip().strip()
except:
title = soup.find('div', class_='jcse-news-title').find('a').text.lstrip().strip()
summary = soup.find('div', class_='jcse-news-abs-content').text.lstrip().strip()
href = soup.find('div', class_='jcse-news-url').text.lstrip().strip()
publishDate = soup.find('span', class_='jcse-news-date').text.lstrip().strip()
content, fjtitle_list, fjhref_list = getContent(href, publishDate, num)
data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
fjhref_list]
return data
def doJob():
if not os.path.exists('./相关政策/山东省人民政府/政策文件'):
os.makedirs('./相关政策/山东省人民政府/政策文件')
data_list = []
num = 1
for page in range(1, 3):
data_json = getDataJson(page)
for data_ in data_json:
data_ = data_.replace('\\', '')
soup = BeautifulSoup(data_, 'lxml')
data = getData(soup, num)
data_list.append(data)
log.info(f'{data[1]}===采集成功')
num += 1
time.sleep(3)
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./相关政策/山东省人民政府/山东省人民政府政策文件.xlsx', index=False)
if __name__ == '__main__':
doJob()
baseCore.close()
import json
import json
import os
import time
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from base import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Type': 'text/plain',
'Host': 'ss.shanghai.gov.cn',
'Origin': 'https://www.shanghai.gov.cn',
'Pragma': 'no-cache',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'sec-ch-ua': '"Microsoft Edge";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
def getDataJson():
ip = baseCore.get_proxy()
url = 'https://ss.shanghai.gov.cn/manda-app/api/app/search/v1/1drao49/search'
data_post = {"cid": "lyHojYviSD3dOfgVFV4aGIu8Ytk7zEWy", "uid": "lyHojYviSD3dOfgVFV4aGIu8Ytk7zEWy", "query": "REITs",
"current": 1, "size": 20, "disable_correction": False,
"facets": {"fwjg": [{"type": "value", "name": "fwjg", "sort": {"count": "desc"}, "size": 100}]},
"input_type": "Input"}
data_post = json.dumps(data_post)
req = requests.post(url, headers=headers, data=data_post, proxies=ip)
req.encoding = req.apparent_encoding
data_json = req.json()['result']['items']
return data_json
def getFjContent(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
return req.content
def getData(data_, driver, num):
fjhref_list = ''
fjtitle_list = ''
title = data_['title']['raw']
publishDate = data_['date']['raw']
origin = '上海市人民政府'
href = data_['url']['raw']
organ = data_['fwjg']['raw']
pub_hao = data_['wh']['raw']
summary = data_['content']['snippet']
driver.get(href)
time.sleep(1)
content = driver.find_element(By.CLASS_NAME, 'Article_content').text.lstrip().strip()
timeTag = driver.find_element(By.CLASS_NAME, 'PBtime').text
try:
try:
try:
writtenDate = timeTag.split('印发日期:')[1].split('发布日期')[0].lstrip().strip()
except:
writtenDate = timeTag.split('印发日期:')[1].split(f'{pub_hao}')[0].lstrip().strip()
except:
writtenDate = timeTag.split('印发日期:')[1].lstrip().strip()
except:
writtenDate = ''
try:
a_list = driver.find_element(By.CLASS_NAME, 'gaoj-list').find_elements(By.TAG_NAME, 'a')
for a in a_list:
fj_href = a.get_attribute('href')
fjhref_list += fj_href + '\n'
category = os.path.splitext(href)[1]
fj_title = f'{num}-{publishDate}-{a.text.lstrip().strip()}'
if '<' in fj_title or '>' in fj_title:
fj_title = fj_title.replace('<', '').replace('>', '')
if category not in fj_title:
fj_title = fj_title + category
fjtitle_list += fj_title + '\n'
# fjcontent = getFjContent(href)
# file = f'./相关政策/内蒙古自治区人民政府/政策文件/{fj_title}'
# with open(file, 'wb') as f:
# f.write(fjcontent)
# log.info(f'{fj_title}===附件下载成功')
except:
pass
data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
fjhref_list]
return data
def doJob():
if not os.path.exists('./相关政策/上海市人民政府/政策文件'):
os.makedirs('./相关政策/上海市人民政府/政策文件')
driver = baseCore.buildDriver()
data_list = []
num = 1
data_json = getDataJson()
for data_ in data_json:
data = getData(data_, driver, num)
log.info(f'{data[1]}===采集成功')
data_list.append(data)
num += 1
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./相关政策/上海市人民政府/上海市人民政府政策文件.xlsx', index=False)
if __name__ == '__main__':
doJob()
baseCore.close()
import os
import os
from urllib.parse import urljoin
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from base import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'www.shanxi.gov.cn',
'Pragma': 'no-cache',
'Referer': 'http://www.shanxi.gov.cn/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
def paserUrl(html, listurl):
# 获取所有的<a>标签和<img>标签
if isinstance(html, str):
html = BeautifulSoup(html, 'html.parser')
links = html.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return html
def getSoup(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'lxml')
return soup
def getPageSize():
ip = baseCore.get_proxy()
url = 'http://www.shanxi.gov.cn/trs-search/trssearch/v2/searchAll.do?siteId=110&searchTag=zc&allKeywords=REITs&fullKeywords=&orKeywords=&notKeywords=&sort=&position=0&organization=&pageNum=1&pageSize=10&zcYear=&zcMonth=&docno=&cdesc=&publisher=&cityName=&isAlways=1&isSearchRmzfAndBgt='
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
total = req.json()['data']['total']
if total % 10 == 0:
pageSize = int(total / 10)
else:
pageSize = int(total / 10) + 1
return pageSize
def getJson(page):
ip = baseCore.get_proxy()
url = f'http://www.shanxi.gov.cn/trs-search/trssearch/v2/searchAll.do?siteId=110&searchTag=zc&allKeywords=REITs&fullKeywords=&orKeywords=&notKeywords=&sort=&position=0&organization=&pageNum={page}&pageSize=10&zcYear=&zcMonth=&docno=&cdesc=&publisher=&cityName=&isAlways=1&isSearchRmzfAndBgt='
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
return req.json()['data']['data']
def getFjContent(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
return req.content
def getContent(num, data):
fjhref_list = ''
fjtitle_list = ''
title = data['title']
pub_hao = data['docno']
origin = data['sitedesc']
organ = data['publisher']
publishDate = data['docpubtime']
writtenDate = data['scrq']
summary = BeautifulSoup(data['zc_doccontent'], 'html.parser').text.lstrip().strip()
url = data['docpuburl']
url_ = url.split('/')[-1]
soup = getSoup(url)
soup = paserUrl(soup, url.replace(url_, ''))
contentWithTag = soup.find('dt', class_='fl_pc')
if not contentWithTag:
contentWithTag = soup.find('div', class_='sxgzk-detail-con')
try:
scripts = contentWithTag.find_all('script')
for script in scripts:
script.decompose()
except:
pass
try:
styles = contentWithTag.find_all('style')
for style in styles:
style.decompose()
except:
pass
a_list = contentWithTag.find_all('a')
for a in a_list:
href = a.get('href')
fjhref_list += href + '\n'
category = os.path.splitext(href)[1]
fj_title = f'{num}-{publishDate}-{a.text.lstrip().strip()}'
if '<' in fj_title or '>' in fj_title:
fj_title = fj_title.replace('<', '').replace('>', '')
if category not in fj_title:
fj_title = fj_title + category
fjtitle_list += fj_title + '\n'
fjcontent = getFjContent(href)
file = f'./相关政策/山西省人民政府/政策文件/{fj_title}'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
content = contentWithTag.text.lstrip().strip()
data_ = [num, title, writtenDate, origin, url, publishDate, organ, pub_hao, summary, content, fjtitle_list, fjhref_list]
return data_
def doJob():
if not os.path.exists('./相关政策/山西省人民政府/政策文件'):
os.makedirs('./相关政策/山西省人民政府/政策文件')
num = 1
data_list = []
pageSize = getPageSize()
for page in range(1, pageSize + 1):
data_json = getJson(page)
for i in range(len(data_json)):
if data_json[i]['chnldesc'] == '政策文件':
data = getContent(num, data_json[i])
data_list.append(data)
log.info(f'{data[1]}===采集成功')
num += 1
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./山西省人民政府政策文件.xlsx', index=False)
if __name__ == '__main__':
doJob()
baseCore.close()
import requests
import requests
from bs4 import BeautifulSoup
from base import BaseCore
import os
import pandas as pd
import numpy as np
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
def getSoup(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
return soup
def getFjContent(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
return req.content
def getDataJson():
ip = baseCore.get_proxy()
url = 'https://api.so-gov.cn/query/s'
data_post = {
'siteCode': '5100000062',
'tab': 'zcwj',
'qt': 'REITs',
'keyPlace': '0',
'sort': 'dateDesc',
'fileType': '',
'timeOption': '0',
'locationCode': '510000000000',
'page': '1',
'pageSize': '20',
'ie': 'c0e059a8-7a00-4fa9-9d70-873a5284d8a0',
}
req = requests.post(url, headers=headers, data=data_post, proxies=ip)
req.encoding = req.apparent_encoding
data_json = req.json()['resultDocs']
return data_json
def getContent(url, publishDate, num):
url_ = url.split('/')[-1]
url_ = url.replace(url_, '')
fjhref_list = ''
fjtitle_list = ''
soup = getSoup(url)
try:
writtenDate = \
soup.select('#szfcontentwrap2022 > div.zfwjwzcontent > div.topbox > ul > li')[3].text.split('成文日期:')[
1].lstrip().strip()
except:
writtenDate = ''
try:
contentWithTag = soup.select('.contText')[0]
except:
contentWithTag = soup.select('#cmsArticleContent')[0]
img_list = contentWithTag.find_all('img')
num_ = 1
for img in img_list:
fj_href = url_ + img.get('src')
fjhref_list += fj_href + '\n'
fj_title = str(num_)
num_ += 1
category = os.path.splitext(fj_href)[1]
if category not in fj_title:
fj_title = fj_title + category
fj_title = f'{num}-{publishDate}-{fj_title}'
fjtitle_list += fj_title + '\n'
fjcontent = getFjContent(fj_href)
file = f'./相关政策/四川省人民政府/政策文件/{fj_title}'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
content = contentWithTag.text
fjtitle_list = fjtitle_list.lstrip().strip()
fjhref_list = fjhref_list.lstrip().strip()
return writtenDate, content, fjtitle_list, fjhref_list
def getData(data_, num):
title = data_['data']['title']
publishDate = data_['data']['docDate']
origin = data_['data']['siteLabel']['value']
href = data_['data']['url']
organ = data_['data']['myValues']['DOCPUBNAME']
pub_hao = data_['data']['myValues']['DOCNOVAL']
summary = ''
if '.pdf' in href or '.PDF' in href:
content = ''
writtenDate = ''
fjtitle_list = title + '.pdf'
fjhref_list = href
fjcontent = getFjContent(href)
file = f'./相关政策/四川省人民政府/政策文件/{title}.pdf'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{title}===附件下载成功')
else:
writtenDate, content, fjtitle_list, fjhref_list = getContent(href, publishDate, num)
data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
fjhref_list]
return data
def doJob():
if not os.path.exists('./相关政策/四川省人民政府/政策文件'):
os.makedirs('./相关政策/四川省人民政府/政策文件')
data_list = []
num = 1
data_json = getDataJson()
for data_ in data_json:
data = getData(data_, num)
data_list.append(data)
log.info(f'{data[1]}===采集成功')
num += 1
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./相关政策/四川省人民政府/四川省人民政府政策文件.xlsx', index=False)
if __name__ == '__main__':
doJob()
baseCore.close()
import os
import os
from datetime import datetime
from urllib.parse import urljoin
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from base import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'www.tj.gov.cn',
'Pragma': 'no-cache',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'sec-ch-ua': '"Microsoft Edge";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
def paserUrl(html, listurl):
# 获取所有的<a>标签和<img>标签
if isinstance(html, str):
html = BeautifulSoup(html, 'html.parser')
links = html.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return html
def getSoup(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
return soup
def getTotal():
ip = baseCore.get_proxy()
url = 'https://www.tj.gov.cn/igs/front/search.jhtml?code=78778b9ded5140d4984030cf8f469303&pageNumber=1&pageSize=10&searchWord=REITs&siteId=34&sortByFocus=true&type=21515&type1=21519'
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
total = req.json()['page']['totalPages']
return int(total)
def getJson(page):
ip = baseCore.get_proxy()
url = f'https://www.tj.gov.cn/igs/front/search.jhtml?code=78778b9ded5140d4984030cf8f469303&pageNumber={page}&pageSize=10&searchWord=REITs&siteId=34&sortByFocus=true&type=21515&type1=21519'
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
return req.json()['page']['content']
def getFjContent(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
return req.content
def getContent(num, title, pub_time, origin, organ, url, pub_hao, summary, ):
fjhref_list = ''
fjtitle_list = ''
soup = getSoup(url)
url_ = url.split('/')[-1]
soup = paserUrl(soup, url.replace(url_, ''))
contentWithTag = soup.find('div', class_='article_content')
try:
contentWithTag.find('div', class_='articlePlayer').decompose()
except:
pass
try:
scripts = contentWithTag.find_all('script')
for script in scripts:
script.decompose()
except:
pass
try:
styles = contentWithTag.find_all('style')
for style in styles:
style.decompose()
except:
pass
try:
a_list = contentWithTag.find('div', class_='qt-attachments').find_all('a')
for a in a_list:
href = a.get('href')
fjhref_list += href + '\n'
category = os.path.splitext(href)[1]
fj_title = f'{num}-{pub_time}-{a.text.lstrip().strip()}'
if '<' in fj_title or '>' in fj_title:
fj_title = fj_title.replace('<', '').replace('>', '')
if category not in fj_title:
fj_title = fj_title + category
fjtitle_list += fj_title + '\n'
fjcontent = getFjContent(href)
file = f'./相关政策/天津市人民政府/政策文件/{fj_title}'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{title}===附件下载成功')
except:
pass
try:
contentWithTag.find('div', class_='qt-attachments').decompose()
except:
pass
content = contentWithTag.text.lstrip().strip()
fjtitle_list = fjtitle_list.lstrip().strip()
fjhref_list = fjhref_list.lstrip().strip()
data = [num, title, pub_time, origin, url, pub_time, organ, pub_hao, summary, content, fjtitle_list, fjhref_list]
return data
def doJob():
if not os.path.exists('./相关政策/天津市人民政府/政策文件'):
os.makedirs('./相关政策/天津市人民政府/政策文件')
data_list = []
total = getTotal()
num = 1
for page in range(1, total + 1):
data_json = getJson(page)
for i in range(len(data_json)):
title = data_json[i]['title']
pub_time = datetime.strptime(data_json[i]['trs_time'], "%Y-%m-%dT%H:%M:%S.%f%z").date()
origin = data_json[i]['trs_site']
organ = data_json[i]['department']
href = data_json[i]['url']
pub_hao = data_json[i]['wh']
summary = ''
data = getContent(num, title, pub_time, origin, organ, href, pub_hao, summary)
data_list.append(data)
log.info(f'{title}===采集成功')
num += 1
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./天津市人民政府政策文件.xlsx', index=False)
if __name__ == '__main__':
doJob()
baseCore.close()
import os
import os
import re
import time
import datetime
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from retry import retry
from selenium.webdriver.common.by import By
from base import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
def getSoup(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
return soup
@retry(tries=3, delay=5)
def getFjContent(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers, proxies=ip)
req.encoding = req.apparent_encoding
return req.content
def getContent(url, publishDate, num):
fjhref_list = ''
fjtitle_list = ''
soup = getSoup(url)
contentWithTag = soup.find('div', class_='content')
if not contentWithTag:
contentWithTag = soup.find('div', class_='TRS_UEDITOR')
try:
scripts = contentWithTag.find_all('script')
for script in scripts:
script.decompose()
except:
pass
try:
styles = contentWithTag.find_all('style')
for style in styles:
style.decompose()
except:
pass
content = contentWithTag.text.lstrip().strip()
num_ = 1
a_list = contentWithTag.find_all('a')
for a in a_list:
fj_title = a.text.lstrip().strip()
fj_href = a.get('href')
if 'http' not in fj_href:
fj_href = 'https://www.yn.gov.cn' + fj_href
fjhref_list += fj_href + '\n'
if fj_title == '':
fj_title = str(num_)
num_ += 1
category = os.path.splitext(fj_href)[1]
if category not in fj_title:
fj_title = fj_title + category
fj_title = f'{num}-{publishDate}-{fj_title}'
fjtitle_list += fj_title + '\n'
fjcontent = getFjContent(fj_href)
file = f'./相关政策/云南省人民政府/政策文件/{fj_title}'
if os.path.exists(file):
fj_title = fj_title.replace(category, f'-{num_}{category}')
num_ += 1
file = f'./相关政策/云南省人民政府/政策文件/{fj_title}'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
try:
a_list = soup.find('ul', class_='apfile').find_all('a')
for a in a_list:
fj_title = a.text.lstrip().strip()
fj_href = a.get('href')
if 'http' not in fj_href:
fj_href = 'https://www.yn.gov.cn' + fj_href
fjhref_list += fj_href + '\n'
if fj_title == '':
fj_title = str(num_)
num_ += 1
category = os.path.splitext(fj_href)[1]
if category not in fj_title:
fj_title = fj_title + category
fj_title = f'{num}-{publishDate}-{fj_title}'
fjtitle_list += fj_title + '\n'
fjcontent = getFjContent(fj_href)
file = f'./相关政策/云南省人民政府/政策文件/{fj_title}'
if os.path.exists(file):
fj_title = fj_title.replace(category, f'-{num_}{category}')
num_ += 1
file = f'./相关政策/云南省人民政府/政策文件/{fj_title}'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
except:
pass
return content, fjtitle_list, fjhref_list
def getData(div, num):
pattern = r"\d{4}-\d{2}-\d{2}"
title = div.find_element(By.CLASS_NAME, 'title').find_element(By.CLASS_NAME, 'fontlan').get_attribute(
'title').lstrip().strip()
href = div.find_element(By.CLASS_NAME, 'fontlan').get_attribute('href')
origin = '云南省人民政府'
try:
publishDate = re.findall(pattern, div.find_element(By.CLASS_NAME, 'content').text)[0]
except:
publishDate = ''
try:
organ = \
div.find_element(By.CLASS_NAME, 'rowtab').find_elements(By.TAG_NAME, 'div')[0].find_elements(By.TAG_NAME,
'p')[
1].find_element(By.CLASS_NAME, 'txt').text.lstrip().strip()
pub_hao = \
div.find_element(By.CLASS_NAME, 'rowtab').find_elements(By.TAG_NAME, 'div')[0].find_elements(By.TAG_NAME,
'p')[
0].find_element(By.CLASS_NAME, 'txt').text.lstrip().strip()
if pub_hao == '无':
pub_hao = ''
except:
organ = ''
pub_hao = ''
summary = ''
writtenDate = ''
if '.pdf' in href or '.PDF' in href:
content = ''
fjhref_list = href
fj_title = title + '.pdf'
fjcontent = getFjContent(fjhref_list)
file = f'./相关政策/云南省人民政府/政策文件/{fj_title}'
with open(file, 'wb') as f:
f.write(fjcontent)
log.info(f'{fj_title}===附件下载成功')
fjtitle_list = fj_title
else:
content, fjtitle_list, fjhref_list = getContent(href, publishDate, num)
data = [num, title, publishDate, origin, href, writtenDate, organ, pub_hao, summary, content, fjtitle_list,
fjhref_list]
return data
def doJob():
if not os.path.exists('./相关政策/云南省人民政府/政策文件'):
os.makedirs('./相关政策/云南省人民政府/政策文件')
data_list = []
url = 'https://sheng.so-gov.cn/s?siteCode=5300000033&qt=REITs'
driver = baseCore.buildDriver()
driver.get(url)
time.sleep(2)
num = 1
for type in range(3, 5):
driver.find_elements(By.XPATH, '/html/body/div/div[6]/div[2]/div[3]/ul/li')[type].click()
time.sleep(2)
if type == 3:
driver.find_element(By.ID, 'key_place_context_id').click()
time.sleep(2)
try:
total = int(driver.find_element(By.CLASS_NAME, 'pagination').find_elements(By.TAG_NAME, 'a')[-2].text)
except:
total = 1
for page in range(total):
time.sleep(2)
div_list = driver.find_elements(By.XPATH, '//*[@id="results"]/div')
for div in div_list:
data = getData(div, num)
data_list.append(data)
log.info(f'{data[1]}===采集成功')
num += 1
try:
driver.find_element(By.CLASS_NAME, 'pagination').find_element(By.CLASS_NAME, 'next').click()
except:
pass
df = pd.DataFrame(np.array(data_list))
df.columns = ['序号', '标题', '发布时间', '来源', '原文链接', '发文时间', '发文机构', '发文字号', '摘要', '正文', '附件名称', '附件连接']
df.to_excel('./相关政策/云南省人民政府/云南省人民政府政策文件.xlsx', index=False)
if __name__ == '__main__':
doJob()
baseCore.close()
import time
import time
import requests
from bs4 import BeautifulSoup
from retry import retry
from base import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'X-Requested-With': 'XMLHttpRequest',
}
@retry(tries=3,delay=10)
def getPageSize():
ip = baseCore.get_proxy()
url = 'https://search.zj.gov.cn/jsearchfront/interfaces/cateSearch.do'
data_post = {
'websiteid': '330000000000000',
'pg': '10',
'p': '1',
'tpl': '1569',
'cateid': '372',
'word': 'REITs',
'checkError': '1',
'isContains': '1',
'q': 'REITs',
'pos': 'content,filenumber',
'sortType': '1',
}
req = requests.post(url, headers=headers, data=data_post, proxies=ip)
req.encoding = req.apparent_encoding
total = req.json()['total']
if total % 10 == 0:
pageSize = total // 10
else:
pageSize = total // 10 + 1
req.close()
return pageSize
@retry(tries=3,delay=10)
def getDataJson(page):
ip = baseCore.get_proxy()
url = 'https://search.zj.gov.cn/jsearchfront/interfaces/cateSearch.do'
data_post = {
'websiteid': '330000000000000',
'pg': '10',
'p': f'{page}',
'tpl': '1569',
'cateid': '372',
'word': 'REITs',
'checkError': '1',
'isContains': '1',
'q': 'REITs',
'pos': 'content,filenumber',
'sortType': '1',
}
req = requests.post(url, headers=headers, data=data_post, proxies=ip)
req.encoding = req.apparent_encoding
data_json = req.json()['result']
return data_json
def getDatas(page):
data_json = getDataJson(page)
for data_ in data_json:
soup = BeautifulSoup(data_, 'lxml')
title = soup.find('div', class_='titleWrapper').find('a', class_='textTitle').text.lstrip().strip().replace(' ','').replace('\r\n',' ')
href = soup.find('div', class_='titleWrapper').find('a', class_='textTitle').get('href')
href = href.split('url=')[1].split('.html')[0].replace('%3A',':').replace('%2F','/') + '.html'
try:
info = soup.find('table', class_='fgwj_table_list').text
organ = info.split('发布机构:')[1].split('成文日期:')[0].lstrip().strip()
writtenDate = info.split('成文日期:')[1].lstrip().strip()
except:
organ = ''
writtenDate = None
origin = soup.find('div', class_='sourceTime').text.split('来源:')[1].split('时间:')[0].lstrip().strip().replace(' ','').replace(' ', '').replace('\r\n', '')
publishDate = soup.find('div', class_='sourceTime').text.split('时间:')[1].lstrip().strip()
log.info(origin)
time.sleep(5)
def doJob():
pageSize = getPageSize()
for page in range(1, pageSize + 1):
datas = getDatas(page)
if __name__ == '__main__':
doJob()
# url = 'http%3A%2F%2Fwww.zj.gov.cn%2Fart%2F2022%2F4%2F18%2Fart_1229630461_2401403.html'
# req = requests.get(url,headers=headers)
# req.encoding = req.apparent_encoding
baseCore.close()
import os
import os
......@@ -136,7 +136,7 @@ class Policy():
policy = Policy()
#国家发展和改革委员会 https://www.ndrc.gov.cn/xxgk/wjk/index.html?tab=all&qt=
def reform():
def reform(wb,file_path):
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br',
......@@ -171,12 +171,6 @@ def reform():
title = info['title']
summary = info['summary'].replace('<em>','').replace('</em>','')
newsUrl = info['url']
# 根据链接判重
is_member = baseCore.r.sismember('REITs::' + webname, newsUrl)
if is_member:
continue
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
......@@ -245,7 +239,6 @@ def reform():
publishDate = ''
policy.deletep(contentWithTag, 3, 'div', 'style', 'text-align: center;')
policy.deletek(contentWithTag)
content = contentWithTag.text
try:
policy.paserUrl(newssoup,newsUrl)
......@@ -357,12 +350,11 @@ def zhengquanqihuo():
total = pageUtil['rowCount']
page_size = pageUtil['pageSize']
Max_page = int(total / page_size)
# DataList = []
DataList = []
num = 0
webname = '证券期货法规数据库系统'
# path = 'data/证监会'
# if not os.path.exists(path):
# os.makedirs(path)
path = 'data/证监会'
if not os.path.exists(path):
os.makedirs(path)
for page in range(0, Max_page+1):
payload_page = {
'pageNo': page + 1,
......@@ -380,7 +372,6 @@ def zhengquanqihuo():
data_page = policy.requestPost(headers, url, payload_page)
info_list = data_page['pageUtil']['pageList']
for info in info_list:
id_list = []
num += 1
try:
title = info['secFutrsLawName']
......@@ -391,12 +382,6 @@ def zhengquanqihuo():
# print(publishDate)
secFutrsLawId = info['secFutrsLawId']
newsUrl = f'https://neris.csrc.gov.cn/falvfagui/rdqsHeader/mainbody?navbarId=3&secFutrsLawId={secFutrsLawId}&body=REITs'
# 根据链接判重
is_member = baseCore.r.sismember('REITs::' + webname, newsUrl)
if is_member:
continue
browser = policy.createDriver()
browser.get(newsUrl)
time.sleep(1)
......@@ -429,25 +414,18 @@ def zhengquanqihuo():
'createDate': time_now,
'sid': '1729030277461815298',
}
try:
baseCore.sendkafka(dic_info, topic)
baseCore.r.sadd('REITs::' + webname, newsUrl)
log.info(f'采集成功--{title}--{newsUrl}')
except:
for att_id in id_list:
baseCore.deliteATT(att_id)
# DataList.append(dic_info)
# sheet_name = "证监会"
# if sheet_name in wb.sheetnames:
# log.info(f"{sheet_name}工作表已存在!")
# else:
# # 创建新工作表
# wb.create_sheet(sheet_name)
# print(f"{sheet_name}新工作表创建完成!")
# # 保存Excel文件
# wb.save(file_path)
#
# baseCore.writerToExcel(DataList, file_path, sheet_name)
DataList.append(dic_info)
sheet_name = "证监会"
if sheet_name in wb.sheetnames:
log.info(f"{sheet_name}工作表已存在!")
else:
# 创建新工作表
wb.create_sheet(sheet_name)
print(f"{sheet_name}新工作表创建完成!")
# 保存Excel文件
wb.save(file_path)
baseCore.writerToExcel(DataList, file_path, sheet_name)
except Exception as e:
log.info(f"error!!!{num}")
log.info({e})
......@@ -472,10 +450,9 @@ def sse(wb,file_path):
total_page = result['data']['totalPage']
DataList = []
num = 0
webname = '上海证券交易所'
# path = 'data/上海交易所'
# if not os.path.exists(path):
# os.makedirs(path)
path = 'data/上海交易所'
if not os.path.exists(path):
os.makedirs(path)
for page in range(0, int(total_page)):
url_page = f'http://query.sse.com.cn/search/getESSearchDoc.do?page={page}&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C8685%2C9348%2C12632%2C12768%2C12769%2C12770%2C12771%2C12772%2C12773%2C12774%2C12775%2C12776%2C12777%2C12778%2C12779%2C12780%2C12781%2C12782%2C12783%2C12784%2C12785%2C12786%2C12787%2C12788%2C12789%2C12790%2C12791%2C12792%2C12793%2C12794%2C12795%2C12796%2C12797%2C12798%2C12799%2C12800%2C12801%2C12802%2C12803%2C12804%2C12805%2C12806%2C12807%2C12808%2C12809%2C12810%2C12811%2C12812%2C13061%2C13282%2C13283%2C13284%2C13285%2C13286%2C13287%2C13288%2C13289%2C13294%2C13364%2C13365%2C13366%2C13367%2C14595%2C14596%2C14597%2C14598%2C14599%2C14600%2C14601%2C14602%2C14603%2C14604%2C14605%2C14606&trackId=50619067167713018335655119683810&_=1699508921761'
data = policy.getrequest_json(headers, url_page)
......@@ -516,10 +493,10 @@ def sse(wb,file_path):
content += page.get_text()
file_href = newsUrl
file_name = title
policy.attuributefile(title, newsUrl, num, publishDate)
rename_file = f'{str(num)}_{publishDate}_{file_name}'
fu_jian_name += rename_file + '\n'
fu_jian_href += file_href + '\n'
policy.downloadfile(file_href, f'{path}/{rename_file}')
dic_info = {
'序号': num,
'标题': title,
......@@ -603,6 +580,100 @@ def sse(wb,file_path):
baseCore.writerToExcel(DataList, file_path, sheet_name)
#北京市人民政府 https://www.beijing.gov.cn/so/s?siteCode=1100000088&tab=zcfg&qt=REITs
def beijing():
url = 'https://www.beijing.gov.cn/so/ss/query/s'
payload = {
'siteCode': '1100000088',
'tab': 'zcfg',
'qt': 'REITs',
'sort': 'relevance',
'keyPlace': '0',
'locationCode': '110000000000',
'page': '1',
'pageSize': '20',
'ie': '89b5e964-dc3c-4a5b-8d80-f6c408769b4a'
}
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Length': '148',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': 'Path=/; Path=/; __jsluid_s=91bdb0d83098fd2e8a8455a9085a22e2; JSESSIONID=M2FmNDczYzYtMmNkYS00N2I0LThhNDgtYWJiMTdhOTIyZDI4; _va_ref=%5B%22%22%2C%22%22%2C1699515166%2C%22https%3A%2F%2Fdocs.qq.com%2F%22%5D; _va_ses=*; JSESSIONID=CD61DA650DB33324962A3BF2527672D0; arialoadData=false; _va_id=c7a63e4b2199befd.1699358536.2.1699515273.1699515166.; CPS_SESSION=2FEFDC54444B24762D057AD6BDE3C7BF',
'Host': 'www.beijing.gov.cn',
'Origin': 'https://www.beijing.gov.cn',
'Referer': 'https://www.beijing.gov.cn/so/s?siteCode=1100000088&tab=zcfg&qt=REITs',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
result = policy.requestPost(headers, url, payload)
total = result['totalHits']
page_size = result['currentHits']
Max_page = int(total / page_size)
for page in range(0, Max_page):
payload_page = {
'siteCode': '1100000088',
'tab': 'zcfg',
'qt': 'REITs',
'sort': 'relevance',
'keyPlace': '0',
'locationCode': '110000000000',
'page': page + 1,
'pageSize': '20',
'ie': '89b5e964-dc3c-4a5b-8d80-f6c408769b4a'
}
data = policy.requestPost(headers, url, payload_page)
info_list = data['resultDocs']
# print(info_list)
for info_ in info_list:
info = info_['data']
title = info['titleO']
titleLabel = info['titleLabel']['value']
publishDate = info['docDate']
# source = info['siteLabel']['value']
newsUrl = info['url']
if titleLabel == '政策解读':
newssoup = policy.getrequest_soup(headers, newsUrl)
print(newssoup)
contentWithTag = newssoup.find('div', id='mainText')
content = contentWithTag.text
source = newssoup.select('p[class="fl"]>span')[1].replace('来源:', '')
formatRows = info['formatRows']
num = 1
for row in formatRows:
for col in row['col']:
name = col['text']
if name == '相关附件':
value = col['value']
file_href = value.keys()
file_name = value.values()
# 附件上传
policy.attuributefile(file_name,file_href,num,publishDate)
num += 1
value = col['value'][0]
dic_info[name] = value
dic_info = {
'title': title,
'publishDate': publishDate,
'source': source,
'newsUrl': newsUrl,
'file_href': file_href
}
# print(dic_info)
# break
# 河北省人民政府
def hebei():
path = 'data/河北省人民政府'
......@@ -807,6 +878,10 @@ def hebei():
baseCore.writerToExcel(DataList, file_path, sheet_name)
break
# 广东省人民政府
def guangdong():
pass
# 贵州省人民政府
def guizhou():
......@@ -915,12 +990,12 @@ def guizhou():
if __name__=="__main__":
# file_path = f'data/REITs贵州省人民政府.xlsx'
# wb = policy.createfile(file_path)
# reform()
file_path = f'data/REITs贵州省人民政府.xlsx'
wb = policy.createfile(file_path)
# reform(wb,file_path)
# shenzhen()
zhengquanqihuo()
# sse()
# hebei()
# guizhou()
#guizhou()
# zhengquanqihuo()
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论