提交 a98a7ff0 作者: LiuLiYuan

lly 01/11

上级 08e4725c
# -*- coding: UTF-8 -*-
import os
import pandas as pd
from obs import ObsClient
from base import BaseCore
os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.UTF8'
from flask import Flask, request, jsonify
from gevent import pywsgi
app = Flask(__name__)
# 获取excel表格,解析内容送入redies
@app.route('/enterprise/readExcel', methods=['GET'])
def getExcel():
baseCore = BaseCore.BaseCore()
r = baseCore.r
log = baseCore.getLogger()
obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
)
filePath = request.args.get('filePath')
data = getDF(filePath, obsClient)
for i in range(data.shape[0]):
data_ = ''
for j in range(len(data.iloc[i])):
if pd.isna(data.iloc[i][j]):
data_ += f'|'
else:
data_ += f'{data.iloc[i][j]}|'
if data.iloc[i]['国内外(1-国内;0-国外)'] == 1 or data.iloc[i]['国内外(1-国内;0-国外)'] == '1':
r.rpush('BaseInfoEnterprise:gnqy_socialCode', data_)
elif data.iloc[i]['国内外(1-国内;0-国外)'] == 0 or data.iloc[i]['国内外(1-国内;0-国外)'] == '0':
r.rpush('BaseInfoEnterprise:gwqy_socialCode', data_)
r.rpush('BaseInfoEnterprise:gnqy_socialCode', 'end')
r.rpush('BaseInfoEnterprise:gwqy_socialCode', 'end')
log.info('文件接收成功')
obsClient.close()
baseCore.close()
return jsonify({'success': 'success'})
@app.route('/enterprise/getInfo', methods=['GET'])
def getInfo():
baseCore = BaseCore.BaseCore()
cursor = baseCore.cursor
cnx = baseCore.cnx
log = baseCore.getLogger()
gpdm = request.args.get('securitiesCode')
xydm = request.args.get('socialCode')
name = request.args.get('name')
ename = request.args.get('englishName')
place = request.args.get('place')
if not xydm and not place :
baseCore.close()
return jsonify({'error':'缺少参数'})
if not name or not ename:
baseCore.close()
return jsonify({'error':'缺少参数'})
sql = f"select * from EnterpriseInfo where SocialCode='{xydm}'"
cursor.execute(sql)
data = cursor.fetchone()
if data:
if str(place) == '1':
if gpdm:
sql_up = f"UPDATE EnterpriseInfo set SecuritiesCode='{gpdm}',CompanyName='{name}',Place={place} WHERE SocialCode='{xydm}'"
cursor.execute(sql_up)
cnx.commit()
log.info(f'更新企业信息==={xydm}')
else:
sql_up = f"UPDATE EnterpriseInfo set CompanyName='{name}',Place={place} WHERE SocialCode='{xydm}'"
cursor.execute(sql_up)
cnx.commit()
log.info(f'更新企业信息==={xydm}')
elif str(place) == '0':
if gpdm:
sql_up = f"UPDATE EnterpriseInfo set SecuritiesCode='{gpdm}',CompanyName='{ename}',Place={place} WHERE SocialCode='{xydm}'"
cursor.execute(sql_up)
cnx.commit()
log.info(f'更新企业信息==={xydm}')
else:
log.error(f'{xydm}===国内外标志错误')
baseCore.close()
return jsonify({'error':f'{xydm}===国内外标志错误'})
else:
if str(place) == '1':
if gpdm:
sql_up = f"INSERT INTO EnterpriseInfo (SocialCode,SecuritiesCode,CompanyName,Place) VALUE ('{xydm}','{gpdm}','{name}','{place}')"
cursor.execute(sql_up)
cnx.commit()
log.info(f'新增企业信息==={xydm}')
else:
sql_up = f"INSERT INTO EnterpriseInfo (SocialCode,CompanyName,Place) VALUE ('{xydm}','{name}','{place}')"
cursor.execute(sql_up)
cnx.commit()
log.info(f'新增企业信息==={xydm}')
elif str(place) == '0':
if gpdm:
sql_up = f"INSERT INTO EnterpriseInfo (SocialCode,SecuritiesCode,CompanyName,Place) VALUE ('{xydm}','{gpdm}','{ename}','{place}')"
cursor.execute(sql_up)
cnx.commit()
log.info(f'新增企业信息==={xydm}')
else:
log.error(f'{xydm}===国内外标志错误')
baseCore.close()
return jsonify({'error':f'{xydm}===国内外标志错误'})
baseCore.close()
return jsonify({'success': 'success'})
def getDF(filePath, obsClient):
response = obsClient.getObject('zzsn', filePath, loadStreamInMemory=True)
df = pd.read_excel(response['body']['buffer'])
return df
if __name__ == '__main__':
server = pywsgi.WSGIServer(('0.0.0.0',5000),app)
server.serve_forever()
......@@ -124,9 +124,6 @@ def doJob():
except Exception as e:
log.error(f'第{page}页==={title}===失败')
time.sleep(2)
df = pd.DataFrame(np.array(info_list))
df.columns = ['证券代码', '证券简称', '公告标题', '发布时间', '公告网址', '来源', '来源网址']
df.to_excel('./市场板块/深圳交易所基金公告_2.xlsx', index=False)
if __name__ == '__main__':
......
import datetime
import datetime
import os
import time
import uuid
from urllib.parse import unquote, urljoin
import pymongo
import requests
from bs4 import BeautifulSoup
from fitz import fitz
from obs import ObsClient
from retry import retry
from base import BaseCore
from requests.packages.urllib3 import disable_warnings
disable_warnings()
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').RESCenter[
'REITsFundAnncmnt']
obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
)
baseCore = BaseCore.BaseCore()
cursor_ = baseCore.cursor_
cnx_ = baseCore.cnx_
cursor = baseCore.cursor
cnx = baseCore.cnx
log = baseCore.getLogger()
class obsOperate():
def __init__(self, cursor_, cnx_, log):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
self.cursor_ = cursor_
self.cnx_ = cnx_
self.log = log
def secrchATT(self, item_id, file_name, type_id, order_by):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s and order_by=%s '''
self.cursor_.execute(sel_sql, (item_id, file_name, type_id, order_by))
selects = self.cursor_.fetchone()
return selects
# 插入到att表 返回附件id
def tableUpdate(self, retData, com_name, file_name, num, pub_time):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
path = retData['path']
full_path = retData['full_path']
category = retData['category']
file_size = retData['file_size']
status = retData['status']
create_by = retData['create_by']
page_size = retData['page_size']
create_time = retData['create_time']
order_by = num
Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name,publish_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
file_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, path, 'zzsn', pub_time)
self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交
self.log.info("更新完成:{}".format(Upsql))
selects = self.secrchATT(item_id, file_name, type_id, order_by)
id = selects[0]
return id, full_path
def getuuid(self):
get_timestamp_uuid = uuid.uuid1() # 根据 时间戳生成 uuid , 保证全球唯一
return get_timestamp_uuid
# 获取文件大小
def convert_size(self, size_bytes):
# 定义不同单位的转换值
units = ['bytes', 'KB', 'MB', 'GB', 'TB']
i = 0
while size_bytes >= 1024 and i < len(units) - 1:
size_bytes /= 1024
i += 1
return f"{size_bytes:.2f} {units[i]}"
@retry(tries=5, delay=10)
def getRes(self, file_href):
response = requests.get(file_href, headers=self.headers)
if response.status_code != 200:
raise
return response
@retry(tries=5, delay=10)
def sendOBS(self, file_name, response):
result = obsClient.putContent('zzsn', 'PolicyDocument/' + file_name, content=response.content)
return result
def uptoOBS(self, file_href, item_id, file_name):
category = os.path.splitext(file_href)[1]
retData = {'state': False, 'type_id': 15, 'item_id': item_id, 'group_name': '', 'path': '',
'full_path': '',
'category': category, 'file_size': '', 'status': 1, 'create_by': 'LiuLiYuan',
'create_time': '', 'page_size': '', 'content': ''}
try:
response = self.getRes(file_href)
except:
self.log.error('文件获取失败')
return retData
try:
with fitz.open(stream=response.content, filetype='pdf') as doc:
for page in doc.pages():
retData['content'] += page.get_text()
except:
self.log.error(f'文件解析失败')
return retData
if 'If this message is not eventually replaced by the proper contents of the document, your PDF' in retData[
'content']:
retData['state'] = True
return retData
file_size = int(response.headers.get('Content-Length'))
file_name = str(self.getuuid()) + category
try:
result = self.sendOBS(file_name, response)
except:
self.log.error(f'obs上传失败')
return retData
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = unquote(result['body']['objectUrl'])
retData['file_size'] = self.convert_size(file_size)
retData['create_time'] = time_now
except Exception as e:
print(f'error:{e}')
return retData
log.info(f'{file_name}===obs上传成功')
return retData
def paserUrl(html, listurl):
# 获取所有的<a>标签和<img>标签
if isinstance(html, str):
html = BeautifulSoup(html, 'html.parser')
links = html.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return html
def getCodeList():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
code_list = []
url = 'https://api.sgx.com/securities/v1.1?params=nc%2Cn%2Ctype%2Cls%2Cm%2Csc%2Cbl%2Csip%2Cex%2Cej%2Cclo%2Ccr%2Ccur%2Cel%2Cr%2Ci%2Ccc%2Cig%2Clf'
req = requests.get(url, headers=headers, verify=False)
req.encoding = req.apparent_encoding
data_json = req.json()['data']['prices']
for data_ in data_json:
type = data_['type']
if type == 'reits':
TradingCode = data_['nc']
TradingName = data_['n']
code_list.append([TradingCode, TradingName])
req.close()
return code_list
def getData(code, name, obsOperate, data):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
path_list = []
content = []
url = data['url']
title = data['title'].split('::')[0].strip()
date = datetime.datetime.strptime(data['submission_date'], '%Y%m%d')
req = requests.get(url, headers=headers, verify=False)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
soup = paserUrl(soup, url)
contentWithTag = soup.find('div', class_='announcement')
dd_list = contentWithTag.find('dl', class_='announcement-attachment-list').find_all('dd')
num = 1
for dd in dd_list:
try:
href = dd.find('a').get('href')
except:
continue
if not href:
continue
file_title = title + '.pdf'
retData = obsOperate.uptoOBS(href, code, file_title)
time.sleep(2)
if retData['state']:
pass
else:
log.error(f'{title}===公告下载obs失败')
continue
if 'If this message is not eventually replaced by the proper contents of the document, your PDF' not in retData['content']:
att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', file_title, num, str(date)[:10])
path_list.append(full_path)
content.append(retData['content'])
num += 1
if len(path_list) == 1:
dic_news = {
'code': code, # 代码
'name': name, # 简称
'title': title, # 名称
'path': path_list[0], # obs路径
'href': url, # 原文链接
'content': content[0], # pdf解析内容
'date': date, # 时间
'strDate': str(date)[:10], # 时间 字符串
'exchange': '新加坡交易所' # 交易所
}
elif len(path_list) > 1:
dic_news = {
'code': code, # 代码
'name': name, # 简称
'title': title, # 名称
'path': path_list[0], # obs路径
'href': url, # 原文链接
'content': content[0], # pdf解析内容
'date': date, # 时间
'strDate': str(date)[:10], # 时间 字符串
'exchange': '新加坡交易所', # 交易所
'pathList': path_list # 附件集合
}
else:
dic_news = {}
req.close()
return dic_news
def getDataJson(code):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'Authorizationtoken': '3ZvSQB8eN/rzx9RVgRBlUMr1Q8vf/mzVVGAAJEo67btKmiM25Tl8WAs5w3dYfM+5mQkSGylxOTCevcn7LwxJSjs5nFi4Pm6KHZjZ1XNgvJPOJ4XF9xO3NmXFvf395xd1',
}
now = datetime.datetime.now().strftime('%Y%m%d')
url = f'https://api.sgx.com/announcements/v1.1/securitycode?periodstart=20191204_000000&periodend={now}_235959&cat=ANNC&value={code}&exactsearch=true&pagestart=0&pagesize=250'
req = requests.get(url, headers=headers, verify=False)
req.encoding = req.apparent_encoding
data_json = req.json()['data']
req.close()
return data_json
def doJob(obsOperate):
try:
code_list = getCodeList()
except:
log.error(f'代码列表获取失败')
return
for codes in code_list:
code = codes[0]
name = codes[1]
log.info(f'{code}===开始采集')
try:
data_json = getDataJson(code)
except Exception as e:
log.error(f'{code}===信息列表获取失败')
continue
for data_ in data_json:
is_insert = db_storage.find_one({'code': code, 'href': data_['url'], 'exchange': '新加坡交易所'})
if is_insert:
log.info(f'{code}==={data_["url"]}===已采集')
time.sleep(3)
continue
dic_info = getData(code, name, obsOperate, data_)
if dic_info:
db_storage.insert_one(dic_info)
log.info(f'{code}==={data_["url"]}===采集成功')
time.sleep(3)
if __name__ == '__main__':
obsOperate = obsOperate(cursor_, cnx_, log)
doJob(obsOperate)
baseCore.close()
import datetime
import datetime
import json
import os
import re
import time
import uuid
from urllib.parse import unquote
import pymongo
import redis
from bs4 import BeautifulSoup
from fitz import fitz
from obs import ObsClient
from retry import retry
from base import BaseCore
import requests
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').RESCenter[
'REITsFundAnncmnt']
obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
)
baseCore = BaseCore.BaseCore()
cursor_ = baseCore.cursor_
cnx_ = baseCore.cnx_
cursor = baseCore.cursor
cnx = baseCore.cnx
r = baseCore.r
log = baseCore.getLogger()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.62',
# 'cookie': 'OptanonAlertBoxClosed=2023-08-29T09:50:42.503Z; AMCV_DD0356406298B0640A495CB8%40AdobeOrg=179643557%7CMCIDTS%7C19599%7CMCMID%7C90834671594036426831047706481131374722%7CvVersion%7C5.5.0; sclang=zh-CN; OptanonConsent=isGpcEnabled=0&datestamp=Wed+Aug+30+2023+13%3A52%3A11+GMT%2B0800+(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4)&version=202303.2.0&browserGpcFlag=0&isIABGlobal=false&hosts=&landingPath=NotLandingPage&groups=C0001%3A1%2CC0003%3A1%2CC0004%3A1%2CC0002%3A1&geolocation=CN%3BHA&AwaitingReconsent=false;',
'Content-Type': 'application/x-www-form-urlencoded'
}
class obsOperate():
def __init__(self, cursor_, cnx_, log):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.62',
# 'cookie': 'OptanonAlertBoxClosed=2023-08-29T09:50:42.503Z; AMCV_DD0356406298B0640A495CB8%40AdobeOrg=179643557%7CMCIDTS%7C19599%7CMCMID%7C90834671594036426831047706481131374722%7CvVersion%7C5.5.0; sclang=zh-CN; OptanonConsent=isGpcEnabled=0&datestamp=Wed+Aug+30+2023+13%3A52%3A11+GMT%2B0800+(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4)&version=202303.2.0&browserGpcFlag=0&isIABGlobal=false&hosts=&landingPath=NotLandingPage&groups=C0001%3A1%2CC0003%3A1%2CC0004%3A1%2CC0002%3A1&geolocation=CN%3BHA&AwaitingReconsent=false;',
'Content-Type': 'application/x-www-form-urlencoded'
}
self.cursor_ = cursor_
self.cnx_ = cnx_
self.log = log
def secrchATT(self, item_id, file_name, type_id, order_by):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s and order_by=%s '''
self.cursor_.execute(sel_sql, (item_id, file_name, type_id, order_by))
selects = self.cursor_.fetchone()
return selects
# 插入到att表 返回附件id
def tableUpdate(self, retData, com_name, file_name, num, pub_time):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
path = retData['path']
full_path = retData['full_path']
category = retData['category']
file_size = retData['file_size']
status = retData['status']
create_by = retData['create_by']
page_size = retData['page_size']
create_time = retData['create_time']
order_by = num
Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name,publish_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
file_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, path, 'zzsn', pub_time)
self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交
self.log.info("更新完成:{}".format(Upsql))
selects = self.secrchATT(item_id, file_name, type_id, order_by)
id = selects[0]
return id, full_path
def getuuid(self):
get_timestamp_uuid = uuid.uuid1() # 根据 时间戳生成 uuid , 保证全球唯一
return get_timestamp_uuid
# 获取文件大小
def convert_size(self, size_bytes):
# 定义不同单位的转换值
units = ['bytes', 'KB', 'MB', 'GB', 'TB']
i = 0
while size_bytes >= 1024 and i < len(units) - 1:
size_bytes /= 1024
i += 1
return f"{size_bytes:.2f} {units[i]}"
@retry(tries=5, delay=10)
def getRes(self, file_href):
response = requests.get(file_href, headers=self.headers)
if response.status_code != 200:
raise
return response
@retry(tries=5, delay=10)
def sendOBS(self, file_name, response):
result = obsClient.putContent('zzsn', 'PolicyDocument/' + file_name, content=response.content)
return result
def uptoOBS(self, file_href, item_id, file_name):
category = os.path.splitext(file_href)[1]
retData = {'state': False, 'type_id': 15, 'item_id': item_id, 'group_name': '', 'path': '',
'full_path': '',
'category': category, 'file_size': '', 'status': 1, 'create_by': 'LiuLiYuan',
'create_time': '', 'page_size': '', 'content': ''}
try:
response = self.getRes(file_href)
except:
self.log.error('文件获取失败')
return retData
try:
with fitz.open(stream=response.content, filetype='pdf') as doc:
for page in doc.pages():
retData['content'] += page.get_text()
except:
self.log.error(f'文件解析失败')
return retData
file_size = int(response.headers.get('Content-Length'))
file_name = str(self.getuuid()) + category
try:
result = self.sendOBS(file_name, response)
except:
self.log.error(f'obs上传失败')
return retData
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = unquote(result['body']['objectUrl'])
retData['file_size'] = self.convert_size(file_size)
retData['create_time'] = time_now
except Exception as e:
print(f'error:{e}')
return retData
return retData
@retry(tries=3,delay=30)
def getToken():
tokenUrl = r.spop('cookie:HKEXurl')
tokenUrl = tokenUrl.decode('utf-8')
token = re.findall('token=(.*?)&',tokenUrl)[0].replace('+','%2b').replace('/','%2f')
callback = re.findall('callback=(.*?)&',tokenUrl)[0]
time_ = re.findall('_=(\d+)',tokenUrl)[0]
return token,callback,time_
def getID(code):
url = 'https://www1.hkexnews.hk/search/titlesearch.xhtml?lang=zh'
data_post = {
'current_page': '1',
'stock_market': 'HKEX',
'rdo_SelectSortBy': 'DateTime',
'txt_stock_code': f'{code}',
'rdo_SelectDocType': '',
'sel_DocTypePrior2006': '-1',
'sel_DocTypeAfter2006': '',
'sel_tier_1': '-2',
'sel_tier_2': '-2',
'sel_tier_2_group': '-2',
'IsFromNewList': False,
'txtKeyWord': '',
'sel_DateOfReleaseFrom_d': '01',
'sel_DateOfReleaseFrom_m': '04',
'sel_DateOfReleaseFrom_y': '1999',
'sel_DateOfReleaseTo_d': '04',
'sel_DateOfReleaseTo_m': '12',
'sel_DateOfReleaseTo_y': '2023',
}
req = requests.post(url, headers=headers, data=data_post)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
id = soup.find('form', attrs={'id': 'j_idt5'}).find('input', attrs={'id': 'stockId'}).get('value')
total = soup.find('div', class_='component-loadmore-leftPart__container').text
total = int(re.findall('共有\s+(\d+)\s+紀錄', total)[0])
req.close()
return id, total
def getCodeList():
token,callback,time_ = getToken()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'Accept': '*/*',
'Referer': 'https://www.hkex.com.hk/'
}
code_list = []
url = f'https://www1.hkex.com.hk/hkexwidget/data/getreitfilter?lang=chi&token={token}&sort=5&order=0&qid=1701506928884&callback={callback}&_={time_}'
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
data_json = re.findall('\((.*?)\)', req.text)[0]
data_json = json.loads(data_json)['data']['stocklist']
req.close()
for data_ in data_json:
code = data_['sym'].rjust(5, '0')
name = baseCore.hant_2_hans(data_['nm'])
ric = data_['ric']
code_list.append([code, name,ric])
req.close()
return code_list
def getJson(id,page):
date = datetime.datetime.today().strftime('%Y%m%d')
url = f'https://www1.hkexnews.hk/search/titleSearchServlet.do?sortDir=0&sortByOptions=DateTime&category=0&market=SEHK&stockId={id}&documentType=-1&fromDate=19990401&toDate={date}&title=&searchType=0&t1code=-2&t2Gcode=-2&t2code=-2&rowRange={page}&lang=zh'
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
data_json = json.loads(req.json()['result'])
req.close()
return data_json
def doJob(obsOperate):
code_list = getCodeList()
for codes in code_list:
code = codes[0]
name = codes[1]
ric = codes[2]
id, total = getID(code)
num = 1
log.info(f'开始采集==={name}==={code}===共{total}条数据')
for page in range(100, total + 1, 100):
data_json = getJson(id,page)
for data_ in data_json:
title = baseCore.hant_2_hans(data_['TITLE'])
date = data_['DATE_TIME']
date = datetime.datetime.strptime(date, '%d/%m/%Y %H:%M')
href = 'https://www1.hkexnews.hk' + data_['FILE_LINK']
file_title = title + '.pdf'
is_insert = db_storage.find_one({'code': code, 'date': date,'href': href,'exchange':'香港交易所'})
if is_insert:
log.info(f'{code}==={title}===已采集')
num += 1
continue
retData = obsOperate.uptoOBS(href, ric, file_title)
time.sleep(2)
if not retData['state']:
log.error(f'{code}==={title}===公告下载obs失败')
continue
att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', file_title, num, str(date)[:10])
num += 1
dic_info = {
'code': code, # 代码
'name': name, # 基金名称
'title': title, # 题目
'path': full_path, # 文件osb位置
'href': href, # 原文链接
'content': retData['content'], # pdf解析内容
'date': date, # 时间(datetime 类型)
'strDate': str(date)[:10], # 时间(字符串类型)
'exchange': '香港交易所', # 交易所
}
db_storage.insert_one(dic_info)
log.info(f'{code}==={title}===采集成功')
if __name__ == '__main__':
obsOperate = obsOperate(cursor_, cnx_, log)
doJob(obsOperate)
baseCore.close()
import datetime
import datetime
import re
import time
import numpy as np
import pandas as pd
import pymongo
import requests
import os
import json
import uuid
from urllib.parse import unquote
from fitz import fitz
from kafka import KafkaProducer
from obs import ObsClient
from retry import retry
from base import BaseCore
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').RESCenter[
'REITsFundAnncmnt']
obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
)
baseCore = BaseCore.BaseCore()
cursor_ = baseCore.cursor_
cnx_ = baseCore.cnx_
cursor = baseCore.cursor
cnx = baseCore.cnx
log = baseCore.getLogger()
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'query.sse.com.cn',
'Pragma': 'no-cache',
'Referer': 'http://www.sse.com.cn/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
headers_ = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
class obsOperate():
def __init__(self, cursor_, cnx_, log):
self.headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'www.sse.com.cn',
'Pragma': 'no-cache',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
self.cursor_ = cursor_
self.cnx_ = cnx_
self.log = log
def secrchATT(self, item_id, file_name, type_id, order_by):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s and order_by=%s '''
self.cursor_.execute(sel_sql, (item_id, file_name, type_id, order_by))
selects = self.cursor_.fetchone()
return selects
# 插入到att表 返回附件id
def tableUpdate(self, retData, com_name, file_name, num, pub_time):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
path = retData['path']
full_path = retData['full_path']
category = retData['category']
file_size = retData['file_size']
status = retData['status']
create_by = retData['create_by']
page_size = retData['page_size']
create_time = retData['create_time']
order_by = num
Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name,publish_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
file_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, path, 'zzsn', pub_time)
self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交
self.log.info("更新完成:{}".format(Upsql))
selects = self.secrchATT(item_id, file_name, type_id, order_by)
id = selects[0]
return id, full_path
def getuuid(self):
get_timestamp_uuid = uuid.uuid1() # 根据 时间戳生成 uuid , 保证全球唯一
return get_timestamp_uuid
# 获取文件大小
def convert_size(self, size_bytes):
# 定义不同单位的转换值
units = ['bytes', 'KB', 'MB', 'GB', 'TB']
i = 0
while size_bytes >= 1024 and i < len(units) - 1:
size_bytes /= 1024
i += 1
return f"{size_bytes:.2f} {units[i]}"
@retry(tries=5, delay=10)
def getRes(self, file_href):
response = requests.get(file_href, headers=self.headers)
if response.status_code != 200:
raise
return response
@retry(tries=5, delay=10)
def sendOBS(self, file_name, response):
result = obsClient.putContent('zzsn', 'PolicyDocument/' + file_name, content=response.content)
return result
def uptoOBS(self, file_href, item_id, file_name):
category = os.path.splitext(file_href)[1]
retData = {'state': False, 'type_id': 15, 'item_id': item_id, 'group_name': '', 'path': '',
'full_path': '',
'category': category, 'file_size': '', 'status': 1, 'create_by': 'LiuLiYuan',
'create_time': '', 'page_size': '', 'content': ''}
try:
response = self.getRes(file_href)
except:
self.log.error('文件获取失败')
return retData
try:
with fitz.open(stream=response.content, filetype='pdf') as doc:
for page in doc.pages():
retData['content'] += page.get_text()
except:
self.log.error(f'文件解析失败')
return retData
file_size = int(response.headers.get('Content-Length'))
file_name = str(self.getuuid()) + category
try:
result = self.sendOBS(file_name, response)
except:
self.log.error(f'obs上传失败')
return retData
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = unquote(result['body']['objectUrl'])
retData['file_size'] = self.convert_size(file_size)
retData['create_time'] = time_now
except Exception as e:
print(f'error:{e}')
return retData
return retData
# 获取json数据
@retry(tries=5, delay=15)
def getJson(url):
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
data_json = re.findall('\((.*)\)', req.text)[0]
data_json = json.loads(data_json)
req.close()
return data_json
# 获取总页数
def getTotal():
url = f'http://query.sse.com.cn/commonSoaQuery.do?jsonCallBack=jsonpCallback42283&sqlId=REITS_BULLETIN&isPagination=true&fundCode=&startDate=&endDate=&pageHelp.pageSize=25&pageHelp.cacheSize=1&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.endPage=5&_={int(time.time())}'
data_json = getJson(url)
total = int(data_json['pageHelp']['pageCount'])
return total
# 获取pdf文件的基本信息
def getDataList(page):
info_list = []
url = f'http://query.sse.com.cn/commonSoaQuery.do?jsonCallBack=jsonpCallback42283&sqlId=REITS_BULLETIN&isPagination=true&fundCode=&startDate=&endDate=&pageHelp.pageSize=25&pageHelp.cacheSize=1&pageHelp.pageNo={page}&pageHelp.beginPage={page}&pageHelp.endPage=5&_={int(time.time())}'
data_json = getJson(url)['result']
for data in data_json:
name = data['fundExtAbbr']
title = data['title']
pub_time = data['sseDate']
code = data['securityCode']
href = 'http://www.sse.com.cn' + data['url'].replace('\\', '')
info_list.append([title, pub_time, href, name, code])
return info_list
def doJob(obsOperate):
total = getTotal()
log.info(f'共{total}页')
num = 0
for page in range(1, total + 1):
log.info(f'开始采集第{page}页')
try:
info_list = getDataList(page)
except Exception as e:
log.error(f'第{page}页数据获取失败==={e}')
time.sleep(5)
continue
for info in info_list:
title = info[0]
pub_time = info[1]
href = info[2]
name = info[3]
code = info[4]
is_insert = db_storage.find_one({'code': code, 'href': href, 'exchange': '上海证券交易所'})
if is_insert:
log.info(f'{title}===已采集')
time.sleep(2)
continue
file_title = title + '.pdf'
retData = obsOperate.uptoOBS(href, f'{code}.SH', file_title)
time.sleep(2)
if retData['state']:
pass
else:
log.error(f'{title}===公告下载obs失败')
continue
att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', file_title, num, pub_time)
dic_news = {
'code': code, # 代码
'name': name, # 简称
'title': title, # 名称
'path': full_path, # obs路径
'href': href, # 原文链接
# 'content':content,
'content':retData['content'], # pdf解析内容
'date': datetime.datetime.strptime(pub_time, '%Y-%m-%d'), # 时间
'strDate': pub_time[:10], # 时间 字符串
'exchange': '上海证券交易所' # 交易所
}
# print(dic_news)
num += 1
try:
db_storage.insert_one(dic_news)
log.info(f'{title}===采集成功')
except:
log.error(f'{title}===入库失败')
time.sleep(4)
if __name__ == '__main__':
obsOperate = obsOperate(cursor_, cnx_, log)
doJob(obsOperate)
baseCore.close()
import datetime
import datetime
import re
import time
import numpy as np
import pandas as pd
import pymongo
import requests
import os
import json
import uuid
from urllib.parse import unquote
from fitz import fitz
from kafka import KafkaProducer
from obs import ObsClient
from retry import retry
from base import BaseCore
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').RESCenter[
'REITsFundAnncmnt']
obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
)
baseCore = BaseCore.BaseCore()
cursor_ = baseCore.cursor_
cnx_ = baseCore.cnx_
cursor = baseCore.cursor
cnx = baseCore.cnx
log = baseCore.getLogger()
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Host': 'www.szse.cn',
'Origin': 'http://www.szse.cn',
'Pragma': 'no-cache',
'Referer': 'http://www.szse.cn/disclosure/fund/notice/index.html',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'X-Request-Type': 'ajax',
'X-Requested-With': 'XMLHttpRequest',
}
url = 'http://www.szse.cn/api/disc/announcement/annList'
class obsOperate():
def __init__(self, cursor_, cnx_, log):
self.headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Host': 'www.szse.cn',
'Origin': 'http://www.szse.cn',
'Pragma': 'no-cache',
'Referer': 'http://www.szse.cn/disclosure/fund/notice/index.html',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'X-Request-Type': 'ajax',
'X-Requested-With': 'XMLHttpRequest',
}
self.cursor_ = cursor_
self.cnx_ = cnx_
self.log = log
def secrchATT(self, item_id, file_name, type_id, order_by):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s and order_by=%s '''
self.cursor_.execute(sel_sql, (item_id, file_name, type_id, order_by))
selects = self.cursor_.fetchone()
return selects
# 插入到att表 返回附件id
def tableUpdate(self, retData, com_name, file_name, num, pub_time):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
path = retData['path']
full_path = retData['full_path']
category = retData['category']
file_size = retData['file_size']
status = retData['status']
create_by = retData['create_by']
page_size = retData['page_size']
create_time = retData['create_time']
order_by = num
Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name,publish_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
file_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, path, 'zzsn', pub_time)
self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交
self.log.info("更新完成:{}".format(Upsql))
selects = self.secrchATT(item_id, file_name, type_id, order_by)
id = selects[0]
return id, full_path
def getuuid(self):
get_timestamp_uuid = uuid.uuid1() # 根据 时间戳生成 uuid , 保证全球唯一
return get_timestamp_uuid
# 获取文件大小
def convert_size(self, size_bytes):
# 定义不同单位的转换值
units = ['bytes', 'KB', 'MB', 'GB', 'TB']
i = 0
while size_bytes >= 1024 and i < len(units) - 1:
size_bytes /= 1024
i += 1
return f"{size_bytes:.2f} {units[i]}"
@retry(tries=5, delay=10)
def getRes(self, file_href):
response = requests.get(file_href, headers=self.headers)
if response.status_code != 200:
raise
return response
@retry(tries=5, delay=10)
def sendOBS(self, file_name, response):
result = obsClient.putContent('zzsn', 'PolicyDocument/' + file_name, content=response.content)
return result
def uptoOBS(self, file_href, item_id, file_name):
category = os.path.splitext(file_href)[1]
retData = {'state': False, 'type_id': 15, 'item_id': item_id, 'group_name': '', 'path': '',
'full_path': '',
'category': category, 'file_size': '', 'status': 1, 'create_by': 'LiuLiYuan',
'create_time': '', 'page_size': '', 'content': ''}
try:
response = self.getRes(file_href)
except:
self.log.error('文件获取失败')
return retData
try:
with fitz.open(stream=response.content, filetype='pdf') as doc:
for page in doc.pages():
retData['content'] += page.get_text()
except:
self.log.error(f'文件解析失败')
return retData
file_size = int(response.headers.get('Content-Length'))
file_name = str(self.getuuid()) + category
try:
result = self.sendOBS(file_name, response)
except:
self.log.error(f'obs上传失败')
return retData
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = unquote(result['body']['objectUrl'])
retData['file_size'] = self.convert_size(file_size)
retData['create_time'] = time_now
except Exception as e:
print(f'error:{e}')
return retData
return retData
# 获取代码列表
def getCodeList():
code_list = []
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Host': 'www.szse.cn',
'Pragma': 'no-cache',
'Referer': 'http://www.szse.cn/market/product/list/all/index.html',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'X-Request-Type': 'ajax',
'X-Requested-With': 'XMLHttpRequest',
}
url = 'http://www.szse.cn/api/report/ShowReport/data?SHOWTYPE=JSON&CATALOGID=1105&TABKEY=tab1&selectJjlb=%E5%9F%BA%E7%A1%80%E8%AE%BE%E6%96%BD%E5%9F%BA%E9%87%91'
ip = baseCore.get_proxy()
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
data_list = req.json()[0]['data']
for data_ in data_list:
code = re.findall('<u>(.*?)</u>', data_['sys_key'])[0]
code_list.append(code)
req.close()
return code_list
# 获取总页数
def getPageSize(id):
data_post = {"seDate": ["", ""], "stock": [f"{id}"], "channelCode": ["fundinfoNotice_disc"], "pageSize": 50,
"pageNum": 1}
data_post = json.dumps(data_post)
ip = baseCore.get_proxy()
req = requests.post(url, headers=headers, data=data_post)
req.encoding = req.apparent_encoding
total = int(req.json()['announceCount'])
if total % 50 == 0:
pageSize = int(total / 50)
else:
pageSize = int(total / 50) + 1
req.close()
return pageSize
# 获取json数据
def getDataList(id, page):
data_post = {"seDate": ["", ""], "stock": [f"{id}"], "channelCode": ["fundinfoNotice_disc"], "pageSize": 50,
"pageNum": page}
data_post = json.dumps(data_post)
ip = baseCore.get_proxy()
req = requests.post(url, headers=headers, data=data_post)
req.encoding = req.apparent_encoding
data_list = req.json()['data']
req.close()
return data_list
def doJob(obsOperate):
code_list = getCodeList()
for code in code_list:
pageSize = getPageSize(code)
log.info(f'{code}===共{pageSize}页')
for page in range(1, pageSize + 1):
log.info(f'开始采集第{page}页')
try:
data_list = getDataList(code, page)
except:
log.error(f'第{page}页数据获取失败')
time.sleep(5)
continue
num = 1
for data in data_list:
title = data['title']
name = data['secName'][0]
file_title = title + '.pdf'
pub_time = data['publishTime']
year = pub_time[:4]
href = 'http://www.szse.cn/api/disc/info/download?id=' + data['id']
is_insert = db_storage.find_one({'code': code, 'href': href, 'exchange': '深圳证券交易所'})
if is_insert:
log.info(f'{title}===已采集')
time.sleep(2)
break
retData = obsOperate.uptoOBS(href, '', file_title)
time.sleep(2)
if retData['state']:
pass
else:
log.error(f'{code}==={title}===公告下载obs失败')
continue
att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', file_title, num, pub_time)
num += 1
dic_news = {
'code': code, # 代码
'name': name, # 简称
'title': title, # 名称
'path': full_path, # obs路径
'href': href, # 原文链接
'content': retData['content'], # pdf解析内容
'date': datetime.datetime.strptime(pub_time, '%Y-%m-%d %H:%M:%S'), # 时间
'strDate': pub_time, # 时间 字符串
'exchange': '深圳证券交易所' # 交易所
}
try:
db_storage.insert_one(dic_news)
log.info(f'{title}===采集成功')
except:
log.error(f'{title}===入库失败')
time.sleep(2)
if __name__ == '__main__':
obsOperate = obsOperate(cursor_, cnx_, log)
doJob(obsOperate)
baseCore.close()
import os
import os
import re
import time
import uuid
from urllib.parse import urljoin, unquote
from fitz import fitz
from obs import ObsClient
from retry import retry
import BaseCore
import requests
from bs4 import BeautifulSoup
from requests.packages.urllib3 import disable_warnings
disable_warnings()
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
)
cursor_ = baseCore.cursor_
cnx_ = baseCore.cnx_
enMonth = {
'January': '01',
'February': '02',
'March': '03',
'April': '04',
'May': '05',
'June': '06',
'July': '07',
'August': '08',
'September': '09',
'October': '10',
'November': '11',
'December': '12'
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
}
webname = 'Nareit官网'
class obsOperate():
def __init__(self, cursor_, cnx_, log):
self.headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Pragma': 'no-cache',
'Referer': 'https://www.reit.com/',
'Sec-Ch-Ua': '"Microsoft Edge";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
self.cursor_ = cursor_
self.cnx_ = cnx_
self.log = log
def secrchATT(self, full_path):
sel_sql = '''select id from clb_sys_attachment where full_path=%s '''
self.cursor_.execute(sel_sql, (full_path))
selects = self.cursor_.fetchone()
return selects
# 插入到att表 返回附件id
def tableUpdate(self, retData, com_name, file_name, num, pub_time):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
path = retData['path']
full_path = retData['full_path']
category = retData['category']
file_size = retData['file_size']
status = retData['status']
create_by = retData['create_by']
page_size = retData['page_size']
create_time = retData['create_time']
order_by = num
Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name,publish_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
file_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, path, 'zzsn', pub_time)
self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交
self.log.info("更新完成:{}".format(Upsql))
selects = self.secrchATT(full_path)
id = selects[0]
return id, full_path
def getuuid(self):
get_timestamp_uuid = uuid.uuid1() # 根据 时间戳生成 uuid , 保证全球唯一
return get_timestamp_uuid
# 获取文件大小
def convert_size(self, size_bytes):
# 定义不同单位的转换值
units = ['bytes', 'KB', 'MB', 'GB', 'TB']
i = 0
while size_bytes >= 1024 and i < len(units) - 1:
size_bytes /= 1024
i += 1
return f"{size_bytes:.2f} {units[i]}"
@retry(tries=5, delay=10)
def getRes(self, file_href):
response = requests.get(file_href, headers=self.headers)
if response.status_code != 200:
raise
return response
@retry(tries=5, delay=10)
def sendOBS(self, file_name, response):
result = obsClient.putContent('zzsn', 'PolicyDocument/' + file_name, content=response.content)
return result
def uptoOBS(self, file_href, item_id, file_title, publishDate):
category = os.path.splitext(file_href)[1]
retData = {'state': False, 'type_id': 15, 'item_id': item_id, 'group_name': '', 'path': '',
'full_path': '',
'category': category, 'file_size': '', 'status': 1, 'create_by': 'LiuLiYuan',
'create_time': '', 'page_size': '', 'content': ''}
try:
response = self.getRes(file_href)
except:
self.log.error('文件获取失败')
return retData
try:
with fitz.open(stream=response.content, filetype='pdf') as doc:
for page in doc.pages():
retData['content'] += page.get_text()
except:
self.log.error(f'文件解析失败')
return retData
file_size = int(response.headers.get('Content-Length'))
file_name = str(self.getuuid()) + category
try:
result = self.sendOBS(file_name, response)
except:
self.log.error(f'obs上传失败')
return retData
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = unquote(result['body']['objectUrl'])
retData['state'] = True
retData['file_size'] = self.convert_size(file_size)
retData['create_time'] = time_now
except Exception as e:
print(f'error:{e}')
return retData
return retData
def paserUrl(html, listurl):
# 获取所有的<a>标签和<img>标签
if isinstance(html, str):
html = BeautifulSoup(html, 'html.parser')
links = html.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return html
@retry(tries=5, delay=10)
def getSoup(url):
req = requests.get(url, headers=headers, verify=False)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
req.close()
return soup
@retry(tries=5, delay=10)
def getImg(url):
req = requests.get(url, headers=headers, verify=False)
content = req.content
return content
def getContentA(url):
soup = getSoup(url)
soup = paserUrl(soup, url)
contentWithTag = soup.find('div', class_='node__content')
try:
scripts = contentWithTag.find_all('script')
for script in scripts:
script.decompose()
except:
pass
try:
styles = contentWithTag.find_all('style')
for style in styles:
style.decompose()
except:
pass
img_list = contentWithTag.find_all('img')
for img in img_list:
src = img.get('src')
img_title = img.get('alt') + '.jpg'
img_title = img_title.replace('/', '-')
content = getImg(src)
with open(f'./img/{img_title}', 'wb') as f:
f.write(content)
time.sleep(3)
def getContentB(url):
pass
@retry(tries=5, delay=10)
def getList():
url = 'https://www.reit.com/data-research/research/nareit-research'
req = requests.get(url, headers=headers, verify=False)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
li_list = soup.find('div', class_='paragraph--text-block__inner').find_all('li')
req.close()
return li_list
def doJob():
li_list = getList()
num = 1
for li in li_list:
# log.info(f'开始采集')
title = li.find('a').text.strip()
summary = li.text.replace(title, '').replace('(PDF)', '').strip()
href = 'https://www.reit.com' + li.find('a').get('href')
if '.pdf' in href:
getContentB(href)
else:
getContentA(href)
time.sleep(10)
class Third_party():
def __init__(self):
pass
def doJob(self,obsOperate):
id_list = []
origin = 'Nareit官网'
url = 'https://www.reit.com/data-research/research/third-party-research'
soup = getSoup(url)
li_list = soup.find('div', class_='field--name-field-text').find_all('li')
num = 2
# for li in li_list:
li = li_list[1]
title = li.find('a').text.strip()
href = li.find('a').get('href')
date = li.text.split(title)[0]
# try:
year = re.findall('\d+', date)[0]
# except:
# continue
month = enMonth[date.split(year)[0].strip()]
publishDate = year + '-' + month + '-' + '01'
summary = li.text.split(title)[1].strip()
# file_title = title + ".pdf"
soup_ = getSoup(href)
soup_ = paserUrl(soup_,href)
contentWithTag = soup_.find('div',class_='content')
a_list = contentWithTag.find_all('a')
for a in a_list:
file_title = a.get('title')
fj_href = a.get('href')
retData = obsOperate.uptoOBS(fj_href, '', file_title, publishDate)
time.sleep(2)
if retData['state']:
pass
else:
log.error(f'{title}===研报下载obs失败')
continue
att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', file_title, num, publishDate)
num += 1
id_list.append(att_id)
content = contentWithTag.text.strip()
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
id = '1729021644139057153' + str(int(time.time()))
lang = baseCore.detect_language(retData['content'])
dic_news = {
'id': id,
'subjectId': '1729021644139057153',
'checkStatus': 1,
'deleteFlag': 0,
'topNum': 0,
# 'content': retData['content'],
'content':content,
'contentWithTag': str(contentWithTag),
'createDate': time_now,
'lang': lang,
'origin': origin,
'publishDate': publishDate,
'sourceAddress': href,
'title': title,
'summary': summary,
'attachmentIds': id_list,
'sid': '1730477904990486529',
}
try:
baseCore.sendkafka(dic_news, 'research_center_fourth')
baseCore.r.sadd('REITs::' + webname, href)
log.info(f'{title}===采集成功')
num += 1
except Exception as e:
log.error(f'{title}===发送kafka失败==={e}')
if __name__ == '__main__':
obsOperate = obsOperate(cursor_,cnx_,log)
Third_party().doJob(obsOperate)
baseCore.close()
\ No newline at end of file
import datetime
import datetime
import time
import pandas as pd
import pymongo
import requests
from apscheduler.schedulers.blocking import BlockingScheduler
from retry import retry
from requests.packages.urllib3 import disable_warnings
import BaseCore
disable_warnings()
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').RESCenter[
'RETIsProdQuot']
db_storage_PO = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').RESCenter[
'REITsProdOverview']
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
}
@retry(tries=5, delay=20)
def getCodeList():
code_list = []
url = 'https://api.sgx.com/securities/v1.1?excludetypes=bonds&params=nc%2Cadjusted-vwap%2Cbond_accrued_interest%2Cbond_clean_price%2Cbond_dirty_price%2Cbond_date%2Cb%2Cbv%2Cp%2Cc%2Cchange_vs_pc%2Cchange_vs_pc_percentage%2Ccx%2Ccn%2Cdp%2Cdpc%2Cdu%2Ced%2Cfn%2Ch%2Ciiv%2Ciopv%2Clt%2Cl%2Co%2Cp_%2Cpv%2Cptd%2Cs%2Csv%2Ctrading_time%2Cv_%2Cv%2Cvl%2Cvwap%2Cvwap-currency'
req = requests.get(url, headers=headers, verify=False)
req.encoding = req.apparent_encoding
data_json = req.json()['data']['prices']
for data in data_json:
if data['type'] == 'reits':
code = data['nc']
code_list.append(code)
req.close()
return code_list
def getDataJson(code):
url = f'https://api.sgx.com/securities/v1.1/charts/historic/reits/code/{code}/1w'
req = requests.get(url,headers=headers,verify=False)
req.encoding = req.apparent_encoding
data_json = req.json()['data']['historic']
req.close()
return data_json
def doJob():
code_list = getCodeList()
for code in code_list:
data_json = getDataJson(code)
for data_ in data_json:
date = datetime.datetime.strptime(data_['trading_time'], '%Y%m%d_%H%M%S')
country = db_storage_PO.find_one({'code':code,'exchange':'新加坡交易所'})['country']
is_insert = db_storage.find_one({'code':code,'date':date})
if is_insert:
log.info(f'{code}===已采集')
continue
name = data_['n']
opening = data_['o']
max = data_['h']
min = data_['l']
closed = data_['lt']
ytdClosed = data_['pv']
volume = float(data_['vl'])*1000
amount = data_['v']
dic_info = {
'code': code, # 代码
'shortName': name, # 简称
'opening': float(opening), # 开盘价
'max': float(max), # 最高价
'min': float(min), # 最低价
'closed': float(closed), # 收盘价
'ytdClosed': float(ytdClosed), # 前收价
'volume': float(volume), # 交易量
'amount': float(amount), # 交易金额
'totalValue': '', # 市价总值
'negoValue': '', # 流通总值
'toRate': '', # 换手率
'date': date, # 时间
'strDate': str(date)[:10],
'country': country, # 国家
'exchange': '新加坡交易所', # 交易所
'currency':'SGD'# 币种
}
try:
db_storage.insert_one(dic_info)
log.info(f'{code}==={name}==={date}===采集成功')
except:
log.error(f'{code}==={name}==={date}===保存失败')
time.sleep(2)
def task():
# 实例化一个调度器
scheduler = BlockingScheduler()
# 每天执行一次
scheduler.add_job(doJob, 'cron', hour='18', minute=0, max_instances=2 )
try:
scheduler.start()
except Exception as e:
log.error('定时采集异常', e)
pass
if __name__ == '__main__':
doJob()
#task()
\ No newline at end of file
import datetime
import datetime
import json
import re
import time
import pymongo
import requests
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'Accept':'*/*',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').RESCenter[
'RETIsProdQuot']
def getList():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'Accept': '*/*',
'Referer': 'https://www.hkex.com.hk/'
}
url = f'https://www1.hkex.com.hk/hkexwidget/data/getreitfilter?lang=chi&token=evLtsLsBNAUVTPxtGqVeGyhVsyUra3j5V4IkrZKhkiaD97FD%2bFaZmy%2f2JkHt6s0m&sort=5&order=0&qid={int(time.time())*1000}&callback=jQuery3510399252964830082_1702532830368&'
req = requests.get(url,headers=headers)
req.encoding = req.apparent_encoding
data_json = re.findall('\((.*?)\)', req.text)[0]
data_json = json.loads(data_json)['data']['stocklist']
req.close()
return data_json
def getDataJson(ric):
url = f'https://www1.hkex.com.hk/hkexwidget/data/getchartdata2?hchart=1&span=6&int=6&ric={ric}&token=evLtsLsBNAUVTPxtGqVeG1W4%2fDlQ5x7gUilJ3XOCHQYqDv29Qh%2f5bHDJA2BUz3YL&qid={int(time.time())*1000}&callback=jQuery35105943072391434097_1702533112873'
req = requests.get(url,headers=headers)
req.encoding = req.apparent_encoding
data_list = json.loads(re.findall('\((.*?)\)',req.text)[0])['data']['datalist']
return data_list
# https://www.hkex.com.hk/Market-Data/Securities-Prices/Real-Estate-Investment-Trusts?sc_lang=zh-HK
def doJob():
try:
info_json = getList()
except:
log.error(f'列表信息获取失败')
return
for info in info_json:
code = info['sym']
name = baseCore.hant_2_hans(info['nm'])
ric = info['ric']
currency = info['ccy']
try:
data_list = getDataJson(ric)
except:
log.error(f'{code}==={name}===数据获取失败')
continue
data_list = data_list[-11:]
# for i in range(len(data_list)):
for data in data_list:
date = datetime.datetime.fromtimestamp(int(data[0])/1000)
opening = data[1]
if not opening:
continue
is_insert = db_storage.find_one({'code':code,'strDate':str(date)[:10],'exchange':'香港交易所'})
if is_insert:
log.info(f'{code}==={name}===已采集')
continue
# if i == 1:
# ytdClosed = 0
# else:
if date.isoweekday() == 1:
yesterday = date - datetime.timedelta(days=3)
else:
yesterday = date - datetime.timedelta(days=1)
while True:
try:
ytdClosed = db_storage.find_one({'code':code,'strDate':f'{str(yesterday)[:10]}','exchange':'香港交易所'})['closed']
break
except:
yesterday = yesterday - datetime.timedelta(days=1)
max = data[2]
min = data[3]
closed = data[4]
volume = data[5]
amount = data[6]
dic_info = {
'code': code, # 代码
'shortName': name, # 简称
'opening': float(opening), # 开盘价
'max': float(max), # 最高价
'min': float(min), # 最低价
'closed': float(closed), # 收盘价
'ytdClosed': float(ytdClosed), # 前收价
'volume': float(volume), # 交易量
'amount': float(amount), # 交易金额
'totalValue': '', # 市价总值
'negoValue': '', # 流通总值
'toRate': '', # 换手率
'date': date, # 时间
'strDate': str(date)[:10],
'country': '中国', # 国家
'exchange': '香港交易所', # 交易所
'currency':currency# 币种
}
try:
db_storage.insert_one(dic_info)
log.info(f'{code}==={name}==={date}===采集成功')
except:
log.error(f'{code}==={name}==={date}===入库失败')
time.sleep(1)
if __name__ == '__main__':
doJob()
baseCore.close()
\ No newline at end of file
import re
import re
import time
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import pymongo
import requests
from apscheduler.schedulers.blocking import BlockingScheduler
from retry import retry
from base import BaseCore
baseCore = BaseCore.BaseCore()
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').RESCenter[
'RETIsProdQuot']
log = baseCore.getLogger()
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Host': 'www.szse.cn',
'Pragma': 'no-cache',
'Referer': 'http://www.szse.cn/market/product/list/all/index.html',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'X-Request-Type': 'ajax',
'X-Requested-With': 'XMLHttpRequest',
}
# 获取基金代码与上市时间
@retry(tries=3, delay=3)
def getData():
data_list = []
# ip = baseCore.get_proxy()
url = 'https://reits.szse.cn/api/report/ShowReport/data?SHOWTYPE=JSON&CATALOGID=reits_fund_list&PAGENO=1&PAGESIZE=10'
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
data_json = req.json()[0]['data']
for data_ in data_json:
jjjcurl = re.findall('<u>(.*?)</u>', data_['jjjcurl'])[0].lstrip().strip()
sys_key = data_['sys_key'].lstrip().strip()
ssrq = data_['ssrq'].lstrip().strip()
# 基金简称 基金代码 上市时间
data = [jjjcurl, sys_key, ssrq]
data_list.append(data)
req.close()
return data_list
# 获取基金交易信息
@retry(tries=3, delay=20)
def getDataList(code, start_date, end_date):
ip = baseCore.get_proxy()
archiveDate = str(datetime.today().year) + '-'+ str(datetime.today().month) + '-' + '01'
url = f'http://www.szse.cn/api/report/ShowReport/data?SHOWTYPE=JSON&CATALOGID=1815_stock_snapshot&TABKEY=tab2&txtDMorJC={code}&txtBeginDate={str(start_date)[:10]}&txtEndDate={str(end_date)[:10]}&archiveDate={archiveDate}'
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
data_json = req.json()[0]['data'][::-1]
req.close()
for data_ in data_json:
jyrq = data_['jyrq']
zqdm = data_['zqdm']
zqjc = data_['zqjc']
qss = data_['qss'].replace(',', '')
ks = data_['ks'].replace(',', '')
zg = data_['zg'].replace(',', '')
zd = data_['zd'].replace(',', '')
ss = data_['ss'].replace(',', '')
cjgs = data_['cjgs'].replace(',', '')
cjje = data_['cjje'].replace(',', '')
jyrq = datetime.strptime(jyrq, '%Y-%m-%d')
is_insert = db_storage.find_one({'code': zqdm, 'date': jyrq, 'exchange': '深圳证券交易所'})
if is_insert:
log.info(f'{code}==={jyrq}===已采集')
continue
dic_info = {
'code': zqdm, # 代码
'shortName': zqjc, # 简称
'opening': float(ks), # 开盘价
'max': float(zg), # 最高价
'min': float(zd), # 最低价
'closed': float(ss), # 收盘价
'ytdClosed': float(qss), # 前收价
'volume': float(cjgs), # 交易量
'amount': float(cjje), # 交易金额
'totalValue': '', # 市价总值
'negoValue': '', # 流通总值
'toRate': '', # 换手率
'date': jyrq, # 时间
'strDate' : str(jyrq)[:10], # 字符串 时间
'country': '中国', # 国家
'exchange': '深圳证券交易所', # 交易所
"currency": "CNY" # 币种
}
db_storage.insert_one(dic_info)
log.info(f'{code}==={jyrq}===采集成功')
time.sleep(3)
def doJob():
try:
data_list = getData()
except Exception as e:
log.error(f'基金列表获取失败==={e}')
return
log.info('开始采集')
for data in data_list:
name = data[0]
code = data[1]
log.info(f'{code}==={name}===开始采集')
# start_date = data[2]
# start_date = datetime.strptime(start_date, "%Y-%m-%d")
current_date = datetime.now()
start_date = current_date + timedelta(days=-5)
# end_date = start_date + timedelta(days=5)
# while end_date != current_date:
# time.sleep(1)
try:
# getDataList(code, start_date, end_date)
getDataList(code,start_date,current_date)
except Exception as e:
log.error(f'{code}==={start_date}-{current_date}===采集失败==={e}')
# start_date = end_date + timedelta(days=1)
# end_date = start_date + timedelta(days=5)
# if end_date > current_date:
# end_date = current_date
def task():
# 实例化一个调度器
scheduler = BlockingScheduler()
# 每天执行一次
scheduler.add_job(doJob, 'cron', hour='18', minute=0, max_instances=2 )
try:
scheduler.start()
except Exception as e:
log.error('定时采集异常', e)
pass
if __name__ == '__main__':
# doJob()
task()
import hashlib
import hashlib
import json
import os
import re
import time
import uuid
import pymongo
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from obs import ObsClient
from retry import retry
from base import BaseCore
from urllib.parse import unquote
obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
)
baseCore = BaseCore.BaseCore()
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').RESCenter[
'REITsProjDynamics']
cursor_ = baseCore.cursor_
cnx_ = baseCore.cnx_
log = baseCore.getLogger()
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'query.sse.com.cn',
'Pragma': 'no-cache',
'Referer': 'http://www.sse.com.cn/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
class obsOperate():
def __init__(self, cursor_, cnx_, log):
self.headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'static.sse.com.cn',
'Pragma': 'no-cache',
'Referer': 'http://www.sse.com.cn/',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
self.cursor_ = cursor_
self.cnx_ = cnx_
self.log = log
def secrchATT(self, item_id, file_name, type_id, order_by):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s and order_by=%s '''
self.cursor_.execute(sel_sql, (item_id, file_name, type_id, order_by))
selects = self.cursor_.fetchone()
return selects
# 插入到att表 返回附件id
def tableUpdate(self, retData, com_name, file_name, num, pub_time):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
path = retData['path']
full_path = retData['full_path']
category = retData['category']
file_size = retData['file_size']
status = retData['status']
create_by = retData['create_by']
page_size = retData['page_size']
create_time = retData['create_time']
order_by = num
Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name,publish_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
file_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, path, 'zzsn', pub_time)
self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交
self.log.info("更新完成:{}".format(Upsql))
selects = self.secrchATT(item_id, file_name, type_id, order_by)
id = selects[0]
return id, full_path
def getuuid(self):
get_timestamp_uuid = uuid.uuid1() # 根据 时间戳生成 uuid , 保证全球唯一
return get_timestamp_uuid
# 获取文件大小
def convert_size(self, size_bytes):
# 定义不同单位的转换值
units = ['bytes', 'KB', 'MB', 'GB', 'TB']
i = 0
while size_bytes >= 1024 and i < len(units) - 1:
size_bytes /= 1024
i += 1
return f"{size_bytes:.2f} {units[i]}"
@retry(tries=5, delay=10)
def getRes(self, file_href):
response = requests.get(file_href, headers=self.headers)
if response.status_code != 200:
raise
return response
@retry(tries=5, delay=10)
def sendOBS(self, file_name, response):
result = obsClient.putContent('zzsn', 'PolicyDocument/' + file_name, content=response.content)
return result
def uptoOBS(self, file_href, item_id, file_name):
category = os.path.splitext(file_href)[1]
retData = {'state': False, 'type_id': 15, 'item_id': item_id, 'group_name': '', 'path': '',
'full_path': '',
'category': category, 'file_size': '', 'status': 1, 'create_by': 'LiuLiYuan',
'create_time': '', 'page_size': '', 'content': ''}
try:
response = self.getRes(file_href)
except:
self.log.error('文件获取失败')
return retData
file_size = int(response.headers.get('Content-Length'))
file_name = str(self.getuuid()) + category
try:
result = self.sendOBS(file_name, response)
except:
self.log.error(f'obs上传失败')
return retData
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = unquote(result['body']['objectUrl'])
retData['file_size'] = self.convert_size(file_size)
retData['create_time'] = time_now
except Exception as e:
print(f'error:{e}')
return retData
return retData
def md5_encrypt(text):
# 创建 MD5 对象
md5 = hashlib.md5()
# 更新 MD5 对象内容
md5.update(text.encode('utf-8'))
# 获取加密结果
encrypted_text = md5.hexdigest()
return encrypted_text
# 获取json数据
def getJson(url):
# ip = baseCore.get_proxy()
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
data_json = re.findall('\((.*)\)', req.text)[0]
data_json = json.loads(data_json)
return data_json
# 获取总页数
def getTotal():
url = f'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback51800&isPagination=true&bond_type=4&sqlId=COMMON_SSE_ZCZZQXMLB&pageHelp.pageSize=25&status=&begin=&end=&pageHelp.cacheSize=1&pageHelp.pageNo=1&pageHelp.beginPage=1&_={int(time.time())}'
data_json = getJson(url)
total = int(data_json['pageHelp']['pageCount'])
return total
# 获取基金id列表
def getInfoList(page):
info_list = []
url = f'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback51800&isPagination=true&bond_type=4&sqlId=COMMON_SSE_ZCZZQXMLB&pageHelp.pageSize=25&status=&begin=&end=&pageHelp.cacheSize=1&pageHelp.pageNo=1&pageHelp.beginPage={page}&_={int(time.time())}'
data_json = getJson(url)
data_json = data_json['result']
for data in data_json:
id = data['BOND_NUM']
type = data['REITS_TYPE']
if type == '0':
info_list.append([id, '首次发售'])
elif type == '1':
info_list.append([id, '扩募发售'])
else:
info_list.append([id, '-'])
return info_list
# 获取项目基本信息
def getBaseInfo(id, type):
url = f'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback72929&isPagination=false&audit_id={id}&sqlId=COMMON_SSE_ZCZZQXMXXXX&_={int(time.time()) * 1000}'
try:
data_ = getJson(url)['result'][0]
except:
log.error(f'{id}===项目基本信息获取失败')
return {}
if data_['AUDIT_STATUS'] == '0':
audit_status = '已申报'
elif data_['AUDIT_STATUS'] == '1':
audit_status = '已受理'
elif data_['AUDIT_STATUS'] == '2':
audit_status = '已反馈'
elif data_['AUDIT_STATUS'] == '3':
audit_status = '已接收反馈意见'
elif data_['AUDIT_STATUS'] == '4':
audit_status = '通过'
elif data_['AUDIT_STATUS'] == '5':
audit_status = '未通过'
elif data_['AUDIT_STATUS'] == '8':
audit_status = '终止'
elif data_['AUDIT_STATUS'] == '901':
audit_status = '承销商/管理人超期中止'
elif data_['AUDIT_STATUS'] == '9':
audit_status = '中止'
elif data_['AUDIT_STATUS'] == '10':
audit_status = '已回复交易所意见'
elif data_['AUDIT_STATUS'] == '111':
audit_status = '提交注册'
elif data_['AUDIT_STATUS'] == '12':
audit_status = '注册生效'
else:
audit_status = '-'
if data_['BOND_TYPE'] == '4':
bond_type = '基础设施公募REITs'
else:
bond_type = '其它'
strUpdateDate = data_['PUBLISH_DATE']
if strUpdateDate == '-':
updateDate = ''
else:
updateDate = datetime.strptime(data_['PUBLISH_DATE'], '%Y-%m-%d')
strAccDate = data_['ACCEPT_DATE']
if strAccDate == '-':
accDate = ''
else:
accDate = datetime.strptime(data_['ACCEPT_DATE'], '%Y-%m-%d')
sponsors = []
sponsorMD5 = []
if ',' in data_['LIST1']:
for sponsor in data_['LIST1'].split(','):
sponsors.append(sponsor)
elif ',' in data_['LIST1']:
for sponsor in data_['LIST1'].split(','):
sponsors.append(sponsor)
elif ';' in data_['LIST1']:
for sponsor in data_['LIST1'].split(';'):
sponsors.append(sponsor)
else:
sponsors.append(data_['LIST1'])
for sponsor in sponsors:
sponsorMD5.append(md5_encrypt(sponsor))
data = {
'name': data_['AUDIT_NAME'],
'variety': bond_type,
'sponsor': data_['LIST1'],
'sponsors':sponsors,
'sponsorMD5':sponsorMD5,
'caretaker': data_['PRIORITY_MANAGER'],
'planName': data_['PRIORITY_NAME'],
'planCaretaker': data_['LIST2'],
'letterNum': data_['REG_APRV_WEN_HAO'],
'status': audit_status,
'updateDate': updateDate,
'strUpdateDate': strUpdateDate,
'accDate': accDate,
'strAccDate': strAccDate,
'type': type,
}
return data
# 获取信息披露文件及备查文件
def getFile(obsOperate, id, mongoId):
data = {
'fundContracts': {},
'fundCustodianAgt': {},
'prospectus': {},
'legalOpinion': {},
}
url = f'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback56354&isPagination=false&audit_id={id}&sqlId=COMMON_SSE_ZCZZQXMXXXX_XXPLWJ_ZGSMS&_={int(time.time()) * 1000}'
try:
data_json = getJson(url)['result']
except:
log.error(f'{id}===信息披露文件及备查文件获取失败')
return 0
for data_ in data_json:
type = data_['FILE_TYPE']
paths = data_['FILE_PATH'].split('|')
versions = data_['FILE_VERSION'].split('|')
times = data_['FILE_TIME'].split('|')
names = data_['FILE_TITLE'].split('|')
data_ = {
'declaration': '',
'declarationTime': '',
'strDeclarationTime': '',
'fdbkResp': '',
'fdbkRespTime': '',
'strFdbkRespTime': '',
'cover': '',
'coverTime': '',
'strCoverTime': '',
}
for i in range(len(paths)):
path = paths[i]
version = versions[i]
time_ = times[i]
file_name = names[i]
path = 'http://static.sse.com.cn/bond' + path
category = os.path.splitext(path)[1]
if category not in file_name:
file_name = file_name + category
retData = obsOperate.uptoOBS(path, mongoId, file_name)
time.sleep(2)
if retData['state']:
pass
else:
log.error(f'{id}===信息披露文件及备查文件下载obs失败')
return 0
att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', file_name, i, time_)
if int(version) == 1:
data_['declaration'] = full_path
data_['declarationTime'] = datetime.strptime(time_, '%Y-%m-%d')
data_['strDeclarationTime'] = time_
elif int(version) == 2:
data_['fdbkResp'] = full_path
data_['fdbkRespTime'] = datetime.strptime(time_, '%Y-%m-%d')
data_['strFdbkRespTime'] = time_
elif int(version) == 3:
data_['cover'] = full_path
data_['coverTime'] = datetime.strptime(time_, '%Y-%m-%d')
data_['strCoverTime'] = time_
if type == '91':
data_['title'] = '基金合同'
data['fundContracts'] = data_
elif type == '92':
data_['title'] = '基金托管协议'
data['fundCustodianAgt'] = data_
elif type == '93':
data_['title'] = '招募说明书'
data['prospectus'] = data_
elif type == '94':
data_['title'] = '基金法律意见书'
data['legalOpinion'] = data_
log.info(f'{id}===信息披露文件及备查文件采集成功')
return data
# 获取反馈意见及回复
def getFeedback(obsOperate, id, mongoId):
data = []
url = f'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback27109&isPagination=true&audit_id={id}&sqlId=COMMON_SSE_ZCZZQXMXXXX_FKXX_ALL&_={int(time.time()) * 1000}'
try:
data_json = getJson(url)['result']
except:
log.error(f'{id}===反馈意见及回复获取失败')
return 0
for data_ in data_json:
num = data_['NUM']
path = data_['FILE_PATH']
path = 'http://static.sse.com.cn/bond' + path
title = data_['FILE_TITLE']
updateTime = data_['UPD_TIME']
category = os.path.splitext(path)[1]
if category not in title:
title = title + category
retData = obsOperate.uptoOBS(path, mongoId, title)
time.sleep(2)
if retData['state']:
pass
else:
log.error(f'{id}===反馈意见及回复文件下载osb失败')
return 0
att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', title, num, updateTime)
data_ = {
'num': num,
'title': title,
'updateTime': datetime.strptime(updateTime, '%Y-%m-%d'),
'strUpdateTime': updateTime,
'filePath': full_path,
}
data.append(data_)
log.info(f'{id}===反馈意见及回复采集成功')
return data
def updateFilePath(obsOperate,paths,versions,times,names,mongoData,mongoId,type):
data = {}
for i in range(len(paths)):
path = paths[i]
version = versions[i]
time_ = times[i]
file_name = names[i]
path = 'http://static.sse.com.cn/bond' + path
category = os.path.splitext(path)[1]
if category not in file_name:
file_name = file_name + category
if int(version) == 1:
try:
if time_ == mongoData['file'][f'{type}']['strDeclarationTime'] :
full_path = mongoData['file'][f'{type}']['declaration']
else:
retData = obsOperate.uptoOBS(path, mongoId, file_name)
time.sleep(2)
if retData['state']:
pass
else:
log.error(f'{id}===信息披露文件及备查文件下载obs失败')
return 0
att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', file_name, i, time_)
except:
retData = obsOperate.uptoOBS(path, mongoId, file_name)
time.sleep(2)
if retData['state']:
pass
else:
log.error(f'{id}===信息披露文件及备查文件下载obs失败')
return 0
att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', file_name, i, time_)
data['declaration'] = full_path
data['declarationTime'] = datetime.strptime(time_, '%Y-%m-%d')
data['strDeclarationTime'] = time_
elif int(version) == 2:
try:
if time_ == mongoData['file'][f'{type}']['strFdbkRespTime'] :
full_path = mongoData['file'][f'{type}']['fdbkResp']
else:
retData = obsOperate.uptoOBS(path, mongoId, file_name)
time.sleep(2)
if retData['state']:
pass
else:
log.error(f'{id}===信息披露文件及备查文件下载obs失败')
return 0
att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', file_name, i, time_)
except:
retData = obsOperate.uptoOBS(path, mongoId, file_name)
time.sleep(2)
if retData['state']:
pass
else:
log.error(f'{id}===信息披露文件及备查文件下载obs失败')
return 0
att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', file_name, i, time_)
data['fdbkResp'] = full_path
data['fdbkRespTime'] = datetime.strptime(time_, '%Y-%m-%d')
data['strFdbkRespTime'] = time_
elif int(version) == 3:
try:
if time_ == mongoData['file'][f'{type}']['strCoverTime']:
full_path = mongoData['file'][f'{type}']['cover']
else:
retData = obsOperate.uptoOBS(path, mongoId, file_name)
time.sleep(2)
if retData['state']:
pass
else:
log.error(f'{id}===信息披露文件及备查文件下载obs失败')
return 0
att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', file_name, i, time_)
except:
retData = obsOperate.uptoOBS(path, mongoId, file_name)
time.sleep(2)
if retData['state']:
pass
else:
log.error(f'{id}===信息披露文件及备查文件下载obs失败')
return 0
att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', file_name, i, time_)
data['cover'] = full_path
data['coverTime'] = datetime.strptime(time_, '%Y-%m-%d')
data['strCoverTime'] = time_
return data
def updateFile(obsOperate, id, mongoId, mongoData):
data = {
'fundContracts': {},
'fundCustodianAgt': {},
'prospectus': {},
'legalOpinion': {},
}
url = f'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback56354&isPagination=false&audit_id={id}&sqlId=COMMON_SSE_ZCZZQXMXXXX_XXPLWJ_ZGSMS&_={int(time.time()) * 1000}'
try:
data_json = getJson(url)['result']
except:
log.error(f'{id}===信息披露文件及备查文件获取失败')
return 0
for data_ in data_json:
type = data_['FILE_TYPE']
paths = data_['FILE_PATH'].split('|')
versions = data_['FILE_VERSION'].split('|')
times = data_['FILE_TIME'].split('|')
names = data_['FILE_TITLE'].split('|')
if type == '91':
data_2 = updateFilePath(obsOperate,paths,versions,times,names,mongoData,mongoId,'fundContracts')
data_['title'] = '基金合同'
data['fundContracts'] = data_2
elif type == '92':
data_2 = updateFilePath(obsOperate,paths,versions,times,names,mongoData,mongoId,'fundCustodianAgt')
data_['title'] = '基金托管协议'
data['fundCustodianAgt'] = data_2
elif type == '93':
data_2 = updateFilePath(obsOperate,paths,versions,times,names,mongoData,mongoId,'prospectus')
data_['title'] = '招募说明书'
data['prospectus'] = data_2
elif type == '94':
data_2 = updateFilePath(obsOperate,paths,versions,times,names,mongoData,mongoId,'legalOpinion')
data_['title'] = '基金法律意见书'
data['legalOpinion'] = data_2
log.info(f'{id}===信息披露文件及备查文件采集成功')
return data
def updateFeedback(obsOperate, id, mongoId, mongoData):
data = []
url = f'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback27109&isPagination=true&audit_id={id}&sqlId=COMMON_SSE_ZCZZQXMXXXX_FKXX_ALL&_={int(time.time()) * 1000}'
try:
data_json = getJson(url)['result']
except:
log.error(f'{id}===反馈意见及回复获取失败')
return 0
for data_ in data_json:
num = data_['NUM']
path = data_['FILE_PATH']
path = 'http://static.sse.com.cn/bond' + path
title = data_['FILE_TITLE']
updateTime = data_['UPD_TIME']
category = os.path.splitext(path)[1]
if category not in title:
title = title + category
flg = True
for feedback_ in mongoData['feedback']:
if updateTime == feedback_['strUpdateTime'] and title == feedback_['title']:
data.append(feedback_)
flg = False
break
if flg:
retData = obsOperate.uptoOBS(path, mongoId, title)
time.sleep(2)
if retData['state']:
pass
else:
log.error(f'{id}===反馈意见及回复文件下载osb失败')
return 0
att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', title, num, updateTime)
data_ = {
'num': num,
'title': title,
'updateTime': datetime.strptime(updateTime, '%Y-%m-%d'),
'strUpdateTime': updateTime,
'filePath': full_path,
}
data.append(data_)
log.info(f'{id}===反馈意见及回复采集成功')
return data
def doJob(obsOperate):
log.info('采集开始')
try:
total = getTotal()
except:
log.error(f'总页数获取失败')
return
for page in range(1, total + 1):
log.info(f'开始采集第{page}页')
try:
info_list = getInfoList(page)
except:
log.error(f'第{page}页数据获取失败')
continue
for info in info_list:
id = info[0]
type = info[1]
baseInfo = getBaseInfo(id, type)
is_insert = db_storage.find_one({'baseInfo.name': f'{baseInfo["name"]}', 'exchange': '上海证券交易所'})
if is_insert:
mongoId = is_insert['_id']
updateDate = is_insert['baseInfo']['strUpdateDate']
if updateDate == baseInfo['strUpdateDate']:
log.info(f'{baseInfo["name"]}===已采集,不需要更新')
time.sleep(5)
continue
else:
log.info(f'{id}===已采集,需要更新,正在更新中')
createDate = is_insert['createDate']
now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
file = updateFile(obsOperate, id, str(mongoId), is_insert)
feedback = updateFeedback(obsOperate, id, str(mongoId), is_insert)
dic_info = {
'baseInfo': baseInfo,
'file': file,
'feedback': feedback,
'exchange': '上海证券交易所',
'updateDate': now,
'createDate': createDate,
}
try:
db_storage.update_one({'_id': mongoId}, {'$set': dic_info})
log.info(f'{id}===更新成功')
except:
log.error(f'{id}===更新失败')
else:
dic_info = {
'baseInfo': baseInfo,
'exchange': '上海证券交易所'
}
try:
db_storage.insert_one(dic_info)
log.info(f'{baseInfo["name"]}===初次入库成功')
except:
log.error(f'{baseInfo["name"]}===初次入库失败')
continue
mongoId = db_storage.find_one({'baseInfo.name': f'{baseInfo["name"]}', 'exchange': '上海证券交易所'})['_id']
file = getFile(obsOperate, id, str(mongoId))
feedback = getFeedback(obsOperate, id, str(mongoId))
if not baseInfo or file == 0 or feedback == 0:
db_storage.delete_one({'_id': mongoId})
log.error(f'{id}===数据采集不完整,初次入库数据已删除')
continue
now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
dic_info = {
'baseInfo': baseInfo,
'file': file,
'feedback': feedback,
'exchange': '上海证券交易所',
'updateDate':now, # 修改日期
'createDate':now # 创建日期
}
try:
db_storage.update_one({'_id': mongoId}, {'$set': dic_info})
log.info(f'{id}===保存成功')
except:
db_storage.delete_one({'_id': mongoId})
log.error(f'{id}===二次入库失败,已删除')
time.sleep(3)
if __name__ == '__main__':
obsOperate = obsOperate(cursor_, cnx_, log)
doJob(obsOperate)
baseCore.close()
\ No newline at end of file
import hashlib
import hashlib
from datetime import datetime
import re
import time
import uuid
from urllib.parse import unquote
import numpy as np
import pandas as pd
import pymongo
import requests
import os
import json
from obs import ObsClient
from retry import retry
from base import BaseCore
obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
)
baseCore = BaseCore.BaseCore()
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').RESCenter[
'REITsProjDynamics']
log = baseCore.getLogger()
cursor_ = baseCore.cursor_
cnx_ = baseCore.cnx_
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Type': 'application/json;charset=utf-8',
'Host': 'reits.szse.cn',
'Pragma': 'no-cache',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'X-Requested-With': 'XMLHttpRequest',
}
class obsOperate():
def __init__(self, cursor_, cnx_, log):
self.headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'reportdocs.static.szse.cn',
'Pragma': 'no-cache',
'Referer': 'http://reits.szse.cn/',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
self.cursor_ = cursor_
self.cnx_ = cnx_
self.log = log
def secrchATT(self, item_id, file_name, type_id, order_by):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s and order_by=%s '''
self.cursor_.execute(sel_sql, (item_id, file_name, type_id, order_by))
selects = self.cursor_.fetchone()
return selects
# 插入到att表 返回附件id
def tableUpdate(self, retData, com_name, file_name, num, pub_time):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
path = retData['path']
full_path = retData['full_path']
category = retData['category']
file_size = retData['file_size']
status = retData['status']
create_by = retData['create_by']
page_size = retData['page_size']
create_time = retData['create_time']
order_by = num
Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name,publish_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
file_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, path, 'zzsn', pub_time)
self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交
self.log.info("更新完成:{}".format(Upsql))
selects = self.secrchATT(item_id, file_name, type_id, order_by)
id = selects[0]
return id, full_path
def getuuid(self):
get_timestamp_uuid = uuid.uuid1() # 根据 时间戳生成 uuid , 保证全球唯一
return get_timestamp_uuid
# 获取文件大小
def convert_size(self, size_bytes):
# 定义不同单位的转换值
units = ['bytes', 'KB', 'MB', 'GB', 'TB']
i = 0
while size_bytes >= 1024 and i < len(units) - 1:
size_bytes /= 1024
i += 1
return f"{size_bytes:.2f} {units[i]}"
@retry(tries=5, delay=10)
def getRes(self, file_href):
response = requests.get(file_href, headers=self.headers)
if response.status_code != 200:
raise
return response
@retry(tries=5, delay=10)
def sendOBS(self, file_name, response):
result = obsClient.putContent('zzsn', 'PolicyDocument/' + file_name, content=response.content)
return result
def uptoOBS(self, file_href, item_id, file_name):
category = os.path.splitext(file_href)[1]
retData = {'state': False, 'type_id': 15, 'item_id': item_id, 'group_name': '', 'path': '',
'full_path': '',
'category': category, 'file_size': '', 'status': 1, 'create_by': 'LiuLiYuan',
'create_time': '', 'page_size': '', 'content': ''}
try:
response = self.getRes(file_href)
except:
self.log.error('文件获取失败')
return retData
file_size = int(response.headers.get('Content-Length'))
file_name = str(self.getuuid()) + category
try:
result = self.sendOBS(file_name, response)
except:
self.log.error(f'obs上传失败')
return retData
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = unquote(result['body']['objectUrl'])
retData['file_size'] = self.convert_size(file_size)
retData['create_time'] = time_now
except Exception as e:
print(f'error:{e}')
return retData
return retData
def md5_encrypt(text):
# 创建 MD5 对象
md5 = hashlib.md5()
# 更新 MD5 对象内容
md5.update(text.encode('utf-8'))
# 获取加密结果
encrypted_text = md5.hexdigest()
return encrypted_text
@retry(tries=5, delay=10)
def getPageSize():
# ip = baseCore.get_proxy()
url = 'http://reits.szse.cn/api/reits/projectrends/query?biztypsb=21&bizType=2&pageIndex=0&pageSize=10'
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
total = int(req.json()['totalSize'])
if total % 10 == 0:
pageSize = int(total / 10)
else:
pageSize = int(total / 10) + 1
req.close()
return pageSize
@retry(tries=5, delay=10)
def getDataJson(page):
# ip = baseCore.get_proxy()
url = f'http://reits.szse.cn/api/reits/projectrends/query?biztypsb=21&bizType=2&pageIndex={page}&pageSize=10'
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
data_json = req.json()['data']
req.close()
return data_json
@retry(tries=5, delay=10)
def getDataJson_(url):
# ip = baseCore.get_proxy()
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
data_json = req.json()['data']
req.close()
return data_json
def getBaseInfo(data):
cmpnm = data['cmpnm']
specialPlanName = data['specialPlanName']
issueTargetName = data['issueTargetName']
primitiveInterestsor = data['primitiveInterestsor']
acctfm = data['acctfm']
sprinst = data['sprinst']
lawfm = data['lawfm']
biztypsbName = data['biztypsbName']
prjst = data['prjst']
updtdt = data['updtdt']
acptdt = data['acptdt']
sponsors = []
sponsorMD5 = []
if ',' in primitiveInterestsor:
for sponsor in primitiveInterestsor.split(','):
sponsors.append(sponsor)
elif ',' in primitiveInterestsor:
for sponsor in primitiveInterestsor.split(','):
sponsors.append(sponsor)
elif ';' in primitiveInterestsor:
for sponsor in primitiveInterestsor.split(';'):
sponsors.append(sponsor)
else:
sponsors.append(primitiveInterestsor)
for sponsor in sponsors:
sponsorMD5.append(md5_encrypt(sponsor))
baseInfo = {
'name': cmpnm,
'variety': issueTargetName,
'sponsor': primitiveInterestsor,
'sponsors':sponsors,
'sponsorMD5':sponsorMD5,
'caretaker': acctfm,
'planName': specialPlanName,
'planCaretaker': sprinst,
'letterNum': '',
'status': prjst,
'updateDate': datetime.strptime(updtdt, '%Y-%m-%d'),
'strUpdateDate': updtdt,
'accDate': datetime.strptime(acptdt, '%Y-%m-%d'),
'strAccDate': acptdt,
'type': biztypsbName,
}
log.info(f'{cmpnm}===项目基本信息采集成功')
return baseInfo
def getFile(obsOperate, data_json, mongoId):
data = {
'fundContracts': {'title': '基金合同', 'fdbkResp': '', 'fdbkRespTime': '', 'strFdbkRespTime': ''},
'fundCustodianAgt': {'title': '基金托管协议', 'fdbkResp': '', 'fdbkRespTime': '', 'strFdbkRespTime': ''},
'prospectus': {'title': '基金招募说明书', 'fdbkResp': '', 'fdbkRespTime': '', 'strFdbkRespTime': ''},
'legalOpinion': {'title': '基金法律意见书', 'fdbkResp': '', 'fdbkRespTime': '', 'strFdbkRespTime': ''},
}
num = 1
for data_json_ in data_json['disclosureMaterials']:
type = int(data_json_['type'])
path = 'http://reportdocs.static.szse.cn' + data_json_['dfpth']
version = data_json_['matnm']
time_ = data_json_['ddt']
file_name = data_json_['dfnm']
category = os.path.splitext(path)[1]
if category not in file_name:
file_name = file_name + category
retData = obsOperate.uptoOBS(path, mongoId, file_name)
time.sleep(2)
if retData['state']:
pass
else:
log.error(f'{id}===信息披露文件及备查文件下载obs失败')
return 0
att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', file_name, num, time_)
num += 1
if version == '基金招募说明书':
if type == 1:
data['prospectus']['declaration'] = full_path
data['prospectus']['declarationTime'] = datetime.strptime(time_, '%Y-%m-%d')
data['prospectus']['strDeclarationTime'] = time_
elif type == 3:
data['prospectus']['cover'] = full_path
data['prospectus']['coverTime'] = datetime.strptime(time_, '%Y-%m-%d')
data['prospectus']['strCoverTime'] = time_
elif version == '基金合同':
if type == 1:
data['fundContracts']['declaration'] = full_path
data['fundContracts']['declarationTime'] = datetime.strptime(time_, '%Y-%m-%d')
data['fundContracts']['strDeclarationTime'] = time_
elif type == 3:
data['fundContracts']['cover'] = full_path
data['fundContracts']['coverTime'] = datetime.strptime(time_, '%Y-%m-%d')
data['fundContracts']['strCoverTime'] = time_
elif version == '基金托管协议':
if type == 1:
data['fundCustodianAgt']['declaration'] = full_path
data['fundCustodianAgt']['declarationTime'] = datetime.strptime(time_, '%Y-%m-%d')
data['fundCustodianAgt']['strDeclarationTime'] = time_
elif type == 3:
data['fundCustodianAgt']['cover'] = full_path
data['fundCustodianAgt']['coverTime'] = datetime.strptime(time_, '%Y-%m-%d')
data['fundCustodianAgt']['strCoverTime'] = time_
elif version == '基金法律意见书':
if type == 1:
data['legalOpinion']['declaration'] = full_path
data['legalOpinion']['declarationTime'] = datetime.strptime(time_, '%Y-%m-%d')
data['fundCustodianAgt']['strDeclarationTime'] = time_
elif type == 3:
data['legalOpinion']['cover'] = full_path
data['legalOpinion']['coverTime'] = datetime.strptime(time_, '%Y-%m-%d')
data['fundCustodianAgt']['strCoverTime'] = time_
log.info(f'{mongoId}===信息披露文件及备查文件采集成功')
return data
def getFeedback(obsOperate, data, mongoId):
feedback = []
data_json = data['enquiryResponseAttachment']
num = 1
for data_json_ in data_json:
path = 'http://reportdocs.static.szse.cn' + data_json_['dfpth']
time_ = data_json_['ddt']
file_name = data_json_['dfnm']
category = os.path.splitext(path)[1]
if category not in file_name:
file_name = file_name + category
retData = obsOperate.uptoOBS(path, mongoId, file_name)
time.sleep(2)
if retData['state']:
pass
else:
log.error(f'{id}===反馈意见及回复文件下载obs失败')
return 0
att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', file_name, num, time_)
data_ = {
'num': num,
'title': file_name,
'updateTime': datetime.strptime(time_, '%Y-%m-%d'),
'strUpdateTime': time_,
'filePath': full_path,
}
feedback.append(data_)
log.info(f'{mongoId}===反馈意见及回复采集成功')
return feedback
def updateFile(obsOperate, data_json, mongoId, mongoData):
data = {
'fundContracts': {'title': '基金合同', 'fdbkResp': '', 'fdbkRespTime': '', 'strFdbkRespTime': ''},
'fundCustodianAgt': {'title': '基金托管协议', 'fdbkResp': '', 'fdbkRespTime': '', 'strFdbkRespTime': ''},
'prospectus': {'title': '基金招募说明书', 'fdbkResp': '', 'fdbkRespTime': '', 'strFdbkRespTime': ''},
'legalOpinion': {'title': '基金法律意见书', 'fdbkResp': '', 'fdbkRespTime': '', 'strFdbkRespTime': ''},
}
num = 1
for data_json_ in data_json['disclosureMaterials']:
type = int(data_json_['type'])
path = 'http://reportdocs.static.szse.cn' + data_json_['dfpth']
version = data_json_['matnm']
time_ = data_json_['ddt']
file_name = data_json_['dfnm']
category = os.path.splitext(path)[1]
if category not in file_name:
file_name = file_name + category
if version == '基金招募说明书':
if datetime.strptime(time_, '%Y-%m-%d') == mongoData['file']['prospectus']['declarationTime']:
if type == 1:
data['prospectus']['declaration'] = mongoData['file']['prospectus']['declaration']
data['prospectus']['declarationTime'] = mongoData['file']['prospectus']['declarationTime']
data['prospectus']['strDeclarationTime'] = mongoData['file']['prospectus']['strDeclarationTime']
elif type == 3:
data['prospectus']['cover'] = mongoData['file']['prospectus']['cover']
data['prospectus']['coverTime'] = mongoData['file']['prospectus']['coverTime']
data['prospectus']['strCoverTime'] = mongoData['file']['prospectus']['strCoverTime']
else:
retData = obsOperate.uptoOBS(path, mongoId, file_name)
time.sleep(2)
if retData['state']:
pass
else:
log.error(f'{id}===信息披露文件及备查文件下载obs失败')
return 0
att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', file_name, num, time_)
if type == 1:
data['prospectus']['declaration'] = full_path
data['prospectus']['declarationTime'] = datetime.strptime(time_, '%Y-%m-%d')
data['prospectus']['strDeclarationTime'] = time_
elif type == 3:
data['prospectus']['cover'] = full_path
data['prospectus']['coverTime'] = datetime.strptime(time_, '%Y-%m-%d')
data['prospectus']['strCoverTime'] = time_
elif version == '基金合同':
if datetime.strptime(time_, '%Y-%m-%d') == mongoData['file']['fundContracts']['declarationTime']:
if type == 1:
data['fundContracts']['declaration'] = mongoData['file']['fundContracts']['declaration']
data['fundContracts']['declarationTime'] = mongoData['file']['fundContracts']['declarationTime']
data['fundContracts']['strDeclarationTime'] = mongoData['file']['fundContracts'][
'strDeclarationTime']
elif type == 3:
data['fundContracts']['cover'] = mongoData['file']['fundContracts']['cover']
data['fundContracts']['coverTime'] = mongoData['file']['fundContracts']['coverTime']
data['fundContracts']['strCoverTime'] = mongoData['file']['fundContracts']['strCoverTime']
else:
retData = obsOperate.uptoOBS(path, mongoId, file_name)
time.sleep(2)
if retData['state']:
pass
else:
log.error(f'{id}===信息披露文件及备查文件下载obs失败')
return 0
att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', file_name, num, time_)
if type == 1:
data['fundContracts']['declaration'] = full_path
data['fundContracts']['declarationTime'] = datetime.strptime(time_, '%Y-%m-%d')
data['fundContracts']['strDeclarationTime'] = time_
elif type == 3:
data['fundContracts']['cover'] = full_path
data['fundContracts']['coverTime'] = datetime.strptime(time_, '%Y-%m-%d')
data['fundContracts']['strCoverTime'] = time_
elif version == '基金托管协议':
if datetime.strptime(time_, '%Y-%m-%d') == mongoData['file']['fundCustodianAgt']['declarationTime']:
if type == 1:
data['fundCustodianAgt']['declaration'] = mongoData['file']['fundContracts']['declaration']
data['fundCustodianAgt']['declarationTime'] = mongoData['file']['fundCustodianAgt'][
'declarationTime']
data['fundCustodianAgt']['strDeclarationTime'] = mongoData['file']['fundCustodianAgt'][
'strDeclarationTime']
elif type == 3:
data['fundCustodianAgt']['cover'] = mongoData['file']['fundCustodianAgt']['cover']
data['fundCustodianAgt']['coverTime'] = mongoData['file']['fundCustodianAgt']['coverTime']
data['fundCustodianAgt']['strCoverTime'] = mongoData['file']['fundCustodianAgt']['strCoverTime']
else:
retData = obsOperate.uptoOBS(path, mongoId, file_name)
time.sleep(2)
if retData['state']:
pass
else:
log.error(f'{id}===信息披露文件及备查文件下载obs失败')
return 0
att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', file_name, num, time_)
if type == 1:
data['fundCustodianAgt']['declaration'] = full_path
data['fundCustodianAgt']['declarationTime'] = datetime.strptime(time_, '%Y-%m-%d')
data['fundCustodianAgt']['strDeclarationTime'] = time_
elif type == 3:
data['fundCustodianAgt']['cover'] = full_path
data['fundCustodianAgt']['coverTime'] = datetime.strptime(time_, '%Y-%m-%d')
data['fundCustodianAgt']['strCoverTime'] = time_
elif version == '基金法律意见书':
if datetime.strptime(time_, '%Y-%m-%d') == mongoData['file']['legalOpinion']['declarationTime']:
if type == 1:
data['legalOpinion']['declaration'] = mongoData['file']['legalOpinion']['declaration']
data['legalOpinion']['declarationTime'] = mongoData['file']['legalOpinion']['declarationTime']
data['fundCustodianAgt']['strDeclarationTime'] = mongoData['file']['fundCustodianAgt'][
'strDeclarationTime']
elif type == 3:
data['legalOpinion']['cover'] = mongoData['file']
data['legalOpinion']['coverTime'] = mongoData['file']
data['fundCustodianAgt']['strCoverTime'] = mongoData['file']
else:
retData = obsOperate.uptoOBS(path, mongoId, file_name)
time.sleep(2)
if retData['state']:
pass
else:
log.error(f'{id}===信息披露文件及备查文件下载obs失败')
return 0
att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', file_name, num, time_)
if type == 1:
data['legalOpinion']['declaration'] = full_path
data['legalOpinion']['declarationTime'] = datetime.strptime(time_, '%Y-%m-%d')
data['fundCustodianAgt']['strDeclarationTime'] = time_
elif type == 3:
data['legalOpinion']['cover'] = full_path
data['legalOpinion']['coverTime'] = datetime.strptime(time_, '%Y-%m-%d')
data['fundCustodianAgt']['strCoverTime'] = time_
num += 1
log.info(f'{mongoId}===信息披露文件及备查文件采集成功')
return data
def updateFeedback(obsOperate, data, mongoId, mongoData):
feedback = []
data_json = data['enquiryResponseAttachment']
num = 1
for data_json_ in data_json:
flg = True
path = 'http://reportdocs.static.szse.cn' + data_json_['dfpth']
time_ = data_json_['ddt']
file_name = data_json_['dfnm']
category = os.path.splitext(path)[1]
if category not in file_name:
file_name = file_name + category
for feedback_ in mongoData['feedback']:
if time_ == feedback_['strUpdateTime'] and file_name == feedback_['title']:
feedback_['num'] = num
feedback.append(feedback_)
flg = False
num += 1
break
if flg:
retData = obsOperate.uptoOBS(path, mongoId, file_name)
time.sleep(2)
if retData['state']:
pass
else:
log.error(f'{id}===反馈意见及回复文件下载obs失败')
return 0
att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', file_name, num, time_)
data_ = {
'num': num,
'title': file_name,
'updateTime': datetime.strptime(time_, '%Y-%m-%d'),
'strUpdateTime': time_,
'filePath': full_path,
}
feedback.append(data_)
log.info(f'{mongoId}===反馈意见及回复采集成功')
return feedback
def getInfo(obsOperate, id):
url = f'http://reits.szse.cn/api/reits/projectrends/details?id={id}'
try:
data = getDataJson_(url)
except:
log.error(f'{id}===信息总列表获取失败')
return {}
baseInfo = getBaseInfo(data)
is_insert = db_storage.find_one({'baseInfo.name': f"{baseInfo['name']}", 'exchange': '深圳证券交易所'})
if is_insert:
mongoId = is_insert['_id']
strUpdateDate = is_insert['baseInfo']['strUpdateDate']
if strUpdateDate == baseInfo['strUpdateDate']:
log.info(f'{id}===已采集,不需要更新')
return
else:
log.info(f'{id}===已采集,需要更新,正在更新中')
createDate = is_insert['createDate']
file = updateFile(obsOperate, data, str(mongoId), is_insert)
feedback = updateFeedback(obsOperate, data, str(mongoId), is_insert)
now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
dic_info = {
'baseInfo': baseInfo,
'file': file,
'feedback': feedback,
'exchange': '深圳证券交易所',
'updateDate': now,
'createDate': createDate,
}
try:
db_storage.update_one({'_id': mongoId}, {'$set': dic_info})
log.info(f'{id}===更新成功')
except:
log.error(f'{id}===更新失败')
else:
dic_info_ = {
'baseInfo': baseInfo,
'exchange': '深圳证券交易所'
}
try:
db_storage.insert_one(dic_info_)
log.error(f'{id}===初次保存成功')
except:
log.error(f'{id}===初次入库失败')
return
mongoId = db_storage.find_one({'baseInfo.name': f"{baseInfo['name']}", 'exchange': '深圳证券交易所'})['_id']
file = getFile(obsOperate, data, str(mongoId))
feedback = getFeedback(obsOperate, data, str(mongoId))
if not baseInfo or file == 0 or feedback == 0:
db_storage.delete_one({'_id': mongoId})
log.error(f'{id}===数据采集不完整,初次入库数据已删除')
return
now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
dic_info = {
'baseInfo': baseInfo,
'file': file,
'feedback': feedback,
'exchange': '深圳证券交易所',
'updateDate': now, # 修改日期
'createDate': now # 创建日期
}
try:
db_storage.update_one({'_id': mongoId}, {'$set': dic_info})
log.info(f'{id}===保存成功')
except:
db_storage.delete_one({'_id': mongoId})
log.error(f'{id}===二次入库失败,已删除')
def doJob(obsOperate):
pageSize = getPageSize()
log.info(f'共{pageSize}页')
for page in range(pageSize):
log.info(f'开始采集第{page + 1}页')
try:
data_json = getDataJson(page)
except Exception as e:
log.error(f'第{page + 1}页数据获取失败==={e}')
continue
for data_ in data_json:
id = data_['prjid']
getInfo(obsOperate, id)
time.sleep(3)
if __name__ == '__main__':
obsOperate = obsOperate(cursor_, cnx_, log)
doJob(obsOperate)
baseCore.close()
import json
import json
import re
import time
import calendar
import pymongo
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from apscheduler.schedulers.blocking import BlockingScheduler
from retry import retry
import BaseCore
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').RESCenter[
'REITsTxnStat']
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'query.sse.com.cn',
'Pragma': 'no-cache',
'Referer': 'http://www.sse.com.cn/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}
@retry(tries=5, delay=20)
def getJson(url):
# ip = baseCore.get_proxy()
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
data_json = re.findall('\((.*)\)', req.text)[0]
data_json = json.loads(data_json)
req.close()
return data_json
# 2021-06-26
# 每日概况
def getDayData():
# start_date = datetime(2021, 6, 21)
start_date = datetime.today() - timedelta(days=5)
end_date = datetime.today() - timedelta(days=1)
date_range = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]
for date in date_range:
date_ = date.strftime('%Y-%m-%d')
url = f'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback89728&sqlId=COMMON_SSE_REITS_HQXX_CJTJ_DAY_L&TRADE_DATE={date_}&FUND_TYPE=01&_={int(time.time())}'
try:
data_json = getJson(url)['result']
if len(data_json) == 0:
continue
data_json = data_json[0]
except Exception as e:
log.error(f'{date}===连接失败==={e}')
time.sleep(3)
continue
is_insert = db_storage.find_one({'strDate': str(date)[:10], 'exchange': '上海证券交易所'})
if is_insert:
log.info(f'{date}===已采集')
time.sleep(3)
continue
dic_info = {
'number': int(data_json['LIST_NUM']), # 挂牌数
'volume': float(data_json['TRADE_VOL']) * 10000, # 成交量
'amount': float(data_json['TRADE_AMT']) * 10000, # 成交金额
'totalValue': float(data_json['TOTAL_VALUE']) * 10000, # 市价总额
'negoValue': float(data_json['NEGO_VALUE']) * 10000, # 流通市值
'toRate': float(data_json['TO_RATE']), # 换手率
'date': date,
'strDate':str(date)[:10],
'country': '中国',
'exchange': '上海证券交易所',
'currency': 'CNY', # 币种
}
try:
db_storage.insert_one(dic_info)
log.info(f'{date}===采集成功')
except Exception as e:
log.error(f'{date}===数据存储失败==={e}')
time.sleep(3)
# 每周概况
def getWeekData(writer):
data_list = []
start_date = datetime(2021, 6, 21)
end_date = datetime.today()
date_range = [start_date + timedelta(days=x) for x in range(0, (end_date - start_date).days + 1, 7)]
for date_1 in date_range:
date_2 = (date_1 + timedelta(days=6)).strftime('%Y-%m-%d')
date_1 = date_1.strftime('%Y-%m-%d')
url = f'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback65413&sqlId=COMMON_SSE_REITS_HQXX_CJTJ_WEEK_L&START_DATE={date_1}&END_DATE={date_2}&FUND_TYPE=01&_={int(time.time())}'
data_json = getJson(url)['result']
for data_ in data_json:
data = [data_['LIST_NUM'], data_['TRADE_VOL'], data_['TRADE_AMT'], data_['TOTAL_VALUE'],
data_['NEGO_VALUE'], data_['TO_RATE'], f'{date_1}至{date_2}']
dic_info = {
'挂牌数': data_['LIST_NUM'],
'成交量(亿份)': data_['TRADE_VOL'],
'成交金额(亿元)': data_['TRADE_AMT'],
'市价总额(亿元)': data_['TOTAL_VALUE'],
'流通市值(亿元)': data_['NEGO_VALUE'],
'换手率(%)': data_['TO_RATE'],
'日期': f'{date_1}至{date_2}',
'类别': '每周概况'
}
db_storage.insert_one(dic_info)
log.info(f'{date_1}至{date_2}===采集完成')
data_list.append(data)
time.sleep(1)
df = pd.DataFrame(np.array(data_list))
df.columns = ['挂牌数', '成交量(亿份)', '成交金额(亿元)', '市价总额(亿元)', '流通市值(亿元)', '换手率(%)', '日期']
df.to_excel(writer, sheet_name='每周概况', index=False)
# 月度概况
def getMonthData(writer):
data_list = []
start_date = datetime.strptime('2021-06-01', '%Y-%m-%d')
current_date = datetime.now()
while start_date <= current_date:
year = start_date.year
month = start_date.month
date = start_date.strftime('%Y-%m')
url = f'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback76435&sqlId=COMMON_SSE_REITS_HQXX_CJTJ_MONTH_L&TRADE_DATE={date}&FUND_TYPE=01&_={int(time.time())}'
data_json = getJson(url)['result']
for data_ in data_json:
data = [data_['LIST_NUM'], data_['TRADE_VOL'], data_['TRADE_AMT'], data_['TOTAL_VALUE'],
data_['NEGO_VALUE'], data_['TO_RATE'], date]
dic_info = {
'挂牌数': data_['LIST_NUM'],
'成交量(亿份)': data_['TRADE_VOL'],
'成交金额(亿元)': data_['TRADE_AMT'],
'市价总额(亿元)': data_['TOTAL_VALUE'],
'流通市值(亿元)': data_['NEGO_VALUE'],
'换手率(%)': data_['TO_RATE'],
'日期': date,
'类别': '月度概况'
}
db_storage.insert_one(dic_info)
log.info(f'{date}===采集完成')
data_list.append(data)
if month == 12:
start_date = start_date.replace(year=year + 1, month=1)
else:
start_date = start_date.replace(month=month + 1)
time.sleep(1)
df = pd.DataFrame(np.array(data_list))
df.columns = ['挂牌数', '成交量(亿份)', '成交金额(亿元)', '市价总额(亿元)', '流通市值(亿元)', '换手率(%)', '日期']
df.to_excel(writer, sheet_name='每月概况', index=False)
def task():
# 实例化一个调度器
scheduler = BlockingScheduler()
# 每天执行一次
scheduler.add_job(getDayData, 'cron', hour='8', minute=0, max_instances=2 )
try:
scheduler.start()
except Exception as e:
log.error('定时采集异常', e)
pass
if __name__ == '__main__':
task()
baseCore.close()
\ No newline at end of file
import time
import time
from datetime import datetime
from datetime import timedelta
from decimal import Decimal
import pymongo
import requests
from retry import retry
from apscheduler.schedulers.blocking import BlockingScheduler
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').RESCenter[
'REITsTxnStat']
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Host': 'www.szse.cn',
'Pragma': 'no-cache',
'Referer': 'http://www.szse.cn/market/fund/dealSurvey/daily/index.html',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'X-Request-Type': 'ajax',
'X-Requested-With': 'XMLHttpRequest'
}
@retry(tries=5, delay=10)
def getJson(url):
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
return req.json()[0]['data']
@retry(tries=5, delay=10)
def getJsonB(date):
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
# }
url = f'http://www.szse.cn/api/report/ShowReport/data?SHOWTYPE=JSON&CATALOGID=1803_sczm&TABKEY=tab1&txtQueryDate={date}'
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
datas = req.json()[0]['data']
for data in datas:
if '基础设施基金' in data['lbmc']:
try:
amount = db_storage.find_one({'exchange': '深圳证券交易所', 'strDate': date})['amount']
except:
log.error(f'{date}===无成交金额')
break
number = int(data['zqsl'])
totalValue = float(Decimal(data['sjzz']) * Decimal('10000'))
negoValue = float(Decimal(data['ltsz']) * Decimal('10000'))
return number,totalValue,negoValue
def doJob():
# start_date = datetime(2021, 6, 21)
start_date = datetime.today() - timedelta(days=5)
end_date = datetime.today() - timedelta(days=1)
date_range = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]
for date in date_range:
date_ = date.strftime('%Y-%m-%d')
is_insert = db_storage.find_one({'strDate': str(date)[:10], 'exchange': '深圳证券交易所'})
if is_insert:
log.info(f'{date_}===已采集')
time.sleep(3)
continue
url = f'http://www.szse.cn/api/report/ShowReport/data?SHOWTYPE=JSON&CATALOGID=scsj_jjrdgk&TABKEY=tab1&txtQueryDate={date_}&tjzqlb=D'
data_json = getJson(url)
for data in data_json:
if '基础设施基金' in data['lbmc']:
volume = data['cjl'].replace(',', '')
amount = data['cjje'].replace(',', '')
number,totalValue,negoValue = getJsonB(date_)
toRate = float(Decimal(f'{amount}') / Decimal(f'{totalValue}') * Decimal('100'))
dic_info = {
'number': number, # 挂牌数
'volume': float(volume), # 成交量
'amount': float(amount), # 成交金额
'totalValue': totalValue, # 市价总额
'negoValue': negoValue, # 流通市值
'toRate': toRate, # 换手率
'date': date,
'strDate': str(date)[:10],
'country': '中国',
'exchange': '深圳证券交易所',
'currency': 'CNY', # 币种
}
try:
db_storage.insert_one(dic_info)
log.info(f'{date_}===数据采集成功')
except:
log.error(f'{date_}===数据入库失败')
time.sleep(3)
def task():
# 实例化一个调度器
scheduler = BlockingScheduler()
# 每天执行一次
scheduler.add_job(doJob, 'cron', hour='8', minute=0, max_instances=2 )
try:
scheduler.start()
except Exception as e:
log.error('定时采集异常', e)
pass
if __name__ == '__main__':
task()
# number,totalValue,negoValue = getJsonB('2023-12-27')
# print(number,totalValue,negoValue)
baseCore.close()
import datetime
import datetime
import pymongo
import requests
from fitz import fitz
import BaseCore
baseCore = BaseCore.BaseCore()
cursor_ = baseCore.cursor_
cnx_ = baseCore.cnx_
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').RESCenter[
'REITsProjDynamics']
datas = db_storage.find({'exchange':'上海证券交易所'})
for data in datas:
mongoId = data['_id']
for a in data['file'].keys():
if data['file'][f'{a}']:
path_1 = data['file'][f'{a}']['declaration']
path_2 = data['file'][f'{a}']['fdbkResp']
path_3 = data['file'][f'{a}']['cover']
if path_1:
# sql = f"select id from clb_sys_attachment where full_path='{path_1}' and type_id=15"
# cursor_.execute(sql)
# id = cursor_.fetchone()
# print(id)
sql = f"update clb_sys_attachment set item_id='{mongoId}' where full_path='{path_1}' and type_id=15"
cursor_.execute(sql)
cnx_.commit()
if path_2:
# sql = f"select id from clb_sys_attachment where full_path='{path_2}'and type_id=15"
# cursor_.execute(sql)
# id = cursor_.fetchone()
# print(id)
sql = f"update clb_sys_attachment set item_id='{mongoId}' where full_path='{path_2}' and type_id=15"
cursor_.execute(sql)
cnx_.commit()
if path_3:
# sql = f"select id from clb_sys_attachment where full_path='{path_3}'and type_id=15"
# cursor_.execute(sql)
# id = cursor_.fetchone()
# print(id)
sql = f"update clb_sys_attachment set item_id='{mongoId}' where full_path='{path_3}' and type_id=15"
cursor_.execute(sql)
cnx_.commit()
import os
import sys
import logbook
def logFormate(record, handler):
formate = "[{date}] [{level}] [{filename}] [{func_name}] [{lineno}] {msg}".format(
date=record.time, # 日志时间
level=record.level_name, # 日志等级
filename=os.path.split(record.filename)[-1], # 文件名
func_name=record.func_name, # 函数名
lineno=record.lineno, # 行号
msg=record.message # 日志内容
)
return formate
# 获取logger
def getLogger(fileLogFlag=True, stdOutFlag=True):
pid = os.getpid()
dirname, filename = os.path.split(os.path.abspath(sys.argv[0]))
dirname = os.path.join(dirname, "logs")
filename = filename.replace(".py", "") + f"{pid}.log"
if not os.path.exists(dirname):
os.mkdir(dirname)
logbook.set_datetime_format('local')
logger = logbook.Logger(filename)
logger.handlers = []
if fileLogFlag: # 日志输出到文件
logFile = logbook.TimedRotatingFileHandler(os.path.join(dirname, filename), date_format='%Y-%m-%d',
bubble=True, encoding='utf-8')
logFile.formatter = logFormate
logger.handlers.append(logFile)
if stdOutFlag: # 日志打印到屏幕
logStd = logbook.more.ColorizedStderrHandler(bubble=True)
logStd.formatter = logFormate
logger.handlers.append(logStd)
return logger
\ No newline at end of file
......@@ -5,7 +5,8 @@ import pandas as pd
import requests
from goose3 import Goose
from goose3.text import StopWordsChinese, StopWordsKorean, StopWordsArabic
sys.path.append('D:\\kkwork\\zzsn_spider\\base\\smart')
# sys.path.append('D:\\kkwork\\zzsn_spider\\base\\smart')
sys.path.append(r'F:\zzsn\zzsn_spider\base\smart')
from entity import *
from smart_extractor_utility import SmartExtractorUtility
# goose3自带的lxml,提示找不到etree,但仍可使用
......
import json
import time
import numpy as np
import pandas as pd
import requests
import urllib3
from bs4 import BeautifulSoup
from kafka import KafkaProducer
import sys
sys.path.append(r'D:\zzsn_spider\base')
import BaseCore
from retry import retry
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
taskType = '企业基本信息/雅虎财经'
baseCore = BaseCore.BaseCore()
cursor = baseCore.cursor
cnx = baseCore.cnx
r = baseCore.r
log = baseCore.getLogger()
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'cache-control': 'max-age=0',
'sec-ch-ua': '"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "Windows",
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
# 保存基本信息
def saveBaseInfo(info, xydm, gpdm,id_):
url_ = info['base_info']['公司网站']
add_ = info['base_info']['地址']
company_dict = [id_,xydm, info['base_info']['英文名'], info['base_info']['电话'], url_, info['base_info']['公司简介'],
info['base_info']['行业'], add_, gpdm]
return company_dict
# 获取请求响应
@retry(tries=5, delay=3)
def getRes(url):
response = requests.get(url, headers=headers, verify=False)
if response.status_code != 200:
raise
return response
# 根据股票代码 获取企业基本信息
def getInfo(gpdm, start):
if 'HK' in str(gpdm):
tmp_g = str(gpdm).split('.')[0]
if len(tmp_g) == 5:
gpdm_ = str(gpdm)[1:]
else:
gpdm_ = gpdm
elif str(gpdm)[-2:] == '.N' or str(gpdm)[-2:] == '.O':
gpdm_ = gpdm[:-2]
else:
gpdm_ = gpdm
retData = {}
url = f'https://finance.yahoo.com/quote/{gpdm_}/profile?p={gpdm_}'
time.sleep(3)
try:
response = getRes(url)
except:
log.error(f"{gpdm}------访问基本信息页面失败")
return retData,'访问基本信息页面失败'
if 'lookup' in response.url:
log.error(f"{gpdm}------股票代码未查询到信息:{response.status_code}")
return retData,'股票代码未查询到信息'
if url != response.url:
log.error(f'{gpdm}------请求失败')
return retData,'请求失败'
state = 1
soup = BeautifulSoup(response.content, 'html.parser')
page = soup.find('div', {'id': 'Col1-0-Profile-Proxy'})
if page.text == '' or 'Invalid Date data is not available' in page.text:
state = 0
log.error(f'{gpdm}---没有基本信息')
return retData,'没有基本信息'
try:
name = page.find('h3', {'class': 'Fz(m) Mb(10px)'}).text.lstrip().strip()
except:
log.error(f'{gpdm}------其它错误原因')
return retData,'其它错误原因'
try:
com_info = page.find('div', {'class': 'Mb(25px)'})
except:
com_info = ''
try:
com_phone = com_info.find_all('p')[0].find('a').text.lstrip().strip()
except:
com_phone = ''
try:
com_url = com_info.find_all('p')[0].find('a', {'target': '_blank'}).text.lstrip().strip()
except:
com_url = ''
try:
com_address = ''
com_addressTag = com_info.find_all('p')[0]
a_list = com_addressTag.select('a')
for a in a_list:
a.decompose()
com_addressTag = str(com_addressTag).replace('<br/>', '</p><p>')
com_addressTag = BeautifulSoup(com_addressTag, 'html.parser')
p_list = com_addressTag.select('p')
for p in p_list:
com_address += p.text.lstrip().strip() + ' '
com_address = com_address.lstrip().strip()
except:
com_address = ''
try:
com_bumen = com_info.find_all('p')[1].find_all('span', {'class': 'Fw(600)'})[0].text.lstrip().strip()
except:
com_bumen = ''
try:
com_hangye = com_info.find_all('p')[1].find_all('span', {'class': 'Fw(600)'})[1].text.lstrip().strip()
except:
com_hangye = ''
try:
com_people = com_info.find_all('p')[1].find_all('span', {'class': 'Fw(600)'})[2].text.lstrip().strip()
except:
com_people = ''
try:
com_jianjie = page.find('p', {'class': 'Mt(15px) Lh(1.6)'}).text.lstrip().strip()
except:
com_jianjie = ''
dic_com_info = {
'英文名': name,
'股票代码': gpdm,
'地址': com_address,
'电话': com_phone,
'公司网站': com_url,
'部门': com_bumen,
'行业': com_hangye,
'员工人数': com_people,
'公司简介': com_jianjie
}
retData['base_info'] = dic_com_info
log.info(f"获取基本信息--{gpdm},耗时{baseCore.getTimeCost(start, time.time())}")
response.close()
return retData,'成功'
# 采集工作
def beginWork():
dic_list = []
error_list1 = []
error_list2 = []
error_list3 = []
writer_ = pd.ExcelWriter(r'D:\zzsn_spider\comData\未采集到企业基本信息_50001-55000.xlsx')
writer = pd.ExcelWriter(r'D:\zzsn_spider\comData\企业基本信息_50001-55000.xlsx')
df = pd.read_excel(r'D:\zzsn_spider\comData\雅虎财经上市企业信息采集50001-55000_20231215.xlsx',sheet_name='yahoostock')
# xydm_list = df['信用代码']
gpdm_list = df['symbol']
id_list = df['id']
for i in range(len(gpdm_list)):
gpdm = gpdm_list[i]
id_ = id_list[i]
if not gpdm or gpdm == '':
continue
info,exc = getInfo(gpdm, time.time())
if info:
dic = saveBaseInfo(info, '', gpdm,id_)
dic_list.append(dic)
else:
if exc == '股票代码未查询到信息':
error_list1.append([id_,gpdm])
elif exc == '没有基本信息':
error_list2.append([id_,gpdm])
elif exc == '其它错误原因':
error_list3.append([id_,gpdm])
df_ = pd.DataFrame(np.array(dic_list))
df_.columns = ['id','信用代码','英文名','电话','官网','简介','行业','地址','股票代码']
df_.to_excel(writer, index=False)
writer.save()
df_1 = pd.DataFrame(np.array(error_list1))
df_2 = pd.DataFrame(np.array(error_list2))
df_3 = pd.DataFrame(np.array(error_list3))
df_1.columns = ['id','股票代码']
df_2.columns = ['id','股票代码']
df_3.columns = ['id','股票代码']
df_1.to_excel(writer_, index=False,sheet_name='股票代码为查询到信息')
df_2.to_excel(writer_, index=False,sheet_name='没有基本信息')
df_3.to_excel(writer_, index=False,sheet_name='其它错误原因')
writer_.save()
# 释放资源
baseCore.close()
if __name__ == '__main__':
beginWork()
import json
import time
import pandas as pd
import requests
import urllib3
from bs4 import BeautifulSoup
# sys.path.append(r'F:\zzsn\zzsn_spider\base')
# import BaseCore
from retry import retry
from base import BaseCore
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
taskType = '企业基本信息/雅虎财经'
baseCore = BaseCore.BaseCore()
r = baseCore.r
log = baseCore.getLogger()
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'cache-control': 'max-age=0',
'sec-ch-ua': '"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "Windows",
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
@retry(tries=5, delay=3)
def getRes(url):
response = requests.get(url, headers=headers, verify=False)
if response.status_code != 200:
raise
return response
# 根据股票代码 获取企业基本信息 高管信息
def getInfo(xydm, gpdm, start):
if 'HK' in str(gpdm):
tmp_g = str(gpdm).split('.')[0]
if len(tmp_g) == 5:
gpdm_ = str(gpdm)[1:]
else:
gpdm_ = gpdm
elif str(gpdm)[-2:] == '.N' or str(gpdm)[-2:] == '.O':
gpdm_ = gpdm[:-2]
else:
gpdm_ = gpdm
retData = {}
url = f'https://finance.yahoo.com/quote/{gpdm_}/profile?p={gpdm_}'
log.info(url)
time.sleep(3)
try:
response = getRes(url)
except:
log.error(f"{gpdm}------访问基本信息页面失败")
state = -1
exeception = '访问基本信息页面失败'
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog('', taskType, state, takeTime, url, exeception)
r.lpush('BaseInfoEnterprise:gwqy_socialCode', '')
return state, retData
if url != response.url:
log.error(f'{gpdm}------请求失败')
state = -1
# r.lpush('BaseInfoEnterprise:gwqy_socialCode_gg', xydm)
return state, retData
state = 1
soup = BeautifulSoup(response.content, 'html.parser')
page = soup.find('div', {'id': 'Col1-0-Profile-Proxy'})
# 高管信息
retPeople = []
try:
list_people = page.find('table', {'class': 'W(100%)'}).find_all('tr')[1:]
except:
list_people = []
for one_people in list_people:
try:
p_name = one_people.find_all('td')[0].text
except:
p_name = ''
continue
try:
p_zhiwu = one_people.find_all('td')[1].text
except:
p_zhiwu = ''
try:
p_money = one_people.find_all('td')[2].text
except:
p_money = ''
try:
p_xingshi = one_people.find_all('td')[3].text
except:
p_xingshi = ''
try:
p_year = one_people.find_all('td')[4].text
except:
p_year = ''
if (p_zhiwu == "N/A"):
p_zhiwu = ""
if (p_money == "N/A"):
p_money = ""
if (p_xingshi == "N/A"):
p_xingshi = ""
if (p_year == "N/A"):
p_year = ""
dic_main_people = {
'股票代码': gpdm,
'信用代码': xydm,
'姓名': p_name,
'职务': p_zhiwu,
'薪资': p_money,
'行使': p_xingshi,
'出生年份': p_year
}
retPeople.append(dic_main_people)
retData['people_info'] = retPeople
log.info(f"获取高管信息--{gpdm},耗时{baseCore.getTimeCost(start, time.time())}")
response.close()
return state, retData
@retry(tries=3, delay=2)
def sendPost(json_updata):
response = requests.post('http://114.115.236.206:9988/datapull/sync/executive', data=json_updata,
timeout=300, verify=False)
if (response.status_code == 200):
retJson = json.loads(response.content.decode('utf-8'))
if (retJson['success'] or retJson['success'] == 'true'):
pass
else:
raise
else:
raise
# 保存高管信息
def savePeopleInfo(info, xydm, start):
# 高管信息调用接口
list_people = info['people_info']
list_one_info = []
for i in range(0, len(list_people)):
dic_json = {
"socialCreditCode": list_people[i]['信用代码'],
"name": list_people[i]['姓名'],
"sex": '',
"education": '',
"position": list_people[i]['职务'],
"salary": list_people[i]['薪资'],
"birthYear": list_people[i]['出生年份'],
"shareNum": '',
"shareRatio": '',
"benefitShare": '',
"currentTerm": '',
"personInfo": '',
"sort": str(i + 1)
}
list_one_info.append(dic_json)
json_updata = json.dumps(list_one_info)
if json_updata == '[]':
log.info("没有高管")
pass
else:
try:
sendPost(json_updata)
except:
log.error(f"保存高管接口失败")
exception = '保存高管接口失败'
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(xydm, taskType, state, takeTime, '', exception)
return state
state = 1
log.info(f"保存高管信息--{xydm},耗时{baseCore.getTimeCost(start, time.time())}")
return state
# 采集工作
def beginWork():
while True:
social_code = baseCore.redicPullData('BaseInfoEnterprise:gwqy_socialCode_gg')
# social_code = 'ZZSN231114182705007'
if social_code == 'None' or not social_code:
time.sleep(20)
break
# 数据库中获取基本信息
data = baseCore.getInfomation(social_code)
gpdm = data[3]
xydm = data[2]
# 获取该企业对应项目的采集次数
start_time = time.time()
try:
state, retData = getInfo(xydm, gpdm, start_time)
if state == 1:
state = savePeopleInfo(retData, xydm, start_time)
time.sleep(1)
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(xydm, taskType, state, takeTime, '', '')
else:
pass
except Exception as e:
log.error(f'{xydm}===错误原因:{e}')
pass
# 释放资源
baseCore.close()
if __name__ == '__main__':
beginWork()
\ No newline at end of file
import json
import time
import numpy as np
import pandas as pd
import requests
import urllib3
from bs4 import BeautifulSoup
from kafka import KafkaProducer
# sys.path.append(r'F:\zzsn\zzsn_spider\base')
# import BaseCore
from retry import retry
from base import BaseCore
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
taskType = '企业基本信息/雅虎财经'
baseCore = BaseCore.BaseCore()
r = baseCore.r
log = baseCore.getLogger()
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'cache-control': 'max-age=0',
'sec-ch-ua': '"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "Windows",
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
# 发送kafka
@retry(delay=5)
def sendKafka(company_dict):
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
# kafka_result = producer.send("regionInfo", json.dumps(company_dict, ensure_ascii=False).encode('utf8'))
kafka_result = producer.send("enterpriseInfo", json.dumps(company_dict, ensure_ascii=False).encode('utf8'))
kafka_result.get(timeout=10)
# 保存基本信息
def saveBaseInfo(info, code, start):
yname = code.split('|')[1] # 原名
cname = code.split('|')[2] # 中文名
oname = code.split('|')[4] # 曾用名
shortname = code.split('|')[5] # 简称
url = code.split('|')[7] # 网址
add = code.split('|')[8] # 地址
country = code.split('|')[16] # 国家
gpdm = code.split('|')[17] # 股票代码
gpjc = code.split('|')[18] # 股票简称
category = code.split('|')[19] # 股票类型
jys = code.split('|')[20] # 交易所
ipotime = code.split('|')[21] # 上市时间
# 基本信息发送到kafka
url_ = info['base_info']['公司网站']
add_ = info['base_info']['地址']
if url_ == '' and url != '':
url_ = url
if add_ == '' and add != '':
add_ = add
company_dict = {
'originalName': yname, # 企业原名称
'name': cname, # 企业中文名称
'shortName': shortname, # 企业简称
'officialPhone': info['base_info']['电话'], # 电话
'officialUrl': url_, # 官网
'briefInfo': info['base_info']['公司简介'], # 简介
'industry': info['base_info']['行业'], # 所属行业
'englishName': info['base_info']['英文名'], # 英文名
'address': add_, # 地址
'beforeName': oname, # 曾用名
'ynDomestic': 0, # 是否国内(1-是;0-否)
'countryName': country,
'securitiesCode': gpdm, # 股票代码
'securitiesShortName': gpjc, # 股票代码简称
'listingDate': ipotime, # 上市时间
'category': category, # 股票类型
'exchange': jys, # 交易所
'status': 0, # 状态
}
sendKafka(company_dict)
log.info(
f"保存基本信息--{info['base_info']['英文名']}--{gpdm}---耗时{baseCore.getTimeCost(start, time.time())}")
# 获取请求响应
@retry(tries=5, delay=3)
def getRes(url):
response = requests.get(url, headers=headers, verify=False)
if response.status_code != 200:
raise
return response
# 根据股票代码 获取企业基本信息
def getInfo(code, gpdm, start):
if 'HK' in str(gpdm):
tmp_g = str(gpdm).split('.')[0]
if len(tmp_g) == 5:
gpdm_ = str(gpdm)[1:]
else:
gpdm_ = gpdm
elif str(gpdm)[-2:] == '.N' or str(gpdm)[-2:] == '.O':
gpdm_ = gpdm[:-2]
else:
gpdm_ = gpdm
retData = {}
url = f'https://finance.yahoo.com/quote/{gpdm_}/profile?p={gpdm_}'
time.sleep(3)
try:
response = getRes(url)
except:
log.error(f"{gpdm}------访问基本信息页面失败")
exeception = '访问基本信息页面失败'
state = -1
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog('', taskType, state, takeTime, url, exeception)
r.lpush('BaseInfoEnterprise:gwqy_socialCode', code)
return state, retData, exeception
if 'lookup' in response.url:
log.error(f"{gpdm}------股票代码未查询到信息:{response.status_code}")
exeception = '股票代码未查询到信息'
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog('', taskType, 0, takeTime, url, exeception)
return state, retData, exeception
if url != response.url:
log.error(f'{gpdm}------请求失败')
exeception = '请求失败'
state = -1
r.lpush('BaseInfoEnterprise:gwqy_socialCode', code)
return state, retData, exeception
state = 1
soup = BeautifulSoup(response.content, 'html.parser')
page = soup.find('div', {'id': 'Col1-0-Profile-Proxy'})
if page.text == '':
state = 0
exeception = '无基本信息'
return state, retData, exeception
try:
try:
name = page.find('h3', {'class': 'Fz(m) Mb(10px)'}).text.lstrip().strip()
try:
com_info = page.find('div', {'class': 'Mb(25px)'})
except:
com_info = ''
try:
com_phone = com_info.find_all('p')[0].find('a').text.lstrip().strip()
except:
com_phone = ''
try:
com_url = com_info.find_all('p')[0].find('a', {'target': '_blank'}).text.lstrip().strip()
except:
com_url = ''
try:
com_address = ''
com_addressTag = com_info.find_all('p')[0]
a_list = com_addressTag.select('a')
for a in a_list:
a.decompose()
com_addressTag = str(com_addressTag).replace('<br/>', '</p><p>')
com_addressTag = BeautifulSoup(com_addressTag, 'html.parser')
p_list = com_addressTag.select('p')
for p in p_list:
com_address += p.text.lstrip().strip() + ' '
com_address = com_address.lstrip().strip()
except:
com_address = ''
try:
com_bumen = com_info.find_all('p')[1].find_all('span', {'class': 'Fw(600)'})[0].text.lstrip().strip()
except:
com_bumen = ''
try:
com_hangye = com_info.find_all('p')[1].find_all('span', {'class': 'Fw(600)'})[1].text.lstrip().strip()
except:
com_hangye = ''
try:
com_people = com_info.find_all('p')[1].find_all('span', {'class': 'Fw(600)'})[2].text.lstrip().strip()
except:
com_people = ''
try:
com_jianjie = page.find('p', {'class': 'Mt(15px) Lh(1.6)'}).text.lstrip().strip()
except:
com_jianjie = ''
except:
name = page.find('h3', {'class': 'Mb(5px) Mend(40px)'}).text.lstrip().strip()
try:
com_phone = page.find('span',class_='D(b) Lh(21px) Mb(20px) C($linkColor)').text.strip().lstrip()
except:
com_phone = ''
except:
state = 0
exeception = '其它错误原因'
return state, retData, exeception
dic_com_info = {
'英文名': name,
'股票代码': gpdm,
'地址': com_address,
'电话': com_phone,
'公司网站': com_url,
'部门': com_bumen,
'行业': com_hangye,
'员工人数': com_people,
'公司简介': com_jianjie
}
retData['base_info'] = dic_com_info
log.info(f"获取基本信息--{gpdm},耗时{baseCore.getTimeCost(start, time.time())}")
response.close()
return state, retData, ''
# 采集工作
def beginWork():
data_false = []
data_true = []
while True:
code = baseCore.redicPullData('BaseInfoEnterprise:gwqy_socialCode')
# 标志某次新增企业全部采集完毕,需要反馈采集情况
if code == 'end':
nowtime = baseCore.getNowTime(1).replace('-', '')[:10]
# 将采集情况保存至本地
writer = pd.ExcelWriter(f'./企业基本信息采集情况_{nowtime}.xlsx')
# 采集失败列表
if data_false:
df_f = pd.DataFrame(np.array(data_false))
df_f.columns = ['企业名称', '股票代码', '失败原因']
df_f.to_excel(writer, sheet_name='采集失败', index=False)
# 采集成功列表
if data_true:
df_t = pd.DataFrame(np.array(data_true))
df_t.columns = ['企业名称', '股票代码']
df_t.to_excel(writer, sheet_name='采集成功', index=False)
if data_true or data_false:
writer.save()
# 发送邮件
baseCore.sendEmail()
# 采集成功与失败列表置为空
data_false = []
data_true = []
continue
if not code or code == 'None':
time.sleep(20)
continue
# 数据库中获取基本信息
ename = code.split('|')[3].lstrip().strip() # 英文名
url = code.split('|')[7].lstrip().strip() # 网址
if url[-1] == '/':
url = url[:-1]
add = code.split('|')[8].lstrip().strip() # 地址
gpdm = code.split('|')[17].lstrip().strip() # 股票代码
log.info(f'==={gpdm}===开始采集基本信息')
start_time = time.time()
try:
state, retData, exeception = getInfo(code, gpdm, start_time)
# 基本信息采集成功 进行数据入库,否则不入库
# state 1采集成功 0采集不到基本信息 -1页面访问失败
if state == 1:
# 企业基本信息入库
try:
ename_ = retData['base_info']['英文名']
if ename_ != ename:
data_false.append([ename, gpdm, '采集到企业名称与所给企业名称不同'])
log.error(f'{gpdm}===采集失败')
else:
url_ = retData['base_info']['公司网站']
if url not in url_ and url_ != '' and url != '':
data_false.append([ename, gpdm, '采集到企业网址与所给企业网址不同'])
log.error(f'{gpdm}===采集失败')
else:
add_ = retData['base_info']['地址']
if add not in add_ and add_ != '' and add != '':
data_false.append([ename, gpdm, '采集到企业地址与所给企业地址不同'])
log.error(f'{gpdm}===采集失败')
else:
saveBaseInfo(retData, code, start_time)
time.sleep(3)
data_true.append([ename, gpdm])
except Exception as e:
r.lpush('BaseInfoEnterprise:gwqy_socialCode', code)
log.error(f'{ename}....企业基本信息Kafka操作失败')
exception = 'Kafka操作失败'
log.error(e)
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog('', taskType, state, takeTime, '', exception)
time.sleep(3)
elif state == 0:
data_false.append([ename, gpdm, exeception])
time.sleep(3)
except Exception as e:
data_false.append([ename, gpdm, '其它错误原因'])
log.error(f'{gpdm}===信息采集错误')
log.error(e)
time.sleep(3)
# 释放资源
baseCore.close()
if __name__ == '__main__':
#beginWork()
url = 'https://finance.yahoo.com/quote/501057.SS/profile?p=501057.SS'
req = requests.get(url,headers=headers,verify=False)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
page = soup.find('div', {'id': 'Col1-0-Profile-Proxy'})
print(page.text)
# -*- coding: utf-8 -*-
# 雅虎财经企业动态获取
# -*- coding: utf-8 -*-
# 雅虎财经企业动态获取
import json
import os
import signal
import time
import pymysql
from kafka import KafkaProducer
from selenium.webdriver.common.by import By
import sys
from retry import retry
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import traceback
from base import BaseCore
from base.smart import smart_extractor
sys.path.append('D:\\zzsn_spider\\base')
import BaseCore
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
r = baseCore.r
taskType = '企业动态/雅虎财经'
smart =smart_extractor.SmartExtractor('cn')
# smart =smart_extractor.SmartExtractor('cn')
last_url = ''
# 获取资讯详情
def getZx(xydm, url, title, cnx, path):
start_time_content = time.time()
try:
driverContent = baseCore.buildDriver(path)
driverContent.get(url)
try:
clickButton = driverContent.find_element(By.CLASS_NAME, "collapse-button")
clickButton.click()
except Exception as e:
pass
time.sleep(0.5)
authorElement = driverContent.find_element(By.CLASS_NAME, "caas-author-byline-collapse")
timeElement = driverContent.find_element(By.CLASS_NAME, "caas-attr-time-style").find_element(By.TAG_NAME,
"time")
# 发送kafka
@retry(tries=3, delay=5)
def sendKafka(dic_news):
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("researchReportTopic",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
contentElement = driverContent.find_element(By.CLASS_NAME, "caas-body").get_attribute('outerHTML')
print(kafka_result.get(timeout=10))
author = authorElement.text.lstrip().strip().replace("'", "''")
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
log.info(dic_result)
pub_time = timeElement.get_attribute("datetime").lstrip().strip().replace("'", "''").replace("T", " ")
pub_time = pub_time[0:19]
content = contentElement.replace("'", "''")
driverContent.close()
# driverContent.quit()
# 保存MySQL数据库
@retry(tries=3, delay=5)
def insertMysql(list_info):
insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,publish_time,title,content,create_time) values(%s,%s,%s,%s,%s,%s,%s,now())'''
cursor.execute(insert_sql, tuple(list_info))
cnx.commit()
# 动态信息列表
list_info = [
xydm,
url,
'雅虎财经',
'2'
]
try:
insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())'''
cursor.execute(insert_sql, tuple(list_info))
cnx.commit()
except Exception as e1:
log.error("保存数据库失败")
exception = '数据库传输失败'
return exception
log.info(f"文章耗时,耗时{baseCore.getTimeCost(start_time_content, time.time())}")
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:插入一条数据,并传入kafka
dic_news = {
'attachmentIds': '',
'author': '',
'content': content,
'contentWithTag': content,
'createDate': time_now,
'deleteFlag': '0',
'id': '',
'keyWords': '',
'lang': 'en',
'origin': '雅虎财经',
'publishDate': pub_time,
'sid': '1684032033495392257',
'sourceAddress': url, # 原文链接
'summary': '',
'title': title,
'type': 2,
'socialCreditCode': social_code,
'year': pub_time[:4]
}
# print(dic_news)
# 将相应字段通过kafka传输保存
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("researchReportTopic",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
log.info(dic_result)
# 传输成功,写入日志中
exception = ''
return exception
# return True
except Exception as e:
dic_result = {
'success': 'false',
'message': '操作失败',
'code': '204',
'e': e
}
log.error(dic_result)
exception = 'Kafka操作失败'
return exception
except Exception as e:
log.info(f'传输失败:{social_code}----{url}')
exception = '数据id获取失败'
return exception
except Exception as e:
log.error("获取正文失败")
exception = '获取正文失败'
return exception
def selectUrl(news_url,xydm):
# 查重
def selectUrl(news_url, xydm):
# with cnx.cursor() as cursor:
sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s '''
cursor.execute(sel_sql, (news_url,xydm))
cursor.execute(sel_sql, (news_url, xydm))
selects = cursor.fetchall()
return selects
def getLastUrl():
news_div = driver.find_element(By.ID, 'summaryPressStream-0-Stream')
news_lis = news_div.find_elements(By.XPATH,"./ul/li")
# 获取最后一条动态url
def getLastUrl(driver):
news_div = driver.find_element(By.ID, 'quoteNewsStream-0-Stream')
news_lis = news_div.find_elements(By.XPATH, "./ul/li")
last = len(news_lis)
try:
url = news_lis[last-1].find_element(By.XPATH,"./div[1]/div[1]/div[2]/h3[1]/a").get_attribute("href").lstrip().strip().replace("'","''")
url = news_lis[last - 1].find_element(By.XPATH, "./div[1]/div[1]/div[2]/h3[1]/a").get_attribute(
"href").lstrip().strip().replace("'", "''")
except:
url = news_lis[last-1].find_element(By.XPATH,"./div[1]/div[1]/div[1]/h3[1]/a").get_attribute("href").lstrip().strip().replace("'","''")
url = news_lis[last - 1].find_element(By.XPATH, "./div[1]/div[1]/div[1]/h3[1]/a").get_attribute(
"href").lstrip().strip().replace("'", "''")
return url
def scroll(xydm,name,gpdm):
# 拖动滑动条显示更多信息
def scroll(driver,xydm, name, gpdm):
last_url_ = ''
while True:
js = "var q=document.documentElement.scrollTop=100000"
driver.execute_script(js)
time.sleep(1)
try:
last_url = getLastUrl()
last_url = getLastUrl(driver)
except Exception as e:
log.error(f"{name}--{gpdm}--获取不到最后一条链接")
break
# todo:增量时 需打开注释
try:
selects = selectUrl(last_url_,xydm)
selects = selectUrl(last_url_, xydm)
except:
break
if selects:
break
if last_url_ == last_url:
break
last_url_ = last_url
time.sleep(1)
#采集失败的企业 重新放入redis
# 采集失败的企业 重新放入redis
def rePutIntoR(item):
r.rpush('NewsEnterprise:gwqy_socialCode', item)
# 获取资讯详情
@retry(tries=5,delay=5)
def getZx(driver,xydm, url, title, origin):
start_time_content = time.time()
driver.get(url)
try:
clickButton = driver.find_element(By.CLASS_NAME, "collapse-button")
clickButton.click()
time.sleep(0.5)
driver.execute_script("arguments[0].remove()",clickButton)
time.sleep(0.5)
except Exception as e:
pass
time.sleep(0.5)
authorElement = driver.find_element(By.CLASS_NAME, "caas-author-byline-collapse")
timeElement = driver.find_element(By.CLASS_NAME, "caas-attr-time-style").find_element(By.TAG_NAME,
"time")
contentWithTag = driver.find_element(By.CLASS_NAME, "caas-body").get_attribute('outerHTML')
author = authorElement.text.lstrip().strip().replace("'", "''")
pub_time = timeElement.get_attribute("datetime").lstrip().strip().replace("'", "''").replace("T", " ")
pub_time = pub_time[0:19]
#if pub_time < '2023-09-01':
# return '超过截止日期'
content = driver.find_element(By.CLASS_NAME, "caas-body").text
if len(content) < 400:
exception = ''
return exception
contentWithTag = contentWithTag.replace("'", "''")
# 动态信息列表
list_info = [
xydm,
url,
f'雅虎财经-{origin}',
'2',
pub_time,
title,
contentWithTag[0:500]
]
log.info(f"文章耗时,耗时{baseCore.getTimeCost(start_time_content, time.time())}")
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# todo:插入一条数据,并传入kafka
dic_news = {
'attachmentIds': '',
'author': author,
'content': content,
'contentWithTag': contentWithTag,
'createDate': time_now,
'deleteFlag': '0',
'id': '',
'keyWords': '',
'lang': 'en',
'origin': origin,
'publishDate': pub_time,
'sid': '1714853151160340481',
'sourceAddress': url, # 原文链接
'summary': '',
'title': title,
'type': 2,
'socialCreditCode': social_code,
'year': pub_time[:4]
}
# print(dic_news)
# 将相应字段通过kafka传输保存
try:
sendKafka(dic_news)
try:
insertMysql(list_info)
except Exception as e1:
log.error("保存数据库失败")
exception = '数据库传输失败'
return exception
# 传输成功,写入日志中
exception = ''
return exception
# return True
except Exception as e:
dic_result = {
'success': 'false',
'message': '操作失败',
'code': '204',
'e': e
}
log.error(dic_result)
exception = 'Kafka操作失败'
return exception
except Exception as e:
log.info(f'传输失败:{social_code}----{url}')
exception = '数据id获取失败'
return exception
if __name__ == "__main__":
path = r'F:\spider\1\chromedriver.exe'
driver = baseCore.buildDriver(path)
StartTime = time.time()
path = r'D:\zzsn_spider\comData\cmd6\chromedriver.exe'
driver = baseCore.buildDriver(path, headless=True)
cnx = baseCore.cnx
cursor = baseCore.cursor
ErrorNume = 0
while True:
if ErrorNume >= 15:
driver.quit()
break
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('NewsEnterprise:gwqy_socialCode')
# social_code = 'ZZSN22080900000046'
# 判断 如果Redis中已经没有数据,则等待
if not social_code :
if not social_code:
log.info('============已没有数据============等待===============')
time.sleep(20)
time.sleep(1800)
continue
if social_code == None:
time.sleep(20)
time.sleep(1800)
continue
data = baseCore.getInfomation(social_code)
try:
data = baseCore.getInfomation(social_code)
except:
rePutIntoR(social_code)
driver.quit()
break
name = data[1]
enname = data[5]
gpdm = data[3]
......@@ -212,7 +253,7 @@ if __name__ == "__main__":
xydm = data[2]
# 获取该企业对应项目的采集次数
count = data[17]
count = data[18]
start_time = time.time()
if (gpdm == ''):
log.error(f"{name}--股票代码为空 跳过")
......@@ -222,29 +263,62 @@ if __name__ == "__main__":
baseCore.recordLog(xydm, taskType, state, takeTime, '', exception)
continue
try:
url = f"https://finance.yahoo.com/quote/{gpdm}/press-releases?p={gpdm}"
url = f"https://finance.yahoo.com/quote/{gpdm}/?p={gpdm}"
driver.get(url)
try:
WebDriverWait(driver, 15).until(EC.visibility_of_element_located((By.ID, 'summaryPressStream-0-Stream')))
news_div = driver.find_element(By.ID, 'summaryPressStream-0-Stream')
news_div.find_element(By.TAG_NAME, 'a')
except Exception as e:
log.error(f"{name}--{gpdm}--没找到新闻元素")
exception = '没找到新闻元素'
if '用户将无法从中国大陆使用 Yahoo 的产品与服务' in driver.page_source:
log.error('代理失效')
time.sleep(5)
#driver.quit()
#driver = baseCore.buildDriver(path, headless=True)
rePutIntoR(social_code)
ErrorNume += 1
continue
if 'https://consent.yahoo.com/v2/collectConsent' in driver.current_url:
log.error('页面跳转,出现弹窗')
time.sleep(5)
#driver.quit()
#driver = baseCore.buildDriver(path, headless=True)
rePutIntoR(social_code)
ErrorNume += 1
continue
if 'lookup' in driver.current_url:
log.error(f"{name}--{gpdm}--股票代码错误")
exception = '股票代码错误'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(xydm, taskType, state, takeTime, url, exception)
baseCore.recordLog(xydm, taskType, state, takeTime, driver.current_url, exception)
continue
try:
scroll(xydm,name,gpdm)
WebDriverWait(driver, 15).until(EC.visibility_of_element_located((By.ID, 'quoteNewsStream-0-Stream')))
div_flg = driver.find_element(By.ID, 'quoteNewsStream-0-Stream')
if "We're sorry we weren't able to find anything about this topic." in div_flg.text:
log.error(f"{driver.current_url}")
log.error(f"{name}--{gpdm}--没找到新闻元素")
exception = '没找到新闻元素'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(xydm, taskType, state, takeTime, url, exception)
continue
except Exception as e:
log.error(f"{name}--{gpdm}--页面打开失败")
time.sleep(5)
#driver.quit()
#driver = baseCore.buildDriver(path, headless=True)
rePutIntoR(social_code)
ErrorNume += 1
continue
try:
scroll(driver,xydm, name, gpdm)
except Exception as e:
print(e)
log.error(f"{name}--{gpdm}--拖拽出现问题")
news_div = driver.find_element(By.ID, 'quoteNewsStream-0-Stream')
news_lis = news_div.find_elements(By.XPATH, "./ul/li")
log.info(f"{name}--{gpdm}--{len(news_lis)}条信息")
#标识符 判断脚本是否断开连接
# 标识符 判断脚本是否断开连接
flag = 0
news_info_list = []
# 获取咨询url、title、origin并放入列表news_info_list中
for i in range(0, len(news_lis)):
try:
try:
......@@ -261,22 +335,38 @@ if __name__ == "__main__":
continue
else:
log.error(f"{name}--{gpdm}--{i}----与网站断开连接")
#todo:重新放入redis
rePutIntoR(xydm)
time.sleep(300)
time.sleep(5)
driver.quit()
driver = baseCore.buildDriver(path, headless=True)
rePutIntoR(social_code)
ErrorNume += 1
flag = 1
break
news_url = a_ele.get_attribute("href").lstrip().strip().replace("'", "''")
if (news_url.startswith("https://finance.yahoo.com")):
pass
title = a_ele.text.lstrip().strip().replace("'", "''")
try:
origin = news_lis[i].find_element(By.XPATH, './div/div/div[2]/div/span[1]').text
except:
origin = news_lis[i].find_element(By.XPATH, './div/div/div/div/span[1]').text
if origin == '':
log.error('来源获取失败')
continue
news_info_list.append([news_url,title,origin])
else:
continue
if flag == 1:
continue
for i in range(len(news_info_list)):
news_url = news_info_list[i][0]
title = news_info_list[i][1]
origin = news_info_list[i][2]
# 判断url是否已经存在
sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s '''
cursor.execute(sel_sql, (news_url, xydm))
selects = cursor.fetchall()
if selects:
log.error(f"{name}--{gpdm}--网址已经存在----{news_url}")
log.info(f"{name}--{gpdm}--网址已经存在----{news_url}")
exception = '网址已存在'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
......@@ -284,9 +374,17 @@ if __name__ == "__main__":
# 增量使用
break
# 全量使用
# continue
title = a_ele.text.lstrip().strip().replace("'", "''")
exception = getZx(xydm, news_url, title, cnx, path)
#continue
try:
exception = getZx(driver,xydm, news_url, title, origin)
except:
log.error('获取正文失败')
driver.quit()
driver = baseCore.buildDriver(path, headless=True)
exception = '获取正文失败'
#if exception == '超过截止日期':
# log.info(f'{name}--{gpdm}--九月一日前数据以采集完毕')
# break
if exception == '':
state = 1
else:
......@@ -294,16 +392,15 @@ if __name__ == "__main__":
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(xydm, taskType, state, takeTime, news_url, exception)
log.info(f"{name}--{gpdm}--{i}----{news_url}")
if flag==1:
continue
log.info(f"{name}--{gpdm}--企业整体,耗时{baseCore.getTimeCost(start_time, time.time())}")
# 信息采集完成后将该企业的采集次数更新
runType = 'NewsRunCount'
count += 1
baseCore.updateRun(social_code, runType, count)
except:
ErrorNume = 0
except Exception as e:
rePutIntoR(xydm)
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
......@@ -311,10 +408,18 @@ if __name__ == "__main__":
log.info(f"-------{name}--{gpdm}---'远程主机强迫关闭了一个现有的连接。'--------")
log.info('===========连接已被关闭========等待重新连接===========')
driver.quit()
driver = baseCore.buildDriver(path)
driver = baseCore.buildDriver(path, headless=True)
time.sleep(5)
ErrorNume += 1
continue
EndTime = time.time()
if EndTime - StartTime >= 6 * 60 * 60 :
driver.quit()
log.info('程序执行超过6小时,重启')
break
cursor.close()
cnx.close()
# 释放资源
......
import json
++ /dev/null
import json
import time
import requests
import sys
from bs4 import BeautifulSoup
from kafka import KafkaProducer
# sys.path.append(r'F:\zzsn\zzsn_spider\base')
# import BaseCore
from base import BaseCore
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
taskType = '企业基本信息/雅虎财经'
baseCore = BaseCore.BaseCore()
r = baseCore.r
log = baseCore.getLogger()
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'cache-control': 'max-age=0',
'sec-ch-ua': '"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "Windows",
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
# 根据股票代码 获取企业基本信息 高管信息
def getInfo(enname, gpdm, xydm, start):
if 'HK' in str(gpdm):
tmp_g = str(gpdm).split('.')[0]
if len(tmp_g) == 5:
gpdm_ = str(gpdm)[1:]
else:
pass
elif str(gpdm)[-2:] == '.N' or str(gpdm)[-2:] == '.O':
gpdm_ = gpdm[:-2]
else:
gpdm_ = gpdm
retData = {}
retData['base_info'] = {
'公司名称': enname,
'英文名': enname,
'信用代码': xydm,
}
retData['people_info'] = []
url = f'https://finance.yahoo.com/quote/{gpdm_}/profile?p={gpdm_}'
time.sleep(3)
for i in range(0, 3):
try:
response = requests.get(url, headers=headers, verify=False)
time.sleep(1)
if (response.status_code == 200):
break
else:
log.error(f"{gpdm}---第{i}次---获取基本信息接口返回失败:{response.status_code}")
except:
continue
try:
if 'lookup' in response.url:
log.error(f"{gpdm}------股票代码错误:{response.status_code}")
exeception = '股票代码错误'
state = 1
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(xydm, taskType, 0, takeTime, url, exeception)
return [state, retData]
elif response.status_code != 200:
log.error(f"{gpdm}------获取基本信息接口重试后依然失败失败:{response.status_code}")
exeception = '获取基本信息接口返回失败'
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(xydm, taskType, state, takeTime, url, exeception)
baseCore.rePutIntoR('BaseInfoEnterpriseFbs:gwqy_social_code', xydm)
return [state, retData]
except:
log.error(f"{gpdm}------获取基本信息接口重试后依然失败失败:{response.status_code}")
exeception = '获取基本信息接口返回失败'
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(xydm, taskType, state, takeTime, url, exeception)
baseCore.rePutIntoR('BaseInfoEnterpriseFbs:gwqy_social_code', xydm)
return [state, retData]
state = 1
soup = BeautifulSoup(response.content, 'html.parser')
page = soup.find('div', {'id': 'Col1-0-Profile-Proxy'})
name = page.find('h3', {'class': 'Fz(m) Mb(10px)'}).text
try:
com_info = page.find('div', {'class': 'Mb(25px)'})
except:
com_info = ''
try:
com_phone = com_info.find_all('p')[0].find('a').text
except:
com_phone = ''
try:
com_url = com_info.find_all('p')[0].find('a', {'target': '_blank'}).text
except:
com_url = ''
try:
com_address = com_info.find_all('p')[0].text.replace(com_phone, '').replace(com_url, '')
except:
com_address = ''
try:
com_bumen = com_info.find_all('p')[1].find_all('span', {'class': 'Fw(600)'})[0].text
except:
com_bumen = ''
try:
com_hangye = com_info.find_all('p')[1].find_all('span', {'class': 'Fw(600)'})[1].text
except:
com_hangye = ''
try:
com_people = com_info.find_all('p')[1].find_all('span', {'class': 'Fw(600)'})[2].text
except:
com_people = ''
try:
com_jianjie = page.find('p', {'class': 'Mt(15px) Lh(1.6)'}).text
except:
com_jianjie = ''
dic_com_info = {
'公司名称': name,
'英文名': name,
'信用代码': xydm,
'股票代码': gpdm,
'地址': com_address,
'电话': com_phone,
'公司网站': com_url,
'部门': com_bumen,
'行业': com_hangye,
'员工人数': com_people,
'公司简介': com_jianjie
}
retData['base_info'] = dic_com_info
# 高管信息
retPeople = []
try:
list_people = page.find('table', {'class': 'W(100%)'}).find_all('tr')[1:]
except:
list_people = []
for one_people in list_people:
try:
p_name = one_people.find_all('td')[0].text
except:
p_name = ''
continue
try:
p_zhiwu = one_people.find_all('td')[1].text
except:
p_zhiwu = ''
try:
p_money = one_people.find_all('td')[2].text
except:
p_money = ''
try:
p_xingshi = one_people.find_all('td')[3].text
except:
p_xingshi = ''
try:
p_year = one_people.find_all('td')[4].text
except:
p_year = ''
if (p_zhiwu == "N/A"):
p_zhiwu = ""
if (p_money == "N/A"):
p_money = ""
if (p_xingshi == "N/A"):
p_xingshi = ""
if (p_year == "N/A"):
p_year = ""
dic_main_people = {
'公司名称': name,
'股票代码': gpdm,
'信用代码': xydm,
'姓名': p_name,
'职务': p_zhiwu,
'薪资': p_money,
'行使': p_xingshi,
'出生年份': p_year
}
retPeople.append(dic_main_people)
retData['people_info'] = retPeople
log.info(f"获取基本信息--{gpdm},耗时{baseCore.getTimeCost(start, time.time())}")
response.close()
return [state, retData]
# 保存基本信息
def saveBaseInfo(info, start):
# 基本信息发送到kafka
try:
company_dict = {
'name': info['base_info']['公司名称'], # 企业名称
'shortName': '', # 企业简称
'socialCreditCode': info['base_info']['信用代码'], # 统一社会信用代码
'officialPhone': info['base_info']['电话'], # 电话
'officialUrl': info['base_info']['公司网站'], # 官网
'briefInfo': info['base_info']['公司简介'], # 简介
'industry': info['base_info']['行业'], # 所属行业
'englishName': info['base_info']['英文名'], # 英文名
'address': info['base_info']['地址'], # 地址
'status': 0, # 状态
}
except:
company_dict = {
'name': info['base_info']['公司名称'], # 企业名称
'socialCreditCode': info['base_info']['信用代码'], # 统一社会信用代码
'englishName': info['base_info']['英文名'], # 英文名
}
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
kafka_result = producer.send("regionInfo", json.dumps(company_dict, ensure_ascii=False).encode('utf8'))
kafka_result.get(timeout=10)
log.info(f"保存基本信息--{info['base_info']['信用代码']},耗时{baseCore.getTimeCost(start, time.time())}")
log.info(f"保存基本信息--{company_dict['name']},耗时{baseCore.getTimeCost(start, time.time())}")
# 保存高管信息
def savePeopleInfo(info, start):
# 高管信息调用接口
list_people = info['people_info']
list_one_info = []
for i in range(0, len(list_people)):
dic_json = {
"socialCreditCode": list_people[i]['信用代码'],
"name": list_people[i]['姓名'],
"sex": '',
"education": '',
"position": list_people[i]['职务'],
"salary": list_people[i]['薪资'],
"birthYear": list_people[i]['出生年份'],
"shareNum": '',
"shareRatio": '',
"benefitShare": '',
"currentTerm": '',
"personInfo": '',
"sort": str(i + 1)
}
list_one_info.append(dic_json)
json_updata = json.dumps(list_one_info)
if json_updata == '[]':
log.info("没有高管")
pass
else:
for i in range(0, 3):
response = requests.post('http://114.115.236.206:9988/datapull/sync/executive', data=json_updata,
timeout=300, verify=False)
if (response.status_code == 200):
retJson = json.loads(response.content.decode('utf-8'))
if (retJson['success'] or retJson['success'] == 'true'):
break
if (response.status_code == 200):
retJson = json.loads(response.content.decode('utf-8'))
if (retJson['success'] or retJson['success'] == 'true'):
pass
else:
log.error(f"保存高管接口失败---{retJson}")
exception = '保存高管接口失败'
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(dic_json['socialCreditCode'], taskType, state, takeTime, '', exception)
return state
else:
log.error(f"保存高管接口失败---{response.status_code}")
exception = '保存高管接口失败'
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(dic_json['socialCreditCode'], taskType, state, takeTime, '', exception)
return state
state = 1
log.info(f"保存高管信息--{info['base_info']['信用代码']},耗时{baseCore.getTimeCost(start, time.time())}")
return state
# 采集工作
def beginWork():
while True:
social_code = baseCore.redicPullData('BaseInfoEnterprise:gwqy_socialCode')
# social_code = 'ZZSN230824151229535'
if not social_code:
time.sleep(20)
continue
if social_code == 'None':
time.sleep(20)
continue
# 数据库中获取基本信息
data = baseCore.getInfomation(social_code)
enname = data[5]
gpdm = data[3]
xydm = data[2]
# 获取该企业对应项目的采集次数
count = data[13]
start_time = time.time()
# 股票代码为空跳过
if gpdm == '':
info = {"base_info": {'公司名称': enname, '英文名': enname, '信用代码': xydm, }}
log.error(f'{xydm}....股票代码为空')
try:
saveBaseInfo(info, start_time)
except:
log.error(f'{enname}....企业基本信息Kafka操作失败')
exception = 'Kafka操作失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(xydm, taskType, state, takeTime, '', exception)
else:
try:
retData = getInfo(enname, gpdm, xydm, start_time)
# 基本信息采集成功 进行数据入库,否则不入库
if retData[0] == 1:
# 企业基本信息入库
try:
saveBaseInfo(retData[1], start_time)
time.sleep(1)
except:
log.error(f'{enname}....企业基本信息Kafka操作失败')
exception = 'Kafka操作失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(xydm, taskType, state, takeTime, '', exception)
# 企业高管信息入库
state = savePeopleInfo(retData[1], start_time)
time.sleep(1)
# 只有企业高管信息和企业基本信息都采集到,该企业才算采集成功
if state == 1:
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(xydm, taskType, state, takeTime, '', '')
else:
pass
else:
pass
except Exception as e:
# 若出现尚未发现的错误,则保存错误信息以及出错位置
info = {"base_info": {'公司名称': enname,'英文名': enname,'信用代码': xydm, }}
try:
saveBaseInfo(info, start_time)
log.info(f'{enname}.....股票代码出错只保存基本信息')
except:
log.error(f'{enname}....企业基本信息Kafka操作失败')
exception = 'Kafka操作失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(xydm, taskType, state, takeTime, '', exception)
ee = e.__traceback__.tb_lineno
log.error(f'{enname}...{xydm}...{gpdm}.....数据采集失败,原因:{ee}行 {e}')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(xydm, taskType, state, takeTime, '', f'数据采集失败,原因:{ee}行 {e}')
# 企业数据采集完成,采集次数加一
count += 1
runType = 'BaseInfoRunCount'
baseCore.updateRun(social_code, runType, count)
# 释放资源
baseCore.close()
if __name__ == '__main__':
beginWork()
from kafka import KafkaConsumer
from kafka import KafkaConsumer
......@@ -9,13 +9,15 @@ import datetime
import time
import redis
import hashlib
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import random
from kafka import KafkaProducer
r = redis.Redis(host="localhost", port=6379)
# r = redis.Redis(host="localhost", port=6379)
# 将数据转换成hash值,用来对文章url进行去重,实现增量爬虫
......@@ -46,12 +48,22 @@ def add_url(article_url):
# 使用模拟浏览器来获取cookie值
def get_cookie():
executable_path = r"D:\chrome\chromedriver.exe"
opt = webdriver.ChromeOptions()
opt.add_argument('--headless')
browser = webdriver.Chrome(chrome_options=opt, executable_path=executable_path)
# executable_path = r"F:\spider\117\chromedriver.exe"
# opt = webdriver.ChromeOptions()
# #opt.add_argument('--headless')
#
# browser = webdriver.Chrome(chrome_options=opt, executable_path=executable_path)
path = r"F:\spider\117\chromedriver.exe"
service = Service(path)
chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_experimental_option(
"excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
browser = webdriver.Chrome(options=chrome_options, service=service)
browser.get("https://weibo.com/")
# 等待界面出现再获取cookie
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, "//*[@id=\"app\"]")))
......@@ -66,8 +78,9 @@ def get_cookie():
for cookie in cookie_list:
cookies[cookie['name']] = cookie['value']
r.set("cookies", str(cookies), ex=600)
# r.set("cookies", str(cookies), ex=600)
# print(cookies)
return cookies
# 代码主程序,通过给出的用户url来获取用户发布的文章
......@@ -78,13 +91,13 @@ def get_content_by_user_uid(url, sid):
}
s = requests.session()
cookies_str = r.get("cookies")
if cookies_str == None:
get_cookie()
cookies = json.loads('{' + re.findall("{(.*?)}", str(r.get("cookies")).replace("\'", "\""))[0] + '}')
else:
cookies = json.loads('{' + re.findall("{(.*?)}", str(cookies_str).replace("\'", "\""))[0] + '}')
# cookies_str = r.get("cookies")
cookies_str = get_cookie()
# if cookies_str == None:
# get_cookie()
# cookies = json.loads('{' + re.findall("{(.*?)}", str(r.get("cookies")).replace("\'", "\""))[0] + '}')
# else:
cookies = json.loads('{' + re.findall("{(.*?)}", str(cookies_str).replace("\'", "\""))[0] + '}')
s.cookies.update(cookies)
......@@ -115,10 +128,11 @@ def get_content_by_user_uid(url, sid):
# print(uid)
if add_uid(uid): # 若uid已存在于redis中,只爬取该作者第一页文章
num_page = 2
else: # 若uid不存在redis中,爬取1000页
num_page = 1000
# if add_uid(uid): # 若uid已存在于redis中,只爬取该作者第一页文章
# num_page = 2
# else: # 若uid不存在redis中,爬取1000页
# num_page = 1000
num_page = 10
# 爬取程序入口
for page in range(1, num_page): # 对后面进行无限翻页,直到无内容显示后跳出
......@@ -142,8 +156,8 @@ def get_content_by_user_uid(url, sid):
title_one_con = list_news_[0]['url_title'] # 文章标题
url_one_con = "https://weibo.com/ttarticle/p/show?id=" + list_news_[0]['page_id'] # 文章链接URL
if add_url(url_one_con): # 若url已存在,则返回TRUE,跳出本次循环
continue
# if add_url(url_one_con): # 若url已存在,则返回TRUE,跳出本次循环
# continue
for num_res in range(0, 3): # url若访问失败可以最多访问3次
try:
......@@ -253,20 +267,20 @@ def get_content_by_user_uid(url, sid):
# df_con = pd.DataFrame(list_all_info)
# df_con.to_excel(f'{uid}.xlsx')
for one_news_info in list_all_info: # 将每一个文章数据转换为json格式,把json文件用kafka发送出去
for num_pro in range(0, 3):
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("crawlerInfo",
json.dumps(one_news_info, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
# time.sleep(1)
# print(json.dumps(one_news_info, ensure_ascii=False))
break
except:
time.sleep(5)
print('发送kafka失败!正在重新发送!')
continue
# for one_news_info in list_all_info: # 将每一个文章数据转换为json格式,把json文件用kafka发送出去
# for num_pro in range(0, 3):
# try:
# producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
# kafka_result = producer.send("crawlerInfo",
# json.dumps(one_news_info, ensure_ascii=False).encode('utf8'))
# print(kafka_result.get(timeout=10))
# # time.sleep(1)
# # print(json.dumps(one_news_info, ensure_ascii=False))
# break
# except:
# time.sleep(5)
# print('发送kafka失败!正在重新发送!')
# continue
# print(list_all_info[0]['title'])
return
......@@ -289,5 +303,5 @@ def consume():
if __name__ == "__main__":
# r = redis.Redis(host="localhost",port=6379)
# consume()
get_content_by_user_uid('https://weibo.com/2656274875?refer_flag=1001030103_','1571698920447193090')
get_content_by_user_uid('https://weibo.com/u/1689572847','1571698920447193090')
import datetime
import datetime
import io
import json
import os
import random
import re
import time
import uuid
import pandas as pd
import pymysql
import redis
import requests
from bs4 import BeautifulSoup
from minio import Minio
from retry import retry
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from base import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
cursor_ = baseCore.cursor
cnx_ = baseCore.cnx
cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='clb_project', charset='utf8mb4')
client = Minio('114.115.215.96:9089', access_key='zzsn@9988@!', secret_key='zzsn@9988@!0519', secure=False)
create_by = 'LiuLiYuan'
# 数据入库,返回主键id传到kafka中
def tableUpdate(year, com_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
create_by, create_time, come, page_size, cnx):
with cnx.cursor() as cursor:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,source,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, com_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
create_by,
create_time, come, page_size)
# print(values)
cursor.execute(Upsql, values) # 插入
cnx.commit() # 提交
querySql = '''select id from clb_sys_attachment where full_path = %s''' # and stock_code = "01786.HK"
cursor.execute(querySql, full_path)
selects = cursor.fetchone()
pdf_id = selects[0]
# cnx.close()
# print("更新完成:{}".format(pdf_id))
return pdf_id
# redis去重
def add_check_id(uid, mid):
r = redis.Redis(host="114.116.90.53", port=6379, password='zzsn9988', db=3)
res = r.sadd(f'weibo:{uid}', mid, 3) # 注意是 保存set的方式
if res == 0: # 若返回0,说明插入不成功,表示有重复
return True
else:
return False
# 登录账号并获取cookie
def get_cookie(driver):
path = 'F:/spider/117/chromedriver-win64/chromedriver.exe'
driver.get("https://weibo.com/")
while True:
try:
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//*[@id=\"app\"]")))
break
except:
driver.quit()
driver = baseCore.buildDriver(path, False)
driver.get("https://weibo.com/")
driver.find_element(By.CLASS_NAME, 'LoginCard_btn_Jp_u1').click()
while True:
flg = False
for cookie in driver.get_cookies():
if 'SSOLoginState' == cookie['name']:
flg = True
if flg:
break
else:
time.sleep(5)
cookie_list = driver.get_cookies()
cookies = {}
# 获取cookie中的name和value,转化成requests可以使用的形式
for cookie in cookie_list:
cookies[cookie['name']] = cookie['value']
return cookies, driver
# 重新获取cookie
def get_cookie_again(driver):
driver.refresh()
cookies = {}
cookie_list = driver.get_cookies()
for cookie in cookie_list:
cookies[cookie['name']] = cookie['value']
return cookies
# 获取请求
@retry(tries=5, delay=5)
def getRes(session, url_one_con, headers):
res_one_con = session.get(url_one_con, headers=headers) # 对具体文章页面进行请求,获得文章内容
if res_one_con.status_code != 200:
raise
return res_one_con
# 获取博主文章的总数
def getTotal(session, uid, headers):
url_total = f"https://weibo.com/ajax/statuses/mymblog?uid={uid}&page=1&feature=0"
req_total = session.get(url_total, headers=headers).json()
total = req_total['data']['total']
if total % 20 == 0:
num_page = total // 20
else:
num_page = total // 20 + 1
return num_page
# 日期格式解析
def trs_date(date_str):
# 使用datetime.strptime解析日期字符串
date_obj = datetime.datetime.strptime(date_str, '%a %b %d %H:%M:%S %z %Y')
# 转化为年月日时间格式
formatted_date = date_obj.strftime('%Y-%m-%d %H:%M:%S')
return formatted_date
# 获取展开的所有内容
def getLongContent(session, mblogid):
mblogurl = f'https://weibo.com/ajax/statuses/longtext?id={mblogid}'
mblogreq = session.get(mblogurl)
mblogreq.encoding = mblogreq.apparent_encoding
mblogreqdata = mblogreq.json()['data']
content = mblogreqdata['longTextContent']
topic_structs = mblogreqdata['topic_struct']
for topic_struct in topic_structs:
topic_title = topic_struct['topic_title']
topic_content = f'''<a href="//s.weibo.com/weibo?q=#{topic_title}#" target="_blank">#{topic_title}#</a>'''
content.replace(f'#{topic_title}#', topic_content)
url_structs = mblogreqdata['url_struct']
for url_struct in url_structs:
short_url = url_struct['short_url']
content.replace(short_url, '')
title = url_struct['url_title']
long_url = url_struct['long_url']
title_content = f'''<a target="_blank" href="{long_url}"><img class="icon-link" src="https://h5.sinaimg.cn/upload/2015/09/25/3/timeline_card_small_web_default.png"/>{title}</a>'''
content += title_content
return content
# 下载展示图片
def getPic(session, pic_infos, mid, uid, origin, year):
pic_list = []
for pic_info in pic_infos:
pic_url = pic_info['large']['url']
category = os.path.splitext(pic_url)[1]
img_name = f'{uuid.uuid1()}{category}'
if 'jpg' in category or 'jpeg' in category:
content_type = 'image/jpeg'
elif 'png' in category:
content_type = 'image/png'
elif 'gif' in category:
content_type = 'image/gif'
try:
req = session.get(pic_url)
req.encoding = req.apparent_encoding
res_content = io.BytesIO(req.content)
size = res_content.getbuffer().nbytes
# 将文件用数据流下载到服务器
result = client.put_object('jcxm', f'img/微博/{mid}/{img_name}', res_content, size, content_type=content_type)
group_name = 'jcxm'
path = f'img/微博/{mid}/{img_name}'
full_path = path
file_size = size
order_by = 1
status = 1
create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
img_id = tableUpdate(year, img_name, 15, mid, group_name, path, full_path,
category, file_size, order_by, status, create_by, create_time, origin, 0, cnx)
pic_list.append(img_id)
except:
log.error(f"{mid}...{uid}下载失败")
return pic_list, False
return pic_list, True
# 下载跳转类图片
def getPage(session, page_info, long_url, mid, uid, origin, year):
page_list = []
object_type = page_info['object_type']
if object_type == 'video':
long_url = page_info['media_info']['h5_url']
video_id, flg = getVedio(session, page_info, mid, year, origin)
if flg:
page_list.append([video_id, long_url, object_type])
return page_list, True
else:
return page_list, False
else:
url = page_info['page_pic']
category = os.path.splitext(url)[1]
img_name = f'{uuid.uuid1()}{category}'
if 'jpg' in category or 'jpeg' in category:
content_type = 'image/jpeg'
elif 'png' in category:
content_type = 'image/png'
elif 'gif' in category:
content_type = 'image/gif'
try:
req = session.get(url)
req.encoding = req.apparent_encoding
res_content = io.BytesIO(req.content)
size = res_content.getbuffer().nbytes
# 将文件用数据流下载到服务器
result = client.put_object('jcxm', f'img/微博/{mid}/{img_name}', res_content, size, content_type=content_type)
group_name = 'jcxm'
path = f'img/微博/{mid}/{img_name}'
full_path = path
file_size = size
order_by = 1
status = 1
create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
img_id = tableUpdate(year, img_name, 15, mid, group_name, path, full_path,
category, file_size, order_by, status, create_by, create_time, origin, 0, cnx)
page_list.append([img_id, long_url, object_type])
except:
log.error(f"{mid}...{uid}下载失败")
return page_list, False
return page_list, True
# 获取转发内容
def getForward(session, mid, uid, origin, year, one_con):
forwardUser = one_con['user']['screen_name']
content = one_con['text']
if '>展开<' in content:
mblogid = one_con['mblogid']
content = getLongContent(session, mblogid)
else:
contentTag = BeautifulSoup(content, 'lxml')
a_list = contentTag.find_all('a')
for a in a_list:
href = a.get('href')
if '@' in a.text:
href_ = 'https://weibo.com' + href
content.replace(href, href_)
elif '#' in a.text:
href_ = 'https:' + href
content.replace(href, href_)
else:
continue
content = str(contentTag)
# 获取图片链接
# 图片需要下载
pic_list = []
try:
pic_infos = one_con['pic_infos']
pic_list, flg = getPic(session, pic_infos, mid, uid, origin, year)
if not flg:
return False
except:
pass
page_list = []
try:
page_info = one_con['page_info']
long_url = one_con['url_struct'][0]['long_url']
page_list, flg = getPage(session, page_info, long_url, mid, uid, origin, year)
if not flg:
return False
except:
pass
return True
# 下载视频 视频链接可下载
def getVedio(session, page_info, mid, year, origin):
title = page_info['kol_title']
try:
href = page_info['mp4_720p_mp4']
if not href:
href = page_info['mp4_hd_url']
if not href:
href = page_info['mp4_sd_url']
if not href:
raise
except:
log.error(f'{mid}==={title}===视频链接获取失败')
return '', False
try:
req = session.get(href)
res_content = io.BytesIO(req.content)
size = res_content.getbuffer().nbytes
# 将文件用数据流下载到服务器
result = client.put_object('jcxm', f'img/微博/{mid}/{title}.mp4', res_content, size, content_type='video/mp4')
except:
log.error(f'{mid}==={title}===上传minio失败')
return '', False
group_name = 'jcxm'
path = f'video/微博/{mid}/{title}.mp4'
full_path = path
file_size = size
order_by = 1
status = 1
create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
try:
video_id = tableUpdate(year, title, 16, mid, group_name, path, full_path,
'mp4', file_size, order_by, status, create_by, create_time, origin, 0, cnx)
except:
log.error(f'{mid}==={title}===上传附件表失败')
return '', False
return video_id, True
# 代码主程序,通过给出的用户url来获取用户发布的文章
def get_content_by_user_uid(url, sid):
path = 'F:/spider/117/chromedriver-win64/chromedriver.exe'
driver = baseCore.buildDriver(path, False)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
}
session = requests.session()
cookies_str, driver = get_cookie(driver)
cookies = json.loads('{' + re.findall("{(.*?)}", str(cookies_str).replace("\'", "\""))[0] + '}')
session.cookies.update(cookies)
# 获取到统一格式的名称,用来查询微博uid
if url[-1] == "/":
url = url[:-1]
if "?" not in url:
url_get_uid = "https://weibo.com/ajax/profile/info?custom=" + url.split('/')[-1]
else:
if "%" not in url:
url_get_uid = "https://weibo.com/ajax/profile/info?custom=" + url.split('/')[-1].split('?')[0]
else:
url_get_uid = "https://weibo.com/ajax/profile/info?screen_name=" + url.split('/')[-1].split('?')[0]
try:
res_get_uid_json = session.get(url_get_uid, headers=headers).json()
weibo_name = res_get_uid_json['data']['user']['screen_name'] # 微博号名称
uid = res_get_uid_json['data']['user']['id'] # 微博uid
origin = "微博-" + weibo_name
except:
print(f"{url}:uid获取失败")
return
num_page = getTotal(session, uid, headers)
log.info(f'开始采集共{num_page}页')
# 爬取程序入口
start_time = time.time()
for page in range(1, num_page):
page_flg = True
log.info(f'开始采集第{page}页')
try:
url_all_con = f"https://weibo.com/ajax/statuses/mymblog?uid={uid}&page={page}&feature=0" # 使用uid找到每个微博的所有文章
res_all_con_json = session.get(url_all_con, headers=headers).json()
list_all_con = res_all_con_json['data']['list'] # 每页微博文章为json类,取出需要的数据
except Exception as e:
log.error("{}的{}页获取失败".format(weibo_name, page), e, sep='===')
continue
for one_con in list_all_con:
equ_source = one_con['source'].replace('\n', '') # 信息发布方式
like = int(one_con['attitudes_count']) # 点赞数
commentNum = int(one_con['comments_count']) # 评论数
collection = int(one_con['reposts_count']) # 转发数
mid = one_con['mid'] # 文章id
if add_check_id(uid, mid):
log.info(f'{uid}==={mid}===已采集')
continue
publishDate = trs_date(one_con['created_at']) # 发布时间
year = publishDate[:4]
if publishDate < '2023-08-01':
page_flg = False
break
# 获取微博文字内容
content = one_con['text']
if '>展开<' in content:
mblogid = one_con['mblogid']
content = getLongContent(session, mblogid)
else:
contentWithTag = BeautifulSoup(content, 'lxml')
a_list = contentWithTag.find_all('a')
for a in a_list:
href = a.get('href')
if '@' in a.text:
href_ = 'https://weibo.com' + href
content.replace(href, href_)
elif '#' in a.text:
href_ = 'https:' + href
content.replace(href, href_)
else:
continue
content = contentWithTag.text
contentWithTag = str(contentWithTag)
try:
title = re.findall('【(.*?)】',content)[0]
except:
title = content
# 获取图片链接
# 图片需要下载
pic_list = []
try:
pic_infos = one_con['pic_infos']
pic_list, flg = getPic(session, pic_infos, mid, uid, origin, year)
if not flg:
continue
except:
pass
page_list = []
try:
page_info = one_con['page_info']
long_url = one_con['url_struct'][0]['long_url']
page_list, flg = getPage(session, page_info, long_url, mid, uid, origin, year)
if not flg:
continue
except:
pass
forward = {}
try:
retweeted_status = one_con['retweeted_status']
forward, flg = getForward(session, mid, uid, origin, year, retweeted_status)
if not flg:
continue
except:
pass
log.info(
f'{uid}==={mid}==={equ_source}==={like}==={commentNum}==={collection}==={publishDate}==={content}==={pic_list}==={page_list}')
time.sleep(random.uniform(3, 5))
log.info("{}的{}页获取成功".format(weibo_name, page))
end_time = time.time()
if end_time - start_time > 1800:
cookies_str = get_cookie_again(driver)
cookies = json.loads('{' + re.findall("{(.*?)}", str(cookies_str).replace("\'", "\""))[0] + '}')
session.cookies.update(cookies)
start_time = time.time()
log.info(f'已重新获取cookie')
if not page_flg:
break
if __name__ == "__main__":
# get_content_by_user_uid('https://weibo.com/u/1689572847', '1571698920447193090')
baseCore.close()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论