提交 222110f7 作者: 薛凌堃

Merge remote-tracking branch 'origin/master'

......@@ -11,7 +11,6 @@ from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages import urllib3
from retry import retry
from base import BaseCore
urllib3.disable_warnings()
......@@ -20,6 +19,7 @@ log = baseCore.getLogger()
cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4')
cursor = cnx.cursor()
r = baseCore.r
URL = 'https://www.nasdaq.com/'
session = requests.session()
session.mount('https://', HTTPAdapter(pool_connections=20, pool_maxsize=100))
......@@ -65,6 +65,7 @@ def add_date(com_code, date_list):
# 数据发送端口
def sendData(start_time, social_code, gpdm, dic_info):
data = json.dumps(dic_info)
# print(data)
url_baocun = 'http://114.115.236.206:8088/sync/finance/nsdk'
for nnn in range(0, 3):
try:
......@@ -86,7 +87,7 @@ def getUnit(gpdm):
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'lxml')
unit = soup.find('div', class_='financials__note').text.split(' ')[1].lstrip().strip()
unit = f'(千){unit}'
unit = f'{unit}(千)'
req.close()
return unit
......@@ -104,9 +105,11 @@ def getlist(table, tableName):
value = re.sub(r"[^\d+-]", "", value)
else:
value = '-'
date = years[f'value{i}'].split('/')[2] + '-' + years[f'value{i}'].split('/')[0] + '-' + \
years[f'value{i}'].split('/')[1]
list.append({f'{tableName}': name, 'value': value, 'date': date, })
date_ = years[f'value{i}']
if date_ :
date = date_.split('/')[2] + '-' + date_.split('/')[0] + '-' + \
date_.split('/')[1]
list.append({f'{tableName}': name, 'value': value, 'date': date, })
return list
......@@ -162,6 +165,7 @@ def getYear(start_time, session, social_code, gpdm):
# 判断该报告期是否已采过
panduan = check_date(social_code, date + '-year')
if panduan:
log.info(f'{social_code}=={gpdm}=={date}年度数据采集过')
continue
xjll_list_f = reviseData([item for item in final_list if 'xjll' in item], unit, 'xjll')
zcfz_list_f = reviseData([item for item in final_list if 'zcfz' in item], unit, 'zcfz')
......@@ -177,6 +181,7 @@ def getYear(start_time, session, social_code, gpdm):
"ynFirst": ynFirst,
}
sendData(start_time, social_code, gpdm, dic_info)
log.info(f'{social_code}=={gpdm}=={date}年度财务数据采集成功')
date_list.append(date + '-year')
else:
log.error(f'找不到{social_code}=={gpdm}年度财务数据')
......@@ -184,6 +189,7 @@ def getYear(start_time, session, social_code, gpdm):
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===无年度财务数据')
except:
log.error(f'{social_code}===年度财务数据访问失败')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===年度财务数据访问失败')
......@@ -217,6 +223,7 @@ def getQuarter(start_time, session, social_code, gpdm):
# 判断该报告期是否已采过
panduan = check_date(social_code, date + '-quarter')
if panduan:
log.info(f'{social_code}=={gpdm}=={date}季度数据采集过')
continue
xjll_list_f = reviseData([item for item in final_list if 'xjll' in item], unit, 'xjll')
zcfz_list_f = reviseData([item for item in final_list if 'zcfz' in item], unit, 'zcfz')
......@@ -236,13 +243,15 @@ def getQuarter(start_time, session, social_code, gpdm):
if panduan_flag:
dic_info['dateFlag'] = 'year'
sendData(start_time, social_code, gpdm, dic_info)
log.info(f'{social_code}=={gpdm}=={date}季度财务数据采集成功')
date_list.append(date + '-quarter')
else:
log.error(f'{social_code}=={gpdm}无季度财务数据')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===无季度财务数据')
except:
log.error(f'{social_code}===季度财务数据访问失败')
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===季度财务数据访问失败')
......@@ -250,36 +259,52 @@ def getQuarter(start_time, session, social_code, gpdm):
return date_list
def FinanceFromNasdaq():
sql = "select xydm from mgzqyjwyh_list where state=2 and exchange='Nasdaq';"
cursor.execute(sql)
finance = cursor.fetchall()
finance_list = [item[0] for item in finance]
for item in finance_list:
r.rpush('FinanceFromNasdaq:nasdaqfinance_socialCode', item)
print('redis放入成功')
def getInfomation(social_code):
sql = f"select * from mgzqyjwyh_list where state=2 and xydm='{social_code}';"
cursor.execute(sql)
data = cursor.fetchone()
return data
def doJob():
# while True:
# social_code = baseCore.redicPullData('')
# datas_enterprise = baseCore.getInfomation(social_code)
session.get(URL, headers=headers)
# sql = "select * from mgzqyjwyh_list where state=2 and exchange='Nasdaq';"
# cursor.execute(sql)
# datas_enterprise = cursor.fetchall()
# for data_enterprise in datas_enterprise:
start_time = time.time()
# gpdm = data_enterprise[3]
# social_code = data_enterprise[6]
social_code = 'ZD0CN0012309000172'
gpdm = 'NTES'
# 采集年度数据
date_list_year = getYear(start_time, session, social_code, gpdm)
# 保存年度数据到redis
add_date(social_code, date_list_year)
# 采集季度数据
date_list_quarter = getQuarter(start_time, session, social_code, gpdm)
# 保存季度数据到redis
add_date(social_code, date_list_quarter)
timeCost = baseCore.getTimeCost(start_time, time.time())
state = 1
baseCore.recordLog(social_code, taskType, state, timeCost, '', '')
log.info(f'{social_code}=={gpdm}==耗时{timeCost}')
# break
cursor.close()
cnx.close()
while True:
social_code = baseCore.redicPullData('FinanceFromNasdaq:nasdaqfinance_socialCode')
if not social_code or social_code == None:
log.info('============已没有数据============等待===============')
time.sleep(600)
continue
data_enterprise = getInfomation(social_code)
start_time = time.time()
gpdm = data_enterprise[3]
social_code = data_enterprise[6]
# print(gpdm,social_code)
# 采集年度数据
date_list_year = getYear(start_time, session, social_code, gpdm)
# 保存年度数据到redis
add_date(social_code, date_list_year)
# 采集季度数据
date_list_quarter = getQuarter(start_time, session, social_code, gpdm)
# 保存季度数据到redis
add_date(social_code, date_list_quarter)
timeCost = baseCore.getTimeCost(start_time, time.time())
state = 1
baseCore.recordLog(social_code, taskType, state, timeCost, '', '')
log.info(f'{social_code}=={gpdm}==耗时{timeCost}')
if __name__ == '__main__':
# 财务数据采集
doJob()
# 企业股票代码放入redis
# FinanceFromNasdaq()
cursor.close()
cnx.close()
\ No newline at end of file
# created by virtualenv automatically
*
import gc
from flask import Flask, render_template, request, current_app
import configparser
from controller.Main import Main # 导入全部蓝图变量
import datetime
from apscheduler.schedulers.blocking import BlockingScheduler
from datetime import datetime
from dao.Conn import ConnMySql
import sys
import io
# 清除登录状态
def clearLoginStateIn24H():
conn = ConnMySql()
conn.userClearLoginStateIn24H()
print("清除登录状态-" + datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
app = Flask(__name__) # 初始化Flask对象
app.register_blueprint(Main) # 将所有蓝图对象注册到app这个flask对象内
# 上传文件最大16M字节
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024
# App配置信息,键=段名+键名,如:db.port=3306
cfg = configparser.ConfigParser()
cfg.optionxform = str # 保持配置文件中键的大小写
cfg.read("static/conf/sys.ini", encoding='utf-8')
sections = cfg.sections()
for section in sections:
items = cfg.items(section)
for key, val in items:
app.config[section + '.' + key] = val
# 个别取值进行特殊处理
app.config['db.port'] = int(app.config['db.port'])
if app.config['sys.useProxy'] == "0":
app.config['sys.useProxy'] = False
else:
app.config['sys.useProxy'] = True
app.config['sys.proxyid'] = 0 #当前使用的代理id
app.config['sys.userid'] = 0 #当前使用的账号id
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
if __name__ == '__main__':
# webbrowser.open("0.0.0.0:5000")
app.run(host='0.0.0.0', port=5201, debug=True) # 启动入口
# 启动定时任务,定时清除异常登录状态,每半小时一次
# sched = BlockingScheduler()
# sched.add_job(clearLoginStateIn24H, 'interval', seconds=1800, id='task-clearLoginStateIn24H')
# sched.start()
import gc
from flask import Blueprint, request, current_app, make_response, send_file # 导入蓝图
import datetime
import re
import os
import logging
import sys
import io
import tempfile
import openpyxl
import string
import json
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium import webdriver
from selenium.webdriver.common.by import By
from util import UtilDate
from service.Service02 import Service02
Main = Blueprint('Main', __name__) # 初始化一个蓝图,而不是Flask对象
# 接受请求,读取请求体中的JSON参数,根据参数进行抓取动作
# {"from":"1900-01-01","last":最近x天数, "orgs":["单位1全称","单位2全称","单位3全称",...]}
@Main.route('/Main/getData', methods=["POST"])
def getData():
print("POST /Main/getData")
paras = request.get_json(force=True)
dateFrom = paras['from']
lastDays = paras['last']
orgs = paras['orgs']
if dateFrom == "":
if lastDays == "":
lastDays = 0
else:
lastDays = -(int(lastDays) - 1)
dateFrom = UtilDate.dateAdd("", "d", lastDays)
service02 = Service02()
return service02.getData(dateFrom, orgs) #"https://wenshu.court.gov.cn/website/wenshu/181029CR4M5A62CH/index.html"
class ProxyDao():
def t(self):
pass
# 基本信息
from util import UtilDate
from util import UtilNumber
class BaseInfo:
info_title: str # 标题
key_word: str # 关键词
info_bianhao: str # 案号
info_address: str # 管辖法院
info_time: str # 发布日期 #yyyy-mm-dd
info_id: str # 案件ID
info_yuanyou: str # 裁判理由
info_content: str # 正文内容
# 判断本条信息日期是否在指定日期(含)之后
def isAfter(self, sDate: str) -> bool:
if sDate == "":
return False
else:
if self.info_time >= sDate:
return True
else:
return False
def toString(self):
return self.info_title + "\t" + self.key_word + "\t" + self.info_bianhao + "\t" + self.info_address + "\t" + self.info_time + "\t" + self.info_id
home = C:\Program Files\Python
implementation = CPython
version_info = 3.8.0.final.0
virtualenv = 20.13.0
include-system-site-packages = true
base-prefix = C:\Program Files\Python
base-exec-prefix = C:\Program Files\Python
base-executable = C:\Program Files\Python\python.exe
#系统配置
[sys]
#文字识别Url,用于识别裁判文书网的验证码
ocrUrl=http://114.116.49.86:8013/wzsb_app?withCrLf=false
#登录模式,0-无需登录,1-账号登录(需要口令、短信、验证码相应的选择器不能为空),2-cookie登录
loginMode=1
#是否使用代理,0-不用,1-使用,需登录的一般不适用代理
useProxy=1
#验证码识别,0-不识别,1-识别,暂采用固定的方法识别验证码,后续扩展为不同的识别模式
verifiCode=0
#登录Url ?open=login
loginUrl=https://wenshu.court.gov.cn/website/wenshu/181010CARHS5BS3C/index.html
#正常Url,登录后可能会自动跳转到正常Url
mainUrl=https://wenshu.court.gov.cn
#登录-用户
loginUser=#root > div > form > div > div:nth-child(1) > div > div > div > input
#登录-口令
loginPasswd=#root > div > form > div > div:nth-child(2) > div > div > div > input
#裁判文书网的图形验证码在单独的页面,输入正确后返回到登录界面
#登录-图形验证码输入框,不为空时则需要识别验证码
loginCaptchaInput=body > div > div.card-body > div > form > div.captcha > input
#登录-图形验证码图片
loginCaptchaImage=#Image1
#登录-图形验证码确认按钮
loginCaptchaButton=body > div > div.card-body > div > form > div.warnbtn > input
#登录-短信验证码,可能和图形验证码同时需要,暂未处理
loginSMSCode=
#主界面登录按钮
loginButton=#loginLi > a
#登录界面确认登录按钮
loginOk=#root > div > form > div > div.login-button-container > span
#数据库配置
[db]
host=114.115.159.144
port=3306
user=caiji
passwd=zzsn9988
db=caiji
charset=utf8
#css选择器配置
[css]
#搜索-文本框
searchInput=#_view_1540966814000 > div > div.search-wrapper.clearfix > div.search-middle > input
#搜索-按钮
searchButton=#_view_1540966814000 > div > div.search-wrapper.clearfix > div.search-rightBtn.search-click
#列表-日期倒排按钮
listDateSort=#_view_1545184311000 > div.LM_tool.clearfix > div:nth-child(2) > a
#列表-案件数量
listCount=#_view_1545184311000 > div.LM_con.clearfix > div.fr.con_right > span
#列表-案件名称
listTitle=#_view_1545184311000 > div:nth-child(?) > div.list_title.clearfix > h4 > a
#列表-编号
listBianhao=#_view_1545184311000 > div:nth-child(?) > div.list_subtitle > span.ah
#列表-法院
listAddress=#_view_1545184311000 > div:nth-child(?) > div.list_subtitle > span.slfyName
#列表-审结日期
listTime=#_view_1545184311000 > div:nth-child(?) > div.list_subtitle > span.cprq
#列表-案由
listYuanyou=#_view_1545184311000 > div:nth-child(?) > div.list_reason > p
#下一页按钮
listNextPage=#_view_1545184311000 > div.left_7_3 > a:last-child
#正文-链接,一般和title相同
contLink=#_view_1545184311000 > div:nth-child(?) > div.list_title.clearfix > h4 > a
#正文-正文
contContent=#_view_1541573883000 > div > div.PDF_box > div.PDF_pox
# 验证码识别,暂只处理裁判文书网的验证码
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.webdriver import WebDriver
import requests
from flask import current_app
from pathlib import Path
import tempfile
import uuid
import hashlib
import os
import json
# selecter: 验证码图片css选择器
def getCaptchaMode1(browser: WebDriver,selecter: str):
ret = ""
# 通过requests发送一个get请求到图片地址,返回的响应就是图片内容
out_path = "./Temp_file"
try:
Path(out_path).mkdir(parents=True, exist_ok=True)
# 将获取到的图片二进制流写入本地文件
path_name = os.path.join(out_path, str(uuid.uuid4())) + ".png"
print(path_name)
# 保存验证码图片
img = browser.find_element(By.CSS_SELECTOR, selecter)
img.screenshot(path_name)
# #url方式下载
# r = requests.get(imgUrl)
# with open(path_name, 'wb') as f:
# # 对于图片类型的通过r.content方式访问响应内容,将响应内容写入baidu.png中
# f.write(r.content)
ocrUrl = current_app.config['sys.ocrUrl']
# 调用文字识别服务
file = open(path_name, "rb")
response = requests.post(ocrUrl, files={"multiRequest": file})
file.close()
os.remove(path_name)
# 返回:{"code":200,"logs":null,"message":"success","resultData":"2rVK"}
oRet = json.loads(response.text)
ret = oRet["resultData"]
#os.remove(path_name)
print(ret)
except Exception as err:
print('getCaptchaMode1 error:', err)
return ret
from datetime import datetime,timedelta
from dateutil.relativedelta import relativedelta
#将yyyy月m月d日格式的日期转为yyyy-mm-dd格式的日期
def convertDate(sDate:str):
sDate = sDate.replace("年","-")
sDate = sDate.replace("月", "-")
sDate = sDate.replace("日", "")
date_obj = datetime.strptime(sDate, '%Y-%m-%d')
sDate = date_obj.strftime('%Y-%m-%d')
return sDate
#日期加减偏置,参数ymd为单位,y=年,m=月,d=日
def dateAdd(sDate:str,ymd:str="d",diff:int=1):
if sDate=="":
sDate = datetime.now()
sDate = sDate.strftime('%Y-%m-%d')
date_obj = datetime.strptime(sDate, '%Y-%m-%d')
if ymd=="y":
if diff > 0:
date_obj = date_obj+relativedelta(years=diff)
else:
diff=-diff
date_obj = date_obj - relativedelta(years=diff)
elif ymd=="m":
if diff>0:
date_obj = date_obj + relativedelta(months=diff)
else:
diff=-diff
date_obj = date_obj - relativedelta(months=diff)
elif ymd=="d":
date_obj = date_obj + timedelta(days=diff)
else:
pass
sDate = date_obj.strftime('%Y-%m-%d')
return sDate
\ No newline at end of file
#数值处理类
#将字符串的金额转换为数值型金额,字符串金额可能包含万元,人民币等
def convertMoney(sMoney:str):
sMoney = sMoney.replace("万", "")
sMoney = sMoney.replace("亿", "")
sMoney = sMoney.replace("人民币", "")
sMoney = sMoney.replace("元", "")
return float(sMoney)
from selenium.webdriver.chrome.webdriver import WebDriver
from seleniumwire import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
#IP代理池
class UtilProxy:
id:int
ip:str
port:str
name:str
password:str
#切换IP代理
def alterIP(self,browser:WebDriver):
pass
# 账号信息
from util import UtilDate
from util import UtilNumber
class LoginInfo:
id: int # 标题
user_group: str
user_name: str
user_passwd: str
# 代理IP信息
from util import UtilDate
from util import UtilNumber
class ProxyInfo:
id: int # 标题
ip: str
port: str
user_name: str
user_passwd: str
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论