Merge remote-tracking branch 'origin/master'

222110f7 · 薛凌堃 · ac853d92 · ce6193f3 · 222110f7 · 222110f7
--- a/comData/caiwushuju/nasdaq_caiwu.py
+++ b/comData/caiwushuju/nasdaq_caiwu.py
@@ -11,7 +11,6 @@ from bs4 import BeautifulSoup
 from requests.adapters import HTTPAdapter
 from requests.packages import urllib3
 from retry import retry
-
 from base import BaseCore

 urllib3.disable_warnings()
@@ -20,6 +19,7 @@ log = baseCore.getLogger()
 cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
                      charset='utf8mb4')
 cursor = cnx.cursor()
+r = baseCore.r
 URL = 'https://www.nasdaq.com/'
 session = requests.session()
 session.mount('https://', HTTPAdapter(pool_connections=20, pool_maxsize=100))
@@ -65,6 +65,7 @@ def add_date(com_code, date_list):
 # 数据发送端口
 def sendData(start_time, social_code, gpdm, dic_info):
    data = json.dumps(dic_info)
+    # print(data)
    url_baocun = 'http://114.115.236.206:8088/sync/finance/nsdk'
    for nnn in range(0, 3):
        try:
@@ -86,7 +87,7 @@ def getUnit(gpdm):
    req.encoding = req.apparent_encoding
    soup = BeautifulSoup(req.text, 'lxml')
    unit = soup.find('div', class_='financials__note').text.split(' ')[1].lstrip().strip()
-    unit = f'(千){unit}'
+    unit = f'{unit}(千)'
    req.close()
    return unit

@@ -104,9 +105,11 @@ def getlist(table, tableName):
                value = re.sub(r"[^\d+-]", "", value)
            else:
                value = '-'
-            date = years[f'value{i}'].split('/')[2] + '-' + years[f'value{i}'].split('/')[0] + '-' + \
-                   years[f'value{i}'].split('/')[1]
-            list.append({f'{tableName}': name, 'value': value, 'date': date, })
+            date_ = years[f'value{i}']
+            if date_ :
+                date = date_.split('/')[2] + '-' + date_.split('/')[0] + '-' + \
+                       date_.split('/')[1]
+                list.append({f'{tableName}': name, 'value': value, 'date': date, })
    return list


@@ -162,6 +165,7 @@ def getYear(start_time, session, social_code, gpdm):
                # 判断该报告期是否已采过
                panduan = check_date(social_code, date + '-year')
                if panduan:
+                    log.info(f'{social_code}=={gpdm}=={date}年度数据采集过')
                    continue
                xjll_list_f = reviseData([item for item in final_list if 'xjll' in item], unit, 'xjll')
                zcfz_list_f = reviseData([item for item in final_list if 'zcfz' in item], unit, 'zcfz')
@@ -177,6 +181,7 @@ def getYear(start_time, session, social_code, gpdm):
                    "ynFirst": ynFirst,
                }
                sendData(start_time, social_code, gpdm, dic_info)
+                log.info(f'{social_code}=={gpdm}=={date}年度财务数据采集成功')
                date_list.append(date + '-year')
        else:
            log.error(f'找不到{social_code}=={gpdm}年度财务数据')
@@ -184,6 +189,7 @@ def getYear(start_time, session, social_code, gpdm):
            takeTime = baseCore.getTimeCost(start_time, time.time())
            baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===无年度财务数据')
    except:
+        log.error(f'{social_code}===年度财务数据访问失败')
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===年度财务数据访问失败')
@@ -217,6 +223,7 @@ def getQuarter(start_time, session, social_code, gpdm):
                # 判断该报告期是否已采过
                panduan = check_date(social_code, date + '-quarter')
                if panduan:
+                    log.info(f'{social_code}=={gpdm}=={date}季度数据采集过')
                    continue
                xjll_list_f = reviseData([item for item in final_list if 'xjll' in item], unit, 'xjll')
                zcfz_list_f = reviseData([item for item in final_list if 'zcfz' in item], unit, 'zcfz')
@@ -236,13 +243,15 @@ def getQuarter(start_time, session, social_code, gpdm):
                if panduan_flag:
                    dic_info['dateFlag'] = 'year'
                sendData(start_time, social_code, gpdm, dic_info)
+                log.info(f'{social_code}=={gpdm}=={date}季度财务数据采集成功')
                date_list.append(date + '-quarter')
-
        else:
+            log.error(f'{social_code}=={gpdm}无季度财务数据')
            state = 0
            takeTime = baseCore.getTimeCost(start_time, time.time())
            baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===无季度财务数据')
    except:
+        log.error(f'{social_code}===季度财务数据访问失败')
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===季度财务数据访问失败')
@@ -250,36 +259,52 @@ def getQuarter(start_time, session, social_code, gpdm):
    return date_list


+def FinanceFromNasdaq():
+    sql = "select xydm from mgzqyjwyh_list where state=2 and exchange='Nasdaq';"
+    cursor.execute(sql)
+    finance = cursor.fetchall()
+    finance_list = [item[0] for item in finance]
+    for item in finance_list:
+        r.rpush('FinanceFromNasdaq:nasdaqfinance_socialCode', item)
+    print('redis放入成功')
+
+def getInfomation(social_code):
+    sql = f"select * from mgzqyjwyh_list where state=2 and xydm='{social_code}';"
+    cursor.execute(sql)
+    data = cursor.fetchone()
+    return data
+
 def doJob():
-    # while True:
-    # social_code = baseCore.redicPullData('')
-    # datas_enterprise = baseCore.getInfomation(social_code)
    session.get(URL, headers=headers)
-    # sql = "select * from mgzqyjwyh_list where state=2 and exchange='Nasdaq';"
-    # cursor.execute(sql)
-    # datas_enterprise = cursor.fetchall()
-    # for data_enterprise in datas_enterprise:
-    start_time = time.time()
-    #     gpdm = data_enterprise[3]
-    #     social_code = data_enterprise[6]
-    social_code = 'ZD0CN0012309000172'
-    gpdm = 'NTES'
-    # 采集年度数据
-    date_list_year = getYear(start_time, session, social_code, gpdm)
-    # 保存年度数据到redis
-    add_date(social_code, date_list_year)
-    # 采集季度数据
-    date_list_quarter = getQuarter(start_time, session, social_code, gpdm)
-    # 保存季度数据到redis
-    add_date(social_code, date_list_quarter)
-    timeCost = baseCore.getTimeCost(start_time, time.time())
-    state = 1
-    baseCore.recordLog(social_code, taskType, state, timeCost, '', '')
-    log.info(f'{social_code}=={gpdm}==耗时{timeCost}')
-    # break
-    cursor.close()
-    cnx.close()
+    while True:
+        social_code = baseCore.redicPullData('FinanceFromNasdaq:nasdaqfinance_socialCode')
+        if not social_code or social_code == None:
+            log.info('============已没有数据============等待===============')
+            time.sleep(600)
+            continue
+        data_enterprise = getInfomation(social_code)
+        start_time = time.time()
+        gpdm = data_enterprise[3]
+        social_code = data_enterprise[6]
+        # print(gpdm,social_code)
+        # 采集年度数据
+        date_list_year = getYear(start_time, session, social_code, gpdm)
+        # 保存年度数据到redis
+        add_date(social_code, date_list_year)
+        # 采集季度数据
+        date_list_quarter = getQuarter(start_time, session, social_code, gpdm)
+        # 保存季度数据到redis
+        add_date(social_code, date_list_quarter)
+        timeCost = baseCore.getTimeCost(start_time, time.time())
+        state = 1
+        baseCore.recordLog(social_code, taskType, state, timeCost, '', '')
+        log.info(f'{social_code}=={gpdm}==耗时{timeCost}')


 if __name__ == '__main__':
+    # 财务数据采集
    doJob()
+    # 企业股票代码放入redis
+    # FinanceFromNasdaq()
+    cursor.close()
+    cnx.close()
\ No newline at end of file
--- a/pyWenShu/.gitignore
+++ b/pyWenShu/.gitignore
+# created by virtualenv automatically
+*
--- a/pyWenShu/App.py
+++ b/pyWenShu/App.py
+import gc
+from flask import Flask, render_template, request, current_app
+import configparser
+from controller.Main import Main  # 导入全部蓝图变量
+import datetime
+from apscheduler.schedulers.blocking import BlockingScheduler
+from datetime import datetime
+from dao.Conn import ConnMySql
+import sys
+import io
+
+
+# 清除登录状态
+def clearLoginStateIn24H():
+    conn = ConnMySql()
+    conn.userClearLoginStateIn24H()
+    print("清除登录状态-" + datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
+
+
+app = Flask(__name__)  # 初始化Flask对象
+app.register_blueprint(Main)  # 将所有蓝图对象注册到app这个flask对象内
+
+# 上传文件最大16M字节
+app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024
+
+# App配置信息，键=段名+键名，如：db.port=3306
+cfg = configparser.ConfigParser()
+cfg.optionxform = str  # 保持配置文件中键的大小写
+cfg.read("static/conf/sys.ini", encoding='utf-8')
+sections = cfg.sections()
+for section in sections:
+    items = cfg.items(section)
+    for key, val in items:
+        app.config[section + '.' + key] = val
+
+# 个别取值进行特殊处理
+app.config['db.port'] = int(app.config['db.port'])
+if app.config['sys.useProxy'] == "0":
+    app.config['sys.useProxy'] = False
+else:
+    app.config['sys.useProxy'] = True
+
+app.config['sys.proxyid'] = 0 #当前使用的代理id
+app.config['sys.userid'] = 0 #当前使用的账号id
+
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+
+if __name__ == '__main__':
+    # webbrowser.open("0.0.0.0:5000")
+    app.run(host='0.0.0.0', port=5201, debug=True)  # 启动入口
+    # 启动定时任务，定时清除异常登录状态，每半小时一次
+    # sched = BlockingScheduler()
+    # sched.add_job(clearLoginStateIn24H, 'interval', seconds=1800, id='task-clearLoginStateIn24H')
+    # sched.start()
--- a/pyWenShu/controller/Main.py
+++ b/pyWenShu/controller/Main.py
+import gc
+from flask import Blueprint, request, current_app, make_response, send_file  # 导入蓝图
+import datetime
+import re
+import os
+import logging
+import sys
+import io
+import tempfile
+import openpyxl
+import string
+import json
+
+from selenium.webdriver.common.proxy import Proxy, ProxyType
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+
+from util import UtilDate
+from service.Service02 import Service02
+
+Main = Blueprint('Main', __name__)  # 初始化一个蓝图，而不是Flask对象
+
+
+# 接受请求，读取请求体中的JSON参数，根据参数进行抓取动作
+# {"from":"1900-01-01","last":最近x天数, "orgs":["单位1全称","单位2全称","单位3全称",...]}
+@Main.route('/Main/getData', methods=["POST"])
+def getData():
+    print("POST /Main/getData")
+    paras = request.get_json(force=True)
+    dateFrom = paras['from']
+    lastDays = paras['last']
+    orgs = paras['orgs']
+    if dateFrom == "":
+        if lastDays == "":
+            lastDays = 0
+        else:
+            lastDays = -(int(lastDays) - 1)
+        dateFrom = UtilDate.dateAdd("", "d", lastDays)
+
+    service02 = Service02()
+    return service02.getData(dateFrom, orgs) #"https://wenshu.court.gov.cn/website/wenshu/181029CR4M5A62CH/index.html"
--- a/pyWenShu/controller/__init__.py
+++ b/pyWenShu/controller/__init__.py
--- a/pyWenShu/dao/Conn.py
+++ b/pyWenShu/dao/Conn.py
--- a/pyWenShu/dao/ProxyDao.py
+++ b/pyWenShu/dao/ProxyDao.py
+
+class ProxyDao():
+
+    def t(self):
+        pass
--- a/pyWenShu/dao/__init__.py
+++ b/pyWenShu/dao/__init__.py
--- a/pyWenShu/entity/BaseInfo.py
+++ b/pyWenShu/entity/BaseInfo.py
+# 基本信息
+from util import UtilDate
+from util import UtilNumber
+
+
+class BaseInfo:
+    info_title: str  # 标题
+    key_word: str  # 关键词
+    info_bianhao: str  # 案号
+    info_address: str  # 管辖法院
+    info_time: str  # 发布日期 #yyyy-mm-dd
+    info_id: str  # 案件ID
+    info_yuanyou: str  # 裁判理由
+    info_content: str  # 正文内容
+
+    # 判断本条信息日期是否在指定日期（含）之后
+    def isAfter(self, sDate: str) -> bool:
+        if sDate == "":
+            return False
+        else:
+            if self.info_time >= sDate:
+                return True
+            else:
+                return False
+
+    def toString(self):
+        return self.info_title + "\t" + self.key_word + "\t" + self.info_bianhao + "\t" + self.info_address + "\t" + self.info_time + "\t" + self.info_id
--- a/pyWenShu/entity/__init__.py
+++ b/pyWenShu/entity/__init__.py
--- a/pyWenShu/pyvenv.cfg
+++ b/pyWenShu/pyvenv.cfg
+home = C:\Program Files\Python
+implementation = CPython
+version_info = 3.8.0.final.0
+virtualenv = 20.13.0
+include-system-site-packages = true
+base-prefix = C:\Program Files\Python
+base-exec-prefix = C:\Program Files\Python
+base-executable = C:\Program Files\Python\python.exe
--- a/pyWenShu/service/Service02.py
+++ b/pyWenShu/service/Service02.py
--- a/pyWenShu/service/__init__.py
+++ b/pyWenShu/service/__init__.py
--- a/pyWenShu/static/conf/sys.ini
+++ b/pyWenShu/static/conf/sys.ini
+#系统配置
+[sys]
+#文字识别Url，用于识别裁判文书网的验证码
+ocrUrl=http://114.116.49.86:8013/wzsb_app?withCrLf=false
+#登录模式，0-无需登录，1-账号登录(需要口令、短信、验证码相应的选择器不能为空)，2-cookie登录
+loginMode=1
+#是否使用代理，0-不用，1-使用，需登录的一般不适用代理
+useProxy=1
+#验证码识别，0-不识别，1-识别，暂采用固定的方法识别验证码，后续扩展为不同的识别模式
+verifiCode=0
+#登录Url ?open=login
+loginUrl=https://wenshu.court.gov.cn/website/wenshu/181010CARHS5BS3C/index.html
+#正常Url，登录后可能会自动跳转到正常Url
+mainUrl=https://wenshu.court.gov.cn
+#登录-用户
+loginUser=#root > div > form > div > div:nth-child(1) > div > div > div > input
+#登录-口令
+loginPasswd=#root > div > form > div > div:nth-child(2) > div > div > div > input
+#裁判文书网的图形验证码在单独的页面，输入正确后返回到登录界面
+#登录-图形验证码输入框，不为空时则需要识别验证码
+loginCaptchaInput=body > div > div.card-body > div > form > div.captcha > input
+#登录-图形验证码图片
+loginCaptchaImage=#Image1
+#登录-图形验证码确认按钮
+loginCaptchaButton=body > div > div.card-body > div > form > div.warnbtn > input
+#登录-短信验证码，可能和图形验证码同时需要，暂未处理
+loginSMSCode=
+#主界面登录按钮
+loginButton=#loginLi > a
+#登录界面确认登录按钮
+loginOk=#root > div > form > div > div.login-button-container > span
+#数据库配置
+[db]
+host=114.115.159.144
+port=3306
+user=caiji
+passwd=zzsn9988
+db=caiji
+charset=utf8
+
+#css选择器配置
+[css]
+#搜索-文本框
+searchInput=#_view_1540966814000 > div > div.search-wrapper.clearfix > div.search-middle > input
+#搜索-按钮
+searchButton=#_view_1540966814000 > div > div.search-wrapper.clearfix > div.search-rightBtn.search-click
+#列表-日期倒排按钮
+listDateSort=#_view_1545184311000 > div.LM_tool.clearfix > div:nth-child(2) > a
+#列表-案件数量
+listCount=#_view_1545184311000 > div.LM_con.clearfix > div.fr.con_right > span
+#列表-案件名称
+listTitle=#_view_1545184311000 > div:nth-child(?) > div.list_title.clearfix > h4 > a
+#列表-编号
+listBianhao=#_view_1545184311000 > div:nth-child(?) > div.list_subtitle > span.ah
+#列表-法院
+listAddress=#_view_1545184311000 > div:nth-child(?) > div.list_subtitle > span.slfyName
+#列表-审结日期
+listTime=#_view_1545184311000 > div:nth-child(?) > div.list_subtitle > span.cprq
+#列表-案由
+listYuanyou=#_view_1545184311000 > div:nth-child(?) > div.list_reason > p
+#下一页按钮
+listNextPage=#_view_1545184311000 > div.left_7_3 > a:last-child
+#正文-链接，一般和title相同
+contLink=#_view_1545184311000 > div:nth-child(?) > div.list_title.clearfix > h4 > a
+#正文-正文
+contContent=#_view_1541573883000 > div > div.PDF_box > div.PDF_pox
+
+
+
+
+
--- a/pyWenShu/util/UtilBrowser.py
+++ b/pyWenShu/util/UtilBrowser.py
--- a/pyWenShu/util/UtilCaptcha.py
+++ b/pyWenShu/util/UtilCaptcha.py
+# 验证码识别，暂只处理裁判文书网的验证码
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.webdriver import WebDriver
+
+import requests
+from flask import current_app
+from pathlib import Path
+import tempfile
+import uuid
+import hashlib
+import os
+import json
+
+
+# selecter: 验证码图片css选择器
+def getCaptchaMode1(browser: WebDriver,selecter: str):
+    ret = ""
+    # 通过requests发送一个get请求到图片地址，返回的响应就是图片内容
+    out_path = "./Temp_file"
+    try:
+        Path(out_path).mkdir(parents=True, exist_ok=True)
+        # 将获取到的图片二进制流写入本地文件
+        path_name = os.path.join(out_path, str(uuid.uuid4())) + ".png"
+        print(path_name)
+        # 保存验证码图片
+        img = browser.find_element(By.CSS_SELECTOR, selecter)
+        img.screenshot(path_name)
+        # #url方式下载
+        # r = requests.get(imgUrl)
+        # with open(path_name, 'wb') as f:
+        #     # 对于图片类型的通过r.content方式访问响应内容，将响应内容写入baidu.png中
+        #     f.write(r.content)
+        ocrUrl = current_app.config['sys.ocrUrl']
+        # 调用文字识别服务
+        file = open(path_name, "rb")
+        response = requests.post(ocrUrl, files={"multiRequest": file})
+        file.close()
+        os.remove(path_name)
+        # 返回：{"code":200,"logs":null,"message":"success","resultData":"2rVK"}
+        oRet = json.loads(response.text)
+        ret = oRet["resultData"]
+        #os.remove(path_name)
+        print(ret)
+    except Exception as err:
+        print('getCaptchaMode1 error:', err)
+    return ret
+
--- a/pyWenShu/util/UtilDate.py
+++ b/pyWenShu/util/UtilDate.py
+from datetime import datetime,timedelta
+from dateutil.relativedelta import relativedelta
+
+#将yyyy月m月d日格式的日期转为yyyy-mm-dd格式的日期
+def convertDate(sDate:str):
+    sDate = sDate.replace("年","-")
+    sDate = sDate.replace("月", "-")
+    sDate = sDate.replace("日", "")
+    date_obj = datetime.strptime(sDate, '%Y-%m-%d')
+    sDate = date_obj.strftime('%Y-%m-%d')
+    return sDate
+
+#日期加减偏置，参数ymd为单位，y=年，m=月，d=日
+def dateAdd(sDate:str,ymd:str="d",diff:int=1):
+    if sDate=="":
+        sDate = datetime.now()
+        sDate = sDate.strftime('%Y-%m-%d')
+
+    date_obj = datetime.strptime(sDate, '%Y-%m-%d')
+    if ymd=="y":
+        if diff > 0:
+            date_obj = date_obj+relativedelta(years=diff)
+        else:
+            diff=-diff
+            date_obj = date_obj - relativedelta(years=diff)
+    elif ymd=="m":
+        if diff>0:
+            date_obj = date_obj + relativedelta(months=diff)
+        else:
+            diff=-diff
+            date_obj = date_obj - relativedelta(months=diff)
+    elif ymd=="d":
+        date_obj = date_obj + timedelta(days=diff)
+    else:
+        pass
+    sDate = date_obj.strftime('%Y-%m-%d')
+    return sDate
\ No newline at end of file
--- a/pyWenShu/util/UtilNumber.py
+++ b/pyWenShu/util/UtilNumber.py
+#数值处理类
+
+#将字符串的金额转换为数值型金额，字符串金额可能包含万元，人民币等
+def convertMoney(sMoney:str):
+    sMoney = sMoney.replace("万", "")
+    sMoney = sMoney.replace("亿", "")
+    sMoney = sMoney.replace("人民币", "")
+    sMoney = sMoney.replace("元", "")
+    return float(sMoney)
--- a/pyWenShu/util/UtilProxy.py
+++ b/pyWenShu/util/UtilProxy.py
+from selenium.webdriver.chrome.webdriver import WebDriver
+from seleniumwire import webdriver
+from selenium.webdriver.chrome.service import Service as ChromeService
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.common.by import By
+
+#IP代理池
+class UtilProxy:
+    id:int
+    ip:str
+    port:str
+    name:str
+    password:str
+
+    #切换IP代理
+    def alterIP(self,browser:WebDriver):
+        pass
+
--- a/pyWenShu/util/__init__.py
+++ b/pyWenShu/util/__init__.py
--- a/pyWenShu/vo/LoginInfo.py
+++ b/pyWenShu/vo/LoginInfo.py
+# 账号信息
+from util import UtilDate
+from util import UtilNumber
+
+
+class LoginInfo:
+    id: int  # 标题
+    user_group: str
+    user_name: str
+    user_passwd: str
--- a/pyWenShu/vo/ProxyInfo.py
+++ b/pyWenShu/vo/ProxyInfo.py
+# 代理IP信息
+from util import UtilDate
+from util import UtilNumber
+
+
+class ProxyInfo:
+    id: int  # 标题
+    ip: str
+    port: str
+    user_name: str
+    user_passwd: str
+