# 裁判文书抓取
from datetime import datetime, timedelta
import json
import time
from flask import current_app as app
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from typing import List
import io
import sys

from dao.Conn import ConnMySql
from util import UtilBrowser
from util import UtilCaptcha

from entity.BaseInfo import BaseInfo
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import jsonpickle

from util.UtilCaptcha import getCaptchaMode1
from vo.LoginInfo import LoginInfo


class Service02:
    browser: WebDriver
    url = ""
    dateFrom = ""

    loginInfo: LoginInfo
    #在浏览器的命令行直接指定打开的url时，chrome会保留原来的默认打开的标签页，此时有2个标签页。通过driver.get打开网页，则直接在默认的标签页打开，此时有1个标签页
    tab1=1 # 主页、列表页；driver.get打开时为0
    tab2=2 # 裁判文书网页；driver.get打开时为1

    baseInfo = []

    nRetry = 100  # 重试次数，暂未使用
    lstRet = []

    # 主过程
    def getData(self, sDateFrom: str, orgs: List[str]):
        # 循环抓取数据，直到指定日期前的数据都抓取完成，基本信息总是抓取。
        self.dateFrom = sDateFrom
        # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
        print("getData...",flush=True)
        for org in orgs:
            ok1 = 0
            print(org, flush=True)
            # 循环采集一个单位的数据，直到全部完成
            while True:
                # 打开浏览器，查找单位并切换到单位详情，失败则更换代理IP后重来
                while True:
                    if self.openBrowser(org):
                        break
                    else:
                        self.quitBrowser()
                if ok1 == 0:
                    # 采集未完成，继续采集
                    ok1 = self.getData1()
                if ok1 == 1:
                    # 采集已完成，退出并继续下一单位
                    break
                conn = ConnMySql()
                if self.loginInfo is not None:
                    conn.userClearLoginStateByID(self.loginInfo.id)
                conn.close()
            # 保存数据到缓冲区
            o = {"org": org, "baseInfo": self.baseInfo}
            self.lstRet.append(o)
        # 全部单位数据采集完成，退出并返回数据给调用者
        retData = jsonpickle.encode(self.lstRet, unpicklable=False)
        print(json.loads(retData))
        return retData

    # 打开浏览器，查找单位并转到单位信息页面
    def openBrowser(self, org: str) -> bool:
        ret = False
        print("openBrowser...", flush=True)
        conn = ConnMySql()
        self.loginInfo = conn.userGetFree("wenshu", app.config['sys.userid'])
        if self.loginInfo is None:
            app.config['sys.userid'] = 0
            self.loginInfo = conn.userGetFree("wenshu", app.config['sys.userid'])
        app.config['sys.userid'] = self.loginInfo.id
        #conn.userSetLoginStateByID(self.loginInfo.id)
        conn.close()

        self.browser = UtilBrowser.newChrome(app.config['sys.mainUrl'], False , app.config['sys.useProxy'])
        #打开浏览器及裁判文书主页后，删除可能存在的多余的窗口页签。暂未使用
        # if len(self.browser.window_handles)>1:
        #     self.browser.switch_to.window(self.browser.window_handles[0])
        #     self.browser.close()
        #     self.browser.switch_to.window(self.browser.window_handles[0])

        loginMode = app.config['sys.loginMode']
        # 登录，后续放到单独的过程中统一处理
        if loginMode == "0":
            pass
        elif loginMode == "1":
            # 需要账号登录，账号存在数据库中，轮换使用
            objLogin = UtilBrowser.waitElement(self.browser, By.CSS_SELECTOR, app.config['sys.loginButton'], 30, 1)
            if objLogin != None:
                objLogin.click()
                #可能随机出现独立的图形验证码输入界面，4个大小写字母和数字组成
                nTry = 0
                hasPass = False
                objLoginCaptchaButton0 = UtilBrowser.waitElement(self.browser, By.CSS_SELECTOR, app.config['sys.loginCaptchaButton'], 10, 1)
                if objLoginCaptchaButton0 is not None:
                    # 尝试5次，若验证通过，则继续，否则退出，切换账号重新尝试
                    while nTry < 10:
                        nTry = nTry + 1
                        objLoginCaptchaButton = UtilBrowser.waitElement(self.browser, By.CSS_SELECTOR, app.config['sys.loginCaptchaButton'], 10, 1)
                        # 获取验证码图片Url
                        # url = self.browser.find_element(By.CSS_SELECTOR, app.config['sys.loginCaptchaImage']).get_attribute('src')
                        strCaptcha = UtilCaptcha.getCaptchaMode1(self.browser, app.config['sys.loginCaptchaImage'])
                        self.browser.find_element(By.CSS_SELECTOR, app.config['sys.loginCaptchaInput']).send_keys(strCaptcha)
                        objLoginCaptchaButton.click()
                        time.sleep(1)
                        objLoginCaptchaButton = UtilBrowser.waitElement(self.browser, By.CSS_SELECTOR, app.config['sys.loginCaptchaButton'], 10, 1)
                        if objLoginCaptchaButton is None:
                            hasPass = True
                            break
                if hasPass:
                    nTry = 0
                else:
                    nTry = 4
                iframe = None  # 登录信息
                while nTry < 3:
                    iframe = UtilBrowser.waitElement(self.browser, By.CSS_SELECTOR, "#contentIframe", 20, 1)
                    # WebDriverWait(driver=driver, timeout=20, poll_frequency=1, ignored_exceptions=None).until(expected_conditions.presence_of_element_located((By.ID,'contentIframe')))
                    if iframe is None:
                        self.browser.refresh()
                    else:
                        break
                    nTry = nTry + 1
                if iframe is not None:
                    self.browser.switch_to.frame(iframe)
                    objUser = UtilBrowser.getElement(self.browser, By.CSS_SELECTOR, app.config['sys.loginUser'])
                    if objUser is not None:
                        objUser.send_keys(self.loginInfo.user_name)
                        objPass = UtilBrowser.getElement(self.browser, By.CSS_SELECTOR, app.config['sys.loginPasswd'])
                        objPass.send_keys(self.loginInfo.user_passwd)
                        objLogin = UtilBrowser.getElement(self.browser, By.CSS_SELECTOR, app.config['sys.loginOk'])
                        # 登录时需要短信验证码，暂未处理
                        if app.config['sys.loginSMSCode'] != "":
                            objSMS = UtilBrowser.hasElement(self.browser, By.CSS_SELECTOR,
                                                            app.config['sys.loginSMSCode'])
                            if objSMS is not None:
                                self.browser.find_element(By.CSS_SELECTOR, app.config['sys.loginSMSCode']).send_keys("")
                        objLogin.click()
                        time.sleep(5)
                        self.browser.refresh()
                        #ret = True  # 此处应确认出现特定元素后才返回True
            # 若填写列口令则自动登录，否则等待人工登录
            # self.browser.find_element(By.CSS_SELECTOR, app.config['sys.loginButton']).click()
        elif loginMode == "2":
            # cookie登录，暂未处理
            pass

        # self.browser.get(app.config['sys.mainUrl'])
        # 搜索框填写单位名称并单击提交
        objSearchInput = UtilBrowser.getElement(self.browser, By.CSS_SELECTOR, app.config['css.searchInput'])
        #登录失败则页面不会有搜索框
        if objSearchInput is None:
            ret = False
        else:
            objSearchInput.send_keys(org)
            time.sleep(2)
            # 模拟单击搜索按钮
            objSearchButton = UtilBrowser.getElement(self.browser, By.CSS_SELECTOR, app.config['css.searchButton'])
            objSearchButton.click()
            time.sleep(5)
            self.browser.refresh()
            # 设置为按日期倒排序
            objDateSort = UtilBrowser.getElement(self.browser, By.CSS_SELECTOR, app.config['css.listDateSort'])
            if objDateSort is not None:
                objDateSort.click()
            time.sleep(5)
            ret = True
        return ret

    # 退出浏览器
    def quitBrowser(self):
        try:
            self.browser.quit()
            self.browser = None
        except:
            pass

    # 裁判文书信息，重新查找某单位的文书时，和上次打开浏览器相比可能增加了新的裁判文书，所以每次都从首页开始，无需从上次中断的页开始
    def getData1(self) -> int:
        ret = 0
        print("getData1...", flush=True)
        # 以下在列表页
        selector_title = app.config['css.listTitle']  # "#_view_1545184311000 > div:nth-child(?) > div.list_title.clearfix > h4 > a"
        # 以下在详情页，需打开裁判文书，裁判文书在新页签打开
        baseInfo1: BaseInfo
        # 裁判文书数量
        s = self.getAttr(By.CSS_SELECTOR, app.config['css.listCount'], "textContent")
        n = self.toInt(s)
        if n == 0:
            # 若无数据则退出，且不在继续采集本类数据
            return 1
        pageNo = 0
        while True:
            # 循环采集多页数据
            pageNo = pageNo + 1
            for i in range(1, 6):  # 每页5条数据
                print(f"----文书数量：{n}，每页文书个数：5，当前页号：{pageNo}，当前序号：{i}", flush=True)
                # 不满一页时遇到不存在的行则退出
                # 文书列表从nth-child(3)开始
                if not UtilBrowser.hasElement(self.browser, By.CSS_SELECTOR,
                                              selector_title.replace("?", str(i + 2), 1)):
                    break
                baseInfo1 = BaseInfo()
                baseInfo1.info_title = self.getAttr(By.CSS_SELECTOR, selector_title.replace("?", str(i + 2), 1), "textContent")
                baseInfo1.info_bianhao = self.getAttr(By.CSS_SELECTOR,
                                                      app.config['css.listBianhao'].replace("?", str(i + 2), 1),
                                                      "textContent")
                baseInfo1.info_address = self.getAttr(By.CSS_SELECTOR,
                                                      app.config['css.listAddress'].replace("?", str(i + 2), 1),
                                                      "textContent")
                baseInfo1.info_time = self.getAttr(By.CSS_SELECTOR,
                                                   app.config['css.listTime'].replace("?", str(i + 2), 1),
                                                   "textContent")
                baseInfo1.info_yuanyou = self.getAttr(By.CSS_SELECTOR,
                                                      app.config['css.listYuanyou'].replace("?", str(i + 2), 1),
                                                      "textContent")
                # https://wenshu.court.gov.cn/website/wenshu/181107ANFZ0BXSK4/index.html?docId=OUD3Tm7EvEQVkiexnBa5S3nnG9zDkQyxiWoR8jr7QJJtFc9Y6vX89Z/dgBYosE2gstL9HQn+C934OzwMvqVgk+DtAz+qRVZWr9dI7ybeiFnaPaFBceYmelTK0+qydxfd
                link = self.getAttr(By.CSS_SELECTOR, selector_title.replace("?", str(i + 2), 1),
                                                           "href")
                pos = link.index("=") + 1
                baseInfo1.info_id = link[pos:]

                if baseInfo1.isAfter(self.dateFrom):
                    # 当前数据条目在指定日期之后，则如果缓冲区没有的话追加到缓冲区，已经存在的，则忽略
                    exist = False
                    for e in self.baseInfo:
                        # 法律文书可抓取到原始的文书ID，可直接用ID查找是否已经抓取
                        if e.info_id == baseInfo1.info_id:  # e.toString == baseInfo1.toString()
                            exist = True
                            break
                    if exist == False:
                        # 获取裁判文书正文，未抓取的才抓取正文
                        # 模拟单击正文链接
                        self.browser.find_element(By.CSS_SELECTOR, selector_title.replace("?", str(i + 2), 1)).click()
                        # 单击后会自动在新标签页打开正文链接，并且为活动页签
                        t1 = datetime.now()
                        while True:
                            if len(self.browser.window_handles) > self.tab2:
                                break
                            t2 = datetime.now()
                            if (t2 - t1).seconds > 60:
                                break
                            time.sleep(1)
                        if len(self.browser.window_handles) > self.tab2:
                            self.browser.switch_to.window(self.browser.window_handles[self.tab2])
                            baseInfo1.info_content = self.getAttr(By.CSS_SELECTOR, app.config['css.contContent'],
                                                                  "textContent")
                            time.sleep(5)
                            # 关闭文书正文页签，回到文书列表页签
                            self.browser.close()
                            self.browser.switch_to.window(self.browser.window_handles[self.tab1])
                            print("--------当前文书长度：", len(baseInfo1.info_content), flush=True)
                            self.baseInfo.append(baseInfo1)
                else:
                    # 当前数据条目在指定日期之后
                    ret = 1
                    break
            # 如果有下一页，则继续，否则数据采集完成
            if self.getAttr(By.CSS_SELECTOR, app.config['css.listNextPage'], "class").find("disabled"):
                # 存在disabled则无下一页
                ret = 1
            else:
                self.browser.find_element(By.CSS_SELECTOR, app.config['css.listNextPage']).click()
            time.sleep(5)
            # 出现验证码窗口或IP锁定界面则退出重新切换IP采集
            if self.hasCaptcha() or self.hasBlock():
                break
            if ret == 1:
                break
        return ret

    def toInt(self, s) -> int:
        ret = 0
        try:
            ret = int(s)
        except:
            pass
        return ret

    # 返回页面元素指定属性的值，如class
    def getAttr(self, by: str, selector: str, attr: str) -> str:
        ret = ""
        try:
            if attr == "text":
                ret = self.browser.find_element(by, selector).text
            else:
                ret = self.browser.find_element(by, selector).get_attribute(attr)
        except:
            pass
        return ret

    # 判断是否出现了验证码。
    def hasCaptcha(self) -> bool:
        ret = False
        # we: WebElement  #
        # wes = self.browser.find_elements(By.TAG_NAME, "div")
        # for we in wes:
        #     if we.get_attribute("class").find("geetest_box"):  #
        #         if we.get_attribute("style").find("display: block;"):
        #             ret = True
        return ret

    # 判断是否出现了IP锁定。
    def hasBlock(self) -> bool:
        ret = False
        # if self.getAttr(By.CSS_SELECTOR, "body > div > p", "text").find("夹带攻击行为"):  # p.prom 您的地址（1.2.3.4）访问疑似夹带攻击行为，请稍后重试，或注册/登录
        #     print("*******夹带攻击行为*******")
        #     ret = True
        return ret
