import csv
import time

import redis
import requests
from bs4 import BeautifulSoup
from retry import retry
from selenium.common import StaleElementReferenceException

from base import BaseCore
from requests.packages import urllib3
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

urllib3.disable_warnings()
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
headers = {
    'Accept': '*/*',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    # 'Cookie': 'pkulaw_v6_sessionid=v1z41wppegb5phyqattpozp4; agency=sclx.pkulaw.com; referer=; Hm_lvt_25f0770f77e5e05b70c050b7d0f2f4a8=1707209811; Hm_lpvt_25f0770f77e5e05b70c050b7d0f2f4a8=1707209811; xCloseNew=7',
    'Host': 'sclx.pkulaw.com',
    'Origin': 'https://sclx.pkulaw.com',
    'Pragma': 'no-cache',
    'Referer': 'https://sclx.pkulaw.com/law',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
    'X-Requested-With': 'XMLHttpRequest',
    'sec-ch-ua': '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
}

# todo:使用模拟浏览器
def create_driver():
    path = r'D:\soft\msedgedriver.exe'

    # options = webdriver.EdgeOptions()
    options = {
        "browserName": "MicrosoftEdge",
        "ms:edgeOptions": {
            "extensions": [], "args": ["--start-maximized"]  # 添加最大化窗口运作参数
        }
    }

    driver = webdriver.Edge(executable_path=path, capabilities=options)
    return driver


@retry(tries=2, delay=5)
def getHref(Keywords, driver):
    # data = {
    #     'Menu': 'law',
    #     'Keywords': Keywords,
    #     'PreKeywords': Keywords,
    #     'SearchKeywordType': 'Title',
    #     'MatchType': 'Exact',
    #     'RangeType': 'Piece',
    #     'Library': 'chl',
    #     'ClassFlag': 'chl',
    #     'GroupLibraries': '',
    #     'QuerySearchCondition': 'Title+Exact+Piece+0',
    #     'QueryOnClick': False,
    #     'AfterSearch': True,
    #     'RequestFrom': 'btnSearch',
    #     'SearchInResult': '',
    #     'PreviousLib': 'chl',
    #     'IsSynonymSearch': 'false',
    #     'RecordShowType': 'List',
    #     'ClassCodeKey': ',,,,,,',
    #     'IsSearchErrorKeyword': '',
    #     'FirstQueryKeywords': Keywords,
    #     'FirstQueryKeywordType': 'Title',
    #     'IsSynonymSearch': 'false',
    #     'X-Requested-With': 'XMLHttpRequest',
    # }
    driver.get('https://sclx.pkulaw.com/law')
    # ip = baseCore.get_proxy()
    driver.find_element(By.ID, 'txtSearch').send_keys(Keywords)
    time.sleep(0.5)
    driver.find_element(By.CLASS_NAME, 'btn-search').click()
    wait = WebDriverWait(driver, 30)
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "accompanying-wrap")))
    getpart = driver.find_element(By.CLASS_NAME, 'accompanying-wrap')
    # li_list = getpart.find_elements(By.TAG_NAME, 'li')
    # for li in li_list:
    driver.execute_script("arguments[0].scrollIntoView();", getpart)
    time.sleep(2)
    try:
        element = getpart.find_element(By.XPATH, ".//div/div[1]/div[3]/div/div[1]/ul/li[@name='HistoryAssociation']")
        time.sleep(1)
        driver.execute_script("arguments[0].scrollIntoView();", element)
        time.sleep(1)
        element.click()
        href = 'https://sclx.pkulaw.com' + element.get_attribute("url")
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, "a-tab-col")))
        info_part = driver.find_element(By.CLASS_NAME, 'a-tab-col').find_element(By.XPATH, './/div[@name="HistoryAssociation"]')
    # except Exception as e:
    except StaleElementReferenceException:
        # 元素已经stale，重新定位元素
        element = driver.find_element(By.XPATH, ".//div/div[1]/div[3]/div/div[1]/ul/li[@name='HistoryAssociation']")
        element.click()  # 再次尝试与元素交互
        href = 'https://sclx.pkulaw.com' + element.get_attribute("url")
        # log.info(e)
        # href = ''
    return href

    # url = 'https://sclx.pkulaw.com/law/chl'
    # req = requests.post(url, headers=headers, data=data, proxies=ip)
    # req = requests.post(url, headers=headers, data=data, verify=False)
    # req.encoding = req.apparent_encoding
    # soup = BeautifulSoup(req.text, 'html.parser')
    # try:
        # tag = soup.find('div', class_='accompanying-wrap').find('div', class_='item').find('li', attrs={
        #     'name': 'HistoryAssociation'})
        # href = 'https://sclx.pkulaw.com' + tag.get('url')
    # except:
    #     href = ''
    # return href


@retry(tries=3, delay=5)
def getData(href, Keywords):
    term = Keywords
    # ip = baseCore.get_proxy()
    # req = requests.get(href, headers=headers, proxies=ip)
    req = requests.get(href, headers=headers)
    req.encoding = req.apparent_encoding
    soup = BeautifulSoup(req.text, 'html.parser')
    li_list = soup.find_all('li')
    for li in li_list:
        publishDate = li.find('span', class_='time').text.strip()
        try:
            theme = li.find('div', class_='theme').text.strip()
        except:
            theme = ''
        # try:
        #     relevance = li.find('div', class_='relevance').text.strip()
        # except:
        #     relevance = ''
        # log.info(f'{publishDate}==={theme}==')
        term += ',' + theme + '_' + publishDate
    log.info(term)
    if ',' not in term or '_' not in term:
        r.rpush('ShenjisclxError:', Keywords)
        return None
    return term


def doJob():
    data_list = []
    driver = create_driver()
    driver.maximize_window()

    while True:
        try:
            Keywords = r.lpop('Shenjisclx:').decode()
            # Keywords = '中华人民共和国银行业监督管理法（2006修正）'
        except:
            Keywords = ''
        if Keywords:
            try:
                href = getHref(Keywords, driver)
                if href:
                    r.rpush('ShenjisclxHref:', f'{Keywords}|{href}')
                    log.info(f'{Keywords}====找到=== {href}')
                    term = getData(href, Keywords)
                else:
                    term = Keywords + ','
                    r.rpush('ShenjisclxHrefNull:', f'{Keywords}|{href}')
                    log.info(f'{Keywords}====未找到')
                if term:
                    # data_list.append(term)
                    r.rpush('ShenjisclxReault:', term)
            except:
                r.rpush('ShenjisclxError:', Keywords)
                continue
            time.sleep(2)
        else:
            break
    # print(data_list)
    # with open('./output.csv', 'w', newline='') as file:
    #     writer = csv.writer(file)
    #
    #     # 写入数据
    #     for row in data_list:
    #         writer.writerow(row.split(','))
    #
    # print('数据已成功写入CSV文件')

if __name__ == '__main__':
    doJob()
    baseCore.close()
