import time
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
import json
import re
from openpyxl import Workbook
import pandas as pd
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver

class Chaoxing(object):
    def __init__(self):
        self.cookie='__dxca=d1027170-19cf-4c12-850d-a2491d365f21; cookiecheck=true; AID_dsr=7209; msign_dsr=1693016311917; search_uuid=93063aab%2d552d%2d44a5%2d97dd%2d69291e006407; mqs=19e3b526c24d63961c594850f09cc91fb2684007674e38588d7c2e084ce6404e6d58f013518d47b7966204b865bf74394b6aea37199a56bd226a4f6e20ecf6dd7a56bfaeb2a1db23e4446bf79392c19a3b7ac3c5ffa4f00ee4b101eb614dfa827ff7dd70f5c2e05994258c957fef2435; qkindustry=; lv=0; chaoxinguser=1; uname=""; _uid=294788899; uf=f9866f9a46b70622f1364c77ecd4d7131ab59cdd60bf892e7a4fb73fc6375849f0d3025d42acdc547f3affbfde451bc49b0594e13f4b452fbdd6b93a43158491db9a01fd759e1b9870b8e6462cc1afdfe506c5241298ab1b; _d=1693016401839; UID=294788899; vc=F998B685897E257FBE4CBDEB36BC4781; vc2=CEE00B2BE090389C97B44000FEFC16C6; vc3=H4qn7owTnyWvR0ubBUMWGf4zX3U0pgoj59Bk4URCwnrBZc1M4ywPJxorV%2B2PhJeMN6sb2DBo7XuPQ%2BEpdtQuXWg1XLj8Z2ZYbFY0X2fYHunmK9tjFteI8BN1V0nXUCUOxAIkpIBcwaPx3D%2BXqRilmQRaYTS66L7i2VoD9GfiQjQ%3D312e1dbb9c6d3a7eea186e0579ee47b6; cx_p_token=17e6b9b56f8636b05c074933cc668ced; xxtenc=ec455945739f206ee2b2e416e997970e; DSSTASH_LOG=C_0-UN_0-US_294788899-T_1693016401841; duxiu=userName%5fdsr%2c%3dzhizhentest%2c%21userid%5fdsr%2c%3d21498%2c%21char%5fdsr%2c%3d%2c%21metaType%2c%3d260%2c%21dsr%5ffrom%2c%3d0%2c%21logo%5fdsr%2c%3dlogo0408%2ejpg%2c%21logosmall%5fdsr%2c%3dsmall0408%2ejpg%2c%21title%5fdsr%2c%3d%u8d85%u661f%u53d1%u73b0%2c%21url%5fdsr%2c%3d%2c%21compcode%5fdsr%2c%3d%2c%21province%5fdsr%2c%3d%u5176%u5b83%2c%21readDom%2c%3d0%2c%21isdomain%2c%3d17153%2c%21showcol%2c%3d0%2c%21hu%2c%3d0%2c%21uscol%2c%3d0%2c%21isfirst%2c%3d0%2c%21istest%2c%3d1%2c%21cdb%2c%3d0%2c%21og%2c%3d0%2c%21ogvalue%2c%3d0%2c%21testornot%2c%3d1%2c%21remind%2c%3d0%2c%21datecount%2c%3d2315%2c%21userIPType%2c%3d2%2c%21lt%2c%3d0%2c%21ttt%2c%3dfxlogin%2echaoxing%2c%21enc%5fdsr%2c%3dFCCD81D85DEC4A17B4F4F17DFD4F5515; historySearchWord=%25E5%2586%25B6%25E9%2587%2591%252C6%252C0831%253B%25E4%25BA%25A4%25E4%25BA%2592%25E6%2596%25B0%25E9%2597%25BB%252C1%252C0826; JSESSIONID=E6662D3AE489E4BFB69B272D6216A15E.fx4210'
        self.driver=self.webdriver()
    def resGetHtml(self,url):
        try:
            proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
            header = {
                'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                'Accept-Encoding':'gzip, deflate, br',
                'Accept-Language':'zh-CN,zh;q=0.9',
                'Cache-Control':'no-cache',
                'Connection':'keep-alive',
                'Cookie': self.cookie,
                'Host':'qikan.chaoxing.com',
                'Pragma':'no-cache',
                'Referer':'https://qikan.chaoxing.com/searchjour?sw=%E5%86%B6%E9%87%91&topsearch=0&size=50',
                'Sec-Fetch-Dest':'document',
                'Sec-Fetch-Mode':'navigate',
                'Sec-Fetch-Site':'same-origin',
                'Sec-Fetch-User':'?1',
                'Upgrade-Insecure-Requests':'1',
                'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
                'sec-ch-ua':'"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
                'sec-ch-ua-mobile':'?0',
                'sec-ch-ua-platform':'"Windows"'
            }
            # url='https://qikan.chaoxing.com/searchjour?sw=%E5%86%B6%E9%87%91&topsearch=0&size=50'
            response = requests.get(url,headers=header,verify=False,timeout=10)
            print(f"请求返回的code码{response.status_code}")
            html=response.text
            print(html)
        except Exception as e:
            html=''
        return html

    def webdriver(self):
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--ignore-certificate-errors')
        chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_argument("--start-maximized")
        # chrome_options.add_argument('--headless')
        # 添加个人资料路径参数
        profile_path=r'C:\Users\WIN10\AppData\Local\Google\Chrome\User Data\Default'
        chrome_options.add_argument(f'--user-data-dir={profile_path}')
        chrome_options.binary_location =r'D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe'
        executable_path =r'C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe'
        driver = webdriver.Chrome(options=chrome_options, executable_path=executable_path)
        return driver

    def paserUrl(self,html,listurl):
        soup = BeautifulSoup(html, 'html.parser')
        # 获取所有的<a>标签和<img>标签
        links = soup.find_all(['a', 'img'])
        # 遍历标签，将相对地址转换为绝对地址
        for link in links:
            if 'href' in link.attrs:
                link['href'] = urljoin(listurl, link['href'])
            elif 'src' in link.attrs:
                link['src'] = urljoin(listurl, link['src'])
        return soup

    def rmTagattr(self,html,url):
        # 使用BeautifulSoup解析网页内容
        # soup = BeautifulSoup(html, 'html.parser')
        soup = self.paserUrl(html,url)
        # 遍历所有标签，并去掉属性
        for tag in soup.find_all(True):
            if tag.name == 'img':
                tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'}
            elif tag.name !='img':
                tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'}
            else:
                tag.attrs = {key: value for key, value in tag.attrs.items()}
        # 打印去掉属性后的网页内容
        # print(soup.prettify())
        html=soup.prettify()
        return html

    def pageList(self):
        for i in range(1,100):
            url=f'https://qikan.chaoxing.com/searchjour?sw=%E5%86%B6%E9%87%91&nosim=1&size=50&x=0_7209&pages={i}'
            print(f'采集第{i}页数据')
            # html=self.resGetHtml(url)
            try:
                self.driver.get(url)
                time.sleep(5)
                html=self.driver.page_source
            except Exception as e:
                self.driver.close()
                self.driver=self.webdriver()
            soup=self.paserUrl(html,url)
            text=soup.prettify()
            doc=pq(text)
            trtext=doc('table[class="listTable"]>tbody>tr')
            trdoc=pq(trtext)
            for td in trdoc:
                time.sleep(5)
                tddoc=pq(td)
                title=tddoc("td:nth-child(2)>a").text()
                title=re.sub(r'\s+', '', title)
                turl=tddoc("td:nth-child(2)>a").attr('href')
                author=tddoc("td:nth-child(3)").text()
                author=re.sub(r'\s+', '', author)
                source=tddoc("td:nth-child(4)").text()
                source=re.sub(r'\s+', '', source)
                cloumn=tddoc("td:nth-child(5)").text()
                cloumn=re.sub(r'\s+', '', cloumn)
                yearqi=tddoc("td:nth-child(6)").text()
                yearqi=re.sub(r'\s+', '', yearqi)
                type=tddoc("td:nth-child(7)").text()
                type=re.sub(r'\s+', '', type)
                yincount=tddoc("td:nth-child(8)").text()
                yincount=re.sub(r'\s+', '', yincount)
                readcount=tddoc("td:nth-child(9)").text()
                readcount=re.sub(r'\s+', '', readcount)
                pdfurl=tddoc("td:nth-child(10)>a").attr('href')
                # print(f'title:{title} turl:{turl} author:{author} source:{source} cloumn:{cloumn} yearqi:{yearqi} type{type} yincount:{yincount} readcount:{readcount} pdfurl:{pdfurl}')
                detailmsg={
                    'turl':turl,
                    'yearqi':yearqi,
                    'type':type,
                    'yincount':yincount,
                    'readcount':readcount,
                    # 'pdfurl':pdfurl
                }
                print(f"解析详情页面地址：{turl}")
                self.paserDetail(detailmsg)
    def paserDetail(self,detailmsg):
        detailList=[]
        durl=detailmsg['turl']
        # html=self.resGetHtml(durl)
        try:
            self.driver.get(durl)
            time.sleep(5)
            html=self.driver.page_source
        except Exception as e:
            self.driver.close()
            self.driver=self.webdriver()

        soup=self.paserUrl(html,durl)
        text=soup.prettify()
        ddoc=pq(text)
        title=ddoc('h1[class="F_titel"]').text()
        atxt=ddoc('p[class="F_name"]>sup')
        atxt.empty()
        author=ddoc('p[class="F_name"]').text()
        sup=ddoc.find('tr:nth-child(1)>td:nth-child(2)>sup')
        sup.empty()
        jigou=ddoc.find('tr:nth-child(1)>td:nth-child(2)').text()
        sup=ddoc.find('tr:nth-child(2)>td:nth-child(2)>sup')
        source=ddoc.find('tr:nth-child(2)>td:nth-child(2)').text()
        sup=ddoc.find('tr:nth-child(3)>td:nth-child(2)>sup')
        classnum=ddoc.find('tr:nth-child(3)>td:nth-child(2)').text()
        classnav=ddoc.find('tr:nth-child(4)>td:nth-child(2)').text()
        keyword=ddoc.find('tr:nth-child(5)>td:nth-child(2)').text()
        jijin=ddoc.find('tr:nth-child(6)>td:nth-child(2)').text()
        summ=ddoc.find('tr:nth-child(7)>td:nth-child(2)').text()
        pdfurl=ddoc.find('a[class="pdfdown"]').attr('href')
        contentTag=ddoc.find('div[id="FtextCon"]')
        content=contentTag.text()
        contentWithTag=self.rmTagattr(str(contentTag),durl)
        detailmsg['title']=title
        detailmsg['author']=author
        detailmsg['source']=source
        detailmsg['classnum']=classnum
        detailmsg['classnav']=classnav
        detailmsg['keyword']=keyword
        detailmsg['jijin']=jijin
        detailmsg['summ']=summ
        detailmsg['pdfurl']=pdfurl
        detailmsg['content']=content
        detailmsg['contentWithTag']=contentWithTag
        detailList.append(detailmsg)
        print(f"详情数据入口{pdfurl}")
        self.writerToExcel(detailList)

    def writerToExcel(self,detailList):
        # filename='baidu搜索.xlsx'
        # 读取已存在的xlsx文件
        existing_data = pd.read_excel(filename)
        # 创建新的数据
        new_data = pd.DataFrame(data=detailList)
        # 将新数据添加到现有数据的末尾
        combined_data = existing_data.append(new_data, ignore_index=True)
        # 将结果写入到xlsx文件
        combined_data.to_excel(filename, index=False)
        print('保存成功！！')




if __name__ == '__main__':
    filename=f'超星期刊.xlsx'
    # # 创建一个工作簿
    workbook = Workbook()
    workbook.save(filename)
    chaoxing=Chaoxing()
    chaoxing.pageList()






