# -*- coding: utf-8 -*-
import time
import urllib

import requests
from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import wget
from openpyxl import Workbook
import pandas as pd

def createDriver():
    chrome_driver =r'C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe'
    path =  Service(chrome_driver)
    chrome_options = webdriver.ChromeOptions()
    chrome_options.binary_location =r'D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe'
    # 设置代理
    # proxy = "127.0.0.1:8080"  # 代理地址和端口
    # chrome_options.add_argument('--proxy-server=http://' + proxy)
    driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
    return driver

def listPage():
    driver=createDriver()
    for i in range(0,6):
        size=i*20
        url = f'https://www.bis.doc.gov/index.php/smart-search?searchword=Russia&searchphrase=all&start={size}'
        driver.get(url)
        html=driver.page_source
        soup=paserUrl(html,url)
        text=str(soup.prettify())
        doc=pq(text)
        titles=doc('dl[class="search-results"]>dt')
        dates=doc('dl[class="search-results"]>dd[class="result-created"]')

        for i in range(0,len(titles)):
            detailList=[]
            tt=titles[i]
            dd=dates[i]
            dddoc=pq(dd)
            ttdoc=pq(tt)
            title=ttdoc('a').text()
            date=dddoc('dd[class="result-created"]').text()
            url=ttdoc('a').attr('href')
            pdfurl,content=detail(driver,url)
            if pdfurl:
                pdfpath="D:/cis/"+title+".pdf"
                download_file(pdfurl,pdfpath)
            else:
                pdfpath=''
            detailmsg={
                "title":title,
                "date":date,
                "url":url,
                "content":content,
                "pdfurl":pdfurl,
                "pdfpath":pdfpath,
            }
            detailList.append(detailmsg)
            writerToExcel(detailList)
            # print(f'title:{title}     date:{date}   url:{url}')

def detail(driver,url):
    k=0
    html=''
    while k<5:
        k+=1
        try:
            # proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
            # response = requests.get(url, proxies=proxy,  verify=False,timeout=10)
            # html=response.text
            driver.get(url)
            time.sleep(3)
            html=driver.page_source
            soup=paserUrl(html,url)
            html=str(soup.prettify())
        except Exception as e:
            html=''
        if html:
            break
    text=paserUrl(html,url)
    docc=pq(text.encode('utf-8'))
    try:
        pdfurl= docc('div[class="docman_download"]>a').attr('href')
    except Exception as e:
        pdfurl=''
    try:
        content=docc('div[class="item-page"]').text()
    except Exception as e:
        content=''
    # print(url)
    return pdfurl,content
# 将数据追加到excel
def writerToExcel(detailList):
    # filename='baidu搜索.xlsx'
    # 读取已存在的xlsx文件
    existing_data = pd.read_excel(filename)
    # 创建新的数据
    new_data = pd.DataFrame(data=detailList)
    # 将新数据添加到现有数据的末尾
    combined_data = existing_data.append(new_data, ignore_index=True)
    # 将结果写入到xlsx文件
    combined_data.to_excel(filename, index=False)

def download_file(url, save_path):
    k=1
    while True:
        if k>5:
            print(url)
            break
        k+=1
        try:
            header = {
                'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                'Accept-Encoding':'gzip, deflate, br',
                'Accept-Language':'zh-CN,zh;q=0.9',
                'Cache-Control':'no-cache',
                'Connection':'keep-alive',
                'Cookie':'b099bcecf0be876536bb9d4826b25ba8=e2horegllcbddejsiveijp7of0; cookiesession1=678A3E12247313FBD6F74569925F4EFD; _ga=GA1.1.840765784.1693040959; __cf_bm=4DTZeDEU67Xjr5nt9OsbE1g1UTdVuOGdQlhj4KD5U2I-1693190695-0-AW81rfvAFUnclDkFVJYqD8+RWrC8FngMzW0dJ+bVHA+JwmPUVpc9/ogA0jhXrKLFYWun2BoK0R/hqWgGZAw/I1Y=; referrer_site=https%3A%2F%2Fwww.bis.doc.gov%2Findex.php%2Fsmart-search%3Fsearchword%3DRussia%26searchphrase%3Dall; csrf_token=a0d03e256a36d037708a809220564f407dee78bc; _ga_TPRT7QB30Y=GS1.1.1693190696.4.1.1693190720.0.0.0',
                'Host':'www.bis.doc.gov',
                'Pragma':'no-cache',
                'Referer':'https://www.bis.doc.gov/index.php/documents/product-guidance/3300-russia-medical-related-license-application-guidance-fpd-final-incorp-occ-and-3f-cmts-clean-071323',
                'Sec-Fetch-Dest':'document',
                'Sec-Fetch-Mode':'navigate',
                'Sec-Fetch-Site':'same-origin',
                'Sec-Fetch-User':'?1',
                'Upgrade-Insecure-Requests':'1',
                'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
                'sec-ch-ua':'"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
                'sec-ch-ua-mobile':'?0',
                'sec-ch-ua-platform':'"Windows"'
            }
            proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
            response = requests.get(url, proxies=proxy, headers=header, verify=False,timeout=10)
            # response = requests.get(url,verify=False)
            with open(save_path, 'wb') as file:
                file.write(response.content)
            break
        except Exception as e:
            time.sleep(5)
            print(e)
    return save_path

def download_file3(url, save_path):
    header = {
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Encoding':'gzip, deflate, br',
        'Accept-Language':'zh-CN,zh;q=0.9',
        'Cache-Control':'no-cache',
        'Connection':'keep-alive',
        'Cookie':'b099bcecf0be876536bb9d4826b25ba8=e2horegllcbddejsiveijp7of0; cookiesession1=678A3E12247313FBD6F74569925F4EFD; _ga=GA1.1.840765784.1693040959; __cf_bm=4DTZeDEU67Xjr5nt9OsbE1g1UTdVuOGdQlhj4KD5U2I-1693190695-0-AW81rfvAFUnclDkFVJYqD8+RWrC8FngMzW0dJ+bVHA+JwmPUVpc9/ogA0jhXrKLFYWun2BoK0R/hqWgGZAw/I1Y=; referrer_site=https%3A%2F%2Fwww.bis.doc.gov%2Findex.php%2Fsmart-search%3Fsearchword%3DRussia%26searchphrase%3Dall; csrf_token=a0d03e256a36d037708a809220564f407dee78bc; _ga_TPRT7QB30Y=GS1.1.1693190696.4.1.1693190720.0.0.0',
        'Host':'www.bis.doc.gov',
        'Pragma':'no-cache',
        'Referer':'https://www.bis.doc.gov/index.php/documents/product-guidance/3300-russia-medical-related-license-application-guidance-fpd-final-incorp-occ-and-3f-cmts-clean-071323',
        'Sec-Fetch-Dest':'document',
        'Sec-Fetch-Mode':'navigate',
        'Sec-Fetch-Site':'same-origin',
        'Sec-Fetch-User':'?1',
        'Upgrade-Insecure-Requests':'1',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
        'sec-ch-ua':'"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
        'sec-ch-ua-mobile':'?0',
        'sec-ch-ua-platform':'"Windows"'
    }
    req = urllib.request.Request(url, headers=header)
    wget.download(url,save_path)

        # 将html中的相对地址转换成绝对地址

def paserUrl(html,listurl):
    soup = BeautifulSoup(html, 'html.parser')
    # 获取所有的<a>标签和<img>标签
    links = soup.find_all(['a', 'img'])
    # 遍历标签，将相对地址转换为绝对地址
    for link in links:
        if 'href' in link.attrs:
            link['href'] = urljoin(listurl, link['href'])
        elif 'src' in link.attrs:
            link['src'] = urljoin(listurl, link['src'])
    return soup


if __name__ == '__main__':
    # # 创建一个工作簿
    filename='cis.xlsx'
    workbook = Workbook()
    workbook.save(filename)
    listPage()
    # driver=createDriver()
    # url='https://www.bis.doc.gov/index.php/policy-guidance/deemed-exports/deemed-exports-faqs/faq/116-what-areas-are-considered-russia-for-purposes-of-these-sanctions'
    # detail(driver,url)




