import pandas as pd
import requests
from bs4 import BeautifulSoup
from langid import langid
from openpyxl import Workbook
from requests.packages import urllib3
from smart_extractor import SmartExtractor
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def detect_language(html):
    soup = BeautifulSoup(html, "html.parser")
    text = soup.get_text()
    # 使用langid.py判断文本的语言
    lang, confidence = langid.classify(text)
    return lang

def webDriver(url):
    chrome_driver =f'C:/Users/WIN10/DataspellProjects/crawlerProjectDemo/tmpcrawler/cmd100/chromedriver.exe'
    path =  Service(chrome_driver)
    chrome_options = webdriver.ChromeOptions()
    chrome_options.binary_location =f'D:/crawler/baidu_crawler/tool/Google/Chrome/Application/chrome.exe'
    driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
    html=''
    try:
        driver.get(url)
        # 等待页面加载完成
        wait = WebDriverWait(driver, 10)
        wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        html=driver.page_source
    except Exception as e:
        print('请求失败')
    finally:
        # 关闭driver
        driver.quit()

    return html

def extractorMsg(url,title):
    content=''
    contentWithTag=''
    try:
        lang=detect_language(title)
        sm=SmartExtractor(lang)
        article=sm.extract_by_url(url=url)
        content=article.cleaned_text
        contentWithTag=article.text
    except Exception as e:
        try:
            raw_html=webDriver(url)
            lang=detect_language(title)
            sm=SmartExtractor(lang)
            article=sm.extract_by_html(raw_html)
            content=article.cleaned_text
            contentWithTag=article.text
        except Exception as e:
            print('抽取失败！！')

    return content,contentWithTag

def readExcel():
    detailmsgList=[]
    # 读取Excel文件
    data = pd.read_excel('./xls/sid_google.xlsx', dtype=str)
    # 使用iterrows()方法遍历数据
    for index, row in data.iterrows():
        # 遍历每一行的数据
        detailmsg={}
        for column in data.columns:
            # value = row[column]
            detailmsg[column]=row[column]
        detailmsgList.append(detailmsg)

    return detailmsgList

def extratorMsg(detailmsgList):

    for detail in detailmsgList:
        detailurl=detail['detailurl']
        title=detail['title']
        content,contentWithTag=extractorMsg(detailurl,title)
        if content=='' or content is None:
            content=detail['content']
            contentWithTag=detail['content_with_tag']
        detail['content']=content
        detail['content_with_tag']=contentWithTag
        # 将数字列转换为字符串
        print(detail['id'])
        detailList=[]
        detailList.append(detail)
        writerToExcel(detailList)


# 将数据追加到excel
def writerToExcel(detailList):
    # filename='google.xlsx'
    # 读取已存在的xlsx文件
    existing_data = pd.read_excel(filename,dtype=str)
    # 创建新的数据
    new_data = pd.DataFrame(data=detailList)
    # 将新数据添加到现有数据的末尾
    combined_data = existing_data.append(new_data, ignore_index=True)
    # 将结果写入到xlsx文件
    combined_data.to_excel(filename, index=False)


if __name__ == '__main__':
    filename='google.xlsx'
    # 创建一个工作簿
    workbook = Workbook()
    workbook.save(filename)
    detailmsgList=readExcel()
    extratorMsg(detailmsgList)