#coding=utf-8
from urllib.parse import urljoin

import pymysql
import requests
from bs4 import BeautifulSoup
from gne import GeneralNewsExtractor
from langid import langid
import csv
import threading
import time
from lxml import etree
from queue import Queue
import re,sys
import datetime
import redis
from kafka import KafkaProducer
import json
from baseCore import BaseCore
import configparser

from smart_extractor import SmartExtractor
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import quote, unquote
from pyquery import PyQuery as pq

class JrttnewsSpider(object):

    def __init__(self,searchkw,wordsCode,sid):
        # 创建ConfigParser对象
        self.config = configparser.ConfigParser()
        # 读取配置文件
        self.config.read('config.ini')
        baseCore=BaseCore()
        self.logger=baseCore.getLogger()
        self.url = 'https://www.sogou.com/'
        self.r = redis.Redis(host=self.config.get('redis', 'host'),
                             port=self.config.get('redis', 'port'),
                             password=self.config.get('redis', 'pass'), db=0)
        self.page_num = 1
        self.kafka_bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
        self.qtitle = Queue()
        self.qurl = Queue()
        self.detailList = Queue()
        self.searchkw = searchkw
        self.wordsCode = wordsCode
        self.sid = sid

    #将列表数据插入到表中 baidu_search_result
    def itemInsertToTable(self,items):
        try:
            itemdata=[]
            conx,cursorM=self.connMysql()
            for item in items:
                nowtime=self.getNowDate()
                data=(self.sid,self.wordsCode,item['title'],item['detailurl'],item['source'],item['publishtime'],item['content'],item['contentHtml'],'1',item['kword'],nowtime)
                itemdata.append(data)

            sql ="INSERT into baidu_search_result (sid,wordsCode,title,detailurl,origin,publishdate,content,content_with_tag,state,keyword,create_time) VALUES (%s, %s,%s, %s, %s, %s, %s, %s, %s, %s, %s)"
            cursorM.executemany(sql, itemdata)
            self.logger.info("数据插入数据库成功！")
            # 定义插入数据的SQL语句
            # 执行插入操作
            conx.commit()
        except Exception as e:
            self.logger.info("数据插入数据库失败！")
        finally:
            self.closeSql(conx,cursorM)

    def connMysql(self):
        # 创建MySQL连接
        conx = pymysql.connect(host=self.config.get('mysql', 'host'),
                               user=self.config.get('mysql', 'username'),
                               password=self.config.get('mysql', 'password'),
                               database=self.config.get('mysql', 'database'))
        # 创建一个游标对象
        cursorM = conx.cursor()
        return conx,cursorM

    def closeSql(self,conx,cursorM):
        # 关闭游标和连接
        cursorM.close()
        conx.close()

    # 解析页面
    def parse_page(self):
        self.logger.info('解析今日头条列表页')
        response = self.driver.page_source
        response = response.replace('<em>', '')
        response = response.replace('</em>', '')
        html = etree.HTML(response)
        lists=self.xpath_paser(html)
        try:
            flag = html.xpath('//a[@id="sogou_next"]')[0]
        except Exception as e:
            flag=''
            lists=[]
        return flag, lists

    def getRealUrl(self,url):
        try:
            header={
                "accept":"*/*",
                "connection":"Keep-Alive",
                "user-agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"
            }
            # url = 'https://www.sogou.com/link?url=hedJjaC291NbWrwHYHKCyPQj_ei8OKC13fJZ5YRQyvgjcXe6RUhCEXfbi95UdEys0ztd7q5nl6o.'
            url=f"https://www.sogou.com{url}"
            res = requests.get(url,headers=header)
            text=res.text
            # 定义正则表达式
            pattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
            # 在给定的字符串中寻找匹配的URL
            urls = re.findall(pattern, text)
            uri=''
            if len(urls)>1:
                uri=urls[0]
        except Exception as e:
            self.logger.info("链接转换异常！")
        return uri

    def xpath_paser(self,html):
        lists=[]
        itemTag=html.xpath('//div[@class="vrwrap"]')
        for itemTag in itemTag:
            try:
                title=itemTag.xpath('.//h3[@class="vr-title"]/a/text()')[0]
            except Exception as e:
                title=''
            try:
                detailUrl=itemTag.xpath('.//h3[@class="vr-title"]/a/@href')[0]
                detailUrl=self.getRealUrl(detailUrl)
            except Exception as e:
                detailUrl=''
            try:
                sourceTag=itemTag.xpath('.//p[@class="news-from text-lightgray"]/span[1]/text()')[0]
            except Exception as e:
                sourceTag=''
            try:
                publishTag=itemTag.xpath('.//p[@class="news-from text-lightgray"]/span[2]/text()')[0]
                publishTag=str(publishTag)
                publishtime=self.paserTime(publishTag)
                publishTag=publishtime.strftime("%Y-%m-%d %H:%M:%S")
            except Exception as e:
                publishTag=''
            detailmsg={
                'title':title,
                'detailUrl':detailUrl,
                'sourceTag':sourceTag,
                'publishTag':publishTag
            }
            lists.append(detailmsg)
        self.logger.info(f'列表获取信息的条数{len(lists)}')
        return lists

        #获取当前时间

    def getNowDate(self):
        # 获取当前时间
        current_time = datetime.datetime.now()
        # 将时间转换为字符串
        currentdate = current_time.strftime("%Y-%m-%d %H:%M:%S")
        return currentdate

    #智能抽取
    def paserDetail(self,detailhtml,detailurl):
        try:
            extractor = GeneralNewsExtractor()
            article_content = extractor.extract(detailhtml,host=detailurl,with_body_html=True)
            # element = html2element(detailhtml)
        except:
            article_content={}

        return article_content
    #解析时间
    def paserTime(self,publishtime):
        timeType=['年前','月前','周前','前天','昨天','天前','今天','小时前','分钟前']
        current_datetime = datetime.datetime.now()
        publishtime=publishtime.strip()
        print(publishtime)
        try:
            if '年前' in publishtime:
                numbers = re.findall(r'\d+', publishtime)
                day=int(numbers[0])
                delta = datetime.timedelta(days=365 * day)
                publishtime = current_datetime - delta
            elif '月前' in publishtime:
                numbers = re.findall(r'\d+', publishtime)
                day=int(numbers[0])
                delta = datetime.timedelta(months= day)
                publishtime = current_datetime - delta
            elif '周前' in publishtime:
                numbers = re.findall(r'\d+', publishtime)
                day=int(numbers[0])
                delta = datetime.timedelta(weeks= day)
                publishtime = current_datetime - delta
            elif '天前' in publishtime:
                numbers = re.findall(r'\d+', publishtime)
                day=int(numbers[0])
                delta = datetime.timedelta(days= day)
                publishtime = current_datetime - delta
            elif '前天' in publishtime:
                delta = datetime.timedelta(days= 2)
                publishtime = current_datetime - delta
            elif '昨天' in publishtime:
                current_datetime = datetime.datetime.now()
                delta = datetime.timedelta(days= 1)
                publishtime = current_datetime - delta
            elif '今天' in publishtime or'小时前' in publishtime or '分钟前' in publishtime :
                delta = datetime.timedelta(hours= 5)
                publishtime = current_datetime - delta
            elif '年' in publishtime and '月' in publishtime :
                time_format = '%Y年%m月%d日'
                publishtime = datetime.datetime.strptime(publishtime, time_format)
            elif '月' in publishtime and '日' in publishtime :
                current_year = current_datetime.year
                time_format = '%Y年%m月%d日'
                publishtime=str(current_year)+'年'+publishtime
                publishtime = datetime.datetime.strptime(publishtime, time_format)
            elif '-' in publishtime:
                time_format = '%Y-%m-%d'
                publishtime = datetime.datetime.strptime(publishtime, time_format)
        except Exception as e:
            print('时间解析异常！！')
        return publishtime


    def reqHtml(self,url):
        headers={
            'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'Accept-Encoding':'gzip, deflate, br',
            'Accept-Language':'zh-CN,zh;q=0.9',
            'Cache-Control':'no-cache',
            'Connection':'keep-alive',
            'Cookie':'tt_webid=7283314732298225163; _ga=GA1.1.1730036912.1695778874; _tea_utm_cache_4916=undefined; _S_DPR=1; _S_IPAD=0; s_v_web_id=verify_ln12yyu3_qeLMwQ8s_Offy_4w8b_9kv1_hMDj7V2H2wuE; msToken=7l75aR51vcmcW4LxtvP1cUt2trK37XA-oZdZRTD2Are065KuEBsofVz7vcQ7kFRXkKXY-I0ydJEkpNrx1_XWuurUFWTyIxMuf8Xg5dg-; _ga_QEHZPBE5HH=GS1.1.1695778874.1.1.1695778928.0.0.0; ttwid=1%7C13mqlyEtsSnqRlDNgTCNya74xNS4Azg1-cqxvZ2aJQs%7C1695778929%7C6462d58bd323e4560a0f5db0c443e767a3716878843c0f9a1dec190be930fa37; _S_WIN_WH=1366_353',
            'Host':'so.toutiao.com',
            'Pragma':'no-cache',
            'Referer':'https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword=%E6%B5%99%E6%B1%9F%E5%9B%BD%E6%9C%89%E8%B5%84%E6%9C%AC%E8%BF%90%E8%90%A5%E5%85%AC%E5%8F%B8&pd=information&action_type=pagination&page_num=1&search_id=202309270941439BB9AFF54062FE7CAC13&from=news&cur_tab_title=news',
            'Sec-Fetch-Dest':'document',
            'Sec-Fetch-Mode':'navigate',
            'Sec-Fetch-Site':'same-origin',
            'Sec-Fetch-User':'?1',
            'Upgrade-Insecure-Requests':'1',
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
            'sec-ch-ua':'"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
            'sec-ch-ua-mobile':'?0',
            'sec-ch-ua-platform':'"Windows"'
        }
        try:
            res=requests.get(url,headers=headers,verify=False,timeout=10)
            res.encoding='utf-8'
            text=res.text
        except Exception as e:
            text=''
        return text

    def get_realurl(self,tmpurl):
        try:
            pattern='url=(.{1,}?)&aid'
            match = re.search(pattern, tmpurl)
            # 判断是否匹配成功
            if match:
                # 获取匹配的结果
                result = match.group(1)
                result=unquote(result)
            else:
                result=''
        except:
            result=''
        return result


    def getFormatedate(self,timestamp):
        date = datetime.datetime.fromtimestamp(timestamp)
        formatted_date = date.strftime('%Y-%m-%d')
        return formatted_date

    # 获取每一页数据, 开趴.
    def get_page_html(self):
        #设置采集列表页面和页数
        totalnum=3
        keyword=self.searchkw
        # keyword='浙江国有资本运营公司'
        for pagenum in range(0,totalnum):

            url=f'https://so.toutiao.com/search?dvpf=pc&source=pagination&keyword={keyword}&pd=information&action_type=pagination&page_num={pagenum}&from=news&cur_tab_title=news'
            lhtml=self.reqHtml(url)
            soup = BeautifulSoup(lhtml, 'html.parser')
            result_contents=soup.select('div[class="s-result-list"]')
            for lists in result_contents:
                doc=pq(str(lists))
                listcontent=doc.find('div[class="result-content"]')
                for litag in listcontent:
                    try:
                        lidoc=pq(litag)
                        ahref=lidoc.find('a[class="text-ellipsis text-underline-hover"]').attr('href')
                        durl=self.get_realurl(ahref)
                        title=lidoc.find('a[class="text-ellipsis text-underline-hover"]').text().replace('\n','')
                        source=lidoc.find('div[class="cs-view cs-view-flex align-items-center flex-row cs-source-content"]>span:nth-child(1)').text().replace('\n','')
                        publishdate=lidoc.find('div[class="cs-view cs-view-flex align-items-center flex-row cs-source-content"]>span:last-child').text().replace('\n','')
                        publishdate=self.paserTime(publishdate)
                        if isinstance(publishdate, str):
                            pubdate=publishdate
                        else:
                            pubdate=publishdate.strftime("%Y-%m-%d %H:%M:%S")
                        is_member = self.r.sismember('pysouhunews_'+self.wordsCode, durl)
                        if is_member:
                            continue
                        detailmsg={
                            'title':title,
                            'detailUrl':durl,
                            'sourceTag':source,
                            'publishTag':pubdate
                        }
                        self.detailList.put(detailmsg)
                    except Exception as e:
                        print(e)
                        continue

    # 获取详情页
    def get_detail_html(self):
        # 获取当前窗口的句柄
        # current_window = self.driver.current_window_handle
        while True:
            if self.detailList.qsize() != 0:
                try:
                    detailmsg=self.detailList.get()
                    title = detailmsg['title']
                    detailUrl = detailmsg['detailUrl']
                    print("%s:%s\n" % (title, detailUrl))
                    bdetail=self.getDetailmsg(detailmsg)
                    processitem=self.getProcessitem(bdetail)
                    try:
                        self.sendkafka(processitem)
                        self.r.sadd('pysouhunews_'+self.wordsCode, processitem['sourceAddress'])
                    except Exception as e:
                        self.logger.info("放入kafka失败！")
                    #插入数据库
                    try:
                        items=[]
                        items.append(bdetail)
                        self.itemInsertToTable(items)
                    except Exception as e:
                        self.logger.info("插入数据库失败！")
                    # 关闭当前新窗口
                    # self.driver.close()
                    time.sleep(1)
                except Exception as e:
                    time.sleep(3)
                    self.logger.info("详情页解析异常！"+detailUrl)
            else:
                break
                # time.sleep(5)

    #解析详情
    def getDetailmsg(self,detailmsg):
        try:
            detailurl=detailmsg['detailUrl']
            title = detailmsg['title']
            content,contentWithTag=self.extractorMsg(detailurl,title)
            contentWithTag=self.rmTagattr(contentWithTag,detailurl)
        except Exception as e:
            content=''
            contentWithTag=''

        currentdate=self.getNowDate()
        kword=self.searchkw
        publishDate=detailmsg['publishTag']
        publishDate=publishDate+''
        # publishtime=self.paserTime(publishtime)
        # publishDate=publishtime.strftime("%Y-%m-%d %H:%M:%S")
        detailmsg={
            'title':detailmsg['title'],
            'source':detailmsg['sourceTag'],
            'detailurl':detailurl,
            'content':content,
            'contentHtml':contentWithTag,
            'publishtime':publishDate,
            'currentdate':currentdate,
            'kword':kword
        }
        return detailmsg

    def webDriver(self,url):
        chrome_driver =self.config.get('selenium', 'chrome_driver')
        path = Service(chrome_driver)
        chrome_options = webdriver.ChromeOptions()
        chrome_options.binary_location =self.config.get('selenium', 'binary_location')
        driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
        html=''
        try:
            driver.get(url)
            # 等待页面加载完成
            time.sleep(2)
            html=driver.page_source
        except Exception as e:
            self.logger.info('请求失败')
        finally:
            driver.quit()

        return html

    def createDriver(self):
        chrome_driver =self.config.get('selenium', 'chrome_driver')
        path =  Service(chrome_driver)
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
        chrome_options.binary_location =self.config.get('selenium', 'binary_location')
        # 设置代理
        # proxy = "127.0.0.1:8080"  # 代理地址和端口
        # chrome_options.add_argument('--proxy-server=http://' + proxy)
        driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
        return driver

    def extractorMsg(self,url,title):
        content=''
        contentWithTag=''
        lang=''
        lang=self.detect_language(title)
        sm=SmartExtractor(lang)
        try:
            driver=self.createDriver()
            driver.get(url)
            # 设置等待时间为10秒
            wait = WebDriverWait(driver, 10)
            # 等待元素加载完成
            element = wait.until(EC.presence_of_element_located((By.TAG_NAME, "article")))
            raw_html=driver.page_source
            if raw_html:
                try:
                    soup=BeautifulSoup(raw_html,'html.parser')
                    tdoc=soup.select('div[class="article-content"]>article')[0]
                    content=tdoc.text
                    contentWithTag=str(tdoc)
                except Exception as e:
                    self.logger.info("定位解析失败！")
                if content:
                    return  content,contentWithTag
                article=sm.extract_by_html(raw_html)
                content=article.cleaned_text
                contentWithTag=article.text
        except Exception as e:
            self.logger.info("抽取解析失败！")

        return content,contentWithTag

    def detect_language(self,html):
        soup = BeautifulSoup(html, "html.parser")
        text = soup.get_text()
        # 使用langid.py判断文本的语言
        lang, confidence = langid.classify(text)
        return lang

    def rmTagattr(self,html,url):
        # 使用BeautifulSoup解析网页内容
        # soup = BeautifulSoup(html, 'html.parser')
        soup = self.paserUrl(html,url)
        # 遍历所有标签，并去掉属性
        for tag in soup.find_all(True):
            if tag.name == 'img':
                tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'}
            elif tag.name !='img':
                tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'}
            else:
                tag.attrs = {key: value for key, value in tag.attrs.items()}
        # 打印去掉属性后的网页内容
        # print(soup.prettify())
        html=soup.prettify()
        return html

    # 将html中的相对地址转换成绝对地址
    def paserUrl(self,html,listurl):
        soup = BeautifulSoup(html, 'html.parser')
        # 获取所有的<a>标签和<img>标签
        links = soup.find_all(['a', 'img'])
        # 遍历标签，将相对地址转换为绝对地址
        for link in links:
            if 'href' in link.attrs:
                link['href'] = urljoin(listurl, link['href'])
            elif 'src' in link.attrs:
                link['src'] = urljoin(listurl, link['src'])

        return soup

    def getProcessitem(self,bdetail):
        nowDate=self.getNowDate()
        content=bdetail['content']
        if content!='':
            processitem={
                "sid":self.sid,
                "source":"5",
                "title":bdetail['title'],
                "content":bdetail['content'],
                "contentWithtag":bdetail['contentHtml'],
                "origin":bdetail['source'],
                "publishDate":bdetail['publishtime'],
                "sourceAddress":bdetail['detailurl'],
                "createDate":nowDate
            }

        return processitem

    def sendkafka(self,processitem):
        try:
            producer = KafkaProducer(bootstrap_servers=[self.kafka_bootstrap_servers])
            content=processitem['content']
            publishDate=str(processitem['publishDate'])
            title=processitem['title']
            if title =='':
                return
            if content=='':
                return
            if publishDate=='':
                return
            kafka_result = producer.send("crawlerInfo", json.dumps(processitem, ensure_ascii=False).encode('utf8'))
            self.logger.info("数据发送kafka成功")
            self.logger.info(kafka_result.get(timeout=10))
        except Exception as e:
            self.logger.info('发送kafka异常')
        finally:
            producer.close()

    def run(self):
        # # 获取每页URL
        # c = threading.Thread(target=self.get_page_html)
        # c.start()
        # c.join()
        # # 解析详情页
        # t = threading.Thread(target=self.get_detail_html)
        # t.start()
        self.get_page_html


if __name__ == '__main__':
    zhuce = JrttnewsSpider()
    zhuce.run()
    # zhuce.driver.close()