#coding=utf-8
from urllib.parse import urljoin

import pymysql
import requests
from bs4 import BeautifulSoup
from gne import GeneralNewsExtractor
from langid import langid
import csv
import threading
import time
from lxml import etree
from queue import Queue
import re,sys
import datetime
import redis
from kafka import KafkaProducer
import json
from baseCore import BaseCore
import configparser

from smart_extractor import SmartExtractor
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import quote, unquote
from pyquery import PyQuery as pq

class JrttnewsSpider(object):

    def __init__(self,searchkw,wordsCode,sid):
        # 创建ConfigParser对象
        self.config = configparser.ConfigParser()
        # 读取配置文件
        self.config.read('config.ini')
        baseCore=BaseCore()
        self.logger=baseCore.getLogger()
        self.url = f'https://so.toutiao.com/search?dvpf=pc&source=input&keyword={searchkw}#'
        self.r = redis.Redis(host=self.config.get('redis', 'host'),
                             port=self.config.get('redis', 'port'),
                             password=self.config.get('redis', 'pass'), db=0)
        self.page_num = 1
        self.kafka_bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
        self.qtitle = Queue()
        self.qurl = Queue()
        self.detailList = Queue()
        self.searchkw = searchkw
        self.wordsCode = wordsCode
        self.sid = sid
        self.driver=self.createDriver();

    #将列表数据插入到表中 meta_search_result
    def itemInsertToTable(self,items):
        try:
            itemdata=[]
            conx,cursorM=self.connMysql()
            for item in items:
                nowtime=self.getNowDate()
                data=(self.sid,self.wordsCode,item['title'],item['detailurl'],item['source'],item['publishtime'],item['content'],item['contentHtml'],'1',item['kword'],nowtime)
                itemdata.append(data)

            sql ="INSERT into meta_search_result (sid,wordsCode,title,detailurl,origin,publishdate,content,content_with_tag,state,keyword,create_time) VALUES (%s, %s,%s, %s, %s, %s, %s, %s, %s, %s, %s)"
            cursorM.executemany(sql, itemdata)
            self.logger.info("数据插入数据库成功！")
            # 定义插入数据的SQL语句
            # 执行插入操作
            conx.commit()
        except Exception as e:
            self.logger.info("数据插入数据库失败！")
        finally:
            self.closeSql(conx,cursorM)

    def connMysql(self):
        # 创建MySQL连接
        conx = pymysql.connect(host=self.config.get('mysql', 'host'),
                               user=self.config.get('mysql', 'username'),
                               password=self.config.get('mysql', 'password'),
                               database=self.config.get('mysql', 'database'))
        # 创建一个游标对象
        cursorM = conx.cursor()
        return conx,cursorM

    def closeSql(self,conx,cursorM):
        # 关闭游标和连接
        cursorM.close()
        conx.close()

    # 解析页面
    def parse_page(self):
        self.logger.info('解析今日头条列表页')
        response = self.driver.page_source
        response = response.replace('<em>', '')
        response = response.replace('</em>', '')
        html = etree.HTML(response)
        lists=self.xpath_paser(html)
        try:
            flag = html.xpath('//a[@class="cs-view cs-view-inline-block cs-button cs-button-mb cs-button-default text-darker text-m radius-m text-center text-nowrap"]')[0]
        except Exception as e:
            flag=''
            lists=[]
        return flag, lists


    def xpath_paser(self,html):
        lists=[]
        itemTags=html.xpath('//div[@class="cs-view cs-view-block cs-card-content"]')
        for itemTag in itemTags:
            html_str = etree.tostring(itemTag)
            try:
                title=itemTag.xpath('.//a[@class="text-ellipsis text-underline-hover"]/text()')[0]
            except Exception as e:
                title=''
            if title=='':
                continue
            try:
                detailUrl=itemTag.xpath('.//a[@class="text-ellipsis text-underline-hover"]/@href')[0]
                id=self.get_reitemid(detailUrl)
                detailUrl=f'https://www.toutiao.com/article/{id}/?&source=m_redirect'
            except Exception as e:
                detailUrl=''
            try:
                sourceTag=itemTag.xpath('.//span[@class="d-flex align-items-center text-ellipsis margin-right-4"]//text()')[0]
            except Exception as e:
                sourceTag=''
            try:
                publishTag=itemTag.xpath('.//div[@class="cs-view cs-view-flex align-items-center flex-row cs-source-content"]/span[@class="text-ellipsis"]/text()')[0]
                publishTag=str(publishTag)
                publishtime=self.paserTime(publishTag)
                publishTag=publishtime.strftime("%Y-%m-%d %H:%M:%S")
            except Exception as e:
                publishTag=''
            detailmsg={
                'title':title,
                'detailUrl':detailUrl,
                'sourceTag':sourceTag,
                'publishTag':publishTag
            }
            lists.append(detailmsg)
        self.logger.info(f'列表获取信息的条数{len(lists)}')
        return lists

        #获取当前时间

    def getNowDate(self):
        # 获取当前时间
        current_time = datetime.datetime.now()
        # 将时间转换为字符串
        currentdate = current_time.strftime("%Y-%m-%d %H:%M:%S")
        return currentdate

    #智能抽取
    def paserDetail(self,detailhtml,detailurl):
        try:
            extractor = GeneralNewsExtractor()
            article_content = extractor.extract(detailhtml,host=detailurl,with_body_html=True)
            # element = html2element(detailhtml)
        except:
            article_content={}

        return article_content
    #解析时间
    def paserTime(self,publishtime):
        timeType=['年前','月前','周前','前天','昨天','天前','今天','小时前','分钟前']
        current_datetime = datetime.datetime.now()
        publishtime=publishtime.strip()
        print(publishtime)
        try:
            if '年前' in publishtime:
                numbers = re.findall(r'\d+', publishtime)
                day=int(numbers[0])
                delta = datetime.timedelta(days=365 * day)
                publishtime = current_datetime - delta
            elif '月前' in publishtime:
                numbers = re.findall(r'\d+', publishtime)
                day=int(numbers[0])
                delta = datetime.timedelta(months= day)
                publishtime = current_datetime - delta
            elif '周前' in publishtime:
                numbers = re.findall(r'\d+', publishtime)
                day=int(numbers[0])
                delta = datetime.timedelta(weeks= day)
                publishtime = current_datetime - delta
            elif '天前' in publishtime:
                numbers = re.findall(r'\d+', publishtime)
                day=int(numbers[0])
                delta = datetime.timedelta(days= day)
                publishtime = current_datetime - delta
            elif '前天' in publishtime:
                delta = datetime.timedelta(days= 2)
                publishtime = current_datetime - delta
            elif '昨天' in publishtime:
                current_datetime = datetime.datetime.now()
                delta = datetime.timedelta(days= 1)
                publishtime = current_datetime - delta
            elif '今天' in publishtime or'小时前' in publishtime or '分钟前' in publishtime :
                delta = datetime.timedelta(hours= 5)
                publishtime = current_datetime - delta
            elif '年' in publishtime and '月' in publishtime :
                time_format = '%Y年%m月%d日'
                publishtime = datetime.datetime.strptime(publishtime, time_format)
            elif '月' in publishtime and '日' in publishtime :
                current_year = current_datetime.year
                time_format = '%Y年%m月%d日'
                publishtime=str(current_year)+'年'+publishtime
                publishtime = datetime.datetime.strptime(publishtime, time_format)
            elif '-' in publishtime:
                time_format = '%Y-%m-%d'
                publishtime = datetime.datetime.strptime(publishtime, time_format)
        except Exception as e:
            print('时间解析异常！！')
        return publishtime


    def reqHtml(self,url):
        headers={
            'Host':'search5-search-hl.toutiaoapi.com',
            'Connection':'keep-alive',
            'Cookie':'store-region=cn-ha; store-region-src=did; install_id=715108030093040; ttreq=1$142a2ea8b4ded7e0dc5e4085a9e18b099c71e711; passport_csrf_token=5bd0ecb22a060f1f0c0932c735a2a13d; passport_csrf_token_default=5bd0ecb22a060f1f0c0932c735a2a13d; odin_tt=ea866ee07058546f522aa5a30b4982a8ee5e25749fc2c9f4fa9745619a3ce73aed8deda6ec675e308c99ac3721f4621c; WIN_WH=360_592; PIXIEL_RATIO=3; FRM=new',
            'tt-tick-click':'1697767278469',
            'tt-enable-js-ext':'true',
            'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'User-Agent':'Mozilla/5.0 (Linux; Android 7.1.2; VTR-AL00 Build/N2G47H; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/81.0.4044.117 Mobile Safari/537.36 JsSdk/2 NewsArticle/9.5.1 NetType/wifi',
            'Upgrade-Insecure-Requests':'1',
            'tt-flow-type':'1',
            'X-SS-REQ-TICKET':'1697767278718',
            'x-vc-bdturing-sdk-version':'3.5.0.cn',
            'x-tt-dt':'AAAUR64HZMJ24BRNLGOI3WYF36IOAE3WDWW5RTNUGW5XMR3S24FLN5ILRURRCSIDIS5ENZ43J2K5OOQJJUJUVTDAZNEDCUCFLASISSXB4O5NU5B2MD3ISSH7UFPFIDIBW76O6PUCQY3GZ34CZF5J4YI',
            'passport-sdk-version':'40650',
            'sdk-version':'2',
            'x-tt-request-tag':'n=1;n=1;s=-1;p=0',
            'x-tt-store-region':'cn-ha',
            'x-tt-store-region-src':'did',
            'X-SS-DP':'13',
            'x-tt-trace-id':'00-4ad0ca4309cdb953b3dd550a32f4000d-4ad0ca4309cdb953-01',
            'Referer':'http://nativeapp.toutiao.com/',
            'Accept-Encoding':'gzip, deflate',
            'X-Argus':'KHZwJccO8RFUIqNJ/EX0ymUiPivSYbrByFwa1el1x3fzCwmdtvtfOOTb6JZDclXHMsIXUO7lZRbs9Pyo84vnORF7UT3f4hOCiiLjT+SCM8eETtX9pAmz+6sKw0h19OhZMAT5MhGyvvX2NObiv2Avnujt19g5oof8KWbgVMc4jJO1r+Di99IZ4MDJJm6OyLAC5U2eVV5KnplpMO5AtCt7+pCJqha+bNc3rFneU9S1njW155hdZ2wVoc6vg1HNt5VZcK3Qtj/3sKhV+gUXflyLw+rl',
            'X-Gorgon':'8404a094000036a0eaee8b8c2540649078f55860dd5db65b55c5',
            'X-Helios':'l2NKLtVosKrLBwQtg/ThpLjDrQIigxP6jwYKhbI2NAK7fK2k',
            'X-Khronos':'1697767278',
            'X-Ladon':'WOfdV44iiU/2w5WK+XsuKpHuhNVBuJPnL2o+9tZGMcAS37rn',
            'X-Medusa':'bN8xZVizpAlbrWv+igssAKZnSEOd4poN/EvhkHRQPxmsXRwrXqOAPJs+fYDNdsfxcJEB34kVAFTm4hB+TxeLRRqwbKS8cfNVLcTHB26WUBYdz36455NMvA/5DIzOifTKJuxKP2E2i8BUX2JwkzEeIYzVRdzdEXDseszRvfWZTUpL1rnkS1+IoN8xQn7cmUEcKl2tn1zfECP5tp4x3v3c+tHVmiPGp9QhuhHT41C5DeavAgJANCuWd4gXV3PCyAtzGyv7AyxutplIntLWJRJySTh8POrHZbbG3x/CGPLkDdGC7pnXNc3aay3IilI4O85yktkYxbfU5yKWDz1zfds3bIIXH9tTE82U7pclJlBSYuPlGbOmqSSAIQHu+psHMDY+aH+hHOP8vvQww/vpxdPQLqhA4kLlJxc84ykQDieibhKDuCP711BdsOC2eLqsB1lsohv09kZp/ELW8W5AsWBK+jj1wlggPehpCaIlcAlGXk4Cm/X3flMO3u+VyrLSVfL1t6hYvPglcYR0Wg6yPgXOoJQHwXAeAkE3tsyZwK2tDOxa7LjnF3O+Pf1KRZohSN1sCxV3wQwUDw3wY/TqV+Rb/Cz2fcZXQfVTPDf8ZbhE/+UY/3quismg//wIDjOgT63lvt6gLPuJvjiuhmnHHIewJEb7YIG7InjTEJ2RSCkbqsTKj4U4cyhVG3oc66OywOuXAwglWmtmnZVlksbxEAHnGNfZqYHw+lRAdH5deKKAbFJmaTaIjahFGvkUbNpo7dSHetGae6IuqxNdbXI+P4WglVLCvjaTKJ6KQF/lbryau/lu3H/jmYQ5j4d+RmGrwxR9AoZhgShRddLvbG1n5og65drzo8PMijkUSbtr63LlCzoCcxW77pTSmrwDj5olu753p3kWZ3iq',
            'x-common-params-v2':'ab_feature=102749%2C94563&ab_group=94567%2C102752&ab_version=668776%2C7284540%2C668774%2C7284539%2C662176%2C7284532%2C668779%2C7284545%2C662099%2C7284498%2C660830%2C7284548%2C1859937%2C668775%2C4413279%2C7284550%2C7339127%2C7379048%2C6378074%2C7032529%2C7208866%2C7308647%2C6154373&ac=wifi&aid=13&app_name=news_article&cdid=7e885618-b3d3-4522-a8a4-fb6ad143c4a9&channel=huawei_13_64&client_vid=6784077%2C3383553%2C2827921%2C3194525&device_brand=HUAWEI&device_id=55223597885&device_platform=android&device_type=VTR-AL00&dpi=480&dq_param=1&host_abi=arm64-v8a&iid=715108030093040&immerse_pool_type=-2&language=zh&manifest_version_code=9510&os=android&os_api=25&os_version=7.1.2&plugin=0&resolution=1080*1776&rom_version=25&ssmix=a&update_version_code=95108&version_code=951&version_name=9.5.1',
        }
        try:
            res=requests.get(url,headers=headers,verify=False,timeout=10)
            res.encoding='utf-8'
            text=res.text
        except Exception as e:
            text=''
        return text

    def get_realurl(self,tmpurl):
        try:
            pattern='url=(.{1,}?)&aid'
            match = re.search(pattern, tmpurl)
            # 判断是否匹配成功
            if match:
                # 获取匹配的结果
                result = match.group(1)
                result=unquote(result)
            else:
                result=''
        except:
            result=''
        return result

    def get_reitemid(self,tmpurl):
        try:
            tmpurl=unquote(tmpurl)
            pattern='com/a([\d]{1,}?)/'
            match = re.search(pattern, tmpurl)
            # 判断是否匹配成功
            if match:
                # 获取匹配的结果
                result = match.group(1)
                result=unquote(result)
            else:
                result=''
        except:
            result=''
        return result


    def getFormatedate(self,timestamp):
        date = datetime.datetime.fromtimestamp(timestamp)
        formatted_date = date.strftime('%Y-%m-%d')
        return formatted_date

    # 获取每一页数据, 开趴.
    def get_page_html(self):
        #设置采集列表页面和页数
        self.driver.get(self.url)
        wait = WebDriverWait(self.driver, 20)
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, "s-result-list")))
        # try:
        #     self.driver.find_element(By.XPATH,'//div[@class="input_box_n6Efbw"]/input').send_keys(self.searchkw)
        # except Exception as e:
        #     print(e)
        # self.driver.find_element(By.CLASS_NAME, 'search_33vwaQ').click()
        # wait = WebDriverWait(self.driver, 20)
        # wait.until(EC.presence_of_element_located((By.CLASS_NAME, "s-result-list")))
        # time.sleep(2)
        self.driver.find_element('xpath', '//div[@class="cs-view pad-bottom-6 cs-view-flex align-items-center flex-row nav_7Dk46Y"]/div[1]/a[text()="资讯"]').click()
        time.sleep(2)
        self.logger.info("开始抓取首页...")
        try:
            flag, lists = self.parse_page()
            if len(lists)<1:
                return
        except Exception as e:
            time.sleep(5)
            return
        if len(lists)==0:
            time.sleep(5)
        for detail in lists:
            durl=detail['detailUrl']
            is_member = self.r.sismember('pyjrttnews_'+self.wordsCode, durl)
            if is_member:
                continue
            self.detailList.put(detail)
        response = self.driver.page_source
        html = etree.HTML(response)
        hasnext = html.xpath('//a[@class="cs-view cs-view-inline-block cs-button cs-button-mb cs-button-default text-darker text-m radius-m text-center text-nowrap"]//text()')[0]
        hasnext = hasnext.strip()
        timeFlag=False
        while   '下一页' in hasnext:
            try:
                if self.page_num==5:
                    break
                self.page_num = self.page_num + 1
                self.logger.info("开始抓取第%s页..." % self.page_num)
                try:
                    self.driver.find_element(By.XPATH, '//a[@class="cs-view cs-view-inline-block cs-button cs-button-mb cs-button-default text-darker text-m radius-m text-center text-nowrap"]').click()
                except Exception as e:
                    time.sleep(5)
                    continue
                time.sleep(5)
                flag, lists = self.parse_page()
                if len(lists)<1:
                    break
                for detail in lists:
                    publishTag=detail['publishTag']
                    is_member = self.r.sismember('pyjrttnews_'+self.wordsCode, durl)
                    if is_member:
                        continue
                    self.detailList.put(detail)
                if timeFlag:
                    break
                try:
                    response = self.driver.page_source
                    html = etree.HTML(response)
                    hasnext = html.xpath('//div[@id="page"]//a[last()]//text()')[0]
                    hasnext = hasnext.strip()
                except Exception as e:
                    hasnext=''
            except Exception as e:
                time.sleep(5)
                break
        self.logger.info("抓取完毕")

    # 获取详情页
    def get_detail_html(self):
        # 获取当前窗口的句柄
        # current_window = self.driver.current_window_handle
        while True:
            if self.detailList.qsize() != 0:
                try:
                    detailmsg=self.detailList.get()
                    title = detailmsg['title']
                    detailUrl = detailmsg['detailUrl']
                    self.logger.info(f"解析详情页标题{title},对应地址{detailUrl}")
                    bdetail=self.getDetailmsg(detailmsg)
                    self.logger.info(f"解析详情页标题{title},获取的内容长度:{len(bdetail['content'])}")
                    processitem=self.getProcessitem(bdetail)
                    try:
                        self.sendkafka(processitem)
                        self.r.sadd('pyjrttnews_'+self.wordsCode, processitem['sourceAddress'])
                    except Exception as e:
                        self.logger.info("放入kafka失败！")
                    #插入数据库
                    try:
                        items=[]
                        items.append(bdetail)
                        self.itemInsertToTable(items)
                    except Exception as e:
                        self.logger.info("插入数据库失败！")
                    # 关闭当前新窗口
                    # self.driver.close()
                    time.sleep(1)
                except Exception as e:
                    time.sleep(3)
                    self.logger.info("详情页解析异常！"+detailUrl)
            else:
                break
                # time.sleep(5)

    def detailHtml(self,url):
        headers2={
            'Host':'www.toutiao.com',
            'Connection':'keep-alive',
            'Cache-Control':'max-age=0',
            'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
            'sec-ch-ua-mobile':'?0',
            'sec-ch-ua-platform':'"Windows"',
            'Upgrade-Insecure-Requests':'1',
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
            'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'Sec-Fetch-Site':'none',
            'Sec-Fetch-Mode':'navigate',
            'Sec-Fetch-User':'?1',
            'Sec-Fetch-Dest':'document',
            'Accept-Encoding':'gzip, deflate, br',
            'Accept-Language':'zh-CN,zh;q=0.9',
            'Cookie':'__ac_signature=_02B4Z6wo00f01SuLBAAAAIDBq4n-Qe9gRVkrrwCAAC.IVicycONgbbW4Hp1evBuF5zqe.dnjQEugXgwVyd-cpabxdL3lcaGCSJLRLtoGOCkqQ0IbU0NO3fW-0TKePlzULR8k5X1DEdqOUpR012; tt_webid=7289060865588020770; s_v_web_id=verify_lnn7ht77_cdKDYvaS_cmN9_4yyG_9pi3_XnOkAu8OxJTm; _ga=GA1.1.630929221.1697116750; local_city_cache=%E5%8C%97%E4%BA%AC; csrftoken=5c57e5aabc31ec2be5f6e40f904c90e8; _S_DPR=1; _S_IPAD=0; _S_WIN_WH=1366_619; msToken=3FJcx4aa5PuT2voafEIRXy1cMYZxEt9zwG9eMGR8srNIU5JajUVx2Ll5tyXJ1_-4Bcm7AfxV2Poyu72Xs2sn8ddk2xZ_a9HTzBFbIzkH; tt_scid=6uvTSx1f1NpCUjG--J4lV.Zb84w.xGWpBCJ.Xv4l-CWWm.CEODd3KFqFKpTFh-Gzdcc9; _ga_QEHZPBE5HH=GS1.1.1697771671.3.1.1697773330.0.0.0; ttwid=1%7CLK33CgR861OZhj9FEIYSUw5rJFx_KIgSNeQ_Wh6AoEM%7C1697773332%7C7d366a710996bbdf03cc49708454c67f8c50ba7864dbb17b544a76a1e06e7c7c'
        }
        try:
            res=requests.get(url,headers=headers2,verify=False,timeout=10)
            res.encoding='utf-8'
            text=res.text
        except Exception as e:
            text=''
        return text
    #解析详情
    def getDetailmsg(self,detailmsg):
        try:
            detailurl=detailmsg['detailUrl']
            title = detailmsg['title']
            content,contentWithTag=self.extractorMsg(detailurl,title)
            contentWithTag=self.rmTagattr(contentWithTag,detailurl)
        except Exception as e:
            content=''
            contentWithTag=''

        currentdate=self.getNowDate()
        kword=self.searchkw
        publishDate=detailmsg['publishTag']
        publishDate=publishDate+''
        # publishtime=self.paserTime(publishtime)
        # publishDate=publishtime.strftime("%Y-%m-%d %H:%M:%S")
        detailmsg={
            'title':detailmsg['title'],
            'source':detailmsg['sourceTag'],
            'detailurl':detailurl,
            'content':content,
            'contentHtml':contentWithTag,
            'publishtime':publishDate,
            'currentdate':currentdate,
            'kword':kword
        }
        return detailmsg

    def webDriver(self,url):
        chrome_driver =self.config.get('selenium', 'chrome_driver')
        path = Service(chrome_driver)
        chrome_options = webdriver.ChromeOptions()
        chrome_options.binary_location =self.config.get('selenium', 'binary_location')
        driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
        html=''
        try:
            driver.get(url)
            # 等待页面加载完成
            time.sleep(2)
            html=driver.page_source
        except Exception as e:
            self.logger.info('请求失败')
        finally:
            driver.quit()

        return html

    def createDriver(self):
        chrome_driver =self.config.get('selenium', 'chrome_driver')
        path =  Service(chrome_driver)
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
        chrome_options.binary_location =self.config.get('selenium', 'binary_location')
        # 设置代理
        # proxy = "127.0.0.1:8080"  # 代理地址和端口
        # chrome_options.add_argument('--proxy-server=http://' + proxy)
        driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
        return driver

    def extractorMsg(self,url,title):
        content=''
        contentWithTag=''
        lang=''
        lang=self.detect_language(title)
        sm=SmartExtractor(lang)
        try:
            # raw_html=self.detailHtml(url)
            # if raw_html:
            #     self.logger.info(f"detailHtml请求的内容长度{len(raw_html)}")
            #     self.logger.info(f"{raw_html}")
            # else:
            #     try:
            #         driver=self.createDriver()
            #         driver.get(url)
            #         # 设置等待时间为10秒
            #         wait = WebDriverWait(driver, 10)
            #         # 等待元素加载完成
            #         element = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "article-meta")))
            #         raw_html=driver.page_source
            #     except Exception as e:
            #         raw_html=''
            try:
                driver=self.createDriver()
                driver.get(url)
                # 设置等待时间为10秒
                wait = WebDriverWait(driver, 10)
                # 等待元素加载完成
                element = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "article-meta")))
                raw_html=driver.page_source
            except Exception as e:
                raw_html=''
            if raw_html:
                try:
                    soup = BeautifulSoup(raw_html, 'html.parser')
                    # publishtime = soup.select('div[class="article-meta"]>span:nth-child(1)')[0].text
                    # source = soup.select('div[class="article-meta"]>span:nth-child(3)')[0].text
                    tdoc = soup.select('article')[0]
                    content=tdoc.text
                    contentWithTag=str(tdoc)
                except Exception as e:
                    self.logger.info("定位解析失败！")
                if content:
                    return  content,contentWithTag
                article=sm.extract_by_html(raw_html)
                content=article.cleaned_text
                contentWithTag=article.text



        except Exception as e:
            self.logger.info("抽取解析失败！")

        return content,contentWithTag

    def detect_language(self,html):
        soup = BeautifulSoup(html, "html.parser")
        text = soup.get_text()
        # 使用langid.py判断文本的语言
        lang, confidence = langid.classify(text)
        return lang

    def rmTagattr(self,html,url):
        # 使用BeautifulSoup解析网页内容
        # soup = BeautifulSoup(html, 'html.parser')
        soup = self.paserUrl(html,url)
        # 遍历所有标签，并去掉属性
        for tag in soup.find_all(True):
            if tag.name == 'img':
                tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'}
            elif tag.name !='img':
                tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'}
            else:
                tag.attrs = {key: value for key, value in tag.attrs.items()}
        # 打印去掉属性后的网页内容
        # print(soup.prettify())
        html=soup.prettify()
        return html

    # 将html中的相对地址转换成绝对地址
    def paserUrl(self,html,listurl):
        soup = BeautifulSoup(html, 'html.parser')
        # 获取所有的<a>标签和<img>标签
        links = soup.find_all(['a', 'img'])
        # 遍历标签，将相对地址转换为绝对地址
        for link in links:
            if 'href' in link.attrs:
                link['href'] = urljoin(listurl, link['href'])
            elif 'src' in link.attrs:
                link['src'] = urljoin(listurl, link['src'])

        return soup

    def getProcessitem(self,bdetail):
        nowDate=self.getNowDate()
        content=bdetail['content']
        if content!='':
            processitem={
                "sid":self.sid,
                "source":"21",
                "title":bdetail['title'],
                "content":bdetail['content'],
                "contentWithTag":bdetail['contentHtml'],
                "origin":bdetail['source'],
                "publishDate":bdetail['publishtime'],
                "sourceAddress":bdetail['detailurl'],
                "createDate":nowDate
            }

        return processitem

    def sendkafka(self,processitem):
        try:
            producer = KafkaProducer(bootstrap_servers=[self.kafka_bootstrap_servers])
            content=processitem['content']
            publishDate=str(processitem['publishDate'])
            title=processitem['title']
            if title =='':
                return
            if content=='':
                return
            if publishDate=='':
                return
            kafka_result = producer.send("crawlerInfo", json.dumps(processitem, ensure_ascii=False).encode('utf8'))
            self.logger.info("数据发送kafka成功")
            self.logger.info(kafka_result.get(timeout=10))
        except Exception as e:
            self.logger.info('发送kafka异常')
        finally:
            producer.close()

    def run(self):
        # # 获取每页URL
        # c = threading.Thread(target=self.get_page_html)
        # c.start()
        # c.join()
        # # 解析详情页
        # t = threading.Thread(target=self.get_detail_html)
        # t.start()
        self.get_page_html


if __name__ == '__main__':
    zhuce = JrttnewsSpider()
    zhuce.run()
    # zhuce.driver.close()