# -*- coding: utf-8 -*-
import json
import re
import threading
import time
import uuid
from urllib.parse import urljoin

import fitz
import redis
import requests
from bs4 import BeautifulSoup
from obs import ObsClient
from retry import retry
from elasticsearch import Elasticsearch
from base import BaseCore
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
baseCore = BaseCore.BaseCore()

cnx_ = baseCore.cnx_
cursor_ = cnx_.cursor()

lock = threading.Lock()


pool = redis.ConnectionPool(host="114.115.236.206", port=6379, password='clbzzsn', db=6)

class EsMethod(object):

    def __init__(self):
        # 创建Elasticsearch对象，并提供账号信息
        self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
        self.index_name = 'researchreportdata'

    def queryatt(self,index_name,id):
       body = {
               "query": {
                "match": {
                       "id": id
                   }
               }

       }

       filter_path = ['hits.hits._id',
                      'hits.total.value',
                      'hits.hits._source.title',
                      'hits.hits._source.socialCreditCode',
                      'hits.hits._source.sourceAddress'
                      # 'hits.hits._source.createDate',
                      # 'hits.hits._source.publishDate',
                      ]  # 字段2
       resultb = self.es.search(index=index_name
                               , doc_type='_doc'
                               , filter_path=filter_path
                               , body=body)
       # log.info(result)
       return resultb

    def updateaunn(self, index_name, id, content, contentWithTag):
        body = {
            'doc': {
                'content': content,
                'contentWithTag': contentWithTag
            }
        }
        resulta = self.es.update(index=index_name
                                ,id=id
                                ,body=body)
        log.info('更新结果:%s' % resulta)

def paserUrl(html,listurl):
    # soup = BeautifulSoup(html, 'html.parser')
    # 获取所有的<a>标签和<img>标签
    links = html.find_all(['a', 'img'])
    print(len(links))
    # 遍历标签，将相对地址转换为绝对地址
    for link in links:
        print(link)
        if 'href' in link.attrs:
            # link['href'] = urljoin(listurl, link['href'])
            pass
        elif 'src' in link.attrs:
            pass
            # link['src'] = urljoin(listurl, link['src'])
    return html

def get_news(news_url,sourceAddress,id):
    header = {
        'Host': 'www.sec.gov',
        'Connection': 'keep-alive',
        'sec-ch-ua': '"Not/A)Brand";v="99", "Google Chrome";v="115", "Chromium";v="115"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-User': '?1',
        'Sec-Fetch-Dest': 'document',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cookie': '_gid=GA1.2.385814648.1694135927; _ga_300V1CHKH1=GS1.1.1694135927.6.1.1694136598.0.0.0; _ga=GA1.1.733439486.1693211261; _4c_=%7B%22_4c_s_%22%3A%22dZJbj9owEIX%2FCvJDngj4EowTKaqqVKq20vbe7SMK9pBYC3HkGLwU8d9rQ%2Bh2V61fEn9z5vjInhPyLXSoIDzPCOMcYyHwFD3CcUDFCVmt4ueACqRqlinOcMprxtOsZos0ZwpSIYQUQi0WFDCaoqfgtcQ4F0vKCRX0PEWqu3lYUDDopnupE5xSHnS6d6MwpGEsx8Ez4%2BKmJYTzK4nam2WN%2Flm3%2FmZ1Kyxyxl9KIwnS3r4%2B9b9S2Y%2FSE5JGQTie5DMiZjjdDCGH%2BxVIJuI19NaovXQrd%2ByjzMN6MqjHUFBw0BJWXivXXvopfqYt6KZ1EeOLi4rZEAl%2FXnfK%2BNdtI%2F3TlrOoXVvjB4idVWvNDiaELAI24UXRz0tHDGthA9ZeZK1z%2FVDM59772QBy1pjDXDY6XetufjVLQTW1fSPNrq%2B7Y%2Fnh832yq51sy8HV1g2p165NNnoL3X5XJt9c7aBMKrPvnD2G%2FV1VJruj8R3YEp7kdq8gqaXTpisbcKNryDRoF29rzDCCMItXll7Zg45UTb5XXwP%2F%2BBf5Un26H9H7t6sfd%2B%2FCZslYxvJM8Fl8XkpIGEt0vr5umHlKaR5WFqbMuS0qBM9wXOfz%2BTc%3D%22%7D'
    }
    response = requests.get(url=news_url,headers=header,verify=False)
    # aa = response.text
    # print(response.text)
    # response = requests.get(url=news_url, verify=False, proxies=ip_dic, timeout=30)
    if response.status_code == 200:
        # 请求成功，处理响应数据
        # print(response.text)
        # result_ = BeautifulSoup(response.content,'html.parser')
        result_ = BeautifulSoup(response.text, 'lxml')
        # print(result)
        pass
    else:
        # 请求失败，输出错误信息
        log.info('请求失败:', response.status_code, response.text)
        result_ = ''
    if result_:
        pass
    # 相对路径转化为绝对路径
    # soup = paserUrl(result_, sourceAddress)
    time.sleep(2)
    content = result_.text.strip()
    # del(result_)
    # content = result_
    # print(content)
    time.sleep(2)
    esMethod.updateaunn(esMethod.index_name, str(id), content, str(result_))

def main(esMethod):
    redis_conn = redis.Redis(connection_pool=pool)
    id_ = redis_conn.lpop('NianbaoUS:id')

    # id = "23101317164"
    if id_:
        pass
    else:
        log.info('已无数据')
        return False
    id = id_.decode()
    result_ = esMethod.queryatt(index_name=esMethod.index_name,id=id)
    result = result_['hits']['hits'][0]
    num = 0
    title = result['_source']['title']
    social_code = result['_source']['socialCreditCode']
    # origin = result['_source']['origin']
    log.info(f'====={title}=={social_code}===正在更新===')
    sourceAddress = result['_source']['sourceAddress']
    ip_dic = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
    get_news(sourceAddress,sourceAddress,id)
    return True


def run_threads(num_threads,esMethod):
    threads = []
    for i in range(num_threads):

        thread = threading.Thread(target=main, args=(esMethod,))

        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

if __name__ == '__main__':
    while True:
        esMethod = EsMethod()
        start = time.time()
        # num_threads = 5
        # run_threads(num_threads,esMethod)
        # log.info(f'5线程 总耗时{time.time()-start}秒')
        result = main(esMethod)
        if not result:
            break