import json
import time

import numpy as np
import pandas as pd
import requests
import urllib3
from bs4 import BeautifulSoup
from kafka import KafkaProducer
# sys.path.append(r'F:\zzsn\zzsn_spider\base')
# import BaseCore
from retry import retry

from base import BaseCore

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

taskType = '企业基本信息/雅虎财经'

baseCore = BaseCore.BaseCore()
r = baseCore.r
log = baseCore.getLogger()
headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
    'cache-control': 'max-age=0',
    'sec-ch-ua': '"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': "Windows",
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}


# 发送kafka
@retry(delay=5)
def sendKafka(company_dict):
    producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
    # kafka_result = producer.send("regionInfo", json.dumps(company_dict, ensure_ascii=False).encode('utf8'))
    kafka_result = producer.send("enterpriseInfo", json.dumps(company_dict, ensure_ascii=False).encode('utf8'))
    kafka_result.get(timeout=10)


# 保存基本信息
def saveBaseInfo(info, code, start):
    yname = code.split('|')[1]  # 原名
    cname = code.split('|')[2]  # 中文名
    oname = code.split('|')[4]  # 曾用名
    shortname = code.split('|')[5]  # 简称
    url = code.split('|')[7]  # 网址
    add = code.split('|')[8]  # 地址
    country = code.split('|')[16]  # 国家
    gpdm = code.split('|')[17]  # 股票代码
    gpjc = code.split('|')[18]  # 股票简称
    category = code.split('|')[19]  # 股票类型
    jys = code.split('|')[20]  # 交易所
    ipotime = code.split('|')[21]  # 上市时间
    # 基本信息发送到kafka
    url_ = info['base_info']['公司网站']
    add_ = info['base_info']['地址']
    if url_ == '' and url != '':
        url_ = url
    if add_ == '' and add != '':
        add_ = add
    company_dict = {
        'originalName': yname,  # 企业原名称
        'name': cname,  # 企业中文名称
        'shortName': shortname,  # 企业简称
        'officialPhone': info['base_info']['电话'],  # 电话
        'officialUrl': url_,  # 官网
        'briefInfo': info['base_info']['公司简介'],  # 简介
        'industry': info['base_info']['行业'],  # 所属行业
        'englishName': info['base_info']['英文名'],  # 英文名
        'address': add_,  # 地址
        'beforeName': oname,  # 曾用名
        'ynDomestic': 0,  # 是否国内(1-是;0-否)
        'countryName': country,
        'securitiesCode': gpdm,  # 股票代码
        'securitiesShortName': gpjc,  # 股票代码简称
        'listingDate': ipotime,  # 上市时间
        'category': category,  # 股票类型
        'exchange': jys,  # 交易所
        'status': 0,  # 状态
    }
    sendKafka(company_dict)
    log.info(
        f"保存基本信息--{info['base_info']['英文名']}--{gpdm}---耗时{baseCore.getTimeCost(start, time.time())}")


# 获取请求响应
@retry(tries=5, delay=3)
def getRes(url):
    response = requests.get(url, headers=headers, verify=False)
    if response.status_code != 200:
        raise
    return response


# 根据股票代码 获取企业基本信息
def getInfo(code, gpdm, start):
    if 'HK' in str(gpdm):
        tmp_g = str(gpdm).split('.')[0]
        if len(tmp_g) == 5:
            gpdm_ = str(gpdm)[1:]
        else:
            gpdm_ = gpdm
    elif str(gpdm)[-2:] == '.N' or str(gpdm)[-2:] == '.O':
        gpdm_ = gpdm[:-2]
    else:
        gpdm_ = gpdm
    retData = {}
    url = f'https://finance.yahoo.com/quote/{gpdm_}/profile?p={gpdm_}'

    time.sleep(3)
    try:
        response = getRes(url)
    except:
        log.error(f"{gpdm}------访问基本信息页面失败")
        exeception = '访问基本信息页面失败'
        state = -1
        takeTime = baseCore.getTimeCost(start, time.time())
        baseCore.recordLog('', taskType, state, takeTime, url, exeception)
        r.lpush('BaseInfoEnterprise:gwqy_socialCode', code)
        return state, retData, exeception

    if 'lookup' in response.url:
        log.error(f"{gpdm}------股票代码未查询到信息：{response.status_code}")
        exeception = '股票代码未查询到信息'
        state = 0
        takeTime = baseCore.getTimeCost(start, time.time())
        baseCore.recordLog('', taskType, 0, takeTime, url, exeception)
        return state, retData, exeception

    if url != response.url:
        log.error(f'{gpdm}------请求失败')
        exeception = '请求失败'
        state = -1
        r.lpush('BaseInfoEnterprise:gwqy_socialCode', code)
        return state, retData, exeception

    state = 1
    soup = BeautifulSoup(response.content, 'html.parser')
    page = soup.find('div', {'id': 'Col1-0-Profile-Proxy'})
    if page.text == '':
        state = 0
        exeception = '无基本信息'
        return state, retData, exeception
    try:
        try:
            name = page.find('h3', {'class': 'Fz(m) Mb(10px)'}).text.lstrip().strip()
            try:
                com_info = page.find('div', {'class': 'Mb(25px)'})
            except:
                com_info = ''
            try:
                com_phone = com_info.find_all('p')[0].find('a').text.lstrip().strip()
            except:
                com_phone = ''
            try:
                com_url = com_info.find_all('p')[0].find('a', {'target': '_blank'}).text.lstrip().strip()
            except:
                com_url = ''
            try:
                com_address = ''
                com_addressTag = com_info.find_all('p')[0]
                a_list = com_addressTag.select('a')
                for a in a_list:
                    a.decompose()
                com_addressTag = str(com_addressTag).replace('<br/>', '</p><p>')
                com_addressTag = BeautifulSoup(com_addressTag, 'html.parser')
                p_list = com_addressTag.select('p')
                for p in p_list:
                    com_address += p.text.lstrip().strip() + ' '
                com_address = com_address.lstrip().strip()
            except:
                com_address = ''
            try:
                com_bumen = com_info.find_all('p')[1].find_all('span', {'class': 'Fw(600)'})[0].text.lstrip().strip()
            except:
                com_bumen = ''
            try:
                com_hangye = com_info.find_all('p')[1].find_all('span', {'class': 'Fw(600)'})[1].text.lstrip().strip()
            except:
                com_hangye = ''
            try:
                com_people = com_info.find_all('p')[1].find_all('span', {'class': 'Fw(600)'})[2].text.lstrip().strip()
            except:
                com_people = ''
            try:
                com_jianjie = page.find('p', {'class': 'Mt(15px) Lh(1.6)'}).text.lstrip().strip()
            except:
                com_jianjie = ''
        except:
            name = page.find('h3', {'class': 'Mb(5px) Mend(40px)'}).text.lstrip().strip()
            try:
                com_phone = page.find('span',class_='D(b) Lh(21px) Mb(20px) C($linkColor)').text.strip().lstrip()
            except:
                com_phone = ''


    except:
        state = 0
        exeception = '其它错误原因'
        return state, retData, exeception

    dic_com_info = {
        '英文名': name,
        '股票代码': gpdm,
        '地址': com_address,
        '电话': com_phone,
        '公司网站': com_url,
        '部门': com_bumen,
        '行业': com_hangye,
        '员工人数': com_people,
        '公司简介': com_jianjie
    }
    retData['base_info'] = dic_com_info
    log.info(f"获取基本信息--{gpdm}，耗时{baseCore.getTimeCost(start, time.time())}")
    response.close()
    return state, retData, ''


# 采集工作
def beginWork():
    data_false = []
    data_true = []
    while True:
        code = baseCore.redicPullData('BaseInfoEnterprise:gwqy_socialCode')
        # 标志某次新增企业全部采集完毕，需要反馈采集情况
        if code == 'end':
            nowtime = baseCore.getNowTime(1).replace('-', '')[:10]
            # 将采集情况保存至本地
            writer = pd.ExcelWriter(f'./企业基本信息采集情况_{nowtime}.xlsx')
            # 采集失败列表
            if data_false:
                df_f = pd.DataFrame(np.array(data_false))
                df_f.columns = ['企业名称', '股票代码', '失败原因']
                df_f.to_excel(writer, sheet_name='采集失败', index=False)
            # 采集成功列表
            if data_true:
                df_t = pd.DataFrame(np.array(data_true))
                df_t.columns = ['企业名称', '股票代码']
                df_t.to_excel(writer, sheet_name='采集成功', index=False)
            if data_true or data_false:
                writer.save()
                # 发送邮件
                baseCore.sendEmail()
            # 采集成功与失败列表置为空
            data_false = []
            data_true = []
            continue
        if not code or code == 'None':
            time.sleep(20)
            continue
        # 数据库中获取基本信息
        ename = code.split('|')[3].lstrip().strip()  # 英文名
        url = code.split('|')[7].lstrip().strip()  # 网址
        if url[-1] == '/':
            url = url[:-1]
        add = code.split('|')[8].lstrip().strip()  # 地址
        gpdm = code.split('|')[17].lstrip().strip()  # 股票代码
        log.info(f'==={gpdm}===开始采集基本信息')
        start_time = time.time()
        try:
            state, retData, exeception = getInfo(code, gpdm, start_time)
            # 基本信息采集成功 进行数据入库,否则不入库
            # state 1采集成功 0采集不到基本信息 -1页面访问失败
            if state == 1:
                # 企业基本信息入库
                try:
                    ename_ = retData['base_info']['英文名']
                    if ename_ != ename:
                        data_false.append([ename, gpdm, '采集到企业名称与所给企业名称不同'])
                        log.error(f'{gpdm}===采集失败')
                    else:
                        url_ = retData['base_info']['公司网站']
                        if url not in url_ and url_ != '' and url != '':
                            data_false.append([ename, gpdm, '采集到企业网址与所给企业网址不同'])
                            log.error(f'{gpdm}===采集失败')
                        else:
                            add_ = retData['base_info']['地址']
                            if add not in add_ and add_ != '' and add != '':
                                data_false.append([ename, gpdm, '采集到企业地址与所给企业地址不同'])
                                log.error(f'{gpdm}===采集失败')
                            else:
                                saveBaseInfo(retData, code, start_time)
                                time.sleep(3)
                                data_true.append([ename, gpdm])
                except Exception as e:
                    r.lpush('BaseInfoEnterprise:gwqy_socialCode', code)
                    log.error(f'{ename}....企业基本信息Kafka操作失败')
                    exception = 'Kafka操作失败'
                    log.error(e)
                    state = 0
                    takeTime = baseCore.getTimeCost(start_time, time.time())
                    baseCore.recordLog('', taskType, state, takeTime, '', exception)
                    time.sleep(3)
            elif state == 0:
                data_false.append([ename, gpdm, exeception])
                time.sleep(3)
        except Exception as e:
            data_false.append([ename, gpdm, '其它错误原因'])
            log.error(f'{gpdm}===信息采集错误')
            log.error(e)
            time.sleep(3)

    # 释放资源
    baseCore.close()


if __name__ == '__main__':
    #beginWork()
    url = 'https://finance.yahoo.com/quote/501057.SS/profile?p=501057.SS'
    req = requests.get(url,headers=headers,verify=False)
    req.encoding = req.apparent_encoding
    soup = BeautifulSoup(req.text, 'html.parser')
    page = soup.find('div', {'id': 'Col1-0-Profile-Proxy'})
    print(page.text)

