# -*- coding: utf-8 -*-
import json
import re
import time
import datetime

import pymongo
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer

import urllib3
from selenium.webdriver.support.wait import WebDriverWait
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
    '天眼查登录信息']
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

import sys
# sys.path.append('D:\\KK\\zzsn_spider\\base')
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor

cnx = baseCore.cnx_
cursor = baseCore.cursor_
log = baseCore.getLogger()

from classtool import Token, File, Tag
token = Token()
file = File()
tag = Tag()
from selenium import webdriver
from selenium.webdriver.common.by import By
def create_driver():
    path = r'D:\soft\msedgedriver.exe'

    # options = webdriver.EdgeOptions()
    options = {
        "browserName": "MicrosoftEdge",
        "ms:edgeOptions": {
            "extensions": [], "args": ["--start-maximized"]  # 添加最大化窗口运作参数
        }
    }

    session = webdriver.Edge(executable_path=path, capabilities=options)
    return session

# 发送数据
def sendkafka(post_data):
    try:
        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
        kafka_result = producer.send("enterpriseInfo", json.dumps(post_data, ensure_ascii=False).encode('utf8'))
        print(kafka_result.get(timeout=10))
    except:
        exception = 'kafka传输失败'
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, '', exception)
        log.info(f"{com_name}--{social_code}--kafka传输失败")

# 检查登陆状态
def checklogin(key):

    t = int(time.time())
    # url = 'https://www.tianyancha.com/search?key=%E4%B8%AD%E5%9B%BD%E7%9F%B3%E6%B2%B9%E5%8C%96%E5%B7%A5%E9%9B%86%E5%9B%A2%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&sessionNo=1706594186.22975563'
    url = f'https://www.tianyancha.com/search?key={key}&sessionNo={t}'
    driver.get(url)
    time.sleep(2)

    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    # todo:检查未登录状态
    # if soup.find('title').text == '会员登录 - 企查查':
    #     log.info('状态---未登录')
    #     soup = ''
    #     return soup
    return soup

# 采集准备
def redaytowork(com_name, social_code, file_name):

    log.info(f'----当前企业{social_code}-{com_name}--开始处理---')
    count = 0
    # 如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
    if social_code:
        soup = checklogin(social_code)
    else:
        soup = checklogin(com_name)
    if not soup:
        log.info("登录失效===重新放入redis")
        baseCore.r.lpush('BaseInfoEnterpriseUptime:gnqy_socialCode', company_field)
        # token.updateTokeen(id_cookie,2)
        # log.info('=====已重新放入redis,失效cookies已删除======')
        time.sleep(20)
        return count
    else:
        try:
            searchinfo = soup.find('div', class_='index_content-tool-title__K1Z6C').find('span', class_='index_title-count__lDSjB').text
        except:
            log.info("登录失效===重新放入redis")
            baseCore.r.lpush('BaseInfoEnterpriseUptime:gnqy_socialCode', company_field)
            # token.updateTokeen(id_cookie,2)
            log.info('=====已重新放入redis,cookies已封号======')
            time.sleep(20)
            return count
        if searchinfo == '0':
            log.info('=====搜索不到该企业====')
            data = [com_name, social_code]
            # todo:搜不到的企业需要返回到一个表格中
            file.appenddata(file_name, '需处理企业', data)
            return count
        else:
            # 开始采集
            try:
                if spiderwork(soup, com_name, file_name):
                    count += 1
                    log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}')
                    # token.updateTokeen(id_cookie,3)
                    return count
                else:
                    return count
            except Exception as e:
                log.info(f'====={social_code}=====获取基本信息失败，重新放入redis=====')
                baseCore.r.lpush('BaseInfoEnterpriseUptime:gnqy_socialCode', company_field)
                # token.updateTokeen(id_cookie,2)
                log.info('=====已重新放入redis,cookies已封号======')
                return count

def ifbeforename(company_url):
    driver.get(company_url)
    time.sleep(2)
    com_soup = BeautifulSoup(driver.page_source, 'html.parser')
    try:
        businessinfo = com_soup.find('table', {'class': 'index_tableBox__ZadJW'})
    except:
        businessinfo = ''
    if businessinfo:
        try:
            name = businessinfo.find('span', class_='index_history-gray-tags__o8mkl').text
            value = businessinfo.find('span', class_='index_copy-text__ri7W6').text.replace('展开', '').replace(' ', '').replace('…','').replace('\n', '').replace('复制', '').split('（')[0]
        except:
            name = '曾用名'
            value = ''
        return value
    else:
        return ''

#解析时间
def paserTime(publishtime):
    timeType = ['年前', '月前', '周前', '前天', '昨天', '天前', '今天', '小时前', '分钟前']
    current_datetime = datetime.datetime.now()
    publishtime = publishtime.strip()
    print(publishtime)
    try:
        if '年前' in publishtime:
            numbers = re.findall(r'\d+', publishtime)
            day = int(numbers[0])
            delta = datetime.timedelta(days=365 * day)
            publishtime = current_datetime - delta
        elif '月前' in publishtime:
            numbers = re.findall(r'\d+', publishtime)
            day = int(numbers[0])
            delta = datetime.timedelta(months=day)
            publishtime = current_datetime - delta
        elif '周前' in publishtime:
            numbers = re.findall(r'\d+', publishtime)
            day = int(numbers[0])
            delta = datetime.timedelta(weeks=day)
            publishtime = current_datetime - delta
        elif '天前' in publishtime:
            numbers = re.findall(r'\d+', publishtime)
            day = int(numbers[0])
            delta = datetime.timedelta(days=day)
            publishtime = current_datetime - delta
        elif '前天' in publishtime:
            delta = datetime.timedelta(days=2)
            publishtime = current_datetime - delta
        elif '昨天' in publishtime:
            current_datetime = datetime.datetime.now()
            delta = datetime.timedelta(days=1)
            publishtime = current_datetime - delta
        elif '今天' in publishtime or '小时前' in publishtime or '分钟前' in publishtime:
            if '小时' in publishtime:
                hour = publishtime.split("小时")[0]
            else:
                hour = 0
            if hour != 0:
                min = publishtime.split("小时")[1].split("分钟")[0]
            else:
                min = publishtime.split("分钟")[0]
            delta = datetime.timedelta(hours=int(hour), minutes=int(min))
            publishtime = current_datetime - delta
        elif '年' in publishtime and '月' in publishtime:
            time_format = '%Y年%m月%d日'
            publishtime = datetime.datetime.strptime(publishtime, time_format)
        elif '月' in publishtime and '日' in publishtime:
            current_year = current_datetime.year
            time_format = '%Y年%m月%d日'
            publishtime = str(current_year) + '年' + publishtime
            publishtime = datetime.datetime.strptime(publishtime, time_format)
    except Exception as e:
        print('时间解析异常！！')
    return publishtime

# 采集基本信息和工商信息
def spiderinfo(company_url, receptname, file_name):

    qccid = company_url.split('company/')[1]
    log.info(f'====={qccid}=====')
    driver.get(company_url)
    # req_ = s.get(headers=headers, url=company_url)
    page_source_detail = driver.page_source
    com_soup = BeautifulSoup(page_source_detail, 'html.parser')
    #todo:天眼查更新时间 正常请求不到 需要使用模拟浏览器
    try:
        sourceUpdateTime_ = com_soup.find('div', class_='index_detail-refresh__6W7U4').find('span').text
        pattern = r'\d{4}-\d{2}-\d{2}'
        matched = re.findall(pattern, sourceUpdateTime_)
        if matched:
            sourceUpdateTime = sourceUpdateTime_
        else:
            sourceUpdateTime = paserTime(sourceUpdateTime_).strftime("%Y-%m-%d %H:%M:%S")
    except:
        log.info(f'天眼查无该企业{social_code}')
        return

    aa_dict = {
        'name': receptname,  # 企业名称
        'shortName': None,  # 企业简称
        'socialCreditCode': social_code,  # 统一社会信用代码
        'sourceUpdateTime': sourceUpdateTime,
        'qccId': qccid
    }
    print(aa_dict)
    # sendkafka(aa_dic)

    header = {
        'Content-Type': 'application/json',
    }
    post_url = 'http://114.115.236.206:8088/enterprise/check/judge'
    dic_info = json.dumps(aa_dict)
    req = requests.post(post_url, data=dic_info, headers=header)
    if req.status_code == 200:
        file.appenddata(file_name, '获取基本信息成功企业', data)
        print(req.text)
    else:
        log.info(f'====={social_code}=====发送数据失败，重新放入redis=====')
        baseCore.r.lpush('BaseInfoEnterpriseUptime:gnqy_socialCode', company_field)

def remove_parentheses(text):
    # 清除中文小括号
    text = re.sub(r'（|）', '', text)
    # 清除英文小括号
    text = re.sub(r'\(|\)', '', text)
    return text.replace(' ', '')

# 判断名称是否统一
def spiderwork(soup, receptname, file_name):
    company_url = ''
    try:
        company_list = soup.find_all('div', class_='index_search-box__7YVh6')
    except:
        log.info(f'====={social_code}=====获取基本信息失败，重新放入redis=====')
        baseCore.r.lpush('BaseInfoEnterpriseUptime:gnqy_socialCode', company_field)
        # token.updateTokeen(id_cookie,2)
        log.info('=====已重新放入redis,cookies已封号======')
        return False

    # receptname = '小米通讯技术有限公司'
    for compamy in company_list:
        info_t = compamy.find('div', class_='index_name__qEdWi')
        getname = info_t.find('span').text
        log.info(f'接收到的企业名称--{receptname}---采到的企业名称--{getname}')
        if receptname and getname == receptname:
            company_url = info_t.find('a')['href']
            break
        elif not receptname:
            company_url = info_t.find('a')['href']
            break
        else:
            jian_name = remove_parentheses(baseCore.hant_2_hans(getname))
            if remove_parentheses(receptname) == jian_name:
                log.info(f'接收到的企业名称--{receptname}---转化成简体字的企业名称--{jian_name}')
                company_url = info_t.find('a')['href']
                break
            else:
                continue
    if company_url:
        # 采集基本信息和工商信息
        spiderinfo(company_url, receptname, file_name)
    else:
        # 判断是否是曾用名
        getname = ''
        for child in company_list[0].find_all():
            if child.has_attr('class'):
                # print(child['class'])
                if 'index_name' in child['class'][0]:
                    getname = child.text
                    company_url = child.find('a')['href']
                    break
        # tr = company_list[:1][0]
        # info_t = tr.find('div', class_='index_name__qEdWi')
        # getname = info_t.find('span').text
        if getname:
            log.info(f'------可能是曾用名------接收到的企业名称--{receptname}---采到的企业名称--{getname}')
            beforename = ifbeforename(company_url)
            if beforename == receptname:
                spiderinfo(company_url, receptname, file_name)
            else:
                # 没有搜到相同的企业名称
                data = [com_name, social_code]
                file.appenddata(file_name, '需处理企业', data)
                time.sleep(2)
                return False
        else:
            # 没有搜到相同的企业名称
            data = [com_name, social_code]
            file.appenddata(file_name, '需处理企业', data)
            time.sleep(2)
            return False
    return True

if __name__ == '__main__':
    taskType = '基本信息/天眼查'
    driver = create_driver()
    driver.get('https://www.tianyancha.com/')
    while True:
        nowtime = baseCore.getNowTime(1).replace('-', '')[:8]
        file_name = f'./data/国内企业基本信息采集情况.xlsx'
        file.createFile(file_name)
        # cookies_list, id_cookie = token.get_cookies()
        # cookies = {}
        # for cookie in cookies_list:
        #     cookies[cookie['name']] = cookie['value']
        # s = requests.Session()
        # s.cookies.update(cookies)
        start_time = time.time()
        # 获取企业信息
        company_field = baseCore.redicPullData('BaseInfoEnterpriseUptime:gnqy_socialCode')
        # company_field = '913100006073602992|光明乳业股份有限公司'

        if company_field == 'end':
            # 本轮处理完毕，需要发送邮件，并且进入下一轮
            baseCore.sendEmail(file_name)
            time.sleep(20)
            file.deleteFile(file_name)
            continue

        if company_field == '' or company_field is None:
            # 本轮结束后没有新增的企业要采集
            file.deleteFile(file_name)
            flag = True
            while flag:
                log.info('--------已没有数据---------')
                time.sleep(30)
                if not baseCore.check_mysql_conn(cnx_):
                    # 144数据库
                    cnx_ = baseCore.cnx
                    cursor_ = cnx_.cursor()
                    log.info('===11数据库重新连接成功===')
                company_field = baseCore.redicPullData('BaseInfoEnterpriseUptime:gnqy_socialCode')
                if company_field:
                    flag = False
                    log.info("-----已添加数据------")
                    baseCore.r.lpush('BaseInfoEnterpriseUptime:gnqy_socialCode', company_field)
                    continue
            continue
        # company_field_ = f'|{company_field}'
        social_code = company_field.split('|')[0]
        com_name = company_field.split('|')[1].replace(' ', '')

        if 'ZZSN' in social_code and 'ZD' in social_code:
            continue

        #todo:查询天眼查id
        data = baseCore.getInfomation(social_code)
        if len(data) != 0:
            tycid = data[11]
        else:
            # 数据重新塞入redis
            # log.info(f'数据库中无该企业{social_code}')
            sql = f"SELECT * FROM sys_base_enterprise WHERE social_credit_code = '{social_code}'"
            cursor.execute(sql)
            data = cursor.fetchone()
            if data:
                pass
            else:
                # 数据库中并没有该企业 需要新增
                pass
            com_name_c = data[3]
            xydm = data[1]
            # 写入数据库
            insert = "INSERT INTO EnterpriseInfo(CompanyName, SocialCode) VALUES (%s, %s)"
            cursor_.execute(insert, (com_name_c, xydm))
            cnx_.commit()
            tycid = ''
        if tycid == None or tycid == '':
            count = redaytowork(com_name, social_code, file_name)
        else:
            company_url = 'https://www.tianyancha.com/company/' + tycid
            spiderinfo(company_url, com_name, file_name)
        time.sleep(10)
        # break
    baseCore.close()