# -*- coding: utf-8 -*-
import redis
import time
from urllib.parse import quote
import pymongo
import requests
from bson.objectid import ObjectId
import json
from pyquery import PyQuery as pq
import urllib3
import hashlib
from kafka import KafkaProducer
import pandas as pd
import zhconv
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()

# 通过企业名称或信用代码获取企查查id
def find_id_by_name(name):
    urllib3.disable_warnings()

    qcc_key = name
    t = str(int(time.time()) * 1000)
    headers['Qcc-Timestamp'] = t
    url = f"https://xcx.qcc.com/mp-weixin/forwardApp/v3/base/advancedSearch?token={token}&t={t}&pageIndex=1&needGroup=yes&insuredCntStart=&insuredCntEnd=&startDateBegin=&startDateEnd=&registCapiBegin=&registCapiEnd=&countyCode=&province=&sortField=&isSortAsc=&searchKey={quote(qcc_key)}&searchIndex=default&industryV3="
    for lll in range(1, 6):
        try:
            resp_dict = requests.get(url=url, headers=headers, verify=False).json()
            break
        except:
            print('重试')
            time.sleep(5)
            continue
    time.sleep(2)
    if resp_dict['result']['Result']:
        result_dict = resp_dict['result']['Result'][0]
        KeyNo = result_dict['KeyNo']
        Name = result_dict['Name'].replace('<em>', '').replace('</em>', '').strip()
        if Name == '':
            KeyNo = ''
    else:
        KeyNo = ''

    print("{}，企业代码为:{}".format(qcc_key, KeyNo))
    return KeyNo


# 判断字符串里是否含数字
def str_have_num(str_num):
    panduan = False

    for str_1 in str_num:
        ppp = str_1.isdigit()
        if ppp:
            panduan = ppp
    return panduan


# 通过企查查id获取企业官网
def info_by_id(com_id,com_name):
    aa_dict_list = []

    t = str(int(time.time()) * 1000)
    headers['Qcc-Timestamp'] = t

    url = "https://xcx.qcc.com/mp-weixin/forwardApp/v1/ent/detail?token={}&t={}&unique={}".format(token, t, com_id)
    resp_dict = requests.get(url=url, headers=headers, verify=False).json()
    time.sleep(2)

    try:
        result_dict = resp_dict['result']['Company']
    except:
        print(com_name + ":获取失败")

    try:
        WebSite = result_dict['companyExtendInfo']['WebSite']
    except:
        WebSite = None
    if WebSite is None:
        try:
            WebSite = result_dict['ContactInfo']['WebSite'][0]['Url']
        except:
            WebSite = ''
    print(com_name + "：爬取完成")
    return WebSite


headers = {
    'Host': 'xcx.qcc.com',
    'Connection': 'keep-alive',
    'Qcc-Platform': 'mp-weixin',
    'Qcc-Timestamp': '',
    'Qcc-Version': '1.0.0',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
    'content-type': 'application/json',
    'Referer': 'https://servicewechat.com/wx395200814fcd7599/166/page-frame.html',
    'Accept-Encoding': 'gzip, deflate, br,'
}

#TODO:需要隔两个小时左右抓包修改
token = '1dcc61d85177733298e5827653706f1a'  # 需要隔两个小时左右抓包修改
start = time.time()
list_weicha = []
#待采集企业文件
filename = 'data/内蒙古市属国有企业_官网.xlsx'
df_all = pd.read_excel('data/内蒙古市属国有企业.xls',dtype=str)
list_all_info = []
for num_df in range(162,len(df_all)):
    #企业社会信用代码
    id_code = str(df_all['本企业代码'][num_df])
    #企业名称
    com_name = str(df_all['企业名称'][num_df])
    #行次
    line = str(df_all['行次'][num_df])
    dic_com = {
        'line': line,
        'social_code': id_code,
        'com_name': id_code,
        'website':''
    }

    company_id = find_id_by_name(id_code)

    if company_id == "":
        print(com_name + "：企业ID获取失败")
        list_weicha.append(com_name + "：企业ID获取失败")
        continue

    WebSite = info_by_id(company_id,com_name)
    dic_com['website'] = WebSite
    log.info(f'---{num_df}-------{com_name}----------耗时{baseCore.getTimeCost(start,time.time())}')
    list_all_info.append(dic_com)
    baseCore.writerToExcel(list_all_info,filename)








