"""
"""
import json
import requests,time,pymysql 
import jieba

from base.BaseCore import BaseCore
from base.smart import smart_extractor

jieba.cut("必须加载jieba")
# 初始化，设置中文分词
smart =smart_extractor.SmartExtractor('cn')
baseCore = BaseCore()
log = baseCore.getLogger()
cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
cursor= cnx.cursor()
pageSize = 10
headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
        'Connection': 'keep-alive',
        'Content-Type': 'application/json',
        'Cookie':'jsid=SEO-BAIDU-ALL-SY-000001; TYCID=77e997401d5f11ee9e91d5a0fd3c0b89; ssuid=6450041974; _ga=GA1.2.858826166.1688800641; _gid=GA1.2.2142449376.1689575510; tyc-user-info-save-time=1689764135027; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22309757777%22%2C%22first_id%22%3A%22189345cb10257d-0cfee05327f673-26031d51-1327104-189345cb10375b%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTg5MzQ1Y2IxMDI1N2QtMGNmZWUwNTMyN2Y2NzMtMjYwMzFkNTEtMTMyNzEwNC0xODkzNDVjYjEwMzc1YiIsIiRpZGVudGl0eV9sb2dpbl9pZCI6IjMwOTc1Nzc3NyJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22309757777%22%7D%2C%22%24device_id%22%3A%22189345cb10257d-0cfee05327f673-26031d51-1327104-189345cb10375b%22%7D; bannerFlag=true; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1689752829,1689821665,1689831487,1689845884; searchSessionId=1689845917.81838207; HWWAFSESID=146bb1d25b1515339d3; HWWAFSESTIME=1689858023324; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1689859758',
        'Host': 'capi.tianyancha.com',
        'Origin': 'https://www.tianyancha.com',
        'Referer': 'https://www.tianyancha.com/',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.51'
}

def beinWork(tyc_code,social_code):
    time.sleep(3)
    retData={'state':False,'total':0,'okCount':0,'errorCount':0,'repetCount':0}
    t=time.time()
    url = f'https://capi.tianyancha.com/cloud-yq-news/company/detail/publicmsg/news/webSimple?_={t}&id={tyc_code}&ps={pageSize}&pn=1&emotion=-100&event=-100'
    for m in range(0, 3):
        try:
            ip = baseCore.get_proxy()
            headers['User-Agent']=baseCore.getRandomUserAgent()
            response = requests.get(url=url, headers=headers, proxies=ip, verify=False)
            # time.sleep(random.randint(3, 5))
            break
        except Exception as e :
            log.error(f"request请求异常----m-----{e}")
            pass

    if (response.status_code == 200):
        pass
    else:
        log.error(f"{tyc_code}-----获取总数接口失败")
        return retData
    try:
        json_1 = json.loads(response.content.decode('utf-8'))
        total = json_1['data']['total']
    except:
        log.error(f"{tyc_code}-----获取总数失败")
        return retData
    if (total > 0):
        if (total % pageSize == 0):
            totalPage = total // pageSize
        else:
            totalPage = total // pageSize + 1
    else:
        log.error(f"{tyc_code}--------总数为0")
        retData['state']=True
        return retData
    log.info(f"{tyc_code}-------总数：{total}----总页数:{totalPage}")

    retData['total']=total
    okCount = 0
    errorCount = 0
    repetCount = 0
    for num in range(1, totalPage+1):
        time.sleep(3)
        log.info(f"获取分页数据--{tyc_code}----分页{num}----开始")
        start_page = time.time()
        url_page = f'https://capi.tianyancha.com/cloud-yq-news/company/detail/publicmsg/news/webSimple?_={time.time()}&id={tyc_code}&ps={pageSize}&pn={num}&emotion=-100&event=-100'
        ip = baseCore.get_proxy()
        for m in range(0, 3):
            try:
                headers['User-Agent']=baseCore.getRandomUserAgent()
                response_page = requests.get(url=url_page,headers=headers, proxies=ip, verify=False)
                # time.sleep(3)
                break
            except:
                pass

        if (response_page.status_code == 200):
            pass
        else:
            log.error(f"{tyc_code}--{num}页---获取分页数据失败")
            errorCount=errorCount+pageSize
            continue
        try:
            json_page = json.loads(response_page.content.decode('utf-8'))
            info_list_page = json_page['data']['items']
        except:
            log.error(f"{tyc_code}--{num}页---获取分页数据失败")
            errorCount = errorCount + pageSize
            continue
        pageIndex=0
        for info_page in info_list_page:
            pageIndex=pageIndex+1
            title = info_page['title']
            source = info_page['website']
            link = info_page['uri']

            sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s '''
            cursor.execute(sel_sql, (link, social_code))

            selects = cursor.fetchall()
            if selects:
                log.info(f'{tyc_code}-----{social_code}----{link}:已经存在')
                repetCount = repetCount + 1 
                continue
            try:
                time_struct = time.localtime(int(info_page['rtm'] / 1000))  # 首先把时间戳转换为结构化时间
                time_format = time.strftime("%Y-%m-%d %H-%M-%S", time_struct)  # 把结构化时间转换为格式化时间
            except:
                time_format = baseCore.getNowTime(1)
            try:
                # 开始进行智能解析
                contentText = smart.extract_by_url(link).text
                # time.sleep(3)
            except Exception as e:
                contentText = ''
            if contentText == '':
                log.error(f'获取正文失败：--------{tyc_code}--------{num}--------{link}')
                errorCount = errorCount + 1
                try:
                    insert_err_sql = f"insert into dt_err(xydm,`from`,url,title,pub_date,zhaiyao,create_date,state,pageNo,pageIndex) values('{social_code}','{source}','{link}','{title}','{time_format}','{info_page['abstracts']}',now(),1,{num},{pageIndex})"
                    cursor.execute(insert_err_sql)
                    cnx.commit()
                except:
                    pass
                continue
            try:
                insert_sql = '''insert into brpa_source_article(social_credit_code,title,summary,content,publish_date,source_address,origin,author,type,lang) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
                # 动态信息列表
                okCount = okCount + 1

                list_info = [
                    social_code,
                    title,
                    info_page['abstracts'],  # 摘要
                    contentText,  # 正文
                    time_format,  # 发布时间
                    link,
                    '天眼查',
                    source,
                    '2',
                    'zh'
                ]
                cursor.execute(insert_sql, tuple(list_info))
                cnx.commit()
                # 采集一条资讯记录一条，记录该企业采到了多少的资讯
                log.info(f'{social_code}----{link}:新增一条')
            except Exception as e:
                log.info(f'传输失败:{social_code}----{link}')
        log.info(f"获取分页数据--{tyc_code}----分页{num}，耗时{baseCore.getTimeCost(start_page, time.time())}")



    retData['state'] = True
    retData['okCount'] = okCount
    retData['errorCount'] = errorCount
    retData['repetCount'] = repetCount
    return  retData

def doJob():

    while True:
        selectSql = f"select id,xydm,tycid from ssqy_tyc where state=1  order by date_time asc  limit 1"
        cursor.execute(selectSql)
        data = cursor.fetchone()
        if (data):
            pass
        else:
            log.info("没有数据了，结束脚本")
            break
        data_list = list(data)
        id = data_list[0]
        xydm = data_list[1]
        tycid = data_list[2]
        log.info(f"{id}---{xydm}----{tycid}----开始处理")
        start_time = time.time()
        updateBeginSql = f"update ssqy_tyc set state=2,date_time=now() where id={id}"
        cursor.execute(updateBeginSql)
        cnx.commit()

        # 开始采集企业动态
        retData = beinWork(tycid, xydm)
        state = retData['state']
        total = retData['total']
        okCount = retData['okCount']
        errorCount = retData['errorCount']
        repetCount = retData['repetCount']

        if state:
            stateNum = 3
        else:
            stateNum = 4

        updateEndSql = f"update ssqy_tyc set state={stateNum},total={total},okCount={okCount},errorCount={errorCount},repetCount={repetCount} ,date_time=now() where id={id}"
        cursor.execute(updateEndSql)
        cnx.commit()
        log.info(f"{id}---{xydm}----{tycid}----结束处理，耗时{baseCore.getTimeCost(start_time, time.time())}---总数:{total}---成功数:{okCount}----失败数:{errorCount}--重复数:{repetCount}")

    cursor.close()
    cnx.close()
    #释放资源
    baseCore.close()


# Press the green button in the gutter to run the script.


if __name__ == '__main__':
    doJob()


