#   国外智库-联合国数据采集
#   需要cookie
import datetime
import json
import time

import pymongo
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
from retry import retry
import BaseCore
from requests.packages import urllib3
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
urllib3.disable_warnings()

db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN[
    '国外智库']
pathType = 'PolicyDocuments/'
taskType = '国外智库-联合国'
create_by = 'LiuLiYuan'
headers = {
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
    'Cache-Control': 'no-cache',
    'Cookie': '_gid=GA1.2.1329908685.1707104609; fpestid=QfG9QjcFlrW1XTiPH6x-5MvkjPA_-fWKmLMvSfmLZ2zCFckq2esUENyoUmljfZpglj3cPA; _ga_331595331=GS1.1.1707104608.1.1.1707105418.0.0.0; _ga=GA1.1.847295956.1707104609; _ga_4057246821=GS1.1.1707104608.1.1.1707105418.0.0.0; _ga_SN6PPP7BP5=GS1.1.1707104608.1.1.1707105522.0.0.0',
    'Pragma': 'no-cache',
    'Referer': 'https://unctad.org/official-documents-search?f%5B0%5D=document_type%3A262&f%5B1%5D=product%3A655',
    'Sec-Ch-Ua': '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
    'Sec-Ch-Ua-Mobile': '?0',
    'Sec-Ch-Ua-Platform': '"Windows"',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
    'X-Requested-With': 'XMLHttpRequest',
}

@retry(tries=2, delay=5)
def sendKafka(dic):
    producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], max_request_size=1024 * 1024 * 20)
    kafka_result = producer.send("research_center_fourth",
                                 json.dumps(dic, ensure_ascii=False).encode('utf8'))
    log.info(f'{dic["sourceAddress"]}传输成功')

def secrchATT(item_id, retData, type_id, order_by):
    sel_sql = '''select id from clb_sys_attachment where item_id = %s and path = %s and type_id=%s and order_by=%s '''
    baseCore.cursor_.execute(sel_sql, (item_id, retData['path'], type_id, order_by))
    selects = baseCore.cursor_.fetchone()
    return selects

# 插入到att表 返回附件id
def tableUpdate(retData, file_name, num, publishDate,origin):
    item_id = retData['item_id']
    type_id = retData['type_id']
    group_name = retData['group_name']
    path = retData['path']
    full_path = retData['full_path']
    category = retData['category']
    file_size = retData['file_size']
    status = retData['status']
    create_by = retData['create_by']
    page_size = retData['page_size']
    create_time = retData['create_time']
    order_by = num
    object_key = full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1]

    Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name,publish_time,source) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''

    values = (
        file_name+'.pdf', type_id, item_id, group_name, path, full_path, category, file_size, order_by,
        status, create_by,
        create_time, object_key, 'zzsn', publishDate,origin)

    baseCore.cursor_.execute(Upsql, values)  # 插入
    baseCore.cnx_.commit()  # 提交
    baseCore.getLogger().info("更新完成:{}".format(Upsql))
    selects = secrchATT(item_id, retData, type_id, order_by)
    id = selects[0]
    return id

def save_data(dic_news):
    aaa_dic = {
        '附件id': dic_news['attachmentIds'],
        '网址': dic_news['sourceAddress'],
        'tid': '',
        '来源': f"联合国",
        '创建时间': dic_news['createDate'],
        '带标签内容': dic_news['contentWithTag'][:100],
        '发布时间': dic_news['publishDate'],
        '标题': dic_news['title']
    }
    db_storage.insert_one(aaa_dic)


def getSoup(page):
    headers = {
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
        'Cache-Control': 'no-cache',
        'Cookie': '_gid=GA1.2.1329908685.1707104609; fpestid=QfG9QjcFlrW1XTiPH6x-5MvkjPA_-fWKmLMvSfmLZ2zCFckq2esUENyoUmljfZpglj3cPA; _ga_331595331=GS1.1.1707104608.1.1.1707105418.0.0.0; _ga=GA1.1.847295956.1707104609; _ga_4057246821=GS1.1.1707104608.1.1.1707105418.0.0.0; _ga_SN6PPP7BP5=GS1.1.1707104608.1.1.1707105522.0.0.0',
        'Pragma': 'no-cache',
        'Referer': 'https://unctad.org/official-documents-search?f%5B0%5D=document_type%3A262&f%5B1%5D=product%3A655',
        'Sec-Ch-Ua': '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
        'Sec-Ch-Ua-Mobile': '?0',
        'Sec-Ch-Ua-Platform': '"Windows"',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'same-origin',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
        'X-Requested-With': 'XMLHttpRequest',
    }
    url = f'https://unctad.org/views/ajax?f%5B0%5D=document_type%3A262&f%5B1%5D=product%3A655&_wrapper_format=drupal_ajax&view_name=unctad_official_documents_search&view_display_id=page_1&view_args=official_document%2Fall%2Fall&view_path=%2Fofficial-documents-search&view_base_path=official-documents-search&view_dom_id=c4190b9016520db531f854876f3c0fdd696c03fb35fb4431f64fc25c90716fcb&pager_element=0&f%5B0%5D=product%3A655&f%5B1%5D=product%3A655&page={page}&_drupal_ajax=1&ajax_page_state%5Btheme%5D=newyork_b5&ajax_page_state%5Btheme_token%5D=&ajax_page_state%5Blibraries%5D=bootstrap_barrio%2Fbreadcrumb%2Cbootstrap_barrio%2Fform%2Cbootstrap_barrio%2Fglobal-styling%2Cbootstrap_barrio%2Flinks%2Cbootstrap_barrio%2Fmessages_light%2Cckeditor_accordion%2Faccordion.frontend%2Cfacets%2Fdrupal.facets.checkbox-widget%2Cfacets%2Fdrupal.facets.dropdown-widget%2Cfacets%2Fdrupal.facets.general%2Cfacets%2Fdrupal.facets.views-ajax%2Cgoogle_analytics%2Fgoogle_analytics%2Cnewyork_b5%2Fglobal-styling%2Csystem%2Fbase%2Cviews%2Fviews.ajax%2Cviews%2Fviews.module'
    req = requests.get(url, headers=headers, verify=False)
    req.encoding = req.apparent_encoding
    datasJson = req.json()
    infoTag = ''
    for dataJson in datasJson:
        if dataJson['command'] == 'settings' or dataJson['command'] == 'scrollTop':
            continue
        if dataJson['method'] == 'replaceWith':
            infoTag = dataJson['data']
            break
    soup = BeautifulSoup(infoTag, 'lxml')
    return soup


def getTotal():
    soup = getSoup(1)
    header = soup.find('div', class_='view-header').text
    total = header.split('of')[-1].strip()
    return int(total)

@retry(tries=2, delay=5)
def translate(title, contentWithTag):
    headers = {
        'Content-Type': 'application/json',
    }
    dic_info = {
        'title': title,
        # 'summary': '<div>apple</div>',
        'contentWithTag': contentWithTag
    }
    dic_info = json.dumps(dic_info)
    req = requests.post('http://117.78.23.14:5000/translate', data=dic_info, headers=headers)
    dataJson = req.json()
    if dataJson['status'] == 'failed':
        raise
    titleRaw = dataJson['title']
    contentWithTagRaw = dataJson['contentWithTag']
    titleRaw = BeautifulSoup(titleRaw,'html.parser')
    titleRaw = titleRaw.text
    contentWithTagRaw = BeautifulSoup(contentWithTagRaw,'html.parser')
    return titleRaw, contentWithTagRaw


def doJob():
    num = 1
    total = getTotal()
    if total % 10 == 0:
        pageSize = int(total / 10)
    else:
        pageSize = int(total / 10 + 1)
    for page in range(pageSize):
        soup = getSoup(page)
        divList = soup.find('div', class_='view-content').find_all('div', class_='views-row')
        for div in divList:
            start_time = time.time()
            title = div.find_all('div')[0].find('a').text
            href = div.find_all('div')[0].find('a').get('href')
            publishDate_ = div.find('div',class_='views-field-field-publisheddate').text.strip()
            publishDate = datetime.datetime.strptime(publishDate_,"%d %b %Y")
            publishDate = publishDate.strftime("%Y-%m-%d %H:%M:%S")
            if publishDate < '2023-01-20':
                continue
            is_href = db_storage.find_one({'网址': href})
            if is_href:
                log.info(f'{href}===已采集')
                continue
            div.find_all('div')[0].extract()
            div.find('span', class_='Z3988').extract()
            contentWithTag = div
            content = div.text.strip()
            contentWithTag = str(contentWithTag)
            titleRaw, contentWithTagRaw = translate(title,contentWithTag)
            contentRaw = contentWithTagRaw.text
            contentWithTagRaw = str(contentWithTagRaw)
            retData = baseCore.uptoOBS(href, title, 15, '', pathType, taskType, start_time, create_by,headers)
            num += 1
            id_list = []
            if retData['state']:
                att_id = tableUpdate(retData, title, num, publishDate, '联合国')
                if att_id:
                    id_list.append(att_id)
                    now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                    lang = baseCore.detect_language(content)
                    dic = {
                        'id': f'1620244462491893761{int(time.time())}',
                        'subjectId': '1620244462491893761',
                        'checkStatus': 1,
                        'deleteFlag': 0,
                        'topNum': 0,
                        'content': content,
                        'contentRaw': contentRaw,
                        'contentWithTag': contentWithTag,
                        'contentWithTagRaw': contentWithTagRaw,
                        'createDate': now,
                        'labels': [
                            {'labelMark': 'organization', 'relationId': '1619903243781824514', 'relationName': '联合国'}],
                        'lang': lang,
                        'origin': '联合国',
                        'publishDate': publishDate,
                        'sourceAddress': href,
                        'title': title,
                        'titleRaw': titleRaw,
                        'updateDate': now,
                        'attachmentIds':id_list
                    }
                    try:
                        sendKafka(dic)
                        try:
                            save_data(dic)
                        except:
                            log.error(f'{href}===数据库保存失败')
                    except:
                        log.error(f'{href}===kafka发送失败')


if __name__ == '__main__':
    doJob()
    baseCore.close()
