# 根据链接删除链接重复的数据
import json
import threading
import time
import uuid

import redis
import requests
from retry import retry
from elasticsearch import Elasticsearch
from base import BaseCore
from obs import ObsClient
import fitz
from urllib.parse import unquote
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
baseCore = BaseCore.BaseCore()

cnx_ = baseCore.cnx_
cursor_ = cnx_.cursor()

lock = threading.Lock()

pool = redis.ConnectionPool(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
class EsMethod(object):

    def __init__(self):
        # 创建Elasticsearch对象，并提供账号信息
        self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
        self.index_name = 'policy'

    def queryatt(self,index_name,pnum):
       body = {
                "size":0,
                "aggs":{
                    "duplicate_titles":{
                        "terms":{
                            "field":"sourceAddress.keyword",
                            "min_doc_count":2,
                            "size":1000
                        },
                        "aggs":{
                            "duplicate_docs":{
                                "top_hits":{
                                    "_source":{
                                        "includes":["id","title","subjectId","sourceAddress","createDate"]
                                    },
                                "size":10
                                }
                            }
                        }
                    }
                }
            }

       # filter_path = ['hits.aggregations.duplicate_titles',
       #                'hits.total.value',
       #                'hits.hits._source.title',
       #                'hits.hits._source.sourceAddress',
       #                'hits.hits._source.createDate',
       #                ]  # 字段2
       result = self.es.search(index=index_name
                               , doc_type='_doc'
                               # , filter_path=filter_path
                               , body=body)
       # log.info(result)
       return result

def main(page, p, esMethod):

    result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
    total = result['hits']['total']['value']

    if total == 0:
        log.info('++++已没有数据+++++')
        return
    documents = result["aggregations"]["duplicate_titles"]["buckets"]
    for bucket in documents:
        info_list = bucket["duplicate_docs"]["hits"]["hits"]
        for info in info_list:

            att_id_list = info['_source']['attachmentIds']
            if len(att_id_list)==0:
                unique_document_ids = info["_id"]
                log.info(f'==={unique_document_ids}===')

    # # 删除重复的文档
    # for doc_id in unique_document_ids:
    #     esMethod.delete(index="policy", id=doc_id)



def run_threads(num_threads,esMethod,j):
    threads = []

    for i in range(num_threads):
        page = j + i + 1
        p = j + i * 200
        thread = threading.Thread(target=main, args=(page, p, esMethod))

        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()



if __name__ == "__main__":
    j = 0
    for i in range(5):
        esMethod = EsMethod()
        # result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
        # total = result['hits']['total']['value']
        # if total == 0:
        #     log.info('++++已没有数据+++++')
        #     break
        start = time.time()
        num_threads = 5
        run_threads(num_threads, esMethod, j)
        j += 1000

        log.info(f'5线程 每个处理200条数据 总耗时{time.time() - start}秒')