import json
import random
import time
from urllib.parse import urljoin
import datetime
import pymongo
from kafka import KafkaProducer
from tqdm import tqdm
import pandas as pd
import pymysql
import requests
from bs4 import BeautifulSoup
import urllib3
from lxml import etree
from BaseCore import BaseCore

baseCore = BaseCore()
log = baseCore.getLogger()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
cnx = baseCore.cnx
cursor = baseCore.cursor
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji[
    '国务院_国资委_copy1']
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'Cookie': 'Hm_lvt_fa835457efbc11dfb88752e70521d23b=1690184499; Hm_lpvt_fa835457efbc11dfb88752e70521d23b=1690184499; SF_cookie_1=98184645; Hm_lvt_2b5618a441c142a90e1a75f4b226c252=1690189470; Hm_lpvt_2b5618a441c142a90e1a75f4b226c252=1690189470; zh_choose=n; wdcid=30ffdae06d11dbde; wdlast=1690189470; wdses=13ee59561f2fb725',
    'Host': 'www.sasac.gov.cn',
    'Pragma': 'no-cache',
    'Referer': 'https://www.baidu.com/link?url=CcQEFfXAeQsxu1IlLlxj8WHugAcJ7sBjOBqvZYDfN7WE6OZpSUM4prK6DiADOqTP&wd=&eqid=d507a037000987780000000364be37d4',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}

def paserUrl(html, listurl):
    # soup = BeautifulSoup(html, 'html.parser')
    # 获取所有的<a>标签和<img>标签
    links = html.find_all(['a', 'img'])
    # 遍历标签，将相对地址转换为绝对地址
    for link in links:
        if 'href' in link.attrs:
            link['href'] = urljoin(listurl, link['href'])
        elif 'src' in link.attrs:
            link['src'] = urljoin(listurl, link['src'])
    return html


def save_data(dic_news):
    aaa_dic = {

        '附件id': dic_news['attachmentIds'],
        '网址': dic_news['sourceAddress'],
        'tid': dic_news['labels'][0]['relationId'],
        '来源': dic_news['labels'][0]['relationName'],
        '创建时间': dic_news['createDate'],
        '带标签内容': dic_news['contentWithTag'][:100]
    }
    db_storage.insert_one(aaa_dic)


def sendKafka(dic_news):
    start_time = time.time()
    try:  # 114.116.116.241
        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
        kafka_result = producer.send("policy",
                                     json.dumps(dic_news, ensure_ascii=False).encode('utf8'))

        print(kafka_result.get(timeout=10))

        dic_result = {
            'success': 'ture',
            'message': '操作成功',
            'code': '200',
        }
        log.info(dic_result)
        # 传输成功,写入日志中
        state = 1
        takeTime = baseCore.getTimeCost(start_time, time.time())
        # return True

    except Exception as e:

        dic_result = {
            'success': 'false',
            'message': '操作失败',
            'code': '204',
            'e': e
        }
        log.error(dic_result)
        e = 'Kafka操作失败'
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())

# 国资委_内设机构
def gzw_nsjg():
    # 获取页面数据
    def get_page_nsjg(href, ting_type, relationId, page):
        start_time = time.time()
        num = 0
        for pageNo in range(1, page + 1):
            if pageNo != 1:
                href = href.replace(f'_{pageNo - 1}.html', f'_{pageNo}.html')
            if pageNo == page:
                tag = href.split('/')[-1]
                href = href.replace(tag, 'index.html')
            try:
                req = requests.get(url=href, headers=headers, verify=False)
                req_text = req.text.encode("ISO-8859-1")
                req_text = req_text.decode("utf-8")
                soup = BeautifulSoup(req_text, 'html.parser')
                soup = paserUrl(soup, href)
                li_list = soup.find('ul', attrs={'class': 'ld-tjywList'}).find_all('li')
            except:
                req = requests.get(url=href, headers=headers, verify=False)
                req_text = req.text.encode("ISO-8859-1")
                req_text = req_text.decode("utf-8")
                soup = BeautifulSoup(req_text, 'html.parser')
                soup = paserUrl(soup, href)
                li_list = soup.find_all('li')
            for li in li_list:
                try:
                    real_href = li.find('a').get('href')
                except:
                    continue
                is_href = db_storage.find_one({'网址': real_href})
                if is_href:
                    log.info('已采集----------跳过')
                    continue
                try:
                    try:
                        try:
                            req_ = requests.get(url=real_href, headers=headers, verify=False)
                            req_.encoding = req_.apparent_encoding
                            soup_ = BeautifulSoup(req_.text, 'html.parser')
                            div_content = soup_.find('div', attrs={'class': 'zsy_content'})
                            pub_result = div_content.find('div', attrs={'class': 'zsy_cotitle'})
                            try:
                                title = str(pub_result.text).split('文章来源：')[0].replace('\n', '').replace('\r',
                                                                                                         '').lstrip().strip()
                                publishDate = str(pub_result.text).split('发布时间：')[1].strip().lstrip()
                                pub_source = str(pub_result.text).split('文章来源：')[1].split('发布时间：')[0].lstrip().strip()
                            except:
                                title = str(pub_result.text).split('发布时间：')[0].replace('\n', '').replace('\r',
                                                                                                         '').lstrip().strip()
                                publishDate = str(pub_result.text).split('发布时间：')[1].strip().lstrip()
                        except:
                            req_ = requests.get(url=real_href, headers=headers, verify=False)
                            req_.encoding = req_.apparent_encoding
                            soup_ = BeautifulSoup(req_.text, 'html.parser')
                            pub_result = soup_.find('div', attrs={'class': 'zsy_cotitle'})
                            real_href = str(pub_result.text).split('location.href="')[1].split('";')[0].lstrip().strip()
                            req_.close()
                            req_ = requests.get(url=real_href, headers=headers, verify=False)
                            req_.encoding = req_.apparent_encoding
                            soup_ = BeautifulSoup(req_.text, 'html.parser')
                            div_content = soup_.find('div', attrs={'class': 'zsy_content'})
                            pub_result = div_content.find('div', attrs={'class': 'zsy_cotitle'})
                            try:
                                title = str(pub_result.text).split('文章来源：')[0].replace('\n', '').replace('\r',
                                                                                                         '').lstrip().strip()
                                publishDate = str(pub_result.text).split('发布时间：')[1].strip().lstrip()
                                pub_source = str(pub_result.text).split('文章来源：')[1].split('发布时间：')[0].lstrip().strip()
                            except:
                                title = str(pub_result.text).split('发布时间：')[0].replace('\n', '').replace('\r',
                                                                                                         '').lstrip().strip()
                                publishDate = str(pub_result.text).split('发布时间：')[1].strip().lstrip()
                        req_.close()
                    except:
                        req_ = requests.get(url=real_href, headers=headers, verify=False)
                        req_.encoding = req_.apparent_encoding
                        soup_ = BeautifulSoup(req_.text, 'html.parser')
                        yaoqiu_list = soup_.find('div', attrs={'class': 'yaoqiu_list'})
                        li_list_ = yaoqiu_list.find_all('li')
                        for li_ in li_list_:
                            href_ = li_.find('a').get('href')
                            real_href = href_.replace('../../../', 'http://www.sasac.gov.cn/')
                            req_ = requests.get(url=real_href, headers=headers, verify=False)
                            req_.encoding = req_.apparent_encoding
                            soup_ = BeautifulSoup(req_.text, 'html.parser')
                            div_content = soup_.find('div', attrs={'class': 'zsy_content'})
                            pub_result = div_content.find('div', attrs={'class': 'zsy_cotitle'})
                            try:
                                title = str(pub_result.text).split('文章来源：')[0].replace('\n', '').replace('\r',
                                                                                                         '').lstrip().strip()
                                publishDate = str(pub_result.text).split('发布时间：')[1].strip().lstrip()
                                pub_source = str(pub_result.text).split('文章来源：')[1].split('发布时间：')[0].lstrip().strip()
                            except:
                                title = str(pub_result.text).split('发布时间：')[0].replace('\n', '').replace('\r',
                                                                                                         '').lstrip().strip()
                                publishDate = str(pub_result.text).split('发布时间：')[1].strip().lstrip()
                                pub_source = ''
                    if 'location.href' in title:
                        continue
                    if '404 Ba' in str(div_content):
                        continue
                    contentWithTag = div_content.find('div',class_='zsy_comain')
                    try:
                        contentWithTag.find('div', id='qr_container').decompose()
                    except:
                        pass
                    # 去掉style标签
                    for styleTag in contentWithTag.find_all('style'):
                        styleTag.extract()
                    content = contentWithTag.text
                    if content == '':
                        log.error(f'{real_href}===获取正文失败')
                        continue
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    dic_news = {
                        'attachmentIds': [],
                        'author': '',
                        'content': content,
                        'contentWithTag': str(contentWithTag),
                        'createDate': time_now,
                        'deleteFlag': 0,
                        'id': '',
                        'labels': [{'relationId': relationId, 'relationName': ting_type, 'labelMark': "policy"}],
                        'origin': pub_source,
                        'organ': '',
                        'topicClassification': '',
                        'issuedNumber': '',
                        'publishDate': publishDate,
                        'writtenDate': None,
                        'sid': '1697458829758697473',
                        'sourceAddress': real_href,
                        'summary': '',
                        'title': title
                    }
                    #print(content)
                    #print(contentWithTag)
                    sendKafka(dic_news)
                    save_data(dic_news)
                    log.info(f'{ting_type}-----{title}----发送成功', )
                    num += 1
                except Exception as e:
                    pass
            req.close()
        end_time = time.time()
        print(f'抓取{num}条数据，共耗时{end_time - start_time}')

    # 获取页面列表
    def get_page_nsjg_list(href, institution, tid):
        href_list = {
            '办公厅（党委办公厅）': ['http://www.sasac.gov.cn/n2588020/n2588072/n2590818/n2590820/index_2642999_1.html', 9],
            '综合研究局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591482/n2591484/index_2656923_1.html', 5],
            '政策法规局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2590860/n2590862/index_2644230_1.html', 21],
            '规划发展局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2590902/n2590904/index_2646556_1.html', 9],
            '财务监管与运行评价局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2590944/n2590946/index_2647546_1.html', 9],
            '产权管理局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591020/n2591022/index_2648251_1.html', 7],
            '企业改革局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591064/n2591066/index_2648748_1.html', 15],
            '考核分配局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591106/n2591108/index_2649149_1.html', 6],
            '资本运营与收益管理局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591192/n2591194/index_2649585_1.html', 3],
            '科技创新局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591148/n2591150/index_2650085_1.html', 14],
            '社会责任局': ['http://www.sasac.gov.cn/n2588020/n2588072/n23746822/n23746853/index_23747054_.html', 10],
            '综合监督局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591284/n2591286/index.html', 1],
            '监督追责局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591266/n2591268/index_2654822_1.html', 2],
            '企业领导人员管理一局（董事会工作局）': [
                'http://www.sasac.gov.cn/n2588020/n2588072/n2591302/n2591304/index_2657539_1.html', 4],
            '企业领导人员管理二局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591344/n2591346/index_2657636_1.html', 4],
            '党建工作局（党委组织部、党委统战部）': [
                'http://www.sasac.gov.cn/n2588020/n2588072/n2591386/n2591388/index_2656630_1.html', 14],
            '宣传工作局（党委宣传部）': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591426/n2591428/index_2656835_1.html',
                             21],
            '国际合作局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591548/n2591550/index_2657011_1.html', 28],
            '人事局': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591586/n2591588/index_2656275_1.html', 7],
            '行业协会商会党建工作局（行业协会商会工作局）': [
                'http://www.sasac.gov.cn/n2588020/n2588072/n2591626/n2591628/index_2656076_1.html', 4],
            '机关服务管理局（离退休干部管理局）': [
                'http://www.sasac.gov.cn/n2588020/n2588072/n2591644/n2591646/index_2655780_1.html', 9],
            '机关党委': ['http://www.sasac.gov.cn/n2588020/n2588072/n2591684/n2591686/index_2655222_1.html', 33],
            '党委巡视工作办公室、国资委巡视组': [
                'http://www.sasac.gov.cn/n2588020/n2588072/n2591770/n2591772/index_2655029_1.html', 8],
            '中央纪委国家监委驻国资委纪检监察组': ['http://www.sasac.gov.cn/n2588020/n2877928/n2878219/index_2879099_1.html', 18]}
        href_ = href_list[institution][0]
        page = href_list[institution][1]
        get_page_nsjg(href_, institution, tid, page)

    # 开始
    def gzw_nsjg_start():
        url = 'http://www.sasac.gov.cn/n2588020/index.html'
        req = requests.get(url=url, headers=headers, verify=False)
        req_text = req.text.encode("ISO-8859-1")
        req_text = req_text.decode("utf-8")
        all_institution = []
        tree = etree.HTML(req_text)
        institution = tree.xpath('/html/body/div[4]/div[2]/div/dl[1]/dt/a/text()')[0].replace('\n', '').replace('\r',
                                                                                                                '')
        institution_href = tree.xpath('/html/body/div[4]/div[2]/div/dl[1]/dt/a/@href')[0].replace('../',
                                                                                                  'http://www.sasac.gov.cn/')
        all_institution.append([institution, institution_href])
        dd_list = tree.xpath('/html/body/div[4]/div[2]/div/dl[2]/dd')
        for dd in dd_list:
            institution = dd.xpath('./a/text()')[0].replace('\n', '').replace('\r', '')
            institution_href = dd.xpath('./a/@href')[0].replace('../', 'http://www.sasac.gov.cn/')
            all_institution.append([institution, institution_href])

        tids = {'办公厅（党委办公厅）': 1643, '综合研究局': 1644, '政策法规局': 1645, '规划发展局': 1646, '财务监管与运行评价局': 1647, '产权管理局': 1648,
                '企业改革局': 1649, '考核分配局': 1650, '资本运营与收益管理局': 1651, '科技创新局': 1652, '社会责任局': 2064, '综合监督局': 1653,
                '监督追责局': 1654,
                '企业领导人员管理一局（董事会工作局）': 1655, '企业领导人员管理二局': 1656, '党建工作局（党委组织部、党委统战部）': 1657, '宣传工作局（党委宣传部）': 1658,
                '国际合作局': 1659, '人事局': 1660, '行业协会商会党建工作局（行业协会商会工作局）': 1661, '机关服务管理局（离退休干部管理局）': 1662, '机关党委': 1663,
                '党委巡视工作办公室、国资委巡视组': 1664, '中央纪委国家监委驻国资委纪检监察组': 1874}
        for a in all_institution:
            institution = a[0]
            href = a[1]
            tid = tids[institution]
            log.info(f'\n================厅局类别==={institution}========================')
            get_page_nsjg_list(href, institution, tid)

    gzw_nsjg_start()


if __name__ == '__main__':
    try:
        gzw_nsjg()
    except Exception as e:
        log.error(e)
    #current_time = datetime.datetime.now()
    #midnight_time = current_time.replace(hour=0, minute=0, second=0, microsecond=0) + datetime.timedelta(days=1)
    #sleep_seconds = (midnight_time - current_time).total_seconds()
    #time.sleep(sleep_seconds)
    # 创建一个ExcelWriter对象
    # writer = pd.ExcelWriter('国务院厅局.xlsx')
