import json
import sys

import redis
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
    }

#国资要闻
def gzyw():
    info_list = []
    url = 'http://www.sasac.gov.cn/n2588025/n2643314/index.html'
    res = requests.get(url=url, headers=headers)
    res.encoding = res.apparent_encoding
    res_text = res.text
    soup = BeautifulSoup(res_text, 'html.parser')
    # pages = soup.find('td',id='pag_4278129')
    pages = soup.find('td', class_='pages')
    pages_tag = pages['id'].split('pag_')[1]
    pages = str(pages).split(f'maxPageNum{pages_tag}=')[1].split('";')[0]
    # print(pages)
    for page in range(1, int(pages)+1):
        log.info(f'==============开始采集第{page}页===============')
        if page == 1:
            url = 'http://www.sasac.gov.cn/n2588025/n2643314/index.html'
        else:
            #http://www.sasac.gov.cn/n2588025/n2643309/index_4278129_131.html
            url = f'http://www.sasac.gov.cn/n2588025/n2643314/index_{pages_tag}_{int(pages)+1-page}.html'

        try:
            res = requests.get(url=url, headers=headers)
        except:
            continue
        res.encoding = res.apparent_encoding
        res_text = res.text
        soup = BeautifulSoup(res_text, 'html.parser')
        li_list = soup.find('span', id=f'comp_{pages_tag}')
        if li_list:
            li_list = li_list.find_all('li')
        else:
            li_list = soup.find_all('li')
        for li in li_list:
            # print(type(li))
            if len(li):
                a = li.find('a')
                # print(a)
                href = a['href']
                if 'http' in href:
                    href = href
                else:
                    href = 'http://www.sasac.gov.cn/' + str(href).replace('../../','')
                # print(href)
                try:
                    flag = r.sismember('IN-20240129-0002-test', href)
                    if flag:
                        # log.info('信息已采集入库过')
                        continue
                    # else:
                    #     log.info(f'未采到----{page}-----{href}')
                except Exception as e:
                    continue
                try:
                    title = a['title']
                except:
                    title = ''
                # print(title)
                try:
                    res_href = requests.get(url=href,headers=headers,verify=False)
                except:
                    continue
                res_href.encoding = res_href.apparent_encoding
                href_text = res_href.text
                i_soup = BeautifulSoup(href_text,'html.parser')
                result = i_soup.find(class_='zsy_cotitle')
                try:
                    if result:
                        result_ =result.find('p').text
                        pub_source = result_.split('发布时间：')[0].replace('文章来源：', '').strip()
                        pub_time = result_.split('发布时间：')[1]
                        # print(pub_source,pub_time)
                        if title == '':
                            result.find('p').decompose()
                            title = result.text.strip().replace(' ', '').replace('\n', '').replace('\t', '')
                        try:
                            i_soup.find('div', id='div_div').decompose()
                            i_soup.find('div', id='qr_container').decompose()
                        except:
                            pass
                        contentWithTag = str(i_soup.find(class_='zsy_comain'))
                        content = str(i_soup.find(class_='zsy_comain').text).replace('扫一扫在手机打开当前页','')
                    else:
                        result = i_soup.find(class_='lyshijian')
                        if result:
                            result_ = result.find_all('span')
                            try:
                                pub_source = str(result_[0]).split('文章来源：')[1].split('</span>')[0].strip()
                                pub_time = str(result_[1]).split('发布时间：')[1].split('</span>')[0].strip()
                            except:
                                pub_time = str(result_[0]).split('发布时间：')[1].split('</span>')[0].strip()
                                pub_source = ''
                            if title == '':
                                result.find('p').decompose()
                                title = result.text.strip()
                            contentWithTag = str(i_soup.find(class_='articlecontent'))
                            content = str(i_soup.find(class_='articlecontent').text)
                        else:
                            result = i_soup.find(class_='pages-date')
                            pub_source = result.find('span').text.replace('来源：', '').strip()
                            pub_time = result.text
                            pub_time = pub_time.split('来源')[0].strip()
                            contentWithTag = str(i_soup.find(class_='pages_content'))
                            content = str(i_soup.find(class_='pages_content').text)
                        # content = str(i_soup.find(class_='articlecontent').text)
                    if title == '':
                        log.info(f'title为空----{page}--{title}--{href}')
                        continue
                    # zhaiyao = HanLP.extractSummary(content,6)
                    info_code = 'IN-20240129-0002'
                    result_dict = {
                        'id':'',
                        'sid':'1751810519211053058',
                        'title': title,
                        'organ': pub_source,
                        'origin': '国务院国有资产监督管理委员会',
                        # '摘要': zhaiyao,
                        'source':16,
                        'content': content,
                        'contentWithTag': contentWithTag,
                        'publishDate': pub_time,
                        'sourceAddress': href,
                    }
                    log.info(f'{page}--{title}--{href}')
                    # info_list.append(result_dict)
                    producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
                    try:
                        kafka_result = producer.send("crawlerInfo",
                                                     json.dumps(result_dict, ensure_ascii=False).encode('utf8'))
                        r.sadd(info_code + '-test', href)
                        log.info('发送kafka成功！')
                    except Exception as e:
                        log.info(e)
                    finally:
                        producer.close()
                except:
                    continue

if __name__ == "__main__":
    r = redis.Redis(host='114.115.236.206', port=6379, password='clbzzsn', db=5)
    gzyw()