from datetime import datetime
from urllib.parse import urljoin

import redis
import requests
import urllib3
from bs4 import BeautifulSoup
from kafka import KafkaProducer
from pyquery import PyQuery as pq
import json
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def reqHtml(url):
    try:
        proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
        header={
            'Accept':'*/*',
            'Accept-Encoding':'gzip, deflate, br',
            'Accept-Language':'zh-CN,zh;q=0.9',
            'Cache-Control':'no-cache',
            'Connection':'keep-alive',
            'Cookie':'__jsluid_s=d344baee4a1e027b745a48855ff6539d',
            'Host':'www.miit.gov.cn',
            'Pragma':'no-cache',
            'Referer':'https://www.miit.gov.cn/zwgk/zcjd/index.html',
            'Sec-Fetch-Dest':'empty',
            'Sec-Fetch-Mode':'cors',
            'Sec-Fetch-Site':'same-origin',
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
            'X-Requested-With':'XMLHttpRequest',
            'sec-ch-ua':'"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
            'sec-ch-ua-mobile':'?0',
            'sec-ch-ua-platform':'"Windows"'
        }
        response = requests.get(url,headers=header,verify=False,timeout=10)
        code=response.status_code
        print(f'url:{url}  信息的采集状态码{code}')
        html=response.text
    except Exception as e:
        html=''
    return html
# 将html中的相对地址转换成绝对地址
def paserUrl(html,listurl):
    soup = BeautifulSoup(html, 'html.parser')
    # 获取所有的<a>标签和<img>标签
    links = soup.find_all(['a', 'img'])
    # 遍历标签，将相对地址转换为绝对地址
    for link in links:
        if 'href' in link.attrs:
            link['href'] = urljoin(listurl, link['href'])
        elif 'src' in link.attrs:
            link['src'] = urljoin(listurl, link['src'])
    return soup


def page_list():
    for i in range(1,29):
        print(f"采集到第{i}页！！")
        aurl='https://www.miit.gov.cn/api-gateway/jpaas-publish-server/front/page/build/unit?webId=8d828e408d90447786ddbe128d495e9e&pageId=1b56e5adc362428299dfc3eb444fe23a&parseType=buildstatic&pageType=column&tagId=右侧内容&tplSetId=209741b2109044b5b7695700b2bec37e&paramJson={"pageNo":[i],"pageSize":"24"}'
        url=aurl.replace('[i]',str(i))
        html=reqHtml(url)
        text=json.loads(html)
        html=text['data']['html']
        soup=paserUrl(html,'https://www.miit.gov.cn/zwgk/zcjd/index.html')
        html=str(soup.prettify())
        doc=pq(html)
        # ll=doc('li[class="cf"]')
        ll=doc('li')
        for list in ll:
            ldoc=pq(list)
            # title=ldoc('a').text()
            title=ldoc('a').attr('title')
            url=ldoc('a').attr('href')
            # url='https://www.miit.gov.cn'+url
            try:
                flag=r.sismember('IN-20230829-0199-test',url)
                if flag:
                    print(f'信息已采集入库{title}')
                    continue
            except Exception as e:
                continue
            publishdate=ldoc('span').text()

            dmsg={
                'title':title,
                'url':url,
                'publishdate':publishdate
            }
            print(f'列表信息： title:{title}  url:{url} time:{publishdate}')
            detail(dmsg)

def detail(dmsg):
    try:
        durl=dmsg['url']
        title=dmsg['title']
        publishTime=dmsg['publishdate']
        html=reqHtml(durl)
        soup=paserUrl(html,durl)
        con=soup.select('div[id="con_con"]')[0]
        contentWithTag=con.prettify()
        content=con.text
        if content:
            pass
        else:
            content=contentWithTag
        ddata={
            'title':title,
            'publishTime':publishTime,
            'sourceAddress':durl,
            'content':content,
            'contentWithTag':contentWithTag,
            'origin':'中华人民共和国工业和信息化部-政务公开-政策解读',
        }
        sendTokafka(ddata)
    except Exception as e:
        print(e)

def sendTokafka(ddata):
    title=ddata['title']
    content=ddata['content']
    contentWithTag=ddata['contentWithTag']
    publishTime=ddata['publishTime']
    if publishTime:
        time_format='%Y-%m-%d'
        publishDate=str(datetime.strptime(publishTime, time_format))
    else:
        publishDate = '1900-01-01'
    sourceAddress=ddata['sourceAddress']
    origin=ddata['origin']
    # time_format='%Y-%m-%d'
    # publishDate=str(datetime.strptime(publishTime, time_format))
    sid='1696452056436424706'
    info_code='IN-20230829-0199'
    aa_dict = {
        'content': content,
        'contentWithTag': contentWithTag,
        'id': '',
        'sid': sid,
        'origin': origin,
        'publishDate': publishDate,
        'sourceAddress': sourceAddress,
        'title': title,
        'source': 'python定制采集',
        'type': ''
    }
    producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
    try:
        kafka_result = producer.send("crawlerInfo", json.dumps(aa_dict, ensure_ascii=False).encode('utf8'))
        r.sadd(info_code+'-test',sourceAddress)
        print('发送kafka结束')
    except Exception as e:
        print(e)
        print('发送kafka异常！')
    finally:
        producer.close()
        # r.close()

if __name__ == '__main__':
    r = redis.Redis(host='114.115.236.206', port=6379,password='clbzzsn', db=5)
    page_list()
    print('采集结束===')