import random
import time
from tqdm import tqdm
import pandas as pd
import pymysql
import requests
from bs4 import BeautifulSoup
import urllib3
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
cnx = baseCore.cnx
cursor = baseCore.cursor

headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'Hm_lvt_fa835457efbc11dfb88752e70521d23b=1690184499; Hm_lpvt_fa835457efbc11dfb88752e70521d23b=1690184499; SF_cookie_1=98184645; Hm_lvt_2b5618a441c142a90e1a75f4b226c252=1690189470; Hm_lpvt_2b5618a441c142a90e1a75f4b226c252=1690189470; zh_choose=n; wdcid=30ffdae06d11dbde; wdlast=1690189470; wdses=13ee59561f2fb725',
'Host':'www.sasac.gov.cn',
'Pragma':'no-cache',
'Referer':'https://www.baidu.com/link?url=CcQEFfXAeQsxu1IlLlxj8WHugAcJ7sBjOBqvZYDfN7WE6OZpSUM4prK6DiADOqTP&wd=&eqid=d507a037000987780000000364be37d4',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
# 创建一个ExcelWriter对象
writer = pd.ExcelWriter('国务院厅局.xlsx')
url = 'http://www.sasac.gov.cn/n2588020/index.html'
ip = baseCore.get_proxy()
res = requests.get(url,headers,proxies=ip)
soup = BeautifulSoup(res.content,'html.parser')
time.sleep(2)
#厅局列表
list_type = soup.find('div',class_='l-jgkk-right column').find_all('dd')
list_error = []
for type in tqdm(list_type[:2]):
    list_news = []
    href_type = type.find('a')['href']
    ting_type = type.find('a').text
    print(f'\n================厅局类别==={ting_type}========================')
    if 'http' in href_type:
        url_type = href_type
    else:
        url_type = 'http://www.sasac.gov.cn/' + href_type.replace('../','')
    # print(url_type)
    i_res = requests.get(url_type,headers)
    i_soup = BeautifulSoup(i_res.content,'html.parser')
    time.sleep(2)
    news_list = i_soup.find('div',class_='tjywBottom').find_all('li')
    #文章列表
    # print('================新闻列表==================')
    for news in tqdm(news_list[:2]):
        try:
            news_href = news.find('a')['href']
        except:
            continue
        if 'http' in news_href:
            news_url = news_href
        else:
            news_url = 'http://www.sasac.gov.cn/' + news_href.replace('../','')
        news_title = news.find('a').text.split('[')[0]
        print(f'\n----正在采集: {news_title}-------')
        pub_time = news.find('span').text.replace('[','').replace(']','')
        #文章信息
        ii_res = requests.get(news_url,headers)
        ii_soup = BeautifulSoup(ii_res.content,'html.parser')
        # todo:相对路径转化为绝对路径
        time.sleep(2)
        try:
            news_info = ii_soup.find('div',class_='zsy_cotitle')
        except Exception as e:
            print(e)
            news_info = ''
        if news_info:
            try:
                pub_source = news_info.find('p').text.split('文章来源：')[1].split('发布时间')[0]
            except:
                pub_source = ''
            try:
                content = ii_soup.find('div','zsy_comain').text.replace('扫一扫在手机打开当前页','').strip()
            except:
                content = ''
            # print(news_url)
            dic_news = {
                '标题':news_title,
                '发布时间':pub_time,
                '来源':pub_source,
                '内容':content,
                '原文链接':news_url
            }
            list_news.append(dic_news)
        else:
            dic_error = {
                '标题': news_title,
                '原文链接':news_url,
                '厅局类别':ting_type
            }
            list_error.append(dic_error)

    df = pd.DataFrame(list_news)
    # 将数据写入不同的sheet页
    df.to_excel(writer, sheet_name=ting_type,index=False)

    print(f'=============当前sheet页{ting_type}---数据总数：{len(df)}================')
    time.sleep(1)
writer.save()
df_error = pd.DataFrame(list_error)
df_error.to_excel('未采到文章.xlsx',index=False)




