import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq

from ClassTool import ClassTool
baseTool = ClassTool()

from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()

# 陕西
def shanxi():
    num = 0
    count = 0
    start_time = time.time()
    url = 'https://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?chid=100127'
    # url = 'https://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?chid=100127'
    try:
        res = requests.get(url=url, headers=baseTool.headers)
        res.encoding = res.apparent_encoding
        res_text = res.text
        soup = BeautifulSoup(res_text, 'html.parser')
        # soup = paserUrl(res_text, 'https://sxgz.shaanxi.gov.cn')
        # print(soup)
        result = soup.find(class_='scroll_cont')
        li_list = result.find_all('li')
        for li in li_list:
            href = li.find('a')['href']
            if 'http' in str(href):
                href = href
            else:
                href = 'https://sxgz.shaanxi.gov.cn/' + href
            is_href = baseTool.db_storage.find_one({'网址': href})
            if is_href:
                num += 1
                continue
            try:
                res_href = requests.get(url=href, headers=baseTool.headers)
                res_href.encoding = res_href.apparent_encoding
                res_text = res_href.text
                # i_soup = BeautifulSoup(res_text, 'html.parser')
                i_soup = baseTool.paserUrl(res_text, href)
                title = i_soup.find(class_='m-gk-title').text
                i_result = i_soup.find(class_='ftitle')
                span_list = i_result.find_all('span')
                origin = str(span_list[0]).split('<span>')[1].split('</span>')[0]
                publishDate = str(span_list[2]).split('<span>')[1].split('</span>')[0]
                t = time.strptime(publishDate, "%Y/%m/%d %H:%M:%S")
                publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
                contentWithTag = i_soup.find(class_='scroll_cont')
                soup = BeautifulSoup(str(contentWithTag), 'html.parser')
                div_tag = soup.find(id='ztl')
                div_tag.extract()
                fu_jian_soup = soup.find_all('a')
                id_list = []
                for file in fu_jian_soup:
                    try:
                        file_href = file['href']
                    except Exception as e:
                        continue
                    if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
                            or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                            or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                        file_name = file.text.strip()
                        category = os.path.splitext(file_href)[1]
                        if category not in file_name:
                            file_name = file_name + category
                        retData = baseCore.uptoOBS(file_href, '1680', file_name)
                        if retData['state']:
                            pass
                        else:
                            continue
                        att_id, full_path = baseCore.tableUpdate(retData, '陕西省国资委', file_name, num, publishDate)
                        id_list.append(att_id)
                        # todo:将返回的地址更新到soup
                        file['href'] = 'http:obs.ciglobal.cn/' + str(full_path)
                # id_ = redefid(id_list)
                contentWithTag = str(soup.prettify())
                content = soup.text
                if content == '' or content == None:
                    log.info(f'-----{href}----{title}----内容为空-----')
                    continue
                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                # todo:传kafka字段
                dic_news = {
                    'attachmentIds': id_list,
                    'author': '',
                    'content': str(content),
                    'contentWithTag': str(contentWithTag),
                    'createDate': time_now,
                    'deleteFlag': 0,
                    'id': '',
                    'labels': [{'relationId': "1680", 'relationName': "陕西省国资委", 'labelMark': "policy"}],
                    'origin': origin,
                    'organ': "",
                    'topicClassification': "",
                    'issuedNumber': "",
                    'publishDate': publishDate,
                    'writtenDate': None,
                    'sid': '1697458829758697473',
                    'sourceAddress': href,
                    'summary': '',
                    'title': title
                }
                # print(dic_news)
                flag = baseTool.sendKafka(dic_news)
                if flag:
                    baseTool.save_data(dic_news)
                    num += 1
                    count += 1
                res_href.close()
            except Exception as e:
                pass
        res.close()
    except:
        pass
    end_time = time.time()
    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

if __name__ == "__main__":
    shanxi()