import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq

from ClassTool import ClassTool
baseTool = ClassTool()

from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()

# 上海
def shang_hai():
    start = time.time()
    num = 0
    count = 0
    for page in range(1, 7):

        if page == 1:
            url = 'https://www.gzw.sh.gov.cn/shgzw_flfg_zcfg_gfxwj/index.html'
        else:
            url = f'https://www.gzw.sh.gov.cn/shgzw_flfg_zcfg_gfxwj/index_{page}.html'
        try:
            resp_text = requests.get(url=url, headers=baseTool.headers, verify=False).text
            doc_resp = pq(resp_text)
            doc_items = doc_resp('.gqzc_list_right ul li').items()
            for doc_item in doc_items:
                id_list = []
                title = doc_item('a').attr('title').strip()
                pub_time = doc_item('span').text() + ' 00:00:00'
                href = doc_item('a').attr('href')
                if 'https:/' in href:
                    pass
                else:
                    href = 'https://www.gzw.sh.gov.cn' + href
                is_href = baseTool.db_storage.find_one({'网址': href})
                if is_href:
                    num += 1
                    continue
                try:
                    # href = 'https://www.gzw.sh.gov.cn/shgzw_xxgk_zxgkxx/20230119/7c5e9691b2b54ff293e5d16d746d1a61.html'
                    href_text = requests.get(url=href, headers=baseTool.headers, verify=False).text
                    doc_href = pq(href_text)
                    doc_href_ = BeautifulSoup(href_text, 'html.parser')
                    # 相对路径转化为绝对路径
                    doc_href_ = baseTool.paserUrl(doc_href_, href)
                    info_list = doc_href_.find_all('span', style='text-align: center;margin-left: 42%;')
                    pub_source = info_list[1].find('b').text.split('信息来源：')[1]
                    content = doc_href_.find('div', attrs={'class': 'detail_03'})
                    if content == '' or content == 'None':
                        log.info(f'{href}-----{title}----内容为空')
                        continue
                    # 将文章中的附件字段删去
                    pattern = r'\d+\.'

                    for p in content.find_all('p')[-22:]:
                        p_text = p.text
                        if len(p_text) > 50:
                            continue
                        matches = re.findall(pattern, p_text)
                        for k in matches:
                            if k in p_text:
                                p.extract()
                    try:
                        pub_result = doc_href('.detail_03')
                        pub_result('meta')
                        pub_result = '沪' + str(pub_result('meta')).split('沪')[1].split('号')[0].strip() + '号'
                    except:
                        try:
                            pub_result = str(
                                '沪' + doc_href('.detail_03 ul').text().split('沪')[1].split('号')[0].strip() + '号')
                        except:
                            pub_result = str(doc_href('.detail_03 p').text().split('号')[0].strip() + '号')
                    if '﹝' in pub_result and '﹞' in pub_result:
                        pub_hao = pub_result.replace('﹝', '〔').replace('﹞', '〕')
                    elif '〔' in pub_result and '〕' in pub_result:
                        pub_hao = pub_result
                    elif '【' in pub_result and '】' in pub_result:
                        pub_hao = pub_result
                    elif '[' in pub_result and ']' in pub_result:
                        pub_hao = pub_result
                    else:
                        pub_hao = ''
                    if len(pub_hao) > 20:
                        pub_hao = ''

                    # todo:找到附件标签,正文内容带有附件

                    fu_jian_soup = content.find('ul')
                    if fu_jian_soup:
                        li_list = fu_jian_soup.find_all('a')
                    else:
                        li_list = []

                    for a in li_list:
                        fu_jian_href = a['href']
                        file_name = a.text
                        if '.doc' in fu_jian_href or '.docx' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
                                or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                            category = os.path.splitext(fu_jian_href)[1]
                            if category not in file_name:
                                file_name = file_name + category
                            retData = baseCore.uptoOBS(fu_jian_href, '1671', file_name)
                            if retData['state']:
                                pass
                            else:
                                continue
                            att_id, full_path = baseCore.tableUpdate(retData, '上海市国资委', file_name, num, pub_time)
                            id_list.append(att_id)

                            # todo:将返回的地址更新到soup
                            a['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
                        else:
                            continue

                    log.info(title)
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

                    # todo:传kafka字段
                    dic_news = {
                        'attachmentIds': id_list,
                        'author': '',
                        'content': content.text,
                        'contentWithTag': str(content),
                        'createDate': time_now,
                        'deleteFlag': 0,
                        'id': '',
                        'labels': [{'relationId': "1671", 'relationName': "上海市国资委", 'labelMark': "policy"}],
                        'origin': pub_source,
                        'organ': '',
                        'topicClassification': '',
                        'issuedNumber': pub_hao,
                        'publishDate': pub_time,
                        'writtenDate': None,
                        'sid': '1697458829758697473',
                        'sourceAddress': href,
                        'summary': '',
                        'title': title
                    }
                    flag = baseTool.sendKafka(dic_news)
                    if flag:
                        baseTool.save_data(dic_news)
                        num = num + 1
                        count += 1
                except:
                    pass
        except:
            pass
    end = time.time()
    log.info(f'共抓取{count}条数据,共耗时{end - start}')

if __name__ == "__main__":
    shang_hai()