import os

import openpyxl
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

from urllib.parse import urljoin

import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
filepath = "data/"

class Policy():
    def getrequest_soup(self,headers,url):
        req = requests.get(headers=headers,url=url)
        result = BeautifulSoup(req.content,'html.parser')
        return result

    def getrequest_json(self,headers,url):
        req = requests.get(headers=headers,url=url)
        result = req.json()
        return result

    def requestPost(self,headers, url, payload):
        req = requests.post(headers=headers, url=url, data=payload)
        data_json = req.json()
        return data_json

    def createDriver(self):
        chrome_driver = r'D:\cmd100\chromedriver.exe'
        path = Service(chrome_driver)
        chrome_options = webdriver.ChromeOptions()
        chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
        # 设置代理
        # proxy = "127.0.0.1:8080"  # 代理地址和端口
        # chrome_options.add_argument('--proxy-server=http://' + proxy)
        driver = webdriver.Chrome(service=path, chrome_options=chrome_options)
        return driver

    def deletep(self,soup,i,tag,attribute_to_delete,value_to_delete):
        # 查找带有指定属性的P标签并删除
        tags = soup.find_all(tag, {attribute_to_delete: value_to_delete})
        for tag in tags[:i]:
            tag.decompose()

    def deletek(self,soup):
        # 删除空白标签（例如<p></p>、<p><br></p>, img、video、hr除外）
        for i in soup.find_all(lambda tag: len(tag.get_text()) == 0 and tag.name not in ["img", "video", "br"] and tag.name != "br" or tag.get_text()==' '):
            for j in i.descendants:
                if j.name in ["img", "video", "br"]:
                    break
            else:
                i.decompose()

    def paserUrl(self,html, listurl):
        # 获取所有的<a>标签和<img>标签
        if isinstance(html, str):
            html = BeautifulSoup(html, 'html.parser')

        links = html.find_all(['a', 'img'])
        # 遍历标签，将相对地址转换为绝对地址
        for link in links:
            if 'href' in link.attrs:
                link['href'] = urljoin(listurl, link['href'])
            elif 'src' in link.attrs:
                link['src'] = urljoin(listurl, link['src'])
        return html

    def attuributefile(self,file_name,file_href,num,publishDate):
        # 下载附件到本地，并上传文件服务器
        if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:

            category = os.path.splitext(file_href)[1]
            if category not in file_name:
                file_name = file_name + category
            retData = baseCore.uptoOBS(file_href, '9999', file_name)
            if retData['state']:
                pass
            else:
                return '', ''
            att_id, full_path = baseCore.tableUpdate(retData, 'RETIs文件', file_name, num, publishDate)
            return att_id,full_path


    def downloadfile(self,file_name,file_href,path):
        response = requests.get(file_href)

        with open(path,"wb") as file:
            file.write(response.content)
        pass



policy = Policy()
#国家发展和改革委员会 https://www.ndrc.gov.cn/xxgk/wjk/index.html?tab=all&qt=
def reform():
    headers = {
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep-alive',
        'Host': 'fwfx.ndrc.gov.cn',
        'Origin': 'https://www.ndrc.gov.cn',
        'Referer': 'https://www.ndrc.gov.cn/',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'same-site',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
        'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"'
    }
    url = 'https://fwfx.ndrc.gov.cn/api/query?qt=REITs&tab=all&page=1&pageSize=20&siteCode=bm04000fgk&key=CAB549A94CF659904A7D6B0E8FC8A7E9&startDateStr=&endDateStr=&timeOption=0&sort=dateDesc'
    result = policy.getrequest_json(headers, url)
    data_list = result['data']['resultList']
    DataList = []
    num = 0
    for info in data_list:
        num += 1
        # info = data_list[1]
        publishDate = info['docDate']
        title = info['title']
        summary = info['summary']
        newsUrl = info['url']
        header = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Cookie': 'Hm_lvt_6c8165462fd93121348afe212168341f=1699338341; yfx_c_g_u_id_10005970=_ck23110714254113251712738304141; http_waf_cookie=05e8486c-c47f-4927291823a10f5e24ceed45b1eaa3eb7354; SF_cookie_3=21321202; Hm_lpvt_6c8165462fd93121348afe212168341f=1699422316; yfx_f_l_v_t_10005970=f_t_1699338341317__r_t_1699412780356__v_t_1699422316031__r_c_1',
            'Host': 'www.ndrc.gov.cn',
            'Referer': 'https://www.ndrc.gov.cn/xxgk/wjk/index.html?tab=all&qt=',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'same-origin',
            'Sec-Fetch-User': '?1',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
            'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"'
        }
        newssoup = policy.getrequest_soup(header, newsUrl)
        # print(newssoup)
        try:
            pubHao = ''
            source = ''

            try:
                # article_con article_con_title
                contentWithTag = newssoup.select('div[class="article_con article_con_notitle"]')[0]
            except:
                try:
                    contentWithTag = newssoup.select('div[class="article_con article_con_title"]')[0]
                except:
                    continue

            try:
                pubHao_ = newssoup.select('div[class="article_con article_con_notitle"]>span')[0].text
                if '〔' in pubHao_:
                    pubHao = pubHao_
            except:
                pass
            policy.deletep(contentWithTag, 3, 'div', 'style', 'text-align: center;')
            policy.deletek(contentWithTag)
            content = contentWithTag.text
            try:
                source = newssoup.select('div[class="ly laiyuantext"]>span')[0].text
            except:
                pass
            dic_info = {
                '序号':num,
                '标题': title,
                '时间': publishDate,
                '来源': source,
                '原文链接':newsUrl,
                '发文字号': pubHao,
                '摘要':summary,
                '正文': content,
                '附件名称':'',
                '附件链接':'',
            }
            DataList.append(dic_info)
            file_name = f'../data/REITs专题数据.xlsx'
            sheet_name = "国家发展和改革委员会"
            file_exist = baseCore.check_excel_file(file_name)
            if file_exist:
                pass
            else:
                wb = openpyxl.Workbook()
                wb.save(file_name)
                log.info("Excel文件已创建")
            baseCore.writerToExcel(DataList, file_name, sheet_name)

        except:
            log.info(f"error！！！{newsUrl}")
    log.info(f'=============处理结束，以采集{num}条数据=================')

#证券期货 https://neris.csrc.gov.cn/falvfagui/multipleFindController/indexJsp
def zhengquanqihuo():
    headers = {
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep-alive',
        'Content-Length': '140',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Cookie': 'JSESSIONID=D18F2DF64366325AC0A50E09AA98EE84',
        'Host': 'neris.csrc.gov.cn',
        'Origin': 'https://neris.csrc.gov.cn',
        'Referer': 'https://neris.csrc.gov.cn/falvfagui/multipleFindController/indexJsp',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'same-origin',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest',
        'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"'
    }
    url = 'https://neris.csrc.gov.cn/falvfagui/multipleFindController/solrSearch'
    payload = {
        'pageNo': '1',
        'secFutrsLawName': '',
        'body': '"REITs"',
        'lawPubOrgName': '',
        'titleQry': '',
        'keyQry': 'REITs、',
        'fileno': '',
        'pubDate_from': '',
        'pubDate_thru': '',
        'nbr': '1',
        'isLike': '0'
    }
    result = policy.requestPost(headers, url, payload)
    pageUtil = result['pageUtil']
    total = pageUtil['rowCount']
    page_size = pageUtil['pageSize']
    Max_page = int(total / page_size)
    for page in range(0, Max_page):
        payload_page = {
            'pageNo': page + 1,
            'secFutrsLawName': '',
            'body': '"REITs"',
            'lawPubOrgName': '',
            'titleQry': '',
            'keyQry': 'REITs、',
            'fileno': '',
            'pubDate_from': '',
            'pubDate_thru': '',
            'nbr': '1',
            'isLike': '0'
        }
        data_page = policy.requestPost(headers, url, payload_page)
        info_list = data_page['pageUtil']['pageList']
        for info in info_list:
            title = info['secFutrsLawName']
            pubHao = info['fileno']
            source = info['lawPubOrgName']
            publish_ = datetime.strptime(info['secFutrsLawVersion'], "%Y%m%d")
            publishDate = datetime.strftime(publish_, "%Y-%m-%d")
            # print(publishDate)
            secFutrsLawId = info['secFutrsLawId']
            newsUrl = f'https://neris.csrc.gov.cn/falvfagui/rdqsHeader/mainbody?navbarId=3&secFutrsLawId={secFutrsLawId}&body=REITs'
            browser = policy.createDriver()
            browser.get(newsUrl)
            time.sleep(1)
            page_source = browser.page_source
            newssoup = BeautifulSoup(page_source, 'html.parser')
            # print(newssoup)
            contentWithTag = newssoup.find('div', class_='law_text mainBody catalog')
            content = contentWithTag.text
            print(content)
            dic_info = {
                'title': title,
                'publishDate': publishDate,
                'source': source,
                'pub_hao': pubHao,
                'contentWithTag': contentWithTag,
                'content': content
            }
            print(dic_info)

#深圳交易所 http://www.szse.cn/lawrules/index.html


#上海交易所 http://www.sse.com.cn/home/search/index.shtml?webswd=REITs
def sse():
    url = 'http://query.sse.com.cn/search/getESSearchDoc.do?page=0&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C8685%2C9348%2C12632%2C12768%2C12769%2C12770%2C12771%2C12772%2C12773%2C12774%2C12775%2C12776%2C12777%2C12778%2C12779%2C12780%2C12781%2C12782%2C12783%2C12784%2C12785%2C12786%2C12787%2C12788%2C12789%2C12790%2C12791%2C12792%2C12793%2C12794%2C12795%2C12796%2C12797%2C12798%2C12799%2C12800%2C12801%2C12802%2C12803%2C12804%2C12805%2C12806%2C12807%2C12808%2C12809%2C12810%2C12811%2C12812%2C13061%2C13282%2C13283%2C13284%2C13285%2C13286%2C13287%2C13288%2C13289%2C13294%2C13364%2C13365%2C13366%2C13367%2C14595%2C14596%2C14597%2C14598%2C14599%2C14600%2C14601%2C14602%2C14603%2C14604%2C14605%2C14606&trackId=50619067167713018335655119683810&_=1699508921761'
    headers = {
        'Accept': '*/*',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep-alive',
        'Cookie': 'ba17301551dcbaf9_gdp_user_key=; ba17301551dcbaf9_gdp_session_id=878c2669-93f0-43bd-91c1-cc30ca7136ef; gdp_user_id=gioenc-9a36dgc8%2C6b5d%2C5265%2Ccdc5%2C2ea193d9g222; ba17301551dcbaf9_gdp_session_id_878c2669-93f0-43bd-91c1-cc30ca7136ef=true; ba17301551dcbaf9_gdp_sequence_ids={%22globalKey%22:28%2C%22VISIT%22:2%2C%22PAGE%22:2%2C%22CUSTOM%22:17%2C%22VIEW_CLICK%22:10}',
        'Host': 'query.sse.com.cn',
        'Referer': 'http://www.sse.com.cn/',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
    }
    result = policy.getrequest_json(headers,url)
    total_page = result['data']['totalPage']
    for page in range(0, int(total_page)):
        url_page = f'http://query.sse.com.cn/search/getESSearchDoc.do?page={page}&limit=10&publishTimeEnd=&publishTimeStart=&orderByDirection=DESC&orderByKey=score&searchMode=fuzzy&spaceId=3&keyword=REITs&siteName=sse&keywordPosition=title%2Cpaper_content&channelId=10001&channelCode=8640%2C8641%2C8642%2C8643%2C8644%2C8645%2C8646%2C8647%2C8648%2C8649%2C8650%2C8651%2C8652%2C8653%2C8654%2C8655%2C8656%2C8657%2C8658%2C8659%2C8660%2C8661%2C8685%2C9348%2C12632%2C12768%2C12769%2C12770%2C12771%2C12772%2C12773%2C12774%2C12775%2C12776%2C12777%2C12778%2C12779%2C12780%2C12781%2C12782%2C12783%2C12784%2C12785%2C12786%2C12787%2C12788%2C12789%2C12790%2C12791%2C12792%2C12793%2C12794%2C12795%2C12796%2C12797%2C12798%2C12799%2C12800%2C12801%2C12802%2C12803%2C12804%2C12805%2C12806%2C12807%2C12808%2C12809%2C12810%2C12811%2C12812%2C13061%2C13282%2C13283%2C13284%2C13285%2C13286%2C13287%2C13288%2C13289%2C13294%2C13364%2C13365%2C13366%2C13367%2C14595%2C14596%2C14597%2C14598%2C14599%2C14600%2C14601%2C14602%2C14603%2C14604%2C14605%2C14606&trackId=50619067167713018335655119683810&_=1699508921761'
        data = policy.getrequest_json(headers, url_page)
        newslist = data['data']['knowledgeList']
        # print(newslist)
        for news in newslist[:1]:
            title = news['title']
            publishDate = news['createTime']
            newsUrl = 'http://www.sse.com.cn' + news['extend'][4]['value']
            # print(newsUrl)
            summary = news['rtfContent']
            source = news['spaceName']
            header = {
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                'Accept-Encoding': 'gzip, deflate',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Cache-Control': 'max-age=0',
                'Connection': 'keep-alive',
                'Cookie': 'ba17301551dcbaf9_gdp_user_key=; ba17301551dcbaf9_gdp_session_id=878c2669-93f0-43bd-91c1-cc30ca7136ef; gdp_user_id=gioenc-9a36dgc8%2C6b5d%2C5265%2Ccdc5%2C2ea193d9g222; ba17301551dcbaf9_gdp_session_id_878c2669-93f0-43bd-91c1-cc30ca7136ef=true; VISITED_MENU=%5B%228307%22%5D; seecookie=REITs; home-search-scroll=; ba17301551dcbaf9_gdp_sequence_ids={%22globalKey%22:33%2C%22VISIT%22:2%2C%22PAGE%22:3%2C%22CUSTOM%22:18%2C%22VIEW_CLICK%22:13}',
                'Host': 'www.sse.com.cn',
                'Referer': 'http://www.sse.com.cn/home/search/index.shtml?webswd=REITs',
                'Upgrade-Insecure-Requests': '1',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
            }
            newssoup = policy.getrequest_soup(header, newsUrl)
            # print(newssoup)
            content_ = newssoup.find('div', class_='allZoom')
            # print(content_)
            # #  将链接替换为绝对路径
            contentWithTag = policy.paserUrl(content_, newsUrl)
            pubHao = contentWithTag.find('p',style='text-align: center;').text.strip(' ')
            if '〔' in pubHao:
                pass
            else:
                pubHao = ''
            # print(contentWithTag)
            content = contentWithTag.text
            fujian_list = contentWithTag.find_all('a')
            id_list = []
            for fujian in fujian_list:
                try:
                    num = 1
                    file_href = fujian['href']
                    file_name = fujian.text.strip(' ')
                    # 下载附件到本地，并上传文件服务器
                    att_id, full_path = policy.attuributefile(file_name,file_href,num,publishDate)
                    num += 1
                    if att_id and full_path:
                        id_list.append(att_id)
                        dic_info = {
                            'attachmentIds':id_list,
                            'title': title,
                            'summary':summary,
                            'publishDate': publishDate,
                            'source': source,
                            'pub_hao': pubHao,
                            'contentWithTag': contentWithTag,
                            'content': content
                        }

                except:
                    continue

#北京市人民政府 https://www.beijing.gov.cn/so/s?siteCode=1100000088&tab=zcfg&qt=REITs

def beijing():
    url = 'https://www.beijing.gov.cn/so/ss/query/s'
    payload = {
        'siteCode': '1100000088',
        'tab': 'zcfg',
        'qt': 'REITs',
        'sort': 'relevance',
        'keyPlace': '0',
        'locationCode': '110000000000',
        'page': '1',
        'pageSize': '20',
        'ie': '89b5e964-dc3c-4a5b-8d80-f6c408769b4a'
    }
    headers = {
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep-alive',
        'Content-Length': '148',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Cookie': 'Path=/; Path=/; __jsluid_s=91bdb0d83098fd2e8a8455a9085a22e2; JSESSIONID=M2FmNDczYzYtMmNkYS00N2I0LThhNDgtYWJiMTdhOTIyZDI4; _va_ref=%5B%22%22%2C%22%22%2C1699515166%2C%22https%3A%2F%2Fdocs.qq.com%2F%22%5D; _va_ses=*; JSESSIONID=CD61DA650DB33324962A3BF2527672D0; arialoadData=false; _va_id=c7a63e4b2199befd.1699358536.2.1699515273.1699515166.; CPS_SESSION=2FEFDC54444B24762D057AD6BDE3C7BF',
        'Host': 'www.beijing.gov.cn',
        'Origin': 'https://www.beijing.gov.cn',
        'Referer': 'https://www.beijing.gov.cn/so/s?siteCode=1100000088&tab=zcfg&qt=REITs',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'same-origin',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest',
        'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"'
    }
    result = policy.requestPost(headers, url, payload)
    total = result['totalHits']
    page_size = result['currentHits']
    Max_page = int(total / page_size)
    for page in range(0, Max_page):
        payload_page = {
            'siteCode': '1100000088',
            'tab': 'zcfg',
            'qt': 'REITs',
            'sort': 'relevance',
            'keyPlace': '0',
            'locationCode': '110000000000',
            'page': page + 1,
            'pageSize': '20',
            'ie': '89b5e964-dc3c-4a5b-8d80-f6c408769b4a'
        }
        data = policy.requestPost(headers, url, payload_page)
        info_list = data['resultDocs']
        # print(info_list)
        for info_ in info_list:
            info = info_['data']
            title = info['titleO']
            titleLabel = info['titleLabel']['value']
            publishDate = info['docDate']
            # source = info['siteLabel']['value']
            newsUrl = info['url']

            if titleLabel == '政策解读':
                newssoup = policy.getrequest_soup(headers, newsUrl)
                print(newssoup)
                contentWithTag = newssoup.find('div', id='mainText')
                content = contentWithTag.text
                source = newssoup.select('p[class="fl"]>span')[1].replace('来源：', '')
            formatRows = info['formatRows']
            num = 1
            for row in formatRows:
                for col in row['col']:
                    name = col['text']
                    if name == '相关附件':
                        value = col['value']
                        file_href = value.keys()
                        file_name = value.values()
                        # 附件上传
                        policy.attuributefile(file_name,file_href,num,publishDate)
                        num += 1
                    value = col['value'][0]

                    dic_info[name] = value

            dic_info = {
                'title': title,
                'publishDate': publishDate,
                'source': source,
                'newsUrl': newsUrl,
                'file_href': file_href
            }

            # print(dic_info)
        # break



if __name__=="__main__":

    reform()
# zhengquanqihuo()