import os

import pandas as pd
import pymysql
import requests
from bs4 import BeautifulSoup
from pymysql.converters import escape_string
import downPdf
from BaseCore import BaseCore
from datetime import datetime
baseCore = BaseCore()
log =baseCore.getLogger()
headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
    'cache-control': 'max-age=0',
    # 'cookie': 'maex=%7B%22v2%22%3A%7B%7D%7D; GUC=AQEBBwFjY49jkEIa8gQo&s=AQAAABw20C7P&g=Y2JIFQ; A1=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A3=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A1S=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc&j=WORLD; PRF=t%3D6954.T%252BTEL%252BSOLB.BR%252BSTM%252BEMR%252BGT%252BAMD%252BSYM.DE%252BPEMEX%252BSGO.PA%252BLRLCF%252BSYNH%252B001040.KS; cmp=t=1669714927&j=0&u=1---',
    'sec-ch-ua': '"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': "Windows",
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
cnx = baseCore.cnx
cursor = baseCore.cursor

def downFile(url,path,pdf_name):
    try:
        baseCore.mkPath(path)
        # proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
        response = requests.get(url, headers=headers, verify=False, timeout=10)
        # response = requests.get(url, proxies=proxy, headers=headers, verify=False,timeout=10)
        pdf_name = pdf_name +'.pdf'
        with open(os.path.join(path, pdf_name), "wb") as pyFile:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    pyFile.write(chunk)
    except Exception as e:
        log.error(f"出错了----------{e}")
        return False
    return pdf_name

def job_2():

    log.info('----开始采集---俄罗斯国家杂志----')
    # path = 'D:chrome/chromedriver.exe'
    # driverContent = baseCore.buildDriver(path, headless=False)
    for i in range(68,200):
        if i == 1:
            url = 'http://publication.pravo.gov.ru/documents/block/president'
        else:
            url = f'http://publication.pravo.gov.ru/documents/block/president?index={i}&pageSize=30'
        req = requests.get(url,headers)
        soup = BeautifulSoup(req.content,'html.parser')
        container = soup.find('div',class_='documents-container')
        web_list = container.find_all('div',class_='documents-table-row')
        for web in web_list:

            title = web.find_all('a')[1].text
            if '"О' in title:
                pdftitle = title.strip().split('"О')[0]
            if '-рп' in title:
                pdftitle = title.strip().split('-рп')[0] + '-рп'
            pdfUrl = 'http://publication.pravo.gov.ru' + web.find('div',class_='notforprint pt-2').find('a')['href']
            # pdfTitle = aa.find('a')['title']
            print(pdfUrl)
            selectCountSql = f"select * from usvsrussia where url = '{pdfUrl}' "
            cursor.execute(selectCountSql)
            url = cursor.fetchone()
            if url:
                log.info("已采集，跳过")
                continue
            else:
                pass
            date_string = web.find('div',class_='infoindocumentlist').find_all('div')[1].find('span',class_='info-data').text
            #时间格式转化
            date_object = datetime.strptime(date_string, "%d.%m.%Y")
            pub_time = date_object.strftime("%Y-%m-%d")
            print(pub_time)
            pdf_name = web.find('div',class_='infoindocumentlist').find_all('div')[0].find('span',class_='info-data').text
            #下载pdf
            path=r'D:\美国VS俄罗斯制裁'
            path = os.path.join(path, downPdf.getPath(pdftitle))
            downFile(pdfUrl,path,pdf_name)

            insertSql = f"insert into  usvsrussia (website,url,title,pub_time,state,pdf_name,pdf_path,create_time) values ('总统令文件','{pdfUrl}','{escape_string(pdftitle)}','{pub_time}',0,'{pdf_name}','{path}',now() )"
            # log.info(insertSql)
            cursor.execute(insertSql)
            cnx.commit()
        # break

job_2()



