提交 0c9bb0cd 作者: 丁双波

下载pdf文件

上级 8be2a4ec
#下载pdf文件
import os
from datetime import time
import pymysql
import requests
import urllib3
from pymysql.converters import escape_string
from base.BaseCore import BaseCore
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'cache-control': 'max-age=0',
# 'cookie': 'maex=%7B%22v2%22%3A%7B%7D%7D; GUC=AQEBBwFjY49jkEIa8gQo&s=AQAAABw20C7P&g=Y2JIFQ; A1=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A3=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A1S=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc&j=WORLD; PRF=t%3D6954.T%252BTEL%252BSOLB.BR%252BSTM%252BEMR%252BGT%252BAMD%252BSYM.DE%252BPEMEX%252BSGO.PA%252BLRLCF%252BSYNH%252B001040.KS; cmp=t=1669714927&j=0&u=1---',
'sec-ch-ua': '"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "Windows",
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
baseCore = BaseCore()
log =baseCore.getLogger()
cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4')
cursor = cnx.cursor()
def get_file_name(headers):
filename = ''
if 'Content-Disposition' in headers and headers['Content-Disposition']:
disposition_split = headers['Content-Disposition'].split(';')
if len(disposition_split) > 1:
if disposition_split[1].strip().lower().startswith('filename='):
file_name = disposition_split[1].split('=')
if len(file_name) > 1:
filename = file_name[1]
if not filename:
return baseCore.getNextSeq()+".pdf"
return filename
def downFile(url,path):
try:
baseCore.mkPath(path)
proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
response = requests.get(url, proxies=proxy, headers=headers, verify=False,timeout=10)
fileName = get_file_name(response.headers)
with open(os.path.join(path, fileName), "wb") as pyFile:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
pyFile.write(chunk)
except Exception as e:
log.error(f"出错了----------{e}")
return False
return fileName
if __name__ == '__main__':
while True :
selectSql = f"select id,url,website,ftype,stype,ttype from usvsrussia where state=0 order by id asc limit 1"
cursor.execute(selectSql)
data = cursor.fetchone()
if data:
id=data[0]
url=data[1]
website=data[2]
ftype=data[3]
stype=data[4]
ttype=data[5]
path=r'D:\美国VS俄罗斯制裁'
log.info(f"开始处理{url}----")
if website:
path = os.path.join(path, website)
if ftype:
path = os.path.join(path, ftype)
if stype:
path = os.path.join(path, stype)
if ttype:
path = os.path.join(path, ttype)
fileName = downFile(url,path)
if fileName:
updateSql = f"update usvsrussia set state=1,pdf_name='{fileName}' ,pdf_path='{escape_string(path)}' where id={id}"
log.info(f"开始处理{url}----处理ok")
else:
updateSql = f"update usvsrussia set state=2 where id={id}"
log.info(f"开始处理{url}----处理error")
cursor.execute(updateSql)
cnx.commit()
else:
log.info("数据处理完毕,程序退出")
break
url = 'https://ofac.treasury.gov/media/931946/download?inline'
log.info(f"{url}----开始下载")
downFile(url)
log.info(f"{url}----开始下载,下载完成")
baseCore.close()
cursor.close()
cnx.close()
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论