提交 0c9bb0cd 作者: 丁双波

下载pdf文件

上级 8be2a4ec
#下载pdf文件
import os
from datetime import time
import pymysql
import requests
import urllib3
from pymysql.converters import escape_string
from base.BaseCore import BaseCore
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'cache-control': 'max-age=0',
# 'cookie': 'maex=%7B%22v2%22%3A%7B%7D%7D; GUC=AQEBBwFjY49jkEIa8gQo&s=AQAAABw20C7P&g=Y2JIFQ; A1=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A3=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A1S=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc&j=WORLD; PRF=t%3D6954.T%252BTEL%252BSOLB.BR%252BSTM%252BEMR%252BGT%252BAMD%252BSYM.DE%252BPEMEX%252BSGO.PA%252BLRLCF%252BSYNH%252B001040.KS; cmp=t=1669714927&j=0&u=1---',
'sec-ch-ua': '"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "Windows",
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
baseCore = BaseCore()
log =baseCore.getLogger()
cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4')
cursor = cnx.cursor()
def get_file_name(headers):
filename = ''
if 'Content-Disposition' in headers and headers['Content-Disposition']:
disposition_split = headers['Content-Disposition'].split(';')
if len(disposition_split) > 1:
if disposition_split[1].strip().lower().startswith('filename='):
file_name = disposition_split[1].split('=')
if len(file_name) > 1:
filename = file_name[1]
if not filename:
return baseCore.getNextSeq()+".pdf"
return filename
def downFile(url,path):
try:
baseCore.mkPath(path)
proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
response = requests.get(url, proxies=proxy, headers=headers, verify=False,timeout=10)
fileName = get_file_name(response.headers)
with open(os.path.join(path, fileName), "wb") as pyFile:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
pyFile.write(chunk)
except Exception as e:
log.error(f"出错了----------{e}")
return False
return fileName
if __name__ == '__main__':
while True :
selectSql = f"select id,url,website,ftype,stype,ttype from usvsrussia where state=0 order by id asc limit 1"
cursor.execute(selectSql)
data = cursor.fetchone()
if data:
id=data[0]
url=data[1]
website=data[2]
ftype=data[3]
stype=data[4]
ttype=data[5]
path=r'D:\美国VS俄罗斯制裁'
log.info(f"开始处理{url}----")
if website:
path = os.path.join(path, website)
if ftype:
path = os.path.join(path, ftype)
if stype:
path = os.path.join(path, stype)
if ttype:
path = os.path.join(path, ttype)
fileName = downFile(url,path)
if fileName:
updateSql = f"update usvsrussia set state=1,pdf_name='{fileName}' ,pdf_path='{escape_string(path)}' where id={id}"
log.info(f"开始处理{url}----处理ok")
else:
updateSql = f"update usvsrussia set state=2 where id={id}"
log.info(f"开始处理{url}----处理error")
cursor.execute(updateSql)
cnx.commit()
else:
log.info("数据处理完毕,程序退出")
break
url = 'https://ofac.treasury.gov/media/931946/download?inline'
log.info(f"{url}----开始下载")
downFile(url)
log.info(f"{url}----开始下载,下载完成")
baseCore.close()
cursor.close()
cnx.close()
\ No newline at end of file
...@@ -790,13 +790,206 @@ def job4(): ...@@ -790,13 +790,206 @@ def job4():
cursor.execute(insertSql) cursor.execute(insertSql)
cnx.commit() cnx.commit()
driverContent.close() driverContent.close()
def job5():
log.info("开始采集----第二层数据采集")
path = r'E:\chromedriver_win32\115\chromedriver.exe'
driverContent = baseCore.buildDriver(path, headless=False)
url='https://ofac.treasury.gov/sanctions-programs-and-country-information/non-english-translations-of-advisories-and-other-documents#ru_food_security'
driverContent.get(url)
ftype = "Russian Harmful Foreign Activities Sanctions"
# TRANSLATIONS OF OFAC FOOD SECURITY FACT SHEET: RUSSIA SANCTIONS AND AGRICULTURAL TRADE
stype = 'Non-English Translations of Advisories and Other Documents'
ttype='TRANSLATIONS OF OFAC FOOD SECURITY FACT SHEET: RUSSIA SANCTIONS AND AGRICULTURAL TRADE'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="ru_food_security"]/ul/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
#time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and ttype='{ttype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','{ttype}'," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# TRANSLATIONS OF OFAC FOOD SECURITY FACT SHEET: RUSSIA SANCTIONS AND AGRICULTURAL TRADE
stype = 'Non-English Translations of Advisories and Other Documents'
ttype = 'TRANSLATIONS OF NORTH KOREAN INFORMATION TECHNOLOGY WORKERS FACT SHEET'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-27626"]/div/div/div[3]/div/div/ul[1]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
# time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and ttype='{ttype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','{ttype}'," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# TRANSLATIONS OF NORTH KOREAN INFORMATION TECHNOLOGY WORKERS ADVISORY
stype = 'Non-English Translations of Advisories and Other Documents'
ttype = 'TRANSLATIONS OF NORTH KOREAN INFORMATION TECHNOLOGY WORKERS ADVISORY'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-27626"]/div/div/div[3]/div/div/ul[2]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
# time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and ttype='{ttype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','{ttype}'," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# TRANSLATIONS OF GLOBAL SHIPPING ADVISORY
stype = 'Non-English Translations of Advisories and Other Documents'
ttype = 'TRANSLATIONS OF GLOBAL SHIPPING ADVISORY'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-27626"]/div/div/div[3]/div/div/ul[3]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
# time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and ttype='{ttype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','{ttype}'," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# TRANSLATIONS OF NORTH KOREAN SHIPPING ADVISORIES
stype = 'Non-English Translations of Advisories and Other Documents'
ttype = 'Translations of North Korean Shipping Advisories'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-27626"]/div/div/div[3]/div/div/ul[4]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
# time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and ttype='{ttype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','{ttype}'," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# TRANSLATIONS OF NORTH KOREAN CYBER ADVISORY
stype = 'Non-English Translations of Advisories and Other Documents'
ttype = 'TRANSLATIONS OF NORTH KOREAN CYBER ADVISORY'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-27626"]/div/div/div[3]/div/div/div[8]/div/ul/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
# time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and ttype='{ttype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','{ttype}'," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
driverContent.close()
def job6():
log.info("开始采集----第二层数据采集")
path = r'E:\chromedriver_win32\115\chromedriver.exe'
driverContent = baseCore.buildDriver(path, headless=False)
url='https://ofac.treasury.gov/sanctions-programs-and-country-information/iran-sanctions/interpretative-rulings-on-ofac-policy'
driverContent.get(url)
ftype = "Russian Harmful Foreign Activities Sanctions"
stype = 'Interpretative Rulings on OFAC Policy'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-11996"]/div/div/div[3]/div/div/table/tbody/tr')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = aEle.text
href = aEle.get_attribute('href')
time = liEle.find_element(By.TAG_NAME, 'th').text # a标签
#time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
driverContent.close()
if __name__ == '__main__': if __name__ == '__main__':
log.info("美国财政部外国资产控制办公室 (OFAC)网站开始采集") log.info("美国财政部外国资产控制办公室 (OFAC)网站开始采集")
job1() #job1()
job2() #job2()
job3() #job3()
job4() #job4()
#job5()
job6()
baseCore.close() baseCore.close()
cursor.close() cursor.close()
cnx.close() cnx.close()
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论