提交 22a5f6f6 作者: 丁双波

美国俄罗斯数据采集

上级 eaa6815d
......@@ -369,7 +369,7 @@ class BaseCore:
if beginStr=='':
pass
else:
begin=str.find(beginStr)
begin=str.rfind(beginStr)
if begin==-1:
begin=0
str=str[begin:]
......@@ -425,11 +425,18 @@ class BaseCore:
IP = socket.gethostbyname(socket.gethostname())
return IP
def mkPath(self,path):
folder = os.path.exists(path)
if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹
os.makedirs(path) # makedirs 创建文件时如果路径不存在会创建这个路径
else:
pass
# 生成google模拟浏览器 必须传入值为googledriver位置信息
# headless用于决定是否为无头浏览器,初始默认为无头浏览器
# 正常浏览器可用于开始对页面解析使用或一些网站无头时无法正常采集
# 无头浏览器用于后续对信息采集时不会有浏览器一直弹出,
def buildDriver(self, path, headless=True):
service = Service(path)
chrome_options = webdriver.ChromeOptions()
if headless:
......@@ -442,7 +449,7 @@ class BaseCore:
chrome_options.add_argument('user-agent=' + self.getRandomUserAgent())
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
driver = webdriver.Chrome(chrome_options=chrome_options, service=service)
driver = webdriver.Chrome(options=chrome_options, service=service)
# with open(r'F:\zzsn\zzsn_spider\base\stealth.min.js') as f:
# js = f.read()
#
......@@ -578,3 +585,4 @@ class BaseCore:
#OFAC:美国财政部外国资产控制办公室 (OFAC),数量在200左右,四个类型里的所有带黑点、PDF文件都要。https://ofac.treasury.gov/
# 美国对俄罗斯相关制裁
# 俄罗斯有害外国活动制裁
# https://ofac.treasury.gov/sanctions-programs-and-country-information/russian-harmful-foreign-activities-sanctions
# 乌克兰/俄罗斯有害外国活动制裁
# https://ofac.treasury.gov/sanctions-programs-and-country-information/ukraine-russia-related-sanctions
# 2017年制裁
# https://ofac.treasury.gov/sanctions-programs-and-country-information/countering-americas-adversaries-through-sanctions-act-related-sanctions
# 马格尼茨基制裁
# https://ofac.treasury.gov/sanctions-programs-and-country-information/the-magnitsky-sanctions
import os
import pandas as pd
import pymysql
import requests
from bs4 import BeautifulSoup
from pymysql.converters import escape_string
from selenium.webdriver.common.by import By
from base.BaseCore import BaseCore
baseCore = BaseCore()
log =baseCore.getLogger()
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'cache-control': 'max-age=0',
# 'cookie': 'maex=%7B%22v2%22%3A%7B%7D%7D; GUC=AQEBBwFjY49jkEIa8gQo&s=AQAAABw20C7P&g=Y2JIFQ; A1=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A3=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A1S=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc&j=WORLD; PRF=t%3D6954.T%252BTEL%252BSOLB.BR%252BSTM%252BEMR%252BGT%252BAMD%252BSYM.DE%252BPEMEX%252BSGO.PA%252BLRLCF%252BSYNH%252B001040.KS; cmp=t=1669714927&j=0&u=1---',
'sec-ch-ua': '"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "Windows",
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
# usvsrussia
cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4')
cursor = cnx.cursor()
def job1():
log.info("开始采集----俄罗斯有害外国活动制裁")
path=r'E:\chromedriver_win32\115\chromedriver.exe'
driverContent = baseCore.buildDriver(path,headless=False)
url='https://ofac.treasury.gov/sanctions-programs-and-country-information/russian-harmful-foreign-activities-sanctions'
driverContent.get(url)
ftype="Russian Harmful Foreign Activities Sanctions"
# IMPORTANT ADVISORIES AND INFORMATION 重要建议和信息
stype='IMPORTANT ADVISORIES AND INFORMATION'
log.info(f"开始采集栏目---{stype}")
# //*[@id="node-35986"]/div/ul[1]/li
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-35986"]/div/ul[1]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME,'a') #a标签
text= aEle.text
href = aEle.get_attribute('href')
time = liEle.text.replace(text,'')
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql=f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{ftype}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
#log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
#Price Cap Policies //*[@id="node-35986"]/div/ul[2]/li
stype = 'Price Cap Policies'
log.info(f"开始采集栏目---{stype}")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-35986"]/div/ul[2]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text,'(',')')
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{ftype}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# INTERPRETIVE GUIDANCE 解释指导
#INTERPRETIVE GUIDANCE 单独处理
#FREQUENTLY ASKED QUESTIONS 单独处理
#RUSSIAN HARMFUL FOREIGN ACTIVITIES SANCTIONS DIRECTIVES
liEles = driverContent.find_elements(By.XPATH, '//*[@id="directives"]/ul/li')
stype = 'RUSSIAN HARMFUL FOREIGN ACTIVITIES SANCTIONS DIRECTIVES'
log.info(f"开始采集栏目---{stype}")
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text,'(',')')
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{ftype}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
#APPLYING FOR A SPECIFIC OFAC LICENSE
#GUIDANCE ON OFAC LICENSING POLICY
stype = 'GUIDANCE ON OFAC LICENSING POLICY'
log.info(f"开始采集栏目---{stype}")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-35986"]/div/ul[6]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time =''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{ftype}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
#GENERAL LICENSES
stype = 'GENERAL LICENSES'
log.info(f"开始采集栏目---{stype}")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-35986"]/div/ul[7]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text,'(',')')
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{ftype}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
#Executive Orders
stype = 'Executive Orders'
log.info(f"开始采集栏目---{stype}")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-35986"]/div/ul[8]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text,'(',')')
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{ftype}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
#Determinations
stype = 'Determinations'
log.info(f"开始采集栏目---{stype}")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-35986"]/div/ul[9]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{ftype}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
#Statutes
stype = 'Statutes'
log.info(f"开始采集栏目---{stype}")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-35986"]/div/ul[10]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{ftype}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
#Code of Federal Regulations
#Federal Register Notices
stype = 'Federal Register Notices'
log.info(f"开始采集栏目---{stype}")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-35986"]/div/ul[12]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text =liEle.text
href = aEle.get_attribute('href')
time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{ftype}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
driverContent.close()
def job2():
log.info("开始采集----乌克兰-俄罗斯有害外国活动制裁")
path = r'E:\chromedriver_win32\115\chromedriver.exe'
driverContent = baseCore.buildDriver(path, headless=False)
url='https://ofac.treasury.gov/sanctions-programs-and-country-information/ukraine-russia-related-sanctions'
driverContent.get(url)
ftype="Ukraine-/Russia-related Sanctions"
# IMPORTANT ADVISORIES
stype = 'IMPORTANT ADVISORIES'
log.info(f"开始采集栏目---{stype}")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-6416"]/div/ul[1]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# IMPORTANT ADVISORIES
stype = 'SANCTIONS BROCHURES'
log.info(f"开始采集栏目---{stype}")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-6416"]/div/ul[1]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = aEle.text
href = aEle.get_attribute('href')
time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
#ADDITIONAL UKRAINE-/RUSSIA-RELATED SANCTIONS INFORMATION
#FREQUENTLY ASKED QUESTIONS
#SECTORAL SANCTIONS IDENTIFICATIONS (SSI) LIST
stype = 'SECTORAL SANCTIONS IDENTIFICATIONS (SSI) LIST'
log.info(f"开始采集栏目---{stype}")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="directives"]/ul[1]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# Archived Directives
stype = 'SECTORAL SANCTIONS IDENTIFICATIONS (SSI) LIST'
log.info(f"开始采集栏目---{stype}---Archived Directives")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="directives"]/ul[2]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','Archived Directives'," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
#INTERPRETIVE GUIDANCE
stype = 'INTERPRETIVE GUIDANCE'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-6416"]/div/ul[5]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
#GUIDANCE ON OFAC LICENSING POLICY
stype = 'GUIDANCE ON OFAC LICENSING POLICY'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-6416"]/div/ul[7]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
#time = baseCore.getSubStr(text, '(', ')')
time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# GENERAL LICENSES
stype = 'GENERAL LICENSES'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-6416"]/div/ul[8]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
#time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# Executive Orders
stype = 'Executive Orders'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-6416"]/div/ul[9]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
# time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# Determinations
stype = 'Determinations'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-6416"]/div/ul[10]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
#time = baseCore.getSubStr(text, '(', ')')
time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# Statutes
stype = 'Statutes'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-6416"]/div/ul[11]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
# time = baseCore.getSubStr(text, '(', ')')
time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# Federal Register Notices
stype = 'Federal Register Notices'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-6416"]/div/ul[13]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
# time = baseCore.getSubStr(text, '(', ')')
time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
driverContent.close()
def job3():
log.info("开始采集----2017年制裁")
path = r'E:\chromedriver_win32\115\chromedriver.exe'
driverContent = baseCore.buildDriver(path, headless=False)
url='https://ofac.treasury.gov/sanctions-programs-and-country-information/countering-americas-adversaries-through-sanctions-act-related-sanctions'
driverContent.get(url)
ftype="Countering America's Adversaries Through Sanctions Act of 2017 (CAATSA)"
stype = 'Countering Americas Adversaries Through Sanctions Act-Related Sanctions'
href="https://congress.gov/115/plaws/publ44/PLAW-115publ44.pdf"
text="Countering America’s Adversaries Through Sanctions Act” (Public Law 115-44) (CAATSA)"
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
else:
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','August 2, 2017',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
#Other Documents Related to the Implementation of Section 105
stype = 'Other Documents Related to the Implementation of Section 105'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-7161"]/div/ul[2]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
#time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# Ukraine-/Russia-related Directives
stype = 'Ukraine-/Russia-related Directives'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-7161"]/div/ul[4]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
# time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# ADDITIONAL CAATSA GUIDANCE AND INFORMATION
stype = 'ADDITIONAL CAATSA GUIDANCE AND INFORMATION'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-7161"]/div/ul[6]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
# time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# AExecutive Orders
stype = 'Executive Orders'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-7161"]/div/ul[8]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
# time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# Statutes
stype = 'Statutes'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-7161"]/div/ul[9]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
#time = baseCore.getSubStr(text, '(', ')')
time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
driverContent.close()
def job4():
log.info("开始采集----马格尼茨基制裁")
path = r'E:\chromedriver_win32\115\chromedriver.exe'
driverContent = baseCore.buildDriver(path, headless=False)
url='https://ofac.treasury.gov/sanctions-programs-and-country-information/the-magnitsky-sanctions'
driverContent.get(url)
ftype = "Magnitsky Sanctions"
# INTERPRETIVE GUIDANCE
stype = 'INTERPRETIVE GUIDANCE'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-6306"]/div/ul[2]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
time = baseCore.getSubStr(text, '(', ')')
#time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
#GUIDANCE ON OFAC LICENSING POLICY
stype = 'GUIDANCE ON OFAC LICENSING POLICY'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-6306"]/div/ul[4]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
# time = baseCore.getSubStr(text, '(', ')')
time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# Statutes
stype = 'Statutes'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-6306"]/div/ul[5]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
# time = baseCore.getSubStr(text, '(', ')')
time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
# Federal Register Notices
stype = 'Federal Register Notices'
log.info(f"开始采集栏目---{stype}---")
liEles = driverContent.find_elements(By.XPATH, '//*[@id="node-6306"]/div/ul[7]/li')
for liEle in liEles:
aEle = liEle.find_element(By.TAG_NAME, 'a') # a标签
text = liEle.text
href = aEle.get_attribute('href')
# time = baseCore.getSubStr(text, '(', ')')
time = ''
selectCountSql = f"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor.execute(selectCountSql)
count = cursor.fetchone()[0]
if count > 0:
log.info("已采集,跳过")
continue
else:
pass
insertSql = f"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values (" \
f"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}',''," \
f"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor.execute(insertSql)
cnx.commit()
driverContent.close()
if __name__ == '__main__':
log.info("美国财政部外国资产控制办公室 (OFAC)网站开始采集")
job1()
job2()
job3()
job4()
baseCore.close()
cursor.close()
cnx.close()
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论