Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
22a5f6f6
提交
22a5f6f6
authored
8月 25, 2023
作者:
丁双波
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
美国俄罗斯数据采集
上级
eaa6815d
隐藏空白字符变更
内嵌
并排
正在显示
4 个修改的文件
包含
813 行增加
和
2 行删除
+813
-2
BaseCore.py
base/BaseCore.py
+10
-2
__init__.py
tmp/__init__.py
+0
-0
__init__.py
tmp/usVsRussia/__init__.py
+0
-0
ofac.py
tmp/usVsRussia/ofac.py
+803
-0
没有找到文件。
base/BaseCore.py
浏览文件 @
22a5f6f6
...
...
@@ -369,7 +369,7 @@ class BaseCore:
if
beginStr
==
''
:
pass
else
:
begin
=
str
.
find
(
beginStr
)
begin
=
str
.
r
find
(
beginStr
)
if
begin
==-
1
:
begin
=
0
str
=
str
[
begin
:]
...
...
@@ -425,11 +425,18 @@ class BaseCore:
IP
=
socket
.
gethostbyname
(
socket
.
gethostname
())
return
IP
def
mkPath
(
self
,
path
):
folder
=
os
.
path
.
exists
(
path
)
if
not
folder
:
# 判断是否存在文件夹如果不存在则创建为文件夹
os
.
makedirs
(
path
)
# makedirs 创建文件时如果路径不存在会创建这个路径
else
:
pass
# 生成google模拟浏览器 必须传入值为googledriver位置信息
# headless用于决定是否为无头浏览器,初始默认为无头浏览器
# 正常浏览器可用于开始对页面解析使用或一些网站无头时无法正常采集
# 无头浏览器用于后续对信息采集时不会有浏览器一直弹出,
def
buildDriver
(
self
,
path
,
headless
=
True
):
service
=
Service
(
path
)
chrome_options
=
webdriver
.
ChromeOptions
()
if
headless
:
...
...
@@ -442,7 +449,7 @@ class BaseCore:
chrome_options
.
add_argument
(
'user-agent='
+
self
.
getRandomUserAgent
())
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
driver
=
webdriver
.
Chrome
(
chrome_
options
=
chrome_options
,
service
=
service
)
driver
=
webdriver
.
Chrome
(
options
=
chrome_options
,
service
=
service
)
# with open(r'F:\zzsn\zzsn_spider\base\stealth.min.js') as f:
# js = f.read()
#
...
...
@@ -578,3 +585,4 @@ class BaseCore:
tmp/__init__.py
0 → 100644
浏览文件 @
22a5f6f6
tmp/usVsRussia/__init__.py
0 → 100644
浏览文件 @
22a5f6f6
tmp/usVsRussia/ofac.py
0 → 100644
浏览文件 @
22a5f6f6
#OFAC:美国财政部外国资产控制办公室 (OFAC),数量在200左右,四个类型里的所有带黑点、PDF文件都要。https://ofac.treasury.gov/
# 美国对俄罗斯相关制裁
# 俄罗斯有害外国活动制裁
# https://ofac.treasury.gov/sanctions-programs-and-country-information/russian-harmful-foreign-activities-sanctions
# 乌克兰/俄罗斯有害外国活动制裁
# https://ofac.treasury.gov/sanctions-programs-and-country-information/ukraine-russia-related-sanctions
# 2017年制裁
# https://ofac.treasury.gov/sanctions-programs-and-country-information/countering-americas-adversaries-through-sanctions-act-related-sanctions
# 马格尼茨基制裁
# https://ofac.treasury.gov/sanctions-programs-and-country-information/the-magnitsky-sanctions
import
os
import
pandas
as
pd
import
pymysql
import
requests
from
bs4
import
BeautifulSoup
from
pymysql.converters
import
escape_string
from
selenium.webdriver.common.by
import
By
from
base.BaseCore
import
BaseCore
baseCore
=
BaseCore
()
log
=
baseCore
.
getLogger
()
headers
=
{
'accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
,
'accept-encoding'
:
'gzip, deflate, br'
,
'accept-language'
:
'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7'
,
'cache-control'
:
'max-age=0'
,
# 'cookie': 'maex=%7B%22v2%22%3A%7B%7D%7D; GUC=AQEBBwFjY49jkEIa8gQo&s=AQAAABw20C7P&g=Y2JIFQ; A1=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A3=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A1S=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc&j=WORLD; PRF=t%3D6954.T%252BTEL%252BSOLB.BR%252BSTM%252BEMR%252BGT%252BAMD%252BSYM.DE%252BPEMEX%252BSGO.PA%252BLRLCF%252BSYNH%252B001040.KS; cmp=t=1669714927&j=0&u=1---',
'sec-ch-ua'
:
'"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
"Windows"
,
'sec-fetch-dest'
:
'document'
,
'sec-fetch-mode'
:
'navigate'
,
'sec-fetch-site'
:
'same-origin'
,
'sec-fetch-user'
:
'?1'
,
'upgrade-insecure-requests'
:
'1'
,
'user-agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
# usvsrussia
cnx
=
pymysql
.
connect
(
host
=
'114.115.159.144'
,
user
=
'caiji'
,
password
=
'zzsn9988'
,
db
=
'caiji'
,
charset
=
'utf8mb4'
)
cursor
=
cnx
.
cursor
()
def
job1
():
log
.
info
(
"开始采集----俄罗斯有害外国活动制裁"
)
path
=
r'E:\chromedriver_win32\115\chromedriver.exe'
driverContent
=
baseCore
.
buildDriver
(
path
,
headless
=
False
)
url
=
'https://ofac.treasury.gov/sanctions-programs-and-country-information/russian-harmful-foreign-activities-sanctions'
driverContent
.
get
(
url
)
ftype
=
"Russian Harmful Foreign Activities Sanctions"
# IMPORTANT ADVISORIES AND INFORMATION 重要建议和信息
stype
=
'IMPORTANT ADVISORIES AND INFORMATION'
log
.
info
(
f
"开始采集栏目---{stype}"
)
# //*[@id="node-35986"]/div/ul[1]/li
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-35986"]/div/ul[1]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
#a标签
text
=
aEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
liEle
.
text
.
replace
(
text
,
''
)
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{ftype}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
#log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
#Price Cap Policies //*[@id="node-35986"]/div/ul[2]/li
stype
=
'Price Cap Policies'
log
.
info
(
f
"开始采集栏目---{stype}"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-35986"]/div/ul[2]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{ftype}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
# INTERPRETIVE GUIDANCE 解释指导
#INTERPRETIVE GUIDANCE 单独处理
#FREQUENTLY ASKED QUESTIONS 单独处理
#RUSSIAN HARMFUL FOREIGN ACTIVITIES SANCTIONS DIRECTIVES
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="directives"]/ul/li'
)
stype
=
'RUSSIAN HARMFUL FOREIGN ACTIVITIES SANCTIONS DIRECTIVES'
log
.
info
(
f
"开始采集栏目---{stype}"
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{ftype}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
#APPLYING FOR A SPECIFIC OFAC LICENSE
#GUIDANCE ON OFAC LICENSING POLICY
stype
=
'GUIDANCE ON OFAC LICENSING POLICY'
log
.
info
(
f
"开始采集栏目---{stype}"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-35986"]/div/ul[6]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{ftype}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
#GENERAL LICENSES
stype
=
'GENERAL LICENSES'
log
.
info
(
f
"开始采集栏目---{stype}"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-35986"]/div/ul[7]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{ftype}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
#Executive Orders
stype
=
'Executive Orders'
log
.
info
(
f
"开始采集栏目---{stype}"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-35986"]/div/ul[8]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{ftype}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
#Determinations
stype
=
'Determinations'
log
.
info
(
f
"开始采集栏目---{stype}"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-35986"]/div/ul[9]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{ftype}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
#Statutes
stype
=
'Statutes'
log
.
info
(
f
"开始采集栏目---{stype}"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-35986"]/div/ul[10]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{ftype}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
#Code of Federal Regulations
#Federal Register Notices
stype
=
'Federal Register Notices'
log
.
info
(
f
"开始采集栏目---{stype}"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-35986"]/div/ul[12]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{ftype}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
driverContent
.
close
()
def
job2
():
log
.
info
(
"开始采集----乌克兰-俄罗斯有害外国活动制裁"
)
path
=
r'E:\chromedriver_win32\115\chromedriver.exe'
driverContent
=
baseCore
.
buildDriver
(
path
,
headless
=
False
)
url
=
'https://ofac.treasury.gov/sanctions-programs-and-country-information/ukraine-russia-related-sanctions'
driverContent
.
get
(
url
)
ftype
=
"Ukraine-/Russia-related Sanctions"
# IMPORTANT ADVISORIES
stype
=
'IMPORTANT ADVISORIES'
log
.
info
(
f
"开始采集栏目---{stype}"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-6416"]/div/ul[1]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
# IMPORTANT ADVISORIES
stype
=
'SANCTIONS BROCHURES'
log
.
info
(
f
"开始采集栏目---{stype}"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-6416"]/div/ul[1]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
aEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
#ADDITIONAL UKRAINE-/RUSSIA-RELATED SANCTIONS INFORMATION
#FREQUENTLY ASKED QUESTIONS
#SECTORAL SANCTIONS IDENTIFICATIONS (SSI) LIST
stype
=
'SECTORAL SANCTIONS IDENTIFICATIONS (SSI) LIST'
log
.
info
(
f
"开始采集栏目---{stype}"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="directives"]/ul[1]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
# Archived Directives
stype
=
'SECTORAL SANCTIONS IDENTIFICATIONS (SSI) LIST'
log
.
info
(
f
"开始采集栏目---{stype}---Archived Directives"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="directives"]/ul[2]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','Archived Directives',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
#INTERPRETIVE GUIDANCE
stype
=
'INTERPRETIVE GUIDANCE'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-6416"]/div/ul[5]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
#GUIDANCE ON OFAC LICENSING POLICY
stype
=
'GUIDANCE ON OFAC LICENSING POLICY'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-6416"]/div/ul[7]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
#time = baseCore.getSubStr(text, '(', ')')
time
=
''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
# GENERAL LICENSES
stype
=
'GENERAL LICENSES'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-6416"]/div/ul[8]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
#time = ''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
# Executive Orders
stype
=
'Executive Orders'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-6416"]/div/ul[9]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
# time = ''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
# Determinations
stype
=
'Determinations'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-6416"]/div/ul[10]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
#time = baseCore.getSubStr(text, '(', ')')
time
=
''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
# Statutes
stype
=
'Statutes'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-6416"]/div/ul[11]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
# time = baseCore.getSubStr(text, '(', ')')
time
=
''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
# Federal Register Notices
stype
=
'Federal Register Notices'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-6416"]/div/ul[13]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
# time = baseCore.getSubStr(text, '(', ')')
time
=
''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
driverContent
.
close
()
def
job3
():
log
.
info
(
"开始采集----2017年制裁"
)
path
=
r'E:\chromedriver_win32\115\chromedriver.exe'
driverContent
=
baseCore
.
buildDriver
(
path
,
headless
=
False
)
url
=
'https://ofac.treasury.gov/sanctions-programs-and-country-information/countering-americas-adversaries-through-sanctions-act-related-sanctions'
driverContent
.
get
(
url
)
ftype
=
"Countering America's Adversaries Through Sanctions Act of 2017 (CAATSA)"
stype
=
'Countering Americas Adversaries Through Sanctions Act-Related Sanctions'
href
=
"https://congress.gov/115/plaws/publ44/PLAW-115publ44.pdf"
text
=
"Countering America’s Adversaries Through Sanctions Act” (Public Law 115-44) (CAATSA)"
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
else
:
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','August 2, 2017',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
#Other Documents Related to the Implementation of Section 105
stype
=
'Other Documents Related to the Implementation of Section 105'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-7161"]/div/ul[2]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
#time = ''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
# Ukraine-/Russia-related Directives
stype
=
'Ukraine-/Russia-related Directives'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-7161"]/div/ul[4]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
# time = ''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
# ADDITIONAL CAATSA GUIDANCE AND INFORMATION
stype
=
'ADDITIONAL CAATSA GUIDANCE AND INFORMATION'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-7161"]/div/ul[6]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
# time = ''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
# AExecutive Orders
stype
=
'Executive Orders'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-7161"]/div/ul[8]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
# time = ''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
# Statutes
stype
=
'Statutes'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-7161"]/div/ul[9]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
#time = baseCore.getSubStr(text, '(', ')')
time
=
''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
driverContent
.
close
()
def
job4
():
log
.
info
(
"开始采集----马格尼茨基制裁"
)
path
=
r'E:\chromedriver_win32\115\chromedriver.exe'
driverContent
=
baseCore
.
buildDriver
(
path
,
headless
=
False
)
url
=
'https://ofac.treasury.gov/sanctions-programs-and-country-information/the-magnitsky-sanctions'
driverContent
.
get
(
url
)
ftype
=
"Magnitsky Sanctions"
# INTERPRETIVE GUIDANCE
stype
=
'INTERPRETIVE GUIDANCE'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-6306"]/div/ul[2]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
#time = ''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
#GUIDANCE ON OFAC LICENSING POLICY
stype
=
'GUIDANCE ON OFAC LICENSING POLICY'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-6306"]/div/ul[4]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
# time = baseCore.getSubStr(text, '(', ')')
time
=
''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
# Statutes
stype
=
'Statutes'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-6306"]/div/ul[5]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
# time = baseCore.getSubStr(text, '(', ')')
time
=
''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
# Federal Register Notices
stype
=
'Federal Register Notices'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-6306"]/div/ul[7]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
# time = baseCore.getSubStr(text, '(', ')')
time
=
''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
driverContent
.
close
()
if
__name__
==
'__main__'
:
log
.
info
(
"美国财政部外国资产控制办公室 (OFAC)网站开始采集"
)
job1
()
job2
()
job3
()
job4
()
baseCore
.
close
()
cursor
.
close
()
cnx
.
close
()
\ No newline at end of file
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论