Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
01e0d716
提交
01e0d716
authored
8月 25, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'origin/master'
上级
32c31bd1
8be2a4ec
隐藏空白字符变更
内嵌
并排
正在显示
7 个修改的文件
包含
943 行增加
和
98 行删除
+943
-98
BaseCore.py
base/BaseCore.py
+10
-2
雅虎财经_企业基本信息_高管信息.py
comData/yhcj/雅虎财经_企业基本信息_高管信息.py
+95
-96
googleSpider.py
google_comm/googleSpider.py
+34
-0
sougouSpider.py
sougou_comm/sougouSpider.py
+1
-0
__init__.py
tmp/__init__.py
+0
-0
__init__.py
tmp/usVsRussia/__init__.py
+0
-0
ofac.py
tmp/usVsRussia/ofac.py
+803
-0
没有找到文件。
base/BaseCore.py
浏览文件 @
01e0d716
...
...
@@ -369,7 +369,7 @@ class BaseCore:
if
beginStr
==
''
:
pass
else
:
begin
=
str
.
find
(
beginStr
)
begin
=
str
.
r
find
(
beginStr
)
if
begin
==-
1
:
begin
=
0
str
=
str
[
begin
:]
...
...
@@ -425,11 +425,18 @@ class BaseCore:
IP
=
socket
.
gethostbyname
(
socket
.
gethostname
())
return
IP
def
mkPath
(
self
,
path
):
folder
=
os
.
path
.
exists
(
path
)
if
not
folder
:
# 判断是否存在文件夹如果不存在则创建为文件夹
os
.
makedirs
(
path
)
# makedirs 创建文件时如果路径不存在会创建这个路径
else
:
pass
# 生成google模拟浏览器 必须传入值为googledriver位置信息
# headless用于决定是否为无头浏览器,初始默认为无头浏览器
# 正常浏览器可用于开始对页面解析使用或一些网站无头时无法正常采集
# 无头浏览器用于后续对信息采集时不会有浏览器一直弹出,
def
buildDriver
(
self
,
path
,
headless
=
True
):
service
=
Service
(
path
)
chrome_options
=
webdriver
.
ChromeOptions
()
if
headless
:
...
...
@@ -442,7 +449,7 @@ class BaseCore:
chrome_options
.
add_argument
(
'user-agent='
+
self
.
getRandomUserAgent
())
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
driver
=
webdriver
.
Chrome
(
chrome_
options
=
chrome_options
,
service
=
service
)
driver
=
webdriver
.
Chrome
(
options
=
chrome_options
,
service
=
service
)
# with open(r'F:\zzsn\zzsn_spider\base\stealth.min.js') as f:
# js = f.read()
#
...
...
@@ -586,3 +593,4 @@ class BaseCore:
comData/yhcj/雅虎财经_企业基本信息_高管信息.py
浏览文件 @
01e0d716
impor
t
json
impor
t
json
import
json
import
time
import
numpy
as
np
import
pandas
as
pd
import
pymysql
import
requests
import
sys
from
bs4
import
BeautifulSoup
from
kafka
import
KafkaProducer
from
NewsYahoo
import
news
from
base.BaseCore
import
BaseCore
sys
.
path
.
append
(
r'F:\zzsn\zzsn_spider\base'
)
import
BaseCore
import
urllib3
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
taskType
=
'企业基本信息/雅虎财经'
baseCore
=
BaseCore
()
baseCore
=
BaseCore
.
BaseCore
()
r
=
baseCore
.
r
log
=
baseCore
.
getLogger
()
headers
=
{
...
...
@@ -38,7 +34,7 @@ headers = {
# 根据股票代码 获取企业基本信息 高管信息
def
getInfo
(
name
,
enname
,
gpdm
,
xydm
,
start
):
def
getInfo
(
enname
,
gpdm
,
xydm
,
start
):
if
'HK'
in
str
(
gpdm
):
tmp_g
=
str
(
gpdm
)
.
split
(
'.'
)[
0
]
if
len
(
tmp_g
)
==
5
:
...
...
@@ -49,17 +45,9 @@ def getInfo(name,enname,gpdm, xydm, start):
gpdm_
=
gpdm
retData
=
{}
retData
[
'base_info'
]
=
{
'公司名称'
:
name
,
'公司名称'
:
en
name
,
'英文名'
:
enname
,
'信用代码'
:
xydm
,
'股票代码'
:
gpdm
,
'地址'
:
''
,
'电话'
:
''
,
'公司网站'
:
''
,
'部门'
:
''
,
'行业'
:
''
,
'员工人数'
:
''
,
'公司简介'
:
''
}
retData
[
'people_info'
]
=
[]
# https://finance.yahoo.com/quote/VOW3.DE/profile?p=VOW3.DE
...
...
@@ -76,22 +64,36 @@ def getInfo(name,enname,gpdm, xydm, start):
log
.
error
(
f
"{gpdm}---第{i}次---获取基本信息接口返回失败:{response.status_code}"
)
except
:
continue
if
(
response
.
status_code
==
200
):
pass
else
:
try
:
if
'lookup'
in
response
.
url
:
log
.
error
(
f
"{gpdm}------股票代码错误:{response.status_code}"
)
exeception
=
'股票代码错误'
state
=
1
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
0
,
takeTime
,
url
,
exeception
)
return
[
state
,
retData
]
elif
response
.
status_code
!=
200
:
log
.
error
(
f
"{gpdm}------获取基本信息接口重试后依然失败失败:{response.status_code}"
)
exeception
=
'获取基本信息接口返回失败'
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
url
,
exeception
)
baseCore
.
rePutIntoR
(
'BaseInfoEnterprise:gwqy_socialCode'
,
xydm
)
return
[
state
,
retData
]
except
:
log
.
error
(
f
"{gpdm}------获取基本信息接口重试后依然失败失败:{response.status_code}"
)
exeception
=
'获取基本信息接口返回失败'
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
url
,
exeception
)
rePutIntoR
(
''
)
return
[
state
,
retData
]
baseCore
.
rePutIntoR
(
'BaseInfoEnterprise:gwqy_socialCode'
,
xydm
)
return
[
state
,
retData
]
state
=
1
soup
=
BeautifulSoup
(
response
.
content
,
'html.parser'
)
page
=
soup
.
find
(
'div'
,
{
'id'
:
'Col1-0-Profile-Proxy'
})
name
=
page
.
find
(
'h3'
,{
'class'
:
'Fz(m) Mb(10px)'
})
.
text
name
=
page
.
find
(
'h3'
,
{
'class'
:
'Fz(m) Mb(10px)'
})
.
text
try
:
com_info
=
page
.
find
(
'div'
,
{
'class'
:
'Mb(25px)'
})
except
:
...
...
@@ -126,7 +128,7 @@ def getInfo(name,enname,gpdm, xydm, start):
com_jianjie
=
''
dic_com_info
=
{
'公司名称'
:
name
,
'英文名'
:
en
name
,
'英文名'
:
name
,
'信用代码'
:
xydm
,
'股票代码'
:
gpdm
,
'地址'
:
com_address
,
...
...
@@ -189,24 +191,31 @@ def getInfo(name,enname,gpdm, xydm, start):
retData
[
'people_info'
]
=
retPeople
log
.
info
(
f
"获取基本信息--{gpdm},耗时{baseCore.getTimeCost(start, time.time())}"
)
response
.
close
()
return
[
state
,
retData
]
return
[
state
,
retData
]
# 保存基本信息
def
saveBaseInfo
(
info
,
start
):
def
saveBaseInfo
(
info
,
start
):
# 基本信息发送到kafka
company_dict
=
{
'name'
:
info
[
'base_info'
][
'公司名称'
],
# 企业名称
'shortName'
:
''
,
# 企业简称
'socialCreditCode'
:
info
[
'base_info'
][
'信用代码'
],
# 统一社会信用代码
'officialPhone'
:
info
[
'base_info'
][
'电话'
],
# 电话
'officialUrl'
:
info
[
'base_info'
][
'公司网站'
],
# 官网
'briefInfo'
:
info
[
'base_info'
][
'公司简介'
],
# 简介
'industry'
:
info
[
'base_info'
][
'行业'
],
# 所属行业
'englishName'
:
info
[
'base_info'
][
'英文名'
],
# 英文名
'address'
:
info
[
'base_info'
][
'地址'
],
# 地址
'status'
:
0
,
# 状态
}
try
:
company_dict
=
{
'name'
:
info
[
'base_info'
][
'公司名称'
],
# 企业名称
'shortName'
:
''
,
# 企业简称
'socialCreditCode'
:
info
[
'base_info'
][
'信用代码'
],
# 统一社会信用代码
'officialPhone'
:
info
[
'base_info'
][
'电话'
],
# 电话
'officialUrl'
:
info
[
'base_info'
][
'公司网站'
],
# 官网
'briefInfo'
:
info
[
'base_info'
][
'公司简介'
],
# 简介
'industry'
:
info
[
'base_info'
][
'行业'
],
# 所属行业
'englishName'
:
info
[
'base_info'
][
'英文名'
],
# 英文名
'address'
:
info
[
'base_info'
][
'地址'
],
# 地址
'status'
:
0
,
# 状态
}
except
:
company_dict
=
{
'name'
:
info
[
'base_info'
][
'公司名称'
],
# 企业名称
'socialCreditCode'
:
info
[
'base_info'
][
'信用代码'
],
# 统一社会信用代码
'englishName'
:
info
[
'base_info'
][
'英文名'
],
# 英文名
}
# print(company_dict)
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
],
api_version
=
(
2
,
0
,
2
))
kafka_result
=
producer
.
send
(
"regionInfo"
,
json
.
dumps
(
company_dict
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
...
...
@@ -216,7 +225,7 @@ def saveBaseInfo(info,start):
# 保存高管信息
def
savePeopleInfo
(
info
,
start
):
def
savePeopleInfo
(
info
,
start
):
# 高管信息调用接口
list_people
=
info
[
'people_info'
]
list_one_info
=
[]
...
...
@@ -240,6 +249,7 @@ def savePeopleInfo(info,start):
json_updata
=
json
.
dumps
(
list_one_info
)
# print(json_updata)
if
json_updata
==
'[]'
:
log
.
info
(
"没有高管"
)
pass
else
:
for
i
in
range
(
0
,
3
):
...
...
@@ -274,18 +284,6 @@ def savePeopleInfo(info,start):
return
state
def
rePutIntoR
(
item
):
r
.
rpush
(
'BaseInfoEnterprise:gwqy_socialCode'
,
item
)
# def getInfomation(social_code):
# sql = f"SELECT * FROM EnterpriseInfo WHERE SocialCode = '{social_code}'"
# cursor.execute(sql)
# data = cursor.fetchone()
# return data
# 采集工作
def
beginWork
():
while
True
:
...
...
@@ -298,65 +296,66 @@ def beginWork():
continue
# 数据库中获取基本信息
data
=
baseCore
.
getInfomation
(
social_code
)
name
=
data
[
1
]
enname
=
data
[
5
]
gpdm
=
data
[
3
]
gpdm
=
'0123'
xydm
=
data
[
2
]
# 获取该企业对应项目的采集次数
count
=
data
[
13
]
start_time
=
time
.
time
()
# 股票代码为空跳过
if
gpdm
is
None
:
log
.
error
(
f
"{name}--股票代码为空 跳过"
)
exception
=
'股票代码为空'
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
''
,
exception
)
continue
try
:
retData
=
getInfo
(
name
,
enname
,
gpdm
,
xydm
,
start_time
)
# 基本信息采集成功 进行数据入库,否则不入库
if
retData
[
0
]
==
1
:
# 企业基本信息入库
try
:
saveBaseInfo
(
retData
[
1
],
start_time
)
except
:
log
.
error
(
f
'{name}....企业基本信息Kafka操作失败'
)
exception
=
'Kafka操作失败'
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
''
,
exception
)
# 企业高管信息入库
state
=
savePeopleInfo
(
retData
[
1
],
start_time
)
# 只有企业高管信息和企业基本信息都采集到,该企业才算采集成功
if
state
==
1
:
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
''
,
''
)
if
gpdm
==
''
:
info
=
{
"base_info"
:
{
'公司名称'
:
enname
,
'英文名'
:
enname
,
'信用代码'
:
xydm
,
}}
log
.
error
(
f
'{xydm}....股票代码为空'
)
try
:
saveBaseInfo
(
info
,
start_time
)
except
:
log
.
error
(
f
'{enname}....企业基本信息Kafka操作失败'
)
exception
=
'Kafka操作失败'
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
''
,
exception
)
else
:
try
:
retData
=
getInfo
(
enname
,
gpdm
,
xydm
,
start_time
)
# 基本信息采集成功 进行数据入库,否则不入库
if
retData
[
0
]
==
1
:
# 企业基本信息入库
try
:
saveBaseInfo
(
retData
[
1
],
start_time
)
except
:
log
.
error
(
f
'{enname}....企业基本信息Kafka操作失败'
)
exception
=
'Kafka操作失败'
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
''
,
exception
)
# 企业高管信息入库
state
=
savePeopleInfo
(
retData
[
1
],
start_time
)
# 只有企业高管信息和企业基本信息都采集到,该企业才算采集成功
if
state
==
1
:
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
''
,
''
)
else
:
pass
else
:
pass
else
:
pass
except
Exception
as
e
:
# 若出现尚未发现的错误,则保存错误信息以及出错位置
ee
=
e
.
__traceback__
.
tb_lineno
log
.
error
(
f
'{name}...{xydm}...{gpdm}.....数据采集失败,原因:{ee}行 {e}'
)
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
''
,
f
'数据采集失败,原因:{ee}行 {e}'
)
except
Exception
as
e
:
# 若出现尚未发现的错误,则保存错误信息以及出错位置
ee
=
e
.
__traceback__
.
tb_lineno
log
.
error
(
f
'{enname}...{xydm}...{gpdm}.....数据采集失败,原因:{ee}行 {e}'
)
state
=
0
takeTime
=
baseCore
.
getTimeCost
(
start_time
,
time
.
time
())
baseCore
.
recordLog
(
xydm
,
taskType
,
state
,
takeTime
,
''
,
f
'数据采集失败,原因:{ee}行 {e}'
)
# 企业数据采集完成,采集次数加一
count
+=
1
runType
=
'BaseInfoRunCount'
baseCore
.
updateRun
(
social_code
,
runType
,
count
)
baseCore
.
updateRun
(
social_code
,
runType
,
count
)
# 释放资源
baseCore
.
close
()
if
__name__
==
'__main__'
:
cnx
=
pymysql
.
connect
(
host
=
'114.115.159.144'
,
user
=
'root'
,
password
=
'zzsn9988'
,
db
=
'caiji'
,
charset
=
'utf8mb4'
)
cursor
=
cnx
.
cursor
()
beginWork
()
cursor
.
close
()
cnx
.
close
()
google_comm/googleSpider.py
浏览文件 @
01e0d716
from
urllib.parse
import
urljoin
import
langid
import
pymysql
...
...
@@ -407,12 +408,45 @@ class GoogleSpider(object):
else
:
break
# time.sleep(5)
def
rmTagattr
(
self
,
html
,
url
):
# 使用BeautifulSoup解析网页内容
# soup = BeautifulSoup(html, 'html.parser')
soup
=
self
.
paserUrl
(
html
,
url
)
# 遍历所有标签,并去掉属性
for
tag
in
soup
.
find_all
(
True
):
if
tag
.
name
==
'img'
:
tag
.
attrs
=
{
key
:
value
for
key
,
value
in
tag
.
attrs
.
items
()
if
key
==
'src'
}
elif
tag
.
name
!=
'img'
:
tag
.
attrs
=
{
key
:
value
for
key
,
value
in
tag
.
attrs
.
items
()
if
key
==
'src'
}
else
:
tag
.
attrs
=
{
key
:
value
for
key
,
value
in
tag
.
attrs
.
items
()}
# 打印去掉属性后的网页内容
# print(soup.prettify())
html
=
soup
.
prettify
()
return
html
# 将html中的相对地址转换成绝对地址
def
paserUrl
(
self
,
html
,
listurl
):
soup
=
BeautifulSoup
(
html
,
'html.parser'
)
# 获取所有的<a>标签和<img>标签
links
=
soup
.
find_all
([
'a'
,
'img'
])
# 遍历标签,将相对地址转换为绝对地址
for
link
in
links
:
if
'href'
in
link
.
attrs
:
link
[
'href'
]
=
urljoin
(
listurl
,
link
[
'href'
])
elif
'src'
in
link
.
attrs
:
link
[
'src'
]
=
urljoin
(
listurl
,
link
[
'src'
])
return
soup
#获取资讯内容信息
def
getDetailmsg
(
self
,
detailmsg
):
try
:
detailurl
=
detailmsg
[
'detailUrl'
]
title
=
detailmsg
[
'title'
]
content
,
contentWithTag
=
self
.
extractorMsg
(
detailurl
,
title
)
contentWithTag
=
self
.
rmTagattr
(
contentWithTag
)
except
Exception
as
e
:
content
=
''
contentWithTag
=
''
...
...
sougou_comm/sougouSpider.py
浏览文件 @
01e0d716
...
...
@@ -122,6 +122,7 @@ class SougouSpider(object):
"user-agent"
:
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"
}
# url = 'https://www.sogou.com/link?url=hedJjaC291NbWrwHYHKCyPQj_ei8OKC13fJZ5YRQyvgjcXe6RUhCEXfbi95UdEys0ztd7q5nl6o.'
url
=
f
"https://www.sogou.com{url}"
res
=
requests
.
get
(
url
,
headers
=
header
)
text
=
res
.
text
# 定义正则表达式
...
...
tmp/__init__.py
0 → 100644
浏览文件 @
01e0d716
tmp/usVsRussia/__init__.py
0 → 100644
浏览文件 @
01e0d716
tmp/usVsRussia/ofac.py
0 → 100644
浏览文件 @
01e0d716
#OFAC:美国财政部外国资产控制办公室 (OFAC),数量在200左右,四个类型里的所有带黑点、PDF文件都要。https://ofac.treasury.gov/
# 美国对俄罗斯相关制裁
# 俄罗斯有害外国活动制裁
# https://ofac.treasury.gov/sanctions-programs-and-country-information/russian-harmful-foreign-activities-sanctions
# 乌克兰/俄罗斯有害外国活动制裁
# https://ofac.treasury.gov/sanctions-programs-and-country-information/ukraine-russia-related-sanctions
# 2017年制裁
# https://ofac.treasury.gov/sanctions-programs-and-country-information/countering-americas-adversaries-through-sanctions-act-related-sanctions
# 马格尼茨基制裁
# https://ofac.treasury.gov/sanctions-programs-and-country-information/the-magnitsky-sanctions
import
os
import
pandas
as
pd
import
pymysql
import
requests
from
bs4
import
BeautifulSoup
from
pymysql.converters
import
escape_string
from
selenium.webdriver.common.by
import
By
from
base.BaseCore
import
BaseCore
baseCore
=
BaseCore
()
log
=
baseCore
.
getLogger
()
headers
=
{
'accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
,
'accept-encoding'
:
'gzip, deflate, br'
,
'accept-language'
:
'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7'
,
'cache-control'
:
'max-age=0'
,
# 'cookie': 'maex=%7B%22v2%22%3A%7B%7D%7D; GUC=AQEBBwFjY49jkEIa8gQo&s=AQAAABw20C7P&g=Y2JIFQ; A1=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A3=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A1S=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc&j=WORLD; PRF=t%3D6954.T%252BTEL%252BSOLB.BR%252BSTM%252BEMR%252BGT%252BAMD%252BSYM.DE%252BPEMEX%252BSGO.PA%252BLRLCF%252BSYNH%252B001040.KS; cmp=t=1669714927&j=0&u=1---',
'sec-ch-ua'
:
'"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
"Windows"
,
'sec-fetch-dest'
:
'document'
,
'sec-fetch-mode'
:
'navigate'
,
'sec-fetch-site'
:
'same-origin'
,
'sec-fetch-user'
:
'?1'
,
'upgrade-insecure-requests'
:
'1'
,
'user-agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
# usvsrussia
cnx
=
pymysql
.
connect
(
host
=
'114.115.159.144'
,
user
=
'caiji'
,
password
=
'zzsn9988'
,
db
=
'caiji'
,
charset
=
'utf8mb4'
)
cursor
=
cnx
.
cursor
()
def
job1
():
log
.
info
(
"开始采集----俄罗斯有害外国活动制裁"
)
path
=
r'E:\chromedriver_win32\115\chromedriver.exe'
driverContent
=
baseCore
.
buildDriver
(
path
,
headless
=
False
)
url
=
'https://ofac.treasury.gov/sanctions-programs-and-country-information/russian-harmful-foreign-activities-sanctions'
driverContent
.
get
(
url
)
ftype
=
"Russian Harmful Foreign Activities Sanctions"
# IMPORTANT ADVISORIES AND INFORMATION 重要建议和信息
stype
=
'IMPORTANT ADVISORIES AND INFORMATION'
log
.
info
(
f
"开始采集栏目---{stype}"
)
# //*[@id="node-35986"]/div/ul[1]/li
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-35986"]/div/ul[1]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
#a标签
text
=
aEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
liEle
.
text
.
replace
(
text
,
''
)
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{ftype}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
#log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
#Price Cap Policies //*[@id="node-35986"]/div/ul[2]/li
stype
=
'Price Cap Policies'
log
.
info
(
f
"开始采集栏目---{stype}"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-35986"]/div/ul[2]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{ftype}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
# INTERPRETIVE GUIDANCE 解释指导
#INTERPRETIVE GUIDANCE 单独处理
#FREQUENTLY ASKED QUESTIONS 单独处理
#RUSSIAN HARMFUL FOREIGN ACTIVITIES SANCTIONS DIRECTIVES
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="directives"]/ul/li'
)
stype
=
'RUSSIAN HARMFUL FOREIGN ACTIVITIES SANCTIONS DIRECTIVES'
log
.
info
(
f
"开始采集栏目---{stype}"
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{ftype}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
#APPLYING FOR A SPECIFIC OFAC LICENSE
#GUIDANCE ON OFAC LICENSING POLICY
stype
=
'GUIDANCE ON OFAC LICENSING POLICY'
log
.
info
(
f
"开始采集栏目---{stype}"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-35986"]/div/ul[6]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{ftype}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
#GENERAL LICENSES
stype
=
'GENERAL LICENSES'
log
.
info
(
f
"开始采集栏目---{stype}"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-35986"]/div/ul[7]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{ftype}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
#Executive Orders
stype
=
'Executive Orders'
log
.
info
(
f
"开始采集栏目---{stype}"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-35986"]/div/ul[8]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{ftype}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
#Determinations
stype
=
'Determinations'
log
.
info
(
f
"开始采集栏目---{stype}"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-35986"]/div/ul[9]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{ftype}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
#Statutes
stype
=
'Statutes'
log
.
info
(
f
"开始采集栏目---{stype}"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-35986"]/div/ul[10]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{ftype}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
#Code of Federal Regulations
#Federal Register Notices
stype
=
'Federal Register Notices'
log
.
info
(
f
"开始采集栏目---{stype}"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-35986"]/div/ul[12]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{ftype}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
driverContent
.
close
()
def
job2
():
log
.
info
(
"开始采集----乌克兰-俄罗斯有害外国活动制裁"
)
path
=
r'E:\chromedriver_win32\115\chromedriver.exe'
driverContent
=
baseCore
.
buildDriver
(
path
,
headless
=
False
)
url
=
'https://ofac.treasury.gov/sanctions-programs-and-country-information/ukraine-russia-related-sanctions'
driverContent
.
get
(
url
)
ftype
=
"Ukraine-/Russia-related Sanctions"
# IMPORTANT ADVISORIES
stype
=
'IMPORTANT ADVISORIES'
log
.
info
(
f
"开始采集栏目---{stype}"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-6416"]/div/ul[1]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
# IMPORTANT ADVISORIES
stype
=
'SANCTIONS BROCHURES'
log
.
info
(
f
"开始采集栏目---{stype}"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-6416"]/div/ul[1]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
aEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
#ADDITIONAL UKRAINE-/RUSSIA-RELATED SANCTIONS INFORMATION
#FREQUENTLY ASKED QUESTIONS
#SECTORAL SANCTIONS IDENTIFICATIONS (SSI) LIST
stype
=
'SECTORAL SANCTIONS IDENTIFICATIONS (SSI) LIST'
log
.
info
(
f
"开始采集栏目---{stype}"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="directives"]/ul[1]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
# Archived Directives
stype
=
'SECTORAL SANCTIONS IDENTIFICATIONS (SSI) LIST'
log
.
info
(
f
"开始采集栏目---{stype}---Archived Directives"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="directives"]/ul[2]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','Archived Directives',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
#INTERPRETIVE GUIDANCE
stype
=
'INTERPRETIVE GUIDANCE'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-6416"]/div/ul[5]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
#GUIDANCE ON OFAC LICENSING POLICY
stype
=
'GUIDANCE ON OFAC LICENSING POLICY'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-6416"]/div/ul[7]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
#time = baseCore.getSubStr(text, '(', ')')
time
=
''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
# GENERAL LICENSES
stype
=
'GENERAL LICENSES'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-6416"]/div/ul[8]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
#time = ''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
# Executive Orders
stype
=
'Executive Orders'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-6416"]/div/ul[9]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
# time = ''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
# Determinations
stype
=
'Determinations'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-6416"]/div/ul[10]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
#time = baseCore.getSubStr(text, '(', ')')
time
=
''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
# Statutes
stype
=
'Statutes'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-6416"]/div/ul[11]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
# time = baseCore.getSubStr(text, '(', ')')
time
=
''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
# Federal Register Notices
stype
=
'Federal Register Notices'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-6416"]/div/ul[13]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
# time = baseCore.getSubStr(text, '(', ')')
time
=
''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
driverContent
.
close
()
def
job3
():
log
.
info
(
"开始采集----2017年制裁"
)
path
=
r'E:\chromedriver_win32\115\chromedriver.exe'
driverContent
=
baseCore
.
buildDriver
(
path
,
headless
=
False
)
url
=
'https://ofac.treasury.gov/sanctions-programs-and-country-information/countering-americas-adversaries-through-sanctions-act-related-sanctions'
driverContent
.
get
(
url
)
ftype
=
"Countering America's Adversaries Through Sanctions Act of 2017 (CAATSA)"
stype
=
'Countering Americas Adversaries Through Sanctions Act-Related Sanctions'
href
=
"https://congress.gov/115/plaws/publ44/PLAW-115publ44.pdf"
text
=
"Countering America’s Adversaries Through Sanctions Act” (Public Law 115-44) (CAATSA)"
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
else
:
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','August 2, 2017',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
#Other Documents Related to the Implementation of Section 105
stype
=
'Other Documents Related to the Implementation of Section 105'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-7161"]/div/ul[2]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
#time = ''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
# Ukraine-/Russia-related Directives
stype
=
'Ukraine-/Russia-related Directives'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-7161"]/div/ul[4]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
# time = ''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
# ADDITIONAL CAATSA GUIDANCE AND INFORMATION
stype
=
'ADDITIONAL CAATSA GUIDANCE AND INFORMATION'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-7161"]/div/ul[6]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
# time = ''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
# AExecutive Orders
stype
=
'Executive Orders'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-7161"]/div/ul[8]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
# time = ''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
# Statutes
stype
=
'Statutes'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-7161"]/div/ul[9]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
#time = baseCore.getSubStr(text, '(', ')')
time
=
''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
driverContent
.
close
()
def
job4
():
log
.
info
(
"开始采集----马格尼茨基制裁"
)
path
=
r'E:\chromedriver_win32\115\chromedriver.exe'
driverContent
=
baseCore
.
buildDriver
(
path
,
headless
=
False
)
url
=
'https://ofac.treasury.gov/sanctions-programs-and-country-information/the-magnitsky-sanctions'
driverContent
.
get
(
url
)
ftype
=
"Magnitsky Sanctions"
# INTERPRETIVE GUIDANCE
stype
=
'INTERPRETIVE GUIDANCE'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-6306"]/div/ul[2]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
time
=
baseCore
.
getSubStr
(
text
,
'('
,
')'
)
#time = ''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
#GUIDANCE ON OFAC LICENSING POLICY
stype
=
'GUIDANCE ON OFAC LICENSING POLICY'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-6306"]/div/ul[4]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
# time = baseCore.getSubStr(text, '(', ')')
time
=
''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
# Statutes
stype
=
'Statutes'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-6306"]/div/ul[5]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
# time = baseCore.getSubStr(text, '(', ')')
time
=
''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
# Federal Register Notices
stype
=
'Federal Register Notices'
log
.
info
(
f
"开始采集栏目---{stype}---"
)
liEles
=
driverContent
.
find_elements
(
By
.
XPATH
,
'//*[@id="node-6306"]/div/ul[7]/li'
)
for
liEle
in
liEles
:
aEle
=
liEle
.
find_element
(
By
.
TAG_NAME
,
'a'
)
# a标签
text
=
liEle
.
text
href
=
aEle
.
get_attribute
(
'href'
)
# time = baseCore.getSubStr(text, '(', ')')
time
=
''
selectCountSql
=
f
"select count(1) from usvsrussia where ftype='{escape_string(ftype)}' and stype='{stype}' and url='{href} '"
cursor
.
execute
(
selectCountSql
)
count
=
cursor
.
fetchone
()[
0
]
if
count
>
0
:
log
.
info
(
"已采集,跳过"
)
continue
else
:
pass
insertSql
=
f
"insert into usvsrussia (website,ftype,stype,ttype,url,title,pub_time,state) values ("
\
f
"'美国财政部外国资产控制办公室','{escape_string(ftype)}','{stype}','',"
\
f
"'{href}','{escape_string(text)}','{time}',0)"
# log.info(insertSql)
cursor
.
execute
(
insertSql
)
cnx
.
commit
()
driverContent
.
close
()
if
__name__
==
'__main__'
:
log
.
info
(
"美国财政部外国资产控制办公室 (OFAC)网站开始采集"
)
job1
()
job2
()
job3
()
job4
()
baseCore
.
close
()
cursor
.
close
()
cnx
.
close
()
\ No newline at end of file
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论