Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
a45d37fb
提交
a45d37fb
authored
2月 21, 2024
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
国外智库
上级
4f59604c
全部展开
隐藏空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
204 行增加
和
0 行删除
+204
-0
BaseCore.py
gwzk/BaseCore.py
+0
-0
europa.py
gwzk/europa.py
+204
-0
没有找到文件。
gwzk/BaseCore.py
浏览文件 @
a45d37fb
差异被折叠。
点击展开。
gwzk/europa.py
0 → 100644
浏览文件 @
a45d37fb
"""
国外智库-欧盟 经合组织
"""
import
json
import
time
import
pymongo
from
bs4
import
BeautifulSoup
import
requests
from
datetime
import
datetime
from
kafka
import
KafkaProducer
from
retry
import
retry
import
BaseCore
baseCore
=
BaseCore
.
BaseCore
()
log
=
baseCore
.
getLogger
()
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017'
,
username
=
'admin'
,
password
=
'ZZsn@9988'
)
.
ZZSN
[
'国外智库'
]
@retry
(
tries
=
2
,
delay
=
5
)
def
sendKafka
(
dic
):
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
],
max_request_size
=
1024
*
1024
*
20
)
kafka_result
=
producer
.
send
(
"research_center_fourth"
,
json
.
dumps
(
dic
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
log
.
info
(
f
'{dic["sourceAddress"]}传输成功'
)
def
secrchATT
(
item_id
,
retData
,
type_id
,
order_by
):
sel_sql
=
'''select id from clb_sys_attachment where item_id =
%
s and path =
%
s and type_id=
%
s and order_by=
%
s '''
baseCore
.
cursor_
.
execute
(
sel_sql
,
(
item_id
,
retData
[
'path'
],
type_id
,
order_by
))
selects
=
baseCore
.
cursor_
.
fetchone
()
return
selects
# 插入到att表 返回附件id
def
tableUpdate
(
retData
,
file_name
,
num
,
publishDate
,
origin
):
item_id
=
retData
[
'item_id'
]
type_id
=
retData
[
'type_id'
]
group_name
=
retData
[
'group_name'
]
path
=
retData
[
'path'
]
full_path
=
retData
[
'full_path'
]
category
=
retData
[
'category'
]
file_size
=
retData
[
'file_size'
]
status
=
retData
[
'status'
]
create_by
=
retData
[
'create_by'
]
page_size
=
retData
[
'page_size'
]
create_time
=
retData
[
'create_time'
]
order_by
=
num
object_key
=
full_path
.
split
(
'https://zzsn.obs.cn-north-1.myhuaweicloud.com/'
)[
1
]
Upsql
=
'''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name,publish_time,source) values(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)'''
values
=
(
file_name
+
'.pdf'
,
type_id
,
item_id
,
group_name
,
path
,
full_path
,
category
,
file_size
,
order_by
,
status
,
create_by
,
create_time
,
object_key
,
'zzsn'
,
publishDate
,
origin
)
baseCore
.
cursor_
.
execute
(
Upsql
,
values
)
# 插入
baseCore
.
cnx_
.
commit
()
# 提交
baseCore
.
getLogger
()
.
info
(
"更新完成:{}"
.
format
(
Upsql
))
selects
=
secrchATT
(
item_id
,
retData
,
type_id
,
order_by
)
id
=
selects
[
0
]
return
id
def
save_data
(
dic_news
):
aaa_dic
=
{
'附件id'
:
dic_news
[
'attachmentIds'
],
'网址'
:
dic_news
[
'sourceAddress'
],
'tid'
:
''
,
'来源'
:
f
"经济合作与发展组织"
,
'创建时间'
:
dic_news
[
'createDate'
],
'带标签内容'
:
dic_news
[
'contentWithTag'
][:
100
],
'发布时间'
:
dic_news
[
'publishDate'
],
'标题'
:
dic_news
[
'title'
]
}
db_storage
.
insert_one
(
aaa_dic
)
@retry
(
tries
=
2
,
delay
=
5
)
def
translate
(
title
,
contentWithTag
):
headers
=
{
'Content-Type'
:
'application/json'
,
}
dic_info
=
{
'title'
:
title
,
# 'summary': '<div>apple</div>',
'contentWithTag'
:
contentWithTag
}
dic_info
=
json
.
dumps
(
dic_info
)
req
=
requests
.
post
(
'http://117.78.23.14:5001/translate'
,
data
=
dic_info
,
headers
=
headers
)
dataJson
=
req
.
json
()
if
dataJson
[
'status'
]
==
'failed'
:
raise
titleRaw
=
dataJson
[
'title'
]
contentWithTagRaw
=
dataJson
[
'contentWithTag'
]
titleRaw
=
BeautifulSoup
(
titleRaw
,
'html.parser'
)
titleRaw
=
titleRaw
.
text
contentWithTagRaw
=
BeautifulSoup
(
contentWithTagRaw
,
'html.parser'
)
return
titleRaw
,
contentWithTagRaw
def
doJob
():
num
=
1
url
=
'https://www.oecd-ilibrary.org/economics/oecd-policy-responses-on-the-impacts-of-the-war-in-ukraine_dc825602-en?page=1'
headers
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Cache-Control'
:
'max-age=0'
,
'Cookie'
:
'JSESSIONID=BHezogPwi8NJVECsKXCXqijdQ00-yMJHw_gR8wiC.ip-10-240-5-121; __cf_bm=c2byUypnSjXPS_UFDM7BMRGDxN6AQEkNVUjzw9HuSq8-1707054653-1-AbbI7JWWkfWKVGi8SKI06f0jGEjPdk5kvHAIRRpBHSSSnmxj1IcvGUT8+/O6R0U2RLZJECZdUzZIXAwFuEz5lPo=; _gcl_au=1.1.201344533.1707054655; _gid=GA1.2.557164000.1707054655; cb-enabled=enabled; cf_clearance=6tK6.WKHJbXXoV4NTgbyHRhetRxMdWPZofwlv01F65Y-1707054656-1-AfrYlWnLLZFC1sKxeFVQintPrZnjvjoJSZwRRhAYwqRHGdWbU5IFZQDJZJM21l20Tj6gk4JxNobWT0wGzp1Dgjw=; _ce.irv=new; cebs=1; _ce.clock_event=1; _ce.clock_data=72
%2
C123.149.3.159
%2
C1
%2
C9c1ce27f08b16479d2e17743062b28ed; custom_cookie_AB=1; AWSALB=I/eGQ0glcxuROskD1JKEl/dqsqElpmo/MnwLboJZJB2QthQFFWnLA3gzuJTskEaZxJD7VuWEEsqjhLVvhq4q2Wt0RebuRhukeHpKvgmGMelxpn/RiDmehyvxTOiS; AWSALBCORS=I/eGQ0glcxuROskD1JKEl/dqsqElpmo/MnwLboJZJB2QthQFFWnLA3gzuJTskEaZxJD7VuWEEsqjhLVvhq4q2Wt0RebuRhukeHpKvgmGMelxpn/RiDmehyvxTOiS; _gat_UA-1887794-2=1; _dc_gtm_UA-136634323-1=1; _ga_F5XZ540Q4V=GS1.1.1707054655.1.1.1707055119.7.0.0; _ga=GA1.1.1014316406.1707054655; _ga_F7KSNTXTRX=GS1.1.1707054655.1.1.1707055119.0.0.0; cebsp_=5; _ce.s=v~212f033193b9432855ae8335d6d3969cc1f8b751~lcw~1707055134688~lva~1707054658247~vpv~0~v11.fhb~1707054659602~v11.lhb~1707055126493~v11.cs~325107~v11.s~6d7ba630-c364-11ee-aba8-136dbbf9a447~v11.sla~1707055134688~v11.send~1707055135439~lcw~1707055135439'
,
'Referer'
:
'https://www.oecd-ilibrary.org/economics/oecd-policy-responses-on-the-impacts-of-the-war-in-ukraine_dc825602-en?page=2'
,
'Sec-Ch-Ua'
:
'"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"'
,
'Sec-Ch-Ua-Mobile'
:
'?0'
,
'Sec-Ch-Ua-Platform'
:
'"Windows"'
,
'Sec-Fetch-Dest'
:
'document'
,
'Sec-Fetch-Mode'
:
'navigate'
,
'Sec-Fetch-Site'
:
'same-origin'
,
'Sec-Fetch-User'
:
'?1'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
req
=
requests
.
get
(
url
=
url
,
headers
=
headers
)
soup
=
BeautifulSoup
(
req
.
content
,
'html.parser'
)
div_part
=
soup
.
find_all
(
'div'
,
class_
=
'col-xs-12 body-section'
)[
1
]
div_list
=
div_part
.
find_all
(
'div'
,
class_
=
'row panel'
)
for
div
in
div_list
:
start_time
=
time
.
time
()
title
=
div
.
find
(
'div'
,
class_
=
'col-lg-7 col-xs-12 resume-item'
)
.
find
(
'p'
,
class_
=
'intro-item'
)
.
find
(
'strong'
,
class_
=
'book-title'
)
.
text
href
=
'https://www.oecd-ilibrary.org'
+
div
.
find
(
'div'
,
class_
=
'col-lg-7 col-xs-12 resume-item'
)
.
find
(
'p'
,
class_
=
'intro-item'
)
.
find
(
'a'
)[
'href'
]
is_href
=
db_storage
.
find_one
({
'网址'
:
href
})
if
is_href
:
log
.
info
(
f
'{href}===已采集'
)
continue
pubtime_
=
div
.
find
(
'div'
,
class_
=
'col-lg-7 col-xs-12 resume-item'
)
.
find
(
'p'
,
class_
=
'intro-item'
)
.
find
(
'strong'
,
class_
=
'book-title gray'
)
.
text
# 定义原始时间的格式
time_format
=
"
%
d
%
b
%
Y"
# 转换为标准时间
standard_time
=
datetime
.
strptime
(
pubtime_
,
time_format
)
.
strftime
(
"
%
Y-
%
m-
%
d"
)
if
standard_time
>
'2023-01-30'
:
pass
else
:
break
year
=
standard_time
[:
4
]
pdf_part
=
div
.
find
(
'div'
,
class_
=
'col-lg-5 col-xs-12 actions-item'
)
.
find
(
'ul'
,
class_
=
'actions'
)
.
find_all
(
'li'
)[
1
]
.
find
(
'a'
)
.
get
(
'href'
)
pdf_url
=
'https://www.oecd-ilibrary.org'
+
pdf_part
req_news
=
requests
.
get
(
url
=
href
,
headers
=
headers
)
soup_news
=
BeautifulSoup
(
req_news
.
content
,
'html.parser'
)
# print(title, standard_time, pdf_url, href)
contentWithTag
=
soup_news
.
find
(
'div'
,
class_
=
'description js-desc-fade show-all'
)
content
=
contentWithTag
.
get_text
()
# todo:翻译
try
:
titleRaw
,
contentWithTagRaw
=
translate
(
str
(
title
),
str
(
contentWithTag
))
log
.
info
(
f
'{href}===翻译成功'
)
except
Exception
as
e
:
log
.
error
(
f
'{href}===翻译失败==={e}'
)
continue
retData
=
baseCore
.
uptoOBS
(
pdf_url
,
title
,
15
,
''
,
pathType
,
taskType
,
start_time
,
create_by
)
num
+=
1
id_list
=
[]
if
retData
[
'state'
]:
att_id
=
tableUpdate
(
retData
,
title
,
num
,
standard_time
,
'经济合作与发展组织'
)
if
att_id
:
id_list
.
append
(
att_id
)
now
=
datetime
.
now
()
.
strftime
(
'
%
Y-
%
m-
%
d
%
H:
%
M:
%
S'
)
lang
=
baseCore
.
detect_language
(
content
)
contentRaw
=
contentWithTagRaw
.
text
contentWithTagRaw
=
str
(
contentWithTagRaw
)
dic
=
{
'id'
:
f
'1620244462491893761{int(time.time())}'
,
'subjectId'
:
'1620244462491893761'
,
'checkStatus'
:
1
,
'deleteFlag'
:
0
,
'topNum'
:
0
,
'content'
:
content
,
'contentRaw'
:
contentRaw
,
'contentWithTag'
:
str
(
contentWithTag
),
'contentWithTagRaw'
:
contentWithTagRaw
,
'createDate'
:
now
,
'labels'
:
[
{
'labelMark'
:
'organization'
,
'relationId'
:
'1619903523269271554'
,
'relationName'
:
'经济合作与发展组织'
}],
'lang'
:
lang
,
'origin'
:
'经济合作与发展组织'
,
'publishDate'
:
standard_time
,
'sourceAddress'
:
href
,
'title'
:
title
,
'titleRaw'
:
titleRaw
,
'updateDate'
:
now
,
'attachmentIds'
:
id_list
}
sendKafka
(
dic
)
try
:
save_data
(
dic
)
except
:
log
.
error
(
f
'{href}===数据库保存失败'
)
# break
if
__name__
==
"__main__"
:
pathType
=
'PolicyDocuments/'
taskType
=
'国外智库-经合组织'
create_by
=
'XueLingKun'
doJob
()
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论