Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
fd395ea2
提交
fd395ea2
authored
1月 15, 2024
作者:
LiuLiYuan
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
中科软 01/15
上级
14054899
全部展开
隐藏空白字符变更
内嵌
并排
正在显示
4 个修改的文件
包含
222 行增加
和
0 行删除
+222
-0
baidufanyi.py
zkr/baidufanyi.py
+222
-0
esToMongodb.py
zkr/esToMongodb.py
+0
-0
推送.py
zkr/推送.py
+0
-0
翻译.py
zkr/翻译.py
+0
-0
没有找到文件。
zkr/baidufanyi.py
0 → 100644
浏览文件 @
fd395ea2
#coding:utf-8
# 百度翻译 不登录翻译1000字 登录翻译5000字
import
re
import
string
import
time
from
urllib.parse
import
quote
import
psutil
import
pymongo
from
bs4
import
BeautifulSoup
from
bson
import
ObjectId
from
selenium
import
webdriver
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.support
import
expected_conditions
as
EC
from
selenium.webdriver.support.wait
import
WebDriverWait
# from selenium.webdriver.chrome.service import Service
from
selenium.webdriver.firefox.service
import
Service
from
selenium.webdriver.firefox.options
import
Options
from
selenium.webdriver.common.proxy
import
Proxy
,
ProxyType
from
func_timeout
import
func_set_timeout
from
base.BaseCore
import
BaseCore
baseCore
=
BaseCore
()
class
Translate
():
def
__init__
(
self
):
self
.
url
=
"https://fanyi.baidu.com/#"
self
.
header
=
{
"User-Agent"
:
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1"
}
self
.
browser
=
self
.
createDriver
()
self
.
db_storage
=
\
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017'
,
username
=
'admin'
,
password
=
'ZZsn@9988'
)
.
中科软
[
'数据源_0106'
]
def
close
(
self
):
self
.
browser
.
quit
()
def
is_website_link
(
self
,
string
):
pattern
=
r"^(http|https)?(://)?[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+(/[a-zA-Z0-9-_.?=/]*)?$"
if
re
.
match
(
pattern
,
string
):
return
True
else
:
return
False
def
createDriver
(
self
):
proxy_
=
baseCore
.
get_proxy
()
profile
=
webdriver
.
FirefoxProfile
()
profile
.
set_preference
(
'network.proxy.type'
,
1
)
profile
.
set_preference
(
'network.proxy.http'
,
proxy_
[
'http'
]
.
split
(
'://'
)[
1
]
.
split
(
':'
)[
0
])
profile
.
set_preference
(
'network.proxy.http_port'
,
int
(
proxy_
[
'http'
]
.
split
(
'://'
)[
1
]
.
split
(
':'
)[
1
]))
profile
.
set_preference
(
'network.proxy.ssl'
,
proxy_
[
'http'
]
.
split
(
'://'
)[
1
]
.
split
(
':'
)[
0
])
profile
.
set_preference
(
'network.proxy.ssl_port'
,
int
(
proxy_
[
'http'
]
.
split
(
'://'
)[
1
]
.
split
(
':'
)[
1
]))
profile
.
update_preferences
()
service
=
Service
(
r'F:\spider\firefox\geckodriver_1.exe'
)
options
=
Options
()
options
.
set_preference
(
"general.useragent.override"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
)
options
.
add_argument
(
'--headless'
)
options
.
add_argument
(
'--disable-gpu'
)
options
.
add_argument
(
'--private'
)
browser
=
webdriver
.
Firefox
(
firefox_profile
=
profile
,
service
=
service
,
options
=
options
)
return
browser
def
kill_firefox
(
self
):
for
proc
in
psutil
.
process_iter
():
try
:
if
proc
.
name
()
==
"firefox.exe"
:
proc
.
kill
()
except
(
psutil
.
NoSuchProcess
,
psutil
.
AccessDenied
,
psutil
.
ZombieProcess
):
pass
def
translate
(
self
,
sentence
,
lang
):
sentence_
=
sentence
wait
=
WebDriverWait
(
self
.
browser
,
20
)
try
:
word_type
=
self
.
get_input_language_type
(
sentence_
,
wait
)
except
:
self
.
browser
.
quit
()
self
.
browser
=
self
.
createDriver
()
result
=
self
.
translate
(
sentence_
,
lang
)
return
result
if
word_type
:
if
word_type
==
lang
:
pass
else
:
word_type
=
lang
url
=
self
.
url
.
format
(
word_type
,
'zh'
,
sentence_
)
url
=
quote
(
url
,
safe
=
'/:#'
)
self
.
browser
.
set_page_load_timeout
(
10
)
try
:
self
.
browser
.
get
(
url
)
wait
.
until
(
EC
.
presence_of_element_located
(
(
By
.
XPATH
,
'//*[@id="main-outer"]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p[2]'
)))
result_
=
self
.
browser
.
find_element
(
By
.
XPATH
,
'//*[@id="main-outer"]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p[2]'
)
result
=
result_
.
text
.
strip
()
return
result
except
:
self
.
browser
.
quit
()
self
.
browser
=
self
.
createDriver
()
result
=
self
.
translate
(
sentence_
,
lang
)
return
result
@func_set_timeout
(
30
)
def
get_input_language_type
(
self
,
word
,
wait
):
self
.
browser
.
get
(
"https://fanyi.baidu.com/"
)
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
ID
,
"baidu_translate_input"
)))
input_word
=
self
.
browser
.
find_element
(
By
.
ID
,
"baidu_translate_input"
)
input_word
.
send_keys
(
word
)
wait
.
until
(
EC
.
presence_of_element_located
(
(
By
.
XPATH
,
'//*[@id="main-outer"]/div/div/div[1]/div[1]/div[1]/a[1]/span/span'
)))
word_type
=
self
.
browser
.
find_element
(
By
.
XPATH
,
'//*[@id="main-outer"]/div/div/div[1]/div[1]/div[1]/a[1]/span/span'
)
word_type
=
word_type
.
get_attribute
(
"data-lang"
)
return
word_type
def
is_punctuation
(
self
,
char
):
punctuation
=
string
.
punctuation
+
'、'
+
'('
+
'…'
+
')'
+
'《'
+
'》'
+
'“'
+
'”'
+
':'
+
';'
+
'!'
+
' '
+
'。'
return
char
in
punctuation
def
sentence_split_sentence
(
self
,
contentWithTag
):
pattern
=
re
.
compile
(
r'[^\n]+(?=\n)|[^\n]+$'
)
match_group
=
pattern
.
finditer
(
contentWithTag
)
sentences
=
[]
if
match_group
:
for
_
in
match_group
:
start_end_index
=
_
.
span
()
sentences
.
append
((
start_end_index
[
0
],
start_end_index
[
1
],
_
.
group
()))
if
(
not
sentences
)
and
(
len
(
contentWithTag
)
>=
4
):
sentences
.
append
((
0
,
len
(
contentWithTag
),
contentWithTag
))
return
sentences
def
jionstr
(
self
,
html
):
paragraphs
=
[]
current_sentence
=
''
for
tag
in
html
.
find_all
(
text
=
True
):
sentence
=
str
(
tag
)
if
sentence
==
'
\n
'
or
sentence
==
'
\t
'
or
sentence
==
' '
:
continue
if
self
.
is_punctuation
(
sentence
):
continue
if
sentence
.
startswith
(
'https://'
)
or
sentence
.
startswith
(
'http://'
)
or
sentence
.
startswith
(
'www.'
):
continue
# 检查拼接后的句子长度是否超过1000字
if
len
(
current_sentence
)
+
len
(
sentence
)
<=
1000
:
current_sentence
+=
sentence
else
:
paragraphs
.
append
(
current_sentence
.
strip
())
current_sentence
=
sentence
return
paragraphs
@func_set_timeout
(
300
)
def
gethtml
(
self
,
contentWithTag
):
tag_list
=
[]
html
=
BeautifulSoup
(
contentWithTag
,
'html.parser'
)
content
=
html
.
text
lang
=
baseCore
.
detect_language
(
content
)
if
lang
==
'zh'
:
return
contentWithTag
for
tag
in
html
.
find_all
(
text
=
True
):
sentence
=
str
(
tag
)
.
strip
()
tag_list
.
append
(
sentence
)
sentence
=
''
num
=
0
for
tag
in
tag_list
:
if
tag
.
strip
()
==
''
:
continue
if
self
.
is_website_link
(
str
(
tag
)
.
strip
()):
continue
sentence
+=
f
'{tag}😊'
num
+=
1
result
=
''
while
True
:
if
len
(
sentence
.
strip
())
==
1
and
self
.
is_punctuation
(
sentence
.
strip
()):
result
+=
sentence
break
if
len
(
sentence
)
>
1000
:
index_1000
=
sentence
[
999
]
# 判断该字符是不是逗号或句号
if
index_1000
==
'.'
or
index_1000
==
'。'
or
index_1000
==
','
or
index_1000
==
','
:
# 如果是标点符号
result
+=
self
.
translate
(
sentence
[:
1000
]
.
strip
(),
lang
)
sentence
=
sentence
[
1000
:]
else
:
# 如果不是标点符号
i
=
1000
while
i
>=
0
:
j
=
i
-
1
if
j
<=
0
:
break
index_punctuation
=
sentence
[
j
]
if
index_punctuation
==
'.'
or
index_punctuation
==
'。'
or
index_punctuation
==
','
or
index_punctuation
==
','
:
result
+=
self
.
translate
(
sentence
[:
j
+
1
]
.
strip
(),
lang
)
sentence
=
sentence
[
j
+
1
:]
break
else
:
i
=
j
continue
if
i
==
1
:
result
+=
self
.
translate
(
sentence
[:
1000
]
.
strip
(),
lang
)
sentence
=
sentence
[
1000
:]
else
:
# 翻译
result
+=
self
.
translate
(
sentence
,
lang
)
time
.
sleep
(
2
)
break
sentences
=
result
.
split
(
'😊'
)
num
=
0
for
tag
in
html
.
find_all
(
text
=
True
):
if
tag
.
strip
()
==
''
:
continue
if
self
.
is_website_link
(
str
(
tag
)
.
strip
()):
continue
sentence
=
sentences
[
num
]
tag
.
replace_with
(
sentence
)
num
+=
1
return
str
(
html
.
prettify
())
+
'<p/><br>译文来源:微软自动翻译<br></p>'
zkr/esToMongodb.py
0 → 100644
浏览文件 @
fd395ea2
差异被折叠。
点击展开。
zkr/推送.py
0 → 100644
浏览文件 @
fd395ea2
差异被折叠。
点击展开。
zkr/翻译.py
0 → 100644
浏览文件 @
fd395ea2
差异被折叠。
点击展开。
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论