Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
王景浩
zzsn_spider
Commits
de21c2fe
提交
de21c2fe
authored
1月 12, 2024
作者:
LiuLiYuan
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fanyi 01/12
上级
b7d2cc8d
全部展开
隐藏空白字符变更
内嵌
并排
正在显示
3 个修改的文件
包含
217 行增加
和
0 行删除
+217
-0
baidufanyi.py
百度翻译/baidufanyi.py
+217
-0
fanyi_test.py
百度翻译/fanyi_test.py
+0
-0
test.py
百度翻译/test.py
+0
-0
没有找到文件。
百度翻译/baidufanyi.py
0 → 100644
浏览文件 @
de21c2fe
#coding:utf-8
#coding:utf-8
# 百度翻译 不登录翻译1000字 登录翻译5000字
import
re
import
string
import
time
from
urllib.parse
import
quote
import
pymongo
from
bs4
import
BeautifulSoup
from
bson
import
ObjectId
from
selenium
import
webdriver
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.support
import
expected_conditions
as
EC
from
selenium.webdriver.support.wait
import
WebDriverWait
# from selenium.webdriver.chrome.service import Service
from
selenium.webdriver.firefox.service
import
Service
from
selenium.webdriver.firefox.options
import
Options
from
base.BaseCore
import
BaseCore
baseCore
=
BaseCore
()
class
Translate
():
def
__init__
(
self
):
self
.
url
=
"https://fanyi.baidu.com/#"
self
.
header
=
{
"User-Agent"
:
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1"
}
self
.
browser
=
self
.
createDriver
()
self
.
db_storage
=
\
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017'
,
username
=
'admin'
,
password
=
'ZZsn@9988'
)
.
中科软
[
'数据源_0106'
]
def
close
(
self
):
self
.
browser
.
quit
()
def
createDriver
(
self
):
# chrome_driver = r'F:\spider\117\chromedriver-win64\chromedriver.exe'
# path = Service(chrome_driver)
# chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--disable-gpu')
# chrome_options.add_argument('--ignore-certificate-errors')
# chrome_options.add_argument("--disable-blink-features=AutomationControlled")
# chrome_options.add_argument("--start-maximized")
# proxy = baseCore.get_proxy()
# chrome_options.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
# chrome_options.add_argument(
# 'user-agent=' + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
#
# browser = webdriver.Chrome(service=path, chrome_options=chrome_options)
service
=
Service
(
r'F:\spider\firefox\geckodriver_1.exe'
)
options
=
Options
()
options
.
set_preference
(
"general.useragent.override"
,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
)
browser
=
webdriver
.
Firefox
(
options
=
options
,
service
=
service
)
return
browser
def
translate
(
self
,
sentence
,
lang
):
sentence_
=
sentence
wait
=
WebDriverWait
(
self
.
browser
,
20
)
try
:
word_type
=
self
.
get_input_language_type
(
sentence_
,
wait
)
except
:
self
.
browser
.
quit
()
self
.
browser
=
self
.
createDriver
()
result
=
self
.
translate
(
sentence_
,
lang
)
return
result
if
word_type
:
if
word_type
==
lang
:
pass
else
:
word_type
=
lang
url
=
self
.
url
.
format
(
word_type
,
'zh'
,
sentence_
)
url
=
quote
(
url
,
safe
=
'/:#'
)
self
.
browser
.
set_page_load_timeout
(
10
)
try
:
self
.
browser
.
get
(
url
)
wait
.
until
(
EC
.
presence_of_element_located
(
(
By
.
XPATH
,
'//*[@id="main-outer"]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p[2]'
)))
result_
=
self
.
browser
.
find_element
(
By
.
XPATH
,
'//*[@id="main-outer"]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p[2]'
)
result
=
result_
.
text
.
strip
()
return
result
except
:
self
.
browser
.
quit
()
self
.
browser
=
self
.
createDriver
()
result
=
self
.
translate
(
sentence_
,
lang
)
return
result
def
get_input_language_type
(
self
,
word
,
wait
):
self
.
browser
.
get
(
"https://fanyi.baidu.com/"
)
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
ID
,
"baidu_translate_input"
)))
input_word
=
self
.
browser
.
find_element
(
By
.
ID
,
"baidu_translate_input"
)
input_word
.
send_keys
(
word
)
wait
.
until
(
EC
.
presence_of_element_located
(
(
By
.
XPATH
,
'//*[@id="main-outer"]/div/div/div[1]/div[1]/div[1]/a[1]/span/span'
)))
word_type
=
self
.
browser
.
find_element
(
By
.
XPATH
,
'//*[@id="main-outer"]/div/div/div[1]/div[1]/div[1]/a[1]/span/span'
)
word_type
=
word_type
.
get_attribute
(
"data-lang"
)
return
word_type
def
is_punctuation
(
self
,
char
):
punctuation
=
string
.
punctuation
+
'、'
+
'('
+
'…'
+
')'
+
'《'
+
'》'
+
'“'
+
'”'
+
':'
+
';'
+
'!'
+
' '
+
'。'
return
char
in
punctuation
def
sentence_split_sentence
(
self
,
contentWithTag
):
pattern
=
re
.
compile
(
r'[^\n]+(?=\n)|[^\n]+$'
)
match_group
=
pattern
.
finditer
(
contentWithTag
)
sentences
=
[]
if
match_group
:
for
_
in
match_group
:
start_end_index
=
_
.
span
()
sentences
.
append
((
start_end_index
[
0
],
start_end_index
[
1
],
_
.
group
()))
if
(
not
sentences
)
and
(
len
(
contentWithTag
)
>=
4
):
sentences
.
append
((
0
,
len
(
contentWithTag
),
contentWithTag
))
return
sentences
def
jionstr
(
self
,
html
):
paragraphs
=
[]
current_sentence
=
''
for
tag
in
html
.
find_all
(
text
=
True
):
sentence
=
str
(
tag
)
if
sentence
==
'
\n
'
or
sentence
==
'
\t
'
or
sentence
==
' '
:
continue
if
self
.
is_punctuation
(
sentence
):
continue
if
sentence
.
startswith
(
'https://'
)
or
sentence
.
startswith
(
'http://'
)
or
sentence
.
startswith
(
'www.'
):
continue
# 检查拼接后的句子长度是否超过1000字
if
len
(
current_sentence
)
+
len
(
sentence
)
<=
1000
:
current_sentence
+=
sentence
else
:
paragraphs
.
append
(
current_sentence
.
strip
())
current_sentence
=
sentence
return
paragraphs
def
gethtml
(
self
,
contentWithTag
):
tag_list
=
[]
html
=
BeautifulSoup
(
contentWithTag
,
'html.parser'
)
content
=
html
.
text
lang
=
baseCore
.
detect_language
(
content
)
if
lang
==
'zh'
:
return
contentWithTag
for
tag
in
html
.
find_all
(
text
=
True
):
sentence
=
str
(
tag
)
.
strip
()
tag_list
.
append
(
sentence
)
sentence
=
''
for
tag
in
tag_list
:
if
tag
==
''
:
continue
sentence
+=
f
'{tag}😊'
# if len(sentence) == 1:
# continue
# if sentence == '\n' or sentence == '\t' or sentence == ' ':
# continue
# if self.is_punctuation(sentence):
# continue
#print(sentence)
result
=
''
while
True
:
if
len
(
sentence
)
>
1000
:
index_1000
=
sentence
[
999
]
# 判断该字符是不是逗号或句号
if
index_1000
==
'.'
or
index_1000
==
'。'
or
index_1000
==
','
or
index_1000
==
','
:
# 如果是标点符号
result
+=
self
.
translate
(
sentence
[:
1000
]
.
strip
(),
lang
)
sentence
=
sentence
[
1000
:]
else
:
# 如果不是标点符号
i
=
1000
while
i
>=
0
:
j
=
i
-
1
if
j
<=
0
:
break
index_punctuation
=
sentence
[
j
]
if
index_punctuation
==
'.'
or
index_punctuation
==
'。'
or
index_punctuation
==
','
or
index_punctuation
==
','
:
result
+=
self
.
translate
(
sentence
[:
j
+
1
]
.
strip
(),
lang
)
sentence
=
sentence
[
j
+
1
:]
# result += self.translate(sentence[j + 1:].strip(), lang)
break
else
:
i
=
j
continue
if
i
==
1
:
result
+=
self
.
translate
(
sentence
[:
1000
]
.
strip
(),
lang
)
sentence
=
sentence
[
1000
:]
else
:
# 翻译
result
+=
self
.
translate
(
sentence
,
lang
)
time
.
sleep
(
2
)
break
#print(result)
sentences
=
result
.
split
(
'😊'
)
print
(
len
(
sentences
))
num
=
0
for
tag
in
html
.
find_all
(
text
=
True
):
if
tag
==
''
:
continue
sentence
=
sentences
[
num
]
tag
.
replace_with
(
sentence
)
num
+=
1
return
str
(
html
.
prettify
())
+
'<p/><br>译文来源:微软自动翻译<br></p>'
if
__name__
==
"__main__"
:
test
=
Translate
()
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017/'
,
username
=
'admin'
,
password
=
'ZZsn@9988'
)
.
中科软
[
'数据源_0504'
]
data
=
db_storage
.
find_one
({
'_id'
:
ObjectId
(
'656f14e84d6d77428c713271'
)})
a
=
data
[
'richTextForeign'
]
result
=
test
.
gethtml
(
a
)
print
(
result
)
test
.
close
()
\ No newline at end of file
百度翻译/fanyi_test.py
0 → 100644
浏览文件 @
de21c2fe
差异被折叠。
点击展开。
百度翻译/test.py
deleted
100644 → 0
浏览文件 @
b7d2cc8d
差异被折叠。
点击展开。
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论