Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
540a0e68
提交
540a0e68
authored
1月 11, 2024
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
1/11
上级
176c0051
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
259 行增加
和
0 行删除
+259
-0
test.py
百度翻译/test.py
+259
-0
没有找到文件。
百度翻译/test.py
0 → 100644
浏览文件 @
540a0e68
#百度翻译 不登录翻译1000字 登录翻译5000字
#百度翻译 不登录翻译1000字 登录翻译5000字
import
re
import
string
import
time
import
pymongo
from
bs4
import
BeautifulSoup
from
selenium
import
webdriver
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.support
import
expected_conditions
as
EC
from
selenium.webdriver.support.wait
import
WebDriverWait
from
selenium.webdriver.chrome.service
import
Service
from
base.BaseCore
import
BaseCore
baseCore
=
BaseCore
()
class
Translate
():
def
__init__
(
self
):
""""
initialize the class, and include the fundamental attributes
"""
# self._lang_list = ['zh', 'en', 'kor', 'fra', 'jp', 'el', 'ru']
# self._lang_list_original = ["中文", "英语", "韩语", "法语", "日语", "希腊语", "俄语"]
# self._num = len(self._lang_list)
self
.
url
=
"https://fanyi.baidu.com/#{}/{}/{}"
self
.
header
=
{
"User-Agent"
:
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1"
}
self
.
db_storage
=
pymongo
.
MongoClient
(
'mongodb://114.115.221.202:27017'
,
username
=
'admin'
,
password
=
'ZZsn@9988'
)
.
中科软
[
'数据源_0106'
]
def
createDriver
(
self
):
chrome_driver
=
r'D:\cmd100\chromedriver.exe'
path
=
Service
(
chrome_driver
)
chrome_options
=
webdriver
.
ChromeOptions
()
chrome_options
.
binary_location
=
r'D:\Google\Chrome\Application\chrome.exe'
chrome_options
.
add_argument
(
'--disable-gpu'
)
chrome_options
.
add_argument
(
'--ignore-certificate-errors'
)
chrome_options
.
add_argument
(
"--disable-blink-features=AutomationControlled"
)
chrome_options
.
add_argument
(
"--start-maximized"
)
proxy
=
baseCore
.
get_proxy
()
chrome_options
.
add_argument
(
'--proxy-server='
+
proxy
[
'http'
]
.
split
(
'://'
)[
1
])
chrome_options
.
add_argument
(
'user-agent='
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
)
# chrome_options.add_argument('--headless')
browser
=
webdriver
.
Chrome
(
service
=
path
,
chrome_options
=
chrome_options
)
return
browser
def
translate
(
self
,
sentence
,
browser
,
lang
):
sentence_
=
sentence
# browser = self.createDriver()
wait
=
WebDriverWait
(
browser
,
20
)
try
:
word_type
=
self
.
get_input_language_type
(
sentence_
,
browser
,
wait
)
except
:
browser
.
quit
()
browser
=
self
.
createDriver
()
result
,
browser
=
self
.
translate
(
sentence_
,
browser
,
lang
)
return
result
,
browser
if
word_type
:
if
word_type
==
lang
:
pass
else
:
word_type
=
lang
url
=
self
.
url
.
format
(
word_type
,
'zh'
,
sentence_
)
browser
.
set_page_load_timeout
(
10
)
try
:
browser
.
get
(
url
)
wait
.
until
(
EC
.
presence_of_element_located
(
(
By
.
XPATH
,
'//*[@id="main-outer"]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p[2]'
)))
result_
=
browser
.
find_element
(
By
.
XPATH
,
'//*[@id="main-outer"]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p[2]'
)
result
=
result_
.
text
.
strip
()
print
(
f
'翻译后的句子:{result}'
)
return
result
,
browser
except
:
browser
.
quit
()
print
(
f
'翻译失败,重新翻译。当前句子为{sentence_}'
)
browser
=
self
.
createDriver
()
result
,
browser
=
self
.
translate
(
sentence_
,
browser
,
lang
)
return
result
,
browser
def
get_input_language_type
(
self
,
word
,
browser
,
wait
):
browser
.
get
(
"https://fanyi.baidu.com/"
)
wait
.
until
(
EC
.
presence_of_element_located
((
By
.
ID
,
"baidu_translate_input"
)))
input_word
=
browser
.
find_element
(
By
.
ID
,
"baidu_translate_input"
)
input_word
.
send_keys
(
word
)
wait
.
until
(
EC
.
presence_of_element_located
(
(
By
.
XPATH
,
'//*[@id="main-outer"]/div/div/div[1]/div[1]/div[1]/a[1]/span/span'
)))
word_type
=
browser
.
find_element
(
By
.
XPATH
,
'//*[@id="main-outer"]/div/div/div[1]/div[1]/div[1]/a[1]/span/span'
)
word_type
=
word_type
.
get_attribute
(
"data-lang"
)
return
word_type
def
is_punctuation
(
self
,
char
):
punctuation
=
string
.
punctuation
+
'、'
+
'('
+
'…'
+
')'
+
'《'
+
'》'
+
'“'
+
'”'
+
':'
+
';'
+
'!'
+
' '
+
'。'
return
char
in
punctuation
def
sentence_split_sentence
(
self
,
contentWithTag
):
pattern
=
re
.
compile
(
r'[^\n]+(?=\n)|[^\n]+$'
)
match_group
=
pattern
.
finditer
(
contentWithTag
)
sentences
=
[]
if
match_group
:
for
_
in
match_group
:
start_end_index
=
_
.
span
()
sentences
.
append
((
start_end_index
[
0
],
start_end_index
[
1
],
_
.
group
()))
if
(
not
sentences
)
and
(
len
(
contentWithTag
)
>=
4
):
sentences
.
append
((
0
,
len
(
contentWithTag
),
contentWithTag
))
return
sentences
def
jionstr
(
self
,
html
):
paragraphs
=
[]
current_sentence
=
''
for
tag
in
html
.
find_all
(
text
=
True
):
sentence
=
str
(
tag
)
if
sentence
==
'
\n
'
or
sentence
==
'
\t
'
or
sentence
==
' '
:
continue
if
self
.
is_punctuation
(
sentence
):
continue
# 检查拼接后的句子长度是否超过1000字
if
len
(
current_sentence
)
+
len
(
sentence
)
<=
1000
:
current_sentence
+=
sentence
else
:
paragraphs
.
append
(
current_sentence
.
strip
())
current_sentence
=
sentence
return
paragraphs
def
gethtml
(
self
):
# data = self.db_storage.find_one({'titleForeign':{'$ne':''}})
try
:
browser
=
self
.
createDriver
()
except
:
browser
=
self
.
createDriver
()
datas
=
self
.
db_storage
.
find
({
'postCode'
:
'2'
,
'newsTime'
:
{
'$gte'
:
'2024-01-01'
,
'$lt'
:
'2024-01-02'
}})
.
limit
(
10
)
for
data
in
datas
:
contentWithTag
=
data
[
'richTextForeign'
]
# 根据分段符\n拆分,拿取纯文本,翻译
# # 拆分成段
# # pattern1 = re.compile(r'[^\n]+(?=\n)|[^\n]+$')
# sentence_list = self.sentence_split_sentence(contentWithTag)
# print(sentence_list)
# # 每段拆分成标签
# result_list = []
# # for sentence_tag in tqdm(sentence_list):
# sentence_xml = BeautifulSoup(sentence_tag[2], 'lxml')
# for tag in sentence_xml.find_all(text=True):
# sentence =
# if len(sentence.strip()) == 0:
# # # print(f'aa当前内容为:{sentence}')
# result = sentence.strip()
# sentence_xml.text.replace(sentence, result)
# result_list.append({
# "start_index": sentence_tag[0],
# "sentence": result,
# "sentence_xml": sentence_xml
# })
# elif self.is_punctuation(sentence.strip()) or len(sentence.strip()) == 1:
# # # print(f'bb当前内容为:{sentence}')
# result_list.append({
# "start_index": sentence_tag[0],
# "sentence": sentence,
# "sentence_xml": sentence_xml
# })
# else:
# # 翻译文本
# result = self.translate(sentence)
# new_xml = sentence_tag[2].replace(sentence, result)
#
# result_list.append({
# "start_index": sentence_tag[0],
# # "sentence": sentence + "\n",
# "sentence": result,
# "sentence_xml": new_xml
# })
# # todo: 对内容进行排序,保证顺序对
# sorted_context_list = sorted(result_list, key=lambda x: x["start_index"])
# final_list = [item["sentence_xml"] for item in sorted_context_list]
#
# return f'\n'.join(final_list)
# paragraphs = self.jionstr(contentWithTag)
html
=
BeautifulSoup
(
contentWithTag
,
'html.parser'
)
content
=
html
.
text
lang
=
baseCore
.
detect_language
(
content
)
for
tag
in
html
.
find_all
(
text
=
True
):
sentence
=
str
(
tag
)
# sentence = " 実際に働き手の数が8がけ(8割)になる16年後、介護のようなケアサービスを今のような形で受けることは困難になると予測される。"
if
sentence
==
'
\n
'
or
sentence
==
'
\t
'
or
sentence
==
' '
:
continue
if
self
.
is_punctuation
(
sentence
):
continue
# if len(sentence) > 1000:
if
len
(
sentence
)
>
50
:
print
(
len
(
sentence
))
# index_1000 = sentence[999]
index_1000
=
sentence
[
49
]
# 判断该字符是不是逗号或句号
if
index_1000
==
'.'
or
index_1000
==
'。'
or
index_1000
==
','
or
index_1000
==
','
:
# 如果是标点符号
# print(f'当前的段1:{sentence[:1000]}')
print
(
f
'当前的段1:{sentence[:50]}'
)
# result1, browser = self.translate(sentence[:1000].strip(), browser, lang)
result1
,
browser
=
self
.
translate
(
sentence
[:
50
]
.
strip
(),
browser
,
lang
)
# print(f'当前的段2:{sentence[1000:]}')
print
(
f
'当前的段2:{sentence[50:]}'
)
# result2, browser = self.translate(sentence[1000:].strip(), browser, lang)
result2
,
browser
=
self
.
translate
(
sentence
[
50
:]
.
strip
(),
browser
,
lang
)
tag
.
replace_with
(
result1
+
result2
)
else
:
# 如果不是标点符号
# i = 1000
i
=
50
while
i
>=
0
:
j
=
i
-
1
if
j
<=
0
:
break
index_punctuation
=
sentence
[
j
]
if
index_punctuation
==
'.'
or
index_punctuation
==
'。'
or
index_punctuation
==
','
or
index_punctuation
==
','
:
print
(
f
'当前的段3:{sentence[:j+1]}'
)
result1
,
browser
=
self
.
translate
(
sentence
[:
j
+
1
]
.
strip
(),
browser
,
lang
)
print
(
f
'当前的段4:{sentence[j+1:]}'
)
result2
,
browser
=
self
.
translate
(
sentence
[
j
+
1
:]
.
strip
(),
browser
,
lang
)
tag
.
replace_with
(
result1
+
result2
)
break
else
:
i
=
j
continue
if
i
==
1
:
print
(
f
'当前的段5:{sentence}'
)
# result, browser = self.translate(sentence[:1000].strip(), browser, lang)
result
,
browser
=
self
.
translate
(
sentence
[:
50
]
.
strip
(),
browser
,
lang
)
tag
.
replace_with
(
result
)
continue
else
:
# 翻译
print
(
f
'当前的段6:{sentence}'
)
result
,
browser
=
self
.
translate
(
sentence
,
browser
,
lang
)
# 替换
tag
.
replace_with
(
result
)
time
.
sleep
(
2
)
print
(
html
.
prettify
())
# return html.prettify()
if
__name__
==
"__main__"
:
test
=
Translate
()
# test.translate()
# print(test.gethtml())
test
.
gethtml
()
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论