提交 540a0e68 作者: 薛凌堃

1/11

上级 176c0051
#百度翻译 不登录翻译1000字 登录翻译5000字
#百度翻译 不登录翻译1000字 登录翻译5000字
import re
import string
import time
import pymongo
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.service import Service
from base.BaseCore import BaseCore
baseCore = BaseCore()
class Translate():
def __init__(self):
""""
initialize the class, and include the fundamental attributes
"""
# self._lang_list = ['zh', 'en', 'kor', 'fra', 'jp', 'el', 'ru']
# self._lang_list_original = ["中文", "英语", "韩语", "法语", "日语", "希腊语", "俄语"]
# self._num = len(self._lang_list)
self.url = "https://fanyi.baidu.com/#{}/{}/{}"
self.header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1"}
self.db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').中科软['数据源_0106']
def createDriver(self):
chrome_driver = r'D:\cmd100\chromedriver.exe'
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--start-maximized")
proxy = baseCore.get_proxy()
chrome_options.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
chrome_options.add_argument(
'user-agent=' + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
# chrome_options.add_argument('--headless')
browser = webdriver.Chrome(service=path, chrome_options=chrome_options)
return browser
def translate(self, sentence, browser, lang):
sentence_ = sentence
# browser = self.createDriver()
wait = WebDriverWait(browser, 20)
try:
word_type = self.get_input_language_type(sentence_, browser, wait)
except:
browser.quit()
browser = self.createDriver()
result, browser = self.translate(sentence_, browser, lang)
return result, browser
if word_type:
if word_type == lang:
pass
else:
word_type = lang
url = self.url.format(word_type, 'zh', sentence_)
browser.set_page_load_timeout(10)
try:
browser.get(url)
wait.until(EC.presence_of_element_located(
(By.XPATH, '//*[@id="main-outer"]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p[2]')))
result_ = browser.find_element(By.XPATH,
'//*[@id="main-outer"]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p[2]')
result = result_.text.strip()
print(f'翻译后的句子:{result}')
return result, browser
except:
browser.quit()
print(f'翻译失败,重新翻译。当前句子为{sentence_}')
browser = self.createDriver()
result, browser = self.translate(sentence_, browser, lang)
return result, browser
def get_input_language_type(self, word, browser, wait):
browser.get("https://fanyi.baidu.com/")
wait.until(EC.presence_of_element_located((By.ID, "baidu_translate_input")))
input_word = browser.find_element(By.ID, "baidu_translate_input")
input_word.send_keys(word)
wait.until(EC.presence_of_element_located(
(By.XPATH, '//*[@id="main-outer"]/div/div/div[1]/div[1]/div[1]/a[1]/span/span')))
word_type = browser.find_element(By.XPATH,
'//*[@id="main-outer"]/div/div/div[1]/div[1]/div[1]/a[1]/span/span')
word_type = word_type.get_attribute("data-lang")
return word_type
def is_punctuation(self, char):
punctuation = string.punctuation + '、' + '(' + '…' + ')' + '《' + '》' + '“' + '”' + ':' + ';' + '!' + ' ' + '。'
return char in punctuation
def sentence_split_sentence(self, contentWithTag):
pattern = re.compile(r'[^\n]+(?=\n)|[^\n]+$')
match_group = pattern.finditer(contentWithTag)
sentences = []
if match_group:
for _ in match_group:
start_end_index = _.span()
sentences.append((start_end_index[0], start_end_index[1], _.group()))
if (not sentences) and (len(contentWithTag) >= 4):
sentences.append((0, len(contentWithTag), contentWithTag))
return sentences
def jionstr(self, html):
paragraphs = []
current_sentence = ''
for tag in html.find_all(text=True):
sentence = str(tag)
if sentence == '\n' or sentence == '\t' or sentence == ' ':
continue
if self.is_punctuation(sentence):
continue
# 检查拼接后的句子长度是否超过1000字
if len(current_sentence) + len(sentence) <= 1000:
current_sentence += sentence
else:
paragraphs.append(current_sentence.strip())
current_sentence = sentence
return paragraphs
def gethtml(self):
# data = self.db_storage.find_one({'titleForeign':{'$ne':''}})
try:
browser = self.createDriver()
except:
browser = self.createDriver()
datas = self.db_storage.find({'postCode': '2', 'newsTime': {'$gte': '2024-01-01', '$lt': '2024-01-02'}}).limit(10)
for data in datas:
contentWithTag = data['richTextForeign']
# 根据分段符\n拆分,拿取纯文本,翻译
# # 拆分成段
# # pattern1 = re.compile(r'[^\n]+(?=\n)|[^\n]+$')
# sentence_list = self.sentence_split_sentence(contentWithTag)
# print(sentence_list)
# # 每段拆分成标签
# result_list = []
# # for sentence_tag in tqdm(sentence_list):
# sentence_xml = BeautifulSoup(sentence_tag[2], 'lxml')
# for tag in sentence_xml.find_all(text=True):
# sentence =
# if len(sentence.strip()) == 0:
# # # print(f'aa当前内容为:{sentence}')
# result = sentence.strip()
# sentence_xml.text.replace(sentence, result)
# result_list.append({
# "start_index": sentence_tag[0],
# "sentence": result,
# "sentence_xml": sentence_xml
# })
# elif self.is_punctuation(sentence.strip()) or len(sentence.strip()) == 1:
# # # print(f'bb当前内容为:{sentence}')
# result_list.append({
# "start_index": sentence_tag[0],
# "sentence": sentence,
# "sentence_xml": sentence_xml
# })
# else:
# # 翻译文本
# result = self.translate(sentence)
# new_xml = sentence_tag[2].replace(sentence, result)
#
# result_list.append({
# "start_index": sentence_tag[0],
# # "sentence": sentence + "\n",
# "sentence": result,
# "sentence_xml": new_xml
# })
# # todo: 对内容进行排序,保证顺序对
# sorted_context_list = sorted(result_list, key=lambda x: x["start_index"])
# final_list = [item["sentence_xml"] for item in sorted_context_list]
#
# return f'\n'.join(final_list)
# paragraphs = self.jionstr(contentWithTag)
html = BeautifulSoup(contentWithTag, 'html.parser')
content = html.text
lang = baseCore.detect_language(content)
for tag in html.find_all(text=True):
sentence = str(tag)
# sentence = " 実際に働き手の数が8がけ(8割)になる16年後、介護のようなケアサービスを今のような形で受けることは困難になると予測される。"
if sentence == '\n' or sentence == '\t' or sentence == ' ':
continue
if self.is_punctuation(sentence):
continue
# if len(sentence) > 1000:
if len(sentence) > 50:
print(len(sentence))
# index_1000 = sentence[999]
index_1000 = sentence[49]
# 判断该字符是不是逗号或句号
if index_1000 == '.' or index_1000 == '。' or index_1000 == ',' or index_1000 == ',':
# 如果是标点符号
# print(f'当前的段1:{sentence[:1000]}')
print(f'当前的段1:{sentence[:50]}')
# result1, browser = self.translate(sentence[:1000].strip(), browser, lang)
result1, browser = self.translate(sentence[:50].strip(), browser, lang)
# print(f'当前的段2:{sentence[1000:]}')
print(f'当前的段2:{sentence[50:]}')
# result2, browser = self.translate(sentence[1000:].strip(), browser, lang)
result2, browser = self.translate(sentence[50:].strip(), browser, lang)
tag.replace_with(result1+result2)
else:
# 如果不是标点符号
# i = 1000
i = 50
while i >= 0:
j = i-1
if j <= 0:
break
index_punctuation = sentence[j]
if index_punctuation == '.' or index_punctuation == '。' or index_punctuation == ',' or index_punctuation == ',':
print(f'当前的段3:{sentence[:j+1]}')
result1, browser = self.translate(sentence[:j+1].strip(), browser, lang)
print(f'当前的段4:{sentence[j+1:]}')
result2, browser = self.translate(sentence[j+1:].strip(), browser, lang)
tag.replace_with(result1+result2)
break
else:
i = j
continue
if i == 1:
print(f'当前的段5:{sentence}')
# result, browser = self.translate(sentence[:1000].strip(), browser, lang)
result, browser = self.translate(sentence[:50].strip(), browser, lang)
tag.replace_with(result)
continue
else:
# 翻译
print(f'当前的段6:{sentence}')
result, browser = self.translate(sentence, browser, lang)
# 替换
tag.replace_with(result)
time.sleep(2)
print(html.prettify())
# return html.prettify()
if __name__ == "__main__":
test = Translate()
# test.translate()
# print(test.gethtml())
test.gethtml()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论