提交 de21c2fe 作者: LiuLiYuan

fanyi 01/12

上级 b7d2cc8d
#coding:utf-8
#coding:utf-8
# 百度翻译 不登录翻译1000字 登录翻译5000字
import re
import string
import time
from urllib.parse import quote
import pymongo
from bs4 import BeautifulSoup
from bson import ObjectId
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
# from selenium.webdriver.chrome.service import Service
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from base.BaseCore import BaseCore
baseCore = BaseCore()
class Translate():
def __init__(self):
self.url = "https://fanyi.baidu.com/#"
self.header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1"}
self.browser = self.createDriver()
self.db_storage = \
pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').中科软[
'数据源_0106']
def close(self):
self.browser.quit()
def createDriver(self):
# chrome_driver = r'F:\spider\117\chromedriver-win64\chromedriver.exe'
# path = Service(chrome_driver)
# chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--disable-gpu')
# chrome_options.add_argument('--ignore-certificate-errors')
# chrome_options.add_argument("--disable-blink-features=AutomationControlled")
# chrome_options.add_argument("--start-maximized")
# proxy = baseCore.get_proxy()
# chrome_options.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
# chrome_options.add_argument(
# 'user-agent=' + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
#
# browser = webdriver.Chrome(service=path, chrome_options=chrome_options)
service = Service(r'F:\spider\firefox\geckodriver_1.exe')
options = Options()
options.set_preference("general.useragent.override",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
browser = webdriver.Firefox(options=options, service=service)
return browser
def translate(self, sentence, lang):
sentence_ = sentence
wait = WebDriverWait(self.browser, 20)
try:
word_type = self.get_input_language_type(sentence_, wait)
except:
self.browser.quit()
self.browser = self.createDriver()
result = self.translate(sentence_, lang)
return result
if word_type:
if word_type == lang:
pass
else:
word_type = lang
url = self.url.format(word_type, 'zh', sentence_)
url = quote(url, safe='/:#')
self.browser.set_page_load_timeout(10)
try:
self.browser.get(url)
wait.until(EC.presence_of_element_located(
(By.XPATH, '//*[@id="main-outer"]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p[2]')))
result_ = self.browser.find_element(By.XPATH,
'//*[@id="main-outer"]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p[2]')
result = result_.text.strip()
return result
except:
self.browser.quit()
self.browser = self.createDriver()
result = self.translate(sentence_, lang)
return result
def get_input_language_type(self, word, wait):
self.browser.get("https://fanyi.baidu.com/")
wait.until(EC.presence_of_element_located((By.ID, "baidu_translate_input")))
input_word = self.browser.find_element(By.ID, "baidu_translate_input")
input_word.send_keys(word)
wait.until(EC.presence_of_element_located(
(By.XPATH, '//*[@id="main-outer"]/div/div/div[1]/div[1]/div[1]/a[1]/span/span')))
word_type = self.browser.find_element(By.XPATH,
'//*[@id="main-outer"]/div/div/div[1]/div[1]/div[1]/a[1]/span/span')
word_type = word_type.get_attribute("data-lang")
return word_type
def is_punctuation(self, char):
punctuation = string.punctuation + '、' + '(' + '…' + ')' + '《' + '》' + '“' + '”' + ':' + ';' + '!' + ' ' + '。'
return char in punctuation
def sentence_split_sentence(self, contentWithTag):
pattern = re.compile(r'[^\n]+(?=\n)|[^\n]+$')
match_group = pattern.finditer(contentWithTag)
sentences = []
if match_group:
for _ in match_group:
start_end_index = _.span()
sentences.append((start_end_index[0], start_end_index[1], _.group()))
if (not sentences) and (len(contentWithTag) >= 4):
sentences.append((0, len(contentWithTag), contentWithTag))
return sentences
def jionstr(self, html):
paragraphs = []
current_sentence = ''
for tag in html.find_all(text=True):
sentence = str(tag)
if sentence == '\n' or sentence == '\t' or sentence == ' ':
continue
if self.is_punctuation(sentence):
continue
if sentence.startswith('https://') or sentence.startswith('http://') or sentence.startswith('www.'):
continue
# 检查拼接后的句子长度是否超过1000字
if len(current_sentence) + len(sentence) <= 1000:
current_sentence += sentence
else:
paragraphs.append(current_sentence.strip())
current_sentence = sentence
return paragraphs
def gethtml(self, contentWithTag):
tag_list = []
html = BeautifulSoup(contentWithTag, 'html.parser')
content = html.text
lang = baseCore.detect_language(content)
if lang == 'zh':
return contentWithTag
for tag in html.find_all(text=True):
sentence = str(tag).strip()
tag_list.append(sentence)
sentence = ''
for tag in tag_list:
if tag == '':
continue
sentence += f'{tag}😊'
# if len(sentence) == 1:
# continue
# if sentence == '\n' or sentence == '\t' or sentence == ' ':
# continue
# if self.is_punctuation(sentence):
# continue
#print(sentence)
result = ''
while True:
if len(sentence) > 1000:
index_1000 = sentence[999]
# 判断该字符是不是逗号或句号
if index_1000 == '.' or index_1000 == '。' or index_1000 == ',' or index_1000 == ',':
# 如果是标点符号
result += self.translate(sentence[:1000].strip(), lang)
sentence = sentence[1000:]
else:
# 如果不是标点符号
i = 1000
while i >= 0:
j = i - 1
if j <= 0:
break
index_punctuation = sentence[j]
if index_punctuation == '.' or index_punctuation == '。' or index_punctuation == ',' or index_punctuation == ',':
result += self.translate(sentence[:j + 1].strip(), lang)
sentence = sentence[j + 1:]
# result += self.translate(sentence[j + 1:].strip(), lang)
break
else:
i = j
continue
if i == 1:
result += self.translate(sentence[:1000].strip(), lang)
sentence = sentence[1000:]
else:
# 翻译
result += self.translate(sentence, lang)
time.sleep(2)
break
#print(result)
sentences = result.split('😊')
print(len(sentences))
num = 0
for tag in html.find_all(text=True):
if tag == '':
continue
sentence = sentences[num]
tag.replace_with(sentence)
num += 1
return str(html.prettify()) + '<p/><br>译文来源:微软自动翻译<br></p>'
if __name__ == "__main__":
test = Translate()
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').中科软[
'数据源_0504']
data = db_storage.find_one({'_id': ObjectId('656f14e84d6d77428c713271')})
a = data['richTextForeign']
result = test.gethtml(a)
print(result)
test.close()
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论