提交 fd395ea2 作者: LiuLiYuan

中科软 01/15

上级 14054899
#coding:utf-8
# 百度翻译 不登录翻译1000字 登录翻译5000字
import re
import string
import time
from urllib.parse import quote
import psutil
import pymongo
from bs4 import BeautifulSoup
from bson import ObjectId
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
# from selenium.webdriver.chrome.service import Service
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.proxy import Proxy, ProxyType
from func_timeout import func_set_timeout
from base.BaseCore import BaseCore
baseCore = BaseCore()
class Translate():
def __init__(self):
self.url = "https://fanyi.baidu.com/#"
self.header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1"}
self.browser = self.createDriver()
self.db_storage = \
pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').中科软[
'数据源_0106']
def close(self):
self.browser.quit()
def is_website_link(self,string):
pattern = r"^(http|https)?(://)?[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+(/[a-zA-Z0-9-_.?=/]*)?$"
if re.match(pattern, string):
return True
else:
return False
def createDriver(self):
proxy_ = baseCore.get_proxy()
profile = webdriver.FirefoxProfile()
profile.set_preference('network.proxy.type',1)
profile.set_preference('network.proxy.http',proxy_['http'].split('://')[1].split(':')[0])
profile.set_preference('network.proxy.http_port',int(proxy_['http'].split('://')[1].split(':')[1]))
profile.set_preference('network.proxy.ssl',proxy_['http'].split('://')[1].split(':')[0])
profile.set_preference('network.proxy.ssl_port',int(proxy_['http'].split('://')[1].split(':')[1]))
profile.update_preferences()
service = Service(r'F:\spider\firefox\geckodriver_1.exe')
options = Options()
options.set_preference("general.useragent.override",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--private')
browser = webdriver.Firefox(firefox_profile=profile, service=service,options=options)
return browser
def kill_firefox(self):
for proc in psutil.process_iter():
try:
if proc.name() == "firefox.exe":
proc.kill()
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
pass
def translate(self, sentence, lang):
sentence_ = sentence
wait = WebDriverWait(self.browser, 20)
try:
word_type = self.get_input_language_type(sentence_, wait)
except:
self.browser.quit()
self.browser = self.createDriver()
result = self.translate(sentence_, lang)
return result
if word_type:
if word_type == lang:
pass
else:
word_type = lang
url = self.url.format(word_type, 'zh', sentence_)
url = quote(url, safe='/:#')
self.browser.set_page_load_timeout(10)
try:
self.browser.get(url)
wait.until(EC.presence_of_element_located(
(By.XPATH, '//*[@id="main-outer"]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p[2]')))
result_ = self.browser.find_element(By.XPATH,
'//*[@id="main-outer"]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p[2]')
result = result_.text.strip()
return result
except:
self.browser.quit()
self.browser = self.createDriver()
result = self.translate(sentence_, lang)
return result
@func_set_timeout(30)
def get_input_language_type(self, word, wait):
self.browser.get("https://fanyi.baidu.com/")
wait.until(EC.presence_of_element_located((By.ID, "baidu_translate_input")))
input_word = self.browser.find_element(By.ID, "baidu_translate_input")
input_word.send_keys(word)
wait.until(EC.presence_of_element_located(
(By.XPATH, '//*[@id="main-outer"]/div/div/div[1]/div[1]/div[1]/a[1]/span/span')))
word_type = self.browser.find_element(By.XPATH,
'//*[@id="main-outer"]/div/div/div[1]/div[1]/div[1]/a[1]/span/span')
word_type = word_type.get_attribute("data-lang")
return word_type
def is_punctuation(self, char):
punctuation = string.punctuation + '、' + '(' + '…' + ')' + '《' + '》' + '“' + '”' + ':' + ';' + '!' + ' ' + '。'
return char in punctuation
def sentence_split_sentence(self, contentWithTag):
pattern = re.compile(r'[^\n]+(?=\n)|[^\n]+$')
match_group = pattern.finditer(contentWithTag)
sentences = []
if match_group:
for _ in match_group:
start_end_index = _.span()
sentences.append((start_end_index[0], start_end_index[1], _.group()))
if (not sentences) and (len(contentWithTag) >= 4):
sentences.append((0, len(contentWithTag), contentWithTag))
return sentences
def jionstr(self, html):
paragraphs = []
current_sentence = ''
for tag in html.find_all(text=True):
sentence = str(tag)
if sentence == '\n' or sentence == '\t' or sentence == ' ':
continue
if self.is_punctuation(sentence):
continue
if sentence.startswith('https://') or sentence.startswith('http://') or sentence.startswith('www.'):
continue
# 检查拼接后的句子长度是否超过1000字
if len(current_sentence) + len(sentence) <= 1000:
current_sentence += sentence
else:
paragraphs.append(current_sentence.strip())
current_sentence = sentence
return paragraphs
@func_set_timeout(300)
def gethtml(self, contentWithTag):
tag_list = []
html = BeautifulSoup(contentWithTag, 'html.parser')
content = html.text
lang = baseCore.detect_language(content)
if lang == 'zh':
return contentWithTag
for tag in html.find_all(text=True):
sentence = str(tag).strip()
tag_list.append(sentence)
sentence = ''
num = 0
for tag in tag_list:
if tag.strip() == '':
continue
if self.is_website_link(str(tag).strip()):
continue
sentence += f'{tag}😊'
num += 1
result = ''
while True:
if len(sentence.strip()) == 1 and self.is_punctuation(sentence.strip()):
result += sentence
break
if len(sentence) > 1000:
index_1000 = sentence[999]
# 判断该字符是不是逗号或句号
if index_1000 == '.' or index_1000 == '。' or index_1000 == ',' or index_1000 == ',':
# 如果是标点符号
result += self.translate(sentence[:1000].strip(), lang)
sentence = sentence[1000:]
else:
# 如果不是标点符号
i = 1000
while i >= 0:
j = i - 1
if j <= 0:
break
index_punctuation = sentence[j]
if index_punctuation == '.' or index_punctuation == '。' or index_punctuation == ',' or index_punctuation == ',':
result += self.translate(sentence[:j + 1].strip(), lang)
sentence = sentence[j + 1:]
break
else:
i = j
continue
if i == 1:
result += self.translate(sentence[:1000].strip(), lang)
sentence = sentence[1000:]
else:
# 翻译
result += self.translate(sentence, lang)
time.sleep(2)
break
sentences = result.split('😊')
num = 0
for tag in html.find_all(text=True):
if tag.strip() == '':
continue
if self.is_website_link(str(tag).strip()):
continue
sentence = sentences[num]
tag.replace_with(sentence)
num += 1
return str(html.prettify()) + '<p/><br>译文来源:微软自动翻译<br></p>'
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论