fanyi 01/12

de21c2fe · LiuLiYuan · b7d2cc8d · de21c2fe · de21c2fe · b7d2cc8d
--- a/百度翻译/baidufanyi.py
+++ b/百度翻译/baidufanyi.py
+#coding:utf-8
+#coding:utf-8
+# 百度翻译 不登录翻译1000字 登录翻译5000字
+import re
+import string
+import time
+from urllib.parse import quote
+
+import pymongo
+from bs4 import BeautifulSoup
+from bson import ObjectId
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+
+from selenium.webdriver.support.wait import WebDriverWait
+# from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.firefox.service import Service
+from selenium.webdriver.firefox.options import Options
+from base.BaseCore import BaseCore
+
+baseCore = BaseCore()
+
+
+class Translate():
+    def __init__(self):
+        self.url = "https://fanyi.baidu.com/#"
+        self.header = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1"}
+        self.browser = self.createDriver()
+        self.db_storage = \
+            pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').中科软[
+                '数据源_0106']
+
+    def close(self):
+        self.browser.quit()
+
+    def createDriver(self):
+        # chrome_driver = r'F:\spider\117\chromedriver-win64\chromedriver.exe'
+        # path = Service(chrome_driver)
+        # chrome_options = webdriver.ChromeOptions()
+        # chrome_options.add_argument('--disable-gpu')
+        # chrome_options.add_argument('--ignore-certificate-errors')
+        # chrome_options.add_argument("--disable-blink-features=AutomationControlled")
+        # chrome_options.add_argument("--start-maximized")
+        # proxy = baseCore.get_proxy()
+        # chrome_options.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
+        # chrome_options.add_argument(
+        #     'user-agent=' + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
+        #
+        # browser = webdriver.Chrome(service=path, chrome_options=chrome_options)
+        service = Service(r'F:\spider\firefox\geckodriver_1.exe')
+        options = Options()
+        options.set_preference("general.useragent.override",
+                               "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
+        browser = webdriver.Firefox(options=options, service=service)
+        return browser
+
+    def translate(self, sentence, lang):
+        sentence_ = sentence
+        wait = WebDriverWait(self.browser, 20)
+        try:
+            word_type = self.get_input_language_type(sentence_, wait)
+        except:
+            self.browser.quit()
+            self.browser = self.createDriver()
+            result = self.translate(sentence_, lang)
+            return result
+
+        if word_type:
+            if word_type == lang:
+                pass
+            else:
+                word_type = lang
+            url = self.url.format(word_type, 'zh', sentence_)
+            url = quote(url, safe='/:#')
+            self.browser.set_page_load_timeout(10)
+            try:
+                self.browser.get(url)
+                wait.until(EC.presence_of_element_located(
+                    (By.XPATH, '//*[@id="main-outer"]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p[2]')))
+                result_ = self.browser.find_element(By.XPATH,
+                                                    '//*[@id="main-outer"]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p[2]')
+                result = result_.text.strip()
+                return result
+            except:
+                self.browser.quit()
+                self.browser = self.createDriver()
+                result = self.translate(sentence_, lang)
+                return result
+
+    def get_input_language_type(self, word, wait):
+        self.browser.get("https://fanyi.baidu.com/")
+        wait.until(EC.presence_of_element_located((By.ID, "baidu_translate_input")))
+        input_word = self.browser.find_element(By.ID, "baidu_translate_input")
+        input_word.send_keys(word)
+        wait.until(EC.presence_of_element_located(
+            (By.XPATH, '//*[@id="main-outer"]/div/div/div[1]/div[1]/div[1]/a[1]/span/span')))
+        word_type = self.browser.find_element(By.XPATH,
+                                              '//*[@id="main-outer"]/div/div/div[1]/div[1]/div[1]/a[1]/span/span')
+        word_type = word_type.get_attribute("data-lang")
+        return word_type
+
+    def is_punctuation(self, char):
+        punctuation = string.punctuation + '、' + '（' + '…' + '）' + '《' + '》' + '“' + '”' + '：' + '；' + '！' + '　' + '。'
+
+        return char in punctuation
+
+    def sentence_split_sentence(self, contentWithTag):
+        pattern = re.compile(r'[^\n]+(?=\n)|[^\n]+$')
+        match_group = pattern.finditer(contentWithTag)
+        sentences = []
+        if match_group:
+            for _ in match_group:
+                start_end_index = _.span()
+                sentences.append((start_end_index[0], start_end_index[1], _.group()))
+        if (not sentences) and (len(contentWithTag) >= 4):
+            sentences.append((0, len(contentWithTag), contentWithTag))
+        return sentences
+
+    def jionstr(self, html):
+        paragraphs = []
+        current_sentence = ''
+        for tag in html.find_all(text=True):
+            sentence = str(tag)
+            if sentence == '\n' or sentence == '\t' or sentence == ' ':
+                continue
+            if self.is_punctuation(sentence):
+                continue
+            if sentence.startswith('https://') or sentence.startswith('http://') or sentence.startswith('www.'):
+                continue
+            # 检查拼接后的句子长度是否超过1000字
+            if len(current_sentence) + len(sentence) <= 1000:
+                current_sentence += sentence
+            else:
+                paragraphs.append(current_sentence.strip())
+                current_sentence = sentence
+        return paragraphs
+
+    def gethtml(self, contentWithTag):
+        tag_list = []
+        html = BeautifulSoup(contentWithTag, 'html.parser')
+        content = html.text
+        lang = baseCore.detect_language(content)
+        if lang == 'zh':
+            return contentWithTag
+        for tag in html.find_all(text=True):
+            sentence = str(tag).strip()
+            tag_list.append(sentence)
+        sentence = ''
+        for tag in tag_list:
+            if tag == '':
+                continue
+            sentence += f'{tag}😊'
+        #     if len(sentence) == 1:
+        #         continue
+        #     if sentence == '\n' or sentence == '\t' or sentence == ' ':
+        #         continue
+        #     if self.is_punctuation(sentence):
+        #         continue
+        #print(sentence)
+        result = ''
+        while True:
+            if len(sentence) > 1000:
+                index_1000 = sentence[999]
+                # 判断该字符是不是逗号或句号
+                if index_1000 == '.' or index_1000 == '。' or index_1000 == ',' or index_1000 == '，':
+                    # 如果是标点符号
+                    result += self.translate(sentence[:1000].strip(), lang)
+                    sentence = sentence[1000:]
+                else:
+                    # 如果不是标点符号
+                    i = 1000
+                    while i >= 0:
+                        j = i - 1
+                        if j <= 0:
+                            break
+                        index_punctuation = sentence[j]
+                        if index_punctuation == '.' or index_punctuation == '。' or index_punctuation == ',' or index_punctuation == '，':
+                            result += self.translate(sentence[:j + 1].strip(), lang)
+                            sentence = sentence[j + 1:]
+                            # result += self.translate(sentence[j + 1:].strip(), lang)
+                            break
+                        else:
+                            i = j
+                            continue
+                    if i == 1:
+                        result += self.translate(sentence[:1000].strip(), lang)
+                        sentence = sentence[1000:]
+            else:
+                # 翻译
+                result += self.translate(sentence, lang)
+                time.sleep(2)
+                break
+        #print(result)
+        sentences = result.split('😊')
+        print(len(sentences))
+        num = 0
+        for tag in html.find_all(text=True):
+            if tag == '':
+                continue
+            sentence = sentences[num]
+            tag.replace_with(sentence)
+            num += 1
+        return str(html.prettify()) + '<p/><br>译文来源：微软自动翻译<br></p>'
+
+
+if __name__ == "__main__":
+    test = Translate()
+    db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').中科软[
+        '数据源_0504']
+    data = db_storage.find_one({'_id': ObjectId('656f14e84d6d77428c713271')})
+    a = data['richTextForeign']
+    result = test.gethtml(a)
+    print(result)
+    test.close()
\ No newline at end of file
--- a/百度翻译/fanyi_test.py
+++ b/百度翻译/fanyi_test.py
--- a/百度翻译/test.py
+++ b/百度翻译/test.py