1/11

540a0e68 · 薛凌堃 · 176c0051 · 540a0e68
--- a/百度翻译/test.py
+++ b/百度翻译/test.py
+#百度翻译 不登录翻译1000字 登录翻译5000字
+#百度翻译 不登录翻译1000字 登录翻译5000字
+import re
+import string
+import time
+
+import pymongo
+from bs4 import BeautifulSoup
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.chrome.service import Service
+from base.BaseCore import BaseCore
+baseCore = BaseCore()
+
+class Translate():
+    def __init__(self):
+        """"
+        initialize the class, and include the fundamental attributes
+        """
+        # self._lang_list = ['zh', 'en', 'kor', 'fra', 'jp', 'el', 'ru']
+        # self._lang_list_original = ["中文", "英语", "韩语", "法语", "日语", "希腊语", "俄语"]
+        # self._num = len(self._lang_list)
+        self.url = "https://fanyi.baidu.com/#{}/{}/{}"
+        self.header = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1"}
+
+        self.db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').中科软['数据源_0106']
+
+    def createDriver(self):
+        chrome_driver = r'D:\cmd100\chromedriver.exe'
+        path = Service(chrome_driver)
+        chrome_options = webdriver.ChromeOptions()
+        chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
+        chrome_options.add_argument('--disable-gpu')
+        chrome_options.add_argument('--ignore-certificate-errors')
+        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
+        chrome_options.add_argument("--start-maximized")
+        proxy = baseCore.get_proxy()
+        chrome_options.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
+        chrome_options.add_argument(
+            'user-agent=' + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
+        # chrome_options.add_argument('--headless')
+
+        browser = webdriver.Chrome(service=path, chrome_options=chrome_options)
+
+        return browser
+
+    def translate(self, sentence, browser, lang):
+        sentence_ = sentence
+        # browser = self.createDriver()
+        wait = WebDriverWait(browser, 20)
+        try:
+            word_type = self.get_input_language_type(sentence_, browser, wait)
+        except:
+            browser.quit()
+            browser = self.createDriver()
+            result, browser = self.translate(sentence_, browser, lang)
+            return result, browser
+
+        if word_type:
+            if word_type == lang:
+                pass
+            else:
+                word_type = lang
+            url = self.url.format(word_type, 'zh', sentence_)
+            browser.set_page_load_timeout(10)
+            try:
+                browser.get(url)
+                wait.until(EC.presence_of_element_located(
+                    (By.XPATH, '//*[@id="main-outer"]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p[2]')))
+                result_ = browser.find_element(By.XPATH,
+                    '//*[@id="main-outer"]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p[2]')
+                result = result_.text.strip()
+                print(f'翻译后的句子：{result}')
+                return result, browser
+            except:
+                browser.quit()
+                print(f'翻译失败，重新翻译。当前句子为{sentence_}')
+                browser = self.createDriver()
+                result, browser = self.translate(sentence_, browser, lang)
+                return result, browser
+
+    def get_input_language_type(self, word, browser, wait):
+        browser.get("https://fanyi.baidu.com/")
+        wait.until(EC.presence_of_element_located((By.ID, "baidu_translate_input")))
+        input_word = browser.find_element(By.ID, "baidu_translate_input")
+        input_word.send_keys(word)
+        wait.until(EC.presence_of_element_located(
+            (By.XPATH, '//*[@id="main-outer"]/div/div/div[1]/div[1]/div[1]/a[1]/span/span')))
+        word_type = browser.find_element(By.XPATH,
+            '//*[@id="main-outer"]/div/div/div[1]/div[1]/div[1]/a[1]/span/span')
+        word_type = word_type.get_attribute("data-lang")
+        return word_type
+
+    def is_punctuation(self, char):
+        punctuation = string.punctuation + '、' + '（' + '…' + '）' + '《' + '》' + '“' + '”' + '：' + '；' + '！' + '　' + '。'
+
+        return char in punctuation
+
+    def sentence_split_sentence(self, contentWithTag):
+        pattern = re.compile(r'[^\n]+(?=\n)|[^\n]+$')
+        match_group = pattern.finditer(contentWithTag)
+        sentences = []
+        if match_group:
+            for _ in match_group:
+                start_end_index = _.span()
+                sentences.append((start_end_index[0], start_end_index[1], _.group()))
+        if (not sentences) and (len(contentWithTag) >= 4):
+            sentences.append((0, len(contentWithTag), contentWithTag))
+        return sentences
+
+    def jionstr(self, html):
+
+        paragraphs = []
+        current_sentence = ''
+        for tag in html.find_all(text=True):
+            sentence = str(tag)
+            if sentence == '\n' or sentence == '\t' or sentence == ' ':
+                continue
+            if self.is_punctuation(sentence):
+                continue
+            # 检查拼接后的句子长度是否超过1000字
+            if len(current_sentence) + len(sentence) <= 1000:
+                current_sentence += sentence
+            else:
+                paragraphs.append(current_sentence.strip())
+                current_sentence = sentence
+        return paragraphs
+
+
+
+    def gethtml(self):
+        # data = self.db_storage.find_one({'titleForeign':{'$ne':''}})
+        try:
+            browser = self.createDriver()
+        except:
+
+            browser = self.createDriver()
+        datas = self.db_storage.find({'postCode': '2', 'newsTime': {'$gte': '2024-01-01', '$lt': '2024-01-02'}}).limit(10)
+        for data in datas:
+            contentWithTag = data['richTextForeign']
+            # 根据分段符\n拆分,拿取纯文本，翻译
+
+            # # 拆分成段
+            # # pattern1 = re.compile(r'[^\n]+(?=\n)|[^\n]+$')
+            # sentence_list = self.sentence_split_sentence(contentWithTag)
+            # print(sentence_list)
+
+            # # 每段拆分成标签
+            # result_list = []
+            # # for sentence_tag in tqdm(sentence_list):
+            #     sentence_xml = BeautifulSoup(sentence_tag[2], 'lxml')
+            #     for tag in sentence_xml.find_all(text=True):
+            #         sentence =
+            #         if len(sentence.strip()) == 0:
+            #             # # print(f'aa当前内容为：{sentence}')
+            #             result = sentence.strip()
+            #             sentence_xml.text.replace(sentence, result)
+            #             result_list.append({
+            #                 "start_index": sentence_tag[0],
+            #                 "sentence": result,
+            #                 "sentence_xml": sentence_xml
+            #             })
+            #         elif self.is_punctuation(sentence.strip()) or len(sentence.strip()) == 1:
+            #             # # print(f'bb当前内容为：{sentence}')
+            #             result_list.append({
+            #                 "start_index": sentence_tag[0],
+            #                 "sentence": sentence,
+            #                 "sentence_xml": sentence_xml
+            #             })
+            #         else:
+            #             # 翻译文本
+            #             result = self.translate(sentence)
+            #             new_xml = sentence_tag[2].replace(sentence, result)
+            #
+            #             result_list.append({
+            #                 "start_index": sentence_tag[0],
+            #                 # "sentence": sentence + "\n",
+            #                 "sentence": result,
+            #                 "sentence_xml": new_xml
+            #             })
+            #     # todo: 对内容进行排序，保证顺序对
+            # sorted_context_list = sorted(result_list, key=lambda x: x["start_index"])
+            # final_list = [item["sentence_xml"] for item in sorted_context_list]
+            #
+            # return f'\n'.join(final_list)
+            # paragraphs = self.jionstr(contentWithTag)
+            html = BeautifulSoup(contentWithTag, 'html.parser')
+            content = html.text
+            lang = baseCore.detect_language(content)
+
+            for tag in html.find_all(text=True):
+                sentence = str(tag)
+                # sentence = "　実際に働き手の数が8がけ（8割）になる16年後、介護のようなケアサービスを今のような形で受けることは困難になると予測される。"
+                if sentence == '\n' or sentence == '\t' or sentence == ' ':
+                    continue
+                if self.is_punctuation(sentence):
+                    continue
+                # if len(sentence) > 1000:
+                if len(sentence) > 50:
+                    print(len(sentence))
+                    # index_1000 = sentence[999]
+                    index_1000 = sentence[49]
+                    # 判断该字符是不是逗号或句号
+                    if index_1000 == '.' or index_1000 == '。' or index_1000 == ',' or index_1000 == '，':
+                        # 如果是标点符号
+                        # print(f'当前的段1：{sentence[:1000]}')
+                        print(f'当前的段1：{sentence[:50]}')
+                        # result1, browser = self.translate(sentence[:1000].strip(), browser, lang)
+                        result1, browser = self.translate(sentence[:50].strip(), browser, lang)
+                        # print(f'当前的段2：{sentence[1000:]}')
+                        print(f'当前的段2：{sentence[50:]}')
+                        # result2, browser = self.translate(sentence[1000:].strip(), browser, lang)
+                        result2, browser = self.translate(sentence[50:].strip(), browser, lang)
+                        tag.replace_with(result1+result2)
+                    else:
+                        # 如果不是标点符号
+                        # i = 1000
+                        i = 50
+                        while i >= 0:
+                            j = i-1
+                            if j <= 0:
+                                break
+                            index_punctuation = sentence[j]
+                            if index_punctuation == '.' or index_punctuation == '。' or index_punctuation == ',' or index_punctuation == '，':
+                                print(f'当前的段3：{sentence[:j+1]}')
+                                result1, browser = self.translate(sentence[:j+1].strip(), browser, lang)
+                                print(f'当前的段4：{sentence[j+1:]}')
+                                result2, browser = self.translate(sentence[j+1:].strip(), browser, lang)
+                                tag.replace_with(result1+result2)
+                                break
+                            else:
+                                i = j
+                                continue
+                        if i == 1:
+                            print(f'当前的段5：{sentence}')
+                            # result, browser = self.translate(sentence[:1000].strip(), browser, lang)
+                            result, browser = self.translate(sentence[:50].strip(), browser, lang)
+                            tag.replace_with(result)
+                    continue
+                else:
+                    # 翻译
+                    print(f'当前的段6：{sentence}')
+                    result, browser = self.translate(sentence, browser, lang)
+                    # 替换
+                    tag.replace_with(result)
+                    time.sleep(2)
+            print(html.prettify())
+            # return html.prettify()
+
+if __name__ == "__main__":
+    test = Translate()
+    # test.translate()
+    # print(test.gethtml())
+    test.gethtml()
+