中科软 01/15

fd395ea2 · LiuLiYuan · 14054899 · fd395ea2 · fd395ea2 · fd395ea2
--- a/zkr/baidufanyi.py
+++ b/zkr/baidufanyi.py
+#coding:utf-8
+# 百度翻译 不登录翻译1000字 登录翻译5000字
+import re
+import string
+import time
+from urllib.parse import quote
+
+import psutil
+import pymongo
+from bs4 import BeautifulSoup
+from bson import ObjectId
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+
+from selenium.webdriver.support.wait import WebDriverWait
+# from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.firefox.service import Service
+from selenium.webdriver.firefox.options import Options
+from selenium.webdriver.common.proxy import Proxy, ProxyType
+from func_timeout import func_set_timeout
+from base.BaseCore import BaseCore
+
+baseCore = BaseCore()
+
+
+class Translate():
+    def __init__(self):
+        self.url = "https://fanyi.baidu.com/#"
+        self.header = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1"}
+        self.browser = self.createDriver()
+        self.db_storage = \
+            pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').中科软[
+                '数据源_0106']
+
+    def close(self):
+        self.browser.quit()
+
+    def is_website_link(self,string):
+        pattern = r"^(http|https)?(://)?[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+(/[a-zA-Z0-9-_.?=/]*)?$"
+        if re.match(pattern, string):
+            return True
+        else:
+            return False
+
+    def createDriver(self):
+        proxy_ = baseCore.get_proxy()
+        profile = webdriver.FirefoxProfile()
+        profile.set_preference('network.proxy.type',1)
+        profile.set_preference('network.proxy.http',proxy_['http'].split('://')[1].split(':')[0])
+        profile.set_preference('network.proxy.http_port',int(proxy_['http'].split('://')[1].split(':')[1]))
+        profile.set_preference('network.proxy.ssl',proxy_['http'].split('://')[1].split(':')[0])
+        profile.set_preference('network.proxy.ssl_port',int(proxy_['http'].split('://')[1].split(':')[1]))
+        profile.update_preferences()
+        service = Service(r'F:\spider\firefox\geckodriver_1.exe')
+        options = Options()
+        options.set_preference("general.useragent.override",
+                               "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
+
+        options.add_argument('--headless')
+        options.add_argument('--disable-gpu')
+        options.add_argument('--private')
+        browser = webdriver.Firefox(firefox_profile=profile, service=service,options=options)
+        return browser
+
+    def kill_firefox(self):
+        for proc in psutil.process_iter():
+            try:
+                if proc.name() == "firefox.exe":
+                    proc.kill()
+            except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
+                pass
+
+    def translate(self, sentence, lang):
+        sentence_ = sentence
+        wait = WebDriverWait(self.browser, 20)
+        try:
+            word_type = self.get_input_language_type(sentence_, wait)
+        except:
+            self.browser.quit()
+            self.browser = self.createDriver()
+            result = self.translate(sentence_, lang)
+            return result
+
+        if word_type:
+            if word_type == lang:
+                pass
+            else:
+                word_type = lang
+            url = self.url.format(word_type, 'zh', sentence_)
+            url = quote(url, safe='/:#')
+            self.browser.set_page_load_timeout(10)
+            try:
+                self.browser.get(url)
+                wait.until(EC.presence_of_element_located(
+                    (By.XPATH, '//*[@id="main-outer"]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p[2]')))
+                result_ = self.browser.find_element(By.XPATH,
+                                                    '//*[@id="main-outer"]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p[2]')
+                result = result_.text.strip()
+                return result
+            except:
+                self.browser.quit()
+                self.browser = self.createDriver()
+                result = self.translate(sentence_, lang)
+                return result
+
+    @func_set_timeout(30)
+    def get_input_language_type(self, word, wait):
+        self.browser.get("https://fanyi.baidu.com/")
+        wait.until(EC.presence_of_element_located((By.ID, "baidu_translate_input")))
+        input_word = self.browser.find_element(By.ID, "baidu_translate_input")
+        input_word.send_keys(word)
+        wait.until(EC.presence_of_element_located(
+            (By.XPATH, '//*[@id="main-outer"]/div/div/div[1]/div[1]/div[1]/a[1]/span/span')))
+        word_type = self.browser.find_element(By.XPATH,
+                                              '//*[@id="main-outer"]/div/div/div[1]/div[1]/div[1]/a[1]/span/span')
+        word_type = word_type.get_attribute("data-lang")
+        return word_type
+
+    def is_punctuation(self, char):
+        punctuation = string.punctuation + '、' + '（' + '…' + '）' + '《' + '》' + '“' + '”' + '：' + '；' + '！' + '　' + '。'
+
+        return char in punctuation
+
+    def sentence_split_sentence(self, contentWithTag):
+        pattern = re.compile(r'[^\n]+(?=\n)|[^\n]+$')
+        match_group = pattern.finditer(contentWithTag)
+        sentences = []
+        if match_group:
+            for _ in match_group:
+                start_end_index = _.span()
+                sentences.append((start_end_index[0], start_end_index[1], _.group()))
+        if (not sentences) and (len(contentWithTag) >= 4):
+            sentences.append((0, len(contentWithTag), contentWithTag))
+        return sentences
+
+    def jionstr(self, html):
+        paragraphs = []
+        current_sentence = ''
+        for tag in html.find_all(text=True):
+            sentence = str(tag)
+            if sentence == '\n' or sentence == '\t' or sentence == ' ':
+                continue
+            if self.is_punctuation(sentence):
+                continue
+            if sentence.startswith('https://') or sentence.startswith('http://') or sentence.startswith('www.'):
+                continue
+            # 检查拼接后的句子长度是否超过1000字
+            if len(current_sentence) + len(sentence) <= 1000:
+                current_sentence += sentence
+            else:
+                paragraphs.append(current_sentence.strip())
+                current_sentence = sentence
+        return paragraphs
+
+    @func_set_timeout(300)
+    def gethtml(self, contentWithTag):
+        tag_list = []
+        html = BeautifulSoup(contentWithTag, 'html.parser')
+        content = html.text
+        lang = baseCore.detect_language(content)
+        if lang == 'zh':
+            return contentWithTag
+        for tag in html.find_all(text=True):
+            sentence = str(tag).strip()
+            tag_list.append(sentence)
+        sentence = ''
+        num = 0
+        for tag in tag_list:
+            if tag.strip() == '':
+                continue
+            if self.is_website_link(str(tag).strip()):
+                continue
+            sentence += f'{tag}😊'
+            num += 1
+        result = ''
+        while True:
+            if len(sentence.strip()) == 1 and self.is_punctuation(sentence.strip()):
+                result += sentence
+                break
+            if len(sentence) > 1000:
+                index_1000 = sentence[999]
+                # 判断该字符是不是逗号或句号
+                if index_1000 == '.' or index_1000 == '。' or index_1000 == ',' or index_1000 == '，':
+                    # 如果是标点符号
+                    result += self.translate(sentence[:1000].strip(), lang)
+                    sentence = sentence[1000:]
+                else:
+                    # 如果不是标点符号
+                    i = 1000
+                    while i >= 0:
+                        j = i - 1
+                        if j <= 0:
+                            break
+                        index_punctuation = sentence[j]
+                        if index_punctuation == '.' or index_punctuation == '。' or index_punctuation == ',' or index_punctuation == '，':
+                            result += self.translate(sentence[:j + 1].strip(), lang)
+                            sentence = sentence[j + 1:]
+                            break
+                        else:
+                            i = j
+                            continue
+                    if i == 1:
+                        result += self.translate(sentence[:1000].strip(), lang)
+                        sentence = sentence[1000:]
+            else:
+                # 翻译
+                result += self.translate(sentence, lang)
+                time.sleep(2)
+                break
+        sentences = result.split('😊')
+        num = 0
+        for tag in html.find_all(text=True):
+            if tag.strip() == '':
+                continue
+            if self.is_website_link(str(tag).strip()):
+                continue
+            sentence = sentences[num]
+            tag.replace_with(sentence)
+            num += 1
+        return str(html.prettify()) + '<p/><br>译文来源：微软自动翻译<br></p>'
--- a/zkr/esToMongodb.py
+++ b/zkr/esToMongodb.py
--- a/zkr/推送.py
+++ b/zkr/推送.py
--- a/zkr/翻译.py
+++ b/zkr/翻译.py