1/18

d9bf9b2f · 薛凌堃 · 654d0ce5 · d9bf9b2f · d9bf9b2f · d9bf9b2f
--- a/Translate/Edge_pyautogui.py
+++ b/Translate/Edge_pyautogui.py
+import pyautogui
+from retry import retry
+from selenium import webdriver
+from selenium.webdriver.common.action_chains import ActionChains
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+import time
+from bson import ObjectId
+import pymongo
+
+# 获取当前活动窗口的标题
+def get_active_window_title():
+    window = pyautogui.getActiveWindow()
+    print(f'当前活动窗口的标题是：{window.title}')
+    return window.title if window else None
+
+
+@retry(tries=3, delay=1)
+def Translate(_id, driver):
+
+    driver.get('file:///C:/Users/EDY/Desktop/aaa.html')
+
+    flag = driver.find_element(By.TAG_NAME, 'body').text
+    driver.maximize_window()
+    # 切换到Edge浏览器窗口
+    driver.switch_to.window(driver.current_window_handle)
+    # 等待一段时间，确保页面加载完成
+    time.sleep(5)
+    # 获取Edge浏览器窗口的句柄
+    edge_handle = driver.current_window_handle
+    # driver.refresh()
+    # time.sleep(5)
+    # 右键选择翻译
+    rightClick = ActionChains(driver)
+    position_element = driver.find_element(By.TAG_NAME, 'body')
+
+    rightClick.context_click(position_element).perform()
+    time.sleep(1)
+    pyautogui.typewrite(['down'] * 6)
+    pyautogui.typewrite(["enter"])
+
+    js = "return action=document.body.scrollHeight"
+    new_height = driver.execute_script(js)
+    for i in range(0, new_height, 300):
+        # js = "var q=document.documentElement.scrollTop=300"
+        driver.execute_script(js)
+        driver.execute_script('window.scrollTo(0, %s)' % (i))
+        time.sleep(1)
+    time.sleep(2)
+
+    if driver.find_element(By.TAG_NAME, 'body').text[:500] in flag:
+        print(f'{_id}---翻译失败,重试')
+        # 使用pyautogui模块模拟按下Alt+Tab键，将Edge浏览器置于最前面
+        # while get_a
+        # ctive_window_title() != "Edge浏览器":
+        while 'Microsoft Edge' not in get_active_window_title():
+            pyautogui.hotkey('alt', 'tab')
+            print('窗口切换操作')
+        # pyautogui.hotkey('alt', 'tab')
+        # 切换到Edge浏览器窗口
+        driver.switch_to.window(edge_handle)
+        driver.refresh()
+        raise
+
+    from bs4 import BeautifulSoup
+    page_source = driver.page_source
+    contentWithTag = BeautifulSoup(page_source, 'html.parser')
+    with open(rf'C:\Users\EDY\Desktop\{_id}.html', 'w', encoding='utf-8') as f:
+        f.write(str(contentWithTag))
+    # print(str(contentWithTag))
+
+
+if __name__ == "__main__":
+    driver = webdriver.Edge()
+
+    db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').中科软[
+        '数据源_0106']
+    datas = db_storage.find({'postCode':'2'}).limit(10)
+    for data in datas:
+        now = time.time()
+        _id = str(data['_id'])
+        richTextForeign = data['richTextForeign']
+        with open(r'C:\Users\EDY\Desktop\aaa.html', 'w', encoding='utf-8') as f:
+            f.write(str(richTextForeign))
+        try:
+
+            Translate(_id, driver)
+        except:
+            print('翻译失败')
+        print(f'{_id}翻译用时--{time.time() - now}')
\ No newline at end of file
--- a/comData/BaseInfo_qcc/requestQCC.py
+++ b/comData/BaseInfo_qcc/requestQCC.py
@@ -48,7 +48,7 @@ if __name__ == "__main__":
    # soup = BeautifulSoup(page_source,'html.parser')
    # print(soup)
    browser.find_element(By.CLASS_NAME, 'nav-item').click()
-    time.sleep(20)
+    time.sleep(70)
    cookies = flushAndGetToken()
    cookies = json.dumps(cookies)
    insert = f"insert into QCC_token (cookies,create_time,fenghao_time,update_time) values ('{escape_string(cookies)}',now(),DATE_SUB(NOW(), INTERVAL 1 DAY),now())"

--- a/comData/Tyc/CorePerson.py
+++ b/comData/Tyc/CorePerson.py
@@ -41,17 +41,18 @@ def doJob():
                baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode',social_code)
                continue
            id = data[0]
+            com_name = data[1]
            xydm = data[2]
            tycid = data[11]
            if tycid == None or tycid == '':
                try:
-                    retData = getTycIdByXYDM(xydm)
+                    retData = getTycIdByXYDM(com_name)
                    if retData['state']:
                        tycid = retData['tycData']['id']
                        # # todo:写入数据库
-                        # updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
-                        # cursor_.execute(updateSql)
-                        # cnx_.commit()
+                        updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
+                        cursor_.execute(updateSql)
+                        cnx_.commit()
                    else:
                        state = 0
                        takeTime = baseCore.getTimeCost(start, time.time())

--- a/sougou_comm/config.ini
+++ b/sougou_comm/config.ini
@@ -16,6 +16,8 @@ topic=keyWordsInfo
 groupId=python_sougou

 [selenium]
-chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
-binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe
+;chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
+;binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe
+chrome_driver=D:\cmd100\chromedriver.exe
+binary_location=D:\Google\Chrome\Application\chrome.exe

--- a/sougou_comm/sougouSpider.py
+++ b/sougou_comm/sougouSpider.py
@@ -7,6 +7,7 @@ import urllib3
 from bs4 import BeautifulSoup
 from gne import GeneralNewsExtractor
 from langid import langid
+from retry import retry
 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.common.by import By
@@ -144,7 +145,14 @@ class SougouSpider(object):
        itemTags=html.xpath('//div[@class="vrwrap"]')
        for itemTag in itemTags:
            try:
-                title=itemTag.xpath('.//h3[@class="vr-title"]/a/text()')[0]
+                elements=itemTag.xpath('.//h3[@class="vr-title"]/a/text()')
+                title = ''.join(str(element.strip()) for element in elements if element.strip())
+                # title = ''
+                # for e in elements:
+                #     print(e)
+                #     title += e
+                print(title)
+
            except Exception as e:
                title=''
            try:
@@ -243,9 +251,10 @@ class SougouSpider(object):
            print('时间解析异常！！')
        return publishtime

+    @retry(tries=3, delay=3)
    # 获取每一页数据, 开趴.
    def get_page_html(self):
-        self.logger.info("进入搜狗首页...")
+        self.logger.info(f"{self.searchkw}...进入搜狗首页...")
        self.driver.get(self.url)
        self.driver.find_element(By.ID, 'query').send_keys(self.searchkw)
        self.driver.find_element(By.ID, 'stb').click()
@@ -280,7 +289,7 @@ class SougouSpider(object):
        timeFlag=False
        while hasnext == '下一页':
            try:
-                if self.page_num==2:
+                if self.page_num ==21:
                    break
                self.page_num = self.page_num + 1
                self.logger.info("开始抓取第%s页..." % self.page_num)
@@ -302,6 +311,7 @@ class SougouSpider(object):
                    #     if pubtime < needTime:
                    #         timeFlag = True
                    #         break
+                    durl = detail['detailUrl']
                    is_member = self.r.sismember('pysougou_'+self.wordsCode, durl)
                    if is_member:
                        continue
@@ -325,6 +335,8 @@ class SougouSpider(object):
    def getDetailmsg(self,detailmsg):
        try:
            detailurl=detailmsg['detailUrl']
+            if detailurl == '':
+                return ''
            title = detailmsg['title']
            content,contentWithTag=self.extractorMsg(detailurl,title)
            contentWithTag=self.rmTagattr(contentWithTag,detailurl)
@@ -350,6 +362,7 @@ class SougouSpider(object):
        }
        return detailmsg

+    @retry(tries=3, delay=2)
    def webDriver(self,url):
        chrome_driver =self.config.get('selenium', 'chrome_driver')
        path = Service(chrome_driver)
@@ -360,12 +373,12 @@ class SougouSpider(object):
        try:
            driver.get(url)
            # 等待页面加载完成
-            # wait = WebDriverWait(self.driver, 20)
-            # wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
+            wait = WebDriverWait(self.driver, 20)
+            wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
            time.sleep(2)
            html=driver.page_source
        except Exception as e:
-            self.logger.info('请求失败')
+            self.logger.info(f'请求失败{e}')
        finally:
            driver.quit()

@@ -406,11 +419,12 @@ class SougouSpider(object):
        # current_window = self.driver.current_window_handle
        while True:
            if self.detailList.qsize() != 0:
-                try:
+
                detailmsg=self.detailList.get()
                title = detailmsg['title']
                detailUrl = detailmsg['detailUrl']
-                    print("%s:%s\n" % (title, detailUrl))
+                self.logger.info("%s:%s\n" % (title, detailUrl))
+                try:
                    # # js = "window.open('"+detailUrl+"')"
                    # # self.driver.execute_script(js)
                    # try:
@@ -423,19 +437,23 @@ class SougouSpider(object):
                    # response = self.driver.page_source
                    # bdetail=self.getDetailmsg(response,detailmsg)
                    bdetail=self.getDetailmsg(detailmsg)
+                    if not bdetail:
+                        continue
                    processitem=self.getProcessitem(bdetail)
                    try:
-                        # self.sendkafka(processitem)
+                        self.sendkafka(processitem)
                        self.r.sadd('pysougou_'+self.wordsCode, processitem['sourceAddress'])
-                    except Exception as e:
-                        self.logger.info("放入kafka失败！")
-                    #插入数据库
+                        # 插入数据库
                        try:
-                        items=[]
+                            items = []
                            items.append(bdetail)
                            self.itemInsertToTable(items)
                        except Exception as e:
-                        self.logger.info("插入数据库失败！")
+                            self.logger.info(f"插入数据库失败！{bdetail['kword']}===={detailUrl}")
+                        self.logger.info(f"放入kafka成功！{bdetail['kword']}===={detailUrl}")
+                    except Exception as e:
+                        self.logger.info(f"放入kafka失败！{bdetail['kword']}===={detailUrl}")
+
                    # 关闭当前新窗口
                    # self.driver.close()
                    time.sleep(1)

--- a/sougou_comm/sougoutaskJob_loc.py
+++ b/sougou_comm/sougoutaskJob_loc.py
@@ -218,12 +218,13 @@ if __name__ == '__main__':
    while True:
        try:
            codeList=[]
-            codeList.append('KW-20231013-0001')
+            # codeList.append('KW-20231013-0001')
+            codeList.append('KW-20240116-0001')
            for codeid in codeList:
                try:
-                    # keymsg=sougouTaskJob.getkeyFromredis(codeid)
-                    # kwList=sougouTaskJob.paserKeyMsg(keymsg)
-                    kwList=sougouTaskJob.lockwMsg()
+                    keymsg=sougouTaskJob.getkeyFromredis(codeid)
+                    kwList=sougouTaskJob.paserKeyMsg(keymsg)
+                    # kwList=sougouTaskJob.lockwMsg()
                    if len(kwList)<1:
                        continue
                    logger.info(f"需要搜索的关键词:{kwList}")
@@ -233,7 +234,7 @@ if __name__ == '__main__':
                    continue
                if kwList:
                    # 创建一个线程池，指定线程数量为4
-                    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+                    with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
                        # 提交任务给线程池，每个任务处理一个数据
                        results = [executor.submit(sougouTaskJob.runLocSpider, data) for data in kwList]
                        # 获取任务的执行结果