提交 d9bf9b2f 作者: 薛凌堃

1/18

上级 654d0ce5
import pyautogui
from retry import retry
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bson import ObjectId
import pymongo
# 获取当前活动窗口的标题
def get_active_window_title():
window = pyautogui.getActiveWindow()
print(f'当前活动窗口的标题是:{window.title}')
return window.title if window else None
@retry(tries=3, delay=1)
def Translate(_id, driver):
driver.get('file:///C:/Users/EDY/Desktop/aaa.html')
flag = driver.find_element(By.TAG_NAME, 'body').text
driver.maximize_window()
# 切换到Edge浏览器窗口
driver.switch_to.window(driver.current_window_handle)
# 等待一段时间,确保页面加载完成
time.sleep(5)
# 获取Edge浏览器窗口的句柄
edge_handle = driver.current_window_handle
# driver.refresh()
# time.sleep(5)
# 右键选择翻译
rightClick = ActionChains(driver)
position_element = driver.find_element(By.TAG_NAME, 'body')
rightClick.context_click(position_element).perform()
time.sleep(1)
pyautogui.typewrite(['down'] * 6)
pyautogui.typewrite(["enter"])
js = "return action=document.body.scrollHeight"
new_height = driver.execute_script(js)
for i in range(0, new_height, 300):
# js = "var q=document.documentElement.scrollTop=300"
driver.execute_script(js)
driver.execute_script('window.scrollTo(0, %s)' % (i))
time.sleep(1)
time.sleep(2)
if driver.find_element(By.TAG_NAME, 'body').text[:500] in flag:
print(f'{_id}---翻译失败,重试')
# 使用pyautogui模块模拟按下Alt+Tab键,将Edge浏览器置于最前面
# while get_a
# ctive_window_title() != "Edge浏览器":
while 'Microsoft​ Edge' not in get_active_window_title():
pyautogui.hotkey('alt', 'tab')
print('窗口切换操作')
# pyautogui.hotkey('alt', 'tab')
# 切换到Edge浏览器窗口
driver.switch_to.window(edge_handle)
driver.refresh()
raise
from bs4 import BeautifulSoup
page_source = driver.page_source
contentWithTag = BeautifulSoup(page_source, 'html.parser')
with open(rf'C:\Users\EDY\Desktop\{_id}.html', 'w', encoding='utf-8') as f:
f.write(str(contentWithTag))
# print(str(contentWithTag))
if __name__ == "__main__":
driver = webdriver.Edge()
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').中科软[
'数据源_0106']
datas = db_storage.find({'postCode':'2'}).limit(10)
for data in datas:
now = time.time()
_id = str(data['_id'])
richTextForeign = data['richTextForeign']
with open(r'C:\Users\EDY\Desktop\aaa.html', 'w', encoding='utf-8') as f:
f.write(str(richTextForeign))
try:
Translate(_id, driver)
except:
print('翻译失败')
print(f'{_id}翻译用时--{time.time() - now}')
\ No newline at end of file
......@@ -48,7 +48,7 @@ if __name__ == "__main__":
# soup = BeautifulSoup(page_source,'html.parser')
# print(soup)
browser.find_element(By.CLASS_NAME, 'nav-item').click()
time.sleep(20)
time.sleep(70)
cookies = flushAndGetToken()
cookies = json.dumps(cookies)
insert = f"insert into QCC_token (cookies,create_time,fenghao_time,update_time) values ('{escape_string(cookies)}',now(),DATE_SUB(NOW(), INTERVAL 1 DAY),now())"
......
......@@ -41,17 +41,18 @@ def doJob():
baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode',social_code)
continue
id = data[0]
com_name = data[1]
xydm = data[2]
tycid = data[11]
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(xydm)
retData = getTycIdByXYDM(com_name)
if retData['state']:
tycid = retData['tycData']['id']
# # todo:写入数据库
# updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
# cursor_.execute(updateSql)
# cnx_.commit()
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor_.execute(updateSql)
cnx_.commit()
else:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
......
......@@ -16,6 +16,8 @@ topic=keyWordsInfo
groupId=python_sougou
[selenium]
chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe
;chrome_driver=C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe
;binary_location=D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe
chrome_driver=D:\cmd100\chromedriver.exe
binary_location=D:\Google\Chrome\Application\chrome.exe
......@@ -7,6 +7,7 @@ import urllib3
from bs4 import BeautifulSoup
from gne import GeneralNewsExtractor
from langid import langid
from retry import retry
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
......@@ -144,7 +145,14 @@ class SougouSpider(object):
itemTags=html.xpath('//div[@class="vrwrap"]')
for itemTag in itemTags:
try:
title=itemTag.xpath('.//h3[@class="vr-title"]/a/text()')[0]
elements=itemTag.xpath('.//h3[@class="vr-title"]/a/text()')
title = ''.join(str(element.strip()) for element in elements if element.strip())
# title = ''
# for e in elements:
# print(e)
# title += e
print(title)
except Exception as e:
title=''
try:
......@@ -243,9 +251,10 @@ class SougouSpider(object):
print('时间解析异常!!')
return publishtime
@retry(tries=3, delay=3)
# 获取每一页数据, 开趴.
def get_page_html(self):
self.logger.info("进入搜狗首页...")
self.logger.info(f"{self.searchkw}...进入搜狗首页...")
self.driver.get(self.url)
self.driver.find_element(By.ID, 'query').send_keys(self.searchkw)
self.driver.find_element(By.ID, 'stb').click()
......@@ -280,7 +289,7 @@ class SougouSpider(object):
timeFlag=False
while hasnext == '下一页':
try:
if self.page_num==2:
if self.page_num ==21:
break
self.page_num = self.page_num + 1
self.logger.info("开始抓取第%s页..." % self.page_num)
......@@ -302,6 +311,7 @@ class SougouSpider(object):
# if pubtime < needTime:
# timeFlag = True
# break
durl = detail['detailUrl']
is_member = self.r.sismember('pysougou_'+self.wordsCode, durl)
if is_member:
continue
......@@ -325,6 +335,8 @@ class SougouSpider(object):
def getDetailmsg(self,detailmsg):
try:
detailurl=detailmsg['detailUrl']
if detailurl == '':
return ''
title = detailmsg['title']
content,contentWithTag=self.extractorMsg(detailurl,title)
contentWithTag=self.rmTagattr(contentWithTag,detailurl)
......@@ -350,6 +362,7 @@ class SougouSpider(object):
}
return detailmsg
@retry(tries=3, delay=2)
def webDriver(self,url):
chrome_driver =self.config.get('selenium', 'chrome_driver')
path = Service(chrome_driver)
......@@ -360,12 +373,12 @@ class SougouSpider(object):
try:
driver.get(url)
# 等待页面加载完成
# wait = WebDriverWait(self.driver, 20)
# wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
wait = WebDriverWait(self.driver, 20)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
time.sleep(2)
html=driver.page_source
except Exception as e:
self.logger.info('请求失败')
self.logger.info(f'请求失败{e}')
finally:
driver.quit()
......@@ -406,11 +419,12 @@ class SougouSpider(object):
# current_window = self.driver.current_window_handle
while True:
if self.detailList.qsize() != 0:
try:
detailmsg=self.detailList.get()
title = detailmsg['title']
detailUrl = detailmsg['detailUrl']
print("%s:%s\n" % (title, detailUrl))
self.logger.info("%s:%s\n" % (title, detailUrl))
try:
# # js = "window.open('"+detailUrl+"')"
# # self.driver.execute_script(js)
# try:
......@@ -423,19 +437,23 @@ class SougouSpider(object):
# response = self.driver.page_source
# bdetail=self.getDetailmsg(response,detailmsg)
bdetail=self.getDetailmsg(detailmsg)
if not bdetail:
continue
processitem=self.getProcessitem(bdetail)
try:
# self.sendkafka(processitem)
self.sendkafka(processitem)
self.r.sadd('pysougou_'+self.wordsCode, processitem['sourceAddress'])
except Exception as e:
self.logger.info("放入kafka失败!")
#插入数据库
# 插入数据库
try:
items=[]
items = []
items.append(bdetail)
self.itemInsertToTable(items)
except Exception as e:
self.logger.info("插入数据库失败!")
self.logger.info(f"插入数据库失败!{bdetail['kword']}===={detailUrl}")
self.logger.info(f"放入kafka成功!{bdetail['kword']}===={detailUrl}")
except Exception as e:
self.logger.info(f"放入kafka失败!{bdetail['kword']}===={detailUrl}")
# 关闭当前新窗口
# self.driver.close()
time.sleep(1)
......
......@@ -218,12 +218,13 @@ if __name__ == '__main__':
while True:
try:
codeList=[]
codeList.append('KW-20231013-0001')
# codeList.append('KW-20231013-0001')
codeList.append('KW-20240116-0001')
for codeid in codeList:
try:
# keymsg=sougouTaskJob.getkeyFromredis(codeid)
# kwList=sougouTaskJob.paserKeyMsg(keymsg)
kwList=sougouTaskJob.lockwMsg()
keymsg=sougouTaskJob.getkeyFromredis(codeid)
kwList=sougouTaskJob.paserKeyMsg(keymsg)
# kwList=sougouTaskJob.lockwMsg()
if len(kwList)<1:
continue
logger.info(f"需要搜索的关键词:{kwList}")
......@@ -233,7 +234,7 @@ if __name__ == '__main__':
continue
if kwList:
# 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
# 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(sougouTaskJob.runLocSpider, data) for data in kwList]
# 获取任务的执行结果
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论