提交 654d0ce5 作者: 薛凌堃

1/17

上级 9fc0f9da
...@@ -126,7 +126,7 @@ def delete_url(article_url): ...@@ -126,7 +126,7 @@ def delete_url(article_url):
else: else:
return False return False
def uptoOBS(pdf_url, name_pdf, type_id, pathType): def uptoOBS(pdf_url, name_pdf, type_id, pathType, header):
retData = {'state': False, 'type_id': type_id, 'group_name': '', 'path': '', retData = {'state': False, 'type_id': type_id, 'group_name': '', 'path': '',
'full_path': '', 'full_path': '',
'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun', 'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
...@@ -134,10 +134,17 @@ def uptoOBS(pdf_url, name_pdf, type_id, pathType): ...@@ -134,10 +134,17 @@ def uptoOBS(pdf_url, name_pdf, type_id, pathType):
for i in range(0, 3): for i in range(0, 3):
try: try:
ip = baseCore.get_proxy() ip = baseCore.get_proxy()
if header:
response = requests.get(pdf_url, headers=header, verify=False, timeout=20)
else:
response = requests.get(pdf_url, headers=headers, verify=False, timeout=20) response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
if response.status_code == 200:
pass
else:
return retData
file_size = int(response.headers.get('Content-Length')) file_size = int(response.headers.get('Content-Length'))
break break
except: except Exception as e:
time.sleep(3) time.sleep(3)
continue continue
page_size = 0 page_size = 0
...@@ -175,7 +182,7 @@ def uptoOBS(pdf_url, name_pdf, type_id, pathType): ...@@ -175,7 +182,7 @@ def uptoOBS(pdf_url, name_pdf, type_id, pathType):
# 下载pdf文件,上传至服务器 # 下载pdf文件,上传至服务器
def download(data, order_by): def download(data, order_by, header):
url_pdf = data['url_pdf'] url_pdf = data['url_pdf']
name_pdf = data['title'] name_pdf = data['title']
if '.pdf' not in name_pdf: if '.pdf' not in name_pdf:
...@@ -251,7 +258,7 @@ def download(data, order_by): ...@@ -251,7 +258,7 @@ def download(data, order_by):
url_pdf = 'https://' + url_pdf url_pdf = 'https://' + url_pdf
# 文件上传到obs # 文件上传到obs
retData = uptoOBS(url_pdf, name_pdf, 4, pathType) retData = uptoOBS(url_pdf, name_pdf, 4, pathType, header)
if retData['state']: if retData['state']:
pass pass
else: else:
...@@ -495,7 +502,7 @@ def Mob(): ...@@ -495,7 +502,7 @@ def Mob():
'sid': '1662008807781212161', # 信息源id 'sid': '1662008807781212161', # 信息源id
} }
order_by = 1 order_by = 1
download(dic_post,order_by) download(dic_post,order_by, '')
order_by += 1 order_by += 1
...@@ -553,7 +560,7 @@ def yidong_guanxiangtai(): ...@@ -553,7 +560,7 @@ def yidong_guanxiangtai():
'sid': '1662008276140597250', # 信息源id 'sid': '1662008276140597250', # 信息源id
} }
order_by = 1 order_by = 1
download(dic_post, order_by) download(dic_post, order_by, '')
order_by += 1 order_by += 1
# print(page,dic_post) # print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download' # url = 'http://114.115.155.139:5002/report_download'
...@@ -659,7 +666,7 @@ def getnews(browser): ...@@ -659,7 +666,7 @@ def getnews(browser):
'sid': '1662008524476948481', # 信息源id 'sid': '1662008524476948481', # 信息源id
} }
order_by = 1 order_by = 1
download(dic_post, order_by) download(dic_post, order_by, '')
order_by += 1 order_by += 1
# print(page,dic_post) # print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download' # url = 'http://114.115.155.139:5002/report_download'
...@@ -774,7 +781,7 @@ def ke36(): ...@@ -774,7 +781,7 @@ def ke36():
'sid': '1662008421217378306', # 信息源id 'sid': '1662008421217378306', # 信息源id
} }
order_by = 1 order_by = 1
# download(dic_post, order_by) download(dic_post, order_by, '')
order_by += 1 order_by += 1
# print(page,dic_post) # print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download' # url = 'http://114.115.155.139:5002/report_download'
...@@ -824,7 +831,7 @@ def qianyanzhishiku(): ...@@ -824,7 +831,7 @@ def qianyanzhishiku():
'sid': '1662008620631367682', # 信息源id 'sid': '1662008620631367682', # 信息源id
} }
order_by = 1 order_by = 1
download(dic_post, order_by) download(dic_post, order_by, '')
order_by += 1 order_by += 1
# print(page,dic_post) # print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download' # url = 'http://114.115.155.139:5002/report_download'
...@@ -904,6 +911,12 @@ def shijiejingjiluntan(): ...@@ -904,6 +911,12 @@ def shijiejingjiluntan():
url = f'https://cn.weforum.org/publications/?page={i}' url = f'https://cn.weforum.org/publications/?page={i}'
browser.get(url) # 跳到指定页面 browser.get(url) # 跳到指定页面
time.sleep(5) time.sleep(5)
# 输出浏览器头部的cookies
cookie_list = browser.get_cookies()
cookies = {}
for cookie in cookie_list:
cookies[cookie['name']] = cookie['value']
# print(cookies)
wait = WebDriverWait(browser, 30) wait = WebDriverWait(browser, 30)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "wef-184hs11"))) wait.until(EC.presence_of_element_located((By.CLASS_NAME, "wef-184hs11")))
page_source = browser.page_source # 获取页面信息 page_source = browser.page_source # 获取页面信息
...@@ -930,13 +943,58 @@ def shijiejingjiluntan(): ...@@ -930,13 +943,58 @@ def shijiejingjiluntan():
info_href = tag.find('a').get('href') info_href = tag.find('a').get('href')
header ={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
# 'Cookie': '_vwo_uuid_v2=D73B6BDECC63FD6C529BF86681F8CD58B|c28a87d1a3cc7168448309232b08e8b6; _vwo_uuid=D73B6BDECC63FD6C529BF86681F8CD58B; CookieConsent={stamp:%27Ri/OIoTKx+9q/VGLC27KMJW8eBEK5YnHm+js/rEyWz9novAMhCvuFQ==%27%2Cnecessary:true%2Cpreferences:true%2Cstatistics:true%2Cmarketing:true%2Cmethod:%27explicit%27%2Cver:1%2Cutc:1702452209760%2Cregion:%27cn%27}; _gcl_au=1.1.1801193883.1702452210; mpDistinctId=abc6849e-26de-4c82-b884-28fa56155633; _parsely_session={%22sid%22:7%2C%22surl%22:%22https://cn.weforum.org/publications/%22%2C%22sref%22:%22%22%2C%22sts%22:1705367422607%2C%22slts%22:1703642202973}; _parsely_visitor={%22id%22:%22pid=7df1e8df-edda-4b0b-abf2-7342814bd9f7%22%2C%22session_count%22:7%2C%22last_session_ts%22:1705367422607}; _gid=GA1.2.1315176656.1705367430; _vis_opt_s=4%7C; _vis_opt_test_cookie=1; _web_session=c2F5TXRsb2NWK0dhL0Q2YXIzdHY2Wkx6eTk2ZlYyRHRDM20wYVVFV3R0VkxxT09DWHJJaHpEeWNzZ0RqM2dDQlpmWWdjUjlQblgzUHBoT0Q4bGdBdmFkUjg2MC9WTURTSmpUNm1NSnZHQ2hpZHVLbWpOSHEra01ucnNOWlMyNWFTa2NpTWNpSHIxbGcwRzZXbXhGTmFRPT0tLUwvcmovcHovSFB3Y0FxNTkrdkUzWEE9PQ%3D%3D--1f41b0e467ab72bcb2632361eedb0ac2c73bbcb1; _ga_1RV0X04XBG=GS1.1.1705367429.7.1.1705369590.0.0.0; _ga_2K5FR2KRN5=GS1.1.1705367430.7.1.1705369590.48.0.0; mp_6232aeb08818ee1161204a011ed8ad16_mixpanel=%7B%22distinct_id%22%3A%20%22abc6849e-26de-4c82-b884-28fa56155633%22%2C%22%24device_id%22%3A%20%2218c620f58ad7028-0ac659fa5c3fcc-26001951-e1000-18c620f58ad7028%22%2C%22%24initial_referrer%22%3A%20%22%24direct%22%2C%22%24initial_referring_domain%22%3A%20%22%24direct%22%2C%22%24user_id%22%3A%20%22abc6849e-26de-4c82-b884-28fa56155633%22%2C%22platform%22%3A%20%22Public%20Site%22%7D; _ga=GA1.2.1351018774.1702452201; _ga_4DKG1LX6QK=GS1.1.1705367445.7.1.1705369592.57.0.0; _vwo_sn=2921458; _vwo_ds=3%3Aa_0%2Ct_0%3A0%241702452201%3A26.98833337%3A%3A26_0%3A4_0%2C3_0%3A3',
'Host': 'cn.weforum.org',
# 'If-None-Match': 'W/"f45caaa1faa8a197c0da29d6c90fa0e8"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
cookies = json.dumps(cookies)
cookies_ = json.loads(cookies)
s = requests.session()
s.cookies.update(cookies_)
reponse = s.get(url=info_href, headers=header, verify=False)
# jar = requests.cookies.RequestsCookieJar() # 先构建RequestsCookieJar对象
# for i in cookie_list:
# # 将selenium侧获取的完整cookies的每一个cookie名称和值传入RequestsCookieJar对象
# # domain和path为可选参数,主要是当出现同名不同作用域的cookie时,为了防止后面同名的cookie将前者覆盖而添加的
# jar.set(i['name'], i['value'], domain=i['domain'], path=i['path'])
#
# session = requests.session() # requests以session会话形式访问网站
# session.cookies.update(jar) # 将配置好的RequestsCookieJar对象加入到requests形式的session会话中
# req = requests.Request(method='GET', url=url, headers=header)
# rpe = session.send(session.prepare_request(req),
# verify=False, # verify设置为False来规避SSL证书验证
# timeout=10)
res_info = requests.get(info_href)
soup_info = BeautifulSoup(res_info.content,'html.parser') # rpe = requests.get(url=info_href, headers=header)
# res_info = session.get(url=info_href, headers=header)
if reponse.status_code == 200:
pass
else:
reponse = s.get(url=info_href, headers=header)
soup_info = BeautifulSoup(reponse.content,'html.parser')
info_content = soup_info.find('div',{'class':'small-12 medium-8 columns'}).text.strip() info_content = soup_info.find('div',{'class':'small-12 medium-8 columns'}).text.strip()
dic_post = { dic_post = {
'title': info_title, # 报告名称 'title': info_title, # 报告名称
'url_pdf': info_pdf, # 报告链接 'url_pdf': info_pdf, # 报告链接
...@@ -953,7 +1011,7 @@ def shijiejingjiluntan(): ...@@ -953,7 +1011,7 @@ def shijiejingjiluntan():
'sid': '1662008019231088642', # 信息源id 'sid': '1662008019231088642', # 信息源id
} }
order_by = 1 order_by = 1
download(dic_post, order_by) download(dic_post, order_by, header)
order_by += 1 order_by += 1
# print(page,dic_post) # print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download' # url = 'http://114.115.155.139:5002/report_download'
......
...@@ -56,7 +56,7 @@ if __name__=="__main__": ...@@ -56,7 +56,7 @@ if __name__=="__main__":
url = "https://mp.weixin.qq.com/" url = "https://mp.weixin.qq.com/"
browser.get(url) browser.get(url)
# 可改动 # 可改动
time.sleep(20) time.sleep(40)
s = requests.session() s = requests.session()
#获取到token和cookies #获取到token和cookies
......
...@@ -28,9 +28,9 @@ r = baseCore.r ...@@ -28,9 +28,9 @@ r = baseCore.r
def resHtml(token,url,cookies): def resHtml(token,url,cookies):
try: try:
ip = baseCore.get_proxy() ip = baseCore.get_proxy()
s=requests.session() # s=requests.session()
cookie_jar = requests.utils.cookiejar_from_dict(cookies, cookiejar=None, overwrite=True) cookie_jar = requests.utils.cookiejar_from_dict(cookies, cookiejar=None, overwrite=True)
s = requests.session()
s.cookies = cookie_jar s.cookies = cookie_jar
# json_search = s.get(url, headers=headers, proxies=ip, verify=False).json() # json_search = s.get(url, headers=headers, proxies=ip, verify=False).json()
json_search = s.get(url, headers=headers, proxies=ip,verify=False).json() json_search = s.get(url, headers=headers, proxies=ip,verify=False).json()
...@@ -142,7 +142,8 @@ def updateCookieToken(token,cookies): ...@@ -142,7 +142,8 @@ def updateCookieToken(token,cookies):
cnx_.commit() cnx_.commit()
#获取token #获取token
def getToken(): def getToken():
cursor_.execute(f"select token,cookies from weixin_tokenCookies where fenghao_time < DATE_SUB(NOW(), INTERVAL 2 HOUR) order by update_time asc limit 1") # cursor_.execute(f"select token,cookies from weixin_tokenCookies where fenghao_time < DATE_SUB(NOW(), INTERVAL 2 HOUR) order by update_time asc limit 1")
cursor_.execute(f"select token,cookies from weixin_tokenCookies where user_name = 'wahaha2'")
row = cursor_.fetchall() row = cursor_.fetchall()
cnx_.commit() cnx_.commit()
if row: if row:
......
...@@ -215,8 +215,8 @@ def spider_zhuanli(com_name, social_code, tycid): ...@@ -215,8 +215,8 @@ def spider_zhuanli(com_name, social_code, tycid):
def runSpider(): def runSpider():
# 根据从Redis中拿到的社会信用代码, 在数据库中获取对应基本信息 # 根据从Redis中拿到的社会信用代码, 在数据库中获取对应基本信息
# social_code = baseCore.redicPullData('ZhuanLi:gnshSocial_code') social_code = baseCore.redicPullData('ZhuanLi:gnshSocial_code')
social_code = '91360400794798498A' # social_code = '91360400794798498A'
# 判断 如果Redis中已经没有数据,则等待 # 判断 如果Redis中已经没有数据,则等待
if social_code == None: if social_code == None:
# time.sleep(20) # time.sleep(20)
...@@ -331,6 +331,6 @@ if __name__ == '__main__': ...@@ -331,6 +331,6 @@ if __name__ == '__main__':
while True: while True:
start = time.time() start = time.time()
num_threads = 1 num_threads = 5
run_threads(num_threads) run_threads(num_threads)
log.info(f'5线程 总耗时{time.time()-start}秒') log.info(f'5线程 总耗时{time.time()-start}秒')
...@@ -3,6 +3,7 @@ from urllib.parse import urljoin ...@@ -3,6 +3,7 @@ from urllib.parse import urljoin
import langid import langid
import pymysql import pymysql
from gne import GeneralNewsExtractor from gne import GeneralNewsExtractor
from retry import retry
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
...@@ -251,10 +252,10 @@ class GoogleSpider(object): ...@@ -251,10 +252,10 @@ class GoogleSpider(object):
print('时间解析异常!!') print('时间解析异常!!')
return publishtime return publishtime
@retry(tries=3, delay=3)
# 获取每一页数据, 开趴. # 获取每一页数据, 开趴.
def get_page_html(self): def get_page_html(self):
self.logger.info("进入google首页..." ) self.logger.info(f"{self.searchkw}...进入google首页...")
self.driver.get(self.url) self.driver.get(self.url)
# 等待页面加载完成 # 等待页面加载完成
time.sleep(3) time.sleep(3)
...@@ -280,9 +281,9 @@ class GoogleSpider(object): ...@@ -280,9 +281,9 @@ class GoogleSpider(object):
time.sleep(2) time.sleep(2)
self.driver.find_element('xpath', '//div[@class="YpcDnf OSrXXb HG1dvd"]/a[text()="按日期排序"]').click() self.driver.find_element('xpath', '//div[@class="YpcDnf OSrXXb HG1dvd"]/a[text()="按日期排序"]').click()
except Exception as e: except Exception as e:
self.logger.info(f'--点击按钮失效----{e}') self.logger.info(f'--{self.searchkw}--点击按钮失效----{e}')
return return
self.logger.info("开始抓取首页..." + self.searchkw ) self.logger.info(f"{self.searchkw}...开始抓取首页...")
time.sleep(5) time.sleep(5)
flag, lists = self.parse_page() flag, lists = self.parse_page()
if len(lists)<1: if len(lists)<1:
...@@ -291,19 +292,23 @@ class GoogleSpider(object): ...@@ -291,19 +292,23 @@ class GoogleSpider(object):
durl=detail['detailUrl'] durl=detail['detailUrl']
is_member = self.r.sismember('pygoogle_'+self.wordsCode, durl) is_member = self.r.sismember('pygoogle_'+self.wordsCode, durl)
if is_member: if is_member:
self.logger.info(f"{self.searchkw}已存在{detail['title']}")
continue continue
self.detailList.put(detail) self.detailList.put(detail)
response = self.driver.page_source response = self.driver.page_source
html = etree.HTML(response) html = etree.HTML(response)
try:
hasnext = html.xpath('//table[@class="AaVjTc"]//td[last()]//text()')[0] hasnext = html.xpath('//table[@class="AaVjTc"]//td[last()]//text()')[0]
hasnext = hasnext.strip() hasnext = hasnext.strip()
except:
hasnext = ''
timeFlag = False timeFlag = False
while hasnext == '下一页': while hasnext == '下一页':
if self.page_num==5: # if self.page_num==5:
break # break
self.page_num = self.page_num + 1 self.page_num = self.page_num + 1
self.logger.info("开始抓取第%s页..." % self.page_num) self.logger.info(f"{self.searchkw}...开始抓取第{self.page_num}页...")
try: try:
self.driver.find_element(By.XPATH, '//a[@id="pnnext"]').click() self.driver.find_element(By.XPATH, '//a[@id="pnnext"]').click()
except Exception as e: except Exception as e:
...@@ -311,17 +316,19 @@ class GoogleSpider(object): ...@@ -311,17 +316,19 @@ class GoogleSpider(object):
time.sleep(5) time.sleep(5)
flag, lists = self.parse_page() flag, lists = self.parse_page()
for detail in lists: for detail in lists:
durl = detail['detailUrl']
is_member = self.r.sismember('pygoogle_'+self.wordsCode, durl) is_member = self.r.sismember('pygoogle_'+self.wordsCode, durl)
if is_member: if is_member:
self.logger.info(f"{self.searchkw}已存在{detail['title']}")
continue continue
publishTag=detail['publishTag'] publishTag=detail['publishTag']
if publishTag: # if publishTag:
pubtime = datetime.datetime.strptime(publishTag, "%Y-%m-%d %H:%M:%S") # pubtime = datetime.datetime.strptime(publishTag, "%Y-%m-%d %H:%M:%S")
needDate='2022-01-01 00:00:00' # needDate='2022-01-01 00:00:00'
needTime = datetime.datetime.strptime(needDate, "%Y-%m-%d %H:%M:%S") # needTime = datetime.datetime.strptime(needDate, "%Y-%m-%d %H:%M:%S")
if pubtime < needTime: # if pubtime < needTime:
timeFlag = True # timeFlag = True
break # break
self.detailList.put(detail) self.detailList.put(detail)
if timeFlag: if timeFlag:
break break
...@@ -333,7 +340,7 @@ class GoogleSpider(object): ...@@ -333,7 +340,7 @@ class GoogleSpider(object):
self.logger.info(hasnext) self.logger.info(hasnext)
except Exception as e: except Exception as e:
hasnext='' hasnext=''
self.logger.info("抓取完毕") self.logger.info(f"{self.searchkw}...列表抓取完毕")
def getRequest(self,url): def getRequest(self,url):
html='' html=''
...@@ -361,11 +368,12 @@ class GoogleSpider(object): ...@@ -361,11 +368,12 @@ class GoogleSpider(object):
def get_detail_html(self): def get_detail_html(self):
while True: while True:
if self.detailList.qsize() != 0: if self.detailList.qsize() != 0:
try:
detailmsg=self.detailList.get() detailmsg=self.detailList.get()
title = detailmsg['title'] title = detailmsg['title']
detailUrl = detailmsg['detailUrl'] detailUrl = detailmsg['detailUrl']
self.logger.info("%s:%s\n" % (title, detailUrl)) self.logger.info("%s:%s开始解析详情数据\n" % (title, detailUrl))
try:
# try: # try:
# self.driver.get(detailUrl) # self.driver.get(detailUrl)
# except Exception as e: # except Exception as e:
...@@ -398,15 +406,17 @@ class GoogleSpider(object): ...@@ -398,15 +406,17 @@ class GoogleSpider(object):
try: try:
self.sendkafka(processitem) self.sendkafka(processitem)
self.r.sadd('pygoogle_'+self.wordsCode, processitem['sourceAddress']) self.r.sadd('pygoogle_'+self.wordsCode, processitem['sourceAddress'])
except Exception as e: # 插入数据库
self.logger.info("放入kafka失败!")
#插入数据库
try: try:
items=[] items = []
items.append(bdetail) items.append(bdetail)
self.itemInsertToTable(items) self.itemInsertToTable(items)
except Exception as e: except Exception as e:
self.logger.info(f"插入数据库失败==={e}") self.logger.info(f"插入数据库失败!{bdetail['kword']}===={detailUrl}")
self.logger.info(f"放入kafka成功!{bdetail['kword']}===={detailUrl}")
except Exception as e:
self.logger.info(f"放入kafka失败!{bdetail['kword']}===={detailUrl}")
# 关闭当前新窗口 # 关闭当前新窗口
# self.driver.close() # self.driver.close()
time.sleep(1) time.sleep(1)
...@@ -508,10 +518,11 @@ class GoogleSpider(object): ...@@ -508,10 +518,11 @@ class GoogleSpider(object):
if publishDate=='': if publishDate=='':
return return
kafka_result = producer.send("crawlerInfo", json.dumps(processitem, ensure_ascii=False).encode('utf8')) kafka_result = producer.send("crawlerInfo", json.dumps(processitem, ensure_ascii=False).encode('utf8'))
self.logger.info("数据发送kafka成功") # self.logger.info("数据发送kafka成功")
self.logger.info(kafka_result.get(timeout=10)) self.logger.info(kafka_result.get(timeout=10))
except Exception as e: except Exception as e:
self.logger.info('发送kafka异常') pass
# self.logger.info('发送kafka异常')
finally: finally:
producer.close() producer.close()
......
...@@ -26,7 +26,7 @@ baseCore = BaseCore() ...@@ -26,7 +26,7 @@ baseCore = BaseCore()
class Translate(): class Translate():
def __init__(self): def __init__(self):
self.url = "https://fanyi.baidu.com/#" self.url = "https://fanyi.baidu.com/#{}/{}/{}"
self.header = { self.header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1"} "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1"}
self.browser = self.createDriver() self.browser = self.createDriver()
...@@ -53,7 +53,7 @@ class Translate(): ...@@ -53,7 +53,7 @@ class Translate():
profile.set_preference('network.proxy.ssl',proxy_['http'].split('://')[1].split(':')[0]) profile.set_preference('network.proxy.ssl',proxy_['http'].split('://')[1].split(':')[0])
profile.set_preference('network.proxy.ssl_port',int(proxy_['http'].split('://')[1].split(':')[1])) profile.set_preference('network.proxy.ssl_port',int(proxy_['http'].split('://')[1].split(':')[1]))
profile.update_preferences() profile.update_preferences()
service = Service(r'F:\spider\firefox\geckodriver_1.exe') service = Service(r'D:\soft\geckodriver.exe')
options = Options() options = Options()
options.set_preference("general.useragent.override", options.set_preference("general.useragent.override",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3") "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
......
#coding:utf-8 #coding:utf-8
...@@ -20,19 +20,19 @@ from selenium.webdriver.firefox.options import Options ...@@ -20,19 +20,19 @@ from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.proxy import Proxy, ProxyType from selenium.webdriver.common.proxy import Proxy, ProxyType
from func_timeout import func_set_timeout from func_timeout import func_set_timeout
from base.BaseCore import BaseCore from base.BaseCore import BaseCore
baseCore = BaseCore() baseCore = BaseCore()
class Translate(): class Translate():
def __init__(self): def __init__(self):
self.url = "https://fanyi.baidu.com/#" self.url = "https://fanyi.baidu.com/#{}/{}/{}"
# self.url = "https://fanyi.baidu.com/#"
self.header = { self.header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1"} "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1"}
self.browser = self.createDriver() self.browser = self.createDriver()
self.db_storage = \ self.db_storage = \
pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').中科软[ pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').中科软[
'数据源_0106'] '数据源_0504']
def close(self): def close(self):
self.browser.quit() self.browser.quit()
...@@ -53,7 +53,7 @@ class Translate(): ...@@ -53,7 +53,7 @@ class Translate():
profile.set_preference('network.proxy.ssl',proxy_['http'].split('://')[1].split(':')[0]) profile.set_preference('network.proxy.ssl',proxy_['http'].split('://')[1].split(':')[0])
profile.set_preference('network.proxy.ssl_port',int(proxy_['http'].split('://')[1].split(':')[1])) profile.set_preference('network.proxy.ssl_port',int(proxy_['http'].split('://')[1].split(':')[1]))
profile.update_preferences() profile.update_preferences()
service = Service(r'F:\spider\firefox\geckodriver_1.exe') service = Service(r'D:\soft\geckodriver.exe')
options = Options() options = Options()
options.set_preference("general.useragent.override", options.set_preference("general.useragent.override",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3") "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
...@@ -71,6 +71,11 @@ class Translate(): ...@@ -71,6 +71,11 @@ class Translate():
def translate(self, sentence, lang): def translate(self, sentence, lang):
sentence_ = sentence sentence_ = sentence
wait = WebDriverWait(self.browser, 20) wait = WebDriverWait(self.browser, 20)
# if lang == 'jp':
# url = self.url.format('jp', 'zh', sentence_)
# url = quote(url, safe='/:#')
# self.browser.set_page_load_timeout(10)
# else:
try: try:
word_type = self.get_input_language_type(sentence_, wait) word_type = self.get_input_language_type(sentence_, wait)
except: except:
...@@ -87,6 +92,7 @@ class Translate(): ...@@ -87,6 +92,7 @@ class Translate():
url = self.url.format(word_type, 'zh', sentence_) url = self.url.format(word_type, 'zh', sentence_)
url = quote(url, safe='/:#') url = quote(url, safe='/:#')
self.browser.set_page_load_timeout(10) self.browser.set_page_load_timeout(10)
try: try:
self.browser.get(url) self.browser.get(url)
wait.until(EC.presence_of_element_located( wait.until(EC.presence_of_element_located(
...@@ -95,7 +101,7 @@ class Translate(): ...@@ -95,7 +101,7 @@ class Translate():
'//*[@id="main-outer"]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p[2]') '//*[@id="main-outer"]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p[2]')
result = result_.text.strip() result = result_.text.strip()
return result return result
except: except Exception as e:
self.browser.quit() self.browser.quit()
self.browser = self.createDriver() self.browser = self.createDriver()
result = self.translate(sentence_, lang) result = self.translate(sentence_, lang)
...@@ -107,6 +113,7 @@ class Translate(): ...@@ -107,6 +113,7 @@ class Translate():
wait.until(EC.presence_of_element_located((By.ID, "baidu_translate_input"))) wait.until(EC.presence_of_element_located((By.ID, "baidu_translate_input")))
input_word = self.browser.find_element(By.ID, "baidu_translate_input") input_word = self.browser.find_element(By.ID, "baidu_translate_input")
input_word.send_keys(word) input_word.send_keys(word)
wait.until(EC.presence_of_element_located( wait.until(EC.presence_of_element_located(
(By.XPATH, '//*[@id="main-outer"]/div/div/div[1]/div[1]/div[1]/a[1]/span/span'))) (By.XPATH, '//*[@id="main-outer"]/div/div/div[1]/div[1]/div[1]/a[1]/span/span')))
word_type = self.browser.find_element(By.XPATH, word_type = self.browser.find_element(By.XPATH,
...@@ -131,25 +138,6 @@ class Translate(): ...@@ -131,25 +138,6 @@ class Translate():
sentences.append((0, len(contentWithTag), contentWithTag)) sentences.append((0, len(contentWithTag), contentWithTag))
return sentences return sentences
def jionstr(self, html):
paragraphs = []
current_sentence = ''
for tag in html.find_all(text=True):
sentence = str(tag)
if sentence == '\n' or sentence == '\t' or sentence == ' ':
continue
if self.is_punctuation(sentence):
continue
if sentence.startswith('https://') or sentence.startswith('http://') or sentence.startswith('www.'):
continue
# 检查拼接后的句子长度是否超过1000字
if len(current_sentence) + len(sentence) <= 1000:
current_sentence += sentence
else:
paragraphs.append(current_sentence.strip())
current_sentence = sentence
return paragraphs
def gethtml(self, contentWithTag): def gethtml(self, contentWithTag):
tag_list = [] tag_list = []
html = BeautifulSoup(contentWithTag, 'html.parser') html = BeautifulSoup(contentWithTag, 'html.parser')
...@@ -169,14 +157,15 @@ class Translate(): ...@@ -169,14 +157,15 @@ class Translate():
continue continue
sentence += f'{tag}😊' sentence += f'{tag}😊'
num += 1 num += 1
#print(num) # print(num)
# if len(sentence) == 1: # if len(sentence) == 1:
# continue # continue
# if sentence == '\n' or sentence == '\t' or sentence == ' ': # if sentence == '\n' or sentence == '\t' or sentence == ' ':
# continue # continue
# if self.is_punctuation(sentence): # if self.is_punctuation(sentence):
# continue # continue
#print(sentence) print(sentence)
# before_ =
result = '' result = ''
while True: while True:
if len(sentence.strip()) == 1 and self.is_punctuation(sentence.strip()): if len(sentence.strip()) == 1 and self.is_punctuation(sentence.strip()):
...@@ -213,7 +202,7 @@ class Translate(): ...@@ -213,7 +202,7 @@ class Translate():
result += self.translate(sentence, lang) result += self.translate(sentence, lang)
time.sleep(2) time.sleep(2)
break break
#print(result) print(result)
sentences = result.split('😊') sentences = result.split('😊')
#print(len(sentences)) #print(len(sentences))
num = 0 num = 0
...@@ -224,17 +213,20 @@ class Translate(): ...@@ -224,17 +213,20 @@ class Translate():
continue continue
#print(num,tag) #print(num,tag)
sentence = sentences[num] sentence = sentences[num]
print(tag)
print(sentence)
tag.replace_with(sentence) tag.replace_with(sentence)
num += 1 num += 1
return str(html.prettify()) + '<p/><br>译文来源:微软自动翻译<br></p>' return str(html.prettify()) + '<p/><br>译文来源:微软自动翻译<br></p>'
# if __name__ == "__main__": if __name__ == "__main__":
# test = Translate() test = Translate()
# db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').中科软[ db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').中科软[
# '数据源_0504'] '数据源_0106']
# data = db_storage.find_one({'_id': ObjectId('656f14e84d6d77428c713271')}) data = db_storage.find_one({'_id': ObjectId('65a4a2dafcaa02e755b786b1')})
# a = data['richTextForeign'] a = data['richTextForeign']
# result = test.gethtml(a) # a = """<h4><strong>The Small Print:</strong><br> </h4> \n <h4><sup>1 </sup>Based on performance estimated with measurements on 12th Gen Intel Core i9-12900HX with RTX 3080Ti against Intel Core i9-11980HK with RTX 3080, Intel Core i9-12900HK with RTX 3080Ti, AMD Ryzen 9 6900HX with RTX 3060, AMD Ryzen 9 6900HS with Radeon 6700S, Intel Core i7-12700H with RTX 3050Ti and Apple M1 Max MacBook Pro with 32 core integrated GPU. Best available compilers selected for all processors. Binaries compiled with ICC for Intel/AMD, binaries compiled with Xcode 13.1 for Apple. The metric used is the geometric mean of C/C++ integer benchmarks in SPEC*int_rate_base2017 2021.2 LLVM (1-copy) and SPEC*int_rate_base2017 2021.2 LLVM (n-copy). See <a href=\"http://www.intel.com/PerformanceIndex\">www.intel.com/PerformanceIndex</a> for additional workload and configuration details. Results may vary. Other names and brands may be claimed as the property of others.</h4> \n <h4><sup>2</sup> Subject to 6 GHz band availability, operating system support, and router compatibility. Details at <a href=\"http://www.intel.com/PerformanceIndex\">www.intel.com/PerformanceIndex</a> (connectivity)</h4> \n <h4>Performance varies by use, configuration and other factors. Learn more at <a href=\"https://edc.intel.com/content/www/us/en/products/performance/benchmarks/overview/\">www.intel.com/PerformanceIndex</a>.</h4> \n <h4>Performance results are based on testing as of dates shown in configurations and may not reflect all publicly available updates. See configuration disclosure for details.</h4> \n \n </div> \n </div> \n </div> \n </div> \n </div> \n </div> \n </div> \n </div> \n </div> \n </div> \n </div> \n </div> \n </div> \n</div>"""
# print(result) result = test.gethtml(a)
# test.close() print(result)
\ No newline at end of file test.close()
\ No newline at end of file
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -12,9 +12,6 @@ from kafka import KafkaProducer ...@@ -12,9 +12,6 @@ from kafka import KafkaProducer
from kafka import KafkaConsumer from kafka import KafkaConsumer
import json import json
import itertools import itertools
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from baiduSpider import BaiduSpider from baiduSpider import BaiduSpider
import concurrent.futures import concurrent.futures
...@@ -43,7 +40,7 @@ class BaiduTaskJob(object): ...@@ -43,7 +40,7 @@ class BaiduTaskJob(object):
bootstrap_servers=[bootstrap_servers], bootstrap_servers=[bootstrap_servers],
value_deserializer=lambda m: json.loads(m.decode('utf-8'))) value_deserializer=lambda m: json.loads(m.decode('utf-8')))
try: try:
for record in tqdm(consumer, desc="Consuming messages"): for record in tqdm(consumer):
try: try:
logger.info("value:",record.value) logger.info("value:",record.value)
keymsg=record.value keymsg=record.value
...@@ -131,7 +128,7 @@ class BaiduTaskJob(object): ...@@ -131,7 +128,7 @@ class BaiduTaskJob(object):
end_time = time.time() end_time = time.time()
if int(end_time - start_time) > 10: if int(end_time - start_time) > 10:
logger.info(f'采集一轮{wordsName}关键词耗时{baseCore.getTimeCost(start_time,end_time)}') logger.info(f'采集一轮{wordsName}关键词耗时{baseCore.getTimeCost(start_time,end_time)}')
logger.info(f"获取到关键词组:{wordsName}---{wordsCode}") logger.info(f"获取到关键词组:{keymsg['wordsName']}---{wordsCode}")
keymsglist=self.getkeywords(keyword) keymsglist=self.getkeywords(keyword)
for kw in keymsglist: for kw in keymsglist:
kwmsg={ kwmsg={
...@@ -169,25 +166,6 @@ class BaiduTaskJob(object): ...@@ -169,25 +166,6 @@ class BaiduTaskJob(object):
# finally: # finally:
# baiduSpider.driver.quit() # baiduSpider.driver.quit()
# logger.info("关键词采集结束!"+searchkw) # logger.info("关键词采集结束!"+searchkw)
def createDriver(self):
chrome_driver = r'D:\cmd100\chromedriver.exe'
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--start-maximized")
proxy = baseCore.get_proxy()
chrome_options.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
chrome_options.add_argument(
'user-agent=' + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
# chrome_options.add_argument('--headless')
browser = webdriver.Chrome(service=path, chrome_options=chrome_options)
return browser
def runSpider(self,kwmsg): def runSpider(self,kwmsg):
searchkw=kwmsg['kw'] searchkw=kwmsg['kw']
wordsCode=kwmsg['wordsCode'] wordsCode=kwmsg['wordsCode']
...@@ -197,11 +175,10 @@ class BaiduTaskJob(object): ...@@ -197,11 +175,10 @@ class BaiduTaskJob(object):
baiduSpider.get_page_html() baiduSpider.get_page_html()
except Exception as e: except Exception as e:
try: try:
baiduSpider.driver.quit()
baiduSpider.driver=self.createDriver()
baiduSpider.get_page_html() baiduSpider.get_page_html()
except Exception as e: except Exception as e:
logger.info('百度搜索异常'+searchkw) logger.info('百度搜索异常'+searchkw)
logger.info(e)
finally: finally:
baiduSpider.driver.quit() baiduSpider.driver.quit()
if baiduSpider.detailList.qsize() != 0: if baiduSpider.detailList.qsize() != 0:
...@@ -233,7 +210,7 @@ if __name__ == '__main__': ...@@ -233,7 +210,7 @@ if __name__ == '__main__':
continue continue
if kwList: if kwList:
# 创建一个线程池,指定线程数量为4 # 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
# 提交任务给线程池,每个任务处理一个数据 # 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(baiduTaskJob.runSpider, data) for data in kwList] results = [executor.submit(baiduTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果 # 获取任务的执行结果
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -121,7 +121,15 @@ class BaiduTaskJob(object): ...@@ -121,7 +121,15 @@ class BaiduTaskJob(object):
kwList=[] kwList=[]
if searchEngines: if searchEngines:
if '3' in searchEngines: if '3' in searchEngines:
keyword=keymsg['keyWord'] start_time = time.time()
keyword = keymsg['keyWord']
wordsName = keymsg['wordsName']
first = wordsName
if wordsName == first:
end_time = time.time()
if int(end_time - start_time) > 10:
logger.info(f'采集一轮{wordsName}关键词耗时{baseCore.getTimeCost(start_time,end_time)}')
logger.info(f"获取到关键词组:{keymsg['wordsName']}---{wordsCode}")
keymsglist=self.getkeywords(keyword) keymsglist=self.getkeywords(keyword)
for kw in keymsglist: for kw in keymsglist:
kwmsg={ kwmsg={
...@@ -166,6 +174,8 @@ class BaiduTaskJob(object): ...@@ -166,6 +174,8 @@ class BaiduTaskJob(object):
baiduSpider.get_page_html() baiduSpider.get_page_html()
except Exception as e: except Exception as e:
logger.info('百度搜索异常'+searchkw) logger.info('百度搜索异常'+searchkw)
logger.error(e)
finally: finally:
baiduSpider.driver.quit() baiduSpider.driver.quit()
if baiduSpider.detailList.qsize() != 0: if baiduSpider.detailList.qsize() != 0:
...@@ -177,6 +187,22 @@ class BaiduTaskJob(object): ...@@ -177,6 +187,22 @@ class BaiduTaskJob(object):
baiduSpider.driver.quit() baiduSpider.driver.quit()
logger.info("关键词采集结束!"+searchkw) logger.info("关键词采集结束!"+searchkw)
import random import random
def PutWords(codeList, r):
for item in codeList:
r.lpush("BaiduSearch:WordsCode", item)
logger.info('数据加载完毕')
# 从Redis的List中获取并移除一个元素
def redicPullData(key, r):
try:
r.ping()
except:
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
item = r.lpop(key)
return item.decode() if item else None
if __name__ == '__main__': if __name__ == '__main__':
baiduTaskJob=BaiduTaskJob() baiduTaskJob=BaiduTaskJob()
baseCore=BaseCore() baseCore=BaseCore()
...@@ -185,12 +211,48 @@ if __name__ == '__main__': ...@@ -185,12 +211,48 @@ if __name__ == '__main__':
# keymsglist=baiduTaskJob.getkeywords(ss) # keymsglist=baiduTaskJob.getkeywords(ss)
# print(keymsglist) # print(keymsglist)
# 创建Redis连接 # 创建Redis连接
# r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
# codeList = [
# 'KW-20220809-0004',
# 'KW-20220524-0004',
# 'KW-20220809-0005',
# 'KW-20220824-0001',
# 'KW-20220809-0002',
# 'KW-20220809-0003',
# 'KW-20220826-0001',
# 'KW-20220602-0003',
# 'KW-20220602-0002',
# 'KW-20220113-0007',
# 'KW-20220113-0006',
# 'KW-20220108-0004',
# 'KW-20220113-0004'
# ]
# PutWords(codeList, r)
print('---------------')
while True: while True:
try: try:
codeList=[] # codeid = redicPullData("BaiduSearch:WordsCode", r)
codeList.append('KW-20230925-0002') # if codeid:
# pass
# else:
# PutWords(codeList, r)
# #codeList.append('KW-20220108-0004')
# logger.info(f'开始采集{codeid}')
codeList = [
'KW-20220809-0004',
'KW-20220524-0004',
'KW-20220809-0005',
'KW-20220824-0001',
'KW-20220809-0002',
'KW-20220809-0003',
'KW-20220826-0001',
'KW-20220602-0003',
'KW-20220602-0002',
'KW-20220113-0007',
'KW-20220113-0006',
'KW-20220108-0004',
'KW-20220113-0004'
]
for codeid in codeList: for codeid in codeList:
try: try:
# keymsg=baiduTaskJob.getkafka() # keymsg=baiduTaskJob.getkafka()
...@@ -207,7 +269,7 @@ if __name__ == '__main__': ...@@ -207,7 +269,7 @@ if __name__ == '__main__':
continue continue
if kwList: if kwList:
# 创建一个线程池,指定线程数量为4 # 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
# 提交任务给线程池,每个任务处理一个数据 # 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(baiduTaskJob.runSpider, data) for data in kwList] results = [executor.submit(baiduTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果 # 获取任务的执行结果
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论