提交 654d0ce5 作者: 薛凌堃

1/17

上级 9fc0f9da
......@@ -126,7 +126,7 @@ def delete_url(article_url):
else:
return False
def uptoOBS(pdf_url, name_pdf, type_id, pathType):
def uptoOBS(pdf_url, name_pdf, type_id, pathType, header):
retData = {'state': False, 'type_id': type_id, 'group_name': '', 'path': '',
'full_path': '',
'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
......@@ -134,10 +134,17 @@ def uptoOBS(pdf_url, name_pdf, type_id, pathType):
for i in range(0, 3):
try:
ip = baseCore.get_proxy()
response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
if header:
response = requests.get(pdf_url, headers=header, verify=False, timeout=20)
else:
response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
if response.status_code == 200:
pass
else:
return retData
file_size = int(response.headers.get('Content-Length'))
break
except:
except Exception as e:
time.sleep(3)
continue
page_size = 0
......@@ -175,7 +182,7 @@ def uptoOBS(pdf_url, name_pdf, type_id, pathType):
# 下载pdf文件,上传至服务器
def download(data, order_by):
def download(data, order_by, header):
url_pdf = data['url_pdf']
name_pdf = data['title']
if '.pdf' not in name_pdf:
......@@ -251,7 +258,7 @@ def download(data, order_by):
url_pdf = 'https://' + url_pdf
# 文件上传到obs
retData = uptoOBS(url_pdf, name_pdf, 4, pathType)
retData = uptoOBS(url_pdf, name_pdf, 4, pathType, header)
if retData['state']:
pass
else:
......@@ -495,7 +502,7 @@ def Mob():
'sid': '1662008807781212161', # 信息源id
}
order_by = 1
download(dic_post,order_by)
download(dic_post,order_by, '')
order_by += 1
......@@ -553,7 +560,7 @@ def yidong_guanxiangtai():
'sid': '1662008276140597250', # 信息源id
}
order_by = 1
download(dic_post, order_by)
download(dic_post, order_by, '')
order_by += 1
# print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download'
......@@ -659,7 +666,7 @@ def getnews(browser):
'sid': '1662008524476948481', # 信息源id
}
order_by = 1
download(dic_post, order_by)
download(dic_post, order_by, '')
order_by += 1
# print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download'
......@@ -774,7 +781,7 @@ def ke36():
'sid': '1662008421217378306', # 信息源id
}
order_by = 1
# download(dic_post, order_by)
download(dic_post, order_by, '')
order_by += 1
# print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download'
......@@ -824,7 +831,7 @@ def qianyanzhishiku():
'sid': '1662008620631367682', # 信息源id
}
order_by = 1
download(dic_post, order_by)
download(dic_post, order_by, '')
order_by += 1
# print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download'
......@@ -904,6 +911,12 @@ def shijiejingjiluntan():
url = f'https://cn.weforum.org/publications/?page={i}'
browser.get(url) # 跳到指定页面
time.sleep(5)
# 输出浏览器头部的cookies
cookie_list = browser.get_cookies()
cookies = {}
for cookie in cookie_list:
cookies[cookie['name']] = cookie['value']
# print(cookies)
wait = WebDriverWait(browser, 30)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "wef-184hs11")))
page_source = browser.page_source # 获取页面信息
......@@ -930,13 +943,58 @@ def shijiejingjiluntan():
info_href = tag.find('a').get('href')
res_info = requests.get(info_href)
soup_info = BeautifulSoup(res_info.content,'html.parser')
header ={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
# 'Cookie': '_vwo_uuid_v2=D73B6BDECC63FD6C529BF86681F8CD58B|c28a87d1a3cc7168448309232b08e8b6; _vwo_uuid=D73B6BDECC63FD6C529BF86681F8CD58B; CookieConsent={stamp:%27Ri/OIoTKx+9q/VGLC27KMJW8eBEK5YnHm+js/rEyWz9novAMhCvuFQ==%27%2Cnecessary:true%2Cpreferences:true%2Cstatistics:true%2Cmarketing:true%2Cmethod:%27explicit%27%2Cver:1%2Cutc:1702452209760%2Cregion:%27cn%27}; _gcl_au=1.1.1801193883.1702452210; mpDistinctId=abc6849e-26de-4c82-b884-28fa56155633; _parsely_session={%22sid%22:7%2C%22surl%22:%22https://cn.weforum.org/publications/%22%2C%22sref%22:%22%22%2C%22sts%22:1705367422607%2C%22slts%22:1703642202973}; _parsely_visitor={%22id%22:%22pid=7df1e8df-edda-4b0b-abf2-7342814bd9f7%22%2C%22session_count%22:7%2C%22last_session_ts%22:1705367422607}; _gid=GA1.2.1315176656.1705367430; _vis_opt_s=4%7C; _vis_opt_test_cookie=1; _web_session=c2F5TXRsb2NWK0dhL0Q2YXIzdHY2Wkx6eTk2ZlYyRHRDM20wYVVFV3R0VkxxT09DWHJJaHpEeWNzZ0RqM2dDQlpmWWdjUjlQblgzUHBoT0Q4bGdBdmFkUjg2MC9WTURTSmpUNm1NSnZHQ2hpZHVLbWpOSHEra01ucnNOWlMyNWFTa2NpTWNpSHIxbGcwRzZXbXhGTmFRPT0tLUwvcmovcHovSFB3Y0FxNTkrdkUzWEE9PQ%3D%3D--1f41b0e467ab72bcb2632361eedb0ac2c73bbcb1; _ga_1RV0X04XBG=GS1.1.1705367429.7.1.1705369590.0.0.0; _ga_2K5FR2KRN5=GS1.1.1705367430.7.1.1705369590.48.0.0; mp_6232aeb08818ee1161204a011ed8ad16_mixpanel=%7B%22distinct_id%22%3A%20%22abc6849e-26de-4c82-b884-28fa56155633%22%2C%22%24device_id%22%3A%20%2218c620f58ad7028-0ac659fa5c3fcc-26001951-e1000-18c620f58ad7028%22%2C%22%24initial_referrer%22%3A%20%22%24direct%22%2C%22%24initial_referring_domain%22%3A%20%22%24direct%22%2C%22%24user_id%22%3A%20%22abc6849e-26de-4c82-b884-28fa56155633%22%2C%22platform%22%3A%20%22Public%20Site%22%7D; _ga=GA1.2.1351018774.1702452201; _ga_4DKG1LX6QK=GS1.1.1705367445.7.1.1705369592.57.0.0; _vwo_sn=2921458; _vwo_ds=3%3Aa_0%2Ct_0%3A0%241702452201%3A26.98833337%3A%3A26_0%3A4_0%2C3_0%3A3',
'Host': 'cn.weforum.org',
# 'If-None-Match': 'W/"f45caaa1faa8a197c0da29d6c90fa0e8"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
cookies = json.dumps(cookies)
cookies_ = json.loads(cookies)
s = requests.session()
s.cookies.update(cookies_)
reponse = s.get(url=info_href, headers=header, verify=False)
# jar = requests.cookies.RequestsCookieJar() # 先构建RequestsCookieJar对象
# for i in cookie_list:
# # 将selenium侧获取的完整cookies的每一个cookie名称和值传入RequestsCookieJar对象
# # domain和path为可选参数,主要是当出现同名不同作用域的cookie时,为了防止后面同名的cookie将前者覆盖而添加的
# jar.set(i['name'], i['value'], domain=i['domain'], path=i['path'])
#
# session = requests.session() # requests以session会话形式访问网站
# session.cookies.update(jar) # 将配置好的RequestsCookieJar对象加入到requests形式的session会话中
# req = requests.Request(method='GET', url=url, headers=header)
# rpe = session.send(session.prepare_request(req),
# verify=False, # verify设置为False来规避SSL证书验证
# timeout=10)
# rpe = requests.get(url=info_href, headers=header)
# res_info = session.get(url=info_href, headers=header)
if reponse.status_code == 200:
pass
else:
reponse = s.get(url=info_href, headers=header)
soup_info = BeautifulSoup(reponse.content,'html.parser')
info_content = soup_info.find('div',{'class':'small-12 medium-8 columns'}).text.strip()
dic_post = {
'title': info_title, # 报告名称
'url_pdf': info_pdf, # 报告链接
......@@ -953,7 +1011,7 @@ def shijiejingjiluntan():
'sid': '1662008019231088642', # 信息源id
}
order_by = 1
download(dic_post, order_by)
download(dic_post, order_by, header)
order_by += 1
# print(page,dic_post)
# url = 'http://114.115.155.139:5002/report_download'
......
......@@ -56,7 +56,7 @@ if __name__=="__main__":
url = "https://mp.weixin.qq.com/"
browser.get(url)
# 可改动
time.sleep(20)
time.sleep(40)
s = requests.session()
#获取到token和cookies
......
......@@ -28,9 +28,9 @@ r = baseCore.r
def resHtml(token,url,cookies):
try:
ip = baseCore.get_proxy()
s=requests.session()
# s=requests.session()
cookie_jar = requests.utils.cookiejar_from_dict(cookies, cookiejar=None, overwrite=True)
s = requests.session()
s.cookies = cookie_jar
# json_search = s.get(url, headers=headers, proxies=ip, verify=False).json()
json_search = s.get(url, headers=headers, proxies=ip,verify=False).json()
......@@ -142,7 +142,8 @@ def updateCookieToken(token,cookies):
cnx_.commit()
#获取token
def getToken():
cursor_.execute(f"select token,cookies from weixin_tokenCookies where fenghao_time < DATE_SUB(NOW(), INTERVAL 2 HOUR) order by update_time asc limit 1")
# cursor_.execute(f"select token,cookies from weixin_tokenCookies where fenghao_time < DATE_SUB(NOW(), INTERVAL 2 HOUR) order by update_time asc limit 1")
cursor_.execute(f"select token,cookies from weixin_tokenCookies where user_name = 'wahaha2'")
row = cursor_.fetchall()
cnx_.commit()
if row:
......
......@@ -215,8 +215,8 @@ def spider_zhuanli(com_name, social_code, tycid):
def runSpider():
# 根据从Redis中拿到的社会信用代码, 在数据库中获取对应基本信息
# social_code = baseCore.redicPullData('ZhuanLi:gnshSocial_code')
social_code = '91360400794798498A'
social_code = baseCore.redicPullData('ZhuanLi:gnshSocial_code')
# social_code = '91360400794798498A'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
# time.sleep(20)
......@@ -331,6 +331,6 @@ if __name__ == '__main__':
while True:
start = time.time()
num_threads = 1
num_threads = 5
run_threads(num_threads)
log.info(f'5线程 总耗时{time.time()-start}秒')
......@@ -3,6 +3,7 @@ from urllib.parse import urljoin
import langid
import pymysql
from gne import GeneralNewsExtractor
from retry import retry
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
......@@ -251,10 +252,10 @@ class GoogleSpider(object):
print('时间解析异常!!')
return publishtime
@retry(tries=3, delay=3)
# 获取每一页数据, 开趴.
def get_page_html(self):
self.logger.info("进入google首页..." )
self.logger.info(f"{self.searchkw}...进入google首页...")
self.driver.get(self.url)
# 等待页面加载完成
time.sleep(3)
......@@ -280,9 +281,9 @@ class GoogleSpider(object):
time.sleep(2)
self.driver.find_element('xpath', '//div[@class="YpcDnf OSrXXb HG1dvd"]/a[text()="按日期排序"]').click()
except Exception as e:
self.logger.info(f'--点击按钮失效----{e}')
self.logger.info(f'--{self.searchkw}--点击按钮失效----{e}')
return
self.logger.info("开始抓取首页..." + self.searchkw )
self.logger.info(f"{self.searchkw}...开始抓取首页...")
time.sleep(5)
flag, lists = self.parse_page()
if len(lists)<1:
......@@ -291,19 +292,23 @@ class GoogleSpider(object):
durl=detail['detailUrl']
is_member = self.r.sismember('pygoogle_'+self.wordsCode, durl)
if is_member:
self.logger.info(f"{self.searchkw}已存在{detail['title']}")
continue
self.detailList.put(detail)
response = self.driver.page_source
html = etree.HTML(response)
hasnext = html.xpath('//table[@class="AaVjTc"]//td[last()]//text()')[0]
hasnext = hasnext.strip()
try:
hasnext = html.xpath('//table[@class="AaVjTc"]//td[last()]//text()')[0]
hasnext = hasnext.strip()
except:
hasnext = ''
timeFlag = False
while hasnext == '下一页':
if self.page_num==5:
break
# if self.page_num==5:
# break
self.page_num = self.page_num + 1
self.logger.info("开始抓取第%s页..." % self.page_num)
self.logger.info(f"{self.searchkw}...开始抓取第{self.page_num}页...")
try:
self.driver.find_element(By.XPATH, '//a[@id="pnnext"]').click()
except Exception as e:
......@@ -311,17 +316,19 @@ class GoogleSpider(object):
time.sleep(5)
flag, lists = self.parse_page()
for detail in lists:
durl = detail['detailUrl']
is_member = self.r.sismember('pygoogle_'+self.wordsCode, durl)
if is_member:
self.logger.info(f"{self.searchkw}已存在{detail['title']}")
continue
publishTag=detail['publishTag']
if publishTag:
pubtime = datetime.datetime.strptime(publishTag, "%Y-%m-%d %H:%M:%S")
needDate='2022-01-01 00:00:00'
needTime = datetime.datetime.strptime(needDate, "%Y-%m-%d %H:%M:%S")
if pubtime < needTime:
timeFlag = True
break
# if publishTag:
# pubtime = datetime.datetime.strptime(publishTag, "%Y-%m-%d %H:%M:%S")
# needDate='2022-01-01 00:00:00'
# needTime = datetime.datetime.strptime(needDate, "%Y-%m-%d %H:%M:%S")
# if pubtime < needTime:
# timeFlag = True
# break
self.detailList.put(detail)
if timeFlag:
break
......@@ -333,7 +340,7 @@ class GoogleSpider(object):
self.logger.info(hasnext)
except Exception as e:
hasnext=''
self.logger.info("抓取完毕")
self.logger.info(f"{self.searchkw}...列表抓取完毕")
def getRequest(self,url):
html=''
......@@ -361,11 +368,12 @@ class GoogleSpider(object):
def get_detail_html(self):
while True:
if self.detailList.qsize() != 0:
detailmsg=self.detailList.get()
title = detailmsg['title']
detailUrl = detailmsg['detailUrl']
self.logger.info("%s:%s开始解析详情数据\n" % (title, detailUrl))
try:
detailmsg=self.detailList.get()
title = detailmsg['title']
detailUrl = detailmsg['detailUrl']
self.logger.info("%s:%s\n" % (title, detailUrl))
# try:
# self.driver.get(detailUrl)
# except Exception as e:
......@@ -398,15 +406,17 @@ class GoogleSpider(object):
try:
self.sendkafka(processitem)
self.r.sadd('pygoogle_'+self.wordsCode, processitem['sourceAddress'])
# 插入数据库
try:
items = []
items.append(bdetail)
self.itemInsertToTable(items)
except Exception as e:
self.logger.info(f"插入数据库失败!{bdetail['kword']}===={detailUrl}")
self.logger.info(f"放入kafka成功!{bdetail['kword']}===={detailUrl}")
except Exception as e:
self.logger.info("放入kafka失败!")
#插入数据库
try:
items=[]
items.append(bdetail)
self.itemInsertToTable(items)
except Exception as e:
self.logger.info(f"插入数据库失败==={e}")
self.logger.info(f"放入kafka失败!{bdetail['kword']}===={detailUrl}")
# 关闭当前新窗口
# self.driver.close()
time.sleep(1)
......@@ -508,10 +518,11 @@ class GoogleSpider(object):
if publishDate=='':
return
kafka_result = producer.send("crawlerInfo", json.dumps(processitem, ensure_ascii=False).encode('utf8'))
self.logger.info("数据发送kafka成功")
# self.logger.info("数据发送kafka成功")
self.logger.info(kafka_result.get(timeout=10))
except Exception as e:
self.logger.info('发送kafka异常')
pass
# self.logger.info('发送kafka异常')
finally:
producer.close()
......
......@@ -26,7 +26,7 @@ baseCore = BaseCore()
class Translate():
def __init__(self):
self.url = "https://fanyi.baidu.com/#"
self.url = "https://fanyi.baidu.com/#{}/{}/{}"
self.header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1"}
self.browser = self.createDriver()
......@@ -53,7 +53,7 @@ class Translate():
profile.set_preference('network.proxy.ssl',proxy_['http'].split('://')[1].split(':')[0])
profile.set_preference('network.proxy.ssl_port',int(proxy_['http'].split('://')[1].split(':')[1]))
profile.update_preferences()
service = Service(r'F:\spider\firefox\geckodriver_1.exe')
service = Service(r'D:\soft\geckodriver.exe')
options = Options()
options.set_preference("general.useragent.override",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
......
#coding:utf-8
#coding:utf-8
......@@ -20,19 +20,19 @@ from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.proxy import Proxy, ProxyType
from func_timeout import func_set_timeout
from base.BaseCore import BaseCore
baseCore = BaseCore()
class Translate():
def __init__(self):
self.url = "https://fanyi.baidu.com/#"
self.url = "https://fanyi.baidu.com/#{}/{}/{}"
# self.url = "https://fanyi.baidu.com/#"
self.header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1"}
self.browser = self.createDriver()
self.db_storage = \
pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').中科软[
'数据源_0106']
'数据源_0504']
def close(self):
self.browser.quit()
......@@ -53,7 +53,7 @@ class Translate():
profile.set_preference('network.proxy.ssl',proxy_['http'].split('://')[1].split(':')[0])
profile.set_preference('network.proxy.ssl_port',int(proxy_['http'].split('://')[1].split(':')[1]))
profile.update_preferences()
service = Service(r'F:\spider\firefox\geckodriver_1.exe')
service = Service(r'D:\soft\geckodriver.exe')
options = Options()
options.set_preference("general.useragent.override",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
......@@ -71,6 +71,11 @@ class Translate():
def translate(self, sentence, lang):
sentence_ = sentence
wait = WebDriverWait(self.browser, 20)
# if lang == 'jp':
# url = self.url.format('jp', 'zh', sentence_)
# url = quote(url, safe='/:#')
# self.browser.set_page_load_timeout(10)
# else:
try:
word_type = self.get_input_language_type(sentence_, wait)
except:
......@@ -87,19 +92,20 @@ class Translate():
url = self.url.format(word_type, 'zh', sentence_)
url = quote(url, safe='/:#')
self.browser.set_page_load_timeout(10)
try:
self.browser.get(url)
wait.until(EC.presence_of_element_located(
(By.XPATH, '//*[@id="main-outer"]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p[2]')))
result_ = self.browser.find_element(By.XPATH,
'//*[@id="main-outer"]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p[2]')
result = result_.text.strip()
return result
except:
self.browser.quit()
self.browser = self.createDriver()
result = self.translate(sentence_, lang)
return result
try:
self.browser.get(url)
wait.until(EC.presence_of_element_located(
(By.XPATH, '//*[@id="main-outer"]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p[2]')))
result_ = self.browser.find_element(By.XPATH,
'//*[@id="main-outer"]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p[2]')
result = result_.text.strip()
return result
except Exception as e:
self.browser.quit()
self.browser = self.createDriver()
result = self.translate(sentence_, lang)
return result
@func_set_timeout(90)
def get_input_language_type(self, word, wait):
......@@ -107,6 +113,7 @@ class Translate():
wait.until(EC.presence_of_element_located((By.ID, "baidu_translate_input")))
input_word = self.browser.find_element(By.ID, "baidu_translate_input")
input_word.send_keys(word)
wait.until(EC.presence_of_element_located(
(By.XPATH, '//*[@id="main-outer"]/div/div/div[1]/div[1]/div[1]/a[1]/span/span')))
word_type = self.browser.find_element(By.XPATH,
......@@ -131,25 +138,6 @@ class Translate():
sentences.append((0, len(contentWithTag), contentWithTag))
return sentences
def jionstr(self, html):
paragraphs = []
current_sentence = ''
for tag in html.find_all(text=True):
sentence = str(tag)
if sentence == '\n' or sentence == '\t' or sentence == ' ':
continue
if self.is_punctuation(sentence):
continue
if sentence.startswith('https://') or sentence.startswith('http://') or sentence.startswith('www.'):
continue
# 检查拼接后的句子长度是否超过1000字
if len(current_sentence) + len(sentence) <= 1000:
current_sentence += sentence
else:
paragraphs.append(current_sentence.strip())
current_sentence = sentence
return paragraphs
def gethtml(self, contentWithTag):
tag_list = []
html = BeautifulSoup(contentWithTag, 'html.parser')
......@@ -169,14 +157,15 @@ class Translate():
continue
sentence += f'{tag}😊'
num += 1
#print(num)
# print(num)
# if len(sentence) == 1:
# continue
# if sentence == '\n' or sentence == '\t' or sentence == ' ':
# continue
# if self.is_punctuation(sentence):
# continue
#print(sentence)
print(sentence)
# before_ =
result = ''
while True:
if len(sentence.strip()) == 1 and self.is_punctuation(sentence.strip()):
......@@ -213,7 +202,7 @@ class Translate():
result += self.translate(sentence, lang)
time.sleep(2)
break
#print(result)
print(result)
sentences = result.split('😊')
#print(len(sentences))
num = 0
......@@ -224,17 +213,20 @@ class Translate():
continue
#print(num,tag)
sentence = sentences[num]
print(tag)
print(sentence)
tag.replace_with(sentence)
num += 1
return str(html.prettify()) + '<p/><br>译文来源:微软自动翻译<br></p>'
# if __name__ == "__main__":
# test = Translate()
# db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').中科软[
# '数据源_0504']
# data = db_storage.find_one({'_id': ObjectId('656f14e84d6d77428c713271')})
# a = data['richTextForeign']
# result = test.gethtml(a)
# print(result)
# test.close()
\ No newline at end of file
if __name__ == "__main__":
test = Translate()
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').中科软[
'数据源_0106']
data = db_storage.find_one({'_id': ObjectId('65a4a2dafcaa02e755b786b1')})
a = data['richTextForeign']
# a = """<h4><strong>The Small Print:</strong><br> </h4> \n <h4><sup>1 </sup>Based on performance estimated with measurements on 12th Gen Intel Core i9-12900HX with RTX 3080Ti against Intel Core i9-11980HK with RTX 3080, Intel Core i9-12900HK with RTX 3080Ti, AMD Ryzen 9 6900HX with RTX 3060, AMD Ryzen 9 6900HS with Radeon 6700S, Intel Core i7-12700H with RTX 3050Ti and Apple M1 Max MacBook Pro with 32 core integrated GPU. Best available compilers selected for all processors. Binaries compiled with ICC for Intel/AMD, binaries compiled with Xcode 13.1 for Apple. The metric used is the geometric mean of C/C++ integer benchmarks in SPEC*int_rate_base2017 2021.2 LLVM (1-copy) and SPEC*int_rate_base2017 2021.2 LLVM (n-copy). See <a href=\"http://www.intel.com/PerformanceIndex\">www.intel.com/PerformanceIndex</a> for additional workload and configuration details. Results may vary. Other names and brands may be claimed as the property of others.</h4> \n <h4><sup>2</sup> Subject to 6 GHz band availability, operating system support, and router compatibility. Details at <a href=\"http://www.intel.com/PerformanceIndex\">www.intel.com/PerformanceIndex</a> (connectivity)</h4> \n <h4>Performance varies by use, configuration and other factors. Learn more at <a href=\"https://edc.intel.com/content/www/us/en/products/performance/benchmarks/overview/\">www.intel.com/PerformanceIndex</a>.</h4> \n <h4>Performance results are based on testing as of dates shown in configurations and may not reflect all publicly available updates. See configuration disclosure for details.</h4> \n \n </div> \n </div> \n </div> \n </div> \n </div> \n </div> \n </div> \n </div> \n </div> \n </div> \n </div> \n </div> \n </div> \n</div>"""
result = test.gethtml(a)
print(result)
test.close()
\ No newline at end of file
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -12,9 +12,6 @@ from kafka import KafkaProducer
from kafka import KafkaConsumer
import json
import itertools
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from baiduSpider import BaiduSpider
import concurrent.futures
......@@ -43,7 +40,7 @@ class BaiduTaskJob(object):
bootstrap_servers=[bootstrap_servers],
value_deserializer=lambda m: json.loads(m.decode('utf-8')))
try:
for record in tqdm(consumer, desc="Consuming messages"):
for record in tqdm(consumer):
try:
logger.info("value:",record.value)
keymsg=record.value
......@@ -131,7 +128,7 @@ class BaiduTaskJob(object):
end_time = time.time()
if int(end_time - start_time) > 10:
logger.info(f'采集一轮{wordsName}关键词耗时{baseCore.getTimeCost(start_time,end_time)}')
logger.info(f"获取到关键词组:{wordsName}---{wordsCode}")
logger.info(f"获取到关键词组:{keymsg['wordsName']}---{wordsCode}")
keymsglist=self.getkeywords(keyword)
for kw in keymsglist:
kwmsg={
......@@ -169,25 +166,6 @@ class BaiduTaskJob(object):
# finally:
# baiduSpider.driver.quit()
# logger.info("关键词采集结束!"+searchkw)
def createDriver(self):
chrome_driver = r'D:\cmd100\chromedriver.exe'
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--start-maximized")
proxy = baseCore.get_proxy()
chrome_options.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
chrome_options.add_argument(
'user-agent=' + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
# chrome_options.add_argument('--headless')
browser = webdriver.Chrome(service=path, chrome_options=chrome_options)
return browser
def runSpider(self,kwmsg):
searchkw=kwmsg['kw']
wordsCode=kwmsg['wordsCode']
......@@ -197,11 +175,10 @@ class BaiduTaskJob(object):
baiduSpider.get_page_html()
except Exception as e:
try:
baiduSpider.driver.quit()
baiduSpider.driver=self.createDriver()
baiduSpider.get_page_html()
except Exception as e:
logger.info('百度搜索异常'+searchkw)
logger.info(e)
finally:
baiduSpider.driver.quit()
if baiduSpider.detailList.qsize() != 0:
......@@ -233,7 +210,7 @@ if __name__ == '__main__':
continue
if kwList:
# 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
# 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(baiduTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -121,7 +121,15 @@ class BaiduTaskJob(object):
kwList=[]
if searchEngines:
if '3' in searchEngines:
keyword=keymsg['keyWord']
start_time = time.time()
keyword = keymsg['keyWord']
wordsName = keymsg['wordsName']
first = wordsName
if wordsName == first:
end_time = time.time()
if int(end_time - start_time) > 10:
logger.info(f'采集一轮{wordsName}关键词耗时{baseCore.getTimeCost(start_time,end_time)}')
logger.info(f"获取到关键词组:{keymsg['wordsName']}---{wordsCode}")
keymsglist=self.getkeywords(keyword)
for kw in keymsglist:
kwmsg={
......@@ -166,6 +174,8 @@ class BaiduTaskJob(object):
baiduSpider.get_page_html()
except Exception as e:
logger.info('百度搜索异常'+searchkw)
logger.error(e)
finally:
baiduSpider.driver.quit()
if baiduSpider.detailList.qsize() != 0:
......@@ -177,6 +187,22 @@ class BaiduTaskJob(object):
baiduSpider.driver.quit()
logger.info("关键词采集结束!"+searchkw)
import random
def PutWords(codeList, r):
for item in codeList:
r.lpush("BaiduSearch:WordsCode", item)
logger.info('数据加载完毕')
# 从Redis的List中获取并移除一个元素
def redicPullData(key, r):
try:
r.ping()
except:
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
item = r.lpop(key)
return item.decode() if item else None
if __name__ == '__main__':
baiduTaskJob=BaiduTaskJob()
baseCore=BaseCore()
......@@ -185,12 +211,48 @@ if __name__ == '__main__':
# keymsglist=baiduTaskJob.getkeywords(ss)
# print(keymsglist)
# 创建Redis连接
# r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
# codeList = [
# 'KW-20220809-0004',
# 'KW-20220524-0004',
# 'KW-20220809-0005',
# 'KW-20220824-0001',
# 'KW-20220809-0002',
# 'KW-20220809-0003',
# 'KW-20220826-0001',
# 'KW-20220602-0003',
# 'KW-20220602-0002',
# 'KW-20220113-0007',
# 'KW-20220113-0006',
# 'KW-20220108-0004',
# 'KW-20220113-0004'
# ]
# PutWords(codeList, r)
print('---------------')
while True:
try:
codeList=[]
codeList.append('KW-20230925-0002')
# codeid = redicPullData("BaiduSearch:WordsCode", r)
# if codeid:
# pass
# else:
# PutWords(codeList, r)
# #codeList.append('KW-20220108-0004')
# logger.info(f'开始采集{codeid}')
codeList = [
'KW-20220809-0004',
'KW-20220524-0004',
'KW-20220809-0005',
'KW-20220824-0001',
'KW-20220809-0002',
'KW-20220809-0003',
'KW-20220826-0001',
'KW-20220602-0003',
'KW-20220602-0002',
'KW-20220113-0007',
'KW-20220113-0006',
'KW-20220108-0004',
'KW-20220113-0004'
]
for codeid in codeList:
try:
# keymsg=baiduTaskJob.getkafka()
......@@ -207,7 +269,7 @@ if __name__ == '__main__':
continue
if kwList:
# 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
# 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(baiduTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论