提交 9d49a0cd 作者: XveLingKun

谷歌搜索

上级 252c04d3
import datetime
import os import os
import random import random
import redis
import sys import sys
import time import time
import logbook import logbook
...@@ -211,12 +213,18 @@ class BaseCore: ...@@ -211,12 +213,18 @@ class BaseCore:
try: try:
self.__cursor_proxy.close() self.__cursor_proxy.close()
self.__cnx_proxy.close() self.__cnx_proxy.close()
self.cursor_.close()
self.cnx_.close()
except : except :
pass pass
def __init__(self): def __init__(self):
self.r = redis.Redis(host="114.116.90.53", port=6380, password='clbzzsn', db=6)
self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='clb_project', self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='clb_project',
charset='utf8mb4') charset='utf8mb4')
self.__cursor_proxy= self.__cnx_proxy.cursor() self.__cursor_proxy= self.__cnx_proxy.cursor()
self.cnx_ = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project',
charset='utf8mb4')
self.cursor_ = self.cnx_.cursor()
pass pass
# 计算耗时 # 计算耗时
...@@ -347,4 +355,42 @@ class BaseCore: ...@@ -347,4 +355,42 @@ class BaseCore:
ip_list.append(proxy) ip_list.append(proxy)
return ip_list return ip_list
\ No newline at end of file
# 从Redis的List中获取并移除一个元素
def redicPullData(self, key):
try:
self.r.ping()
except:
self.r = redis.Redis(host="114.116.90.53", port=6380, password='clbzzsn', db=6)
item = self.r.lpop(key)
return item.decode() if item else None
def getSidName(self, sid):
sqlSelect = f"SELECT words_name FROM `key_words` WHERE id = '{sid}'"
self.cursor_.execute(sqlSelect)
data = self.cursor_.fetchone()[0]
return data
# 获得脚本进程PID
def getPID(self):
PID = os.getpid()
return PID
def getUniqueCode(self, abbr, serverId, threadId):
while True:
timeCode = self.r.blpop(['timeCode:google'], 2)
if timeCode:
timeCode = timeCode[1]
timeCode = timeCode.decode('utf-8')
break
else:
time.sleep(2)
pid = str(self.getPID())
if len(pid) < 4:
pid = pid.zfill(4)
elif len(pid) > 4:
pid = pid[0:4]
uniqueCode = abbr + str(datetime.datetime.now().strftime('%Y%m%d'))[2:] + serverId + pid + str(threadId) + str(timeCode)
return uniqueCode
\ No newline at end of file
...@@ -2,7 +2,7 @@ from urllib.parse import urljoin ...@@ -2,7 +2,7 @@ from urllib.parse import urljoin
import langid import langid
import pymysql import pymysql
from gne import GeneralNewsExtractor
from retry import retry from retry import retry
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.service import Service
...@@ -15,11 +15,12 @@ import threading ...@@ -15,11 +15,12 @@ import threading
import time import time
from lxml import etree from lxml import etree
from queue import Queue from queue import Queue
import re,sys import re, sys
import datetime import datetime
import redis import redis
from kafka import KafkaProducer from kafka import KafkaProducer
import json import json
import uuid
from baseCore import BaseCore from baseCore import BaseCore
import configparser import configparser
...@@ -31,14 +32,15 @@ import requests ...@@ -31,14 +32,15 @@ import requests
# 从HTML中提取纯文本 # 从HTML中提取纯文本
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
class GoogleSpider(object): class GoogleSpider(object):
def __init__(self,searchkw,wordsCode,sid): def __init__(self, threadId, searchkw, wordsCode, sid, item, bangdan_name):
# 创建ConfigParser对象 # 创建ConfigParser对象
self.config = configparser.ConfigParser() self.config = configparser.ConfigParser()
# 读取配置文件 # 读取配置文件
self.config.read('config.ini') self.config.read('config.ini')
baseCore=BaseCore() self.baseCore = BaseCore()
self.logger=baseCore.getLogger() self.logger = self.baseCore.getLogger()
# self.url = f'https://www.google.com/search?q={searchkw}&tbm=nws&source=lnms&sa=X&ved=2ahUKEwicke6y37OAAxWJGIgKHQWAASUQ0pQJegQIDRAB&biw=1366&bih=372&dpr=1' # self.url = f'https://www.google.com/search?q={searchkw}&tbm=nws&source=lnms&sa=X&ved=2ahUKEwicke6y37OAAxWJGIgKHQWAASUQ0pQJegQIDRAB&biw=1366&bih=372&dpr=1'
# self.url = f'https://www.google.com.hk/search?q={searchkw}&sca_esv=555819424&tbs=sbd:1&tbm=nws&ei=CezVZPaGCaqC4-EPqZi_oAk&start=90&sa=N&ved=2ahUKEwi2r_qGk9SAAxUqwTgGHSnMD5QQ8tMDegQIAhAU&biw=1366&bih=619&dpr=1' # self.url = f'https://www.google.com.hk/search?q={searchkw}&sca_esv=555819424&tbs=sbd:1&tbm=nws&ei=CezVZPaGCaqC4-EPqZi_oAk&start=90&sa=N&ved=2ahUKEwi2r_qGk9SAAxUqwTgGHSnMD5QQ8tMDegQIAhAU&biw=1366&bih=619&dpr=1'
self.url = f'https://www.google.com.hk' self.url = f'https://www.google.com.hk'
...@@ -46,12 +48,15 @@ class GoogleSpider(object): ...@@ -46,12 +48,15 @@ class GoogleSpider(object):
port=self.config.get('redis', 'port'), port=self.config.get('redis', 'port'),
password=self.config.get('redis', 'pass'), db=0) password=self.config.get('redis', 'pass'), db=0)
self.page_num = 1 self.page_num = 1
chrome_driver =self.config.get('selenium', 'chrome_driver') chrome_driver = self.config.get('selenium', 'chrome_driver')
self.kafka_bootstrap_servers = self.config.get('kafka', 'bootstrap_servers') self.kafka_bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
path = Service(chrome_driver) path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions() chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = self.config.get('selenium', 'binary_location') chrome_options.binary_location = self.config.get('selenium', 'binary_location')
self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options) chrome_options.add_argument(rf'user-data-dir=D:\seleniumTmp\baidu{uuid.uuid1()}')
chrome_options.add_argument("--disable-component-update")
chrome_options.add_argument("--disable-extensions")
self.driver = webdriver.Chrome(service=path, chrome_options=chrome_options)
# driver = webdriver.Chrome(chrome_options=chrome_options) # driver = webdriver.Chrome(chrome_options=chrome_options)
self.qtitle = Queue() self.qtitle = Queue()
self.qurl = Queue() self.qurl = Queue()
...@@ -59,33 +64,41 @@ class GoogleSpider(object): ...@@ -59,33 +64,41 @@ class GoogleSpider(object):
self.searchkw = searchkw self.searchkw = searchkw
self.wordsCode = wordsCode self.wordsCode = wordsCode
self.sid = sid self.sid = sid
self.threadId = threadId
self.item = item
self.bangdan_name = bangdan_name
def createDriver(self): def createDriver(self):
chrome_driver =self.config.get('selenium', 'chrome_driver') chrome_driver = self.config.get('selenium', 'chrome_driver')
path = Service(chrome_driver) path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions() chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location =self.config.get('selenium', 'binary_location') chrome_options.binary_location = self.config.get('selenium', 'binary_location')
# 设置代理 # 设置代理
# proxy = "127.0.0.1:8080" # 代理地址和端口 # proxy = "127.0.0.1:8080" # 代理地址和端口
# chrome_options.add_argument('--proxy-server=http://' + proxy) # chrome_options.add_argument('--proxy-server=http://' + proxy)
self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options) self.driver = webdriver.Chrome(service=path, chrome_options=chrome_options)
#将列表数据插入到表中 baidu_search_result # 将列表数据插入到表中 baidu_search_result
def itemInsertToTable(self,items): def itemInsertToTable(self, items):
itemdata=[] itemdata = []
conx,cursorM=self.connMysql() conx, cursorM = self.connMysql()
companyinfo = self.item
social_code = str(companyinfo.split('|')[0])
ch_name = companyinfo.split('|')[1]
en_name = companyinfo.split('|')[2]
rank = self.bangdan_name + '|' + str(companyinfo.split('|')[3])
for item in items: for item in items:
nowtime=self.getNowDate() nowtime = self.getNowDate()
data=(self.sid,self.wordsCode,item['title'],item['detailurl'],item['source'],item['publishtime'],item['content'],item['contentHtml'],'1',item['kword'],nowtime) data = (social_code, en_name, ch_name, rank, item['title'], item['content'], item['detailurl'], item['publishtime'], item['source'], nowtime)
itemdata.append(data) itemdata.append(data)
sql ="INSERT into google_search_result (sid,wordsCode,title,detailurl,origin,publishdate,content,content_with_tag,state,keyword,create_time) VALUES (%s, %s,%s, %s, %s, %s, %s, %s, %s, %s, %s)" sql = "INSERT into Company_layoff (企业信用代码,企业英文名称,企业中文名称,所在榜单排名,标题,内容,链接,发布时间,来源,创建时间) VALUES (%s, %s,%s, %s, %s, %s, %s, %s, %s, %s)"
cursorM.executemany(sql, itemdata) cursorM.executemany(sql, itemdata)
self.logger.info("数据插入数据库成功!") self.logger.info("数据插入数据库成功!")
# 定义插入数据的SQL语句 # 定义插入数据的SQL语句
# 执行插入操作 # 执行插入操作
conx.commit() conx.commit()
self.closeSql(conx,cursorM) self.closeSql(conx, cursorM)
def connMysql(self): def connMysql(self):
# 创建MySQL连接 # 创建MySQL连接
...@@ -95,62 +108,63 @@ class GoogleSpider(object): ...@@ -95,62 +108,63 @@ class GoogleSpider(object):
database=self.config.get('mysql', 'database')) database=self.config.get('mysql', 'database'))
# 创建一个游标对象 # 创建一个游标对象
cursorM = conx.cursor() cursorM = conx.cursor()
return conx,cursorM return conx, cursorM
def closeSql(self,conx,cursorM): def closeSql(self, conx, cursorM):
# 关闭游标和连接 # 关闭游标和连接
cursorM.close() cursorM.close()
conx.close() conx.close()
# 解析页面 # 解析页面
def parse_page(self): def parse_page(self):
try: try:
response = self.driver.page_source response = self.driver.page_source
html = etree.HTML(response) html = etree.HTML(response)
lists=self.xpath_paser(html) lists = self.xpath_paser(html)
flag = html.xpath('//tr[@jsname="TeSSVd"]//a[last()]//@class')[0] flag = html.xpath('//tr[@jsname="TeSSVd"]//a[last()]//@class')[0]
except: except:
lists=[] lists = []
flag='' flag = ''
return flag, lists return flag, lists
def xpath_paser(self,html): def xpath_paser(self, html):
lists=[] lists = []
itemTag=html.xpath('//div[@class="SoaBEf"]') itemTag = html.xpath('//div[@class="SoaBEf"]')
for itemTag in itemTag: for itemTag in itemTag:
try: try:
title=itemTag.xpath('.//div[@class="n0jPhd ynAwRc MBeuO nDgy9d"]/text()')[0] title = itemTag.xpath('.//div[@class="n0jPhd ynAwRc MBeuO nDgy9d"]/text()')[0]
title=str(title) title = str(title)
except Exception as e: except Exception as e:
title='' title = ''
try: try:
detailUrl=itemTag.xpath('.//a[@class="WlydOe"]/@href')[0] detailUrl = itemTag.xpath('.//a[@class="WlydOe"]/@href')[0]
detailUrl=str(detailUrl) detailUrl = str(detailUrl)
except Exception as e: except Exception as e:
detailUrl='' detailUrl = ''
try: try:
sourceTag=itemTag.xpath('.//div[@class="MgUUmf NUnG9d"]//text()')[0] sourceTag = itemTag.xpath('.//div[@class="MgUUmf NUnG9d"]//text()')[0]
sourceTag=str(sourceTag) sourceTag = str(sourceTag)
except Exception as e: except Exception as e:
print(e) print(e)
sourceTag='' sourceTag = ''
try: try:
publishTag=itemTag.xpath('.//div[@class="OSrXXb rbYSKb LfVVr"]/span/text()')[0] publishTag = itemTag.xpath('.//div[@class="OSrXXb rbYSKb LfVVr"]/span/text()')[0]
publishTag=str(publishTag) publishTag = str(publishTag)
publishtime=self.paserTime(publishTag) publishtime = self.paserTime(publishTag)
publishTag=publishtime.strftime("%Y-%m-%d %H:%M:%S") publishTag = publishtime.strftime("%Y-%m-%d %H:%M:%S")
except Exception as e: except Exception as e:
publishTag='' publishTag = ''
detailmsg={ detailmsg = {
'title':title, 'title': title,
'detailUrl':detailUrl, 'detailUrl': detailUrl,
'sourceTag':sourceTag, 'sourceTag': sourceTag,
'publishTag':publishTag 'publishTag': publishTag
} }
lists.append(detailmsg) lists.append(detailmsg)
return lists return lists
#获取当前时间 # 获取当前时间
def getNowDate(self): def getNowDate(self):
# 获取当前时间 # 获取当前时间
...@@ -159,13 +173,13 @@ class GoogleSpider(object): ...@@ -159,13 +173,13 @@ class GoogleSpider(object):
currentdate = current_time.strftime("%Y-%m-%d %H:%M:%S") currentdate = current_time.strftime("%Y-%m-%d %H:%M:%S")
return currentdate return currentdate
def webDriver(self,url): def webDriver(self, url):
chrome_driver =self.config.get('selenium', 'chrome_driver') chrome_driver = self.config.get('selenium', 'chrome_driver')
path = Service(chrome_driver) path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions() chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location =self.config.get('selenium', 'binary_location') chrome_options.binary_location = self.config.get('selenium', 'binary_location')
driver = webdriver.Chrome(service=path,chrome_options=chrome_options) driver = webdriver.Chrome(service=path, chrome_options=chrome_options)
html='' html = ''
try: try:
driver.get(url) driver.get(url)
# 等待页面加载完成 # 等待页面加载完成
...@@ -173,7 +187,7 @@ class GoogleSpider(object): ...@@ -173,7 +187,7 @@ class GoogleSpider(object):
driver.refresh() driver.refresh()
wait = WebDriverWait(driver, 20) wait = WebDriverWait(driver, 20)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body"))) wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
html=driver.page_source html = driver.page_source
except Exception as e: except Exception as e:
self.logger.info('请求失败') self.logger.info('请求失败')
finally: finally:
...@@ -182,77 +196,81 @@ class GoogleSpider(object): ...@@ -182,77 +196,81 @@ class GoogleSpider(object):
return html return html
def extractorMsg(self,url,title): def extractorMsg(self, url, title):
content='' content = ''
contentWithTag='' contentWithTag = ''
lang='' lang = ''
try: try:
lang=self.detect_language(title) lang = self.detect_language(title)
sm=SmartExtractor(lang) sm = SmartExtractor(lang)
article=sm.extract_by_url(url=url) article = sm.extract_by_url(url=url)
content=article.cleaned_text content = article.cleaned_text
contentWithTag=article.text contentWithTag = article.text
except Exception as e: except Exception as e:
try: try:
raw_html=self.webDriver(url) raw_html = self.webDriver(url)
sm=SmartExtractor(lang) sm = SmartExtractor(lang)
article=sm.extract_by_html(raw_html) article = sm.extract_by_html(raw_html)
content=article.cleaned_text content = article.cleaned_text
contentWithTag=article.text contentWithTag = article.text
except Exception as e: except Exception as e:
print('抽取失败!!') print('抽取失败!!')
return content,contentWithTag return content, contentWithTag
def paserTime(self,publishtime): def paserTime(self, publishtime):
timeType=['年前','月前','周前','前天','昨天','天前','今天','小时前','分钟前'] timeType = ['年前', '月前', '周前', '前天', '昨天', '天前', '今天', '小时前', '分钟前']
current_datetime = datetime.datetime.now() current_datetime = datetime.datetime.now()
publishtime=publishtime.strip() publishtime = publishtime.strip()
print(publishtime) print(publishtime)
try: try:
if '年前' in publishtime: if '年前' in publishtime:
numbers = re.findall(r'\d+', publishtime) numbers = re.findall(r'\d+', publishtime)
day=int(numbers[0]) day = int(numbers[0])
delta = datetime.timedelta(days=365 * day) delta = datetime.timedelta(days=365 * day)
publishtime = current_datetime - delta publishtime = current_datetime - delta
elif '月前' in publishtime: elif '月前' in publishtime:
numbers = re.findall(r'\d+', publishtime) numbers = re.findall(r'\d+', publishtime)
day=int(numbers[0]) day = int(numbers[0])
delta = datetime.timedelta(days= 30*day) delta = datetime.timedelta(days=30 * day)
publishtime = current_datetime - delta publishtime = current_datetime - delta
elif '周前' in publishtime: elif '周前' in publishtime:
numbers = re.findall(r'\d+', publishtime) numbers = re.findall(r'\d+', publishtime)
day=int(numbers[0]) day = int(numbers[0])
delta = datetime.timedelta(weeks= day) delta = datetime.timedelta(weeks=day)
publishtime = current_datetime - delta publishtime = current_datetime - delta
elif '天前' in publishtime: elif '天前' in publishtime:
numbers = re.findall(r'\d+', publishtime) numbers = re.findall(r'\d+', publishtime)
day=int(numbers[0]) day = int(numbers[0])
delta = datetime.timedelta(days= day) delta = datetime.timedelta(days=day)
publishtime = current_datetime - delta publishtime = current_datetime - delta
elif '前天' in publishtime: elif '前天' in publishtime:
delta = datetime.timedelta(days= 2) delta = datetime.timedelta(days=2)
publishtime = current_datetime - delta publishtime = current_datetime - delta
elif '昨天' in publishtime: elif '昨天' in publishtime:
current_datetime = datetime.datetime.now() current_datetime = datetime.datetime.now()
delta = datetime.timedelta(days= 1) delta = datetime.timedelta(days=1)
publishtime = current_datetime - delta publishtime = current_datetime - delta
elif '今天' in publishtime or '小时前' in publishtime or '分钟前' in publishtime : elif '今天' in publishtime or '小时前' in publishtime or '分钟前' in publishtime:
delta = datetime.timedelta(hours= 5) delta = datetime.timedelta(hours=5)
publishtime = current_datetime - delta publishtime = current_datetime - delta
elif '年' in publishtime and '月' in publishtime : elif '年' in publishtime and '月' in publishtime:
time_format = '%Y年%m月%d日' time_format = '%Y年%m月%d日'
publishtime = datetime.datetime.strptime(publishtime, time_format) publishtime = datetime.datetime.strptime(publishtime, time_format)
elif '月' in publishtime and '日' in publishtime : elif '月' in publishtime and '日' in publishtime:
current_year = current_datetime.year current_year = current_datetime.year
time_format = '%Y年%m月%d日' time_format = '%Y年%m月%d日'
publishtime=str(current_year)+'年'+publishtime publishtime = str(current_year) + '年' + publishtime
publishtime = datetime.datetime.strptime(publishtime, time_format) publishtime = datetime.datetime.strptime(publishtime, time_format)
except Exception as e: except Exception as e:
print('时间解析异常!!') print('时间解析异常!!')
return publishtime return publishtime
@retry(tries=3, delay=3) @retry(tries=3, delay=3)
def get_buket_news(self):
self.driver.find_element('xpath', '//div[contains(@class, "YmvwI") and contains(text(), "新闻")]').click()
@retry(tries=3, delay=3)
# 获取每一页数据, 开趴. # 获取每一页数据, 开趴.
def get_page_html(self): def get_page_html(self):
self.logger.info(f"{self.searchkw}...进入google首页...") self.logger.info(f"{self.searchkw}...进入google首页...")
...@@ -266,33 +284,44 @@ class GoogleSpider(object): ...@@ -266,33 +284,44 @@ class GoogleSpider(object):
search_input.send_keys(self.searchkw) search_input.send_keys(self.searchkw)
search_input.submit() search_input.submit()
try: try:
time.sleep(3)
wait = WebDriverWait(self.driver, 20) wait = WebDriverWait(self.driver, 20)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body"))) wait.until(EC.presence_of_element_located((By.CLASS_NAME, "crJ18e")))
try: time.sleep(3)
self.driver.find_element('xpath', '//div[contains(@class, "YmvwI") and contains(text(), "新闻")]').click() for i in range(3):
except: try:
self.logger.info('点击新闻按钮失效') self.get_buket_news()
return break
except Exception as e:
self.logger.info(f'点击新闻按钮失效')
self.driver.refresh()
time.sleep(3)
if i < 3:
continue
else:
return
time.sleep(3) time.sleep(3)
self.driver.find_element('xpath', '//div[@id="hdtb-tls"]').click() self.driver.find_element('xpath', '//div[@id="hdtb-tls"]').click()
time.sleep(2) time.sleep(2)
self.driver.find_element('xpath', '//div[@class="hdtb-mn-hd"]/div[text()="按相关性排序"]').click() # self.driver.find_element('xpath', '//div[@class="hdtb-mn-hd"]/div[text()="按相关性排序"]').click()
self.driver.find_element('xpath',
'//*[@id="tn_1"]/span[3]/g-popup/div[1]/div/div/div[text()="按相关性排序"]').click()
time.sleep(2) time.sleep(2)
self.driver.find_element('xpath', '//div[@class="YpcDnf OSrXXb HG1dvd"]/a[text()="按日期排序"]').click() # self.driver.find_element('xpath', '//div[@class="YpcDnf OSrXXb HG1dvd"]/a[text()="按日期排序"]').click()
self.driver.find_element('xpath', '//*[@id="lb"]/div/g-menu/g-menu-item[2]/div/a[text()="按日期排序"]').click()
except Exception as e: except Exception as e:
self.logger.info(f'--{self.searchkw}--点击按钮失效----{e}') self.logger.info(f'--{self.searchkw}--点击按钮失效')
return return
self.logger.info(f"{self.searchkw}...开始抓取首页...") self.logger.info(f"{self.searchkw}...开始抓取首页...")
time.sleep(5) time.sleep(5)
flag, lists = self.parse_page() flag, lists = self.parse_page()
if len(lists)<1: if len(lists) < 1:
time.sleep(6) time.sleep(6)
repeatCounts = 0 repeatCounts = 0
for detail in lists: for detail in lists:
durl=detail['detailUrl'] durl = detail['detailUrl']
is_member = self.r.sismember('pygoogle_'+self.wordsCode, durl) is_member = self.r.sismember('pygoogle_' + self.wordsCode, durl)
if is_member: if is_member:
repeatCounts += 1 repeatCounts += 1
if repeatCounts / len(lists) > 0.5: if repeatCounts / len(lists) > 0.5:
...@@ -310,8 +339,8 @@ class GoogleSpider(object): ...@@ -310,8 +339,8 @@ class GoogleSpider(object):
hasnext = '' hasnext = ''
timeFlag = False timeFlag = False
while hasnext == '下一页': while hasnext == '下一页':
if self.page_num==5: # if self.page_num == 5:
break # break
self.page_num = self.page_num + 1 self.page_num = self.page_num + 1
self.logger.info(f"{self.searchkw}...开始抓取第{self.page_num}页...") self.logger.info(f"{self.searchkw}...开始抓取第{self.page_num}页...")
try: try:
...@@ -323,7 +352,7 @@ class GoogleSpider(object): ...@@ -323,7 +352,7 @@ class GoogleSpider(object):
repeated_counts = 0 repeated_counts = 0
for detail in lists: for detail in lists:
durl = detail['detailUrl'] durl = detail['detailUrl']
is_member = self.r.sismember('pygoogle_'+self.wordsCode, durl) is_member = self.r.sismember('pygoogle_' + self.wordsCode, durl)
if is_member: if is_member:
self.logger.info(f"{self.searchkw}已存在{detail['title']}") self.logger.info(f"{self.searchkw}已存在{detail['title']}")
repeated_counts += 1 repeated_counts += 1
...@@ -331,14 +360,14 @@ class GoogleSpider(object): ...@@ -331,14 +360,14 @@ class GoogleSpider(object):
self.logger.info(f"{self.searchkw}第{self.page_num}页已存在过多,跳出循环") self.logger.info(f"{self.searchkw}第{self.page_num}页已存在过多,跳出循环")
return return
continue continue
publishTag=detail['publishTag'] publishTag = detail['publishTag']
# if publishTag: if publishTag:
# pubtime = datetime.datetime.strptime(publishTag, "%Y-%m-%d %H:%M:%S") pubtime = datetime.datetime.strptime(publishTag, "%Y-%m-%d %H:%M:%S")
# needDate='2022-01-01 00:00:00' needDate = '2022-01-01 00:00:00'
# needTime = datetime.datetime.strptime(needDate, "%Y-%m-%d %H:%M:%S") needTime = datetime.datetime.strptime(needDate, "%Y-%m-%d %H:%M:%S")
# if pubtime < needTime: if pubtime < needTime:
# timeFlag = True timeFlag = True
# break break
self.detailList.put(detail) self.detailList.put(detail)
if timeFlag: if timeFlag:
break break
...@@ -349,37 +378,74 @@ class GoogleSpider(object): ...@@ -349,37 +378,74 @@ class GoogleSpider(object):
hasnext = hasnext.strip() hasnext = hasnext.strip()
self.logger.info(hasnext) self.logger.info(hasnext)
except Exception as e: except Exception as e:
hasnext='' hasnext = ''
self.logger.info(f"{self.searchkw}...列表抓取完毕") self.logger.info(f"{self.searchkw}...列表抓取完毕")
def getRequest(self,url): def getRequest(self, url):
html='' html = ''
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
} }
try: try:
print(url) print(url)
res=requests.get(url=url,timeout=30) res = requests.get(url=url, timeout=30)
if res.status_code==200: if res.status_code == 200:
res.encoding = res.apparent_encoding # 使用自动检测的编码方式 res.encoding = res.apparent_encoding # 使用自动检测的编码方式
html=res.text html = res.text
else: else:
html='' html = ''
if html=='': if html == '':
for i in range(1,3): for i in range(1, 3):
time.sleep(1) time.sleep(1)
html=self.getRequest(url) html = self.getRequest(url)
except Exception as e: except Exception as e:
print(e) print(e)
return html return html
def sendMonitor(self, processitem):
self.logger.info(processitem['uniqueCode'])
sidName = self.baseCore.getSidName(processitem['sid'])
monitor = {
"title": processitem['title'], # 标题
"sourceAddress": processitem['sourceAddress'], # 原文链接
"uniqueCode": processitem['uniqueCode'], # 唯一编码 采集类型+6位日期+服务器序列+线程序列+自定义数字
"operateType": "DATA_CRAWLER", # 操作类型 写死
"handlerBody": {
"success": True, # 处理成功或失败状态 写死
"handlerStatus": "CRAWLED" # 处理状态 写死
},
"source": {
"sourceId": processitem['sid'], # 信息源Id
"sourceName": sidName, # 信息源名称
"sourceType": 4, # 信息源类型 sourceType枚举字典
},
"processTime": datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), # 处理时间 yyyy-MM-dd HH:mm:ss
"server": {
"serverIp": "94.74.96.195", # 所在服务器IP
"serverHostName": "数据采集服务", # 服务器名称
"processId": self.baseCore.getPID() # 进程Id
}
}
producer = KafkaProducer(bootstrap_servers=[self.kafka_bootstrap_servers], max_request_size=1024 * 1024 * 20,
api_version=(2, 7, 0))
try:
kafka_result = producer.send("crawlerInfo", json.dumps(monitor, ensure_ascii=False).encode('utf8'))
self.logger.info('监控数据发送Kafka成功')
except Exception as e:
monitor = json.dumps(monitor, ensure_ascii=False)
monitorDic = {
'lifecycle_data_crawler': monitor
}
self.baseCore.r.xadd('data_lifecycle_log_data_crawler-redis', monitorDic, id='*')
self.logger.info('数据监控发送Kafka失败,已放置Redis中')
# 获取详情页 # 获取详情页
def get_detail_html(self): def get_detail_html(self):
while True: while True:
if self.detailList.qsize() != 0: if self.detailList.qsize() != 0:
detailmsg=self.detailList.get() detailmsg = self.detailList.get()
title = detailmsg['title'] title = detailmsg['title']
detailUrl = detailmsg['detailUrl'] detailUrl = detailmsg['detailUrl']
self.logger.info("%s:%s开始解析详情数据\n" % (title, detailUrl)) self.logger.info("%s:%s开始解析详情数据\n" % (title, detailUrl))
...@@ -392,12 +458,12 @@ class GoogleSpider(object): ...@@ -392,12 +458,12 @@ class GoogleSpider(object):
# self.driver.get(detailUrl) # self.driver.get(detailUrl)
# response = self.driver.page_source # response = self.driver.page_source
try: try:
bdetail=self.getDetailmsg(detailmsg) bdetail = self.getDetailmsg(detailmsg)
# 'content':content, # 'content':content,
# 'contentHtml':contentWithTag, # 'contentHtml':contentWithTag,
content=bdetail['content'] content = bdetail['content']
contentHtml=bdetail['contentHtml'] contentHtml = bdetail['contentHtml']
if len(content)<100: if len(content) < 100:
continue continue
soup = BeautifulSoup(contentHtml, "html.parser") soup = BeautifulSoup(contentHtml, "html.parser")
# 查找所有带有class属性的元素 # 查找所有带有class属性的元素
...@@ -405,57 +471,62 @@ class GoogleSpider(object): ...@@ -405,57 +471,62 @@ class GoogleSpider(object):
# 循环遍历元素并去掉class属性 # 循环遍历元素并去掉class属性
for element in elements_with_class: for element in elements_with_class:
del element.attrs["class"] del element.attrs["class"]
contentHtml=str(soup) contentHtml = str(soup)
bdetail['content']=content bdetail['content'] = content
bdetail['contentHtml']=contentHtml bdetail['contentHtml'] = contentHtml
except Exception as e: except Exception as e:
self.logger.info('详情解析失败') self.logger.info('详情解析失败')
continue continue
processitem=self.getProcessitem(bdetail) processitem = self.getProcessitem(bdetail)
# uniqueCode = self.baseCore.getUniqueCode('GG', '195', self.threadId)
# processitem['uniqueCode'] = uniqueCode
try: try:
self.sendkafka(processitem) # flg = self.sendkafka(processitem)
self.r.sadd('pygoogle_'+self.wordsCode, processitem['sourceAddress']) flg = True
# 插入数据库 if flg:
try: self.r.sadd('pygoogle_' + self.wordsCode, processitem['sourceAddress'])
items = [] # 插入数据库
items.append(bdetail) try:
self.itemInsertToTable(items) items = []
except Exception as e: items.append(bdetail)
self.logger.info(f"插入数据库失败!{bdetail['kword']}===={detailUrl}") self.itemInsertToTable(items)
self.logger.info(f"放入kafka成功!{bdetail['kword']}===={detailUrl}") except Exception as e:
self.logger.info(f"插入数据库失败!{bdetail['kword']}===={e}")
# self.logger.info(f"放入kafka成功!{bdetail['kword']}===={detailUrl}")
# self.sendMonitor(processitem)
except Exception as e: except Exception as e:
self.logger.info(f"放入kafka失败!{bdetail['kword']}===={detailUrl}") self.logger.info(f"{e}{bdetail['kword']}===={detailUrl}")
# 关闭当前新窗口 # 关闭当前新窗口
# self.driver.close() # self.driver.close()
time.sleep(1) time.sleep(1)
except Exception as e: except Exception as e:
time.sleep(5) time.sleep(5)
self.logger.info("详情页解析异常!"+detailUrl) self.logger.info("详情页解析异常!" + detailUrl)
else: else:
break break
# time.sleep(5) # time.sleep(5)
def rmTagattr(self,html,url): def rmTagattr(self, html, url):
# 使用BeautifulSoup解析网页内容 # 使用BeautifulSoup解析网页内容
# soup = BeautifulSoup(html, 'html.parser') # soup = BeautifulSoup(html, 'html.parser')
soup = self.paserUrl(html,url) soup = self.paserUrl(html, url)
# 遍历所有标签,并去掉属性 # 遍历所有标签,并去掉属性
for tag in soup.find_all(True): for tag in soup.find_all(True):
if tag.name == 'img': if tag.name == 'img':
tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'} tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'}
elif tag.name !='img': elif tag.name != 'img':
tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'} tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'}
else: else:
tag.attrs = {key: value for key, value in tag.attrs.items()} tag.attrs = {key: value for key, value in tag.attrs.items()}
# 打印去掉属性后的网页内容 # 打印去掉属性后的网页内容
# print(soup.prettify()) # print(soup.prettify())
html=soup.prettify() html = soup.prettify()
return html return html
# 将html中的相对地址转换成绝对地址 # 将html中的相对地址转换成绝对地址
def paserUrl(self,html,listurl): def paserUrl(self, html, listurl):
soup = BeautifulSoup(html, 'html.parser') soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签 # 获取所有的<a>标签和<img>标签
links = soup.find_all(['a', 'img']) links = soup.find_all(['a', 'img'])
...@@ -468,73 +539,76 @@ class GoogleSpider(object): ...@@ -468,73 +539,76 @@ class GoogleSpider(object):
return soup return soup
#获取资讯内容信息 # 获取资讯内容信息
def getDetailmsg(self,detailmsg): def getDetailmsg(self, detailmsg):
try: try:
detailurl=detailmsg['detailUrl'] detailurl = detailmsg['detailUrl']
title = detailmsg['title'] title = detailmsg['title']
content,contentWithTag=self.extractorMsg(detailurl,title) content, contentWithTag = self.extractorMsg(detailurl, title)
contentWithTag=self.rmTagattr(contentWithTag,detailurl) contentWithTag = self.rmTagattr(contentWithTag, detailurl)
except Exception as e: except Exception as e:
content='' content = ''
contentWithTag='' contentWithTag = ''
currentdate=self.getNowDate() currentdate = self.getNowDate()
kword=self.searchkw kword = self.searchkw
publishDate=detailmsg['publishTag'] publishDate = detailmsg['publishTag']
publishDate=publishDate+'' publishDate = publishDate + ''
# publishtime=self.paserTime(publishtime) # publishtime=self.paserTime(publishtime)
# publishDate=publishtime.strftime("%Y-%m-%d %H:%M:%S") # publishDate=publishtime.strftime("%Y-%m-%d %H:%M:%S")
detailmsg={ detailmsg = {
'title':detailmsg['title'], 'title': detailmsg['title'],
'source':detailmsg['sourceTag'], 'source': detailmsg['sourceTag'],
'detailurl':detailurl, 'detailurl': detailurl,
'content':content, 'content': content,
'contentHtml':contentWithTag, 'contentHtml': contentWithTag,
'publishtime':publishDate, 'publishtime': publishDate,
'currentdate':currentdate, 'currentdate': currentdate,
'kword':kword 'kword': kword
} }
return detailmsg return detailmsg
def getProcessitem(self,bdetail): def getProcessitem(self, bdetail):
nowDate=self.getNowDate() nowDate = self.getNowDate()
content=bdetail['content'] content = bdetail['content']
if content!='': if content != '':
processitem={ processitem = {
"sid":self.sid, "sid": self.sid,
"source":"4", "source": "4",
"title":bdetail['title'], "title": bdetail['title'],
"content":bdetail['content'], "content": bdetail['content'],
"contentWithtag":bdetail['contentHtml'], "contentWithtag": bdetail['contentHtml'],
"origin":bdetail['source'], "origin": bdetail['source'],
"publishDate":bdetail['publishtime'], "publishDate": bdetail['publishtime'],
"sourceAddress":bdetail['detailurl'], "sourceAddress": bdetail['detailurl'],
"createDate":nowDate "createDate": nowDate
} }
return processitem return processitem
def sendkafka(self,processitem): def sendkafka(self, processitem):
try: try:
producer = KafkaProducer(bootstrap_servers=[self.kafka_bootstrap_servers]) producer = KafkaProducer(bootstrap_servers=[self.kafka_bootstrap_servers])
content=processitem['content'] content = processitem['content']
publishDate=str(processitem['publishDate']) publishDate = str(processitem['publishDate'])
title=processitem['title'] title = processitem['title']
if title =='': if title == '':
return return
if content=='': if content == '':
return return
if publishDate=='': if publishDate == '':
return return
kafka_result = producer.send("crawlerInfo", json.dumps(processitem, ensure_ascii=False).encode('utf8')) kafka_result = producer.send("crawlerInfo", json.dumps(processitem, ensure_ascii=False).encode('utf8'))
# self.logger.info("数据发送kafka成功") # self.logger.info("数据发送kafka成功")
self.logger.info(kafka_result.get(timeout=10)) self.logger.info(kafka_result.get(timeout=10))
flg = True
except Exception as e: except Exception as e:
flg = False
pass pass
# self.logger.info('发送kafka异常') # self.logger.info('发送kafka异常')
finally: finally:
producer.close() producer.close()
return flg
def run(self): def run(self):
# 获取每页URL # 获取每页URL
...@@ -545,38 +619,37 @@ class GoogleSpider(object): ...@@ -545,38 +619,37 @@ class GoogleSpider(object):
t = threading.Thread(target=self.get_detail_html) t = threading.Thread(target=self.get_detail_html)
t.start() t.start()
def detect_language(self,html): def detect_language(self, html):
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
text = soup.get_text() text = soup.get_text()
# 使用langid.py判断文本的语言 # 使用langid.py判断文本的语言
lang, confidence = langid.classify(text) lang, confidence = langid.classify(text)
return lang return lang
if __name__ == '__main__': if __name__ == '__main__':
searchkw='kw' searchkw = 'kw'
wordsCode='wordsCode' wordsCode = 'wordsCode'
sid='sid' sid = 'sid'
zhuce=GoogleSpider(searchkw,wordsCode,sid) zhuce = GoogleSpider(searchkw, wordsCode, sid)
# zhuce.run() # zhuce.run()
url='https://vostok.today/46962-fesco-i-rzhd-rasshirjat-propusknuju-sposobnost-vladivostokskogo-morskogo-torgovogo-porta.html' url = 'https://vostok.today/46962-fesco-i-rzhd-rasshirjat-propusknuju-sposobnost-vladivostokskogo-morskogo-torgovogo-porta.html'
zhuce.driver.get(url) zhuce.driver.get(url)
time.sleep(20) time.sleep(20)
html=zhuce.driver.page_source html = zhuce.driver.page_source
print(html) print(html)
lang=zhuce.detect_language(html) lang = zhuce.detect_language(html)
print(lang) print(lang)
print('++++++++++++++++++') print('++++++++++++++++++')
sm=SmartExtractor(lang) sm = SmartExtractor(lang)
article=sm.extract_by_html(html) article = sm.extract_by_html(html)
# article=sm.extract_by_url(url) # article=sm.extract_by_url(url)
content=article.cleaned_text content = article.cleaned_text
text=article.text text = article.text
print(content) print(content)
print(text) print(text)
# raw_html = article.raw_html # raw_html = article.raw_html
# html=zhuce.getRequest(url) # html=zhuce.getRequest(url)
# article_content=zhuce.extract_article(html,url) # article_content=zhuce.extract_article(html,url)
# print(article_content) # print(article_content)
...@@ -27,6 +27,9 @@ class GoogleTaskJob(object): ...@@ -27,6 +27,9 @@ class GoogleTaskJob(object):
self.r = redis.Redis(host=self.config.get('redis', 'host'), self.r = redis.Redis(host=self.config.get('redis', 'host'),
port=self.config.get('redis', 'port'), port=self.config.get('redis', 'port'),
password=self.config.get('redis', 'pass'), db=0) password=self.config.get('redis', 'pass'), db=0)
self.r_6 = redis.Redis(host=self.config.get('redis', 'host'),
port=self.config.get('redis', 'port'),
password=self.config.get('redis', 'pass'), db=6)
def getkafka(self): def getkafka(self):
# Kafka集群的地址 # Kafka集群的地址
...@@ -108,35 +111,36 @@ class GoogleTaskJob(object): ...@@ -108,35 +111,36 @@ class GoogleTaskJob(object):
def paserKeyMsg(self,keymsg): def paserKeyMsg(self,keymsg):
num = 1
logger.info('----------') logger.info('----------')
wordsCode=keymsg['wordsCode'] wordsCode=keymsg['wordsCode']
id=keymsg['id'] id=keymsg['id']
try: keyword=keymsg['keyWord']
searchEngines=keymsg['searchEngines'] kwList = []
if 'java.util.ArrayList' in searchEngines: keymsglist=self.getkeywords(keyword)
searchEngines=searchEngines[1]
except Exception as e: for kw in keymsglist:
searchEngines=[] kwmsg={
kwList=[] 'kw':kw,
if searchEngines: 'wordsCode':wordsCode,
if '4' in searchEngines: 'sid':id
keyword=keymsg['keyWord'] }
keymsglist=self.getkeywords(keyword) kwList.append((num,kwmsg))
for kw in keymsglist: num += 1
kwmsg={
'kw':kw,
'wordsCode':wordsCode,
'sid':id
}
kwList.append(kwmsg)
return kwList return kwList
def runSpider(self,kwmsg): def runSpider(self,threadId,kwmsg, item, bangdan_name):
if 'lay' in kwmsg['kw']:
com_name = item.split('|')[2]
else:
com_name = item.split('|')[1]
searchkw = com_name + ' ' + kwmsg['kw']
searchkw=kwmsg['kw'] print(f'======拼接的关键词是{searchkw}=={com_name}====')
wordsCode=kwmsg['wordsCode'] wordsCode = kwmsg['wordsCode']
sid=kwmsg['sid'] sid = kwmsg['sid']
googleSpider=GoogleSpider(searchkw,wordsCode,sid) googleSpider = GoogleSpider(threadId, searchkw, wordsCode, sid, item, bangdan_name)
try: try:
googleSpider.get_page_html() googleSpider.get_page_html()
...@@ -151,7 +155,28 @@ class GoogleTaskJob(object): ...@@ -151,7 +155,28 @@ class GoogleTaskJob(object):
finally: finally:
googleSpider.driver.quit() googleSpider.driver.quit()
logger.info("关键词采集结束!"+searchkw) logger.info("关键词采集结束!"+searchkw)
import random
def get_comname(self):
# todo:读取redis里的企业名称添加到关键词上
# ZZSN22080900000001|沃尔玛|WMT|1
item = baseCore.redicPullData('GOOGLE_KEYWORDS:COMPANY_NAME:2023_500')
# item = 'ZZSN22080900000001|沃尔玛|WMT|1'
if item:
return item
else:
logger.info('====已无企业===')
return None
# 从Redis的List中获取并移除一个元素
def redicPullData(key, r):
try:
r.ping()
except:
r = redis.Redis(host="114.116.90.53", port=6380, password='clbzzsn', db=6)
item = r.lpop(key)
return item.decode() if item else None
if __name__ == '__main__': if __name__ == '__main__':
# ss='道地西洋参+(销售市场|交易市场|直播带货|借助大会平台|网店|微商|电商|农民博主|推介宣传|高品质定位|西洋参产品经营者加盟|引进龙头企业|西洋参冷风库|建设农旅中心|农产品展销中心|精品民宿|温泉)' # ss='道地西洋参+(销售市场|交易市场|直播带货|借助大会平台|网店|微商|电商|农民博主|推介宣传|高品质定位|西洋参产品经营者加盟|引进龙头企业|西洋参冷风库|建设农旅中心|农产品展销中心|精品民宿|温泉)'
# keymsglist=getkeywords(ss) # keymsglist=getkeywords(ss)
...@@ -164,14 +189,28 @@ if __name__ == '__main__': ...@@ -164,14 +189,28 @@ if __name__ == '__main__':
print('---------------') print('---------------')
while True: while True:
try: try:
codeids=[] # try:
# codeid='KW-20230727-0001' # googleTaskJob.r.ping()
codeids.append('KW-20240318-0001') # except:
for codeid in codeids: # googleTaskJob.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
# all_keys = 'GOOGLE_KEYWORDS:COMPANY_NAME*'
# keys = googleTaskJob.r.scan_iter(f"{key}*")
# for key in keys:
item = googleTaskJob.get_comname()
bangdan_name = '2023年世界500强'
if item:
pass
else:
break
codeList = [
'KW-20240516-0002'
]
for codeid in codeList:
try: try:
# keymsg=baiduTaskJob.getkafka() #keymsg=baiduTaskJob.getkafka()
keymsg=googleTaskJob.getkeyFromredis(codeid) keymsg = googleTaskJob.getkeyFromredis(codeid)
kwList=googleTaskJob.paserKeyMsg(keymsg) kwList = googleTaskJob.paserKeyMsg(keymsg)
# kwList=reversed(kwList) # kwList=reversed(kwList)
# 从列表中随机选择5个数据 # 从列表中随机选择5个数据
# kwList = random.sample(kwList, 4) # kwList = random.sample(kwList, 4)
...@@ -182,9 +221,9 @@ if __name__ == '__main__': ...@@ -182,9 +221,9 @@ if __name__ == '__main__':
continue continue
if kwList: if kwList:
# 创建一个线程池,指定线程数量为4 # 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
# 提交任务给线程池,每个任务处理一个数据 # 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(googleTaskJob.runSpider, data) for data in kwList] results = [executor.submit(googleTaskJob.runSpider, num, data, item, bangdan_name) for num, data in kwList]
# 获取任务的执行结果 # 获取任务的执行结果
for future in concurrent.futures.as_completed(results): for future in concurrent.futures.as_completed(results):
try: try:
...@@ -195,5 +234,5 @@ if __name__ == '__main__': ...@@ -195,5 +234,5 @@ if __name__ == '__main__':
# 处理任务执行过程中的异常 # 处理任务执行过程中的异常
logger.info(f"任务执行exception: {e}") logger.info(f"任务执行exception: {e}")
except Exception as e: except Exception as e:
logger.info('采集异常') logger.info(f'采集异常{e}')
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论