提交 176c0051 作者: 薛凌堃

1/11

上级 55d9be00
...@@ -663,6 +663,8 @@ class BaseCore: ...@@ -663,6 +663,8 @@ class BaseCore:
return 'cn' return 'cn'
if result[0] == '': if result[0] == '':
return 'cn' return 'cn'
if result[0] == 'ja':
return 'jp'
return result[0] return result[0]
#创建excel文件 #创建excel文件
......
...@@ -895,7 +895,7 @@ def qianyanzhishiku(): ...@@ -895,7 +895,7 @@ def qianyanzhishiku():
def shijiejingjiluntan(): def shijiejingjiluntan():
allnum = {'一': '01', '二': '02', '三': '03', '四': '04', '五': '05', '六': '06', '七': '07', '八': '08', '九': '09', '十': '10', '十一': '11', '十二': '12'} allnum = {'一': '01', '二': '02', '三': '03', '四': '04', '五': '05', '六': '06', '七': '07', '八': '08', '九': '09', '十': '10', '十一': '11', '十二': '12'}
for i in range(76, 128): for i in range(1, 2):
# res = requests.get(url) # res = requests.get(url)
# soup = BeautifulSoup(res.content,'html.parser') # soup = BeautifulSoup(res.content,'html.parser')
...@@ -1637,23 +1637,23 @@ if __name__ == '__main__': ...@@ -1637,23 +1637,23 @@ if __name__ == '__main__':
# juliangsuanshu() # juliangsuanshu()
# except Exception as e: # except Exception as e:
# pass # pass
try:
log.info('ke36')
ke36()
except Exception as e:
ke36()
pass
# try: # try:
# log.info('qianyanzhishiku') # log.info('ke36')
# qianyanzhishiku() # ke36()
# except Exception as e: # except Exception as e:
# ke36()
# pass # pass
# try: # try:
# log.info('shijiejingjiluntan') # log.info('qianyanzhishiku')
# shijiejingjiluntan() # qianyanzhishiku()
# except Exception as e: # except Exception as e:
# log.info(e)
# pass # pass
try:
log.info('shijiejingjiluntan')
shijiejingjiluntan()
except Exception as e:
log.info(e)
pass
# try: # try:
# log.info('dongfangcaifu') # log.info('dongfangcaifu')
# dongfangcaifu() # dongfangcaifu()
......
...@@ -37,8 +37,8 @@ if __name__ == "__main__": ...@@ -37,8 +37,8 @@ if __name__ == "__main__":
while True: while True:
start_time = time.time() start_time = time.time()
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息 # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('ShangBiao:gnshSocial_code') # social_code = baseCore.redicPullData('ShangBiao:gnshSocial_code')
# social_code = '91350700856994874M' social_code = '91130629MA0CG2DL51'
# 判断 如果Redis中已经没有数据,则等待 # 判断 如果Redis中已经没有数据,则等待
if social_code == None: if social_code == None:
# time.sleep(20) # time.sleep(20)
...@@ -149,3 +149,4 @@ if __name__ == "__main__": ...@@ -149,3 +149,4 @@ if __name__ == "__main__":
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}') baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5) time.sleep(5)
break
\ No newline at end of file
...@@ -68,7 +68,7 @@ if __name__ == '__main__': ...@@ -68,7 +68,7 @@ if __name__ == '__main__':
place = data[6] place = data[6]
if place == 1: if place == 1:
log.info(f'{com_name}--国内') log.info(f'{com_name}--国内')
baseCore.rePutIntoR('ZhuanLi_500:zgSocial_code',social_code) baseCore.rePutIntoR('Zhuanli:gwSocial_code',social_code)
continue continue
if english_name_: if english_name_:
pass pass
......
import requests
url = 'https://www.ctwant.com/article/308534'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
}
req = requests.get(url,headers)
print(req.text)
\ No newline at end of file
#coding=utf-8 #coding=utf-8
...@@ -25,7 +25,7 @@ from baseCore import BaseCore ...@@ -25,7 +25,7 @@ from baseCore import BaseCore
import configparser import configparser
from smart_extractor import SmartExtractor from smart_extractor import SmartExtractor
# baseCore=BaseCore()
class BaiduSpider(object): class BaiduSpider(object):
def __init__(self,searchkw,wordsCode,sid): def __init__(self,searchkw,wordsCode,sid):
...@@ -40,13 +40,15 @@ class BaiduSpider(object): ...@@ -40,13 +40,15 @@ class BaiduSpider(object):
port=self.config.get('redis', 'port'), port=self.config.get('redis', 'port'),
password=self.config.get('redis', 'pass'), db=0) password=self.config.get('redis', 'pass'), db=0)
self.page_num = 1 self.page_num = 1
chrome_driver =self.config.get('selenium', 'chrome_driver') # chrome_driver =self.config.get('selenium', 'chrome_driver')
self.kafka_bootstrap_servers = self.config.get('kafka', 'bootstrap_servers') # self.kafka_bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
path = Service(chrome_driver) # path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions() # chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = self.config.get('selenium', 'binary_location') # chrome_options.binary_location = self.config.get('selenium', 'binary_location')
self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options) # proxy = baseCore.get_proxy()
# driver = webdriver.Chrome(chrome_options=chrome_options) # chrome_options.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
# self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
# # driver = webdriver.Chrome(chrome_options=chrome_options)
self.qtitle = Queue() self.qtitle = Queue()
self.qurl = Queue() self.qurl = Queue()
self.detailList = Queue() self.detailList = Queue()
...@@ -54,14 +56,16 @@ class BaiduSpider(object): ...@@ -54,14 +56,16 @@ class BaiduSpider(object):
self.wordsCode = wordsCode self.wordsCode = wordsCode
self.sid = sid self.sid = sid
def createDriver(self): def createDriver(self):
chrome_driver =self.config.get('selenium', 'chrome_driver') chrome_driver =self.config.get('selenium', 'chrome_driver')
self.kafka_bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
path = Service(chrome_driver) path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions() chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location =self.config.get('selenium', 'binary_location') chrome_options.binary_location = self.config.get('selenium', 'binary_location')
# 设置代理 proxy = baseCore.get_proxy()
# proxy = "127.0.0.1:8080" # 代理地址和端口 chrome_options.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
# chrome_options.add_argument('--proxy-server=http://' + proxy)
self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options) self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
# driver = webdriver.Chrome(chrome_options=chrome_options)
#将列表数据插入到表中 meta_search_result #将列表数据插入到表中 meta_search_result
def itemInsertToTable(self,items): def itemInsertToTable(self,items):
try: try:
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -12,12 +12,16 @@ from kafka import KafkaProducer ...@@ -12,12 +12,16 @@ from kafka import KafkaProducer
from kafka import KafkaConsumer from kafka import KafkaConsumer
import json import json
import itertools import itertools
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from baiduSpider import BaiduSpider from baiduSpider import BaiduSpider
import concurrent.futures import concurrent.futures
from baseCore import BaseCore from baseCore import BaseCore
from queue import Queue from queue import Queue
import configparser import configparser
from tqdm import tqdm
class BaiduTaskJob(object): class BaiduTaskJob(object):
def __init__(self): def __init__(self):
...@@ -39,7 +43,7 @@ class BaiduTaskJob(object): ...@@ -39,7 +43,7 @@ class BaiduTaskJob(object):
bootstrap_servers=[bootstrap_servers], bootstrap_servers=[bootstrap_servers],
value_deserializer=lambda m: json.loads(m.decode('utf-8'))) value_deserializer=lambda m: json.loads(m.decode('utf-8')))
try: try:
for record in consumer: for record in tqdm(consumer, desc="Consuming messages"):
try: try:
logger.info("value:",record.value) logger.info("value:",record.value)
keymsg=record.value keymsg=record.value
...@@ -119,7 +123,15 @@ class BaiduTaskJob(object): ...@@ -119,7 +123,15 @@ class BaiduTaskJob(object):
kwList=[] kwList=[]
if searchEngines: if searchEngines:
if '3' in searchEngines: if '3' in searchEngines:
keyword=keymsg['keyWord'] start_time = time.time()
keyword = keymsg['keyWord']
wordsName = keymsg['wordsName']
first = wordsName
if wordsName == first:
end_time = time.time()
if int(end_time - start_time) > 10:
logger.info(f'采集一轮{wordsName}关键词耗时{baseCore.getTimeCost(start_time,end_time)}')
logger.info(f"获取到关键词组:{wordsName}---{wordsCode}")
keymsglist=self.getkeywords(keyword) keymsglist=self.getkeywords(keyword)
for kw in keymsglist: for kw in keymsglist:
kwmsg={ kwmsg={
...@@ -157,6 +169,25 @@ class BaiduTaskJob(object): ...@@ -157,6 +169,25 @@ class BaiduTaskJob(object):
# finally: # finally:
# baiduSpider.driver.quit() # baiduSpider.driver.quit()
# logger.info("关键词采集结束!"+searchkw) # logger.info("关键词采集结束!"+searchkw)
def createDriver(self):
chrome_driver = r'D:\cmd100\chromedriver.exe'
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--start-maximized")
proxy = baseCore.get_proxy()
chrome_options.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
chrome_options.add_argument(
'user-agent=' + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
# chrome_options.add_argument('--headless')
browser = webdriver.Chrome(service=path, chrome_options=chrome_options)
return browser
def runSpider(self,kwmsg): def runSpider(self,kwmsg):
searchkw=kwmsg['kw'] searchkw=kwmsg['kw']
wordsCode=kwmsg['wordsCode'] wordsCode=kwmsg['wordsCode']
...@@ -166,6 +197,8 @@ class BaiduTaskJob(object): ...@@ -166,6 +197,8 @@ class BaiduTaskJob(object):
baiduSpider.get_page_html() baiduSpider.get_page_html()
except Exception as e: except Exception as e:
try: try:
baiduSpider.driver.quit()
baiduSpider.driver=self.createDriver()
baiduSpider.get_page_html() baiduSpider.get_page_html()
except Exception as e: except Exception as e:
logger.info('百度搜索异常'+searchkw) logger.info('百度搜索异常'+searchkw)
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
...@@ -293,6 +293,7 @@ class BaseCore: ...@@ -293,6 +293,7 @@ class BaseCore:
sql = "select proxy from clb_proxy" sql = "select proxy from clb_proxy"
self.__cursor_proxy.execute(sql) self.__cursor_proxy.execute(sql)
proxy_lists = self.__cursor_proxy.fetchall() proxy_lists = self.__cursor_proxy.fetchall()
self.__cnx_proxy.commit()
ip_list = [] ip_list = []
for proxy_ in proxy_lists: for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", '')) ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
...@@ -304,8 +305,8 @@ class BaseCore: ...@@ -304,8 +305,8 @@ class BaseCore:
"port": str_ip_list[1], "port": str_ip_list[1],
} }
proxy = { proxy = {
"HTTP": proxyMeta, "http": proxyMeta,
"HTTPS": proxyMeta "https": proxyMeta
} }
proxy_list.append(proxy) proxy_list.append(proxy)
return proxy_list[random.randint(0, 3)] return proxy_list[random.randint(0, 3)]
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论