提交 176c0051 作者: 薛凌堃

1/11

上级 55d9be00
......@@ -663,6 +663,8 @@ class BaseCore:
return 'cn'
if result[0] == '':
return 'cn'
if result[0] == 'ja':
return 'jp'
return result[0]
#创建excel文件
......
......@@ -895,7 +895,7 @@ def qianyanzhishiku():
def shijiejingjiluntan():
allnum = {'一': '01', '二': '02', '三': '03', '四': '04', '五': '05', '六': '06', '七': '07', '八': '08', '九': '09', '十': '10', '十一': '11', '十二': '12'}
for i in range(76, 128):
for i in range(1, 2):
# res = requests.get(url)
# soup = BeautifulSoup(res.content,'html.parser')
......@@ -1637,23 +1637,23 @@ if __name__ == '__main__':
# juliangsuanshu()
# except Exception as e:
# pass
try:
log.info('ke36')
ke36()
except Exception as e:
ke36()
pass
# try:
# log.info('qianyanzhishiku')
# qianyanzhishiku()
# log.info('ke36')
# ke36()
# except Exception as e:
# ke36()
# pass
# try:
# log.info('shijiejingjiluntan')
# shijiejingjiluntan()
# log.info('qianyanzhishiku')
# qianyanzhishiku()
# except Exception as e:
# log.info(e)
# pass
try:
log.info('shijiejingjiluntan')
shijiejingjiluntan()
except Exception as e:
log.info(e)
pass
# try:
# log.info('dongfangcaifu')
# dongfangcaifu()
......
......@@ -37,8 +37,8 @@ if __name__ == "__main__":
while True:
start_time = time.time()
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('ShangBiao:gnshSocial_code')
# social_code = '91350700856994874M'
# social_code = baseCore.redicPullData('ShangBiao:gnshSocial_code')
social_code = '91130629MA0CG2DL51'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
# time.sleep(20)
......@@ -149,3 +149,4 @@ if __name__ == "__main__":
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
break
\ No newline at end of file
......@@ -68,7 +68,7 @@ if __name__ == '__main__':
place = data[6]
if place == 1:
log.info(f'{com_name}--国内')
baseCore.rePutIntoR('ZhuanLi_500:zgSocial_code',social_code)
baseCore.rePutIntoR('Zhuanli:gwSocial_code',social_code)
continue
if english_name_:
pass
......
import requests
url = 'https://www.ctwant.com/article/308534'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
}
req = requests.get(url,headers)
print(req.text)
\ No newline at end of file
#coding=utf-8
#coding=utf-8
......@@ -25,7 +25,7 @@ from baseCore import BaseCore
import configparser
from smart_extractor import SmartExtractor
# baseCore=BaseCore()
class BaiduSpider(object):
def __init__(self,searchkw,wordsCode,sid):
......@@ -40,13 +40,15 @@ class BaiduSpider(object):
port=self.config.get('redis', 'port'),
password=self.config.get('redis', 'pass'), db=0)
self.page_num = 1
chrome_driver =self.config.get('selenium', 'chrome_driver')
self.kafka_bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = self.config.get('selenium', 'binary_location')
self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
# driver = webdriver.Chrome(chrome_options=chrome_options)
# chrome_driver =self.config.get('selenium', 'chrome_driver')
# self.kafka_bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
# path = Service(chrome_driver)
# chrome_options = webdriver.ChromeOptions()
# chrome_options.binary_location = self.config.get('selenium', 'binary_location')
# proxy = baseCore.get_proxy()
# chrome_options.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
# self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
# # driver = webdriver.Chrome(chrome_options=chrome_options)
self.qtitle = Queue()
self.qurl = Queue()
self.detailList = Queue()
......@@ -54,14 +56,16 @@ class BaiduSpider(object):
self.wordsCode = wordsCode
self.sid = sid
def createDriver(self):
chrome_driver =self.config.get('selenium', 'chrome_driver')
self.kafka_bootstrap_servers = self.config.get('kafka', 'bootstrap_servers')
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location =self.config.get('selenium', 'binary_location')
# 设置代理
# proxy = "127.0.0.1:8080" # 代理地址和端口
# chrome_options.add_argument('--proxy-server=http://' + proxy)
chrome_options.binary_location = self.config.get('selenium', 'binary_location')
proxy = baseCore.get_proxy()
chrome_options.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
self.driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
# driver = webdriver.Chrome(chrome_options=chrome_options)
#将列表数据插入到表中 meta_search_result
def itemInsertToTable(self,items):
try:
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -12,12 +12,16 @@ from kafka import KafkaProducer
from kafka import KafkaConsumer
import json
import itertools
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from baiduSpider import BaiduSpider
import concurrent.futures
from baseCore import BaseCore
from queue import Queue
import configparser
from tqdm import tqdm
class BaiduTaskJob(object):
def __init__(self):
......@@ -39,7 +43,7 @@ class BaiduTaskJob(object):
bootstrap_servers=[bootstrap_servers],
value_deserializer=lambda m: json.loads(m.decode('utf-8')))
try:
for record in consumer:
for record in tqdm(consumer, desc="Consuming messages"):
try:
logger.info("value:",record.value)
keymsg=record.value
......@@ -119,7 +123,15 @@ class BaiduTaskJob(object):
kwList=[]
if searchEngines:
if '3' in searchEngines:
keyword=keymsg['keyWord']
start_time = time.time()
keyword = keymsg['keyWord']
wordsName = keymsg['wordsName']
first = wordsName
if wordsName == first:
end_time = time.time()
if int(end_time - start_time) > 10:
logger.info(f'采集一轮{wordsName}关键词耗时{baseCore.getTimeCost(start_time,end_time)}')
logger.info(f"获取到关键词组:{wordsName}---{wordsCode}")
keymsglist=self.getkeywords(keyword)
for kw in keymsglist:
kwmsg={
......@@ -157,6 +169,25 @@ class BaiduTaskJob(object):
# finally:
# baiduSpider.driver.quit()
# logger.info("关键词采集结束!"+searchkw)
def createDriver(self):
chrome_driver = r'D:\cmd100\chromedriver.exe'
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--start-maximized")
proxy = baseCore.get_proxy()
chrome_options.add_argument('--proxy-server=' + proxy['http'].split('://')[1])
chrome_options.add_argument(
'user-agent=' + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
# chrome_options.add_argument('--headless')
browser = webdriver.Chrome(service=path, chrome_options=chrome_options)
return browser
def runSpider(self,kwmsg):
searchkw=kwmsg['kw']
wordsCode=kwmsg['wordsCode']
......@@ -166,6 +197,8 @@ class BaiduTaskJob(object):
baiduSpider.get_page_html()
except Exception as e:
try:
baiduSpider.driver.quit()
baiduSpider.driver=self.createDriver()
baiduSpider.get_page_html()
except Exception as e:
logger.info('百度搜索异常'+searchkw)
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -293,6 +293,7 @@ class BaseCore:
sql = "select proxy from clb_proxy"
self.__cursor_proxy.execute(sql)
proxy_lists = self.__cursor_proxy.fetchall()
self.__cnx_proxy.commit()
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
......@@ -304,8 +305,8 @@ class BaseCore:
"port": str_ip_list[1],
}
proxy = {
"HTTP": proxyMeta,
"HTTPS": proxyMeta
"http": proxyMeta,
"https": proxyMeta
}
proxy_list.append(proxy)
return proxy_list[random.randint(0, 3)]
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论