提交 21214964 作者: LiuLiYuan

雅虎企业动态

上级 fe8d1cca
...@@ -15,15 +15,15 @@ from selenium.webdriver.chrome.service import Service ...@@ -15,15 +15,15 @@ from selenium.webdriver.chrome.service import Service
from openpyxl import Workbook from openpyxl import Workbook
import langid import langid
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class BaseCore: class BaseCore:
# 序列号 # 序列号
__seq = 0 __seq = 0
# 代理池 数据库连接 # 代理池 数据库连接
__cnx_proxy =None __cnx_proxy = None
__cursor_proxy = None __cursor_proxy = None
# agent 池 # agent 池
__USER_AGENT_LIST = [ __USER_AGENT_LIST = [
...@@ -218,8 +218,9 @@ class BaseCore: ...@@ -218,8 +218,9 @@ class BaseCore:
'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5' 'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
] ]
#Android agent池 # Android agent池
__USER_PHONE_AGENT_LIST = ['Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36'] __USER_PHONE_AGENT_LIST = [
'Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36']
def __init__(self): def __init__(self):
self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='clb_project', self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='clb_project',
...@@ -238,12 +239,11 @@ class BaseCore: ...@@ -238,12 +239,11 @@ class BaseCore:
self.__cnx_proxy.close() self.__cnx_proxy.close()
self.cursor.close() self.cursor.close()
self.cnx.close() self.cnx.close()
except : except:
pass pass
# 计算耗时 # 计算耗时
def getTimeCost(self,start, end): def getTimeCost(self, start, end):
seconds = int(end - start) seconds = int(end - start)
m, s = divmod(seconds, 60) m, s = divmod(seconds, 60)
h, m = divmod(m, 60) h, m = divmod(m, 60)
...@@ -256,6 +256,7 @@ class BaseCore: ...@@ -256,6 +256,7 @@ class BaseCore:
else: else:
ms = int((end - start) * 1000) ms = int((end - start) * 1000)
return "%d毫秒" % (ms) return "%d毫秒" % (ms)
# 当前时间格式化 # 当前时间格式化
# 1 : 2001-01-01 12:00:00 %Y-%m-%d %H:%M:%S # 1 : 2001-01-01 12:00:00 %Y-%m-%d %H:%M:%S
# 2 : 010101120000 %y%m%d%H%M%S # 2 : 010101120000 %y%m%d%H%M%S
...@@ -285,7 +286,7 @@ class BaseCore: ...@@ -285,7 +286,7 @@ class BaseCore:
return "ZZSN" + self.getNowTime(2) + str(self.__seq).zfill(3) return "ZZSN" + self.getNowTime(2) + str(self.__seq).zfill(3)
# 日志格式 # 日志格式
def logFormate(self,record, handler): def logFormate(self, record, handler):
formate = "[{date}] [{level}] [{filename}] [{func_name}] [{lineno}] {msg}".format( formate = "[{date}] [{level}] [{filename}] [{func_name}] [{lineno}] {msg}".format(
date=record.time, # 日志时间 date=record.time, # 日志时间
level=record.level_name, # 日志等级 level=record.level_name, # 日志等级
...@@ -295,8 +296,9 @@ class BaseCore: ...@@ -295,8 +296,9 @@ class BaseCore:
msg=record.message # 日志内容 msg=record.message # 日志内容
) )
return formate return formate
# 获取logger # 获取logger
def getLogger(self,fileLogFlag=True, stdOutFlag=True): def getLogger(self, fileLogFlag=True, stdOutFlag=True):
dirname, filename = os.path.split(os.path.abspath(sys.argv[0])) dirname, filename = os.path.split(os.path.abspath(sys.argv[0]))
dirname = os.path.join(dirname, "logs") dirname = os.path.join(dirname, "logs")
filename = filename.replace(".py", "") + ".log" filename = filename.replace(".py", "") + ".log"
...@@ -345,34 +347,34 @@ class BaseCore: ...@@ -345,34 +347,34 @@ class BaseCore:
proxy_list.append(proxy) proxy_list.append(proxy)
return proxy_list[random.randint(0, 3)] return proxy_list[random.randint(0, 3)]
#字符串截取 # 字符串截取
def getSubStr(self,str,beginStr,endStr): def getSubStr(self, str, beginStr, endStr):
if beginStr=='': if beginStr == '':
pass pass
else: else:
begin=str.find(beginStr) begin = str.find(beginStr)
if begin==-1: if begin == -1:
begin=0 begin = 0
str=str[begin:] str = str[begin:]
if endStr=='': if endStr == '':
pass pass
else: else:
end=str.rfind(endStr) end = str.rfind(endStr)
if end==-1: if end == -1:
pass pass
else: else:
str = str[0:end+1] str = str[0:end + 1]
return str return str
# 繁体字转简体字 # 繁体字转简体字
def hant_2_hans(self,hant_str: str): def hant_2_hans(self, hant_str: str):
''' '''
Function: 将 hant_str 由繁体转化为简体 Function: 将 hant_str 由繁体转化为简体
''' '''
return zhconv.convert(hant_str, 'zh-hans') return zhconv.convert(hant_str, 'zh-hans')
# 判断字符串里是否含数字 # 判断字符串里是否含数字
def str_have_num(self,str_num): def str_have_num(self, str_num):
panduan = False panduan = False
for str_1 in str_num: for str_1 in str_num:
...@@ -392,7 +394,7 @@ class BaseCore: ...@@ -392,7 +394,7 @@ class BaseCore:
# return gw_item.decode() if gw_item else None # return gw_item.decode() if gw_item else None
# 从Redis的List中获取并移除一个元素 # 从Redis的List中获取并移除一个元素
def redicPullData(self,key): def redicPullData(self, key):
item = self.r.lpop(key) item = self.r.lpop(key)
return item.decode() if item else None return item.decode() if item else None
...@@ -421,15 +423,15 @@ class BaseCore: ...@@ -421,15 +423,15 @@ class BaseCore:
chrome_options.add_experimental_option('useAutomationExtension', False) chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en') chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
chrome_options.add_argument(self.getRandomUserAgent()) chrome_options.add_argument('user-agent=' + self.getRandomUserAgent())
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36') # 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
driver = webdriver.Chrome(chrome_options=chrome_options, service=service) driver = webdriver.Chrome(chrome_options=chrome_options, service=service)
with open('../../base/stealth.min.js') as f: # with open(r'F:\zzsn\zzsn_spider\base\stealth.min.js') as f:
js = f.read() # js = f.read()
#
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { # driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": js # "source": js
}) # })
return driver return driver
# 根据社会信用代码获取企业信息 # 根据社会信用代码获取企业信息
...@@ -458,16 +460,16 @@ class BaseCore: ...@@ -458,16 +460,16 @@ class BaseCore:
print(e) print(e)
self.cnx.commit() self.cnx.commit()
#获取企查查token # 获取企查查token
def GetToken(self): def GetToken(self):
#获取企查查token # 获取企查查token
query = "select token from QCC_token " query = "select token from QCC_token "
# token = '67ec7402166df1da84ae83c4b95cefc0' # 需要隔两个小时左右抓包修改 # token = '67ec7402166df1da84ae83c4b95cefc0' # 需要隔两个小时左右抓包修改
self.cursor.execute(query) self.cursor.execute(query)
token = self.cursor.fetchone()[0] token = self.cursor.fetchone()[0]
return token return token
#检测语言 # 检测语言
def detect_language(self, text): def detect_language(self, text):
# 使用langid.py判断文本的语言 # 使用langid.py判断文本的语言
result = langid.classify(text) result = langid.classify(text)
...@@ -477,11 +479,11 @@ class BaseCore: ...@@ -477,11 +479,11 @@ class BaseCore:
return 'cn' return 'cn'
return result[0] return result[0]
#追加接入excel # 追加接入excel
def writerToExcel(self,detailList,filename): def writerToExcel(self, detailList, filename):
# filename='baidu搜索.xlsx' # filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件 # 读取已存在的xlsx文件
existing_data = pd.read_excel(filename,engine='openpyxl') existing_data = pd.read_excel(filename, engine='openpyxl')
# 创建新的数据 # 创建新的数据
new_data = pd.DataFrame(data=detailList) new_data = pd.DataFrame(data=detailList)
# 将新数据添加到现有数据的末尾 # 将新数据添加到现有数据的末尾
...@@ -490,8 +492,6 @@ class BaseCore: ...@@ -490,8 +492,6 @@ class BaseCore:
combined_data.to_excel(filename, index=False) combined_data.to_excel(filename, index=False)
# return combined_data # return combined_data
#对失败或者断掉的企业 重新放入redis # 对失败或者断掉的企业 重新放入redis
def rePutIntoR(self,item): def rePutIntoR(self, item):
self.r.rpush('NewsEnterprise:gwqy_socialCode', item) self.r.rpush('NewsEnterprise:gwqy_socialCode', item)
# 雅虎财经企业动态获取 # 雅虎财经企业动态获取
...@@ -8,9 +8,9 @@ import sys ...@@ -8,9 +8,9 @@ import sys
from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support.wait import WebDriverWait
sys.path.append('D:/zzsn_spider/base') from base import BaseCore
import BaseCore from base.smart import smart_extractor
from smart import smart_extractor
import urllib3 import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
baseCore = BaseCore.BaseCore() baseCore = BaseCore.BaseCore()
...@@ -48,7 +48,9 @@ def getZx(xydm, url, title, cnx, path): ...@@ -48,7 +48,9 @@ def getZx(xydm, url, title, cnx, path):
pub_time = pub_time[0:19] pub_time = pub_time[0:19]
content = contentElement.replace("'", "''") content = contentElement.replace("'", "''")
driverContent.close() # driverContent.close()
driverContent.quit()
# 动态信息列表 # 动态信息列表
list_info = [ list_info = [
xydm, xydm,
...@@ -159,28 +161,24 @@ def getLastUrl(): ...@@ -159,28 +161,24 @@ def getLastUrl():
def scroll(xydm,name,gpdm): def scroll(xydm,name,gpdm):
last_url_ = '' last_url_ = ''
try:
last_url = getLastUrl()
except:
log.error(f"{name}--{gpdm}--获取不到最后一条链接")
while True: while True:
js = "var q=document.documentElement.scrollTop=100000" js = "var q=document.documentElement.scrollTop=100000"
driver.execute_script(js) driver.execute_script(js)
time.sleep(1) time.sleep(1)
try: try:
last_url_ = getLastUrl() last_url = getLastUrl()
except Exception as e: except Exception as e:
log.error(f"{name}--{gpdm}--获取不到最后一条链接") log.error(f"{name}--{gpdm}--获取不到最后一条链接")
break break
try: # try:
selects = selectUrl(last_url_,xydm) # selects = selectUrl(last_url_,xydm)
except: # except:
break # break
if selects: # if selects:
break # break
if last_url_ == last_url: if last_url_ == last_url:
break break
last_url = last_url_ last_url_ = last_url
#采集失败的公众号 重新放入redis #采集失败的公众号 重新放入redis
def rePutIntoR(item): def rePutIntoR(item):
...@@ -188,7 +186,7 @@ def rePutIntoR(item): ...@@ -188,7 +186,7 @@ def rePutIntoR(item):
if __name__ == "__main__": if __name__ == "__main__":
path = r'D:\zzsn_spider\comData\cmd6\chromedriver.exe' path = r'F:\spider\115\chromedriver.exe'
driver = baseCore.buildDriver(path) driver = baseCore.buildDriver(path)
cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4') cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
cursor = cnx.cursor() cursor = cnx.cursor()
...@@ -198,6 +196,9 @@ if __name__ == "__main__": ...@@ -198,6 +196,9 @@ if __name__ == "__main__":
social_code = baseCore.redicPullData('NewsEnterprise:gwqy_socialCode') social_code = baseCore.redicPullData('NewsEnterprise:gwqy_socialCode')
# 判断 如果Redis中已经没有数据,则等待 # 判断 如果Redis中已经没有数据,则等待
if not social_code :
time.sleep(20)
continue
if social_code == 'None': if social_code == 'None':
time.sleep(20) time.sleep(20)
continue continue
...@@ -311,7 +312,7 @@ if __name__ == "__main__": ...@@ -311,7 +312,7 @@ if __name__ == "__main__":
log.info('===========连接已被关闭========等待重新连接===========') log.info('===========连接已被关闭========等待重新连接===========')
driver.quit() driver.quit()
driver = baseCore.buildDriver(path) driver = baseCore.buildDriver(path)
time.sleep(1200) time.sleep(5)
continue continue
cursor.close() cursor.close()
......
This source diff could not be displayed because it is too large. You can view the blob instead.
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from base.BaseCore import BaseCore from base.BaseCore import BaseCore
baseCore = BaseCore() baseCore = BaseCore()
...@@ -6,13 +8,25 @@ log =baseCore.getLogger() ...@@ -6,13 +8,25 @@ log =baseCore.getLogger()
if __name__ == '__main__': if __name__ == '__main__':
log.info("ok") path = r'F:\spider\115\chromedriver.exe'
#获取流水号 driver = baseCore.buildDriver(path,headless=False)
print(baseCore.getNextSeq()) # service = Service(r'F:\spider\115\chromedriver.exe')
print(baseCore.getNextSeq()) # chrome_options = webdriver.ChromeOptions()
# 获取随机agent # # chrome_options.add_argument('--headless')
print(baseCore.getRandomUserAgent()) # # chrome_options.add_argument('--disable-gpu')
# 获取代理池 # chrome_options.add_experimental_option(
print(baseCore.get_proxy()) # "excludeSwitches", ["enable-automation"])
# 释放相关资源 # chrome_options.add_experimental_option('useAutomationExtension', False)
baseCore.close() # chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
\ No newline at end of file # chrome_options.add_argument('user-agent='+baseCore.getRandomUserAgent())
#
# bro = webdriver.Chrome(chrome_options=chrome_options, service=service)
# with open('stealth.min.js') as f:
# js = f.read()
#
# bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
# "source": js
# })
gpdm = '9021.T'
url = f"https://finance.yahoo.com/quote/{gpdm}/press-releases?p={gpdm}"
driver.get(url)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论