提交 21214964 作者: LiuLiYuan

雅虎企业动态

上级 fe8d1cca
......@@ -15,15 +15,15 @@ from selenium.webdriver.chrome.service import Service
from openpyxl import Workbook
import langid
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class BaseCore:
# 序列号
__seq = 0
# 代理池 数据库连接
__cnx_proxy =None
__cnx_proxy = None
__cursor_proxy = None
# agent 池
__USER_AGENT_LIST = [
......@@ -218,8 +218,9 @@ class BaseCore:
'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
]
#Android agent池
__USER_PHONE_AGENT_LIST = ['Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36']
# Android agent池
__USER_PHONE_AGENT_LIST = [
'Mozilla/5.0 (Linux; Android 7.1.1; OPPO R9sk) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.111 Mobile Safari/537.36']
def __init__(self):
self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='clb_project',
......@@ -238,12 +239,11 @@ class BaseCore:
self.__cnx_proxy.close()
self.cursor.close()
self.cnx.close()
except :
except:
pass
# 计算耗时
def getTimeCost(self,start, end):
def getTimeCost(self, start, end):
seconds = int(end - start)
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
......@@ -256,6 +256,7 @@ class BaseCore:
else:
ms = int((end - start) * 1000)
return "%d毫秒" % (ms)
# 当前时间格式化
# 1 : 2001-01-01 12:00:00 %Y-%m-%d %H:%M:%S
# 2 : 010101120000 %y%m%d%H%M%S
......@@ -285,7 +286,7 @@ class BaseCore:
return "ZZSN" + self.getNowTime(2) + str(self.__seq).zfill(3)
# 日志格式
def logFormate(self,record, handler):
def logFormate(self, record, handler):
formate = "[{date}] [{level}] [{filename}] [{func_name}] [{lineno}] {msg}".format(
date=record.time, # 日志时间
level=record.level_name, # 日志等级
......@@ -295,8 +296,9 @@ class BaseCore:
msg=record.message # 日志内容
)
return formate
# 获取logger
def getLogger(self,fileLogFlag=True, stdOutFlag=True):
def getLogger(self, fileLogFlag=True, stdOutFlag=True):
dirname, filename = os.path.split(os.path.abspath(sys.argv[0]))
dirname = os.path.join(dirname, "logs")
filename = filename.replace(".py", "") + ".log"
......@@ -345,34 +347,34 @@ class BaseCore:
proxy_list.append(proxy)
return proxy_list[random.randint(0, 3)]
#字符串截取
def getSubStr(self,str,beginStr,endStr):
if beginStr=='':
# 字符串截取
def getSubStr(self, str, beginStr, endStr):
if beginStr == '':
pass
else:
begin=str.find(beginStr)
if begin==-1:
begin=0
str=str[begin:]
if endStr=='':
begin = str.find(beginStr)
if begin == -1:
begin = 0
str = str[begin:]
if endStr == '':
pass
else:
end=str.rfind(endStr)
if end==-1:
end = str.rfind(endStr)
if end == -1:
pass
else:
str = str[0:end+1]
str = str[0:end + 1]
return str
# 繁体字转简体字
def hant_2_hans(self,hant_str: str):
def hant_2_hans(self, hant_str: str):
'''
Function: 将 hant_str 由繁体转化为简体
'''
return zhconv.convert(hant_str, 'zh-hans')
# 判断字符串里是否含数字
def str_have_num(self,str_num):
def str_have_num(self, str_num):
panduan = False
for str_1 in str_num:
......@@ -392,7 +394,7 @@ class BaseCore:
# return gw_item.decode() if gw_item else None
# 从Redis的List中获取并移除一个元素
def redicPullData(self,key):
def redicPullData(self, key):
item = self.r.lpop(key)
return item.decode() if item else None
......@@ -415,21 +417,21 @@ class BaseCore:
chrome_options = webdriver.ChromeOptions()
if headless:
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_experimental_option(
"excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
chrome_options.add_argument(self.getRandomUserAgent())
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
chrome_options.add_argument('user-agent=' + self.getRandomUserAgent())
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
driver = webdriver.Chrome(chrome_options=chrome_options, service=service)
with open('../../base/stealth.min.js') as f:
js = f.read()
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": js
})
# with open(r'F:\zzsn\zzsn_spider\base\stealth.min.js') as f:
# js = f.read()
#
# driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
# "source": js
# })
return driver
# 根据社会信用代码获取企业信息
......@@ -458,16 +460,16 @@ class BaseCore:
print(e)
self.cnx.commit()
#获取企查查token
# 获取企查查token
def GetToken(self):
#获取企查查token
# 获取企查查token
query = "select token from QCC_token "
# token = '67ec7402166df1da84ae83c4b95cefc0' # 需要隔两个小时左右抓包修改
self.cursor.execute(query)
token = self.cursor.fetchone()[0]
return token
#检测语言
# 检测语言
def detect_language(self, text):
# 使用langid.py判断文本的语言
result = langid.classify(text)
......@@ -477,11 +479,11 @@ class BaseCore:
return 'cn'
return result[0]
#追加接入excel
def writerToExcel(self,detailList,filename):
# 追加接入excel
def writerToExcel(self, detailList, filename):
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
existing_data = pd.read_excel(filename,engine='openpyxl')
existing_data = pd.read_excel(filename, engine='openpyxl')
# 创建新的数据
new_data = pd.DataFrame(data=detailList)
# 将新数据添加到现有数据的末尾
......@@ -490,8 +492,6 @@ class BaseCore:
combined_data.to_excel(filename, index=False)
# return combined_data
#对失败或者断掉的企业 重新放入redis
def rePutIntoR(self,item):
# 对失败或者断掉的企业 重新放入redis
def rePutIntoR(self, item):
self.r.rpush('NewsEnterprise:gwqy_socialCode', item)
# 雅虎财经企业动态获取
# 雅虎财经企业动态获取
......@@ -8,9 +8,9 @@ import sys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
sys.path.append('D:/zzsn_spider/base')
import BaseCore
from smart import smart_extractor
from base import BaseCore
from base.smart import smart_extractor
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
baseCore = BaseCore.BaseCore()
......@@ -48,7 +48,9 @@ def getZx(xydm, url, title, cnx, path):
pub_time = pub_time[0:19]
content = contentElement.replace("'", "''")
driverContent.close()
# driverContent.close()
driverContent.quit()
# 动态信息列表
list_info = [
xydm,
......@@ -159,28 +161,24 @@ def getLastUrl():
def scroll(xydm,name,gpdm):
last_url_ = ''
try:
last_url = getLastUrl()
except:
log.error(f"{name}--{gpdm}--获取不到最后一条链接")
while True:
js = "var q=document.documentElement.scrollTop=100000"
driver.execute_script(js)
time.sleep(1)
try:
last_url_ = getLastUrl()
last_url = getLastUrl()
except Exception as e:
log.error(f"{name}--{gpdm}--获取不到最后一条链接")
break
try:
selects = selectUrl(last_url_,xydm)
except:
break
if selects:
break
# try:
# selects = selectUrl(last_url_,xydm)
# except:
# break
# if selects:
# break
if last_url_ == last_url:
break
last_url = last_url_
last_url_ = last_url
#采集失败的公众号 重新放入redis
def rePutIntoR(item):
......@@ -188,7 +186,7 @@ def rePutIntoR(item):
if __name__ == "__main__":
path = r'D:\zzsn_spider\comData\cmd6\chromedriver.exe'
path = r'F:\spider\115\chromedriver.exe'
driver = baseCore.buildDriver(path)
cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
cursor = cnx.cursor()
......@@ -198,6 +196,9 @@ if __name__ == "__main__":
social_code = baseCore.redicPullData('NewsEnterprise:gwqy_socialCode')
# 判断 如果Redis中已经没有数据,则等待
if not social_code :
time.sleep(20)
continue
if social_code == 'None':
time.sleep(20)
continue
......@@ -311,7 +312,7 @@ if __name__ == "__main__":
log.info('===========连接已被关闭========等待重新连接===========')
driver.quit()
driver = baseCore.buildDriver(path)
time.sleep(1200)
time.sleep(5)
continue
cursor.close()
......
This source diff could not be displayed because it is too large. You can view the blob instead.
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from base.BaseCore import BaseCore
baseCore = BaseCore()
......@@ -6,13 +8,25 @@ log =baseCore.getLogger()
if __name__ == '__main__':
log.info("ok")
#获取流水号
print(baseCore.getNextSeq())
print(baseCore.getNextSeq())
# 获取随机agent
print(baseCore.getRandomUserAgent())
# 获取代理池
print(baseCore.get_proxy())
# 释放相关资源
baseCore.close()
\ No newline at end of file
path = r'F:\spider\115\chromedriver.exe'
driver = baseCore.buildDriver(path,headless=False)
# service = Service(r'F:\spider\115\chromedriver.exe')
# chrome_options = webdriver.ChromeOptions()
# # chrome_options.add_argument('--headless')
# # chrome_options.add_argument('--disable-gpu')
# chrome_options.add_experimental_option(
# "excludeSwitches", ["enable-automation"])
# chrome_options.add_experimental_option('useAutomationExtension', False)
# chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
# chrome_options.add_argument('user-agent='+baseCore.getRandomUserAgent())
#
# bro = webdriver.Chrome(chrome_options=chrome_options, service=service)
# with open('stealth.min.js') as f:
# js = f.read()
#
# bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
# "source": js
# })
gpdm = '9021.T'
url = f"https://finance.yahoo.com/quote/{gpdm}/press-releases?p={gpdm}"
driver.get(url)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论