提交 01e0d716 作者: 薛凌堃

Merge remote-tracking branch 'origin/master'

...@@ -369,7 +369,7 @@ class BaseCore: ...@@ -369,7 +369,7 @@ class BaseCore:
if beginStr=='': if beginStr=='':
pass pass
else: else:
begin=str.find(beginStr) begin=str.rfind(beginStr)
if begin==-1: if begin==-1:
begin=0 begin=0
str=str[begin:] str=str[begin:]
...@@ -425,11 +425,18 @@ class BaseCore: ...@@ -425,11 +425,18 @@ class BaseCore:
IP = socket.gethostbyname(socket.gethostname()) IP = socket.gethostbyname(socket.gethostname())
return IP return IP
def mkPath(self,path):
folder = os.path.exists(path)
if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹
os.makedirs(path) # makedirs 创建文件时如果路径不存在会创建这个路径
else:
pass
# 生成google模拟浏览器 必须传入值为googledriver位置信息 # 生成google模拟浏览器 必须传入值为googledriver位置信息
# headless用于决定是否为无头浏览器,初始默认为无头浏览器 # headless用于决定是否为无头浏览器,初始默认为无头浏览器
# 正常浏览器可用于开始对页面解析使用或一些网站无头时无法正常采集 # 正常浏览器可用于开始对页面解析使用或一些网站无头时无法正常采集
# 无头浏览器用于后续对信息采集时不会有浏览器一直弹出, # 无头浏览器用于后续对信息采集时不会有浏览器一直弹出,
def buildDriver(self, path, headless=True): def buildDriver(self, path, headless=True):
service = Service(path) service = Service(path)
chrome_options = webdriver.ChromeOptions() chrome_options = webdriver.ChromeOptions()
if headless: if headless:
...@@ -442,7 +449,7 @@ class BaseCore: ...@@ -442,7 +449,7 @@ class BaseCore:
chrome_options.add_argument('user-agent=' + self.getRandomUserAgent()) chrome_options.add_argument('user-agent=' + self.getRandomUserAgent())
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36') # 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
driver = webdriver.Chrome(chrome_options=chrome_options, service=service) driver = webdriver.Chrome(options=chrome_options, service=service)
# with open(r'F:\zzsn\zzsn_spider\base\stealth.min.js') as f: # with open(r'F:\zzsn\zzsn_spider\base\stealth.min.js') as f:
# js = f.read() # js = f.read()
# #
...@@ -586,3 +593,4 @@ class BaseCore: ...@@ -586,3 +593,4 @@ class BaseCore:
import json import json
import json import json
import time import time
import numpy as np
import pandas as pd
import pymysql
import requests import requests
import sys
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from kafka import KafkaProducer from kafka import KafkaProducer
from NewsYahoo import news sys.path.append(r'F:\zzsn\zzsn_spider\base')
import BaseCore
from base.BaseCore import BaseCore
import urllib3 import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
taskType = '企业基本信息/雅虎财经' taskType = '企业基本信息/雅虎财经'
baseCore = BaseCore() baseCore = BaseCore.BaseCore()
r = baseCore.r r = baseCore.r
log = baseCore.getLogger() log = baseCore.getLogger()
headers = { headers = {
...@@ -38,7 +34,7 @@ headers = { ...@@ -38,7 +34,7 @@ headers = {
# 根据股票代码 获取企业基本信息 高管信息 # 根据股票代码 获取企业基本信息 高管信息
def getInfo(name,enname,gpdm, xydm, start): def getInfo(enname, gpdm, xydm, start):
if 'HK' in str(gpdm): if 'HK' in str(gpdm):
tmp_g = str(gpdm).split('.')[0] tmp_g = str(gpdm).split('.')[0]
if len(tmp_g) == 5: if len(tmp_g) == 5:
...@@ -49,17 +45,9 @@ def getInfo(name,enname,gpdm, xydm, start): ...@@ -49,17 +45,9 @@ def getInfo(name,enname,gpdm, xydm, start):
gpdm_ = gpdm gpdm_ = gpdm
retData = {} retData = {}
retData['base_info'] = { retData['base_info'] = {
'公司名称': name, '公司名称': enname,
'英文名': enname, '英文名': enname,
'信用代码': xydm, '信用代码': xydm,
'股票代码': gpdm,
'地址': '',
'电话': '',
'公司网站': '',
'部门': '',
'行业': '',
'员工人数': '',
'公司简介': ''
} }
retData['people_info'] = [] retData['people_info'] = []
# https://finance.yahoo.com/quote/VOW3.DE/profile?p=VOW3.DE # https://finance.yahoo.com/quote/VOW3.DE/profile?p=VOW3.DE
...@@ -76,22 +64,36 @@ def getInfo(name,enname,gpdm, xydm, start): ...@@ -76,22 +64,36 @@ def getInfo(name,enname,gpdm, xydm, start):
log.error(f"{gpdm}---第{i}次---获取基本信息接口返回失败:{response.status_code}") log.error(f"{gpdm}---第{i}次---获取基本信息接口返回失败:{response.status_code}")
except: except:
continue continue
try:
if (response.status_code == 200): if 'lookup' in response.url:
pass log.error(f"{gpdm}------股票代码错误:{response.status_code}")
else: exeception = '股票代码错误'
state = 1
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(xydm, taskType, 0, takeTime, url, exeception)
return [state, retData]
elif response.status_code != 200:
log.error(f"{gpdm}------获取基本信息接口重试后依然失败失败:{response.status_code}") log.error(f"{gpdm}------获取基本信息接口重试后依然失败失败:{response.status_code}")
exeception = '获取基本信息接口返回失败' exeception = '获取基本信息接口返回失败'
state = 0 state = 0
takeTime = baseCore.getTimeCost(start, time.time()) takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(xydm, taskType, state, takeTime, url, exeception) baseCore.recordLog(xydm, taskType, state, takeTime, url, exeception)
rePutIntoR('') baseCore.rePutIntoR('BaseInfoEnterprise:gwqy_socialCode', xydm)
return [state,retData] return [state, retData]
except:
log.error(f"{gpdm}------获取基本信息接口重试后依然失败失败:{response.status_code}")
exeception = '获取基本信息接口返回失败'
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(xydm, taskType, state, takeTime, url, exeception)
baseCore.rePutIntoR('BaseInfoEnterprise:gwqy_socialCode', xydm)
return [state, retData]
state = 1 state = 1
soup = BeautifulSoup(response.content, 'html.parser') soup = BeautifulSoup(response.content, 'html.parser')
page = soup.find('div', {'id': 'Col1-0-Profile-Proxy'}) page = soup.find('div', {'id': 'Col1-0-Profile-Proxy'})
name = page.find('h3',{'class':'Fz(m) Mb(10px)'}).text name = page.find('h3', {'class': 'Fz(m) Mb(10px)'}).text
try: try:
com_info = page.find('div', {'class': 'Mb(25px)'}) com_info = page.find('div', {'class': 'Mb(25px)'})
except: except:
...@@ -126,7 +128,7 @@ def getInfo(name,enname,gpdm, xydm, start): ...@@ -126,7 +128,7 @@ def getInfo(name,enname,gpdm, xydm, start):
com_jianjie = '' com_jianjie = ''
dic_com_info = { dic_com_info = {
'公司名称': name, '公司名称': name,
'英文名': enname, '英文名': name,
'信用代码': xydm, '信用代码': xydm,
'股票代码': gpdm, '股票代码': gpdm,
'地址': com_address, '地址': com_address,
...@@ -189,12 +191,13 @@ def getInfo(name,enname,gpdm, xydm, start): ...@@ -189,12 +191,13 @@ def getInfo(name,enname,gpdm, xydm, start):
retData['people_info'] = retPeople retData['people_info'] = retPeople
log.info(f"获取基本信息--{gpdm},耗时{baseCore.getTimeCost(start, time.time())}") log.info(f"获取基本信息--{gpdm},耗时{baseCore.getTimeCost(start, time.time())}")
response.close() response.close()
return [state,retData] return [state, retData]
# 保存基本信息 # 保存基本信息
def saveBaseInfo(info,start): def saveBaseInfo(info, start):
# 基本信息发送到kafka # 基本信息发送到kafka
try:
company_dict = { company_dict = {
'name': info['base_info']['公司名称'], # 企业名称 'name': info['base_info']['公司名称'], # 企业名称
'shortName': '', # 企业简称 'shortName': '', # 企业简称
...@@ -207,6 +210,12 @@ def saveBaseInfo(info,start): ...@@ -207,6 +210,12 @@ def saveBaseInfo(info,start):
'address': info['base_info']['地址'], # 地址 'address': info['base_info']['地址'], # 地址
'status': 0, # 状态 'status': 0, # 状态
} }
except:
company_dict = {
'name': info['base_info']['公司名称'], # 企业名称
'socialCreditCode': info['base_info']['信用代码'], # 统一社会信用代码
'englishName': info['base_info']['英文名'], # 英文名
}
# print(company_dict) # print(company_dict)
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2)) producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
kafka_result = producer.send("regionInfo", json.dumps(company_dict, ensure_ascii=False).encode('utf8')) kafka_result = producer.send("regionInfo", json.dumps(company_dict, ensure_ascii=False).encode('utf8'))
...@@ -216,7 +225,7 @@ def saveBaseInfo(info,start): ...@@ -216,7 +225,7 @@ def saveBaseInfo(info,start):
# 保存高管信息 # 保存高管信息
def savePeopleInfo(info,start): def savePeopleInfo(info, start):
# 高管信息调用接口 # 高管信息调用接口
list_people = info['people_info'] list_people = info['people_info']
list_one_info = [] list_one_info = []
...@@ -240,6 +249,7 @@ def savePeopleInfo(info,start): ...@@ -240,6 +249,7 @@ def savePeopleInfo(info,start):
json_updata = json.dumps(list_one_info) json_updata = json.dumps(list_one_info)
# print(json_updata) # print(json_updata)
if json_updata == '[]': if json_updata == '[]':
log.info("没有高管")
pass pass
else: else:
for i in range(0, 3): for i in range(0, 3):
...@@ -274,18 +284,6 @@ def savePeopleInfo(info,start): ...@@ -274,18 +284,6 @@ def savePeopleInfo(info,start):
return state return state
def rePutIntoR(item):
r.rpush('BaseInfoEnterprise:gwqy_socialCode', item)
# def getInfomation(social_code):
# sql = f"SELECT * FROM EnterpriseInfo WHERE SocialCode = '{social_code}'"
# cursor.execute(sql)
# data = cursor.fetchone()
# return data
# 采集工作 # 采集工作
def beginWork(): def beginWork():
while True: while True:
...@@ -298,37 +296,42 @@ def beginWork(): ...@@ -298,37 +296,42 @@ def beginWork():
continue continue
# 数据库中获取基本信息 # 数据库中获取基本信息
data = baseCore.getInfomation(social_code) data = baseCore.getInfomation(social_code)
name = data[1]
enname = data[5] enname = data[5]
gpdm = data[3] gpdm = '0123'
xydm = data[2] xydm = data[2]
# 获取该企业对应项目的采集次数 # 获取该企业对应项目的采集次数
count = data[13] count = data[13]
start_time = time.time() start_time = time.time()
# 股票代码为空跳过 # 股票代码为空跳过
if gpdm is None: if gpdm == '':
log.error(f"{name}--股票代码为空 跳过") info = {"base_info": {'公司名称': enname,'英文名': enname,'信用代码': xydm, }}
exception = '股票代码为空' log.error(f'{xydm}....股票代码为空')
try:
saveBaseInfo(info, start_time)
except:
log.error(f'{enname}....企业基本信息Kafka操作失败')
exception = 'Kafka操作失败'
state = 0 state = 0
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(xydm, taskType, state, takeTime, '', exception) baseCore.recordLog(xydm, taskType, state, takeTime, '', exception)
continue else:
try: try:
retData = getInfo(name,enname,gpdm, xydm, start_time) retData = getInfo(enname, gpdm, xydm, start_time)
# 基本信息采集成功 进行数据入库,否则不入库 # 基本信息采集成功 进行数据入库,否则不入库
if retData[0] == 1: if retData[0] == 1:
# 企业基本信息入库 # 企业基本信息入库
try: try:
saveBaseInfo(retData[1],start_time) saveBaseInfo(retData[1], start_time)
except: except:
log.error(f'{name}....企业基本信息Kafka操作失败') log.error(f'{enname}....企业基本信息Kafka操作失败')
exception = 'Kafka操作失败' exception = 'Kafka操作失败'
state = 0 state = 0
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(xydm, taskType, state, takeTime, '', exception) baseCore.recordLog(xydm, taskType, state, takeTime, '', exception)
# 企业高管信息入库 # 企业高管信息入库
state = savePeopleInfo(retData[1],start_time) state = savePeopleInfo(retData[1], start_time)
# 只有企业高管信息和企业基本信息都采集到,该企业才算采集成功 # 只有企业高管信息和企业基本信息都采集到,该企业才算采集成功
if state == 1: if state == 1:
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
...@@ -340,7 +343,7 @@ def beginWork(): ...@@ -340,7 +343,7 @@ def beginWork():
except Exception as e: except Exception as e:
# 若出现尚未发现的错误,则保存错误信息以及出错位置 # 若出现尚未发现的错误,则保存错误信息以及出错位置
ee = e.__traceback__.tb_lineno ee = e.__traceback__.tb_lineno
log.error(f'{name}...{xydm}...{gpdm}.....数据采集失败,原因:{ee}行 {e}') log.error(f'{enname}...{xydm}...{gpdm}.....数据采集失败,原因:{ee}行 {e}')
state = 0 state = 0
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(xydm, taskType, state, takeTime, '', f'数据采集失败,原因:{ee}行 {e}') baseCore.recordLog(xydm, taskType, state, takeTime, '', f'数据采集失败,原因:{ee}行 {e}')
...@@ -348,15 +351,11 @@ def beginWork(): ...@@ -348,15 +351,11 @@ def beginWork():
# 企业数据采集完成,采集次数加一 # 企业数据采集完成,采集次数加一
count += 1 count += 1
runType = 'BaseInfoRunCount' runType = 'BaseInfoRunCount'
baseCore.updateRun(social_code,runType,count) baseCore.updateRun(social_code, runType, count)
# 释放资源 # 释放资源
baseCore.close() baseCore.close()
if __name__ == '__main__': if __name__ == '__main__':
cnx = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='caiji',charset='utf8mb4')
cursor = cnx.cursor()
beginWork() beginWork()
cursor.close()
cnx.close()
from urllib.parse import urljoin
import langid import langid
import pymysql import pymysql
...@@ -407,12 +408,45 @@ class GoogleSpider(object): ...@@ -407,12 +408,45 @@ class GoogleSpider(object):
else: else:
break break
# time.sleep(5) # time.sleep(5)
def rmTagattr(self,html,url):
# 使用BeautifulSoup解析网页内容
# soup = BeautifulSoup(html, 'html.parser')
soup = self.paserUrl(html,url)
# 遍历所有标签,并去掉属性
for tag in soup.find_all(True):
if tag.name == 'img':
tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'}
elif tag.name !='img':
tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'}
else:
tag.attrs = {key: value for key, value in tag.attrs.items()}
# 打印去掉属性后的网页内容
# print(soup.prettify())
html=soup.prettify()
return html
# 将html中的相对地址转换成绝对地址
def paserUrl(self,html,listurl):
soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links = soup.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return soup
#获取资讯内容信息 #获取资讯内容信息
def getDetailmsg(self,detailmsg): def getDetailmsg(self,detailmsg):
try: try:
detailurl=detailmsg['detailUrl'] detailurl=detailmsg['detailUrl']
title = detailmsg['title'] title = detailmsg['title']
content,contentWithTag=self.extractorMsg(detailurl,title) content,contentWithTag=self.extractorMsg(detailurl,title)
contentWithTag=self.rmTagattr(contentWithTag)
except Exception as e: except Exception as e:
content='' content=''
contentWithTag='' contentWithTag=''
......
...@@ -122,6 +122,7 @@ class SougouSpider(object): ...@@ -122,6 +122,7 @@ class SougouSpider(object):
"user-agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36" "user-agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"
} }
# url = 'https://www.sogou.com/link?url=hedJjaC291NbWrwHYHKCyPQj_ei8OKC13fJZ5YRQyvgjcXe6RUhCEXfbi95UdEys0ztd7q5nl6o.' # url = 'https://www.sogou.com/link?url=hedJjaC291NbWrwHYHKCyPQj_ei8OKC13fJZ5YRQyvgjcXe6RUhCEXfbi95UdEys0ztd7q5nl6o.'
url=f"https://www.sogou.com{url}"
res = requests.get(url,headers=header) res = requests.get(url,headers=header)
text=res.text text=res.text
# 定义正则表达式 # 定义正则表达式
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论