提交 7715292a 作者: 刘伟刚

tradingview的动态采集

上级 d82da41e
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
# @Author: MENG
# @Time : 2022-4-9
import datetime
import xlrd
from selenium.webdriver.support.wait import WebDriverWait
from tqdm import tqdm
import pymongo
import pymysql
import time
import requests
from pyquery import PyQuery as pq
from selenium import webdriver
import json
from requests.packages import urllib3
urllib3.disable_warnings()
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import re
from BaseCore import BaseCore
import configparser
import redis
"""
将请求链接分成三个
https://finance.yahoo.com/quote/WMT/financials?p=WMT
https://finance.yahoo.com/quote/WMT/balance-sheet?p=WMT
https://finance.yahoo.com/quote/WMT/cash-flow?p=WMT
雅虎财务数据流程修改
1.从sys_base_enterprise_ipo获取到国外上市和台湾企业的股票代码 category 5,6
2.从雅虎财经上请求获取财务数据和币种单位。
3.将数据插入更新到表config_finance_data_sync
信息更新的字段
social_credit_code
name
stock_code
content
level_relation
unit
create_time
4.将采集结果通知接口进行拉取数据处理
"""
class YahooCaiwu(object):
def __init__(self):
self.config = configparser.ConfigParser()
# 读取配置文件
self.config.read('config.ini')
baseCore=BaseCore()
self.logger=baseCore.getLogger()
self.r = redis.Redis(host=self.config.get('redis', 'host'),
port=self.config.get('redis', 'port'),
password=self.config.get('redis', 'pass'), db=6)
self.driver=self.get_webdriver()
# 雅虎财经处理表格
def deal_table(self,doc_resp):
all_dict = {}
resp1_table = doc_resp('#Col1-1-Financials-Proxy section div:nth-child(3)>div>div').children()
catalogue_title = pq(resp1_table[0]).text().split('\n')
doc_items = pq(resp1_table[1]).children()
if len(doc_items)<1:
resp1_table = doc_resp('#Col1-1-Financials-Proxy section div:nth-child(4)>div>div').children()
catalogue_title = pq(resp1_table[0]).text().split('\n')
doc_items = pq(resp1_table[1]).children()
catalogue_dict = {}
content_dict = {}
for doc_item in doc_items:
if pq(doc_item).text() == '':
continue
a = pq(pq(doc_item).children()[0]).text().split('\n')[0]
a_list = pq(pq(doc_item).children()[0]).text().split('\n')[1:]
content_dict[a] = a_list
b_dict = {}
for doc_item1 in pq(doc_item).children()[1]:
b = pq(pq(doc_item1).children()[0]).text().split('\n')[0]
if not b:
continue
b_list = pq(pq(doc_item1).children()[0]).text().split('\n')[1:]
content_dict[b] = b_list
c_dict = {}
for doc_item2 in pq(doc_item1).children()[1]:
c = pq(pq(doc_item2).children()[0]).text().split('\n')[0]
if not c:
continue
c_list = pq(pq(doc_item2).children()[0]).text().split('\n')[1:]
content_dict[c] = c_list
d_dict = {}
for doc_item3 in pq(doc_item2).children()[1]:
d = pq(pq(doc_item3).children()[0]).text().split('\n')[0]
if not d:
continue
d_list = pq(pq(doc_item3).children()[0]).text().split('\n')[1:]
content_dict[d] = d_list
e_dict = {}
for doc_item4 in pq(doc_item3).children()[1]:
e = pq(pq(doc_item4).children()[0]).text().split('\n')[0]
if not e:
continue
e_list = pq(pq(doc_item4).children()[0]).text().split('\n')[1:]
content_dict[e] = e_list
f_dict = {}
for doc_item5 in pq(doc_item4).children()[1]:
f = pq(pq(doc_item5).children()[0]).text().split('\n')[0]
if not f:
continue
f_list = pq(pq(doc_item5).children()[0]).text().split('\n')[1:]
content_dict[f] = f_list
g_dict = {}
for doc_item6 in pq(doc_item5).children()[1]:
g = pq(pq(doc_item6).children()[0]).text().split('\n')[0]
if not g:
continue
g_list = pq(pq(doc_item6).children()[0]).text().split('\n')[1:]
content_dict[g] = g_list
g_dict[g] = {}
f_dict[f] = g_dict
e_dict[e] = f_dict
d_dict[d] = e_dict
c_dict[c] = d_dict
b_dict[b] = c_dict
catalogue_dict[a] = b_dict
all_dict['表头'] = catalogue_title
all_dict['目录'] = catalogue_dict
all_dict['内容'] = content_dict
return all_dict
def get_webdriver(self):
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--start-maximized")
# chrome_options.add_argument('--headless')
chrome_options.binary_location = self.config.get('selenium', 'binary_location')
executable_path =self.config.get('selenium', 'chrome_driver')
driver = webdriver.Chrome(options=chrome_options, executable_path=executable_path)
return driver
def conn11(self):
conn = pymysql.Connect(host='114.116.44.11', port=3306, user='caiji', passwd='f7s0&7qqtK', db='clb_project',
charset='utf8')
cursor = conn.cursor()
return conn,cursor
def getCodeFromRedis(self):
securitiescode=self.r.lpop('NoticeEnterprise:securities_code')
securitiescode = securitiescode.decode('utf-8')
return securitiescode
# 雅虎财经
def get_content2(self):
conn,cursor=self.conn11()
try:
sql1 = """select social_credit_code,securities_code,securities_short_name from sys_base_enterprise_ipo where exchange='8' """ # and stock_code = "SYNH"
# sql1 = f"select social_credit_code,securities_code,securities_short_name from sys_base_enterprise_ipo where securities_code='{securitiescode}' " # and stock_code = "SYNH"
cursor.execute(sql1)
result_data = cursor.fetchall()
except Exception as e:
self.logger.info("数据查询异常!")
for data in result_data:
try:
data_list = list(data)
print(data_list)
social_credit_code = data_list[0]
stock = data_list[1]
securities_short_name = data_list[2] if data_list[2] is not None else ""
# content_sql = ''
self.logger.info(f"需要采集的股票代码{securities_short_name}")
stock2=str(stock)
if stock2.upper().endswith("HK") and stock2.upper().startswith("0") :
stock2=stock2[1:]
url = f'https://finance.yahoo.com/quote/{stock2}/financials?p={stock2}'
try:
print(f'正在采集:{url}')
self.driver.get(url)
# 等待页面加载完成
wait = WebDriverWait(self.driver, 300)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
time.sleep(2)
try:
# driver.find_element(By.XPATH,'//div/span[text()="Expand All"]').click()
self.driver.find_element(By.XPATH,'//div[@id="Col1-1-Financials-Proxy"]/section/div[2]/button/div/span[text()="Expand All"]').click()
wait = WebDriverWait(self.driver, 60)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
time.sleep(2)
except Exception as e:
print(e)
pass
doc_resp1 = pq(self.driver.page_source)
unit=self.get_unit(doc_resp1)
financials1 = self.deal_table(doc_resp1)
self.driver.find_element(By.XPATH,'//div/span[text()="Quarterly"]').click()
time.sleep(5)
try:
self.driver.find_element(By.XPATH,'//div/span[text()="Expand All"]').click()
time.sleep(5)
except Exception as e:
print(e)
pass
doc_resp2 = pq(self.driver.page_source)
financials2 = self.deal_table(doc_resp2)
self.driver.find_element(By.XPATH,'//div/span[text()="Balance Sheet"]').click()
time.sleep(5)
try: #//*[@id="Col1-1-Financials-Proxy"]/section/div[2]/button/div/span
self.driver.find_element(By.XPATH,'//div/span[text()="Expand All"]').click()
time.sleep(5)
except Exception as e:
print(e)
pass
doc_resp3 = pq(self.driver.page_source)
financials3 = self.deal_table(doc_resp3)
self.driver.find_element(By.XPATH,'//div/span[text()="Quarterly"]').click()
time.sleep(5)
try:
self.driver.find_element(By.XPATH,'//div/span[text()="Expand All"]').click()
time.sleep(5)
except Exception as e:
print(e)
pass
doc_resp4 = pq(self.driver.page_source)
financials4 = self.deal_table(doc_resp4)
self.driver.find_element(By.XPATH,'//div/span[text()="Cash Flow"]').click()
time.sleep(5)
try:
self.driver.find_element(By.XPATH,'//div/span[text()="Expand All"]').click()
time.sleep(5)
except Exception as e:
print(e)
pass
doc_resp5 = pq(self.driver.page_source)
financials5 = self.deal_table(doc_resp5)
self.driver.find_element(By.XPATH,'//div/span[text()="Quarterly"]').click()
time.sleep(5)
try:
self.driver.find_element(By.XPATH,'//div/span[text()="Expand All"]').click()
time.sleep(5)
except Exception as e:
print(e)
pass
doc_resp6 = pq(self.driver.page_source)
financials6 = self.deal_table(doc_resp6)
financials_dict = {
'表1': financials1,
'表2': financials2,
'表3': financials3,
'表4': financials4,
'表5': financials5,
'表6': financials6,
}
mu_lus = ''
for i in range(1, 7):
mu_lu = financials_dict[f'表{i}']['目录']
mu_lu = json.dumps(mu_lu, ensure_ascii=False, indent=4)
mu_lus += mu_lu + '&&&&'
level_relation = mu_lus[:-4]
financials = ''
for i in range(1, 7):
a_list = financials_dict[f'表{i}']['表头']
for a in a_list:
financials += a + '\n'
b_dict = financials_dict[f'表{i}']['内容']
for key, values in b_dict.items():
financials += key + '\n'
for b in values:
financials += b + '\n'
financials += '&&&&' + '\n'
financials = financials.strip()
content = financials[:-4].strip().replace('\n&&&&\n', '&&&&')
# if content[:100] in str(content_sql).replace("\\n","\n"):
# print(f"{orc_id}:无最新数据")
# continue
# sql = "UPDATE config_finance_data_sync SET level_relation=%s, content=%s WHERE ID = %s"
# val = (level_relation, content, orc_id)
# cursor.execute(sql, val)
# conn.commit()
ipo_data={
'social_credit_code':social_credit_code,
'stock':stock,
'securities_short_name':securities_short_name,
'content':content,
'level_relation':level_relation,
'unit':unit,
'origin_type':1
}
flag=self.dataToSql(conn,cursor,ipo_data)
if flag:
# get_url = f'http://192.168.1.49:8088/sync/finance/yh?securitiesCode={stock}'
get_url = f'http://114.115.236.206:8088/sync/finance/yh?securitiesCode={stock}'
try:
resp = requests.get(get_url)
print(resp.text)
print('调用接口成功!!')
except:
with open('雅虎财经-财务数据_发送错误ID.txt', 'a', encoding='utf8')as f:
f.write(stock + '\n')
except Exception as e:
print(e)
print(f'采集:{url}失败')
self.driver.quit()
time.sleep(10)
self.driver=self.get_webdriver()
except:
# time.sleep(60 * 10)
self.driver.quit()
time.sleep(10)
self.driver=self.get_webdriver()
print('出错,重试中!')
continue
# self.driver.close()
def dataToSql(self,conn,cursor,ipo_data):
try:
social_credit_code=ipo_data['social_credit_code']
stock=ipo_data['stock']
securities_short_name=ipo_data['securities_short_name']
content=ipo_data['content']
level_relation=ipo_data['level_relation']
unit=ipo_data['unit']
origin_type=ipo_data['origin_type']
if len(unit) == 0:
return False
if len(content) == 0:
return False
if len(level_relation) == 0:
return False
# 检查记录是否存在
select_sql=f"SELECT * FROM config_finance_data_sync WHERE stock_code='{stock}'"
cursor.execute(select_sql)
existing_record = cursor.fetchone()
# 获取当前时间
current_time = datetime.datetime.now()
# 将时间转换为字符串
currentdate = current_time.strftime("%Y-%m-%d %H:%M:%S")
if existing_record:
# 记录已存在,执行更新操作
update_param=(social_credit_code,content,level_relation,origin_type,currentdate,stock)
update_sql=f"UPDATE config_finance_data_sync SET social_credit_code=%s , content=%s , level_relation=%s,origin_type=%s ,create_time=%s WHERE stock_code=%s "
cursor.execute(update_sql,update_param)
print('更新成功')
else:
insert_param=(social_credit_code,content,level_relation,unit,stock,origin_type,currentdate)
insert_sql=f"INSERT INTO config_finance_data_sync (social_credit_code, content,level_relation,unit,stock_code,origin_type,create_time) VALUES ( %s,%s, %s, %s, %s, %s, %s)"
# 记录不存在,执行插入操作
cursor.execute(insert_sql,insert_param)
print('插入成功')
# 提交事务
conn.commit()
except Exception as e:
return False
return True
def get_unit(self,doc_resp):
try:
resp1_table = doc_resp('#quote-header-info >div:nth-child(2)>div:nth-child(1)>div:nth-child(2)>span')
currency = pq(resp1_table[0]).text()
if 'Currency in' in currency:
result = re.findall(r'(?<=Currency in\s).*', currency)
currency=result[0]
if '(' in currency:
currency=currency.split('(')[0]
currency=str(currency).upper()+'(千)'
except Exception as e:
currency=''
return currency
#对比指标计算
def calculateIndexReq(self):
get_url = 'http://114.115.236.206:8088/sync/calculateIndex'
try:
params={
'type':2
}
resp = requests.get(get_url,params=params)
print(resp.text)
text=json.loads(resp.text)
codee=text['code']
while codee==-200:
time.sleep(600)
resp = requests.get(get_url)
print(resp.text)
text=json.loads(resp.text)
codee=text['code']
if codee==-200:
break
print('调用接口成功!!')
except:
print('调用失败!')
if __name__ == '__main__':
# parse_excel()
#get_content1()
yahoo=YahooCaiwu()
yahoo.get_content2()
yahoo.calculateIndexReq()
import pymysql
import requests
import json
import time
import datetime
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from BaseCore import BaseCore
baseCore=BaseCore()
logger=baseCore.getLogger()
def reqmsg(offset,operand):
header={
'Connection':'keep-alive',
'Content-Length':'268',
'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-platform':'"Windows"',
'sec-ch-ua-mobile':'?0',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'Content-Type':'application/json',
'Accept':'*/*',
'Origin':'https://finance.yahoo.com',
'Sec-Fetch-Site':'same-site',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Dest':'empty',
'Referer':'https://finance.yahoo.com/screener/unsaved/ff7f1d36-5088-4986-b7ed-4c7ee4f7af57?count=100&offset=0',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cookie':'tbla_id=24af1dd7-fb87-4d22-a0cf-7c313202694c-tuct86184e4; gpp=DBAA; gpp_sid=-1; OTH=v=2&s=2&d=eyJraWQiOiIwMTY0MGY5MDNhMjRlMWMxZjA5N2ViZGEyZDA5YjE5NmM5ZGUzZWQ5IiwiYWxnIjoiUlMyNTYifQ.eyJjdSI6eyJndWlkIjoiMkZJTDdIQjVZVFcyN1NLWVFIQjNOTFU0RDQiLCJwZXJzaXN0ZW50Ijp0cnVlLCJzaWQiOiJZbXE5VW82MkRsd0QifX0.sVtMHG-HjplldMiy1GXA1thZlnwJusoLr5vAjuHOGIppAWCgZgTz4HuUAB4weZAfICtLge3MZbDnfnIDuHSm620aQ-8lc9RbpQ0_YtWbn50lbi13EgxHuDs7IDvozIqZ7Wji4DldEHGMezxWOqwzG6HeiWdu51gngtC0wYXtKGM; T=af=JnRzPTE2OTY3Mjc4NDAmcHM9THVnWDRYeE11MUNhSXkxcHFOb0pjUS0t&d=bnMBeWFob28BZwEyRklMN0hCNVlUVzI3U0tZUUhCM05MVTRENAFhYwFBRWVLWnVsbQFhbAF3ZWlnYW5nbGl1MTFAZ21haWwuY29tAXNjAW1icl9yZWdpc3RyYXRpb24BZnMBeGI5ODZlUmxJZ01nAXp6AWdNZ0lsQkE3RQFhAVFBRQFsYXQBZ01nSWxCAW51ATA-&kt=EAApyOTr6JKep3_MVia32x9bA--~I&ku=FAAfpFFRxC0lNQCwURloTuGtPI.ZMaMkip9vcBGgEYFfK9jscSEGovt9tf6JPudIpJ1LGwPF8XPDrQGyLdzpK0WeyodXshfKU_VWmF7zaHgEKwVTP6eyxJagSsjv_f.k4KH4UemJUDrEv6AlrYlxgrVtqn8oRdc0E6dse7_A.dyKxk-~E; F=d=GTRbFBs9vIsFBPqpFYbEBjWkFoKZ3VYPcOZjc86puK_qeukJy9prU1z2; PH=l=en-US; Y=v=1&n=4e4ri71j7l6on&l=lj3pdsegobxk43l22ihvwfahc77wvm4u4aao9056/o&p=n2svvhk00000000&r=1ce&intl=us; axids=gam=y-dy_7BtBG2uKuxS_spt7cbrFy7QoSkfWRrfb.CnwY_FJGvFqjAA---A&dv360=eS1Sci5ZRTQ5RTJ1RXBvZkRrM1JMM1pzM1Z0LlBqSkhYdEZnRUY1cmVqRVZWdGo4Z3hjM0NiMnVnTE5YVGFibjYzcFROS35B; gam_id=y-dy_7BtBG2uKuxS_spt7cbrFy7QoSkfWRrfb.CnwY_FJGvFqjAA---A; GUC=AQEACAJlI0llUkIdBwRd&s=AQAAAIsCEds8&g=ZSIDMg; A1=d=AQABBFnbm2QCEJD73Orb0UPzH5ts62DMAwYFEgEACAJJI2VSZdyia3sB_eMBAAcIKP9nYQIMhisIDy8f80k8wbm5XkoLgG-PKwkBBwoBCg&S=AQAAAo9aGFBw2wIEVJoc_Tspjbw; A3=d=AQABBFnbm2QCEJD73Orb0UPzH5ts62DMAwYFEgEACAJJI2VSZdyia3sB_eMBAAcIKP9nYQIMhisIDy8f80k8wbm5XkoLgG-PKwkBBwoBCg&S=AQAAAo9aGFBw2wIEVJoc_Tspjbw; cmp=t=1696898974&j=0&u=1YNN; PRF=t%3DLMT%252BAAPL%252BAAPL.BA%252BTM.BA%252B%255EIXIC%252BISP%252B032830.KS%252BABG.JO%252BWCC%252BAHT.L%252BCPB%252BVMUK.L%252BRAJESHEXPO.NS%252B8128.HK%252B5019.T%26newChartbetateaser%3D1; __gpi=UID=00000c5bc07ce289:T=1696900616:RT=1696900616:S=ALNI_MZVy68LYVM9slK8cg6vB3OeE3-uvw; A1S=d=AQABBFnbm2QCEJD73Orb0UPzH5ts62DMAwYFEgEACAJJI2VSZdyia3sB_eMBAAcIKP9nYQIMhisIDy8f80k8wbm5XkoLgG-PKwkBBwoBCg&S=AQAAAo9aGFBw2wIEVJoc_Tspjbw',
}
data={
"sortType": "DESC",
"sortField": "intradaymarketcap",
"quoteType": "EQUITY",
"offset": offset,
"query": {
"operator": "and",
"operands": [
{
"operator": "or",
"operands": [operand]
}
]
},
"size": 100,
"userId": "2FIL7HB5YTW27SKYQHB3NLU4D4",
"userIdType": "guid"
}
proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
url='https://query2.finance.yahoo.com/v1/finance/screener?crumb=sZDWS3KsXAl&lang=en-US&region=US&formatted=true&corsDomain=finance.yahoo.com'
for i in range(0,3):
try:
response=requests.post(url=url,data=json.dumps(data),headers=header,verify=False,timeout=20,proxies=proxy)
stockmsg=response.json()
except Exception as e:
stockmsg=''
logger.info(f"第offset={offset}页请求失败{e}")
if stockmsg:
logger.info(f"第offset={offset}页请求成功")
break
return stockmsg
def listPage():
operands=[
{
"operator": "EQ",
"operands": [
"region",
"ar"
]
},
{
"operator": "EQ",
"operands": [
"region",
"au"
]
},
{
"operator": "EQ",
"operands": [
"region",
"ch"
]
},
{
"operator": "EQ",
"operands": [
"region",
"cn"
]
},
{
"operator": "EQ",
"operands": [
"region",
"de"
]
},
{
"operator": "EQ",
"operands": [
"region",
"ee"
]
},
{
"operator": "EQ",
"operands": [
"region",
"at"
]
},
{
"operator": "EQ",
"operands": [
"region",
"ca"
]
},
{
"operator": "EQ",
"operands": [
"region",
"be"
]
},
{
"operator": "EQ",
"operands": [
"region",
"cl"
]
},
{
"operator": "EQ",
"operands": [
"region",
"cz"
]
},
{
"operator": "EQ",
"operands": [
"region",
"dk"
]
},
{
"operator": "EQ",
"operands": [
"region",
"eg"
]
},
{
"operator": "EQ",
"operands": [
"region",
"fi"
]
},
{
"operator": "EQ",
"operands": [
"region",
"br"
]
},
{
"operator": "EQ",
"operands": [
"region",
"es"
]
},
{
"operator": "EQ",
"operands": [
"region",
"fr"
]
},
{
"operator": "EQ",
"operands": [
"region",
"gb"
]
},
{
"operator": "EQ",
"operands": [
"region",
"hk"
]
},
{
"operator": "EQ",
"operands": [
"region",
"id"
]
},
{
"operator": "EQ",
"operands": [
"region",
"gr"
]
},
{
"operator": "EQ",
"operands": [
"region",
"hu"
]
},
{
"operator": "EQ",
"operands": [
"region",
"il"
]
},
{
"operator": "EQ",
"operands": [
"region",
"ie"
]
},
{
"operator": "EQ",
"operands": [
"region",
"it"
]
},
{
"operator": "EQ",
"operands": [
"region",
"in"
]
},
{
"operator": "EQ",
"operands": [
"region",
"kr"
]
},
{
"operator": "EQ",
"operands": [
"region",
"is"
]
},
{
"operator": "EQ",
"operands": [
"region",
"jp"
]
},
{
"operator": "EQ",
"operands": [
"region",
"kw"
]
},
{
"operator": "EQ",
"operands": [
"region",
"lt"
]
},
{
"operator": "EQ",
"operands": [
"region",
"lk"
]
},
{
"operator": "EQ",
"operands": [
"region",
"lv"
]
},
{
"operator": "EQ",
"operands": [
"region",
"mx"
]
},
{
"operator": "EQ",
"operands": [
"region",
"nl"
]
},
{
"operator": "EQ",
"operands": [
"region",
"nz"
]
},
{
"operator": "EQ",
"operands": [
"region",
"my"
]
},
{
"operator": "EQ",
"operands": [
"region",
"no"
]
},
{
"operator": "EQ",
"operands": [
"region",
"ph"
]
},
{
"operator": "EQ",
"operands": [
"region",
"pe"
]
},
{
"operator": "EQ",
"operands": [
"region",
"vn"
]
},
{
"operator": "EQ",
"operands": [
"region",
"us"
]
},
{
"operator": "EQ",
"operands": [
"region",
"tr"
]
},
{
"operator": "EQ",
"operands": [
"region",
"sr"
]
},
{
"operator": "EQ",
"operands": [
"region",
"za"
]
},
{
"operator": "EQ",
"operands": [
"region",
"ve"
]
},
{
"operator": "EQ",
"operands": [
"region",
"th"
]
},
{
"operator": "EQ",
"operands": [
"region",
"sg"
]
},
{
"operator": "EQ",
"operands": [
"region",
"tw"
]
},
{
"operator": "EQ",
"operands": [
"region",
"sa"
]
},
{
"operator": "EQ",
"operands": [
"region",
"qa"
]
},
{
"operator": "EQ",
"operands": [
"region",
"pl"
]
},
{
"operator": "EQ",
"operands": [
"region",
"se"
]
},
{
"operator": "EQ",
"operands": [
"region",
"ru"
]
},
{
"operator": "EQ",
"operands": [
"region",
"pt"
]
},
{
"operator": "EQ",
"operands": [
"region",
"pk"
]
}
]
for operand in operands:
logger.info(f'采集地域股票信息{operand}')
#第一次请求获取地域总共有的股票代码数量
try:
stockmsg=reqmsg(0,operand)
total=stockmsg['finance']['result'][0]['total']
except Exception as e:
logger.info(f'region该地域没有股票信息{operand}')
continue
for i in range(0,total,100):
logger.info(f"offset的值{i}")
stockmsg=reqmsg(i,operand)
if stockmsg:
try:
getStock(stockmsg)
except Exception as e:
logger.info(f"解析失败{e}")
time.sleep(3)
def getStock(stockmsg):
quotes=stockmsg['finance']['result'][0]['quotes']
for quote in quotes:
symbol=quote['symbol']
try:
longName=quote['longName']
except:
longName=''
try:
exchange=quote['exchange']
except:
exchange=''
try:
fullExchangeName=quote['fullExchangeName']
except:
fullExchangeName=''
try:
financialCurrency=quote['financialCurrency']
except:
financialCurrency=''
try:
market=quote['market']
except:
market=''
try:
shortName=quote['shortName']
except:
shortName=''
quotmsg={
'symbol':symbol,
'longName':longName,
'exchange':exchange,
'fullExchangeName':fullExchangeName,
'financialCurrency':financialCurrency,
'market':market,
'shortName':shortName
}
dataToSql(quotmsg)
def conn144():
conn = pymysql.Connect(host='114.115.159.144', port=3306, user='caiji', passwd='zzsn9988', db='caiji',
charset='utf8')
cursor = conn.cursor()
return conn,cursor
def dataToSql(quotmsg):
conn,cursor=conn144()
try:
symbol=quotmsg['symbol']
longName=quotmsg['longName']
exchange=quotmsg['exchange']
fullExchangeName=quotmsg['fullExchangeName']
financialCurrency=quotmsg['financialCurrency']
market=quotmsg['market']
shortName=quotmsg['shortName']
# 检查记录是否存在
select_sql=f"SELECT * FROM yahoostock WHERE symbol='{symbol}'"
cursor.execute(select_sql)
existing_record = cursor.fetchone()
# 获取当前时间
current_time = datetime.datetime.now()
# 将时间转换为字符串
currentdate = current_time.strftime("%Y-%m-%d %H:%M:%S")
if existing_record:
# 记录已存在,执行更新操作
# update_param=(symbol,longName,exchange,fullExchangeName,financialCurrency,market,shortName,currentdate)
# update_sql=f"UPDATE yahoostock SET social_credit_code=%s , content=%s , level_relation=%s,origin_type=%s ,create_time=%s WHERE stock_code=%s "
# cursor.execute(update_sql,update_param)
logger.info(f"股票代码已采集入库过{symbol}")
else:
insert_param=(symbol,longName,exchange,fullExchangeName,financialCurrency,market,shortName,currentdate)
insert_sql=f"INSERT INTO yahoostock (symbol, longName,exchange,fullExchangeName,financialCurrency,market,shortName,currentdate) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s)"
# 记录不存在,执行插入操作
cursor.execute(insert_sql,insert_param)
logger.info(f"{symbol}股票入库添加成功")
# 提交事务
conn.commit()
except Exception as e:
return False
return True
if __name__ == '__main__':
logger.info(f"采集开始")
try:
listPage()
except Exception as e:
logger.info(f"程序异常退出{e}")
logger.info(f"采集结束")
# -*- coding: utf-8 -*-
import os
import random
import sys
import time
import logbook
import logbook.more
# 核心工具包
import pymysql
import redis
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class BaseCore:
# 序列号
__seq = 0
# 代理池 数据库连接
__cnx_proxy =None
__cursor_proxy = None
# agent 池
__USER_AGENT_LIST = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.29 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/531.4 (KHTML, like Gecko) Chrome/3.0.194.0 Safari/531.4',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.50 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; Lunascape 5.0 alpha2)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.7 Safari/532.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; ru-RU) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.10 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Maxthon;',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.169.0 Safari/530.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040614 Firefox/0.9',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.810.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.500.0 Safari/534.6',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; TencentTraveler)',
'Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.4 (KHTML, like Gecko) Chrome/6.0.481.0 Safari/534.4',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.370.0 Safari/533.4',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.4.154.31 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB) AppleWebKit/534.1 (KHTML, like Gecko) Chrome/6.0.428.0 Safari/534.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.540.0 Safari/534.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE) Chrome/4.0.223.3 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/12.0.702.0 Safari/534.24',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.42 Safari/525.19',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.3 (KHTML, like Gecko) Chrome/4.0.227.0 Safari/532.3',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.8 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.8',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.460.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.463.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.9 (KHTML, like Gecko) Chrome/2.0.157.0 Safari/528.9',
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.794.0 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.694.0 Safari/534.24',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1',
'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Maxthon; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.4 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.65 Safari/535.11',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.682.0 Safari/534.21',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.0 (KHTML, like Gecko) Chrome/2.0.182.0 Safari/531.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.9 (KHTML, like Gecko) Chrome/7.0.531.0 Safari/534.9',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.811.0 Safari/535.1',
'ozilla/5.0 (Windows; U; Windows NT 5.0; de-DE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.127 Safari/533.4',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; zh-cn) Opera 8.50',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/7.0.0 Safari/700.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.4 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.5) Gecko/20041107 Firefox/0.9.2 StumbleUpon/1.994',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.7.5) Gecko/20041110 Firefox/1.0',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; en) Opera 8.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.6 Safari/530.5',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.0.3705)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.792.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.168.0 Safari/530.1',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20040913 Firefox/0.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.8 (KHTML, like Gecko) Chrome/2.0.177.1 Safari/530.8',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.40 Safari/530.5',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.24 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.10 (KHTML, like Gecko) Chrome/2.0.157.2 Safari/528.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.2 Safari/532.2',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.75 Safari/535.7',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; T312461)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.461.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.0; rv:1.7.3) Gecko/20041001 Firefox/0.10.1',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; de-DE) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.2 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/531.3 (KHTML, like Gecko) Chrome/3.0.193.2 Safari/531.3',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.864.0 Safari/535.2',
'Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0',
'Mozilla/5.0 (Windows NT 5.1; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.801.0 Safari/535.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.697.0 Safari/534.24',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.548.0 Safari/534.10',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.17 (KHTML, like Gecko) Chrome/11.0.652.0 Safari/534.17',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10 ChromePlus/1.5.2.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.0 Safari/532.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.2 Safari/533.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.4 Safari/532.1',
'Mozilla/5.0 (Windows NT 6.0; rv:2.1.1) Gecko/20110415 Firefox/4.0.2pre Fennec/4.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.0 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; sv-SE; rv:1.7.5) Gecko/20041108 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.462.0 Safari/534.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE; rv:1.7.5) Gecko/20041122 Firefox/1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; uZardWeb/1.0; Server_JP)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; HCI0449; .NET CLR 1.0.3705)',
'Mozilla/4.0 (compatible; MSIE 5.0; Windows 98; DigExt); Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1);',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.23 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.0; rv:14.0) Gecko/20100101 Firefox/14.0.1',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.7 (KHTML, like Gecko) Chrome/2.0.176.0 Safari/530.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.678.0 Safari/534.21',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; InfoPath.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.55 Safari/525.19',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0a1) Gecko/20110623 Firefox/7.0a1 Fennec/7.0a1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.724.100 Safari/534.30',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; WOW64; SV1; uZardWeb/1.0; Server_HK)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
'Mozilla/5.0 (Windows NT 6.0) yi; AppleWebKit/345667.12221 (KHTML, like Gecko) Chrome/23.0.1271.26 Safari/453667.1221',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.2 (KHTML, like Gecko) Chrome/3.0.191.3 Safari/531.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.39 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.38 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.27 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8b) Gecko/20050118 Firefox/1.0+',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP; rv:1.7) Gecko/20040707 Firefox/0.9.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.0 Safari/532.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.4 (KHTML, like Gecko) Chrome/2.0.171.0 Safari/530.4',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; nl-NL; rv:1.7.5) Gecko/20041202 Firefox/1.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.204.0 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.6 Safari/532.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/528.8 (KHTML, like Gecko) Chrome/1.0.156.0 Safari/528.8',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 1.0.3705; .NET CLR 2.0.50727; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.43 Safari/534.7',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13',
'Mozilla/5.0 (ipad Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.498.0 Safari/534.6',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.43 Safari/530.5',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.19 (KHTML, like Gecko) Chrome/11.0.661.0 Safari/534.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-CA) AppleWebKit/534.13 (KHTML like Gecko) Chrome/9.0.597.98 Safari/534.13',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.2 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.213.1 Safari/532.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.6 (KHTML, like Gecko) Chrome/2.0.174.0 Safari/530.6',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.3.154.6 Safari/525.19',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.599.0 Safari/534.13',
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.8 (KHTML, like Gecko) Chrome/7.0.521.0 Safari/534.8',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1b2pre) Gecko/20081015 Fennec/1.0a1',
'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
]
def close(self):
try:
self.__cursor_proxy.close()
self.__cnx_proxy.close()
except :
pass
def __init__(self):
self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='clb_project',
charset='utf8mb4')
self.__cursor_proxy= self.__cnx_proxy.cursor()
# 连接到Redis
self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
# 计算耗时
def getTimeCost(self,start, end):
seconds = int(end - start)
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
if (h > 0):
return "%d小时%d分钟%d秒" % (h, m, s)
elif (m > 0):
return "%d分钟%d秒" % (m, s)
elif (seconds > 0):
return "%d秒" % (s)
else:
ms = int((end - start) * 1000)
return "%d毫秒" % (ms)
# 当前时间格式化
# 1 : 2001-01-01 12:00:00 %Y-%m-%d %H:%M:%S
# 2 : 010101120000 %y%m%d%H%M%S
# 时间戳 3:1690179526555 精确到秒
def getNowTime(self, type):
now_time = ""
if type == 1:
now_time = time.strftime("%Y-%m-%d %H:%M:%S")
if type == 2:
now_time = time.strftime("%y%m%d%H%M%S")
if type == 3:
now_time = int(time.time() * 1000)
return now_time
# 日志格式
def logFormate(self,record, handler):
formate = "[{date}] [{level}] [{filename}] [{func_name}] [{lineno}] {msg}".format(
date=record.time, # 日志时间
level=record.level_name, # 日志等级
filename=os.path.split(record.filename)[-1], # 文件名
func_name=record.func_name, # 函数名
lineno=record.lineno, # 行号
msg=record.message # 日志内容
)
return formate
# 获取logger
def getLogger(self,fileLogFlag=True, stdOutFlag=True):
dirname, filename = os.path.split(os.path.abspath(sys.argv[0]))
dirname = os.path.join(dirname, "logs")
filename = filename.replace(".py", "") + ".log"
if not os.path.exists(dirname):
os.mkdir(dirname)
logbook.set_datetime_format('local')
logger = logbook.Logger(filename)
logger.handlers = []
if fileLogFlag: # 日志输出到文件
logFile = logbook.TimedRotatingFileHandler(os.path.join(dirname, filename), date_format='%Y-%m-%d',
bubble=True, encoding='utf-8')
logFile.formatter = self.logFormate
logger.handlers.append(logFile)
if stdOutFlag: # 日志打印到屏幕
logStd = logbook.more.ColorizedStderrHandler(bubble=True)
logStd.formatter = self.logFormate
logger.handlers.append(logStd)
return logger
# 获取随机的userAgent
def getRandomUserAgent(self):
return random.choice(self.__USER_AGENT_LIST)
# 获取代理
def get_proxy(self):
sql = "select proxy from clb_proxy"
self.__cursor_proxy.execute(sql)
proxy_lists = self.__cursor_proxy.fetchall()
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
proxy_list = []
for str_ip in ip_list:
str_ip_list = str_ip.split('-')
proxyMeta = "http://%(host)s:%(port)s" % {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
proxy = {
"HTTP": proxyMeta,
"HTTPS": proxyMeta
}
proxy_list.append(proxy)
return proxy_list[random.randint(0, 3)]
# def get_proxy(self):
# ip_list = []
# with self.__cursor_proxy as cursor:
# sql_str = '''select PROXY from clb_proxy where id={} '''.format(random.randint(1, 12))
# print(sql_str)
# cursor.execute(sql_str)
# rows = cursor.fetchall()
# for row in tqdm(rows):
# str_ip = row[0]
# str_ip_list = str_ip.split('-')
# proxyMeta = "http://%(host)s:%(port)s" % {
# "host": str_ip_list[0],
# "port": str_ip_list[1],
# }
# proxy = {
# "HTTP": proxyMeta,
# "HTTPS": proxyMeta
# }
# ip_list.append(proxy)
#
# return ip_list
# def get_proxyIPPort(self):
# ip_list = []
# with self.__cursor_proxy as cursor:
# sql_str = '''select PROXY from clb_proxy where id={} '''.format(random.randint(1, 12))
# print(sql_str)
# cursor.execute(sql_str)
# rows = cursor.fetchall()
# for row in tqdm(rows):
# str_ip = row[0]
# str_ip_list = str_ip.split('-')
# proxy = {
# "host": str_ip_list[0],
# "port": str_ip_list[1],
# }
#
# ip_list.append(proxy)
#
# return ip_list
#
\ No newline at end of file
# -*- coding: utf-8 -*-
# 智能采集请求
# 1、考虑:请求智能采集时,不再使用实体类
# a. 仍使用:通过HTTP的 raw 请求体,直接传递HTML源文件,通过query参数传递 lang-code、link-text 参数
# b. 原因:在 postman 中,不方便进行测试,无法使用粘贴后的HTML源文件
# 2、不考虑:使用实体类,利大于弊
# a. 使用实体类,方便扩展参数字段
# b. 方便展示接口文档:调用 json_parameter_utility.get_json_parameters 函数,可显示请求实体类
class ExtractionRequest:
# 语言代码
# 1、采集“非中文”的文章时,需要用到语言代码
lang_code = ""
# 链接文本
# 1、用于采集标题,如果不提供,标题的准确度会下降
link_text = ""
# 文章页面源文件
# 1、用于采集标题、发布时间、内容等
article_html = ""
@staticmethod
def from_dict(dictionary: dict):
extraction_request = ExtractionRequest()
# 尝试方法:
# 1、将字典,更新到内部的 __dict__ 对象
# extraction_request.__dict__.update(dictionary)
# 将字典值,设置到当前对象
for key in dictionary:
setattr(extraction_request, key, dictionary[key])
return extraction_request
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
# 采集结果
class ExtractionResult:
# 标题
title = ""
# 发布日期
publish_date = ""
# 正文(保留所有HTML标记,如:br、img)
text = ""
# URL
url = ""
# 摘要
meta_description = ""
# 干净正文(不带HTML)
cleaned_text = ""
# 来源(目前只支持采集中文网站中的“来源”)
# source = ""
# 顶部图片(top_image:采集不到任何内容,不再使用此属性)
# top_image = ""
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
class UrlPickingRequest:
# 列表页面的响应URL
# 1、作为Base URL,用于拼接提取到的相对URL
# 2、Base URL:必须使用响应URL
# 3、示例:在 Python中,通过 requests.get(url) 请求URL后,需要使用 resp.url 作为 Base URL
list_page_resp_url = ""
# 列表页面源文件
# 1、用于提取文章网址
list_page_html = ""
@staticmethod
def from_dict(dictionary: dict):
url_picking_request = UrlPickingRequest()
# 将字典值,设置到当前对象
for key in dictionary:
setattr(url_picking_request, key, dictionary[key])
return url_picking_request
def to_dict(self):
# 转换为字典对象:
# 1、序列化为JSON时,需要调用此方法
# 2、转换为JSON字符串:json.dumps(extraction_result, default=ExtractionResult.to_dict)
data = {}
# 借助内部的 __dict__ 对象
# 1、将内部的 __dict__ 对象,更新到新的字典对象中
data.update(self.__dict__)
return data
# -*- coding: utf-8 -*-
import requests
from goose3 import Goose
from goose3.text import StopWordsChinese, StopWordsKorean, StopWordsArabic
from entity import *
from smart_extractor_utility import SmartExtractorUtility
# goose3自带的lxml,提示找不到etree,但仍可使用
from lxml import etree
from lxml.html import HtmlElement
class SmartExtractor:
@staticmethod
def get_supported_lang_code_dict():
"""
支持语言:
1、需要分词,传递分词器(3种):
a. 中文、韩语、阿拉伯语
2、不需要分词,直接传递语言编码(16种)
a. 其中英语、俄语,单独测试
"""
supported_lang_code_dict = {
'cn': '中文', # 中文
'zh-cn': '简体中文', # 简体中文
'zh': '简体中文', # 简体中文
'ko': '韩语', # 韩语
'ar': '阿拉伯语', # 阿拉伯语
'en': '英语', # 英语
'ru': '俄语', # 俄语
'da': '丹麦语', # 丹麦语
'de': '德语', # 德语
'es': '西班牙语', # 西班牙语
'fi': '芬兰语', # 芬兰语
'fr': '法语', # 法语
'hu': '匈牙利语', # 匈牙利语
'id': '印度尼西亚语', # 印度尼西亚语
'it': '意大利语', # 意大利语
'nb': '挪威语(伯克梅尔)', # 挪威语(伯克梅尔)
'nl': '荷兰语', # 荷兰语
'no': '挪威文(耐诺斯克)', # 挪威文(耐诺斯克)
'pl': '波兰语', # 波兰语
'pt': '葡萄牙语', # 葡萄牙语
'sv': '瑞典语', # 瑞典语
}
return supported_lang_code_dict
def __init__(self, lang_code='cn'):
"""
构造器:未指定 lang_code 参数时,默认为 cn
"""
# 支持语言
supported_lang_code_list = list(SmartExtractor.get_supported_lang_code_dict())
# 初始化 goose 对象:
# 1、根据语言代码,创建 goose 对象
if lang_code is None or lang_code == 'cn' or lang_code == 'zh-cn' or lang_code == 'zh':
# 需要分词:中文
# 1、不指定lang_code参数,或不指定lang_code为 None 时,默认为中文分词
# 2、Flask Web接口:未指定get参数 lang_code 时,lang_code 会接收为 None
self.goose = Goose({'stopwords_class': StopWordsChinese})
elif lang_code == 'ko':
# 需要分词:韩语
# 1、测试:只传递语言,不传递分词器
# self.goose = Goose({'use_meta_language': False, 'target_language': 'ko'}) # 测试失败:正文采集为空
# self.goose = Goose() # 测试失败:正文采集为空
# 韩语分词:测试成功
self.goose = Goose({'stopwords_class': StopWordsKorean})
elif lang_code == 'ar':
# 需要分词:阿拉伯语
# self.goose = Goose({'use_meta_language': False, 'target_language': 'en'}) # 测试失败:正文采集为空
# self.goose = Goose() # 测试成功
# self.goose = Goose({'use_meta_language': False, 'target_language': lang_code}) # 测试成功:直接传递语言编码
self.goose = Goose({'stopwords_class': StopWordsArabic})
elif lang_code == 'en':
# 单独测试:英文
# self.goose = Goose({'use_meta_language': False, 'target_language': 'en'})
# 测试成功:创建Goose对象时,不指定语言默认为英文分词
self.goose = Goose()
elif lang_code == 'ru':
# 单独测试:俄语
# self.goose = Goose({'use_meta_language': False, 'target_language': 'en'}) # 测试失败:正文采集为空
self.goose = Goose({'use_meta_language': False, 'target_language': lang_code}) # 测试成功:直接传递语言编码
elif lang_code in supported_lang_code_list:
# 其它语言编码,统一处理,不再单独测试
self.goose = Goose({'use_meta_language': False, 'target_language': lang_code})
else:
# 未识别的语言代码
raise Exception(f'智能采集时,无法识别语言代码:{lang_code}')
def get_extraction_result(self, article, link_text=''):
"""
获取采集结果:
1、从 artcile 对象中,采集数据并封装到 ExtractionResult
"""
# 用于保存:采集后的文本
extraction_result = ExtractionResult()
# 标题
# extraction_result.title = article.title # 原办法:使用 goose 采集到的 title 中的标题
extraction_result.title = SmartExtractorUtility.get_article_title(article, link_text)
# 发布日期
extraction_result.publish_date = SmartExtractorUtility.get_publish_date(article)
# 正文(保留所有HTML标记,如:br、img)
extraction_result.text = SmartExtractorUtility.get_article_text(article)
# URL
extraction_result.url = article.final_url
# 摘要
extraction_result.meta_description = article.meta_description
# 干净正文(不带HTML)
extraction_result.cleaned_text = article.cleaned_text
# 来源(目前只支持采集中文网站中的“来源”)
extraction_result.source = ''
return extraction_result
def extract_by_url(self, url, link_text=''):
"""
按URL采集内容
"""
# 采集正文:传入url
article = self.goose.extract(url=url)
# article = goose.extract(raw_html=html)
return self.get_extraction_result(article, link_text)
def extract_by_html(self, html, link_text=''):
"""
按HTML采集内容
"""
# 采集正文:传入html
article = self.goose.extract(raw_html=html)
return self.get_extraction_result(article, link_text)
def extract_by_url_test():
# 测试:按URL采集
url_list = [
# "http://www.news.cn/politics/2022-07/31/c_1128879636.htm", # 短文本
# "https://baijiahao.baidu.com/s?id=1741311527693101670", # 带多张图片
# "https://news.cctv.com/2022/08/16/ARTIERrXbbVtVUaQU0pMzQxf220816.shtml", # 带多张图片,及一个视频(测试内容XPath失败)
# "http://opinion.people.com.cn/n1/2022/0803/c1003-32492653.html", # 人民网
# 韩文:中央日报-politics
# "https://www.joongang.co.kr/article/25094974",
# "https://www.joongang.co.kr/article/25094967",
# 英文:加德满都邮报-national-security
# "https://kathmandupost.com/national-security/2020/01/17/police-s-intelligence-continues-to-fail-them-as-chand-party-claims-explosion",
# "https://kathmandupost.com/national-security/2019/11/04/india-s-new-political-map-places-disputed-territory-of-kalapani-inside-its-own-borders", # 测试采集:发布时间
# 俄语:今日白俄罗斯报-word
# "https://www.sb.by/articles/byvshiy-premer-ministr-italii-zayavil-chto-strane-sleduet-otkazatsya-ot-gaza-iz-rossii.html",
# 'https://www.sb.by/articles/kryuchkov-predupredil-o-nepopravimykh-posledstviyakh-dlya-ukrainy-v-sluchae-udarov-po-krymu.html',
# 阿语
# "http://arabic.people.com.cn/n3/2022/0822/c31659-10137917.html",
# "http://arabic.people.com.cn/n3/2022/0822/c31657-10137909.html",
# 测试提取标题
# "http://www.sasac.gov.cn/n4470048/n16518962/n20928507/n20928570/c25819031/content.html",
# "http://www.forestry.gov.cn/main/102/20220823/092407820617754.html",
# "http://www.sasac.gov.cn/n2588025/n2588139/c25825832/content.html", # 标题采集为空
# 'http://www.crfeb.com.cn/1j/_124/2005409/index.html', # 内容采集失败
# 'http://www.crfeb.com.cn/1j/_124/912248/index.html', # 内容采集失败
# 'https://www.crcc.cn/art/2021/11/12/art_205_3413380.html', # 中国铁建股份有限公司-工作动态(日期采集错误)
# 'http://ccecc.crcc.cn/art/2015/11/19/art_7608_1136312.html', # 中国土木工程集团有限公司-多个栏目(日期采集错误)
# 'http://v.people.cn/n1/2022/0901/c444662-32517559.html', # 人民网视频:title必须以“元素中的标题”开始,不能判断“包含”
# 'https://www.chec.bj.cn/cn/xwzx/gsyw/2022/202207/t20220706_8128.html', # 中国港湾工程有限责任公司-公司要闻(标题采集失败)
# 'https://www.cscec.com/xwzx_new/gsyw_new/202208/3570377.html', # 中国建筑集团有限公司-中建要闻(标题采集失败)
# 'https://www.crbc.com/site/crbc/276/info/2022/46884837.html', # 中国路桥工程有限责任公司-多个栏目(标题采集失败)
# 'http://www.cgcoc.com.cn/news/432.html', # 中地海外集团有限公司-新闻中心(标题和内容采集失败)
# 'http://www.mcc.com.cn/mcc/_132154/_132572/308233/index.html' # 中国五矿(测试:正文采集失败)
# 'http://www.powerchina.cn/art/2015/5/27/art_7449_441845.html', # 中国电力建设集团(测试:标题、正文采集失败)
# 中国电力建设集团(测试:标题采集失败),相比列表中的链接文本、title标签中的内容,元素中的标题,“秉承丝路精髓 抒写锦绣华章”中间多出一个空格
# 'http://world.people.com.cn/n1/2022/0624/c1002-32455607.html', # 标题采集失败:看着没有问题
# 'https://www.cscec.com/xwzx_new/zqydt_new/202209/3578274.html', # 中国建筑股份有限公司-企业动态:日期采集错误,采集到当天日期
# 'https://3g.k.sohu.com/t/n705260979' #天眼查--企业公告'
# 'https://baijiahao.baidu.com/s?id=1769415116218226935'
# 'https://m.gelonghui.com/community/post/1678728#ocr'
'http://epaper.zqrb.cn/html/2023-05/27/content_950333.htm'
]
# 语言编码
lang_code = 'cn'
# lang_code = 'ko'
# lang_code = 'en'
# lang_code = 'ru'
# lang_code = 'ar'
for url in url_list:
print()
print("-" * 100)
print('请求URL:', url)
extraction_result = SmartExtractor(lang_code).extract_by_url(url)
# 测试转换为JSON
# 1、直接转换时,会抛异常:TypeError: Object of type ExtractionResult is not JSON serializable
# print(json.dumps(extraction_result))
# print(json.dumps(extraction_result, default=ExtractionResult.to_dict)) # 转换成功:指定序列化器
# print(type(json.dumps(extraction_result.to_dict()))) # 返回类型:<class 'str'>,内容中的中文会被转义
# print(str(extraction_result.to_dict())) # 如果直接转换为字符串,中文不会被转义
# 打印测试结果
print_extraction_result(extraction_result)
def extract_by_html_test():
# 测试:按HTML采集
html = '''
<html>
<head>
<title>标题</title>
</head>
<body>
<div>标题</div>
<div>内容</div>
</body>
</html>
'''
# 测试:通过请求URL,获取完整的html
# url = "http://www.news.cn/politics/2022-07/31/c_1128879636.htm" # 测试成功
# url = "http://views.ce.cn/view/ent/202208/15/t20220815_37961634.shtml" # 1、测试失败:lxml.etree.ParserError: Document is empty
url = 'https://www.crcc.cn/art/2021/11/12/art_205_3413380.html' # 中国铁建股份有限公司-工作动态(日期采集错误)
# url = 'http://ccecc.crcc.cn/art/2015/11/19/art_7608_1136312.html' # 中国土木工程集团有限公司-多个栏目(日期采集错误)
print()
print("-" * 100)
print('请求URL:', url)
html = requests.get(url).text
# 语言编码
lang_code = 'cn'
# 采集内容
extraction_result = SmartExtractor(lang_code).extract_by_html(html)
# 打印测试结果
print_extraction_result(extraction_result)
def print_extraction_result(extraction_result):
# 打印测试结果
print("标题:", extraction_result.title) # 标题
print("发布时间:", extraction_result.publish_date) # 发布时间
print("正文:", extraction_result.text) # 正文
print("URL:", extraction_result.url) # URL
print("摘要:", extraction_result.meta_description) # 摘要
print("干净正文:", extraction_result.cleaned_text) # 干净正文
if __name__ == '__main__':
try:
# 测试:按URL采集
extract_by_url_test()
# 测试:按HTML采集
# extract_by_html_test()
except Exception as e:
print("采集失败:", e)
# -*- coding: utf-8 -*-
import re
from goose3.article import Article
from lxml import etree
from lxml.html import HtmlElement
class SmartExtractorUtility:
# 标题最小长度
title_min_len = 6
@staticmethod
def extract_publish_date(html):
pattern_list = [
# 2010-10-1 8:00:00
r"20\d{2}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}",
# 2010-10-1 8:00
r"20\d{2}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}",
# 2010年10月1日 8:00:00
r"20\d{2}年\d{1,2}月\d{1,2}日 \d{1,2}:\d{1,2}:\d{1,2}",
# 2010年10月1日 8:00
r"20\d{2}年\d{1,2}月\d{1,2}日 \d{1,2}:\d{1,2}",
# 2010/10/1 8:00:00
r"20\d{2}/\d{1,2}/\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}",
# 2010/10/1 8:00
r"20\d{2}/\d{1,2}/\d{1,2} \d{1,2}:\d{1,2}",
# 2010-10-1
r"20\d{2}-\d{1,2}-\d{1,2}",
# 2010年10月1日
r"20\d{2}年\d{1,2}月\d{1,2}日",
# 2010/10/1
r"20\d{2}/\d{1,2}/\d{1,2}",
# 2022.08.28
r"20\d{2}\.\d{1,2}\.\d{1,2}"
# 12-07-02 10:10
r"\d{2}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}",
# 1月前
r"\d+(&nbsp;| )*月前",
# 12天前
r"\d+(&nbsp;| )*天前",
# 2小时前
r"\d+(&nbsp;| )*小时前",
# 15分钟前
r"\d+(&nbsp;| )*分钟前",
# 昨天&nbsp;17:59
r"昨天(&nbsp;| )*\d{1,2}:\d{1,2}",
]
# 尝试匹配所有正则式
for pattern in pattern_list:
# 提取可见日期:
# 1、必须在标签内部,不能提取HTML标签属性中的日期
# 2、提取规则:必须在 > 和 < 之间,且中间不能再有 >
tag_pattern = f'>[^>]*(?P<date>{pattern})[^>]*<'
# 搜索第一个匹配项
match = re.search(tag_pattern, html)
# 如果匹配成功,返回正确的发布时间
if match:
return match.group('date')
# 所有正则式匹配失败,返回空字符串
return ""
@staticmethod
def add_html_br(cleaned_text):
# 包装HTML标记:换行
# 1、优先替换双换行:使用goose提取到的cleaned_text,都是双换行
cleaned_text = cleaned_text.replace("\n\n", "<br>")
cleaned_text = cleaned_text.replace("\n", "<br>")
return cleaned_text
@staticmethod
def get_article_title(article: Article, link_text=''):
#
# 优先提取h1、div、span、td元素中的标题
# 1、测试任务:2.智能采集\1.测试任务\国资委-新闻发布
# a. 原title标题:中国能建:聚焦价值创造 打造国企改革发展“红色引擎”-国务院国有资产监督管理委员会
# b. div元素中的标题:中国能建:聚焦价值创造 打造国企改革发展“红色引擎”
# 2、测试任务:2.智能采集\1.测试任务\国家林业和草原局-地方动态
# a. 原title标题:上海完成森林资源年度监测遥感解译图斑市级质量检查_地方动态_国家林业和草原局政府网
# b. span元素中的标题:上海完成森林资源年度监测遥感解译图斑市级质量检查
#
# 根据xpath,查询标题元素时:
# 1、标签优先级:h1、特殊元素(id或class包含title)、h2、h3、div、span、td
#
title_element_list = [
'h1',
'h2',
'h3',
'div',
'span',
'td',
'p',
]
# 对比标题前,统一将空格剔除(2022-09-21):
# 1、测试任务:3.马荣:一带一路,配置不成功\中国电力建设集团(测试:标题采集失败)
# 2、相比列表中的链接文本、title标签中的内容,元素中的标题,“秉承丝路精髓 抒写锦绣华章”中间多出一个空格
link_text = link_text.replace(" ", "")
tag_title = article.title.replace(" ", "")
title = None
for title_element in title_element_list:
element_list = article.raw_doc.getroottree().xpath(f'//{title_element}')
# 查询XPath成功,遍历所有元素
for element in element_list:
# 取纯文本内容,包括子元素
text = etree.tounicode(element, method='text').strip()
text_no_space = text.replace(" ", "")
# 判断标题:
# 1、如果智能采集的原title标题,以“元素内容”开头,则取元素内容
# 2、查找成功后,返回text作为标题,否则继续下一个循环
# 判断是否以“元素中的标题”开始:
# 1、title必须以“元素中的标题”开始,不能判断“包含”
# 2、测试URL:http://v.people.cn/n1/2022/0901/c444662-32517559.html
# 3、title标签:<title>亿缕阳光丨小生意,大格局--人民视频--人民网</title>
# a. 如果判断“包含”,会采集到:人民网
# b. 因为存在元素:<a href="http://www.people.com.cn/" class="clink">人民网</a>
# c. 如果判断以“元素中的标题”开始,采集到:亿缕阳光丨小生意,大格局
# d. 标题元素:<h2>亿缕阳光丨小生意,大格局</h2>
# 新方案:
# 1、对比常用元素:仍判断是否以“元素中的标题”开始
# 2、优先对比“链接文本”,其次对比“title元素”
# 3、满足最少字数:6个字
# 新方案(2022-09-21):
# 1、对比“链接文本”、“title元素”时,除了判断开始,同时允许结尾
# 2、测试任务:3.马荣:一带一路,配置不成功\中国电力建设集团(测试:标题采集失败)
# a. 列表中的链接文本:【“一带一路”旗舰篇】秉承丝路精髓 抒写锦绣华章——河北电...
# b. title标签中的内容:<title>中国电力建设集团 公司要闻 【“一带一路”旗舰篇】秉承丝路精髓 抒写锦绣华章——河北电建一公司摘取“一带一路”上的“鲁班奖”桂冠</title>
# c. 元素中的标题:【“一带一路”旗舰篇】秉承丝路精髓 抒写锦绣华章——河北电建一公司摘取“一带一路”上的“鲁班奖”桂冠
if text_no_space is not None and text_no_space != '' and len(
text_no_space) >= SmartExtractorUtility.title_min_len:
# 优先判断6个字,以方便调试:排除短文本元素
if link_text.startswith(text_no_space) or link_text.endswith(text_no_space) or tag_title.startswith(
text_no_space) or tag_title.endswith(text_no_space):
# 返回时,仍返回未剔除空格后的标题
return text
if title:
# 查找成功,返回元素中的标题
return title
else:
# 查找失败,返回提取到的title属性
# return article.title
# 新考虑:标题采集失败后,返回空值
# 1、原因:article.title 不可靠,只是提取了 title 标签中的内容
return ''
@staticmethod
def get_publish_date(article: Article):
# 优先使用正则式提取日期
# 1、测试任务:加德满都邮报-national-security
# a. 使用 publish_datetime_utc 提取英文日期后,提取错误
# b. 实际日期:Friday, August 19, 2022,但提取到了:2015-02-05
# c. 原因:在下方JS中,有一段JSON文本: "datePublished": "2015-02-05T08:00:00+08:00"
# 2、注意:中文网站,都必须使用正则式
publish_date = SmartExtractorUtility.extract_publish_date(article.raw_html)
if publish_date != '':
return publish_date
else:
if article.publish_datetime_utc:
# 优先使用提取成功的 datetime
return article.publish_datetime_utc.strftime('%Y-%m-%d')
elif article.publish_date:
# 其次使用提取成功的 date 字符串
return article.publish_date
else:
# 全部提取失败,返回字符串
return ''
@staticmethod
def get_article_text(article: Article):
# 第一种方法:在纯文本(cleaned_text)基础上,添加br标签
# 1、缺点:无法获取图片,同时会丢掉原有的p标签(只能用br替补)
# text = SmartExtractor.add_html_br(article.cleaned_text)
# 第二种方法:直接获取 top_node 的HTML内容
# 1、优点:可保留原有的p标签等
# 2、缺点:无法获取图片,img标签未被保留
# text = etree.tounicode(article.top_node, method='html')
# 测试抛出异常
# raise Exception("测试抛出异常")
# 第三种方法:获取到 top_node 的xpath,再通过xpath查询原始doc
# 1、可行:通过查询原始doc,可以获取“正文”的所有HTML内容
# 2、遇到问题:获取到 top_node 的xpath不准确,与原位置偏移一个元素
# a. 测试URL:https://news.cctv.com/2022/08/16/ARTIERrXbbVtVUaQU0pMzQxf220816.shtml
# b. 获取到的xpath:/html/body/div/div[1]/div[2]/div[4]
# c. 实际xpath:/html/body/div/div[1]/div[2]/div[5]
# 3、解决办法:
# a. 优先使用id、class查询,如果没有id、class,再查询 top_node 的xpath
xpath = None
if type(article.top_node) is HtmlElement:
if 'id' in article.top_node.attrib:
xpath = "//*[@id='{}']".format(article.top_node.attrib['id'])
elif 'class' in article.top_node.attrib:
xpath = "//*[@class='{}']".format(article.top_node.attrib['class'])
else:
xpath = article.top_node.getroottree().getpath(article.top_node)
else:
# article.top_node 有时为空:
# 1、测试URL:https://baijiahao.baidu.com/s?id=1741311527693101670
# 2、输出日志:article.top_node 不是 HtmlElement 对象:None
print("SmartExtractor:article.top_node 为 {},不是 HtmlElement 对象。".format(article.top_node))
# article.top_node 为空时,直接输出 cleaned_text:
# 1、在纯文本(cleaned_text)基础上,添加br标签
text = SmartExtractorUtility.add_html_br(article.cleaned_text)
return text
# 根据xpath,查询元素
element_list = article.raw_doc.getroottree().xpath(xpath)
if element_list:
# 查询XPath成功,获取第一个元素的HTML
text = etree.tounicode(element_list[0], method='html')
else:
# 查询XPath失败,返回 top_node 原有的HTML
# 1、缺点:无法获取图片,img标签未被保留
text = etree.tounicode(article.top_node, method='html')
return text
#coding=utf-8
import datetime
import json
import time
import pymysql
import requests
from kafka import KafkaProducer
from smart_extractor import SmartExtractor
from bs4 import BeautifulSoup
from langid import langid
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from baseCore import BaseCore
basecore = BaseCore()
log = basecore.getLogger()
r = basecore.r
def reqmsg(url):
header={
'Connection':'keep-alive',
#'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile':'?0',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'sec-ch-ua-platform':'"Windows"',
'Accept':'*/*',
'Origin':'https://cn.tradingview.com',
'Sec-Fetch-Site':'same-site',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Dest':'empty',
'Referer':'https://cn.tradingview.com/',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9'
}
proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
for i in range(0,3):
try:
response=requests.get(url=url,headers=header,timeout=10,proxies=proxy,verify=False)
searchmsg=response.json()
except Exception as e:
searchmsg=''
log.info(f'{url}---请求失败--{e}')
if searchmsg:
log.info(f'{url}---请求成功')
break
return searchmsg
def reqDetailmsg(url):
header={
'Host':'cn.tradingview.com',
'Connection':'keep-alive',
'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Sec-Fetch-Site':'none',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-User':'?1',
'Sec-Fetch-Dest':'document',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cookie':'cookiePrivacyPreferenceBannerProduction=notApplicable; cookiesSettings={"analytics":true,"advertising":true}; _ga=GA1.1.153931157.1696599356; will_start_trial=1; device_t=MzBfV0F3OjA.5HeDqPHu8F5Ux85y2Bi3xCC-liNchYNYW1zUgqB5E4s; sessionid=rcy2dho7lh83k6tasjy4jjatig31tbdf; sessionid_sign=v1:K9a7nKtEZ3MWrJqUgqr9ZaVHrjlepGyPAoGrDmq2DiM=; _gcl_au=1.1.557075741.1696651024; png=f403f4d2-d955-4385-b59c-f2d74f7ec679; etg=f403f4d2-d955-4385-b59c-f2d74f7ec679; cachec=f403f4d2-d955-4385-b59c-f2d74f7ec679; tv_ecuid=f403f4d2-d955-4385-b59c-f2d74f7ec679; _ga_YVVRYGL0E0=deleted; __gads=ID=b0fa0efe8c0ccdc3:T=1696647286:RT=1696916773:S=ALNI_MaPEozJ_doJikuSMJ0r5yFDU3j_Mw; __gpi=UID=00000c59f5923a81:T=1696647286:RT=1696916773:S=ALNI_Ma-WnwGckO3mzIStdpHv1jmEDMMvA; _sp_ses.cf1a=*; _sp_id.cf1a=8a315f91-7829-4ad7-bf4b-151a217809dd.1696599355.14.1696924687.1696916773.00da5df6-3641-4999-a8cf-e2d01afa79e7; _ga_YVVRYGL0E0=GS1.1.1696924315.18.1.1696924691.38.0.0',
}
proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
for i in range(0,3):
try:
response=requests.get(url=url,headers=header,timeout=10,proxies=proxy,verify=False)
htmltext=response.text
except Exception as e:
htmltext=''
log.info(f'{url}---详情请求失败--{e}')
if htmltext:
log.info(f'{url}---详情请求成功')
break
return htmltext
def paserList(searchmsg,social_code):
items=searchmsg['items']
for item in items:
try:
id=item['id']
title=item['title']
storyPath='https://cn.tradingview.com'+item['storyPath']
published=item['published']
published=getFormatedate(published)
#是否重复判断
flag=selectLinkMsg(storyPath,social_code)
if flag:
log.info(f'{social_code}---{storyPath}---数据已采集过')
continue
except Exception as e:
log.info(f'列表解析失败----{e}')
continue
try:
source=item['source']
except Exception as e:
source=''
try:
link=item['link']
except Exception as e:
link=''
try:
symbol=item['relatedSymbols'][0]['symbol']
except Exception as e:
symbol=''
try:
# if link:
# sourceAddress=link
# else:
# sourceAddress=storyPath
sourceAddress=storyPath
content,contentWithTag=extractorMsg(sourceAddress,title)
if content:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
detailmsg={
'content': content,
'contentWithTag': contentWithTag,
'createDate': time_now,
'publishDate': published,
'sourceAddress': sourceAddress, # 原文链接
'summary': '',
'title': title,
'socialCreditCode': social_code,
'year': published[:4]
}
sendToKafka(detailmsg)
saveLinkMsg(sourceAddress,social_code)
log.info(f'信息发生kafka成功----{sourceAddress}')
else:
log.info(f'内容抽取失败----{sourceAddress}')
except Exception as e:
log.info(f'{social_code}____{sourceAddress}详情采集异常{e}')
def getFormatedate(timestamp):
date = datetime.datetime.fromtimestamp(timestamp)
formatted_date = date.strftime('%Y-%m-%d')
return formatted_date
def createDriver():
chrome_driver =r'C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe'
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = r'D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe'
# 设置代理
# proxy = "127.0.0.1:8080" # 代理地址和端口
# chrome_options.add_argument('--proxy-server=http://' + proxy)
driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
return driver
def extractorMsg(url,title):
content=''
contentWithTag=''
lang=detect_language(title)
sm=SmartExtractor(lang)
raw_html=reqDetailmsg(url)
if raw_html:
try:
soup=BeautifulSoup(raw_html,'html.parser')
tdoc=soup.select('div[class="body-KX2tCBZq body-pIO_GYwT content-pIO_GYwT"]')[0]
content=tdoc.text
contentWithTag=str(tdoc)
except Exception as e:
log.info(f'抽取失败!!{e}')
if content:
log.info(f'抽取成功')
else:
try:
article=sm.extract_by_html(raw_html)
content=article.cleaned_text
contentWithTag=article.text
except Exception as e:
log.info(f'抽取失败!!{e}')
if content:
log.info(f'抽取成功')
else:
driver=createDriver()
driver.get(url)
time.sleep(3)
raw_html=driver.page_source
try:
article=sm.extract_by_html(raw_html)
content=article.cleaned_text
contentWithTag=article.text
except Exception as e:
log.info(f'抽取失败!!{e}')
return content,contentWithTag
def detect_language(html):
soup = BeautifulSoup(html, "html.parser")
text = soup.get_text()
# 使用langid.py判断文本的语言
lang, confidence = langid.classify(text)
return lang
def conn144():
conn = pymysql.Connect(host='114.115.159.144', port=3306, user='caiji', passwd='zzsn9988', db='caiji',
charset='utf8')
cursor = conn.cursor()
return conn,cursor
def getStockFromSql():
conn,cursor=conn144()
# 检查记录是否存在
select_sql=f"SELECT ticker,exchange,xydm FROM mgzqyjwyh_list "
cursor.execute(select_sql)
gn_result = cursor.fetchall()
conn.commit()
itemList=[]
for item in gn_result:
try:
ticker=item[0]
exchange=item[1]
xydm=item[2]
exchange=str(exchange).upper()
param=exchange+':'+ticker+'_'+xydm
r.rpush('tradview_ticker', param)
itemList.append(param)
except Exception as e:
print(e)
cursor.close()
conn.close()
return itemList
def sendToKafka(detailmsg):
dic_news = {
'attachmentIds': '',
'author': '',
'content': detailmsg['content'],
'contentWithTag': detailmsg['contentWithTag'],
'createDate': detailmsg['createDate'],
'deleteFlag': '0',
'id': '',
'keyWords': '',
'lang': 'zh',
'origin': 'Tradingview',
'publishDate': detailmsg['publishDate'],
'sid': '1711619846545776641',
'sourceAddress': detailmsg['sourceAddress'], # 原文链接
'summary': '',
'title': detailmsg['title'],
'type': 2,
'socialCreditCode': detailmsg['socialCreditCode'],
'year': detailmsg['year']
}
producer=KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
try:
kafka_result = producer.send("researchReportTopic",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
# log.info(kafka_result.get(timeout=10))
log.info('发送kafka成功!')
except Exception as e:
log.info(f"发生kafka失败{e}")
finally:
producer.close()
#将连接保存到数据库
def saveLinkMsg(link,social_code):
conn,cursor=conn144()
try:
insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,publish_time,create_time) values(%s,%s,%s,%s,%s,now())'''
# 动态信息列表
time_format = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
list_info = [
social_code,
link,
'Tradingview',
'2',
time_format
]
cursor.execute(insert_sql, tuple(list_info))
except Exception as e:
log.info(f'{link}插入库中失败{e}')
finally:
conn.commit()
cursor.close()
conn.close()
#查询是否存在
def selectLinkMsg(link,social_code):
flag=False
conn,cursor=conn144()
try:
sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s and type='2' '''
cursor.execute(sel_sql, (link, social_code))
selects = cursor.fetchone()
if selects:
log.info(f'-----{social_code}----{link}:已经存在')
flag=True
except Exception as e:
log.info(f'查询数据是否在库中失败{e}')
finally:
conn.commit()
cursor.close()
conn.close()
return flag
if __name__ == '__main__':
# url='https://news-headlines.tradingview.com/v2/headlines?client=web&lang=zh-Hans&symbol=NASDAQ%3AAAPL'
# searchmsg=reqmsg(url)
# print(searchmsg)
# getStockFromSql()
while True:
try:
tradview_ticker=r.lpop('tradview_ticker')
if tradview_ticker:
tradviewticker = tradview_ticker.decode(errors='ignore')
ticker_param=str(tradviewticker).split('_')[0]
social_code=str(tradviewticker).split('_')[1]
url=f'https://news-headlines.tradingview.com/v2/headlines?client=web&lang=zh-Hans&symbol={tradview_ticker}'
searchmsg=reqmsg(url)
paserList(searchmsg,social_code)
except Exception as e:
log.info(f'redis中获取企业信息为空{e}')
break
#coding=utf-8
import datetime
import json
import time
import pymysql
import requests
from kafka import KafkaProducer
from smart_extractor import SmartExtractor
from bs4 import BeautifulSoup
from gne import GeneralNewsExtractor
from langid import langid
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from base import BaseCore
basecore = BaseCore.BaseCore()
log = basecore.getLogger()
r = basecore.r
def reqmsg(url):
header={
'Connection':'keep-alive',
#'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile':'?0',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'sec-ch-ua-platform':'"Windows"',
'Accept':'*/*',
'Origin':'https://cn.tradingview.com',
'Sec-Fetch-Site':'same-site',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Dest':'empty',
'Referer':'https://cn.tradingview.com/',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9'
}
proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
for i in range(0,3):
try:
response=requests.get(url=url,headers=header,timeout=10,proxies=proxy,verify=False)
searchmsg=response.json()
except Exception as e:
searchmsg=''
log.info(f'{url}---请求失败--{e}')
if searchmsg:
log.info(f'{url}---请求成功')
break
return searchmsg
def reqDetailmsg(url):
header={
'Host':'cn.tradingview.com',
'Connection':'keep-alive',
'sec-ch-ua':'"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Sec-Fetch-Site':'none',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-User':'?1',
'Sec-Fetch-Dest':'document',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cookie':'cookiePrivacyPreferenceBannerProduction=notApplicable; cookiesSettings={"analytics":true,"advertising":true}; _ga=GA1.1.153931157.1696599356; will_start_trial=1; device_t=MzBfV0F3OjA.5HeDqPHu8F5Ux85y2Bi3xCC-liNchYNYW1zUgqB5E4s; sessionid=rcy2dho7lh83k6tasjy4jjatig31tbdf; sessionid_sign=v1:K9a7nKtEZ3MWrJqUgqr9ZaVHrjlepGyPAoGrDmq2DiM=; _gcl_au=1.1.557075741.1696651024; png=f403f4d2-d955-4385-b59c-f2d74f7ec679; etg=f403f4d2-d955-4385-b59c-f2d74f7ec679; cachec=f403f4d2-d955-4385-b59c-f2d74f7ec679; tv_ecuid=f403f4d2-d955-4385-b59c-f2d74f7ec679; _ga_YVVRYGL0E0=deleted; __gads=ID=b0fa0efe8c0ccdc3:T=1696647286:RT=1696916773:S=ALNI_MaPEozJ_doJikuSMJ0r5yFDU3j_Mw; __gpi=UID=00000c59f5923a81:T=1696647286:RT=1696916773:S=ALNI_Ma-WnwGckO3mzIStdpHv1jmEDMMvA; _sp_ses.cf1a=*; _sp_id.cf1a=8a315f91-7829-4ad7-bf4b-151a217809dd.1696599355.14.1696924687.1696916773.00da5df6-3641-4999-a8cf-e2d01afa79e7; _ga_YVVRYGL0E0=GS1.1.1696924315.18.1.1696924691.38.0.0',
}
proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
for i in range(0,3):
try:
response=requests.get(url=url,headers=header,timeout=10,proxies=proxy,verify=False)
htmltext=response.text
except Exception as e:
htmltext=''
log.info(f'{url}---详情请求失败--{e}')
if htmltext:
log.info(f'{url}---详情请求成功')
break
return htmltext
def paserList(searchmsg,social_code):
items=searchmsg['items']
for item in items:
try:
id=item['id']
title=item['title']
storyPath='https://cn.tradingview.com'+item['storyPath']
published=item['published']
published=getFormatedate(published)
#是否重复判断
flag=selectLinkMsg(storyPath,social_code)
if flag:
log.info(f'{social_code}---{storyPath}---数据已采集过')
continue
except Exception as e:
log.info(f'列表解析失败----{e}')
continue
try:
source=item['source']
except Exception as e:
source=''
try:
link=item['link']
except Exception as e:
link=''
try:
symbol=item['relatedSymbols'][0]['symbol']
except Exception as e:
symbol=''
try:
# if link:
# sourceAddress=link
# else:
# sourceAddress=storyPath
sourceAddress=storyPath
content,contentWithTag=extractorMsg(sourceAddress,title)
if content:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
detailmsg={
'content': content,
'contentWithTag': contentWithTag,
'createDate': time_now,
'publishDate': published,
'sourceAddress': sourceAddress, # 原文链接
'summary': '',
'title': title,
'socialCreditCode': social_code,
'year': published[:4]
}
sendToKafka(detailmsg)
saveLinkMsg(sourceAddress,social_code)
log.info(f'信息发生kafka成功----{sourceAddress}')
else:
log.info(f'内容抽取失败----{sourceAddress}')
except Exception as e:
log.info(f'{social_code}____{sourceAddress}详情采集异常{e}')
def getFormatedate(timestamp):
date = datetime.datetime.fromtimestamp(timestamp)
formatted_date = date.strftime('%Y-%m-%d')
return formatted_date
def createDriver():
chrome_driver =r'C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe'
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = r'D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe'
# 设置代理
# proxy = "127.0.0.1:8080" # 代理地址和端口
# chrome_options.add_argument('--proxy-server=http://' + proxy)
driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
return driver
def extractorMsg(url,title):
content=''
contentWithTag=''
lang=detect_language(title)
sm=SmartExtractor(lang)
raw_html=reqDetailmsg(url)
if raw_html:
try:
soup=BeautifulSoup(raw_html,'html.parser')
tdoc=soup.select('div[class="body-KX2tCBZq body-pIO_GYwT content-pIO_GYwT"]')[0]
content=tdoc.text
contentWithTag=str(tdoc)
except Exception as e:
log.info(f'抽取失败!!{e}')
if content:
log.info(f'抽取成功')
else:
try:
article=sm.extract_by_html(raw_html)
content=article.cleaned_text
contentWithTag=article.text
except Exception as e:
log.info(f'抽取失败!!{e}')
if content:
log.info(f'抽取成功')
else:
try:
article_content=paserDetail(raw_html,url)
content=article_content['content']
contentWithTag=article_content['body_html']
except Exception as e:
log.info(f'抽取失败!!{e}')
else:
driver=createDriver()
driver.get(url)
time.sleep(3)
raw_html=driver.page_source
try:
article=sm.extract_by_html(raw_html)
content=article.cleaned_text
contentWithTag=article.text
except Exception as e:
log.info(f'抽取失败!!{e}')
if content:
log.info(f'抽取成功')
else:
try:
article_content=paserDetail(raw_html,url)
content=article_content['content']
contentWithTag=article_content['body_html']
except Exception as e:
log.info(f'抽取失败!!{e}')
return content,contentWithTag
#智能抽取
def paserDetail(detailhtml,detailurl):
try:
extractor = GeneralNewsExtractor()
article_content = extractor.extract(detailhtml,host=detailurl,with_body_html=True)
except:
article_content={}
return article_content
def detect_language(html):
soup = BeautifulSoup(html, "html.parser")
text = soup.get_text()
# 使用langid.py判断文本的语言
lang, confidence = langid.classify(text)
return lang
def conn144():
conn = pymysql.Connect(host='114.115.159.144', port=3306, user='caiji', passwd='zzsn9988', db='caiji',
charset='utf8')
cursor = conn.cursor()
return conn,cursor
def getStockFromSql():
conn,cursor=conn144()
# 检查记录是否存在
select_sql=f"SELECT ticker,exchange,xydm FROM mgzqyjwyh_list "
cursor.execute(select_sql)
gn_result = cursor.fetchall()
conn.commit()
itemList=[]
for item in gn_result:
try:
ticker=item[0]
exchange=item[1]
xydm=item[2]
exchange=str(exchange).upper()
param=exchange+':'+ticker+'_'+xydm
r.rpush('tradview_ticker', param)
itemList.append(param)
except Exception as e:
print(e)
cursor.close()
conn.close()
return itemList
def sendToKafka(detailmsg):
dic_news = {
'attachmentIds': '',
'author': '',
'content': detailmsg['content'],
'contentWithTag': detailmsg['contentWithTag'],
'createDate': detailmsg['createDate'],
'deleteFlag': '0',
'id': '',
'keyWords': '',
'lang': 'zh',
'origin': 'Tradingview',
'publishDate': detailmsg['publishDate'],
'sid': '1711619846545776641',
'sourceAddress': detailmsg['sourceAddress'], # 原文链接
'summary': '',
'title': detailmsg['title'],
'type': 2,
'socialCreditCode': detailmsg['socialCreditCode'],
'year': detailmsg['year']
}
producer=KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
try:
kafka_result = producer.send("researchReportTopic",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
log.info(kafka_result.get(timeout=10))
except Exception as e:
log.info(f"发生kafka失败{e}")
finally:
producer.close()
#将连接保存到数据库
def saveLinkMsg(link,social_code):
conn,cursor=conn144()
try:
insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,publish_time,create_time) values(%s,%s,%s,%s,%s,now())'''
# 动态信息列表
time_format = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
list_info = [
social_code,
link,
'Tradingview',
'2',
time_format
]
cursor.execute(insert_sql, tuple(list_info))
except Exception as e:
log.info(f'{link}插入库中失败{e}')
finally:
conn.commit()
cursor.close()
conn.close()
#查询是否存在
def selectLinkMsg(link,social_code):
flag=False
conn,cursor=conn144()
try:
sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s and type='2' '''
cursor.execute(sel_sql, (link, social_code))
selects = cursor.fetchone()
if selects:
log.info(f'-----{social_code}----{link}:已经存在')
flag=True
except Exception as e:
log.info(f'查询数据是否在库中失败{e}')
finally:
conn.commit()
cursor.close()
conn.close()
return flag
if __name__ == '__main__':
# url='https://news-headlines.tradingview.com/v2/headlines?client=web&lang=zh-Hans&symbol=NASDAQ%3AAAPL'
# searchmsg=reqmsg(url)
# print(searchmsg)
# getStockFromSql()
while True:
try:
tradview_ticker=r.lpop('tradview_ticker')
if tradview_ticker:
tradviewticker = tradview_ticker.decode(errors='ignore')
ticker_param=str(tradviewticker).split('_')[0]
social_code=str(tradviewticker).split('_')[1]
url=f'https://news-headlines.tradingview.com/v2/headlines?client=web&lang=zh-Hans&symbol={tradview_ticker}'
searchmsg=reqmsg(url)
paserList(searchmsg,social_code)
except Exception as e:
log.info(f'redis中获取企业信息为空{e}')
break
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论