提交 20a0eb97 作者: 刘伟刚

修改代码提交

上级 b376f641
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -64,8 +64,9 @@ class YahooCaiwu(object):
doc_items = pq(resp1_table[1]).children()
if len(doc_items)<1:
resp1_table = doc_resp('#Col1-1-Financials-Proxy section div:nth-child(4)>div>div').children()
catalogue_title = pq(resp1_table[0]).text().split('\n')
doc_items = pq(resp1_table[1]).children()
if resp1_table:
catalogue_title = pq(resp1_table[0]).text().split('\n')
doc_items = pq(resp1_table[1]).children()
catalogue_dict = {}
content_dict = {}
for doc_item in doc_items:
......@@ -376,6 +377,10 @@ class YahooCaiwu(object):
#对比指标计算
def calculateIndexReq(self):
get_url = 'http://114.115.236.206:8088/sync/calculateIndex'
# 获取当前时间
current_time = datetime.datetime.now()
# 将时间转换为字符串
currentdate = current_time.strftime("%Y-%m-%d %H:%M:%S")
try:
params={
'type':2
......@@ -399,6 +404,7 @@ if __name__ == '__main__':
# parse_excel()
#get_content1()
yahoo=YahooCaiwu()
while True:
securitiescode=''
try:
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -58,74 +58,80 @@ class YahooCaiwu(object):
# 雅虎财经处理表格
def deal_table(self,doc_resp):
all_dict = {}
resp1_table = doc_resp('#Col1-1-Financials-Proxy section div:nth-child(3)>div>div').children()
catalogue_title = pq(resp1_table[0]).text().split('\n')
doc_items = pq(resp1_table[1]).children()
if len(doc_items)<1:
resp1_table = doc_resp('#Col1-1-Financials-Proxy section div:nth-child(4)>div>div').children()
try:
all_dict = {}
resp1_table = doc_resp('#Col1-1-Financials-Proxy section div:nth-child(3)>div>div').children()
catalogue_title = pq(resp1_table[0]).text().split('\n')
doc_items = pq(resp1_table[1]).children()
catalogue_dict = {}
content_dict = {}
for doc_item in doc_items:
if pq(doc_item).text() == '':
continue
a = pq(pq(doc_item).children()[0]).text().split('\n')[0]
a_list = pq(pq(doc_item).children()[0]).text().split('\n')[1:]
content_dict[a] = a_list
b_dict = {}
for doc_item1 in pq(doc_item).children()[1]:
b = pq(pq(doc_item1).children()[0]).text().split('\n')[0]
if not b:
if len(doc_items)<1:
resp1_table = doc_resp('#Col1-1-Financials-Proxy section div:nth-child(4)>div>div').children()
if resp1_table:
catalogue_title = pq(resp1_table[0]).text().split('\n')
doc_items = pq(resp1_table[1]).children()
catalogue_dict = {}
content_dict = {}
for doc_item in doc_items:
if pq(doc_item).text() == '':
continue
b_list = pq(pq(doc_item1).children()[0]).text().split('\n')[1:]
content_dict[b] = b_list
c_dict = {}
for doc_item2 in pq(doc_item1).children()[1]:
c = pq(pq(doc_item2).children()[0]).text().split('\n')[0]
if not c:
a = pq(pq(doc_item).children()[0]).text().split('\n')[0]
a_list = pq(pq(doc_item).children()[0]).text().split('\n')[1:]
content_dict[a] = a_list
b_dict = {}
for doc_item1 in pq(doc_item).children()[1]:
b = pq(pq(doc_item1).children()[0]).text().split('\n')[0]
if not b:
continue
c_list = pq(pq(doc_item2).children()[0]).text().split('\n')[1:]
content_dict[c] = c_list
d_dict = {}
for doc_item3 in pq(doc_item2).children()[1]:
d = pq(pq(doc_item3).children()[0]).text().split('\n')[0]
if not d:
b_list = pq(pq(doc_item1).children()[0]).text().split('\n')[1:]
content_dict[b] = b_list
c_dict = {}
for doc_item2 in pq(doc_item1).children()[1]:
c = pq(pq(doc_item2).children()[0]).text().split('\n')[0]
if not c:
continue
d_list = pq(pq(doc_item3).children()[0]).text().split('\n')[1:]
content_dict[d] = d_list
e_dict = {}
for doc_item4 in pq(doc_item3).children()[1]:
e = pq(pq(doc_item4).children()[0]).text().split('\n')[0]
if not e:
c_list = pq(pq(doc_item2).children()[0]).text().split('\n')[1:]
content_dict[c] = c_list
d_dict = {}
for doc_item3 in pq(doc_item2).children()[1]:
d = pq(pq(doc_item3).children()[0]).text().split('\n')[0]
if not d:
continue
e_list = pq(pq(doc_item4).children()[0]).text().split('\n')[1:]
content_dict[e] = e_list
f_dict = {}
for doc_item5 in pq(doc_item4).children()[1]:
f = pq(pq(doc_item5).children()[0]).text().split('\n')[0]
if not f:
d_list = pq(pq(doc_item3).children()[0]).text().split('\n')[1:]
content_dict[d] = d_list
e_dict = {}
for doc_item4 in pq(doc_item3).children()[1]:
e = pq(pq(doc_item4).children()[0]).text().split('\n')[0]
if not e:
continue
f_list = pq(pq(doc_item5).children()[0]).text().split('\n')[1:]
content_dict[f] = f_list
g_dict = {}
for doc_item6 in pq(doc_item5).children()[1]:
g = pq(pq(doc_item6).children()[0]).text().split('\n')[0]
if not g:
e_list = pq(pq(doc_item4).children()[0]).text().split('\n')[1:]
content_dict[e] = e_list
f_dict = {}
for doc_item5 in pq(doc_item4).children()[1]:
f = pq(pq(doc_item5).children()[0]).text().split('\n')[0]
if not f:
continue
g_list = pq(pq(doc_item6).children()[0]).text().split('\n')[1:]
content_dict[g] = g_list
g_dict[g] = {}
f_dict[f] = g_dict
e_dict[e] = f_dict
d_dict[d] = e_dict
c_dict[c] = d_dict
b_dict[b] = c_dict
catalogue_dict[a] = b_dict
all_dict['表头'] = catalogue_title
all_dict['目录'] = catalogue_dict
all_dict['内容'] = content_dict
f_list = pq(pq(doc_item5).children()[0]).text().split('\n')[1:]
content_dict[f] = f_list
g_dict = {}
for doc_item6 in pq(doc_item5).children()[1]:
g = pq(pq(doc_item6).children()[0]).text().split('\n')[0]
if not g:
continue
g_list = pq(pq(doc_item6).children()[0]).text().split('\n')[1:]
content_dict[g] = g_list
g_dict[g] = {}
f_dict[f] = g_dict
e_dict[e] = f_dict
d_dict[d] = e_dict
c_dict[c] = d_dict
b_dict[b] = c_dict
catalogue_dict[a] = b_dict
all_dict['表头'] = catalogue_title
all_dict['目录'] = catalogue_dict
all_dict['内容'] = content_dict
except Exception as e:
all_dict['表头'] = {}
all_dict['目录'] = {}
all_dict['内容'] = {}
return all_dict
......@@ -157,7 +163,8 @@ class YahooCaiwu(object):
conn,cursor=self.conn11()
try:
sql1 = """select social_credit_code,securities_code,securities_short_name from sys_base_enterprise_ipo where exchange='8' """ # and stock_code = "SYNH"
# sql1 = """select social_credit_code,securities_code,securities_short_name from sys_base_enterprise_ipo where securities_code='RAIZ4.SA' """ # and stock_code = "SYNH"
sql1 = """select social_credit_code,securities_code,securities_short_name from sys_base_enterprise_ipo where exchange='8' and any_data='0' """ # and stock_code = "SYNH"
# sql1 = f"select social_credit_code,securities_code,securities_short_name from sys_base_enterprise_ipo where securities_code='{securitiescode}' " # and stock_code = "SYNH"
cursor.execute(sql1)
result_data = cursor.fetchall()
......@@ -396,6 +403,7 @@ class YahooCaiwu(object):
print('调用接口成功!!')
except:
print('调用失败!')
if __name__ == '__main__':
# parse_excel()
#get_content1()
......
# import redis
#
#
# r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn',db=3)
#
# # 获取所有键
# keys = r.keys('*')
# # print(keys)
# for key in keys:
# f_key = key.decode()
# print(f_key)
# print("----------")
# res = r.exists(f_key)
# value = list(r.smembers(f_key))
# # 对列表进行排序
# value.sort()
# # 遍历排序后的列表
# list_data = []
# for member in value:
# member = member.decode()
# members = member.strip('[').strip(']').replace('\'','').strip().split(',')
# #获取每一个报告期
# for date in members:
# data = date.strip()
# # print(date.strip())
# list_data.append(data)
# # 放入redis
# for item in list_data:
# r.sadd(key, item)
#
# # 获取Set中的所有元素
# items = r.smembers(key)
# # print(items)
# print("======================================")
import re
from urllib.parse import quote, unquote
import requests
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import json
import difflib
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import datetime
timestamp = 1688054400 # 示例时间戳
date = datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
headers={
'Connection':'keep-alive',
'Pragma':'no-cache',
'Cache-Control':'no-cache',
'sec-ch-ua':'"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Sec-Fetch-Site':'same-origin',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-User':'?1',
'Sec-Fetch-Dest':'document',
'Referer':'https://quotes.sina.com.cn/usstock/hq/income.php?s=brk.a&t=quarter',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cookie':'UOR=,finance.sina.com.cn,; SINAGLOBAL=123.149.3.173_1695815968.404462; Apache=123.149.3.173_1695815968.404463; ULV=1695816017391:2:2:2:123.149.3.173_1695815968.404463:1695815967476; lxlrttp=1578733570; U_TRS1=000000ad.bc7f83f51.651419db.690100f2; U_TRS2=000000ad.bc8a83f51.651419db.138fca70; SUB=_2AkMSSJVgf8NxqwFRmP0XzG3kbIxxyA_EieKkFGS7JRMyHRl-yD9kqhY-tRB6Oci7j27VGy6gikgIaUYBZsIPzk3PbLLC; hqEtagMode=1',
}
proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
def reqHtml(url):
res=requests.get(url,headers=headers,verify=False,timeout=10)
res.encoding='GB18030'
text=res.text
return text
headers2={
'Accept':'*/*',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Content-Length':'0',
'Cookie':'HWWAFSESID=fd8b573695b0ce804b; HWWAFSESTIME=1695799275143',
'Host':'www.qyyjt.cn',
'Origin':'https://www.qyyjt.cn',
'Pragma':'no-cache',
'Referer':'https://www.qyyjt.cn/detail/enterprise/overview?code=56CD928FAD278663E73BE7486C764DA7&type=company',
'Sec-Fetch-Dest':'empty',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Site':'same-origin',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
'client':'pc-web;pro',
'dataid':'869',
'pcuss':'eyJ0eXAiOiJKV1QiLCJ0eXBlIjoiand0IiwiYWxnIjoiSFMyNTYifQ.eyJjcmVhdGVUaW1lIjoiMjAyMy0wOS0yNyAyMDoxODowMy40NDkiLCJleHAiOjE2OTU4MTc5ODMsInVzZXJJZCI6IjIwMjMwOTI3MTUyMzA0XzEzNTkyNDgxODM5IiwiZXhwaXJlZFRpbWUiOiIyMDIzLTA5LTI3IDIwOjMzOjAzLjQ0OSJ9.SouwRylKogHfJILh97JMnYRzcJuj2Hg30BmQa9gc-Nc',
'sec-ch-ua':'"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"',
'system':'new',
'terminal':'pc-web;pro',
'user':'847E223529194582C37A02EEEC8AC09F0D7AD12E40778D6CA9CFB91F69F8C537',
'ver':'20230914',
'x-request-id':'x1eCRO-X8D7',
}
def reqPostMsg(url):
res=requests.post(url,headers=headers2,verify=False,timeout=10)
res.encoding='utf-8'
text=res.text
return text
def get_realurl(tmpurl):
try:
pattern='url=(.{1,}?)&aid'
match = re.search(pattern, tmpurl)
# 判断是否匹配成功
if match:
# 获取匹配的结果
result = match.group(1)
result=unquote(result)
else:
result=''
except:
result=''
return result
def getFormatedate(timestamp):
date = datetime.datetime.fromtimestamp(timestamp)
formatted_date = date.strftime('%Y-%m-%d')
return formatted_date
print(date)
url='https://quotes.sina.com.cn/usstock/hq/income.php?s=brk.a&t=quarter'
ttext=reqHtml(url)
soup=BeautifulSoup(ttext,'html.parser')
tdoc=soup.select('div[class="tbl_wrap"]>table[class="data_tbl os_tbl"]')[0]
print(str(tdoc))
......@@ -461,7 +461,7 @@ def listPage():
}
]
for operand in operands:
logger.info(f'采集地域股票信息{operand}')
rego=operand['operands'][1]
#第一次请求获取地域总共有的股票代码数量
try:
stockmsg=reqmsg(0,operand)
......@@ -469,21 +469,23 @@ def listPage():
except Exception as e:
logger.info(f'region该地域没有股票信息{operand}')
continue
logger.info(f'采集地域股票信息{rego}---对应的数量{total}')
for i in range(0,total,100):
logger.info(f"offset的值{i}")
stockmsg=reqmsg(i,operand)
if stockmsg:
try:
getStock(stockmsg)
getStock(stockmsg,rego)
except Exception as e:
logger.info(f"解析失败{e}")
time.sleep(3)
def getStock(stockmsg):
def getStock(stockmsg,rego):
quotes=stockmsg['finance']['result'][0]['quotes']
for quote in quotes:
symbol=quote['symbol']
logger.info(f"{rego}地区对应的股票代码{symbol}")
try:
longName=quote['longName']
except:
......
......@@ -169,15 +169,13 @@ class QQnewsTaskJob(object):
qqnewsSpider.get_page_html()
except Exception as e:
logger.info('搜狗搜索异常'+searchkw)
finally:
qqnewsSpider.driver.quit()
if qqnewsSpider.detailList.qsize() != 0:
try:
qqnewsSpider.get_detail_html()
except Exception as e:
logger.info('详情解析异常'+searchkw)
finally:
qqnewsSpider.driver.quit()
logger.info("关键词采集结束!"+searchkw)
if __name__ == '__main__':
# ss='道地西洋参+(销售市场|交易市场|直播带货|借助大会平台|网店|微商|电商|农民博主|推介宣传|高品质定位|西洋参产品经营者加盟|引进龙头企业|西洋参冷风库|建设农旅中心|农产品展销中心|精品民宿|温泉)'
......
#coding=utf-8
#coding=utf-8
......@@ -402,6 +402,7 @@ class BaiduSpider(object):
# 使用langid.py判断文本的语言
lang, confidence = langid.classify(text)
return lang
# 获取详情页
def get_detail_html(self):
# 获取当前窗口的句柄
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -190,7 +190,7 @@ if __name__ == '__main__':
while True:
try:
codeList=[]
codeList.append('KW-20230818-0003')
codeList.append('KW-20230925-0002')
for codeid in codeList:
try:
# keymsg=baiduTaskJob.getkafka()
......@@ -207,7 +207,7 @@ if __name__ == '__main__':
continue
if kwList:
# 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
# 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(baiduTaskJob.runSpider, data) for data in kwList]
# 获取任务的执行结果
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论