提交 e13aa13e 作者: 刘伟刚

代码修改4

上级 07b5b32c
......@@ -16,9 +16,9 @@ from openpyxl import Workbook
import langid
#创建连接池
import pymysql
from pymysql import connections
from DBUtils.PooledDB import PooledDB
import pymysql
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
......
......@@ -9,7 +9,7 @@ r = basecore.r
def conn11():
conn = pymysql.Connect(host='114.116.44.11', port=3306, user='root', passwd='f7s0&7qqtK', db='clb_project',
conn = pymysql.Connect(host='114.116.44.11', port=3306, user='caiji', passwd='f7s0&7qqtK', db='clb_project',
charset='utf8')
cursor = conn.cursor()
return conn,cursor
......@@ -25,6 +25,7 @@ def yahooCodeFromSql():
print('=======')
for item in gn_social_list:
r.rpush('NoticeEnterprise:securities_code', item)
print('将股票代码放入redis结束')
except Exception as e:
log.info("数据查询异常")
finally:
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
# @Author: MENG
# @Time : 2022-4-9
import datetime
import xlrd
from selenium.webdriver.support.wait import WebDriverWait
from tqdm import tqdm
......@@ -42,6 +44,7 @@ create_time
"""
class YahooCaiwu(object):
def __init__(self):
self.config = configparser.ConfigParser()
# 读取配置文件
......@@ -125,6 +128,7 @@ class YahooCaiwu(object):
all_dict['内容'] = content_dict
return all_dict
def get_webdriver(self):
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--disable-gpu')
......@@ -173,7 +177,7 @@ class YahooCaiwu(object):
stock2=stock2[1:]
url = f'https://finance.yahoo.com/quote/{stock2}/financials?p={stock2}'
try:
self.logger.info(f'正在采集:{url}')
print(f'正在采集:{url}')
self.driver.get(url)
# 等待页面加载完成
wait = WebDriverWait(self.driver, 300)
......@@ -293,13 +297,13 @@ class YahooCaiwu(object):
try:
resp = requests.get(get_url)
print(resp.text)
self.logger.info('调用接口成功!!')
print('调用接口成功!!')
except:
with open('雅虎财经-财务数据_发送错误ID.txt', 'a', encoding='utf8')as f:
f.write(stock + '\n')
except Exception as e:
print(e)
self.logger.info(f'采集:{url}失败')
print(f'采集:{url}失败')
self.driver.quit()
time.sleep(10)
self.driver=self.get_webdriver()
......@@ -307,15 +311,16 @@ class YahooCaiwu(object):
# time.sleep(60 * 10)
self.driver.quit()
time.sleep(10)
driver=self.get_webdriver()
self.logger.info('出错,重试中!')
self.driver=self.get_webdriver()
print('出错,重试中!')
continue
# driver.close()
# self.driver.close()
def dataToSql(self,conn,cursor,ipo_data):
try:
social_credit_code=ipo_data['social_credit_code']
stock=str(ipo_data['stock'])
stock=ipo_data['stock']
securities_short_name=ipo_data['securities_short_name']
content=ipo_data['content']
level_relation=ipo_data['level_relation']
......@@ -331,23 +336,27 @@ class YahooCaiwu(object):
select_sql=f"SELECT * FROM config_finance_data_sync WHERE stock_code='{stock}'"
cursor.execute(select_sql)
existing_record = cursor.fetchone()
# 获取当前时间
current_time = datetime.datetime.now()
# 将时间转换为字符串
currentdate = current_time.strftime("%Y-%m-%d %H:%M:%S")
if existing_record:
# 记录已存在,执行更新操作
update_param=(social_credit_code,content,level_relation,origin_type,stock)
update_sql=f"UPDATE config_finance_data_sync SET social_credit_code=%s , content=%s , level_relation=%s,origin_type=%s WHERE stock_code=%s "
update_param=(social_credit_code,content,level_relation,origin_type,currentdate,stock)
update_sql=f"UPDATE config_finance_data_sync SET social_credit_code=%s , content=%s , level_relation=%s,origin_type=%s ,create_time=%s WHERE stock_code=%s "
cursor.execute(update_sql,update_param)
print('更新成功')
else:
insert_param=(social_credit_code,content,level_relation,unit,stock,origin_type)
insert_sql=f"INSERT INTO config_finance_data_sync (social_credit_code, content,level_relation,unit,stock_code,origin_type) VALUES ( %s, %s, %s, %s, %s, %s)"
insert_param=(social_credit_code,content,level_relation,unit,stock,origin_type,currentdate)
insert_sql=f"INSERT INTO config_finance_data_sync (social_credit_code, content,level_relation,unit,stock_code,origin_type,create_time) VALUES ( %s,%s, %s, %s, %s, %s, %s)"
# 记录不存在,执行插入操作
cursor.execute(insert_sql,insert_param)
print('插入成功')
# 提交事务
conn.commit()
except Exception as e:
return False
finally:
cursor.close()
conn.close()
return True
def get_unit(self,doc_resp):
......@@ -366,9 +375,13 @@ if __name__ == '__main__':
#get_content1()
yahoo=YahooCaiwu()
while True:
securitiescode=''
try:
securitiescode=yahoo.getCodeFromRedis()
yahoo.get_content2(securitiescode)
except Exception as e:
yahoo.r.rpush('NoticeEnterprise:securities_code',securitiescode)
if securitiescode:
yahoo.r.rpush('NoticeEnterprise:securities_code',securitiescode)
else:
time.sleep(300)
......@@ -5,7 +5,7 @@ pass=clbzzsn
[mysql]
host=114.115.159.144
username=root
username=caiji
password=zzsn9988
database=caiji
url=jdbc:mysql://114.115.159.144:3306/caiji?useUnicode=true&characterEncoding=utf-8&serverTimezone=Asia/Shanghai&useSSL=false
......
# -*- coding: utf-8 -*-
import datetime
from selenium.webdriver.support.wait import WebDriverWait
import time
import requests
from pyquery import PyQuery as pq
from selenium import webdriver
from requests.packages import urllib3
urllib3.disable_warnings()
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import configparser
import redis
class CurrencyRate(object):
def __init__(self):
self.config = configparser.ConfigParser()
# 读取配置文件
self.config.read('config.ini')
self.r = redis.Redis(host=self.config.get('redis', 'host'),
port=self.config.get('redis', 'port'),
password=self.config.get('redis', 'pass'), db=6)
self.driver=self.get_webdriver()
def get_webdriver(self):
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--start-maximized")
# chrome_options.add_argument('--headless')
chrome_options.binary_location = self.config.get('selenium', 'binary_location')
executable_path =self.config.get('selenium', 'chrome_driver')
driver = webdriver.Chrome(options=chrome_options, executable_path=executable_path)
return driver
def getRate(self):
rateList=[]
for result1 in result_list1:
currency_name = result1[0]
currency = result1[1]
to_USD = ''
to_CNY = ''
for i in range(len(result_list2)):
result2 = result_list2[i]
# https://qq.ip138.com/hl.asp?from=CNY&to=USD&q=1
url = f'''https://qq.ip138.com/hl.asp?from={currency}&to={result2}&q=1'''
# 等待页面加载完成
try:
self.driver.get(url)
wait = WebDriverWait(self.driver, 300)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
time.sleep(1)
doc_resp = pq(self.driver.page_source)
money = doc_resp('table tr:nth-child(3) td:nth-child(3)').text()
if money == '1':
money_result = money
else:
try:
money_result = round(float(money), 4)
except:
continue
if i == 0:
to_USD = money_result
else:
to_CNY = money_result
except Exception as e:
try:
self.driver.close()
self.driver.quit()
except Exception as e:
print(e)
self.driver=self.get_webdriver()
now = datetime.datetime.now()
now_time = now.strftime('%Y-%m-%d')
if to_USD == '' or to_CNY == '':
continue
result_dict = {
'币种': currency_name,
'币简称': currency,
'对美元': to_USD,
'对人民币': to_CNY,
'更新时间': now_time }
print(result_dict)
rate={
"currencyName": currency_name,
"currencyCode": currency,
"rateToUSD": to_USD,
"rateToCNY": to_CNY,
"createDate": now_time
}
rateList.append(rate)
# market_url = f'http://192.168.1.39:8088/sync/currencyRate'
market_url = f'http://114.115.236.206:8088/sync/currencyRate'
try:
resp = requests.post(market_url,json=rateList)
# 检查响应状态码
if resp.status_code == 200:
print("请求成功")
# 打印响应内容
print(resp.content)
else:
print("请求失败")
except Exception as e:
print(e)
if __name__ == '__main__':
result_list1 = [
[
'人民币',
'CNY'],
[
'美元',
'USD'],
[
'欧元',
'EUR'],
[
'瑞士法郎',
'CHF'],
[
'加元',
'CAD'],
[
'波兰兹罗提',
'PLN'],
[
'英镑',
'GBP'],
[
'澳元',
'AUD'],
[
'泰铢',
'THB'],
[
'沙特里亚尔',
'SAR'],
[
'巴西里亚伊',
'BRL'],
[
'新土耳其新里拉',
'TRY'],
[
'新台币',
'TWD'],
[
'印度卢比',
'INR'],
[
'墨西哥比索',
'MXN'],
[
'日元',
'JPY'],
[
'瑞典克朗',
'SEK'],
[
'韩元',
'KRW'],
[
'俄罗斯卢布',
'RUB'],
[
'新加坡元',
'SGD'],
[
'港币',
'HKD']]
result_list2 = [
'USD',
'CNY']
currenRate=CurrencyRate()
currenRate.getRate()
currenRate.driver.quit()
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import datetime
import xlrd
from selenium.webdriver.support.wait import WebDriverWait
from tqdm import tqdm
import pymongo
import pymysql
import time
import requests
from pyquery import PyQuery as pq
from selenium import webdriver
import json
from requests.packages import urllib3
urllib3.disable_warnings()
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import re
from BaseCore import BaseCore
import configparser
import redis
class Shizhi(object):
def __init__(self):
self.config = configparser.ConfigParser()
# 读取配置文件
self.config.read('config.ini')
baseCore=BaseCore()
self.logger=baseCore.getLogger()
self.r = redis.Redis(host=self.config.get('redis', 'host'),
port=self.config.get('redis', 'port'),
password=self.config.get('redis', 'pass'), db=6)
self.driver=self.get_webdriver()
def get_webdriver(self):
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--start-maximized")
# chrome_options.add_argument('--headless')
chrome_options.binary_location = self.config.get('selenium', 'binary_location')
executable_path =self.config.get('selenium', 'chrome_driver')
driver = webdriver.Chrome(options=chrome_options, executable_path=executable_path)
return driver
def conn11(self):
conn = pymysql.Connect(host='114.116.44.11', port=3306, user='root', passwd='f7s0&7qqtK', db='clb_project',
charset='utf8')
cursor = conn.cursor()
return conn,cursor
def getmarketCap(self):
conn,cursor=self.conn11()
try:
sql1 = """select social_credit_code,securities_code,securities_short_name from sys_base_enterprise_ipo where category in ('4','5','6') """ # and stock_code = "SYNH"
cursor.execute(sql1)
result_data = cursor.fetchall()
except Exception as e:
self.logger.info("数据查询异常!")
return
for data in result_data:
try:
data_list = list(data)
print(data_list)
social_credit_code = data_list[0]
stock = data_list[1]
securities_short_name = data_list[2] if data_list[2] is not None else ""
# content_sql = ''
stock2=str(stock)
if stock2.upper().endswith("HK") and stock2.upper().startswith("0") :
stock2=stock2[1:]
# https://finance.yahoo.com/quote/032830.KS?p=032830.KS
url = f'https://finance.yahoo.com/quote/{stock2}?p={stock2}'
try:
self.logger.info(f'正在采集:{url}')
self.driver.get(url)
# 等待页面加载完成
wait = WebDriverWait(self.driver, 300)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
time.sleep(5)
doc_resp = pq(self.driver.page_source)
unit=doc_resp('div[id="quote-header-info"]>div:nth-child(2)>div:nth-child(1)>div:nth-child(2)>span')
currency = unit.text().split("Currency in ")[1]
market_cap=doc_resp('td[data-test="MARKET_CAP-value"]')
marketcap=market_cap.text()
if marketcap and marketcap!='N/A':
# 获取当前时间
current_time = datetime.datetime.now()
currentdate = current_time.strftime("%Y-%m-%d")
print(f'信用代码:{social_credit_code} 股票代码:{stock} 币种:{currency} 市值:{marketcap} 日期:{currentdate}')
# market_url = f'http://192.168.1.39:8088/sync/marketValue'
market_url = f'http://114.115.236.206:8088/sync/marketValue'
param= {
"socialCreditCode": social_credit_code,
"stockCode": stock,
"marketValue": marketcap,
"originalUnit": currency,
"valueTime": currentdate
}
try:
resp = requests.post(market_url,json=param)
# 检查响应状态码
if resp.status_code == 200:
print("请求成功")
# 打印响应内容
print(resp.content)
else:
print("请求失败")
except:
with open('雅虎财经-财务数据_发送错误ID.txt', 'a', encoding='utf8')as f:
f.write(stock + '\n')
except Exception as e:
self.driver.close()
self.driver.quit()
self.driver=self.get_webdriver()
print(e)
except Exception as e:
print(e)
self.driver.close()
self.driver.quit()
self.driver=self.get_webdriver()
if __name__ == '__main__':
shizhi=Shizhi()
shizhi.getmarketCap()
\ No newline at end of file
# -*- coding: utf-8 -*-
import datetime
import time
import pymysql
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from pyquery import PyQuery as pq
from openpyxl import Workbook
import pandas as pd
class WanfangSpider(object):
def __init__(self):
pass
def req(self,url):
header={
"accept":"*/*",
"connection":"Keep-Alive",
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
}
res = requests.get(url,headers=header)
if res.status_code==200:
text=res.text
print('请求成功!')
else:
text=''
print('请求失败!')
return text
# 将html中的相对地址转换成绝对地址
def paserUrl(self,html,listurl):
soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links = soup.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return soup
def pageList(self):
listmsg=[]
for num in range(1,73):
url=f'https://kms.wanfangdata.com.cn/IndustryYJ/Search/Cecdb?q=%E5%86%B6%E9%87%91%2B%E5%86%B6%E7%82%BC%20%E6%9C%BA%E6%9E%84%3Acsi&f=Inst.Type&PageNumber={num}'
html=self.req(url)
soup=self.paserUrl(html,url)
text=str(soup.prettify())
doc=pq(text)
liTag=doc('li[class="rt-wrap"]')
# print(liTag)
for li in liTag:
lidoc=pq(li)
title=lidoc('a[class="title"]').text()
turl=lidoc('a[class="title"]').attr('href')
msg={
'title':title,
'turl':turl
}
listmsg.append(msg)
return listmsg
def detailMsg(self,msg):
detailList=[]
turl = msg['turl']
title = msg['title']
html=self.req(turl)
soup=self.paserUrl(html,turl)
dtext=str(soup.prettify())
ddoc=pq(dtext)
a1=ddoc('table[class="detail-md"]>tr:nth-child(2)>td:nth-child(1)').text().replace(":","")
institutionType=ddoc('table[class="detail-md"]>tr:nth-child(2)>td:nth-child(2)').text()
a2=ddoc('table[class="detail-md"]>tr:nth-child(3)>td:nth-child(1)').text().replace(":","")
formerName=ddoc('table[class="detail-md"]>tr:nth-child(3)>td:nth-child(2)').text()
a3=ddoc('table[class="detail-md"]>tr:nth-child(4)>td:nth-child(1)').text().replace(":","")
leader=ddoc('table[class="detail-md"]>tr:nth-child(4)>td:nth-child(2)').text()
a4=ddoc('table[class="detail-md"]>tr:nth-child(5)>td:nth-child(1)').text().replace(":","")
establishmentDate=ddoc('table[class="detail-md"]>tr:nth-child(5)>td:nth-child(2)').text()
a5=ddoc('table[class="detail-md"]>tr:nth-child(6)>td:nth-child(1)').text().replace(":","")
introduction=ddoc('table[class="detail-md"]>tr:nth-child(6)>td:nth-child(2)').text()
a6=ddoc('table[class="detail-md"]>tr:nth-child(7)>td:nth-child(1)').text().replace(":","")
classification=ddoc('table[class="detail-md"]>tr:nth-child(7)>td:nth-child(2)').text()
a7=ddoc('table[class="detail-md"]>tr:nth-child(8)>td:nth-child(1)').text().replace(":","")
keywords=ddoc('table[class="detail-md"]>tr:nth-child(8)>td:nth-child(2)').text()
a8=ddoc('table[class="detail-md"]>tr:nth-child(9)>td:nth-child(1)').text().replace(":","")
researchEquipment=ddoc('table[class="detail-md"]>tr:nth-child(9)>td:nth-child(2)').text()
a9=ddoc('table[class="detail-md"]>tr:nth-child(10)>td:nth-child(1)').text().replace(":","")
researchAreas=ddoc('table[class="detail-md"]>tr:nth-child(10)>td:nth-child(2)').text()
a10=ddoc('table[class="detail-md"]>tr:nth-child(11)>td:nth-child(1)').text().replace(":","")
awards=ddoc('table[class="detail-md"]>tr:nth-child(11)>td:nth-child(2)').text()
a11=ddoc('table[class="detail-md"]>tr:nth-child(12)>td:nth-child(1)').text().replace(":","")
internalDepartments=ddoc('table[class="detail-md"]>tr:nth-child(12)>td:nth-child(2)').text()
a12=ddoc('table[class="detail-md"]>tr:nth-child(13)>td:nth-child(1)').text().replace(":","")
subsidiaryInstitutions=ddoc('table[class="detail-md"]>tr:nth-child(13)>td:nth-child(2)').text()
a13=ddoc('table[class="detail-md"]>tr:nth-child(14)>td:nth-child(1)').text().replace(":","")
productInformation=ddoc('table[class="detail-md"]>tr:nth-child(14)>td:nth-child(2)').text()
a14=ddoc('table[class="detail-md"]>tr:nth-child(15)>td:nth-child(1)').text().replace(":","")
publicationJournals=ddoc('table[class="detail-md"]>tr:nth-child(15)>td:nth-child(2)').text()
a15=ddoc('table[class="detail-md"]>tr:nth-child(16)>td:nth-child(1)').text().replace(":","")
mailingAddress=ddoc('table[class="detail-md"]>tr:nth-child(16)>td:nth-child(2)').text()
a16=ddoc('table[class="detail-md"]>tr:nth-child(17)>td:nth-child(1)').text().replace(":","")
tel=ddoc('table[class="detail-md"]>tr:nth-child(17)>td:nth-child(2)').text()
a17=ddoc('table[class="detail-md"]>tr:nth-child(18)>td:nth-child(1)').text().replace(":","")
faxNumber=ddoc('table[class="detail-md"]>tr:nth-child(18)>td:nth-child(2)').text()
a18=ddoc('table[class="detail-md"]>tr:nth-child(19)>td:nth-child(1)').text().replace(":","")
email=ddoc('table[class="detail-md"]>tr:nth-child(19)>td:nth-child(2)').text()
a19=ddoc('table[class="detail-md"]>tr:nth-child(20)>td:nth-child(1)').text().replace(":","")
website=ddoc('table[class="detail-md"]>tr:nth-child(20)>td:nth-child(2)').text()
detailmsg={
'title':title,
a1:institutionType,
a2:formerName,
a3:leader,
a4:establishmentDate,
a5:introduction,
a6:classification,
a7:keywords,
a8:researchEquipment,
a9:researchAreas,
a10:awards,
a11:internalDepartments,
a12:subsidiaryInstitutions,
a13:productInformation,
a14:publicationJournals,
a15:mailingAddress,
a16:tel,
a17:faxNumber,
a18:email,
a19:website
}
detailList.append(detailmsg)
self.writerToExcel(detailList)
def conn144(self):
conn = pymysql.Connect(host='114.115.159.144', port=3306, user='caiji', passwd='zzsn9988', db='caiji',
charset='utf8')
cursor = conn.cursor()
return conn,cursor
def dataToSql(self,detailmsg):
conn,cursor=self.conn144()
try:
# 检查记录是否存在
# 获取当前时间
current_time = datetime.datetime.now()
# 将时间转换为字符串
currentdate = current_time.strftime("%Y-%m-%d %H:%M:%S")
except Exception as e:
print('+++++')
finally:
cursor.close()
conn.close()
# 将数据追加到excel
def writerToExcel(self,detailList):
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
existing_data = pd.read_excel(filename)
# 创建新的数据
new_data = pd.DataFrame(data=detailList)
# 将新数据添加到现有数据的末尾
combined_data = existing_data.append(new_data, ignore_index=True)
# 将结果写入到xlsx文件
combined_data.to_excel(filename, index=False)
if __name__ == '__main__':
filename='机构.xlsx'
# # 创建一个工作簿
workbook = Workbook()
workbook.save(filename)
wanfang=WanfangSpider()
lsitmsg=wanfang.pageList()
for msg in lsitmsg:
wanfang.detailMsg(msg)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论