提交 2ce0a6ad 作者: LiuLiYuan

Changes

上级 4b1fa562
import sys
import pandas as pd
import numpy as np
import pymysql
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import requests
import certifi
from bs4 import BeautifulSoup
from base import BaseCore
import pymysql
# social_code = '91440300665899831W'
# cn = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='caiji', charset='utf8mb4')
# cursor = cn.cursor()
# sql = f"SELECT * FROM `EnterpriseInfo` WHERE SocialCode = '{social_code}'"
# cursor.execute(sql)
# data = cursor.fetchone()
# enname = data[5]
# gpdm = data[3]
# xydm = data[2]
# print(enname,gpdm,xydm)
# basecore =BaseCore.BaseCore()
# path = r'F:\spider\115\chromedriver.exe'
# driver = basecore.buildDriver(path,headless=False)
# url = 'https://www.baidu.com/'
# driver.get(url)
# time.sleep(10)
# driver.close()
# # service = Service(r'./chromedriver.exe')
# # executable_path = '.\\chromedriver.exe/'
# chrome_options = webdriver.ChromeOptions()
# # chrome_options.binary_location = r'./goole/Google/Chrome/Application/chrome.exe'
# # if headless:
# # chrome_options.add_argument('--headless')
# # chrome_options.add_argument('--disable-gpu')
# chrome_options.add_experimental_option(
# "excludeSwitches", ["enable-automation"])
# chrome_options.add_experimental_option('useAutomationExtension', False)
# chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
# chrome_options.add_argument(
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
# driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=executable_path)
# driver.get('www.baidu.com')
# time.sleep(10)
# driver.close()
# def a():
# return ''
# xxx = a()
# if xxx == '':
# print('xxx')
baseCore = BaseCore.BaseCore()
log= baseCore.getLogger()
# cnx = pymysql.connect(host='114.116.44.11', user='root', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
# cursor = cnx.cursor()
cnx = pymysql.connect(host='114.115.159.144', user='root', password='zzsn9988', db='caiji', charset='utf8mb4')
curosr = cnx.cursor()
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'cache-control': 'max-age=0',
#'cookie': 'maex=%7B%22v2%22%3A%7B%7D%7D; GUC=AQEBBwFjY49jkEIa8gQo&s=AQAAABw20C7P&g=Y2JIFQ; A1=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A3=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc; A1S=d=AQABBBIpnmICEOnPTXZVmK6DESXgxq3niTMFEgEBBwGPY2OQYysNb2UB_eMBAAcIEimeYq3niTM&S=AQAAAobGawhriFKqJdu9-rSz9nc&j=WORLD; PRF=t%3D6954.T%252BTEL%252BSOLB.BR%252BSTM%252BEMR%252BGT%252BAMD%252BSYM.DE%252BPEMEX%252BSGO.PA%252BLRLCF%252BSYNH%252B001040.KS; cmp=t=1669714927&j=0&u=1---',
'sec-ch-ua': '"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "Windows",
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
def getInfo(gpdm,xydm):
print('开始')
gpdm_ = gpdm
if 'HK' in gpdm_:
gpdm_ = gpdm_[1:]
start = time.time()
retData={}
retData['base_info'] = {
'公司名称': '',
'英文名':'',
'信用代码': xydm,
'股票代码': gpdm_,
'地址': '',
'电话': '',
'公司网站': '',
'部门': '',
'行业': '',
'员工人数': '',
'公司简介': ''
}
retData['people_info']=[]
# https://finance.yahoo.com/quote/VOW3.DE/profile?p=VOW3.DE
url = f'https://finance.yahoo.com/quote/{gpdm}/profile?p={gpdm}'
def a():
print('a....1')
for i in range(2):
if i == 1:
print('a....3')
sys.exit(0)
print('a....4')
print('a....2')
time.sleep(3)
for i in range(0,3):
try:
response = requests.get(url, headers=headers, verify=False)
time.sleep(1)
if (response.status_code == 200):
break
else:
log.error(f"{gpdm}---第{i}次---获取基本信息接口返回失败:{response.status_code}")
except :
continue
if (response.status_code == 200):
pass
else:
log.error(f"{gpdm}------获取基本信息接口重试后依然失败失败:{response.status_code}")
return retData
soup = BeautifulSoup(response.content, 'html.parser')
page = soup.find('div', {'id': 'Col1-0-Profile-Proxy'})
name = page.find('h3',{'class':'Fz(m) Mb(10px)'})
try:
com_info = page.find('div', {'class': 'Mb(25px)'})
except:
com_info = ''
try:
com_phone = com_info.find_all('p')[0].find('a').text
except:
com_phone = ''
try:
com_url = com_info.find_all('p')[0].find('a', {'target': '_blank'}).text
except:
com_url = ''
try:
com_address = com_info.find_all('p')[0].text.replace(com_phone, '').replace(com_url, '')
except:
com_address = ''
try:
com_bumen = com_info.find_all('p')[1].find_all('span', {'class': 'Fw(600)'})[0].text
except:
com_bumen = ''
try:
com_hangye = com_info.find_all('p')[1].find_all('span', {'class': 'Fw(600)'})[1].text
except:
com_hangye = ''
try:
com_people = com_info.find_all('p')[1].find_all('span', {'class': 'Fw(600)'})[2].text
except:
com_people = ''
try:
com_jianjie = page.find('p', {'class': 'Mt(15px) Lh(1.6)'}).text
except:
com_jianjie = ''
dic_com_info = {
'公司名称':'',
'英文名':name,
'信用代码': xydm,
'股票代码': gpdm_,
'地址': com_address,
'电话': com_phone,
'公司网站': com_url,
'部门': com_bumen,
'行业': com_hangye,
'员工人数': com_people,
'公司简介': com_jianjie
}
retData['base_info']=dic_com_info
#高管信息
retPeople = []
try:
list_people = page.find('table', {'class': 'W(100%)'}).find_all('tr')[1:]
except:
list_people = []
for one_people in list_people:
try:
p_name = one_people.find_all('td')[0].text
except:
p_name = ''
continue
try:
p_zhiwu = one_people.find_all('td')[1].text
except:
p_zhiwu = ''
try:
p_money = one_people.find_all('td')[2].text
except:
p_money = ''
try:
p_xingshi = one_people.find_all('td')[3].text
except:
p_xingshi = ''
try:
p_year = one_people.find_all('td')[4].text
except:
p_year = ''
def b():
print('b....1')
pass
if(p_zhiwu=="N/A"):
p_zhiwu=""
if (p_money == "N/A"):
p_money = ""
if (p_xingshi == "N/A"):
p_xingshi = ""
if (p_year == "N/A"):
p_year = ""
dic_main_people = {
'公司名称': name,
'股票代码': gpdm_,
'信用代码': xydm,
'姓名': p_name,
'职务': p_zhiwu,
'薪资': p_money,
'行使': p_xingshi,
'出生年份': p_year
}
retPeople.append(dic_main_people)
retData['people_info'] = retPeople
# df_a = pd.DataFrame(retData['base_info'])
log.info(f"获取基本信息--{gpdm},耗时{baseCore.getTimeCost(start, time.time())}")
return retData
if __name__ == '__main__':
print('a....start')
a()
print('b....start')
b()
\ No newline at end of file
# # 数据库中获取企业gpdm、xydm
sql_select = "SELECT * FROM Tfbs where col3 is not null and length(col3)>3 and col6 is not null and state1=1 and col3 like 'ZZSN%' order by date1 ,id LIMIT 1"
curosr.execute(sql_select)
data = curosr.fetchone()
id = data[0]
# 更新以获取企业的采集状态
# sql_update = f"UPDATE Tfbs set state1 = 2 WHERE id = {id}"
# curosr.execute(sql_update)
# cnx.commit()
xydm = data[4]
gpdm = data[7]
# 获取企业的基本信息和高管信息
retData = getInfo(gpdm,xydm)
print(retData)
curosr.close()
cnx.close()
url = 'https://finance.yahoo.com/quote/VOW3.DE/profile?p=VOW3.DE'
req = requests.get(url=url,headers=headers,verify=False)
print(req.status_code)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论