提交 930da4ff 作者: LiJunMing

新三板财务数据脚本维护

上级 e3ee9068
++ /dev/null
import json
import requests,time,re,random,pymysql
import pandas as pd
from bs4 import BeautifulSoup
import urllib3
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
cnx = pymysql.connect(host='114.115.159.144',user='root', password='zzsn9988', db='clb_project', charset='utf8mb4')
cursor = cnx.cursor()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
def get_proxy():
sql = "select proxy from clb_proxy"
cursor.execute(sql)
proxy_lists = cursor.fetchall()
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
proxy_list = []
for str_ip in ip_list:
str_ip_list = str_ip.split('-')
proxyMeta = "http://%(host)s:%(port)s" % {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
proxy = {
"HTTP": proxyMeta,
"HTTPS": proxyMeta
}
proxy_list.append(proxy)
return proxy_list
headers = {
'Cookie': 'TYCID=82cbe530204b11ed9f23298cecec1c60; ssuid=3927938144; _ga=GA1.2.1842488970.1670638075; jsid=SEO-BAIDU-ALL-SY-000001; tyc-user-info={%22state%22:%220%22%2C%22vipManager%22:%220%22%2C%22mobile%22:%2215565837784%22}; tyc-user-info-save-time=1678953978429; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTU2NTgzNzc4NCIsImlhdCI6MTY3ODk1Mzk3OCwiZXhwIjoxNjgxNTQ1OTc4fQ.wsNxLWMkZVrtOEvo_CCDPD38R7F23c5yk7dFAdHkwFPkZhEEvmiv0nlt7UD0ZWfo3t8aYxc4qvu4ueEgMubJ5g; tyc-user-phone=%255B%252215565837784%2522%255D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22284710084%22%2C%22first_id%22%3A%22182b9ca585ead-089598c1d7f7928-26021d51-1327104-182b9ca585f7f1%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfbG9naW5faWQiOiIyODQ3MTAwODQiLCIkaWRlbnRpdHlfY29va2llX2lkIjoiMTgyYjljYTU4NWVhZC0wODk1OThjMWQ3Zjc5MjgtMjYwMjFkNTEtMTMyNzEwNC0xODJiOWNhNTg1ZjdmMSJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22284710084%22%7D%2C%22%24device_id%22%3A%22182b9ca585ead-089598c1d7f7928-26021d51-1327104-182b9ca585f7f1%22%7D; HWWAFSESID=fa776898fa88a6520ea; HWWAFSESTIME=1679899464128; csrfToken=m3cB6mHsznwIuppkT-S8oYc6; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1679016180,1679471093,1679732923,1679899468; bdHomeCount=28; bannerFlag=true; show_activity_id_92=92; searchSessionId=1679899783.48494979; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1679899783',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
}
list_code = []
while True:
list_weicha = []
list_all_info = []
name_list = []
start_time = time.time()
# 获取企业信息
query = "SELECT * FROM Tfbs where col3 is not null and length(col3)>3 and col3 not like 'ZZSN%' and state2 is null limit 1 "
#兴业银行
# query = "SELECT * FROM Tfbs where col3 is not null and length(col3)>3 and col3 not like 'ZZSN%' and col5='兴业银行'"
cursor_.execute(query)
row = cursor_.fetchone()
if row:
pass
else:
print('没有数据了,结束脚本')
break
com_name = row[6]
social_code = row[4]
code = row[7]
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
updateBeginSql = f"update Tfbs set state1=0,date2='{time_now}' where col3='{social_code}' "
# print(updateBeginSql)
cursor_.execute(updateBeginSql)
cnx_.commit()
t = time.time()
ip = get_proxy()[random.randint(0,3)]
url_t = f'https://www.tianyancha.com/search?key={social_code}&sessionNo={t}'
res_t = requests.get(url_t,headers=headers, proxies=ip,verify=False) #, proxies=ip,verify=False
time.sleep(10)
soup_t = BeautifulSoup(res_t.content, 'html.parser')
try:
com_id = soup_t.find('div',{'class':'index_header__x2QZ3'}).find('a').get('href').split('/')[-1]
print(f"{com_name}:{com_id}")
except:
com_id = '--'
print(f'{com_name}:没有查询到该企业')
#colext1获取天眼查id
updateBeginSql = f"update Tfbs set state2=0,colext1='{com_id}',date2='{time_now}' where col3='{social_code}' "
cursor_.execute(updateBeginSql)
cnx_.commit()
log.info(f'{com_name}===天眼查id更新入库===== ')
if com_id == '--':
continue
list_one_info = []
list_all_1 = []
list_all_2 = []
# 采集天眼查企业核心人员并通过接口入库
log.info('=====开始采集企业核心人员=======')
print(f'{social_code}:{com_id}')
num = 1
for page in range(1, 2):
t = int(time.time() * 1000)
url = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={com_id}&pageSize=20&pageNum={page}'
ip = get_proxy()[random.randint(0, 3)]
res = requests.get(url, headers=headers, proxies=ip) # ,verify=False
time.sleep(10)
list_all = res.json()['data']['dataList']
if list_all:
for one_info in list_all:
name = one_info['name']
sex = one_info['sex']
education = one_info['education']
position = one_info['position']
Salary = one_info['salary']
try:
birthYear = 2023 - int(one_info['age'])
except:
birthYear = ''
StockKeepings = one_info['numberOfShares']
currentTerm = one_info['term']
personInfo = one_info['resume']
try:
person_img = one_info['logo']
except:
person_img = '--'
dic_json = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": StockKeepings,
"shareRatio": '',
"benefitShare": '',
"currentTerm": currentTerm,
"personInfo": personInfo,
"sort": str(num)
}
dic_json_img = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": StockKeepings,
"shareRatio": '',
"benefitShare": '',
"currentTerm": currentTerm,
"personInfo": personInfo,
"头像": person_img,
"sort": str(num)
}
num = num + 1
list_one_info.append(dic_json)
list_all_2.append(dic_json_img)
else:
t = int(time.time() * 1000)
url = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={com_id}&pageSize=20&pageNum={page}'
ip = get_proxy()[random.randint(0, 3)]
res = requests.get(url, headers=headers, proxies=ip) # ,verify=False
list_all = res.json()['data']['result']
for one_info in list_all:
name = one_info['name']
sex = ''
education = ''
position = one_info['typeSore']
Salary = ''
birthYear = ''
shareRatio = one_info['percent']
try:
benefitShare = one_info['finalBenefitShares']
except:
benefitShare = ''
person_id = one_info['id']
person_url = f'https://www.tianyancha.com/human/{person_id}-c{com_id}'
person_res = requests.get(person_url, headers=headers, proxies=ip)
person_soup = BeautifulSoup(person_res.content, 'html.parser')
try:
personInfo = person_soup.find('span', {'class': '_56d0a'}).text.strip()
except:
personInfo = ''
try:
person_img = one_info['logo']
except:
person_img = '--'
dic_json = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": '',
"shareRatio": shareRatio,
"benefitShare": benefitShare,
"currentTerm": '',
"personInfo": personInfo,
"sort": str(num)
}
dic_json_img = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": '',
"shareRatio": shareRatio,
"benefitShare": benefitShare,
"currentTerm": '',
"personInfo": personInfo,
"头像": person_img,
"sort": str(num)
}
num = num + 1
list_one_info.append(dic_json)
list_all_2.append(dic_json_img)
log.info(f'{com_name}===该企业采集完成====')
df_info = pd.DataFrame(list_one_info)
df_info.to_excel('主要人员.xlsx', index=False)
json_updata = json.dumps(list_one_info)
if json_updata == '[]':
continue
else:
pass
response = requests.post('http://114.115.236.206:8088/sync/executive', data=json_updata, timeout=300,
verify=False)
print(response.text)
cnx.close()
cursor.close()
baseCore.close()
# df_img = pd.DataFrame(list_all_2)
# df_img.to_excel('企业主要人员-头像-23年500强新榜.xlsx',index=False)
...@@ -24,8 +24,9 @@ cursor_ = baseCore.cursor_ ...@@ -24,8 +24,9 @@ cursor_ = baseCore.cursor_
# tracker_conf = get_tracker_conf('./client.conf') # tracker_conf = get_tracker_conf('./client.conf')
# client = Fdfs_client(tracker_conf) # client = Fdfs_client(tracker_conf)
taskType = '企业公告/证监会' taskType = '企业公告/证监会/新三板'
#todo:股转公告和挂牌审核包含在公司公告中,没有单独的id
type_map = { type_map = {
'zljgcs':'自律监管措施', 'zljgcs':'自律监管措施',
'wxh':'问询函', 'wxh':'问询函',
......
# __init__.py
__version__ = '2.2.0'
VERSION = tuple(map(int, __version__.split('.')))
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论