提交 dc808e12 作者: LiuLiYuan

Merge remote-tracking branch 'origin/master'

# cache = {
# "company": {
# "上汽集团": 1,
# "欧盟委员会": 13,
# "欧盟": 8,
# "欧盟委员会总部大厦一角": 2
# },
# "person": {
# "梅赛德斯": 2,
# "齐普策": 8,
# "韦杜姆": 5,
# "朔尔茨": 7,
# "冯德莱恩": 1,
# "布尔茨": 4,
# "纳吉": 3,
# "哈贝克": 3,
# "特斯拉": 2,
# "莫里永": 1,
# "维辛": 1,
# "康松林": 1,
# "尼古拉斯普瓦捷NiclasPoitiers": 1,
# "李缘": 1,
# "伯格": 1,
# "阿斯拉克伯格AslakBerg": 1,
# "林燕": 1,
# "华盛顿": 1,
# "布鲁盖尔Bruegel": 1,
# "川普": 1,
# "哈桑-扎米特": 1,
# "小鹏": 2,
# "康逸": 3,
# "费尔": 1,
# "埃德加博高": 1,
# "蔚来": 3,
# "杜登赫费尔": 3,
# "赵丁喆": 3,
# "卢基斯": 1,
# "斯泰兰蒂斯": 2,
# "保罗博若思": 1,
# "海国": 1,
# "斯特凡德古阿拉": 1,
# "弗兰克": 1,
# "施沃佩": 1,
# "费迪南德": 3,
# "迪尔克扬杜拉": 1,
# "米扎克": 1,
# "帕沃尔安塔利奇": 1,
# "亚采克米扎克": 1,
# "弗尔季奥蒂洛": 1,
# "张晨霖": 1,
# "基塞伊佐尔坦": 1,
# "德古阿拉": 3,
# "明道加斯普": 1,
# "杜登赫": 1,
# "奥托尔巴吉": 1,
# "郭晨": 1,
# "波罗": 1,
# "尹栋逊": 1,
# "颜景辉": 1,
# "段思瑶": 1,
# "裴健如": 1,
# "陈庆": 1,
# "纳吉马顿": 2,
# "崔东树": 1,
# "PatrickHummel": 1,
# "如蔚": 1,
# "李斌": 1,
# "福尔克•维辛": 1,
# "蔚": 1,
# "TechWeb": 1,
# "Suky": 1,
# "陈继业": 1,
# "欧方": 1,
# "齐普策OliverZipse": 1,
# "康林松OlaKaellenius": 1,
# "PFA": 1,
# "ACEA": 1,
# "希尔德加德": 1,
# "穆勒HildegardMueller": 1,
# "阿道夫乌尔索阿道夫": 1,
# "乌尔索": 1,
# "马库斯费伯MarkusFerber": 1,
# "特蕾莎里贝拉TeresaRibera": 1,
# "福尔克维辛": 2,
# "辛婧": 1,
# "殷晓圣": 3,
# "李若佳": 1,
# "刘维佳": 1,
# "萨拉热窝": 1,
# "专员薇奥莱塔布尔茨": 4,
# "哈贝克RobertHabeck": 1,
# "布特克MaximilianButek": 1,
# "关乌": 1,
# "布特克": 2,
# "俄乌": 1,
# "哈桑": 2,
# "吕瑟尔斯海姆": 2,
# "何塞普戈梅斯": 3,
# "李学军": 2,
# "刘向": 2,
# "戈梅斯": 2,
# "马灿": 2,
# "克雷希米尔": 2,
# "康林松": 3,
# "于荣": 2,
# "霍尔格格尔克": 3,
# "陈斌杰": 2,
# "梁国勇": 3,
# "李博": 2,
# "乔纳森博格": 2,
# "胡加齐": 2,
# "单玮怡": 2,
# "林剑": 3,
# "马克西米利安布特克MaximilianButek": 1,
# "何亚东": 1,
# "吕骞": 1,
# "金瑞庭": 1,
# "罗知之": 1,
# "马铭博": 1,
# "马铭": 1,
# "梅赛德斯-奔驰": 1,
# "埃隆马斯克": 1,
# "罗伯特哈贝克RobertHabeck": 1,
# "奥拉夫朔尔茨OlafScholz": 1
# },
# "location": {
# "上海市": 4,
# "北京市": 2,
# "江西省": 1,
# "赣州市": 1,
# "常州市": 2,
# "武进区": 2,
# "江苏省": 2
# },
# "sentiment": {
# "负面": 4,
# "中性": 10,
# "正面": 10
# },
# "time": {
# "17.4": 2,
# "6月12日": 6,
# "12日": 2,
# "2024年06月14日": 1,
# "2024年6月3日": 1,
# "6月13日": 1,
# "7月4日": 1,
# "38.1": 1,
# "2023年2月15日": 1,
# "6月17日": 3,
# "60.7": 1,
# "6月14日": 1
# }
# }
#
# top_keywords = {keyword_type: sorted(keyword_freq.items(), key=lambda x: x[1], reverse=True)[:10] for
# keyword_type, keyword_freq in cache.items()}
# # print(top_keywords)
#
# # 提取前十的关键词
# top_keywords_dict = {keyword_type: [keyword for keyword, freq in keywords] for keyword_type, keywords in
# top_keywords.items()}
# print(top_keywords_dict)
#
# industry_result = top_keywords_dict['industry'] if 'industry' in top_keywords_dict else []
# insert_industry = ",".join(industry_result)
# company_result = top_keywords_dict["company"] if "company" in top_keywords_dict else []
# person_result = top_keywords_dict["person"] if "person" in top_keywords_dict else []
# sentiment_result = top_keywords_dict["sentiment"] if "sentiment" in top_keywords_dict else []
# location_result = top_keywords_dict["location"] if "location" in top_keywords_dict else []
# time_result = top_keywords_dict["time"] if "time" in top_keywords_dict else []
# print(f"insert_industry:{insert_industry}")
# insert_company = ",".join(company_result)
# insert_person = ",".join(person_result)
# insert_sentiment = ",".join(sentiment_result)
# insert_location = ",".join(location_result)
# insert_time = ",".join(time_result)
# print(f"insert_company:{insert_company}")
# print(f"insert_person:{insert_person}")
# print(f"insert_sentiment:{insert_sentiment}")
# print(f"insert_location:{insert_location}")
# print(f"insert_time:{insert_time}")
# print(type(insert_industry))
#
# test_none = None
# test_set = set(test_none)
# print(test_set)
# set1 = {'万家小新,迎春,李虹萦,张灏然,王宏志,袁野,谭作钧,习近平,鄂维南,苟坪'}
# set2 = {'孟晚舟,李虹萦,张灏然,习近平,王宏志,鄂维南,张玉卓,谭作钧'}
# # 使用 & 运算符找到交集
# intersection_set = set1 & set2
#
# print(intersection_set)
import json
import json
import pandas as pd
# 假设我们有一个包含int64类型数据的DataFrame
df = pd.DataFrame({'id': [1, 2, 3], 'value': [4, 5, 6]})
df['id'] = df['id'].astype('int64') # 确保id列是int64类型
# 将DataFrame转换为JSON
# 首先将int64转换为int
df['id'] = df['id'].astype('int')
# 现在可以将DataFrame转换为JSON
json_str = json.dumps(df.to_dict(orient='records'))
print(json_str)
......@@ -372,8 +372,9 @@ def AnnualEnterpriseXueQ_task():
def AnnualEnterpriseUS():
cnx,cursor = connectSql()
# 获取美股企业
us_query = "select SocialCode from EnterpriseInfo where Place = '2' and SecuritiesType = '美股' and SecuritiesCode is not null and CreateTime='2023-08-15 14:00:00'"
# us_query = "select SocialCode from EnterpriseInfo where Place = '2' and SecuritiesType = '美股' and SecuritiesCode is not null and CreateTime='2023-08-15 14:00:00'"
# us_query = "select SocialCode from EnterpriseInfo where Place = '2' and SecuritiesType = '美股' and SecuritiesCode = 'BP' "
us_query = "select cik from mgzqyjwyh_list where state=2 "
#ZZSN22080900000025
cursor.execute(us_query)
us_result = cursor.fetchall()
......@@ -381,7 +382,7 @@ def AnnualEnterpriseUS():
us_social_list = [item[0] for item in us_result]
print('=======')
for item in us_social_list:
r.rpush('AnnualEnterprise:usqy_socialCode', item)
r.rpush('Sec_cik_US:uscik_annualReport', item)
closeSql(cnx,cursor)
#国外企业基本信息 redis中放入id
......@@ -659,12 +660,14 @@ if __name__ == "__main__":
# zhuangjingtexind()
# NoticeEnterprise()
# NoticeDF()
AnnualEnterpriseUS()
# AnnualEnterpriseIPO()
# AnnualEnterprise()
# BaseInfoEnterprise()
# BaseInfoEnterpriseAbroad()
# NewsEnterprise_task()
# NewsEnterprise()
# NoticeEnterprise()
# CorPerson()
# china100()
# global100()
......@@ -678,7 +681,7 @@ if __name__ == "__main__":
# SEC_CIK()
# dujioashou()
# omeng()
AnnualEnterprise()
# AnnualEnterprise()
# AnnualEnterpriseUS()
# NoticeEnterprise_task()
# AnnualEnterprise_task()
......
"""
"""
......@@ -47,93 +47,100 @@ def update_table(update_sql, cursor_c, cnx_c):
if __name__ == "__main__":
key = 'Synchronize_data:info'
# result = search_formal_table('social_credit_code, name, english_name', 'sys_base_enterprise', 'yn_domestic', '1', cursor)
# for row in result:
# social_credit_code = row[0]
# name = row[1]
# english_name = row[2]
# if not english_name:
# english_name = ''
# item = social_credit_code + '|' + name + '|' + english_name
# baseCore.rePutIntoR(key, item)
while True:
info = baseCore.redicPullData(key)
if info == None:
break
else:
pass
com_code = info.split('|')[0]
com_name = info.split('|')[1]
com_english_name = info.split('|')[2]
result = search_formal_table('CompanyName, SocialCode, EnglishName, SecuritiesCode, SecuritiesShortName, Place, isIPO, SecuritiesType, Category, Exchange, countryName', 'EnterpriseInfo',
'SocialCode', com_code, cursor_)
u_name, u_code, u_ename, u_short_name, u_type, u_category, u_exchange = '', '', '', '', '', '', ''
# 更新语句
update_sql = """update EnterpriseInfo set {} where SocialCode = {}"""
fields = ''
if result:
# 判断这几个值是否为空
if result[0][0] != com_name:
u_name = com_name
fields = f'CompanyName = "{com_name}", '
if not result[0][2] and com_english_name:
u_ename = com_english_name
fields += f'EnglishName = "{com_english_name}", '
if not result[0][5]:
u_place = '1'
fields += f'Place = "{u_place}", '
if not result[0][10]:
u_countryname = '中国内地'
fields += f'countryName = "{u_countryname}", '
if not result[0][3]:
result_ipo = search_formal_table('social_credit_code, securities_code, securities_short_name, securities_type, category,exchange', 'sys_base_enterprise_ipo', 'social_credit_code',
com_code, cursor)
if result_ipo:
# 是上市企业
if not result[0][6]:
u_ipo = '1'
fields += f'IsIPO = "{u_ipo}", '
if result_ipo[0][1]:
u_code = result_ipo[0][1]
fields += f'SecuritiesCode = "{u_code}", '
if not result[0][4] and result_ipo[0][2]:
u_short_name = result_ipo[0][2]
fields += f'SecuritiesShortName = "{u_short_name}", '
if not result[0][7] and result_ipo[0][3]:
u_type = result_ipo[0][3]
fields += f'SecuritiesType = "{u_type}", '
if not result[0][8] and result_ipo[0][4]:
u_category = result_ipo[0][4]
fields += f'Category = "{u_category}", '
if not result[0][9] and result_ipo[0][5]:
u_exchange = result_ipo[0][5]
fields += f'Exchange = "{u_exchange}", '
else: # 可能不是上市企业
result = search_formal_table('social_credit_code, name, english_name', 'sys_base_enterprise', 'yn_domestic', '1', cursor)
for row in result:
social_credit_code = row[0]
name = row[1]
english_name = row[2]
if not english_name:
english_name = ''
item = social_credit_code + '|' + name + '|' + english_name
baseCore.rePutIntoR(key, item)
# while True:
# # info = baseCore.redicPullData(key)
# info = ""
# if info == None:
# break
# else:
# pass
# log.info(f"当前企业---{info}---")
# com_code = info.split('|')[0]
# com_name = info.split('|')[1]
# com_english_name = info.split('|')[2]
# result = search_formal_table('CompanyName, SocialCode, EnglishName, SecuritiesCode, SecuritiesShortName, Place, isIPO, SecuritiesType, Category, Exchange, countryName', 'EnterpriseInfo',
# 'SocialCode', com_code, cursor_)
# u_name, u_code, u_ename, u_short_name, u_type, u_category, u_exchange = '', '', '', '', '', '', ''
# # 更新语句
# update_sql = """update EnterpriseInfo set {} where SocialCode = {}"""
# fields = ''
# if result:
# # 判断这几个值是否为空
# if result[0][0] != com_name:
# u_name = com_name
# fields = f'CompanyName = "{com_name}", '
# if not result[0][2] and com_english_name:
# u_ename = com_english_name
# fields += f'EnglishName = "{com_english_name}", '
# if not result[0][5]:
# u_place = '1'
# fields += f'Place = "{u_place}", '
# if not result[0][10]:
# u_countryname = '中国内地'
# fields += f'countryName = "{u_countryname}", '
# if not result[0][3]:
# result_ipo = search_formal_table('social_credit_code, securities_code, securities_short_name, securities_type, category,exchange', 'sys_base_enterprise_ipo', 'social_credit_code',
# com_code, cursor)
# if len(result_ipo) == 1:
# # 是上市企业
# if not result[0][6]:
# u_ipo = '1'
# fields += f'IsIPO = "{u_ipo}", '
# if result_ipo[0][1]:
# u_code = result_ipo[0][1]
# fields += f'SecuritiesCode = "{u_code}", '
# if not result[0][4] and result_ipo[0][2]:
# u_short_name = result_ipo[0][2]
# fields += f'SecuritiesShortName = "{u_short_name}", '
# if not result[0][7] and result_ipo[0][3]:
# u_type = result_ipo[0][3]
# fields += f'SecuritiesType = "{u_type}", '
# if not result[0][8] and result_ipo[0][4]:
# u_category = result_ipo[0][4]
# fields += f'Category = "{u_category}", '
# if not result[0][9] and result_ipo[0][5]:
# u_exchange = result_ipo[0][5]
# fields += f'Exchange = "{u_exchange}", '
# else:
# if len(result_ipo) > 1:
# # 记录下
# baseCore.rePutIntoR(key, "Synchronize_data:More")
# # 可能不是上市企业
# # if fields:
# # update_sql = update_sql.format(fields.rstrip(', '), f'"{com_code}"')
# # print(update_sql)
# # continue
# pass
#
# if fields:
# update_sql = update_sql.format(fields.rstrip(', '), f'"{com_code}"')
# print(update_sql)
# continue
pass
if fields:
update_sql = update_sql.format(fields.rstrip(', '), f'"{com_code}"')
log.info(f'更新的sql语句--{update_sql}')
update_table(update_sql, cursor_, cnx_)
else:
result_ipo = search_formal_table('social_credit_code, securities_code, securities_short_name, securities_type, category,exchange', 'sys_base_enterprise_ipo', 'social_credit_code',
com_code, cursor)
if result_ipo:
SecuritiesCode = result_ipo[1]
SecuritiesShortName = result_ipo[2]
securities_type = result_ipo[3]
Category = result_ipo[4]
exchange = result_ipo[5]
sqlInsert = 'insert into EnterpriseInfo(CompanyName, SocialCode, EnglishName, SecuritiesCode, SecuritiesShortName, Place, isIPO, SecuritiesType, Category, Exchange, countryName) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
baseCore.cursor.execute(sqlInsert, (com_name, com_code, com_english_name, SecuritiesCode, 1, 1, securities_type, Category, exchange, '中国内地'))
baseCore.cnx.commit()
log.info(f'{com_name}==={com_name}===上市企业===插入成功')
else:
sqlInsert = 'insert into EnterpriseInfo(CompanyName, SocialCode, EnglishName, Place, isIPO, countryName) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
baseCore.cursor.execute(sqlInsert, (com_name, com_code, com_english_name, 1, 1, '中国内地'))
baseCore.cnx.commit()
log.info(f'{com_name}==={com_name}===非上市企业===插入成功')
\ No newline at end of file
# log.info(f'更新的sql语句--{update_sql}')
# update_table(update_sql, cursor_, cnx_)
# else:
# result_ipo = search_formal_table('social_credit_code, securities_code, securities_short_name, securities_type, category,exchange', 'sys_base_enterprise_ipo', 'social_credit_code',
# com_code, cursor)
# if result_ipo:
# SecuritiesCode = result_ipo[1]
# SecuritiesShortName = result_ipo[2]
# securities_type = result_ipo[3]
# Category = result_ipo[4]
# exchange = result_ipo[5]
# sqlInsert = 'insert into EnterpriseInfo(CompanyName, SocialCode, EnglishName, SecuritiesCode, SecuritiesShortName, Place, isIPO, SecuritiesType, Category, Exchange, countryName) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
# baseCore.cursor.execute(sqlInsert, (com_name, com_code, com_english_name, SecuritiesCode, 1, 1, securities_type, Category, exchange, '中国内地'))
# baseCore.cnx.commit()
# log.info(f'{com_name}==={com_name}===上市企业===插入成功')
# else:
# sqlInsert = 'insert into EnterpriseInfo(CompanyName, SocialCode, EnglishName, Place, isIPO, countryName) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
# baseCore.cursor.execute(sqlInsert, (com_name, com_code, com_english_name, 1, 1, '中国内地'))
# baseCore.cnx.commit()
# log.info(f'{com_name}==={com_name}===非上市企业===插入成功')
\ No newline at end of file
......@@ -16,7 +16,7 @@ cursor = baseCore.cursor
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
'天眼查登录信息']
db_storage2 = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
'股东信息']
'股东信息0621']
class File():
......@@ -164,6 +164,20 @@ class Info():
db_storage2.update_one({'序号': str(no)}, {
'$set': {'股东企业信用代码': dic_info['股东企业信用代码'], '股东企业标签': dic_info['股东企业标签']}})
pass
def insert_into(self, dic_info):
if dic_info['股东序号序号']:
db_storage2.find_one_and_update(
{
'序号': str(dic_info['序号']),
"股东序号序号": str(dic_info['股东序号序号'])
},
{'$set': dic_info}, upsert=True)
else:
result = db_storage2.insert_one(dic_info)
print(result)
pass
if __name__ == '__main__':
# token = Token()
......
......@@ -64,7 +64,7 @@ taskType = '天眼查企业id/天眼查'
@retry(tries=5, delay=3)
def getTycIdByXYDM(com_name, s):
retData={'state':False, 'tycData':None, 'reput':True}
retData={'state': False, 'tycData': None, 'reput': True}
url=f"https://capi.tianyancha.com/cloud-tempest/search/suggest/v3?_={baseCore.getNowTime(3)}"
# url=f"https://capi.tianyancha.com/cloud-tempest/search/suggest/v3"
ip = baseCore.get_proxy()
......
......@@ -25,7 +25,7 @@ taskType = '天眼查/股东信息'
from classtool import Token, Info
token = Token()
info = Info()
Info = Info()
@retry(tries=3, delay=1)
def get_html(tycid, driver, dic_info):
......@@ -90,22 +90,6 @@ def get_page(url, s, headers):
@retry(tries=5, delay=3)
def get_page1(url, s, headers):
ip = baseCore.get_proxy()
header = {
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzYzNjcxMTc0NiIsImlhdCI6MTcxNDk1Njg3MywiZXhwIjoxNzE3NTQ4ODczfQ.qMEvtETT7RS3Rhwq9idu5H2AKMxc2cjtr5bDDW6C6yOFKR-ErgDwT4SOBX9PB2LWDexAG2hNaeAvn6swr-n6VA',
'X-TYCID': 'dad485900fcc11ee8c0de34479b5b939',
'sec-ch-ua': '"Chromium";v="124", "Google Chrome";v="124", "Not-A.Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'version': 'TYC-Web'
}
res = s.get(url=url, headers=headers, proxies=ip, timeout=(5, 10))
if res.status_code != 200:
raise
......@@ -181,7 +165,7 @@ def doJob():
for i in range(1000):
# while True:
# todo:设置cookies的使用
dic_info = {}
headers = {
'Accept-Language': 'zh-CN,zh;q=0.9',
'Content-Type': 'application/json',
......@@ -196,7 +180,7 @@ def doJob():
continue
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
item = baseCore.redicPullData('shareHolderInfo')
# item = '1|914401010885128005'
# item = '900|微创心律管理|None|罗七一|健康科技|¥ 90 亿|¥ 90 亿|¥ 92 亿|823|861|911|ZZSN231108150127681|MicroPort Cardiac Rhythm Management International Limited|中国|None'
# 判断 如果Redis中已经没有数据,则等待
# social_code = '91110108780992804C'
if item == None:
......@@ -204,8 +188,31 @@ def doJob():
continue
start = time.time()
no = item.split('|')[0]
social_code = item.split('|')[1]
social_code = item.split('|')[11]
recept_name = item.split('|')[12]
dic_info = {"序号": item.split('|')[0],
"企业名称(榜单公布)": item.split('|')[1],
"企业别称": item.split('|')[2],
"门人/联合创始": item.split('|')[3],
"行业": item.split('|')[4],
"企业估值(2022年)": item.split('|')[5],
"企业估值(2023年)": item.split('|')[6],
"企业估值(2024年)": item.split('|')[7],
"2022年独角兽排名": item.split('|')[8],
"2023年独角兽排名": item.split('|')[9],
"2024年独角兽排名": item.split('|')[10],
"企业信用代码(中国内地企业需填写信用代码)": item.split('|')[11],
"企业名称(企查查)": item.split('|')[12],
"所属国家": item.split('|')[13]
}
if "ZZSN" in social_code:
dic_info['前十大股东名称'] = ''
dic_info['持股比例'] = ''
dic_info['认缴出资额'] = ''
dic_info['股东序号序号'] = ''
Info.insert_into(dic_info)
break
try:
try:
data = baseCore.getInfomation(social_code)
......@@ -237,7 +244,8 @@ def doJob():
tycid = ''
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(xydm, s)
retData = getTycIdByXYDM(recept_name, s)
# retData = getTycIdByXYDM("极星汽车销售有限公司", s)
if retData['state']:
tycid = retData['tycData']['id']
......@@ -269,16 +277,20 @@ def doJob():
baseCore.rePutIntoR('shareHolderInfo', item)
log.info(f"{no}---{xydm}----{tycid}----请求失败----重新放入redis")
time.sleep(3)
continue
break
elif charge == -2:
# 该企业没有股东信息
token.updateTokeen(id_cookie, 2)
baseCore.rePutIntoR('shareHolderInfo', item)
# baseCore.rePutIntoR('shareHolderInfo', item)
log.info(f"{no}---{xydm}----{tycid}----没有股东信息或需要滑动验证----重新放入redis")
time.sleep(5)
# log.info(f"{id}---{xydm}----{tycid}----没有核心人员")
continue
dic_info['前十大股东名称'] = ''
dic_info['持股比例'] = ''
dic_info['认缴出资额'] = ''
dic_info['股东序号序号'] = ''
Info.insert_into(dic_info)
break
else:
log.info(f"{no}---{xydm}----{tycid}")
......@@ -310,7 +322,7 @@ def doJob():
flag = 1
else:
if total_page3 == charge:
url = 'https://capi.tianyancha.com/cloud-listed-company/listed/holder/topTen?_={}&gid={}&pageSize=20&pageNum={}&percentLevel=-100&type=1'
url = 'https://capi.tianyancha.com/cloud-listed-company/listed/holder/topTen?&gid={}&pageSize=20&pageNum={}&percentLevel=-100&type=1'
total_page = total_page3
data_page_one = data_page3
flag = 3
......@@ -325,15 +337,52 @@ def doJob():
baseCore.rePutIntoR('shareHolderInfo', item)
log.info(f'==={social_code}=====总数请求失败===重新放入redis====')
continue
# # todo:获取页数
# total_page = 34
# flag = 2
# todo: 测试程序是否执行到这一步
# todo:获取页数
log.info(f'总数为{total_page}')
if int(total_page % 20) == 0:
maxpage = int((total_page / 20) + 1)
else:
maxpage = int((total_page / 20) + 1) + 1
for page in range(1, maxpage):
if page == 1:
data_page = data_page_one
errorCode = data_page['errorCode']
else:
res = None
for d in range(3):
ip = baseCore.get_proxy()
if flag == 1:
url_ = url
payload = {"gid": f"{tycid}", "pageSize": 10, "pageNum": f"{page}", "sortField": "",
"sortType": "-100", "historyType": 1}
try:
res = s.post(url=url_, headers=headers, data=json.dumps(payload), proxies=ip,
timeout=(5, 10))
except requests.exceptions.RequestException as e:
log.info(e)
time.sleep(1)
continue
data_page = res.json()
errorCode = res.json()['errorCode']
if errorCode != 0:
continue
else:
break
else:
url_ = url.format(tycid, page)
try:
res = s.get(url_, headers=headers, proxies=ip, timeout=(5, 10)) # ,verify=False
except requests.exceptions.RequestException as e:
log.info(e)
time.sleep(1)
continue
data_page = res.json()
errorCode = res.json()['errorCode']
if errorCode != 0:
continue
else:
break
res.close()
if errorCode == 0:
pass
else:
......@@ -359,27 +408,40 @@ def doJob():
# res.close()
log.info(f'----flag:{flag}----')
log.info(f'-----list_all:{len(list_all)}----')
for idx,holder_info in enumerate(list_all):
shareHolderName, percent = '', ''
if flag == 1:
holder_info = list_all[0]
shareHolderName = holder_info['shareHolderName']
percent = holder_info['percent']
capitalTotal = holder_info['capitalTotal']
elif flag == 3:
holder_info = list_all[0]
shareHolderName = holder_info['name']
percent = holder_info['proportion']
capitalTotal = ''
else:
holder_info = list_all[0]
shareHolderName = holder_info['holder_name']
percent = holder_info['longHeldRatioWithUnit']
capitalTotal = ''
if shareHolderName and percent:
dic_info['最大持股名称'] = shareHolderName
if page == 1:
dic_info['股东序号序号'] = idx + 1
else:
dic_info['股东序号序号'] = idx + 1 + (10 * (page-1))
dic_info['前十大股东名称'] = shareHolderName
dic_info['持股比例'] = percent
# todo: 更新字段
# info.update_holder(no, dic_info)
dic_info['认缴出资额'] = capitalTotal
# todo: 插入一条新纪录
log.info(dic_info)
try:
del dic_info['_id']
except:
pass
Info.insert_into(dic_info)
log.info('=========成功======')
token.updateTokeen(id_cookie, 3)
# time.sleep(randint(5,10))
time.sleep(5)
......@@ -395,7 +457,7 @@ def doJob():
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
break
# break
......
import json
import openpyxl
import redis
from bs4 import BeautifulSoup
import langid
from base.BaseCore import BaseCore
baseCore =BaseCore()
import pymysql
# print(baseCore.detect_language("是对jhjjhjhhjjhjhjh的浮点数"))
# cnx_ = baseCore.cnx
# cursor_ = baseCore.cursor
cnx_ = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
charset='utf8mb4')
cursor_ = cnx_.cursor()
# updateBeginSql = f"update Tfbs set state3=%s where col3=%s "
# # print(updateBeginSql)
# cursor_.execute(updateBeginSql,(200,'91350000158142711F'))
# cnx_.commit()
import time
# from getTycId import getTycIdByXYDM
# social_code = '91440101231247350J'
# data = baseCore.getInfomation(social_code)
# tycid = data[11]
# if tycid == None:
# print(data)
# retData = getTycIdByXYDM(social_code)
# tycid = retData['tycData']['id']
# print(tycid)
# time_struct = time.localtime(int(1692762780000 / 1000)) # 首先把时间戳转换为结构化时间
# time_format = time.strftime("%Y-%m-%d %H-%M-%S", time_struct) # 把结构化时间转换为格式化时间
# print(time_format)
# r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn',db=6)
# #原键名
# key1 = 'CorPersonEnterpriseFbs:gnqy_socialCode'
# #目标键名
# key2 = 'NewsEnterpriseFbs:gnqy_socialCode'
# values = r.lrange(key1,0,-1)
# for value in values:
# r.rpush(key2, value)
#
# # 关闭Redis连接
# r.close()
list_all = []
if list_all:
print(len(list_all))
else:
print('---')
# 先采两千强和独角兽
# 连接到Redis服务器
redis_client = redis.Redis(host="114.116.90.53", port=6380, password='clbzzsn', db=6)
# 打开Excel文件
# workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2022年福布斯2000强榜单(已排除2023年).xlsx')
# workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2023年福布斯2000强.xlsx')
# workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2023年独角兽企业(已排除2024年).xlsx')
# workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2024胡润独角兽(4).xlsx')
# workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2022年世界500强企业39家(已排除23年上榜企业)2.xlsx')
# workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2023年世界500强名单.xlsx')
# workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2023年欧盟2500(已排除2022年).xlsx')
workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2022年欧盟2500强.xlsx')
# 选择要读取的工作表
worksheet = workbook['Sheet1']
# worksheet = workbook['sheet1']
# 选择要读取的列
column_index = 0 # 选择第2列
# 遍历指定列的单元格,并将值放入Redis列表
for row in worksheet.iter_rows(values_only=True):
try:
cell_value = row[1]
except:
print(row[1])
continue
# print(type(cell_value))
# print(cell_value)
if row[0] == '序列' or row[0] == '序号' or row[0] == '排序':
continue
# 309
# item = ""+ "|"+str(row[3]) + "|" + str(row[2])+ "|" + str(row[1])+ "|" + "2022年福布斯2000强"
# item = str(row[2])+ "|"+str(row[5]) + "|" + str(row[3])+ "|" + str(row[1])+ "|" + "2023年福布斯2000强"
# item = str(row[2])+ "|"+str(row[3]) + "|" + str(row[3])+ "|" + str(row[1])+ "|" + "2023年独角兽"
# item = str(row[2])+ "|"+str(row[4]) + "|" + str(row[4])+ "|" + str(row[0])+ "|" + "2024年独角兽"
# item = str(row[1])+ "|"+str(row[5]) + "|" + str(row[4])+ "|" + str(row[0])+ "|" + "2022年世界500强"
# item = str(row[1])+ "|"+str(row[3]) + "|" + str(row[5])+ "|" + str(row[6])+ "|" + "2023年世界500强"
# item = ""+ "|"+str(row[3]) + "|" + str(row[2])+ "|" + str(row[1])+ "|" + "2023年欧盟2500"
item = str(row[2])+ "|"+str(row[5]) + "|" + str(row[4])+ "|" + str(row[1])+ "|" + "2022年欧盟2500"
redis_client.rpush('GOOGLE_KEYWORDS:COMPANY_NAME', item)
# redis_client.rpush('BAIDU_KEYWORDS:COMPANY_NAME', item)
print(item)
# break
# 关闭Excel文件
workbook.close()
......@@ -261,6 +261,7 @@ def getPageData(dic_url, page, dic_user_count):
return True, dic_user_count
# 修改token使用时间
updateTokeen(token, 3)
pagecount = json_search['app_msg_cnt'] # 837
# 保存数据到数据库
return insertWxList(dic_url, json_search, page, user_name), dic_user_count
......@@ -280,6 +281,7 @@ def getWxList(infoSourceCode, dic_user_count):
origin = dic_url['name']
biz = dic_url['biz']
# retFlag, dic_user_count = getPageData(dic_url, 1, dic_user_count)
for page in range(1, 6):
retFlag, dic_user_count = getPageData(dic_url, page, dic_user_count)
time.sleep(random.randint(60, 181))
......@@ -311,12 +313,12 @@ def getnumber_redis():
if __name__ == "__main__":
getFromSql()
# getFromSql()
# numbers = getnumber_redis()
# log.info("当前批次采集公众号个数{}".format(numbers))
# time.sleep(3)
# dic_user_count = {}
dic_user_count = {}
# # dic_user_count = {
# # 'name': '',
# # 'use_count': 0,
......@@ -344,5 +346,5 @@ if __name__ == "__main__":
# for key, value in dic_user_count.items():
# log.info(f"====账号{key},使用次数{value}")
# # break
# # infoSourceCode = 'IN-20220917-0159'
# # getWxList(infoSourceCode)
infoSourceCode = 'IN-20231110-0003'
getWxList(infoSourceCode, dic_user_count)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论