提交 e471e82e 作者: XveLingKun

06-27

上级 a0ee390b
# cache = {
# "company": {
# "上汽集团": 1,
# "欧盟委员会": 13,
# "欧盟": 8,
# "欧盟委员会总部大厦一角": 2
# },
# "person": {
# "梅赛德斯": 2,
# "齐普策": 8,
# "韦杜姆": 5,
# "朔尔茨": 7,
# "冯德莱恩": 1,
# "布尔茨": 4,
# "纳吉": 3,
# "哈贝克": 3,
# "特斯拉": 2,
# "莫里永": 1,
# "维辛": 1,
# "康松林": 1,
# "尼古拉斯普瓦捷NiclasPoitiers": 1,
# "李缘": 1,
# "伯格": 1,
# "阿斯拉克伯格AslakBerg": 1,
# "林燕": 1,
# "华盛顿": 1,
# "布鲁盖尔Bruegel": 1,
# "川普": 1,
# "哈桑-扎米特": 1,
# "小鹏": 2,
# "康逸": 3,
# "费尔": 1,
# "埃德加博高": 1,
# "蔚来": 3,
# "杜登赫费尔": 3,
# "赵丁喆": 3,
# "卢基斯": 1,
# "斯泰兰蒂斯": 2,
# "保罗博若思": 1,
# "海国": 1,
# "斯特凡德古阿拉": 1,
# "弗兰克": 1,
# "施沃佩": 1,
# "费迪南德": 3,
# "迪尔克扬杜拉": 1,
# "米扎克": 1,
# "帕沃尔安塔利奇": 1,
# "亚采克米扎克": 1,
# "弗尔季奥蒂洛": 1,
# "张晨霖": 1,
# "基塞伊佐尔坦": 1,
# "德古阿拉": 3,
# "明道加斯普": 1,
# "杜登赫": 1,
# "奥托尔巴吉": 1,
# "郭晨": 1,
# "波罗": 1,
# "尹栋逊": 1,
# "颜景辉": 1,
# "段思瑶": 1,
# "裴健如": 1,
# "陈庆": 1,
# "纳吉马顿": 2,
# "崔东树": 1,
# "PatrickHummel": 1,
# "如蔚": 1,
# "李斌": 1,
# "福尔克•维辛": 1,
# "蔚": 1,
# "TechWeb": 1,
# "Suky": 1,
# "陈继业": 1,
# "欧方": 1,
# "齐普策OliverZipse": 1,
# "康林松OlaKaellenius": 1,
# "PFA": 1,
# "ACEA": 1,
# "希尔德加德": 1,
# "穆勒HildegardMueller": 1,
# "阿道夫乌尔索阿道夫": 1,
# "乌尔索": 1,
# "马库斯费伯MarkusFerber": 1,
# "特蕾莎里贝拉TeresaRibera": 1,
# "福尔克维辛": 2,
# "辛婧": 1,
# "殷晓圣": 3,
# "李若佳": 1,
# "刘维佳": 1,
# "萨拉热窝": 1,
# "专员薇奥莱塔布尔茨": 4,
# "哈贝克RobertHabeck": 1,
# "布特克MaximilianButek": 1,
# "关乌": 1,
# "布特克": 2,
# "俄乌": 1,
# "哈桑": 2,
# "吕瑟尔斯海姆": 2,
# "何塞普戈梅斯": 3,
# "李学军": 2,
# "刘向": 2,
# "戈梅斯": 2,
# "马灿": 2,
# "克雷希米尔": 2,
# "康林松": 3,
# "于荣": 2,
# "霍尔格格尔克": 3,
# "陈斌杰": 2,
# "梁国勇": 3,
# "李博": 2,
# "乔纳森博格": 2,
# "胡加齐": 2,
# "单玮怡": 2,
# "林剑": 3,
# "马克西米利安布特克MaximilianButek": 1,
# "何亚东": 1,
# "吕骞": 1,
# "金瑞庭": 1,
# "罗知之": 1,
# "马铭博": 1,
# "马铭": 1,
# "梅赛德斯-奔驰": 1,
# "埃隆马斯克": 1,
# "罗伯特哈贝克RobertHabeck": 1,
# "奥拉夫朔尔茨OlafScholz": 1
# },
# "location": {
# "上海市": 4,
# "北京市": 2,
# "江西省": 1,
# "赣州市": 1,
# "常州市": 2,
# "武进区": 2,
# "江苏省": 2
# },
# "sentiment": {
# "负面": 4,
# "中性": 10,
# "正面": 10
# },
# "time": {
# "17.4": 2,
# "6月12日": 6,
# "12日": 2,
# "2024年06月14日": 1,
# "2024年6月3日": 1,
# "6月13日": 1,
# "7月4日": 1,
# "38.1": 1,
# "2023年2月15日": 1,
# "6月17日": 3,
# "60.7": 1,
# "6月14日": 1
# }
# }
#
# top_keywords = {keyword_type: sorted(keyword_freq.items(), key=lambda x: x[1], reverse=True)[:10] for
# keyword_type, keyword_freq in cache.items()}
# # print(top_keywords)
#
# # 提取前十的关键词
# top_keywords_dict = {keyword_type: [keyword for keyword, freq in keywords] for keyword_type, keywords in
# top_keywords.items()}
# print(top_keywords_dict)
#
# industry_result = top_keywords_dict['industry'] if 'industry' in top_keywords_dict else []
# insert_industry = ",".join(industry_result)
# company_result = top_keywords_dict["company"] if "company" in top_keywords_dict else []
# person_result = top_keywords_dict["person"] if "person" in top_keywords_dict else []
# sentiment_result = top_keywords_dict["sentiment"] if "sentiment" in top_keywords_dict else []
# location_result = top_keywords_dict["location"] if "location" in top_keywords_dict else []
# time_result = top_keywords_dict["time"] if "time" in top_keywords_dict else []
# print(f"insert_industry:{insert_industry}")
# insert_company = ",".join(company_result)
# insert_person = ",".join(person_result)
# insert_sentiment = ",".join(sentiment_result)
# insert_location = ",".join(location_result)
# insert_time = ",".join(time_result)
# print(f"insert_company:{insert_company}")
# print(f"insert_person:{insert_person}")
# print(f"insert_sentiment:{insert_sentiment}")
# print(f"insert_location:{insert_location}")
# print(f"insert_time:{insert_time}")
# print(type(insert_industry))
#
# test_none = None
# test_set = set(test_none)
# print(test_set)
# set1 = {'万家小新,迎春,李虹萦,张灏然,王宏志,袁野,谭作钧,习近平,鄂维南,苟坪'}
# set2 = {'孟晚舟,李虹萦,张灏然,习近平,王宏志,鄂维南,张玉卓,谭作钧'}
# # 使用 & 运算符找到交集
# intersection_set = set1 & set2
#
# print(intersection_set)
import json
import json
import pandas as pd
# 假设我们有一个包含int64类型数据的DataFrame
df = pd.DataFrame({'id': [1, 2, 3], 'value': [4, 5, 6]})
df['id'] = df['id'].astype('int64') # 确保id列是int64类型
# 将DataFrame转换为JSON
# 首先将int64转换为int
df['id'] = df['id'].astype('int')
# 现在可以将DataFrame转换为JSON
json_str = json.dumps(df.to_dict(orient='records'))
print(json_str)
...@@ -372,8 +372,9 @@ def AnnualEnterpriseXueQ_task(): ...@@ -372,8 +372,9 @@ def AnnualEnterpriseXueQ_task():
def AnnualEnterpriseUS(): def AnnualEnterpriseUS():
cnx,cursor = connectSql() cnx,cursor = connectSql()
# 获取美股企业 # 获取美股企业
us_query = "select SocialCode from EnterpriseInfo where Place = '2' and SecuritiesType = '美股' and SecuritiesCode is not null and CreateTime='2023-08-15 14:00:00'" # us_query = "select SocialCode from EnterpriseInfo where Place = '2' and SecuritiesType = '美股' and SecuritiesCode is not null and CreateTime='2023-08-15 14:00:00'"
# us_query = "select SocialCode from EnterpriseInfo where Place = '2' and SecuritiesType = '美股' and SecuritiesCode = 'BP' " # us_query = "select SocialCode from EnterpriseInfo where Place = '2' and SecuritiesType = '美股' and SecuritiesCode = 'BP' "
us_query = "select cik from mgzqyjwyh_list where state=2 "
#ZZSN22080900000025 #ZZSN22080900000025
cursor.execute(us_query) cursor.execute(us_query)
us_result = cursor.fetchall() us_result = cursor.fetchall()
...@@ -381,7 +382,7 @@ def AnnualEnterpriseUS(): ...@@ -381,7 +382,7 @@ def AnnualEnterpriseUS():
us_social_list = [item[0] for item in us_result] us_social_list = [item[0] for item in us_result]
print('=======') print('=======')
for item in us_social_list: for item in us_social_list:
r.rpush('AnnualEnterprise:usqy_socialCode', item) r.rpush('Sec_cik_US:uscik_annualReport', item)
closeSql(cnx,cursor) closeSql(cnx,cursor)
#国外企业基本信息 redis中放入id #国外企业基本信息 redis中放入id
...@@ -659,12 +660,14 @@ if __name__ == "__main__": ...@@ -659,12 +660,14 @@ if __name__ == "__main__":
# zhuangjingtexind() # zhuangjingtexind()
# NoticeEnterprise() # NoticeEnterprise()
# NoticeDF() # NoticeDF()
AnnualEnterpriseUS()
# AnnualEnterpriseIPO() # AnnualEnterpriseIPO()
# AnnualEnterprise() # AnnualEnterprise()
# BaseInfoEnterprise() # BaseInfoEnterprise()
# BaseInfoEnterpriseAbroad() # BaseInfoEnterpriseAbroad()
# NewsEnterprise_task() # NewsEnterprise_task()
# NewsEnterprise() # NewsEnterprise()
# NoticeEnterprise()
# CorPerson() # CorPerson()
# china100() # china100()
# global100() # global100()
...@@ -678,7 +681,7 @@ if __name__ == "__main__": ...@@ -678,7 +681,7 @@ if __name__ == "__main__":
# SEC_CIK() # SEC_CIK()
# dujioashou() # dujioashou()
# omeng() # omeng()
AnnualEnterprise() # AnnualEnterprise()
# AnnualEnterpriseUS() # AnnualEnterpriseUS()
# NoticeEnterprise_task() # NoticeEnterprise_task()
# AnnualEnterprise_task() # AnnualEnterprise_task()
......
""" """
...@@ -47,93 +47,100 @@ def update_table(update_sql, cursor_c, cnx_c): ...@@ -47,93 +47,100 @@ def update_table(update_sql, cursor_c, cnx_c):
if __name__ == "__main__": if __name__ == "__main__":
key = 'Synchronize_data:info' key = 'Synchronize_data:info'
# result = search_formal_table('social_credit_code, name, english_name', 'sys_base_enterprise', 'yn_domestic', '1', cursor) result = search_formal_table('social_credit_code, name, english_name', 'sys_base_enterprise', 'yn_domestic', '1', cursor)
# for row in result: for row in result:
# social_credit_code = row[0] social_credit_code = row[0]
# name = row[1] name = row[1]
# english_name = row[2] english_name = row[2]
# if not english_name: if not english_name:
# english_name = '' english_name = ''
# item = social_credit_code + '|' + name + '|' + english_name item = social_credit_code + '|' + name + '|' + english_name
# baseCore.rePutIntoR(key, item) baseCore.rePutIntoR(key, item)
while True:
info = baseCore.redicPullData(key) # while True:
if info == None: # # info = baseCore.redicPullData(key)
break # info = ""
else: # if info == None:
pass # break
com_code = info.split('|')[0] # else:
com_name = info.split('|')[1] # pass
com_english_name = info.split('|')[2] # log.info(f"当前企业---{info}---")
result = search_formal_table('CompanyName, SocialCode, EnglishName, SecuritiesCode, SecuritiesShortName, Place, isIPO, SecuritiesType, Category, Exchange, countryName', 'EnterpriseInfo', # com_code = info.split('|')[0]
'SocialCode', com_code, cursor_) # com_name = info.split('|')[1]
u_name, u_code, u_ename, u_short_name, u_type, u_category, u_exchange = '', '', '', '', '', '', '' # com_english_name = info.split('|')[2]
# 更新语句 # result = search_formal_table('CompanyName, SocialCode, EnglishName, SecuritiesCode, SecuritiesShortName, Place, isIPO, SecuritiesType, Category, Exchange, countryName', 'EnterpriseInfo',
update_sql = """update EnterpriseInfo set {} where SocialCode = {}""" # 'SocialCode', com_code, cursor_)
fields = '' # u_name, u_code, u_ename, u_short_name, u_type, u_category, u_exchange = '', '', '', '', '', '', ''
if result: # # 更新语句
# 判断这几个值是否为空 # update_sql = """update EnterpriseInfo set {} where SocialCode = {}"""
if result[0][0] != com_name: # fields = ''
u_name = com_name # if result:
fields = f'CompanyName = "{com_name}", ' # # 判断这几个值是否为空
if not result[0][2] and com_english_name: # if result[0][0] != com_name:
u_ename = com_english_name # u_name = com_name
fields += f'EnglishName = "{com_english_name}", ' # fields = f'CompanyName = "{com_name}", '
if not result[0][5]: # if not result[0][2] and com_english_name:
u_place = '1' # u_ename = com_english_name
fields += f'Place = "{u_place}", ' # fields += f'EnglishName = "{com_english_name}", '
if not result[0][10]: # if not result[0][5]:
u_countryname = '中国内地' # u_place = '1'
fields += f'countryName = "{u_countryname}", ' # fields += f'Place = "{u_place}", '
if not result[0][3]: # if not result[0][10]:
result_ipo = search_formal_table('social_credit_code, securities_code, securities_short_name, securities_type, category,exchange', 'sys_base_enterprise_ipo', 'social_credit_code', # u_countryname = '中国内地'
com_code, cursor) # fields += f'countryName = "{u_countryname}", '
if result_ipo: # if not result[0][3]:
# 是上市企业 # result_ipo = search_formal_table('social_credit_code, securities_code, securities_short_name, securities_type, category,exchange', 'sys_base_enterprise_ipo', 'social_credit_code',
if not result[0][6]: # com_code, cursor)
u_ipo = '1' # if len(result_ipo) == 1:
fields += f'IsIPO = "{u_ipo}", ' # # 是上市企业
if result_ipo[0][1]: # if not result[0][6]:
u_code = result_ipo[0][1] # u_ipo = '1'
fields += f'SecuritiesCode = "{u_code}", ' # fields += f'IsIPO = "{u_ipo}", '
if not result[0][4] and result_ipo[0][2]: # if result_ipo[0][1]:
u_short_name = result_ipo[0][2] # u_code = result_ipo[0][1]
fields += f'SecuritiesShortName = "{u_short_name}", ' # fields += f'SecuritiesCode = "{u_code}", '
if not result[0][7] and result_ipo[0][3]: # if not result[0][4] and result_ipo[0][2]:
u_type = result_ipo[0][3] # u_short_name = result_ipo[0][2]
fields += f'SecuritiesType = "{u_type}", ' # fields += f'SecuritiesShortName = "{u_short_name}", '
if not result[0][8] and result_ipo[0][4]: # if not result[0][7] and result_ipo[0][3]:
u_category = result_ipo[0][4] # u_type = result_ipo[0][3]
fields += f'Category = "{u_category}", ' # fields += f'SecuritiesType = "{u_type}", '
if not result[0][9] and result_ipo[0][5]: # if not result[0][8] and result_ipo[0][4]:
u_exchange = result_ipo[0][5] # u_category = result_ipo[0][4]
fields += f'Exchange = "{u_exchange}", ' # fields += f'Category = "{u_category}", '
else: # 可能不是上市企业 # if not result[0][9] and result_ipo[0][5]:
# u_exchange = result_ipo[0][5]
# fields += f'Exchange = "{u_exchange}", '
# else:
# if len(result_ipo) > 1:
# # 记录下
# baseCore.rePutIntoR(key, "Synchronize_data:More")
# # 可能不是上市企业
# # if fields:
# # update_sql = update_sql.format(fields.rstrip(', '), f'"{com_code}"')
# # print(update_sql)
# # continue
# pass
#
# if fields: # if fields:
# update_sql = update_sql.format(fields.rstrip(', '), f'"{com_code}"') # update_sql = update_sql.format(fields.rstrip(', '), f'"{com_code}"')
# print(update_sql) # log.info(f'更新的sql语句--{update_sql}')
# continue # update_table(update_sql, cursor_, cnx_)
pass # else:
# result_ipo = search_formal_table('social_credit_code, securities_code, securities_short_name, securities_type, category,exchange', 'sys_base_enterprise_ipo', 'social_credit_code',
if fields: # com_code, cursor)
update_sql = update_sql.format(fields.rstrip(', '), f'"{com_code}"') # if result_ipo:
log.info(f'更新的sql语句--{update_sql}') # SecuritiesCode = result_ipo[1]
update_table(update_sql, cursor_, cnx_) # SecuritiesShortName = result_ipo[2]
else: # securities_type = result_ipo[3]
result_ipo = search_formal_table('social_credit_code, securities_code, securities_short_name, securities_type, category,exchange', 'sys_base_enterprise_ipo', 'social_credit_code', # Category = result_ipo[4]
com_code, cursor) # exchange = result_ipo[5]
if result_ipo: # sqlInsert = 'insert into EnterpriseInfo(CompanyName, SocialCode, EnglishName, SecuritiesCode, SecuritiesShortName, Place, isIPO, SecuritiesType, Category, Exchange, countryName) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
SecuritiesCode = result_ipo[1] # baseCore.cursor.execute(sqlInsert, (com_name, com_code, com_english_name, SecuritiesCode, 1, 1, securities_type, Category, exchange, '中国内地'))
SecuritiesShortName = result_ipo[2] # baseCore.cnx.commit()
securities_type = result_ipo[3] # log.info(f'{com_name}==={com_name}===上市企业===插入成功')
Category = result_ipo[4] # else:
exchange = result_ipo[5] # sqlInsert = 'insert into EnterpriseInfo(CompanyName, SocialCode, EnglishName, Place, isIPO, countryName) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
sqlInsert = 'insert into EnterpriseInfo(CompanyName, SocialCode, EnglishName, SecuritiesCode, SecuritiesShortName, Place, isIPO, SecuritiesType, Category, Exchange, countryName) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' # baseCore.cursor.execute(sqlInsert, (com_name, com_code, com_english_name, 1, 1, '中国内地'))
baseCore.cursor.execute(sqlInsert, (com_name, com_code, com_english_name, SecuritiesCode, 1, 1, securities_type, Category, exchange, '中国内地')) # baseCore.cnx.commit()
baseCore.cnx.commit() # log.info(f'{com_name}==={com_name}===非上市企业===插入成功')
log.info(f'{com_name}==={com_name}===上市企业===插入成功') \ No newline at end of file
else:
sqlInsert = 'insert into EnterpriseInfo(CompanyName, SocialCode, EnglishName, Place, isIPO, countryName) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
baseCore.cursor.execute(sqlInsert, (com_name, com_code, com_english_name, 1, 1, '中国内地'))
baseCore.cnx.commit()
log.info(f'{com_name}==={com_name}===非上市企业===插入成功')
\ No newline at end of file
...@@ -16,7 +16,7 @@ cursor = baseCore.cursor ...@@ -16,7 +16,7 @@ cursor = baseCore.cursor
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[ db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
'天眼查登录信息'] '天眼查登录信息']
db_storage2 = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[ db_storage2 = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
'股东信息'] '股东信息0621']
class File(): class File():
...@@ -164,6 +164,20 @@ class Info(): ...@@ -164,6 +164,20 @@ class Info():
db_storage2.update_one({'序号': str(no)}, { db_storage2.update_one({'序号': str(no)}, {
'$set': {'股东企业信用代码': dic_info['股东企业信用代码'], '股东企业标签': dic_info['股东企业标签']}}) '$set': {'股东企业信用代码': dic_info['股东企业信用代码'], '股东企业标签': dic_info['股东企业标签']}})
pass pass
def insert_into(self, dic_info):
if dic_info['股东序号序号']:
db_storage2.find_one_and_update(
{
'序号': str(dic_info['序号']),
"股东序号序号": str(dic_info['股东序号序号'])
},
{'$set': dic_info}, upsert=True)
else:
result = db_storage2.insert_one(dic_info)
print(result)
pass
if __name__ == '__main__': if __name__ == '__main__':
# token = Token() # token = Token()
......
...@@ -64,7 +64,7 @@ taskType = '天眼查企业id/天眼查' ...@@ -64,7 +64,7 @@ taskType = '天眼查企业id/天眼查'
@retry(tries=5, delay=3) @retry(tries=5, delay=3)
def getTycIdByXYDM(com_name, s): def getTycIdByXYDM(com_name, s):
retData={'state':False, 'tycData':None, 'reput':True} retData={'state': False, 'tycData': None, 'reput': True}
url=f"https://capi.tianyancha.com/cloud-tempest/search/suggest/v3?_={baseCore.getNowTime(3)}" url=f"https://capi.tianyancha.com/cloud-tempest/search/suggest/v3?_={baseCore.getNowTime(3)}"
# url=f"https://capi.tianyancha.com/cloud-tempest/search/suggest/v3" # url=f"https://capi.tianyancha.com/cloud-tempest/search/suggest/v3"
ip = baseCore.get_proxy() ip = baseCore.get_proxy()
......
...@@ -25,7 +25,7 @@ taskType = '天眼查/股东信息' ...@@ -25,7 +25,7 @@ taskType = '天眼查/股东信息'
from classtool import Token, Info from classtool import Token, Info
token = Token() token = Token()
info = Info() Info = Info()
@retry(tries=3, delay=1) @retry(tries=3, delay=1)
def get_html(tycid, driver, dic_info): def get_html(tycid, driver, dic_info):
...@@ -90,22 +90,6 @@ def get_page(url, s, headers): ...@@ -90,22 +90,6 @@ def get_page(url, s, headers):
@retry(tries=5, delay=3) @retry(tries=5, delay=3)
def get_page1(url, s, headers): def get_page1(url, s, headers):
ip = baseCore.get_proxy() ip = baseCore.get_proxy()
header = {
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzYzNjcxMTc0NiIsImlhdCI6MTcxNDk1Njg3MywiZXhwIjoxNzE3NTQ4ODczfQ.qMEvtETT7RS3Rhwq9idu5H2AKMxc2cjtr5bDDW6C6yOFKR-ErgDwT4SOBX9PB2LWDexAG2hNaeAvn6swr-n6VA',
'X-TYCID': 'dad485900fcc11ee8c0de34479b5b939',
'sec-ch-ua': '"Chromium";v="124", "Google Chrome";v="124", "Not-A.Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'version': 'TYC-Web'
}
res = s.get(url=url, headers=headers, proxies=ip, timeout=(5, 10)) res = s.get(url=url, headers=headers, proxies=ip, timeout=(5, 10))
if res.status_code != 200: if res.status_code != 200:
raise raise
...@@ -181,7 +165,7 @@ def doJob(): ...@@ -181,7 +165,7 @@ def doJob():
for i in range(1000): for i in range(1000):
# while True: # while True:
# todo:设置cookies的使用 # todo:设置cookies的使用
dic_info = {}
headers = { headers = {
'Accept-Language': 'zh-CN,zh;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9',
'Content-Type': 'application/json', 'Content-Type': 'application/json',
...@@ -196,7 +180,7 @@ def doJob(): ...@@ -196,7 +180,7 @@ def doJob():
continue continue
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息 # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
item = baseCore.redicPullData('shareHolderInfo') item = baseCore.redicPullData('shareHolderInfo')
# item = '1|914401010885128005' # item = '900|微创心律管理|None|罗七一|健康科技|¥ 90 亿|¥ 90 亿|¥ 92 亿|823|861|911|ZZSN231108150127681|MicroPort Cardiac Rhythm Management International Limited|中国|None'
# 判断 如果Redis中已经没有数据,则等待 # 判断 如果Redis中已经没有数据,则等待
# social_code = '91110108780992804C' # social_code = '91110108780992804C'
if item == None: if item == None:
...@@ -204,8 +188,31 @@ def doJob(): ...@@ -204,8 +188,31 @@ def doJob():
continue continue
start = time.time() start = time.time()
no = item.split('|')[0] no = item.split('|')[0]
social_code = item.split('|')[1] social_code = item.split('|')[11]
recept_name = item.split('|')[12]
dic_info = {"序号": item.split('|')[0],
"企业名称(榜单公布)": item.split('|')[1],
"企业别称": item.split('|')[2],
"门人/联合创始": item.split('|')[3],
"行业": item.split('|')[4],
"企业估值(2022年)": item.split('|')[5],
"企业估值(2023年)": item.split('|')[6],
"企业估值(2024年)": item.split('|')[7],
"2022年独角兽排名": item.split('|')[8],
"2023年独角兽排名": item.split('|')[9],
"2024年独角兽排名": item.split('|')[10],
"企业信用代码(中国内地企业需填写信用代码)": item.split('|')[11],
"企业名称(企查查)": item.split('|')[12],
"所属国家": item.split('|')[13]
}
if "ZZSN" in social_code:
dic_info['前十大股东名称'] = ''
dic_info['持股比例'] = ''
dic_info['认缴出资额'] = ''
dic_info['股东序号序号'] = ''
Info.insert_into(dic_info)
break
try: try:
try: try:
data = baseCore.getInfomation(social_code) data = baseCore.getInfomation(social_code)
...@@ -237,7 +244,8 @@ def doJob(): ...@@ -237,7 +244,8 @@ def doJob():
tycid = '' tycid = ''
if tycid == None or tycid == '': if tycid == None or tycid == '':
try: try:
retData = getTycIdByXYDM(xydm, s) retData = getTycIdByXYDM(recept_name, s)
# retData = getTycIdByXYDM("极星汽车销售有限公司", s)
if retData['state']: if retData['state']:
tycid = retData['tycData']['id'] tycid = retData['tycData']['id']
...@@ -269,16 +277,20 @@ def doJob(): ...@@ -269,16 +277,20 @@ def doJob():
baseCore.rePutIntoR('shareHolderInfo', item) baseCore.rePutIntoR('shareHolderInfo', item)
log.info(f"{no}---{xydm}----{tycid}----请求失败----重新放入redis") log.info(f"{no}---{xydm}----{tycid}----请求失败----重新放入redis")
time.sleep(3) time.sleep(3)
continue break
elif charge == -2: elif charge == -2:
# 该企业没有股东信息 # 该企业没有股东信息
token.updateTokeen(id_cookie, 2) token.updateTokeen(id_cookie, 2)
baseCore.rePutIntoR('shareHolderInfo', item) # baseCore.rePutIntoR('shareHolderInfo', item)
log.info(f"{no}---{xydm}----{tycid}----没有股东信息或需要滑动验证----重新放入redis") log.info(f"{no}---{xydm}----{tycid}----没有股东信息或需要滑动验证----重新放入redis")
time.sleep(5) time.sleep(5)
# log.info(f"{id}---{xydm}----{tycid}----没有核心人员") dic_info['前十大股东名称'] = ''
continue dic_info['持股比例'] = ''
dic_info['认缴出资额'] = ''
dic_info['股东序号序号'] = ''
Info.insert_into(dic_info)
break
else: else:
log.info(f"{no}---{xydm}----{tycid}") log.info(f"{no}---{xydm}----{tycid}")
...@@ -310,7 +322,7 @@ def doJob(): ...@@ -310,7 +322,7 @@ def doJob():
flag = 1 flag = 1
else: else:
if total_page3 == charge: if total_page3 == charge:
url = 'https://capi.tianyancha.com/cloud-listed-company/listed/holder/topTen?_={}&gid={}&pageSize=20&pageNum={}&percentLevel=-100&type=1' url = 'https://capi.tianyancha.com/cloud-listed-company/listed/holder/topTen?&gid={}&pageSize=20&pageNum={}&percentLevel=-100&type=1'
total_page = total_page3 total_page = total_page3
data_page_one = data_page3 data_page_one = data_page3
flag = 3 flag = 3
...@@ -325,15 +337,52 @@ def doJob(): ...@@ -325,15 +337,52 @@ def doJob():
baseCore.rePutIntoR('shareHolderInfo', item) baseCore.rePutIntoR('shareHolderInfo', item)
log.info(f'==={social_code}=====总数请求失败===重新放入redis====') log.info(f'==={social_code}=====总数请求失败===重新放入redis====')
continue continue
# # todo:获取页数 # todo:获取页数
# total_page = 34
# flag = 2
# todo: 测试程序是否执行到这一步
log.info(f'总数为{total_page}') log.info(f'总数为{total_page}')
if int(total_page % 20) == 0:
maxpage = int((total_page / 20) + 1)
else:
maxpage = int((total_page / 20) + 1) + 1
for page in range(1, maxpage):
if page == 1:
data_page = data_page_one data_page = data_page_one
errorCode = data_page['errorCode'] errorCode = data_page['errorCode']
else:
res = None
for d in range(3):
ip = baseCore.get_proxy()
if flag == 1:
url_ = url
payload = {"gid": f"{tycid}", "pageSize": 10, "pageNum": f"{page}", "sortField": "",
"sortType": "-100", "historyType": 1}
try:
res = s.post(url=url_, headers=headers, data=json.dumps(payload), proxies=ip,
timeout=(5, 10))
except requests.exceptions.RequestException as e:
log.info(e)
time.sleep(1)
continue
data_page = res.json()
errorCode = res.json()['errorCode']
if errorCode != 0:
continue
else:
break
else:
url_ = url.format(tycid, page)
try:
res = s.get(url_, headers=headers, proxies=ip, timeout=(5, 10)) # ,verify=False
except requests.exceptions.RequestException as e:
log.info(e)
time.sleep(1)
continue
data_page = res.json()
errorCode = res.json()['errorCode']
if errorCode != 0:
continue
else:
break
res.close()
if errorCode == 0: if errorCode == 0:
pass pass
else: else:
...@@ -359,27 +408,40 @@ def doJob(): ...@@ -359,27 +408,40 @@ def doJob():
# res.close() # res.close()
log.info(f'----flag:{flag}----') log.info(f'----flag:{flag}----')
log.info(f'-----list_all:{len(list_all)}----') log.info(f'-----list_all:{len(list_all)}----')
for idx,holder_info in enumerate(list_all):
shareHolderName, percent = '', '' shareHolderName, percent = '', ''
if flag == 1: if flag == 1:
holder_info = list_all[0]
shareHolderName = holder_info['shareHolderName'] shareHolderName = holder_info['shareHolderName']
percent = holder_info['percent'] percent = holder_info['percent']
capitalTotal = holder_info['capitalTotal']
elif flag == 3: elif flag == 3:
holder_info = list_all[0]
shareHolderName = holder_info['name'] shareHolderName = holder_info['name']
percent = holder_info['proportion'] percent = holder_info['proportion']
capitalTotal = ''
else: else:
holder_info = list_all[0]
shareHolderName = holder_info['holder_name'] shareHolderName = holder_info['holder_name']
percent = holder_info['longHeldRatioWithUnit'] percent = holder_info['longHeldRatioWithUnit']
capitalTotal = ''
if shareHolderName and percent: if shareHolderName and percent:
dic_info['最大持股名称'] = shareHolderName if page == 1:
dic_info['股东序号序号'] = idx + 1
else:
dic_info['股东序号序号'] = idx + 1 + (10 * (page-1))
dic_info['前十大股东名称'] = shareHolderName
dic_info['持股比例'] = percent dic_info['持股比例'] = percent
# todo: 更新字段 dic_info['认缴出资额'] = capitalTotal
# info.update_holder(no, dic_info) # todo: 插入一条新纪录
log.info(dic_info)
try:
del dic_info['_id']
except:
pass
Info.insert_into(dic_info)
log.info('=========成功======') log.info('=========成功======')
token.updateTokeen(id_cookie, 3) token.updateTokeen(id_cookie, 3)
# time.sleep(randint(5,10)) # time.sleep(randint(5,10))
time.sleep(5) time.sleep(5)
...@@ -395,7 +457,7 @@ def doJob(): ...@@ -395,7 +457,7 @@ def doJob():
takeTime = baseCore.getTimeCost(start, time.time()) takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}') baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5) time.sleep(5)
break # break
......
import json import openpyxl
import redis import redis
from bs4 import BeautifulSoup # 先采两千强和独角兽
import langid # 连接到Redis服务器
redis_client = redis.Redis(host="114.116.90.53", port=6380, password='clbzzsn', db=6)
from base.BaseCore import BaseCore
baseCore =BaseCore() # 打开Excel文件
import pymysql # workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2022年福布斯2000强榜单(已排除2023年).xlsx')
# print(baseCore.detect_language("是对jhjjhjhhjjhjhjh的浮点数")) # workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2023年福布斯2000强.xlsx')
# cnx_ = baseCore.cnx # workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2023年独角兽企业(已排除2024年).xlsx')
# cursor_ = baseCore.cursor # workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2024胡润独角兽(4).xlsx')
cnx_ = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji', # workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2022年世界500强企业39家(已排除23年上榜企业)2.xlsx')
charset='utf8mb4') # workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2023年世界500强名单.xlsx')
cursor_ = cnx_.cursor() # workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2023年欧盟2500(已排除2022年).xlsx')
# updateBeginSql = f"update Tfbs set state3=%s where col3=%s " workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2022年欧盟2500强.xlsx')
# # print(updateBeginSql)
# cursor_.execute(updateBeginSql,(200,'91350000158142711F')) # 选择要读取的工作表
# cnx_.commit() worksheet = workbook['Sheet1']
# worksheet = workbook['sheet1']
import time
# from getTycId import getTycIdByXYDM # 选择要读取的列
# social_code = '91440101231247350J' column_index = 0 # 选择第2列
# data = baseCore.getInfomation(social_code)
# tycid = data[11] # 遍历指定列的单元格,并将值放入Redis列表
# if tycid == None: for row in worksheet.iter_rows(values_only=True):
# print(data) try:
# retData = getTycIdByXYDM(social_code) cell_value = row[1]
# tycid = retData['tycData']['id'] except:
# print(tycid) print(row[1])
continue
# time_struct = time.localtime(int(1692762780000 / 1000)) # 首先把时间戳转换为结构化时间 # print(type(cell_value))
# time_format = time.strftime("%Y-%m-%d %H-%M-%S", time_struct) # 把结构化时间转换为格式化时间 # print(cell_value)
# print(time_format) if row[0] == '序列' or row[0] == '序号' or row[0] == '排序':
continue
# r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn',db=6)
# #原键名 # 309
# key1 = 'CorPersonEnterpriseFbs:gnqy_socialCode' # item = ""+ "|"+str(row[3]) + "|" + str(row[2])+ "|" + str(row[1])+ "|" + "2022年福布斯2000强"
# #目标键名 # item = str(row[2])+ "|"+str(row[5]) + "|" + str(row[3])+ "|" + str(row[1])+ "|" + "2023年福布斯2000强"
# key2 = 'NewsEnterpriseFbs:gnqy_socialCode' # item = str(row[2])+ "|"+str(row[3]) + "|" + str(row[3])+ "|" + str(row[1])+ "|" + "2023年独角兽"
# values = r.lrange(key1,0,-1) # item = str(row[2])+ "|"+str(row[4]) + "|" + str(row[4])+ "|" + str(row[0])+ "|" + "2024年独角兽"
# for value in values: # item = str(row[1])+ "|"+str(row[5]) + "|" + str(row[4])+ "|" + str(row[0])+ "|" + "2022年世界500强"
# r.rpush(key2, value) # item = str(row[1])+ "|"+str(row[3]) + "|" + str(row[5])+ "|" + str(row[6])+ "|" + "2023年世界500强"
# # item = ""+ "|"+str(row[3]) + "|" + str(row[2])+ "|" + str(row[1])+ "|" + "2023年欧盟2500"
# # 关闭Redis连接 item = str(row[2])+ "|"+str(row[5]) + "|" + str(row[4])+ "|" + str(row[1])+ "|" + "2022年欧盟2500"
# r.close() redis_client.rpush('GOOGLE_KEYWORDS:COMPANY_NAME', item)
# redis_client.rpush('BAIDU_KEYWORDS:COMPANY_NAME', item)
print(item)
list_all = [] # break
if list_all: # 关闭Excel文件
print(len(list_all)) workbook.close()
else:
print('---')
...@@ -261,6 +261,7 @@ def getPageData(dic_url, page, dic_user_count): ...@@ -261,6 +261,7 @@ def getPageData(dic_url, page, dic_user_count):
return True, dic_user_count return True, dic_user_count
# 修改token使用时间 # 修改token使用时间
updateTokeen(token, 3) updateTokeen(token, 3)
pagecount = json_search['app_msg_cnt'] # 837
# 保存数据到数据库 # 保存数据到数据库
return insertWxList(dic_url, json_search, page, user_name), dic_user_count return insertWxList(dic_url, json_search, page, user_name), dic_user_count
...@@ -280,6 +281,7 @@ def getWxList(infoSourceCode, dic_user_count): ...@@ -280,6 +281,7 @@ def getWxList(infoSourceCode, dic_user_count):
origin = dic_url['name'] origin = dic_url['name']
biz = dic_url['biz'] biz = dic_url['biz']
# retFlag, dic_user_count = getPageData(dic_url, 1, dic_user_count)
for page in range(1, 6): for page in range(1, 6):
retFlag, dic_user_count = getPageData(dic_url, page, dic_user_count) retFlag, dic_user_count = getPageData(dic_url, page, dic_user_count)
time.sleep(random.randint(60, 181)) time.sleep(random.randint(60, 181))
...@@ -311,12 +313,12 @@ def getnumber_redis(): ...@@ -311,12 +313,12 @@ def getnumber_redis():
if __name__ == "__main__": if __name__ == "__main__":
getFromSql() # getFromSql()
# numbers = getnumber_redis() # numbers = getnumber_redis()
# log.info("当前批次采集公众号个数{}".format(numbers)) # log.info("当前批次采集公众号个数{}".format(numbers))
# time.sleep(3) # time.sleep(3)
# dic_user_count = {} dic_user_count = {}
# # dic_user_count = { # # dic_user_count = {
# # 'name': '', # # 'name': '',
# # 'use_count': 0, # # 'use_count': 0,
...@@ -344,5 +346,5 @@ if __name__ == "__main__": ...@@ -344,5 +346,5 @@ if __name__ == "__main__":
# for key, value in dic_user_count.items(): # for key, value in dic_user_count.items():
# log.info(f"====账号{key},使用次数{value}") # log.info(f"====账号{key},使用次数{value}")
# # break # # break
# # infoSourceCode = 'IN-20220917-0159' infoSourceCode = 'IN-20231110-0003'
# # getWxList(infoSourceCode) getWxList(infoSourceCode, dic_user_count)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论