提交 dc808e12 作者: LiuLiYuan

Merge remote-tracking branch 'origin/master'

# cache = {
# "company": {
# "上汽集团": 1,
# "欧盟委员会": 13,
# "欧盟": 8,
# "欧盟委员会总部大厦一角": 2
# },
# "person": {
# "梅赛德斯": 2,
# "齐普策": 8,
# "韦杜姆": 5,
# "朔尔茨": 7,
# "冯德莱恩": 1,
# "布尔茨": 4,
# "纳吉": 3,
# "哈贝克": 3,
# "特斯拉": 2,
# "莫里永": 1,
# "维辛": 1,
# "康松林": 1,
# "尼古拉斯普瓦捷NiclasPoitiers": 1,
# "李缘": 1,
# "伯格": 1,
# "阿斯拉克伯格AslakBerg": 1,
# "林燕": 1,
# "华盛顿": 1,
# "布鲁盖尔Bruegel": 1,
# "川普": 1,
# "哈桑-扎米特": 1,
# "小鹏": 2,
# "康逸": 3,
# "费尔": 1,
# "埃德加博高": 1,
# "蔚来": 3,
# "杜登赫费尔": 3,
# "赵丁喆": 3,
# "卢基斯": 1,
# "斯泰兰蒂斯": 2,
# "保罗博若思": 1,
# "海国": 1,
# "斯特凡德古阿拉": 1,
# "弗兰克": 1,
# "施沃佩": 1,
# "费迪南德": 3,
# "迪尔克扬杜拉": 1,
# "米扎克": 1,
# "帕沃尔安塔利奇": 1,
# "亚采克米扎克": 1,
# "弗尔季奥蒂洛": 1,
# "张晨霖": 1,
# "基塞伊佐尔坦": 1,
# "德古阿拉": 3,
# "明道加斯普": 1,
# "杜登赫": 1,
# "奥托尔巴吉": 1,
# "郭晨": 1,
# "波罗": 1,
# "尹栋逊": 1,
# "颜景辉": 1,
# "段思瑶": 1,
# "裴健如": 1,
# "陈庆": 1,
# "纳吉马顿": 2,
# "崔东树": 1,
# "PatrickHummel": 1,
# "如蔚": 1,
# "李斌": 1,
# "福尔克•维辛": 1,
# "蔚": 1,
# "TechWeb": 1,
# "Suky": 1,
# "陈继业": 1,
# "欧方": 1,
# "齐普策OliverZipse": 1,
# "康林松OlaKaellenius": 1,
# "PFA": 1,
# "ACEA": 1,
# "希尔德加德": 1,
# "穆勒HildegardMueller": 1,
# "阿道夫乌尔索阿道夫": 1,
# "乌尔索": 1,
# "马库斯费伯MarkusFerber": 1,
# "特蕾莎里贝拉TeresaRibera": 1,
# "福尔克维辛": 2,
# "辛婧": 1,
# "殷晓圣": 3,
# "李若佳": 1,
# "刘维佳": 1,
# "萨拉热窝": 1,
# "专员薇奥莱塔布尔茨": 4,
# "哈贝克RobertHabeck": 1,
# "布特克MaximilianButek": 1,
# "关乌": 1,
# "布特克": 2,
# "俄乌": 1,
# "哈桑": 2,
# "吕瑟尔斯海姆": 2,
# "何塞普戈梅斯": 3,
# "李学军": 2,
# "刘向": 2,
# "戈梅斯": 2,
# "马灿": 2,
# "克雷希米尔": 2,
# "康林松": 3,
# "于荣": 2,
# "霍尔格格尔克": 3,
# "陈斌杰": 2,
# "梁国勇": 3,
# "李博": 2,
# "乔纳森博格": 2,
# "胡加齐": 2,
# "单玮怡": 2,
# "林剑": 3,
# "马克西米利安布特克MaximilianButek": 1,
# "何亚东": 1,
# "吕骞": 1,
# "金瑞庭": 1,
# "罗知之": 1,
# "马铭博": 1,
# "马铭": 1,
# "梅赛德斯-奔驰": 1,
# "埃隆马斯克": 1,
# "罗伯特哈贝克RobertHabeck": 1,
# "奥拉夫朔尔茨OlafScholz": 1
# },
# "location": {
# "上海市": 4,
# "北京市": 2,
# "江西省": 1,
# "赣州市": 1,
# "常州市": 2,
# "武进区": 2,
# "江苏省": 2
# },
# "sentiment": {
# "负面": 4,
# "中性": 10,
# "正面": 10
# },
# "time": {
# "17.4": 2,
# "6月12日": 6,
# "12日": 2,
# "2024年06月14日": 1,
# "2024年6月3日": 1,
# "6月13日": 1,
# "7月4日": 1,
# "38.1": 1,
# "2023年2月15日": 1,
# "6月17日": 3,
# "60.7": 1,
# "6月14日": 1
# }
# }
#
# top_keywords = {keyword_type: sorted(keyword_freq.items(), key=lambda x: x[1], reverse=True)[:10] for
# keyword_type, keyword_freq in cache.items()}
# # print(top_keywords)
#
# # 提取前十的关键词
# top_keywords_dict = {keyword_type: [keyword for keyword, freq in keywords] for keyword_type, keywords in
# top_keywords.items()}
# print(top_keywords_dict)
#
# industry_result = top_keywords_dict['industry'] if 'industry' in top_keywords_dict else []
# insert_industry = ",".join(industry_result)
# company_result = top_keywords_dict["company"] if "company" in top_keywords_dict else []
# person_result = top_keywords_dict["person"] if "person" in top_keywords_dict else []
# sentiment_result = top_keywords_dict["sentiment"] if "sentiment" in top_keywords_dict else []
# location_result = top_keywords_dict["location"] if "location" in top_keywords_dict else []
# time_result = top_keywords_dict["time"] if "time" in top_keywords_dict else []
# print(f"insert_industry:{insert_industry}")
# insert_company = ",".join(company_result)
# insert_person = ",".join(person_result)
# insert_sentiment = ",".join(sentiment_result)
# insert_location = ",".join(location_result)
# insert_time = ",".join(time_result)
# print(f"insert_company:{insert_company}")
# print(f"insert_person:{insert_person}")
# print(f"insert_sentiment:{insert_sentiment}")
# print(f"insert_location:{insert_location}")
# print(f"insert_time:{insert_time}")
# print(type(insert_industry))
#
# test_none = None
# test_set = set(test_none)
# print(test_set)
# set1 = {'万家小新,迎春,李虹萦,张灏然,王宏志,袁野,谭作钧,习近平,鄂维南,苟坪'}
# set2 = {'孟晚舟,李虹萦,张灏然,习近平,王宏志,鄂维南,张玉卓,谭作钧'}
# # 使用 & 运算符找到交集
# intersection_set = set1 & set2
#
# print(intersection_set)
import json
import json
import pandas as pd
# 假设我们有一个包含int64类型数据的DataFrame
df = pd.DataFrame({'id': [1, 2, 3], 'value': [4, 5, 6]})
df['id'] = df['id'].astype('int64') # 确保id列是int64类型
# 将DataFrame转换为JSON
# 首先将int64转换为int
df['id'] = df['id'].astype('int')
# 现在可以将DataFrame转换为JSON
json_str = json.dumps(df.to_dict(orient='records'))
print(json_str)
...@@ -372,8 +372,9 @@ def AnnualEnterpriseXueQ_task(): ...@@ -372,8 +372,9 @@ def AnnualEnterpriseXueQ_task():
def AnnualEnterpriseUS(): def AnnualEnterpriseUS():
cnx,cursor = connectSql() cnx,cursor = connectSql()
# 获取美股企业 # 获取美股企业
us_query = "select SocialCode from EnterpriseInfo where Place = '2' and SecuritiesType = '美股' and SecuritiesCode is not null and CreateTime='2023-08-15 14:00:00'" # us_query = "select SocialCode from EnterpriseInfo where Place = '2' and SecuritiesType = '美股' and SecuritiesCode is not null and CreateTime='2023-08-15 14:00:00'"
# us_query = "select SocialCode from EnterpriseInfo where Place = '2' and SecuritiesType = '美股' and SecuritiesCode = 'BP' " # us_query = "select SocialCode from EnterpriseInfo where Place = '2' and SecuritiesType = '美股' and SecuritiesCode = 'BP' "
us_query = "select cik from mgzqyjwyh_list where state=2 "
#ZZSN22080900000025 #ZZSN22080900000025
cursor.execute(us_query) cursor.execute(us_query)
us_result = cursor.fetchall() us_result = cursor.fetchall()
...@@ -381,7 +382,7 @@ def AnnualEnterpriseUS(): ...@@ -381,7 +382,7 @@ def AnnualEnterpriseUS():
us_social_list = [item[0] for item in us_result] us_social_list = [item[0] for item in us_result]
print('=======') print('=======')
for item in us_social_list: for item in us_social_list:
r.rpush('AnnualEnterprise:usqy_socialCode', item) r.rpush('Sec_cik_US:uscik_annualReport', item)
closeSql(cnx,cursor) closeSql(cnx,cursor)
#国外企业基本信息 redis中放入id #国外企业基本信息 redis中放入id
...@@ -659,12 +660,14 @@ if __name__ == "__main__": ...@@ -659,12 +660,14 @@ if __name__ == "__main__":
# zhuangjingtexind() # zhuangjingtexind()
# NoticeEnterprise() # NoticeEnterprise()
# NoticeDF() # NoticeDF()
AnnualEnterpriseUS()
# AnnualEnterpriseIPO() # AnnualEnterpriseIPO()
# AnnualEnterprise() # AnnualEnterprise()
# BaseInfoEnterprise() # BaseInfoEnterprise()
# BaseInfoEnterpriseAbroad() # BaseInfoEnterpriseAbroad()
# NewsEnterprise_task() # NewsEnterprise_task()
# NewsEnterprise() # NewsEnterprise()
# NoticeEnterprise()
# CorPerson() # CorPerson()
# china100() # china100()
# global100() # global100()
...@@ -678,7 +681,7 @@ if __name__ == "__main__": ...@@ -678,7 +681,7 @@ if __name__ == "__main__":
# SEC_CIK() # SEC_CIK()
# dujioashou() # dujioashou()
# omeng() # omeng()
AnnualEnterprise() # AnnualEnterprise()
# AnnualEnterpriseUS() # AnnualEnterpriseUS()
# NoticeEnterprise_task() # NoticeEnterprise_task()
# AnnualEnterprise_task() # AnnualEnterprise_task()
......
...@@ -16,7 +16,7 @@ cursor = baseCore.cursor ...@@ -16,7 +16,7 @@ cursor = baseCore.cursor
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[ db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
'天眼查登录信息'] '天眼查登录信息']
db_storage2 = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[ db_storage2 = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
'股东信息'] '股东信息0621']
class File(): class File():
...@@ -164,6 +164,20 @@ class Info(): ...@@ -164,6 +164,20 @@ class Info():
db_storage2.update_one({'序号': str(no)}, { db_storage2.update_one({'序号': str(no)}, {
'$set': {'股东企业信用代码': dic_info['股东企业信用代码'], '股东企业标签': dic_info['股东企业标签']}}) '$set': {'股东企业信用代码': dic_info['股东企业信用代码'], '股东企业标签': dic_info['股东企业标签']}})
pass pass
def insert_into(self, dic_info):
if dic_info['股东序号序号']:
db_storage2.find_one_and_update(
{
'序号': str(dic_info['序号']),
"股东序号序号": str(dic_info['股东序号序号'])
},
{'$set': dic_info}, upsert=True)
else:
result = db_storage2.insert_one(dic_info)
print(result)
pass
if __name__ == '__main__': if __name__ == '__main__':
# token = Token() # token = Token()
......
...@@ -64,7 +64,7 @@ taskType = '天眼查企业id/天眼查' ...@@ -64,7 +64,7 @@ taskType = '天眼查企业id/天眼查'
@retry(tries=5, delay=3) @retry(tries=5, delay=3)
def getTycIdByXYDM(com_name, s): def getTycIdByXYDM(com_name, s):
retData={'state':False, 'tycData':None, 'reput':True} retData={'state': False, 'tycData': None, 'reput': True}
url=f"https://capi.tianyancha.com/cloud-tempest/search/suggest/v3?_={baseCore.getNowTime(3)}" url=f"https://capi.tianyancha.com/cloud-tempest/search/suggest/v3?_={baseCore.getNowTime(3)}"
# url=f"https://capi.tianyancha.com/cloud-tempest/search/suggest/v3" # url=f"https://capi.tianyancha.com/cloud-tempest/search/suggest/v3"
ip = baseCore.get_proxy() ip = baseCore.get_proxy()
......
import json import openpyxl
import redis import redis
from bs4 import BeautifulSoup # 先采两千强和独角兽
import langid # 连接到Redis服务器
redis_client = redis.Redis(host="114.116.90.53", port=6380, password='clbzzsn', db=6)
from base.BaseCore import BaseCore
baseCore =BaseCore() # 打开Excel文件
import pymysql # workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2022年福布斯2000强榜单(已排除2023年).xlsx')
# print(baseCore.detect_language("是对jhjjhjhhjjhjhjh的浮点数")) # workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2023年福布斯2000强.xlsx')
# cnx_ = baseCore.cnx # workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2023年独角兽企业(已排除2024年).xlsx')
# cursor_ = baseCore.cursor # workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2024胡润独角兽(4).xlsx')
cnx_ = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji', # workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2022年世界500强企业39家(已排除23年上榜企业)2.xlsx')
charset='utf8mb4') # workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2023年世界500强名单.xlsx')
cursor_ = cnx_.cursor() # workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2023年欧盟2500(已排除2022年).xlsx')
# updateBeginSql = f"update Tfbs set state3=%s where col3=%s " workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2022年欧盟2500强.xlsx')
# # print(updateBeginSql)
# cursor_.execute(updateBeginSql,(200,'91350000158142711F')) # 选择要读取的工作表
# cnx_.commit() worksheet = workbook['Sheet1']
# worksheet = workbook['sheet1']
import time
# from getTycId import getTycIdByXYDM # 选择要读取的列
# social_code = '91440101231247350J' column_index = 0 # 选择第2列
# data = baseCore.getInfomation(social_code)
# tycid = data[11] # 遍历指定列的单元格,并将值放入Redis列表
# if tycid == None: for row in worksheet.iter_rows(values_only=True):
# print(data) try:
# retData = getTycIdByXYDM(social_code) cell_value = row[1]
# tycid = retData['tycData']['id'] except:
# print(tycid) print(row[1])
continue
# time_struct = time.localtime(int(1692762780000 / 1000)) # 首先把时间戳转换为结构化时间 # print(type(cell_value))
# time_format = time.strftime("%Y-%m-%d %H-%M-%S", time_struct) # 把结构化时间转换为格式化时间 # print(cell_value)
# print(time_format) if row[0] == '序列' or row[0] == '序号' or row[0] == '排序':
continue
# r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn',db=6)
# #原键名 # 309
# key1 = 'CorPersonEnterpriseFbs:gnqy_socialCode' # item = ""+ "|"+str(row[3]) + "|" + str(row[2])+ "|" + str(row[1])+ "|" + "2022年福布斯2000强"
# #目标键名 # item = str(row[2])+ "|"+str(row[5]) + "|" + str(row[3])+ "|" + str(row[1])+ "|" + "2023年福布斯2000强"
# key2 = 'NewsEnterpriseFbs:gnqy_socialCode' # item = str(row[2])+ "|"+str(row[3]) + "|" + str(row[3])+ "|" + str(row[1])+ "|" + "2023年独角兽"
# values = r.lrange(key1,0,-1) # item = str(row[2])+ "|"+str(row[4]) + "|" + str(row[4])+ "|" + str(row[0])+ "|" + "2024年独角兽"
# for value in values: # item = str(row[1])+ "|"+str(row[5]) + "|" + str(row[4])+ "|" + str(row[0])+ "|" + "2022年世界500强"
# r.rpush(key2, value) # item = str(row[1])+ "|"+str(row[3]) + "|" + str(row[5])+ "|" + str(row[6])+ "|" + "2023年世界500强"
# # item = ""+ "|"+str(row[3]) + "|" + str(row[2])+ "|" + str(row[1])+ "|" + "2023年欧盟2500"
# # 关闭Redis连接 item = str(row[2])+ "|"+str(row[5]) + "|" + str(row[4])+ "|" + str(row[1])+ "|" + "2022年欧盟2500"
# r.close() redis_client.rpush('GOOGLE_KEYWORDS:COMPANY_NAME', item)
# redis_client.rpush('BAIDU_KEYWORDS:COMPANY_NAME', item)
print(item)
list_all = [] # break
if list_all: # 关闭Excel文件
print(len(list_all)) workbook.close()
else:
print('---')
...@@ -261,6 +261,7 @@ def getPageData(dic_url, page, dic_user_count): ...@@ -261,6 +261,7 @@ def getPageData(dic_url, page, dic_user_count):
return True, dic_user_count return True, dic_user_count
# 修改token使用时间 # 修改token使用时间
updateTokeen(token, 3) updateTokeen(token, 3)
pagecount = json_search['app_msg_cnt'] # 837
# 保存数据到数据库 # 保存数据到数据库
return insertWxList(dic_url, json_search, page, user_name), dic_user_count return insertWxList(dic_url, json_search, page, user_name), dic_user_count
...@@ -280,6 +281,7 @@ def getWxList(infoSourceCode, dic_user_count): ...@@ -280,6 +281,7 @@ def getWxList(infoSourceCode, dic_user_count):
origin = dic_url['name'] origin = dic_url['name']
biz = dic_url['biz'] biz = dic_url['biz']
# retFlag, dic_user_count = getPageData(dic_url, 1, dic_user_count)
for page in range(1, 6): for page in range(1, 6):
retFlag, dic_user_count = getPageData(dic_url, page, dic_user_count) retFlag, dic_user_count = getPageData(dic_url, page, dic_user_count)
time.sleep(random.randint(60, 181)) time.sleep(random.randint(60, 181))
...@@ -311,12 +313,12 @@ def getnumber_redis(): ...@@ -311,12 +313,12 @@ def getnumber_redis():
if __name__ == "__main__": if __name__ == "__main__":
getFromSql() # getFromSql()
# numbers = getnumber_redis() # numbers = getnumber_redis()
# log.info("当前批次采集公众号个数{}".format(numbers)) # log.info("当前批次采集公众号个数{}".format(numbers))
# time.sleep(3) # time.sleep(3)
# dic_user_count = {} dic_user_count = {}
# # dic_user_count = { # # dic_user_count = {
# # 'name': '', # # 'name': '',
# # 'use_count': 0, # # 'use_count': 0,
...@@ -344,5 +346,5 @@ if __name__ == "__main__": ...@@ -344,5 +346,5 @@ if __name__ == "__main__":
# for key, value in dic_user_count.items(): # for key, value in dic_user_count.items():
# log.info(f"====账号{key},使用次数{value}") # log.info(f"====账号{key},使用次数{value}")
# # break # # break
# # infoSourceCode = 'IN-20220917-0159' infoSourceCode = 'IN-20231110-0003'
# # getWxList(infoSourceCode) getWxList(infoSourceCode, dic_user_count)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论