Merge remote-tracking branch 'origin/master'

dc808e12 · LiuLiYuan · d6410378 · 5f3288f3 · dc808e12 · dc808e12
--- a/612test.py
+++ b/612test.py
+# cache = {
+#     "company": {
+#         "上汽集团": 1,
+#         "欧盟委员会": 13,
+#         "欧盟": 8,
+#         "欧盟委员会总部大厦一角": 2
+#     },
+#     "person": {
+#         "梅赛德斯": 2,
+#         "齐普策": 8,
+#         "韦杜姆": 5,
+#         "朔尔茨": 7,
+#         "冯德莱恩": 1,
+#         "布尔茨": 4,
+#         "纳吉": 3,
+#         "哈贝克": 3,
+#         "特斯拉": 2,
+#         "莫里永": 1,
+#         "维辛": 1,
+#         "康松林": 1,
+#         "尼古拉斯普瓦捷NiclasPoitiers": 1,
+#         "李缘": 1,
+#         "伯格": 1,
+#         "阿斯拉克伯格AslakBerg": 1,
+#         "林燕": 1,
+#         "华盛顿": 1,
+#         "布鲁盖尔Bruegel": 1,
+#         "川普": 1,
+#         "哈桑-扎米特": 1,
+#         "小鹏": 2,
+#         "康逸": 3,
+#         "费尔": 1,
+#         "埃德加博高": 1,
+#         "蔚来": 3,
+#         "杜登赫费尔": 3,
+#         "赵丁喆": 3,
+#         "卢基斯": 1,
+#         "斯泰兰蒂斯": 2,
+#         "保罗博若思": 1,
+#         "海国": 1,
+#         "斯特凡德古阿拉": 1,
+#         "弗兰克": 1,
+#         "施沃佩": 1,
+#         "费迪南德": 3,
+#         "迪尔克扬杜拉": 1,
+#         "米扎克": 1,
+#         "帕沃尔安塔利奇": 1,
+#         "亚采克米扎克": 1,
+#         "弗尔季奥蒂洛": 1,
+#         "张晨霖": 1,
+#         "基塞伊佐尔坦": 1,
+#         "德古阿拉": 3,
+#         "明道加斯普": 1,
+#         "杜登赫": 1,
+#         "奥托尔巴吉": 1,
+#         "郭晨": 1,
+#         "波罗": 1,
+#         "尹栋逊": 1,
+#         "颜景辉": 1,
+#         "段思瑶": 1,
+#         "裴健如": 1,
+#         "陈庆": 1,
+#         "纳吉马顿": 2,
+#         "崔东树": 1,
+#         "PatrickHummel": 1,
+#         "如蔚": 1,
+#         "李斌": 1,
+#         "福尔克•维辛": 1,
+#         "蔚": 1,
+#         "TechWeb": 1,
+#         "Suky": 1,
+#         "陈继业": 1,
+#         "欧方": 1,
+#         "齐普策OliverZipse": 1,
+#         "康林松OlaKaellenius": 1,
+#         "PFA": 1,
+#         "ACEA": 1,
+#         "希尔德加德": 1,
+#         "穆勒HildegardMueller": 1,
+#         "阿道夫乌尔索阿道夫": 1,
+#         "乌尔索": 1,
+#         "马库斯费伯MarkusFerber": 1,
+#         "特蕾莎里贝拉TeresaRibera": 1,
+#         "福尔克维辛": 2,
+#         "辛婧": 1,
+#         "殷晓圣": 3,
+#         "李若佳": 1,
+#         "刘维佳": 1,
+#         "萨拉热窝": 1,
+#         "专员薇奥莱塔布尔茨": 4,
+#         "哈贝克RobertHabeck": 1,
+#         "布特克MaximilianButek": 1,
+#         "关乌": 1,
+#         "布特克": 2,
+#         "俄乌": 1,
+#         "哈桑": 2,
+#         "吕瑟尔斯海姆": 2,
+#         "何塞普戈梅斯": 3,
+#         "李学军": 2,
+#         "刘向": 2,
+#         "戈梅斯": 2,
+#         "马灿": 2,
+#         "克雷希米尔": 2,
+#         "康林松": 3,
+#         "于荣": 2,
+#         "霍尔格格尔克": 3,
+#         "陈斌杰": 2,
+#         "梁国勇": 3,
+#         "李博": 2,
+#         "乔纳森博格": 2,
+#         "胡加齐": 2,
+#         "单玮怡": 2,
+#         "林剑": 3,
+#         "马克西米利安布特克MaximilianButek": 1,
+#         "何亚东": 1,
+#         "吕骞": 1,
+#         "金瑞庭": 1,
+#         "罗知之": 1,
+#         "马铭博": 1,
+#         "马铭": 1,
+#         "梅赛德斯-奔驰": 1,
+#         "埃隆马斯克": 1,
+#         "罗伯特哈贝克RobertHabeck": 1,
+#         "奥拉夫朔尔茨OlafScholz": 1
+#     },
+#     "location": {
+#         "上海市": 4,
+#         "北京市": 2,
+#         "江西省": 1,
+#         "赣州市": 1,
+#         "常州市": 2,
+#         "武进区": 2,
+#         "江苏省": 2
+#     },
+#     "sentiment": {
+#         "负面": 4,
+#         "中性": 10,
+#         "正面": 10
+#     },
+#     "time": {
+#         "17.4": 2,
+#         "6月12日": 6,
+#         "12日": 2,
+#         "2024年06月14日": 1,
+#         "2024年6月3日": 1,
+#         "6月13日": 1,
+#         "7月4日": 1,
+#         "38.1": 1,
+#         "2023年2月15日": 1,
+#         "6月17日": 3,
+#         "60.7": 1,
+#         "6月14日": 1
+#     }
+# }
+#
+# top_keywords = {keyword_type: sorted(keyword_freq.items(), key=lambda x: x[1], reverse=True)[:10] for
+#                     keyword_type, keyword_freq in cache.items()}
+# # print(top_keywords)
+#
+# # 提取前十的关键词
+# top_keywords_dict = {keyword_type: [keyword for keyword, freq in keywords] for keyword_type, keywords in
+#                          top_keywords.items()}
+# print(top_keywords_dict)
+#
+# industry_result = top_keywords_dict['industry'] if 'industry' in top_keywords_dict else []
+# insert_industry = ",".join(industry_result)
+# company_result = top_keywords_dict["company"] if "company" in top_keywords_dict else []
+# person_result = top_keywords_dict["person"] if "person" in top_keywords_dict else []
+# sentiment_result = top_keywords_dict["sentiment"] if "sentiment" in top_keywords_dict else []
+# location_result = top_keywords_dict["location"] if "location" in top_keywords_dict else []
+# time_result = top_keywords_dict["time"] if "time" in top_keywords_dict else []
+# print(f"insert_industry:{insert_industry}")
+# insert_company = ",".join(company_result)
+# insert_person = ",".join(person_result)
+# insert_sentiment = ",".join(sentiment_result)
+# insert_location = ",".join(location_result)
+# insert_time = ",".join(time_result)
+# print(f"insert_company:{insert_company}")
+# print(f"insert_person:{insert_person}")
+# print(f"insert_sentiment:{insert_sentiment}")
+# print(f"insert_location:{insert_location}")
+# print(f"insert_time:{insert_time}")
+# print(type(insert_industry))
+#
+# test_none = None
+# test_set = set(test_none)
+# print(test_set)
+# set1 = {'万家小新,迎春,李虹萦,张灏然,王宏志,袁野,谭作钧,习近平,鄂维南,苟坪'}
+# set2 = {'孟晚舟,李虹萦,张灏然,习近平,王宏志,鄂维南,张玉卓,谭作钧'}
+# # 使用 & 运算符找到交集
+# intersection_set = set1 & set2
+#
+# print(intersection_set)
+import json
+import json
+import pandas as pd
+# 假设我们有一个包含int64类型数据的DataFrame
+df = pd.DataFrame({'id': [1, 2, 3], 'value': [4, 5, 6]})
+df['id'] = df['id'].astype('int64')  # 确保id列是int64类型
+# 将DataFrame转换为JSON
+# 首先将int64转换为int
+df['id'] = df['id'].astype('int')
+# 现在可以将DataFrame转换为JSON
+json_str = json.dumps(df.to_dict(orient='records'))
+print(json_str)
--- a/base/RedisPPData.py
+++ b/base/RedisPPData.py
@@ -372,8 +372,9 @@ def AnnualEnterpriseXueQ_task():
 def AnnualEnterpriseUS():
    cnx,cursor = connectSql()
    # 获取美股企业
-    us_query = "select SocialCode from EnterpriseInfo where Place = '2' and SecuritiesType = '美股' and SecuritiesCode is not null and  CreateTime='2023-08-15 14:00:00'"
+    # us_query = "select SocialCode from EnterpriseInfo where Place = '2' and SecuritiesType = '美股' and SecuritiesCode is not null and  CreateTime='2023-08-15 14:00:00'"
    # us_query = "select SocialCode from EnterpriseInfo where Place = '2' and SecuritiesType = '美股' and SecuritiesCode = 'BP' "
+    us_query = "select cik from mgzqyjwyh_list where state=2 "
    #ZZSN22080900000025
    cursor.execute(us_query)
    us_result = cursor.fetchall()
@@ -381,7 +382,7 @@ def AnnualEnterpriseUS():
    us_social_list = [item[0] for item in us_result]
    print('=======')
    for item in us_social_list:
-        r.rpush('AnnualEnterprise:usqy_socialCode', item)
+        r.rpush('Sec_cik_US:uscik_annualReport', item)
    closeSql(cnx,cursor)
 #国外企业基本信息 redis中放入id
@@ -659,12 +660,14 @@ if __name__ == "__main__":
    # zhuangjingtexind()
    # NoticeEnterprise()
    # NoticeDF()
+    AnnualEnterpriseUS()
    # AnnualEnterpriseIPO()
    # AnnualEnterprise()
    # BaseInfoEnterprise()
    # BaseInfoEnterpriseAbroad()
    # NewsEnterprise_task()
    # NewsEnterprise()
+    # NoticeEnterprise()
    # CorPerson()
    # china100()
    # global100()
@@ -678,7 +681,7 @@ if __name__ == "__main__":
    # SEC_CIK()
    # dujioashou()
    # omeng()
-    AnnualEnterprise()
+    # AnnualEnterprise()
    # AnnualEnterpriseUS()
    # NoticeEnterprise_task()
    # AnnualEnterprise_task()

--- a/comData/Synchronize_data/同步数据.py
+++ b/comData/Synchronize_data/同步数据.py
--- a/comData/Tyc/classtool.py
+++ b/comData/Tyc/classtool.py
@@ -16,7 +16,7 @@ cursor = baseCore.cursor
 db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
    '天眼查登录信息']
 db_storage2 = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
-    '股东信息']
+    '股东信息0621']
 class File():
@@ -164,6 +164,20 @@ class Info():
        db_storage2.update_one({'序号': str(no)}, {
            '$set': {'股东企业信用代码': dic_info['股东企业信用代码'], '股东企业标签': dic_info['股东企业标签']}})
        pass
+    def insert_into(self, dic_info):
+        if dic_info['股东序号序号']:
+            db_storage2.find_one_and_update(
+                {
+                '序号': str(dic_info['序号']),
+                "股东序号序号": str(dic_info['股东序号序号'])
+                },
+                {'$set': dic_info}, upsert=True)
+        else:
+            result = db_storage2.insert_one(dic_info)
+            print(result)
+            pass
 if __name__ == '__main__':
    # token = Token()

--- a/comData/Tyc/getTycId.py
+++ b/comData/Tyc/getTycId.py
@@ -64,7 +64,7 @@ taskType = '天眼查企业id/天眼查'
 @retry(tries=5, delay=3)
 def getTycIdByXYDM(com_name, s):
-    retData={'state':False, 'tycData':None, 'reput':True}
+    retData={'state': False, 'tycData': None, 'reput': True}
    url=f"https://capi.tianyancha.com/cloud-tempest/search/suggest/v3?_={baseCore.getNowTime(3)}"
    # url=f"https://capi.tianyancha.com/cloud-tempest/search/suggest/v3"
    ip = baseCore.get_proxy()

--- a/comData/Tyc/shareHolderInfo.py
+++ b/comData/Tyc/shareHolderInfo.py
--- a/comData/Tyc/test.py
+++ b/comData/Tyc/test.py
-import json
+import openpyxl
 import redis
-from bs4 import BeautifulSoup
+# 先采两千强和独角兽
-import langid
+# 连接到Redis服务器
+redis_client = redis.Redis(host="114.116.90.53", port=6380, password='clbzzsn', db=6)
-from base.BaseCore import BaseCore
-baseCore =BaseCore()
+# 打开Excel文件
-import pymysql
+# workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2022年福布斯2000强榜单（已排除2023年）.xlsx')
-# print(baseCore.detect_language("是对jhjjhjhhjjhjhjh的浮点数"))
+# workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2023年福布斯2000强.xlsx')
-# cnx_ = baseCore.cnx
+# workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2023年独角兽企业（已排除2024年）.xlsx')
-# cursor_ = baseCore.cursor
+# workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2024胡润独角兽(4).xlsx')
-cnx_ = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='caiji',
+# workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2022年世界500强企业39家（已排除23年上榜企业）2.xlsx')
-                                   charset='utf8mb4')
+# workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2023年世界500强名单.xlsx')
-cursor_ = cnx_.cursor()
+# workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2023年欧盟2500（已排除2022年）.xlsx')
-# updateBeginSql = f"update Tfbs set state3=%s where col3=%s "
+workbook = openpyxl.load_workbook(r'D:\kkwork\企业数据\数据组提供\企业裁员数据\2022年欧盟2500强.xlsx')
-# # print(updateBeginSql)
-# cursor_.execute(updateBeginSql,(200,'91350000158142711F'))
+# 选择要读取的工作表
-# cnx_.commit()
+worksheet = workbook['Sheet1']
+# worksheet = workbook['sheet1']
-import time
-# from getTycId import getTycIdByXYDM
+# 选择要读取的列
-# social_code = '91440101231247350J'
+column_index = 0  # 选择第2列
-# data = baseCore.getInfomation(social_code)
-# tycid = data[11]
+# 遍历指定列的单元格，并将值放入Redis列表
-# if tycid == None:
+for row in worksheet.iter_rows(values_only=True):
-#     print(data)
+    try:
-#     retData = getTycIdByXYDM(social_code)
+        cell_value = row[1]
-#     tycid = retData['tycData']['id']
+    except:
-#     print(tycid)
+        print(row[1])
+        continue
-# time_struct = time.localtime(int(1692762780000 / 1000))  # 首先把时间戳转换为结构化时间
+    # print(type(cell_value))
-# time_format = time.strftime("%Y-%m-%d %H-%M-%S", time_struct)  # 把结构化时间转换为格式化时间
+    # print(cell_value)
-# print(time_format)
+    if row[0] == '序列' or row[0] == '序号' or row[0] == '排序':
+        continue
-# r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn',db=6)
-# #原键名
+    # 309
-# key1 = 'CorPersonEnterpriseFbs:gnqy_socialCode'
+    # item = ""+ "|"+str(row[3]) + "|" + str(row[2])+ "|" + str(row[1])+ "|" + "2022年福布斯2000强"
-# #目标键名
+    # item = str(row[2])+ "|"+str(row[5]) + "|" + str(row[3])+ "|" + str(row[1])+ "|" + "2023年福布斯2000强"
-# key2 = 'NewsEnterpriseFbs:gnqy_socialCode'
+    # item = str(row[2])+ "|"+str(row[3]) + "|" + str(row[3])+ "|" + str(row[1])+ "|" + "2023年独角兽"
-# values = r.lrange(key1,0,-1)
+    # item = str(row[2])+ "|"+str(row[4]) + "|" + str(row[4])+ "|" + str(row[0])+ "|" + "2024年独角兽"
-# for value in values:
+    # item = str(row[1])+ "|"+str(row[5]) + "|" + str(row[4])+ "|" + str(row[0])+ "|" + "2022年世界500强"
-#     r.rpush(key2, value)
+    # item = str(row[1])+ "|"+str(row[3]) + "|" + str(row[5])+ "|" + str(row[6])+ "|" + "2023年世界500强"
-#
+    # item = ""+ "|"+str(row[3]) + "|" + str(row[2])+ "|" + str(row[1])+ "|" + "2023年欧盟2500"
-# # 关闭Redis连接
+    item = str(row[2])+ "|"+str(row[5]) + "|" + str(row[4])+ "|" + str(row[1])+ "|" + "2022年欧盟2500"
-# r.close()
+    redis_client.rpush('GOOGLE_KEYWORDS:COMPANY_NAME', item)
+    # redis_client.rpush('BAIDU_KEYWORDS:COMPANY_NAME', item)
+    print(item)
-list_all = []
+    # break
-if list_all:
+# 关闭Excel文件
-    print(len(list_all))
+workbook.close()
-else:
-    print('---')
--- a/comData/weixin_solo/wxList.py
+++ b/comData/weixin_solo/wxList.py
@@ -261,6 +261,7 @@ def getPageData(dic_url, page, dic_user_count):
        return True, dic_user_count
    # 修改token使用时间
    updateTokeen(token, 3)
+    pagecount = json_search['app_msg_cnt']   # 837
    # 保存数据到数据库
    return insertWxList(dic_url, json_search, page, user_name), dic_user_count
@@ -280,6 +281,7 @@ def getWxList(infoSourceCode, dic_user_count):
    origin = dic_url['name']
    biz = dic_url['biz']
+    # retFlag, dic_user_count = getPageData(dic_url, 1, dic_user_count)
    for page in range(1, 6):
        retFlag, dic_user_count = getPageData(dic_url, page, dic_user_count)
        time.sleep(random.randint(60, 181))
@@ -311,12 +313,12 @@ def getnumber_redis():
 if __name__ == "__main__":
-    getFromSql()
+    # getFromSql()
    # numbers = getnumber_redis()
    # log.info("当前批次采集公众号个数{}".format(numbers))
    # time.sleep(3)
-    # dic_user_count = {}
+    dic_user_count = {}
    # # dic_user_count = {
    # #     'name': '',
    # #     'use_count': 0,
@@ -344,5 +346,5 @@ if __name__ == "__main__":
    #     for key, value in dic_user_count.items():
    #         log.info(f"====账号{key}，使用次数{value}")
    #     # break
-    # # infoSourceCode = 'IN-20220917-0159'
+    infoSourceCode = 'IN-20231110-0003'
-    # # getWxList(infoSourceCode)
+    getWxList(infoSourceCode, dic_user_count)