0906

4c5b1a70 · XveLingKun · 255f8c19 · 4c5b1a70 · 4c5b1a70 · 4c5b1a70
--- a/.idea/deployment.xml
+++ b/.idea/deployment.xml
@@ -2,6 +2,13 @@
 <project version="4">
  <component name="PublishConfigData" remoteFilesAllowedToDisappearOnAutoupload="false">
    <serverData>
+      <paths name="root@114.115.141.81:22">
+        <serverdata>
+          <mappings>
+            <mapping local="$PROJECT_DIR$" web="/" />
+          </mappings>
+        </serverdata>
+      </paths>
      <paths name="root@114.115.141.81:22 (2)">
        <serverdata>
          <mappings>
@@ -16,6 +23,13 @@
          </mappings>
        </serverdata>
      </paths>
+      <paths name="root@114.116.49.86:22 (2)">
+        <serverdata>
+          <mappings>
+            <mapping local="$PROJECT_DIR$" web="/" />
+          </mappings>
+        </serverdata>
+      </paths>
      <paths name="root@114.116.54.108:22">
        <serverdata>
          <mappings>

--- a/612test.py
+++ b/612test.py
--- a/comData/Tyc/classtool.py
+++ b/comData/Tyc/classtool.py
@@ -7,16 +7,17 @@ import pymongo
 from bson import ObjectId
 from openpyxl import Workbook, load_workbook
-from base.BaseCore import BaseCore
+import sys
+sys.path.append('../../base')
-baseCore = BaseCore()
+import BaseCore
+baseCore = BaseCore.BaseCore()
 log = baseCore.getLogger()
 cnx = baseCore.cnx
 cursor = baseCore.cursor
 db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
    '天眼查登录信息']
 db_storage2 = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
-    '股东信息0621']
+    '最大股东信息0902']
 class File():
@@ -160,10 +161,12 @@ class Info():
    def update_holder(self, no, dic_info):
        db_storage2.update_one({'序号': str(no)}, {'$set': {'最大持股名称': dic_info['最大持股名称'], '持股比例': dic_info['持股比例'], '企业标签': dic_info['企业标签']}})
        pass
    def update_info(self, no, dic_info):
        db_storage2.update_one({'序号': str(no)}, {
            '$set': {'股东企业信用代码': dic_info['股东企业信用代码'], '股东企业标签': dic_info['股东企业标签']}})
        pass
    def insert_into(self, dic_info):
        if dic_info['股东序号序号']:
@@ -179,6 +182,16 @@ class Info():
            print(result)
            pass
+    def bigshearholder_insert(self,dic_info):
+        insertion_result = db_storage2.insert_one(dic_info)
+        inserted_id = insertion_result.inserted_id
+        return inserted_id
+    def bigupdate_info(self, no, dic_info):
+        db_storage2.update_one({'企业信用代码（中国内地企业需填写信用代码）': str(no)}, {
+            '$set': {'最大持股企业信用代码': dic_info['最大持股企业信用代码'], '最大持股企业标签': dic_info['最大持股企业标签']}})
+        pass
 from selenium import webdriver
 class Driver():

--- a/comData/Tyc/get_tyc_cookies.py
+++ b/comData/Tyc/get_tyc_cookies.py
@@ -26,7 +26,7 @@ if __name__ == "__main__":
    name = input('所属用户:')
    driver = create_driver()
    driver.get(url)
-    time.sleep(60)
+    time.sleep(80)
    cookies = driver.get_cookies()
    # print(driver.get_cookies())

--- a/comData/Tyc/shareHolderInfo-1.py
+++ b/comData/Tyc/shareHolderInfo-1.py
--- a/comData/Tyc/shareHolderInfo.py
+++ b/comData/Tyc/shareHolderInfo.py
@@ -180,32 +180,26 @@ def doJob():
            continue
        # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
        item = baseCore.redicPullData('shareHolderInfo')
-        # item = '900|微创心律管理|None|罗七一|健康科技|￥ 90 亿|￥ 90 亿|￥ 92 亿|823|861|911|ZZSN231108150127681|MicroPort Cardiac Rhythm Management International Limited|中国|None'
+        # item = '91310115MA1HB3LY4M|上海商汤科技开发有限公司'
        # 判断 如果Redis中已经没有数据，则等待
        # social_code = '91110108780992804C'
        if item == None:
            time.sleep(30 * 60)
            continue
        start = time.time()
-        no = item.split('|')[0]
+        # no = item.split('|')[0]
-        social_code = item.split('|')[11]
+        # social_code = item.split('|')[11]
+        social_code = item.split('|')[0]
+        com_name = item.split('|')[1]
-        recept_name = item.split('|')[12]
+        # recept_name = item.split('|')[12]
        dic_info = {"序号": item.split('|')[0],
-                    "企业名称（榜单公布）": item.split('|')[1],
+                    "企业信用代码（中国内地企业需填写信用代码）": social_code,
-                    "企业别称": item.split('|')[2],
+                    "企业名称（企查查/天眼查）": com_name
-                    "门人/联合创始": item.split('|')[3],
-                    "行业": item.split('|')[4],
-                    "企业估值（2022年）": item.split('|')[5],
-                    "企业估值（2023年）": item.split('|')[6],
-                    "企业估值（2024年）": item.split('|')[7],
-                    "2022年独角兽排名": item.split('|')[8],
-                    "2023年独角兽排名": item.split('|')[9],
-                    "2024年独角兽排名": item.split('|')[10],
-                    "企业信用代码（中国内地企业需填写信用代码）": item.split('|')[11],
-                    "企业名称（企查查）": item.split('|')[12],
-                    "所属国家": item.split('|')[13]
                    }
+        """
+        最大持股企业、最大持股企业原文名称、最大持股企业所属国家、持股比例、最大持股企业信用代码、最大持股企业标签
+        """
        if "ZZSN" in social_code:
            dic_info['前十大股东名称'] = ''
            dic_info['持股比例'] = ''
@@ -244,7 +238,7 @@ def doJob():
                    tycid = ''
            if tycid == None or tycid == '':
                try:
-                    retData = getTycIdByXYDM(recept_name, s)
+                    retData = getTycIdByXYDM(com_name, s)
                    # retData = getTycIdByXYDM("极星汽车销售有限公司", s)
                    if retData['state']:
                        tycid = retData['tycData']['id']

--- a/comData/Tyc/shareHolderInfo_otherinfo.py
+++ b/comData/Tyc/shareHolderInfo_otherinfo.py
+"""采集最大股东信息的相关信息"""
+import json
+import requests, time
+from bs4 import BeautifulSoup
+import urllib3
+from retry import retry
+from getTycId import getTycIdByXYDM
+import sys
+sys.path.append('../../base')
+import BaseCore
+baseCore = BaseCore.BaseCore()
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+log = baseCore.getLogger()
+cnx_ = baseCore.cnx
+cursor_ = baseCore.cursor
+cnx = baseCore.cnx_
+cursor = baseCore.cursor_
+list_all_1 = []
+list_all_2 = []
+taskType = '天眼查/股东信息'
+from classtool import Token, Info
+token = Token()
+Info = Info()
+@retry(tries=3, delay=1)
+def get_html(tycid, driver, dic_info):
+    url = f"https://www.tianyancha.com/company/{tycid}"
+    driver.get(url=url)
+    time.sleep(3)
+    page_source = driver.page_source
+    soup = BeautifulSoup(page_source, 'html.parser')
+    xydm = soup.find('span', attrs={'class': 'index_detail-credit-code__fH1Ny'}).text
+    dic_info['最大持股企业信用代码'] = xydm
+    script = soup.find('script', attrs={'id': '__NEXT_DATA__'}).text
+    script = json.loads(script)
+    script = script['props']['pageProps']['dehydratedState']['queries'][0]['state']['data']['data']['tagListV2']
+    tag_list = []
+    filter_list = ['存续', '曾用名', '竞争风险', '司法案件', '合作风险', '股权出质', '仍注册']
+    for tag in script:
+        if tag['title'] in filter_list:
+            continue
+        if tag['color'] == '#FF463C':
+            continue
+        tag_list.append(tag['title'])
+    dic_info['最大持股企业标签'] = tag_list
+    return dic_info
+@retry(tries=5, delay=3)
+def get_page(url, s, headers):
+    ip = baseCore.get_proxy()
+    res = s.get(url=url, headers=headers, proxies=ip, timeout=(5, 10))
+    if res.status_code != 200:
+        raise
+    data_page = res.json()
+    # log.info(f'接口获取总数---{data_page}')
+    try:
+        total_page_ = data_page['data']['total']
+    except:
+        raise
+    return total_page_, data_page
+@retry(tries=5, delay=3)
+def get_page1(url, s, headers):
+    ip = baseCore.get_proxy()
+    res = s.get(url=url, headers=headers, proxies=ip, timeout=(5, 10))
+    if res.status_code != 200:
+        raise
+    data_page = res.json()
+    # log.info(f'接口获取总数---{data_page}')
+    try:
+        total_page_ = data_page['data']['stockHolder']['total']
+    except:
+        raise
+    return total_page_, data_page
+@retry(tries=5, delay=3)
+def post_page(url, s, headers, payload):
+    ip = baseCore.get_proxy()
+    res = s.post(url=url, headers=headers, data=json.dumps(payload), proxies=ip, timeout=(5, 10))
+    if res.status_code != 200:
+        raise
+    json_info = res.json()
+    try:
+        total_page_ = json_info['data']['total']
+    except:
+        raise
+    return total_page_, json_info
+from selenium import webdriver
+def create_driver():
+    path = r'D:\soft\msedgedriver.exe'
+    # options = webdriver.EdgeOptions()
+    options = {
+        "browserName": "MicrosoftEdge",
+        "ms:edgeOptions": {
+            "extensions": [], "args": ["--start-maximized"]  # 添加最大化窗口运作参数
+        }
+    }
+    session = webdriver.Edge(executable_path=path, capabilities=options)
+    return session
+def login(driver):
+    cookies = {}
+    cookies_list, id_cookie, user_name = token.get_cookies()
+    if cookies_list:
+        pass
+    else:
+        log.info("没有账号了,等待30分钟")
+        time.sleep(30 * 60)
+        return '', '', ''
+    log.info(f'=====当前使用的是{user_name}的cookie======')
+    for cookie in cookies_list:
+        driver.add_cookie(cookie)
+    time.sleep(3)
+    driver.refresh()
+    time.sleep(3)
+    for cookie in cookies_list:
+        cookies[cookie['name']] = cookie['value']
+    s = requests.Session()
+    s.cookies.update(cookies)
+    return driver, id_cookie, s
+def doJob():
+    # for social_code in social_code_list:
+    driver = create_driver()
+    url = 'https://www.tianyancha.com/'
+    driver.get(url)
+    driver.maximize_window()
+    for i in range(1000):
+        # while True:
+        # todo:设置cookies的使用
+        headers = {
+            'Accept-Language': 'zh-CN,zh;q=0.9',
+            'Content-Type': 'application/json',
+            'Connection': 'keep-alive',
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'version': 'TYC-Web'
+        }
+        driver, id_cookie, s = login(driver)
+        if id_cookie:
+            pass
+        else:
+            continue
+        # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
+        item = baseCore.redicPullData('BigShareHolder:comname')
+        dic_info = {}
+        # item = '91310115MA1HB3LY4M|上海阡伦科技有限公司|3476165132'
+        # 判断 如果Redis中已经没有数据，则等待
+        # Big_item = inserted_id + "|" + shareHolderName + "|" + big_tycid
+        if item == None:
+            time.sleep(30 * 60)
+            continue
+        start = time.time()
+        no = item.split('|')[0]
+        # todo:根据信用代码找到该条信息
+        tycid = item.split('|')[2]
+        com_name = item.split('|')[1]
+        try:
+            if tycid == None or tycid == '':
+                try:
+                    retData = getTycIdByXYDM(com_name, s)
+                    # retData = getTycIdByXYDM("极星汽车销售有限公司", s)
+                    if retData['state']:
+                        tycid = retData['tycData']['id']
+                        xydm = retData['tycData']['taxCode']
+                    else:
+                        state = 0
+                        takeTime = baseCore.getTimeCost(start, time.time())
+                        baseCore.recordLog(com_name, taskType, state, takeTime, '', '获取天眼查id失败')
+                        log.info(f'======={com_name}====重新放入redis====')
+                        baseCore.rePutIntoR('BigShareHolder:Error', item)
+                        continue
+                except:
+                    state = 0
+                    takeTime = baseCore.getTimeCost(start, time.time())
+                    baseCore.recordLog(com_name, taskType, state, takeTime, '', '获取天眼查id失败')
+                    baseCore.rePutIntoR('BigShareHolder:Error', item)
+                    continue
+            log.info(f"---{com_name}----{tycid}----开始采集股东信息")
+            try:
+                dic_info = get_html(tycid, driver, dic_info)
+                charge = 0
+            # 页面请求三次都失败
+            except:
+                charge = -1
+            if charge == -1:
+                token.updateTokeen(id_cookie, 3)
+                # 重新塞入redis
+                baseCore.rePutIntoR('BigShareHolder:comname', item)
+                log.info(f"---{com_name}----{tycid}----请求失败----重新放入redis")
+                time.sleep(3)
+                continue
+            else:
+                t = int(time.time() * 1000)
+                Info.bigupdate_info(no, dic_info)
+        except Exception as e:
+            token.updateTokeen(id_cookie, 3)
+            # token.updateTokeen(id_cookie, 2)
+            log.info(f'==={com_name}=====企业核心人员采集失败===重新放入redis====')
+            log.info(e)
+            # 重新塞入redis
+            baseCore.rePutIntoR('BigShareHolder:comname', item)
+            state = 0
+            takeTime = baseCore.getTimeCost(start, time.time())
+            baseCore.recordLog(com_name, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
+            time.sleep(5)
+        # break
+# df_img = pd.DataFrame(list_all_2)
+# df_img.to_excel('企业主要人员-头像.xlsx',index=False)
+if __name__ == "__main__":
+    doJob()
\ No newline at end of file
--- a/comData/weixin_solo/get_tokenCookies.py
+++ b/comData/weixin_solo/get_tokenCookies.py
@@ -74,6 +74,7 @@ if __name__ == "__main__":
        # loadinfo = [token,cookies]
        # 保存到数据库中
+        # insert = f"insert into weixin_tokenCookies_person (token,cookies,create_time,fenghao_time,user_name,update_time) values ('{token}','{escape_string(cookies)}',now(),DATE_SUB(NOW(), INTERVAL 1 DAY),'{user_name}',now())"
        insert = f"insert into weixin_tokenCookies (token,cookies,create_time,fenghao_time,user_name,update_time) values ('{token}','{escape_string(cookies)}',now(),DATE_SUB(NOW(), INTERVAL 1 DAY),'{user_name}',now())"
        cursor_.execute(insert)
        cnx_.commit()

--- a/comData/weixin_solo/oneWeixin_test.py
+++ b/comData/weixin_solo/oneWeixin_test.py
--- a/es拉取全球企业资讯/mongo导数据.py
+++ b/es拉取全球企业资讯/mongo导数据.py
 import pandas as pd
@@ -2,20 +2,20 @@ import pandas as pd
 import pymongo
 # 7649
 data_list = []
-db_stroage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['新华丝路-丝路商机100+']
+db_stroage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='shencai', password='shencai_zzsn008').ZZSN['国务院问答对']
 # datas = db_stroage.find({"内容": {"$ne": None, "$exists": True}})
 # 导出标签是空的数据
 datas = db_stroage.find()
 link = []
 for data in datas:
    del data['_id']
-    del data['id']
+    # del data['id']
-    # if data['标题'] not in link:
+    if data['问题']:
-    #     data_list.append(data)
+        data_list.append(data)
-    #     link.append(data['标题'])
+    else:
-    data_list.append(data)
+        continue
    # print(data)
 print(len(data_list))
 df = pd.DataFrame(data_list)
-df.to_excel('./新华丝路-丝路投资2.xlsx',index=False)
+df.to_excel('./国务院问答对.xlsx',index=False)
\ No newline at end of file
--- a/中国外汇交易中心/spider_main.py
+++ b/中国外汇交易中心/spider_main.py
 #  读取表中的数据，转化成list
@@ -44,7 +44,8 @@ def getrequest(href, headers):
 def classify_report_type(title):
-    if "年年度报告" in title or re.match(r'\d{4}年度报告', title):
+    type_pattern = r'(.*?)\d{4}年?(年度财务报告|年报|年度报告)'
+    if "年年度报告" in title or re.match(type_pattern, title):
        return "年度报告"
    elif "半年" in title:
        return "半年度报告"
@@ -95,15 +96,16 @@ def parase(com_name, social_code, dataJson):
            "报告年份": year
        }
        db_storage2.insert_one(dic_info)
-        time.sleep(1)
+        time.sleep(2)
 if __name__ == "__main__":
    dataList = getcomlist(file_path, sheet_name)
    # print(dataList)
    for item in enumerate(dataList):
-        social_code = item[1]
+        # print(item)
-        com_name = item[2]
+        social_code = item[1][1]
+        com_name = item[1][2]
        print(f"正在采集:{com_name}")
        href = url.format(com_name, 1)
        dataJson = getrequest(href, headers)
@@ -116,5 +118,5 @@ if __name__ == "__main__":
                href_page = url.format(com_name, page)
                dataJson_page = getrequest(href_page, headers)
            parase(com_name, social_code, dataJson_page)
-            time.sleep(2)
+            time.sleep(5)
--- a/国务院问答对处理/qa选登.py
+++ b/国务院问答对处理/qa选登.py
++ "b/\345\233\275\345\212\241\351\231\242\351\227\256\347\255\224\345\257\271\345\244\204\347\220\206/qa\351\200\211\347\231\273.py"
--- a/国务院问答对处理/国务院政府问答对.xlsx
+++ b/国务院问答对处理/国务院政府问答对.xlsx