1/31

862e97ab · 薛凌堃 · 1d1053c8 · 862e97ab · 862e97ab · 862e97ab
--- a/comData/Tyc/CorePerson.py
+++ b/comData/Tyc/CorePerson.py
@@ -2,32 +2,99 @@
 #先采集天眼查id，再通过id采集核心人员信息
 import datetime
 import json
+import os
+import subprocess
+import sys
 import requests,time,random
 import pandas as pd
 from bs4 import BeautifulSoup
 import urllib3
+from retry import retry
 from base.BaseCore import BaseCore
 from getTycId import getTycIdByXYDM
 baseCore = BaseCore()
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 log = baseCore.getLogger()
 headers = {
-    'Cookie':'HWWAFSESID=b6312a4594bea18413c; HWWAFSESTIME=1686818921445; csrfToken=e7sNDKWelJwlcjnm6Rlny887; TYCID=6ff6bc600b5911ee89d35bf79a73a3b1; bannerFlag=true; ssuid=1534238432; refresh_page=0; _ga=GA1.2.1790752229.1688467828; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22307016917%22%2C%22first_id%22%3A%22188be3e337e4bf-0d85716d366e44-26031d51-1049088-188be3e337f19e%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTg4YmUzZTMzN2U0YmYtMGQ4NTcxNmQzNjZlNDQtMjYwMzFkNTEtMTA0OTA4OC0xODhiZTNlMzM3ZjE5ZSIsIiRpZGVudGl0eV9sb2dpbl9pZCI6IjMwNzAxNjkxNyJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22307016917%22%7D%2C%22%24device_id%22%3A%22188be3e337e4bf-0d85716d366e44-26031d51-1049088-188be3e337f19e%22%7D; jsid=SEO-BAIDU-ALL-SY-000001; bdHomeCount=7; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1693986307; tyc-user-info=%7B%22state%22%3A%220%22%2C%22vipManager%22%3A%220%22%2C%22mobile%22%3A%2213592481839%22%7D; tyc-user-info-save-time=1693986377592; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzU5MjQ4MTgzOSIsImlhdCI6MTY5Mzk4NjM3NywiZXhwIjoxNjk2NTc4Mzc3fQ.xeK54nMtB5wt7ipdOjhrzdplT1azvezrTuoD1b8i3OguqMB97ZOR1pFbRsP7vsKRdZ3Fsf5Y5ZqlmRKAVHGraA; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1693986412',
+    'Cookie': 'HWWAFSESID=38a70202d86311cd90f; HWWAFSESTIME=1706662296323; jsid=SEO-BING-ALL-SY-000001; TYCID=e35f3910bfd211eeac66555a29ade465; ssuid=6800091776; sajssdk_2015_cross_new_user=1; csrfToken=e85dxv9-DXNUkQ7yuzIgZrbs; bannerFlag=true; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1706662300; _ga=GA1.2.1071312772.1706662301; _gid=GA1.2.1602571847.1706662301; tyc-user-info={%22state%22:%220%22%2C%22vipManager%22:%220%22%2C%22mobile%22:%2217103126138%22%2C%22userId%22:%22304029617%22}; tyc-user-info-save-time=1706662339304; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNzEwMzEyNjEzOCIsImlhdCI6MTcwNjY2MjMzOCwiZXhwIjoxNzA5MjU0MzM4fQ.z9cOzr0YWyU_rxTZNn8ojsxfMAdre4NbQLzwgKAGdI-CCcfPvuBBrL4tFP5HmR5pDv204e4P4k4Ll4kKPhBQTg; tyc-user-phone=%255B%252217103126138%2522%255D; searchSessionId=1706667106.29658260; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22304029617%22%2C%22first_id%22%3A%2218d5d0009e8153-01c79a4d65a09f9-4c657b58-921600-18d5d0009e914e%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMThkNWQwMDA5ZTgxNTMtMDFjNzlhNGQ2NWEwOWY5LTRjNjU3YjU4LTkyMTYwMC0xOGQ1ZDAwMDllOTE0ZSIsIiRpZGVudGl0eV9sb2dpbl9pZCI6IjMwNDAyOTYxNyJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22304029617%22%7D%2C%22%24device_id%22%3A%2218d5d0009e8153-01c79a4d65a09f9-4c657b58-921600-18d5d0009e914e%22%7D; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1706667529',
-    # 'Cookie': 'TYCID=82cbe530204b11ed9f23298cecec1c60; ssuid=3927938144; _ga=GA1.2.1842488970.1670638075; jsid=SEO-BAIDU-ALL-SY-000001; tyc-user-info={%22state%22:%220%22%2C%22vipManager%22:%220%22%2C%22mobile%22:%2215565837784%22}; tyc-user-info-save-time=1678953978429; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTU2NTgzNzc4NCIsImlhdCI6MTY3ODk1Mzk3OCwiZXhwIjoxNjgxNTQ1OTc4fQ.wsNxLWMkZVrtOEvo_CCDPD38R7F23c5yk7dFAdHkwFPkZhEEvmiv0nlt7UD0ZWfo3t8aYxc4qvu4ueEgMubJ5g; tyc-user-phone=%255B%252215565837784%2522%255D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22284710084%22%2C%22first_id%22%3A%22182b9ca585ead-089598c1d7f7928-26021d51-1327104-182b9ca585f7f1%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfbG9naW5faWQiOiIyODQ3MTAwODQiLCIkaWRlbnRpdHlfY29va2llX2lkIjoiMTgyYjljYTU4NWVhZC0wODk1OThjMWQ3Zjc5MjgtMjYwMjFkNTEtMTMyNzEwNC0xODJiOWNhNTg1ZjdmMSJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22284710084%22%7D%2C%22%24device_id%22%3A%22182b9ca585ead-089598c1d7f7928-26021d51-1327104-182b9ca585f7f1%22%7D; HWWAFSESID=fa776898fa88a6520ea; HWWAFSESTIME=1679899464128; csrfToken=m3cB6mHsznwIuppkT-S8oYc6; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1679016180,1679471093,1679732923,1679899468; bdHomeCount=28; bannerFlag=true; show_activity_id_92=92; searchSessionId=1679899783.48494979; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1679899783',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
 }
 cnx_ = baseCore.cnx
 cursor_ = baseCore.cursor
+cnx = baseCore.cnx_
+cursor = baseCore.cursor_
 list_all_1 = []
 list_all_2 = []
 taskType = '天眼查/核心人员'
+ip_num = 0
+def get_proxy(ip_num):
+    sql = "select proxy from clb_proxy"
+    cursor_.execute(sql)
+    proxy_lists = cursor_.fetchall()
+    cnx_.commit()
+    ip_list = []
+    for proxy_ in proxy_lists:
+        ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
+    proxy_list = []
+    for str_ip in ip_list:
+        str_ip_list = str_ip.split('-')
+        proxyMeta = "http://%(host)s:%(port)s" % {
+            "host": str_ip_list[0],
+            "port": str_ip_list[1],
+        }
+        proxy = {
+            "http": proxyMeta,
+            "https": proxyMeta
+        }
+        proxy_list.append(proxy)
+    return proxy_list[ip_num]
+@retry(tries=3, delay=1)
+def get_html(tycid, ip_num):
+    url = f"https://www.tianyancha.com/company/{tycid}"
+    ip = get_proxy(ip_num)
+    response = requests.get(url=url, headers=headers, proxies=ip)
+    if response.status_code == 200:
+        pass
+    else:
+        ip_num += 1
+        raise
+        # return -1
+    soup = BeautifulSoup(response.content, 'html.parser')
+    try:
+        tmp_field = soup.find('div', class_='dim-tab-root').find('span').text
+        if '最新公示' in tmp_field:
+            total = soup.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(' ', '')
+            return int(total)
+        else:
+            return 0
+    except:
+        return 0
+@retry(tries=3, delay=1)
+def get_page(url, ip_num):
+    ip = get_proxy(ip_num)
+    res = requests.get(url=url, headers=headers, proxies=ip)
+    if res.status_code == 200:
+        pass
+    else:
+        ip_num += 1
+        raise
+    time.sleep(1)
+    total_page_ = res.json()['data']['total']
+    return total_page_
 def doJob():
    while True:
        # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
-        social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
+        # social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
        # 判断 如果Redis中已经没有数据，则等待
-        # social_code = '9135020056842712XB'
+        social_code = '91320691550279691N'
        if social_code == None:
            time.sleep(20)
            continue
@@ -35,15 +102,29 @@ def doJob():
        try:
            data = baseCore.getInfomation(social_code)
            if len(data) != 0:
-                pass
-            else:
-                #数据重新塞入redis
-                baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode',social_code)
-                continue
                id = data[0]
                com_name = data[1]
                xydm = data[2]
                tycid = data[11]
+                count = data[17]
+            else:
+                #数据重新塞入redis
+                # log.info(f'数据库中无该企业{social_code}')
+                sql = f"SELECT * FROM sys_base_enterprise WHERE social_credit_code = '{social_code}'"
+                cursor.execute(sql)
+                data = cursor.fetchone()
+                id = data[0]
+                com_name = data[3]
+                xydm = data[1]
+                conut = 0
+                # 写入数据库
+                insert = "INSERT INTO EnterpriseInfo(com_name, xydm, social_credit_code) VALUES (%s, %s, %s)"
+                cursor_.execute(insert, (com_name, xydm, social_code))
+                cnx_.commit()
+                tycid = ''
+                # baseCore.rePutIntoR('CorPersonEnterpriseNone:gnqy_socialCode', social_code)
+                # continue
            if tycid == None or tycid == '':
                try:
                    retData = getTycIdByXYDM(com_name)
@@ -58,28 +139,111 @@ def doJob():
                        takeTime = baseCore.getTimeCost(start, time.time())
                        baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
                        log.info(f'======={social_code}====重新放入redis====')
-                        baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code)
+                        baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
                        continue
                except:
                    state = 0
                    takeTime = baseCore.getTimeCost(start, time.time())
                    baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
-                    baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code)
+                    baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
                    continue
-            count = data[17]
            log.info(f"{id}---{xydm}----{tycid}----开始采集核心人员")
            list_one_info = []
            num = 1
-            for page in range(1,2):
-                t = int(time.time()*1000)
+            #todo:先确定接口走哪个
-                       #https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_=1692929256462&gid=209370942&pageSize=20&pageNum=1
+            try:
-                url = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum={page}'
+                charge = get_html(tycid, ip_num)
+            except Exception as e:
+                charge = -1
+                log.info(e)
+            total_page = 0
+            t = int(time.time() * 1000)
+            if charge == -1:
+                # 重新塞入redis
+                baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
+                log.info(f'==={social_code}=====页面请求失败===重新放入redis====')
+                log.info(f"{id}---{xydm}----{tycid}----请求失败")
+                # 获取当前进程pid
+                current_pid = baseCore.getPID()
+                # todo: 重新启动新进程，杀死当前进程
+                subprocess.Popen([sys.executable] + sys.argv)
+                os.kill(current_pid, 9)
+                continue
+            elif charge == 0:
+                log.info(f"{id}---{xydm}----{tycid}----没有最新公示")
+                url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
+                try:
+                    total_page1 = get_page(url1, ip_num)
+                except:
+                    total_page1 = 0
+                url = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={}&gid={}&pageSize=20&pageNum={}'
+                total_page = total_page1
+                flag = 2
+            else:
+                log.info(f"{id}---{xydm}----{tycid}----有最新公示")
+                url2 = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
+                url3 = f'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
+                try:
+                    total_page2 = get_page(url2, ip_num)
+                except:
+                    total_page2 = 0
+                time.sleep(2)
+                try:
+                    total_page3 = get_page(url3, ip_num)
+                except:
+                    total_page3 = 0
+                if total_page2 == charge:
+                    url = 'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
+                    total_page = total_page2
+                    flag = 1
+                else:
+                    if total_page3 == charge:
+                        url = 'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
+                        total_page = total_page3
+                        flag = 3
+                    else:
+                        total_page = 0
+                        flag = 0
+                        log.info(f'{id}---{xydm}----{tycid}----没有高管信息')
+                        continue
+            if total_page == 0:
+                # 重新塞入redis
+                baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
+                log.info(f'==={social_code}=====总数请求失败===重新放入redis====')
+                continue
+            #todo:获取页数
+            time.sleep(2)
+            for page in range(1, int((total_page/20) + 1)+1):
+                for c in range(3):
                    ip = baseCore.get_proxy()
-                # res = requests.get(url,headers=headers,proxies=ip)  # ,verify=False
+                    url_ = url.format(t, tycid, page)
-                res = requests.get(url,headers=headers)  # ,verify=False
+                    res = requests.get(url_, headers=headers, proxies=ip)  # ,verify=False
                    time.sleep(1)
+                    if res.status_code == 200:
+                        break
+                    else:
+                        if c == 2:
+                            res = ''
+                            break
+                        continue
+                if res:
+                    pass
+                else:
+                    # 重新塞入redis
+                    baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
+                    log.info(f'{id}---{xydm}----{tycid}----高管信息请求失败')
+                    continue
+                try:
                    list_all = res.json()['data']['dataList']
+                except:
+                    list_all = res.json()['data']['result']
                if list_all:
+                    pass
+                else:
+                    log.info(f'{id}---{xydm}----{tycid}----没有高管信息')
+                if flag == 1:
                    for one_info in list_all:
                        name = one_info['name']
                        sex = one_info['sex']
@@ -135,15 +299,7 @@ def doJob():
                        num = num+1
                        list_one_info.append(dic_json)
                        # list_all_2.append(dic_json_img)
-                else:
+                elif flag == 3:
-                    t = int(time.time() * 1000)
-                    url = f'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum={page}'
-                    ip = baseCore.get_proxy()
-                    # res = requests.get(url, headers=headers, proxies=ip)  # ,verify=False
-                    res = requests.get(url, headers=headers)  # ,verify=False
-                    time.sleep(1)
-                    list_all = res.json()['data']['dataList']
-                    if list_all:
                    for one_info in list_all:
                        name = one_info['personal_name']
                        try:
@@ -153,8 +309,13 @@ def doJob():
                        education = ''
                        position = one_info['position_name']
                        Salary = ''
+                        try:
+                            birthYear = one_info['year_of_birth']
+                        except:
                            birthYear = ''
                        personInfo = one_info['resume_cn']
+                        timestamp = int(int(one_info['employ_date'])/10000)
+                        currentTerm = time.strftime("%Y-%m-%d", time.localtime(timestamp))
                        dic_json = {
                            "socialCreditCode": social_code,
                            "name": name,
@@ -166,53 +327,20 @@ def doJob():
                            "shareNum": '',
                            "shareRatio": '',
                            "benefitShare": '',
-                                "currentTerm": '',
+                            "currentTerm": currentTerm+'至-',
                            "personInfo": personInfo,
                            "sort": str(num)
                        }
                        num = num + 1
                        list_one_info.append(dic_json)
                else:
-                        t = int(time.time() * 1000)
-                        url = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum={page}'
-                        ip = baseCore.get_proxy()
-                        # res = requests.get(url, headers=headers, proxies=ip)  # ,verify=False
-                        res = requests.get(url, headers=headers)  # ,verify=False
-                        time.sleep(1)
-                        list_all = res.json()['data']['result']
-                        # todo:增加一种情况
-                        if list_all:
                    for one_info in list_all:
                        name = one_info['name']
                        try:
-                                    sex = one_info['sex']
-                                except:
-                                    sex = ''
-                                try:
-                                    education = one_info['education']
-                                except:
-                                    education = ''
-                                try:
                            position = one_info['typeSore']
                        except:
                            position = ''
-                                try:
-                                    Salary = one_info['salary']
-                                except:
-                                    Salary = ''
-                                birthYear = ''
-                                try:
-                                    shareRatio = one_info['percent']
-                                except:
-                                    shareRatio = ''
-                                try:
-                                    benefitShare = one_info['finalBenefitShares']
-                                except:
-                                    benefitShare = ''
-                                try:
-                                    currentTerm = one_info['term']
-                                except:
-                                    currentTerm = ''
                        person_id = one_info['id']
                        person_url = f'https://www.tianyancha.com/human/{person_id}-c{tycid}'
                        # person_res = requests.get(person_url, headers=headers, proxies=ip)
@@ -229,29 +357,29 @@ def doJob():
                        dic_json = {
                            "socialCreditCode": social_code,
                            "name": name,
-                                    "sex": sex,
+                            "sex": '',
-                                    "education": education,
+                            "education": '',
                            "position": position,
-                                    "salary": Salary,
+                            "salary": '',
-                                    "birthYear": birthYear,
+                            "birthYear": '',
                            "shareNum": '',
-                                    "shareRatio": shareRatio,
+                            "shareRatio": '',
-                                    "benefitShare": benefitShare,
+                            "benefitShare": '',
-                                    "currentTerm": currentTerm,
+                            "currentTerm": '',
                            "personInfo": personInfo,
                            "sort": str(num)
                        }
                        dic_json_img = {
                            "socialCreditCode": social_code,
                            "name": name,
-                                    "sex": sex,
+                            "sex": '',
-                                    "education": education,
+                            "education": '',
                            "position": position,
-                                    "salary": Salary,
+                            "salary": '',
-                                    "birthYear": birthYear,
+                            "birthYear": '',
                            "shareNum": '',
-                                    "shareRatio": shareRatio,
+                            "shareRatio": '',
-                                    "benefitShare": benefitShare,
+                            "benefitShare": '',
                            "currentTerm": '',
                            "personInfo": personInfo,
                            "头像": person_img,
@@ -259,7 +387,7 @@ def doJob():
                        }
                        num = num + 1
                        list_one_info.append(dic_json)
+            # print(list_one_info)
            json_updata = json.dumps(list_one_info)
            if json_updata == '[]':
                continue
@@ -272,7 +400,7 @@ def doJob():
            log.info(f'==={social_code}=====企业核心人员采集失败===重新放入redis====')
            log.info(e)
            # 重新塞入redis
-            baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code)
+            baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
            state = 0
            takeTime = baseCore.getTimeCost(start, time.time())
            baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')

--- a/comData/Tyc/CorePerson2.py
+++ b/comData/Tyc/CorePerson2.py
@@ -7,6 +7,8 @@ import requests,time,random
 import pandas as pd
 from bs4 import BeautifulSoup
 import urllib3
+from retry import retry
 from base.BaseCore import BaseCore
 from getTycId import getTycIdByXYDM
 baseCore = BaseCore()
@@ -19,77 +21,207 @@ headers = {
 }
 cnx_ = baseCore.cnx
 cursor_ = baseCore.cursor
+cnx = baseCore.cnx_
+cursor = baseCore.cursor_
 list_all_1 = []
 list_all_2 = []
 taskType = '天眼查/核心人员'
-requests.adapters.DEFAULT_RETRIES = 5
+ip_num = 0
+@retry(tries=3, delay=1)
+def get_html(tycid):
+    url = f"https://www.tianyancha.com/company/{tycid}"
+    # ip = baseCore.get_proxy()
+    response = requests.get(url=url, headers=headers)
+    if response.status_code == 200:
+        pass
+    else:
+        raise
+        # return -1
+    soup = BeautifulSoup(response.content, 'html.parser')
+    try:
+        tmp_field = soup.find('div', class_='dim-tab-root').find('span').text
+        if '最新公示' in tmp_field:
+            total = soup.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(' ', '')
+            return int(total)
+        else:
+            return 0
+    except:
+        return 0
+@retry(tries=3, delay=1)
+def get_page(url):
+    # ip = baseCore.get_proxy()
+    res = requests.get(url=url, headers=headers)
+    time.sleep(1)
+    total_page_ = res.json()['data']['total']
+    return total_page_
 def doJob():
    while True:
        # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
-        # social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
+        social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
        # 判断 如果Redis中已经没有数据，则等待
-        social_code = '91510000207312079C'
+        # social_code = '91320691550279691N'
        if social_code == None:
            time.sleep(20)
            continue
-        if 'ZZSN' in social_code:
-            continue
        start = time.time()
        try:
-            # data = baseCore.getInfomation(social_code)
+            data = baseCore.getInfomation(social_code)
-            # if len(data) != 0:
+            if len(data) != 0:
-            #     pass
+                id = data[0]
-            # else:
+                com_name = data[1]
-            #     #数据重新塞入redis
+                xydm = data[2]
-            #     baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode',social_code)
+                tycid = data[11]
+                count = data[17]
+            else:
+                # 数据重新塞入redis
+                # log.info(f'数据库中无该企业{social_code}')
+                sql = f"SELECT * FROM sys_base_enterprise WHERE social_credit_code = '{social_code}'"
+                cursor.execute(sql)
+                data = cursor.fetchone()
+                id = data[0]
+                com_name = data[3]
+                xydm = data[1]
+                conut = 0
+                # 写入数据库
+                insert = "INSERT INTO EnterpriseInfo(com_name, xydm, social_credit_code) VALUES (%s, %s, %s)"
+                cursor_.execute(insert, (com_name, xydm, social_code))
+                cnx_.commit()
+                tycid = ''
+                # baseCore.rePutIntoR('CorPersonEnterpriseNone:gnqy_socialCode', social_code)
                # continue
            # id = data[0]
+            # com_name = data[1]
            # xydm = data[2]
-            tycid = ''
+            # tycid = data[11]
            if tycid == None or tycid == '':
                try:
-                    retData = getTycIdByXYDM(social_code)
+                    retData = getTycIdByXYDM(com_name)
                    if retData['state']:
                        tycid = retData['tycData']['id']
-                        # todo:写入数据库
+                        # # todo:写入数据库
-                        # updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
+                        updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
-                        # cursor_.execute(updateSql)
+                        cursor_.execute(updateSql)
-                        # cnx_.commit()
+                        cnx_.commit()
                    else:
                        state = 0
                        takeTime = baseCore.getTimeCost(start, time.time())
                        baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
                        log.info(f'======={social_code}====重新放入redis====')
-                        baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code)
+                        baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
                        continue
-                except Exception as e:
+                except:
                    state = 0
                    takeTime = baseCore.getTimeCost(start, time.time())
                    baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
-                    baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code)
+                    baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
                    continue
-            # count = data[17]
+            count = data[17]
-            log.info(f"---{social_code}----{tycid}----开始采集核心人员")
+            log.info(f"{id}---{xydm}----{tycid}----开始采集核心人员")
            list_one_info = []
            num = 1
-            for page in range(1,2):
-                t = int(time.time()*1000)
+            # todo:先确定接口走哪个
-                       #https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_=1692929256462&gid=209370942&pageSize=20&pageNum=1
+            try:
-                url = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum={page}'
+                charge = get_html(tycid)
-                ip = baseCore.get_proxy()
+            except Exception as e:
-                res = requests.get(url,headers=headers,proxies=ip,verify=False)
+                charge = -1
+                log.info(e)
+            time.sleep(2)
+            t = int(time.time() * 1000)
+            if charge == -1:
+                # 重新塞入redis
+                baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
+                log.info(f'==={social_code}=====页面请求失败===重新放入redis====')
+                log.info(f"{id}---{xydm}----{tycid}----请求失败")
+                continue
+            elif charge == 0:
+                log.info(f"{id}---{xydm}----{tycid}----没有最新公示")
+                url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
+                try:
+                    total_page1 = get_page(url1)
+                except:
+                    total_page1 = 0
+                url = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={}&gid={}&pageSize=20&pageNum={}'
+                total_page = total_page1
+                flag = 2
+            else:
+                log.info(f"{id}---{xydm}----{tycid}----有最新公示")
+                url2 = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
+                url3 = f'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
+                try:
+                    total_page2 = get_page(url2)
+                except:
+                    total_page2 = 0
                time.sleep(1)
+                try:
+                    total_page3 = get_page(url3)
+                except:
+                    total_page3 = 0
+                if total_page2 == charge:
+                    url = 'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
+                    total_page = total_page2
+                    flag = 1
+                else:
+                    if total_page3 == charge:
+                        url = 'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
+                        total_page = total_page3
+                        flag = 3
+                    else:
+                        total_page = 0
+                        flag = 0
+                        log.info(f'{id}---{xydm}----{tycid}----没有高管信息')
+                        continue
+            if total_page == 0:
+                # 重新塞入redis
+                baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
+                log.info(f'==={social_code}=====总数请求失败===重新放入redis====')
+                break
+            # todo:获取页数
+            for page in range(1, int((total_page / 20) + 1) + 1):
+                for c in range(3):
+                    # ip = baseCore.get_proxy()
+                    url_ = url.format(t, tycid, page)
+                    res = requests.get(url_, headers=headers)  # ,verify=False
+                    time.sleep(1)
+                    if res.status_code == 200:
+                        break
+                    else:
+                        if c == 2:
+                            res = ''
+                            break
+                        continue
+                if res:
+                    pass
+                else:
+                    # 重新塞入redis
+                    baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
+                    log.info(f'{id}---{xydm}----{tycid}----高管信息请求失败')
+                    continue
+                try:
                    list_all = res.json()['data']['dataList']
+                except:
+                    list_all = res.json()['data']['result']
                if list_all:
+                    pass
+                else:
+                    log.info(f'{id}---{xydm}----{tycid}----没有高管信息')
+                if flag == 1:
                    for one_info in list_all:
                        name = one_info['name']
                        sex = one_info['sex']
                        education = one_info['education']
                        position = one_info['position']
                        Salary = one_info['salary']
-                        #todo:获取当前年份
+                        # todo:获取当前年份
                        now = datetime.datetime.now()
                        year = now.year
                        try:
@@ -105,47 +237,40 @@ def doJob():
                        except:
                            person_img = '--'
                        dic_json = {
-                            "socialCreditCode":social_code,
+                            "socialCreditCode": social_code,
-                            "name":name,
+                            "name": name,
-                            "sex":sex,
+                            "sex": sex,
-                            "education":education,
+                            "education": education,
-                            "position":position,
+                            "position": position,
-                            "salary":Salary,
+                            "salary": Salary,
-                            "birthYear":birthYear,
+                            "birthYear": birthYear,
-                            "shareNum":StockKeepings,
+                            "shareNum": StockKeepings,
-                            "shareRatio":'',
+                            "shareRatio": '',
-                            "benefitShare":'',
+                            "benefitShare": '',
-                            "currentTerm":currentTerm,
+                            "currentTerm": currentTerm,
-                            "personInfo":personInfo,
+                            "personInfo": personInfo,
-                            "sort":str(num)
+                            "sort": str(num)
                        }
                        dic_json_img = {
-                            "socialCreditCode":social_code,
+                            "socialCreditCode": social_code,
-                            "name":name,
+                            "name": name,
-                            "sex":sex,
+                            "sex": sex,
-                            "education":education,
+                            "education": education,
-                            "position":position,
+                            "position": position,
-                            "salary":Salary,
+                            "salary": Salary,
-                            "birthYear":birthYear,
+                            "birthYear": birthYear,
-                            "shareNum":StockKeepings,
+                            "shareNum": StockKeepings,
-                            "shareRatio":'',
+                            "shareRatio": '',
-                            "benefitShare":'',
+                            "benefitShare": '',
-                            "currentTerm":currentTerm,
+                            "currentTerm": currentTerm,
-                            "personInfo":personInfo,
+                            "personInfo": personInfo,
-                            "头像":person_img,
+                            "头像": person_img,
-                            "sort":str(num)
+                            "sort": str(num)
                        }
-                        num = num+1
+                        num = num + 1
                        list_one_info.append(dic_json)
                        # list_all_2.append(dic_json_img)
-                else:
+                elif flag == 3:
-                    t = int(time.time() * 1000)
-                    url = f'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum={page}'
-                    ip = baseCore.get_proxy()
-                    res = requests.get(url, headers=headers, proxies=ip, verify=False)
-                    time.sleep(1)
-                    list_all = res.json()['data']['dataList']
-                    if list_all:
                    for one_info in list_all:
                        name = one_info['personal_name']
                        try:
@@ -155,8 +280,13 @@ def doJob():
                        education = ''
                        position = one_info['position_name']
                        Salary = ''
+                        try:
+                            birthYear = one_info['year_of_birth']
+                        except:
                            birthYear = ''
                        personInfo = one_info['resume_cn']
+                        timestamp = int(int(one_info['employ_date']) / 10000)
+                        currentTerm = time.strftime("%Y-%m-%d", time.localtime(timestamp))
                        dic_json = {
                            "socialCreditCode": social_code,
                            "name": name,
@@ -168,59 +298,24 @@ def doJob():
                            "shareNum": '',
                            "shareRatio": '',
                            "benefitShare": '',
-                                "currentTerm": '',
+                            "currentTerm": currentTerm + '至-',
                            "personInfo": personInfo,
                            "sort": str(num)
                        }
                        num = num + 1
                        list_one_info.append(dic_json)
                else:
-                        t = int(time.time() * 1000)
-                        url = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum={page}'
-                        ip = baseCore.get_proxy()
-                        res = requests.get(url, headers=headers, proxies=ip, verify=False)
-                        time.sleep(1)
-                        try:
-                            list_all = res.json()['data']['result']
-                        except Exception as e:
-                            log.info(res.json())
-                            continue
-                        # todo:增加一种情况
-                        if list_all:
                    for one_info in list_all:
                        name = one_info['name']
                        try:
-                                    sex = one_info['sex']
-                                except:
-                                    sex = ''
-                                try:
-                                    education = one_info['education']
-                                except:
-                                    education = ''
-                                try:
                            position = one_info['typeSore']
                        except:
                            position = ''
-                                try:
-                                    Salary = one_info['salary']
-                                except:
-                                    Salary = ''
-                                birthYear = ''
-                                try:
-                                    shareRatio = one_info['percent']
-                                except:
-                                    shareRatio = ''
-                                try:
-                                    benefitShare = one_info['finalBenefitShares']
-                                except:
-                                    benefitShare = ''
-                                try:
-                                    currentTerm = one_info['term']
-                                except:
-                                    currentTerm = ''
                        person_id = one_info['id']
                        person_url = f'https://www.tianyancha.com/human/{person_id}-c{tycid}'
-                                person_res = requests.get(person_url, headers=headers, proxies=ip)
+                        # person_res = requests.get(person_url, headers=headers, proxies=ip)
+                        person_res = requests.get(person_url, headers=headers)
                        person_soup = BeautifulSoup(person_res.content, 'html.parser')
                        try:
                            personInfo = person_soup.find('span', {'class': '_56d0a'}).text.strip()
@@ -233,29 +328,29 @@ def doJob():
                        dic_json = {
                            "socialCreditCode": social_code,
                            "name": name,
-                                    "sex": sex,
+                            "sex": '',
-                                    "education": education,
+                            "education": '',
                            "position": position,
-                                    "salary": Salary,
+                            "salary": '',
-                                    "birthYear": birthYear,
+                            "birthYear": '',
                            "shareNum": '',
-                                    "shareRatio": shareRatio,
+                            "shareRatio": '',
-                                    "benefitShare": benefitShare,
+                            "benefitShare": '',
-                                    "currentTerm": currentTerm,
+                            "currentTerm": '',
                            "personInfo": personInfo,
                            "sort": str(num)
                        }
                        dic_json_img = {
                            "socialCreditCode": social_code,
                            "name": name,
-                                    "sex": sex,
+                            "sex": '',
-                                    "education": education,
+                            "education": '',
                            "position": position,
-                                    "salary": Salary,
+                            "salary": '',
-                                    "birthYear": birthYear,
+                            "birthYear": '',
                            "shareNum": '',
-                                    "shareRatio": shareRatio,
+                            "shareRatio": '',
-                                    "benefitShare": benefitShare,
+                            "benefitShare": '',
                            "currentTerm": '',
                            "personInfo": personInfo,
                            "头像": person_img,
@@ -263,25 +358,28 @@ def doJob():
                        }
                        num = num + 1
                        list_one_info.append(dic_json)
+            # print(list_one_info)
            json_updata = json.dumps(list_one_info)
            if json_updata == '[]':
-                log.indo(f'---{social_code}---无高管信息---')
                continue
            else:
                pass
-            response = requests.post('http://114.115.236.206:8088/sync/executive',data=json_updata,timeout=300, verify=False)
+            response = requests.post('http://114.115.236.206:8088/sync/executive', data=json_updata, timeout=300,
+                                     verify=False)
            print(response.text)
            log.info('=========成功======')
        except Exception as e:
            log.info(f'==={social_code}=====企业核心人员采集失败===重新放入redis====')
+            log.info(e)
            # 重新塞入redis
-            baseCore.rePutIntoR('CorPersonEnterprise:gnqy_socialCode', social_code)
+            baseCore.rePutIntoR('CorPersonEnterpriseError:gnqy_socialCode', social_code)
            state = 0
            takeTime = baseCore.getTimeCost(start, time.time())
            baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
            time.sleep(5)
        # break
 # df_img = pd.DataFrame(list_all_2)
 # df_img.to_excel('企业主要人员-头像.xlsx',index=False)
 if __name__ == "__main__":

--- a/comData/YanBao/resentYanbao.py
+++ b/comData/YanBao/resentYanbao.py
@@ -160,6 +160,7 @@ def uptoOBS(pdf_url, name_pdf, type_id, pathType, header):
            break
        except Exception as e:
            time.sleep(3)
+            log.info(e)
            continue
    if page_size < 1:
@@ -206,7 +207,8 @@ def download(data, order_by,header):
        come = data['come']
    except:
        come = ''
+    if publishDate < '2024-01-29':
+        return
    tf_url = add_check_url(sourceAddress)
    if tf_url:
        dic_result = {
@@ -1726,12 +1728,12 @@ if __name__ == '__main__':
    #     qianyanzhishiku()
    # except Exception as e:
    #     pass
-    try:
+    # try:
-        log.info('shijiejingjiluntan')
+    #     log.info('shijiejingjiluntan')
-        shijiejingjiluntan()
+    #     shijiejingjiluntan()
-    except Exception as e:
+    # except Exception as e:
-        log.info(e)
+    #     log.info(e)
-        pass
+    #     pass
    # try:
    #     log.info('dongfangcaifu')
    #     dongfangcaifu()
@@ -1749,31 +1751,31 @@ if __name__ == '__main__':
    # except Exception as e:
    #     log.info(e)
    #     pass
-    #
-    # try:
+    try:
-    #     log.info('dongfangcaifu4')
+        log.info('dongfangcaifu4')
-    #     dongfangcaifu4()
+        dongfangcaifu4()
-    # except Exception as e:
+    except Exception as e:
-    #     log.info(e)
+        log.info(e)
-    #     pass
+        pass
-    #
-    # try:
+    try:
-    #     log.info('dongfangcaifu5')
+        log.info('dongfangcaifu5')
-    #     dongfangcaifu5()
+        dongfangcaifu5()
-    # except Exception as e:
+    except Exception as e:
-    #     log.info(e)
+        log.info(e)
-    #     pass
+        pass
-    #
-    # try:
+    try:
-    #     log.info('dongfangcaifu6')
+        log.info('dongfangcaifu6')
-    #     dongfangcaifu6()
+        dongfangcaifu6()
-    # except Exception as e:
+    except Exception as e:
-    #     log.info(e)
+        log.info(e)
-    #     pass
+        pass
-    #
-    # try:
+    try:
-    #     log.info('dongfangcaifu7')
+        log.info('dongfangcaifu7')
-    #     dongfangcaifu7()
+        dongfangcaifu7()
-    # except Exception as e:
+    except Exception as e:
-    #     log.info(e)
+        log.info(e)
-    #     pass
+        pass
--- a/comData/dingzhi/dfsm_sasac.py
+++ b/comData/dingzhi/dfsm_sasac.py
+import requests
+import json
+import sys
+import redis
+import requests
+from bs4 import BeautifulSoup
+from kafka import KafkaProducer
+sys.path.append('D:\\kkwork\\zzsn_spider\\base')
+import BaseCore
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
+    }
+def two_dfsm_mtgc():
+    info_list = []
+    """
+    地方扫描
+    """
+    url_list = ['http://www.sasac.gov.cn/n2588025/n2588129/index.html',
+                # 'http://www.sasac.gov.cn/n2588025/n2588139/index.html'
+                ]
+    for url in url_list:
+        res = requests.get(url=url,headers=headers)
+        res.encoding = res.apparent_encoding
+        res_text = res.text
+        soup = BeautifulSoup(res_text, 'html.parser')
+        pages = soup.find('td', class_='pages')
+        pages_tag = pages['id'].split('pag_')[1]
+        pages = str(pages).split(f'maxPageNum{pages_tag}=')[1].split('";')[0]
+        # print(pages)
+        # for page in range(378,int(pages)+1):
+        for page in range(1,378):
+            log.info(f'==============开始采集第{page}页===============')
+            if page == 1:
+                url = 'http://www.sasac.gov.cn/n2588025/n2588129/index.html'
+            else:
+                url = f'http://www.sasac.gov.cn/n2588025/n2588129/index_{pages_tag}_{int(pages)+1-page}.html'
+            try:
+                res = requests.get(url=url, headers=headers)
+            except:
+                continue
+            res.encoding = res.apparent_encoding
+            res_text = res.text
+            soup = BeautifulSoup(res_text, 'html.parser')
+            li_list = soup.find('span', id=f'comp_{pages_tag}')
+            if li_list:
+                li_list = li_list.find_all('li')
+            else:
+                li_list = soup.find_all('li')
+            for li in li_list:
+                # print(type(li))
+                if len(li):
+                    a = li.find('a')
+                    # print(a)
+                    href = a['href']
+                    if 'http' in href:
+                        href = href
+                    else:
+                        href = 'http://www.sasac.gov.cn/' + str(href).replace('../../','')
+                    # print(href)
+                    try:
+                        flag = r.sismember('IN-20240129-0019-test', href)
+                        if flag:
+                            log.info('信息已采集入库过')
+                            continue
+                        # else:
+                        #     log.info(f'未采到----{page}-----{href}')
+                        #     continue
+                    except Exception as e:
+                        continue
+                    # href = "http://www.sasac.gov.cn/n2588025/n2588129/c2711101/content.html"
+                    try:
+                        title = a['title']
+                    except:
+                        title = ''
+                    # print(title)
+                    try:
+                        res_href = requests.get(url=href,headers=headers,verify=False)
+                    except:
+                        continue
+                    res_href.encoding = res_href.apparent_encoding
+                    href_text = res_href.text
+                    i_soup = BeautifulSoup(href_text,'html.parser')
+                    result = i_soup.find(class_='zsy_cotitle')
+                    try:
+                        if result:
+                            result =result.find('p').text
+                            pub_source = result.split('发布时间：')[0].replace('文章来源：','').strip()
+                            pub_time = result.split('发布时间：')[1]
+                            # print(pub_source,pub_time)
+                            try:
+                                i_soup.find('div', id='div_div').decompose()
+                                i_soup.find('div', id='qr_container').decompose()
+                            except:
+                                pass
+                            contentWithTag = str(i_soup.find(class_='zsy_comain'))
+                            content = str(i_soup.find(class_='zsy_comain').text).replace('扫一扫在手机打开当前页','')
+                        else:
+                            result = i_soup.find(class_='lyshijian').find_all('span')
+                            try:
+                                pub_source = str(result[0]).split('文章来源：')[1].split('</span>')[0].strip()
+                                pub_time = str(result[1]).split('发布时间：')[1].split('</span>')[0].strip()
+                            except:
+                                pub_time = str(result[0]).split('发布时间：')[1].split('</span>')[0].strip()
+                                pub_source =''
+                            contentWithTag = str(i_soup.find(class_='pages_content'))
+                            content = str(i_soup.find(class_='articlecontent').text)
+                        if title == '':
+                            log.info(f'title为空----{page}--{title}--{href}')
+                            continue
+                        info_code = 'IN-20240129-0019'
+                        result_dict = {
+                            'id': '',
+                            'sid': '1751849444877144065',
+                            'title': title,
+                            'organ': pub_source,
+                            'origin': '国务院国有资产监督管理委员会',
+                            # '摘要': zhaiyao,
+                            'source': 16,
+                            'content': content,
+                            'contentWithTag': contentWithTag,
+                            'publishDate': pub_time,
+                            'sourceAddress': href,
+                        }
+                        log.info(f'{page}--{title}--{href}')
+                        # info_list.append(result_dict)
+                        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
+                        try:
+                            kafka_result = producer.send("crawlerInfo",
+                                                         json.dumps(result_dict, ensure_ascii=False).encode('utf8'))
+                            r.sadd(info_code + '-test', href)
+                            log.info('发送kafka成功！')
+                        except Exception as e:
+                            log.info(e)
+                        finally:
+                            producer.close()
+                    except:
+                        continue
+if __name__ == "__main__":
+    r = redis.Redis(host='114.115.236.206', port=6379, password='clbzzsn', db=5)
+    two_dfsm_mtgc()
\ No newline at end of file
--- a/comData/dingzhi/gzyw_sasac.py
+++ b/comData/dingzhi/gzyw_sasac.py
+import json
+import sys
+import redis
+import requests
+from bs4 import BeautifulSoup
+from kafka import KafkaProducer
+sys.path.append('D:\\kkwork\\zzsn_spider\\base')
+import BaseCore
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
+    }
+#国资要闻
+def gzyw():
+    info_list = []
+    url = 'http://www.sasac.gov.cn/n2588025/n2643314/index.html'
+    res = requests.get(url=url, headers=headers)
+    res.encoding = res.apparent_encoding
+    res_text = res.text
+    soup = BeautifulSoup(res_text, 'html.parser')
+    # pages = soup.find('td',id='pag_4278129')
+    pages = soup.find('td', class_='pages')
+    pages_tag = pages['id'].split('pag_')[1]
+    pages = str(pages).split(f'maxPageNum{pages_tag}=')[1].split('";')[0]
+    # print(pages)
+    for page in range(1, int(pages)+1):
+        log.info(f'==============开始采集第{page}页===============')
+        if page == 1:
+            url = 'http://www.sasac.gov.cn/n2588025/n2643314/index.html'
+        else:
+            #http://www.sasac.gov.cn/n2588025/n2643309/index_4278129_131.html
+            url = f'http://www.sasac.gov.cn/n2588025/n2643314/index_{pages_tag}_{int(pages)+1-page}.html'
+        try:
+            res = requests.get(url=url, headers=headers)
+        except:
+            continue
+        res.encoding = res.apparent_encoding
+        res_text = res.text
+        soup = BeautifulSoup(res_text, 'html.parser')
+        li_list = soup.find('span', id=f'comp_{pages_tag}')
+        if li_list:
+            li_list = li_list.find_all('li')
+        else:
+            li_list = soup.find_all('li')
+        for li in li_list:
+            # print(type(li))
+            if len(li):
+                a = li.find('a')
+                # print(a)
+                href = a['href']
+                if 'http' in href:
+                    href = href
+                else:
+                    href = 'http://www.sasac.gov.cn/' + str(href).replace('../../','')
+                # print(href)
+                try:
+                    flag = r.sismember('IN-20240129-0002-test', href)
+                    if flag:
+                        # log.info('信息已采集入库过')
+                        continue
+                    # else:
+                    #     log.info(f'未采到----{page}-----{href}')
+                except Exception as e:
+                    continue
+                try:
+                    title = a['title']
+                except:
+                    title = ''
+                # print(title)
+                try:
+                    res_href = requests.get(url=href,headers=headers,verify=False)
+                except:
+                    continue
+                res_href.encoding = res_href.apparent_encoding
+                href_text = res_href.text
+                i_soup = BeautifulSoup(href_text,'html.parser')
+                result = i_soup.find(class_='zsy_cotitle')
+                try:
+                    if result:
+                        result_ =result.find('p').text
+                        pub_source = result_.split('发布时间：')[0].replace('文章来源：', '').strip()
+                        pub_time = result_.split('发布时间：')[1]
+                        # print(pub_source,pub_time)
+                        if title == '':
+                            result.find('p').decompose()
+                            title = result.text.strip().replace(' ', '').replace('\n', '').replace('\t', '')
+                        try:
+                            i_soup.find('div', id='div_div').decompose()
+                            i_soup.find('div', id='qr_container').decompose()
+                        except:
+                            pass
+                        contentWithTag = str(i_soup.find(class_='zsy_comain'))
+                        content = str(i_soup.find(class_='zsy_comain').text).replace('扫一扫在手机打开当前页','')
+                    else:
+                        result = i_soup.find(class_='lyshijian')
+                        if result:
+                            result_ = result.find_all('span')
+                            try:
+                                pub_source = str(result_[0]).split('文章来源：')[1].split('</span>')[0].strip()
+                                pub_time = str(result_[1]).split('发布时间：')[1].split('</span>')[0].strip()
+                            except:
+                                pub_time = str(result_[0]).split('发布时间：')[1].split('</span>')[0].strip()
+                                pub_source = ''
+                            if title == '':
+                                result.find('p').decompose()
+                                title = result.text.strip()
+                            contentWithTag = str(i_soup.find(class_='articlecontent'))
+                            content = str(i_soup.find(class_='articlecontent').text)
+                        else:
+                            result = i_soup.find(class_='pages-date')
+                            pub_source = result.find('span').text.replace('来源：', '').strip()
+                            pub_time = result.text
+                            pub_time = pub_time.split('来源')[0].strip()
+                            contentWithTag = str(i_soup.find(class_='pages_content'))
+                            content = str(i_soup.find(class_='pages_content').text)
+                        # content = str(i_soup.find(class_='articlecontent').text)
+                    if title == '':
+                        log.info(f'title为空----{page}--{title}--{href}')
+                        continue
+                    # zhaiyao = HanLP.extractSummary(content,6)
+                    info_code = 'IN-20240129-0002'
+                    result_dict = {
+                        'id':'',
+                        'sid':'1751810519211053058',
+                        'title': title,
+                        'organ': pub_source,
+                        'origin': '国务院国有资产监督管理委员会',
+                        # '摘要': zhaiyao,
+                        'source':16,
+                        'content': content,
+                        'contentWithTag': contentWithTag,
+                        'publishDate': pub_time,
+                        'sourceAddress': href,
+                    }
+                    log.info(f'{page}--{title}--{href}')
+                    # info_list.append(result_dict)
+                    producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
+                    try:
+                        kafka_result = producer.send("crawlerInfo",
+                                                     json.dumps(result_dict, ensure_ascii=False).encode('utf8'))
+                        r.sadd(info_code + '-test', href)
+                        log.info('发送kafka成功！')
+                    except Exception as e:
+                        log.info(e)
+                    finally:
+                        producer.close()
+                except:
+                    continue
+if __name__ == "__main__":
+    r = redis.Redis(host='114.115.236.206', port=6379, password='clbzzsn', db=5)
+    gzyw()
\ No newline at end of file
--- a/comData/dingzhi/zzcx.py
+++ b/comData/dingzhi/zzcx.py
+"""
+中证智能财讯
+"""
+import json
+import requests
+from bs4 import BeautifulSoup
+def zzcx():
+    url = 'https://zzcx.cs.com.cn/dist/publishManuscript/listES'
+    payload = {"pageNo": 1, "pageSize": 15, "statusList": [0], "keyword": ""}
+    headers = {
+        'Accept': 'application/json',
+        'Accept-Encoding': 'gzip, deflate, br',
+        'Accept-Language': 'zh-CN,zh;q=0.9',
+        'Content-Length': '56',
+        'Content-Type': 'application/json;charset=UTF-8',
+        'Cookie': 'zycna=VEwasVGF9akBAXuVA58n9CJm',
+        'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
+        'Sec-Ch-Ua-Mobile': '?0',
+        'Sec-Ch-Ua-Platform': '"Windows"',
+        'Sec-Fetch-Dest': 'empty',
+        'Sec-Fetch-Mode': 'cors',
+        'Sec-Fetch-Site': 'same-origin',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+        'Origin': 'https://zzcx.cs.com.cn',
+        'Referer': 'https://zzcx.cs.com.cn/app/zzb/list?spm=0.0.0.0.wjnSUZ'
+    }
+    payload = json.dumps(payload)
+    result_json = requests.post(url=url, data=payload, headers=headers).json()
+    print(result_json)
+    pages = result_json['data']['pages']
+    for page in range(1, int(pages + 1)):
+        payload_page = {"pageNo": page, "pageSize": 15, "statusList": [0], "keyword": ""}
+        payload_page = json.dumps(payload_page)
+        datas = requests.post(url=url, data=payload_page, headers=headers)
+        records = datas.json()['data']['records']
+        for news in records:
+            title = news['title']
+            news_url = 'https://zzcx.cs.com.cn/app/zzb/detail?id=' + news['manuscriptId']
+            news_req = requests.get(url=news_url, headers=headers)
+            news_soup = BeautifulSoup(news_req.content, 'html.parser')
+            detail_info = news_soup.find('div', class_='subTitle___svblj')
+            div_list = detail_info.find_all('div')
+            origin = div_list[0].text
+            publishDate = div_list[1].text
+if __name__ == "__main__":
+    zzcx()
\ No newline at end of file
--- a/comData/policylaw/ClassTool.py
+++ b/comData/policylaw/ClassTool.py
@@ -85,7 +85,8 @@ class ClassTool():
            '来源': dic_news['labels'][0]['relationName'],
            '创建时间': dic_news['createDate'],
            '带标签内容': dic_news['contentWithTag'][:100],
-            '发布时间': dic_news['publishDate']
+            '发布时间': dic_news['publishDate'],
+            '标题': dic_news['title']
        }
        self.db_storage.insert_one(aaa_dic)

--- a/test.py
+++ b/test.py
@@ -112,27 +112,63 @@ from base.BaseCore import BaseCore
 #
 # code = use_ocr(out_img_path)
 # 验证码输入框元素.send_keys(code)
+# import requests
+# headers = {
+#     # 'Accept': '*/*',
+#     # 'Accept-Encoding': 'gzip, deflate, br',
+#     # 'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+#     # 'Cache-Control': 'no-cache',
+#     # 'Connection': 'keep-alive',
+#     # 'Host': 'search-api-web.eastmoney.com',
+#     # 'Pragma': 'no-cache',
+#     # 'Sec-Fetch-Dest': 'script',
+#     # 'Sec-Fetch-Mode': 'no-cors',
+#     # 'Sec-Fetch-Site': 'same-site',
+#     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
+#     # 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
+#     # 'sec-ch-ua-mobile': '?0',
+#     # 'sec-ch-ua-platform': '"Windows"'
+# }
+# url = "https://www-private-oss.mob.com/academy_reports/2023/03/31/Mob%E7%A0%94%E7%A9%B6%E9%99%A2%E3%80%8A2023%E5%B9%B4%E4%B8%AD%E5%9B%BD%E6%96%87%E6%97%85%E4%BA%A7%E4%B8%9A%E5%8F%91%E5%B1%95%E8%B6%8B%E5%8A%BF%E6%8A%A5%E5%91%8A%E3%80%8B.pdf?response-content-disposition=attachment&OSSAccessKeyId=LTAI5t5mdPuMS9gNj93RPowJ&Expires=1703839064&Signature=X1mpakYGVaBffNokvhvW917UH%2Fk%3D"
+#
+#
+# # res = requests.get(url).text[1:-1]
+# res = requests.get(url=url, headers=headers)
+# with open('./a.pdf','wb') as f:
+#     f.write(res.content)
+import datetime
+import json
 import requests
-headers = {
+import pymongo
-    # 'Accept': '*/*',
+from base import BaseCore
-    # 'Accept-Encoding': 'gzip, deflate, br',
+baseCore = BaseCore.BaseCore()
-    # 'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
+log = baseCore.getLogger()
-    # 'Cache-Control': 'no-cache',
-    # 'Connection': 'keep-alive',
-    # 'Host': 'search-api-web.eastmoney.com',
+db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').中科软[
-    # 'Pragma': 'no-cache',
+    '数据源_0504']
-    # 'Sec-Fetch-Dest': 'script',
-    # 'Sec-Fetch-Mode': 'no-cors',
+datas = db_storage.find({'postCode':'2'}).limit(5)
-    # 'Sec-Fetch-Site': 'same-site',
+for data in datas:
-    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
+    title = data['titleForeign']
-    # 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"',
+    contentWithTag = data['richTextForeign']
-    # 'sec-ch-ua-mobile': '?0',
+    summary = data['contentForeign']
-    # 'sec-ch-ua-platform': '"Windows"'
+    dic_info = {
-}
+        'title':title,
-url = "https://www-private-oss.mob.com/academy_reports/2023/03/31/Mob%E7%A0%94%E7%A9%B6%E9%99%A2%E3%80%8A2023%E5%B9%B4%E4%B8%AD%E5%9B%BD%E6%96%87%E6%97%85%E4%BA%A7%E4%B8%9A%E5%8F%91%E5%B1%95%E8%B6%8B%E5%8A%BF%E6%8A%A5%E5%91%8A%E3%80%8B.pdf?response-content-disposition=attachment&OSSAccessKeyId=LTAI5t5mdPuMS9gNj93RPowJ&Expires=1703839064&Signature=X1mpakYGVaBffNokvhvW917UH%2Fk%3D"
+        'summary':summary,
+        'contentWithTag':contentWithTag
+    }
-# res = requests.get(url).text[1:-1]
+    headers = {
-res = requests.get(url=url, headers=headers)
+        'Content-Type': 'application/json',
-with open('./a.pdf','wb') as f:
+    }
-    f.write(res.content)
+    dic_info_ = json.dumps(dic_info)
\ No newline at end of file
+    # print(dic_info_)
+    # with open('./data.json','w') as f:
+    #     f.write(dic_info_)
+    # break
+    # req = requests.post('http://192.168.1.236:5000/translate',data=dic_info_,headers=headers)
+    req = requests.post('http://117.78.23.14:5000/translate',data=dic_info_,headers=headers)
+    log.info(req.text)
\ No newline at end of file