补采天眼查企业动态

d711ce98 · 薛凌堃 · f424eb1b · d711ce98
--- a/comData/Tyc/newsbucai.py
+++ b/comData/Tyc/newsbucai.py
+#企业动态 从redis中获取数据
+import json
+import random
+
+import requests, time, pymysql
+import jieba
+import sys
+
+from kafka import KafkaProducer
+from getTycId import getTycIdByXYDM
+# from base.BaseCore import BaseCore
+# from base.smart import smart_extractor
+sys.path.append('D:\\zzsn_spider\\base')
+import BaseCore
+from smart import smart_extractor
+import urllib3
+
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+jieba.cut("必须加载jieba")
+# 初始化，设置中文分词
+smart =smart_extractor.SmartExtractor('cn')
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='dbScore', charset='utf8mb4')
+cursor = cnx.cursor()
+pageSize = 10
+log.info(f'======================当前脚本进程为{baseCore.getPID()}==============================')
+
+headers = {
+        'Accept': 'application/json, text/plain, */*',
+        'Accept-Encoding': 'gzip, deflate, br',
+        'Accept-Language': 'zh-CN,zh;q=0.9',
+        'Connection': 'keep-alive',
+        'Content-Type': 'application/json',
+        'Host': 'capi.tianyancha.com',
+        'Origin': 'https://www.tianyancha.com',
+        'Referer': 'https://www.tianyancha.com/',
+        'Sec-Fetch-Dest': 'empty',
+        'Sec-Fetch-Mode': 'cors',
+        'Sec-Fetch-Site': 'same-site',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
+        'X-AUTH-TOKEN': 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODcwMzc1MjYwMCIsImlhdCI6MTY5NzE5MDMwMywiZXhwIjoxNjk5NzgyMzAzfQ.awXuS-59RzK35r0aUJq4Rj83JzyAOvsdUfL_ojp66CVQMjlLv_ZDD9g5gCoZKE21LN1JYRMLNZhuWsHhxapROw',
+        'X-TYCID': '6f6298905d3011ee96146793e725899d',
+        'sec-ch-ua': '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
+        'sec-ch-ua-mobile': '?0',
+        'sec-ch-ua-platform': '"Windows"',
+        'version': 'TYC-Web'
+        }
+cnx_ = baseCore.cnx
+cursor_ = baseCore.cursor
+
+taskType = '企业动态/天眼查/补采20W+'
+
+def beinWork(tyc_code, social_code,start_time):
+
+    time.sleep(3)
+    # retData={'up_state':False,'total':0,'up_okCount':0,'up_errorCount':0,'up_repetCount':0}
+    retData = {'total': 0, 'up_okCount': 0, 'up_errorCount': 0, 'up_repetCount': 0}
+    t = time.time()
+    url = f'https://capi.tianyancha.com/cloud-yq-news/company/detail/publicmsg/news/webSimple?_={t}&id={tyc_code}&ps={pageSize}&pn=1&emotion=-100&event=-100'
+    try:
+        for m in range(0, 3):
+            ip = baseCore.get_proxy()
+            headers['User-Agent'] = baseCore.getRandomUserAgent()
+            response = requests.get(url=url, headers=headers, proxies=ip, verify=False)
+            time.sleep(random.randint(3, 5))
+            break
+        if (response.status_code == 200):
+            pass
+    except Exception as e:
+        log.error(f"{tyc_code}-----获取总数接口失败")
+        error = '获取总数接口失败'
+        state = 0
+        takeTime = baseCore.getTimeCost(start_time, time.time())
+        baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{error}----{e}')
+        return retData
+    try:
+        json_1 = json.loads(response.content.decode('utf-8'))
+        total = json_1['data']['total']
+    except:
+        log.error(f"{tyc_code}-----获取总数失败")
+        e = '获取总数失败'
+        state = 0
+        takeTime = baseCore.getTimeCost(start_time, time.time())
+        baseCore.recordLog(social_code, taskType, state, takeTime, url, e)
+        return retData
+    if (total > 0):
+        if (total % pageSize == 0):
+            totalPage = total // pageSize
+        else:
+            totalPage = total // pageSize + 1
+    else:
+        log.error(f"{tyc_code}--------总数为0")
+        retData['state'] = True
+        return retData
+    log.info(f"{tyc_code}-------总数：{total}----总页数:{totalPage}")
+
+    retData['total'] = total
+    up_okCount = 0
+    up_errorCount = 0
+    up_repetCount = 0
+    for num in range(1, totalPage + 1):
+        time.sleep(3)
+        log.info(f"获取分页数据--{tyc_code}----分页{num}----开始")
+        start_page = time.time()
+        url_page = f'https://capi.tianyancha.com/cloud-yq-news/company/detail/publicmsg/news/webSimple?_={time.time()}&id={tyc_code}&ps={pageSize}&pn={num}&emotion=-100&event=-100'
+        for m in range(0, 3):
+            try:
+                ip = baseCore.get_proxy()
+                headers['User-Agent'] = baseCore.getRandomUserAgent()
+                response_page = requests.get(url=url_page, headers=headers, proxies=ip, verify=False)
+                time.sleep(1)
+                break
+            except:
+                pass
+
+        if (response_page.status_code == 200):
+            pass
+        else:
+            log.error(f"{tyc_code}--{num}页---获取分页数据失败")
+            e = '获取分页数据失败'
+            state = 0
+            takeTime = baseCore.getTimeCost(start_time, time.time())
+            baseCore.recordLog(social_code, taskType, state, takeTime, url_page, e)
+            up_errorCount = up_errorCount + pageSize
+            continue
+        try:
+            json_page = json.loads(response_page.content.decode('utf-8'))
+            info_list_page = json_page['data']['items']
+        except:
+            log.error(f"{tyc_code}--{num}页---获取分页数据失败")
+            e = '获取分页数据失败'
+            state = 0
+            takeTime = baseCore.getTimeCost(start_time, time.time())
+            baseCore.recordLog(social_code, taskType, state, takeTime, url_page, e)
+            up_errorCount = up_errorCount + pageSize
+            continue
+        pageIndex = 0
+        for info_page in info_list_page:
+            pageIndex = pageIndex + 1
+            title = info_page['title']
+            source = info_page['website']
+            link = info_page['uri']
+            try:
+                sel_sql = '''select social_credit_code from brpa_source_article_news where source_address = %s and social_credit_code=%s and type='2' '''
+                cursor_.execute(sel_sql, (link, social_code))
+            except Exception as e:
+                print(e)
+            selects = cursor_.fetchone()
+            if selects:
+                log.info(f'{tyc_code}-----{social_code}----{link}:已经存在')
+
+                # todo:如果该条数据存在则说明该条数据之后的都已经采集完成，就可以跳出函数，执行下一个企业
+
+                retData['up_okCount'] = up_okCount
+                retData['up_errorCount'] = up_errorCount
+                retData['up_repetCount'] = up_repetCount
+                #return retData
+                continue
+            try:
+                time_struct = time.localtime(int(info_page['rtm'] / 1000))  # 首先把时间戳转换为结构化时间
+                time_format = time.strftime("%Y-%m-%d %H:%M:%S", time_struct)  # 把结构化时间转换为格式化时间
+            except:
+                time_format = baseCore.getNowTime(1)
+            #记录时间 对比时间
+            #if time_format > '2023-09-25' and time_format < '2023-10-01':
+                #pass
+            #else:
+                #continue
+            try:
+                # 开始进行智能解析
+                # lang = baseCore.detect_language(title)
+                # smart = smart_extractor.SmartExtractor(lang)
+                #带标签正文
+                contentText = smart.extract_by_url(link).text
+                #不带标签正文
+                content = smart.extract_by_url(link).cleaned_text
+                # time.sleep(3)
+            except Exception as e:
+                contentText = ''
+            if contentText == '':
+                log.error(f'获取正文失败：--------{tyc_code}--------{num}--------{link}')
+                e = '获取正文失败'
+                state = 0
+                takeTime = baseCore.getTimeCost(start_time, time.time())
+                baseCore.recordLog(social_code, taskType, state, takeTime, link, e)
+                up_errorCount = up_errorCount + 1
+                try:
+                    insert_err_sql = f"insert into dt_err(xydm,`from`,url,title,pub_date,zhaiyao,create_date,state,pageNo,pageIndex) values('{social_code}','{source}','{link}','{title}','{time_format}','{info_page['abstracts']}',now(),1,{num},{pageIndex})"
+                    cursor.execute(insert_err_sql)
+                    cnx.commit()
+                except:
+                    pass
+                continue
+            try:
+                insert_sql = '''insert into brpa_source_article_news(social_credit_code,source_address,origin,type,publish_time,content,create_time) values(%s,%s,%s,%s,%s,%s,now())'''
+                # 动态信息列表
+                up_okCount = up_okCount + 1
+                list_info = [
+                    social_code,
+                    link,
+                    f'天眼查-{source}',
+                    '2',
+                    time_format,
+                    content[:500]
+                ]
+                cursor_.execute(insert_sql, tuple(list_info))
+                cnx_.commit()
+                # 采集一条资讯记录一条，记录该企业采到了多少的资讯
+                log.info(f'{social_code}----{link}:新增一条')
+                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+                # todo:插入一条数据，并传入kafka
+                dic_news = {
+                    'attachmentIds': '',
+                    'author': '',
+                    'content': content,
+                    'contentWithTag': contentText,
+                    'createDate': time_now,
+                    'deleteFlag': '0',
+                    'id': '',
+                    'keyWords': '',
+                    'lang': 'zh',
+                    'origin': source,
+                    'publishDate': time_format,
+                    #'sid': '1684032033495392257',
+                    'sid': '1714852232679067649',
+                    'sourceAddress': link,  # 原文链接
+                    'summary': info_page['abstracts'],
+                    'title': title,
+                    'type': 2,
+                    'socialCreditCode': social_code,
+                    'year': time_format[:4]
+                }
+            except Exception as e:
+                log.info(f'传输失败:{social_code}----{link}')
+                error = '数据库传输失败'
+                state = 0
+                takeTime = baseCore.getTimeCost(start_time, time.time())
+                baseCore.recordLog(social_code, taskType, state, takeTime, link, f'{error}----{e}')
+                continue
+                # print(dic_news)
+                # 将相应字段通过kafka传输保存
+            try:
+                producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
+                kafka_result = producer.send("researchReportTopic",
+                                             json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
+
+                print(kafka_result.get(timeout=10))
+                dic_result = {
+                    'success': 'ture',
+                    'message': '操作成功',
+                    'code': '200',
+                }
+                log.info(dic_result)
+                # 传输成功,写入日志中
+                state = 1
+                takeTime = baseCore.getTimeCost(start_time, time.time())
+                baseCore.recordLog(social_code, taskType, state, takeTime, link, '成功')
+                # return True
+            except Exception as e:
+                dic_result = {
+                    'success': 'false',
+                    'message': '操作失败',
+                    'code': '204',
+                    'e': e
+                }
+                log.error(dic_result)
+                e = 'Kafka操作失败'
+                state = 0
+                takeTime = baseCore.getTimeCost(start_time, time.time())
+                baseCore.recordLog(social_code, taskType, state, takeTime, link, e)
+
+        log.info(f"获取分页数据--{tyc_code}----分页{num}，耗时{baseCore.getTimeCost(start_page, time.time())}")
+    retData['up_okCount'] = up_okCount
+    retData['up_errorCount'] = up_errorCount
+    retData['up_repetCount'] = up_repetCount
+    return retData
+
+# 日志信息保存至现已创建好数据库中,因此并没有再对此前保存日志信息数据库进行保存
+def doJob():
+    while True:
+        # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
+        social_code = baseCore.redicPullData('NewsEnterprise:gnqybc_socialCode')
+        #social_code = '91440300665899831W'
+        # 判断 如果Redis中已经没有数据，则等待
+        if social_code == None:
+            time.sleep(20)
+            continue
+        start = time.time()
+        try:
+            data = baseCore.getInfomation(social_code)
+            if len(data) != 0:
+                pass
+            else:
+                #数据重新塞入redis
+                baseCore.rePutIntoR('NewsEnterprise:gnqybc_socialCode',social_code)
+                continue
+            id = data[0]
+            xydm = data[2]
+            tycid = data[11]
+            if tycid == None or tycid == '':
+                try:
+                    retData = getTycIdByXYDM(xydm)
+                    if retData['tycData'] and retData['reput']:
+                        tycid = retData['tycData']['id']
+                        # todo:写入数据库
+                        updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
+                        cursor_.execute(updateSql)
+                        cnx_.commit()
+                    elif not retData['tycData'] and retData['reput']:
+                        state = 0
+                        takeTime = baseCore.getTimeCost(start, time.time())
+                        baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
+                        log.info(f'======={social_code}====重新放入redis====')
+                        baseCore.rePutIntoR('NewsEnterprise:gnqybc_socialCode', social_code)
+                        continue
+                    elif not retData['reput'] and not retData['tycData']:
+                        continue
+                except:
+                    state = 0
+                    takeTime = baseCore.getTimeCost(start, time.time())
+                    baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
+                    baseCore.rePutIntoR('NewsEnterprise:gnqybc_socialCode', social_code)
+                    continue
+            count = data[17]
+            log.info(f"{id}---{xydm}----{tycid}----开始处理")
+            start_time = time.time()
+            # 开始采集企业动态
+            retData = beinWork(tycid, xydm,start_time)
+            # 信息采集完成后将该企业的采集次数更新
+            runType = 'NewsRunCount'
+            count += 1
+            baseCore.updateRun(social_code, runType, count)
+            total = retData['total']
+            up_okCount = retData['up_okCount']
+            up_errorCount = retData['up_errorCount']
+            up_repetCount = retData['up_repetCount']
+            log.info(
+                f"{id}---{xydm}----{tycid}----结束处理，耗时{baseCore.getTimeCost(start_time, time.time())}---总数:{total}---成功数:{up_okCount}----失败数:{up_errorCount}--重复数:{up_repetCount}")
+        except Exception as e:
+            log.info(f'==={social_code}=====获取企业信息失败====')
+            #重新塞入redis
+            baseCore.rePutIntoR('NewsEnterprise:gnqybc_socialCode',social_code)
+            state = 0
+            takeTime = baseCore.getTimeCost(start, time.time())
+            baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
+            time.sleep(5)
+        #break
+
+    cursor.close()
+    cnx.close()
+    # 释放资源
+    baseCore.close()
+
+# Press the green button in the gutter to run the script.
+
+if __name__ == '__main__':
+
+    doJob()
+