新浪财经企业动态 10/10

87ecb399 · LiuLiYuan · 5007e9f7 · 87ecb399 · 87ecb399
--- a/comData/sinafinance_news/nyse_news_gn.py
+++ b/comData/sinafinance_news/nyse_news_gn.py
+"""
+    新浪财经国内企业动态
+"""
+import json
+import re
+import time
+
+import jieba
+import requests
+from bs4 import BeautifulSoup
+from kafka import KafkaProducer
+from retry import retry
+from base.smart import smart_extractor
+
+from base.BaseCore import BaseCore
+
+# 初始化，设置中文分词
+jieba.cut("必须加载jieba")
+smart = smart_extractor.SmartExtractor('cn')
+baseCore = BaseCore()
+log = baseCore.getLogger()
+cnx = baseCore.cnx
+cursor = baseCore.cursor
+r = baseCore.r
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
+    'Cache-Control': 'no-cache',
+    'Pragma': 'no-cache'
+}
+taskType = '企业动态/新浪财经'
+pattern = r"\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}"
+
+# 获取响应页面
+@retry(tries=3, delay=1)
+def getrequests(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers,proxies=ip)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'html.parser')
+    return soup
+
+
+# 解析内容
+def getDic(social_code, title, href, pub_time):
+    start_time = time.time()
+    if 'http' not in href:
+        href = 'https://finance.sina.com.cn' + href
+    href_ = href.replace('https', 'http')
+    try:
+        # 带标签正文
+        contentText = smart.extract_by_url(href_).text
+        # 不带标签正文
+        content = smart.extract_by_url(href_).cleaned_text
+        if content == '':
+            log.error(f'{href}===页面解析失败')
+            state = 0
+            takeTime = baseCore.getTimeCost(start_time, time.time())
+            baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===页面解析失败')
+            return 0
+    except:
+        log.error(f'{href}===页面解析失败')
+        state = 0
+        takeTime = baseCore.getTimeCost(start_time, time.time())
+        baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===页面解析失败')
+        return 0
+    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    dic_news = {
+        'attachmentIds': '',
+        'author': '',
+        'content': content,
+        'contentWithTag': contentText,
+        'createDate': time_now,
+        'deleteFlag': '0',
+        'id': '',
+        'keyWords': '',
+        'lang': 'zh',
+        'origin': '新浪财经',
+        'publishDate': pub_time,
+        'sid': '1684032033495392257',
+        'sourceAddress': href,  # 原文链接
+        'summary': '',
+        'title': title,
+        'type': 2,
+        'socialCreditCode': social_code,
+        'year': pub_time[:4]
+    }
+    # print(dic_news)
+    try:
+        sendKafka(dic_news, start_time)
+        log.info(f'Kafka发送成功')
+        try:
+            insertMysql(social_code, href)
+            log.info(f'数据库保存成功')
+        except:
+            log.error(f'{href}===数据入库失败')
+            state = 0
+            takeTime = baseCore.getTimeCost(start_time, time.time())
+            baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===数据入库失败')
+    except:
+        log.error(f'{href}===发送Kafka失败')
+        state = 0
+        takeTime = baseCore.getTimeCost(start_time, time.time())
+        baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===发送Kafka失败')
+    return 1
+
+
+# 数据发送至Kafka
+@retry(tries=3, delay=1)
+def sendKafka(dic_news, start_time):
+    producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
+    kafka_result = producer.send("researchReportTopic",
+                                 json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
+
+    print(kafka_result.get(timeout=10))
+    dic_result = {
+        'success': 'ture',
+        'message': '操作成功',
+        'code': '200',
+    }
+    log.info(dic_result)
+    # 传输成功,写入日志中
+    state = 1
+    takeTime = baseCore.getTimeCost(start_time, time.time())
+    baseCore.recordLog(dic_news['socialCreditCode'], taskType, state, takeTime, dic_news['sourceAddress'], '')
+
+
+# 数据保存入库，用于判重
+@retry(tries=3, delay=1)
+def insertMysql(social_code, link):
+    insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())'''
+    # 动态信息列表
+    list_info = [
+        social_code,
+        link,
+        '新浪财经',
+        '2',
+    ]
+    cursor.execute(insert_sql, tuple(list_info))
+    cnx.commit()
+
+
+# 判断动态是否采集过
+@retry(tries=3, delay=1)
+def selectUrl(url, social_code):
+    sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s and type='2' '''
+    cursor.execute(sel_sql, (url, social_code))
+    selects = cursor.fetchone()
+    return selects
+
+
+def doJob():
+    while True:
+        start_time = time.time()
+        social_code = baseCore.redicPullData('NewsEnterprise:gnqy_nyse_socialCode')
+        # social_code = '914403007261824992'
+        if not social_code or social_code == 'None':
+            print(f'============已没有数据============等待===============')
+            time.sleep(1800)
+        data = baseCore.getInfomation(social_code)
+        gpdm = data[3]
+        log.info(f'{social_code}==={gpdm}===开始采集')
+        exchange = data[10]
+        if gpdm == '' or not gpdm:
+            log.error(f'{social_code}===股票代码为空')
+            continue
+        # 根据所在交易所不同，修改股票代码
+        if exchange == 1:
+            gpdm_ = 'bj' + gpdm
+        elif exchange == 2:
+            gpdm_ = 'sh' + gpdm
+        elif exchange == 3:
+            gpdm_ = 'sz' + gpdm
+        else:
+            log.info(f'{social_code}==={gpdm}===不在北京、上海、深圳交易所')
+            continue
+        page = 1
+        num_ok = 0
+        num_error =0
+        while True:
+            url = f'http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllNewsStock.php?symbol={gpdm_}&Page={page}'
+            soup = getrequests(url)
+            if '拒绝访问' in soup.text:
+                log.error(f'{social_code}===ip封禁')
+                state = 0
+                takeTime = baseCore.getTimeCost(start_time, time.time())
+                baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===ip封禁')
+                r.rpush('NewsEnterprise:gnqy_nyse_socialCode',social_code)
+                time.sleep(1800)
+                break
+            try:
+                ul = soup.find('div', class_='datelist').find('ul')
+                a_list = ul.find_all('a')
+                time_list = re.findall(pattern, str(ul))
+                for i in range(len(a_list)):
+                    try:
+                        title = a_list[i].text.lstrip().strip()
+                        if title == '':
+                            continue
+                        href = a_list[i].get('href')
+                        selects = selectUrl(href,social_code)
+                        if selects:
+                            log.info(f'{href}===已采集')
+                            continue
+                        if 'http' not in href:
+                            href = 'https://finance.sina.com.cn' + href
+                        pub_time = time_list[i].replace('\xa0', ' ') + ":00"
+                        flg = getDic(social_code,title,href,pub_time)
+                        if flg == 0:
+                            num_error += 1
+                        else:
+                            num_ok += 1
+                        time.sleep(0.5)
+                    except Exception as e:
+                        ee = e.__traceback__.tb_lineno
+                        log.error(f'{social_code}===信息采集失败==原因:{ee}行 {e}')
+                        state = 0
+                        takeTime = baseCore.getTimeCost(start_time, time.time())
+                        baseCore.recordLog(social_code, taskType, state, takeTime, url, f'信息采集失败==原因:{ee}行 {e}')
+                    break
+            except:
+                log.error(f"{social_code}==={gpdm}===第{page}页获取信息列表失败")
+                state = 0
+                takeTime = baseCore.getTimeCost(start_time, time.time())
+                baseCore.recordLog(social_code, taskType, state, takeTime, url, f'获取信息列表失败')
+            next_flg = soup.select('#con02-7 > table > tr')[1].select('div')[2].text
+            if '下一页' not in next_flg:
+                break
+            page += 1
+            break
+
+        log.info(f'{social_code}==={gpdm}===企业整体耗时{baseCore.getTimeCost(start_time, time.time())}===成功{num_ok}条,失败{num_error}条')
+
+
+if __name__ == "__main__":
+    doJob()
--- a/comData/sinafinance_news/nyse_news_xg.py
+++ b/comData/sinafinance_news/nyse_news_xg.py
+"""
+    新浪财经香港企业动态
+"""
+from datetime import datetime
+import json
+import re
+import time
+
+import jieba
+import requests
+from bs4 import BeautifulSoup
+from kafka import KafkaProducer
+from retry import retry
+from base.smart import smart_extractor
+
+from base.BaseCore import BaseCore
+
+# 初始化，设置中文分词
+jieba.cut("必须加载jieba")
+smart = smart_extractor.SmartExtractor('cn')
+baseCore = BaseCore()
+log = baseCore.getLogger()
+cnx = baseCore.cnx
+cursor = baseCore.cursor
+r = baseCore.r
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
+    'Cache-Control': 'no-cache',
+    'Pragma': 'no-cache'
+}
+taskType = '企业动态/新浪财经'
+
+
+# 判断时间是否是正确格式
+def format_time(time_str):
+    try:
+        # 尝试将时间字符串按指定格式解析为datetime对象
+        datetime_obj = datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S")
+        # 检查解析后的时间对象是否与原字符串完全匹配
+        if datetime_obj.strftime("%Y-%m-%d %H:%M:%S") == time_str:
+            return time_str
+    except ValueError:
+        pass
+    # 如果无法解析为指定格式，则格式化为"%Y-%m-%d %H:%M:%S"
+    formatted_time = datetime.strftime(datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S")
+    return formatted_time
+
+
+# 获取响应页面
+@retry(tries=3, delay=1)
+def getrequests(url):
+    ip = baseCore.get_proxy()
+    req = requests.get(url, headers=headers,proxies=ip)
+    req.encoding = req.apparent_encoding
+    soup = BeautifulSoup(req.text, 'html.parser')
+    return soup
+
+
+# 解析内容
+def getDic(social_code, title, href, pub_time):
+    start_time = time.time()
+    if 'http' not in href:
+        href = 'https://finance.sina.com.cn' + href
+    href_ = href.replace('https', 'http')
+    try:
+        # 带标签正文
+        contentText = smart.extract_by_url(href_).text
+        # 不带标签正文
+        content = smart.extract_by_url(href_).cleaned_text
+        if content == '':
+            log.error(f'{href}===页面解析失败')
+            state = 0
+            takeTime = baseCore.getTimeCost(start_time, time.time())
+            baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===页面解析失败')
+            return 0
+    except:
+        log.error(f'{href}===页面解析失败')
+        state = 0
+        takeTime = baseCore.getTimeCost(start_time, time.time())
+        baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===页面解析失败')
+        return 0
+    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    dic_news = {
+        'attachmentIds': '',
+        'author': '',
+        'content': content,
+        'contentWithTag': contentText,
+        'createDate': time_now,
+        'deleteFlag': '0',
+        'id': '',
+        'keyWords': '',
+        'lang': 'zh',
+        'origin': '新浪财经',
+        'publishDate': pub_time,
+        'sid': '1684032033495392257',
+        'sourceAddress': href,  # 原文链接
+        'summary': '',
+        'title': title,
+        'type': 2,
+        'socialCreditCode': social_code,
+        'year': pub_time[:4]
+    }
+    # print(dic_news)
+    # try:
+    #     sendKafka(dic_news, start_time)
+    #     log.info(f'Kafka发送成功')
+    #     try:
+    #         insertMysql(social_code, href)
+    #         log.info(f'数据库保存成功')
+    #     except:
+    #         log.error(f'{href}===数据入库失败')
+    #         state = 0
+    #         takeTime = baseCore.getTimeCost(start_time, time.time())
+    #         baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===数据入库失败')
+    # except:
+    #     log.error(f'{href}===发送Kafka失败')
+    #     state = 0
+    #     takeTime = baseCore.getTimeCost(start_time, time.time())
+    #     baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===发送Kafka失败')
+    # return 1
+
+
+# 数据发送至Kafka
+@retry(tries=3, delay=1)
+def sendKafka(dic_news, start_time):
+    producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
+    kafka_result = producer.send("researchReportTopic",
+                                 json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
+
+    print(kafka_result.get(timeout=10))
+    dic_result = {
+        'success': 'ture',
+        'message': '操作成功',
+        'code': '200',
+    }
+    log.info(dic_result)
+    # 传输成功,写入日志中
+    state = 1
+    takeTime = baseCore.getTimeCost(start_time, time.time())
+    baseCore.recordLog(dic_news['socialCreditCode'], taskType, state, takeTime, dic_news['sourceAddress'], '')
+
+
+# 数据保存入库，用于判重
+@retry(tries=3, delay=1)
+def insertMysql(social_code, link):
+    insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())'''
+    # 动态信息列表
+    list_info = [
+        social_code,
+        link,
+        '新浪财经',
+        '2',
+    ]
+    cursor.execute(insert_sql, tuple(list_info))
+    cnx.commit()
+
+
+# 判断动态是否采集过
+@retry(tries=3, delay=1)
+def selectUrl(url, social_code):
+    sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s and type='2' '''
+    cursor.execute(sel_sql, (url, social_code))
+    selects = cursor.fetchone()
+    return selects
+
+
+def doJob():
+    # while True:
+    start_time = time.time()
+    # social_code = baseCore.redicPullData('NewsEnterprise:xgqy_nyse_socialCode')
+    social_code = '91330000747735638J'
+    if not social_code or social_code == 'None':
+        time.sleep(20)
+    data = baseCore.getInfomation(social_code)
+    gpdm = data[3]
+    log.info(f'{social_code}==={gpdm}===开始采集')
+    # if gpdm == '' or not gpdm:
+    #     log.error(f'{social_code}===股票代码为空')
+    #     continue
+    gpdm_ = gpdm.split('.')[0]
+    if len(gpdm_) != 5:
+        gpdm_ = gpdm_.zfill(5)
+    page = 1
+    num_ok = 0
+    num_error =0
+    while True:
+        url = f'http://stock.finance.sina.com.cn/hkstock/go.php/CompanyNews/page/{page}/code/{gpdm_}/.phtml'
+        soup = getrequests(url)
+        if '拒绝访问' in soup.text:
+            log.error(f'{social_code}===ip封禁')
+            state = 0
+            takeTime = baseCore.getTimeCost(start_time, time.time())
+            baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===ip封禁')
+            # r.rpush('NewsEnterprise:xgqy_nyse_socialCode',social_code)
+            time.sleep(1800)
+            break
+        next_flg = soup.find('div',class_='part02').text
+        if '暂无数据' in next_flg:
+            break
+        try:
+            li_list = soup.find('ul', class_='list01').find_all('li')
+            for li in li_list:
+                try:
+                    a = li.find('a')
+                    if a:
+                        title = a.text
+                        if title == '':
+                            continue
+                        href = a.get('href')
+                        selects = selectUrl(href,social_code)
+                        if selects:
+                            log.info(f'{href}===已采集过')
+                            continue
+                        pub_time = format_time(li.find('span').text)
+                        print(title)
+                        flag = getDic(social_code,title,href,pub_time)
+                        if flag == 1:
+                            num_ok += 1
+                        else:
+                            num_error += 1
+                        time.sleep(0.5)
+                except Exception as e:
+                    ee = e.__traceback__.tb_lineno
+                    log.error(f'{social_code}===信息采集失败==原因:{ee}行 {e}')
+                    state = 0
+                    takeTime = baseCore.getTimeCost(start_time, time.time())
+                    baseCore.recordLog(social_code, taskType, state, takeTime, url, f'信息采集失败==原因:{ee}行 {e}')
+                    continue
+            # 增量使用
+            # if selects:
+            #     break
+        except:
+            log.error(f"{social_code}==={gpdm}===第{page}页获取信息列表失败")
+            state = 0
+            takeTime = baseCore.getTimeCost(start_time, time.time())
+            baseCore.recordLog(social_code, taskType, state, takeTime, url, f'获取信息列表失败')
+        page += 1
+    log.info(f'{social_code}==={gpdm}===企业整体耗时{baseCore.getTimeCost(start_time, time.time())}===成功{num_ok}条,失败{num_error}条')
+
+
+
+
+if __name__ == "__main__":
+    doJob()
+    baseCore.close()