新浪财经 10/13

1c9b2f85 · LiuLiYuan · 864508c6 · 1c9b2f85 · 1c9b2f85 · 1c9b2f85
--- a/comData/sinafinance_news/nyse_news.py
+++ b/comData/sinafinance_news/nyse_news.py
@@ -25,18 +25,15 @@ headers = {
    'Cache-Control': 'no-cache',
    'Pragma': 'no-cache'
 }
-taskType = '新浪财经/天眼查'
+taskType = '企业动态/新浪财经'


 # 获取企业信息
 def getinfomation(social_code):
-    selectSql = f"select * from mgzqjywyh_list where state = '2' and xydm='{social_code}' "
+    selectSql = f"select * from mgzqyjwyh_list where state = '2' and xydm='{social_code}' "
    cursor.execute(selectSql)
    data = cursor.fetchone()
-    cnx.commit()
    data = list(data)
-    cursor.close()
-    cnx.close()
    return data


@@ -66,13 +63,19 @@ def getDic(social_code, li):
        contentText = smart.extract_by_url(href_).text
        # 不带标签正文
        content = smart.extract_by_url(href_).cleaned_text
-        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        if content == '':
+            log.error(f'{href}===页面解析失败')
+            state = 0
+            takeTime = baseCore.getTimeCost(start_time, time.time())
+            baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===页面解析失败')
+            return 0
    except:
        log.error(f'{href}===页面解析失败')
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===页面解析失败')
-        return
+        return 0
+    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    dic_news = {
        'attachmentIds': '',
        'author': author,
@@ -110,6 +113,7 @@ def getDic(social_code, li):
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===发送Kafka失败')
+    return 1

 # 数据发送至Kafka
 @retry(tries=3, delay=1)
@@ -157,15 +161,16 @@ def selectUrl(url, social_code):

 def doJob():
    # while True:
-    # social_code = ''
-    # # 从redis中获取企业信用代码
-    # try:
-    # data = getinfomation(social_code)
-    # com_code = data[6]
-    com_code = 'AAPL'
-    social_code = 'ZZSN22080900000004'
-    log.info(f'{social_code}==={com_code}===开始采集')
+    num_ok = 0
+    num_error = 0
+    # 从redis中获取企业信用代码
+    # social_code = baseCore.redicPullData('NewsEnterprise:nyse_socialCode')
+    social_code = 'ZZSN22080900000010'
    start_time = time.time()
+    try:
+        data = getinfomation(social_code)
+        com_code = data[3]
+        log.info(f'{social_code}==={com_code}===开始采集')
        pageIndex = 1
        while True:
            # 拼接链接
@@ -188,13 +193,15 @@ def doJob():
                if selects:
                    log.info(f'{url}==已采集过')
                else:
-                getDic(social_code, li)
-            break
-        break
+                    flg = getDic(social_code, li)
+                    if flg == 1:
+                        num_ok += 1
+                    else:
+                        num_error += 1
            # # 如果采集到已采集过动态,证明最新发布动态已经全部采集过
            # 增量使用
-        # if selects:
-        #     break
+            if selects:
+                break
            next = soup_home.select('body > div > div.xb_news > div.xb_pages > a')
            for i in range(5):
                if len(next) == 0:
@@ -203,18 +210,24 @@ def doJob():
                    break
            if len(next) == 2:
                break
+            if len(next) == 1:
+                text_flag = next[0].text
+                if text_flag != '下页':
+                    break
            pageIndex += 1
            time.sleep(2)
-    log.info(f'{social_code}==={com_code}===企业整体耗时{baseCore.getTimeCost(start_time,time.time())}')
-    # except:
-    #     log.info(f'==={social_code}=====获取企业信息失败====')
-    #     #重新塞入redis
-    #     baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode',social_code)
-    #     state = 0
-    #     takeTime = baseCore.getTimeCost(start, time.time())
-    #     baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
-    #     time.sleep(5)
+        log.info(f'{social_code}==={com_code}===企业整体耗时{baseCore.getTimeCost(start_time, time.time())}===成功{num_ok}条,失败{num_error}条')
+    except Exception as e:
+        print(e)
+        log.info(f'==={social_code}=====获取企业信息失败====')
+        # 重新塞入redis
+        baseCore.rePutIntoR('NewsEnterprise:nyse_socialCode', social_code)
+        state = 0
+        takeTime = baseCore.getTimeCost(start_time, time.time())
+        baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
+        time.sleep(5)


 if __name__ == "__main__":
    doJob()
+    baseCore.close()
--- a/comData/sinafinance_news/nyse_news_gn.py
+++ b/comData/sinafinance_news/nyse_news_gn.py
@@ -30,33 +30,68 @@ headers = {
 taskType = '企业动态/新浪财经/国内'
 pattern = r"\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}"

+
 # 获取响应页面
 @retry(tries=3, delay=1)
 def getrequests(url):
    ip = baseCore.get_proxy()
-    req = requests.get(url, headers=headers,proxies=ip)
+    req = requests.get(url, headers=headers, proxies=ip)
    req.encoding = req.apparent_encoding
    soup = BeautifulSoup(req.text, 'html.parser')
    return soup


-# 解析内容
+# 页面内容解析
+def getContent(href):
+    soup = getrequests(href)
+    if 'http://stock.finance' in href:
+        div_content = soup.find('div',class_='blk_container')
+        content = div_content.text
+        contentWithTag = str(div_content)
+    else:
+        div_content = soup.find('div', attrs={'id': 'artibody'})
+        div_list = div_content.find_all('div')
+        for div in div_list:
+            try:
+                if div.get('class')[0] != 'img_wrapper':
+                    div.decompose()
+            except:
+                div.decompose()
+        script_list = div_content.find_all('script')
+        for script in script_list:
+            script.decompose()
+        style_list = div_content.find_all('style')
+        for style in style_list:
+            style.decompose()
+        img_list = div_content.find_all('img')
+        content = div_content.text.lstrip().strip()
+        contentWithTag = str(div_content)
+        for img in img_list:
+            img_src = img.get('src')
+            if 'https' not in img_src:
+                contentWithTag = contentWithTag.replace(img_src, f'https:{img_src}')
+    return content, contentWithTag
+
+
+# 获取解析内容并发送
 def getDic(social_code, title, href, pub_time):
    start_time = time.time()
-    if 'http' not in href:
-        href = 'https://finance.sina.com.cn' + href
-    href_ = href.replace('https', 'http')
+    pattern = re.compile(r'[\u4e00-\u9fa5]+')
    try:
-        # 带标签正文
-        contentText = smart.extract_by_url(href_).text
-        # 不带标签正文
-        content = smart.extract_by_url(href_).cleaned_text
+        content, contentWithTag = getContent(href)
        if content == '':
            log.error(f'{href}===页面解析失败')
            state = 0
            takeTime = baseCore.getTimeCost(start_time, time.time())
            baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===页面解析失败')
            return 0
+        matches = re.findall(pattern, content)
+        if len(matches) == 0:
+            log.error(f'{href}===页面解析乱码')
+            state = 0
+            takeTime = baseCore.getTimeCost(start_time, time.time())
+            baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===页面解析乱码')
+            return 0
    except:
        log.error(f'{href}===页面解析失败')
        state = 0
@@ -68,7 +103,7 @@ def getDic(social_code, title, href, pub_time):
        'attachmentIds': '',
        'author': '',
        'content': content,
-        'contentWithTag': contentText,
+        'contentWithTag': contentWithTag,
        'createDate': time_now,
        'deleteFlag': '0',
        'id': '',
@@ -84,12 +119,11 @@ def getDic(social_code, title, href, pub_time):
        'socialCreditCode': social_code,
        'year': pub_time[:4]
    }
-    # print(dic_news)
    try:
        sendKafka(dic_news, start_time)
        log.info(f'Kafka发送成功')
        try:
-            insertMysql(social_code, href)
+            insertMysql(social_code, href, pub_time,title,content)
            log.info(f'数据库保存成功')
        except:
            log.error(f'{href}===数据入库失败')
@@ -126,14 +160,17 @@ def sendKafka(dic_news, start_time):

 # 数据保存入库，用于判重
 @retry(tries=3, delay=1)
-def insertMysql(social_code, link):
-    insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())'''
+def insertMysql(social_code, link, pub_time, title, content):
+    insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,publish_time,title,content,create_time) values(%s,%s,%s,%s,%s,%s,%s,now())'''
    # 动态信息列表
    list_info = [
        social_code,
        link,
        '新浪财经',
        '2',
+        pub_time,
+        title,
+        content[0:500]
    ]
    cursor.execute(insert_sql, tuple(list_info))
    cnx.commit()
@@ -175,7 +212,7 @@ def doJob():
            continue
        page = 1
        num_ok = 0
-        num_error =0
+        num_error = 0
        while True:
            url = f'http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllNewsStock.php?symbol={gpdm_}&Page={page}'
            soup = getrequests(url)
@@ -184,7 +221,7 @@ def doJob():
                state = 0
                takeTime = baseCore.getTimeCost(start_time, time.time())
                baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===ip封禁')
-                r.rpush('NewsEnterprise:gnqy_nyse_socialCode',social_code)
+                r.rpush('NewsEnterprise:gnqy_nyse_socialCode', social_code)
                time.sleep(1800)
                break
            try:
@@ -197,14 +234,16 @@ def doJob():
                        if title == '':
                            continue
                        href = a_list[i].get('href')
-                        selects = selectUrl(href,social_code)
+                        selects = selectUrl(href, social_code)
                        if selects:
                            log.info(f'{href}===已采集')
                            continue
                        if 'http' not in href:
                            href = 'https://finance.sina.com.cn' + href
                        pub_time = time_list[i].replace('\xa0', ' ') + ":00"
-                        flg = getDic(social_code,title,href,pub_time)
+                        if 'https://cj.sina' not in href and 'https://t.cj.sina' not in href:
+                            print(href)
+                        flg = getDic(social_code, title, href, pub_time)
                        if flg == 0:
                            num_error += 1
                        else:
@@ -216,7 +255,7 @@ def doJob():
                        state = 0
                        takeTime = baseCore.getTimeCost(start_time, time.time())
                        baseCore.recordLog(social_code, taskType, state, takeTime, url, f'信息采集失败==原因:{ee}行 {e}')
-                    break
+                    # break
            except:
                log.error(f"{social_code}==={gpdm}===第{page}页获取信息列表失败")
                state = 0
@@ -226,9 +265,11 @@ def doJob():
            if '下一页' not in next_flg:
                break
            page += 1
-            break
+            # break

-        log.info(f'{social_code}==={gpdm}===企业整体耗时{baseCore.getTimeCost(start_time, time.time())}===成功{num_ok}条,失败{num_error}条')
+        log.info(
+            f'{social_code}==={gpdm}===企业整体耗时{baseCore.getTimeCost(start_time, time.time())}===成功{num_ok}条,失败{num_error}条')
+        break


 if __name__ == "__main__":

--- a/comData/sinafinance_news/nyse_news_xg.py
+++ b/comData/sinafinance_news/nyse_news_xg.py
@@ -55,24 +55,51 @@ def getrequests(url):
    soup = BeautifulSoup(req.text, 'html.parser')
    return soup

+# 页面内容解析
+def getContent(href):
+    soup = getrequests(href)
+    div_content = soup.find('div',attrs={'id':'artibody'})
+    div_list = div_content.find_all('div')
+    for div in div_list:
+        try:
+            if div.get('class')[0] != 'img_wrapper':
+                div.decompose()
+        except:
+            div.decompose()
+    script_list = div_content.find_all('script')
+    for script in script_list:
+        script.decompose()
+    style_list = div_content.find_all('style')
+    for style in style_list:
+        style.decompose()
+    img_list = div_content.find_all('img')
+    content = div_content.text.lstrip().strip()
+    contentWithTag = str(div_content)
+    for img in img_list:
+        img_src = img.get('src')
+        if 'https' not in img_src:
+            contentWithTag = contentWithTag.replace(img_src,f'https:{img_src}')
+    return content,contentWithTag

-# 解析内容
+# 获取解析内容并发送
 def getDic(social_code, title, href, pub_time):
    start_time = time.time()
-    if 'http' not in href:
-        href = 'https://finance.sina.com.cn' + href
-    href_ = href.replace('https', 'http')
+    pattern = re.compile(r'[\u4e00-\u9fa5]+')
    try:
-        # 带标签正文
-        contentText = smart.extract_by_url(href_).text
-        # 不带标签正文
-        content = smart.extract_by_url(href_).cleaned_text
+        content,contentWithTag = getContent(href)
        if content == '':
            log.error(f'{href}===页面解析失败')
            state = 0
            takeTime = baseCore.getTimeCost(start_time, time.time())
            baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===页面解析失败')
            return 0
+        matches = re.findall(pattern, content)
+        if len(matches) == 0:
+            log.error(f'{href}===页面解析乱码')
+            state = 0
+            takeTime = baseCore.getTimeCost(start_time, time.time())
+            baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===页面解析乱码')
+            return 0
    except:
        log.error(f'{href}===页面解析失败')
        state = 0
@@ -84,7 +111,7 @@ def getDic(social_code, title, href, pub_time):
        'attachmentIds': '',
        'author': '',
        'content': content,
-        'contentWithTag': contentText,
+        'contentWithTag': contentWithTag,
        'createDate': time_now,
        'deleteFlag': '0',
        'id': '',
@@ -100,23 +127,22 @@ def getDic(social_code, title, href, pub_time):
        'socialCreditCode': social_code,
        'year': pub_time[:4]
    }
-    # print(dic_news)
-    # try:
-    #     sendKafka(dic_news, start_time)
-    #     log.info(f'Kafka发送成功')
-    #     try:
-    #         insertMysql(social_code, href)
-    #         log.info(f'数据库保存成功')
-    #     except:
-    #         log.error(f'{href}===数据入库失败')
-    #         state = 0
-    #         takeTime = baseCore.getTimeCost(start_time, time.time())
-    #         baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===数据入库失败')
-    # except:
-    #     log.error(f'{href}===发送Kafka失败')
-    #     state = 0
-    #     takeTime = baseCore.getTimeCost(start_time, time.time())
-    #     baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===发送Kafka失败')
+    try:
+        sendKafka(dic_news, start_time)
+        log.info(f'Kafka发送成功')
+        try:
+            insertMysql(social_code, href,pub_time)
+            log.info(f'数据库保存成功')
+        except:
+            log.error(f'{href}===数据入库失败')
+            state = 0
+            takeTime = baseCore.getTimeCost(start_time, time.time())
+            baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===数据入库失败')
+    except:
+        log.error(f'{href}===发送Kafka失败')
+        state = 0
+        takeTime = baseCore.getTimeCost(start_time, time.time())
+        baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===发送Kafka失败')
    return 1


@@ -142,14 +168,17 @@ def sendKafka(dic_news, start_time):

 # 数据保存入库，用于判重
 @retry(tries=3, delay=1)
-def insertMysql(social_code, link):
-    insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())'''
+def insertMysql(social_code, link,pub_time,title,content):
+    insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,publish_time,title,content,create_time) values(%s,%s,%s,%s,%s,%s,%s,now())'''
    # 动态信息列表
    list_info = [
        social_code,
        link,
        '新浪财经',
        '2',
+        pub_time,
+        title,
+        content[:500]
    ]
    cursor.execute(insert_sql, tuple(list_info))
    cnx.commit()
@@ -174,9 +203,9 @@ def doJob():
        data = baseCore.getInfomation(social_code)
        gpdm = data[3]
        log.info(f'{social_code}==={gpdm}===开始采集')
-        # if gpdm == '' or not gpdm:
-        #     log.error(f'{social_code}===股票代码为空')
-        #     continue
+        if gpdm == '' or not gpdm:
+            log.error(f'{social_code}===股票代码为空')
+            continue
        gpdm_ = gpdm.split('.')[0]
        if len(gpdm_) != 5:
            gpdm_ = gpdm_.zfill(5)
@@ -191,7 +220,7 @@ def doJob():
                state = 0
                takeTime = baseCore.getTimeCost(start_time, time.time())
                baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===ip封禁')
-                # r.rpush('NewsEnterprise:xgqy_nyse_socialCode',social_code)
+                r.rpush('NewsEnterprise:xgqy_nyse_socialCode',social_code)
                time.sleep(1800)
                break
            next_flg = soup.find('div',class_='part02').text
@@ -212,13 +241,12 @@ def doJob():
                                log.info(f'{href}===已采集过')
                                continue
                            pub_time = format_time(li.find('span').text)
-                            print(title)
                            flag = getDic(social_code,title,href,pub_time)
                            if flag == 1:
                                num_ok += 1
                            else:
                                num_error += 1
-                            time.sleep(0.5)
+                            time.sleep(1)
                    except Exception as e:
                        ee = e.__traceback__.tb_lineno
                        log.error(f'{social_code}===信息采集失败==原因:{ee}行 {e}')

--- a/comData/sinafinance_news/nyse_notice_gn.py
+++ b/comData/sinafinance_news/nyse_notice_gn.py
@@ -10,9 +10,6 @@ import requests
 from bs4 import BeautifulSoup
 from kafka import KafkaProducer
 from retry import retry
-from obs import ObsClient
-import fitz
-from urllib.parse import unquote
 from base.BaseCore import BaseCore

 taskType = '企业公告/新浪财经/国内'
@@ -30,141 +27,41 @@ headers = {
 }
 pattern = r"\d{4}-\d{2}-\d{2}"

-obsClient = ObsClient(
-    access_key_id='VEHN7D0TJ9316H8AHCAV',  # 你的华为云的ak码
-    secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY',  # 你的华为云的sk
-    server='https://obs.cn-north-1.myhuaweicloud.com'  # 你的桶的地址
-)
-
-
-# 获取文件大小
-def convert_size(size_bytes):
-    # 定义不同单位的转换值
-    units = ['bytes', 'KB', 'MB', 'GB', 'TB']
-    i = 0
-    while size_bytes >= 1024 and i < len(units) - 1:
-        size_bytes /= 1024
-        i += 1
-    return f"{size_bytes:.2f} {units[i]}"
-
-
-def uptoOBS(pdf_url, pdf_name, type_id, social_code):
-    start_time = time.time()
-    headers = {}
-    retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': 'group1', 'path': '',
-               'full_path': '',
-               'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'Liuliyuans',
-               'create_time': '', 'page_size': '', 'content': ''}
-    headers['User-Agent'] = baseCore.getRandomUserAgent()
-    for i in range(0, 3):
-        try:
-            response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
-            file_size = int(response.headers.get('Content-Length'))
-            break
-        except:
-            time.sleep(3)
-            continue
-    page_size = 0
-    for i in range(0, 3):
-        try:
-            name = pdf_name + '.pdf'
-            now_time = time.strftime("%Y-%m")
-            result = obsClient.putContent('zzsn', f'ZJH/{now_time}/' + name, content=response.content)
-            with fitz.open(stream=response.content, filetype='pdf') as doc:
-                page_size = doc.page_count
-                for page in doc.pages():
-                    retData['content'] += page.get_text()
-            break
-        except:
-            time.sleep(3)
-            continue
-
-    if page_size < 1:
-        # pdf解析失败
-        # print(f'======pdf解析失败=====')
-        return retData
-    else:
-        try:
-            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-            retData['state'] = True
-            retData['path'] = result['body']['objectUrl'].split('.com')[1]
-            retData['full_path'] = unquote(result['body']['objectUrl'])
-            retData['file_size'] = convert_size(file_size)
-            retData['create_time'] = time_now
-            retData['page_size'] = page_size
-        except Exception as e:
-            state = 0
-            takeTime = baseCore.getTimeCost(start_time, time.time())
-            baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
-            return retData
-
-        return retData
-
-
-def secrchATT(item_id, name, type_id, order_by):
-    sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s and order_by=%s '''
-    cursor_.execute(sel_sql, (item_id, name, type_id, order_by))
-    selects = cursor_.fetchone()
-    return selects
-
-
-# 插入到att表 返回附件id
-def tableUpdate(retData, com_name, year, pdf_name, num):
-    item_id = retData['item_id']
-    type_id = retData['type_id']
-    group_name = retData['group_name']
-    path = retData['path']
-    full_path = retData['full_path']
-    category = retData['category']
-    file_size = retData['file_size']
-    status = retData['status']
-    create_by = retData['create_by']
-    page_size = retData['page_size']
-    create_time = retData['create_time']
-    order_by = num
-    Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
-
-    values = (
-        year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
-        status, create_by,
-        create_time, page_size)
-
-    cursor_.execute(Upsql, values)  # 插入
-    cnx_.commit()  # 提交
-    log.info("更新完成:{}".format(Upsql))
-    selects = secrchATT(item_id, pdf_name, type_id, order_by)
-    id = selects[0]
-    return id
-

 def ifInstert(social_code, pdf_url):
-    sel_sql = '''select social_credit_code,source_address from brpa_source_article where social_credit_code = %s and source_address = %s and origin='证监会' and type='1' '''
+    sel_sql = '''select social_credit_code,source_address from brpa_source_article where social_credit_code = %s and source_address = %s and origin='新浪财经' and type='1' '''
    cursor.execute(sel_sql, (social_code, pdf_url))
    selects = cursor.fetchone()
    return selects


+@retry(tries=3, delay=1)
+def sendKafka(dic_news):
+    producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
+    kafka_result = producer.send("researchReportTopic", json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
+
+    print(kafka_result.get(timeout=10))
+
+
 def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time, com_name, num):
    # 判断文件是否已经存在obs服务器中
    # file_path = 'XQWAnnualReport/2023-10/浙江国祥股份有限公司首次公开发行股票并在主板上市暂缓发行公告.doc'
+    soup = getrequests(pdf_url)
+    pdf_url = soup.find('table',attrs={'id':'allbulletin'}).find('tr',class_='gray').find('a').get('href')
    now_time = time.strftime("%Y-%m")
-    file_path = 'ZJH/' + now_time + '/' + pdf_name + '.pdf'
-    response = obsClient.getObjectMetadata('zzsn', file_path)
-    if response.status >= 300:
-        log.info('=====文件不存在obs=====')
-        pass
-    else:
-        log.info(f'=====文件存在obs========{file_path}')
+    file_path = 'XLCJNotice/' + now_time + '/' + pdf_name + '.pdf'
+    response = baseCore.obsexist(file_path)
+    if not response:
        return False
    # 上传至华为云服务器
-    retData = uptoOBS(pdf_url, pdf_name, 8, social_code)
+    retData = baseCore.uptoOBS(pdf_url, pdf_name, 8, social_code, 'XLCJNotice', taskType, start_time, 'LiuLiYuan')
    # 附件插入att数据库
    if retData['state']:
        pass
    else:
        log.info(f'====pdf解析失败====')
        return False
-    att_id = tableUpdate(retData, com_name, year, pdf_name, num)
+    att_id = baseCore.tableUpdate(retData, com_name, year, pdf_name, num)
    if att_id:
        pass
    else:
@@ -195,11 +92,7 @@ def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time, com_n
    # print(dic_news)
    # 将相应字段通过kafka传输保存
    try:
-        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
-        kafka_result = producer.send("researchReportTopic", json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
-
-        print(kafka_result.get(timeout=10))
-
+        sendKafka(dic_news)
        dic_result = {
            'success': 'ture',
            'message': '操作成功',
@@ -249,6 +142,7 @@ def getrequests(url):
    return soup


+# 数据库插入
 @retry(tries=3, delay=1)
 def insertMysql(social_code, link, pub_time):
    insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())'''
@@ -265,6 +159,8 @@ def insertMysql(social_code, link, pub_time):


 def doJob():
+    # while True:
+    # social_code = baseCore.redicPullData()
    start_time = time.time()
    social_code = '91440300192185379H'
    data = baseCore.getInfomation(social_code)
@@ -277,8 +173,9 @@ def doJob():
        # continue
    page = 1
    num = 1
-    # while True:
+    while True:
        url = f'https://vip.stock.finance.sina.com.cn/corp/view/vCB_AllBulletin.php?stockid={gpdm}&Page={page}'
+        print(url)
        soup = getrequests(url)
        # if '拒绝访问' in soup.text:
        #     log.error(f'{social_code}===ip封禁')
@@ -303,29 +200,39 @@ def doJob():
                    continue
                href = a_list[i].get('href')
                if 'http' not in href:
-                href = 'https://finance.sina.com.cn' + href
-            selects = ifInstert(short_name, social_code, href)
+                    href = 'https://vip.stock.finance.sina.com.cn' + href
+                soup_href = getrequests(href)
+                th = soup_href.find('table',attrs={'id':'allbulletin'}).find_all('th')[0]
+                if '下载公告' in th.text:
+                    pdf_url = th.find('a',attrs={'target':'_blank'}).get('href')
+                else:
+                    log.error(f"{href}===没有公告下载地址")
+                    continue
+                selects = ifInstert(social_code, pdf_url)
                if selects:
-                log.info(f'{href}===已采集')
+                    log.info(f'{pdf_url}===已采集')
                    continue
                pub_time = format_time(time_list[i])
                year = pub_time[:4]
-            result = GetContent(href, name_pdf, social_code, year, pub_time, start_time, com_name, num)
-            num += 1
-            if result:
-                # 公告信息列表
-                log.info(f'{short_name}==============解析传输操作成功')
-                state = 1
-                takeTime = baseCore.getTimeCost(start_time, time.time())
-                baseCore.recordLog(social_code, taskType, state, takeTime, href, '成功')
-
-                # 发送kafka成功之后 再插入数据库
-                insert = insertMysql(social_code, href, pub_time)
-                if insert:
-                    log.info(f'===={social_code}========{name_pdf}=====插入库成功')
-                pass
-            else:
-                continue
+                print(name_pdf,pub_time,pdf_url)
+                # result = GetContent(pdf_url, name_pdf, social_code, year, pub_time, start_time, com_name, num)
+                # num += 1
+                # if result:
+                #     # 公告信息列表
+                #     log.info(f'{short_name}==============解析传输操作成功')
+                #     state = 1
+                #     takeTime = baseCore.getTimeCost(start_time, time.time())
+                #     baseCore.recordLog(social_code, taskType, state, takeTime, href, '成功')
+                #     try:
+                #         # 发送kafka成功之后 再插入数据库
+                #         insertMysql(social_code, href, pub_time)
+                #         log.info(f"{social_code}==={href}===数据库插入成功")
+                #     except:
+                #         log.info(f"{social_code}==={href}===数据库插入失败")
+                #         continue
+                # else:
+                #     continue
+                # break
                #             time.sleep(0.5)
                #         except Exception as e:
                #             ee = e.__traceback__.tb_lineno
@@ -339,12 +246,11 @@ def doJob():
                #     state = 0
                #     takeTime = baseCore.getTimeCost(start_time, time.time())
                #     baseCore.recordLog(social_code, taskType, state, takeTime, url, f'获取信息列表失败')
-            # next_flg = soup.select('#con02-7 > table > tr')[1].select('div')[2].text
-            # if '下一页' not in next_flg:
-            #     break
-            # page += 1
-            # break
-
+            next_flg = soup.select('#con02-7 > table > tr')[1].select('div')[2].text
+            if '下一页' not in next_flg:
+                break
+            page += 1
+            break
    # log.info(
    #     f'{social_code}==={gpdm}===企业整体耗时{baseCore.getTimeCost(start_time, time.time())}===成功{num_ok}条,失败{num_error}条')