Merge remote-tracking branch 'origin/master'

864508c6 · 刘伟刚 · aff90fb2 · 560ca32c · 864508c6 · 864508c6
--- a/comData/sinafinance_news/nyse_news_gn.py
+++ b/comData/sinafinance_news/nyse_news_gn.py
@@ -27,7 +27,7 @@ headers = {
    'Cache-Control': 'no-cache',
    'Pragma': 'no-cache'
 }
-taskType = '企业动态/新浪财经'
+taskType = '企业动态/新浪财经/国内'
 pattern = r"\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}"

 # 获取响应页面

--- a/comData/sinafinance_news/nyse_news_xg.py
+++ b/comData/sinafinance_news/nyse_news_xg.py
@@ -28,7 +28,7 @@ headers = {
    'Cache-Control': 'no-cache',
    'Pragma': 'no-cache'
 }
-taskType = '企业动态/新浪财经'
+taskType = '企业动态/新浪财经/香港'


 # 判断时间是否是正确格式
@@ -51,7 +51,7 @@ def format_time(time_str):
 def getrequests(url):
    ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers,proxies=ip)
-    req.encoding = req.apparent_encoding
+    req.encoding = 'gbk'
    soup = BeautifulSoup(req.text, 'html.parser')
    return soup

@@ -117,7 +117,7 @@ def getDic(social_code, title, href, pub_time):
    #     state = 0
    #     takeTime = baseCore.getTimeCost(start_time, time.time())
    #     baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===发送Kafka失败')
-    # return 1
+    return 1


 # 数据发送至Kafka
@@ -165,77 +165,77 @@ def selectUrl(url, social_code):


 def doJob():
-    # while True:
-    start_time = time.time()
-    # social_code = baseCore.redicPullData('NewsEnterprise:xgqy_nyse_socialCode')
-    social_code = '91330000747735638J'
-    if not social_code or social_code == 'None':
-        time.sleep(20)
-    data = baseCore.getInfomation(social_code)
-    gpdm = data[3]
-    log.info(f'{social_code}==={gpdm}===开始采集')
-    # if gpdm == '' or not gpdm:
-    #     log.error(f'{social_code}===股票代码为空')
-    #     continue
-    gpdm_ = gpdm.split('.')[0]
-    if len(gpdm_) != 5:
-        gpdm_ = gpdm_.zfill(5)
-    page = 1
-    num_ok = 0
-    num_error =0
    while True:
-        url = f'http://stock.finance.sina.com.cn/hkstock/go.php/CompanyNews/page/{page}/code/{gpdm_}/.phtml'
-        soup = getrequests(url)
-        if '拒绝访问' in soup.text:
-            log.error(f'{social_code}===ip封禁')
-            state = 0
-            takeTime = baseCore.getTimeCost(start_time, time.time())
-            baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===ip封禁')
-            # r.rpush('NewsEnterprise:xgqy_nyse_socialCode',social_code)
-            time.sleep(1800)
-            break
-        next_flg = soup.find('div',class_='part02').text
-        if '暂无数据' in next_flg:
-            break
-        try:
-            li_list = soup.find('ul', class_='list01').find_all('li')
-            for li in li_list:
-                try:
-                    a = li.find('a')
-                    if a:
-                        title = a.text
-                        if title == '':
-                            continue
-                        href = a.get('href')
-                        selects = selectUrl(href,social_code)
-                        if selects:
-                            log.info(f'{href}===已采集过')
-                            continue
-                        pub_time = format_time(li.find('span').text)
-                        print(title)
-                        flag = getDic(social_code,title,href,pub_time)
-                        if flag == 1:
-                            num_ok += 1
-                        else:
-                            num_error += 1
-                        time.sleep(0.5)
-                except Exception as e:
-                    ee = e.__traceback__.tb_lineno
-                    log.error(f'{social_code}===信息采集失败==原因:{ee}行 {e}')
-                    state = 0
-                    takeTime = baseCore.getTimeCost(start_time, time.time())
-                    baseCore.recordLog(social_code, taskType, state, takeTime, url, f'信息采集失败==原因:{ee}行 {e}')
-                    continue
-            # 增量使用
-            # if selects:
-            #     break
-        except:
-            log.error(f"{social_code}==={gpdm}===第{page}页获取信息列表失败")
-            state = 0
-            takeTime = baseCore.getTimeCost(start_time, time.time())
-            baseCore.recordLog(social_code, taskType, state, takeTime, url, f'获取信息列表失败')
-        page += 1
-    log.info(f'{social_code}==={gpdm}===企业整体耗时{baseCore.getTimeCost(start_time, time.time())}===成功{num_ok}条,失败{num_error}条')
+        start_time = time.time()
+        social_code = baseCore.redicPullData('NewsEnterprise:xgqy_nyse_socialCode')
+        # social_code = '91330000747735638J'
+        if not social_code or social_code == 'None':
+            time.sleep(20)
+        data = baseCore.getInfomation(social_code)
+        gpdm = data[3]
+        log.info(f'{social_code}==={gpdm}===开始采集')
+        # if gpdm == '' or not gpdm:
+        #     log.error(f'{social_code}===股票代码为空')
+        #     continue
+        gpdm_ = gpdm.split('.')[0]
+        if len(gpdm_) != 5:
+            gpdm_ = gpdm_.zfill(5)
+        page = 1
+        num_ok = 0
+        num_error =0
+        while True:
+            url = f'http://stock.finance.sina.com.cn/hkstock/go.php/CompanyNews/page/{page}/code/{gpdm_}/.phtml'
+            soup = getrequests(url)
+            if '拒绝访问' in soup.text:
+                log.error(f'{social_code}===ip封禁')
+                state = 0
+                takeTime = baseCore.getTimeCost(start_time, time.time())
+                baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===ip封禁')
+                # r.rpush('NewsEnterprise:xgqy_nyse_socialCode',social_code)
+                time.sleep(1800)
+                break
+            next_flg = soup.find('div',class_='part02').text
+            if '暂无数据' in next_flg:
+                break
+            try:
+                li_list = soup.find('ul', class_='list01').find_all('li')
+                for li in li_list:
+                    try:
+                        a = li.find('a')
+                        if a:
+                            title = a.text
+                            if title == '':
+                                continue
+                            href = a.get('href')
+                            selects = selectUrl(href,social_code)
+                            if selects:
+                                log.info(f'{href}===已采集过')
+                                continue
+                            pub_time = format_time(li.find('span').text)
+                            print(title)
+                            flag = getDic(social_code,title,href,pub_time)
+                            if flag == 1:
+                                num_ok += 1
+                            else:
+                                num_error += 1
+                            time.sleep(0.5)
+                    except Exception as e:
+                        ee = e.__traceback__.tb_lineno
+                        log.error(f'{social_code}===信息采集失败==原因:{ee}行 {e}')
+                        state = 0
+                        takeTime = baseCore.getTimeCost(start_time, time.time())
+                        baseCore.recordLog(social_code, taskType, state, takeTime, url, f'信息采集失败==原因:{ee}行 {e}')
+                        continue
+                # 增量使用
+                # if selects:
+                #     break
+            except:
+                log.error(f"{social_code}==={gpdm}===第{page}页获取信息列表失败")
+                state = 0
+                takeTime = baseCore.getTimeCost(start_time, time.time())
+                baseCore.recordLog(social_code, taskType, state, takeTime, url, f'获取信息列表失败')
+            page += 1
+        log.info(f'{social_code}==={gpdm}===企业整体耗时{baseCore.getTimeCost(start_time, time.time())}===成功{num_ok}条,失败{num_error}条')




--- a/comData/sinafinance_news/nyse_notice_gn.py
+++ b/comData/sinafinance_news/nyse_notice_gn.py