Merge remote-tracking branch 'origin/master'

864508c6 · 刘伟刚 · aff90fb2 · 560ca32c · 864508c6 · 864508c6
--- a/comData/sinafinance_news/nyse_news_gn.py
+++ b/comData/sinafinance_news/nyse_news_gn.py
@@ -27,7 +27,7 @@ headers = {
    'Cache-Control': 'no-cache',
    'Pragma': 'no-cache'
 }
-taskType = '企业动态/新浪财经'
+taskType = '企业动态/新浪财经/国内'
 pattern = r"\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}"
 # 获取响应页面

--- a/comData/sinafinance_news/nyse_news_xg.py
+++ b/comData/sinafinance_news/nyse_news_xg.py
@@ -28,7 +28,7 @@ headers = {
    'Cache-Control': 'no-cache',
    'Pragma': 'no-cache'
 }
-taskType = '企业动态/新浪财经'
+taskType = '企业动态/新浪财经/香港'
 # 判断时间是否是正确格式
@@ -51,7 +51,7 @@ def format_time(time_str):
 def getrequests(url):
    ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers,proxies=ip)
-    req.encoding = req.apparent_encoding
+    req.encoding = 'gbk'
    soup = BeautifulSoup(req.text, 'html.parser')
    return soup
@@ -117,7 +117,7 @@ def getDic(social_code, title, href, pub_time):
    #     state = 0
    #     takeTime = baseCore.getTimeCost(start_time, time.time())
    #     baseCore.recordLog(social_code, taskType, state, takeTime, href, f'{href}===发送Kafka失败')
-    # return 1
+    return 1
 # 数据发送至Kafka
@@ -165,77 +165,77 @@ def selectUrl(url, social_code):
 def doJob():
-    # while True:
-    start_time = time.time()
-    # social_code = baseCore.redicPullData('NewsEnterprise:xgqy_nyse_socialCode')
-    social_code = '91330000747735638J'
-    if not social_code or social_code == 'None':
-        time.sleep(20)
-    data = baseCore.getInfomation(social_code)
-    gpdm = data[3]
-    log.info(f'{social_code}==={gpdm}===开始采集')
-    # if gpdm == '' or not gpdm:
-    #     log.error(f'{social_code}===股票代码为空')
-    #     continue
-    gpdm_ = gpdm.split('.')[0]
-    if len(gpdm_) != 5:
-        gpdm_ = gpdm_.zfill(5)
-    page = 1
-    num_ok = 0
-    num_error =0
    while True:
-        url = f'http://stock.finance.sina.com.cn/hkstock/go.php/CompanyNews/page/{page}/code/{gpdm_}/.phtml'
+        start_time = time.time()
-        soup = getrequests(url)
+        social_code = baseCore.redicPullData('NewsEnterprise:xgqy_nyse_socialCode')
-        if '拒绝访问' in soup.text:
+        # social_code = '91330000747735638J'
-            log.error(f'{social_code}===ip封禁')
+        if not social_code or social_code == 'None':
-            state = 0
+            time.sleep(20)
-            takeTime = baseCore.getTimeCost(start_time, time.time())
+        data = baseCore.getInfomation(social_code)
-            baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===ip封禁')
+        gpdm = data[3]
-            # r.rpush('NewsEnterprise:xgqy_nyse_socialCode',social_code)
+        log.info(f'{social_code}==={gpdm}===开始采集')
-            time.sleep(1800)
+        # if gpdm == '' or not gpdm:
-            break
+        #     log.error(f'{social_code}===股票代码为空')
-        next_flg = soup.find('div',class_='part02').text
+        #     continue
-        if '暂无数据' in next_flg:
+        gpdm_ = gpdm.split('.')[0]
-            break
+        if len(gpdm_) != 5:
-        try:
+            gpdm_ = gpdm_.zfill(5)
-            li_list = soup.find('ul', class_='list01').find_all('li')
+        page = 1
-            for li in li_list:
+        num_ok = 0
-                try:
+        num_error =0
-                    a = li.find('a')
+        while True:
-                    if a:
+            url = f'http://stock.finance.sina.com.cn/hkstock/go.php/CompanyNews/page/{page}/code/{gpdm_}/.phtml'
-                        title = a.text
+            soup = getrequests(url)
-                        if title == '':
+            if '拒绝访问' in soup.text:
-                            continue
+                log.error(f'{social_code}===ip封禁')
-                        href = a.get('href')
+                state = 0
-                        selects = selectUrl(href,social_code)
+                takeTime = baseCore.getTimeCost(start_time, time.time())
-                        if selects:
+                baseCore.recordLog(social_code, taskType, state, takeTime, url, f'{social_code}===ip封禁')
-                            log.info(f'{href}===已采集过')
+                # r.rpush('NewsEnterprise:xgqy_nyse_socialCode',social_code)
-                            continue
+                time.sleep(1800)
-                        pub_time = format_time(li.find('span').text)
+                break
-                        print(title)
+            next_flg = soup.find('div',class_='part02').text
-                        flag = getDic(social_code,title,href,pub_time)
+            if '暂无数据' in next_flg:
-                        if flag == 1:
+                break
-                            num_ok += 1
+            try:
-                        else:
+                li_list = soup.find('ul', class_='list01').find_all('li')
-                            num_error += 1
+                for li in li_list:
-                        time.sleep(0.5)
+                    try:
-                except Exception as e:
+                        a = li.find('a')
-                    ee = e.__traceback__.tb_lineno
+                        if a:
-                    log.error(f'{social_code}===信息采集失败==原因:{ee}行 {e}')
+                            title = a.text
-                    state = 0
+                            if title == '':
-                    takeTime = baseCore.getTimeCost(start_time, time.time())
+                                continue
-                    baseCore.recordLog(social_code, taskType, state, takeTime, url, f'信息采集失败==原因:{ee}行 {e}')
+                            href = a.get('href')
-                    continue
+                            selects = selectUrl(href,social_code)
-            # 增量使用
+                            if selects:
-            # if selects:
+                                log.info(f'{href}===已采集过')
-            #     break
+                                continue
-        except:
+                            pub_time = format_time(li.find('span').text)
-            log.error(f"{social_code}==={gpdm}===第{page}页获取信息列表失败")
+                            print(title)
-            state = 0
+                            flag = getDic(social_code,title,href,pub_time)
-            takeTime = baseCore.getTimeCost(start_time, time.time())
+                            if flag == 1:
-            baseCore.recordLog(social_code, taskType, state, takeTime, url, f'获取信息列表失败')
+                                num_ok += 1
-        page += 1
+                            else:
-    log.info(f'{social_code}==={gpdm}===企业整体耗时{baseCore.getTimeCost(start_time, time.time())}===成功{num_ok}条,失败{num_error}条')
+                                num_error += 1
+                            time.sleep(0.5)
+                    except Exception as e:
+                        ee = e.__traceback__.tb_lineno
+                        log.error(f'{social_code}===信息采集失败==原因:{ee}行 {e}')
+                        state = 0
+                        takeTime = baseCore.getTimeCost(start_time, time.time())
+                        baseCore.recordLog(social_code, taskType, state, takeTime, url, f'信息采集失败==原因:{ee}行 {e}')
+                        continue
+                # 增量使用
+                # if selects:
+                #     break
+            except:
+                log.error(f"{social_code}==={gpdm}===第{page}页获取信息列表失败")
+                state = 0
+                takeTime = baseCore.getTimeCost(start_time, time.time())
+                baseCore.recordLog(social_code, taskType, state, takeTime, url, f'获取信息列表失败')
+            page += 1
+        log.info(f'{social_code}==={gpdm}===企业整体耗时{baseCore.getTimeCost(start_time, time.time())}===成功{num_ok}条,失败{num_error}条')

--- a/comData/sinafinance_news/nyse_notice_gn.py
+++ b/comData/sinafinance_news/nyse_notice_gn.py