修改代码提交

20a0eb97 · 刘伟刚 · b376f641 · 20a0eb97 · 20a0eb97 · 20a0eb97
--- a/comData/caiwushuju/YAHOO财务数据4.py
+++ b/comData/caiwushuju/YAHOO财务数据4.py
-# -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
@@ -64,8 +64,9 @@ class YahooCaiwu(object):
        doc_items = pq(resp1_table[1]).children()
        if len(doc_items)<1:
            resp1_table = doc_resp('#Col1-1-Financials-Proxy section div:nth-child(4)>div>div').children()
-            catalogue_title = pq(resp1_table[0]).text().split('\n')
-            doc_items = pq(resp1_table[1]).children()
+            if resp1_table:
+                catalogue_title = pq(resp1_table[0]).text().split('\n')
+                doc_items = pq(resp1_table[1]).children()
        catalogue_dict = {}
        content_dict = {}
        for doc_item in doc_items:
@@ -376,6 +377,10 @@ class YahooCaiwu(object):
    #对比指标计算
    def calculateIndexReq(self):
        get_url = 'http://114.115.236.206:8088/sync/calculateIndex'
+        # 获取当前时间
+        current_time = datetime.datetime.now()
+        # 将时间转换为字符串
+        currentdate = current_time.strftime("%Y-%m-%d %H:%M:%S")
        try:
            params={
                'type':2
@@ -399,6 +404,7 @@ if __name__ == '__main__':
    # parse_excel()
    #get_content1()
    yahoo=YahooCaiwu()
+
    while True:
        securitiescode=''
        try:

--- a/comData/caiwushuju/YAHOO财务数据tmp.py
+++ b/comData/caiwushuju/YAHOO财务数据tmp.py
-# -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
@@ -58,74 +58,80 @@ class YahooCaiwu(object):

    # 雅虎财经处理表格
    def deal_table(self,doc_resp):
-        all_dict = {}
-        resp1_table = doc_resp('#Col1-1-Financials-Proxy section div:nth-child(3)>div>div').children()
-        catalogue_title = pq(resp1_table[0]).text().split('\n')
-        doc_items = pq(resp1_table[1]).children()
-        if len(doc_items)<1:
-            resp1_table = doc_resp('#Col1-1-Financials-Proxy section div:nth-child(4)>div>div').children()
+        try:
+            all_dict = {}
+            resp1_table = doc_resp('#Col1-1-Financials-Proxy section div:nth-child(3)>div>div').children()
            catalogue_title = pq(resp1_table[0]).text().split('\n')
            doc_items = pq(resp1_table[1]).children()
-        catalogue_dict = {}
-        content_dict = {}
-        for doc_item in doc_items:
-            if pq(doc_item).text() == '':
-                continue
-            a = pq(pq(doc_item).children()[0]).text().split('\n')[0]
-            a_list = pq(pq(doc_item).children()[0]).text().split('\n')[1:]
-            content_dict[a] = a_list
-            b_dict = {}
-            for doc_item1 in pq(doc_item).children()[1]:
-                b = pq(pq(doc_item1).children()[0]).text().split('\n')[0]
-                if not b:
+            if len(doc_items)<1:
+                resp1_table = doc_resp('#Col1-1-Financials-Proxy section div:nth-child(4)>div>div').children()
+                if resp1_table:
+                    catalogue_title = pq(resp1_table[0]).text().split('\n')
+                    doc_items = pq(resp1_table[1]).children()
+            catalogue_dict = {}
+            content_dict = {}
+            for doc_item in doc_items:
+                if pq(doc_item).text() == '':
                    continue
-                b_list = pq(pq(doc_item1).children()[0]).text().split('\n')[1:]
-                content_dict[b] = b_list
-                c_dict = {}
-                for doc_item2 in pq(doc_item1).children()[1]:
-                    c = pq(pq(doc_item2).children()[0]).text().split('\n')[0]
-                    if not c:
+                a = pq(pq(doc_item).children()[0]).text().split('\n')[0]
+                a_list = pq(pq(doc_item).children()[0]).text().split('\n')[1:]
+                content_dict[a] = a_list
+                b_dict = {}
+                for doc_item1 in pq(doc_item).children()[1]:
+                    b = pq(pq(doc_item1).children()[0]).text().split('\n')[0]
+                    if not b:
                        continue
-                    c_list = pq(pq(doc_item2).children()[0]).text().split('\n')[1:]
-                    content_dict[c] = c_list
-                    d_dict = {}
-                    for doc_item3 in pq(doc_item2).children()[1]:
-                        d = pq(pq(doc_item3).children()[0]).text().split('\n')[0]
-                        if not d:
+                    b_list = pq(pq(doc_item1).children()[0]).text().split('\n')[1:]
+                    content_dict[b] = b_list
+                    c_dict = {}
+                    for doc_item2 in pq(doc_item1).children()[1]:
+                        c = pq(pq(doc_item2).children()[0]).text().split('\n')[0]
+                        if not c:
                            continue
-                        d_list = pq(pq(doc_item3).children()[0]).text().split('\n')[1:]
-                        content_dict[d] = d_list
-                        e_dict = {}
-                        for doc_item4 in pq(doc_item3).children()[1]:
-                            e = pq(pq(doc_item4).children()[0]).text().split('\n')[0]
-                            if not e:
+                        c_list = pq(pq(doc_item2).children()[0]).text().split('\n')[1:]
+                        content_dict[c] = c_list
+                        d_dict = {}
+                        for doc_item3 in pq(doc_item2).children()[1]:
+                            d = pq(pq(doc_item3).children()[0]).text().split('\n')[0]
+                            if not d:
                                continue
-                            e_list = pq(pq(doc_item4).children()[0]).text().split('\n')[1:]
-                            content_dict[e] = e_list
-                            f_dict = {}
-                            for doc_item5 in pq(doc_item4).children()[1]:
-                                f = pq(pq(doc_item5).children()[0]).text().split('\n')[0]
-                                if not f:
+                            d_list = pq(pq(doc_item3).children()[0]).text().split('\n')[1:]
+                            content_dict[d] = d_list
+                            e_dict = {}
+                            for doc_item4 in pq(doc_item3).children()[1]:
+                                e = pq(pq(doc_item4).children()[0]).text().split('\n')[0]
+                                if not e:
                                    continue
-                                f_list = pq(pq(doc_item5).children()[0]).text().split('\n')[1:]
-                                content_dict[f] = f_list
-                                g_dict = {}
-                                for doc_item6 in pq(doc_item5).children()[1]:
-                                    g = pq(pq(doc_item6).children()[0]).text().split('\n')[0]
-                                    if not g:
+                                e_list = pq(pq(doc_item4).children()[0]).text().split('\n')[1:]
+                                content_dict[e] = e_list
+                                f_dict = {}
+                                for doc_item5 in pq(doc_item4).children()[1]:
+                                    f = pq(pq(doc_item5).children()[0]).text().split('\n')[0]
+                                    if not f:
                                        continue
-                                    g_list = pq(pq(doc_item6).children()[0]).text().split('\n')[1:]
-                                    content_dict[g] = g_list
-                                    g_dict[g] = {}
-                                f_dict[f] = g_dict
-                            e_dict[e] = f_dict
-                        d_dict[d] = e_dict
-                    c_dict[c] = d_dict
-                b_dict[b] = c_dict
-            catalogue_dict[a] = b_dict
-        all_dict['表头'] = catalogue_title
-        all_dict['目录'] = catalogue_dict
-        all_dict['内容'] = content_dict
+                                    f_list = pq(pq(doc_item5).children()[0]).text().split('\n')[1:]
+                                    content_dict[f] = f_list
+                                    g_dict = {}
+                                    for doc_item6 in pq(doc_item5).children()[1]:
+                                        g = pq(pq(doc_item6).children()[0]).text().split('\n')[0]
+                                        if not g:
+                                            continue
+                                        g_list = pq(pq(doc_item6).children()[0]).text().split('\n')[1:]
+                                        content_dict[g] = g_list
+                                        g_dict[g] = {}
+                                    f_dict[f] = g_dict
+                                e_dict[e] = f_dict
+                            d_dict[d] = e_dict
+                        c_dict[c] = d_dict
+                    b_dict[b] = c_dict
+                catalogue_dict[a] = b_dict
+            all_dict['表头'] = catalogue_title
+            all_dict['目录'] = catalogue_dict
+            all_dict['内容'] = content_dict
+        except Exception as e:
+            all_dict['表头'] = {}
+            all_dict['目录'] = {}
+            all_dict['内容'] = {}
        return all_dict


@@ -157,7 +163,8 @@ class YahooCaiwu(object):

        conn,cursor=self.conn11()
        try:
-            sql1 = """select social_credit_code,securities_code,securities_short_name from sys_base_enterprise_ipo where exchange='8'  """  # and stock_code = "SYNH"
+            # sql1 = """select social_credit_code,securities_code,securities_short_name from sys_base_enterprise_ipo where securities_code='RAIZ4.SA'  """  # and stock_code = "SYNH"
+            sql1 = """select social_credit_code,securities_code,securities_short_name from sys_base_enterprise_ipo where exchange='8' and any_data='0' """  # and stock_code = "SYNH"
            # sql1 = f"select social_credit_code,securities_code,securities_short_name from sys_base_enterprise_ipo where securities_code='{securitiescode}'  "  # and stock_code = "SYNH"
            cursor.execute(sql1)
            result_data = cursor.fetchall()
@@ -396,6 +403,7 @@ class YahooCaiwu(object):
            print('调用接口成功！！')
        except:
            print('调用失败！')
+
 if __name__ == '__main__':
    # parse_excel()
    #get_content1()

--- a/comData/caiwushuju/test.py
+++ b/comData/caiwushuju/test.py
-# import redis
-#
-#
-# r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn',db=3)
-#
-# # 获取所有键
-# keys = r.keys('*')
-# # print(keys)
-# for key in keys:
-#     f_key = key.decode()
-#     print(f_key)
-#     print("----------")
-#     res = r.exists(f_key)
-#     value = list(r.smembers(f_key))
-#     # 对列表进行排序
-#     value.sort()
-#     # 遍历排序后的列表
-#     list_data = []
-#     for member in value:
-#         member = member.decode()
-#         members = member.strip('[').strip(']').replace('\'','').strip().split(',')
-#         #获取每一个报告期
-#         for date in members:
-#             data = date.strip()
-#             # print(date.strip())
-#             list_data.append(data)
-#     # 放入redis
-#     for item in list_data:
-#         r.sadd(key, item)
-#
-#     # 获取Set中的所有元素
-#     items = r.smembers(key)
-#     # print(items)
-#     print("======================================")
-
+import re
+from urllib.parse import quote, unquote
+import requests
+from pyquery import PyQuery as pq
+from bs4 import BeautifulSoup
+import json
+import difflib
+import urllib3
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 import datetime

-timestamp = 1688054400  # 示例时间戳
-date = datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
+headers={
+    'Connection':'keep-alive',
+    'Pragma':'no-cache',
+    'Cache-Control':'no-cache',
+    'sec-ch-ua':'"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
+    'sec-ch-ua-mobile':'?0',
+    'sec-ch-ua-platform':'"Windows"',
+    'Upgrade-Insecure-Requests':'1',
+    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
+    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+    'Sec-Fetch-Site':'same-origin',
+    'Sec-Fetch-Mode':'navigate',
+    'Sec-Fetch-User':'?1',
+    'Sec-Fetch-Dest':'document',
+    'Referer':'https://quotes.sina.com.cn/usstock/hq/income.php?s=brk.a&t=quarter',
+    'Accept-Encoding':'gzip, deflate, br',
+    'Accept-Language':'zh-CN,zh;q=0.9',
+    'Cookie':'UOR=,finance.sina.com.cn,; SINAGLOBAL=123.149.3.173_1695815968.404462; Apache=123.149.3.173_1695815968.404463; ULV=1695816017391:2:2:2:123.149.3.173_1695815968.404463:1695815967476; lxlrttp=1578733570; U_TRS1=000000ad.bc7f83f51.651419db.690100f2; U_TRS2=000000ad.bc8a83f51.651419db.138fca70; SUB=_2AkMSSJVgf8NxqwFRmP0XzG3kbIxxyA_EieKkFGS7JRMyHRl-yD9kqhY-tRB6Oci7j27VGy6gikgIaUYBZsIPzk3PbLLC; hqEtagMode=1',
+
+}
+proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
+def reqHtml(url):
+    res=requests.get(url,headers=headers,verify=False,timeout=10)
+    res.encoding='GB18030'
+    text=res.text
+    return text
+headers2={
+    'Accept':'*/*',
+    'Accept-Encoding':'gzip, deflate, br',
+    'Accept-Language':'zh-CN,zh;q=0.9',
+    'Cache-Control':'no-cache',
+    'Connection':'keep-alive',
+    'Content-Length':'0',
+    'Cookie':'HWWAFSESID=fd8b573695b0ce804b; HWWAFSESTIME=1695799275143',
+    'Host':'www.qyyjt.cn',
+    'Origin':'https://www.qyyjt.cn',
+    'Pragma':'no-cache',
+    'Referer':'https://www.qyyjt.cn/detail/enterprise/overview?code=56CD928FAD278663E73BE7486C764DA7&type=company',
+    'Sec-Fetch-Dest':'empty',
+    'Sec-Fetch-Mode':'cors',
+    'Sec-Fetch-Site':'same-origin',
+    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
+    'client':'pc-web;pro',
+    'dataid':'869',
+    'pcuss':'eyJ0eXAiOiJKV1QiLCJ0eXBlIjoiand0IiwiYWxnIjoiSFMyNTYifQ.eyJjcmVhdGVUaW1lIjoiMjAyMy0wOS0yNyAyMDoxODowMy40NDkiLCJleHAiOjE2OTU4MTc5ODMsInVzZXJJZCI6IjIwMjMwOTI3MTUyMzA0XzEzNTkyNDgxODM5IiwiZXhwaXJlZFRpbWUiOiIyMDIzLTA5LTI3IDIwOjMzOjAzLjQ0OSJ9.SouwRylKogHfJILh97JMnYRzcJuj2Hg30BmQa9gc-Nc',
+    'sec-ch-ua':'"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
+    'sec-ch-ua-mobile':'?0',
+    'sec-ch-ua-platform':'"Windows"',
+    'system':'new',
+    'terminal':'pc-web;pro',
+    'user':'847E223529194582C37A02EEEC8AC09F0D7AD12E40778D6CA9CFB91F69F8C537',
+    'ver':'20230914',
+    'x-request-id':'x1eCRO-X8D7',
+}
+def reqPostMsg(url):
+
+    res=requests.post(url,headers=headers2,verify=False,timeout=10)
+    res.encoding='utf-8'
+    text=res.text
+    return text
+
+def get_realurl(tmpurl):
+    try:
+        pattern='url=(.{1,}?)&aid'
+        match = re.search(pattern, tmpurl)
+        # 判断是否匹配成功
+        if match:
+            # 获取匹配的结果
+            result = match.group(1)
+            result=unquote(result)
+        else:
+            result=''
+    except:
+        result=''
+    return result
+
+def getFormatedate(timestamp):
+    date = datetime.datetime.fromtimestamp(timestamp)
+    formatted_date = date.strftime('%Y-%m-%d')
+    return formatted_date

-print(date)
+url='https://quotes.sina.com.cn/usstock/hq/income.php?s=brk.a&t=quarter'
+ttext=reqHtml(url)
+soup=BeautifulSoup(ttext,'html.parser')
+tdoc=soup.select('div[class="tbl_wrap"]>table[class="data_tbl os_tbl"]')[0]
+print(str(tdoc))

--- a/comData/caiwushuju/yahoo_stock.py
+++ b/comData/caiwushuju/yahoo_stock.py
@@ -461,7 +461,7 @@ def listPage():
        }
    ]
    for operand in operands:
-        logger.info(f'采集地域股票信息{operand}')
+        rego=operand['operands'][1]
        #第一次请求获取地域总共有的股票代码数量
        try:
            stockmsg=reqmsg(0,operand)
@@ -469,21 +469,23 @@ def listPage():
        except Exception as e:
            logger.info(f'region该地域没有股票信息{operand}')
            continue
+        logger.info(f'采集地域股票信息{rego}---对应的数量{total}')
        for i in range(0,total,100):
            logger.info(f"offset的值{i}")
            stockmsg=reqmsg(i,operand)
            if stockmsg:
                try:
-                    getStock(stockmsg)
+                    getStock(stockmsg,rego)
                except Exception as e:
                    logger.info(f"解析失败{e}")
                    time.sleep(3)


-def getStock(stockmsg):
+def getStock(stockmsg,rego):
    quotes=stockmsg['finance']['result'][0]['quotes']
    for quote in quotes:
        symbol=quote['symbol']
+        logger.info(f"{rego}地区对应的股票代码{symbol}")
        try:
            longName=quote['longName']
        except:

--- a/qqnews_comm/qqnewstaskJob_comm.py
+++ b/qqnews_comm/qqnewstaskJob_comm.py
@@ -169,15 +169,13 @@ class QQnewsTaskJob(object):
                qqnewsSpider.get_page_html()
            except Exception as e:
                logger.info('搜狗搜索异常'+searchkw)
-        finally:
-            qqnewsSpider.driver.quit()
+
        if qqnewsSpider.detailList.qsize() != 0:
            try:
                qqnewsSpider.get_detail_html()
            except Exception as e:
                logger.info('详情解析异常'+searchkw)
-            finally:
-                qqnewsSpider.driver.quit()
+
        logger.info("关键词采集结束！"+searchkw)
 if __name__ == '__main__':
    # ss='道地西洋参+(销售市场|交易市场|直播带货|借助大会平台|网店|微商|电商|农民博主|推介宣传|高品质定位|西洋参产品经营者加盟|引进龙头企业|西洋参冷风库|建设农旅中心|农产品展销中心|精品民宿|温泉)'

--- a/百度采集/baidu_comm/baiduSpider.py
+++ b/百度采集/baidu_comm/baiduSpider.py
-#coding=utf-8
+#coding=utf-8
@@ -402,6 +402,7 @@ class BaiduSpider(object):
        # 使用langid.py判断文本的语言
        lang, confidence = langid.classify(text)
        return lang
+
    # 获取详情页
    def get_detail_html(self):
        # 获取当前窗口的句柄

--- a/百度采集/baidu_comm/baidutaskJob_loc.py
+++ b/百度采集/baidu_comm/baidutaskJob_loc.py
-# -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
@@ -190,7 +190,7 @@ if __name__ == '__main__':
    while True:
        try:
            codeList=[]
-            codeList.append('KW-20230818-0003')
+            codeList.append('KW-20230925-0002')
            for codeid in codeList:
                try:
                    # keymsg=baiduTaskJob.getkafka()
@@ -207,7 +207,7 @@ if __name__ == '__main__':
                    continue
                if kwList:
                    # 创建一个线程池，指定线程数量为4
-                    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
+                    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
                        # 提交任务给线程池，每个任务处理一个数据
                        results = [executor.submit(baiduTaskJob.runSpider, data) for data in kwList]
                        # 获取任务的执行结果