11/28

e00e2f5b · 薛凌堃 · 119a9a33 · e00e2f5b · e00e2f5b · e00e2f5b
--- a/SASAC/down.py
+++ b/SASAC/down.py
+import os
+import time
+
+import requests
+from retry import retry
+from base.BaseCore import BaseCore
+baseCore = BaseCore()
+log = baseCore.getLogger()
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309080f)XWEB/8461',
+    'Content-Type': 'application/octet-stream',
+}
+
+
+@retry(tries=3, delay=5)
+def getContent(url):
+    req = requests.get(url, headers=headers,timeout=120)
+    if req.status_code != 200:
+        raise
+    req.encoding = req.apparent_encoding
+    content = req.content
+    return content
+
+
+if __name__ == '__main__':
+
+    url_list = []
+    name_list = []
+    count_dict = {}
+    while True:
+        item = baseCore.redicPullData('Download:gwshrfe')
+        if not item or item == 'None':
+            log.info('已没有数据')
+            continue
+
+        if 'http' not in item:
+            # 文件名字
+            file_name_ = item
+            if file_name_ in name_list:
+                count_dict[file_name_] += 1
+                file_name = file_name_ + '_' + str(count_dict[file_name_])
+            else:
+                count_dict[file_name_] = 1
+                file_name = file_name_
+            name_list.append(file_name_)
+            continue
+        else:
+            # 说明是链接
+            url = item
+
+            if url in url_list:
+                log.info(f'{url}该链接已处理过')
+                continue
+            log.info(f'{file_name}==={url}===开始采集')
+
+            try:
+                content = getContent(url)
+            except:
+                # r.sadd('gwshrfe', url)
+                log.error(f'{file_name}==={url}===解析失败')
+                time.sleep(2)
+                continue
+            # 需加上后缀
+            category = os.path.splitext(url)[1]
+            path = f'./文件1/{file_name}'
+            if not os.path.exists(path):
+                os.makedirs(path)
+            file = f'{path}/{file_name}{category}'
+
+            try:
+                with open(file, 'wb') as f:
+                    f.write(content)
+                log.info(f'{url}===下载成功')
+            except:
+                log.error(f'{url}===下载失败')
+            url_list.append(url)
+            time.sleep(2)
+    baseCore.close()
\ No newline at end of file
--- a/SASAC/download.py
+++ b/SASAC/download.py
+import http.server
+import socketserver
+PORT = 8001
+DIRECTORY = r"D:\kkwork\zzsn_spider\SASAC"
+Handler = http.server.SimpleHTTPRequestHandler
+with socketserver.TCPServer(("", PORT), Handler) as httpd:
+    print("Serving at port", PORT)
+    httpd.RequestHandlerClass.directory = DIRECTORY
+    httpd.serve_forever()
\ No newline at end of file
--- a/base/BaseCore.py
+++ b/base/BaseCore.py
@@ -728,6 +728,12 @@ class BaseCore:
    #
    #         return retData

+
+    def deliteATT(self,id):
+        delitesql = f"delete from clb_sys_attachment where id = '{id}' "
+        self.cursor_.execute(delitesql)
+        self.cnx_.commit()
+
    def secrchATT(self, item_id, year, type_id):
        sel_sql = '''select id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
        self.cursor_.execute(sel_sql, (item_id, year, type_id))

--- a/base/RedisPPData.py
+++ b/base/RedisPPData.py
@@ -126,14 +126,19 @@ def NoticeEnterprise_task():
 def NoticeDF():
    cnx, cursor = connectSql()
    # 获取美股企业
-    mg_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=2 AND a.Place=0 AND SecuritiesCode is not null AND SecuritiesCode not LIKE '%.%'"
-    cursor.execute(mg_query)
-    cnx.commit()
-    mg_result = cursor.fetchall()
-    mg_social_list = [item[0] for item in mg_result]
-    print('=======')
-    for item in mg_social_list:
-        r.rpush('NoticeEnterprise:mgqy_socialCode_add', item)
+    # # mg_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=2 AND a.Place=0 AND SecuritiesCode is not null AND SecuritiesCode not LIKE '%.%'"
+    # mg_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=3 AND a.Place=0 AND SecuritiesCode is not null AND SecuritiesCode not LIKE '%.%'"
+    # cursor.execute(mg_query)
+    # cnx.commit()
+    # mg_result = cursor.fetchall()
+    # mg_social_list = [item[0] for item in mg_result]
+    # print('=======')
+    # for item in mg_social_list:
+    #     if r.lrem('NoticeEnterprise:mgqy_socialCode_add', 0, item) == 0:
+    #         r.lpush('NoticeEnterprise:mgqy_socialCode_add', item)
+    #     else:
+    #         continue
+    #     # r.rpush('NoticeEnterprise:mgqy_socialCode_add', item)

    # 获取港股企业
    gg_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=2 And SecuritiesCode like '%.HK'"

--- a/comData/noticeReport/东方财富网-港股公告.py
+++ b/comData/noticeReport/东方财富网-港股公告.py