提交 e00e2f5b 作者: 薛凌堃

11/28

上级 119a9a33
import os
import time
import requests
from retry import retry
from base.BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309080f)XWEB/8461',
'Content-Type': 'application/octet-stream',
}
@retry(tries=3, delay=5)
def getContent(url):
req = requests.get(url, headers=headers,timeout=120)
if req.status_code != 200:
raise
req.encoding = req.apparent_encoding
content = req.content
return content
if __name__ == '__main__':
url_list = []
name_list = []
count_dict = {}
while True:
item = baseCore.redicPullData('Download:gwshrfe')
if not item or item == 'None':
log.info('已没有数据')
continue
if 'http' not in item:
# 文件名字
file_name_ = item
if file_name_ in name_list:
count_dict[file_name_] += 1
file_name = file_name_ + '_' + str(count_dict[file_name_])
else:
count_dict[file_name_] = 1
file_name = file_name_
name_list.append(file_name_)
continue
else:
# 说明是链接
url = item
if url in url_list:
log.info(f'{url}该链接已处理过')
continue
log.info(f'{file_name}==={url}===开始采集')
try:
content = getContent(url)
except:
# r.sadd('gwshrfe', url)
log.error(f'{file_name}==={url}===解析失败')
time.sleep(2)
continue
# 需加上后缀
category = os.path.splitext(url)[1]
path = f'./文件1/{file_name}'
if not os.path.exists(path):
os.makedirs(path)
file = f'{path}/{file_name}{category}'
try:
with open(file, 'wb') as f:
f.write(content)
log.info(f'{url}===下载成功')
except:
log.error(f'{url}===下载失败')
url_list.append(url)
time.sleep(2)
baseCore.close()
\ No newline at end of file
import http.server
import socketserver
PORT = 8001
DIRECTORY = r"D:\kkwork\zzsn_spider\SASAC"
Handler = http.server.SimpleHTTPRequestHandler
with socketserver.TCPServer(("", PORT), Handler) as httpd:
print("Serving at port", PORT)
httpd.RequestHandlerClass.directory = DIRECTORY
httpd.serve_forever()
\ No newline at end of file
......@@ -728,6 +728,12 @@ class BaseCore:
#
# return retData
def deliteATT(self,id):
delitesql = f"delete from clb_sys_attachment where id = '{id}' "
self.cursor_.execute(delitesql)
self.cnx_.commit()
def secrchATT(self, item_id, year, type_id):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
self.cursor_.execute(sel_sql, (item_id, year, type_id))
......
......@@ -126,14 +126,19 @@ def NoticeEnterprise_task():
def NoticeDF():
cnx, cursor = connectSql()
# 获取美股企业
mg_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=2 AND a.Place=0 AND SecuritiesCode is not null AND SecuritiesCode not LIKE '%.%'"
cursor.execute(mg_query)
cnx.commit()
mg_result = cursor.fetchall()
mg_social_list = [item[0] for item in mg_result]
print('=======')
for item in mg_social_list:
r.rpush('NoticeEnterprise:mgqy_socialCode_add', item)
# # mg_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=2 AND a.Place=0 AND SecuritiesCode is not null AND SecuritiesCode not LIKE '%.%'"
# mg_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=3 AND a.Place=0 AND SecuritiesCode is not null AND SecuritiesCode not LIKE '%.%'"
# cursor.execute(mg_query)
# cnx.commit()
# mg_result = cursor.fetchall()
# mg_social_list = [item[0] for item in mg_result]
# print('=======')
# for item in mg_social_list:
# if r.lrem('NoticeEnterprise:mgqy_socialCode_add', 0, item) == 0:
# r.lpush('NoticeEnterprise:mgqy_socialCode_add', item)
# else:
# continue
# # r.rpush('NoticeEnterprise:mgqy_socialCode_add', item)
# 获取港股企业
gg_query = "SELECT a.SocialCode From EnterpriseInfo a ,EnterpriseType b WHERE a.SocialCode = b.SocialCode AND b.type=2 And SecuritiesCode like '%.HK'"
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论