提交 a0ee390b 作者: XveLingKun

证监会公告

上级 79448081
......@@ -51,18 +51,29 @@ def convert_size(size_bytes):
return f"{size_bytes:.2f} {units[i]}"
def uptoOBS(pdf_url,pdf_name,type_id,social_code):
headers = {}
headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'acw_tc=b7ccf59b17183478493685990e058ea61e8fcc97b3e3ceeb9ca72237eb; cdn_sec_tc=b7ccf59b17183478493685990e058ea61e8fcc97b3e3ceeb9ca72237eb; acw_sc__v3=666be84cf4454ec8c2436572df9f9e6dc78b409b; tfstk=fqG-Yit_Tnxu599nN49m-_AIYB8Dsb3ru0u1tkXQNmEI7DCnr3uQAkigmbqor2ZKpoisr_xrxDEIS2DuVDDuAyiif42H87mY9mn0qT0H46hKjmf3V3PSp6FriYf3q3PKRcVpjhAMs4uzaWtMjvDJxuN_WgaCxuNbhEjiRARMs4u5rzTilCYevQh4AkNCFk6Xky48OzwSAowb5Pj7OWiIlEU37k1QAzgfGzznwK58vaZN9vHWr405ar5CObNUelUOK6CLOzwRU4Zxy4hYy8ETn4oFqbimRbcz3KW0TqDtvvi6mTqSBPnYIYKOwcnzRmFo1eJz3JGK9rk2v9Etd4DZd-LWNqF82-U8eaBQwvir98kR8FubNmkab8920rhosJEaHitSoqE7Bvnk06ZoBqiYIjjcs5MZDXe_1gPEsfIB8GqT-TTvk9WUFrfOmApHgpiI6rEMumWFL-U4klYvk9WUFrzYjEyVL9yYu',
'Host': 'static.sse.com.cn',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'}
retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
'full_path': '',
'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = baseCore.getRandomUserAgent()
# headers['User-Agent'] = baseCore.getRandomUserAgent()
for i in range(0, 3):
try:
response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
try:
file_size = int(response.headers.get('Content-Length'))
break
except:
file_size = 0
break
except Exception as e:
time.sleep(3)
continue
page_size = 0
......@@ -78,7 +89,7 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
except:
except Exception as e:
log.error(f'文件损坏')
return retData
......@@ -156,10 +167,11 @@ def tableUpdate(retData, com_name, year, pdf_name, num,pub_time,origin):
@retry(tries=3, delay=5)
def RequestUrl(url, payload, social_code,start_time):
ip = baseCore.get_proxy()
# ip = baseCore.get_proxy()
# proxy = {'https': 'http://127.0.0.1:8888', 'http': 'http://127.0.0.1:8888'}
response = requests.post(url=url, headers=headers, data=payload, proxies=ip)
# response = requests.post(url=url, headers=headers, data=payload, proxies=ip)
response = requests.post(url=url, headers=headers, data=payload)
# response = requests.post(url=url, data=payload)
response.encoding = response.apparent_encoding
if response.status_code == 200:
......@@ -463,6 +475,13 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库
pub_time = date_object.strftime("%Y-%m-%d %H:%M:%S")
year = pub_time[:4]
report_type = td_list[4].text.strip()
# 获取当前年份
current_year = datetime.now().year
# print(current_year)
if int(current_year) < int(year):
continue
if str(current_year)[:1] < year[:1]: # 防止年份出现6005这种切出来股票代码的情况
continue
# 判断数据库中是否有该条资讯
ifexist = ifInstert(short_name, social_code, pdf_url)
......@@ -489,7 +508,7 @@ def SpiderByZJH(url, payload, dic_info, start_time,num): # dic_info 数据库
else:
log.info(f'======={short_name}========{code}===已存在')
# continue
break
return
if __name__ == '__main__':
num = 0
......@@ -528,8 +547,8 @@ if __name__ == '__main__':
while True:
start_time = time.time()
# 获取企业信息
# social_code = baseCore.redicPullData('NoticeEnterprise:gnqy_socialCode')
social_code = '91370000163446410B'
social_code = baseCore.redicPullData('NoticeEnterprise:gnqy_socialCode_add')
# social_code = '91370000163446410B'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
time.sleep(20)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论