提交 baba9e5d 作者: 薛凌堃

政策法规脚本维护

上级 f2ff6737
...@@ -505,27 +505,36 @@ class BaseCore: ...@@ -505,27 +505,36 @@ class BaseCore:
for i in range(0, 3): for i in range(0, 3):
try: try:
response = requests.get(file_href, headers=headers, verify=False, timeout=20) response = requests.get(file_href, headers=headers, verify=False, timeout=20)
file_size = int(response.headers.get('Content-Length'))
break break
except: except Exception as e:
time.sleep(3) time.sleep(3)
if i ==2:
return retData
continue continue
try:
if response.status_code == 200:
file_size = int(response.headers.get('Content-Length'))
else:
return retData
except:
file_size = ''
for i in range(0, 3): for i in range(0, 3):
try: try:
name = str(self.getuuid()) + category name = str(self.getuuid()) + category
result = obsClient.putContent('zzsn', 'PolicyDocuments/' + name, content=response.content) result = obsClient.putContent('zzsn', 'PolicyDocuments/' + name, content=response.content)
break break
except: except:
time.sleep(3) time.sleep(3)
continue continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1] retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = result['body']['objectUrl'] retData['full_path'] = result['body']['objectUrl']
retData['file_size'] = self.convert_size(file_size) try:
retData['file_size'] = self.convert_size(file_size)
except:
retData['file_size'] = ''
retData['create_time'] = time_now retData['create_time'] = time_now
return retData return retData
except Exception as e: except Exception as e:
......
...@@ -34,8 +34,8 @@ def get_content3(): ...@@ -34,8 +34,8 @@ def get_content3():
doc_href = soup.find('div', class_='zsy_content') doc_href = soup.find('div', class_='zsy_content')
try: try:
org_content = doc_href.select('.zsy_cotitle')[0] org_content = doc_href.select('.zsy_cotitle')[0]
org = re.findall('文章来源:(.*?)发布时间:', org_content)[0].strip() org = re.findall('文章来源:(.*?)发布时间:', str(org_content))[0].strip()
except: except Exception as e:
org = '' org = ''
try: try:
contentWithTag = doc_href.find('div', class_='zsy_comain') contentWithTag = doc_href.find('div', class_='zsy_comain')
...@@ -103,7 +103,7 @@ def get_content3(): ...@@ -103,7 +103,7 @@ def get_content3():
'id': '', # 'id': '', #
'labels': [{'relationId': "1642", 'relationName': "国务院国资委", 'labelMark': "policy"}], 'labels': [{'relationId': "1642", 'relationName': "国务院国资委", 'labelMark': "policy"}],
# 关联标签id 关联标签名称 关联标签标识 # 关联标签id 关联标签名称 关联标签标识
'origin': '', # 政策发布机关 'origin': org, # 政策发布机关
'organ': org, # 政策发文机关 'organ': org, # 政策发文机关
'topicClassification': '', # 政策文件分类 'topicClassification': '', # 政策文件分类
'issuedNumber': pub_hao, # 发文字号 'issuedNumber': pub_hao, # 发文字号
...@@ -168,10 +168,10 @@ def get_content3(): ...@@ -168,10 +168,10 @@ def get_content3():
href = f'http://www.sasac.gov.cn{href_.replace("../../..", "")}' href = f'http://www.sasac.gov.cn{href_.replace("../../..", "")}'
# 判断是否已经爬取过 # 判断是否已经爬取过
is_href = baseTool.db_storage.find_one({'网址': href}) is_href = baseTool.db_storage.find_one({'网址': href})
if is_href: # if is_href:
num += 1 # num += 1
log.info('已采集----------跳过') # log.info('已采集----------跳过')
continue # continue
title = doc_item('a').attr('title') title = doc_item('a').attr('title')
pub_time = doc_item('span').text().replace('[', '').replace(']', '') pub_time = doc_item('span').text().replace('[', '').replace(']', '')
except: except:
...@@ -184,9 +184,9 @@ def get_content3(): ...@@ -184,9 +184,9 @@ def get_content3():
end_time = time.time() end_time = time.time()
log.info(f'共抓取国资委文件{count}条数据,耗时{end_time - start_time}') log.info(f'共抓取国资委文件{count}条数据,耗时{end_time - start_time}')
# partOne() partOne()
# 增量执行需要注释掉partTwo() # 增量执行需要注释掉partTwo()
partTwo() # partTwo()
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论