提交 98ca1672 作者: 薛凌堃

Merge remote-tracking branch 'origin/master'

...@@ -49,6 +49,7 @@ def RequestUrl(url, payload, item_id, start_time): ...@@ -49,6 +49,7 @@ def RequestUrl(url, payload, item_id, start_time):
if response.status_code == 200: if response.status_code == 200:
# 请求成功,处理响应数据 # 请求成功,处理响应数据
# print(response.text) # print(response.text)
soup = BeautifulSoup(response.text, 'html.parser')
pass pass
else: else:
# 请求失败,输出错误信息 # 请求失败,输出错误信息
...@@ -56,9 +57,7 @@ def RequestUrl(url, payload, item_id, start_time): ...@@ -56,9 +57,7 @@ def RequestUrl(url, payload, item_id, start_time):
state = 0 state = 0
takeTime = baseCore.getTimeCost(start_time, time.time()) takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(item_id, taskType, state, takeTime, url, '请求失败') baseCore.recordLog(item_id, taskType, state, takeTime, url, '请求失败')
soup = ''
soup = BeautifulSoup(response.text, 'html.parser')
return soup return soup
...@@ -89,6 +88,8 @@ def SpiderByZJH(url, payload, dic_info, num, start_time): ...@@ -89,6 +88,8 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
# years = dic_info['call_year'] # years = dic_info['call_year']
short_name = dic_info[4] short_name = dic_info[4]
soup = RequestUrl(url, payload, item_id, start_time) soup = RequestUrl(url, payload, item_id, start_time)
if soup == '':
return
# 先获取页数 # 先获取页数
page = soup.find('div', class_='pages').find('ul', class_='g-ul').text page = soup.find('div', class_='pages').find('ul', class_='g-ul').text
...@@ -106,8 +107,9 @@ def SpiderByZJH(url, payload, dic_info, num, start_time): ...@@ -106,8 +107,9 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
else: else:
# http://eid.csrc.gov.cn/101811/index_3_f.html # http://eid.csrc.gov.cn/101811/index_3_f.html
href = url.split('index')[0] + f'index_{i}_f.html' href = url.split('index')[0] + f'index_{i}_f.html'
soup = RequestUrl(href, payload, item_id, start_time) soup = RequestUrl(href, payload, item_id, start_time)
if soup == '':
continue
tr_list = soup.find('div', id='txt').find_all('tr') tr_list = soup.find('div', id='txt').find_all('tr')
for tr in tr_list[1:]: for tr in tr_list[1:]:
td_list = tr.find_all('td') td_list = tr.find_all('td')
...@@ -210,7 +212,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time): ...@@ -210,7 +212,7 @@ def SpiderByZJH(url, payload, dic_info, num, start_time):
baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, e) baseCore.recordLog(item_id, taskType, state, takeTime, pdf_url, e)
continue continue
else: else:
continue continue
def getUrl(code, url_parms, Catagory2_parms): def getUrl(code, url_parms, Catagory2_parms):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论