提交 e7e9818f 作者: 薛凌堃

福建省人民政府

上级 2a7b3088
import time import time
...@@ -52,6 +52,7 @@ def getContent(num, url, publishDate): ...@@ -52,6 +52,7 @@ def getContent(num, url, publishDate):
url_ = url.replace(url_, '') url_ = url.replace(url_, '')
soup = getSoup(url) soup = getSoup(url)
policy.paserUrl(soup, url)
contentWithTag = soup.find('div', class_='TRS_Editor') contentWithTag = soup.find('div', class_='TRS_Editor')
try: try:
scripts = contentWithTag.find_all('script') scripts = contentWithTag.find_all('script')
...@@ -65,10 +66,13 @@ def getContent(num, url, publishDate): ...@@ -65,10 +66,13 @@ def getContent(num, url, publishDate):
style.decompose() style.decompose()
except: except:
pass pass
a_list = contentWithTag.find_all('a') a_list = soup.find('div', class_='xl_list1').find_all('a')
for a in a_list: for a in a_list:
fj_href = a.get('href').replace('./', url_) fj_href = a.get('href')
if fj_href:
pass
else:
continue
fj_title = a.text.lstrip().strip() fj_title = a.text.lstrip().strip()
category = os.path.splitext(fj_href)[1] category = os.path.splitext(fj_href)[1]
if category not in fj_title: if category not in fj_title:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论