提交 623b6803 作者: 薛凌堃

Merge remote-tracking branch 'origin/master'

#coding=utf-8 #coding=utf-8
from urllib.parse import urljoin
import pymysql import pymysql
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
...@@ -119,7 +121,7 @@ class SougouSpider(object): ...@@ -119,7 +121,7 @@ class SougouSpider(object):
"connection":"Keep-Alive", "connection":"Keep-Alive",
"user-agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36" "user-agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"
} }
url = 'https://www.sogou.com/link?url=hedJjaC291NbWrwHYHKCyPQj_ei8OKC13fJZ5YRQyvgjcXe6RUhCEXfbi95UdEys0ztd7q5nl6o.' # url = 'https://www.sogou.com/link?url=hedJjaC291NbWrwHYHKCyPQj_ei8OKC13fJZ5YRQyvgjcXe6RUhCEXfbi95UdEys0ztd7q5nl6o.'
res = requests.get(url,headers=header) res = requests.get(url,headers=header)
text=res.text text=res.text
# 定义正则表达式 # 定义正则表达式
...@@ -321,7 +323,7 @@ class SougouSpider(object): ...@@ -321,7 +323,7 @@ class SougouSpider(object):
detailurl=detailmsg['detailUrl'] detailurl=detailmsg['detailUrl']
title = detailmsg['title'] title = detailmsg['title']
content,contentWithTag=self.extractorMsg(detailurl,title) content,contentWithTag=self.extractorMsg(detailurl,title)
contentWithTag=self.rmTagattr(contentWithTag) contentWithTag=self.rmTagattr(contentWithTag,detailurl)
except Exception as e: except Exception as e:
content='' content=''
contentWithTag='' contentWithTag=''
...@@ -467,9 +469,10 @@ class SougouSpider(object): ...@@ -467,9 +469,10 @@ class SougouSpider(object):
# } # }
# return detailmsg # return detailmsg
def rmTagattr(self,html): def rmTagattr(self,html,url):
# 使用BeautifulSoup解析网页内容 # 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(html, 'html.parser') # soup = BeautifulSoup(html, 'html.parser')
soup = self.paserUrl(html,url)
# 遍历所有标签,并去掉属性 # 遍历所有标签,并去掉属性
for tag in soup.find_all(True): for tag in soup.find_all(True):
if tag.name == 'img': if tag.name == 'img':
...@@ -482,6 +485,21 @@ class SougouSpider(object): ...@@ -482,6 +485,21 @@ class SougouSpider(object):
# print(soup.prettify()) # print(soup.prettify())
html=soup.prettify() html=soup.prettify()
return html return html
# 将html中的相对地址转换成绝对地址
def paserUrl(self,html,listurl):
soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links = soup.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return soup
def getProcessitem(self,bdetail): def getProcessitem(self,bdetail):
nowDate=self.getNowDate() nowDate=self.getNowDate()
content=bdetail['content'] content=bdetail['content']
...@@ -531,6 +549,6 @@ class SougouSpider(object): ...@@ -531,6 +549,6 @@ class SougouSpider(object):
if __name__ == '__main__': if __name__ == '__main__':
zhuce = BaiduSpider() zhuce = SougouSpider()
zhuce.run() # zhuce.run()
# zhuce.driver.close() # zhuce.driver.close()
\ No newline at end of file
#coding=utf-8 #coding=utf-8
#coding=utf-8 #coding=utf-8
from urllib.parse import urljoin
import pymysql import pymysql
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from gne import GeneralNewsExtractor from gne import GeneralNewsExtractor
...@@ -151,7 +153,36 @@ class BaiduSpider(object): ...@@ -151,7 +153,36 @@ class BaiduSpider(object):
# 将时间转换为字符串 # 将时间转换为字符串
currentdate = current_time.strftime("%Y-%m-%d %H:%M:%S") currentdate = current_time.strftime("%Y-%m-%d %H:%M:%S")
return currentdate return currentdate
def rmTagattr(self,html,url):
# 使用BeautifulSoup解析网页内容
# soup = BeautifulSoup(html, 'html.parser')
soup = self.paserUrl(html,url)
# 遍历所有标签,并去掉属性
for tag in soup.find_all(True):
if tag.name == 'img':
tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'}
elif tag.name !='img':
tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'}
else:
tag.attrs = {key: value for key, value in tag.attrs.items()}
# 打印去掉属性后的网页内容
# print(soup.prettify())
html=soup.prettify()
return html
# 将html中的相对地址转换成绝对地址
def paserUrl(self,html,listurl):
soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links = soup.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return soup
#智能抽取 #智能抽取
def paserDetail(self,detailhtml,detailurl): def paserDetail(self,detailhtml,detailurl):
try: try:
...@@ -298,7 +329,7 @@ class BaiduSpider(object): ...@@ -298,7 +329,7 @@ class BaiduSpider(object):
detailurl=detailmsg['detailUrl'] detailurl=detailmsg['detailUrl']
title = detailmsg['title'] title = detailmsg['title']
content,contentWithTag=self.extractorMsg(detailurl,title) content,contentWithTag=self.extractorMsg(detailurl,title)
contentWithTag=self.rmTagattr(contentWithTag) contentWithTag=self.rmTagattr(contentWithTag,detailurl)
except Exception as e: except Exception as e:
content='' content=''
contentWithTag='' contentWithTag=''
...@@ -445,21 +476,7 @@ class BaiduSpider(object): ...@@ -445,21 +476,7 @@ class BaiduSpider(object):
# 'kword':kword # 'kword':kword
# } # }
# return detailmsg # return detailmsg
def rmTagattr(self,html):
# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(html, 'html.parser')
# 遍历所有标签,并去掉属性
for tag in soup.find_all(True):
if tag.name == 'img':
tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'}
elif tag.name !='img':
tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'}
else:
tag.attrs = {key: value for key, value in tag.attrs.items()}
# 打印去掉属性后的网页内容
# print(soup.prettify())
html=soup.prettify()
return html
def getProcessitem(self,bdetail): def getProcessitem(self,bdetail):
nowDate=self.getNowDate() nowDate=self.getNowDate()
content=bdetail['content'] content=bdetail['content']
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论