Merge remote-tracking branch 'origin/master'

623b6803 · 薛凌堃 · 48c1b97a · fc821dc7 · 623b6803 · 623b6803
--- a/sougou_comm/sougouSpider.py
+++ b/sougou_comm/sougouSpider.py
 #coding=utf-8
+from urllib.parse import urljoin
 import pymysql
 import requests
 from bs4 import BeautifulSoup
@@ -119,7 +121,7 @@ class SougouSpider(object):
                "connection":"Keep-Alive",
                "user-agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"
            }
-            url = 'https://www.sogou.com/link?url=hedJjaC291NbWrwHYHKCyPQj_ei8OKC13fJZ5YRQyvgjcXe6RUhCEXfbi95UdEys0ztd7q5nl6o.'
+            # url = 'https://www.sogou.com/link?url=hedJjaC291NbWrwHYHKCyPQj_ei8OKC13fJZ5YRQyvgjcXe6RUhCEXfbi95UdEys0ztd7q5nl6o.'
            res = requests.get(url,headers=header)
            text=res.text
            # 定义正则表达式
@@ -321,7 +323,7 @@ class SougouSpider(object):
            detailurl=detailmsg['detailUrl']
            title = detailmsg['title']
            content,contentWithTag=self.extractorMsg(detailurl,title)
-            contentWithTag=self.rmTagattr(contentWithTag)
+            contentWithTag=self.rmTagattr(contentWithTag,detailurl)
        except Exception as e:
            content=''
            contentWithTag=''
@@ -467,9 +469,10 @@ class SougouSpider(object):
    #     }
    #     return detailmsg
-    def rmTagattr(self,html):
+    def rmTagattr(self,html,url):
        # 使用BeautifulSoup解析网页内容
-        soup = BeautifulSoup(html, 'html.parser')
+        # soup = BeautifulSoup(html, 'html.parser')
+        soup = self.paserUrl(html,url)
        # 遍历所有标签，并去掉属性
        for tag in soup.find_all(True):
            if tag.name == 'img':
@@ -482,6 +485,21 @@ class SougouSpider(object):
        # print(soup.prettify())
        html=soup.prettify()
        return html
+    # 将html中的相对地址转换成绝对地址
+    def paserUrl(self,html,listurl):
+        soup = BeautifulSoup(html, 'html.parser')
+        # 获取所有的<a>标签和<img>标签
+        links = soup.find_all(['a', 'img'])
+        # 遍历标签，将相对地址转换为绝对地址
+        for link in links:
+            if 'href' in link.attrs:
+                link['href'] = urljoin(listurl, link['href'])
+            elif 'src' in link.attrs:
+                link['src'] = urljoin(listurl, link['src'])
+        return soup
    def getProcessitem(self,bdetail):
        nowDate=self.getNowDate()
        content=bdetail['content']
@@ -531,6 +549,6 @@ class SougouSpider(object):
 if __name__ == '__main__':
-    zhuce = BaiduSpider()
+    zhuce = SougouSpider()
-    zhuce.run()
+    # zhuce.run()
    # zhuce.driver.close()
\ No newline at end of file
--- a/百度采集/baidu_comm/baiduSpider.py
+++ b/百度采集/baidu_comm/baiduSpider.py
 #coding=utf-8
 #coding=utf-8
+from urllib.parse import urljoin
 import pymysql
 from bs4 import BeautifulSoup
 from gne import GeneralNewsExtractor
@@ -151,7 +153,36 @@ class BaiduSpider(object):
        # 将时间转换为字符串
        currentdate = current_time.strftime("%Y-%m-%d %H:%M:%S")
        return currentdate
+    def rmTagattr(self,html,url):
+        # 使用BeautifulSoup解析网页内容
+        # soup = BeautifulSoup(html, 'html.parser')
+        soup = self.paserUrl(html,url)
+        # 遍历所有标签，并去掉属性
+        for tag in soup.find_all(True):
+            if tag.name == 'img':
+                tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'}
+            elif tag.name !='img':
+                tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'}
+            else:
+                tag.attrs = {key: value for key, value in tag.attrs.items()}
+        # 打印去掉属性后的网页内容
+        # print(soup.prettify())
+        html=soup.prettify()
+        return html
+    # 将html中的相对地址转换成绝对地址
+    def paserUrl(self,html,listurl):
+        soup = BeautifulSoup(html, 'html.parser')
+        # 获取所有的<a>标签和<img>标签
+        links = soup.find_all(['a', 'img'])
+        # 遍历标签，将相对地址转换为绝对地址
+        for link in links:
+            if 'href' in link.attrs:
+                link['href'] = urljoin(listurl, link['href'])
+            elif 'src' in link.attrs:
+                link['src'] = urljoin(listurl, link['src'])
+        return soup
    #智能抽取
    def paserDetail(self,detailhtml,detailurl):
        try:
@@ -298,7 +329,7 @@ class BaiduSpider(object):
            detailurl=detailmsg['detailUrl']
            title = detailmsg['title']
            content,contentWithTag=self.extractorMsg(detailurl,title)
-            contentWithTag=self.rmTagattr(contentWithTag)
+            contentWithTag=self.rmTagattr(contentWithTag,detailurl)
        except Exception as e:
            content=''
            contentWithTag=''
@@ -445,21 +476,7 @@ class BaiduSpider(object):
    #         'kword':kword
    #     }
    #     return detailmsg
-    def rmTagattr(self,html):
-        # 使用BeautifulSoup解析网页内容
-        soup = BeautifulSoup(html, 'html.parser')
-        # 遍历所有标签，并去掉属性
-        for tag in soup.find_all(True):
-            if tag.name == 'img':
-                tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'}
-            elif tag.name !='img':
-                tag.attrs = {key: value for key, value in tag.attrs.items() if key == 'src'}
-            else:
-                tag.attrs = {key: value for key, value in tag.attrs.items()}
-        # 打印去掉属性后的网页内容
-        # print(soup.prettify())
-        html=soup.prettify()
-        return html
    def getProcessitem(self,bdetail):
        nowDate=self.getNowDate()
        content=bdetail['content']