import re

import requests
from bs4 import BeautifulSoup
from retry import retry

from base.BaseCore import BaseCore
baseCore = BaseCore()
@retry(tries=3,delay=2)
def getrequest(url_news):

    ip = baseCore.get_proxy()
    res_news = requests.get(url_news, proxies=ip, timeout=20)
    if res_news.status_code != 200:
        raise
    return res_news


def rm_style_attr(soup):
    # 查找所有含有style属性的标签
    style_tags = soup.find_all(style=True)
    # 遍历每个style标签
    for style_tag in style_tags:
        try:
            # 使用正则表达式替换
            styleattr = style_tag['style']
            styleattr = re.sub(r'visibility:(?s).{1,}?;', '', styleattr)
            styleattr = re.sub(r'font-family:(?s).{1,}?;', '', styleattr)
            styleattr = re.sub(r'color:(?s).{1,}?;', '', styleattr)
            styleattr = re.sub(r'font-size:(?s).{1,}?;', '', styleattr)
            style_tag['style'] = styleattr
        except:
            continue

    # first_div = soup.select('div[id="js_content"]')
    # # 设置style属性
    # first_div['style'] = 'width: 814px ; margin: 0 auto;'

    first_div = soup.select('div[id="js_content"]')
    if first_div:
        first_div = first_div[0]  # 获取第一个匹配的元素
        first_div['style'] = 'width: 814px ; margin: 0 auto;'  # 设置style属性

    return soup


if __name__ == "__main__":
    # url_news = "http://mp.weixin.qq.com/s?__biz=MjM5NDMxOTMwNg==&mid=2653175413&idx=1&sn=8c0853ddab6e27799c4452e0b6e63156&chksm=bd5900d08a2e89c698de51f102b7423b33a27522966ca2218ca1b8ef290837b0087173c74bcb#rd"
    url_news = "http://mp.weixin.qq.com/s?__biz=MzU4ODQwNTIxMw==&mid=2247528290&idx=4&sn=370655b44dfd31b99984e2eeeb4868e0&chksm=fddf6fd0caa8e6c63a0b5e4fece250415fcb56f03f305450b1434978769b443eaa416342326e#rd"
    # 修改请求方法,retry 3次
    try:
        res_news = getrequest(url_news)
        # print(res_news)
    except:
        try:
            res_news = requests.get(url_news, timeout=20)
            print('请求成功')
        except:
            res_news = None
            pass

    soup_news = BeautifulSoup(res_news.content, 'html.parser')
    if '此内容发送失败无法查看' in soup_news.text or '该页面不存在' in soup_news.text or '该内容已被发布者删除' in soup_news.text or '此内容因违规无法查看' in soup_news.text:
        print('失败')
    try:
        news_html = soup_news.find('div', {'id': 'js_content'})

        news_html['style'] = 'width: 814px ; margin: 0 auto;'

        #del news_html['style']
        news_html = rm_style_attr(news_html)
        # print(news_html)
        del news_html['id']
        del news_html['class']
    except Exception as e:
        print(e)
        news_html = None
    # print(news_html)
    news_content = news_html.text
    list_img = news_html.find_all('img')
    for num_img in range(len(list_img)):
        img_one = list_img[num_img]

        url_src = img_one.get('data-src')
        # print(url_src)
        if url_src and 'gif' in url_src:
            url_img = ''
            img_one.extract()
        else:
            try:
                try:
                    name_img = url_src.split('/')[-2] + '.' + url_src.split('wx_fmt=')[1]
                except:
                    img_one.extract()
                    continue
                try:
                    res = requests.get(url_src, timeout=20)
                except:
                    img_one.extract()
                    continue
            except Exception as e:
                print(f'--error--{url_news}-----------{e}')

    for tag in news_html.descendants:
        try:
            del tag['style']
        except:
            pass

    list_section = news_html.find_all('section')
    for section in list_section:
        section.name = 'div'
    print(news_html)