from kafka import KafkaConsumer
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import re
from selenium import webdriver
import datetime
import time
import redis
import hashlib

from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import random
from kafka import KafkaProducer

# r = redis.Redis(host="localhost", port=6379)


# 将数据转换成hash值，用来对文章url进行去重，实现增量爬虫
def get_md5(val):
    """把目标数据进行哈希，用哈希值去重更快"""
    md5 = hashlib.md5()
    md5.update(val.encode('utf-8'))
    return md5.hexdigest()


# 使用redis的set对uid进行去重，对爬取过的uid不再全部爬取
def add_uid(name_uid):
    res = r.sadd('name_uid', name_uid)  # 注意是 保存set的方式
    if res == 0:  # 若返回0,说明插入不成功，表示有重复
        return True
    else:
        return False


# 使用redis的set对文章url进行去重
def add_url(article_url):
    res = r.sadd('article_url', get_md5(article_url), 3)  # 注意是 保存set的方式
    if res == 0:  # 若返回0,说明插入不成功，表示有重复
        return True
    else:
        return False


# 使用模拟浏览器来获取cookie值
def get_cookie():
    # executable_path = r"F:\spider\117\chromedriver.exe"
    # opt = webdriver.ChromeOptions()
    # #opt.add_argument('--headless')
    #
    # browser = webdriver.Chrome(chrome_options=opt, executable_path=executable_path)
    path = r"F:\spider\117\chromedriver.exe"
    service = Service(path)
    chrome_options = webdriver.ChromeOptions()
    # chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_experimental_option(
        "excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
    chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
    browser = webdriver.Chrome(options=chrome_options, service=service)
    browser.get("https://weibo.com/")
    # 等待界面出现再获取cookie
    WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, "//*[@id=\"app\"]")))

    cookie_list = browser.get_cookies()

    browser.quit()
    # 格式化打印cookie

    cookies = {}
    # 获取cookie中的name和value,转化成requests可以使用的形式
    for cookie in cookie_list:
        cookies[cookie['name']] = cookie['value']

    # r.set("cookies", str(cookies), ex=600)
    # print(cookies)
    return cookies


# 代码主程序，通过给出的用户url来获取用户发布的文章
def get_content_by_user_uid(url, sid):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
        # "accept": "application/json, text/plain, */*",
    }
    s = requests.session()

    # cookies_str = r.get("cookies")
    cookies_str = get_cookie()
    # if cookies_str == None:
    #     get_cookie()
    #     cookies = json.loads('{' + re.findall("{(.*?)}", str(r.get("cookies")).replace("\'", "\""))[0] + '}')
    # else:
    cookies = json.loads('{' + re.findall("{(.*?)}", str(cookies_str).replace("\'", "\""))[0] + '}')

    s.cookies.update(cookies)

    list_all_info = []  # 用来储存获取后的所有字典数据

    url_get_uid = ''

    # 获取到统一格式的名称，用来查询微博uid
    if url[-1] == "/":
        url = url[:-1]
    if "?" not in url:
        url_get_uid = "https://weibo.com/ajax/profile/info?custom=" + url.split('/')[-1]
    else:
        if "%" not in url:
            url_get_uid = "https://weibo.com/ajax/profile/info?custom=" + url.split('/')[-1].split('?')[0]
        else:
            url_get_uid = "https://weibo.com/ajax/profile/info?screen_name=" + url.split('/')[-1].split('?')[0]

    try:
        res_get_uid_json = s.get(url_get_uid, headers=headers).json()

        weibo_name = res_get_uid_json['data']['user']['screen_name']  # 微博号名称
        uid = res_get_uid_json['data']['user']['id']  # 微博uid
        origin = "微博-" + weibo_name
    except:
        print(f"{url}:uid获取失败")
        return

    # print(uid)

    # if add_uid(uid):  # 若uid已存在于redis中，只爬取该作者第一页文章
    #     num_page = 2
    # else:  # 若uid不存在redis中，爬取1000页
    #     num_page = 1000
    num_page = 10

    # 爬取程序入口
    for page in range(1, num_page):  # 对后面进行无限翻页，直到无内容显示后跳出
        try:
            url_all_con = f"https://weibo.com/ajax/statuses/mymblog?uid={uid}&page={page}&feature=10"  # 使用uid找到每个微博的所有文章

            res_all_con_json = s.get(url_all_con, headers=headers).json()

            list_all_con = res_all_con_json['data']['list']  # 每页微博文章为json类，取出需要的数据

            if list_all_con:  # 当页面有内容时进行获取数据，无内容则跳出
                for one_con in list_all_con:
                    list_news_ = one_con['url_struct']

                    try:
                        for weibo_news in list_news_:
                            if "weibo.com" in weibo_news['long_url']:
                                title_one_con = weibo_news['url_title']  # 文章标题
                                url_one_con = weibo_news['long_url']  # 文章链接URL
                    except:
                        title_one_con = list_news_[0]['url_title']  # 文章标题
                        url_one_con = "https://weibo.com/ttarticle/p/show?id=" + list_news_[0]['page_id']  # 文章链接URL

                    # if add_url(url_one_con):  # 若url已存在，则返回TRUE，跳出本次循环
                    #     continue

                    for num_res in range(0, 3):  # url若访问失败可以最多访问3次
                        try:
                            res_one_con = s.get(url_one_con, headers=headers)  # 对具体文章页面进行请求，获得文章内容
                            break
                        except:
                            time.sleep(2)
                            continue
                    soup_one_con = BeautifulSoup(res_one_con.content, 'html.parser')

                    try:
                        try:
                            one_time = soup_one_con.find('span', {'class': 'time'}).text.split(' ')[0]  # 文章发表时间
                        except:
                            opt = webdriver.ChromeOptions()  # 微博的头条文章用request不能获取到，在这里用模拟浏览器获取
                            opt.add_argument('--headless')
                            browser = webdriver.Chrome(chrome_options=opt)
                            browser.get(url_one_con)
                            WebDriverWait(browser, 10).until(EC.element_to_be_clickable(
                                (By.XPATH, "/html/body/div/div[2]/div[1]/div[2]/div/span[1]")))

                            page_source = browser.page_source  # 获取页面信息
                            soup_one_con = BeautifulSoup(page_source, 'html.parser')

                            one_time = \
                            soup_one_con.find('div', {'class': 'm-box-col m-box-center-a'}).find('span').text.split(
                                ' ')[0]
                            one_content = soup_one_con.find('div', {'class': 'art-con-new'}).text
                            one_content_html = str(soup_one_con.find('div', {'class': 'art-con-new'}))

                            if len(one_time.split('-')) == 2:  # 2022年的微博时间只显示月和日，在这里手动添加年
                                one_time = str(time.localtime(time.time())[0]) + '-' + one_time

                            dic_one_news = {  # 将获取到的每条文章设置为dic，再对dic进行储存
                                "author": weibo_name,  # 微博名称
                                "title": title_one_con,  # 文章标题
                                "publishDate": one_time,  # 发布时间
                                "sourceAddress": url_one_con,  # 文章链接
                                "content": one_content,  # 文章正文
                                "contentWithTag": one_content_html,  # 带标签的正文
                                "sid": sid,  # kafka传过来的id
                                "lang": "zh_cn",  # 语言
                                "origin": origin,  # 来源
                                "originalDate": "",  # 原文时间
                                "summary": "",  # 摘要
                                "source": "15",  # 采集来源（如通用、定制、微信公众号等）
                            }

                            list_all_info.append(dic_one_news)  # 储存所有的dic
                            time.sleep(random.uniform(1.5, 2))

                            continue
                        try:
                            one_content = soup_one_con.find('div', {'class': 'WB_editor_iframe_new'}).text  # 文章内容
                            one_content_html = str(soup_one_con.find('div', {'class': 'WB_editor_iframe_new'}))
                        except:
                            one_content = soup_one_con.find('div', {'class': 'WB_editor_iframe_word'}).text
                            one_content_html = str(soup_one_con.find('div', {'class': 'WB_editor_iframe_word'}))

                        if len(one_time.split('-')) == 2:  # 2022年的微博时间只显示月和日，在这里手动添加年
                            one_time = str(time.localtime(time.time())[0]) + '-' + one_time

                        while True:  # 将\n空格变为\n
                            one_content_n = one_content.replace('\n  ', '\n').replace('\n ', '\n')
                            if one_content_n == one_content:
                                break
                            else:
                                one_content = one_content_n

                        while True:  # 将\n空格变为\n
                            one_content_html_n = one_content_html.replace('\n  ', '\n').replace('\n ', '\n')
                            if one_content_html_n == one_content_html:
                                break
                            else:
                                one_content_html = one_content_html_n

                        one_content_n = re.sub('\n+', '\n', one_content)  # 将连续的\n改为单个\n
                        one_content_html_n = re.sub('\n+', '\n', one_content_html)

                        dic_one_news = {  # 将获取到的每条文章设置为dic，再对dic进行储存
                            "author": weibo_name,  # 微博名称
                            "title": title_one_con,  # 文章标题
                            "publishDate": one_time,  # 发布时间
                            "sourceAddress": url_one_con,  # 文章链接
                            "content": one_content_n,  # 文章正文
                            "contentWithTag": one_content_html_n,  # 带标签的正文
                            "sid": sid,  # kafka传过来的id
                            "lang": "zh_cn",  # 语言
                            "origin": origin,  # 来源
                            "originalDate": "",  # 原文时间
                            "summary": "",  # 摘要
                            "source": "15",  # 采集来源（如通用、定制、微信公众号等）
                        }
                        list_all_info.append(dic_one_news)

                        time.sleep(random.uniform(1.5, 2))  # 一个文章爬取过后随机暂停1.5-2秒
                    except:
                        print("{}:的:{}:获取失败".format(weibo_name, url_one_con))

                print("{}:的:{}:页获取成功".format(weibo_name, page))
            else:  # 页面无内容后表示已到最后一页，退出循环
                break
        except:
            print("{}:的:{}:页获取失败".format(weibo_name, page))
            break

    #     df_con = pd.DataFrame(list_all_info)
    #     df_con.to_excel(f'{uid}.xlsx')

    # for one_news_info in list_all_info:  # 将每一个文章数据转换为json格式，把json文件用kafka发送出去
    #     for num_pro in range(0, 3):
    #         try:
    #             producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
    #             kafka_result = producer.send("crawlerInfo",
    #                                          json.dumps(one_news_info, ensure_ascii=False).encode('utf8'))
    #             print(kafka_result.get(timeout=10))
    #             # time.sleep(1)
    #             # print(json.dumps(one_news_info, ensure_ascii=False))
    #             break
    #         except:
    #             time.sleep(5)
    #             print('发送kafka失败！正在重新发送！')
    #             continue
    # print(list_all_info[0]['title'])
    return


def consume():
    """auto_commit_enable=True, auto_commit_interval_ms=3000"""
    consumer = KafkaConsumer("weiBoCrawl", auto_offset_reset='earliest', group_id="python_weibo",
                             bootstrap_servers=['114.115.159.144:9092'])
    # consumer = KafkaConsumer("pythonInfo", auto_offset_reset='earliest', bootstrap_servers=['114.115.159.144:9092'])
    for message in consumer:
        mes_dict = json.loads(message.value.decode('utf-8'))
        # print(message.value.decode('utf-8'))
        url = mes_dict['siteUri']
        sid = mes_dict['id']

        # print(url)
        get_content_by_user_uid(url, sid)


if __name__ == "__main__":
    # r = redis.Redis(host="localhost",port=6379)
    # consume()
    get_content_by_user_uid('https://weibo.com/u/1689572847','1571698920447193090')

