"""
知网论文采集 模拟点击 封ip
"""
import pymysql
import requests,re,time,random
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from base.BaseCore import BaseCore
baseCore = BaseCore()
cnx = baseCore.cnx
cursor = baseCore.cursor

def get_proxy():
    sql = "select proxy from clb_proxy"
    cursor.execute(sql)
    proxy_lists = cursor.fetchall()
    ip_list = []
    for proxy_ in proxy_lists:
        ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
    proxy_list = []
    for str_ip in ip_list:
        str_ip_list = str_ip.split('-')
        proxyMeta = "http://%(host)s:%(port)s" % {
            "host": str_ip_list[0],
            "port": str_ip_list[1],
        }
        proxy = {
            "HTTP": proxyMeta,
            "HTTPS": proxyMeta
        }
        proxy_list.append(proxy)
    return proxy_list

# 设置浏览器启动参数
capabilities = webdriver.DesiredCapabilities.CHROME.copy()
get_proxy().proxy.add_to_capabilities(capabilities)



info = pd.read_excel('全球创新指标数据(1).xlsx')
enterprise_name_list = []
industry_list = []
for i in range(info.shape[0]):
    # print(info['contrast_name'][i])
    if info['contrast_name'][i]=='发表论文数量' :

        enterprise_name = info['enterprise_name'][i]
        if enterprise_name == '中国石油天然气股份有限公司':
            pass
        else:
            continue
        industry = info['industry'][i]
        industry_list.append(industry)
        enterprise_name_list.append(enterprise_name)

df_all = pd.DataFrame({'公司名称':enterprise_name_list,
                       '行业':industry_list})
df_all['文章发表数'] = ''
# for year in range(2022,1989,-1):
#     df_all[f'{year}'] = ''
# print(df_all)

list_one_info = []

def get_num(com_name,com_industry):

    url = f'https://kns.cnki.net/kns8/DefaultResult/Index?dbcode=CFLQ&kw={com_name}&korder=AF'
    browser.get(url)  # 跳到指定页面

    time.sleep(2)
    btn = browser.find_element(By.XPATH, '/html/body/div[3]/div[1]/div/div/a/span')
    btn.click()
    print('点击1成功')
    time.sleep(3)

    btn2 = browser.find_element(By.XPATH,'//*[@id="divGroup"]/dl[3]/dt')
    btn2.click()
    print("点击2成功")
    time.sleep(1)

    page_source = browser.page_source  # 获取页面信息
    soup = BeautifulSoup(page_source, 'html.parser')

    num_all = soup.find_all('div', {'class': 'resultlist'})[3].find('ul').find_all('li')
    if num_all:
        for li in num_all:
            year = li.find('a').text
            num = li.find('span').text.split('(')[1].split(')')[0]
            dic_json = {
                'enterprise_name':com_name,
                'year':year,
                'num':num,
                'source':'国内外企业发布文章数量来源：中国知网',
                'industry':com_industry
            }
            list_one_info.append(dic_json)
    else:
        dic_json = {
            'enterprise_name': com_name,
            'year': '',
            'num': '',
            'source': '国内外企业发布文章数量来源：中国知网',
            'industry': com_industry
        }
        list_one_info.append(dic_json)

    return list_one_info

chromedriver = 'D:\Chrome\chromedriver.exe'

browser = webdriver.Chrome(chromedriver)

for i in range(0,len(df_all)):
    com_name = df_all['公司名称'][i]
    com_industry=df_all['行业'][i]
    try:
        list_one_info = get_num(com_name,com_industry)
    except:
        continue

    print(list_one_info)
df_info = pd.DataFrame(list_one_info)
df_info.to_excel('年份-论文发表数量.xlsx',index=False)
