提交 ee09212f 作者: 薛凌堃

知网专家采集

上级 241a04dd
......@@ -51,9 +51,9 @@ def parse_excel():
def get_content1():
print_result_list = []
result_dict_list = []
# query = {"专家库主键id":"141"}
# for db_dict in db_storage.find(query):
for db_dict in db_storage.find():
query = {"专家库主键id":"143"}
for db_dict in db_storage.find(query):
# for db_dict in db_storage.find():
del db_dict['_id']
result_dict_list.append(db_dict)
for result_dict in result_dict_list:
......
# -*- coding: utf-8 -*-
# @Author: MENG
# @Time : 2022-4-9
import xlrd
from tqdm import tqdm
import pymongo
import pymysql
import time
import requests
from pyquery import PyQuery as pq
from selenium import webdriver
import json
from requests.packages import urllib3
urllib3.disable_warnings()
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['知网-研究中心专家']
# 知网专家 读取Excel
def parse_excel():
result_dict_list = []
data = xlrd.open_workbook('知网-研究中心专家.xlsx').sheets()[0]
# 读取excel第一行数据作为存入mongodb的字段名
rows_tag = data.row_values(0)
n_rows = data.nrows
for i in range(1, n_rows):
# 将字段名和excel数据存储为字典形式
result_dict = dict(zip(rows_tag, data.row_values(i)))
c = result_dict['专家库主键id']
# if c:
# c = str(int(c))
d = result_dict['专家库知网code码']
if d:
d = str(int(d))
total_page = result_dict['页数']
a_dict = {
'云协作专家': result_dict['云协作专家'],
'专家库主键id': str(int(c)),
'专家库知网code码': d,
'页数': int(total_page),
'代码': result_dict['代码'],
}
print(a_dict)
result_dict_list.append(a_dict)
db_storage.insert_many(result_dict_list)
# 知网专家
#todo:更新代码如果必要字段为空的话 就不传输
def get_content1():
print_result_list = []
result_dict_list = []
query = {"专家库主键id":{"$gt": 144}}
for db_dict in db_storage.find(query):
# for db_dict in db_storage.find():
del db_dict['_id']
result_dict_list.append(db_dict)
for result_dict in result_dict_list:
try:
a = result_dict['云协作专家']
# b = result_dict['分类id']
c = int(result_dict['专家库主键id'])
# d = result_dict['专家库知网code码']
d = ''
# total_page = result_dict['页数'] + 1
# payload_ = 'IsSearch=false&QueryJson=%7B%22Platform%22%3A%22%22%2C%22DBCode%22%3A%22CFLS%22%2C%22KuaKuCode%22%3A%22CJFQ%2CCDMD%2CCIPD%2CCCND%2CCISD%2CSNAD%2CBDZK%2CCCJD%2CCCVD%2CCJFN%22%2C%22QNode%22%3A%7B%22QGroup%22%3A%5B%7B%22Key%22%3A%22Subject%22%2C%22Title%22%3A%22%22%2C%22Logic%22%3A1%2C%22Items%22%3A%5B%5D%2C%22ChildItems%22%3A%5B%7B%22Key%22%3A%22%22%2C%22Title%22%3A%22%E4%BD%9C%E8%80%85%22%2C%22Logic%22%3A1%2C%22Items%22%3A%5B%7B%22Key%22%3A%22%22%2C%22Title%22%3A%22%E7%99%BD%E4%BA%91%E7%94%9F%EF%BC%88%E4%B8%AD%E5%9B%BD%E6%A0%B8%E7%A7%91%E6%8A%80%E4%BF%A1%E6%81%AF%E4%B8%8E%E7%BB%8F%E6%B5%8E%E7%A0%94%E7%A9%B6%E9%99%A2%EF%BC%89%22%2C%22Logic%22%3A1%2C%22Name%22%3A%22AUC%22%2C%22Operate%22%3A%22%3D%22%2C%22Value%22%3A%22000005858248%22%2C%22ExtendType%22%3A13%2C%22ExtendValue%22%3A%22%22%2C%22Value2%22%3A%22%22%2C%22BlurType%22%3A%22%22%7D%2C%7B%22Key%22%3A%22%22%2C%22Title%22%3A%22%E7%99%BD%E4%BA%91%E7%94%9F%EF%BC%88%E4%B8%AD%E5%9B%BD%E6%A0%B8%E7%A7%91%E6%8A%80%E4%BF%A1%E6%81%AF%E4%B8%8E%E7%BB%8F%E6%B5%8E%E7%A0%94%E7%A9%B6%E9%99%A2%EF%BC%89%22%2C%22Logic%22%3A1%2C%22Name%22%3A%22AU%22%2C%22Operate%22%3A%22%3D%22%2C%22Value%22%3A%22%E7%99%BD%E4%BA%91%E7%94%9F%22%2C%22ExtendType%22%3A13%2C%22ExtendValue%22%3A%22%22%2C%22Value2%22%3A%22%22%2C%22BlurType%22%3A%22%22%7D%5D%2C%22ChildItems%22%3A%5B%5D%7D%5D%7D%5D%7D%2C%22CodeLang%22%3A%22ch%22%7D&SearchSql=0645419CC2F0B23BC604FFC82ADF67C6E920108EDAD48468E8156BA693E89F481391D6F5096D7FFF3585B29E8209A884EFDF8EF1B43B4C7232E120D4832CCC8979F171B4C268EE675FFB969E7C6AF23B4B63CE6436EE93F3973DCB2E4950C92CBCE188BEB6A4E9E17C3978AE8787ED6BB56445D70910E6E32D9A03F3928F9AD8AADE2A90A8F00E2B29BD6E5A0BE025E88D8E778EC97D42EF1CF47C35B8A9D5473493D11B406E77A4FF28F5B34B8028FE85F57606D7A3FED75B27901EEF587583EBD4B63AC0E07735BE77F216B50090DEE5ABB766456B996D37EB8BDACA3A67E8126F111CF9D15B351A094210DB6B4638A21065F03B6F0B73BB4625BBECE66F8197909739D8FB4EB756DEF71864177DFA3CB468CFA6E8ABF7924234DED6B0DFD49D9269CBA4A2BF4075D517A61D094225D70C1B4C137DB9614758A5E097376F5F3E55A7063A4B7E437436D13FF3CC8FB435E131FFCD16FC30DD997098B4FC997D995E767E2712175BC05B960D3FEB5CAF12A13BD1CE3530AD72FC4DB93206996E216BC5DC294960A0CA05E986848E1E64FFC5A52BFCB41A97840A708E397F11EFF261E08F3A34094061AE8E8F819AF6A17A9E2176C3893C6DD3E3C06864C91989BDEF9790A38FAF2524B17743B30EBA4ADD550BF985F9C3097A608C697283CE37F8CB78BDC9EAA4874C3485E6F931B016EC41BFBC0EF91B2AD7E1B424E1DFB8FC8771DEA2458C5A7A4C9BF0192C101FD8EDDEE1BACB44C3E478361EF0D1B70FAD56BCF6870A6044D3A226611B9C1A43C6F9F7C021C98E0D5F778D72C87183F026071A730B8BB4FABF9F68FEC783AB1E6E79218B5D87FD1BB541817FB4F3C21DC849A803CB8A620A2EE00475BAF2CE6556638B7A949B446F39A1076DA15764A777BA6239447CB91F4CF513325366E167D268DDB75F288B5C13415CE62F5C431181C044A28CA502FF14439E5C6F63D419CB6DE1360DB01593FE765459299E442EE24917C199AB5178F38461F8C4EBBC95344C5F2AB60F379813A87E2E3AFE3021198B8222CEAB870D9A353786079961184D63977917C7DF8FE6AFBBC795A832BDD454D6E3CD22C3FF7A58808923DD6F464C12A9A88FBFD0C71458AB0E4C1D566315181A9578ECE93670E5CAF13CF2553F68E64726C131F4A48B42A9E7F09EFEBA51D1FDA6BA0ECA0B02B951ECC04548F1D4D08DB69D0EFDCE6793537BB8E59DC442631A9CDBA13878D7493AADA0CD868C1C1C3A6A6FA17C4109205A83F9C0C43E0D2551D0A8592EA99D20D4B78B4EEEE2D53A543701F620C7D6FF46E800B0CEF9B3D23ACD62C7CBEA25FC8BD74D5A0E5C86B9CF3FCACBDBE585AFF85F9689CBF5BCBD267C580361D5B93AA9BD5A1BB6122BB87C04AE227211FE675A4650814F2285261E5641683D65E0454E2597F6025BB4AA1A044D7B97F57394EA5EC878B80FC0A82F12E2D3D9E1BCB062A7ABB290F02116BDED95761A67CE2FDDE42BDDDF34F22E49A0406D724FEBC86B93F80BB52A8D34B8D2B24288ECBC3F90CCB1EF36085E77F1E2AE0AB411FE60A033E704A21469EA5CE4BB8AF6B1C1F1C5F1F084472D57F458CC39F0B2FE583D0795159E9E38BD1102F5D96DB0F828B66F41A702BB0AE59E40CF53BE7F6342EF208434CFFABF845AFD771B288D484BB79952159E6EA27658A6B6230557AF16E86C4AFDF973DBD5A3A2B979AD9037441409D22A954DC50CBCEA8EA5AC500C4BC8282DCE2626BD2B2CB4B1E33B2E1F92533F7F04C48D061907DBCE3E21FF0A77F09C1AE33E769962CD1EDE6B688590D569409EE9EEC4DF1074DCC97C43B0EDAF1C38B5B2784ABC803D9B3B4FC35F46CB1E275E7F83036FC6AFF2E624D4D2E6AE1C2D4CE3FF219FA90A935957E0DE1A386E4AAB5C9F9D1CECA909F5698BFA86C57B6A73D3C0F9FDB94128B7BB9FDD19D57E4C2C2F4127A1F127A96ABF248B26D8B6EF12A1EA97D064564D33D46E5CA71F53FA121A7E5C91ED2B08BE64A0E3D22BE26FC251C0BF4CF21674DE19AF410E3EDFBD9A4BBEA6C709A1E42B5C17E1EE7AA33EFB0F375BF0858D49210A71662313FA5B8E04E508A5E9425D49C3C5D12CB8DCADBF8A148BFA042BBB0218AAC403AAB9CECC45FD33CEC6797FC984BF91FF638AF6E1F09546F595CFC779D2D867282C63B78DC6A6ED3C1C3887462C84AC07C756C5A8D8A8B2EFD39C28A68D47091A3312461BC20085636F4B41F22D5B46861F3E557777CDCFDFE6CAB8ABECFECA3634D779C0F21185772C426BE383BD26E1715DAB5EC4AE4CAC877ED6899CBCB31546F9C6144399C7C0257BBCCBE0EBD2E90EA901840211FCBB1655CB66FD9C51E90432B273CC4CFF3F8DBEB24CDB0ED6017FE68A7F3E9156E1BA276526DE9599A66921F0E2C3ED466FDE0076DCDA6745F29D28E406BEE5FFB0D4C5FB5D72029BAE56BC22496567FF64341F89469703987DD9D700C08346782F57BA62479812820C862D3D5C604C5C26A76F1A7EC503EB4892BAEB25BB44E783FFB3F1B2F5BECB16A5B48F4D769C3DD5713D2B00AEF4870248A1D561623C9418C285CEE86E1C8DF4A73ED729D7789577456281B4F4D1EE3447E6F8391341BB8F15CF9712E73DBE149164B95748F35D6A4CB4492B8D082AB372E96FC29D1578B41D85F0B7A04EFCBE928642D5D2825F978805C43062C3DF3F0915B33F58A8D82BCC523F3C36B9BAEB9226A39549408AFD3119A8B39B8887038107EB5A623D59186BFFF562E624E1BC25A0C8AA6DD298AEC09B06802A77DFD11799D29506307693DAB2962B98EF9F25E785619D05BDE7073474E187D59D41F6A2E06CC292AD406C2991D9C5E58812A1431B46AB634D548433B1E437D745A013EF4C4FED4369039C743E94AE511E197E219B125674EB364F0CFF5CE55D4C503377D1FFCA229511BBE26D1FB2962F42FC4850A76FB6A12ED2E3067FDE84EED64CF16BD02939CAAB6320DDF506FD2A0014218BDB1426740642DE8F050A92A2FAAB4A0A62AD58F057140E8BC88CEBDFF50EA340BCBB9D4047B8AC0BE152C7DF2563DA563FFB42E36BD2919FA201C0DB25FDF3C7491A8A68DA05598505951C7118DE0ABA5BDB42B6D4A910E519ADFC508F303C54B8E4322F643E4A7B0E589959EAAAFE8506F0CF661FF5C6369CE510C874E596&PageName=defaultresult&HandlerId=19&DBCode=CFLS&KuaKuCodes=CJFQ%2CCDMD%2CCIPD%2CCCND%2CCISD%2CSNAD%2CBDZK%2CCCJD%2CCCVD%2CCJFN&CurPage=2&RecordsCntPerPage=10&CurDisplayMode=listmode&CurrSortField=PT&CurrSortFieldType=desc&IsSortSearch=false&IsSentenceSearch=false&Subject='
payload_ = result_dict['代码']
if payload_:
pass
else:
print(f'{a}代码为空')
continue
aaa_dict_list = []
set_title = set(())
# print(f'正在采集{a},共{total_page - 1}页')
time.sleep(2)
# for page in tqdm(range(1, total_page)):
# if total_page == 2:
# payload = payload_
# else:
# if '&CurPage=1&' in payload_:
# payload = payload_.replace('&CurPage=1&', f'&CurPage={page}&')
# else:
# payload = payload_.replace('&CurPage=2&', f'&CurPage={page}&')
headers = {
'Connection': 'keep-alive',
'Accept': 'text/html, */*; q=0.01',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Origin': 'https://kns.cnki.net',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://kns.cnki.net/kns8/defaultresult/index',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': 'Ecp_ClientId=1210719153902607419; cnkiUserKey=124a20fb-4ebb-86f9-fcc8-5ba2e8da45a2; Ecp_ClientIp=221.15.216.161; Ecp_IpLoginFail=211126125.41.173.138; ASP.NET_SessionId=k5molf2cg114sjxy0lhjceyp; SID_kns8=123106; CurrSortField=%e5%8f%91%e8%a1%a8%e6%97%b6%e9%97%b4%2f(%e5%8f%91%e8%a1%a8%e6%97%b6%e9%97%b4%2c%27TIME%27); CurrSortFieldType=desc; SID_kns_new=kns123123; dsorder=pubdate; dSearchFold=undefined; dstyle=listmode; language=undefined; SID_kcms=124103; _pk_ref=%5B%22%22%2C%22%22%2C1637905762%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3Dk_wIlObu07-p-iE_2Ec5ow7fGwitei5-u-u-hlhx-Z3%26wd%3D%26eqid%3Db0aef0a90017e0810000000261a0492d%22%5D; _pk_ses=*; dperpage=50; searchTimeFlag=1; _pk_id=abbb4caf-5c9c-4e46-b660-e356d71710f1.1626680366.29.1637909392.1637899142.; CurrSortField=%e5%8f%91%e8%a1%a8%e6%97%b6%e9%97%b4%2f(%e5%8f%91%e8%a1%a8%e6%97%b6%e9%97%b4%2c%27TIME%27); CurrSortFieldType=desc'
}
# url = "https://kns.cnki.net/KNS8/Brief/GetGridTableHtml"
url = "https://kns.cnki.net/kns8s/brief/grid"
resp_text = requests.post(url, headers=headers, verify=False, data=payload_).text
time.sleep(15)
doc_resp = pq(resp_text)
doc_items = doc_resp('.result-table-list tr').items()
total_count = doc_resp('.pagerTitleCell em').text()
if 'pageSize=10' in payload_:
total_page = int(total_count) // 10
if int(total_count) % 10 != 0:
total_page += 1
if 'pageSize=20' in payload_:
total_page = int(total_count) // 20
if int(total_count) % 20 != 0:
total_page += 1
print(f'正在采集{a},共{total_page}页')
for page in tqdm(range(1,total_page+1)):
# for page in tqdm(range(1, 2 + 1)):
if '&pageNum=1&' in payload_:
payload = payload_.replace('&pageNum=1&', f'&pageNum={page}&')
else:
payload = payload_.replace('&pageNum=2&', f'&pageNum={page}&')
while True:
try:
resp_text = requests.post(url, headers=headers, verify=False, data=payload).text
time.sleep(15)
doc_resp = pq(resp_text)
doc_items = doc_resp('.result-table-list tr').items()
total_count = doc_resp('.pagerTitleCell em').text()
total_page = int(total_count) // 20
if int(total_count) // 20 != 0:
total_page += 1
if '请输入验证码' in doc_resp.text():
print('验证码!')
time.sleep(600)
continue
break
except Exception as e:
print(f'连接超时!==={e}')
time.sleep(10)
continue
for doc_item in doc_items:
title = doc_item('.name').text().replace('\n', '').replace('免费', '').strip()
if title == '':
continue
source = doc_item('.source').text()
date = doc_item('.date').text()
if date:
date = date.split(' ')[0]
data = doc_item('.data').text()
quote = doc_item('.quote').text()
try:
quote = int(quote)
except:
quote = 0
download = doc_item('.download').text()
try:
download = int(download)
except:
download = 0
author_items1 = doc_item('.author .KnowledgeNetLink').items()
authors = ''
for author_item1 in author_items1:
author1 = author_item1.text()
authors += author1 + '; '
if authors:
authors = authors[:-2]
try:
title_href = doc_item('.name .fz14').attr('href')
# dbcode = title_href.split('DbCode=')[1].split('&yx')[0]
# dbname = title_href.split('DbName=')[1].split('&DbCode')[0]
# filename = title_href.split('FileName=')[1].split('&DbName')[0]
# new_title_href = f'https://kns.cnki.net/kcms/detail/detail.aspx?dbcode={dbcode}&dbname={dbname}&filename={filename}'
headers11 = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'sec-ch-ua': '"Chromium";v="21", " Not;A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': 'cangjieConfig_NZKPT2=%7B%22status%22%3Atrue%2C%22startTime%22%3A%222021-12-23%22%2C%22endTime%22%3A%222022-05-26%22%2C%22orginHosts%22%3A%22kns.cnki.net%22%2C%22type%22%3A%22mix%22%2C%22poolSize%22%3A%2210%22%2C%22intervalTime%22%3A10000%2C%22persist%22%3Afalse%7D; Hm_lvt_38f33a73da35494cc56a660420d5b6be=1653730524; Hm_lpvt_38f33a73da35494cc56a660420d5b6be=1653731762; Ecp_ClientId=a220517174803460325; knsLeftGroupSelectItem=1%3B2%3B; Ecp_ClientIp=1.193.57.250; ASP.NET_SessionId=g3lo0aj14clgsxzbfps5ae2i; SID_kns8=123145; dblang=ch; _pk_ses=*; SID_kns_new=kns123165; Ecp_IpLoginFail=2205281.193.37.253; CurrSortField=%e7%9b%b8%e5%85%b3%e5%ba%a6%2frelevant%2c(%e5%8f%91%e8%a1%a8%e6%97%b6%e9%97%b4%2c%27time%27)+desc; CurrSortFieldType=desc; SID_kcms=015126022; SID_docpre=006007; yeswholedownload=%3Btzjd199805016; _pk_id=2af0698c-d628-4f4c-94ea-1175726c8139.1652780940.2.1653731390.1653730486.'
}
resp_text = requests.request("GET", title_href, headers=headers11, verify=False).text
doc_href = pq(resp_text)
summary = doc_href('.abstract-text').text()
if not summary:
summary = doc_href('#ChDivSummary').text()
except:
summary = ''
time.sleep(20)
if title in set_title:
continue
set_title.add(title)
aaa_dict = {
'title': title,
'author': authors,
'agency': source,
'pubdate': date,
'baseData': data,
'citedcount': quote,
'downloadcount': download,
'summary': summary,
}
if aaa_dict['title'] == '' or aaa_dict['summary'] == '' or aaa_dict['author'] == '' or aaa_dict['citedcount']== '' :
continue
else:
aaa_dict_list.append(aaa_dict)
post_dict = {
"authorId": int(c),
"authorName": a,
"ikCode": d,
"papers": aaa_dict_list
}
# print(post_dict)
print(len(aaa_dict_list))
print_result_list.append(post_dict)
# print(aaa_dict_list)
while aaa_dict_list != []:
try:
#192.168.1.88:8008
post_url = 'http://114.116.19.92:8088/api/reptile/autoSaveExpertPaper'
# post_url = 'http://192.168.1.88:8008/api/reptile/autoSaveExpertPaper'
headers = {
'Content-Type': 'application/json'
}
resp_json = requests.post(url=post_url, headers=headers, verify=False, data=json.dumps(post_dict)).json()
print('推送:', resp_json['msg'])
break
except Exception as e:
print(e)
print('数据传接口失败!')
time.sleep(10)
continue
except Exception as e:
print(e)
time.sleep(30)
print('出错,重试中!')
continue
import pandas as pd
df = pd.DataFrame(print_result_list)
df.to_excel('experct_data.xlsx', index=False)
# 雅虎财经处理表格
def deal_table(doc_resp):
all_dict = {}
resp1_table = doc_resp('#Col1-1-Financials-Proxy section div:nth-child(3)>div>div').children()
catalogue_title = pq(resp1_table[0]).text().split('\n')
doc_items = pq(resp1_table[1]).children()
catalogue_dict = {}
content_dict = {}
for doc_item in doc_items:
if pq(doc_item).text() == '':
continue
a = pq(pq(doc_item).children()[0]).text().split('\n')[0]
a_list = pq(pq(doc_item).children()[0]).text().split('\n')[1:]
content_dict[a] = a_list
b_dict = {}
for doc_item1 in pq(doc_item).children()[1]:
b = pq(pq(doc_item1).children()[0]).text().split('\n')[0]
if not b:
continue
b_list = pq(pq(doc_item1).children()[0]).text().split('\n')[1:]
content_dict[b] = b_list
c_dict = {}
for doc_item2 in pq(doc_item1).children()[1]:
c = pq(pq(doc_item2).children()[0]).text().split('\n')[0]
if not c:
continue
c_list = pq(pq(doc_item2).children()[0]).text().split('\n')[1:]
content_dict[c] = c_list
d_dict = {}
for doc_item3 in pq(doc_item2).children()[1]:
d = pq(pq(doc_item3).children()[0]).text().split('\n')[0]
if not d:
continue
d_list = pq(pq(doc_item3).children()[0]).text().split('\n')[1:]
content_dict[d] = d_list
e_dict = {}
for doc_item4 in pq(doc_item3).children()[1]:
e = pq(pq(doc_item4).children()[0]).text().split('\n')[0]
if not e:
continue
e_list = pq(pq(doc_item4).children()[0]).text().split('\n')[1:]
content_dict[e] = e_list
f_dict = {}
for doc_item5 in pq(doc_item4).children()[1]:
f = pq(pq(doc_item5).children()[0]).text().split('\n')[0]
if not f:
continue
f_list = pq(pq(doc_item5).children()[0]).text().split('\n')[1:]
content_dict[f] = f_list
g_dict = {}
for doc_item6 in pq(doc_item5).children()[1]:
g = pq(pq(doc_item6).children()[0]).text().split('\n')[0]
if not g:
continue
g_list = pq(pq(doc_item6).children()[0]).text().split('\n')[1:]
content_dict[g] = g_list
g_dict[g] = {}
f_dict[f] = g_dict
e_dict[e] = f_dict
d_dict[d] = e_dict
c_dict[c] = d_dict
b_dict[b] = c_dict
catalogue_dict[a] = b_dict
all_dict['表头'] = catalogue_title
all_dict['目录'] = catalogue_dict
all_dict['内容'] = content_dict
return all_dict
# 雅虎财经
def get_content2():
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument('--headless')
executable_path = r"D:\chrome\chromedriver.exe"
driver = webdriver.Chrome(options=chrome_options, executable_path=executable_path)
conn = pymysql.Connect(host='114.115.159.144', port=3306, user='root', passwd='zzsn9988', db='clb_project',
charset='utf8')
cursor = conn.cursor()
sql1 = """SELECT id, stock_code from config_finance_data_sync WHERE origin_type = 1"""
cursor.execute(sql1)
result_data = cursor.fetchall()
for data in result_data:
try:
data_list = list(data)
print(data_list)
stock = data_list[1]
orc_id = data_list[0]
url = f'https://finance.yahoo.com/quote/{stock}/financials?p={stock}'
try:
print(f'正在采集:{url}')
driver.get(url)
time.sleep(8)
try:
driver.find_element_by_xpath('//div/span[text()="Expand All"]').click()
time.sleep(8)
except:
pass
doc_resp1 = pq(driver.page_source)
financials1 = deal_table(doc_resp1)
driver.find_element_by_xpath('//div/span[text()="Quarterly"]').click()
time.sleep(8)
try:
driver.find_element_by_xpath('//div/span[text()="Expand All"]').click()
time.sleep(8)
except:
pass
doc_resp2 = pq(driver.page_source)
financials2 = deal_table(doc_resp2)
driver.find_element_by_xpath('//div/span[text()="Balance Sheet"]').click()
time.sleep(8)
try:
driver.find_element_by_xpath('//div/span[text()="Expand All"]').click()
time.sleep(8)
except:
pass
doc_resp3 = pq(driver.page_source)
financials3 = deal_table(doc_resp3)
driver.find_element_by_xpath('//div/span[text()="Quarterly"]').click()
time.sleep(8)
try:
driver.find_element_by_xpath('//div/span[text()="Expand All"]').click()
time.sleep(8)
except:
pass
doc_resp4 = pq(driver.page_source)
financials4 = deal_table(doc_resp4)
driver.find_element_by_xpath('//div/span[text()="Cash Flow"]').click()
time.sleep(8)
try:
driver.find_element_by_xpath('//div/span[text()="Expand All"]').click()
time.sleep(8)
except:
pass
doc_resp5 = pq(driver.page_source)
financials5 = deal_table(doc_resp5)
driver.find_element_by_xpath('//div/span[text()="Quarterly"]').click()
time.sleep(8)
try:
driver.find_element_by_xpath('//div/span[text()="Expand All"]').click()
time.sleep(8)
except:
pass
doc_resp6 = pq(driver.page_source)
financials6 = deal_table(doc_resp6)
financials_dict = {
'表1': financials1,
'表2': financials2,
'表3': financials3,
'表4': financials4,
'表5': financials5,
'表6': financials6,
}
mu_lus = ''
for i in range(1, 7):
mu_lu = financials_dict[f'表{i}']['目录']
mu_lu = json.dumps(mu_lu, ensure_ascii=False, indent=4)
mu_lus += mu_lu + '&&&&'
level_relation = mu_lus[:-4]
financials = ''
for i in range(1, 7):
a_list = financials_dict[f'表{i}']['表头']
for a in a_list:
financials += a + '\n'
b_dict = financials_dict[f'表{i}']['内容']
for key, values in b_dict.items():
financials += key + '\n'
for b in values:
financials += b + '\n'
financials += '&&&&' + '\n'
financials = financials.strip()
content = financials[:-4].strip().replace('\n&&&&\n', '&&&&')
sql = "UPDATE config_finance_data_sync SET level_relation=%s, content=%s WHERE ID = %s"
val = (level_relation, content, orc_id)
cursor.execute(sql, val)
conn.commit()
get_url = f'http://114.115.215.250:8089/synFinanceData/yh?id={orc_id}'
try:
resp = requests.get(get_url)
except:
with open('雅虎财经-财务数据_发送错误ID.txt', 'a', encoding='utf8')as f:
f.write(orc_id + '\n')
except:
print(f'采集:{url}失败')
pass
except:
time.sleep(60 * 60)
print('出错,重试中!')
continue
driver.close()
if __name__ == '__main__':
# parse_excel()
get_content1()
#get_content2()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论