提交 c887e9d2 作者: XveLingKun

0717

上级 57e944a7
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -52,8 +52,10 @@ if __name__ == "__main__": ...@@ -52,8 +52,10 @@ if __name__ == "__main__":
opt.add_experimental_option("excludeSwitches", ["enable-automation"]) opt.add_experimental_option("excludeSwitches", ["enable-automation"])
opt.add_experimental_option('excludeSwitches', ['enable-logging']) opt.add_experimental_option('excludeSwitches', ['enable-logging'])
opt.add_experimental_option('useAutomationExtension', False) opt.add_experimental_option('useAutomationExtension', False)
opt.binary_location = r'F:\spider\Google\Chrome\Application\chrome.exe' # opt.binary_location = r'F:\spider\Google\Chrome\Application\chrome.exe'
chromedriver = r'F:\spider\cmd100\chromedriver.exe' # chromedriver = r'F:\spider\cmd100\chromedriver.exe'
opt.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
chromedriver = r'D:\cmd100\chromedriver.exe'
browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver) browser = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)
url = "https://mp.weixin.qq.com/" url = "https://mp.weixin.qq.com/"
browser.get(url) browser.get(url)
......
import json import json
...@@ -3,7 +3,9 @@ import time ...@@ -3,7 +3,9 @@ import time
import pymongo import pymongo
url = "https://web.archive.org/web/20230702131549/https://www.forbes.com/lists/global2000/"
# url = "https://web.archive.org/web/20230702131549/https://www.forbes.com/lists/global2000/"
url = "https://web.archive.org/web/20220929184024/https://www.forbes.com/lists/global2000/"
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[ db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
'福布斯企业人数'] '福布斯企业人数']
headers = { headers = {
...@@ -25,7 +27,7 @@ headers = { ...@@ -25,7 +27,7 @@ headers = {
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
requests.adapters.DEFAULT_RETRIES = 5
proxies = { proxies = {
'https': 'http://127.0.0.1:1080', 'https': 'http://127.0.0.1:1080',
'http': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080',
...@@ -46,7 +48,7 @@ with open('./a.txt', 'r', encoding='utf-8') as f: ...@@ -46,7 +48,7 @@ with open('./a.txt', 'r', encoding='utf-8') as f:
dataJson = f.read() dataJson = f.read()
dataJson = json.loads(dataJson) dataJson = json.loads(dataJson)
tableDates = dataJson['tableData'] tableDates = dataJson['tableData']
for tableDate in tableDates[894:]: for tableDate in tableDates:
uri = tableDate['uri'] uri = tableDate['uri']
rank = tableDate['rank'] rank = tableDate['rank']
...@@ -79,4 +81,5 @@ for tableDate in tableDates[894:]: ...@@ -79,4 +81,5 @@ for tableDate in tableDates[894:]:
db_storage.insert_one(dic) db_storage.insert_one(dic)
print(f'{rank}==={organizationName}===已入库') print(f'{rank}==={organizationName}===已入库')
req.close() req.close()
time.sleep(1) # time.sleep(1)
\ No newline at end of file break
\ No newline at end of file
import json
import json
import pandas as pd
import pymongo
import requests
from bs4 import BeautifulSoup
from retry import retry
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
'2022年福布斯企业人数']
url = 'https://web.archive.org/web/20220929184024/https://www.forbes.com/lists/global2000/'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'max-age=0',
'Cookie': 'lux_uid=166447682647510727; donation-identifier=aab33e1c4e293a8fcd5490465688bb01; bafp=79fcddb0-4e71-11ee-8a81-b762f64bf85c',
'Priority': 'u=0, i',
'Sec-Ch-Ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Microsoft Edge";v="126"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': 'Windows"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0'
}
proxies = {
'https': 'http://127.0.0.1:1080',
'http': 'http://127.0.0.1:1080',
}
@retry(tries=5, delay=2)
def detail(href):
try:
req = requests.get(headers=headers, url=href, verify=False, proxies=proxies)
soup_ = BeautifulSoup(req.text, 'lxml')
scripts = soup_.find_all('script')
req.close()
return scripts
except:
raise
@retry(tries=3, delay=2)
def spider():
response = requests.get(url=url, headers=headers, proxies=proxies)
soup = BeautifulSoup(response.text, 'html.parser')
# print(soup)
tables = soup.find_all('div', class_="table-row-group")
print(len(tables))
for idx, table in enumerate(tables):
print(f'正在遍历第{idx}个table')
a_list = table.find_all('a', class_="table-row")
for a in a_list:
rank = a.find('div', class_="rank").text.replace('.', '')
print(f'排名: {rank}')
organizationName = a.find('div', class_="organizationName").text
href = a.get('href')
try:
scripts = detail(href)
except:
print(f'error--:{idx},{rank},{organizationName}')
item = str(idx) + ',' + rank + ',' + organizationName
with open('./error_2022.txt', 'a', encoding='utf-8')as f:
f.write(item)
continue
# print(scripts)
for script in scripts:
if 'numberOfEmployees' in script.text:
break
else:
continue
# print(f'{rank}--{uri}---not found')
try:
employeesJson = script.text
# print(employeesJson)
employeesJson = json.loads(employeesJson)
numberOfEmployees = employeesJson['numberOfEmployees'].replace(',', '')
except:
numberOfEmployees = '--'
dic = {
'排名': rank,
'企业名称': organizationName,
'员工人数': numberOfEmployees,
}
# print(dic)
db_storage.insert_one(dic)
print(f'{rank}==={organizationName}===已入库')
def spider2():
# 读取excel
df = pd.read_excel('./2022年福布斯榜单.xlsx', sheet_name='待补充')
# 获取数据
data = df.values.tolist()
for idx, row in enumerate(data):
# 获取排名、公司名称、链接
rank = row[1]
organizationName = row[2]
# 将名称转化成小写
organizationName = organizationName.lower().replace(' ', '-')
href = f'https://web.archive.org/web/20220929184024/https://www.forbes.com/companies/{organizationName}/?list=global2000'
# 调用爬虫
try:
scripts = detail(href)
except:
print(f'error--:{idx},{rank},{organizationName}')
item = str(idx) + ',' + rank + ',' + organizationName
with open('./error_2022.txt', 'a', encoding='utf-8') as f:
f.write(item)
continue
# print(scripts)
for script in scripts:
if 'numberOfEmployees' in script.text:
break
else:
continue
# print(f'{rank}--{uri}---not found')
try:
employeesJson = script.text
# print(employeesJson)
employeesJson = json.loads(employeesJson)
numberOfEmployees = employeesJson['numberOfEmployees'].replace(',', '')
except:
numberOfEmployees = '--'
dic = {
'排名': rank,
'企业名称': organizationName,
'员工人数': numberOfEmployees,
}
# print(dic)
db_storage.insert_one(dic)
print(f'{rank}==={organizationName}===已入库')
if __name__ == '__main__':
# spider()
spider2()
\ No newline at end of file
import pandas as pd
import pandas as pd
import pymongo
# 7649
data_list = []
db_stroage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN['全球企业资讯']
# datas = db_stroage.find({"内容": {"$ne": None, "$exists": True}})
# 导出标签是空的数据
datas = db_stroage.find({"标签": ""})
link = []
for data in datas:
del data['_id']
del data['id']
if data['标题'] not in link:
data_list.append(data)
link.append(data['标题'])
# print(data)
print(len(data_list))
df = pd.DataFrame(data_list)
df.to_excel('./不保留企业资讯.xlsx',index=False)
\ No newline at end of file
import json
import json
import re
import threading
import time
import uuid
import pymongo
import redis
import requests
from bs4 import BeautifulSoup
from retry import retry
from elasticsearch import Elasticsearch
from base import BaseCore
from obs import ObsClient
import fitz
from urllib.parse import unquote
baseCore = BaseCore.BaseCore(sqlFlg=False)
log = baseCore.getLogger()
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
'智库-不保留222']
lock = threading.Lock()
class EsMethod(object):
def __init__(self):
# 创建Elasticsearch对象,并提供账号信息
self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
self.index_name = 'subjectdatabase'
def queryatt(self,index_name,pnum):
body = {
"query": {
"bool": {
"must": [
{
"match": {
"subjectId": "1537739653432397825"
}
},
{
"match": {
"deleteFlag": "1"
}
},
{
"range": {
"createDate": {
"gte": "2023-12-31T00:00:00",
"lte": "2024-07-02T12:00:00"
}
}
}
]
}
},
"sort": [
{
"createDate": {
"order": "desc"
}
}
],
"track_total_hits": True,
"size": 200,
"from": pnum
}
result = self.es.search(index=index_name
, doc_type='_doc'
, body=body)
# log.info(result)
return result
def clean_html_tag(content):
# todo: 考虑正式场景中是以</p>进行段落划分的
ori_text = re.sub("(<\/p\s*>)", "\t", content)
# 处理图片标签
ori_text = re.sub(r"<img.*?/>", "", ori_text)
tag_content_list = ori_text.split("\t") if "<p" in ori_text else ori_text
temp_content_list = []
if type(tag_content_list) is list:
for text in tag_content_list:
bs = BeautifulSoup(text, 'lxml')
ori_match_content = bs.text.strip()
temp_content_list.append(ori_match_content)
match_content = "\n".join(temp_content_list)
else:
bs1 = BeautifulSoup(tag_content_list, 'lxml')
match_content = bs1.text.strip()
# if "参考文献" not in tag_content_list:
# match_content = temp_content
# else:
# match_content = temp_content.split("参考文献")[0]
return match_content
def preprocess(text: str):
text = text.strip().strip('\n').strip()
text = re.sub(' +', '', text)
text = re.sub('\n+', '\n', text)
return text
def main(page, p, esMethod):
result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
total = result['hits']['total']['value']
# if total == 0:
# log.info('++++已没有数据+++++')
# return
try:
msglist = result['hits']['hits']
except:
log.info(f'error-----{result}')
return
log.info(f'---第{page}页{len(msglist)}条数据----共{total}条数据----')
for mms in msglist:
id = mms['_id']
title = mms['_source']['title']
try:
content = mms['_source']['content']
except:
continue
try:
clean_content = clean_html_tag(content)
pre_content = preprocess(clean_content)
except:
pre_content = content
try:
summary = mms['_source']['summary']
except:
summary = ''
try:
clean_summary = clean_html_tag(summary)
pre_summary = preprocess(clean_summary)
except:
pre_summary = summary
try:
contentRaw = mms['_source']['contentRaw']
except:
contentRaw = ''
try:
clean_contentRaw = clean_html_tag(contentRaw)
pre_contentRaw = preprocess(clean_contentRaw)
except:
pre_contentRaw = contentRaw
try:
titleRaw = mms['_source']['titleRaw']
except:
titleRaw = ''
try:
summaryRaw = mms['_source']['summaryRaw']
except:
summaryRaw = ''
try:
clean_summaryRaw = clean_html_tag(summaryRaw)
pre_summaryRaw = preprocess(clean_summaryRaw)
except:
pre_summaryRaw = summaryRaw
contentWithTag = mms['_source']['contentWithTag']
log.info(f'{id}--{title}---')
# labels = mms['_source']['labels']
# tags = []
# for label in labels:
# label_name = label['labelMark']
# if label_name == "dynamic_tags":
# relationName = label['relationName']
# tags.append(relationName)
# else:
# continue
# info_tags = ','.join(tags)
# 存入数据库
dic = {
"id": id,
"标题": title,
"摘要": pre_summary,
"内容": pre_content,
"标题译文": titleRaw,
"摘要译文": pre_summaryRaw,
"内容译文": pre_contentRaw,
"正文html": contentWithTag,
"标签": '',
"状态": "通过",
}
db_storage.insert_one(dic)
def run_threads(num_threads,esMethod,j):
threads = []
for i in range(num_threads):
page = j + i + 1
p = j + i * 200
thread = threading.Thread(target=main, args=(page, p, esMethod))
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
if __name__ == "__main__":
j = 0
for i in range(9):
esMethod = EsMethod()
# result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
# total = result['hits']['total']['value']
# if total == 0:
# log.info('++++已没有数据+++++')
# break
start = time.time()
num_threads = 5
run_threads(num_threads, esMethod, j)
j += 1000
log.info(f'5线程 每个处理200条数据 总耗时{time.time() - start}秒')
\ No newline at end of file
import json
import json
import re
import threading
import time
import uuid
import pymongo
import redis
import requests
from bs4 import BeautifulSoup
from retry import retry
from elasticsearch import Elasticsearch
from base import BaseCore
from obs import ObsClient
import fitz
from urllib.parse import unquote
baseCore = BaseCore.BaseCore(sqlFlg=False)
log = baseCore.getLogger()
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
'全球企业资讯0710']
lock = threading.Lock()
class EsMethod(object):
def __init__(self):
# 创建Elasticsearch对象,并提供账号信息
self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
self.index_name = 'subjectdatabase'
def queryatt(self,index_name,pnum):
body = {
"query": {
"bool": {
"must": [
{
"match": {
"subjectId": "1734030182269853697"
}
},
{
"range": {
"createDate": {
"gte": "2024-07-01T00:00:00",
"lte": "2024-07-11T00:00:00"
}
}
}
]
}
},
"sort": [
{
"createDate": {
"order": "desc"
}
}
],
"track_total_hits": True,
"size": 200,
"from": pnum
}
result = self.es.search(index=index_name
, doc_type='_doc'
, body=body)
# log.info(result)
return result
def clean_html_tag(content):
# todo: 考虑正式场景中是以</p>进行段落划分的
ori_text = re.sub("(<\/p\s*>)", "\t", content)
# 处理图片标签
ori_text = re.sub(r"<img.*?/>", "", ori_text)
tag_content_list = ori_text.split("\t") if "<p" in ori_text else ori_text
temp_content_list = []
if type(tag_content_list) is list:
for text in tag_content_list:
bs = BeautifulSoup(text, 'lxml')
ori_match_content = bs.text.strip()
temp_content_list.append(ori_match_content)
match_content = "\n".join(temp_content_list)
else:
bs1 = BeautifulSoup(tag_content_list, 'lxml')
match_content = bs1.text.strip()
# if "参考文献" not in tag_content_list:
# match_content = temp_content
# else:
# match_content = temp_content.split("参考文献")[0]
return match_content
def preprocess(text: str):
text = text.strip().strip('\n').strip()
text = re.sub(' +', '', text)
text = re.sub('\n+', '\n', text)
return text
def main(page, p, esMethod):
result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
total = result['hits']['total']['value']
# if total == 0:
# log.info('++++已没有数据+++++')
# return
try:
msglist = result['hits']['hits']
except:
log.info(f'error-----{result}')
return
log.info(f'---第{page}页{len(msglist)}条数据----共{total}条数据----')
for mms in msglist:
id = mms['_id']
title = mms['_source']['title']
try:
content = mms['_source']['content']
except:
continue
try:
contentWithTag = mms['_source']['contentWithTag']
except:
continue
try:
clean_content = clean_html_tag(content)
pre_content = preprocess(clean_content)
except:
pre_content = content
try:
summary = mms['_source']['summary']
except:
summary = ''
try:
clean_summary = clean_html_tag(summary)
pre_summary = preprocess(clean_summary)
except:
pre_summary = summary
try:
contentRaw = mms['_source']['contentRaw']
except:
contentRaw = ''
try:
clean_contentRaw = clean_html_tag(contentRaw)
pre_contentRaw = preprocess(clean_contentRaw)
except:
pre_contentRaw = contentRaw
try:
titleRaw = mms['_source']['titleRaw']
except:
titleRaw = ''
try:
summaryRaw = mms['_source']['summaryRaw']
except:
summaryRaw = ''
try:
clean_summaryRaw = clean_html_tag(summaryRaw)
pre_summaryRaw = preprocess(clean_summaryRaw)
except:
pre_summaryRaw = summaryRaw
log.info(f'{id}--{title}---')
labels = mms['_source']['labels']
tags = []
for label in labels:
label_name = label['labelMark']
if label_name == "dynamic_tags":
relationName = label['relationName']
tags.append(relationName)
else:
continue
info_tags = ','.join(tags)
# 存入数据库
dic = {
"id": id,
"标题": title,
"摘要": pre_summary,
"内容": pre_content,
"带标签内容": contentWithTag,
"标题译文": titleRaw,
"摘要译文": pre_summaryRaw,
"内容译文": pre_contentRaw,
"标签": info_tags,
}
db_storage.insert_one(dic)
def run_threads(num_threads,esMethod,j):
threads = []
for i in range(num_threads):
page = j + i + 1
p = j + i * 200
thread = threading.Thread(target=main, args=(page, p, esMethod))
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
if __name__ == "__main__":
j = 0
for i in range(2):
esMethod = EsMethod()
# result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
# total = result['hits']['total']['value']
# if total == 0:
# log.info('++++已没有数据+++++')
# break
start = time.time()
num_threads = 5
run_threads(num_threads, esMethod, j)
j += 1000
log.info(f'5线程 每个处理200条数据 总耗时{time.time() - start}秒')
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论