提交 36594c06 作者: ctt

新增小样本多分类和小样本多标签分类

上级 0e893d10
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 20:06
# @FileName: FewMultiLabelConfig.py
# @Software: PyCharm
from model.base.views.config.BaseConfig import BaseConfig
class FewMultiConfig(BaseConfig):
def __init__(self, config_path):
super().__init__(config_path)
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/14 14:14
# @FileName: FewMultiLabelEvaluator.py
# @Software: PyCharm
from model.base import BaseEvaluator
class FewMultiEvaluator(BaseEvaluator.BaseEvaluator):
def __init__(self, config_path):
super().__init__(config_path)
\ No newline at end of file
import os
import sys
import paddle
from paddle.metric import Accuracy
from collections import defaultdict
from paddlenlp.utils.log import logger
from dataclasses import dataclass, field
from paddlenlp.prompt import (
AutoTemplate,
PromptModelForSequenceClassification,
PromptTrainer,
PromptTuningArguments,
SoftVerbalizer,
)
from paddlenlp.trainer import EarlyStoppingCallback, PdArgumentParser
from paddlenlp.transformers import AutoModelForMaskedLM, AutoTokenizer
from model.base import BaseModel, BaseRunner, BaseDataProcess
from model.classify.views.few_multi_class.data.FewMultiDataLoader import FewMultiDataLoader
from model.classify.views.few_multi_class.FewMultiClassConfig import FewMultiConfig
@dataclass
class DataArguments:
data_dir: str = field(default="./data/", metadata={"help": "Path to a dataset which includes train.txt, dev.txt, test.txt, label.txt and data.txt (optional)."})
prompt: str = field(default=None, metadata={"help": "The input prompt for tuning."})
@dataclass
class ModelArguments:
model_name_or_path: str = field(default="ernie-3.0-platform_base-zh", metadata={"help": "Build-in pretrained model name or the path to local model."})
export_type: str = field(default='paddle', metadata={"help": "The type to export. Support `paddle` and `onnx`."})
class FewMultiModel(BaseModel.BaseModel):
def __init__(self, config_path):
super().__init__(config_path)
class FewMultiDataProcess(BaseDataProcess.BaseDataProcess):
def __init__(self, config_path):
super().__init__(config_path)
class FewMultiRunner(BaseRunner.BaseRunner):
def __init__(self, config_path):
super().__init__(config_path)
self.config_path = config_path
self.config = FewMultiConfig(self.config_path)
def train(self, logger):
py_path = os.path.abspath(__file__)
sys.argv = [py_path]
print(self.config)
for key, value in self.config._parsed_file['embedding'].items():
if key == 'name':
sys.argv.extend(['--model_name_or_path', value])
elif (key == 'tokenizer_path') or (key == 'embedding_path'):
pass
else:
sys.argv.extend(['--' + key, str(value)])
for key, value in self.config._parsed_file['model'].items():
if key == 'model_name':
sys.argv.extend(['--output_dir', './checkpoints/'])
elif (key == 'model_path'):
pass
else:
sys.argv.extend(['--' + key, str(value)])
for key, value in self.config._parsed_file['runner'].items():
sys.argv.extend(['--' + key, str(value)])
data_path = os.path.join(os.path.dirname(self.config_path), 'data')
sys.argv.extend(['--do_train', '--do_eval', 'true', '--load_best_model_at_end', '--do_predict', 'true', '--logging_steps', '5'])
sys.argv.extend(['--data_dir', data_path])
print(sys.argv)
parser = PdArgumentParser((ModelArguments, DataArguments, PromptTuningArguments))
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
paddle.set_device(training_args.device)
# Load the pretrained language model.
model = AutoModelForMaskedLM.from_pretrained(model_args.model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
# Define the template for preprocess and the verbalizer for postprocess.
template = AutoTemplate.create_from(data_args.prompt, tokenizer, training_args.max_seq_length, model=model)
logger.info("Using template: {}".format(template.prompt))
logger.info("Using template: {}".format(template.prompt))
label_file = os.path.join(data_args.data_dir, "label.txt")
with open(label_file, "r", encoding="utf-8") as fp:
label_words = defaultdict(list)
for line in fp:
data = line.strip().split("==")
word = data[1] if len(data) > 1 else data[0].split("##")[-1]
label_words[data[0]].append(word)
verbalizer = SoftVerbalizer(label_words, tokenizer, model)
# Load the few-shot datasets.
train_ds, dev_ds, test_ds = FewMultiDataLoader.load_local_dataset(
data_path=data_args.data_dir, splits=["train", "dev", "test"], label_list=verbalizer.labels_to_ids
)
# Define the criterion.
criterion = paddle.nn.CrossEntropyLoss()
# Initialize the prompt model with the above variables.
prompt_model = PromptModelForSequenceClassification(
model, template, verbalizer, freeze_plm=training_args.freeze_plm, freeze_dropout=training_args.freeze_dropout
)
# Define the metric function.
def compute_metrics(eval_preds):
metric = Accuracy()
correct = metric.compute(paddle.to_tensor(eval_preds.predictions), paddle.to_tensor(eval_preds.label_ids))
metric.update(correct)
acc = metric.accumulate()
return {"accuracy": acc}
# Deine the early-stopping callback.
callbacks = [EarlyStoppingCallback(early_stopping_patience=4, early_stopping_threshold=0.0)]
# Initialize the trainer.
trainer = PromptTrainer(
model=prompt_model,
tokenizer=tokenizer,
args=training_args,
criterion=criterion,
train_dataset=train_ds,
eval_dataset=dev_ds,
callbacks=callbacks,
compute_metrics=compute_metrics,
)
# Traininig.
if training_args.do_train:
train_result = trainer.train(resume_from_checkpoint=None)
metrics = train_result.metrics
trainer.save_model()
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
# Prediction.
if training_args.do_predict:
test_ret = trainer.predict(test_ds)
trainer.log_metrics("test", test_ret.metrics)
# Export static model.
if training_args.do_export:
export_path = os.path.join(training_args.output_dir, "export")
trainer.export_model(export_path, export_type=model_args.export_type)
# if __name__ == '__main__':
# FewMultiRunner('/data/ctt/platform_zzsn_new/media/123456/config.yaml').train('')
\ No newline at end of file
# -*- coding: utf-8 -*-
# @Time : 2023/2/7 15:48
# @Author : ctt
# @File : __init__.py
# @Project : platform_zzsn
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/6/1 11:19
# @Author : 程婷婷
# @FileName: FastTextDataLoader.py
# @Software: PyCharm
import os
from paddlenlp.datasets import load_dataset
class FewMultiDataLoader():
def __init__(self, config_path):
super().__init__(config_path)
@staticmethod
def load_local_dataset(data_path, splits, label_list):
"""
Read datasets from files.
Args:
data_path (str):
Path to the dataset directory, including label.txt, train.txt,
dev.txt, test.txt (and data.txt).
splits (list):
Which file(s) to load, such as ['train', 'dev', 'test'].
label_list(dict):
A dictionary to encode labels as ids, which should be compatible
with that of verbalizer.
"""
def _reader(data_file, label_list):
with open(data_file, "r", encoding="utf-8") as fp:
for idx, line in enumerate(fp):
data = line.strip().split("\t")
if len(data) == 1:
yield {"text_a": data[0]}
else:
text, label = data
yield {"text_a": text, "labels": label_list[label]}
assert isinstance(splits, list) and len(splits) > 0
split_map = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"}
dataset = []
for split in splits:
data_file = os.path.join(data_path, split_map[split])
dataset.append(load_dataset(_reader, data_file=data_file, label_list=label_list, lazy=False))
return dataset
# -*- coding: utf-8 -*-
# @Time : 2023/2/7 19:59
# @Author : ctt
# @File : __init__.py
# @Project : platform_zzsn
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/11 20:06
# @FileName: FewMultiLabelConfig.py
# @Software: PyCharm
from model.base.views.config.BaseConfig import BaseConfig
class FewMultiLabelConfig(BaseConfig):
def __init__(self, config_path):
super().__init__(config_path)
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2021/5/14 14:14
# @FileName: FewMultiLabelEvaluator.py
# @Software: PyCharm
import numpy as np
from paddle.metric import Metric
from paddlenlp.utils.log import logger
from sklearn.metrics import classification_report, f1_score
from model.base import BaseEvaluator
class FewMultiLabelEvaluator(BaseEvaluator.BaseEvaluator):
def __init__(self, config_path):
super().__init__(config_path)
class MetricReport(Metric):
"""
F1 score for multi-label text classification task.
"""
def __init__(self, name="MetricReport", average="micro"):
super(MetricReport, self).__init__()
self.average = average
self._name = name
self.reset()
def reset(self):
"""
Resets all of the metric state.
"""
self.y_prob = None
self.y_true = None
def f1_score(self, y_prob):
"""
Compute micro f1 score and macro f1 score
"""
threshold = 0.5
self.y_pred = y_prob > threshold
micro_f1_score = f1_score(y_pred=self.y_pred, y_true=self.y_true, average="micro")
macro_f1_score = f1_score(y_pred=self.y_pred, y_true=self.y_true, average="macro")
return micro_f1_score, macro_f1_score
def update(self, probs, labels):
"""
Update the probability and label
"""
if self.y_prob is not None:
self.y_prob = np.append(self.y_prob, probs.numpy(), axis=0)
else:
self.y_prob = probs.numpy()
if self.y_true is not None:
self.y_true = np.append(self.y_true, labels.numpy(), axis=0)
else:
self.y_true = labels.numpy()
def accumulate(self):
"""
Returns micro f1 score and macro f1 score
"""
micro_f1_score, macro_f1_score = self.f1_score(y_prob=self.y_prob)
return micro_f1_score, macro_f1_score
def report(self):
"""
Returns classification report
"""
self.y_pred = self.y_prob > 0.5
logger.info("classification report:\n" + classification_report(self.y_true, self.y_pred, digits=4))
def name(self):
"""
Returns metric name
"""
return self._name
import os
import sys
import paddle
import paddle.nn.functional as F
from collections import defaultdict
from dataclasses import dataclass, field
from paddlenlp.prompt import (
AutoTemplate,
PromptModelForSequenceClassification,
PromptTrainer,
PromptTuningArguments,
SoftVerbalizer,
)
from paddlenlp.trainer import EarlyStoppingCallback, PdArgumentParser
from paddlenlp.transformers import AutoModelForMaskedLM, AutoTokenizer
from paddlenlp.utils.log import logger
from model.base import BaseRunner
from model.classify.views.few_multi_label.FewMultiLabelEvaluator import MetricReport
from model.classify.views.few_multi_label.FewMultiLabelConfig import FewMultiLabelConfig
from model.classify.views.few_multi_label.data.FewMultiLabelDataLoader import FewMultiLabelDataLoader
@dataclass
class DataArguments:
data_dir: str = field(default="./data", metadata={"help": "The dataset dictionary includes train.txt, dev.txt and label.txt files."})
prompt: str = field(default=None, metadata={"help": "The input prompt for tuning."})
@dataclass
class ModelArguments:
model_name_or_path: str = field(default="ernie-3.0-base-zh", metadata={"help": "The build-in pretrained model or the path to local model."})
export_type: str = field(default='paddle', metadata={"help": "The type to export. Support `paddle` and `onnx`."})
class FewMultiLabelRunner(BaseRunner.BaseRunner):
def __init__(self, config_path):
super().__init__(config_path)
self.config_path = config_path
self.config = FewMultiLabelConfig(self.config_path)
def train(self, logger):
# Parse the arguments.
py_path = os.path.abspath(__file__)
sys.argv = [py_path]
print(self.config)
for key, value in self.config._parsed_file['embedding'].items():
if key == 'name':
sys.argv.extend(['--model_name_or_path', value])
elif (key == 'tokenizer_path') or (key == 'embedding_path'):
pass
else:
sys.argv.extend(['--' + key, str(value)])
for key, value in self.config._parsed_file['model'].items():
if key == 'model_name':
sys.argv.extend(['--output_dir', './checkpoints/'])
elif (key == 'model_path'):
pass
else:
sys.argv.extend(['--'+key, str(value)])
for key, value in self.config._parsed_file['runner'].items():
sys.argv.extend(['--'+key, str(value)])
data_path = os.path.join(os.path.dirname(self.config_path), 'data')
sys.argv.extend(
['--do_train', '--do_eval', 'true', '--load_best_model_at_end', '--do_predict', 'true', '--logging_steps',
'5'])
sys.argv.extend(['--data_dir', data_path])
print(sys.argv)
parser = PdArgumentParser((ModelArguments, DataArguments, PromptTuningArguments))
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
training_args.print_config(model_args, "Model")
training_args.print_config(data_args, "Data")
paddle.set_device(training_args.device)
# Load the pretrained language model.
model = AutoModelForMaskedLM.from_pretrained(model_args.model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
# Define the template for preprocess and the verbalizer for postprocess.
template = AutoTemplate.create_from(data_args.prompt, tokenizer, training_args.max_seq_length, model=model)
logger.info("Using template: {}".format(template.prompt))
label_file = os.path.join(data_args.data_dir, "label.txt")
with open(label_file, "r", encoding="utf-8") as fp:
label_words = defaultdict(list)
for line in fp:
data = line.strip().split("==")
word = data[1] if len(data) > 1 else data[0].split("##")[-1]
label_words[data[0]].append(word)
verbalizer = SoftVerbalizer(label_words, tokenizer, model)
# Load the few-shot datasets.
train_ds, dev_ds, test_ds = FewMultiLabelDataLoader.load_local_dataset(
data_path=data_args.data_dir, splits=["train", "dev", "test"], label_list=verbalizer.labels_to_ids
)
# Define the criterion.
criterion = paddle.nn.BCEWithLogitsLoss()
# Initialize the prompt model with the above variables.
prompt_model = PromptModelForSequenceClassification(
model, template, verbalizer, freeze_plm=training_args.freeze_plm, freeze_dropout=training_args.freeze_dropout
)
# Define the metric function.
def compute_metrics(eval_preds):
metric = MetricReport()
preds = F.sigmoid(paddle.to_tensor(eval_preds.predictions))
metric.update(preds, paddle.to_tensor(eval_preds.label_ids))
micro_f1_score, macro_f1_score = metric.accumulate()
return {"micro_f1_score": micro_f1_score, "macro_f1_score": macro_f1_score}
# Deine the early-stopping callback.
callbacks = [EarlyStoppingCallback(early_stopping_patience=4, early_stopping_threshold=0.0)]
# Initialize the trainer.
trainer = PromptTrainer(
model=prompt_model,
tokenizer=tokenizer,
args=training_args,
criterion=criterion,
train_dataset=train_ds,
eval_dataset=dev_ds,
callbacks=callbacks,
compute_metrics=compute_metrics,
)
# Training.
if training_args.do_train:
train_result = trainer.train(resume_from_checkpoint=None)
metrics = train_result.metrics
trainer.save_model()
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
# Prediction.
if training_args.do_predict:
test_ret = trainer.predict(test_ds)
trainer.log_metrics("test", test_ret.metrics)
# Export static model.
if training_args.do_export:
export_path = os.path.join(training_args.output_dir, "export")
trainer.export_model(export_path, export_type=model_args.export_type)
# -*- coding: utf-8 -*-
# @Time : 2023/2/9 9:53
# @Author : ctt
# @File : __init__.py
# @Project : platform_zzsn
# -*- coding: utf-8 -*-
# @Time : 2023/2/9 10:01
# @Author : ctt
# @File : FewMultiLabelDataLoader
# @Project : platform_zzsn
import os
from paddlenlp.datasets import load_dataset
class FewMultiLabelDataLoader():
def __init__(self, config_path):
super().__init__(config_path)
@staticmethod
def load_local_dataset(data_path, splits, label_list):
"""
Load dataset for multi-label classification from files, where
there is one example per line. Text and label are seperated
by '\t', and multiple labels are delimited by ','.
Args:
data_path (str):
Path to the dataset directory, including label.txt, train.txt,
dev.txt (and data.txt).
splits (list):
Which file(s) to load, such as ['train', 'dev', 'test'].
label_list (dict):
The dictionary that maps labels to indeces.
"""
def _reader(data_file, label_list):
with open(data_file, "r", encoding="utf-8") as fp:
for idx, line in enumerate(fp):
data = line.strip().split("\t")
if len(data) == 1:
yield {"text_a": data[0]}
else:
text, label = data
label = label.strip().split(",")
label = [float(1) if x in label else float(0) for x in label_list]
yield {"text_a": text, "labels": label}
split_map = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"}
datasets = []
for split in splits:
data_file = os.path.join(data_path, split_map[split])
datasets.append(load_dataset(_reader, data_file=data_file, label_list=label_list, lazy=False))
return datasets
# -*- coding: utf-8 -*-
# @Time : 2023/2/9 10:00
# @Author : ctt
# @File : __init__.py
# @Project : platform_zzsn
data_process:
test_file_path: ''
train_file_path: ''
embedding:
name: 'ernie-3.0-base-zh'
tokenizer_path: null
embedding_path: null
max_seq_length: 128
model:
model_name: multi_class_model
model_path: ''
learning_rate: 3e-5
ppt_learning_rate: 3e-4
num_train_epochs: 10
per_device_train_batch_size: 8
per_device_eval_batch_size: 32
metric_for_best_model: accuracy
evaluation_strategy: epoch
save_strategy: epoch
save_total_limit: 1
prompt: "这条新闻写的是"
evaluate:
average: micro
runner:
device: cpu
\ No newline at end of file
data_process:
test_file_path: ''
train_file_path: ''
embedding:
name: 'ernie-3.0-base-zh'
tokenizer_path: null
embedding_path: null
max_seq_length: 128
model:
model_name: multi_class_model
model_path: ''
learning_rate: 3e-5
ppt_learning_rate: 3e-4
num_train_epochs: 10
per_device_train_batch_size: 8
per_device_eval_batch_size: 32
metric_for_best_model: accuracy
evaluation_strategy: epoch
save_strategy: epoch
save_total_limit: 1
prompt: "这句话包含的要素有"
evaluate:
average: micro
runner:
device: cpu
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论