新增小样本多分类和小样本多标签分类

36594c06 · ctt · 0e893d10 · 36594c06 · 36594c06 · 36594c06
--- a/model/classify/views/few_multi_class/FewMultiClassConfig.py
+++ b/model/classify/views/few_multi_class/FewMultiClassConfig.py
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Time    : 2021/5/11 20:06
+# @FileName: FewMultiLabelConfig.py
+# @Software: PyCharm
+from model.base.views.config.BaseConfig import BaseConfig
+class FewMultiConfig(BaseConfig):
+    def __init__(self, config_path):
+        super().__init__(config_path)
--- a/model/classify/views/few_multi_class/FewMultiClassEvaluator.py
+++ b/model/classify/views/few_multi_class/FewMultiClassEvaluator.py
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Time    : 2021/5/14 14:14
+# @FileName: FewMultiLabelEvaluator.py
+# @Software: PyCharm
+from model.base import BaseEvaluator
+class FewMultiEvaluator(BaseEvaluator.BaseEvaluator):
+    def __init__(self, config_path):
+        super().__init__(config_path)
\ No newline at end of file
--- a/model/classify/views/few_multi_class/FewMultiClassRunner.py
+++ b/model/classify/views/few_multi_class/FewMultiClassRunner.py
+import os
+import sys
+import paddle
+from paddle.metric import Accuracy
+from collections import defaultdict
+from paddlenlp.utils.log import logger
+from dataclasses import dataclass, field
+from paddlenlp.prompt import (
+    AutoTemplate,
+    PromptModelForSequenceClassification,
+    PromptTrainer,
+    PromptTuningArguments,
+    SoftVerbalizer,
+)
+from paddlenlp.trainer import EarlyStoppingCallback, PdArgumentParser
+from paddlenlp.transformers import AutoModelForMaskedLM, AutoTokenizer
+from model.base import BaseModel, BaseRunner, BaseDataProcess
+from model.classify.views.few_multi_class.data.FewMultiDataLoader import FewMultiDataLoader
+from model.classify.views.few_multi_class.FewMultiClassConfig import FewMultiConfig
+@dataclass
+class DataArguments:
+    data_dir: str = field(default="./data/", metadata={"help": "Path to a dataset which includes train.txt, dev.txt, test.txt, label.txt and data.txt (optional)."})
+    prompt: str = field(default=None, metadata={"help": "The input prompt for tuning."})
+@dataclass
+class ModelArguments:
+    model_name_or_path: str = field(default="ernie-3.0-platform_base-zh", metadata={"help": "Build-in pretrained model name or the path to local model."})
+    export_type: str = field(default='paddle', metadata={"help": "The type to export. Support `paddle` and `onnx`."})
+class FewMultiModel(BaseModel.BaseModel):
+    def __init__(self, config_path):
+        super().__init__(config_path)
+class FewMultiDataProcess(BaseDataProcess.BaseDataProcess):
+    def __init__(self, config_path):
+        super().__init__(config_path)
+class FewMultiRunner(BaseRunner.BaseRunner):
+    def __init__(self, config_path):
+        super().__init__(config_path)
+        self.config_path = config_path
+        self.config = FewMultiConfig(self.config_path)
+    def train(self, logger):
+        py_path = os.path.abspath(__file__)
+        sys.argv = [py_path]
+        print(self.config)
+        for key, value in self.config._parsed_file['embedding'].items():
+            if key == 'name':
+                sys.argv.extend(['--model_name_or_path', value])
+            elif (key == 'tokenizer_path') or (key == 'embedding_path'):
+                pass
+            else:
+                sys.argv.extend(['--' + key, str(value)])
+        for key, value in self.config._parsed_file['model'].items():
+            if key == 'model_name':
+                sys.argv.extend(['--output_dir', './checkpoints/'])
+            elif (key == 'model_path'):
+                pass
+            else:
+                sys.argv.extend(['--' + key, str(value)])
+        for key, value in self.config._parsed_file['runner'].items():
+            sys.argv.extend(['--' + key, str(value)])
+        data_path = os.path.join(os.path.dirname(self.config_path), 'data')
+        sys.argv.extend(['--do_train', '--do_eval', 'true', '--load_best_model_at_end', '--do_predict', 'true', '--logging_steps', '5'])
+        sys.argv.extend(['--data_dir', data_path])
+        print(sys.argv)
+        parser = PdArgumentParser((ModelArguments, DataArguments, PromptTuningArguments))
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+        paddle.set_device(training_args.device)
+        # Load the pretrained language model.
+        model = AutoModelForMaskedLM.from_pretrained(model_args.model_name_or_path)
+        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
+        # Define the template for preprocess and the verbalizer for postprocess.
+        template = AutoTemplate.create_from(data_args.prompt, tokenizer, training_args.max_seq_length, model=model)
+        logger.info("Using template: {}".format(template.prompt))
+        logger.info("Using template: {}".format(template.prompt))
+        label_file = os.path.join(data_args.data_dir, "label.txt")
+        with open(label_file, "r", encoding="utf-8") as fp:
+            label_words = defaultdict(list)
+            for line in fp:
+                data = line.strip().split("==")
+                word = data[1] if len(data) > 1 else data[0].split("##")[-1]
+                label_words[data[0]].append(word)
+        verbalizer = SoftVerbalizer(label_words, tokenizer, model)
+        # Load the few-shot datasets.
+        train_ds, dev_ds, test_ds = FewMultiDataLoader.load_local_dataset(
+            data_path=data_args.data_dir, splits=["train", "dev", "test"], label_list=verbalizer.labels_to_ids
+        )
+        # Define the criterion.
+        criterion = paddle.nn.CrossEntropyLoss()
+        # Initialize the prompt model with the above variables.
+        prompt_model = PromptModelForSequenceClassification(
+            model, template, verbalizer, freeze_plm=training_args.freeze_plm, freeze_dropout=training_args.freeze_dropout
+        )
+        # Define the metric function.
+        def compute_metrics(eval_preds):
+            metric = Accuracy()
+            correct = metric.compute(paddle.to_tensor(eval_preds.predictions), paddle.to_tensor(eval_preds.label_ids))
+            metric.update(correct)
+            acc = metric.accumulate()
+            return {"accuracy": acc}
+        # Deine the early-stopping callback.
+        callbacks = [EarlyStoppingCallback(early_stopping_patience=4, early_stopping_threshold=0.0)]
+        # Initialize the trainer.
+        trainer = PromptTrainer(
+            model=prompt_model,
+            tokenizer=tokenizer,
+            args=training_args,
+            criterion=criterion,
+            train_dataset=train_ds,
+            eval_dataset=dev_ds,
+            callbacks=callbacks,
+            compute_metrics=compute_metrics,
+        )
+        # Traininig.
+        if training_args.do_train:
+            train_result = trainer.train(resume_from_checkpoint=None)
+            metrics = train_result.metrics
+            trainer.save_model()
+            trainer.log_metrics("train", metrics)
+            trainer.save_metrics("train", metrics)
+            trainer.save_state()
+        # Prediction.
+        if training_args.do_predict:
+            test_ret = trainer.predict(test_ds)
+            trainer.log_metrics("test", test_ret.metrics)
+        # Export static model.
+        if training_args.do_export:
+            export_path = os.path.join(training_args.output_dir, "export")
+            trainer.export_model(export_path, export_type=model_args.export_type)
+# if __name__ == '__main__':
+#     FewMultiRunner('/data/ctt/platform_zzsn_new/media/123456/config.yaml').train('')
\ No newline at end of file
--- a/model/classify/views/few_multi_class/__init__.py
+++ b/model/classify/views/few_multi_class/__init__.py
+# -*- coding: utf-8 -*-
+# @Time : 2023/2/7 15:48
+# @Author : ctt
+# @File : __init__.py
+# @Project : platform_zzsn
--- a/model/classify/views/few_multi_class/data/FewMultiDataLoader.py
+++ b/model/classify/views/few_multi_class/data/FewMultiDataLoader.py
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Time    : 2021/6/1 11:19
+# @Author  : 程婷婷
+# @FileName: FastTextDataLoader.py
+# @Software: PyCharm
+import os
+from paddlenlp.datasets import load_dataset
+class FewMultiDataLoader():
+    def __init__(self, config_path):
+        super().__init__(config_path)
+    @staticmethod
+    def load_local_dataset(data_path, splits, label_list):
+        """
+        Read datasets from files.
+        Args:
+            data_path (str):
+                Path to the dataset directory, including label.txt, train.txt,
+                dev.txt, test.txt (and data.txt).
+            splits (list):
+                Which file(s) to load, such as ['train', 'dev', 'test'].
+            label_list(dict):
+                A dictionary to encode labels as ids, which should be compatible
+                with that of verbalizer.
+        """
+        def _reader(data_file, label_list):
+            with open(data_file, "r", encoding="utf-8") as fp:
+                for idx, line in enumerate(fp):
+                    data = line.strip().split("\t")
+                    if len(data) == 1:
+                        yield {"text_a": data[0]}
+                    else:
+                        text, label = data
+                        yield {"text_a": text, "labels": label_list[label]}
+        assert isinstance(splits, list) and len(splits) > 0
+        split_map = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"}
+        dataset = []
+        for split in splits:
+            data_file = os.path.join(data_path, split_map[split])
+            dataset.append(load_dataset(_reader, data_file=data_file, label_list=label_list, lazy=False))
+        return dataset
--- a/model/classify/views/few_multi_class/data/__init__.py
+++ b/model/classify/views/few_multi_class/data/__init__.py
+# -*- coding: utf-8 -*-
+# @Time : 2023/2/7 19:59
+# @Author : ctt
+# @File : __init__.py
+# @Project : platform_zzsn
--- a/model/classify/views/few_multi_label/FewMultiLabelConfig.py
+++ b/model/classify/views/few_multi_label/FewMultiLabelConfig.py
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Time    : 2021/5/11 20:06
+# @FileName: FewMultiLabelConfig.py
+# @Software: PyCharm
+from model.base.views.config.BaseConfig import BaseConfig
+class FewMultiLabelConfig(BaseConfig):
+    def __init__(self, config_path):
+        super().__init__(config_path)
--- a/model/classify/views/few_multi_label/FewMultiLabelEvaluator.py
+++ b/model/classify/views/few_multi_label/FewMultiLabelEvaluator.py
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Time    : 2021/5/14 14:14
+# @FileName: FewMultiLabelEvaluator.py
+# @Software: PyCharm
+import numpy as np
+from paddle.metric import Metric
+from paddlenlp.utils.log import logger
+from sklearn.metrics import classification_report, f1_score
+from model.base import BaseEvaluator
+class FewMultiLabelEvaluator(BaseEvaluator.BaseEvaluator):
+    def __init__(self, config_path):
+        super().__init__(config_path)
+class MetricReport(Metric):
+    """
+    F1 score for multi-label text classification task.
+    """
+    def __init__(self, name="MetricReport", average="micro"):
+        super(MetricReport, self).__init__()
+        self.average = average
+        self._name = name
+        self.reset()
+    def reset(self):
+        """
+        Resets all of the metric state.
+        """
+        self.y_prob = None
+        self.y_true = None
+    def f1_score(self, y_prob):
+        """
+        Compute micro f1 score and macro f1 score
+        """
+        threshold = 0.5
+        self.y_pred = y_prob > threshold
+        micro_f1_score = f1_score(y_pred=self.y_pred, y_true=self.y_true, average="micro")
+        macro_f1_score = f1_score(y_pred=self.y_pred, y_true=self.y_true, average="macro")
+        return micro_f1_score, macro_f1_score
+    def update(self, probs, labels):
+        """
+        Update the probability and label
+        """
+        if self.y_prob is not None:
+            self.y_prob = np.append(self.y_prob, probs.numpy(), axis=0)
+        else:
+            self.y_prob = probs.numpy()
+        if self.y_true is not None:
+            self.y_true = np.append(self.y_true, labels.numpy(), axis=0)
+        else:
+            self.y_true = labels.numpy()
+    def accumulate(self):
+        """
+        Returns micro f1 score and macro f1 score
+        """
+        micro_f1_score, macro_f1_score = self.f1_score(y_prob=self.y_prob)
+        return micro_f1_score, macro_f1_score
+    def report(self):
+        """
+        Returns classification report
+        """
+        self.y_pred = self.y_prob > 0.5
+        logger.info("classification report:\n" + classification_report(self.y_true, self.y_pred, digits=4))
+    def name(self):
+        """
+        Returns metric name
+        """
+        return self._name
--- a/model/classify/views/few_multi_label/FewMultiLabelRunner.py
+++ b/model/classify/views/few_multi_label/FewMultiLabelRunner.py
+import os
+import sys
+import paddle
+import paddle.nn.functional as F
+from collections import defaultdict
+from dataclasses import dataclass, field
+from paddlenlp.prompt import (
+    AutoTemplate,
+    PromptModelForSequenceClassification,
+    PromptTrainer,
+    PromptTuningArguments,
+    SoftVerbalizer,
+)
+from paddlenlp.trainer import EarlyStoppingCallback, PdArgumentParser
+from paddlenlp.transformers import AutoModelForMaskedLM, AutoTokenizer
+from paddlenlp.utils.log import logger
+from model.base import BaseRunner
+from model.classify.views.few_multi_label.FewMultiLabelEvaluator import MetricReport
+from model.classify.views.few_multi_label.FewMultiLabelConfig import FewMultiLabelConfig
+from model.classify.views.few_multi_label.data.FewMultiLabelDataLoader import FewMultiLabelDataLoader
+@dataclass
+class DataArguments:
+    data_dir: str = field(default="./data", metadata={"help": "The dataset dictionary includes train.txt, dev.txt and label.txt files."})
+    prompt: str = field(default=None, metadata={"help": "The input prompt for tuning."})
+@dataclass
+class ModelArguments:
+    model_name_or_path: str = field(default="ernie-3.0-base-zh", metadata={"help": "The build-in pretrained model or the path to local model."})
+    export_type: str = field(default='paddle', metadata={"help": "The type to export. Support `paddle` and `onnx`."})
+class FewMultiLabelRunner(BaseRunner.BaseRunner):
+    def __init__(self, config_path):
+        super().__init__(config_path)
+        self.config_path = config_path
+        self.config = FewMultiLabelConfig(self.config_path)
+    def train(self, logger):
+        # Parse the arguments.
+        py_path = os.path.abspath(__file__)
+        sys.argv = [py_path]
+        print(self.config)
+        for key, value in self.config._parsed_file['embedding'].items():
+            if key == 'name':
+                sys.argv.extend(['--model_name_or_path', value])
+            elif (key == 'tokenizer_path') or (key == 'embedding_path'):
+                pass
+            else:
+                sys.argv.extend(['--' + key, str(value)])
+        for key, value in self.config._parsed_file['model'].items():
+            if key == 'model_name':
+                sys.argv.extend(['--output_dir', './checkpoints/'])
+            elif (key == 'model_path'):
+                pass
+            else:
+                sys.argv.extend(['--'+key, str(value)])
+        for key, value in self.config._parsed_file['runner'].items():
+            sys.argv.extend(['--'+key, str(value)])
+        data_path = os.path.join(os.path.dirname(self.config_path), 'data')
+        sys.argv.extend(
+            ['--do_train', '--do_eval', 'true', '--load_best_model_at_end', '--do_predict', 'true', '--logging_steps',
+             '5'])
+        sys.argv.extend(['--data_dir', data_path])
+        print(sys.argv)
+        parser = PdArgumentParser((ModelArguments, DataArguments, PromptTuningArguments))
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+        training_args.print_config(model_args, "Model")
+        training_args.print_config(data_args, "Data")
+        paddle.set_device(training_args.device)
+        # Load the pretrained language model.
+        model = AutoModelForMaskedLM.from_pretrained(model_args.model_name_or_path)
+        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
+        # Define the template for preprocess and the verbalizer for postprocess.
+        template = AutoTemplate.create_from(data_args.prompt, tokenizer, training_args.max_seq_length, model=model)
+        logger.info("Using template: {}".format(template.prompt))
+        label_file = os.path.join(data_args.data_dir, "label.txt")
+        with open(label_file, "r", encoding="utf-8") as fp:
+            label_words = defaultdict(list)
+            for line in fp:
+                data = line.strip().split("==")
+                word = data[1] if len(data) > 1 else data[0].split("##")[-1]
+                label_words[data[0]].append(word)
+        verbalizer = SoftVerbalizer(label_words, tokenizer, model)
+        # Load the few-shot datasets.
+        train_ds, dev_ds, test_ds = FewMultiLabelDataLoader.load_local_dataset(
+            data_path=data_args.data_dir, splits=["train", "dev", "test"], label_list=verbalizer.labels_to_ids
+        )
+        # Define the criterion.
+        criterion = paddle.nn.BCEWithLogitsLoss()
+        # Initialize the prompt model with the above variables.
+        prompt_model = PromptModelForSequenceClassification(
+            model, template, verbalizer, freeze_plm=training_args.freeze_plm, freeze_dropout=training_args.freeze_dropout
+        )
+        # Define the metric function.
+        def compute_metrics(eval_preds):
+            metric = MetricReport()
+            preds = F.sigmoid(paddle.to_tensor(eval_preds.predictions))
+            metric.update(preds, paddle.to_tensor(eval_preds.label_ids))
+            micro_f1_score, macro_f1_score = metric.accumulate()
+            return {"micro_f1_score": micro_f1_score, "macro_f1_score": macro_f1_score}
+        # Deine the early-stopping callback.
+        callbacks = [EarlyStoppingCallback(early_stopping_patience=4, early_stopping_threshold=0.0)]
+        # Initialize the trainer.
+        trainer = PromptTrainer(
+            model=prompt_model,
+            tokenizer=tokenizer,
+            args=training_args,
+            criterion=criterion,
+            train_dataset=train_ds,
+            eval_dataset=dev_ds,
+            callbacks=callbacks,
+            compute_metrics=compute_metrics,
+        )
+        # Training.
+        if training_args.do_train:
+            train_result = trainer.train(resume_from_checkpoint=None)
+            metrics = train_result.metrics
+            trainer.save_model()
+            trainer.log_metrics("train", metrics)
+            trainer.save_metrics("train", metrics)
+            trainer.save_state()
+        # Prediction.
+        if training_args.do_predict:
+            test_ret = trainer.predict(test_ds)
+            trainer.log_metrics("test", test_ret.metrics)
+        # Export static model.
+        if training_args.do_export:
+            export_path = os.path.join(training_args.output_dir, "export")
+            trainer.export_model(export_path, export_type=model_args.export_type)
--- a/model/classify/views/few_multi_label/__init__.py
+++ b/model/classify/views/few_multi_label/__init__.py
+# -*- coding: utf-8 -*-
+# @Time : 2023/2/9 9:53
+# @Author : ctt
+# @File : __init__.py
+# @Project : platform_zzsn
--- a/model/classify/views/few_multi_label/data/FewMultiLabelDataLoader.py
+++ b/model/classify/views/few_multi_label/data/FewMultiLabelDataLoader.py
+# -*- coding: utf-8 -*-
+# @Time : 2023/2/9 10:01
+# @Author : ctt
+# @File : FewMultiLabelDataLoader
+# @Project : platform_zzsn
+import os
+from paddlenlp.datasets import load_dataset
+class FewMultiLabelDataLoader():
+    def __init__(self, config_path):
+        super().__init__(config_path)
+    @staticmethod
+    def load_local_dataset(data_path, splits, label_list):
+        """
+        Load dataset for multi-label classification from files, where
+        there is one example per line. Text and label are seperated
+        by '\t', and multiple labels are delimited by ','.
+        Args:
+            data_path (str):
+                Path to the dataset directory, including label.txt, train.txt,
+                dev.txt (and data.txt).
+            splits (list):
+                Which file(s) to load, such as ['train', 'dev', 'test'].
+            label_list (dict):
+                The dictionary that maps labels to indeces.
+        """
+        def _reader(data_file, label_list):
+            with open(data_file, "r", encoding="utf-8") as fp:
+                for idx, line in enumerate(fp):
+                    data = line.strip().split("\t")
+                    if len(data) == 1:
+                        yield {"text_a": data[0]}
+                    else:
+                        text, label = data
+                        label = label.strip().split(",")
+                        label = [float(1) if x in label else float(0) for x in label_list]
+                        yield {"text_a": text, "labels": label}
+        split_map = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"}
+        datasets = []
+        for split in splits:
+            data_file = os.path.join(data_path, split_map[split])
+            datasets.append(load_dataset(_reader, data_file=data_file, label_list=label_list, lazy=False))
+        return datasets
--- a/model/classify/views/few_multi_label/data/__init__.py
+++ b/model/classify/views/few_multi_label/data/__init__.py
+# -*- coding: utf-8 -*-
+# @Time : 2023/2/9 10:00
+# @Author : ctt
+# @File : __init__.py
+# @Project : platform_zzsn
--- a/static/common/config_data/few_multi_class.yaml
+++ b/static/common/config_data/few_multi_class.yaml
+data_process:
+  test_file_path: ''
+  train_file_path: ''
+embedding:
+  name: 'ernie-3.0-base-zh'
+  tokenizer_path: null
+  embedding_path: null
+  max_seq_length: 128
+model:
+  model_name: multi_class_model
+  model_path: ''
+  learning_rate: 3e-5
+  ppt_learning_rate: 3e-4
+  num_train_epochs: 10
+  per_device_train_batch_size: 8
+  per_device_eval_batch_size: 32
+  metric_for_best_model: accuracy
+  evaluation_strategy: epoch
+  save_strategy: epoch
+  save_total_limit: 1
+  prompt: "这条新闻写的是"
+evaluate:
+  average: micro
+runner:
+  device: cpu
\ No newline at end of file
--- a/static/common/config_data/few_multi_label.yaml
+++ b/static/common/config_data/few_multi_label.yaml
+data_process:
+  test_file_path: ''
+  train_file_path: ''
+embedding:
+  name: 'ernie-3.0-base-zh'
+  tokenizer_path: null
+  embedding_path: null
+  max_seq_length: 128
+model:
+  model_name: multi_class_model
+  model_path: ''
+  learning_rate: 3e-5
+  ppt_learning_rate: 3e-4
+  num_train_epochs: 10
+  per_device_train_batch_size: 8
+  per_device_eval_batch_size: 32
+  metric_for_best_model: accuracy
+  evaluation_strategy: epoch
+  save_strategy: epoch
+  save_total_limit: 1
+  prompt: "这句话包含的要素有"
+evaluate:
+  average: micro
+runner:
+  device: cpu
\ No newline at end of file