大模型部署框架

894b7158 · XveLingKun · 894b7158 · 894b7158 · 894b7158 · 894b7158
--- a/.dockerignore
+++ b/.dockerignore
+.vscode
+.git
+.github
+.venv
+cache
+data
+docker
+saves
+hf_cache
+ms_cache
+om_cache
+output
+.dockerignore
+.gitattributes
+.gitignore
--- a/.env.local
+++ b/.env.local
+# Note: actually we do not support .env, just for reference
+# api
+API_HOST=
+API_PORT=
+API_KEY=
+API_MODEL_NAME=
+API_VERBOSE=
+FASTAPI_ROOT_PATH=
+MAX_CONCURRENT=
+# general
+DISABLE_VERSION_CHECK=
+FORCE_CHECK_IMPORTS=
+ALLOW_EXTRA_ARGS=
+LLAMAFACTORY_VERBOSITY=
+USE_MODELSCOPE_HUB=
+USE_OPENMIND_HUB=
+USE_RAY=
+RECORD_VRAM=
+# torchrun
+FORCE_TORCHRUN=
+MASTER_ADDR=
+MASTER_PORT=
+NNODES=
+NODE_RANK=
+NPROC_PER_NODE=
+# wandb
+WANDB_DISABLED=
+WANDB_PROJECT=
+WANDB_API_KEY=
+# gradio ui
+GRADIO_SHARE=
+GRADIO_SERVER_NAME=
+GRADIO_SERVER_PORT=
+GRADIO_ROOT_PATH=
+GRADIO_IPV6=
+# setup
+ENABLE_SHORT_CONSOLE=
+# reserved (do not use)
+LLAMABOARD_ENABLED=
+LLAMABOARD_WORKDIR=
--- a/.gitattributes
+++ b/.gitattributes
+# Auto detect text files and perform LF normalization
+* text=auto
--- a/.gitignore
+++ b/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+
+# vscode
+.vscode/
+
+# uv
+uv.lock
+
+# custom .gitignore
+ms_cache/
+hf_cache/
+om_cache/
+cache/
+config/
+saves/
+output/
+wandb/
+swanlog/
+generated_predictions.jsonl
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+    -   id: check-ast
+    -   id: check-added-large-files
+        args: ['--maxkb=25000']
+    -   id: check-merge-conflict
+    -   id: check-yaml
+    -   id: debug-statements
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+        args: [--markdown-linebreak-ext=md]
+    -   id: no-commit-to-branch
+        args: ['--branch', 'main']
+
+-   repo: https://github.com/asottile/pyupgrade
+    rev: v3.17.0
+    hooks:
+    -   id: pyupgrade
+        args: [--py38-plus]
+
+-   repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.6.9
+    hooks:
+    -   id: ruff
+        args: [--fix]
+    -   id: ruff-format
--- a/CITATION.cff
+++ b/CITATION.cff
+cff-version: 1.2.0
+date-released: 2024-03
+message: "If you use this software, please cite it as below."
+authors:
+- family-names: "Zheng"
+  given-names: "Yaowei"
+- family-names: "Zhang"
+  given-names: "Richong"
+- family-names: "Zhang"
+  given-names: "Junhao"
+- family-names: "Ye"
+  given-names: "Yanhan"
+- family-names: "Luo"
+  given-names: "Zheyan"
+- family-names: "Feng"
+  given-names: "Zhangchi"
+- family-names: "Ma"
+  given-names: "Yongqiang"
+title: "LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models"
+url: "https://arxiv.org/abs/2403.13372"
+preferred-citation:
+  type: conference-paper
+  conference:
+    name: "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)"
+  authors:
+    - family-names: "Zheng"
+      given-names: "Yaowei"
+    - family-names: "Zhang"
+      given-names: "Richong"
+    - family-names: "Zhang"
+      given-names: "Junhao"
+    - family-names: "Ye"
+      given-names: "Yanhan"
+    - family-names: "Luo"
+      given-names: "Zheyan"
+    - family-names: "Feng"
+      given-names: "Zhangchi"
+    - family-names: "Ma"
+      given-names: "Yongqiang"
+  title: "LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models"
+  url: "https://arxiv.org/abs/2403.13372"
+  year: 2024
+  publisher: "Association for Computational Linguistics"
+  address: "Bangkok, Thailand"
--- a/LICENSE
+++ b/LICENSE
--- a/MANIFEST.in
+++ b/MANIFEST.in
+include LICENSE requirements.txt
--- a/Makefile
+++ b/Makefile
+.PHONY: build commit license quality style test
+
+check_dirs := scripts src tests setup.py
+
+build:
+	pip install build && python -m build
+
+commit:
+	pre-commit install
+	pre-commit run --all-files
+
+license:
+	python3 tests/check_license.py $(check_dirs)
+
+quality:
+	ruff check $(check_dirs)
+	ruff format --check $(check_dirs)
+
+style:
+	ruff check $(check_dirs) --fix
+	ruff format $(check_dirs)
+
+test:
+	CUDA_VISIBLE_DEVICES= WANDB_DISABLED=true pytest -vv tests/
--- a/README.md
+++ b/README.md
--- a/README_zh.md
+++ b/README_zh.md
--- a/assets/benchmark.svg
+++ b/assets/benchmark.svg
--- a/assets/logo.png
+++ b/assets/logo.png
--- a/assets/wechat.jpg
+++ b/assets/wechat.jpg
--- a/assets/wechat_npu.jpg
+++ b/assets/wechat_npu.jpg
--- a/bench.py
+++ b/bench.py
+"""
+速度并发测试
+"""
+import aiohttp
+import asyncio
+import time
+from tqdm import tqdm
+
+
+async def fetch(session, url):
+    """
+    参数:
+        session (aiohttp.ClientSession): 用于请求的会话。
+        url (str): 要发送请求的 URL。
+
+    返回:
+        tuple: 包含完成 token 数量和请求时间。
+    """
+    start_time = time.time()
+
+    # 固定请求的内容
+    json_payload = {
+    "model": "qwen",
+    "messages": [
+        {
+            "role": "system",
+            "content": "你是国资国企数智化分析平台人工智能助手国资小研，是由项目组基于国产开源大语言模型结合国资国企领域知识训练开发的垂直领域大模型。你的任务是针对国资国企研究领域的问题和要求提供适当的答复和支持。"
+        },
+        {
+            "role": "user",
+            "content": " 如何确保农产品质量安全？"
+        }
+    ],
+    "do_sample": True,
+    "temperature":  0.95,
+    "top_p": 0.0,
+    "n": 1,
+    "max_tokens": 8192,
+    "stream": False
+}
+    async with session.post(url, json=json_payload) as response:
+        response_json = await response.json()
+        end_time = time.time()
+        request_time = end_time - start_time
+        completion_tokens = response_json['usage']['completion_tokens']  # 从返回的参数里获取生成的 token 的数量
+        return completion_tokens, request_time
+
+
+async def bound_fetch(sem, session, url, pbar):
+    # 使用信号量 sem 来限制并发请求的数量，确保不会超过最大并发请求数
+    async with sem:
+        result = await fetch(session, url)
+        pbar.update(1)
+        return result
+
+
+async def run(load_url, max_concurrent_requests, total_requests):
+    """
+    通过发送多个并发请求来运行基准测试。
+
+    参数:
+        load_url (str): 要发送请求的URL。
+        max_concurrent_requests (int): 最大并发请求数。
+        total_requests (int): 要发送的总请求数。
+
+    返回:
+        tuple: 包含完成 token 总数列表和响应时间列表。
+    """
+    # 创建 Semaphore 来限制并发请求的数量
+    sem = asyncio.Semaphore(max_concurrent_requests)
+
+    # 创建一个异步的HTTP会话
+    async with aiohttp.ClientSession() as session:
+        tasks = []
+
+        # 创建一个进度条来可视化请求的进度
+        with tqdm(total=total_requests) as pbar:
+            # 循环创建任务，直到达到总请求数
+            for _ in range(total_requests):
+                # 为每个请求创建一个任务，确保它遵守信号量的限制
+                task = asyncio.ensure_future(bound_fetch(sem, session, load_url, pbar))
+                tasks.append(task)  # 将任务添加到任务列表中
+
+            # 等待所有任务完成并收集它们的结果
+            results = await asyncio.gather(*tasks)
+
+        # 计算所有结果中的完成token总数
+        completion_tokens = sum(result[0] for result in results)
+
+        # 从所有结果中提取响应时间
+        response_times = [result[1] for result in results]
+
+        # 返回完成token的总数和响应时间的列表
+        return completion_tokens, response_times
+
+
+if __name__ == '__main__':
+    import sys
+
+    if len(sys.argv) != 3:
+        print("Usage: python bench.py <C> <N>")
+        sys.exit(1)
+
+    C = int(sys.argv[1])  # 最大并发数
+    N = int(sys.argv[2])  # 请求总数
+
+    # vllm 和 ollama 都兼容了 openai 的 api 让测试变得更简单了
+    url = 'http://localhost:8000/v1/chat/completions'
+
+    start_time = time.time()
+    completion_tokens, response_times = asyncio.run(run(url, C, N))
+    end_time = time.time()
+
+    # 计算总时间
+    total_time = end_time - start_time
+    # 计算每个请求的平均时间
+    avg_time_per_request = sum(response_times) / len(response_times)
+    # 计算每秒生成的 token 数量
+    tokens_per_second = completion_tokens / total_time
+
+    print(f'Performance Results:')
+    print(f'  Total requests            : {N}')
+    print(f'  Max concurrent requests   : {C}')
+    print(f'  Total time                : {total_time:.2f} seconds')
+    print(f'  Average time per request  : {avg_time_per_request:.2f} seconds')
+    print(f'  Tokens per second         : {tokens_per_second:.2f}')
--- a/data/README.md
+++ b/data/README.md
--- a/data/README_zh.md
+++ b/data/README_zh.md
--- a/data/alpaca_en_demo.json
+++ b/data/alpaca_en_demo.json
--- a/data/alpaca_zh_demo.json
+++ b/data/alpaca_zh_demo.json
--- a/data/belle_multiturn/belle_multiturn.py
+++ b/data/belle_multiturn/belle_multiturn.py
+# Copyright 2025 the LlamaFactory team.
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+
+import datasets
+
+
+_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
+
+_DESCRIPTION = "BELLE multiturn chat dataset."
+
+_CITATION = """\
+@article{belle2023exploring,
+  title={Exploring the Impact of Instruction Data Scaling on Large Language Models},
+  author={Yunjie Ji, Yong Deng, Yan Gong, Yiping Peng, Qiang Niu, Lei Zhang, Baochang Ma, Xiangang Li},
+  journal={arXiv preprint arXiv:2303.14742},
+  year={2023}
+}
+"""
+
+_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/BelleGroup/multiturn_chat_0.8M"
+_LICENSE = "gpl-3.0"
+_URL = f"{_HF_ENDPOINT}/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json"
+
+
+class BelleMultiturn(datasets.GeneratorBasedBuilder):
+    VERSION = datasets.Version("0.0.0")
+
+    def _info(self):
+        features = datasets.Features(
+            {"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]}
+        )
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager):
+        file_path = dl_manager.download(_URL)
+        return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": file_path})]
+
+    def _generate_examples(self, filepath: str):
+        with open(filepath, encoding="utf-8") as f:
+            for key, row in enumerate(f):
+                data = json.loads(row)
+                conversations = []
+                prompt = data["instruction"].strip()
+                response = data["output"].strip()
+
+                assist_idx = prompt.rfind("Assistant:")
+                human_idx = prompt.rfind("Human:")
+                query = prompt[human_idx + 6 : assist_idx].strip()
+                prompt = prompt[:human_idx].strip()
+                conversations.insert(0, {"from": "gpt", "value": response})
+                conversations.insert(0, {"from": "human", "value": query})
+
+                while prompt.rfind("Assistant:") != -1:
+                    assist_idx = prompt.rfind("Assistant:")
+                    human_idx = prompt.rfind("Human:")
+                    if human_idx != -1:
+                        old_query = prompt[human_idx + 6 : assist_idx].strip()
+                        old_resp = prompt[assist_idx + 10 :].strip()
+                        conversations.insert(0, {"from": "gpt", "value": old_resp})
+                        conversations.insert(0, {"from": "human", "value": old_query})
+                    else:
+                        break
+                    prompt = prompt[:human_idx].strip()
+
+                yield key, {"conversations": conversations}
--- a/data/c4_demo.json
+++ b/data/c4_demo.json
--- a/data/dataset_info.json
+++ b/data/dataset_info.json
--- a/data/dpo_en_demo.json
+++ b/data/dpo_en_demo.json
--- a/data/dpo_zh_demo.json
+++ b/data/dpo_zh_demo.json
--- a/data/glaive_toolcall_en_demo.json
+++ b/data/glaive_toolcall_en_demo.json
--- a/data/glaive_toolcall_zh_demo.json
+++ b/data/glaive_toolcall_zh_demo.json
--- a/data/hh_rlhf_en/hh_rlhf_en.py
+++ b/data/hh_rlhf_en/hh_rlhf_en.py
+# Copyright 2025 the LlamaFactory team.
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+
+import datasets
+
+
+_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
+_DESCRIPTION = "Human preference data about helpfulness and harmlessness."
+_CITATION = ""
+_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/Anthropic/hh-rlhf"
+_LICENSE = "mit"
+_URL = f"{_HF_ENDPOINT}/datasets/Anthropic/hh-rlhf/resolve/main/"
+_URLS = {
+    "train": [
+        _URL + "harmless-base/train.jsonl.gz",
+        _URL + "helpful-base/train.jsonl.gz",
+        _URL + "helpful-online/train.jsonl.gz",
+        _URL + "helpful-rejection-sampled/train.jsonl.gz",
+    ],
+    "test": [
+        _URL + "harmless-base/test.jsonl.gz",
+        _URL + "helpful-base/test.jsonl.gz",
+        _URL + "helpful-online/test.jsonl.gz",
+        _URL + "helpful-rejection-sampled/test.jsonl.gz",
+    ],
+}
+
+
+class HhRlhfEn(datasets.GeneratorBasedBuilder):
+    VERSION = datasets.Version("0.0.0")
+
+    def _info(self) -> datasets.DatasetInfo:
+        features = datasets.Features(
+            {
+                "instruction": datasets.Value("string"),
+                "chosen": datasets.Value("string"),
+                "rejected": datasets.Value("string"),
+                "history": datasets.Sequence(datasets.Sequence(datasets.Value("string"))),
+            }
+        )
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager):
+        file_path = dl_manager.download_and_extract(_URLS)
+        return [
+            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": file_path["train"]}),
+            datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepaths": file_path["test"]}),
+        ]
+
+    def _generate_examples(self, filepaths: list[str]):
+        key = 0
+        for filepath in filepaths:
+            with open(filepath, encoding="utf-8") as f:
+                for row in f:
+                    data = json.loads(row)
+                    chosen = data["chosen"]
+                    rejected = data["rejected"]
+
+                    assist_idx = rejected.rfind("\n\nAssistant: ")
+                    r_reject = rejected[assist_idx + 13 :].strip()
+                    assist_idx = chosen.rfind("\n\nAssistant: ")
+                    r_accept = chosen[assist_idx + 13 :].strip()
+
+                    human_idx = chosen.rfind("\n\nHuman: ")
+                    query = chosen[human_idx + 9 : assist_idx].strip()
+                    prompt = chosen[:human_idx]
+                    history = []
+
+                    while prompt.rfind("\n\nAssistant: ") != -1:
+                        assist_idx = prompt.rfind("\n\nAssistant: ")
+                        human_idx = prompt.rfind("\n\nHuman: ")
+                        if human_idx != -1:
+                            old_query = prompt[human_idx + 9 : assist_idx].strip()
+                            old_resp = prompt[assist_idx + 13 :].strip()
+                            history.insert(0, (old_query, old_resp))
+                        else:
+                            break
+                        prompt = prompt[:human_idx]
+
+                    yield key, {"instruction": query, "chosen": r_accept, "rejected": r_reject, "history": history}
+                    key += 1
--- a/data/identity.json
+++ b/data/identity.json
--- a/data/kto_en_demo.json
+++ b/data/kto_en_demo.json
--- a/data/mllm_audio_demo.json
+++ b/data/mllm_audio_demo.json
+[
+  {
+    "messages": [
+      {
+        "content": "<audio>What's that sound?",
+        "role": "user"
+      },
+      {
+        "content": "It is the sound of glass shattering.",
+        "role": "assistant"
+      }
+    ],
+    "audios": [
+      "mllm_demo_data/1.mp3"
+    ]
+  },
+  {
+    "messages": [
+      {
+        "content": "<audio>What can you hear?",
+        "role": "user"
+      },
+      {
+        "content": "A woman is coughing.",
+        "role": "assistant"
+      }
+    ],
+    "audios": [
+      "mllm_demo_data/2.wav"
+    ]
+  },
+  {
+    "messages": [
+      {
+        "content": "<audio>What does the person say?",
+        "role": "user"
+      },
+      {
+        "content": "Mister Quiller is the apostle of the middle classes and we are glad to welcome his gospel.",
+        "role": "assistant"
+      }
+    ],
+    "audios": [
+      "mllm_demo_data/3.flac"
+    ]
+  }
+]
--- a/data/mllm_demo.json
+++ b/data/mllm_demo.json
+[
+  {
+    "messages": [
+      {
+        "content": "<image>Who are they?",
+        "role": "user"
+      },
+      {
+        "content": "They're Kane and Gretzka from Bayern Munich.",
+        "role": "assistant"
+      },
+      {
+        "content": "What are they doing?<image>",
+        "role": "user"
+      },
+      {
+        "content": "They are celebrating on the soccer field.",
+        "role": "assistant"
+      }
+    ],
+    "images": [
+      "mllm_demo_data/1.jpg",
+      "mllm_demo_data/1.jpg"
+    ]
+  },
+  {
+    "messages": [
+      {
+        "content": "<image>Who is he?",
+        "role": "user"
+      },
+      {
+        "content": "He's Thomas Muller from Bayern Munich.",
+        "role": "assistant"
+      },
+      {
+        "content": "Why is he on the ground?",
+        "role": "user"
+      },
+      {
+        "content": "Because he's sliding on his knees to celebrate.",
+        "role": "assistant"
+      }
+    ],
+    "images": [
+      "mllm_demo_data/2.jpg"
+    ]
+  },
+  {
+    "messages": [
+      {
+        "content": "<image>Please describe this image",
+        "role": "user"
+      },
+      {
+        "content": "Chinese astronaut Gui Haichao is giving a speech.",
+        "role": "assistant"
+      },
+      {
+        "content": "What has he accomplished?",
+        "role": "user"
+      },
+      {
+        "content": "He was appointed to be a payload specialist on Shenzhou 16 mission in June 2022, thus becoming the first Chinese civilian of Group 3 in space on 30 May 2023. He is responsible for the on-orbit operation of space science experimental payloads.",
+        "role": "assistant"
+      }
+    ],
+    "images": [
+      "mllm_demo_data/3.jpg"
+    ]
+  },
+  {
+    "messages": [
+      {
+        "content": "<image>他们是谁？",
+        "role": "user"
+      },
+      {
+        "content": "他们是拜仁慕尼黑的凯恩和格雷茨卡。",
+        "role": "assistant"
+      },
+      {
+        "content": "他们在做什么？<image>",
+        "role": "user"
+      },
+      {
+        "content": "他们在足球场上庆祝。",
+        "role": "assistant"
+      }
+    ],
+    "images": [
+      "mllm_demo_data/1.jpg",
+      "mllm_demo_data/1.jpg"
+    ]
+  },
+  {
+    "messages": [
+      {
+        "content": "<image>他是谁？",
+        "role": "user"
+      },
+      {
+        "content": "他是来自拜仁慕尼黑的托马斯·穆勒。",
+        "role": "assistant"
+      },
+      {
+        "content": "他为什么在地上？",
+        "role": "user"
+      },
+      {
+        "content": "因为他正在双膝跪地滑行庆祝。",
+        "role": "assistant"
+      }
+    ],
+    "images": [
+      "mllm_demo_data/2.jpg"
+    ]
+  },
+  {
+    "messages": [
+      {
+        "content": "<image>请描述这张图片",
+        "role": "user"
+      },
+      {
+        "content": "中国宇航员桂海潮正在讲话。",
+        "role": "assistant"
+      },
+      {
+        "content": "他取得过哪些成就？",
+        "role": "user"
+      },
+      {
+        "content": "他于2022年6月被任命为神舟十六号任务的有效载荷专家，从而成为2023年5月30日进入太空的首位平民宇航员。他负责在轨操作空间科学实验有效载荷。",
+        "role": "assistant"
+      }
+    ],
+    "images": [
+      "mllm_demo_data/3.jpg"
+    ]
+  }
+]
--- a/data/mllm_demo_data/1.jpg
+++ b/data/mllm_demo_data/1.jpg
--- a/data/mllm_demo_data/1.mp3
+++ b/data/mllm_demo_data/1.mp3
--- a/data/mllm_demo_data/1.mp4
+++ b/data/mllm_demo_data/1.mp4
--- a/data/mllm_demo_data/2.avi
+++ b/data/mllm_demo_data/2.avi
--- a/data/mllm_demo_data/2.jpg
+++ b/data/mllm_demo_data/2.jpg
--- a/data/mllm_demo_data/2.wav
+++ b/data/mllm_demo_data/2.wav
--- a/data/mllm_demo_data/3.flac
+++ b/data/mllm_demo_data/3.flac
--- a/data/mllm_demo_data/3.jpg
+++ b/data/mllm_demo_data/3.jpg
--- a/data/mllm_demo_data/3.mp4
+++ b/data/mllm_demo_data/3.mp4
--- a/data/mllm_video_demo.json
+++ b/data/mllm_video_demo.json
+[
+  {
+    "messages": [
+      {
+        "content": "<video>Why is this video funny?",
+        "role": "user"
+      },
+      {
+        "content": "Because a baby is reading, and he is so cute!",
+        "role": "assistant"
+      }
+    ],
+    "videos": [
+      "mllm_demo_data/1.mp4"
+    ]
+  },
+  {
+    "messages": [
+      {
+        "content": "<video>What is she doing?",
+        "role": "user"
+      },
+      {
+        "content": "She is cooking.",
+        "role": "assistant"
+      }
+    ],
+    "videos": [
+      "mllm_demo_data/2.avi"
+    ]
+  },
+  {
+    "messages": [
+      {
+        "content": "<video>What's in the video?",
+        "role": "user"
+      },
+      {
+        "content": "A baby is playing in the living room.",
+        "role": "assistant"
+      }
+    ],
+    "videos": [
+      "mllm_demo_data/3.mp4"
+    ]
+  }
+]
--- a/data/ultra_chat/ultra_chat.py
+++ b/data/ultra_chat/ultra_chat.py
+# Copyright 2025 the LlamaFactory team.
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+
+import datasets
+
+
+_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
+
+_DESCRIPTION = "UltraChat: Large-scale, Informative, and Diverse Multi-round Dialogue Data."
+
+_CITATION = """\
+@misc{UltraChat,
+  author = {Ding, Ning and Chen, Yulin and Xu, Bokai and Hu, Shengding and others},
+  title = {UltraChat: A Large-scale Auto-generated Multi-round Dialogue Data},
+  year = {2023},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\\url{https://github.com/thunlp/ultrachat}},
+}
+"""
+
+_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/stingning/ultrachat"
+_LICENSE = "cc-by-nc-4.0"
+_BASE_DATA_URL = f"{_HF_ENDPOINT}/datasets/stingning/ultrachat/resolve/main/train_{{idx}}.jsonl"
+
+
+class UltraChat(datasets.GeneratorBasedBuilder):
+    VERSION = datasets.Version("0.0.0")
+
+    def _info(self):
+        features = datasets.Features(
+            {"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]}
+        )
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager):
+        file_paths = [dl_manager.download(_BASE_DATA_URL.format(idx=idx)) for idx in range(10)]  # multiple shards
+        return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": file_paths})]
+
+    def _generate_examples(self, filepaths: list[str]):
+        for filepath in filepaths:
+            with open(filepath, encoding="utf-8") as f:
+                for row in f:
+                    try:
+                        data = json.loads(row)
+                    except Exception:
+                        continue
+                    key: int = data["id"]
+                    content: list[str] = data["data"]
+                    if len(content) % 2 == 1:
+                        content.pop(-1)
+                    if len(content) < 2:
+                        continue
+                    conversations = [
+                        {"from": "human" if i % 2 == 0 else "gpt", "value": content[i]} for i in range(len(content))
+                    ]
+                    yield key, {"conversations": conversations}
--- a/data/wiki_demo.txt
+++ b/data/wiki_demo.txt
--- a/docker/docker-cuda/Dockerfile
+++ b/docker/docker-cuda/Dockerfile
+# Default use the NVIDIA official image with PyTorch 2.3.0
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.02-py3
+FROM ${BASE_IMAGE}
+
+# Define environments
+ENV MAX_JOBS=4
+ENV FLASH_ATTENTION_FORCE_BUILD=TRUE
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+
+# Define installation arguments
+ARG INSTALL_BNB=false
+ARG INSTALL_VLLM=false
+ARG INSTALL_DEEPSPEED=false
+ARG INSTALL_FLASHATTN=false
+ARG INSTALL_LIGER_KERNEL=false
+ARG INSTALL_HQQ=false
+ARG INSTALL_EETQ=false
+ARG PIP_INDEX=https://pypi.org/simple
+ARG HTTP_PROXY=
+
+# Set the working directory
+WORKDIR /app
+
+# Set http proxy
+RUN if [ -n "$HTTP_PROXY" ]; then \
+        echo "Configuring proxy..."; \
+        export http_proxy=$HTTP_PROXY; \
+        export https_proxy=$HTTP_PROXY; \
+    fi
+
+# Install the requirements
+COPY requirements.txt /app
+RUN pip config set global.index-url "$PIP_INDEX" && \
+    pip config set global.extra-index-url "$PIP_INDEX" && \
+    python -m pip install --upgrade pip && \
+    if [ -n "$HTTP_PROXY" ]; then \
+        python -m pip install --proxy=$HTTP_PROXY -r requirements.txt; \
+    else \
+        python -m pip install -r requirements.txt; \
+    fi
+
+# Copy the rest of the application into the image
+COPY . /app
+
+# Install the LLaMA Factory
+RUN EXTRA_PACKAGES="metrics"; \
+    if [ "$INSTALL_BNB" == "true" ]; then \
+        EXTRA_PACKAGES="${EXTRA_PACKAGES},bitsandbytes"; \
+    fi; \
+    if [ "$INSTALL_VLLM" == "true" ]; then \
+        EXTRA_PACKAGES="${EXTRA_PACKAGES},vllm"; \
+    fi; \
+    if [ "$INSTALL_DEEPSPEED" == "true" ]; then \
+        EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
+    fi; \
+    if [ "$INSTALL_LIGER_KERNEL" == "true" ]; then \
+        EXTRA_PACKAGES="${EXTRA_PACKAGES},liger-kernel"; \
+    fi; \
+    if [ "$INSTALL_HQQ" == "true" ]; then \
+        EXTRA_PACKAGES="${EXTRA_PACKAGES},hqq"; \
+    fi; \
+    if [ "$INSTALL_EETQ" == "true" ]; then \
+        EXTRA_PACKAGES="${EXTRA_PACKAGES},eetq"; \
+    fi; \
+    if [ -n "$HTTP_PROXY" ]; then \
+        pip install --proxy=$HTTP_PROXY -e ".[$EXTRA_PACKAGES]"; \
+    else \
+        pip install -e ".[$EXTRA_PACKAGES]"; \
+    fi
+
+# Rebuild flash attention
+RUN pip uninstall -y transformer-engine flash-attn && \
+    if [ "$INSTALL_FLASHATTN" == "true" ]; then \
+        pip uninstall -y ninja && \
+        if [ -n "$HTTP_PROXY" ]; then \
+            pip install --proxy=$HTTP_PROXY ninja && \
+            pip install --proxy=$HTTP_PROXY --no-cache-dir flash-attn --no-build-isolation; \
+        else \
+            pip install ninja && \
+            pip install --no-cache-dir flash-attn --no-build-isolation; \
+        fi; \
+    fi
+
+
+# Unset http proxy
+RUN if [ -n "$HTTP_PROXY" ]; then \
+        unset http_proxy; \
+        unset https_proxy; \
+    fi
+
+# Set up volumes
+VOLUME [ "/root/.cache/huggingface", "/root/.cache/modelscope", "/app/data", "/app/output" ]
+
+# Expose port 7860 for the LLaMA Board
+ENV GRADIO_SERVER_PORT 7860
+EXPOSE 7860
+
+# Expose port 8000 for the API service
+ENV API_PORT 8000
+EXPOSE 8000
--- a/docker/docker-cuda/docker-compose.yml
+++ b/docker/docker-cuda/docker-compose.yml
+services:
+  llamafactory:
+    build:
+      dockerfile: ./docker/docker-cuda/Dockerfile
+      context: ../..
+      args:
+        INSTALL_BNB: "false"
+        INSTALL_VLLM: "false"
+        INSTALL_DEEPSPEED: "false"
+        INSTALL_FLASHATTN: "false"
+        INSTALL_LIGER_KERNEL: "false"
+        INSTALL_HQQ: "false"
+        INSTALL_EETQ: "false"
+        PIP_INDEX: https://pypi.org/simple
+    container_name: llamafactory
+    volumes:
+      - ../../hf_cache:/root/.cache/huggingface
+      - ../../ms_cache:/root/.cache/modelscope
+      - ../../om_cache:/root/.cache/openmind
+      - ../../data:/app/data
+      - ../../output:/app/output
+    ports:
+      - "7860:7860"
+      - "8000:8000"
+    ipc: host
+    tty: true
+    shm_size: "16gb"
+    stdin_open: true
+    command: bash
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            count: "all"
+            capabilities: [gpu]
+    restart: unless-stopped
--- a/docker/docker-npu/Dockerfile
+++ b/docker/docker-npu/Dockerfile
+# Use the Ubuntu 22.04 image with CANN 8.0.rc1
+# More versions can be found at https://hub.docker.com/r/ascendai/cann/tags
+# FROM ascendai/cann:8.0.rc1-910-ubuntu22.04-py3.8
+FROM ascendai/cann:8.0.0-910b-ubuntu22.04-py3.10
+# FROM ascendai/cann:8.0.rc1-910-openeuler22.03-py3.8
+# FROM ascendai/cann:8.0.rc1-910b-openeuler22.03-py3.8
+
+# Define environments
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Define installation arguments
+ARG INSTALL_DEEPSPEED=false
+ARG PIP_INDEX=https://pypi.org/simple
+ARG TORCH_INDEX=https://download.pytorch.org/whl/cpu
+ARG HTTP_PROXY=
+
+# Set the working directory
+WORKDIR /app
+
+# Set http proxy
+RUN if [ -n "$HTTP_PROXY" ]; then \
+        echo "Configuring proxy..."; \
+        export http_proxy=$HTTP_PROXY; \
+        export https_proxy=$HTTP_PROXY; \
+    fi
+
+# Install the requirements
+COPY requirements.txt /app
+RUN pip config set global.index-url "$PIP_INDEX" && \
+    pip config set global.extra-index-url "$TORCH_INDEX" && \
+    python -m pip install --upgrade pip && \
+    if [ -n "$HTTP_PROXY" ]; then \
+        python -m pip install --proxy=$HTTP_PROXY -r requirements.txt; \
+    else \
+        python -m pip install -r requirements.txt; \
+    fi
+
+# Copy the rest of the application into the image
+COPY . /app
+
+# Install the LLaMA Factory
+RUN EXTRA_PACKAGES="torch-npu,metrics"; \
+    if [ "$INSTALL_DEEPSPEED" == "true" ]; then \
+        EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
+    fi; \
+    if [ -n "$HTTP_PROXY" ]; then \
+        pip install --proxy=$HTTP_PROXY -e ".[$EXTRA_PACKAGES]"; \
+    else \
+        pip install -e ".[$EXTRA_PACKAGES]"; \
+    fi
+
+# Unset http proxy
+RUN if [ -n "$HTTP_PROXY" ]; then \
+        unset http_proxy; \
+        unset https_proxy; \
+    fi
+
+# Set up volumes
+VOLUME [ "/root/.cache/huggingface", "/root/.cache/modelscope", "/app/data", "/app/output" ]
+
+# Expose port 7860 for the LLaMA Board
+ENV GRADIO_SERVER_PORT 7860
+EXPOSE 7860
+
+# Expose port 8000 for the API service
+ENV API_PORT 8000
+EXPOSE 8000
--- a/docker/docker-npu/docker-compose.yml
+++ b/docker/docker-npu/docker-compose.yml
+services:
+  llamafactory:
+    build:
+      dockerfile: ./docker/docker-npu/Dockerfile
+      context: ../..
+      args:
+        INSTALL_DEEPSPEED: "false"
+        PIP_INDEX: https://pypi.org/simple
+    container_name: llamafactory
+    volumes:
+      - ../../hf_cache:/root/.cache/huggingface
+      - ../../ms_cache:/root/.cache/modelscope
+      - ../../om_cache:/root/.cache/openmind
+      - ../../data:/app/data
+      - ../../output:/app/output
+      - /usr/local/dcmi:/usr/local/dcmi
+      - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
+      - /usr/local/Ascend/driver:/usr/local/Ascend/driver
+      - /etc/ascend_install.info:/etc/ascend_install.info
+    ports:
+      - "7860:7860"
+      - "8000:8000"
+    ipc: host
+    tty: true
+    shm_size: "16gb"
+    stdin_open: true
+    command: bash
+    devices:
+      - /dev/davinci0
+      - /dev/davinci_manager
+      - /dev/devmm_svm
+      - /dev/hisi_hdc
+    restart: unless-stopped
--- a/docker/docker-rocm/Dockerfile
+++ b/docker/docker-rocm/Dockerfile
+FROM hardandheavy/transformers-rocm:2.2.0
+
+# Define environments
+ENV MAX_JOBS=4
+ENV FLASH_ATTENTION_FORCE_BUILD=TRUE
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+
+# Define installation arguments
+ARG INSTALL_BNB=false
+ARG INSTALL_VLLM=false
+ARG INSTALL_DEEPSPEED=false
+ARG INSTALL_FLASHATTN=false
+ARG INSTALL_LIGER_KERNEL=false
+ARG INSTALL_HQQ=false
+ARG PIP_INDEX=https://pypi.org/simple
+ARG HTTP_PROXY=
+
+# Set the working directory
+WORKDIR /app
+
+# Set http proxy
+RUN if [ -n "$HTTP_PROXY" ]; then \
+        echo "Configuring proxy..."; \
+        export http_proxy=$HTTP_PROXY; \
+        export https_proxy=$HTTP_PROXY; \
+    fi
+
+# Install the requirements
+COPY requirements.txt /app
+RUN pip config set global.index-url "$PIP_INDEX" && \
+    pip config set global.extra-index-url "$PIP_INDEX" && \
+    python -m pip install --upgrade pip && \
+    if [ -n "$HTTP_PROXY" ]; then \
+        python -m pip install --proxy=$HTTP_PROXY -r requirements.txt; \
+    else \
+        python -m pip install -r requirements.txt; \
+    fi
+
+# Copy the rest of the application into the image
+COPY . /app
+
+# Install the LLaMA Factory
+RUN EXTRA_PACKAGES="metrics"; \
+    if [ "$INSTALL_BNB" == "true" ]; then \
+        EXTRA_PACKAGES="${EXTRA_PACKAGES},bitsandbytes"; \
+    fi; \
+    if [ "$INSTALL_VLLM" == "true" ]; then \
+        EXTRA_PACKAGES="${EXTRA_PACKAGES},vllm"; \
+    fi; \
+    if [ "$INSTALL_DEEPSPEED" == "true" ]; then \
+        EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
+    fi; \
+    if [ "$INSTALL_LIGER_KERNEL" == "true" ]; then \
+        EXTRA_PACKAGES="${EXTRA_PACKAGES},liger-kernel"; \
+    fi; \
+    if [ "$INSTALL_HQQ" == "true" ]; then \
+        EXTRA_PACKAGES="${EXTRA_PACKAGES},hqq"; \
+    fi; \
+    if [ -n "$HTTP_PROXY" ]; then \
+        pip install --proxy=$HTTP_PROXY -e ".[$EXTRA_PACKAGES]"; \
+    else \
+        pip install -e ".[$EXTRA_PACKAGES]"; \
+    fi
+
+# Rebuild flash attention
+RUN pip uninstall -y transformer-engine flash-attn && \
+    if [ "$INSTALL_FLASHATTN" == "true" ]; then \
+        pip uninstall -y ninja && \
+        if [ -n "$HTTP_PROXY" ]; then \
+            pip install --proxy=$HTTP_PROXY ninja && \
+            pip install --proxy=$HTTP_PROXY --no-cache-dir flash-attn --no-build-isolation; \
+        else \
+            pip install ninja && \
+            pip install --no-cache-dir flash-attn --no-build-isolation; \
+        fi; \
+    fi
+
+# Unset http proxy
+RUN if [ -n "$HTTP_PROXY" ]; then \
+        unset http_proxy; \
+        unset https_proxy; \
+    fi
+
+# Set up volumes
+VOLUME [ "/root/.cache/huggingface", "/root/.cache/modelscope", "/app/data", "/app/output" ]
+
+# Expose port 7860 for the LLaMA Board
+ENV GRADIO_SERVER_PORT 7860
+EXPOSE 7860
+
+# Expose port 8000 for the API service
+ENV API_PORT 8000
+EXPOSE 8000
--- a/docker/docker-rocm/docker-compose.yml
+++ b/docker/docker-rocm/docker-compose.yml
+services:
+  llamafactory:
+    build:
+      dockerfile: ./docker/docker-rocm/Dockerfile
+      context: ../..
+      args:
+        INSTALL_BNB: "false"
+        INSTALL_VLLM: "false"
+        INSTALL_DEEPSPEED: "false"
+        INSTALL_FLASHATTN: "false"
+        INSTALL_LIGER_KERNEL: "false"
+        INSTALL_HQQ: "false"
+        PIP_INDEX: https://pypi.org/simple
+    container_name: llamafactory
+    volumes:
+      - ../../hf_cache:/root/.cache/huggingface
+      - ../../ms_cache:/root/.cache/modelscope
+      - ../../om_cache:/root/.cache/openmind
+      - ../../data:/app/data
+      - ../../output:/app/output
+      - ../../saves:/app/saves
+    ports:
+      - "7860:7860"
+      - "8000:8000"
+    ipc: host
+    tty: true
+    shm_size: "16gb"
+    stdin_open: true
+    command: bash
+    devices:
+      - /dev/kfd:/dev/kfd
+      - /dev/dri:/dev/dri
+    restart: unless-stopped
--- a/evaluation/ceval/ceval.py
+++ b/evaluation/ceval/ceval.py
+# Copyright 2025 the LlamaFactory team.
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import datasets
+import pandas as pd
+
+
+_CITATION = """\
+@article{huang2023ceval,
+  title={C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models},
+  author={Huang, Yuzhen and Bai, Yuzhuo and Zhu, Zhihao and others},
+  journal={arXiv preprint arXiv:2305.08322},
+  year={2023}
+}
+"""
+
+_DESCRIPTION = """\
+C-Eval is a comprehensive Chinese evaluation suite for foundation models.
+It consists of 13948 multi-choice questions spanning 52 diverse disciplines and four difficulty levels.
+"""
+
+_HOMEPAGE = "https://cevalbenchmark.com"
+
+_LICENSE = "Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License"
+
+_URL = "ceval.zip"
+
+task_list = [
+    "computer_network",
+    "operating_system",
+    "computer_architecture",
+    "college_programming",
+    "college_physics",
+    "college_chemistry",
+    "advanced_mathematics",
+    "probability_and_statistics",
+    "discrete_mathematics",
+    "electrical_engineer",
+    "metrology_engineer",
+    "high_school_mathematics",
+    "high_school_physics",
+    "high_school_chemistry",
+    "high_school_biology",
+    "middle_school_mathematics",
+    "middle_school_biology",
+    "middle_school_physics",
+    "middle_school_chemistry",
+    "veterinary_medicine",
+    "college_economics",
+    "business_administration",
+    "marxism",
+    "mao_zedong_thought",
+    "education_science",
+    "teacher_qualification",
+    "high_school_politics",
+    "high_school_geography",
+    "middle_school_politics",
+    "middle_school_geography",
+    "modern_chinese_history",
+    "ideological_and_moral_cultivation",
+    "logic",
+    "law",
+    "chinese_language_and_literature",
+    "art_studies",
+    "professional_tour_guide",
+    "legal_professional",
+    "high_school_chinese",
+    "high_school_history",
+    "middle_school_history",
+    "civil_servant",
+    "sports_science",
+    "plant_protection",
+    "basic_medicine",
+    "clinical_medicine",
+    "urban_and_rural_planner",
+    "accountant",
+    "fire_engineer",
+    "environmental_impact_assessment_engineer",
+    "tax_accountant",
+    "physician",
+]
+
+
+class CevalConfig(datasets.BuilderConfig):
+    def __init__(self, **kwargs):
+        super().__init__(version=datasets.Version("1.0.0"), **kwargs)
+
+
+class Ceval(datasets.GeneratorBasedBuilder):
+    BUILDER_CONFIGS = [
+        CevalConfig(
+            name=task_name,
+        )
+        for task_name in task_list
+    ]
+
+    def _info(self):
+        features = datasets.Features(
+            {
+                "id": datasets.Value("int32"),
+                "question": datasets.Value("string"),
+                "A": datasets.Value("string"),
+                "B": datasets.Value("string"),
+                "C": datasets.Value("string"),
+                "D": datasets.Value("string"),
+                "answer": datasets.Value("string"),
+                "explanation": datasets.Value("string"),
+            }
+        )
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        data_dir = dl_manager.download_and_extract(_URL)
+        task_name = self.config.name
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, "test", f"{task_name}_test.csv"),
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, "val", f"{task_name}_val.csv"),
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, "dev", f"{task_name}_dev.csv"),
+                },
+            ),
+        ]
+
+    def _generate_examples(self, filepath):
+        df = pd.read_csv(filepath, encoding="utf-8")
+        for i, instance in enumerate(df.to_dict(orient="records")):
+            if "answer" not in instance.keys():
+                instance["answer"] = ""
+            if "explanation" not in instance.keys():
+                instance["explanation"] = ""
+            yield i, instance
--- a/evaluation/ceval/ceval.zip
+++ b/evaluation/ceval/ceval.zip
--- a/evaluation/ceval/mapping.json
+++ b/evaluation/ceval/mapping.json
+{
+  "accountant": {
+    "name": "注册会计师",
+    "category": "Other"
+  },
+  "advanced_mathematics": {
+    "name": "高等数学",
+    "category": "STEM"
+  },
+  "art_studies": {
+    "name": "艺术学",
+    "category": "Humanities"
+  },
+  "basic_medicine": {
+    "name": "基础医学",
+    "category": "Other"
+  },
+  "business_administration": {
+    "name": "工商管理",
+    "category": "Social Sciences"
+  },
+  "chinese_language_and_literature": {
+    "name": "中国语言文学",
+    "category": "Humanities"
+  },
+  "civil_servant": {
+    "name": "公务员",
+    "category": "Other"
+  },
+  "clinical_medicine": {
+    "name": "临床医学",
+    "category": "Other"
+  },
+  "college_chemistry": {
+    "name": "大学化学",
+    "category": "STEM"
+  },
+  "college_economics": {
+    "name": "大学经济学",
+    "category": "Social Sciences"
+  },
+  "college_physics": {
+    "name": "大学物理",
+    "category": "STEM"
+  },
+  "college_programming": {
+    "name": "大学编程",
+    "category": "STEM"
+  },
+  "computer_architecture": {
+    "name": "计算机组成",
+    "category": "STEM"
+  },
+  "computer_network": {
+    "name": "计算机网络",
+    "category": "STEM"
+  },
+  "discrete_mathematics": {
+    "name": "离散数学",
+    "category": "STEM"
+  },
+  "education_science": {
+    "name": "教育学",
+    "category": "Social Sciences"
+  },
+  "electrical_engineer": {
+    "name": "注册电气工程师",
+    "category": "STEM"
+  },
+  "environmental_impact_assessment_engineer": {
+    "name": "环境影响评价工程师",
+    "category": "Other"
+  },
+  "fire_engineer": {
+    "name": "注册消防工程师",
+    "category": "Other"
+  },
+  "high_school_biology": {
+    "name": "高中生物",
+    "category": "STEM"
+  },
+  "high_school_chemistry": {
+    "name": "高中化学",
+    "category": "STEM"
+  },
+  "high_school_chinese": {
+    "name": "高中语文",
+    "category": "Humanities"
+  },
+  "high_school_geography": {
+    "name": "高中地理",
+    "category": "Social Sciences"
+  },
+  "high_school_history": {
+    "name": "高中历史",
+    "category": "Humanities"
+  },
+  "high_school_mathematics": {
+    "name": "高中数学",
+    "category": "STEM"
+  },
+  "high_school_physics": {
+    "name": "高中物理",
+    "category": "STEM"
+  },
+  "high_school_politics": {
+    "name": "高中政治",
+    "category": "Social Sciences"
+  },
+  "ideological_and_moral_cultivation": {
+    "name": "思想道德修养与法律基础",
+    "category": "Humanities"
+  },
+  "law": {
+    "name": "法学",
+    "category": "Humanities"
+  },
+  "legal_professional": {
+    "name": "法律职业资格",
+    "category": "Humanities"
+  },
+  "logic": {
+    "name": "逻辑学",
+    "category": "Humanities"
+  },
+  "mao_zedong_thought": {
+    "name": "毛泽东思想和中国特色社会主义理论体系概论",
+    "category": "Social Sciences"
+  },
+  "marxism": {
+    "name": "马克思主义基本原理",
+    "category": "Social Sciences"
+  },
+  "metrology_engineer": {
+    "name": "注册计量师",
+    "category": "STEM"
+  },
+  "middle_school_biology": {
+    "name": "初中生物",
+    "category": "STEM"
+  },
+  "middle_school_chemistry": {
+    "name": "初中化学",
+    "category": "STEM"
+  },
+  "middle_school_geography": {
+    "name": "初中地理",
+    "category": "Social Sciences"
+  },
+  "middle_school_history": {
+    "name": "初中历史",
+    "category": "Humanities"
+  },
+  "middle_school_mathematics": {
+    "name": "初中数学",
+    "category": "STEM"
+  },
+  "middle_school_physics": {
+    "name": "初中物理",
+    "category": "STEM"
+  },
+  "middle_school_politics": {
+    "name": "初中政治",
+    "category": "Social Sciences"
+  },
+  "modern_chinese_history": {
+    "name": "近代史纲要",
+    "category": "Humanities"
+  },
+  "operating_system": {
+    "name": "操作系统",
+    "category": "STEM"
+  },
+  "physician": {
+    "name": "医师资格",
+    "category": "Other"
+  },
+  "plant_protection": {
+    "name": "植物保护",
+    "category": "Other"
+  },
+  "probability_and_statistics": {
+    "name": "概率统计",
+    "category": "STEM"
+  },
+  "professional_tour_guide": {
+    "name": "导游资格",
+    "category": "Humanities"
+  },
+  "sports_science": {
+    "name": "体育学",
+    "category": "Other"
+  },
+  "tax_accountant": {
+    "name": "税务师",
+    "category": "Other"
+  },
+  "teacher_qualification": {
+    "name": "教师资格",
+    "category": "Social Sciences"
+  },
+  "urban_and_rural_planner": {
+    "name": "注册城乡规划师",
+    "category": "Other"
+  },
+  "veterinary_medicine": {
+    "name": "兽医学",
+    "category": "STEM"
+  }
+}
--- a/evaluation/cmmlu/cmmlu.py
+++ b/evaluation/cmmlu/cmmlu.py
+# Copyright 2025 the LlamaFactory team.
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import datasets
+import pandas as pd
+
+
+_CITATION = """\
+@article{li2023cmmlu,
+  title={CMMLU: Measuring massive multitask language understanding in Chinese},
+  author={Haonan Li and Yixuan Zhang and Fajri Koto and Yifei Yang and others,
+  journal={arXiv preprint arXiv:2306.09212},
+  year={2023}
+}
+"""
+
+_DESCRIPTION = """\
+CMMLU is a comprehensive Chinese assessment suite specifically designed to evaluate the advanced knowledge
+and reasoning abilities of LLMs within the Chinese language and cultural context.
+"""
+
+_HOMEPAGE = "https://github.com/haonan-li/CMMLU"
+
+_LICENSE = "Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License"
+
+_URL = "cmmlu.zip"
+
+task_list = [
+    "agronomy",
+    "anatomy",
+    "ancient_chinese",
+    "arts",
+    "astronomy",
+    "business_ethics",
+    "chinese_civil_service_exam",
+    "chinese_driving_rule",
+    "chinese_food_culture",
+    "chinese_foreign_policy",
+    "chinese_history",
+    "chinese_literature",
+    "chinese_teacher_qualification",
+    "clinical_knowledge",
+    "college_actuarial_science",
+    "college_education",
+    "college_engineering_hydrology",
+    "college_law",
+    "college_mathematics",
+    "college_medical_statistics",
+    "college_medicine",
+    "computer_science",
+    "computer_security",
+    "conceptual_physics",
+    "construction_project_management",
+    "economics",
+    "education",
+    "electrical_engineering",
+    "elementary_chinese",
+    "elementary_commonsense",
+    "elementary_information_and_technology",
+    "elementary_mathematics",
+    "ethnology",
+    "food_science",
+    "genetics",
+    "global_facts",
+    "high_school_biology",
+    "high_school_chemistry",
+    "high_school_geography",
+    "high_school_mathematics",
+    "high_school_physics",
+    "high_school_politics",
+    "human_sexuality",
+    "international_law",
+    "journalism",
+    "jurisprudence",
+    "legal_and_moral_basis",
+    "logical",
+    "machine_learning",
+    "management",
+    "marketing",
+    "marxist_theory",
+    "modern_chinese",
+    "nutrition",
+    "philosophy",
+    "professional_accounting",
+    "professional_law",
+    "professional_medicine",
+    "professional_psychology",
+    "public_relations",
+    "security_study",
+    "sociology",
+    "sports_science",
+    "traditional_chinese_medicine",
+    "virology",
+    "world_history",
+    "world_religions",
+]
+
+
+class CMMLUConfig(datasets.BuilderConfig):
+    def __init__(self, **kwargs):
+        super().__init__(version=datasets.Version("1.0.1"), **kwargs)
+
+
+class CMMLU(datasets.GeneratorBasedBuilder):
+    BUILDER_CONFIGS = [
+        CMMLUConfig(
+            name=task_name,
+        )
+        for task_name in task_list
+    ]
+
+    def _info(self):
+        features = datasets.Features(
+            {
+                "question": datasets.Value("string"),
+                "A": datasets.Value("string"),
+                "B": datasets.Value("string"),
+                "C": datasets.Value("string"),
+                "D": datasets.Value("string"),
+                "answer": datasets.Value("string"),
+            }
+        )
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        data_dir = dl_manager.download_and_extract(_URL)
+        task_name = self.config.name
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, f"test/{task_name}.csv"),
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, f"dev/{task_name}.csv"),
+                },
+            ),
+        ]
+
+    def _generate_examples(self, filepath):
+        df = pd.read_csv(filepath, header=0, index_col=0, encoding="utf-8")
+        for i, instance in enumerate(df.to_dict(orient="records")):
+            question = instance.pop("Question", "")
+            answer = instance.pop("Answer", "")
+            instance["question"] = question
+            instance["answer"] = answer
+            yield i, instance
--- a/evaluation/cmmlu/cmmlu.zip
+++ b/evaluation/cmmlu/cmmlu.zip
--- a/evaluation/cmmlu/mapping.json
+++ b/evaluation/cmmlu/mapping.json
+{
+  "agronomy": {
+    "name": "农学",
+    "category": "Other"
+  },
+  "anatomy": {
+    "name": "解剖学",
+    "category": "STEM"
+  },
+  "ancient_chinese": {
+    "name": "古汉语",
+    "category": "Social Sciences"
+  },
+  "arts": {
+    "name": "艺术学",
+    "category": "Humanities"
+  },
+  "astronomy": {
+    "name": "天文学",
+    "category": "STEM"
+  },
+  "business_ethics": {
+    "name": "商业伦理",
+    "category": "Social Sciences"
+  },
+  "chinese_civil_service_exam": {
+    "name": "中国公务员考试",
+    "category": "Social Sciences"
+  },
+  "chinese_driving_rule": {
+    "name": "中国驾驶规则",
+    "category": "Other"
+  },
+  "chinese_food_culture": {
+    "name": "中国饮食文化",
+    "category": "Social Sciences"
+  },
+  "chinese_foreign_policy": {
+    "name": "中国外交政策",
+    "category": "Social Sciences"
+  },
+  "chinese_history": {
+    "name": "中国历史",
+    "category": "Humanities"
+  },
+  "chinese_literature": {
+    "name": "中国文学",
+    "category": "Humanities"
+  },
+  "chinese_teacher_qualification": {
+    "name": "中国教师资格",
+    "category": "Social Sciences"
+  },
+  "college_actuarial_science": {
+    "name": "大学精算学",
+    "category": "STEM"
+  },
+  "college_education": {
+    "name": "大学教育学",
+    "category": "Social Sciences"
+  },
+  "college_engineering_hydrology": {
+    "name": "大学工程水文学",
+    "category": "STEM"
+  },
+  "college_law": {
+    "name": "大学法律",
+    "category": "Humanities"
+  },
+  "college_mathematics": {
+    "name": "大学数学",
+    "category": "STEM"
+  },
+  "college_medical_statistics": {
+    "name": "大学医学统计",
+    "category": "STEM"
+  },
+  "clinical_knowledge": {
+    "name": "临床知识",
+    "category": "Other"
+  },
+  "college_medicine": {
+    "name": "大学医学",
+    "category": "Other"
+  },
+  "computer_science": {
+    "name": "计算机科学",
+    "category": "STEM"
+  },
+  "computer_security": {
+    "name": "计算机安全",
+    "category": "Other"
+  },
+  "conceptual_physics": {
+    "name": "概念物理学",
+    "category": "STEM"
+  },
+  "construction_project_management": {
+    "name": "建设工程管理",
+    "category": "Other"
+  },
+  "economics": {
+    "name": "经济学",
+    "category": "Social Sciences"
+  },
+  "education": {
+    "name": "教育学",
+    "category": "Social Sciences"
+  },
+  "elementary_chinese": {
+    "name": "小学语文",
+    "category": "Social Sciences"
+  },
+  "elementary_commonsense": {
+    "name": "小学常识",
+    "category": "Other"
+  },
+  "elementary_information_and_technology": {
+    "name": "小学信息技术",
+    "category": "Other"
+  },
+  "electrical_engineering": {
+    "name": "电气工程",
+    "category": "STEM"
+  },
+  "elementary_mathematics": {
+    "name": "初等数学",
+    "category": "STEM"
+  },
+  "ethnology": {
+    "name": "民族学",
+    "category": "Social Sciences"
+  },
+  "food_science": {
+    "name": "食品科学",
+    "category": "Other"
+  },
+  "genetics": {
+    "name": "遗传学",
+    "category": "STEM"
+  },
+  "global_facts": {
+    "name": "全球事实",
+    "category": "Humanities"
+  },
+  "high_school_biology": {
+    "name": "高中生物",
+    "category": "STEM"
+  },
+  "high_school_chemistry": {
+    "name": "高中化学",
+    "category": "STEM"
+  },
+  "high_school_geography": {
+    "name": "高中地理",
+    "category": "Social Sciences"
+  },
+  "high_school_mathematics": {
+    "name": "高中数学",
+    "category": "STEM"
+  },
+  "high_school_physics": {
+    "name": "高中物理学",
+    "category": "STEM"
+  },
+  "high_school_politics": {
+    "name": "高中政治",
+    "category": "Social Sciences"
+  },
+  "human_sexuality": {
+    "name": "人类性行为",
+    "category": "Other"
+  },
+  "international_law": {
+    "name": "国际法学",
+    "category": "Humanities"
+  },
+  "journalism": {
+    "name": "新闻学",
+    "category": "Social Sciences"
+  },
+  "jurisprudence": {
+    "name": "法理学",
+    "category": "Humanities"
+  },
+  "legal_and_moral_basis": {
+    "name": "法律与道德基础",
+    "category": "Other"
+  },
+  "logical": {
+    "name": "逻辑学",
+    "category": "Humanities"
+  },
+  "machine_learning": {
+    "name": "机器学习",
+    "category": "STEM"
+  },
+  "management": {
+    "name": "管理学",
+    "category": "Social Sciences"
+  },
+  "marketing": {
+    "name": "市场营销",
+    "category": "Social Sciences"
+  },
+  "marxist_theory": {
+    "name": "马克思主义理论",
+    "category": "Humanities"
+  },
+  "modern_chinese": {
+    "name": "现代汉语",
+    "category": "Social Sciences"
+  },
+  "nutrition": {
+    "name": "营养学",
+    "category": "Other"
+  },
+  "philosophy": {
+    "name": "哲学",
+    "category": "Humanities"
+  },
+  "professional_accounting": {
+    "name": "专业会计",
+    "category": "Social Sciences"
+  },
+  "professional_law": {
+    "name": "专业法学",
+    "category": "Humanities"
+  },
+  "professional_medicine": {
+    "name": "专业医学",
+    "category": "Other"
+  },
+  "professional_psychology": {
+    "name": "专业心理学",
+    "category": "Social Sciences"
+  },
+  "public_relations": {
+    "name": "公共关系",
+    "category": "Social Sciences"
+  },
+  "security_study": {
+    "name": "安全研究",
+    "category": "Social Sciences"
+  },
+  "sociology": {
+    "name": "社会学",
+    "category": "Social Sciences"
+  },
+  "sports_science": {
+    "name": "体育学",
+    "category": "Other"
+  },
+  "traditional_chinese_medicine": {
+    "name": "中医中药",
+    "category": "Other"
+  },
+  "virology": {
+    "name": "病毒学",
+    "category": "STEM"
+  },
+  "world_history": {
+    "name": "世界历史",
+    "category": "Humanities"
+  },
+  "world_religions": {
+    "name": "世界宗教",
+    "category": "Humanities"
+  }
+}
--- a/evaluation/mmlu/mapping.json
+++ b/evaluation/mmlu/mapping.json
+{
+  "abstract_algebra": {
+    "name": "abstract algebra",
+    "category": "STEM"
+  },
+  "anatomy": {
+    "name": "anatomy",
+    "category": "Other"
+  },
+  "astronomy": {
+    "name": "astronomy",
+    "category": "STEM"
+  },
+  "business_ethics": {
+    "name": "business ethics",
+    "category": "Other"
+  },
+  "clinical_knowledge": {
+    "name": "clinical knowledge",
+    "category": "Other"
+  },
+  "college_biology": {
+    "name": "college biology",
+    "category": "STEM"
+  },
+  "college_chemistry": {
+    "name": "college chemistry",
+    "category": "STEM"
+  },
+  "college_computer_science": {
+    "name": "college computer science",
+    "category": "STEM"
+  },
+  "college_mathematics": {
+    "name": "college mathematics",
+    "category": "STEM"
+  },
+  "college_medicine": {
+    "name": "college medicine",
+    "category": "Other"
+  },
+  "college_physics": {
+    "name": "college physics",
+    "category": "STEM"
+  },
+  "computer_security": {
+    "name": "computer security",
+    "category": "STEM"
+  },
+  "conceptual_physics": {
+    "name": "conceptual physics",
+    "category": "STEM"
+  },
+  "econometrics": {
+    "name": "econometrics",
+    "category": "Social Sciences"
+  },
+  "electrical_engineering": {
+    "name": "electrical engineering",
+    "category": "STEM"
+  },
+  "elementary_mathematics": {
+    "name": "elementary mathematics",
+    "category": "STEM"
+  },
+  "formal_logic": {
+    "name": "formal logic",
+    "category": "Humanities"
+  },
+  "global_facts": {
+    "name": "global facts",
+    "category": "Other"
+  },
+  "high_school_biology": {
+    "name": "high school biology",
+    "category": "STEM"
+  },
+  "high_school_chemistry": {
+    "name": "high school chemistry",
+    "category": "STEM"
+  },
+  "high_school_computer_science": {
+    "name": "high school computer science",
+    "category": "STEM"
+  },
+  "high_school_european_history": {
+    "name": "high school european history",
+    "category": "Humanities"
+  },
+  "high_school_geography": {
+    "name": "high school geography",
+    "category": "Social Sciences"
+  },
+  "high_school_government_and_politics": {
+    "name": "high school government and politics",
+    "category": "Social Sciences"
+  },
+  "high_school_macroeconomics": {
+    "name": "high school macroeconomics",
+    "category": "Social Sciences"
+  },
+  "high_school_mathematics": {
+    "name": "high school mathematics",
+    "category": "STEM"
+  },
+  "high_school_microeconomics": {
+    "name": "high school microeconomics",
+    "category": "Social Sciences"
+  },
+  "high_school_physics": {
+    "name": "high school physics",
+    "category": "STEM"
+  },
+  "high_school_psychology": {
+    "name": "high school psychology",
+    "category": "Social Sciences"
+  },
+  "high_school_statistics": {
+    "name": "high school statistics",
+    "category": "STEM"
+  },
+  "high_school_us_history": {
+    "name": "high school us history",
+    "category": "Humanities"
+  },
+  "high_school_world_history": {
+    "name": "high school world history",
+    "category": "Humanities"
+  },
+  "human_aging": {
+    "name": "human aging",
+    "category": "Other"
+  },
+  "human_sexuality": {
+    "name": "human sexuality",
+    "category": "Social Sciences"
+  },
+  "international_law": {
+    "name": "international law",
+    "category": "Humanities"
+  },
+  "jurisprudence": {
+    "name": "jurisprudence",
+    "category": "Humanities"
+  },
+  "logical_fallacies": {
+    "name": "logical fallacies",
+    "category": "Humanities"
+  },
+  "machine_learning": {
+    "name": "machine learning",
+    "category": "STEM"
+  },
+  "management": {
+    "name": "management",
+    "category": "Other"
+  },
+  "marketing": {
+    "name": "marketing",
+    "category": "Other"
+  },
+  "medical_genetics": {
+    "name": "medical genetics",
+    "category": "Other"
+  },
+  "miscellaneous": {
+    "name": "miscellaneous",
+    "category": "Other"
+  },
+  "moral_disputes": {
+    "name": "moral disputes",
+    "category": "Humanities"
+  },
+  "moral_scenarios": {
+    "name": "moral scenarios",
+    "category": "Humanities"
+  },
+  "nutrition": {
+    "name": "nutrition",
+    "category": "Other"
+  },
+  "philosophy": {
+    "name": "philosophy",
+    "category": "Humanities"
+  },
+  "prehistory": {
+    "name": "prehistory",
+    "category": "Humanities"
+  },
+  "professional_accounting": {
+    "name": "professional accounting",
+    "category": "Other"
+  },
+  "professional_law": {
+    "name": "professional law",
+    "category": "Humanities"
+  },
+  "professional_medicine": {
+    "name": "professional medicine",
+    "category": "Other"
+  },
+  "professional_psychology": {
+    "name": "professional psychology",
+    "category": "Social Sciences"
+  },
+  "public_relations": {
+    "name": "public relations",
+    "category": "Social Sciences"
+  },
+  "security_studies": {
+    "name": "security studies",
+    "category": "Social Sciences"
+  },
+  "sociology": {
+    "name": "sociology",
+    "category": "Social Sciences"
+  },
+  "us_foreign_policy": {
+    "name": "us foreign policy",
+    "category": "Social Sciences"
+  },
+  "virology": {
+    "name": "virology",
+    "category": "Other"
+  },
+  "world_religions": {
+    "name": "world religions",
+    "category": "Humanities"
+  }
+}
--- a/evaluation/mmlu/mmlu.py
+++ b/evaluation/mmlu/mmlu.py
+# Copyright 2025 the LlamaFactory team.
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import datasets
+import pandas as pd
+
+
+_CITATION = """\
+@article{hendryckstest2021,
+  title={Measuring Massive Multitask Language Understanding},
+  author={Dan Hendrycks and Collin Burns and others},
+  journal={Proceedings of the International Conference on Learning Representations (ICLR)},
+  year={2021}
+}
+"""
+
+_DESCRIPTION = """\
+Measuring Massive Multitask Language Understanding by Dan Hendrycks, Collin Burns, Steven Basart,
+Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt (ICLR 2021).
+"""
+
+_HOMEPAGE = "https://github.com/hendrycks/test"
+
+_LICENSE = "MIT"
+
+_URL = "mmlu.zip"
+
+task_list = [
+    "high_school_european_history",
+    "business_ethics",
+    "clinical_knowledge",
+    "medical_genetics",
+    "high_school_us_history",
+    "high_school_physics",
+    "high_school_world_history",
+    "virology",
+    "high_school_microeconomics",
+    "econometrics",
+    "college_computer_science",
+    "high_school_biology",
+    "abstract_algebra",
+    "professional_accounting",
+    "philosophy",
+    "professional_medicine",
+    "nutrition",
+    "global_facts",
+    "machine_learning",
+    "security_studies",
+    "public_relations",
+    "professional_psychology",
+    "prehistory",
+    "anatomy",
+    "human_sexuality",
+    "college_medicine",
+    "high_school_government_and_politics",
+    "college_chemistry",
+    "logical_fallacies",
+    "high_school_geography",
+    "elementary_mathematics",
+    "human_aging",
+    "college_mathematics",
+    "high_school_psychology",
+    "formal_logic",
+    "high_school_statistics",
+    "international_law",
+    "high_school_mathematics",
+    "high_school_computer_science",
+    "conceptual_physics",
+    "miscellaneous",
+    "high_school_chemistry",
+    "marketing",
+    "professional_law",
+    "management",
+    "college_physics",
+    "jurisprudence",
+    "world_religions",
+    "sociology",
+    "us_foreign_policy",
+    "high_school_macroeconomics",
+    "computer_security",
+    "moral_scenarios",
+    "moral_disputes",
+    "electrical_engineering",
+    "astronomy",
+    "college_biology",
+]
+
+
+class MMLUConfig(datasets.BuilderConfig):
+    def __init__(self, **kwargs):
+        super().__init__(version=datasets.Version("1.0.0"), **kwargs)
+
+
+class MMLU(datasets.GeneratorBasedBuilder):
+    BUILDER_CONFIGS = [
+        MMLUConfig(
+            name=task_name,
+        )
+        for task_name in task_list
+    ]
+
+    def _info(self):
+        features = datasets.Features(
+            {
+                "question": datasets.Value("string"),
+                "A": datasets.Value("string"),
+                "B": datasets.Value("string"),
+                "C": datasets.Value("string"),
+                "D": datasets.Value("string"),
+                "answer": datasets.Value("string"),
+            }
+        )
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        data_dir = dl_manager.download_and_extract(_URL)
+        task_name = self.config.name
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, "data", "test", f"{task_name}_test.csv"),
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, "data", "val", f"{task_name}_val.csv"),
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, "data", "dev", f"{task_name}_dev.csv"),
+                },
+            ),
+        ]
+
+    def _generate_examples(self, filepath):
+        df = pd.read_csv(filepath, header=None)
+        df.columns = ["question", "A", "B", "C", "D", "answer"]
+
+        yield from enumerate(df.to_dict(orient="records"))
--- a/evaluation/mmlu/mmlu.zip
+++ b/evaluation/mmlu/mmlu.zip
--- a/examples/README.md
+++ b/examples/README.md
+We provide diverse examples about fine-tuning LLMs.
+
+Make sure to execute these commands in the `LLaMA-Factory` directory.
+
+## Table of Contents
+
+- [LoRA Fine-Tuning](#lora-fine-tuning)
+- [QLoRA Fine-Tuning](#qlora-fine-tuning)
+- [Full-Parameter Fine-Tuning](#full-parameter-fine-tuning)
+- [Merging LoRA Adapters and Quantization](#merging-lora-adapters-and-quantization)
+- [Inferring LoRA Fine-Tuned Models](#inferring-lora-fine-tuned-models)
+- [Extras](#extras)
+
+Use `CUDA_VISIBLE_DEVICES` (GPU) or `ASCEND_RT_VISIBLE_DEVICES` (NPU) to choose computing devices.
+
+By default, LLaMA-Factory uses all visible computing devices.
+
+## Examples
+
+### LoRA Fine-Tuning
+
+#### (Continuous) Pre-Training
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_lora_pretrain.yaml
+```
+
+#### Supervised Fine-Tuning
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
+```
+
+#### Multimodal Supervised Fine-Tuning
+
+```bash
+llamafactory-cli train examples/train_lora/llava1_5_lora_sft.yaml
+llamafactory-cli train examples/train_lora/qwen2vl_lora_sft.yaml
+```
+
+#### DPO/ORPO/SimPO Training
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_lora_dpo.yaml
+```
+
+#### Multimodal DPO/ORPO/SimPO Training
+
+```bash
+llamafactory-cli train examples/train_lora/qwen2vl_lora_dpo.yaml
+```
+
+#### Reward Modeling
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_lora_reward.yaml
+```
+
+#### PPO Training
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_lora_ppo.yaml
+```
+
+#### KTO Training
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_lora_kto.yaml
+```
+
+#### Preprocess Dataset
+
+It is useful for large dataset, use `tokenized_path` in config to load the preprocessed dataset.
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_preprocess.yaml
+```
+
+#### Evaluating on MMLU/CMMLU/C-Eval Benchmarks
+
+```bash
+llamafactory-cli eval examples/train_lora/llama3_lora_eval.yaml
+```
+
+#### Supervised Fine-Tuning on Multiple Nodes
+
+```bash
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
+```
+
+#### Supervised Fine-Tuning with DeepSpeed ZeRO-3 (Weight Sharding)
+
+```bash
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds3.yaml
+```
+
+#### Supervised Fine-Tuning with Ray on 4 GPUs
+
+```bash
+USE_RAY=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ray.yaml
+```
+
+### QLoRA Fine-Tuning
+
+#### Supervised Fine-Tuning with 4/8-bit Bitsandbytes/HQQ/EETQ Quantization (Recommended)
+
+```bash
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_otfq.yaml
+```
+
+#### Supervised Fine-Tuning with 4-bit Bitsandbytes Quantization on Ascend NPU
+
+```bash
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_bnb_npu.yaml
+```
+
+#### Supervised Fine-Tuning with 4/8-bit GPTQ Quantization
+
+```bash
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_gptq.yaml
+```
+
+#### Supervised Fine-Tuning with 4-bit AWQ Quantization
+
+```bash
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_awq.yaml
+```
+
+#### Supervised Fine-Tuning with 2-bit AQLM Quantization
+
+```bash
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_aqlm.yaml
+```
+
+### Full-Parameter Fine-Tuning
+
+#### Supervised Fine-Tuning on Single Node
+
+```bash
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+```
+
+#### Supervised Fine-Tuning on Multiple Nodes
+
+```bash
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+```
+
+#### Multimodal Supervised Fine-Tuning
+
+```bash
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/qwen2vl_full_sft.yaml
+```
+
+### Merging LoRA Adapters and Quantization
+
+#### Merge LoRA Adapters
+
+Note: DO NOT use quantized model or `quantization_bit` when merging LoRA adapters.
+
+```bash
+llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
+```
+
+#### Quantizing Model using AutoGPTQ
+
+```bash
+llamafactory-cli export examples/merge_lora/llama3_gptq.yaml
+```
+
+### Save Ollama modelfile
+
+```bash
+llamafactory-cli export examples/merge_lora/llama3_full_sft.yaml
+```
+
+### Inferring LoRA Fine-Tuned Models
+
+#### Batch Generation using vLLM Tensor Parallel
+
+```
+python scripts/vllm_infer.py --model_name_or_path path_to_merged_model --dataset alpaca_en_demo
+```
+
+#### Use CLI ChatBox
+
+```bash
+llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
+```
+
+#### Use Web UI ChatBox
+
+```bash
+llamafactory-cli webchat examples/inference/llama3_lora_sft.yaml
+```
+
+#### Launch OpenAI-style API
+
+```bash
+llamafactory-cli api examples/inference/llama3_lora_sft.yaml
+```
+
+### Extras
+
+#### Full-Parameter Fine-Tuning using GaLore
+
+```bash
+llamafactory-cli train examples/extras/galore/llama3_full_sft.yaml
+```
+
+#### Full-Parameter Fine-Tuning using APOLLO
+
+```bash
+llamafactory-cli train examples/extras/apollo/llama3_full_sft.yaml
+```
+
+#### Full-Parameter Fine-Tuning using BAdam
+
+```bash
+llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml
+```
+
+#### Full-Parameter Fine-Tuning using Adam-mini
+
+```bash
+llamafactory-cli train examples/extras/adam_mini/qwen2_full_sft.yaml
+```
+
+#### LoRA+ Fine-Tuning
+
+```bash
+llamafactory-cli train examples/extras/loraplus/llama3_lora_sft.yaml
+```
+
+#### PiSSA Fine-Tuning
+
+```bash
+llamafactory-cli train examples/extras/pissa/llama3_lora_sft.yaml
+```
+
+#### Mixture-of-Depths Fine-Tuning
+
+```bash
+llamafactory-cli train examples/extras/mod/llama3_full_sft.yaml
+```
+
+#### LLaMA-Pro Fine-Tuning
+
+```bash
+bash examples/extras/llama_pro/expand.sh
+llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
+```
+
+#### FSDP+QLoRA Fine-Tuning
+
+```bash
+bash examples/extras/fsdp_qlora/train.sh
+```
+
+#### Computing BLEU and ROUGE Scores
+
+```bash
+llamafactory-cli train examples/extras/nlg_eval/llama3_lora_predict.yaml
+```
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
+我们提供了多样化的大模型微调示例脚本。
+
+请确保在 `LLaMA-Factory` 目录下执行下述命令。
+
+## 目录
+
+- [LoRA 微调](#lora-微调)
+- [QLoRA 微调](#qlora-微调)
+- [全参数微调](#全参数微调)
+- [合并 LoRA 适配器与模型量化](#合并-lora-适配器与模型量化)
+- [推理 LoRA 模型](#推理-lora-模型)
+- [杂项](#杂项)
+
+使用 `CUDA_VISIBLE_DEVICES`（GPU）或 `ASCEND_RT_VISIBLE_DEVICES`（NPU）选择计算设备。
+
+LLaMA-Factory 默认使用所有可见的计算设备。
+
+## 示例
+
+### LoRA 微调
+
+#### （增量）预训练
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_lora_pretrain.yaml
+```
+
+#### 指令监督微调
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
+```
+
+#### 多模态指令监督微调
+
+```bash
+llamafactory-cli train examples/train_lora/llava1_5_lora_sft.yaml
+llamafactory-cli train examples/train_lora/qwen2vl_lora_sft.yaml
+```
+
+#### DPO/ORPO/SimPO 训练
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_lora_dpo.yaml
+```
+
+#### 多模态 DPO/ORPO/SimPO 训练
+
+```bash
+llamafactory-cli train examples/train_lora/qwen2vl_lora_dpo.yaml
+```
+
+#### 奖励模型训练
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_lora_reward.yaml
+```
+
+#### PPO 训练
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_lora_ppo.yaml
+```
+
+#### KTO 训练
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_lora_kto.yaml
+```
+
+#### 预处理数据集
+
+对于大数据集有帮助，在配置中使用 `tokenized_path` 以加载预处理后的数据集。
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_preprocess.yaml
+```
+
+#### 在 MMLU/CMMLU/C-Eval 上评估
+
+```bash
+llamafactory-cli eval examples/train_lora/llama3_lora_eval.yaml
+```
+
+#### 多机指令监督微调
+
+```bash
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
+```
+
+#### 使用 DeepSpeed ZeRO-3 平均分配显存
+
+```bash
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds3.yaml
+```
+
+#### 使用 Ray 在 4 张 GPU 上微调
+
+```bash
+USE_RAY=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ray.yaml
+```
+
+### QLoRA 微调
+
+#### 基于 4/8 比特 Bitsandbytes/HQQ/EETQ 量化进行指令监督微调（推荐）
+
+```bash
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_otfq.yaml
+```
+
+#### 在 NPU 上基于 4 比特 Bitsandbytes 量化进行指令监督微调
+
+```bash
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_bnb_npu.yaml
+```
+
+#### 基于 4/8 比特 GPTQ 量化进行指令监督微调
+
+```bash
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_gptq.yaml
+```
+
+#### 基于 4 比特 AWQ 量化进行指令监督微调
+
+```bash
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_awq.yaml
+```
+
+#### 基于 2 比特 AQLM 量化进行指令监督微调
+
+```bash
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_aqlm.yaml
+```
+
+### 全参数微调
+
+#### 在单机上进行指令监督微调
+
+```bash
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+```
+
+#### 在多机上进行指令监督微调
+
+```bash
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+```
+
+#### 多模态指令监督微调
+
+```bash
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/qwen2vl_full_sft.yaml
+```
+
+### 合并 LoRA 适配器与模型量化
+
+#### 合并 LoRA 适配器
+
+注：请勿使用量化后的模型或 `quantization_bit` 参数来合并 LoRA 适配器。
+
+```bash
+llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
+```
+
+#### 使用 AutoGPTQ 量化模型
+
+```bash
+llamafactory-cli export examples/merge_lora/llama3_gptq.yaml
+```
+
+### 保存 Ollama 配置文件
+
+```bash
+llamafactory-cli export examples/merge_lora/llama3_full_sft.yaml
+```
+
+### 推理 LoRA 模型
+
+#### 使用 vLLM+TP 批量推理
+
+```
+python scripts/vllm_infer.py --model_name_or_path path_to_merged_model --dataset alpaca_en_demo
+```
+
+#### 使用命令行对话框
+
+```bash
+llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
+```
+
+#### 使用浏览器对话框
+
+```bash
+llamafactory-cli webchat examples/inference/llama3_lora_sft.yaml
+```
+
+#### 启动 OpenAI 风格 API
+
+```bash
+llamafactory-cli api examples/inference/llama3_lora_sft.yaml
+```
+
+### 杂项
+
+#### 使用 GaLore 进行全参数训练
+
+```bash
+llamafactory-cli train examples/extras/galore/llama3_full_sft.yaml
+```
+
+#### 使用 APOLLO 进行全参数训练
+
+```bash
+llamafactory-cli train examples/extras/apollo/llama3_full_sft.yaml
+```
+
+#### 使用 BAdam 进行全参数训练
+
+```bash
+llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml
+```
+
+#### 使用 Adam-mini 进行全参数训练
+
+```bash
+llamafactory-cli train examples/extras/adam_mini/qwen2_full_sft.yaml
+```
+
+#### LoRA+ 微调
+
+```bash
+llamafactory-cli train examples/extras/loraplus/llama3_lora_sft.yaml
+```
+
+#### PiSSA 微调
+
+```bash
+llamafactory-cli train examples/extras/pissa/llama3_lora_sft.yaml
+```
+
+#### 深度混合微调
+
+```bash
+llamafactory-cli train examples/extras/mod/llama3_full_sft.yaml
+```
+
+#### LLaMA-Pro 微调
+
+```bash
+bash examples/extras/llama_pro/expand.sh
+llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
+```
+
+#### FSDP+QLoRA 微调
+
+```bash
+bash examples/extras/fsdp_qlora/train.sh
+```
+
+#### 计算 BLEU 和 ROUGE 分数
+
+```bash
+llamafactory-cli train examples/extras/nlg_eval/llama3_lora_predict.yaml
+```
--- a/examples/accelerate/fsdp_config.yaml
+++ b/examples/accelerate/fsdp_config.yaml
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch: BACKWARD_PRE
+  fsdp_forward_prefetch: false
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_offload_params: true # offload may affect training speed
+  fsdp_sharding_strategy: FULL_SHARD
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_use_orig_params: true
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16 # or fp16
+num_machines: 1 # the number of nodes
+num_processes: 2 # the number of GPUs in all nodes
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
--- a/examples/deepspeed/ds_z0_config.json
+++ b/examples/deepspeed/ds_z0_config.json
+{
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "zero_allow_untested_optimizer": true,
+  "fp16": {
+    "enabled": "auto",
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 16,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "zero_optimization": {
+    "stage": 0,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 5e8,
+    "overlap_comm": false,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 5e8,
+    "contiguous_gradients": true,
+    "round_robin_gradients": true
+  }
+}
--- a/examples/deepspeed/ds_z2_config.json
+++ b/examples/deepspeed/ds_z2_config.json
+{
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "zero_allow_untested_optimizer": true,
+  "fp16": {
+    "enabled": "auto",
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 16,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "zero_optimization": {
+    "stage": 2,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 5e8,
+    "overlap_comm": false,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 5e8,
+    "contiguous_gradients": true,
+    "round_robin_gradients": true
+  }
+}
--- a/examples/deepspeed/ds_z2_offload_config.json
+++ b/examples/deepspeed/ds_z2_offload_config.json
+{
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "zero_allow_untested_optimizer": true,
+  "fp16": {
+    "enabled": "auto",
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 16,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "zero_optimization": {
+    "stage": 2,
+    "offload_optimizer": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "allgather_partitions": true,
+    "allgather_bucket_size": 5e8,
+    "overlap_comm": false,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 5e8,
+    "contiguous_gradients": true,
+    "round_robin_gradients": true
+  }
+}
--- a/examples/deepspeed/ds_z3_config.json
+++ b/examples/deepspeed/ds_z3_config.json
+{
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "zero_allow_untested_optimizer": true,
+  "fp16": {
+    "enabled": "auto",
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 16,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "zero_optimization": {
+    "stage": 3,
+    "overlap_comm": false,
+    "contiguous_gradients": true,
+    "sub_group_size": 1e9,
+    "reduce_bucket_size": "auto",
+    "stage3_prefetch_bucket_size": "auto",
+    "stage3_param_persistence_threshold": "auto",
+    "stage3_max_live_parameters": 1e9,
+    "stage3_max_reuse_distance": 1e9,
+    "stage3_gather_16bit_weights_on_model_save": true
+  }
+}
--- a/examples/deepspeed/ds_z3_offload_config.json
+++ b/examples/deepspeed/ds_z3_offload_config.json
+{
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "zero_allow_untested_optimizer": true,
+  "fp16": {
+    "enabled": "auto",
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 16,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "zero_optimization": {
+    "stage": 3,
+    "offload_optimizer": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "offload_param": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "overlap_comm": false,
+    "contiguous_gradients": true,
+    "sub_group_size": 1e9,
+    "reduce_bucket_size": "auto",
+    "stage3_prefetch_bucket_size": "auto",
+    "stage3_param_persistence_threshold": "auto",
+    "stage3_max_live_parameters": 1e9,
+    "stage3_max_reuse_distance": 1e9,
+    "stage3_gather_16bit_weights_on_model_save": true
+  }
+}
--- a/examples/extras/adam_mini/qwen2_full_sft.yaml
+++ b/examples/extras/adam_mini/qwen2_full_sft.yaml
+### model
+model_name_or_path: Qwen/Qwen2-1.5B-Instruct
+trust_remote_code: true
+
+### method
+stage: sft
+do_train: true
+finetuning_type: full
+use_adam_mini: true
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: qwen
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/qwen2-1_5b/full/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-5
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+
+### eval
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/extras/apollo/llama3_full_sft.yaml
+++ b/examples/extras/apollo/llama3_full_sft.yaml
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+trust_remote_code: true
+
+### method
+stage: sft
+do_train: true
+finetuning_type: full
+use_apollo: true
+apollo_layerwise: true  # choices: [true, false], use false for DDP training
+apollo_target: all
+apollo_rank: 128
+apollo_scale: 32.0
+apollo_scale_type: channel
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama3-8b/full/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 1  # use 1 for layerwise apollo
+learning_rate: 1.0e-5
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+pure_bf16: true
+ddp_timeout: 180000000
+
+### eval
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/extras/badam/llama3_full_sft.yaml
+++ b/examples/extras/badam/llama3_full_sft.yaml
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+trust_remote_code: true
+
+### method
+stage: sft
+do_train: true
+finetuning_type: full
+use_badam: true
+badam_mode: layer
+badam_switch_mode: ascending
+badam_switch_interval: 50
+badam_verbose: 2
+# deepspeed: examples/deepspeed/ds_z3_config.json
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama3-8b/full/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-5
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+
+### eval
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
+++ b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+quantization_bit: 4
+trust_remote_code: true
+
+### method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_rank: 8
+lora_target: all
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+
+### eval
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/extras/fsdp_qlora/train.sh
+++ b/examples/extras/fsdp_qlora/train.sh
+#!/bin/bash
+# DO NOT use GPTQ/AWQ model in FSDP+QLoRA
+
+CUDA_VISIBLE_DEVICES=0,1 accelerate launch \
+    --config_file examples/accelerate/fsdp_config.yaml \
+    src/train.py examples/extras/fsdp_qlora/llama3_lora_sft.yaml
--- a/examples/extras/galore/llama3_full_sft.yaml
+++ b/examples/extras/galore/llama3_full_sft.yaml
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+trust_remote_code: true
+
+### method
+stage: sft
+do_train: true
+finetuning_type: full
+use_galore: true
+galore_layerwise: true  # choices: [true, false], use false for DDP training
+galore_target: all
+galore_rank: 128
+galore_scale: 2.0
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama3-8b/full/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 1  # use 1 for layerwise galore
+learning_rate: 1.0e-5
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+pure_bf16: true
+ddp_timeout: 180000000
+
+### eval
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/extras/llama_pro/expand.sh
+++ b/examples/extras/llama_pro/expand.sh
+#!/bin/bash
+
+python scripts/llama_pro.py \
+    --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \
+    --output_dir models/llama3-8b-pro \
+    --num_expand 8
--- a/examples/extras/llama_pro/llama3_freeze_sft.yaml
+++ b/examples/extras/llama_pro/llama3_freeze_sft.yaml
+### model
+model_name_or_path: models/llama3-8b-pro
+trust_remote_code: true
+
+### method
+stage: sft
+do_train: true
+finetuning_type: freeze
+freeze_trainable_layers: 8
+freeze_trainable_modules: all
+use_llama_pro: true
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama3-8b-pro/freeze/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+
+### eval
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/extras/loraplus/llama3_lora_sft.yaml
+++ b/examples/extras/loraplus/llama3_lora_sft.yaml
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+trust_remote_code: true
+
+### method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_rank: 8
+lora_target: all
+loraplus_lr_ratio: 16.0
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+
+### eval
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/extras/mod/llama3_full_sft.yaml
+++ b/examples/extras/mod/llama3_full_sft.yaml
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+trust_remote_code: true
+
+### method
+stage: sft
+do_train: true
+finetuning_type: full
+mixture_of_depths: convert
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama3-8b-mod/full/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+optim: paged_adamw_8bit
+learning_rate: 1.0e-5
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+pure_bf16: true
+ddp_timeout: 180000000
+
+### eval
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/extras/nlg_eval/llama3_lora_predict.yaml
+++ b/examples/extras/nlg_eval/llama3_lora_predict.yaml
+# The batch generation can be SLOW using this config.
+# For faster inference, we recommend to use `scripts/vllm_infer.py`.
+
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+adapter_name_or_path: saves/llama3-8b/lora/sft
+trust_remote_code: true
+
+### method
+stage: sft
+do_predict: true
+finetuning_type: lora
+
+### dataset
+eval_dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 50
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama3-8b/lora/predict
+overwrite_output_dir: true
+
+### eval
+per_device_eval_batch_size: 1
+predict_with_generate: true
+ddp_timeout: 180000000
--- a/examples/extras/pissa/init.sh
+++ b/examples/extras/pissa/init.sh
+#!/bin/bash
+
+python scripts/pissa_init.py \
+    --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \
+    --output_dir models/llama3-8b-pissa
--- a/examples/extras/pissa/llama3_lora_sft.yaml
+++ b/examples/extras/pissa/llama3_lora_sft.yaml
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+trust_remote_code: true
+
+### method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_rank: 8
+lora_target: all
+pissa_init: true
+pissa_iter: 16
+pissa_convert: true
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+
+### eval
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/inference/llama3.yaml
+++ b/examples/inference/llama3.yaml
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+template: llama3
+infer_backend: huggingface  # choices: [huggingface, vllm]
+trust_remote_code: true
--- a/examples/inference/llama3_full_sft.yaml
+++ b/examples/inference/llama3_full_sft.yaml
+model_name_or_path: saves/llama3-8b/full/sft
+template: llama3
+infer_backend: huggingface  # choices: [huggingface, vllm]
+trust_remote_code: true
--- a/examples/inference/llama3_lora_sft.yaml
+++ b/examples/inference/llama3_lora_sft.yaml
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+adapter_name_or_path: saves/llama3-8b/lora/sft
+template: llama3
+infer_backend: huggingface  # choices: [huggingface, vllm]
+trust_remote_code: true
--- a/examples/inference/llama3_sglang.yaml
+++ b/examples/inference/llama3_sglang.yaml
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+template: llama3
+infer_backend: sglang
+trust_remote_code: true
--- a/examples/inference/llama3_vllm.yaml
+++ b/examples/inference/llama3_vllm.yaml
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+template: llama3
+infer_backend: vllm
+vllm_enforce_eager: true
+trust_remote_code: true
--- a/examples/inference/llava1_5.yaml
+++ b/examples/inference/llava1_5.yaml
+model_name_or_path: llava-hf/llava-1.5-7b-hf
+template: llava
+infer_backend: huggingface  # choices: [huggingface, vllm]
+trust_remote_code: true
--- a/examples/inference/qwen2_vl.yaml
+++ b/examples/inference/qwen2_vl.yaml
+model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
+template: qwen2_vl
+infer_backend: huggingface  # choices: [huggingface, vllm]
+trust_remote_code: true
--- a/examples/merge_lora/llama3_full_sft.yaml
+++ b/examples/merge_lora/llama3_full_sft.yaml
+### model
+model_name_or_path: saves/llama3-8b/full/sft
+template: llama3
+trust_remote_code: true
+
+### export
+export_dir: output/llama3_full_sft
+export_size: 5
+export_device: cpu
+export_legacy_format: false
--- a/examples/merge_lora/llama3_gptq.yaml
+++ b/examples/merge_lora/llama3_gptq.yaml
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+template: llama3
+trust_remote_code: true
+
+### export
+export_dir: output/llama3_gptq
+export_quantization_bit: 4
+export_quantization_dataset: data/c4_demo.json
+export_size: 5
+export_device: cpu
+export_legacy_format: false
--- a/examples/merge_lora/llama3_lora_sft.yaml
+++ b/examples/merge_lora/llama3_lora_sft.yaml
+### Note: DO NOT use quantized model or quantization_bit when merging lora adapters
+
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+adapter_name_or_path: saves/llama3-8b/lora/sft
+template: llama3
+trust_remote_code: true
+
+### export
+export_dir: output/llama3_lora_sft
+export_size: 5
+export_device: cpu
+export_legacy_format: false
--- a/examples/merge_lora/qwen2vl_lora_sft.yaml
+++ b/examples/merge_lora/qwen2vl_lora_sft.yaml
+### Note: DO NOT use quantized model or quantization_bit when merging lora adapters
+
+### model
+model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
+adapter_name_or_path: saves/qwen2_vl-7b/lora/sft
+template: qwen2_vl
+trust_remote_code: true
+
+### export
+export_dir: output/qwen2_vl_lora_sft
+export_size: 5
+export_device: cpu
+export_legacy_format: false
--- a/examples/train_full/llama3_full_sft.yaml
+++ b/examples/train_full/llama3_full_sft.yaml
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+trust_remote_code: true
+
+### method
+stage: sft
+do_train: true
+finetuning_type: full
+deepspeed: examples/deepspeed/ds_z3_config.json  # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json]
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+dataloader_num_workers: 4
+
+### output
+output_dir: saves/llama3-8b/full/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+save_only_model: false
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 2
+learning_rate: 1.0e-5
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+resume_from_checkpoint: null
+
+### eval
+# eval_dataset: alpaca_en_demo
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/train_full/qwen2vl_full_sft.yaml
+++ b/examples/train_full/qwen2vl_full_sft.yaml
+### model
+model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
+image_max_pixels: 262144
+video_max_pixels: 16384
+trust_remote_code: true
+
+### method
+stage: sft
+do_train: true
+finetuning_type: full
+freeze_vision_tower: true  # choices: [true, false]
+freeze_multi_modal_projector: true  # choices: [true, false]
+freeze_language_model: false  # choices: [true, false]
+deepspeed: examples/deepspeed/ds_z3_config.json  # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json]
+
+### dataset
+dataset: mllm_demo,identity,alpaca_en_demo
+template: qwen2_vl
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+dataloader_num_workers: 4
+
+### output
+output_dir: saves/qwen2_vl-7b/full/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+save_only_model: false
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 2
+learning_rate: 1.0e-5
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+resume_from_checkpoint: null
+
+### eval
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/train_lora/llama3_lora_dpo.yaml
+++ b/examples/train_lora/llama3_lora_dpo.yaml
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+trust_remote_code: true
+
+### method
+stage: dpo
+do_train: true
+finetuning_type: lora
+lora_rank: 8
+lora_target: all
+pref_beta: 0.1
+pref_loss: sigmoid  # choices: [sigmoid (dpo), orpo, simpo]
+
+### dataset
+dataset: dpo_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+dataloader_num_workers: 4
+
+### output
+output_dir: saves/llama3-8b/lora/dpo
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+save_only_model: false
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 5.0e-6
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+resume_from_checkpoint: null
+
+### eval
+# eval_dataset: dpo_en_demo
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/train_lora/llama3_lora_eval.yaml
+++ b/examples/train_lora/llama3_lora_eval.yaml
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+adapter_name_or_path: saves/llama3-8b/lora/sft
+trust_remote_code: true
+
+### method
+finetuning_type: lora
+
+### dataset
+task: mmlu_test  # choices: [mmlu_test, ceval_validation, cmmlu_test]
+template: fewshot
+lang: en
+n_shot: 5
+
+### output
+save_dir: saves/llama3-8b/lora/eval
+
+### eval
+batch_size: 4
--- a/examples/train_lora/llama3_lora_kto.yaml
+++ b/examples/train_lora/llama3_lora_kto.yaml
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+trust_remote_code: true
+
+### method
+stage: kto
+do_train: true
+finetuning_type: lora
+lora_rank: 8
+lora_target: all
+pref_beta: 0.1
+
+### dataset
+dataset: kto_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama3-8b/lora/kto
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 5.0e-6
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+
+### eval
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/train_lora/llama3_lora_ppo.yaml
+++ b/examples/train_lora/llama3_lora_ppo.yaml
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+reward_model: saves/llama3-8b/lora/reward
+trust_remote_code: true
+
+### method
+stage: ppo
+do_train: true
+finetuning_type: lora
+lora_rank: 8
+lora_target: all
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama3-8b/lora/ppo
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-5
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+
+### generate
+max_new_tokens: 512
+top_k: 0
+top_p: 0.9
--- a/examples/train_lora/llama3_lora_pretrain.yaml
+++ b/examples/train_lora/llama3_lora_pretrain.yaml
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+trust_remote_code: true
+
+### method
+stage: pt
+do_train: true
+finetuning_type: lora
+lora_rank: 8
+lora_target: all
+
+### dataset
+dataset: c4_demo
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+dataloader_num_workers: 4
+
+### output
+output_dir: saves/llama3-8b/lora/pretrain
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+save_only_model: false
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+resume_from_checkpoint: null
+
+### eval
+# eval_dataset: c4_demo
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/train_lora/llama3_lora_reward.yaml
+++ b/examples/train_lora/llama3_lora_reward.yaml
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+trust_remote_code: true
+
+### method
+stage: rm
+do_train: true
+finetuning_type: lora
+lora_rank: 8
+lora_target: all
+
+### dataset
+dataset: dpo_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+dataloader_num_workers: 4
+
+### output
+output_dir: saves/llama3-8b/lora/reward
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+save_only_model: false
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+resume_from_checkpoint: null
+
+### eval
+# eval_dataset: dpo_en_demo
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/train_lora/llama3_lora_sft.yaml
+++ b/examples/train_lora/llama3_lora_sft.yaml
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+trust_remote_code: true
+
+### method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_rank: 8
+lora_target: all
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+dataloader_num_workers: 4
+
+### output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+save_only_model: false
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+resume_from_checkpoint: null
+
+### eval
+# eval_dataset: alpaca_en_demo
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/train_lora/llama3_lora_sft_ds3.yaml
+++ b/examples/train_lora/llama3_lora_sft_ds3.yaml
--- a/examples/train_lora/llama3_lora_sft_ray.yaml
+++ b/examples/train_lora/llama3_lora_sft_ray.yaml
--- a/examples/train_lora/llama3_preprocess.yaml
+++ b/examples/train_lora/llama3_preprocess.yaml
--- a/examples/train_lora/llava1_5_lora_sft.yaml
+++ b/examples/train_lora/llava1_5_lora_sft.yaml
--- a/examples/train_lora/qwen2vl_lora_dpo.yaml
+++ b/examples/train_lora/qwen2vl_lora_dpo.yaml
--- a/examples/train_lora/qwen2vl_lora_sft.yaml
+++ b/examples/train_lora/qwen2vl_lora_sft.yaml
--- a/examples/train_qlora/llama3_lora_sft_aqlm.yaml
+++ b/examples/train_qlora/llama3_lora_sft_aqlm.yaml
--- a/examples/train_qlora/llama3_lora_sft_awq.yaml
+++ b/examples/train_qlora/llama3_lora_sft_awq.yaml
--- a/examples/train_qlora/llama3_lora_sft_bnb_npu.yaml
+++ b/examples/train_qlora/llama3_lora_sft_bnb_npu.yaml
--- a/examples/train_qlora/llama3_lora_sft_gptq.yaml
+++ b/examples/train_qlora/llama3_lora_sft_gptq.yaml
--- a/examples/train_qlora/llama3_lora_sft_otfq.yaml
+++ b/examples/train_qlora/llama3_lora_sft_otfq.yaml
--- a/inference.yaml
+++ b/inference.yaml
--- a/pyproject.toml
+++ b/pyproject.toml
--- a/requirements.txt
+++ b/requirements.txt
--- a/scripts/api_example/test_image.py
+++ b/scripts/api_example/test_image.py
--- a/scripts/api_example/test_toolcall.py
+++ b/scripts/api_example/test_toolcall.py
--- a/scripts/convert_ckpt/llamafy_baichuan2.py
+++ b/scripts/convert_ckpt/llamafy_baichuan2.py
--- a/scripts/convert_ckpt/llamafy_qwen.py
+++ b/scripts/convert_ckpt/llamafy_qwen.py
--- a/scripts/llama_pro.py
+++ b/scripts/llama_pro.py
--- a/scripts/loftq_init.py
+++ b/scripts/loftq_init.py
--- a/scripts/pissa_init.py
+++ b/scripts/pissa_init.py
--- a/scripts/stat_utils/cal_flops.py
+++ b/scripts/stat_utils/cal_flops.py
--- a/scripts/stat_utils/cal_lr.py
+++ b/scripts/stat_utils/cal_lr.py
--- a/scripts/stat_utils/cal_mfu.py
+++ b/scripts/stat_utils/cal_mfu.py
--- a/scripts/stat_utils/cal_ppl.py
+++ b/scripts/stat_utils/cal_ppl.py
--- a/scripts/stat_utils/length_cdf.py
+++ b/scripts/stat_utils/length_cdf.py
--- a/scripts/vllm_infer.py
+++ b/scripts/vllm_infer.py
--- a/setup.py
+++ b/setup.py
--- a/speed.bash
+++ b/speed.bash
--- a/speed_test.py
+++ b/speed_test.py
--- a/speed_test_time.py
+++ b/speed_test_time.py
--- a/src/api.py
+++ b/src/api.py
--- a/src/llamafactory/__init__.py
+++ b/src/llamafactory/__init__.py
--- a/src/llamafactory/api/__init__.py
+++ b/src/llamafactory/api/__init__.py
--- a/src/llamafactory/api/app.py
+++ b/src/llamafactory/api/app.py
--- a/src/llamafactory/api/chat.py
+++ b/src/llamafactory/api/chat.py
--- a/src/llamafactory/api/common.py
+++ b/src/llamafactory/api/common.py
--- a/src/llamafactory/api/protocol.py
+++ b/src/llamafactory/api/protocol.py
--- a/src/llamafactory/chat/__init__.py
+++ b/src/llamafactory/chat/__init__.py
--- a/src/llamafactory/chat/base_engine.py
+++ b/src/llamafactory/chat/base_engine.py
--- a/src/llamafactory/chat/chat_model.py
+++ b/src/llamafactory/chat/chat_model.py
--- a/src/llamafactory/chat/hf_engine.py
+++ b/src/llamafactory/chat/hf_engine.py
--- a/src/llamafactory/chat/sglang_engine.py
+++ b/src/llamafactory/chat/sglang_engine.py
--- a/src/llamafactory/chat/vllm_engine.py
+++ b/src/llamafactory/chat/vllm_engine.py
--- a/src/llamafactory/cli.py
+++ b/src/llamafactory/cli.py
--- a/src/llamafactory/data/__init__.py
+++ b/src/llamafactory/data/__init__.py
--- a/src/llamafactory/data/collator.py
+++ b/src/llamafactory/data/collator.py
--- a/src/llamafactory/data/converter.py
+++ b/src/llamafactory/data/converter.py
--- a/src/llamafactory/data/data_utils.py
+++ b/src/llamafactory/data/data_utils.py
--- a/src/llamafactory/data/formatter.py
+++ b/src/llamafactory/data/formatter.py
--- a/src/llamafactory/data/loader.py
+++ b/src/llamafactory/data/loader.py
--- a/src/llamafactory/data/mm_plugin.py
+++ b/src/llamafactory/data/mm_plugin.py
--- a/src/llamafactory/data/parser.py
+++ b/src/llamafactory/data/parser.py
--- a/src/llamafactory/data/processor/__init__.py
+++ b/src/llamafactory/data/processor/__init__.py
--- a/src/llamafactory/data/processor/feedback.py
+++ b/src/llamafactory/data/processor/feedback.py
--- a/src/llamafactory/data/processor/pairwise.py
+++ b/src/llamafactory/data/processor/pairwise.py
--- a/src/llamafactory/data/processor/pretrain.py
+++ b/src/llamafactory/data/processor/pretrain.py
--- a/src/llamafactory/data/processor/processor_utils.py
+++ b/src/llamafactory/data/processor/processor_utils.py
--- a/src/llamafactory/data/processor/supervised.py
+++ b/src/llamafactory/data/processor/supervised.py
--- a/src/llamafactory/data/processor/unsupervised.py
+++ b/src/llamafactory/data/processor/unsupervised.py
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
--- a/src/llamafactory/data/tool_utils.py
+++ b/src/llamafactory/data/tool_utils.py
--- a/src/llamafactory/eval/__init__.py
+++ b/src/llamafactory/eval/__init__.py
--- a/src/llamafactory/eval/evaluator.py
+++ b/src/llamafactory/eval/evaluator.py
--- a/src/llamafactory/eval/template.py
+++ b/src/llamafactory/eval/template.py
--- a/src/llamafactory/extras/__init__.py
+++ b/src/llamafactory/extras/__init__.py
--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
--- a/src/llamafactory/extras/env.py
+++ b/src/llamafactory/extras/env.py
--- a/src/llamafactory/extras/logging.py
+++ b/src/llamafactory/extras/logging.py
--- a/src/llamafactory/extras/misc.py
+++ b/src/llamafactory/extras/misc.py
--- a/src/llamafactory/extras/packages.py
+++ b/src/llamafactory/extras/packages.py
--- a/src/llamafactory/extras/ploting.py
+++ b/src/llamafactory/extras/ploting.py
--- a/src/llamafactory/hparams/__init__.py
+++ b/src/llamafactory/hparams/__init__.py
--- a/src/llamafactory/hparams/data_args.py
+++ b/src/llamafactory/hparams/data_args.py
--- a/src/llamafactory/hparams/evaluation_args.py
+++ b/src/llamafactory/hparams/evaluation_args.py
--- a/src/llamafactory/hparams/finetuning_args.py
+++ b/src/llamafactory/hparams/finetuning_args.py
--- a/src/llamafactory/hparams/generating_args.py
+++ b/src/llamafactory/hparams/generating_args.py
--- a/src/llamafactory/hparams/model_args.py
+++ b/src/llamafactory/hparams/model_args.py
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
--- a/src/llamafactory/hparams/training_args.py
+++ b/src/llamafactory/hparams/training_args.py
--- a/src/llamafactory/launcher.py
+++ b/src/llamafactory/launcher.py
--- a/src/llamafactory/model/__init__.py
+++ b/src/llamafactory/model/__init__.py
--- a/src/llamafactory/model/adapter.py
+++ b/src/llamafactory/model/adapter.py
--- a/src/llamafactory/model/loader.py
+++ b/src/llamafactory/model/loader.py
--- a/src/llamafactory/model/model_utils/__init__.py
+++ b/src/llamafactory/model/model_utils/__init__.py
--- a/src/llamafactory/model/model_utils/attention.py
+++ b/src/llamafactory/model/model_utils/attention.py
--- a/src/llamafactory/model/model_utils/checkpointing.py
+++ b/src/llamafactory/model/model_utils/checkpointing.py
--- a/src/llamafactory/model/model_utils/embedding.py
+++ b/src/llamafactory/model/model_utils/embedding.py
--- a/src/llamafactory/model/model_utils/liger_kernel.py
+++ b/src/llamafactory/model/model_utils/liger_kernel.py
--- a/src/llamafactory/model/model_utils/longlora.py
+++ b/src/llamafactory/model/model_utils/longlora.py
--- a/src/llamafactory/model/model_utils/misc.py
+++ b/src/llamafactory/model/model_utils/misc.py
--- a/src/llamafactory/model/model_utils/mod.py
+++ b/src/llamafactory/model/model_utils/mod.py
--- a/src/llamafactory/model/model_utils/moe.py
+++ b/src/llamafactory/model/model_utils/moe.py
--- a/src/llamafactory/model/model_utils/packing.py
+++ b/src/llamafactory/model/model_utils/packing.py
--- a/src/llamafactory/model/model_utils/quantization.py
+++ b/src/llamafactory/model/model_utils/quantization.py
--- a/src/llamafactory/model/model_utils/rope.py
+++ b/src/llamafactory/model/model_utils/rope.py
--- a/src/llamafactory/model/model_utils/unsloth.py
+++ b/src/llamafactory/model/model_utils/unsloth.py
--- a/src/llamafactory/model/model_utils/valuehead.py
+++ b/src/llamafactory/model/model_utils/valuehead.py
--- a/src/llamafactory/model/model_utils/visual.py
+++ b/src/llamafactory/model/model_utils/visual.py
--- a/src/llamafactory/model/patcher.py
+++ b/src/llamafactory/model/patcher.py
--- a/src/llamafactory/train/__init__.py
+++ b/src/llamafactory/train/__init__.py
--- a/src/llamafactory/train/callbacks.py
+++ b/src/llamafactory/train/callbacks.py
--- a/src/llamafactory/train/dpo/__init__.py
+++ b/src/llamafactory/train/dpo/__init__.py
--- a/src/llamafactory/train/dpo/trainer.py
+++ b/src/llamafactory/train/dpo/trainer.py
--- a/src/llamafactory/train/dpo/workflow.py
+++ b/src/llamafactory/train/dpo/workflow.py
--- a/src/llamafactory/train/kto/__init__.py
+++ b/src/llamafactory/train/kto/__init__.py
--- a/src/llamafactory/train/kto/trainer.py
+++ b/src/llamafactory/train/kto/trainer.py
--- a/src/llamafactory/train/kto/workflow.py
+++ b/src/llamafactory/train/kto/workflow.py
--- a/src/llamafactory/train/ppo/__init__.py
+++ b/src/llamafactory/train/ppo/__init__.py
--- a/src/llamafactory/train/ppo/ppo_utils.py
+++ b/src/llamafactory/train/ppo/ppo_utils.py
--- a/src/llamafactory/train/ppo/trainer.py
+++ b/src/llamafactory/train/ppo/trainer.py
--- a/src/llamafactory/train/ppo/workflow.py
+++ b/src/llamafactory/train/ppo/workflow.py
--- a/src/llamafactory/train/pt/__init__.py
+++ b/src/llamafactory/train/pt/__init__.py
--- a/src/llamafactory/train/pt/trainer.py
+++ b/src/llamafactory/train/pt/trainer.py
--- a/src/llamafactory/train/pt/workflow.py
+++ b/src/llamafactory/train/pt/workflow.py
--- a/src/llamafactory/train/rm/__init__.py
+++ b/src/llamafactory/train/rm/__init__.py
--- a/src/llamafactory/train/rm/metric.py
+++ b/src/llamafactory/train/rm/metric.py
--- a/src/llamafactory/train/rm/trainer.py
+++ b/src/llamafactory/train/rm/trainer.py
--- a/src/llamafactory/train/rm/workflow.py
+++ b/src/llamafactory/train/rm/workflow.py
--- a/src/llamafactory/train/sft/__init__.py
+++ b/src/llamafactory/train/sft/__init__.py
--- a/src/llamafactory/train/sft/metric.py
+++ b/src/llamafactory/train/sft/metric.py
--- a/src/llamafactory/train/sft/trainer.py
+++ b/src/llamafactory/train/sft/trainer.py
--- a/src/llamafactory/train/sft/workflow.py
+++ b/src/llamafactory/train/sft/workflow.py
--- a/src/llamafactory/train/test_utils.py
+++ b/src/llamafactory/train/test_utils.py
--- a/src/llamafactory/train/trainer_utils.py
+++ b/src/llamafactory/train/trainer_utils.py
--- a/src/llamafactory/train/tuner.py
+++ b/src/llamafactory/train/tuner.py
--- a/src/llamafactory/webui/__init__.py
+++ b/src/llamafactory/webui/__init__.py
--- a/src/llamafactory/webui/chatter.py
+++ b/src/llamafactory/webui/chatter.py
--- a/src/llamafactory/webui/common.py
+++ b/src/llamafactory/webui/common.py
--- a/src/llamafactory/webui/components/__init__.py
+++ b/src/llamafactory/webui/components/__init__.py
--- a/src/llamafactory/webui/components/chatbot.py
+++ b/src/llamafactory/webui/components/chatbot.py
--- a/src/llamafactory/webui/components/data.py
+++ b/src/llamafactory/webui/components/data.py
--- a/src/llamafactory/webui/components/eval.py
+++ b/src/llamafactory/webui/components/eval.py
--- a/src/llamafactory/webui/components/export.py
+++ b/src/llamafactory/webui/components/export.py
--- a/src/llamafactory/webui/components/infer.py
+++ b/src/llamafactory/webui/components/infer.py
--- a/src/llamafactory/webui/components/top.py
+++ b/src/llamafactory/webui/components/top.py
--- a/src/llamafactory/webui/components/train.py
+++ b/src/llamafactory/webui/components/train.py
--- a/src/llamafactory/webui/control.py
+++ b/src/llamafactory/webui/control.py
--- a/src/llamafactory/webui/css.py
+++ b/src/llamafactory/webui/css.py
--- a/src/llamafactory/webui/engine.py
+++ b/src/llamafactory/webui/engine.py
--- a/src/llamafactory/webui/interface.py
+++ b/src/llamafactory/webui/interface.py
--- a/src/llamafactory/webui/locales.py
+++ b/src/llamafactory/webui/locales.py
--- a/src/llamafactory/webui/manager.py
+++ b/src/llamafactory/webui/manager.py
--- a/src/llamafactory/webui/runner.py
+++ b/src/llamafactory/webui/runner.py
--- a/src/train.py
+++ b/src/train.py
--- a/src/webui.py
+++ b/src/webui.py
--- a/tests/check_license.py
+++ b/tests/check_license.py
--- a/tests/data/processor/test_feedback.py
+++ b/tests/data/processor/test_feedback.py
--- a/tests/data/processor/test_pairwise.py
+++ b/tests/data/processor/test_pairwise.py
--- a/tests/data/processor/test_processor_utils.py
+++ b/tests/data/processor/test_processor_utils.py
--- a/tests/data/processor/test_supervised.py
+++ b/tests/data/processor/test_supervised.py
--- a/tests/data/processor/test_unsupervised.py
+++ b/tests/data/processor/test_unsupervised.py
--- a/tests/data/test_collator.py
+++ b/tests/data/test_collator.py
--- a/tests/data/test_converter.py
+++ b/tests/data/test_converter.py
--- a/tests/data/test_formatter.py
+++ b/tests/data/test_formatter.py
--- a/tests/data/test_loader.py
+++ b/tests/data/test_loader.py
--- a/tests/data/test_mm_plugin.py
+++ b/tests/data/test_mm_plugin.py
--- a/tests/data/test_template.py
+++ b/tests/data/test_template.py
--- a/tests/e2e/test_chat.py
+++ b/tests/e2e/test_chat.py
--- a/tests/e2e/test_sglang.py
+++ b/tests/e2e/test_sglang.py
--- a/tests/e2e/test_train.py
+++ b/tests/e2e/test_train.py
--- a/tests/eval/test_eval_template.py
+++ b/tests/eval/test_eval_template.py
--- a/tests/model/model_utils/test_attention.py
+++ b/tests/model/model_utils/test_attention.py
--- a/tests/model/model_utils/test_checkpointing.py
+++ b/tests/model/model_utils/test_checkpointing.py
--- a/tests/model/model_utils/test_misc.py
+++ b/tests/model/model_utils/test_misc.py
--- a/tests/model/model_utils/test_packing.py
+++ b/tests/model/model_utils/test_packing.py
--- a/tests/model/model_utils/test_visual.py
+++ b/tests/model/model_utils/test_visual.py
--- a/tests/model/test_base.py
+++ b/tests/model/test_base.py
--- a/tests/model/test_freeze.py
+++ b/tests/model/test_freeze.py
--- a/tests/model/test_full.py
+++ b/tests/model/test_full.py
--- a/tests/model/test_lora.py
+++ b/tests/model/test_lora.py
--- a/tests/model/test_pissa.py
+++ b/tests/model/test_pissa.py
--- a/tests/train/test_sft_trainer.py
+++ b/tests/train/test_sft_trainer.py