关联issue:[[Bug] Garbled text output likes ה during inference when fast_inference=True(VLLM) is enabled using Qwen3-1.7B · Issue #3320 · unslothai/unsloth](https://github.com/unslothai/unsloth/issues/3320)

最开始因为生成慢所以尝试vllm加速,然后开了UNSLOTH_VLLM_STANDBY和fast_inference,不过实际上这个并不会加速(因为用普通的生成方法无法成功用vllm,要用fast_generate)

然后就发现了生成乱码的问题。我当时排查了vllm,chattemplate,最后发现是FastLanguageModel会有问题,而FastModel正常,最后用gpt-5-codex看了venv的unsloth代码解决了。

省流:每次unsloth生成完会把padding_side设置为right,所以要手动改为left来解决。

参考的可运行脚本如下:

File: test_fast_language_model.py

import os
os.environ["UNSLOTH_VLLM_STANDBY"] = "1"
import random
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template

# This is a core helper function to create the full prompt from the problem text.
def create_prompt(problem_text: str, tokenizer) -> str:
    """Formats the problem using the Qwen3 chat template."""
    return tokenizer.apply_chat_template(
        [
            {"role": "system", "content": "You are a helpful math assistant."},
            {
                "role": "user",
                "content": f"Solve the problem step-by-step and state the answer.\n\nProblem:\n{problem_text}",
            },
        ],
        tokenize=False,
        add_generation_prompt=True,
    )

# This function defines what constitutes "gibberish" or "corrupted" output.
def check_for_corruption(text: str) -> str | None:
    """Checks the generated text for known corruption patterns."""
    stripped_text = text.strip()
    if "ה" in stripped_text or "ת" in stripped_text:
        return "Detected Hebrew characters (garbled)"
    if "is the is the" in stripped_text.lower():
        return "Inappropriate content: contains 'is the is the'"
    if "the the the" in stripped_text.lower():
        return "Inappropriate content: contains 'the the the'"
    return None

# Encapsulates the testing logic for a single batch.
def run_batch_test(model, tokenizer, batch_size: int, prompts_pool: list) -> list:
    """Runs the generation and corruption check for a given batch size."""
    prompts = [create_prompt(random.choice(prompts_pool), tokenizer) for _ in range(batch_size)]

    try:
        tokenizer.padding_side = "left"  # Unsloth resets this to "right" after generate()
        inputs = tokenizer(prompts, return_tensors="pt", padding=True).to("cuda")
    except Exception:
        print("[WARN] CUDA not available, falling back to CPU for inputs.")
        tokenizer.padding_side = "left"
        inputs = tokenizer(prompts, return_tensors="pt", padding=True)

    outputs = model.generate(
        **inputs,
        max_new_tokens=1024,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=False,
    )
    texts = tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)

    results = []
    for text in texts:
        reason = check_for_corruption(text)
        results.append((reason, text))
    return results

def main():
    # --- Configuration ---
    MODEL_NAME = "unsloth/Qwen3-1.7B-Base"
    BATCH_SIZES_TO_TEST = [4,8]

    proven_failure_prompts = [
    r"What is the product of the squares of the solutions of $2x^2 + 13x + 6 = 0$?",
    r"The solutions of $x(x-3)=1$ may be expressed in the form $\frac{a+\sqrt{b}}{c}$ and $\frac{a-\sqrt{b}}{c}$, where $a$, $b$, and $c$ are prime numbers.  Find $abc$.",
    r"Simplify $\n\root 3 \of {x \root 3 \of {x \root 3 \of {x \sqrt{x}}}}.\n$ Express your answer in simplest radical form in terms of $x$.",
    r"Simplify $(3-i)(6+2i)$.",
    r"Given that $f(x)$ is a function such that $f(1)=2$, $f(4)=3$, $f(7)=4$, and $f^{-1}(x)$ is the inverse of $f(x)$, what is $f^{-1}(f^{-1}(3))$?",
    r"I choose a random integer $n$ between $1$ and $10$ inclusive. What is the probability that for the $n$ I chose, there exist no real solutions to the equation $x(x+5) = -n$? Express your answer as a common fraction.",
    r"What is the domain of the function $k(y) = \frac{1}{2y+1}~?$ Express your answer in interval notation.",
    r"Evaluate $\log_{\sqrt8}(64\sqrt{8})$."
]

    # --- Model and Tokenizer Setup ---
    print(f"Loading model: {MODEL_NAME}...")
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=MODEL_NAME, max_seq_length=2048, dtype=None, load_in_4bit=True,
        fast_inference=True,
    )
    tokenizer = get_chat_template(tokenizer, chat_template="qwen3")
    tokenizer.padding_side = "left"
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    print("Model and tokenizer loaded successfully.\n")

    # --- Testing Loop and Reporting ---
    final_summary = {}
    for bs in BATCH_SIZES_TO_TEST:
        print(f"\n{'='*20} TESTING BATCH SIZE: {bs} {'='*20}")
        results = run_batch_test(model, tokenizer, bs, proven_failure_prompts)
        corrupted_count = sum(1 for reason, _ in results if reason is not None)
        final_summary[bs] = corrupted_count

        print(f"Status: {'FAILED' if corrupted_count > 0 else 'PASSED'} | Found {corrupted_count}/{bs} corrupted outputs.")
        for i, (reason, text) in enumerate(results):
            preview = text[:15].strip().replace(os.linesep, ' ')
            status = "CORRUPTED" if reason else "OK"
            print(f"  - Example {i+1}: Status = {status} | Preview = '{preview}...'")

    # --- Final Summary ---
    print(f"\n\n{'='*25} FINAL SUMMARY {'='*25}")
    for bs, count in final_summary.items():
        result = "FAILED" if count > 0 else "PASSED"
        print(f"Batch Size: {bs:<4} | Corrupted: {count:<4} | Result: {result}")


if __name__ == "__main__":
    main()
Last modification:September 17, 2025
如果觉得我的文章对你有用,请随意赞赏