关联issue:[[Bug] Garbled text output likes ה
during inference when fast_inference=True(VLLM) is enabled using Qwen3-1.7B · Issue #3320 · unslothai/unsloth](https://github.com/unslothai/unsloth/issues/3320)
最开始因为生成慢所以尝试vllm加速,然后开了UNSLOTH_VLLM_STANDBY和fast_inference,不过实际上这个并不会加速(因为用普通的生成方法无法成功用vllm,要用fast_generate)
然后就发现了生成乱码的问题。我当时排查了vllm,chattemplate,最后发现是FastLanguageModel会有问题,而FastModel正常,最后用gpt-5-codex看了venv的unsloth代码解决了。
省流:每次unsloth生成完会把padding_side
设置为right,所以要手动改为left来解决。
参考的可运行脚本如下:
File: test_fast_language_model.py
import os
os.environ["UNSLOTH_VLLM_STANDBY"] = "1"
import random
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
# This is a core helper function to create the full prompt from the problem text.
def create_prompt(problem_text: str, tokenizer) -> str:
"""Formats the problem using the Qwen3 chat template."""
return tokenizer.apply_chat_template(
[
{"role": "system", "content": "You are a helpful math assistant."},
{
"role": "user",
"content": f"Solve the problem step-by-step and state the answer.\n\nProblem:\n{problem_text}",
},
],
tokenize=False,
add_generation_prompt=True,
)
# This function defines what constitutes "gibberish" or "corrupted" output.
def check_for_corruption(text: str) -> str | None:
"""Checks the generated text for known corruption patterns."""
stripped_text = text.strip()
if "ה" in stripped_text or "ת" in stripped_text:
return "Detected Hebrew characters (garbled)"
if "is the is the" in stripped_text.lower():
return "Inappropriate content: contains 'is the is the'"
if "the the the" in stripped_text.lower():
return "Inappropriate content: contains 'the the the'"
return None
# Encapsulates the testing logic for a single batch.
def run_batch_test(model, tokenizer, batch_size: int, prompts_pool: list) -> list:
"""Runs the generation and corruption check for a given batch size."""
prompts = [create_prompt(random.choice(prompts_pool), tokenizer) for _ in range(batch_size)]
try:
tokenizer.padding_side = "left" # Unsloth resets this to "right" after generate()
inputs = tokenizer(prompts, return_tensors="pt", padding=True).to("cuda")
except Exception:
print("[WARN] CUDA not available, falling back to CPU for inputs.")
tokenizer.padding_side = "left"
inputs = tokenizer(prompts, return_tensors="pt", padding=True)
outputs = model.generate(
**inputs,
max_new_tokens=1024,
eos_token_id=tokenizer.eos_token_id,
do_sample=False,
)
texts = tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
results = []
for text in texts:
reason = check_for_corruption(text)
results.append((reason, text))
return results
def main():
# --- Configuration ---
MODEL_NAME = "unsloth/Qwen3-1.7B-Base"
BATCH_SIZES_TO_TEST = [4,8]
proven_failure_prompts = [
r"What is the product of the squares of the solutions of $2x^2 + 13x + 6 = 0$?",
r"The solutions of $x(x-3)=1$ may be expressed in the form $\frac{a+\sqrt{b}}{c}$ and $\frac{a-\sqrt{b}}{c}$, where $a$, $b$, and $c$ are prime numbers. Find $abc$.",
r"Simplify $\n\root 3 \of {x \root 3 \of {x \root 3 \of {x \sqrt{x}}}}.\n$ Express your answer in simplest radical form in terms of $x$.",
r"Simplify $(3-i)(6+2i)$.",
r"Given that $f(x)$ is a function such that $f(1)=2$, $f(4)=3$, $f(7)=4$, and $f^{-1}(x)$ is the inverse of $f(x)$, what is $f^{-1}(f^{-1}(3))$?",
r"I choose a random integer $n$ between $1$ and $10$ inclusive. What is the probability that for the $n$ I chose, there exist no real solutions to the equation $x(x+5) = -n$? Express your answer as a common fraction.",
r"What is the domain of the function $k(y) = \frac{1}{2y+1}~?$ Express your answer in interval notation.",
r"Evaluate $\log_{\sqrt8}(64\sqrt{8})$."
]
# --- Model and Tokenizer Setup ---
print(f"Loading model: {MODEL_NAME}...")
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=MODEL_NAME, max_seq_length=2048, dtype=None, load_in_4bit=True,
fast_inference=True,
)
tokenizer = get_chat_template(tokenizer, chat_template="qwen3")
tokenizer.padding_side = "left"
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print("Model and tokenizer loaded successfully.\n")
# --- Testing Loop and Reporting ---
final_summary = {}
for bs in BATCH_SIZES_TO_TEST:
print(f"\n{'='*20} TESTING BATCH SIZE: {bs} {'='*20}")
results = run_batch_test(model, tokenizer, bs, proven_failure_prompts)
corrupted_count = sum(1 for reason, _ in results if reason is not None)
final_summary[bs] = corrupted_count
print(f"Status: {'FAILED' if corrupted_count > 0 else 'PASSED'} | Found {corrupted_count}/{bs} corrupted outputs.")
for i, (reason, text) in enumerate(results):
preview = text[:15].strip().replace(os.linesep, ' ')
status = "CORRUPTED" if reason else "OK"
print(f" - Example {i+1}: Status = {status} | Preview = '{preview}...'")
# --- Final Summary ---
print(f"\n\n{'='*25} FINAL SUMMARY {'='*25}")
for bs, count in final_summary.items():
result = "FAILED" if count > 0 else "PASSED"
print(f"Batch Size: {bs:<4} | Corrupted: {count:<4} | Result: {result}")
if __name__ == "__main__":
main()