import json
import os
import platform
from pathlib import Path
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
MODEL_REPO = "Qwen/Qwen3-1.7B-Base"
TEST_PROMPTS = [
"The capital of France is",
"Two plus two equals",
"Once upon a time, there was a",
]
TOP_K = 10
def main() -> None:
os.environ.setdefault("CUBLAS_WORKSPACE_CONFIG", ":16:8")
torch.use_deterministic_algorithms(True)
torch.manual_seed(0)
import transformers as hf_transformers
print(f"Qwen3 forward-pass reference generation for {MODEL_REPO}")
print(f" {len(TEST_PROMPTS)} prompts, top-{TOP_K} logits per prompt")
print(f" torch {torch.__version__}, transformers {hf_transformers.__version__}")
print(f" platform {platform.platform()}")
print()
print("Loading model + tokenizer ...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
model = AutoModelForCausalLM.from_pretrained(
MODEL_REPO,
dtype=torch.float32,
device_map="cpu",
)
model.eval()
cfg = model.config
print(
f" hidden_size={cfg.hidden_size}, num_layers={cfg.num_hidden_layers}, "
f"vocab_size={cfg.vocab_size}, head_dim={cfg.head_dim}, "
f"num_kv_heads={cfg.num_key_value_heads}"
)
print()
results: dict = {
"model_repo": MODEL_REPO,
"methodology": "from-first-principles forward-pass oracle "
"(transformers.AutoModelForCausalLM, F32 CPU)",
"torch_version": torch.__version__,
"transformers_version": hf_transformers.__version__,
"platform": platform.platform(),
"hidden_size": cfg.hidden_size,
"num_layers": cfg.num_hidden_layers,
"vocab_size": cfg.vocab_size,
"head_dim": cfg.head_dim,
"num_attention_heads": cfg.num_attention_heads,
"num_kv_heads": cfg.num_key_value_heads,
"max_position_embeddings": getattr(cfg, "max_position_embeddings", None),
"rope_theta": getattr(cfg, "rope_theta", None),
"rms_norm_eps": getattr(cfg, "rms_norm_eps", None),
"use_qk_norm": True,
"test_cases": [],
}
with torch.no_grad():
for prompt in TEST_PROMPTS:
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs.input_ids
tokens = input_ids[0].tolist()
outputs = model(
input_ids=input_ids,
output_hidden_states=True,
use_cache=False,
return_dict=True,
)
last_logits = outputs.logits[0, -1, :].float()
top_vals, top_idx = last_logits.topk(TOP_K)
final_hidden = outputs.hidden_states[-1]
last_residual = final_hidden[0, -1, :].float().tolist()
top_token_str = tokenizer.decode([int(top_idx[0])])
print(
f" prompt='{prompt}': {len(tokens)} tokens, "
f"top1=({int(top_idx[0])}, '{top_token_str}', {float(top_vals[0]):.4f})"
)
test_case = {
"prompt": prompt,
"tokens": tokens,
"top_10": [
{"index": int(idx), "logit": float(val)}
for idx, val in zip(top_idx, top_vals, strict=False)
],
"last_residual_f32": last_residual,
}
results["test_cases"].append(test_case)
out_path = Path(__file__).parent / "qwen3_forward_reference.json"
with open(out_path, "w") as f:
json.dump(results, f, indent=2)
n_cases = len(results["test_cases"])
file_size = out_path.stat().st_size
print(
f"\nSaved {n_cases} test cases to {out_path} "
f"({file_size / 1024:.1f} KB)"
)
if __name__ == "__main__":
main()