from __future__ import annotations
import argparse
import sys
def main() -> int:
p = argparse.ArgumentParser(description="qwen35 parity reference (llama-cpp-python)")
p.add_argument("weights", help="path to .gguf")
p.add_argument(
"--prompt-ids",
default="1,2,3",
help="comma-separated u32 token ids (default: 1,2,3)",
)
p.add_argument("--top-k", type=int, default=16)
p.add_argument(
"--n-ctx",
type=int,
default=None,
help="context size (default: model's max)",
)
args = p.parse_args()
try:
from llama_cpp import Llama
except ImportError:
print(
"ERROR: `llama-cpp-python` not installed. Run:\n"
" pip install llama-cpp-python\n"
"(set CMAKE_ARGS='-DGGML_METAL=on' on macOS for Metal builds).",
file=sys.stderr,
)
return 2
prompt_ids = [int(x.strip()) for x in args.prompt_ids.split(",") if x.strip()]
if not prompt_ids:
print("ERROR: empty prompt-ids", file=sys.stderr)
return 2
print(f"# Loading {args.weights} via llama-cpp-python…", file=sys.stderr)
llm = Llama(
model_path=args.weights,
n_ctx=args.n_ctx or 4096,
logits_all=True,
verbose=False,
)
llm.eval(prompt_ids)
logits = llm.eval_logits[-1] n_vocab = len(logits)
print(f"# REF logits: len={n_vocab}", file=sys.stderr)
pairs = sorted(enumerate(logits), key=lambda kv: kv[1], reverse=True)[: args.top_k]
for rank, (tok_id, val) in enumerate(pairs):
print(f"REF_LOGIT idx={rank} token={tok_id} value={val:.6f}")
return 0
if __name__ == "__main__":
sys.exit(main())