1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/usr/bin/env python3
"""Compare layer 0 outputs between llama-cpp-python internals and our computation."""
import numpy as np
from llama_cpp import Llama
model_path = "/home/joseph/Models/qwen2.5-0.5b-instruct-q4_k_m.gguf"
print("Loading model...")
llm = Llama(model_path=model_path, n_ctx=512, verbose=False, logits_all=True)
# Try to get access to internal state
# Note: llama-cpp-python doesn't expose internal hidden states directly
# But we can try to understand what's happening
# Test with single token first
tokens = [28] # "="
print(f"Input tokens: {tokens}")
llm.reset()
llm.eval(tokens)
logits_single = np.array(llm._scores[-1])
print(f"\nSingle token '=' (token 28) logits:")
print(f" min: {logits_single.min():.4f}, max: {logits_single.max():.4f}")
print(f" Token 17 ('2') logit: {logits_single[17]:.4f}")
print(f" First 10: {logits_single[:10]}")
# Get top 5
top_idx = np.argsort(logits_single)[::-1][:5]
print(f" Top 5: {[(idx, logits_single[idx]) for idx in top_idx]}")
# Compare with our Rust output for single token
# From our earlier tests:
# Single token "=" gives Token 17 rank around 3880 in Rust
# But llama-cpp should be different
# Also check the embedding itself
# We can do this by using the tokenizer
print("\n=== Token Embedding Check ===")
from gguf import GGUFReader
reader = GGUFReader(model_path)
# Find embedding tensor
for tensor in reader.tensors:
if tensor.name == "token_embd.weight":
print(f"Embedding tensor shape: {tensor.shape}")
print(f"Embedding tensor type: {tensor.tensor_type}")
# The shape should be [hidden_size, vocab_size] = [896, 151936]
# Check if we can access the data
data = tensor.data
print(f"Data shape: {data.shape}")
# For token 28, the embedding should be at column 28
# In GGUF column-major, this means indices [28*896 : (28+1)*896]
# But the quantized data needs dequantization
break