import numpy as np
from gguf import GGUFReader, GGUFValueType
from llama_cpp import Llama
model_path = "/home/joseph/Models/qwen2.5-0.5b-instruct-q4_k_m.gguf"
reader = GGUFReader(model_path)
print("=== GGUF Tensor Info ===")
for tensor in reader.tensors:
if tensor.name in ['token_embd.weight', 'output.weight', 'output_norm.weight', 'blk.0.attn_q.weight']:
print(f"\n{tensor.name}:")
print(f" Shape (ne): {tensor.shape}")
print(f" Type: {tensor.tensor_type}")
print(f" Data shape: {tensor.data.shape}")
if tensor.name == 'output.weight':
hidden_size = 896
vocab_size = 151936
expected_elements = hidden_size * vocab_size
print(f" Expected elements (896 x 151936): {expected_elements}")
llm = Llama(model_path=model_path, n_ctx=512, verbose=False, embedding=True)
for tensor in reader.tensors:
if tensor.name == 'output.weight':
print(f"\n=== output.weight details ===")
print(f" Type: {tensor.tensor_type} (Q8_0 = 8)")
blocks_per_column = 896 // 32 bytes_per_block = 32 + 2 expected_bytes = blocks_per_column * bytes_per_block * 151936
print(f" Expected raw bytes: {expected_bytes}")
print(f" Actual raw data len: {len(tensor.data.tobytes())}")
print("\n=== Verifying matrix multiplication order ===")
print("\nAccording to GGUF spec:")
print(" - ne[0] is the fastest-varying dimension (contiguous)")
print(" - For shape [896, 151936], each column of 896 elements is contiguous")
print(" - output.weight[i, j] = data[i + j * 896]")
print(" - logits[j] = sum_i(hidden[i] * output.weight[i, j])")
print(" - This matches vec_mat implementation")
print("\n=== Checking first few dequantized values ===")
for tensor in reader.tensors:
if tensor.name == 'output.weight':
data = tensor.data
print(f" Raw data dtype: {data.dtype}")
print(f" Raw data shape: {data.shape}")
print(f"\n IMPORTANT: The shape (151936, 952) means:")
print(f" - 151936 rows (one per vocab token)")
print(f" - 952 bytes per row = 28 Q8_0 blocks * 34 bytes/block")
print(f" - 28 blocks * 32 elements/block = 896 elements")
print(f" - So data[vocab_idx, :] = packed Q8_0 for hidden weights of that vocab token")
print(f"\n This means the weight is stored as [vocab_size, hidden_size] in memory!")
print(f" But GGUF reports shape as [896, 151936]...")
print(f" The GGUF shape [ne0, ne1] = [896, 151936] might mean:")
print(f" - GGUF convention: ne[0] is inner dimension")
print(f" - But physical storage could be [ne1, packed_ne0]")