candle-pyo3 0.3.2

from candle.utils import load_safetensors, save_gguf, load_gguf
from candle.models.bert import BertModel, Config
import json
from candle import Tensor
from tqdm import tqdm
from dataclasses import fields
import os
import time

from huggingface_hub import hf_hub_download
from transformers import BertTokenizer, AutoModel
import torch

if __name__ == "__main__":
    model_name = "intfloat/e5-small-v2"
    model_file = hf_hub_download(repo_id=model_name, filename="model.safetensors")
    config_file = hf_hub_download(repo_id=model_name, filename="config.json")

    tensors = load_safetensors(model_file)
    config = Config()
    with open(config_file, "r") as f:
        raw_config = json.load(f)
        for field in fields(config):
            if field.name in raw_config:
                setattr(config, field.name, raw_config[field.name])

    # Load the model
    model = BertModel(config)
    model.load_state_dict(tensors)

    hf_model = AutoModel.from_pretrained(model_name)
    tokenizer = BertTokenizer.from_pretrained(model_name)

    sentences = [
        "The cat sits outside",
        "A man is playing guitar",
        "I love pasta",
        "The new movie is awesome",
        "The cat plays in the garden",
        "A woman watches TV",
        "The new movie is so great",
        "Do you like pizza?",
    ]

    def average_pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor):
        """Average the hidden states according to the attention mask"""
        last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
        return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

    tokenized = tokenizer(sentences, padding=True)
    tokens = Tensor(tokenized["input_ids"])
    token_type_ids = Tensor(tokenized["token_type_ids"])
    attention_mask = Tensor(tokenized["attention_mask"])
    encoder_out, _ = model.forward(tokens, token_type_ids, attention_mask=attention_mask)

    hf_tokenized = tokenizer(sentences, padding=True, return_tensors="pt")
    hf_result = hf_model(**hf_tokenized)["last_hidden_state"]

    hf_pooled = average_pool(hf_result, hf_tokenized["attention_mask"])
    candle_pooled = average_pool(torch.tensor(encoder_out.values()), hf_tokenized["attention_mask"])

    loss = torch.nn.L1Loss()
    error = loss(hf_pooled, candle_pooled).mean().item()
    print(f"Mean error between torch-reference and candle: {error}")

    # Quantize all attention 'weights'
    quantized_tensors = {}
    for name, tensor in tqdm(tensors.items(), desc="Quantizing tensors to 5-Bit"):
        if name.endswith("weight") and ("attention" in name or "intermediate" in name or "output" in name):
            # check if the tensor is k-quantizable
            if tensor.shape[-1] % 256 == 0:
                new_tensor = tensor.quantize("q4k")
            else:
                new_tensor = tensor.quantize("q5_0")
            quantized_tensors[name] = new_tensor
        else:
            quantized_tensors[name] = tensor.quantize("q8_0")

    print(f"Saving quantized tensors")
    # Remove all None values from the config
    config_to_save = {k: v for k, v in config.__dict__.items() if v is not None}
    # Save the model
    quantized_model_file = "e5_small.gguf"
    save_gguf(quantized_model_file, quantized_tensors, config_to_save)

    file_size_mb = os.path.getsize(model_file) / 1024 / 1024
    file_size_mb_compressed = os.path.getsize(quantized_model_file) / 1024 / 1024
    print(f"Compressed model from {file_size_mb:.2f} MB to {file_size_mb_compressed:.2f} MB")
    # Load the model from the gguf
    tensors, raw_config = load_gguf(quantized_model_file)
    config = Config()
    for field in fields(config):
        if field.name in raw_config:
            setattr(config, field.name, raw_config[field.name])
    model = BertModel(config)
    # "embeddings.position_ids" is missing in the gguf as it is i64
    model.load_state_dict(tensors, strict=False)

    # Run the model again
    encoder_out_2, pooled_output_2 = model.forward(tokens, token_type_ids)
    encoder_out_2, pooled_output_2 = encoder_out_2.to_device("cpu"), pooled_output_2.to_device("cpu")

    candle_pooled_2 = average_pool(torch.tensor(encoder_out_2.values()), hf_tokenized["attention_mask"])
    error = loss(hf_pooled, candle_pooled_2).mean().item()
    print(f"Mean error between torch-reference and quantized-candle: {error}")