embedding 0.1.1

A Rust library and CLI for training embeddings from scratch
Documentation
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use embedding::*;

fn bench_skipgram_training(c: &mut Criterion) {
    let sentences = vec![
        vec!["the".to_string(), "quick".to_string(), "brown".to_string(), "fox".to_string()],
        vec!["jumps".to_string(), "over".to_string(), "the".to_string(), "lazy".to_string()],
        vec!["dog".to_string(), "and".to_string(), "cat".to_string()],
        vec!["machine".to_string(), "learning".to_string(), "is".to_string(), "fun".to_string()],
    ];
    
    let (vocab, reverse_vocab) = build_vocab(&sentences);
    let training_data = TrainingData {
        sentences,
        vocab,
        reverse_vocab,
    };
    
    let config = TrainingConfig {
        embedding_dim: 100,
        learning_rate: 0.025,
        epochs: 5,
        batch_size: 32,
        context_window: 2,
        negative_samples: 5,
        model_type: ModelType::SkipGram,
        lr_schedule: LearningRateSchedule::Constant,
        early_stopping: None,
        l2_regularization: None,
        gradient_clip: None,
    };
    
    c.bench_function("skipgram_training", |b| {
        b.iter(|| {
            let mut model = EmbeddingModel::new(config.clone(), training_data.vocab.len());
            model.train(&training_data).unwrap();
            black_box(());
        })
    });
}

fn bench_cbow_training(c: &mut Criterion) {
    let sentences = vec![
        vec!["the".to_string(), "quick".to_string(), "brown".to_string(), "fox".to_string()],
        vec!["jumps".to_string(), "over".to_string(), "the".to_string(), "lazy".to_string()],
        vec!["dog".to_string(), "and".to_string(), "cat".to_string()],
        vec!["machine".to_string(), "learning".to_string(), "is".to_string(), "fun".to_string()],
    ];
    
    let (vocab, reverse_vocab) = build_vocab(&sentences);
    let training_data = TrainingData {
        sentences,
        vocab,
        reverse_vocab,
    };
    
    let config = TrainingConfig {
        embedding_dim: 100,
        learning_rate: 0.025,
        epochs: 5,
        batch_size: 32,
        context_window: 2,
        negative_samples: 5,
        model_type: ModelType::Cbow,
        lr_schedule: LearningRateSchedule::Constant,
        early_stopping: None,
        l2_regularization: None,
        gradient_clip: None,
    };
    
    c.bench_function("cbow_training", |b| {
        b.iter(|| {
            let mut model = EmbeddingModel::new(config.clone(), training_data.vocab.len());
            model.train(&training_data).unwrap();
            black_box(());
        })
    });
}

fn bench_similarity_calculation(c: &mut Criterion) {
    let sentences = vec![
        vec!["the".to_string(), "quick".to_string(), "brown".to_string(), "fox".to_string()],
        vec!["jumps".to_string(), "over".to_string(), "the".to_string(), "lazy".to_string()],
        vec!["dog".to_string(), "and".to_string(), "cat".to_string()],
        vec!["machine".to_string(), "learning".to_string(), "is".to_string(), "fun".to_string()],
    ];
    
    let (vocab, reverse_vocab) = build_vocab(&sentences);
    let training_data = TrainingData {
        sentences,
        vocab,
        reverse_vocab,
    };
    
    let config = TrainingConfig {
        embedding_dim: 100,
        learning_rate: 0.025,
        epochs: 10,
        batch_size: 32,
        context_window: 2,
        negative_samples: 5,
        model_type: ModelType::SkipGram,
        lr_schedule: LearningRateSchedule::Constant,
        early_stopping: None,
        l2_regularization: None,
        gradient_clip: None,
    };
    
    let mut model = EmbeddingModel::new(config, training_data.vocab.len());
    model.train(&training_data).unwrap();
    
    c.bench_function("similarity_calculation", |b| {
        b.iter(|| {
            black_box(model.similarity("fox", "dog", &training_data));
        })
    });
}

fn bench_embedding_retrieval(c: &mut Criterion) {
    let sentences = vec![
        vec!["the".to_string(), "quick".to_string(), "brown".to_string(), "fox".to_string()],
        vec!["jumps".to_string(), "over".to_string(), "the".to_string(), "lazy".to_string()],
        vec!["dog".to_string(), "and".to_string(), "cat".to_string()],
        vec!["machine".to_string(), "learning".to_string(), "is".to_string(), "fun".to_string()],
    ];
    
    let (vocab, reverse_vocab) = build_vocab(&sentences);
    let training_data = TrainingData {
        sentences,
        vocab,
        reverse_vocab,
    };
    
    let config = TrainingConfig {
        embedding_dim: 100,
        learning_rate: 0.025,
        epochs: 10,
        batch_size: 32,
        context_window: 2,
        negative_samples: 5,
        model_type: ModelType::SkipGram,
        lr_schedule: LearningRateSchedule::Constant,
        early_stopping: None,
        l2_regularization: None,
        gradient_clip: None,
    };
    
    let mut model = EmbeddingModel::new(config, training_data.vocab.len());
    model.train(&training_data).unwrap();
    
    let words = vec!["the", "fox", "dog", "machine", "learning", "quick", "brown"];
    
    c.bench_function("embedding_retrieval", |b| {
        b.iter(|| {
            for word in black_box(&words) {
                black_box(model.get_embedding(word, &training_data));
            }
        })
    });
}

fn bench_vocab_building(c: &mut Criterion) {
    let text = "The quick brown fox jumps over the lazy dog. The fox is quick and the dog is lazy. Machine learning is fascinating and powerful.";
    let sentences = load_text_data(text);
    
    c.bench_function("vocab_building", |b| {
        b.iter(|| {
            black_box(build_vocab(&sentences));
        })
    });
}

fn bench_semantic_search(c: &mut Criterion) {
    let sentences = vec![
        vec!["the".to_string(), "quick".to_string(), "brown".to_string(), "fox".to_string()],
        vec!["jumps".to_string(), "over".to_string(), "the".to_string(), "lazy".to_string()],
        vec!["dog".to_string(), "and".to_string(), "cat".to_string()],
        vec!["machine".to_string(), "learning".to_string(), "is".to_string(), "fun".to_string()],
    ];

    let (vocab, reverse_vocab) = build_vocab(&sentences);
    let training_data = TrainingData {
        sentences,
        vocab,
        reverse_vocab,
    };

    let config = TrainingConfig {
        embedding_dim: 100,
        learning_rate: 0.025,
        epochs: 10,
        batch_size: 32,
        context_window: 2,
        negative_samples: 5,
        model_type: ModelType::SkipGram,
        lr_schedule: LearningRateSchedule::Constant,
        early_stopping: None,
        l2_regularization: None,
        gradient_clip: None,
    };

    let mut model = EmbeddingModel::new(config, training_data.vocab.len());
    model.train(&training_data).unwrap();

    c.bench_function("semantic_search", |b| {
        b.iter(|| {
            black_box(model.semantic_search("fox", &training_data, 5));
        })
    });
}

fn bench_analogy(c: &mut Criterion) {
    let sentences = vec![
        vec!["the".to_string(), "quick".to_string(), "brown".to_string(), "fox".to_string()],
        vec!["jumps".to_string(), "over".to_string(), "the".to_string(), "lazy".to_string()],
        vec!["dog".to_string(), "and".to_string(), "cat".to_string()],
        vec!["machine".to_string(), "learning".to_string(), "is".to_string(), "fun".to_string()],
    ];

    let (vocab, reverse_vocab) = build_vocab(&sentences);
    let training_data = TrainingData {
        sentences,
        vocab,
        reverse_vocab,
    };

    let config = TrainingConfig {
        embedding_dim: 100,
        learning_rate: 0.025,
        epochs: 10,
        batch_size: 32,
        context_window: 2,
        negative_samples: 5,
        model_type: ModelType::SkipGram,
        lr_schedule: LearningRateSchedule::Constant,
        early_stopping: None,
        l2_regularization: None,
        gradient_clip: None,
    };

    let mut model = EmbeddingModel::new(config, training_data.vocab.len());
    model.train(&training_data).unwrap();

    c.bench_function("analogy", |b| {
        b.iter(|| {
            black_box(model.analogy("fox", "dog", "cat", &training_data, 3));
        })
    });
}

fn bench_lsh_query(c: &mut Criterion) {
    let sentences = vec![
        vec!["the".to_string(), "quick".to_string(), "brown".to_string(), "fox".to_string()],
        vec!["jumps".to_string(), "over".to_string(), "the".to_string(), "lazy".to_string()],
        vec!["dog".to_string(), "and".to_string(), "cat".to_string()],
        vec!["machine".to_string(), "learning".to_string(), "is".to_string(), "fun".to_string()],
    ];

    let (vocab, reverse_vocab) = build_vocab(&sentences);
    let training_data = TrainingData {
        sentences,
        vocab,
        reverse_vocab,
    };

    let config = TrainingConfig {
        embedding_dim: 100,
        learning_rate: 0.025,
        epochs: 10,
        batch_size: 32,
        context_window: 2,
        negative_samples: 5,
        model_type: ModelType::SkipGram,
        lr_schedule: LearningRateSchedule::Constant,
        early_stopping: None,
        l2_regularization: None,
        gradient_clip: None,
    };

    let mut model = EmbeddingModel::new(config, training_data.vocab.len());
    model.train(&training_data).unwrap();

    let mut lsh = LSHIndex::new(4, 100);
    lsh.build(&model, &training_data);

    c.bench_function("lsh_query", |b| {
        b.iter(|| {
            black_box(lsh.query("fox", &model, &training_data, 5));
        })
    });
}

fn bench_sentence_embedding(c: &mut Criterion) {
    let sentences = vec![
        vec!["the".to_string(), "quick".to_string(), "brown".to_string(), "fox".to_string()],
        vec!["jumps".to_string(), "over".to_string(), "the".to_string(), "lazy".to_string()],
        vec!["dog".to_string(), "and".to_string(), "cat".to_string()],
        vec!["machine".to_string(), "learning".to_string(), "is".to_string(), "fun".to_string()],
    ];

    let (vocab, reverse_vocab) = build_vocab(&sentences);
    let training_data = TrainingData {
        sentences,
        vocab,
        reverse_vocab,
    };

    let config = TrainingConfig {
        embedding_dim: 100,
        learning_rate: 0.025,
        epochs: 10,
        batch_size: 32,
        context_window: 2,
        negative_samples: 5,
        model_type: ModelType::SkipGram,
        lr_schedule: LearningRateSchedule::Constant,
        early_stopping: None,
        l2_regularization: None,
        gradient_clip: None,
    };

    let mut model = EmbeddingModel::new(config, training_data.vocab.len());
    model.train(&training_data).unwrap();

    let sentence = vec!["the".to_string(), "quick".to_string(), "fox".to_string()];

    c.bench_function("sentence_embedding", |b| {
        b.iter(|| {
            black_box(model.sentence_embedding(&sentence, &training_data));
        })
    });
}

criterion_group!(
    benches,
    bench_skipgram_training,
    bench_cbow_training,
    bench_similarity_calculation,
    bench_embedding_retrieval,
    bench_vocab_building,
    bench_semantic_search,
    bench_analogy,
    bench_lsh_query,
    bench_sentence_embedding
);
criterion_main!(benches);