word_vec-rs 0.1.0

Memory efficient library to work with word2vec vectors
pub mod as_vector;
pub mod error;
pub mod export;
pub mod iter;
pub mod parse;
pub mod space;
pub mod vector;

use parse::Word2VecParser;
use space::VecSpace;
use std::time::Instant;
use vector::OwnedVector;

fn main() {
    let start = Instant::now();
    let en_space = Word2VecParser::new()
        .binary()
        .index_terms(true)
        .parse_file("./GoogleNews-vectors-negative300.bin")
        .unwrap();

    println!(
        "loading {} vectors took: {:?}",
        en_space.len(),
        start.elapsed()
    );
    loop {}
}

pub fn main2() {
    let en_space = Word2VecParser::new()
        .index_terms(true)
        .parse_file("./enja.para.lang0.vec")
        .unwrap();

    let ja_space = Word2VecParser::new()
        .index_terms(true)
        .parse_file("./enja.para.lang1.vec")
        .unwrap();

    println!("Loaded");
    let mut buf = String::new();
    loop {
        std::io::stdin().read_line(&mut buf).unwrap();
        let txt = buf.trim();
        if txt.is_empty() {
            buf.clear();
            continue;
        }

        print_top_k(&en_space, txt, &ja_space, 10);

        buf.clear();
    }
}

fn print_top_k(src_space: &VecSpace, term: &str, space: &VecSpace, k: usize) {
    let subterms: Vec<_> = term
        .split(' ')
        .filter_map(|i| src_space.find_term(i))
        .collect();

    if subterms.is_empty() {
        println!("Term {term:?} not found");
        return;
    }

    let mut qvec: OwnedVector = borrowme::ToOwned::to_owned(&subterms[0]);
    for i in 1..subterms.len() {
        qvec = qvec + subterms[i];
    }

    let start = Instant::now();
    let top = space.top_k(k, |o| qvec.cosine(o));
    let dur = start.elapsed();

    println!("Top k={k} for {term:?} (in: {dur:?}):");

    for (sim, vec) in top {
        println!("- {} ({})", vec.term(), sim);
    }

    println!();
}