dataflow_nlp 0.1.1

Dataflow is a data processing library, primarily for machine learning.
Documentation
// TOKENIZATION TESTS
use super::*;

#[test]
fn tokenize_alphabet() {
    let letters: Vec<String> = vec!["h", "e", "l", "l", "o"]
        .iter()
        .map(|t| (*t).to_string())
        .collect();
    let tokenizer = AlphabetTokenizer::load();
    assert_eq!(tokenizer.tokenize("hello"), letters);
}

#[test]
fn tokenize_spaces() {
    let tokens: Vec<String> = vec!["hello", "how", "are", "you"]
        .iter()
        .map(|t| (*t).to_string())
        .collect();
    let tokenizer = WhitespaceTokenizer::load();
    assert_eq!(tokenizer.tokenize("hello how are you"), tokens);
}

#[test]
fn tokenize_sentences() {
    let tokens: Vec<String> = vec!["hello, how are you?", "good, how are you?"]
        .iter()
        .map(|t| (*t).to_string())
        .collect();
    let tokenizer = SentenceTokenizer::load();
    assert_eq!(
        tokenizer.tokenize("hello, how are you? good, how are you?"),
        tokens
    );
}

#[test]
fn tokenize_bpe() {
    let tokens: Vec<String> = vec!["hello", ",", " ", "how", " ", "are", " ", "you"]
        .iter()
        .map(|str| str.to_string())
        .collect();
    let tokenizer = BPETokenizer::load();
    assert_eq!(
        tokenizer.batch_tokenize(vec!["hello, how are you".to_string()]),
        vec![tokens.clone()]
    );
    assert_eq!(tokenizer.tokenize("hello, how are you"), tokens);
}

#[test]
fn tokenize_wordpiece() {
    let tokens: Vec<String> = vec!["hello", ",", "how", "are", "you"]
        .iter()
        .map(|str| str.to_string())
        .collect();
    let tokenizer = WordpieceTokenizer::load();
    assert_eq!(
        tokenizer.batch_tokenize(vec!["hello, how are you".to_string()]),
        vec![tokens.clone()]
    );
    assert_eq!(tokenizer.tokenize("hello, how are you"), tokens);
}

#[test]
fn untokenize_bpe() {
    let sentence = "hello, how are you?";
    let tokenizer = BPETokenizer::load();
    let tokens = tokenizer.tokenize(sentence);
    assert_eq!(tokenizer.untokenize(tokens), sentence.to_string());
}

#[test]
fn untokenize_wordpiece() {
    let sentence = "hello, how are you?";
    let tokenizer = WordpieceTokenizer::load();
    let tokens = tokenizer.tokenize(sentence);
    assert_eq!(tokenizer.untokenize(tokens), sentence.to_string());
}

#[test]
fn untokenize_alphabet() {
    let sentence = "hello, how are you?";
    let tokenizer = AlphabetTokenizer::load();
    let tokens = tokenizer.tokenize(sentence);
    assert_eq!(tokenizer.untokenize(tokens), sentence.to_string());
}

#[test]
fn untokenize_spaces() {
    let sentence = "hello, how are you?";
    let tokenizer = WhitespaceTokenizer::load();
    let tokens = tokenizer.tokenize(sentence);
    assert_eq!(tokenizer.untokenize(tokens), sentence.to_string());
}

#[test]
fn untokenize_sentences() {
    let sentence = "hello how are you? good, how are you?";
    let tokenizer = WhitespaceTokenizer::load();
    let tokens = tokenizer.tokenize(sentence);
    assert_eq!(tokenizer.untokenize(tokens), sentence.to_string());
}