bert_tokenizer 0.1.3

This crate is a Rust port of Google's BERT WordPiece tokenizer.
Documentation
use bert_tokenizer::{FullTokenizer, Tokenizer};

#[test]
fn test_full_tokenizer_uncased() {
    let vocab_file = "tests/uncased_L-12_H-768_A-12/vocab.txt";
    let do_lower_case = true;
    let tokenizer = FullTokenizer::new()
        .vocab_from_file(vocab_file)
        .do_lower_case(do_lower_case)
        .build();
    let tokens = tokenizer.tokenize("Hello world!");
    assert_eq!(tokens, vec!["hello", "world", "!"]);
    let ids = tokenizer.convert_tokens_to_ids(&tokens);
    let tokens = tokenizer.convert_ids_to_tokens(&ids);
    assert_eq!(tokens, vec!["hello", "world", "!"]);
    let text = tokenizer.convert_tokens_to_string(&tokens);
    assert_eq!(text, "hello world !");
}

#[test]
fn test_full_tokenizer_cased() {
    let vocab_file = "tests/cased_L-12_H-768_A-12/vocab.txt";
    let do_lower_case = false;
    let tokenizer = FullTokenizer::new()
        .vocab_from_file(vocab_file)
        .do_lower_case(do_lower_case)
        .build();
    let tokens = tokenizer.tokenize("Hello world!");
    assert_eq!(tokens, vec!["Hello", "world", "!"]);
    let ids = tokenizer.convert_tokens_to_ids(&tokens);
    let tokens = tokenizer.convert_ids_to_tokens(&ids);
    assert_eq!(tokens, vec!["Hello", "world", "!"]);
    let text = tokenizer.convert_tokens_to_string(&tokens);
    assert_eq!(text, "Hello world !");
}

#[test]
fn test_full_tokenizer_cased_strip_accents() {
    let vocab_file = "tests/cased_L-12_H-768_A-12/vocab.txt";
    let do_lower_case = false;
    let do_strip_accents = true;
    let tokenizer = FullTokenizer::new()
        .vocab_from_file(vocab_file)
        .do_lower_case(do_lower_case)
        .do_strip_accents(do_strip_accents)
        .build();
    let tokens = tokenizer.tokenize("Hello wörld!");
    assert_eq!(tokens, vec!["Hello", "world", "!"]);
    let ids = tokenizer.convert_tokens_to_ids(&tokens);
    let tokens = tokenizer.convert_ids_to_tokens(&ids);
    assert_eq!(tokens, vec!["Hello", "world", "!"]);
    let text = tokenizer.convert_tokens_to_string(&tokens);
    assert_eq!(text, "Hello world !");
}

#[test]
fn test_full_tokenizer_cased_no_strip_accents() {
    let vocab_file = "tests/cased_L-12_H-768_A-12/vocab.txt";
    let tokenizer = FullTokenizer::new()
        .vocab_from_file(vocab_file)
        .build();
    let tokens = tokenizer.tokenize("Hello wörld!");
    assert_eq!(tokens, vec!["Hello", "w", "##ö", "##rl", "##d", "!"]);
    let ids = tokenizer.convert_tokens_to_ids(&tokens);
    let tokens = tokenizer.convert_ids_to_tokens(&ids);
    assert_eq!(tokens,  vec!["Hello", "w", "##ö", "##rl", "##d", "!"]);
    let text = tokenizer.convert_tokens_to_string(&tokens);
    assert_eq!(text, "Hello wörld !");
}