use crate::tokenize::*;
#[test]
fn test_regexp_tokenizer() {
let s = "fox can't jump 32.3 feet, right?";
let tokenizer = RegexpTokenizer::default();
let tokens: Vec<&str> = tokenizer.tokenize(s).collect();
let b: &[_] = &["fox", "can", "jump", "32", "feet", "right"];
assert_eq!(tokens, b);
}
#[test]
fn test_regexp_tokenizer_error() {
let tokenizer = RegexpTokenizerParams::default().pattern("(").build();
assert!(tokenizer.is_err());
}
#[test]
fn test_unicode_tokenizer() {
let s = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
let tokenizer = UnicodeWordTokenizerParams::default()
.word_bounds(false)
.build()
.unwrap();
let tokens: Vec<&str> = tokenizer.tokenize(s).collect();
let b: &[_] = &[
"The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right",
];
assert_eq!(tokens, b);
let tokenizer = UnicodeWordTokenizer::default();
let tokens: Vec<&str> = tokenizer.tokenize(s).collect();
let b: &[_] = &[
"The", "quick", "(", "\"", "brown", "\"", ")", "fox", "can't", "jump", "32.3", "feet", ",",
"right", "?",
];
assert_eq!(tokens, b);
}
#[test]
fn test_vtext_tokenizer_all_lang() {
let tokenizer = VTextTokenizer::default();
for (s, tokens_ref) in [
("23.2 meters", vec!["23.2", "meters"]),
("11,2 m", vec!["11,2", "m"]),
("1 ..", vec!["1", ".."]),
("I ...", vec!["I", "..."]),
(", o ! o", vec![",", "o", "!", "o"]),
("... ok.", vec!["...", "ok", "."]),
("porte-manteau", vec!["porte-manteau"]),
("name@domain.com", vec!["name@domain.com"]),
("1/2", vec!["1/2"]),
("and/or", vec!["and", "/", "or"]),
("8:30", vec!["8:30"]),
("B&B", vec!["B&B"]),
]
.iter()
{
let tokens: Vec<&str> = tokenizer.tokenize(s).collect();
assert_eq!(&tokens, tokens_ref);
}
}
#[test]
fn test_vtext_tokenizer_en() {
let tokenizer = VTextTokenizer::default();
for (s, tokens_ref) in [
("We can't", vec!["We", "ca", "n't"]),
("it's", vec!["it", "'s"]),
("it’s", vec!["it", "’s"]),
]
.iter()
{
let tokens: Vec<&str> = tokenizer.tokenize(s).collect();
assert_eq!(&tokens, tokens_ref);
}
}
#[test]
fn test_vtext_tokenizer_fr() {
let tokenizer = VTextTokenizerParams::default().lang("fr").build().unwrap();
for (s, tokens_ref) in [("l'image", vec!["l'", "image"])].iter() {
let tokens: Vec<&str> = tokenizer.tokenize(s).collect();
assert_eq!(&tokens, tokens_ref);
}
}
#[test]
fn test_vtext_tokenizer_invalid_lang() {
let tokenizer = VTextTokenizerParams::default()
.lang("unknown")
.build()
.unwrap();
assert_eq!(tokenizer.params.lang, "any");
}
#[test]
fn test_character_tokenizer() {
let s = "fox can't";
let tokenizer = CharacterTokenizer::default();
let tokens: Vec<&str> = tokenizer.tokenize(s).collect();
let b: &[_] = &["fox ", "ox c", "x ca", " can", "can'", "an't"];
assert_eq!(tokens, b);
}
#[test]
fn test_tokenizer_defaults() {
let tokenizer = UnicodeWordTokenizer::default();
assert_eq!(tokenizer.params.word_bounds, true);
}