use super::*;
#[test]
fn normalize_lowercases_ascii() {
assert_eq!(normalize("Hello World"), "hello world");
}
#[test]
fn normalize_strips_polish_diacritics() {
assert_eq!(normalize("Kraków"), "krakow");
assert_eq!(normalize("żółć"), "zolc");
assert_eq!(normalize("łąka"), "laka");
assert_eq!(normalize("Mañana"), "manana");
}
#[test]
fn normalize_folds_non_decomposing_letters() {
assert_eq!(normalize("Łódź"), "lodz");
assert_eq!(normalize("Straße"), "strasse");
assert_eq!(normalize("Bjørn"), "bjorn");
assert_eq!(normalize("Þórr"), "thorr");
}
#[test]
fn normalize_strips_arabic_harakat() {
let with_marks = "كَتَبَ";
let without_marks = "كتب";
assert_eq!(normalize(with_marks), normalize(without_marks));
}
#[test]
fn normalize_unifies_cjk_halfwidth_fullwidth() {
let halfwidth = "カタカナ"; let fullwidth = "カタカナ"; assert_eq!(normalize(halfwidth), normalize(fullwidth));
}
#[test]
fn normalize_is_idempotent() {
let s = "Café — 東京 — żółć";
let once = normalize(s);
let twice = normalize(&once);
assert_eq!(once, twice);
}
#[test]
fn ngrams_emits_trigrams_for_latin() {
let g = ngrams("kitten");
assert_eq!(g, vec!["kit", "itt", "tte", "ten"]);
}
#[test]
fn ngrams_emits_bigrams_for_cjk() {
let g = ngrams("日本語");
assert_eq!(g, vec!["日本", "本語"]);
}
#[test]
fn ngrams_mixed_script_splits_at_boundary() {
let g = ngrams("東京tokyo");
assert_eq!(g, vec!["東京", "tok", "oky", "kyo"]);
}
#[test]
fn ngrams_drops_runs_too_short() {
let g = ngrams("ab東");
assert!(g.is_empty(), "got {:?}", g);
}
#[test]
fn ngrams_substring_inside_word_is_indexable() {
let normalized = normalize("Concatenate");
let g = ngrams(&normalized);
assert!(g.contains(&"cat"), "trigrams: {:?}", g);
}
#[test]
fn ngrams_empty_input_returns_empty() {
assert!(ngrams("").is_empty());
}
#[test]
fn is_cjk_classifies_common_scripts() {
assert!(is_cjk('東'));
assert!(is_cjk('あ')); assert!(is_cjk('カ')); assert!(is_cjk('한')); assert!(!is_cjk('a'));
assert!(!is_cjk('ą'));
assert!(!is_cjk(',')); }