#![warn(missing_docs)]
pub mod flash;
pub mod fold;
pub mod html;
pub mod ngram;
pub mod similarity;
pub mod spans;
pub mod stopwords;
pub mod subword;
pub mod tokenize;
pub mod unicode;
pub use flash::{FlashText, KeywordMatch};
pub use fold::{fold, strip_diacritics};
pub use html::decode_entities;
pub use spans::clean_span_boundary;
#[allow(deprecated)]
pub use subword::{BpeTokenizer, SubwordTokenizer, VocabTokenizer};
pub use tokenize::Token;
pub use tokenize::TokenRef;
pub use unicode::{nfc, nfkc};
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct ScrubConfig {
pub normalize_newlines: bool,
pub remove_zero_width: bool,
pub remove_bidi_controls: bool,
pub collapse_whitespace: bool,
pub normalization: ScrubNormalization,
pub case: ScrubCase,
pub strip_diacritics: bool,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub enum ScrubNormalization {
None,
Nfc,
Nfkc,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub enum ScrubCase {
None,
Lower,
#[cfg(feature = "casefold")]
NfkcCasefold,
}
impl ScrubConfig {
#[cfg(feature = "casefold")]
pub fn search_key() -> Self {
Self {
normalize_newlines: true,
remove_zero_width: false, remove_bidi_controls: true,
collapse_whitespace: true,
normalization: ScrubNormalization::Nfkc,
case: ScrubCase::NfkcCasefold,
strip_diacritics: true,
}
}
#[cfg(not(feature = "casefold"))]
pub fn search_key() -> Self {
Self {
normalize_newlines: true,
remove_zero_width: false, remove_bidi_controls: true,
collapse_whitespace: true,
normalization: ScrubNormalization::Nfkc,
case: ScrubCase::Lower,
strip_diacritics: true,
}
}
pub fn search_key_strict_invisibles() -> Self {
let mut cfg = Self::search_key();
cfg.remove_zero_width = true;
cfg
}
}
impl Default for ScrubConfig {
fn default() -> Self {
Self {
normalize_newlines: false,
remove_zero_width: false,
remove_bidi_controls: false,
collapse_whitespace: false,
normalization: ScrubNormalization::Nfc,
case: ScrubCase::Lower,
strip_diacritics: true,
}
}
}
pub fn scrub(text: &str) -> String {
scrub_with(text, &ScrubConfig::default())
}
pub fn scrub_with(text: &str, cfg: &ScrubConfig) -> String {
let mut s = text.to_string();
let mut buf = String::new();
if cfg.normalize_newlines {
unicode::normalize_newlines_into(&s, &mut buf);
std::mem::swap(&mut s, &mut buf);
}
if cfg.remove_zero_width {
unicode::remove_zero_width_into(&s, &mut buf);
std::mem::swap(&mut s, &mut buf);
}
if cfg.remove_bidi_controls {
unicode::remove_bidi_controls_into(&s, &mut buf);
std::mem::swap(&mut s, &mut buf);
}
s = match cfg.normalization {
ScrubNormalization::None => s,
ScrubNormalization::Nfc => unicode::nfc(&s),
ScrubNormalization::Nfkc => unicode::nfkc(&s),
};
s = match cfg.case {
ScrubCase::None => s,
ScrubCase::Lower => fold::fold(&s),
#[cfg(feature = "casefold")]
ScrubCase::NfkcCasefold => fold::fold_nfkc_casefold(&s),
};
if cfg.strip_diacritics {
s = fold::strip_diacritics(&s);
}
if cfg.collapse_whitespace {
unicode::collapse_whitespace_into(&s, &mut buf);
std::mem::swap(&mut s, &mut buf);
}
s
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_scrub() {
let text = "Müller";
assert_eq!(scrub(text), "muller");
}
#[test]
fn test_scrub_with_preserves_when_disabled() {
let cfg = ScrubConfig {
normalization: ScrubNormalization::None,
case: ScrubCase::None,
strip_diacritics: false,
..ScrubConfig::default()
};
assert_eq!(scrub_with("Müller", &cfg), "Müller");
}
#[test]
#[cfg(feature = "casefold")]
fn test_search_key_nfkc_casefold_expands_sharp_s() {
let cfg = ScrubConfig::search_key();
assert_eq!(scrub_with("Straße", &cfg), scrub_with("strasse", &cfg));
}
#[test]
fn test_search_key_strict_invisibles_removes_zwj_family() {
let cfg = ScrubConfig::search_key_strict_invisibles();
let text = "a\u{200B}b\u{200C}c\u{200D}d\u{2060}e\u{FEFF}f";
assert_eq!(scrub_with(text, &cfg), "abcdef");
}
#[test]
fn test_search_key_preserves_zwj_by_default() {
let cfg = ScrubConfig::search_key();
let text = "x\u{200D}y"; let out = scrub_with(text, &cfg);
assert_eq!(out, text);
}
#[test]
#[cfg(feature = "serde")]
fn test_serde_roundtrip_scrub_config() {
let cfg = ScrubConfig {
normalize_newlines: true,
remove_zero_width: false,
remove_bidi_controls: true,
collapse_whitespace: true,
normalization: ScrubNormalization::Nfkc,
case: ScrubCase::Lower,
strip_diacritics: true,
};
let s = serde_json::to_string(&cfg).expect("serialize");
let de: ScrubConfig = serde_json::from_str(&s).expect("deserialize");
assert_eq!(cfg.normalize_newlines, de.normalize_newlines);
assert_eq!(cfg.remove_zero_width, de.remove_zero_width);
assert_eq!(cfg.remove_bidi_controls, de.remove_bidi_controls);
assert_eq!(cfg.collapse_whitespace, de.collapse_whitespace);
assert_eq!(cfg.normalization, de.normalization);
assert_eq!(cfg.case, de.case);
assert_eq!(cfg.strip_diacritics, de.strip_diacritics);
}
#[test]
#[cfg(feature = "serde")]
fn test_serde_roundtrip_token() {
let t = Token {
text: "東京".to_string(),
start: 1,
end: 3,
};
let s = serde_json::to_string(&t).expect("serialize");
let de: Token = serde_json::from_str(&s).expect("deserialize");
assert_eq!(t, de);
}
#[test]
#[cfg(feature = "serde")]
fn test_serde_roundtrip_keyword_match() {
let m = KeywordMatch {
keyword: "Müller".to_string(),
value: "muller".to_string(),
start: 0,
end: 6,
};
let s = serde_json::to_string(&m).expect("serialize");
let de: KeywordMatch = serde_json::from_str(&s).expect("deserialize");
assert_eq!(m, de);
}
}