txtfp 0.2.1

Text fingerprinting: MinHash + LSH, SimHash, and ONNX semantic embeddings
Documentation
//! UAX #29 grapheme-cluster tokenizer.
//!
//! Useful when the unit of comparison is the user-perceived character
//! rather than the word: emoji deduplication, character-level shingling
//! on languages without word boundaries, fuzzy matching of mixed-script
//! identifiers.

use alloc::borrow::Cow;
use alloc::boxed::Box;

use unicode_segmentation::UnicodeSegmentation;

use super::{TokenStream, Tokenizer};

/// Grapheme-cluster tokenizer (UAX #29 extended grapheme clusters).
///
/// Splits text into user-perceived characters. Family ZWJ sequences
/// (`πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦`), regional-indicator flag pairs (`πŸ‡ΊπŸ‡Έ`), and combining-mark
/// composites (`e + β—ŒΜ` β†’ `Γ©`) are each a single grapheme.
///
/// # Performance
///
/// Zero-sized (`Copy`). The `for_each_token` impl walks `graphemes(true)`
/// directly and yields borrowed `&str` slices β€” no allocation per token.
///
/// # Use cases
///
/// - Character-level shingling on languages without word boundaries
///   (Thai, Chinese, …).
/// - Emoji deduplication.
/// - Fuzzy matching of mixed-script identifiers where word boundaries
///   are ambiguous.
///
/// # Example
///
/// ```
/// use txtfp::{GraphemeTokenizer, Tokenizer};
///
/// let mut count = 0;
/// GraphemeTokenizer.for_each_token("πŸ‘¨\u{200D}πŸ‘©\u{200D}πŸ‘§", &mut |_| count += 1);
/// assert_eq!(count, 1);                                              // family emoji = 1 grapheme
/// ```
#[derive(Default, Copy, Clone, Debug, PartialEq, Eq, Hash)]
pub struct GraphemeTokenizer;

impl Tokenizer for GraphemeTokenizer {
    fn tokens<'a>(&'a self, input: &'a str) -> TokenStream<'a> {
        // `true` = extended grapheme clusters (vs legacy).
        TokenStream::Borrowed(Box::new(input.graphemes(true).filter(|s| !s.is_empty())))
    }

    #[inline]
    fn name(&self) -> Cow<'static, str> {
        Cow::Borrowed("grapheme-uax29")
    }

    #[inline]
    fn for_each_token(&self, input: &str, f: &mut dyn FnMut(&str)) {
        for g in input.graphemes(true) {
            if !g.is_empty() {
                f(g);
            }
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use alloc::string::String;
    use alloc::vec::Vec;

    fn collect(s: &str) -> Vec<String> {
        GraphemeTokenizer
            .tokens(s)
            .into_string_iter()
            .collect::<Vec<_>>()
    }

    #[test]
    fn ascii_is_per_char() {
        assert_eq!(collect("abc"), ["a", "b", "c"]);
    }

    #[test]
    fn flag_emoji_is_one_grapheme() {
        // πŸ‡ΊπŸ‡Έ (regional indicator pair) should be one extended grapheme.
        let toks = collect("πŸ‡ΊπŸ‡Έ");
        assert_eq!(toks.len(), 1);
    }

    #[test]
    fn family_zwj_is_one_grapheme() {
        // πŸ‘¨β€πŸ‘©β€πŸ‘§ (man + ZWJ + woman + ZWJ + girl).
        let toks = collect("πŸ‘¨\u{200D}πŸ‘©\u{200D}πŸ‘§");
        assert_eq!(toks.len(), 1);
    }

    #[test]
    fn combining_marks_glue_to_base() {
        // 'e' + combining acute = single grapheme.
        let toks = collect("e\u{0301}");
        assert_eq!(toks.len(), 1);
    }

    #[test]
    fn name_is_stable() {
        assert_eq!(GraphemeTokenizer.name(), "grapheme-uax29");
    }
}