llm_utils 0.0.11

The best possible text chunker and text splitter and other text tools
Documentation
use regex::Regex;
use std::sync::LazyLock;

#[derive(Default)]
pub enum Newlines {
    Space,
    Single,
    #[default]
    TwoPlus,
    None,
}
#[derive(Default)]
pub struct TextCleaner {
    pub newlines: Newlines,
    pub remove_non_basic_ascii: bool,
    pub remve_citations: bool,
}
impl TextCleaner {
    pub fn new() -> Self {
        Self::default()
    }

    pub fn do_not_reduce_newlines(mut self) -> Self {
        self.newlines = Newlines::None;
        self
    }

    pub fn reduce_newlines_to_single_space(mut self) -> Self {
        self.newlines = Newlines::Space;
        self
    }

    pub fn reduce_newlines_to_single_newline(mut self) -> Self {
        self.newlines = Newlines::Single;
        self
    }

    pub fn reduce_newlines_to_double_newline(mut self) -> Self {
        self.newlines = Newlines::TwoPlus;
        self
    }

    pub fn remove_non_basic_ascii(mut self) -> Self {
        self.remove_non_basic_ascii = true;
        self
    }

    pub fn remove_citations(mut self) -> Self {
        self.remve_citations = true;
        self
    }

    pub fn run(&self, text: &str) -> String {
        let text = END_OF_LINE_REGEX.replace_all(text, "\n");
        let text = END_OF_PARAGRAPH_REGEX.replace_all(&text, "\n\n");
        let text = WHITE_SPACE_REGEX.replace_all(&text, " ");

        let text = match self.newlines {
            Newlines::Space => SINGLE_NEWLINE_REGEX.replace_all(&text, " "),
            Newlines::Single => SINGLE_NEWLINE_REGEX.replace_all(&text, "\n"),
            Newlines::TwoPlus => TWO_PLUS_NEWLINE_REGEX.replace_all(&text, "\n\n"),
            Newlines::None => text,
        };

        let text = if self.remove_non_basic_ascii {
            UNWANTED_CHARS_REGEX.replace_all(&text, "")
        } else {
            text
        };

        let text = if self.remve_citations {
            CITATIONS_REGEX.replace_all(&text, "")
        } else {
            text
        };

        SINGLE_SPACE_REGEX
            .replace_all(&text, " ")
            .trim()
            .to_string()
    }
}

pub fn normalize_whitespace(text: &str) -> String {
    let text = END_OF_LINE_REGEX.replace_all(text, "\n");
    let text = END_OF_PARAGRAPH_REGEX.replace_all(&text, "\n\n");
    WHITE_SPACE_REGEX.replace_all(&text, " ").to_string()
}

pub fn strip_unwanted_chars(text: &str) -> String {
    UNWANTED_CHARS_REGEX
        .replace_all(text, "")
        .trim()
        .to_string()
}

pub fn reduce_to_single_whitespace(text: &str) -> String {
    let text = SINGLE_SPACE_REGEX.replace_all(text, " ");
    SINGLE_NEWLINE_REGEX
        .replace_all(&text, "\n")
        .trim()
        .to_string()
}

//
// Newlines
//
pub static END_OF_LINE_SEQUENCES: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
    vec![
        // Ascii
        r"(\\r\\n|\r\n)", // Windows // This must be first to avoid matching \r
        r"(\\r|\r)",      // MacOS
        r"(\\v|\v)",      // Vertical tab
        r"(\\f|\f)",      // Form feed
        r"\\n",           // Literal
        // Unicode
        r"\u{2028}",
    ]
});
pub static END_OF_LINE_REGEX: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(&END_OF_LINE_SEQUENCES.join("|")).unwrap());
pub static SINGLE_NEWLINE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\n{1,}").unwrap());

//
// Paragraphs
//
pub static END_OF_PARAGRAPH_SEQUENCES: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
    vec![
        // Unicode
        r"\u{2029}",
    ]
});
pub static END_OF_PARAGRAPH_REGEX: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(&END_OF_PARAGRAPH_SEQUENCES.join("|")).unwrap());
pub static TWO_PLUS_NEWLINE_REGEX: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r"\n{2,}").unwrap());

//
// White space
//
pub static WHITE_SPACE_SEQUENCES: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
    vec![
        // Ascii
        r"\\s",
        r"(\\t|\t)",
        // Unicode
        r"\u{0020}",
        r"\u{00A0}",
        r"\u{1680}",
        r"\u{2000}",
        r"\u{2001}",
        r"\u{2002}",
        r"\u{2003}",
        r"\u{2004}",
        r"\u{2005}",
        r"\u{2006}",
        r"\u{2007}",
        r"\u{2008}",
        r"\u{2009}",
        r"\u{200A}",
        r"\u{2028}",
        r"\u{202F}",
        r"\u{205F}",
        r"\u{3000}",
        r"\u{0009}",
    ]
});

pub static WHITE_SPACE_REGEX: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(&WHITE_SPACE_SEQUENCES.join("|")).unwrap());
pub static SINGLE_SPACE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r" {1,}").unwrap());

//
// Unwanted characters
//
pub static UNWANTED_CHARS_REGEX: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r#"[^a-zA-Z0-9.,?!:;'\"\-\(\)\[\]\{\}$&@#%^*()\s]+"#).unwrap());
pub static CITATIONS_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[\d{1,3}\]").unwrap());

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_normalize_whitespace() {
        let ascii_text = "Ascii\tspaces here. Unicode\u{00A0}spaces here. Literal\\sspaces\\t.";
        let ascii_result = "Ascii spaces here. Unicode spaces here. Literal spaces .";
        assert_eq!(normalize_whitespace(ascii_text), ascii_result);
        let ascii_text = "Ascii\nnewlines
. Unicode\u{2028}newlines.
. Literal\\nnewlines.\\n";
        let ascii_result = "Ascii\nnewlines\n. Unicode\nnewlines.\n. Literal\nnewlines.\n";
        assert_eq!(normalize_whitespace(ascii_text), ascii_result);
        let ascii_text = "Ascii\n\nparagraphs\r\n\r\n.Unicode\u{2029}paragraphs.
 Literal\\n\\nparagraphs.\\r\\n\\r\\n";
        let ascii_result =
            "Ascii\n\nparagraphs\n\n.Unicode\n\nparagraphs.\n\n Literal\n\nparagraphs.\n\n";
        assert_eq!(normalize_whitespace(ascii_text), ascii_result);
    }

    #[test]
    fn test_clean_to_single_spaces() {
        let ascii_text =
            "Ascii\tspaces here. Unicode\u{00A0}spaces here.\n And\nof course, newlines.\n\n";
        let ascii_result = "Ascii spaces here. Unicode spaces here. And of course, newlines.";
        assert_eq!(
            TextCleaner::new()
                .reduce_newlines_to_single_space()
                .run(ascii_text),
            ascii_result
        );
    }

    #[test]
    fn test_clean_to_single_newlines() {
        let ascii_text =
            "Ascii\tspaces here. Unicode\u{00A0}spaces here.\nAnd of course, newlines.\n\nCool.";
        let ascii_result =
            "Ascii spaces here. Unicode spaces here.\nAnd of course, newlines.\nCool.";
        assert_eq!(
            TextCleaner::new()
                .reduce_newlines_to_single_newline()
                .run(ascii_text),
            ascii_result
        );
    }

    #[test]
    fn test_clean_to_double_newlines() {
        let ascii_text =
            "Ascii\tspaces here. Unicode\u{00A0}spaces here.\n\nAscii\n\nparagraphs.\r\n\r\nUnicode\u{2029}paragraphs.
Cool.";
        let ascii_result =
            "Ascii spaces here. Unicode spaces here.\n\nAscii\n\nparagraphs.\n\nUnicode\n\nparagraphs.\n\nCool.";
        assert_eq!(
            TextCleaner::new()
                .reduce_newlines_to_double_newline()
                .run(ascii_text),
            ascii_result
        );
    }

    #[test]
    fn test_strip_unwanted_chars() {
        let ascii_text = r#"This is a "test" sentence. It include's 'single' and "double" quotes, as well as other basic punctuation characters like commas, periods, question marks?, exclamation marks!, colons:, semicolons;, hyphens-, parentheses(), square brackets[], curly braces{}, and special characters $&@#%^*(). It also includes some advanced punctuation characters that should be removed, such as ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"#;
        let ascii_result = r#"This is a "test" sentence. It include's 'single' and "double" quotes, as well as other basic punctuation characters like commas, periods, question marks?, exclamation marks!, colons:, semicolons;, hyphens-, parentheses(), square brackets[], curly braces{}, and special characters $&@#%^*(). It also includes some advanced punctuation characters that should be removed, such as"#;
        assert_eq!(
            TextCleaner::new()
                .do_not_reduce_newlines()
                .remove_non_basic_ascii()
                .run(ascii_text),
            ascii_result
        );
    }
}