Skip to main content

shadowforge_lib/domain/scrubber/
mod.rs

1//! Linguistic stylometric fingerprint scrubbing.
2//!
3//! Deterministic statistical normalisation of text to destroy authorship
4//! attribution fingerprints. No LLM, no network calls — purely local
5//! text transformation.
6
7use unicode_segmentation::UnicodeSegmentation;
8
9use crate::domain::types::StyloProfile;
10
11/// Normalise punctuation to ASCII equivalents.
12///
13/// Smart quotes → ASCII, em-dashes → `--`, ellipses → `...`.
14#[must_use]
15pub fn normalize_punctuation(text: &str) -> String {
16    let mut result = String::with_capacity(text.len());
17    for ch in text.chars() {
18        match ch {
19            '\u{2018}' | '\u{2019}' | '\u{201A}' | '\u{201B}' => result.push('\''),
20            '\u{201C}' | '\u{201D}' | '\u{201E}' | '\u{201F}' => result.push('"'),
21            '\u{2013}' | '\u{2014}' => result.push_str("--"),
22            '\u{2026}' => result.push_str("..."),
23            '\u{00A0}' => result.push(' '), // non-breaking space
24            _ => result.push(ch),
25        }
26    }
27    result
28}
29
30/// Expand common English contractions to full forms.
31#[must_use]
32pub fn expand_contractions(text: &str) -> String {
33    // Process word by word, preserving whitespace structure
34    let mut result = String::with_capacity(text.len());
35    let mut last_end = 0;
36
37    for (start, word) in text.unicode_word_indices() {
38        // Preserve any whitespace/punctuation before this word
39        result.push_str(&text[last_end..start]);
40        last_end = start + word.len();
41
42        // Check for contractions in the surrounding context
43        let lower = word.to_lowercase();
44        let expansion = match lower.as_str() {
45            "don't" => Some("do not"),
46            "doesn't" => Some("does not"),
47            "didn't" => Some("did not"),
48            "won't" => Some("will not"),
49            "wouldn't" => Some("would not"),
50            "couldn't" => Some("could not"),
51            "shouldn't" => Some("should not"),
52            "isn't" => Some("is not"),
53            "aren't" => Some("are not"),
54            "wasn't" => Some("was not"),
55            "weren't" => Some("were not"),
56            "haven't" => Some("have not"),
57            "hasn't" => Some("has not"),
58            "hadn't" => Some("had not"),
59            "can't" => Some("cannot"),
60            "it's" => Some("it is"),
61            "i'm" => Some("I am"),
62            "i've" => Some("I have"),
63            "i'll" => Some("I will"),
64            "i'd" => Some("I would"),
65            "we're" => Some("we are"),
66            "we've" => Some("we have"),
67            "we'll" => Some("we will"),
68            "they're" => Some("they are"),
69            "they've" => Some("they have"),
70            "they'll" => Some("they will"),
71            "you're" => Some("you are"),
72            "you've" => Some("you have"),
73            "you'll" => Some("you will"),
74            "he's" => Some("he is"),
75            "she's" => Some("she is"),
76            "that's" => Some("that is"),
77            "there's" => Some("there is"),
78            "here's" => Some("here is"),
79            "what's" => Some("what is"),
80            "who's" => Some("who is"),
81            "let's" => Some("let us"),
82            _ => None,
83        };
84
85        if let Some(expanded) = expansion {
86            result.push_str(expanded);
87        } else {
88            result.push_str(word);
89        }
90    }
91
92    // Append any trailing text after the last word
93    if last_end < text.len() {
94        result.push_str(&text[last_end..]);
95    }
96
97    result
98}
99
100/// Count words in a sentence using grapheme-aware word boundaries.
101fn word_count(sentence: &str) -> usize {
102    sentence.unicode_words().count()
103}
104
105/// Apply all scrubbing transformations to the input text.
106///
107/// Operations applied:
108/// 1. Punctuation normalisation (if `profile.normalize_punctuation`)
109/// 2. Contraction expansion
110/// 3. Whitespace normalisation
111#[must_use]
112pub fn scrub_text(text: &str, profile: &StyloProfile) -> String {
113    if text.is_empty() {
114        return String::new();
115    }
116
117    let mut result = text.to_owned();
118
119    // Step 1: Normalise punctuation
120    if profile.normalize_punctuation {
121        result = normalize_punctuation(&result);
122    }
123
124    // Step 2: Expand contractions
125    result = expand_contractions(&result);
126
127    // Step 3: Normalise whitespace (collapse runs of spaces)
128    result = collapse_whitespace(&result);
129
130    result
131}
132
133/// Collapse runs of whitespace into single spaces, trimming leading/trailing.
134fn collapse_whitespace(text: &str) -> String {
135    let mut result = String::with_capacity(text.len());
136    let mut prev_was_space = true; // trim leading
137    for ch in text.chars() {
138        if ch.is_whitespace() {
139            if !prev_was_space {
140                result.push(' ');
141            }
142            prev_was_space = true;
143        } else {
144            result.push(ch);
145            prev_was_space = false;
146        }
147    }
148    // Trim trailing space
149    if result.ends_with(' ') {
150        result.pop();
151    }
152    result
153}
154
155/// Compute the average sentence length in words for the given text.
156#[must_use]
157pub fn average_sentence_length(text: &str) -> f64 {
158    let sentences: Vec<&str> = text.split_sentence_bounds().collect();
159    let sentence_words: Vec<usize> = sentences
160        .iter()
161        .map(|s| word_count(s))
162        .filter(|&count| count > 0)
163        .collect();
164
165    if sentence_words.is_empty() {
166        return 0.0;
167    }
168
169    let total_words: usize = sentence_words.iter().sum();
170    #[expect(
171        clippy::cast_precision_loss,
172        reason = "sentence counts never exceed 2^52"
173    )]
174    {
175        total_words as f64 / sentence_words.len() as f64
176    }
177}
178
179#[cfg(test)]
180mod tests {
181    use super::*;
182
183    #[test]
184    fn punctuation_normalisation_smart_quotes() {
185        let input = "\u{201C}Hello,\u{201D} she said. \u{2018}Goodbye.\u{2019}";
186        let output = normalize_punctuation(input);
187        assert_eq!(output, "\"Hello,\" she said. 'Goodbye.'");
188    }
189
190    #[test]
191    fn punctuation_normalisation_em_dashes() {
192        let input = "word\u{2014}another";
193        let output = normalize_punctuation(input);
194        assert_eq!(output, "word--another");
195    }
196
197    #[test]
198    fn punctuation_normalisation_ellipsis() {
199        let input = "wait\u{2026}";
200        let output = normalize_punctuation(input);
201        assert_eq!(output, "wait...");
202    }
203
204    #[test]
205    fn contraction_expansion() {
206        let input = "I can't believe they're here.";
207        let output = expand_contractions(input);
208        assert!(output.contains("cannot"));
209        assert!(output.contains("they are"));
210    }
211
212    #[test]
213    fn scrub_idempotent() {
214        let profile = StyloProfile {
215            target_vocab_size: 5000,
216            target_avg_sentence_len: 15.0,
217            normalize_punctuation: true,
218        };
219        let input = "Don't worry\u{2014}it's fine!";
220        let once = scrub_text(input, &profile);
221        let twice = scrub_text(&once, &profile);
222        assert_eq!(once, twice);
223    }
224
225    #[test]
226    fn non_latin_passes_through() {
227        let profile = StyloProfile {
228            target_vocab_size: 5000,
229            target_avg_sentence_len: 15.0,
230            normalize_punctuation: true,
231        };
232        let arabic = "\u{0645}\u{0631}\u{062D}\u{0628}\u{0627} \u{0628}\u{0627}\u{0644}\u{0639}\u{0627}\u{0644}\u{0645}";
233        let output = scrub_text(arabic, &profile);
234        // Should pass through without panic or data loss
235        assert!(!output.is_empty());
236    }
237
238    #[test]
239    fn chinese_passes_through() {
240        let profile = StyloProfile {
241            target_vocab_size: 5000,
242            target_avg_sentence_len: 15.0,
243            normalize_punctuation: false,
244        };
245        let chinese = "\u{4F60}\u{597D}\u{4E16}\u{754C}";
246        let output = scrub_text(chinese, &profile);
247        assert_eq!(output, chinese);
248    }
249
250    #[test]
251    fn whitespace_collapse() {
252        let input = "  hello   world  ";
253        let output = collapse_whitespace(input);
254        assert_eq!(output, "hello world");
255    }
256
257    #[test]
258    fn average_sentence_length_basic() {
259        let text = "Hello world. This is a test.";
260        let avg = average_sentence_length(text);
261        assert!(avg > 1.0);
262    }
263
264    #[test]
265    fn empty_text_scrubs_to_empty() {
266        let profile = StyloProfile {
267            target_vocab_size: 5000,
268            target_avg_sentence_len: 15.0,
269            normalize_punctuation: true,
270        };
271        let output = scrub_text("", &profile);
272        assert!(output.is_empty());
273    }
274}