shadowforge_lib/domain/scrubber/
mod.rs1use unicode_segmentation::UnicodeSegmentation;
8
9use crate::domain::types::StyloProfile;
10
11#[must_use]
15pub fn normalize_punctuation(text: &str) -> String {
16 let mut result = String::with_capacity(text.len());
17 for ch in text.chars() {
18 match ch {
19 '\u{2018}' | '\u{2019}' | '\u{201A}' | '\u{201B}' => result.push('\''),
20 '\u{201C}' | '\u{201D}' | '\u{201E}' | '\u{201F}' => result.push('"'),
21 '\u{2013}' | '\u{2014}' => result.push_str("--"),
22 '\u{2026}' => result.push_str("..."),
23 '\u{00A0}' => result.push(' '), _ => result.push(ch),
25 }
26 }
27 result
28}
29
30#[must_use]
32pub fn expand_contractions(text: &str) -> String {
33 let mut result = String::with_capacity(text.len());
35 let mut last_end = 0;
36
37 for (start, word) in text.unicode_word_indices() {
38 result.push_str(&text[last_end..start]);
40 last_end = start + word.len();
41
42 let lower = word.to_lowercase();
44 let expansion = match lower.as_str() {
45 "don't" => Some("do not"),
46 "doesn't" => Some("does not"),
47 "didn't" => Some("did not"),
48 "won't" => Some("will not"),
49 "wouldn't" => Some("would not"),
50 "couldn't" => Some("could not"),
51 "shouldn't" => Some("should not"),
52 "isn't" => Some("is not"),
53 "aren't" => Some("are not"),
54 "wasn't" => Some("was not"),
55 "weren't" => Some("were not"),
56 "haven't" => Some("have not"),
57 "hasn't" => Some("has not"),
58 "hadn't" => Some("had not"),
59 "can't" => Some("cannot"),
60 "it's" => Some("it is"),
61 "i'm" => Some("I am"),
62 "i've" => Some("I have"),
63 "i'll" => Some("I will"),
64 "i'd" => Some("I would"),
65 "we're" => Some("we are"),
66 "we've" => Some("we have"),
67 "we'll" => Some("we will"),
68 "they're" => Some("they are"),
69 "they've" => Some("they have"),
70 "they'll" => Some("they will"),
71 "you're" => Some("you are"),
72 "you've" => Some("you have"),
73 "you'll" => Some("you will"),
74 "he's" => Some("he is"),
75 "she's" => Some("she is"),
76 "that's" => Some("that is"),
77 "there's" => Some("there is"),
78 "here's" => Some("here is"),
79 "what's" => Some("what is"),
80 "who's" => Some("who is"),
81 "let's" => Some("let us"),
82 _ => None,
83 };
84
85 if let Some(expanded) = expansion {
86 result.push_str(expanded);
87 } else {
88 result.push_str(word);
89 }
90 }
91
92 if last_end < text.len() {
94 result.push_str(&text[last_end..]);
95 }
96
97 result
98}
99
100fn word_count(sentence: &str) -> usize {
102 sentence.unicode_words().count()
103}
104
105#[must_use]
112pub fn scrub_text(text: &str, profile: &StyloProfile) -> String {
113 if text.is_empty() {
114 return String::new();
115 }
116
117 let mut result = text.to_owned();
118
119 if profile.normalize_punctuation {
121 result = normalize_punctuation(&result);
122 }
123
124 result = expand_contractions(&result);
126
127 result = collapse_whitespace(&result);
129
130 result
131}
132
133fn collapse_whitespace(text: &str) -> String {
135 let mut result = String::with_capacity(text.len());
136 let mut prev_was_space = true; for ch in text.chars() {
138 if ch.is_whitespace() {
139 if !prev_was_space {
140 result.push(' ');
141 }
142 prev_was_space = true;
143 } else {
144 result.push(ch);
145 prev_was_space = false;
146 }
147 }
148 if result.ends_with(' ') {
150 result.pop();
151 }
152 result
153}
154
155#[must_use]
157pub fn average_sentence_length(text: &str) -> f64 {
158 let sentences: Vec<&str> = text.split_sentence_bounds().collect();
159 let sentence_words: Vec<usize> = sentences
160 .iter()
161 .map(|s| word_count(s))
162 .filter(|&count| count > 0)
163 .collect();
164
165 if sentence_words.is_empty() {
166 return 0.0;
167 }
168
169 let total_words: usize = sentence_words.iter().sum();
170 #[expect(
171 clippy::cast_precision_loss,
172 reason = "sentence counts never exceed 2^52"
173 )]
174 {
175 total_words as f64 / sentence_words.len() as f64
176 }
177}
178
179#[cfg(test)]
180mod tests {
181 use super::*;
182
183 #[test]
184 fn punctuation_normalisation_smart_quotes() {
185 let input = "\u{201C}Hello,\u{201D} she said. \u{2018}Goodbye.\u{2019}";
186 let output = normalize_punctuation(input);
187 assert_eq!(output, "\"Hello,\" she said. 'Goodbye.'");
188 }
189
190 #[test]
191 fn punctuation_normalisation_em_dashes() {
192 let input = "word\u{2014}another";
193 let output = normalize_punctuation(input);
194 assert_eq!(output, "word--another");
195 }
196
197 #[test]
198 fn punctuation_normalisation_ellipsis() {
199 let input = "wait\u{2026}";
200 let output = normalize_punctuation(input);
201 assert_eq!(output, "wait...");
202 }
203
204 #[test]
205 fn contraction_expansion() {
206 let input = "I can't believe they're here.";
207 let output = expand_contractions(input);
208 assert!(output.contains("cannot"));
209 assert!(output.contains("they are"));
210 }
211
212 #[test]
213 fn scrub_idempotent() {
214 let profile = StyloProfile {
215 target_vocab_size: 5000,
216 target_avg_sentence_len: 15.0,
217 normalize_punctuation: true,
218 };
219 let input = "Don't worry\u{2014}it's fine!";
220 let once = scrub_text(input, &profile);
221 let twice = scrub_text(&once, &profile);
222 assert_eq!(once, twice);
223 }
224
225 #[test]
226 fn non_latin_passes_through() {
227 let profile = StyloProfile {
228 target_vocab_size: 5000,
229 target_avg_sentence_len: 15.0,
230 normalize_punctuation: true,
231 };
232 let arabic = "\u{0645}\u{0631}\u{062D}\u{0628}\u{0627} \u{0628}\u{0627}\u{0644}\u{0639}\u{0627}\u{0644}\u{0645}";
233 let output = scrub_text(arabic, &profile);
234 assert!(!output.is_empty());
236 }
237
238 #[test]
239 fn chinese_passes_through() {
240 let profile = StyloProfile {
241 target_vocab_size: 5000,
242 target_avg_sentence_len: 15.0,
243 normalize_punctuation: false,
244 };
245 let chinese = "\u{4F60}\u{597D}\u{4E16}\u{754C}";
246 let output = scrub_text(chinese, &profile);
247 assert_eq!(output, chinese);
248 }
249
250 #[test]
251 fn whitespace_collapse() {
252 let input = " hello world ";
253 let output = collapse_whitespace(input);
254 assert_eq!(output, "hello world");
255 }
256
257 #[test]
258 fn average_sentence_length_basic() {
259 let text = "Hello world. This is a test.";
260 let avg = average_sentence_length(text);
261 assert!(avg > 1.0);
262 }
263
264 #[test]
265 fn empty_text_scrubs_to_empty() {
266 let profile = StyloProfile {
267 target_vocab_size: 5000,
268 target_avg_sentence_len: 15.0,
269 normalize_punctuation: true,
270 };
271 let output = scrub_text("", &profile);
272 assert!(output.is_empty());
273 }
274}