1use regex::Regex;
7use serde::{Deserialize, Serialize};
8use std::collections::HashSet;
9use std::sync::OnceLock;
10use tracing::info;
11
12#[cfg(test)]
13mod tests;
14
15#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct PreprocessingConfig {
18 #[serde(default = "default_true")]
20 pub remove_tool_artifacts: bool,
21
22 #[serde(default = "default_true")]
24 pub remove_cli_output: bool,
25
26 #[serde(default = "default_true")]
28 pub remove_metadata: bool,
29
30 #[serde(default = "default_min_length")]
32 pub min_content_length: usize,
33
34 #[serde(default = "default_dedupe_threshold")]
36 pub dedupe_threshold: f32,
37
38 #[serde(default = "default_true")]
40 pub remove_empty_content: bool,
41
42 #[serde(default = "default_true")]
44 pub remove_duplicate_headers: bool,
45}
46
47fn default_true() -> bool {
48 true
49}
50
51fn default_min_length() -> usize {
52 50
53}
54
55fn default_dedupe_threshold() -> f32 {
56 0.95
57}
58
59impl Default for PreprocessingConfig {
60 fn default() -> Self {
61 Self {
62 remove_tool_artifacts: true,
63 remove_cli_output: true,
64 remove_metadata: false,
67 min_content_length: 50,
68 dedupe_threshold: 0.95,
69 remove_empty_content: true,
70 remove_duplicate_headers: true,
71 }
72 }
73}
74
75struct PreprocessingRegexes {
76 function_calls_block: Regex,
77 antml_invoke_block: Regex,
78 antml_parameter_block: Regex,
79 function_results_block: Regex,
80 result_block: Regex,
81 tool_output_tags: Regex,
82 git_status_output: Regex,
83 git_diff_output: Regex,
84 cargo_output: Regex,
85 npm_output: Regex,
86 file_listing: Regex,
87 tree_output: Regex,
88 uuid_pattern: Regex,
89 timestamp_iso: Regex,
90 unix_timestamp: Regex,
91 session_id_pattern: Regex,
92 file_path_metadata: Regex,
93 empty_content_json: Regex,
94 empty_text_json: Regex,
95 placeholder_message: Regex,
96 multiple_newlines: Regex,
97 multiple_spaces: Regex,
98}
99
100fn compile_static_regex(pattern: &str, label: &str) -> Regex {
101 Regex::new(pattern).unwrap_or_else(|err| panic!("invalid preprocessing regex '{label}': {err}"))
102}
103
104fn preprocessing_regexes() -> &'static PreprocessingRegexes {
105 static REGEXES: OnceLock<PreprocessingRegexes> = OnceLock::new();
106 REGEXES.get_or_init(|| PreprocessingRegexes {
107 function_calls_block: compile_static_regex(
108 &format!(
109 r"(?s)<{}>{}</{}>",
110 "function_calls", r".*?", "function_calls"
111 ),
112 "function_calls_block",
113 ),
114 antml_invoke_block: compile_static_regex(
115 &format!(
116 r"(?s)<{}:{}[^>]*>.*?</{}:{}>",
117 "antml", "invoke", "antml", "invoke"
118 ),
119 "antml_invoke_block",
120 ),
121 antml_parameter_block: compile_static_regex(
122 &format!(
123 r"(?s)<{}:{}[^>]*>.*?</{}:{}>",
124 "antml", "parameter", "antml", "parameter"
125 ),
126 "antml_parameter_block",
127 ),
128 function_results_block: compile_static_regex(
129 &format!(
130 r"(?s)<{}>{}</{}>",
131 "function_results", r".*?", "function_results"
132 ),
133 "function_results_block",
134 ),
135 result_block: compile_static_regex(
136 &format!(r"(?s)<{}>{}</{}>", "result", r".*?", "result"),
137 "result_block",
138 ),
139 tool_output_tags: compile_static_regex(
140 r"(?s)<(output|name|value)>.*?</(output|name|value)>",
141 "tool_output_tags",
142 ),
143 git_status_output: compile_static_regex(
144 r"(?m)^\s*(On branch|Your branch|Changes (?:not staged|to be committed)|Untracked files|nothing to commit|modified:|new file:|deleted:).*$",
145 "git_status_output",
146 ),
147 git_diff_output: compile_static_regex(
148 r"(?m)^(diff --git|index [0-9a-f]+\.\.[0-9a-f]+|--- a/|--- /|\+\+\+ a/|\+\+\+ b/|@@\s*-\d+.*@@|Binary files).*$",
149 "git_diff_output",
150 ),
151 cargo_output: compile_static_regex(
152 r"(?m)^(\s*(Compiling|Finished|Running|warning:|error\[E|-->|note:|help:)).*$",
153 "cargo_output",
154 ),
155 npm_output: compile_static_regex(
156 r"(?m)^(npm (WARN|ERR!|notice)|added \d+ packages|up to date|audited \d+ packages).*$",
157 "npm_output",
158 ),
159 file_listing: compile_static_regex(
160 r"(?m)^(total \d+|[drwx-]{10}\s+\d+|[-lrwx]{10}\s+\d+).*$",
161 "file_listing",
162 ),
163 tree_output: compile_static_regex(r"(?m)^[│├└─\s]+[\w.-]+/?$", "tree_output"),
164 uuid_pattern: compile_static_regex(
165 r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}",
166 "uuid_pattern",
167 ),
168 timestamp_iso: compile_static_regex(
169 r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?(Z|[+-]\d{2}:?\d{2})?",
170 "timestamp_iso",
171 ),
172 unix_timestamp: compile_static_regex(r"\b1[6-7]\d{8}\b", "unix_timestamp"),
173 session_id_pattern: compile_static_regex(
174 r#"(session_id|sessionId|session-id|conv_id|conversation_id)["']?\s*[:=]\s*["']?[\w-]+"#,
175 "session_id_pattern",
176 ),
177 file_path_metadata: compile_static_regex(
178 r#""(path|file_path|filepath)"\s*:\s*"[^"]+""#,
179 "file_path_metadata",
180 ),
181 empty_content_json: compile_static_regex(
182 r#""content"\s*:\s*\[\s*\]"#,
183 "empty_content_json",
184 ),
185 empty_text_json: compile_static_regex(r#""text"\s*:\s*"""#, "empty_text_json"),
186 placeholder_message: compile_static_regex(
187 r"(?i)(placeholder|lorem ipsum|TODO:|FIXME:|XXX:)",
188 "placeholder_message",
189 ),
190 multiple_newlines: compile_static_regex(r"\n{3,}", "multiple_newlines"),
191 multiple_spaces: compile_static_regex(r" {2,}", "multiple_spaces"),
192 })
193}
194
195fn content_hash(s: &str) -> u64 {
197 use std::collections::hash_map::DefaultHasher;
198 use std::hash::{Hash, Hasher};
199
200 let normalized = s
201 .to_lowercase()
202 .split_whitespace()
203 .collect::<Vec<_>>()
204 .join(" ");
205
206 let mut hasher = DefaultHasher::new();
207 normalized.hash(&mut hasher);
208 hasher.finish()
209}
210
211fn content_similarity(a: &str, b: &str) -> f32 {
213 let words_a: HashSet<&str> = a.split_whitespace().collect();
214 let words_b: HashSet<&str> = b.split_whitespace().collect();
215
216 if words_a.is_empty() && words_b.is_empty() {
217 return 1.0;
218 }
219
220 let intersection = words_a.intersection(&words_b).count();
221 let union = words_a.union(&words_b).count();
222
223 if union == 0 {
224 return 0.0;
225 }
226
227 intersection as f32 / union as f32
228}
229
230#[derive(Debug, Clone)]
232pub struct Message {
233 pub role: String,
234 pub content: String,
235 pub metadata: Option<serde_json::Value>,
236}
237
238#[derive(Debug, Clone, Default)]
240pub struct PreprocessingStats {
241 pub total_input: usize,
242 pub filtered_tool_artifacts: usize,
243 pub filtered_cli_output: usize,
244 pub filtered_metadata: usize,
245 pub filtered_empty: usize,
246 pub filtered_duplicates: usize,
247 pub filtered_below_min_length: usize,
248 pub total_output: usize,
249}
250
251impl PreprocessingStats {
252 pub fn total_filtered(&self) -> usize {
253 self.filtered_tool_artifacts
254 + self.filtered_cli_output
255 + self.filtered_metadata
256 + self.filtered_empty
257 + self.filtered_duplicates
258 + self.filtered_below_min_length
259 }
260
261 pub fn filter_rate(&self) -> f32 {
262 if self.total_input == 0 {
263 return 0.0;
264 }
265 self.total_filtered() as f32 / self.total_input as f32
266 }
267}
268
269pub struct Preprocessor {
271 config: PreprocessingConfig,
272 seen_hashes: HashSet<u64>,
273}
274
275impl Preprocessor {
276 pub fn new(config: PreprocessingConfig) -> Self {
278 Self {
279 config,
280 seen_hashes: HashSet::new(),
281 }
282 }
283
284 pub fn with_defaults() -> Self {
286 Self::new(PreprocessingConfig::default())
287 }
288
289 pub fn reset_dedupe_cache(&mut self) {
291 self.seen_hashes.clear();
292 }
293
294 pub fn filter_message(&mut self, content: &str) -> Option<String> {
296 let cleaned = self.extract_semantic_content(content);
298
299 if cleaned.len() < self.config.min_content_length {
301 return None;
302 }
303
304 if self.config.dedupe_threshold < 1.0 {
306 let hash = content_hash(&cleaned);
307 if self.seen_hashes.contains(&hash) {
308 return None;
309 }
310 self.seen_hashes.insert(hash);
311 }
312
313 Some(cleaned)
314 }
315
316 pub fn filter_conversation(
318 &mut self,
319 messages: Vec<Message>,
320 ) -> (Vec<Message>, PreprocessingStats) {
321 let mut stats = PreprocessingStats {
322 total_input: messages.len(),
323 ..Default::default()
324 };
325
326 let mut result = Vec::new();
327 let mut previous_contents: Vec<String> = Vec::new();
328
329 for msg in messages {
330 if self.config.remove_tool_artifacts && self.is_mostly_tool_artifact(&msg.content) {
332 stats.filtered_tool_artifacts += 1;
333 continue;
334 }
335
336 if self.config.remove_cli_output && self.is_mostly_cli_output(&msg.content) {
338 stats.filtered_cli_output += 1;
339 continue;
340 }
341
342 let cleaned = self.extract_semantic_content(&msg.content);
344
345 if self.config.remove_empty_content && cleaned.trim().is_empty() {
347 stats.filtered_empty += 1;
348 continue;
349 }
350
351 if cleaned.len() < self.config.min_content_length {
353 stats.filtered_below_min_length += 1;
354 continue;
355 }
356
357 if self.config.dedupe_threshold < 1.0 {
359 let is_duplicate = previous_contents
360 .iter()
361 .any(|prev| content_similarity(prev, &cleaned) >= self.config.dedupe_threshold);
362
363 if is_duplicate {
364 stats.filtered_duplicates += 1;
365 continue;
366 }
367 }
368
369 previous_contents.push(cleaned.clone());
370
371 result.push(Message {
372 role: msg.role,
373 content: cleaned,
374 metadata: msg.metadata,
375 });
376 }
377
378 stats.total_output = result.len();
379
380 info!(
381 "Preprocessing complete: {}/{} messages kept ({:.1}% filtered)",
382 stats.total_output,
383 stats.total_input,
384 stats.filter_rate() * 100.0
385 );
386
387 (result, stats)
388 }
389
390 pub fn extract_semantic_content(&self, raw: &str) -> String {
392 let regexes = preprocessing_regexes();
393 let mut result = raw.to_string();
394
395 if self.config.remove_tool_artifacts {
397 result = regexes
398 .function_calls_block
399 .replace_all(&result, "")
400 .to_string();
401 result = regexes
402 .antml_invoke_block
403 .replace_all(&result, "")
404 .to_string();
405 result = regexes
406 .antml_parameter_block
407 .replace_all(&result, "")
408 .to_string();
409 result = regexes
410 .function_results_block
411 .replace_all(&result, "")
412 .to_string();
413 result = regexes.result_block.replace_all(&result, "").to_string();
414 result = regexes
415 .tool_output_tags
416 .replace_all(&result, "")
417 .to_string();
418 }
419
420 if self.config.remove_cli_output {
422 result = regexes
423 .git_status_output
424 .replace_all(&result, "")
425 .to_string();
426 result = regexes.git_diff_output.replace_all(&result, "").to_string();
427 result = regexes.cargo_output.replace_all(&result, "").to_string();
428 result = regexes.npm_output.replace_all(&result, "").to_string();
429 result = regexes.file_listing.replace_all(&result, "").to_string();
430 result = regexes.tree_output.replace_all(&result, "").to_string();
431 }
432
433 if self.config.remove_metadata {
435 result = regexes
436 .uuid_pattern
437 .replace_all(&result, "[UUID]")
438 .to_string();
439 result = regexes
440 .timestamp_iso
441 .replace_all(&result, "[TIMESTAMP]")
442 .to_string();
443 result = regexes
444 .unix_timestamp
445 .replace_all(&result, "[TIMESTAMP]")
446 .to_string();
447 result = regexes
448 .session_id_pattern
449 .replace_all(&result, "")
450 .to_string();
451 result = regexes
452 .file_path_metadata
453 .replace_all(&result, "")
454 .to_string();
455 }
456
457 if self.config.remove_empty_content {
459 result = regexes
460 .empty_content_json
461 .replace_all(&result, "")
462 .to_string();
463 result = regexes.empty_text_json.replace_all(&result, "").to_string();
464 result = regexes
465 .placeholder_message
466 .replace_all(&result, "")
467 .to_string();
468 }
469
470 result = regexes
472 .multiple_newlines
473 .replace_all(&result, "\n\n")
474 .to_string();
475 result = regexes
476 .multiple_spaces
477 .replace_all(&result, " ")
478 .to_string();
479
480 result.trim().to_string()
481 }
482
483 fn is_mostly_tool_artifact(&self, content: &str) -> bool {
485 let regexes = preprocessing_regexes();
486 let original_len = content.len();
487 if original_len == 0 {
488 return false;
489 }
490
491 let mut cleaned = content.to_string();
492 cleaned = regexes
493 .function_calls_block
494 .replace_all(&cleaned, "")
495 .to_string();
496 cleaned = regexes
497 .antml_invoke_block
498 .replace_all(&cleaned, "")
499 .to_string();
500 cleaned = regexes
501 .antml_parameter_block
502 .replace_all(&cleaned, "")
503 .to_string();
504 cleaned = regexes
505 .function_results_block
506 .replace_all(&cleaned, "")
507 .to_string();
508 cleaned = regexes.result_block.replace_all(&cleaned, "").to_string();
509
510 let remaining_len = cleaned.trim().len();
511 let artifact_ratio = 1.0 - (remaining_len as f32 / original_len as f32);
512
513 artifact_ratio > 0.8
515 }
516
517 fn is_mostly_cli_output(&self, content: &str) -> bool {
519 let regexes = preprocessing_regexes();
520 let lines: Vec<&str> = content.lines().collect();
521 if lines.is_empty() {
522 return false;
523 }
524
525 let cli_lines = lines
526 .iter()
527 .filter(|line| {
528 regexes.git_status_output.is_match(line)
529 || regexes.git_diff_output.is_match(line)
530 || regexes.cargo_output.is_match(line)
531 || regexes.npm_output.is_match(line)
532 || regexes.file_listing.is_match(line)
533 || regexes.tree_output.is_match(line)
534 })
535 .count();
536
537 let cli_ratio = cli_lines as f32 / lines.len() as f32;
538
539 cli_ratio > 0.7
541 }
542}
543
544#[derive(Debug, Clone, Serialize, Deserialize)]
565pub struct TextIntegrityMetrics {
566 pub sentence_integrity: f32,
568
569 pub word_integrity: f32,
571
572 pub chunk_quality: f32,
574
575 pub overall: f32,
577
578 pub chunk_count: usize,
580
581 pub avg_chunk_length: usize,
583}
584
585impl TextIntegrityMetrics {
586 pub const THRESHOLD: f32 = 0.90;
588
589 pub const OPTIMAL_MIN: usize = 200;
591 pub const OPTIMAL_MAX: usize = 800;
592
593 pub fn compute(original: &str, chunks: &[String]) -> Self {
595 if chunks.is_empty() {
596 return Self {
597 sentence_integrity: 0.0,
598 word_integrity: 0.0,
599 chunk_quality: 0.0,
600 overall: 0.0,
601 chunk_count: 0,
602 avg_chunk_length: 0,
603 };
604 }
605
606 let sentence_integrity = Self::compute_sentence_integrity(original, chunks);
607 let word_integrity = Self::compute_word_integrity(chunks);
608 let chunk_quality = Self::compute_chunk_quality(chunks);
609
610 let overall = sentence_integrity * 0.5 + word_integrity * 0.3 + chunk_quality * 0.2;
612
613 let total_chars: usize = chunks.iter().map(|c| c.len()).sum();
614 let avg_chunk_length = total_chars / chunks.len();
615
616 Self {
617 sentence_integrity,
618 word_integrity,
619 chunk_quality,
620 overall,
621 chunk_count: chunks.len(),
622 avg_chunk_length,
623 }
624 }
625
626 pub fn passes_threshold(&self) -> bool {
628 self.overall >= Self::THRESHOLD
629 }
630
631 pub fn recommendation(&self) -> IntegrityRecommendation {
633 if self.overall >= 0.95 {
634 IntegrityRecommendation::Excellent
635 } else if self.overall >= Self::THRESHOLD {
636 IntegrityRecommendation::Good
637 } else if self.overall >= 0.70 {
638 IntegrityRecommendation::Warn
639 } else {
640 IntegrityRecommendation::Purge
641 }
642 }
643
644 fn compute_sentence_integrity(original: &str, chunks: &[String]) -> f32 {
646 let original_sentences = Self::count_sentences(original);
647 if original_sentences == 0 {
648 return 1.0; }
650
651 let preserved_sentences: usize = chunks
652 .iter()
653 .map(|c| Self::count_complete_sentences(c))
654 .sum();
655
656 let ratio = preserved_sentences as f32 / original_sentences as f32;
658
659 ratio.min(1.0)
661 }
662
663 fn compute_word_integrity(chunks: &[String]) -> f32 {
665 if chunks.is_empty() {
666 return 1.0;
667 }
668
669 let complete_endings = chunks
670 .iter()
671 .filter(|c| Self::ends_at_word_boundary(c))
672 .count();
673
674 complete_endings as f32 / chunks.len() as f32
675 }
676
677 fn compute_chunk_quality(chunks: &[String]) -> f32 {
679 if chunks.is_empty() {
680 return 0.0;
681 }
682
683 let optimal_count = chunks
684 .iter()
685 .filter(|c| {
686 let len = c.len();
687 (Self::OPTIMAL_MIN..=Self::OPTIMAL_MAX).contains(&len)
688 })
689 .count();
690
691 optimal_count as f32 / chunks.len() as f32
692 }
693
694 fn count_sentences(text: &str) -> usize {
696 text.chars()
697 .filter(|&c| c == '.' || c == '!' || c == '?')
698 .count()
699 }
700
701 fn count_complete_sentences(chunk: &str) -> usize {
703 let trimmed = chunk.trim();
704 if trimmed.is_empty() {
705 return 0;
706 }
707
708 let sentences = Self::count_sentences(trimmed);
710
711 if Self::ends_at_sentence_boundary(trimmed) {
713 sentences
714 } else {
715 sentences.saturating_sub(1)
717 }
718 }
719
720 fn ends_at_sentence_boundary(text: &str) -> bool {
722 let trimmed = text.trim();
723 if trimmed.is_empty() {
724 return true;
725 }
726
727 let last_char = trimmed.chars().last().unwrap_or(' ');
728 matches!(last_char, '.' | '!' | '?' | ':' | '"' | '\'' | ')' | ']')
729 }
730
731 fn ends_at_word_boundary(text: &str) -> bool {
735 let trimmed = text.trim_end();
736 if trimmed.is_empty() {
737 return true;
738 }
739
740 let last_char = trimmed.chars().last().unwrap_or(' ');
741 last_char.is_whitespace() || last_char.is_ascii_punctuation()
744 }
745}
746
747#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
749pub enum IntegrityRecommendation {
750 Excellent,
752 Good,
754 Warn,
756 Purge,
758}
759
760impl IntegrityRecommendation {
761 pub fn as_str(&self) -> &'static str {
762 match self {
763 Self::Excellent => "EXCELLENT",
764 Self::Good => "GOOD",
765 Self::Warn => "WARN",
766 Self::Purge => "PURGE",
767 }
768 }
769
770 pub fn emoji(&self) -> &'static str {
771 match self {
772 Self::Excellent => "✅",
773 Self::Good => "✅",
774 Self::Warn => "⚠️",
775 Self::Purge => "❌",
776 }
777 }
778}
779
780impl std::fmt::Display for TextIntegrityMetrics {
781 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
782 let rec = self.recommendation();
783 write!(
784 f,
785 "{} {}: {:.1}% (sentence: {:.1}%, word: {:.1}%, chunk: {:.1}%) - {} chunks, avg {}ch",
786 rec.emoji(),
787 rec.as_str(),
788 self.overall * 100.0,
789 self.sentence_integrity * 100.0,
790 self.word_integrity * 100.0,
791 self.chunk_quality * 100.0,
792 self.chunk_count,
793 self.avg_chunk_length
794 )
795 }
796}
797
798#[cfg(test)]
799impl Message {
800 pub fn new(role: impl Into<String>, content: impl Into<String>) -> Self {
801 Self {
802 role: role.into(),
803 content: content.into(),
804 metadata: None,
805 }
806 }
807}
808
809#[cfg(test)]
810mod integrity_tests {
811 use super::*;
812
813 #[test]
814 fn test_perfect_integrity() {
815 let original = "This is the first sentence with some padding text to make it longer. \
817 Here is another sentence that continues the thought and adds context. \
818 The third sentence provides more information about the topic at hand. \
819 Finally we conclude with a fourth sentence that wraps everything up nicely.";
820 let chunks = vec![
821 "This is the first sentence with some padding text to make it longer. \
822 Here is another sentence that continues the thought and adds context."
823 .to_string(),
824 "The third sentence provides more information about the topic at hand. \
825 Finally we conclude with a fourth sentence that wraps everything up nicely."
826 .to_string(),
827 ];
828
829 let metrics = TextIntegrityMetrics::compute(original, &chunks);
830 assert!(
831 metrics.sentence_integrity >= 0.9,
832 "sentence_integrity: {}",
833 metrics.sentence_integrity
834 );
835 assert!(
836 metrics.word_integrity >= 0.9,
837 "word_integrity: {}",
838 metrics.word_integrity
839 );
840 assert!(metrics.overall >= 0.75, "overall: {}", metrics.overall);
843 }
844
845 #[test]
846 fn test_poor_integrity() {
847 let original = "This is a complete sentence with many words.";
848 let chunks = vec![
850 "This is a compl".to_string(), "ete sentence wi".to_string(), "th many words".to_string(), ];
854
855 let metrics = TextIntegrityMetrics::compute(original, &chunks);
856 assert!(metrics.word_integrity < 0.9); assert!(!metrics.passes_threshold());
858 assert_eq!(metrics.recommendation(), IntegrityRecommendation::Purge);
859 }
860
861 #[test]
862 fn test_empty_chunks() {
863 let original = "Some text";
864 let chunks: Vec<String> = vec![];
865
866 let metrics = TextIntegrityMetrics::compute(original, &chunks);
867 assert_eq!(metrics.chunk_count, 0);
868 assert_eq!(metrics.overall, 0.0);
869 }
870
871 #[test]
872 fn test_recommendation_levels() {
873 let m = TextIntegrityMetrics {
875 sentence_integrity: 1.0,
876 word_integrity: 1.0,
877 chunk_quality: 0.9,
878 overall: 0.97,
879 chunk_count: 10,
880 avg_chunk_length: 400,
881 };
882 assert_eq!(m.recommendation(), IntegrityRecommendation::Excellent);
883
884 let m = TextIntegrityMetrics { overall: 0.92, ..m };
886 assert_eq!(m.recommendation(), IntegrityRecommendation::Good);
887
888 let m = TextIntegrityMetrics { overall: 0.75, ..m };
890 assert_eq!(m.recommendation(), IntegrityRecommendation::Warn);
891
892 let m = TextIntegrityMetrics { overall: 0.50, ..m };
894 assert_eq!(m.recommendation(), IntegrityRecommendation::Purge);
895 }
896}