rust_memex/preprocessing/
mod.rs

1//! Preprocessing module for filtering noise from conversation exports before embedding.
2//!
3//! Problem: ~36-40% of conversation exports are NOISE (tool outputs, metadata, CLI commands).
4//! This module filters noise BEFORE embedding to save vector space and improve search quality.
5
6use regex::Regex;
7use serde::{Deserialize, Serialize};
8use std::collections::HashSet;
9use std::sync::OnceLock;
10use tracing::info;
11
12#[cfg(test)]
13mod tests;
14
15/// Configuration for the preprocessing pipeline.
16#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct PreprocessingConfig {
18    /// Remove MCP tool artifacts (function_calls, invoke, parameter, etc.)
19    #[serde(default = "default_true")]
20    pub remove_tool_artifacts: bool,
21
22    /// Remove CLI command outputs (git status, cargo build, file listings)
23    #[serde(default = "default_true")]
24    pub remove_cli_output: bool,
25
26    /// Remove system metadata (timestamps, UUIDs, session IDs)
27    #[serde(default = "default_true")]
28    pub remove_metadata: bool,
29
30    /// Minimum content length after cleaning (characters)
31    #[serde(default = "default_min_length")]
32    pub min_content_length: usize,
33
34    /// Threshold for deduplication (0.0-1.0, based on content hash similarity)
35    #[serde(default = "default_dedupe_threshold")]
36    pub dedupe_threshold: f32,
37
38    /// Remove empty/boilerplate content
39    #[serde(default = "default_true")]
40    pub remove_empty_content: bool,
41
42    /// Remove duplicate headers and repeated system prompts
43    #[serde(default = "default_true")]
44    pub remove_duplicate_headers: bool,
45}
46
47fn default_true() -> bool {
48    true
49}
50
51fn default_min_length() -> usize {
52    50
53}
54
55fn default_dedupe_threshold() -> f32 {
56    0.95
57}
58
59impl Default for PreprocessingConfig {
60    fn default() -> Self {
61        Self {
62            remove_tool_artifacts: true,
63            remove_cli_output: true,
64            // CRITICAL: Preserve timestamps by default for temporal queries!
65            // Use --sanitize-metadata CLI flag for opt-in sanitization.
66            remove_metadata: false,
67            min_content_length: 50,
68            dedupe_threshold: 0.95,
69            remove_empty_content: true,
70            remove_duplicate_headers: true,
71        }
72    }
73}
74
75struct PreprocessingRegexes {
76    function_calls_block: Regex,
77    antml_invoke_block: Regex,
78    antml_parameter_block: Regex,
79    function_results_block: Regex,
80    result_block: Regex,
81    tool_output_tags: Regex,
82    git_status_output: Regex,
83    git_diff_output: Regex,
84    cargo_output: Regex,
85    npm_output: Regex,
86    file_listing: Regex,
87    tree_output: Regex,
88    uuid_pattern: Regex,
89    timestamp_iso: Regex,
90    unix_timestamp: Regex,
91    session_id_pattern: Regex,
92    file_path_metadata: Regex,
93    empty_content_json: Regex,
94    empty_text_json: Regex,
95    placeholder_message: Regex,
96    multiple_newlines: Regex,
97    multiple_spaces: Regex,
98}
99
100fn compile_static_regex(pattern: &str, label: &str) -> Regex {
101    Regex::new(pattern).unwrap_or_else(|err| panic!("invalid preprocessing regex '{label}': {err}"))
102}
103
104fn preprocessing_regexes() -> &'static PreprocessingRegexes {
105    static REGEXES: OnceLock<PreprocessingRegexes> = OnceLock::new();
106    REGEXES.get_or_init(|| PreprocessingRegexes {
107        function_calls_block: compile_static_regex(
108            &format!(
109            r"(?s)<{}>{}</{}>",
110            "function_calls", r".*?", "function_calls"
111        ),
112            "function_calls_block",
113        ),
114        antml_invoke_block: compile_static_regex(
115            &format!(
116            r"(?s)<{}:{}[^>]*>.*?</{}:{}>",
117            "antml", "invoke", "antml", "invoke"
118        ),
119            "antml_invoke_block",
120        ),
121        antml_parameter_block: compile_static_regex(
122            &format!(
123            r"(?s)<{}:{}[^>]*>.*?</{}:{}>",
124            "antml", "parameter", "antml", "parameter"
125        ),
126            "antml_parameter_block",
127        ),
128        function_results_block: compile_static_regex(
129            &format!(
130            r"(?s)<{}>{}</{}>",
131            "function_results", r".*?", "function_results"
132        ),
133            "function_results_block",
134        ),
135        result_block: compile_static_regex(
136            &format!(r"(?s)<{}>{}</{}>", "result", r".*?", "result"),
137            "result_block",
138        ),
139        tool_output_tags: compile_static_regex(
140            r"(?s)<(output|name|value)>.*?</(output|name|value)>",
141            "tool_output_tags",
142        ),
143        git_status_output: compile_static_regex(
144            r"(?m)^\s*(On branch|Your branch|Changes (?:not staged|to be committed)|Untracked files|nothing to commit|modified:|new file:|deleted:).*$",
145            "git_status_output",
146        ),
147        git_diff_output: compile_static_regex(
148            r"(?m)^(diff --git|index [0-9a-f]+\.\.[0-9a-f]+|--- a/|--- /|\+\+\+ a/|\+\+\+ b/|@@\s*-\d+.*@@|Binary files).*$",
149            "git_diff_output",
150        ),
151        cargo_output: compile_static_regex(
152            r"(?m)^(\s*(Compiling|Finished|Running|warning:|error\[E|-->|note:|help:)).*$",
153            "cargo_output",
154        ),
155        npm_output: compile_static_regex(
156            r"(?m)^(npm (WARN|ERR!|notice)|added \d+ packages|up to date|audited \d+ packages).*$",
157            "npm_output",
158        ),
159        file_listing: compile_static_regex(
160            r"(?m)^(total \d+|[drwx-]{10}\s+\d+|[-lrwx]{10}\s+\d+).*$",
161            "file_listing",
162        ),
163        tree_output: compile_static_regex(r"(?m)^[│├└─\s]+[\w.-]+/?$", "tree_output"),
164        uuid_pattern: compile_static_regex(
165            r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}",
166            "uuid_pattern",
167        ),
168        timestamp_iso: compile_static_regex(
169            r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?(Z|[+-]\d{2}:?\d{2})?",
170            "timestamp_iso",
171        ),
172        unix_timestamp: compile_static_regex(r"\b1[6-7]\d{8}\b", "unix_timestamp"),
173        session_id_pattern: compile_static_regex(
174            r#"(session_id|sessionId|session-id|conv_id|conversation_id)["']?\s*[:=]\s*["']?[\w-]+"#,
175            "session_id_pattern",
176        ),
177        file_path_metadata: compile_static_regex(
178            r#""(path|file_path|filepath)"\s*:\s*"[^"]+""#,
179            "file_path_metadata",
180        ),
181        empty_content_json: compile_static_regex(
182            r#""content"\s*:\s*\[\s*\]"#,
183            "empty_content_json",
184        ),
185        empty_text_json: compile_static_regex(r#""text"\s*:\s*"""#, "empty_text_json"),
186        placeholder_message: compile_static_regex(
187            r"(?i)(placeholder|lorem ipsum|TODO:|FIXME:|XXX:)",
188            "placeholder_message",
189        ),
190        multiple_newlines: compile_static_regex(r"\n{3,}", "multiple_newlines"),
191        multiple_spaces: compile_static_regex(r" {2,}", "multiple_spaces"),
192    })
193}
194
195/// A simple hash for deduplication based on normalized content
196fn content_hash(s: &str) -> u64 {
197    use std::collections::hash_map::DefaultHasher;
198    use std::hash::{Hash, Hasher};
199
200    let normalized = s
201        .to_lowercase()
202        .split_whitespace()
203        .collect::<Vec<_>>()
204        .join(" ");
205
206    let mut hasher = DefaultHasher::new();
207    normalized.hash(&mut hasher);
208    hasher.finish()
209}
210
211/// Calculate similarity between two content hashes (Jaccard-like for words)
212fn content_similarity(a: &str, b: &str) -> f32 {
213    let words_a: HashSet<&str> = a.split_whitespace().collect();
214    let words_b: HashSet<&str> = b.split_whitespace().collect();
215
216    if words_a.is_empty() && words_b.is_empty() {
217        return 1.0;
218    }
219
220    let intersection = words_a.intersection(&words_b).count();
221    let union = words_a.union(&words_b).count();
222
223    if union == 0 {
224        return 0.0;
225    }
226
227    intersection as f32 / union as f32
228}
229
230/// Message structure for conversation filtering
231#[derive(Debug, Clone)]
232pub struct Message {
233    pub role: String,
234    pub content: String,
235    pub metadata: Option<serde_json::Value>,
236}
237
238/// Statistics from preprocessing
239#[derive(Debug, Clone, Default)]
240pub struct PreprocessingStats {
241    pub total_input: usize,
242    pub filtered_tool_artifacts: usize,
243    pub filtered_cli_output: usize,
244    pub filtered_metadata: usize,
245    pub filtered_empty: usize,
246    pub filtered_duplicates: usize,
247    pub filtered_below_min_length: usize,
248    pub total_output: usize,
249}
250
251impl PreprocessingStats {
252    pub fn total_filtered(&self) -> usize {
253        self.filtered_tool_artifacts
254            + self.filtered_cli_output
255            + self.filtered_metadata
256            + self.filtered_empty
257            + self.filtered_duplicates
258            + self.filtered_below_min_length
259    }
260
261    pub fn filter_rate(&self) -> f32 {
262        if self.total_input == 0 {
263            return 0.0;
264        }
265        self.total_filtered() as f32 / self.total_input as f32
266    }
267}
268
269/// The main preprocessor for filtering conversation noise
270pub struct Preprocessor {
271    config: PreprocessingConfig,
272    seen_hashes: HashSet<u64>,
273}
274
275impl Preprocessor {
276    /// Create a new preprocessor with the given configuration
277    pub fn new(config: PreprocessingConfig) -> Self {
278        Self {
279            config,
280            seen_hashes: HashSet::new(),
281        }
282    }
283
284    /// Create a preprocessor with default configuration
285    pub fn with_defaults() -> Self {
286        Self::new(PreprocessingConfig::default())
287    }
288
289    /// Reset the deduplication cache (useful between conversations)
290    pub fn reset_dedupe_cache(&mut self) {
291        self.seen_hashes.clear();
292    }
293
294    /// Filter a single message content. Returns None if the content should be skipped entirely.
295    pub fn filter_message(&mut self, content: &str) -> Option<String> {
296        // Extract semantic content first
297        let cleaned = self.extract_semantic_content(content);
298
299        // Check minimum length
300        if cleaned.len() < self.config.min_content_length {
301            return None;
302        }
303
304        // Check for duplicates
305        if self.config.dedupe_threshold < 1.0 {
306            let hash = content_hash(&cleaned);
307            if self.seen_hashes.contains(&hash) {
308                return None;
309            }
310            self.seen_hashes.insert(hash);
311        }
312
313        Some(cleaned)
314    }
315
316    /// Filter a conversation (list of messages), returning only meaningful content
317    pub fn filter_conversation(
318        &mut self,
319        messages: Vec<Message>,
320    ) -> (Vec<Message>, PreprocessingStats) {
321        let mut stats = PreprocessingStats {
322            total_input: messages.len(),
323            ..Default::default()
324        };
325
326        let mut result = Vec::new();
327        let mut previous_contents: Vec<String> = Vec::new();
328
329        for msg in messages {
330            // Skip if content is mostly tool artifacts
331            if self.config.remove_tool_artifacts && self.is_mostly_tool_artifact(&msg.content) {
332                stats.filtered_tool_artifacts += 1;
333                continue;
334            }
335
336            // Skip if content is mostly CLI output
337            if self.config.remove_cli_output && self.is_mostly_cli_output(&msg.content) {
338                stats.filtered_cli_output += 1;
339                continue;
340            }
341
342            // Extract semantic content
343            let cleaned = self.extract_semantic_content(&msg.content);
344
345            // Skip empty content
346            if self.config.remove_empty_content && cleaned.trim().is_empty() {
347                stats.filtered_empty += 1;
348                continue;
349            }
350
351            // Skip if below minimum length
352            if cleaned.len() < self.config.min_content_length {
353                stats.filtered_below_min_length += 1;
354                continue;
355            }
356
357            // Check for duplicates/near-duplicates
358            if self.config.dedupe_threshold < 1.0 {
359                let is_duplicate = previous_contents
360                    .iter()
361                    .any(|prev| content_similarity(prev, &cleaned) >= self.config.dedupe_threshold);
362
363                if is_duplicate {
364                    stats.filtered_duplicates += 1;
365                    continue;
366                }
367            }
368
369            previous_contents.push(cleaned.clone());
370
371            result.push(Message {
372                role: msg.role,
373                content: cleaned,
374                metadata: msg.metadata,
375            });
376        }
377
378        stats.total_output = result.len();
379
380        info!(
381            "Preprocessing complete: {}/{} messages kept ({:.1}% filtered)",
382            stats.total_output,
383            stats.total_input,
384            stats.filter_rate() * 100.0
385        );
386
387        (result, stats)
388    }
389
390    /// Extract semantic content from raw text, removing noise patterns
391    pub fn extract_semantic_content(&self, raw: &str) -> String {
392        let regexes = preprocessing_regexes();
393        let mut result = raw.to_string();
394
395        // Remove tool artifacts
396        if self.config.remove_tool_artifacts {
397            result = regexes
398                .function_calls_block
399                .replace_all(&result, "")
400                .to_string();
401            result = regexes
402                .antml_invoke_block
403                .replace_all(&result, "")
404                .to_string();
405            result = regexes
406                .antml_parameter_block
407                .replace_all(&result, "")
408                .to_string();
409            result = regexes
410                .function_results_block
411                .replace_all(&result, "")
412                .to_string();
413            result = regexes.result_block.replace_all(&result, "").to_string();
414            result = regexes
415                .tool_output_tags
416                .replace_all(&result, "")
417                .to_string();
418        }
419
420        // Remove CLI output
421        if self.config.remove_cli_output {
422            result = regexes
423                .git_status_output
424                .replace_all(&result, "")
425                .to_string();
426            result = regexes.git_diff_output.replace_all(&result, "").to_string();
427            result = regexes.cargo_output.replace_all(&result, "").to_string();
428            result = regexes.npm_output.replace_all(&result, "").to_string();
429            result = regexes.file_listing.replace_all(&result, "").to_string();
430            result = regexes.tree_output.replace_all(&result, "").to_string();
431        }
432
433        // Remove metadata
434        if self.config.remove_metadata {
435            result = regexes
436                .uuid_pattern
437                .replace_all(&result, "[UUID]")
438                .to_string();
439            result = regexes
440                .timestamp_iso
441                .replace_all(&result, "[TIMESTAMP]")
442                .to_string();
443            result = regexes
444                .unix_timestamp
445                .replace_all(&result, "[TIMESTAMP]")
446                .to_string();
447            result = regexes
448                .session_id_pattern
449                .replace_all(&result, "")
450                .to_string();
451            result = regexes
452                .file_path_metadata
453                .replace_all(&result, "")
454                .to_string();
455        }
456
457        // Remove empty content patterns
458        if self.config.remove_empty_content {
459            result = regexes
460                .empty_content_json
461                .replace_all(&result, "")
462                .to_string();
463            result = regexes.empty_text_json.replace_all(&result, "").to_string();
464            result = regexes
465                .placeholder_message
466                .replace_all(&result, "")
467                .to_string();
468        }
469
470        // Normalize whitespace
471        result = regexes
472            .multiple_newlines
473            .replace_all(&result, "\n\n")
474            .to_string();
475        result = regexes
476            .multiple_spaces
477            .replace_all(&result, " ")
478            .to_string();
479
480        result.trim().to_string()
481    }
482
483    /// Check if content is predominantly tool artifacts
484    fn is_mostly_tool_artifact(&self, content: &str) -> bool {
485        let regexes = preprocessing_regexes();
486        let original_len = content.len();
487        if original_len == 0 {
488            return false;
489        }
490
491        let mut cleaned = content.to_string();
492        cleaned = regexes
493            .function_calls_block
494            .replace_all(&cleaned, "")
495            .to_string();
496        cleaned = regexes
497            .antml_invoke_block
498            .replace_all(&cleaned, "")
499            .to_string();
500        cleaned = regexes
501            .antml_parameter_block
502            .replace_all(&cleaned, "")
503            .to_string();
504        cleaned = regexes
505            .function_results_block
506            .replace_all(&cleaned, "")
507            .to_string();
508        cleaned = regexes.result_block.replace_all(&cleaned, "").to_string();
509
510        let remaining_len = cleaned.trim().len();
511        let artifact_ratio = 1.0 - (remaining_len as f32 / original_len as f32);
512
513        // If more than 80% was tool artifacts, consider it mostly artifacts
514        artifact_ratio > 0.8
515    }
516
517    /// Check if content is predominantly CLI output
518    fn is_mostly_cli_output(&self, content: &str) -> bool {
519        let regexes = preprocessing_regexes();
520        let lines: Vec<&str> = content.lines().collect();
521        if lines.is_empty() {
522            return false;
523        }
524
525        let cli_lines = lines
526            .iter()
527            .filter(|line| {
528                regexes.git_status_output.is_match(line)
529                    || regexes.git_diff_output.is_match(line)
530                    || regexes.cargo_output.is_match(line)
531                    || regexes.npm_output.is_match(line)
532                    || regexes.file_listing.is_match(line)
533                    || regexes.tree_output.is_match(line)
534            })
535            .count();
536
537        let cli_ratio = cli_lines as f32 / lines.len() as f32;
538
539        // If more than 70% of lines are CLI output, consider it mostly CLI
540        cli_ratio > 0.7
541    }
542}
543
544// =============================================================================
545// TEXT INTEGRITY METRICS - Quality assessment for embedding quality
546// =============================================================================
547
548/// Text integrity metrics for embedding quality assessment.
549///
550/// Target: >90% overall integrity score before indexing.
551///
552/// # Metrics
553/// - **Sentence Integrity**: % of complete sentences preserved (not cut mid-sentence)
554/// - **Word Integrity**: % of complete words (not truncated mid-word)
555/// - **Chunk Quality**: How close chunks are to optimal size
556///
557/// # Example
558/// ```rust,ignore
559/// let metrics = TextIntegrityMetrics::compute(original_text, &chunks);
560/// if !metrics.passes_threshold() {
561///     warn!("Text integrity below 90%: {:.1}%", metrics.overall * 100.0);
562/// }
563/// ```
564#[derive(Debug, Clone, Serialize, Deserialize)]
565pub struct TextIntegrityMetrics {
566    /// Percentage of complete sentences preserved (0.0 - 1.0)
567    pub sentence_integrity: f32,
568
569    /// Percentage of complete words (not truncated) (0.0 - 1.0)
570    pub word_integrity: f32,
571
572    /// Quality score based on chunk length distribution (0.0 - 1.0)
573    pub chunk_quality: f32,
574
575    /// Combined overall score (0.0 - 1.0)
576    pub overall: f32,
577
578    /// Number of chunks analyzed
579    pub chunk_count: usize,
580
581    /// Average chunk length in characters
582    pub avg_chunk_length: usize,
583}
584
585impl TextIntegrityMetrics {
586    /// Minimum acceptable overall score (90%)
587    pub const THRESHOLD: f32 = 0.90;
588
589    /// Optimal chunk length range (characters)
590    pub const OPTIMAL_MIN: usize = 200;
591    pub const OPTIMAL_MAX: usize = 800;
592
593    /// Compute integrity metrics from original text and resulting chunks
594    pub fn compute(original: &str, chunks: &[String]) -> Self {
595        if chunks.is_empty() {
596            return Self {
597                sentence_integrity: 0.0,
598                word_integrity: 0.0,
599                chunk_quality: 0.0,
600                overall: 0.0,
601                chunk_count: 0,
602                avg_chunk_length: 0,
603            };
604        }
605
606        let sentence_integrity = Self::compute_sentence_integrity(original, chunks);
607        let word_integrity = Self::compute_word_integrity(chunks);
608        let chunk_quality = Self::compute_chunk_quality(chunks);
609
610        // Weighted average: sentence integrity is most important
611        let overall = sentence_integrity * 0.5 + word_integrity * 0.3 + chunk_quality * 0.2;
612
613        let total_chars: usize = chunks.iter().map(|c| c.len()).sum();
614        let avg_chunk_length = total_chars / chunks.len();
615
616        Self {
617            sentence_integrity,
618            word_integrity,
619            chunk_quality,
620            overall,
621            chunk_count: chunks.len(),
622            avg_chunk_length,
623        }
624    }
625
626    /// Check if metrics pass the minimum threshold (90%)
627    pub fn passes_threshold(&self) -> bool {
628        self.overall >= Self::THRESHOLD
629    }
630
631    /// Get recommendation based on metrics
632    pub fn recommendation(&self) -> IntegrityRecommendation {
633        if self.overall >= 0.95 {
634            IntegrityRecommendation::Excellent
635        } else if self.overall >= Self::THRESHOLD {
636            IntegrityRecommendation::Good
637        } else if self.overall >= 0.70 {
638            IntegrityRecommendation::Warn
639        } else {
640            IntegrityRecommendation::Purge
641        }
642    }
643
644    /// Compute sentence integrity score
645    fn compute_sentence_integrity(original: &str, chunks: &[String]) -> f32 {
646        let original_sentences = Self::count_sentences(original);
647        if original_sentences == 0 {
648            return 1.0; // No sentences to preserve
649        }
650
651        let preserved_sentences: usize = chunks
652            .iter()
653            .map(|c| Self::count_complete_sentences(c))
654            .sum();
655
656        // Ratio of preserved complete sentences
657        let ratio = preserved_sentences as f32 / original_sentences as f32;
658
659        // Cap at 1.0 (can exceed if chunks add sentence breaks)
660        ratio.min(1.0)
661    }
662
663    /// Compute word integrity (chunks not ending mid-word)
664    fn compute_word_integrity(chunks: &[String]) -> f32 {
665        if chunks.is_empty() {
666            return 1.0;
667        }
668
669        let complete_endings = chunks
670            .iter()
671            .filter(|c| Self::ends_at_word_boundary(c))
672            .count();
673
674        complete_endings as f32 / chunks.len() as f32
675    }
676
677    /// Compute chunk quality based on length distribution
678    fn compute_chunk_quality(chunks: &[String]) -> f32 {
679        if chunks.is_empty() {
680            return 0.0;
681        }
682
683        let optimal_count = chunks
684            .iter()
685            .filter(|c| {
686                let len = c.len();
687                (Self::OPTIMAL_MIN..=Self::OPTIMAL_MAX).contains(&len)
688            })
689            .count();
690
691        optimal_count as f32 / chunks.len() as f32
692    }
693
694    /// Count sentences in text (ending with . ! ?)
695    fn count_sentences(text: &str) -> usize {
696        text.chars()
697            .filter(|&c| c == '.' || c == '!' || c == '?')
698            .count()
699    }
700
701    /// Count complete sentences (not cut mid-sentence)
702    fn count_complete_sentences(chunk: &str) -> usize {
703        let trimmed = chunk.trim();
704        if trimmed.is_empty() {
705            return 0;
706        }
707
708        // A complete sentence ends with punctuation
709        let sentences = Self::count_sentences(trimmed);
710
711        // Check if chunk ends cleanly (with sentence terminator or at natural break)
712        if Self::ends_at_sentence_boundary(trimmed) {
713            sentences
714        } else {
715            // Last sentence is incomplete
716            sentences.saturating_sub(1)
717        }
718    }
719
720    /// Check if text ends at a sentence boundary
721    fn ends_at_sentence_boundary(text: &str) -> bool {
722        let trimmed = text.trim();
723        if trimmed.is_empty() {
724            return true;
725        }
726
727        let last_char = trimmed.chars().last().unwrap_or(' ');
728        matches!(last_char, '.' | '!' | '?' | ':' | '"' | '\'' | ')' | ']')
729    }
730
731    /// Check if text ends at a word boundary (punctuation or whitespace)
732    /// Note: We consider alphanumeric endings as incomplete if the original
733    /// context isn't available - caller should verify if needed
734    fn ends_at_word_boundary(text: &str) -> bool {
735        let trimmed = text.trim_end();
736        if trimmed.is_empty() {
737            return true;
738        }
739
740        let last_char = trimmed.chars().last().unwrap_or(' ');
741        // Only punctuation counts as clean word boundary
742        // Alphanumeric endings might be mid-word cuts
743        last_char.is_whitespace() || last_char.is_ascii_punctuation()
744    }
745}
746
747/// Recommendation based on integrity metrics
748#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
749pub enum IntegrityRecommendation {
750    /// >95% - High quality, keep
751    Excellent,
752    /// 90-95% - Good quality, keep
753    Good,
754    /// 70-90% - Consider re-indexing with better chunking
755    Warn,
756    /// <70% - Low quality, purge and re-index
757    Purge,
758}
759
760impl IntegrityRecommendation {
761    pub fn as_str(&self) -> &'static str {
762        match self {
763            Self::Excellent => "EXCELLENT",
764            Self::Good => "GOOD",
765            Self::Warn => "WARN",
766            Self::Purge => "PURGE",
767        }
768    }
769
770    pub fn emoji(&self) -> &'static str {
771        match self {
772            Self::Excellent => "✅",
773            Self::Good => "✅",
774            Self::Warn => "⚠️",
775            Self::Purge => "❌",
776        }
777    }
778}
779
780impl std::fmt::Display for TextIntegrityMetrics {
781    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
782        let rec = self.recommendation();
783        write!(
784            f,
785            "{} {}: {:.1}% (sentence: {:.1}%, word: {:.1}%, chunk: {:.1}%) - {} chunks, avg {}ch",
786            rec.emoji(),
787            rec.as_str(),
788            self.overall * 100.0,
789            self.sentence_integrity * 100.0,
790            self.word_integrity * 100.0,
791            self.chunk_quality * 100.0,
792            self.chunk_count,
793            self.avg_chunk_length
794        )
795    }
796}
797
798#[cfg(test)]
799impl Message {
800    pub fn new(role: impl Into<String>, content: impl Into<String>) -> Self {
801        Self {
802            role: role.into(),
803            content: content.into(),
804            metadata: None,
805        }
806    }
807}
808
809#[cfg(test)]
810mod integrity_tests {
811    use super::*;
812
813    #[test]
814    fn test_perfect_integrity() {
815        // Create longer chunks to pass chunk_quality (OPTIMAL_MIN=200)
816        let original = "This is the first sentence with some padding text to make it longer. \
817                        Here is another sentence that continues the thought and adds context. \
818                        The third sentence provides more information about the topic at hand. \
819                        Finally we conclude with a fourth sentence that wraps everything up nicely.";
820        let chunks = vec![
821            "This is the first sentence with some padding text to make it longer. \
822             Here is another sentence that continues the thought and adds context."
823                .to_string(),
824            "The third sentence provides more information about the topic at hand. \
825             Finally we conclude with a fourth sentence that wraps everything up nicely."
826                .to_string(),
827        ];
828
829        let metrics = TextIntegrityMetrics::compute(original, &chunks);
830        assert!(
831            metrics.sentence_integrity >= 0.9,
832            "sentence_integrity: {}",
833            metrics.sentence_integrity
834        );
835        assert!(
836            metrics.word_integrity >= 0.9,
837            "word_integrity: {}",
838            metrics.word_integrity
839        );
840        // overall = 0.5*sentence + 0.3*word + 0.2*chunk_quality
841        // With perfect sentence/word but short chunks (< OPTIMAL_MIN), overall = 0.8
842        assert!(metrics.overall >= 0.75, "overall: {}", metrics.overall);
843    }
844
845    #[test]
846    fn test_poor_integrity() {
847        let original = "This is a complete sentence with many words.";
848        // Chunks cut mid-word and mid-sentence
849        let chunks = vec![
850            "This is a compl".to_string(), // Cut mid-word
851            "ete sentence wi".to_string(), // Cut mid-word
852            "th many words".to_string(),   // Missing period
853        ];
854
855        let metrics = TextIntegrityMetrics::compute(original, &chunks);
856        assert!(metrics.word_integrity < 0.9); // Mid-word cuts
857        assert!(!metrics.passes_threshold());
858        assert_eq!(metrics.recommendation(), IntegrityRecommendation::Purge);
859    }
860
861    #[test]
862    fn test_empty_chunks() {
863        let original = "Some text";
864        let chunks: Vec<String> = vec![];
865
866        let metrics = TextIntegrityMetrics::compute(original, &chunks);
867        assert_eq!(metrics.chunk_count, 0);
868        assert_eq!(metrics.overall, 0.0);
869    }
870
871    #[test]
872    fn test_recommendation_levels() {
873        // Excellent
874        let m = TextIntegrityMetrics {
875            sentence_integrity: 1.0,
876            word_integrity: 1.0,
877            chunk_quality: 0.9,
878            overall: 0.97,
879            chunk_count: 10,
880            avg_chunk_length: 400,
881        };
882        assert_eq!(m.recommendation(), IntegrityRecommendation::Excellent);
883
884        // Good
885        let m = TextIntegrityMetrics { overall: 0.92, ..m };
886        assert_eq!(m.recommendation(), IntegrityRecommendation::Good);
887
888        // Warn
889        let m = TextIntegrityMetrics { overall: 0.75, ..m };
890        assert_eq!(m.recommendation(), IntegrityRecommendation::Warn);
891
892        // Purge
893        let m = TextIntegrityMetrics { overall: 0.50, ..m };
894        assert_eq!(m.recommendation(), IntegrityRecommendation::Purge);
895    }
896}
rust_memex/preprocessing/mod.rs

rust_memex/preprocessing/
mod.rs