use regex::Regex;
use serde::{Deserialize, Serialize};
use std::collections::HashSet;
use std::sync::OnceLock;
use tracing::info;
#[cfg(test)]
mod tests;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PreprocessingConfig {
#[serde(default = "default_true")]
pub remove_tool_artifacts: bool,
#[serde(default = "default_true")]
pub remove_cli_output: bool,
#[serde(default = "default_true")]
pub remove_metadata: bool,
#[serde(default = "default_min_length")]
pub min_content_length: usize,
#[serde(default = "default_dedupe_threshold")]
pub dedupe_threshold: f32,
#[serde(default = "default_true")]
pub remove_empty_content: bool,
#[serde(default = "default_true")]
pub remove_duplicate_headers: bool,
}
fn default_true() -> bool {
true
}
fn default_min_length() -> usize {
50
}
fn default_dedupe_threshold() -> f32 {
0.95
}
impl Default for PreprocessingConfig {
fn default() -> Self {
Self {
remove_tool_artifacts: true,
remove_cli_output: true,
remove_metadata: false,
min_content_length: 50,
dedupe_threshold: 0.95,
remove_empty_content: true,
remove_duplicate_headers: true,
}
}
}
struct PreprocessingRegexes {
function_calls_block: Regex,
antml_invoke_block: Regex,
antml_parameter_block: Regex,
function_results_block: Regex,
result_block: Regex,
tool_output_tags: Regex,
git_status_output: Regex,
git_diff_output: Regex,
cargo_output: Regex,
npm_output: Regex,
file_listing: Regex,
tree_output: Regex,
uuid_pattern: Regex,
timestamp_iso: Regex,
unix_timestamp: Regex,
session_id_pattern: Regex,
file_path_metadata: Regex,
empty_content_json: Regex,
empty_text_json: Regex,
placeholder_message: Regex,
multiple_newlines: Regex,
multiple_spaces: Regex,
}
fn compile_static_regex(pattern: &str, label: &str) -> Regex {
Regex::new(pattern).unwrap_or_else(|err| panic!("invalid preprocessing regex '{label}': {err}"))
}
fn preprocessing_regexes() -> &'static PreprocessingRegexes {
static REGEXES: OnceLock<PreprocessingRegexes> = OnceLock::new();
REGEXES.get_or_init(|| PreprocessingRegexes {
function_calls_block: compile_static_regex(
&format!(
r"(?s)<{}>{}</{}>",
"function_calls", r".*?", "function_calls"
),
"function_calls_block",
),
antml_invoke_block: compile_static_regex(
&format!(
r"(?s)<{}:{}[^>]*>.*?</{}:{}>",
"antml", "invoke", "antml", "invoke"
),
"antml_invoke_block",
),
antml_parameter_block: compile_static_regex(
&format!(
r"(?s)<{}:{}[^>]*>.*?</{}:{}>",
"antml", "parameter", "antml", "parameter"
),
"antml_parameter_block",
),
function_results_block: compile_static_regex(
&format!(
r"(?s)<{}>{}</{}>",
"function_results", r".*?", "function_results"
),
"function_results_block",
),
result_block: compile_static_regex(
&format!(r"(?s)<{}>{}</{}>", "result", r".*?", "result"),
"result_block",
),
tool_output_tags: compile_static_regex(
r"(?s)<(output|name|value)>.*?</(output|name|value)>",
"tool_output_tags",
),
git_status_output: compile_static_regex(
r"(?m)^\s*(On branch|Your branch|Changes (?:not staged|to be committed)|Untracked files|nothing to commit|modified:|new file:|deleted:).*$",
"git_status_output",
),
git_diff_output: compile_static_regex(
r"(?m)^(diff --git|index [0-9a-f]+\.\.[0-9a-f]+|--- a/|--- /|\+\+\+ a/|\+\+\+ b/|@@\s*-\d+.*@@|Binary files).*$",
"git_diff_output",
),
cargo_output: compile_static_regex(
r"(?m)^(\s*(Compiling|Finished|Running|warning:|error\[E|-->|note:|help:)).*$",
"cargo_output",
),
npm_output: compile_static_regex(
r"(?m)^(npm (WARN|ERR!|notice)|added \d+ packages|up to date|audited \d+ packages).*$",
"npm_output",
),
file_listing: compile_static_regex(
r"(?m)^(total \d+|[drwx-]{10}\s+\d+|[-lrwx]{10}\s+\d+).*$",
"file_listing",
),
tree_output: compile_static_regex(r"(?m)^[│├└─\s]+[\w.-]+/?$", "tree_output"),
uuid_pattern: compile_static_regex(
r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}",
"uuid_pattern",
),
timestamp_iso: compile_static_regex(
r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?(Z|[+-]\d{2}:?\d{2})?",
"timestamp_iso",
),
unix_timestamp: compile_static_regex(r"\b1[6-7]\d{8}\b", "unix_timestamp"),
session_id_pattern: compile_static_regex(
r#"(session_id|sessionId|session-id|conv_id|conversation_id)["']?\s*[:=]\s*["']?[\w-]+"#,
"session_id_pattern",
),
file_path_metadata: compile_static_regex(
r#""(path|file_path|filepath)"\s*:\s*"[^"]+""#,
"file_path_metadata",
),
empty_content_json: compile_static_regex(
r#""content"\s*:\s*\[\s*\]"#,
"empty_content_json",
),
empty_text_json: compile_static_regex(r#""text"\s*:\s*"""#, "empty_text_json"),
placeholder_message: compile_static_regex(
r"(?i)(placeholder|lorem ipsum|TODO:|FIXME:|XXX:)",
"placeholder_message",
),
multiple_newlines: compile_static_regex(r"\n{3,}", "multiple_newlines"),
multiple_spaces: compile_static_regex(r" {2,}", "multiple_spaces"),
})
}
fn content_hash(s: &str) -> u64 {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let normalized = s
.to_lowercase()
.split_whitespace()
.collect::<Vec<_>>()
.join(" ");
let mut hasher = DefaultHasher::new();
normalized.hash(&mut hasher);
hasher.finish()
}
fn content_similarity(a: &str, b: &str) -> f32 {
let words_a: HashSet<&str> = a.split_whitespace().collect();
let words_b: HashSet<&str> = b.split_whitespace().collect();
if words_a.is_empty() && words_b.is_empty() {
return 1.0;
}
let intersection = words_a.intersection(&words_b).count();
let union = words_a.union(&words_b).count();
if union == 0 {
return 0.0;
}
intersection as f32 / union as f32
}
#[derive(Debug, Clone)]
pub struct Message {
pub role: String,
pub content: String,
pub metadata: Option<serde_json::Value>,
}
#[derive(Debug, Clone, Default)]
pub struct PreprocessingStats {
pub total_input: usize,
pub filtered_tool_artifacts: usize,
pub filtered_cli_output: usize,
pub filtered_metadata: usize,
pub filtered_empty: usize,
pub filtered_duplicates: usize,
pub filtered_below_min_length: usize,
pub total_output: usize,
}
impl PreprocessingStats {
pub fn total_filtered(&self) -> usize {
self.filtered_tool_artifacts
+ self.filtered_cli_output
+ self.filtered_metadata
+ self.filtered_empty
+ self.filtered_duplicates
+ self.filtered_below_min_length
}
pub fn filter_rate(&self) -> f32 {
if self.total_input == 0 {
return 0.0;
}
self.total_filtered() as f32 / self.total_input as f32
}
}
pub struct Preprocessor {
config: PreprocessingConfig,
seen_hashes: HashSet<u64>,
}
impl Preprocessor {
pub fn new(config: PreprocessingConfig) -> Self {
Self {
config,
seen_hashes: HashSet::new(),
}
}
pub fn with_defaults() -> Self {
Self::new(PreprocessingConfig::default())
}
pub fn reset_dedupe_cache(&mut self) {
self.seen_hashes.clear();
}
pub fn filter_message(&mut self, content: &str) -> Option<String> {
let cleaned = self.extract_semantic_content(content);
if cleaned.len() < self.config.min_content_length {
return None;
}
if self.config.dedupe_threshold < 1.0 {
let hash = content_hash(&cleaned);
if self.seen_hashes.contains(&hash) {
return None;
}
self.seen_hashes.insert(hash);
}
Some(cleaned)
}
pub fn filter_conversation(
&mut self,
messages: Vec<Message>,
) -> (Vec<Message>, PreprocessingStats) {
let mut stats = PreprocessingStats {
total_input: messages.len(),
..Default::default()
};
let mut result = Vec::new();
let mut previous_contents: Vec<String> = Vec::new();
for msg in messages {
if self.config.remove_tool_artifacts && self.is_mostly_tool_artifact(&msg.content) {
stats.filtered_tool_artifacts += 1;
continue;
}
if self.config.remove_cli_output && self.is_mostly_cli_output(&msg.content) {
stats.filtered_cli_output += 1;
continue;
}
let cleaned = self.extract_semantic_content(&msg.content);
if self.config.remove_empty_content && cleaned.trim().is_empty() {
stats.filtered_empty += 1;
continue;
}
if cleaned.len() < self.config.min_content_length {
stats.filtered_below_min_length += 1;
continue;
}
if self.config.dedupe_threshold < 1.0 {
let is_duplicate = previous_contents
.iter()
.any(|prev| content_similarity(prev, &cleaned) >= self.config.dedupe_threshold);
if is_duplicate {
stats.filtered_duplicates += 1;
continue;
}
}
previous_contents.push(cleaned.clone());
result.push(Message {
role: msg.role,
content: cleaned,
metadata: msg.metadata,
});
}
stats.total_output = result.len();
info!(
"Preprocessing complete: {}/{} messages kept ({:.1}% filtered)",
stats.total_output,
stats.total_input,
stats.filter_rate() * 100.0
);
(result, stats)
}
pub fn extract_semantic_content(&self, raw: &str) -> String {
let regexes = preprocessing_regexes();
let mut result = raw.to_string();
if self.config.remove_tool_artifacts {
result = regexes
.function_calls_block
.replace_all(&result, "")
.to_string();
result = regexes
.antml_invoke_block
.replace_all(&result, "")
.to_string();
result = regexes
.antml_parameter_block
.replace_all(&result, "")
.to_string();
result = regexes
.function_results_block
.replace_all(&result, "")
.to_string();
result = regexes.result_block.replace_all(&result, "").to_string();
result = regexes
.tool_output_tags
.replace_all(&result, "")
.to_string();
}
if self.config.remove_cli_output {
result = regexes
.git_status_output
.replace_all(&result, "")
.to_string();
result = regexes.git_diff_output.replace_all(&result, "").to_string();
result = regexes.cargo_output.replace_all(&result, "").to_string();
result = regexes.npm_output.replace_all(&result, "").to_string();
result = regexes.file_listing.replace_all(&result, "").to_string();
result = regexes.tree_output.replace_all(&result, "").to_string();
}
if self.config.remove_metadata {
result = regexes
.uuid_pattern
.replace_all(&result, "[UUID]")
.to_string();
result = regexes
.timestamp_iso
.replace_all(&result, "[TIMESTAMP]")
.to_string();
result = regexes
.unix_timestamp
.replace_all(&result, "[TIMESTAMP]")
.to_string();
result = regexes
.session_id_pattern
.replace_all(&result, "")
.to_string();
result = regexes
.file_path_metadata
.replace_all(&result, "")
.to_string();
}
if self.config.remove_empty_content {
result = regexes
.empty_content_json
.replace_all(&result, "")
.to_string();
result = regexes.empty_text_json.replace_all(&result, "").to_string();
result = regexes
.placeholder_message
.replace_all(&result, "")
.to_string();
}
result = regexes
.multiple_newlines
.replace_all(&result, "\n\n")
.to_string();
result = regexes
.multiple_spaces
.replace_all(&result, " ")
.to_string();
result.trim().to_string()
}
fn is_mostly_tool_artifact(&self, content: &str) -> bool {
let regexes = preprocessing_regexes();
let original_len = content.len();
if original_len == 0 {
return false;
}
let mut cleaned = content.to_string();
cleaned = regexes
.function_calls_block
.replace_all(&cleaned, "")
.to_string();
cleaned = regexes
.antml_invoke_block
.replace_all(&cleaned, "")
.to_string();
cleaned = regexes
.antml_parameter_block
.replace_all(&cleaned, "")
.to_string();
cleaned = regexes
.function_results_block
.replace_all(&cleaned, "")
.to_string();
cleaned = regexes.result_block.replace_all(&cleaned, "").to_string();
let remaining_len = cleaned.trim().len();
let artifact_ratio = 1.0 - (remaining_len as f32 / original_len as f32);
artifact_ratio > 0.8
}
fn is_mostly_cli_output(&self, content: &str) -> bool {
let regexes = preprocessing_regexes();
let lines: Vec<&str> = content.lines().collect();
if lines.is_empty() {
return false;
}
let cli_lines = lines
.iter()
.filter(|line| {
regexes.git_status_output.is_match(line)
|| regexes.git_diff_output.is_match(line)
|| regexes.cargo_output.is_match(line)
|| regexes.npm_output.is_match(line)
|| regexes.file_listing.is_match(line)
|| regexes.tree_output.is_match(line)
})
.count();
let cli_ratio = cli_lines as f32 / lines.len() as f32;
cli_ratio > 0.7
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TextIntegrityMetrics {
pub sentence_integrity: f32,
pub word_integrity: f32,
pub chunk_quality: f32,
pub overall: f32,
pub chunk_count: usize,
pub avg_chunk_length: usize,
}
impl TextIntegrityMetrics {
pub const THRESHOLD: f32 = 0.90;
pub const OPTIMAL_MIN: usize = 200;
pub const OPTIMAL_MAX: usize = 800;
pub fn compute(original: &str, chunks: &[String]) -> Self {
if chunks.is_empty() {
return Self {
sentence_integrity: 0.0,
word_integrity: 0.0,
chunk_quality: 0.0,
overall: 0.0,
chunk_count: 0,
avg_chunk_length: 0,
};
}
let sentence_integrity = Self::compute_sentence_integrity(original, chunks);
let word_integrity = Self::compute_word_integrity(chunks);
let chunk_quality = Self::compute_chunk_quality(chunks);
let overall = sentence_integrity * 0.5 + word_integrity * 0.3 + chunk_quality * 0.2;
let total_chars: usize = chunks.iter().map(|c| c.len()).sum();
let avg_chunk_length = total_chars / chunks.len();
Self {
sentence_integrity,
word_integrity,
chunk_quality,
overall,
chunk_count: chunks.len(),
avg_chunk_length,
}
}
pub fn passes_threshold(&self) -> bool {
self.overall >= Self::THRESHOLD
}
pub fn recommendation(&self) -> IntegrityRecommendation {
if self.overall >= 0.95 {
IntegrityRecommendation::Excellent
} else if self.overall >= Self::THRESHOLD {
IntegrityRecommendation::Good
} else if self.overall >= 0.70 {
IntegrityRecommendation::Warn
} else {
IntegrityRecommendation::Purge
}
}
fn compute_sentence_integrity(original: &str, chunks: &[String]) -> f32 {
let original_sentences = Self::count_sentences(original);
if original_sentences == 0 {
return 1.0; }
let preserved_sentences: usize = chunks
.iter()
.map(|c| Self::count_complete_sentences(c))
.sum();
let ratio = preserved_sentences as f32 / original_sentences as f32;
ratio.min(1.0)
}
fn compute_word_integrity(chunks: &[String]) -> f32 {
if chunks.is_empty() {
return 1.0;
}
let complete_endings = chunks
.iter()
.filter(|c| Self::ends_at_word_boundary(c))
.count();
complete_endings as f32 / chunks.len() as f32
}
fn compute_chunk_quality(chunks: &[String]) -> f32 {
if chunks.is_empty() {
return 0.0;
}
let optimal_count = chunks
.iter()
.filter(|c| {
let len = c.len();
(Self::OPTIMAL_MIN..=Self::OPTIMAL_MAX).contains(&len)
})
.count();
optimal_count as f32 / chunks.len() as f32
}
fn count_sentences(text: &str) -> usize {
text.chars()
.filter(|&c| c == '.' || c == '!' || c == '?')
.count()
}
fn count_complete_sentences(chunk: &str) -> usize {
let trimmed = chunk.trim();
if trimmed.is_empty() {
return 0;
}
let sentences = Self::count_sentences(trimmed);
if Self::ends_at_sentence_boundary(trimmed) {
sentences
} else {
sentences.saturating_sub(1)
}
}
fn ends_at_sentence_boundary(text: &str) -> bool {
let trimmed = text.trim();
if trimmed.is_empty() {
return true;
}
let last_char = trimmed.chars().last().unwrap_or(' ');
matches!(last_char, '.' | '!' | '?' | ':' | '"' | '\'' | ')' | ']')
}
fn ends_at_word_boundary(text: &str) -> bool {
let trimmed = text.trim_end();
if trimmed.is_empty() {
return true;
}
let last_char = trimmed.chars().last().unwrap_or(' ');
last_char.is_whitespace() || last_char.is_ascii_punctuation()
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum IntegrityRecommendation {
Excellent,
Good,
Warn,
Purge,
}
impl IntegrityRecommendation {
pub fn as_str(&self) -> &'static str {
match self {
Self::Excellent => "EXCELLENT",
Self::Good => "GOOD",
Self::Warn => "WARN",
Self::Purge => "PURGE",
}
}
pub fn emoji(&self) -> &'static str {
match self {
Self::Excellent => "✅",
Self::Good => "✅",
Self::Warn => "⚠️",
Self::Purge => "❌",
}
}
}
impl std::fmt::Display for TextIntegrityMetrics {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let rec = self.recommendation();
write!(
f,
"{} {}: {:.1}% (sentence: {:.1}%, word: {:.1}%, chunk: {:.1}%) - {} chunks, avg {}ch",
rec.emoji(),
rec.as_str(),
self.overall * 100.0,
self.sentence_integrity * 100.0,
self.word_integrity * 100.0,
self.chunk_quality * 100.0,
self.chunk_count,
self.avg_chunk_length
)
}
}
#[cfg(test)]
impl Message {
pub fn new(role: impl Into<String>, content: impl Into<String>) -> Self {
Self {
role: role.into(),
content: content.into(),
metadata: None,
}
}
}
#[cfg(test)]
mod integrity_tests {
use super::*;
#[test]
fn test_perfect_integrity() {
let original = "This is the first sentence with some padding text to make it longer. \
Here is another sentence that continues the thought and adds context. \
The third sentence provides more information about the topic at hand. \
Finally we conclude with a fourth sentence that wraps everything up nicely.";
let chunks = vec![
"This is the first sentence with some padding text to make it longer. \
Here is another sentence that continues the thought and adds context."
.to_string(),
"The third sentence provides more information about the topic at hand. \
Finally we conclude with a fourth sentence that wraps everything up nicely."
.to_string(),
];
let metrics = TextIntegrityMetrics::compute(original, &chunks);
assert!(
metrics.sentence_integrity >= 0.9,
"sentence_integrity: {}",
metrics.sentence_integrity
);
assert!(
metrics.word_integrity >= 0.9,
"word_integrity: {}",
metrics.word_integrity
);
assert!(metrics.overall >= 0.75, "overall: {}", metrics.overall);
}
#[test]
fn test_poor_integrity() {
let original = "This is a complete sentence with many words.";
let chunks = vec![
"This is a compl".to_string(), "ete sentence wi".to_string(), "th many words".to_string(), ];
let metrics = TextIntegrityMetrics::compute(original, &chunks);
assert!(metrics.word_integrity < 0.9); assert!(!metrics.passes_threshold());
assert_eq!(metrics.recommendation(), IntegrityRecommendation::Purge);
}
#[test]
fn test_empty_chunks() {
let original = "Some text";
let chunks: Vec<String> = vec![];
let metrics = TextIntegrityMetrics::compute(original, &chunks);
assert_eq!(metrics.chunk_count, 0);
assert_eq!(metrics.overall, 0.0);
}
#[test]
fn test_recommendation_levels() {
let m = TextIntegrityMetrics {
sentence_integrity: 1.0,
word_integrity: 1.0,
chunk_quality: 0.9,
overall: 0.97,
chunk_count: 10,
avg_chunk_length: 400,
};
assert_eq!(m.recommendation(), IntegrityRecommendation::Excellent);
let m = TextIntegrityMetrics { overall: 0.92, ..m };
assert_eq!(m.recommendation(), IntegrityRecommendation::Good);
let m = TextIntegrityMetrics { overall: 0.75, ..m };
assert_eq!(m.recommendation(), IntegrityRecommendation::Warn);
let m = TextIntegrityMetrics { overall: 0.50, ..m };
assert_eq!(m.recommendation(), IntegrityRecommendation::Purge);
}
}