use super::*;
#[test]
fn test_default_config() {
let config = PreprocessingConfig::default();
assert!(config.remove_tool_artifacts);
assert!(config.remove_cli_output);
assert!(!config.remove_metadata);
assert_eq!(config.min_content_length, 50);
assert_eq!(config.dedupe_threshold, 0.95);
}
#[test]
fn test_timestamps_preserved_by_default() {
let preprocessor = Preprocessor::with_defaults();
let input = "Meeting at 2024-11-13T10:30:00Z with UUID 550e8400-e29b-41d4-a716-446655440000.";
let result = preprocessor.extract_semantic_content(input);
assert!(
result.contains("2024-11-13T10:30:00Z"),
"Timestamp should be preserved by default"
);
assert!(
!result.contains("[TIMESTAMP]"),
"Timestamps should NOT be sanitized by default"
);
assert!(
result.contains("550e8400-e29b-41d4-a716-446655440000"),
"UUID should be preserved by default"
);
assert!(
!result.contains("[UUID]"),
"UUIDs should NOT be sanitized by default"
);
}
#[test]
fn test_remove_tool_artifacts_function_calls() {
let preprocessor = Preprocessor::with_defaults();
let input = r#"Here is some context. <function_calls><invoke name="test"><parameter>value</parameter></invoke></function_calls> And here is the result."#;
let result = preprocessor.extract_semantic_content(input);
assert!(!result.contains("function_calls"));
assert!(!result.contains("invoke"));
assert!(result.contains("Here is some context"));
assert!(result.contains("And here is the result"));
}
#[test]
fn test_remove_tool_artifacts_antml_tags() {
let preprocessor = Preprocessor::with_defaults();
let tag_open = format!("<{}:{} name=\"test\">", "antml", "invoke");
let tag_close = format!("</{}:{}>", "antml", "invoke");
let input = format!("Before {}content inside{} After", tag_open, tag_close);
let result = preprocessor.extract_semantic_content(&input);
assert!(!result.contains("antml"));
assert!(result.contains("Before"));
assert!(result.contains("After"));
}
#[test]
fn test_remove_git_status_output() {
let preprocessor = Preprocessor::with_defaults();
let input = r#"Looking at the repo:
On branch main
Your branch is up to date with 'origin/main'.
Changes not staged for commit:
modified: src/lib.rs
modified: Cargo.toml
Now let me fix that."#;
let result = preprocessor.extract_semantic_content(input);
assert!(!result.contains("On branch"));
assert!(!result.contains("Your branch is"));
assert!(!result.contains("Changes not staged"));
assert!(!result.contains("modified:"));
assert!(result.contains("Looking at the repo"));
assert!(result.contains("Now let me fix that"));
}
#[test]
fn test_remove_cargo_output() {
let preprocessor = Preprocessor::with_defaults();
let input = r#"Building the project:
Compiling rust-memex v0.1.0
Compiling tokio v1.0.0
Finished release [optimized] target(s) in 2.34s
Running target/release/rust_memex
Build complete!"#;
let result = preprocessor.extract_semantic_content(input);
assert!(!result.contains("Compiling"));
assert!(!result.contains("Finished"));
assert!(!result.contains("Running"));
assert!(result.contains("Building the project"));
assert!(result.contains("Build complete"));
}
#[test]
fn test_remove_file_listing() {
let preprocessor = Preprocessor::with_defaults();
let input = r#"Directory contents:
total 42
drwxr-xr-x 5 user staff 160 Dec 24 10:00 src
-rw-r--r-- 1 user staff 1234 Dec 24 09:00 Cargo.toml
-rw-r--r-- 1 user staff 567 Dec 24 08:00 README.md
That's what we have."#;
let result = preprocessor.extract_semantic_content(input);
assert!(!result.contains("total 42"));
assert!(!result.contains("drwxr-xr-x"));
assert!(result.contains("Directory contents"));
assert!(result.contains("That's what we have"));
}
#[test]
fn test_remove_uuid() {
let preprocessor = Preprocessor::new(PreprocessingConfig {
remove_metadata: true,
..Default::default()
});
let input = "Session 550e8400-e29b-41d4-a716-446655440000 started. Working on the task.";
let result = preprocessor.extract_semantic_content(input);
assert!(!result.contains("550e8400-e29b-41d4-a716-446655440000"));
assert!(result.contains("[UUID]"));
assert!(result.contains("Session"));
assert!(result.contains("started"));
}
#[test]
fn test_remove_timestamps() {
let preprocessor = Preprocessor::new(PreprocessingConfig {
remove_metadata: true,
..Default::default()
});
let input = "Created at 2024-12-24T10:30:00Z. Last modified 2024-12-24T11:00:00+01:00.";
let result = preprocessor.extract_semantic_content(input);
assert!(!result.contains("2024-12-24T"));
assert!(result.contains("[TIMESTAMP]"));
assert!(result.contains("Created at"));
assert!(result.contains("Last modified"));
}
#[test]
fn test_remove_session_id() {
let preprocessor = Preprocessor::new(PreprocessingConfig {
remove_metadata: true,
..Default::default()
});
let input = r#"Request with session_id: abc123xyz. Also sessionId="def456"."#;
let result = preprocessor.extract_semantic_content(input);
assert!(!result.contains("session_id: abc123xyz"));
assert!(!result.contains("sessionId=\"def456\""));
assert!(result.contains("Request with"));
}
#[test]
fn test_remove_empty_content_json() {
let preprocessor = Preprocessor::with_defaults();
let input = r#"Response: {"content": [], "status": "ok"}. Done."#;
let result = preprocessor.extract_semantic_content(input);
assert!(!result.contains(r#""content": []"#));
assert!(result.contains("Response"));
assert!(result.contains("Done"));
}
#[test]
fn test_preserve_semantic_content() {
let preprocessor = Preprocessor::with_defaults();
let input = r#"The preprocessing module filters noise from conversation exports.
It removes tool artifacts, CLI output, and metadata while preserving
the actual semantic content that should be embedded for search.
Key features:
- Pattern-based filtering using regex
- Deduplication with configurable threshold
- Minimum content length requirement"#;
let result = preprocessor.extract_semantic_content(input);
assert!(result.contains("preprocessing module"));
assert!(result.contains("filters noise"));
assert!(result.contains("conversation exports"));
assert!(result.contains("Key features"));
assert!(result.contains("Pattern-based filtering"));
assert!(result.contains("Deduplication"));
}
#[test]
fn test_filter_message_below_min_length() {
let mut preprocessor = Preprocessor::with_defaults();
let result = preprocessor.filter_message("Hi");
assert!(result.is_none());
let long_content = "This is a meaningful message with enough content to be indexed properly.";
let result = preprocessor.filter_message(long_content);
assert!(result.is_some());
}
#[test]
fn test_filter_message_duplicates() {
let mut preprocessor = Preprocessor::new(PreprocessingConfig {
min_content_length: 10,
dedupe_threshold: 0.95,
..Default::default()
});
let content = "This is a test message that we will see twice.";
let result1 = preprocessor.filter_message(content);
assert!(result1.is_some());
let result2 = preprocessor.filter_message(content);
assert!(result2.is_none());
}
#[test]
fn test_filter_conversation() {
let mut preprocessor = Preprocessor::new(PreprocessingConfig {
min_content_length: 20,
..Default::default()
});
let messages = vec![
Message::new("user", "How do I implement the preprocessing module?"),
Message::new("assistant", "On branch main\nmodified: src/lib.rs"), Message::new("assistant", "Let me explain the preprocessing approach..."),
Message::new("user", ""), Message::new(
"assistant",
"Here's the implementation with detailed explanation of the patterns.",
),
];
let (filtered, stats) = preprocessor.filter_conversation(messages);
assert_eq!(stats.total_input, 5);
assert!(stats.total_output < 5);
assert!(!filtered.iter().any(|m| m.content.contains("On branch")));
assert!(!filtered.iter().any(|m| m.content.is_empty()));
}
#[test]
fn test_is_mostly_tool_artifact() {
let preprocessor = Preprocessor::with_defaults();
let _mostly_artifact = format!(
"<{}>Some tool call with lots of data and parameters</{}>",
"function_calls", "function_calls"
);
let mixed_content = "Brief intro. Here's the actual content that matters.";
assert!(!preprocessor.is_mostly_tool_artifact(mixed_content));
}
#[test]
fn test_is_mostly_cli_output() {
let preprocessor = Preprocessor::with_defaults();
let cli_heavy = r#"On branch main
Your branch is up to date with 'origin/main'.
Changes not staged for commit:
modified: src/lib.rs
modified: Cargo.toml
nothing to commit"#;
let mixed = r#"Looking at the code:
On branch main
That's the current state."#;
assert!(preprocessor.is_mostly_cli_output(cli_heavy));
assert!(!preprocessor.is_mostly_cli_output(mixed));
}
#[test]
fn test_content_similarity() {
assert_eq!(content_similarity("hello world", "hello world"), 1.0);
assert_eq!(content_similarity("hello world", "foo bar baz"), 0.0);
let sim = content_similarity("hello world test", "hello world foo");
assert!(sim > 0.0 && sim < 1.0);
}
#[test]
fn test_preprocessing_stats() {
let stats = PreprocessingStats {
total_input: 100,
filtered_tool_artifacts: 10,
filtered_cli_output: 15,
filtered_metadata: 5,
filtered_empty: 8,
filtered_duplicates: 2,
filtered_below_min_length: 5,
total_output: 55,
};
assert_eq!(stats.total_filtered(), 45);
assert!((stats.filter_rate() - 0.45).abs() < 0.01);
}
#[test]
fn test_whitespace_normalization() {
let preprocessor = Preprocessor::with_defaults();
let input = "Text with multiple spaces\n\n\n\nand many newlines.";
let result = preprocessor.extract_semantic_content(input);
assert!(!result.contains(" "));
assert!(!result.contains("\n\n\n"));
}
#[test]
fn test_config_serialization() {
let config = PreprocessingConfig::default();
let toml_str = toml::to_string(&config).unwrap();
assert!(toml_str.contains("remove_tool_artifacts = true"));
assert!(toml_str.contains("remove_metadata = false"));
let parsed: PreprocessingConfig = toml::from_str(&toml_str).unwrap();
assert_eq!(parsed.min_content_length, config.min_content_length);
assert!(
!parsed.remove_metadata,
"remove_metadata should default to false"
);
}
#[test]
fn test_reset_dedupe_cache() {
let mut preprocessor = Preprocessor::new(PreprocessingConfig {
min_content_length: 10,
dedupe_threshold: 0.95,
..Default::default()
});
let content = "This is a test message that we will see again.";
assert!(preprocessor.filter_message(content).is_some());
assert!(preprocessor.filter_message(content).is_none());
preprocessor.reset_dedupe_cache();
assert!(preprocessor.filter_message(content).is_some());
}