use crate::common::test_db_manager::TestDatabaseManager;
use codex_memory::chunking::{ChunkingStrategy, FileChunker};
use codex_memory::error::Result;
use codex_memory::mcp_server::handlers::MCPHandlers;
use codex_memory::storage::Storage;
use serde_json::json;
use std::sync::Arc;
use tokio::fs;
#[tokio::test]
async fn test_store_file_basic_functionality() -> Result<()> {
let mut db_manager = TestDatabaseManager::new()?;
let pool = db_manager.setup_test_database().await?;
let storage = Arc::new(Storage::new(pool));
let handlers = MCPHandlers::new(storage.clone());
let temp_dir = std::env::temp_dir();
let unique_id = uuid::Uuid::new_v4();
let test_file_path = temp_dir.join(format!("test_store_file_{}.txt", unique_id));
let mut test_content = format!("Test file content for basic functionality test. Unique ID: {}\n", unique_id);
test_content.push_str(&"This is sample content that should be split into multiple chunks when processed. ".repeat(50));
test_content.push_str(&format!("\nEnd of test content. ID: {}", unique_id));
fs::write(&test_file_path, &test_content).await.unwrap();
let params = json!({
"file_path": test_file_path.to_string_lossy(),
"chunk_size": 1500,
"overlap": 200,
"chunking_strategy": "hybrid",
"tags": ["test", "document"]
});
let result = handlers.handle_tool_call("store_file", params).await?;
assert!(result["file_path"].is_string());
assert!(result["file_size"].is_number());
assert!(result["chunks_created"].is_number());
assert!(result["chunk_ids"].is_array());
assert!(result["chunking_strategy"].is_string());
assert!(result["message"].is_string());
let chunks_created = result["chunks_created"].as_u64().unwrap();
assert!(chunks_created > 1, "Should create multiple chunks");
let chunk_ids = result["chunk_ids"].as_array().unwrap();
assert_eq!(chunk_ids.len() as u64, chunks_created);
for chunk_id_value in chunk_ids {
let chunk_id = uuid::Uuid::parse_str(chunk_id_value.as_str().unwrap())
.map_err(|e| codex_memory::error::Error::InternalError(format!("UUID parse error: {}", e)))?;
let memory = storage.get(chunk_id).await?;
assert!(memory.is_some(), "Chunk should be stored in database");
let memory = memory.unwrap();
assert!(memory.content.len() > 0, "Chunk content should not be empty");
assert!(memory.tags.contains(&"test".to_string()), "Should contain test tag");
assert!(memory.tags.contains(&"document".to_string()), "Should contain document tag");
assert!(memory.tags.iter().any(|t| t.starts_with("chunk_")), "Should have chunk number tag");
assert!(memory.tags.iter().any(|t| t.starts_with("strategy_")), "Should have strategy tag");
}
fs::remove_file(&test_file_path).await.ok();
db_manager.cleanup().await?;
Ok(())
}
#[tokio::test]
async fn test_chunking_strategies_comparison() -> Result<()> {
let mut db_manager = TestDatabaseManager::new()?;
let pool = db_manager.setup_test_database().await?;
let storage = Arc::new(Storage::new(pool));
let handlers = MCPHandlers::new(storage);
let temp_dir = std::env::temp_dir();
let test_file_path = temp_dir.join("test_chunking_strategies.md");
let test_content = "# Introduction\n\nThis is the introduction paragraph. It explains the purpose of the document.\n\n## First Section\n\nThis section contains detailed information. It has multiple sentences that provide context and examples.\n\n### Subsection\n\nA subsection with code:\n\n```python\ndef hello_world():\n print(\"Hello, world!\")\n return True\n```\n\n## Second Section\n\nThis is another major section. It provides additional information and concludes the document.\n\nThe end.";
fs::write(&test_file_path, test_content).await.unwrap();
let strategies = ["sentence", "paragraph", "semantic", "hybrid"];
let mut results = Vec::new();
for strategy in strategies {
let unique_content = format!("{}\n\n--- {} strategy test ---", test_content, strategy);
let strategy_file_path = temp_dir.join(format!("test_chunking_strategy_{}.md", strategy));
fs::write(&strategy_file_path, &unique_content).await.unwrap();
let params = json!({
"file_path": strategy_file_path.to_string_lossy(),
"chunk_size": 2000,
"overlap": 300,
"chunking_strategy": strategy,
"tags": [format!("test_{}", strategy)]
});
let result = handlers.handle_tool_call("store_file", params).await?;
let chunks_created = result["chunks_created"].as_u64().unwrap();
results.push((strategy, chunks_created));
let recorded_strategy = result["chunking_strategy"].as_str().unwrap().to_lowercase();
assert_eq!(recorded_strategy, strategy.to_lowercase());
fs::remove_file(&strategy_file_path).await.ok();
}
let unique_counts: std::collections::HashSet<_> = results.iter().map(|(_, count)| count).collect();
assert!(unique_counts.len() >= 2, "Different strategies should produce different chunk counts: {:?}", results);
fs::remove_file(&test_file_path).await.ok();
db_manager.cleanup().await?;
Ok(())
}
#[tokio::test]
async fn test_store_file_error_handling() -> Result<()> {
let mut db_manager = TestDatabaseManager::new()?;
let pool = db_manager.setup_test_database().await?;
let storage = Arc::new(Storage::new(pool));
let handlers = MCPHandlers::new(storage);
let params = json!({
"file_path": "/path/that/does/not/exist.txt"
});
let result = handlers.handle_tool_call("store_file", params).await;
assert!(result.is_err(), "Should fail for non-existent file");
let temp_dir = std::env::temp_dir();
let test_file_path = temp_dir.join("test_error_handling.txt");
fs::write(&test_file_path, "test content").await.unwrap();
let params = json!({
"file_path": test_file_path.to_string_lossy(),
"chunk_size": 100 });
let result = handlers.handle_tool_call("store_file", params).await;
assert!(result.is_err(), "Should fail for chunk size too small");
let params = json!({
"file_path": test_file_path.to_string_lossy(),
"chunk_size": 200000 });
let result = handlers.handle_tool_call("store_file", params).await;
assert!(result.is_err(), "Should fail for chunk size too large");
let params = json!({
"file_path": test_file_path.to_string_lossy(),
"chunk_size": 2000,
"overlap": 1500 });
let result = handlers.handle_tool_call("store_file", params).await;
assert!(result.is_err(), "Should fail for overlap too large");
let params = json!({
"chunk_size": 2000
});
let result = handlers.handle_tool_call("store_file", params).await;
assert!(result.is_err(), "Should fail for missing file_path");
fs::remove_file(&test_file_path).await.ok();
db_manager.cleanup().await?;
Ok(())
}
#[tokio::test]
async fn test_chunking_algorithms_directly() -> Result<()> {
let test_content = "First sentence. Second sentence! Third question? \n\nNew paragraph starts here. It continues with more text. And even more content.\n\n```code\nlet x = 42;\nprintln!(\"{}\", x);\n```\n\nFinal paragraph.";
let sentence_chunker = FileChunker::with_strategy(100, 20, ChunkingStrategy::Sentence);
let sentence_chunks = sentence_chunker.chunk_content(test_content)?;
assert!(sentence_chunks.len() > 0, "Sentence chunking should produce chunks");
for chunk in &sentence_chunks {
let content = &chunk.content;
if content.len() > 20 { assert!(!content.ends_with(' ') || content.trim().ends_with(['.', '!', '?']),
"Chunk should end at sentence boundary: '{}'", content.chars().rev().take(20).collect::<String>());
}
}
let paragraph_chunker = FileChunker::with_strategy(200, 0, ChunkingStrategy::Paragraph);
let paragraph_chunks = paragraph_chunker.chunk_content(test_content)?;
assert!(paragraph_chunks.len() > 0, "Paragraph chunking should produce chunks");
let semantic_chunker = FileChunker::with_strategy(150, 30, ChunkingStrategy::Semantic);
let semantic_chunks = semantic_chunker.chunk_content(test_content)?;
assert!(semantic_chunks.len() > 0, "Semantic chunking should produce chunks");
let hybrid_chunker = FileChunker::with_strategy(120, 25, ChunkingStrategy::Hybrid);
let hybrid_chunks = hybrid_chunker.chunk_content(test_content)?;
assert!(hybrid_chunks.len() > 0, "Hybrid chunking should produce chunks");
let sentence_count = sentence_chunks.len();
let paragraph_count = paragraph_chunks.len();
let semantic_count = semantic_chunks.len();
let hybrid_count = hybrid_chunks.len();
println!("Chunk counts - Sentence: {}, Paragraph: {}, Semantic: {}, Hybrid: {}",
sentence_count, paragraph_count, semantic_count, hybrid_count);
let all_counts = vec![sentence_count, paragraph_count, semantic_count, hybrid_count];
let unique_counts: std::collections::HashSet<_> = all_counts.iter().collect();
assert!(unique_counts.len() >= 2, "Different chunking strategies should produce different results");
Ok(())
}
#[tokio::test]
async fn test_file_size_limits() -> Result<()> {
let mut db_manager = TestDatabaseManager::new()?;
let pool = db_manager.setup_test_database().await?;
let storage = Arc::new(Storage::new(pool));
let handlers = MCPHandlers::new(storage);
let temp_dir = std::env::temp_dir();
let large_file_path = temp_dir.join("large_test_file.txt");
let large_content = "a".repeat(11 * 1024 * 1024);
fs::write(&large_file_path, large_content).await.unwrap();
let params = json!({
"file_path": large_file_path.to_string_lossy(),
"chunk_size": 8000,
"overlap": 200
});
let result = handlers.handle_tool_call("store_file", params).await;
assert!(result.is_err(), "Should fail for files larger than 10MB");
let error_msg = format!("{}", result.unwrap_err());
assert!(error_msg.to_lowercase().contains("file size") || error_msg.to_lowercase().contains("exceeds"),
"Error should mention file size limit: {}", error_msg);
fs::remove_file(&large_file_path).await.ok();
db_manager.cleanup().await?;
Ok(())
}
#[tokio::test]
async fn test_chunk_overlap() -> Result<()> {
let test_content = "Word1 Word2 Word3 Word4 Word5 Word6 Word7 Word8 Word9 Word10 Word11 Word12 Word13 Word14 Word15 Word16 Word17 Word18 Word19 Word20";
let chunker = FileChunker::with_strategy(50, 10, ChunkingStrategy::Hybrid);
let chunks = chunker.chunk_content(test_content)?;
if chunks.len() > 1 {
for i in 0..chunks.len()-1 {
let current_chunk = &chunks[i];
let next_chunk = &chunks[i+1];
let current_words: std::collections::HashSet<_> = current_chunk.content
.split_whitespace()
.collect();
let next_words: std::collections::HashSet<_> = next_chunk.content
.split_whitespace()
.collect();
let intersection: Vec<_> = current_words.intersection(&next_words).collect();
println!("Chunk {} ({}..{}): '{}'", i, current_chunk.start_byte, current_chunk.end_byte,
current_chunk.content.chars().take(30).collect::<String>());
println!("Chunk {} ({}..{}): '{}'", i+1, next_chunk.start_byte, next_chunk.end_byte,
next_chunk.content.chars().take(30).collect::<String>());
println!("Overlapping words: {:?}", intersection);
}
}
Ok(())
}
#[tokio::test]
async fn test_single_chunk_files() -> Result<()> {
let mut db_manager = TestDatabaseManager::new()?;
let pool = db_manager.setup_test_database().await?;
let storage = Arc::new(Storage::new(pool));
let handlers = MCPHandlers::new(storage.clone());
let temp_dir = std::env::temp_dir();
let small_file_path = temp_dir.join("small_test_file.txt");
let small_content = "This is a small file that should fit in a single chunk.";
fs::write(&small_file_path, small_content).await.unwrap();
let params = json!({
"file_path": small_file_path.to_string_lossy(),
"chunk_size": 8000,
"overlap": 200,
"chunking_strategy": "hybrid",
"tags": ["small", "single_chunk"]
});
let result = handlers.handle_tool_call("store_file", params).await?;
let chunks_created = result["chunks_created"].as_u64().unwrap();
assert_eq!(chunks_created, 1, "Small file should create exactly one chunk");
let chunk_ids = result["chunk_ids"].as_array().unwrap();
assert_eq!(chunk_ids.len(), 1);
let chunk_id = uuid::Uuid::parse_str(chunk_ids[0].as_str().unwrap())
.map_err(|e| codex_memory::error::Error::InternalError(format!("UUID parse error: {}", e)))?;
let memory = storage.get(chunk_id).await?;
assert!(memory.is_some());
let memory = memory.unwrap();
assert_eq!(memory.content.trim(), small_content.trim());
assert!(memory.tags.contains(&"small".to_string()));
assert!(memory.tags.contains(&"single_chunk".to_string()));
fs::remove_file(&small_file_path).await.ok();
db_manager.cleanup().await?;
Ok(())
}