use crate::config::MempalaceConfig;
use crate::models::Wing;
use crate::storage::Storage;
use crate::vector_storage::VectorStorage;
use anyhow::{anyhow, Result};
use serde_json::json;
use std::collections::HashMap;
use std::fs;
use std::path::Path;
use walkdir::WalkDir;
pub const READABLE_EXTENSIONS: &[&str] = &[
".txt", ".md", ".py", ".js", ".ts", ".jsx", ".tsx", ".json", ".yaml", ".yml", ".html", ".css",
".java", ".go", ".rs", ".rb", ".sh", ".csv", ".sql", ".toml",
];
pub const SKIP_DIRS: &[&str] = &[
".git",
"node_modules",
"__pycache__",
".venv",
"venv",
"env",
"dist",
"build",
".next",
"coverage",
".mempalace",
"target",
];
pub const CHUNK_SIZE: usize = 800;
pub const CHUNK_OVERLAP: usize = 100;
pub const MIN_CHUNK_SIZE: usize = 50;
pub fn chunk_text(content: &str) -> Vec<String> {
let content = content.trim();
if content.is_empty() {
return vec![];
}
let mut chunks = Vec::new();
let mut start = 0;
while start < content.len() {
let mut end = std::cmp::min(start + CHUNK_SIZE, content.len());
if end < content.len() {
if let Some(newline_pos) = content[start..end].rfind("\n\n") {
if newline_pos > CHUNK_SIZE / 2 {
end = start + newline_pos;
}
} else if let Some(newline_pos) = content[start..end].rfind('\n') {
if newline_pos > CHUNK_SIZE / 2 {
end = start + newline_pos;
}
}
}
let chunk = content[start..end].trim();
if chunk.len() >= MIN_CHUNK_SIZE {
chunks.push(chunk.to_string());
}
if end >= content.len() {
break;
}
start = end - CHUNK_OVERLAP;
}
chunks
}
pub fn detect_room(
filepath: &Path,
content: &str,
config: &MempalaceConfig,
project_path: &Path,
) -> String {
let relative = filepath
.strip_prefix(project_path)
.unwrap_or(filepath)
.to_string_lossy()
.to_lowercase();
let filename = filepath
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("")
.to_lowercase();
let content_lower = content
.chars()
.take(2000)
.collect::<String>()
.to_lowercase();
let path_parts: Vec<&str> = relative.split(['/', '\\']).collect();
if path_parts.len() > 1 {
for part in &path_parts[..path_parts.len() - 1] {
for wing in &config.topic_wings {
if !part.is_empty()
&& (part.contains(&wing.to_lowercase()) || wing.to_lowercase().contains(part))
{
return wing.clone();
}
}
}
}
for wing in &config.topic_wings {
if !filename.is_empty()
&& (filename.contains(&wing.to_lowercase()) || wing.to_lowercase().contains(&filename))
{
return wing.clone();
}
}
let mut scores = HashMap::new();
for (wing, keywords) in &config.hall_keywords {
let mut score = 0;
for kw in keywords {
score += content_lower.matches(&kw.to_lowercase()).count();
}
if score > 0 {
scores.insert(wing.clone(), score);
}
}
if let Some((best, _)) = scores.into_iter().max_by_key(|&(_, count)| count) {
return best;
}
"general".to_string()
}
pub fn get_mineable_files(project_path: &Path) -> Vec<std::path::PathBuf> {
let mut files = Vec::new();
for entry in WalkDir::new(project_path)
.into_iter()
.filter_entry(|e| {
let name = e.file_name().to_string_lossy();
!SKIP_DIRS.contains(&name.as_ref())
})
.flatten()
{
let path = entry.path();
if path.is_file() {
let extension = path.extension().and_then(|s| s.to_str()).unwrap_or("");
let ext_with_dot = format!(".{}", extension);
if READABLE_EXTENSIONS.contains(&ext_with_dot.as_str()) {
let filename = path.file_name().unwrap().to_string_lossy();
if filename != "mempalace.yaml"
&& filename != "mempalace.json"
&& filename != "package-lock.json"
{
files.push(path.to_path_buf());
}
}
}
}
files
}
pub fn prepare_documents(
chunks: Vec<String>,
wing_name: &str,
room: &str,
source_file: &str,
) -> (
Vec<String>,
Vec<String>,
Vec<serde_json::Map<String, serde_json::Value>>,
) {
let mut ids = Vec::new();
let mut documents = Vec::new();
let mut metadatas = Vec::new();
for (i, chunk) in chunks.into_iter().enumerate() {
let drawer_id = format!(
"drawer_{}_{}_{}_{}",
wing_name,
room,
hash_string(source_file),
i
);
ids.push(drawer_id);
documents.push(chunk);
metadatas.push(
json!({
"wing": wing_name,
"room": room,
"source_file": source_file,
"chunk_index": i,
"filed_at": chrono::Utc::now().to_rfc3339(),
})
.as_object()
.unwrap()
.clone(),
);
}
(ids, documents, metadatas)
}
pub type ProjectFileResult = (
String,
Vec<String>,
Vec<String>,
Vec<serde_json::Map<String, serde_json::Value>>,
);
pub fn process_project_file(
content: &str,
wing_name: &str,
source_file: &str,
path: &Path,
config: &MempalaceConfig,
project_path: &Path,
) -> Option<ProjectFileResult> {
let chunks = chunk_text(content);
if chunks.is_empty() {
return None;
}
let room = detect_room(path, content, config, project_path);
let (ids, documents, metadatas) = prepare_documents(chunks, wing_name, &room, source_file);
Some((room, ids, documents, metadatas))
}
pub async fn mine_project(
dir: &str,
storage: &Storage,
config: &MempalaceConfig,
wing_override: Option<&str>,
) -> Result<()> {
let project_path_raw = Path::new(dir);
if !project_path_raw.exists() || !project_path_raw.is_dir() {
return Err(anyhow!(
"Directory does not exist or is not a directory: {}",
dir
));
}
let project_path = project_path_raw.canonicalize()?;
let files = get_mineable_files(&project_path);
if files.is_empty() {
return Ok(());
}
let wing_name = wing_override.unwrap_or("general").to_string();
println!(
"Mining project files in: {:?} into wing: {}",
project_path, wing_name
);
let wing = Wing {
name: wing_name.clone(),
r#type: "project".to_string(),
keywords: vec![],
};
match storage.add_wing(&wing) {
Ok(_) => {}
Err(e) => {
if !e.to_string().contains("UNIQUE") {
return Err(e.into());
}
}
}
let mut vs = VectorStorage::new(
config.config_dir.join("vectors.db"),
config.config_dir.join("vectors.usearch"),
)?;
for path in files {
let source_file = path.to_string_lossy().to_string();
if vs.has_source_file(&source_file).unwrap_or(false) {
continue;
}
if let Ok(content) = fs::read_to_string(&path) {
if let Some((room, _ids, documents, _metadatas)) = process_project_file(
&content,
&wing_name,
&source_file,
&path,
config,
&project_path,
) {
let mut count = 0usize;
for doc in &documents {
vs.add_memory(doc, &wing_name, &room, Some(&source_file), None)?;
count += 1;
}
let filename = path.file_name().unwrap().to_string_lossy();
println!(" ✓ Filed {} drawers from {}", count, filename);
}
}
}
vs.save_index(config.config_dir.join("vectors.usearch"))?;
Ok(())
}
fn hash_string(s: &str) -> String {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
s.hash(&mut hasher);
format!("{:x}", hasher.finish())
}
#[cfg(test)]
mod tests {
use super::*;
use crate::storage::Storage;
use std::fs;
#[test]
fn test_chunk_text() {
let content = "A".repeat(1000);
let chunks = chunk_text(&content);
assert!(chunks.len() > 1);
}
#[test]
fn test_chunk_text_empty() {
assert!(chunk_text("").is_empty());
assert!(chunk_text(" ").is_empty());
}
#[test]
fn test_chunk_text_short() {
let chunks = chunk_text("Hello world");
assert!(chunks.is_empty()); }
#[test]
fn test_chunk_text_exact_min() {
let content = "A".repeat(50);
let chunks = chunk_text(&content);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0], content);
}
#[test]
fn test_chunk_text_newline_break() {
let part1 = "A".repeat(600);
let part2 = "B".repeat(600);
let content = format!("{}\n\n{}", part1, part2);
let chunks = chunk_text(&content);
assert_eq!(chunks.len(), 2);
assert!(chunks[0].contains('A'));
assert!(!chunks[0].contains('B'));
}
#[test]
fn test_chunk_text_long_with_breaks() {
let part1 = "A".repeat(450);
let part2 = "B".repeat(450);
let content = format!("{}\n\n{}", part1, part2);
let chunks = chunk_text(&content);
assert!(chunks.len() >= 2);
assert_eq!(chunks[0].len(), 450);
let content_single = format!("{}\n{}", part1, part2);
let chunks2 = chunk_text(&content_single);
assert!(chunks2.len() >= 2);
assert_eq!(chunks2[0].len(), 450);
}
#[test]
fn test_detect_room() {
let mut hall_keywords = std::collections::HashMap::new();
hall_keywords.insert(
"frontend".to_string(),
vec!["react".to_string(), "css".to_string()],
);
hall_keywords.insert("room2".to_string(), vec!["banana".to_string()]);
let config = MempalaceConfig {
topic_wings: vec![
"infra".to_string(),
"backend".to_string(),
"arch".to_string(),
],
hall_keywords,
..Default::default()
};
let project_path = std::path::Path::new("/project");
let path = std::path::Path::new("/project/infra/module.rs");
assert_eq!(
detect_room(path, "some code", &config, project_path),
"infra"
);
let path_sub = std::path::Path::new("/project/infrastructure/module.rs");
assert_eq!(detect_room(path_sub, "", &config, project_path), "infra");
let path_super = std::path::Path::new("/project/inf/module.rs");
assert_eq!(detect_room(path_super, "", &config, project_path), "infra");
let path_exact = std::path::Path::new("/project/backend/module.rs");
assert_eq!(
detect_room(path_exact, "", &config, project_path),
"backend"
);
let path2 = std::path::Path::new("/project/my_arch_folder/file.rs");
assert_eq!(detect_room(path2, "", &config, project_path), "arch");
let path3 = std::path::Path::new("/project/src/infra.rs");
assert_eq!(detect_room(path3, "", &config, project_path), "infra");
let path4 = std::path::Path::new("/project/src/backend_utils.rs");
assert_eq!(
detect_room(path4, "some code", &config, project_path),
"backend"
);
let path5 = std::path::Path::new("/project/src/ui.rs");
assert_eq!(
detect_room(path5, "import react; write css;", &config, project_path),
"frontend"
);
assert_eq!(
detect_room(std::path::Path::new("/"), "", &config, project_path),
"general"
);
}
#[test]
fn test_detect_room_keyword_scoring() {
let mut hall_keywords = std::collections::HashMap::new();
hall_keywords.insert("roomA".to_string(), vec!["apple".to_string()]);
hall_keywords.insert("roomB".to_string(), vec!["banana".to_string()]);
let config = MempalaceConfig {
topic_wings: vec![],
hall_keywords,
..Default::default()
};
let path = std::path::Path::new("/project/file.txt");
let project_path = std::path::Path::new("/project");
assert_eq!(
detect_room(path, "apple apple", &config, project_path),
"roomA"
);
assert_eq!(
detect_room(path, "apple banana banana", &config, project_path),
"roomB"
);
}
#[test]
fn test_hash_string() {
assert_eq!(hash_string("test"), hash_string("test"));
assert_ne!(hash_string("test1"), hash_string("test2"));
assert_ne!(hash_string("🦀"), hash_string("🦀🦀"));
assert_ne!(
hash_string(&"A".repeat(1000)),
hash_string(&"A".repeat(1001))
);
}
#[test]
fn test_prepare_documents() {
let chunks = vec!["chunk1".to_string(), "chunk2".to_string()];
let (ids, docs, metadatas) =
prepare_documents(chunks.clone(), "test_wing", "test_room", "test_file.rs");
assert_eq!(ids.len(), 2);
assert_eq!(docs.len(), 2);
assert_eq!(metadatas.len(), 2);
assert!(ids[0].starts_with("drawer_test_wing_test_room_"));
assert_eq!(docs[0], "chunk1");
assert_eq!(metadatas[0]["wing"].as_str().unwrap(), "test_wing");
assert_eq!(metadatas[0]["chunk_index"].as_u64().unwrap(), 0);
}
#[test]
fn test_get_mineable_files() {
let temp_dir = tempfile::tempdir().unwrap();
let path = temp_dir.path();
fs::write(path.join("test.rs"), "fn main() {}").unwrap();
let git_dir = path.join(".git");
fs::create_dir(&git_dir).unwrap();
fs::write(git_dir.join("test2.rs"), "fn main() {}").unwrap();
fs::write(path.join("test.bin"), "0101").unwrap();
fs::write(path.join("mempalace.yaml"), "").unwrap();
fs::write(path.join("no_extension"), "test").unwrap();
fs::write(path.join("test.xyz"), "test").unwrap();
let files = get_mineable_files(path);
assert_eq!(files.len(), 1);
assert!(files[0].to_string_lossy().ends_with("test.rs"));
}
#[test]
fn test_process_project_file() {
let content = "A".repeat(50);
let temp_config_dir = tempfile::tempdir().unwrap();
let config = MempalaceConfig::new(Some(temp_config_dir.path().to_path_buf()));
let path = std::path::Path::new("/project/src/main.rs");
let project_path = std::path::Path::new("/project");
let result = process_project_file(
&content,
"test_wing",
"test_file.rs",
path,
&config,
project_path,
);
assert!(result.is_some());
let (room, ids, docs, metadatas) = result.unwrap();
assert_eq!(room, "general");
assert_eq!(ids.len(), 1);
assert_eq!(docs.len(), 1);
assert_eq!(metadatas.len(), 1);
let result_empty =
process_project_file("", "test_wing", "test_file.rs", path, &config, project_path);
assert!(result_empty.is_none());
}
#[tokio::test]
async fn test_mine_project_invalid_dir() {
let storage = Storage::new("test_mine.db").unwrap();
let temp_config_dir = tempfile::tempdir().unwrap();
let config = MempalaceConfig::new(Some(temp_config_dir.path().to_path_buf()));
let result = mine_project("/nonexistent/dir", &storage, &config, None).await;
assert!(result.is_err());
let _ = fs::remove_file("test_mine.db");
}
#[tokio::test]
async fn test_mine_project_storage_error() {
let storage = Storage::new("test_mine_storage.db").unwrap();
let temp_config_dir = tempfile::tempdir().unwrap();
let config = MempalaceConfig::new(Some(temp_config_dir.path().to_path_buf()));
let temp_dir = tempfile::tempdir().unwrap();
fs::write(temp_dir.path().join("test.rs"), "A".repeat(100)).unwrap();
let result = mine_project(temp_dir.path().to_str().unwrap(), &storage, &config, None).await;
assert!(result.is_ok());
let _ = fs::remove_file("test_mine_storage.db");
}
#[tokio::test]
async fn test_mine_project_with_file() {
let storage = Storage::new("test_mine_file.db").unwrap();
let temp_config_dir = tempfile::tempdir().unwrap();
let config = MempalaceConfig::new(Some(temp_config_dir.path().to_path_buf()));
let temp_dir = tempfile::tempdir().unwrap();
let file_path = temp_dir.path().join("main.rs");
fs::write(&file_path, "A".repeat(100)).unwrap();
let result = mine_project(temp_dir.path().to_str().unwrap(), &storage, &config, None).await;
assert!(result.is_ok());
let _ = fs::remove_file("test_mine_file.db");
}
#[test]
fn test_get_mineable_files_with_skips() {
let temp_dir = tempfile::tempdir().unwrap();
let path = temp_dir.path();
fs::write(path.join("main.rs"), "fn main() {}").unwrap();
fs::create_dir(path.join(".git")).unwrap();
fs::write(path.join(".git").join("config"), "").unwrap();
fs::create_dir(path.join("target")).unwrap();
fs::write(path.join("target").join("debug"), "").unwrap();
let files = get_mineable_files(path);
assert_eq!(files.len(), 1);
assert!(files[0].to_string_lossy().ends_with("main.rs"));
}
#[test]
fn test_get_mineable_files_nested() {
let temp_dir = tempfile::tempdir().unwrap();
let path = temp_dir.path();
fs::create_dir_all(path.join("a/b/c")).unwrap();
fs::write(path.join("a/b/c/file.rs"), "").unwrap();
let files = get_mineable_files(path);
assert_eq!(files.len(), 1);
fs::write(path.join("LICENSE"), "").unwrap();
let files2 = get_mineable_files(path);
assert_eq!(files2.len(), 1); }
#[test]
fn test_process_project_file_empty() {
let config = MempalaceConfig::default();
let project_path = std::path::Path::new("/project");
let result = process_project_file(
"",
"wing",
"file.rs",
&project_path.join("file.rs"),
&config,
project_path,
);
assert!(result.is_none());
}
}