use anyhow::{Result, anyhow};
use rusqlite::{Connection, params};
use sha2::{Digest, Sha256};
use std::fs;
use std::path::{Path, PathBuf};
use std::sync::{Arc, Mutex};
use std::time::Duration;
use tracing::{debug, info, warn};
use uuid::Uuid;
use super::embeddings::{cosine_similarity, deserialize_embedding, serialize_embedding};
use super::search::MemoryChunk;
#[derive(Clone)]
pub struct MemoryIndex {
conn: Arc<Mutex<Connection>>,
workspace: PathBuf,
db_path: PathBuf,
has_vec_extension: bool,
chunk_size: usize,
chunk_overlap: usize,
}
#[derive(Debug)]
pub struct ReindexStats {
pub files_processed: usize,
pub files_updated: usize,
pub chunks_indexed: usize,
pub duration: Duration,
}
impl MemoryIndex {
pub fn new_with_db_path(workspace: &Path, db_path: &Path) -> Result<Self> {
if let Some(parent) = db_path.parent() {
fs::create_dir_all(parent)?;
}
let conn = Connection::open(db_path)?;
let needs_migration = Self::needs_schema_migration(&conn)?;
if needs_migration {
info!("Migrating database schema to OpenClaw-compatible format");
Self::migrate_to_openclaw_schema(&conn)?;
}
conn.execute_batch(
r#"
-- Metadata key/value store
CREATE TABLE IF NOT EXISTS meta (
key TEXT PRIMARY KEY,
value TEXT NOT NULL
);
-- File tracking (OpenClaw-compatible)
CREATE TABLE IF NOT EXISTS files (
path TEXT PRIMARY KEY,
source TEXT NOT NULL DEFAULT 'memory',
hash TEXT NOT NULL,
mtime INTEGER NOT NULL,
size INTEGER NOT NULL
);
-- Chunked content (OpenClaw-compatible)
CREATE TABLE IF NOT EXISTS chunks (
id TEXT PRIMARY KEY,
path TEXT NOT NULL,
source TEXT NOT NULL DEFAULT 'memory',
start_line INTEGER NOT NULL,
end_line INTEGER NOT NULL,
hash TEXT NOT NULL,
model TEXT NOT NULL DEFAULT '',
text TEXT NOT NULL,
embedding TEXT NOT NULL DEFAULT '',
updated_at INTEGER NOT NULL
);
-- Embedding cache (OpenClaw-compatible)
CREATE TABLE IF NOT EXISTS embedding_cache (
provider TEXT NOT NULL,
model TEXT NOT NULL,
provider_key TEXT NOT NULL,
hash TEXT NOT NULL,
embedding TEXT NOT NULL,
dims INTEGER,
updated_at INTEGER NOT NULL,
PRIMARY KEY (provider, model, provider_key, hash)
);
-- Indexes
CREATE INDEX IF NOT EXISTS idx_chunks_path ON chunks(path);
CREATE INDEX IF NOT EXISTS idx_chunks_source ON chunks(source);
CREATE INDEX IF NOT EXISTS idx_embedding_cache_updated_at ON embedding_cache(updated_at);
"#,
)?;
Self::ensure_fts_table(&conn)?;
Self::ensure_column(&conn, "files", "source", "TEXT NOT NULL DEFAULT 'memory'")?;
Self::ensure_column(&conn, "chunks", "source", "TEXT NOT NULL DEFAULT 'memory'")?;
let has_vec_extension = Self::try_load_sqlite_vec(&conn);
if has_vec_extension {
debug!("sqlite-vec extension loaded successfully");
Self::ensure_vec_table(&conn)?;
} else {
debug!("sqlite-vec extension not available, using in-memory vector search");
}
Ok(Self {
conn: Arc::new(Mutex::new(conn)),
workspace: workspace.to_path_buf(),
db_path: db_path.to_path_buf(),
has_vec_extension,
chunk_size: 400,
chunk_overlap: 80,
})
}
pub fn with_chunk_config(mut self, chunk_size: usize, chunk_overlap: usize) -> Self {
self.chunk_size = chunk_size;
self.chunk_overlap = chunk_overlap;
self
}
#[allow(unsafe_code)]
fn try_load_sqlite_vec(conn: &Connection) -> bool {
if unsafe { conn.load_extension_enable() }.is_err() {
return false;
}
let ext_paths = [
"vec0", "./vec0",
"/usr/local/lib/vec0",
"/usr/lib/vec0",
];
for path in ext_paths {
if unsafe { conn.load_extension(path, None::<&str>) }.is_ok() {
let _ = conn.load_extension_disable();
return true;
}
}
let _ = conn.load_extension_disable();
false
}
fn ensure_vec_table(conn: &Connection) -> Result<()> {
let result = conn.execute(
"CREATE VIRTUAL TABLE IF NOT EXISTS chunks_vec USING vec0(id TEXT PRIMARY KEY, embedding float[1536])",
[],
);
match result {
Ok(_) => debug!("chunks_vec table created/verified"),
Err(e) => debug!("chunks_vec table creation skipped: {}", e),
}
Ok(())
}
pub fn new(workspace: &Path) -> Result<Self> {
let db_path = workspace.join("memory.sqlite");
Self::new_with_db_path(workspace, &db_path)
}
pub fn index_file(&self, path: &Path, force: bool) -> Result<bool> {
let content = fs::read_to_string(path)?;
let file_hash = hash_content(&content);
let metadata = fs::metadata(path)?;
let mtime = metadata
.modified()?
.duration_since(std::time::UNIX_EPOCH)?
.as_secs() as i64;
let size = metadata.len() as i64;
let relative_path = path
.strip_prefix(&self.workspace)
.unwrap_or(path)
.to_string_lossy()
.to_string();
let conn = self
.conn
.lock()
.map_err(|e| anyhow!("Lock poisoned: {}", e))?;
if !force {
let existing: Option<String> = conn
.query_row(
"SELECT hash FROM files WHERE path = ?1",
params![&relative_path],
|row| row.get(0),
)
.ok();
if existing.as_deref() == Some(&file_hash) {
debug!("File unchanged, skipping: {}", relative_path);
return Ok(false);
}
}
debug!("Indexing file: {}", relative_path);
let now = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)?
.as_secs() as i64;
conn.execute(
"INSERT OR REPLACE INTO files (path, source, hash, mtime, size) VALUES (?1, 'memory', ?2, ?3, ?4)",
params![&relative_path, &file_hash, mtime, size],
)?;
Self::delete_chunks_for_path(&conn, &relative_path)?;
let chunks = chunk_text(&content, self.chunk_size, self.chunk_overlap);
for chunk in chunks.iter() {
let chunk_id = Uuid::new_v4().to_string();
let chunk_hash = hash_content(&chunk.content);
conn.execute(
r#"INSERT INTO chunks (id, path, source, start_line, end_line, hash, model, text, embedding, updated_at)
VALUES (?1, ?2, 'memory', ?3, ?4, ?5, '', ?6, '', ?7)"#,
params![&chunk_id, &relative_path, chunk.line_start, chunk.line_end, &chunk_hash, &chunk.content, now],
)?;
Self::insert_fts(
&conn,
&chunk_id,
&relative_path,
"memory",
"",
chunk.line_start,
chunk.line_end,
&chunk.content,
)?;
}
Ok(true)
}
fn delete_chunks_for_path(conn: &Connection, path: &str) -> Result<()> {
let mut stmt = conn.prepare("SELECT id FROM chunks WHERE path = ?1")?;
let chunk_ids: Vec<String> = stmt
.query_map(params![path], |row| row.get(0))?
.filter_map(|r| r.ok())
.collect();
for chunk_id in chunk_ids {
let _ = conn.execute("DELETE FROM chunks_fts WHERE id = ?1", params![&chunk_id]);
}
conn.execute("DELETE FROM chunks WHERE path = ?1", params![path])?;
Ok(())
}
pub fn remove_file(&self, relative_path: &str) -> Result<()> {
let conn = self
.conn
.lock()
.map_err(|e| anyhow!("Lock poisoned: {}", e))?;
Self::delete_chunks_for_path(&conn, relative_path)?;
conn.execute("DELETE FROM files WHERE path = ?1", params![relative_path])?;
debug!("Removed deleted file from index: {}", relative_path);
Ok(())
}
pub fn indexed_files(&self) -> Result<Vec<String>> {
let conn = self
.conn
.lock()
.map_err(|e| anyhow!("Lock poisoned: {}", e))?;
let mut stmt = conn.prepare("SELECT path FROM files")?;
let rows = stmt.query_map([], |row| row.get(0))?;
let mut paths = Vec::new();
for row in rows {
paths.push(row?);
}
Ok(paths)
}
#[allow(clippy::too_many_arguments)]
fn insert_fts(
conn: &Connection,
id: &str,
path: &str,
source: &str,
model: &str,
start_line: i32,
end_line: i32,
text: &str,
) -> Result<()> {
let _ = conn.execute(
"INSERT INTO chunks_fts (text, id, path, source, model, start_line, end_line) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
params![text, id, path, source, model, start_line, end_line],
);
Ok(())
}
pub fn search(&self, query: &str, limit: usize) -> Result<Vec<MemoryChunk>> {
let fts_query = match build_fts_query(query) {
Some(q) => q,
None => return Ok(Vec::new()),
};
let conn = self
.conn
.lock()
.map_err(|e| anyhow!("Lock poisoned: {}", e))?;
let mut stmt = conn.prepare(
r#"
SELECT fts.path, fts.start_line, fts.end_line, fts.text, bm25(chunks_fts) as score
FROM chunks_fts fts
WHERE chunks_fts MATCH ?1
ORDER BY score
LIMIT ?2
"#,
)?;
let rows = stmt.query_map(params![&fts_query, limit as i64], |row| {
Ok(MemoryChunk {
file: row.get(0)?,
line_start: row.get(1)?,
line_end: row.get(2)?,
content: row.get(3)?,
score: row.get::<_, f64>(4)?.abs(), })
})?;
let mut results = Vec::new();
for row in rows {
results.push(row?);
}
Ok(results)
}
pub fn chunk_count(&self) -> Result<usize> {
let conn = self
.conn
.lock()
.map_err(|e| anyhow!("Lock poisoned: {}", e))?;
let count: i64 = conn.query_row("SELECT COUNT(*) FROM chunks", [], |row| row.get(0))?;
Ok(count as usize)
}
pub fn file_chunk_count(&self, path: &Path) -> Result<usize> {
let relative_path = path
.strip_prefix(&self.workspace)
.unwrap_or(path)
.to_string_lossy()
.to_string();
let conn = self
.conn
.lock()
.map_err(|e| anyhow!("Lock poisoned: {}", e))?;
let count: i64 = conn.query_row(
"SELECT COUNT(*) FROM chunks WHERE path = ?1",
params![&relative_path],
|row| row.get(0),
)?;
Ok(count as usize)
}
pub fn size_bytes(&self) -> Result<u64> {
if self.db_path.exists() {
Ok(fs::metadata(&self.db_path)?.len())
} else {
Ok(0)
}
}
pub fn db_path(&self) -> &Path {
&self.db_path
}
fn needs_schema_migration(conn: &Connection) -> Result<bool> {
let result: rusqlite::Result<String> =
conn.query_row("PRAGMA table_info(chunks)", [], |row| row.get(1));
if result.is_err() {
return Ok(false);
}
let has_file_path: bool = conn.prepare("SELECT file_path FROM chunks LIMIT 0").is_ok();
let has_content: bool = conn.prepare("SELECT content FROM chunks LIMIT 0").is_ok();
Ok(has_file_path || has_content)
}
fn migrate_to_openclaw_schema(conn: &Connection) -> Result<()> {
conn.execute("BEGIN TRANSACTION", [])?;
let _ = conn.execute("ALTER TABLE chunks RENAME TO chunks_old", []);
let _ = conn.execute("ALTER TABLE files RENAME TO files_old", []);
let _ = conn.execute("DROP TABLE IF EXISTS chunks_fts", []);
let _ = conn.execute("DROP TRIGGER IF EXISTS chunks_ai", []);
let _ = conn.execute("DROP TRIGGER IF EXISTS chunks_ad", []);
let _ = conn.execute("DROP TRIGGER IF EXISTS chunks_au", []);
conn.execute(
r#"
CREATE TABLE IF NOT EXISTS files (
path TEXT PRIMARY KEY,
source TEXT NOT NULL DEFAULT 'memory',
hash TEXT NOT NULL,
mtime INTEGER NOT NULL,
size INTEGER NOT NULL
)
"#,
[],
)?;
conn.execute(
r#"
CREATE TABLE IF NOT EXISTS chunks (
id TEXT PRIMARY KEY,
path TEXT NOT NULL,
source TEXT NOT NULL DEFAULT 'memory',
start_line INTEGER NOT NULL,
end_line INTEGER NOT NULL,
hash TEXT NOT NULL,
model TEXT NOT NULL DEFAULT '',
text TEXT NOT NULL,
embedding TEXT NOT NULL DEFAULT '',
updated_at INTEGER NOT NULL
)
"#,
[],
)?;
let now = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_secs() as i64;
let _ = conn.execute(
r#"
INSERT INTO files (path, source, hash, mtime, size)
SELECT path, 'memory', hash, mtime, size FROM files_old
"#,
[],
);
let has_embedding_cols = conn
.prepare("SELECT embedding FROM chunks_old LIMIT 0")
.is_ok();
if has_embedding_cols {
let mut stmt = conn.prepare(
"SELECT file_path, line_start, line_end, content, embedding, embedding_model FROM chunks_old",
)?;
let rows = stmt.query_map([], |row| {
Ok((
row.get::<_, String>(0)?,
row.get::<_, i32>(1)?,
row.get::<_, i32>(2)?,
row.get::<_, String>(3)?,
row.get::<_, Option<String>>(4)?,
row.get::<_, Option<String>>(5)?,
))
})?;
for row in rows {
let (file_path, line_start, line_end, content, embedding, model) = row?;
let new_id = Uuid::new_v4().to_string();
let hash = hash_content(&content);
let model = model.unwrap_or_default();
let embedding = embedding.unwrap_or_default();
conn.execute(
r#"
INSERT INTO chunks (id, path, source, start_line, end_line, hash, model, text, embedding, updated_at)
VALUES (?1, ?2, 'memory', ?3, ?4, ?5, ?6, ?7, ?8, ?9)
"#,
params![&new_id, &file_path, line_start, line_end, &hash, &model, &content, &embedding, now],
)?;
}
} else {
let mut stmt =
conn.prepare("SELECT file_path, line_start, line_end, content FROM chunks_old")?;
let rows = stmt.query_map([], |row| {
Ok((
row.get::<_, String>(0)?,
row.get::<_, i32>(1)?,
row.get::<_, i32>(2)?,
row.get::<_, String>(3)?,
))
})?;
for row in rows {
let (file_path, line_start, line_end, content) = row?;
let new_id = Uuid::new_v4().to_string();
let hash = hash_content(&content);
conn.execute(
r#"
INSERT INTO chunks (id, path, source, start_line, end_line, hash, model, text, embedding, updated_at)
VALUES (?1, ?2, 'memory', ?3, ?4, ?5, '', ?6, '', ?7)
"#,
params![&new_id, &file_path, line_start, line_end, &hash, &content, now],
)?;
}
}
let _ = conn.execute("DROP TABLE IF EXISTS chunks_old", []);
let _ = conn.execute("DROP TABLE IF EXISTS files_old", []);
conn.execute("COMMIT", [])?;
info!("Schema migration completed successfully");
Ok(())
}
fn ensure_fts_table(conn: &Connection) -> Result<()> {
let result = conn.execute(
r#"
CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
text,
id UNINDEXED,
path UNINDEXED,
source UNINDEXED,
model UNINDEXED,
start_line UNINDEXED,
end_line UNINDEXED
)
"#,
[],
);
match result {
Ok(_) => debug!("FTS5 table created/verified"),
Err(e) => debug!("FTS5 table creation skipped: {}", e),
}
Ok(())
}
fn ensure_column(conn: &Connection, table: &str, column: &str, definition: &str) -> Result<()> {
let sql = format!("SELECT {} FROM {} LIMIT 0", column, table);
if conn.prepare(&sql).is_err() {
let alter = format!("ALTER TABLE {} ADD COLUMN {} {}", table, column, definition);
conn.execute(&alter, [])?;
debug!("Added column {} to table {}", column, table);
}
Ok(())
}
pub fn chunks_without_embeddings(&self, limit: usize) -> Result<Vec<(String, String)>> {
let conn = self
.conn
.lock()
.map_err(|e| anyhow!("Lock poisoned: {}", e))?;
let mut stmt = conn.prepare(
"SELECT id, text FROM chunks WHERE embedding = '' OR embedding IS NULL LIMIT ?1",
)?;
let rows = stmt.query_map(params![limit as i64], |row| {
Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))
})?;
let mut results = Vec::new();
for row in rows {
results.push(row?);
}
Ok(results)
}
pub fn store_embedding(&self, chunk_id: &str, embedding: &[f32], model: &str) -> Result<()> {
let conn = self
.conn
.lock()
.map_err(|e| anyhow!("Lock poisoned: {}", e))?;
let embedding_json = serialize_embedding(embedding);
let now = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)?
.as_secs() as i64;
conn.execute(
"UPDATE chunks SET embedding = ?1, model = ?2, updated_at = ?3 WHERE id = ?4",
params![&embedding_json, model, now, chunk_id],
)?;
if self.has_vec_extension {
let embedding_blob = embedding_to_blob(embedding);
let _ = conn.execute(
"INSERT OR REPLACE INTO chunks_vec (id, embedding) VALUES (?1, ?2)",
params![chunk_id, &embedding_blob],
);
}
Ok(())
}
pub fn get_cached_embedding(
&self,
provider: &str,
model: &str,
text_hash: &str,
) -> Result<Option<Vec<f32>>> {
let conn = self
.conn
.lock()
.map_err(|e| anyhow!("Lock poisoned: {}", e))?;
let result: Option<String> = conn
.query_row(
"SELECT embedding FROM embedding_cache WHERE provider = ?1 AND model = ?2 AND hash = ?3",
params![provider, model, text_hash],
|row| row.get(0),
)
.ok();
Ok(result.map(|json| deserialize_embedding(&json)))
}
pub fn cache_embedding(
&self,
provider: &str,
model: &str,
provider_key: &str,
text_hash: &str,
embedding: &[f32],
) -> Result<()> {
let conn = self
.conn
.lock()
.map_err(|e| anyhow!("Lock poisoned: {}", e))?;
let embedding_json = serialize_embedding(embedding);
let dims = embedding.len() as i32;
let now = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)?
.as_secs() as i64;
conn.execute(
"INSERT OR REPLACE INTO embedding_cache (provider, model, provider_key, hash, embedding, dims, updated_at)
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
params![provider, model, provider_key, text_hash, &embedding_json, dims, now],
)?;
Ok(())
}
pub fn has_vec_extension(&self) -> bool {
self.has_vec_extension
}
pub fn search_vector(
&self,
query_embedding: &[f32],
model: &str,
limit: usize,
) -> Result<Vec<MemoryChunk>> {
let conn = self
.conn
.lock()
.map_err(|e| anyhow!("Lock poisoned: {}", e))?;
if self.has_vec_extension {
if let Ok(results) = self.search_vector_fast(&conn, query_embedding, model, limit) {
return Ok(results);
}
warn!("sqlite-vec search failed, falling back to in-memory scan");
}
self.search_vector_scan(&conn, query_embedding, model, limit)
}
fn search_vector_fast(
&self,
conn: &Connection,
query_embedding: &[f32],
model: &str,
limit: usize,
) -> Result<Vec<MemoryChunk>> {
let query_blob = embedding_to_blob(query_embedding);
let mut stmt = conn.prepare(
r#"
SELECT c.path, c.start_line, c.end_line, c.text,
1.0 - vec_distance_cosine(v.embedding, ?1) AS score
FROM chunks_vec v
JOIN chunks c ON c.id = v.id
WHERE c.model = ?2
ORDER BY score DESC
LIMIT ?3
"#,
)?;
let rows = stmt.query_map(params![&query_blob, model, limit as i64], |row| {
Ok(MemoryChunk {
file: row.get(0)?,
line_start: row.get(1)?,
line_end: row.get(2)?,
content: row.get(3)?,
score: row.get(4)?,
})
})?;
let mut results = Vec::new();
for row in rows {
results.push(row?);
}
Ok(results)
}
fn search_vector_scan(
&self,
conn: &Connection,
query_embedding: &[f32],
model: &str,
limit: usize,
) -> Result<Vec<MemoryChunk>> {
let mut stmt = conn.prepare(
"SELECT id, path, start_line, end_line, text, embedding
FROM chunks
WHERE embedding != '' AND embedding IS NOT NULL AND model = ?1",
)?;
let rows = stmt.query_map(params![model], |row| {
Ok((
row.get::<_, String>(0)?,
row.get::<_, String>(1)?,
row.get::<_, i32>(2)?,
row.get::<_, i32>(3)?,
row.get::<_, String>(4)?,
row.get::<_, String>(5)?,
))
})?;
let mut scored: Vec<(f32, MemoryChunk)> = Vec::new();
for row in rows {
let (_, path, start_line, end_line, text, embedding_json) = row?;
let embedding = deserialize_embedding(&embedding_json);
if embedding.len() == query_embedding.len() {
let similarity = cosine_similarity(query_embedding, &embedding);
scored.push((
similarity,
MemoryChunk {
file: path,
line_start: start_line,
line_end: end_line,
content: text,
score: similarity as f64,
},
));
}
}
scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
Ok(scored
.into_iter()
.take(limit)
.map(|(_, chunk)| chunk)
.collect())
}
pub fn search_hybrid(
&self,
query: &str,
query_embedding: Option<&[f32]>,
model: &str,
limit: usize,
text_weight: f32,
vector_weight: f32,
) -> Result<Vec<MemoryChunk>> {
let fts_results = self.search(query, limit * 2)?;
let vector_results = if let Some(embedding) = query_embedding {
self.search_vector(embedding, model, limit * 2)?
} else {
Vec::new()
};
let mut merged: std::collections::HashMap<String, (f32, MemoryChunk)> =
std::collections::HashMap::new();
for (rank, result) in fts_results.into_iter().enumerate() {
let key = format!("{}:{}:{}", result.file, result.line_start, result.line_end);
let rank_score = 1.0 / (1.0 + rank as f32); let weighted_score = rank_score * text_weight;
merged.insert(key, (weighted_score, result));
}
for (rank, result) in vector_results.into_iter().enumerate() {
let key = format!("{}:{}:{}", result.file, result.line_start, result.line_end);
let rank_score = 1.0 / (1.0 + rank as f32);
let weighted_score = rank_score * vector_weight;
if let Some((existing_score, existing_chunk)) = merged.get_mut(&key) {
*existing_score += weighted_score;
existing_chunk.score = *existing_score as f64;
} else {
let mut chunk = result;
chunk.score = weighted_score as f64;
merged.insert(key, (weighted_score, chunk));
}
}
let mut results: Vec<_> = merged.into_values().collect();
results.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
Ok(results
.into_iter()
.take(limit)
.map(|(_, chunk)| chunk)
.collect())
}
pub fn embedded_chunk_count(&self, model: &str) -> Result<usize> {
let conn = self
.conn
.lock()
.map_err(|e| anyhow!("Lock poisoned: {}", e))?;
let count: i64 = conn.query_row(
"SELECT COUNT(*) FROM chunks WHERE embedding != '' AND embedding IS NOT NULL AND model = ?1",
params![model],
|row| row.get(0),
)?;
Ok(count as usize)
}
}
fn hash_content(content: &str) -> String {
let mut hasher = Sha256::new();
hasher.update(content.as_bytes());
format!("{:x}", hasher.finalize())
}
fn embedding_to_blob(embedding: &[f32]) -> Vec<u8> {
let mut blob = Vec::with_capacity(embedding.len() * 4);
for &val in embedding {
blob.extend_from_slice(&val.to_le_bytes());
}
blob
}
fn build_fts_query(raw: &str) -> Option<String> {
let tokens: Vec<&str> = raw
.split(|c: char| !c.is_alphanumeric() && c != '_')
.map(|t| t.trim())
.filter(|t| !t.is_empty())
.collect();
if tokens.is_empty() {
return None;
}
let quoted: Vec<String> = tokens
.iter()
.map(|t| format!("\"{}\"", t.replace('"', "")))
.collect();
Some(quoted.join(" AND "))
}
struct ChunkInfo {
line_start: i32,
line_end: i32,
content: String,
}
fn chunk_text(text: &str, target_tokens: usize, overlap_tokens: usize) -> Vec<ChunkInfo> {
let lines: Vec<&str> = text.lines().collect();
let mut chunks = Vec::new();
if lines.is_empty() {
return chunks;
}
let target_chars = target_tokens * 4;
let overlap_chars = overlap_tokens * 4;
let mut start_line = 0;
let mut current_chars = 0;
let mut chunk_lines = Vec::new();
for (i, line) in lines.iter().enumerate() {
chunk_lines.push(*line);
current_chars += line.len() + 1;
if current_chars >= target_chars || i == lines.len() - 1 {
chunks.push(ChunkInfo {
line_start: (start_line + 1) as i32,
line_end: (i + 1) as i32,
content: chunk_lines.join("\n"),
});
let mut overlap_len = 0;
let mut overlap_start = chunk_lines.len();
for (j, line) in chunk_lines.iter().enumerate().rev() {
overlap_len += line.len() + 1;
if overlap_len >= overlap_chars {
overlap_start = j;
break;
}
}
if overlap_start < chunk_lines.len() {
start_line += overlap_start;
chunk_lines = chunk_lines[overlap_start..].to_vec();
current_chars = chunk_lines.iter().map(|l| l.len() + 1).sum();
} else {
start_line = i + 1;
chunk_lines.clear();
current_chars = 0;
}
}
}
chunks
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
#[test]
fn test_chunk_text() {
let text = "Line 1\nLine 2\nLine 3\nLine 4\nLine 5";
let chunks = chunk_text(text, 10, 2);
assert!(!chunks.is_empty());
assert_eq!(chunks[0].line_start, 1);
}
#[test]
fn test_memory_index() -> Result<()> {
let temp_dir = TempDir::new()?;
let workspace = temp_dir.path();
let test_file = workspace.join("test.md");
fs::write(
&test_file,
"# Test\n\nThis is a test document.\n\nWith multiple lines.",
)?;
let index = MemoryIndex::new(workspace)?;
index.index_file(&test_file, false)?;
assert!(index.chunk_count()? > 0);
let results = index.search("test document", 10)?;
assert!(!results.is_empty());
Ok(())
}
}