use anyhow::{Context, Result};
use duckdb::{params, Connection};
use std::path::Path;
#[derive(Debug, Clone)]
#[allow(dead_code)]
pub struct FileRecord {
pub id: i64,
pub path: String,
pub content_hash: String,
pub last_indexed: String,
}
#[derive(Debug, Clone)]
#[allow(dead_code)]
pub struct ChunkRecord {
pub id: i64,
pub file_id: i64,
pub chunk_index: i32,
pub content: String,
pub start_line: i32,
pub end_line: i32,
}
#[derive(Debug, Clone)]
pub struct SearchResult {
pub file_path: String,
pub chunk_content: String,
pub start_line: i32,
pub end_line: i32,
pub score: f64,
}
pub struct Storage {
conn: Connection,
dimensions: usize,
}
impl Storage {
pub fn new(db_path: &Path, dimensions: usize) -> Result<Self> {
let conn = Connection::open(db_path)
.with_context(|| format!("Failed to open database: {}", db_path.display()))?;
let storage = Self { conn, dimensions };
storage.initialize()?;
Ok(storage)
}
pub fn open_readonly(db_path: &Path, dimensions: usize) -> Result<Self> {
use duckdb::Config;
let config = Config::default().access_mode(duckdb::AccessMode::ReadOnly)?;
let conn = Connection::open_with_flags(db_path, config).with_context(|| {
format!("Failed to open database (read-only): {}", db_path.display())
})?;
conn.execute_batch("LOAD vss;")
.context("Failed to load VSS extension")?;
Ok(Self { conn, dimensions })
}
#[allow(dead_code)]
pub fn in_memory(dimensions: usize) -> Result<Self> {
let conn = Connection::open_in_memory()?;
let storage = Self { conn, dimensions };
storage.initialize()?;
Ok(storage)
}
fn initialize(&self) -> Result<()> {
self.conn
.execute_batch(
"INSTALL vss;
LOAD vss;",
)
.context(
"Failed to load VSS extension. Make sure DuckDB VSS extension is available.",
)?;
self.conn.execute_batch(
"CREATE SEQUENCE IF NOT EXISTS files_id_seq;
CREATE SEQUENCE IF NOT EXISTS chunks_id_seq;",
)?;
self.conn.execute(
"CREATE TABLE IF NOT EXISTS files (
id INTEGER PRIMARY KEY DEFAULT nextval('files_id_seq'),
path TEXT UNIQUE NOT NULL,
content_hash TEXT NOT NULL,
last_indexed TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)",
[],
)?;
let create_chunks = format!(
"CREATE TABLE IF NOT EXISTS chunks (
id INTEGER PRIMARY KEY DEFAULT nextval('chunks_id_seq'),
file_id INTEGER NOT NULL REFERENCES files(id),
chunk_index INTEGER NOT NULL,
content TEXT NOT NULL,
start_line INTEGER,
end_line INTEGER,
embedding FLOAT[{}] NOT NULL
)",
self.dimensions
);
self.conn.execute(&create_chunks, [])?;
self.conn.execute(
"CREATE INDEX IF NOT EXISTS idx_chunks_file_id ON chunks(file_id)",
[],
)?;
let create_hnsw = "CREATE INDEX IF NOT EXISTS chunks_embedding_idx ON chunks
USING HNSW (embedding) WITH (metric = 'cosine')";
let _ = self.conn.execute(create_hnsw, []);
Ok(())
}
pub fn upsert_file(&self, path: &str, content_hash: &str) -> Result<i64> {
let existing: Option<i64> = self
.conn
.query_row(
"SELECT id FROM files WHERE path = ?",
params![path],
|row| row.get(0),
)
.ok();
if let Some(id) = existing {
self.conn.execute(
"UPDATE files SET content_hash = ?, last_indexed = CURRENT_TIMESTAMP WHERE id = ?",
params![content_hash, id],
)?;
self.conn
.execute("DELETE FROM chunks WHERE file_id = ?", params![id])?;
Ok(id)
} else {
let id: i64 = self.conn.query_row(
"INSERT INTO files (path, content_hash) VALUES (?, ?) RETURNING id",
params![path, content_hash],
|row| row.get(0),
)?;
Ok(id)
}
}
pub fn insert_chunk(
&self,
file_id: i64,
chunk_index: i32,
content: &str,
start_line: i32,
end_line: i32,
embedding: &[f32],
) -> Result<i64> {
let embedding_str = format!(
"[{}]",
embedding
.iter()
.map(|v| v.to_string())
.collect::<Vec<_>>()
.join(", ")
);
let sql = format!(
"INSERT INTO chunks (file_id, chunk_index, content, start_line, end_line, embedding)
VALUES (?, ?, ?, ?, ?, {}::FLOAT[{}]) RETURNING id",
embedding_str, self.dimensions
);
let id: i64 = self.conn.query_row(
&sql,
params![file_id, chunk_index, content, start_line, end_line],
|row| row.get(0),
)?;
Ok(id)
}
#[allow(dead_code)]
pub fn insert_chunks_batch(
&self,
file_id: i64,
chunks: &[(i32, String, i32, i32, Vec<f32>)],
) -> Result<()> {
for (chunk_index, content, start_line, end_line, embedding) in chunks {
self.insert_chunk(
file_id,
*chunk_index,
content,
*start_line,
*end_line,
embedding,
)?;
}
Ok(())
}
pub fn search(
&self,
query_embedding: &[f32],
limit: usize,
folder_filter: Option<&str>,
) -> Result<Vec<SearchResult>> {
let embedding_str = format!(
"[{}]",
query_embedding
.iter()
.map(|v| v.to_string())
.collect::<Vec<_>>()
.join(", ")
);
let folder_clause = if let Some(folder) = folder_filter {
format!("AND f.path LIKE '{}%'", folder.replace('\'', "''"))
} else {
String::new()
};
let sql = format!(
"SELECT
f.path,
c.content,
c.start_line,
c.end_line,
array_cosine_similarity(c.embedding, {}::FLOAT[{}]) as score
FROM chunks c
JOIN files f ON c.file_id = f.id
WHERE 1=1 {}
ORDER BY score DESC
LIMIT ?",
embedding_str, self.dimensions, folder_clause
);
let mut stmt = self.conn.prepare(&sql)?;
let results = stmt
.query_map(params![limit as i64], |row| {
Ok(SearchResult {
file_path: row.get(0)?,
chunk_content: row.get(1)?,
start_line: row.get(2)?,
end_line: row.get(3)?,
score: row.get(4)?,
})
})?
.collect::<Result<Vec<_>, _>>()?;
Ok(results)
}
pub fn get_file(&self, path: &str) -> Result<Option<FileRecord>> {
let result = self
.conn
.query_row(
"SELECT id, path, content_hash, CAST(last_indexed AS VARCHAR) FROM files WHERE path = ?",
params![path],
|row| {
Ok(FileRecord {
id: row.get(0)?,
path: row.get(1)?,
content_hash: row.get(2)?,
last_indexed: row.get(3)?,
})
},
)
.ok();
Ok(result)
}
pub fn get_all_file_paths(&self) -> Result<Vec<String>> {
let mut stmt = self.conn.prepare("SELECT path FROM files")?;
let paths = stmt
.query_map([], |row| row.get(0))?
.collect::<Result<Vec<String>, _>>()?;
Ok(paths)
}
pub fn get_all_file_hashes(&self) -> Result<Vec<(String, String)>> {
let mut stmt = self.conn.prepare("SELECT path, content_hash FROM files")?;
let results = stmt
.query_map([], |row| Ok((row.get(0)?, row.get(1)?)))?
.collect::<Result<Vec<(String, String)>, _>>()?;
Ok(results)
}
pub fn delete_file(&self, path: &str) -> Result<bool> {
self.conn.execute(
"DELETE FROM chunks WHERE file_id IN (SELECT id FROM files WHERE path = ?)",
params![path],
)?;
let rows = self
.conn
.execute("DELETE FROM files WHERE path = ?", params![path])?;
Ok(rows > 0)
}
pub fn delete_files(&self, paths: &[String]) -> Result<usize> {
let mut deleted = 0;
for path in paths {
if self.delete_file(path)? {
deleted += 1;
}
}
Ok(deleted)
}
pub fn get_stats(&self) -> Result<IndexStats> {
let file_count: i64 = self
.conn
.query_row("SELECT COUNT(*) FROM files", [], |row| row.get(0))?;
let chunk_count: i64 = self
.conn
.query_row("SELECT COUNT(*) FROM chunks", [], |row| row.get(0))?;
let last_indexed: Option<String> = self
.conn
.query_row(
"SELECT CAST(MAX(last_indexed) AS VARCHAR) FROM files",
[],
|row| row.get(0),
)
.ok();
let total_content_size: i64 = self.conn.query_row(
"SELECT COALESCE(SUM(LENGTH(content)), 0) FROM chunks",
[],
|row| row.get(0),
)?;
Ok(IndexStats {
file_count: file_count as usize,
chunk_count: chunk_count as usize,
last_indexed,
total_content_size: total_content_size as usize,
})
}
#[allow(dead_code)]
pub fn needs_rebuild(&self, new_dimensions: usize) -> Result<bool> {
let has_chunks: i64 = self
.conn
.query_row("SELECT COUNT(*) FROM chunks LIMIT 1", [], |row| row.get(0))
.unwrap_or(0);
if has_chunks == 0 {
return Ok(false);
}
Ok(self.dimensions != new_dimensions)
}
pub fn clear(&self) -> Result<()> {
self.conn.execute("DELETE FROM chunks", [])?;
self.conn.execute("DELETE FROM files", [])?;
Ok(())
}
pub fn list_files_with_stats(&self) -> Result<Vec<FileWithStats>> {
let mut stmt = self.conn.prepare(
"SELECT
f.id,
f.path,
f.content_hash,
CAST(f.last_indexed AS VARCHAR),
COUNT(c.id) as chunk_count,
COALESCE(SUM(LENGTH(c.content)), 0) as total_size
FROM files f
LEFT JOIN chunks c ON f.id = c.file_id
GROUP BY f.id, f.path, f.content_hash, f.last_indexed
ORDER BY f.path",
)?;
let results = stmt
.query_map([], |row| {
Ok(FileWithStats {
id: row.get(0)?,
path: row.get(1)?,
content_hash: row.get(2)?,
last_indexed: row.get(3)?,
chunk_count: row.get::<_, i64>(4)? as usize,
total_size: row.get::<_, i64>(5)? as usize,
})
})?
.collect::<Result<Vec<_>, _>>()?;
Ok(results)
}
pub fn get_chunks_for_file(&self, file_path: &str) -> Result<Vec<ChunkWithMetadata>> {
let mut stmt = self.conn.prepare(
"SELECT
c.id,
c.file_id,
c.chunk_index,
c.content,
c.start_line,
c.end_line,
f.path
FROM chunks c
JOIN files f ON c.file_id = f.id
WHERE f.path = ?
ORDER BY c.chunk_index",
)?;
let results = stmt
.query_map(params![file_path], |row| {
Ok(ChunkWithMetadata {
id: row.get(0)?,
file_id: row.get(1)?,
chunk_index: row.get(2)?,
content: row.get(3)?,
start_line: row.get(4)?,
end_line: row.get(5)?,
file_path: row.get(6)?,
})
})?
.collect::<Result<Vec<_>, _>>()?;
Ok(results)
}
pub fn get_chunks_paginated(
&self,
offset: usize,
limit: usize,
file_filter: Option<&str>,
) -> Result<Vec<ChunkWithMetadata>> {
let file_clause = if let Some(pattern) = file_filter {
format!("WHERE f.path LIKE '%{}%'", pattern.replace('\'', "''"))
} else {
String::new()
};
let sql = format!(
"SELECT
c.id,
c.file_id,
c.chunk_index,
c.content,
c.start_line,
c.end_line,
f.path
FROM chunks c
JOIN files f ON c.file_id = f.id
{}
ORDER BY f.path, c.chunk_index
LIMIT ? OFFSET ?",
file_clause
);
let mut stmt = self.conn.prepare(&sql)?;
let results = stmt
.query_map(params![limit as i64, offset as i64], |row| {
Ok(ChunkWithMetadata {
id: row.get(0)?,
file_id: row.get(1)?,
chunk_index: row.get(2)?,
content: row.get(3)?,
start_line: row.get(4)?,
end_line: row.get(5)?,
file_path: row.get(6)?,
})
})?
.collect::<Result<Vec<_>, _>>()?;
Ok(results)
}
pub fn get_chunk_by_id(&self, chunk_id: i64) -> Result<Option<ChunkWithMetadata>> {
let result = self
.conn
.query_row(
"SELECT
c.id,
c.file_id,
c.chunk_index,
c.content,
c.start_line,
c.end_line,
f.path
FROM chunks c
JOIN files f ON c.file_id = f.id
WHERE c.id = ?",
params![chunk_id],
|row| {
Ok(ChunkWithMetadata {
id: row.get(0)?,
file_id: row.get(1)?,
chunk_index: row.get(2)?,
content: row.get(3)?,
start_line: row.get(4)?,
end_line: row.get(5)?,
file_path: row.get(6)?,
})
},
)
.ok();
Ok(result)
}
pub fn get_stats_by_extension(&self) -> Result<Vec<ExtensionStats>> {
let mut stmt = self.conn.prepare(
"SELECT
COALESCE(
CASE
WHEN POSITION('.' IN REVERSE(path)) > 0
THEN SUBSTRING(path FROM LENGTH(path) - POSITION('.' IN REVERSE(path)) + 2)
ELSE 'no extension'
END,
'no extension'
) as ext,
COUNT(DISTINCT f.id) as file_count,
COUNT(c.id) as chunk_count,
COALESCE(SUM(LENGTH(c.content)), 0) as total_size
FROM files f
LEFT JOIN chunks c ON f.id = c.file_id
GROUP BY ext
ORDER BY file_count DESC",
)?;
let results = stmt
.query_map([], |row| {
Ok(ExtensionStats {
extension: row.get(0)?,
file_count: row.get::<_, i64>(1)? as usize,
chunk_count: row.get::<_, i64>(2)? as usize,
total_size: row.get::<_, i64>(3)? as usize,
})
})?
.collect::<Result<Vec<_>, _>>()?;
Ok(results)
}
}
#[derive(Debug, Clone)]
pub struct FileWithStats {
#[allow(dead_code)]
pub id: i64,
pub path: String,
pub content_hash: String,
pub last_indexed: String,
pub chunk_count: usize,
pub total_size: usize,
}
#[derive(Debug, Clone)]
pub struct ChunkWithMetadata {
pub id: i64,
#[allow(dead_code)]
pub file_id: i64,
pub chunk_index: i32,
pub content: String,
pub start_line: i32,
pub end_line: i32,
pub file_path: String,
}
#[derive(Debug, Clone)]
pub struct ExtensionStats {
pub extension: String,
pub file_count: usize,
pub chunk_count: usize,
pub total_size: usize,
}
#[derive(Debug, Clone)]
pub struct IndexStats {
pub file_count: usize,
pub chunk_count: usize,
pub last_indexed: Option<String>,
pub total_content_size: usize,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_storage_creation() {
let storage = Storage::in_memory(384);
if storage.is_err() {
eprintln!("Skipping test: VSS extension not available");
return;
}
let storage = storage.unwrap();
let stats = storage.get_stats().unwrap();
assert_eq!(stats.file_count, 0);
assert_eq!(stats.chunk_count, 0);
}
#[test]
fn test_file_operations() {
let storage = Storage::in_memory(384);
if storage.is_err() {
return; }
let storage = storage.unwrap();
let file_id = storage.upsert_file("test.rs", "abc123").unwrap();
assert!(file_id > 0);
let file = storage.get_file("test.rs").unwrap();
assert!(file.is_some());
assert_eq!(file.unwrap().content_hash, "abc123");
let file_id2 = storage.upsert_file("test.rs", "def456").unwrap();
assert_eq!(file_id, file_id2);
let file = storage.get_file("test.rs").unwrap().unwrap();
assert_eq!(file.content_hash, "def456");
let deleted = storage.delete_file("test.rs").unwrap();
assert!(deleted);
let file = storage.get_file("test.rs").unwrap();
assert!(file.is_none());
}
}