vyctor 0.1.0 - Docs.rs

//! DuckDB storage implementation with VSS extension

use anyhow::{Context, Result};
use duckdb::{params, Connection};
use std::path::Path;

/// A record representing an indexed file
#[derive(Debug, Clone)]
#[allow(dead_code)]
pub struct FileRecord {
    pub id: i64,
    pub path: String,
    pub content_hash: String,
    pub last_indexed: String,
}

/// A record representing a chunk of a file
#[derive(Debug, Clone)]
#[allow(dead_code)]
pub struct ChunkRecord {
    pub id: i64,
    pub file_id: i64,
    pub chunk_index: i32,
    pub content: String,
    pub start_line: i32,
    pub end_line: i32,
}

/// A search result with similarity score
#[derive(Debug, Clone)]
pub struct SearchResult {
    pub file_path: String,
    pub chunk_content: String,
    pub start_line: i32,
    pub end_line: i32,
    pub score: f64,
}

/// Storage interface for DuckDB operations
pub struct Storage {
    conn: Connection,
    dimensions: usize,
}

impl Storage {
    /// Create a new storage instance, initializing the database if needed
    pub fn new(db_path: &Path, dimensions: usize) -> Result<Self> {
        let conn = Connection::open(db_path)
            .with_context(|| format!("Failed to open database: {}", db_path.display()))?;

        let storage = Self { conn, dimensions };
        storage.initialize()?;
        Ok(storage)
    }

    /// Open an existing database in read-only mode
    /// This allows multiple readers (e.g., browse commands while daemon is running)
    pub fn open_readonly(db_path: &Path, dimensions: usize) -> Result<Self> {
        use duckdb::Config;

        let config = Config::default().access_mode(duckdb::AccessMode::ReadOnly)?;

        let conn = Connection::open_with_flags(db_path, config).with_context(|| {
            format!("Failed to open database (read-only): {}", db_path.display())
        })?;

        // Load VSS extension for read operations
        conn.execute_batch("LOAD vss;")
            .context("Failed to load VSS extension")?;

        Ok(Self { conn, dimensions })
    }

    /// Create an in-memory storage instance (for testing)
    #[allow(dead_code)]
    pub fn in_memory(dimensions: usize) -> Result<Self> {
        let conn = Connection::open_in_memory()?;
        let storage = Self { conn, dimensions };
        storage.initialize()?;
        Ok(storage)
    }

    /// Initialize the database schema
    fn initialize(&self) -> Result<()> {
        // Install and load the VSS extension
        self.conn
            .execute_batch(
                "INSTALL vss;
             LOAD vss;",
            )
            .context(
                "Failed to load VSS extension. Make sure DuckDB VSS extension is available.",
            )?;

        // Create sequences for auto-incrementing IDs
        self.conn.execute_batch(
            "CREATE SEQUENCE IF NOT EXISTS files_id_seq;
             CREATE SEQUENCE IF NOT EXISTS chunks_id_seq;",
        )?;

        // Create the files table
        self.conn.execute(
            "CREATE TABLE IF NOT EXISTS files (
                id INTEGER PRIMARY KEY DEFAULT nextval('files_id_seq'),
                path TEXT UNIQUE NOT NULL,
                content_hash TEXT NOT NULL,
                last_indexed TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )",
            [],
        )?;

        // Create the chunks table with embedding column
        // Using FLOAT array for embeddings
        // Note: DuckDB doesn't support ON DELETE CASCADE, so we handle deletions manually
        let create_chunks = format!(
            "CREATE TABLE IF NOT EXISTS chunks (
                id INTEGER PRIMARY KEY DEFAULT nextval('chunks_id_seq'),
                file_id INTEGER NOT NULL REFERENCES files(id),
                chunk_index INTEGER NOT NULL,
                content TEXT NOT NULL,
                start_line INTEGER,
                end_line INTEGER,
                embedding FLOAT[{}] NOT NULL
            )",
            self.dimensions
        );
        self.conn.execute(&create_chunks, [])?;

        // Create index on file_id for faster lookups
        self.conn.execute(
            "CREATE INDEX IF NOT EXISTS idx_chunks_file_id ON chunks(file_id)",
            [],
        )?;

        // Create HNSW index for vector similarity search
        // Note: This may fail if the index already exists, which is fine
        let create_hnsw = "CREATE INDEX IF NOT EXISTS chunks_embedding_idx ON chunks 
             USING HNSW (embedding) WITH (metric = 'cosine')";
        let _ = self.conn.execute(create_hnsw, []);

        Ok(())
    }

    /// Insert or update a file record
    pub fn upsert_file(&self, path: &str, content_hash: &str) -> Result<i64> {
        // Try to get existing file
        let existing: Option<i64> = self
            .conn
            .query_row(
                "SELECT id FROM files WHERE path = ?",
                params![path],
                |row| row.get(0),
            )
            .ok();

        if let Some(id) = existing {
            // Update the hash and timestamp
            self.conn.execute(
                "UPDATE files SET content_hash = ?, last_indexed = CURRENT_TIMESTAMP WHERE id = ?",
                params![content_hash, id],
            )?;
            // Delete old chunks (will be re-inserted)
            self.conn
                .execute("DELETE FROM chunks WHERE file_id = ?", params![id])?;
            Ok(id)
        } else {
            // Insert new file and get the ID using RETURNING
            let id: i64 = self.conn.query_row(
                "INSERT INTO files (path, content_hash) VALUES (?, ?) RETURNING id",
                params![path, content_hash],
                |row| row.get(0),
            )?;
            Ok(id)
        }
    }

    /// Insert a chunk with its embedding
    pub fn insert_chunk(
        &self,
        file_id: i64,
        chunk_index: i32,
        content: &str,
        start_line: i32,
        end_line: i32,
        embedding: &[f32],
    ) -> Result<i64> {
        // Convert embedding to DuckDB array format
        let embedding_str = format!(
            "[{}]",
            embedding
                .iter()
                .map(|v| v.to_string())
                .collect::<Vec<_>>()
                .join(", ")
        );

        let sql = format!(
            "INSERT INTO chunks (file_id, chunk_index, content, start_line, end_line, embedding) 
             VALUES (?, ?, ?, ?, ?, {}::FLOAT[{}]) RETURNING id",
            embedding_str, self.dimensions
        );

        let id: i64 = self.conn.query_row(
            &sql,
            params![file_id, chunk_index, content, start_line, end_line],
            |row| row.get(0),
        )?;

        Ok(id)
    }

    /// Insert multiple chunks in a batch for better performance
    #[allow(dead_code)]
    pub fn insert_chunks_batch(
        &self,
        file_id: i64,
        chunks: &[(i32, String, i32, i32, Vec<f32>)],
    ) -> Result<()> {
        for (chunk_index, content, start_line, end_line, embedding) in chunks {
            self.insert_chunk(
                file_id,
                *chunk_index,
                content,
                *start_line,
                *end_line,
                embedding,
            )?;
        }
        Ok(())
    }

    /// Search for similar chunks using vector similarity
    pub fn search(
        &self,
        query_embedding: &[f32],
        limit: usize,
        folder_filter: Option<&str>,
    ) -> Result<Vec<SearchResult>> {
        let embedding_str = format!(
            "[{}]",
            query_embedding
                .iter()
                .map(|v| v.to_string())
                .collect::<Vec<_>>()
                .join(", ")
        );

        let folder_clause = if let Some(folder) = folder_filter {
            format!("AND f.path LIKE '{}%'", folder.replace('\'', "''"))
        } else {
            String::new()
        };

        let sql = format!(
            "SELECT 
                f.path,
                c.content,
                c.start_line,
                c.end_line,
                array_cosine_similarity(c.embedding, {}::FLOAT[{}]) as score
             FROM chunks c
             JOIN files f ON c.file_id = f.id
             WHERE 1=1 {}
             ORDER BY score DESC
             LIMIT ?",
            embedding_str, self.dimensions, folder_clause
        );

        let mut stmt = self.conn.prepare(&sql)?;
        let results = stmt
            .query_map(params![limit as i64], |row| {
                Ok(SearchResult {
                    file_path: row.get(0)?,
                    chunk_content: row.get(1)?,
                    start_line: row.get(2)?,
                    end_line: row.get(3)?,
                    score: row.get(4)?,
                })
            })?
            .collect::<Result<Vec<_>, _>>()?;

        Ok(results)
    }

    /// Get a file by path
    pub fn get_file(&self, path: &str) -> Result<Option<FileRecord>> {
        let result = self
            .conn
            .query_row(
                "SELECT id, path, content_hash, CAST(last_indexed AS VARCHAR) FROM files WHERE path = ?",
                params![path],
                |row| {
                    Ok(FileRecord {
                        id: row.get(0)?,
                        path: row.get(1)?,
                        content_hash: row.get(2)?,
                        last_indexed: row.get(3)?,
                    })
                },
            )
            .ok();
        Ok(result)
    }

    /// Get all indexed file paths
    pub fn get_all_file_paths(&self) -> Result<Vec<String>> {
        let mut stmt = self.conn.prepare("SELECT path FROM files")?;
        let paths = stmt
            .query_map([], |row| row.get(0))?
            .collect::<Result<Vec<String>, _>>()?;
        Ok(paths)
    }

    /// Get all file paths and their content hashes (for parallel skip detection)
    pub fn get_all_file_hashes(&self) -> Result<Vec<(String, String)>> {
        let mut stmt = self.conn.prepare("SELECT path, content_hash FROM files")?;
        let results = stmt
            .query_map([], |row| Ok((row.get(0)?, row.get(1)?)))?
            .collect::<Result<Vec<(String, String)>, _>>()?;
        Ok(results)
    }

    /// Delete a file and its chunks
    pub fn delete_file(&self, path: &str) -> Result<bool> {
        // First delete chunks (DuckDB doesn't support ON DELETE CASCADE)
        self.conn.execute(
            "DELETE FROM chunks WHERE file_id IN (SELECT id FROM files WHERE path = ?)",
            params![path],
        )?;
        // Then delete the file
        let rows = self
            .conn
            .execute("DELETE FROM files WHERE path = ?", params![path])?;
        Ok(rows > 0)
    }

    /// Delete multiple files by path
    pub fn delete_files(&self, paths: &[String]) -> Result<usize> {
        let mut deleted = 0;
        for path in paths {
            if self.delete_file(path)? {
                deleted += 1;
            }
        }
        Ok(deleted)
    }

    /// Get statistics about the index
    pub fn get_stats(&self) -> Result<IndexStats> {
        let file_count: i64 = self
            .conn
            .query_row("SELECT COUNT(*) FROM files", [], |row| row.get(0))?;

        let chunk_count: i64 = self
            .conn
            .query_row("SELECT COUNT(*) FROM chunks", [], |row| row.get(0))?;

        let last_indexed: Option<String> = self
            .conn
            .query_row(
                "SELECT CAST(MAX(last_indexed) AS VARCHAR) FROM files",
                [],
                |row| row.get(0),
            )
            .ok();

        let total_content_size: i64 = self.conn.query_row(
            "SELECT COALESCE(SUM(LENGTH(content)), 0) FROM chunks",
            [],
            |row| row.get(0),
        )?;

        Ok(IndexStats {
            file_count: file_count as usize,
            chunk_count: chunk_count as usize,
            last_indexed,
            total_content_size: total_content_size as usize,
        })
    }

    /// Check if the database needs rebuilding (e.g., dimension mismatch)
    #[allow(dead_code)]
    pub fn needs_rebuild(&self, new_dimensions: usize) -> Result<bool> {
        // Check if any chunks exist
        let has_chunks: i64 = self
            .conn
            .query_row("SELECT COUNT(*) FROM chunks LIMIT 1", [], |row| row.get(0))
            .unwrap_or(0);

        if has_chunks == 0 {
            return Ok(false);
        }

        // We can't easily check array dimensions in DuckDB, so we trust the config
        Ok(self.dimensions != new_dimensions)
    }

    /// Clear all data from the database
    pub fn clear(&self) -> Result<()> {
        self.conn.execute("DELETE FROM chunks", [])?;
        self.conn.execute("DELETE FROM files", [])?;
        Ok(())
    }

    /// List all files with their chunk counts and sizes
    pub fn list_files_with_stats(&self) -> Result<Vec<FileWithStats>> {
        let mut stmt = self.conn.prepare(
            "SELECT 
                f.id,
                f.path,
                f.content_hash,
                CAST(f.last_indexed AS VARCHAR),
                COUNT(c.id) as chunk_count,
                COALESCE(SUM(LENGTH(c.content)), 0) as total_size
             FROM files f
             LEFT JOIN chunks c ON f.id = c.file_id
             GROUP BY f.id, f.path, f.content_hash, f.last_indexed
             ORDER BY f.path",
        )?;

        let results = stmt
            .query_map([], |row| {
                Ok(FileWithStats {
                    id: row.get(0)?,
                    path: row.get(1)?,
                    content_hash: row.get(2)?,
                    last_indexed: row.get(3)?,
                    chunk_count: row.get::<_, i64>(4)? as usize,
                    total_size: row.get::<_, i64>(5)? as usize,
                })
            })?
            .collect::<Result<Vec<_>, _>>()?;

        Ok(results)
    }

    /// Get chunks for a specific file
    pub fn get_chunks_for_file(&self, file_path: &str) -> Result<Vec<ChunkWithMetadata>> {
        let mut stmt = self.conn.prepare(
            "SELECT 
                c.id,
                c.file_id,
                c.chunk_index,
                c.content,
                c.start_line,
                c.end_line,
                f.path
             FROM chunks c
             JOIN files f ON c.file_id = f.id
             WHERE f.path = ?
             ORDER BY c.chunk_index",
        )?;

        let results = stmt
            .query_map(params![file_path], |row| {
                Ok(ChunkWithMetadata {
                    id: row.get(0)?,
                    file_id: row.get(1)?,
                    chunk_index: row.get(2)?,
                    content: row.get(3)?,
                    start_line: row.get(4)?,
                    end_line: row.get(5)?,
                    file_path: row.get(6)?,
                })
            })?
            .collect::<Result<Vec<_>, _>>()?;

        Ok(results)
    }

    /// Get all chunks with pagination
    pub fn get_chunks_paginated(
        &self,
        offset: usize,
        limit: usize,
        file_filter: Option<&str>,
    ) -> Result<Vec<ChunkWithMetadata>> {
        let file_clause = if let Some(pattern) = file_filter {
            format!("WHERE f.path LIKE '%{}%'", pattern.replace('\'', "''"))
        } else {
            String::new()
        };

        let sql = format!(
            "SELECT 
                c.id,
                c.file_id,
                c.chunk_index,
                c.content,
                c.start_line,
                c.end_line,
                f.path
             FROM chunks c
             JOIN files f ON c.file_id = f.id
             {}
             ORDER BY f.path, c.chunk_index
             LIMIT ? OFFSET ?",
            file_clause
        );

        let mut stmt = self.conn.prepare(&sql)?;
        let results = stmt
            .query_map(params![limit as i64, offset as i64], |row| {
                Ok(ChunkWithMetadata {
                    id: row.get(0)?,
                    file_id: row.get(1)?,
                    chunk_index: row.get(2)?,
                    content: row.get(3)?,
                    start_line: row.get(4)?,
                    end_line: row.get(5)?,
                    file_path: row.get(6)?,
                })
            })?
            .collect::<Result<Vec<_>, _>>()?;

        Ok(results)
    }

    /// Get a single chunk by ID
    pub fn get_chunk_by_id(&self, chunk_id: i64) -> Result<Option<ChunkWithMetadata>> {
        let result = self
            .conn
            .query_row(
                "SELECT 
                    c.id,
                    c.file_id,
                    c.chunk_index,
                    c.content,
                    c.start_line,
                    c.end_line,
                    f.path
                 FROM chunks c
                 JOIN files f ON c.file_id = f.id
                 WHERE c.id = ?",
                params![chunk_id],
                |row| {
                    Ok(ChunkWithMetadata {
                        id: row.get(0)?,
                        file_id: row.get(1)?,
                        chunk_index: row.get(2)?,
                        content: row.get(3)?,
                        start_line: row.get(4)?,
                        end_line: row.get(5)?,
                        file_path: row.get(6)?,
                    })
                },
            )
            .ok();
        Ok(result)
    }

    /// Get file statistics grouped by extension
    pub fn get_stats_by_extension(&self) -> Result<Vec<ExtensionStats>> {
        let mut stmt = self.conn.prepare(
            "SELECT 
                COALESCE(
                    CASE 
                        WHEN POSITION('.' IN REVERSE(path)) > 0 
                        THEN SUBSTRING(path FROM LENGTH(path) - POSITION('.' IN REVERSE(path)) + 2)
                        ELSE 'no extension'
                    END,
                    'no extension'
                ) as ext,
                COUNT(DISTINCT f.id) as file_count,
                COUNT(c.id) as chunk_count,
                COALESCE(SUM(LENGTH(c.content)), 0) as total_size
             FROM files f
             LEFT JOIN chunks c ON f.id = c.file_id
             GROUP BY ext
             ORDER BY file_count DESC",
        )?;

        let results = stmt
            .query_map([], |row| {
                Ok(ExtensionStats {
                    extension: row.get(0)?,
                    file_count: row.get::<_, i64>(1)? as usize,
                    chunk_count: row.get::<_, i64>(2)? as usize,
                    total_size: row.get::<_, i64>(3)? as usize,
                })
            })?
            .collect::<Result<Vec<_>, _>>()?;

        Ok(results)
    }
}

/// A file record with additional statistics
#[derive(Debug, Clone)]
pub struct FileWithStats {
    #[allow(dead_code)]
    pub id: i64,
    pub path: String,
    pub content_hash: String,
    pub last_indexed: String,
    pub chunk_count: usize,
    pub total_size: usize,
}

/// A chunk record with file path included
#[derive(Debug, Clone)]
pub struct ChunkWithMetadata {
    pub id: i64,
    #[allow(dead_code)]
    pub file_id: i64,
    pub chunk_index: i32,
    pub content: String,
    pub start_line: i32,
    pub end_line: i32,
    pub file_path: String,
}

/// Statistics grouped by file extension
#[derive(Debug, Clone)]
pub struct ExtensionStats {
    pub extension: String,
    pub file_count: usize,
    pub chunk_count: usize,
    pub total_size: usize,
}

/// Statistics about the index
#[derive(Debug, Clone)]
pub struct IndexStats {
    pub file_count: usize,
    pub chunk_count: usize,
    pub last_indexed: Option<String>,
    pub total_content_size: usize,
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_storage_creation() {
        // This test requires DuckDB VSS extension to be installed
        // Skip if not available
        let storage = Storage::in_memory(384);
        if storage.is_err() {
            eprintln!("Skipping test: VSS extension not available");
            return;
        }
        let storage = storage.unwrap();

        let stats = storage.get_stats().unwrap();
        assert_eq!(stats.file_count, 0);
        assert_eq!(stats.chunk_count, 0);
    }

    #[test]
    fn test_file_operations() {
        let storage = Storage::in_memory(384);
        if storage.is_err() {
            return; // Skip if VSS not available
        }
        let storage = storage.unwrap();

        // Insert a file
        let file_id = storage.upsert_file("test.rs", "abc123").unwrap();
        assert!(file_id > 0);

        // Get the file
        let file = storage.get_file("test.rs").unwrap();
        assert!(file.is_some());
        assert_eq!(file.unwrap().content_hash, "abc123");

        // Update the file
        let file_id2 = storage.upsert_file("test.rs", "def456").unwrap();
        assert_eq!(file_id, file_id2);

        let file = storage.get_file("test.rs").unwrap().unwrap();
        assert_eq!(file.content_hash, "def456");

        // Delete the file
        let deleted = storage.delete_file("test.rs").unwrap();
        assert!(deleted);

        let file = storage.get_file("test.rs").unwrap();
        assert!(file.is_none());
    }
}