maproom 0.1.0 - Docs.rs

//! Incremental file processing with atomic updates and transaction integrity.
//!
//! This module provides the core processor for incremental indexing, handling three types
//! of file changes (new, modified, deleted) with full transaction safety and edge consistency.
//!
//! # Architecture
//!
//! The processor coordinates:
//! - File parsing (via existing ParserFactory)
//! - Chunk database updates (atomic transactions)
//! - Edge relationship maintenance (via EdgeUpdater)
//!
//! # Transaction Flow
//!
//! For modified files:
//! ```sql
//! BEGIN;
//!   DELETE FROM maproom.chunks WHERE file_id = $1;
//!   INSERT INTO maproom.chunks (...) VALUES (...);
//!   UPDATE maproom.files SET blake3_hash = $1, last_modified = NOW() WHERE id = $2;
//!   DELETE FROM maproom.chunk_edges WHERE src_chunk_id IN (...) OR dst_chunk_id IN (...);
//!   INSERT INTO maproom.chunk_edges (...) VALUES (...);
//! COMMIT;
//! ```
//!
//! # Performance Target
//!
//! - File updates complete in <5s for typical files
//! - Automatic rollback on any error (prevents corruption)
//! - Batch operations within transactions for efficiency

use anyhow::{Context, Result};
use std::fs;
use std::path::{Path, PathBuf};
use tracing::{debug, info, warn};

use crate::db::traits::StoreChunks;
use crate::db::traits::StoreCore;
use crate::db::SqliteStore;
use crate::indexer::SymbolChunk;
use std::sync::Arc;

use super::detector::ChangeType;
use super::edge_updater::EdgeUpdater;
use super::hash::ContentHash;
use super::path_utils::normalize_to_relpath;
use super::task::UpdateTask;

/// Maximum file size (in bytes) to index. Files larger than this are skipped.
///
/// Set to 10MB to prevent DoS attacks via very large files (e.g., 1GB file causing OOM).
/// Source code files are typically < 1MB. This limit accommodates large autogenerated files,
/// package-lock.json, etc., while protecting against accidental indexing of binary artifacts.
///
/// Adjust this constant if your repository requires indexing larger files.
const MAX_FILE_SIZE_BYTES: u64 = 10 * 1024 * 1024; // 10MB

/// Incremental processor for atomic file updates.
///
/// Processes individual file changes from the update queue with full
/// transaction safety. Each file operation is atomic - either all changes
/// succeed or all are rolled back.
///
/// # Example
///
/// ```ignore
/// use std::path::PathBuf;
/// use maproom::db::create_pool;
/// use maproom::incremental::{IncrementalProcessor, UpdateTask, ChangeType, FileHasher, Trigger};
///
/// #[tokio::main]
/// async fn main() -> anyhow::Result<()> {
///     let pool = create_pool().await?;
///     let repo_root = PathBuf::from("/workspace");
///     let processor = IncrementalProcessor::new(pool, repo_root, 1, 1, 1);
///
///     // Process a file update
///     let path = PathBuf::from("/workspace/src/main.rs");
///     let old_hash = FileHasher::hash_bytes(b"old content");
///     let new_hash = FileHasher::hash_bytes(b"new content");
///     let task = UpdateTask::new(
///         path,
///         ChangeType::Modified { old: old_hash, new: new_hash },
///         Trigger::Save
///     );
///
///     processor.process(task).await?;
///     Ok(())
/// }
/// ```
pub struct IncrementalProcessor {
    store: Arc<SqliteStore>,
    edge_updater: EdgeUpdater,
    repo_root: PathBuf,
    repo_id: i64,
    worktree_id: i64,
    commit_id: i64,
}

impl IncrementalProcessor {
    /// Create a new incremental processor.
    ///
    /// # Arguments
    /// * `store` - SqliteStore instance
    /// * `repo_root` - Absolute path to the repository root (used for path normalization)
    /// * `repo_id` - Database ID of the repository
    /// * `worktree_id` - Database ID of the worktree
    /// * `commit_id` - Database ID of the commit
    ///
    /// # Returns
    /// A new processor ready to handle file updates
    pub fn new(
        store: Arc<SqliteStore>,
        repo_root: PathBuf,
        repo_id: i64,
        worktree_id: i64,
        commit_id: i64,
    ) -> Self {
        Self {
            edge_updater: EdgeUpdater::new(store.clone()),
            store,
            repo_root,
            repo_id,
            worktree_id,
            commit_id,
        }
    }

    /// Process a single update task.
    ///
    /// Handles the task based on its change type:
    /// - New: Parse and insert file chunks
    /// - Modified: Delete old chunks, insert new ones, update file record
    /// - Deleted: Remove all chunks and edges
    /// - None: Skip (no changes needed)
    ///
    /// All operations are wrapped in a transaction for atomicity.
    ///
    /// # Arguments
    /// * `task` - The update task to process
    ///
    /// # Returns
    /// * `Ok(())` - Task processed successfully
    /// * `Err(_)` - Processing failed, transaction rolled back
    ///
    /// # Performance
    ///
    /// Typical processing times:
    /// - New file: 100-500ms (parse + insert)
    /// - Modified file: 200-800ms (delete + parse + insert + edges)
    /// - Deleted file: 50-200ms (delete chunks + edges)
    ///
    /// # Example
    ///
    /// ```ignore
    /// # use std::path::PathBuf;
    /// # use maproom::db::create_pool;
    /// # use maproom::incremental::{IncrementalProcessor, UpdateTask, ChangeType, FileHasher, Trigger};
    /// # #[tokio::main]
    /// # async fn main() -> anyhow::Result<()> {
    /// # let pool = create_pool().await?;
    /// let repo_root = PathBuf::from("/workspace");
    /// let processor = IncrementalProcessor::new(pool, repo_root);
    /// let task = UpdateTask::new(
    ///     PathBuf::from("/workspace/src/lib.rs"),
    ///     ChangeType::New(FileHasher::hash_bytes(b"content")),
    ///     Trigger::Auto
    /// );
    ///
    /// processor.process(task).await?;
    /// # Ok(())
    /// # }
    /// ```
    pub async fn process(&self, task: UpdateTask) -> Result<()> {
        let path_display = task.path.display().to_string();

        debug!(
            path = %path_display,
            change_type = ?task.change_type,
            priority = ?task.priority,
            "Processing update task"
        );

        match &task.change_type {
            ChangeType::New(hash) => {
                self.index_new_file(&task.path, hash)
                    .await
                    .with_context(|| format!("Failed to index new file: {}", path_display))?;
                info!(path = %path_display, "Indexed new file");
            }
            ChangeType::Modified { old: _, new } => {
                self.update_file(&task.path, new)
                    .await
                    .with_context(|| format!("Failed to update modified file: {}", path_display))?;
                info!(path = %path_display, "Updated modified file");
            }
            ChangeType::Deleted(_) => {
                self.remove_file(&task.path)
                    .await
                    .with_context(|| format!("Failed to remove deleted file: {}", path_display))?;
                info!(path = %path_display, "Removed deleted file");
            }
            ChangeType::None => {
                debug!(path = %path_display, "No change detected, skipping");
                return Ok(());
            }
        }

        Ok(())
    }

    /// Index a new file by parsing and inserting its chunks.
    ///
    /// # Transaction Flow
    /// 1. Look up or create file record in database
    /// 2. Parse file to extract chunks
    /// 3. Insert all chunks in a transaction
    /// 4. Update edges for new chunks
    ///
    /// # Arguments
    /// * `path` - Absolute filesystem path to the new file
    /// * `hash` - Content hash of the file
    ///
    /// # Returns
    /// * `Ok(())` - File indexed successfully
    /// * `Err(_)` - Indexing failed (e.g., parse error, DB error)
    async fn index_new_file(&self, path: &Path, hash: &ContentHash) -> Result<()> {
        // Note: This function previously used PostgreSQL transactions.
        // SQLite implementation will be added in future tickets.

        // Check file size BEFORE reading to prevent OOM on very large files
        let metadata = fs::metadata(path)
            .with_context(|| format!("Failed to get file metadata: {}", path.display()))?;

        if metadata.len() > MAX_FILE_SIZE_BYTES {
            warn!(
                path = %path.display(),
                size_mb = metadata.len() / (1024 * 1024),
                limit_mb = MAX_FILE_SIZE_BYTES / (1024 * 1024),
                "File too large to index, skipping"
            );
            return Ok(()); // Skip gracefully, don't error
        }

        // Detect symlinks for security awareness (log but allow indexing)
        // Use symlink_metadata to check the link itself, not the target
        let symlink_metadata = fs::symlink_metadata(path)
            .with_context(|| format!("Failed to get symlink metadata: {}", path.display()))?;

        if symlink_metadata.file_type().is_symlink() {
            warn!(
                path = %path.display(),
                "Indexing symlink - resolved path may be outside repository"
            );
            // Continue processing - just log awareness
        }

        // CRITICAL: Read file content using absolute path (filesystem operation)
        let content = fs::read_to_string(path)
            .with_context(|| format!("Failed to read file: {}", path.display()))?;

        // Detect language from file extension
        let language = detect_language_from_path(path);

        // CRITICAL: Normalize path for database query (database stores relative paths)
        // Absolute path example: "/workspace/packages/cli/src/main.ts"
        // Relative path example: "packages/cli/src/main.ts"
        let relpath = normalize_to_relpath(path, &self.repo_root)
            .with_context(|| format!("Failed to normalize path: {}", path.display()))?;

        let relpath_str = relpath
            .to_str()
            .ok_or_else(|| anyhow::anyhow!("Invalid UTF-8 in path: {}", relpath.display()))?;

        // 1. Create file record
        let file_record = crate::db::FileRecord {
            repo_id: self.repo_id,
            worktree_id: self.worktree_id,
            commit_id: self.commit_id,
            relpath: relpath_str.to_string(),
            language: language.map(|s| s.to_string()),
            content_hash: hash.to_hex().to_string(),
            size_bytes: metadata.len() as i32,
            last_modified: Some(chrono::Utc::now()),
        };

        let file_id = self
            .store
            .upsert_file(&file_record)
            .await
            .with_context(|| format!("Failed to upsert file record: {}", path.display()))?;

        // 2. Parse file to extract chunks
        let lang_str = language.unwrap_or("unknown");
        let symbol_chunks = parse_file_chunks(&content, lang_str)
            .with_context(|| format!("Failed to parse file: {}", path.display()))?;

        // 3. Create chunk records and insert them
        let mut chunk_ids = Vec::new();
        for chunk in &symbol_chunks {
            let preview = content
                .lines()
                .skip((chunk.start_line - 1) as usize)
                .take((chunk.end_line - chunk.start_line + 1) as usize)
                .collect::<Vec<_>>()
                .join("\n");

            let ts_doc_text = build_ts_doc(
                chunk.symbol_name.as_deref(),
                chunk.signature.as_deref(),
                chunk.docstring.as_deref(),
                &preview,
            );

            // Compute blob_sha for this chunk's content
            let chunk_content = &preview;
            let blob_sha = super::hash::FileHasher::hash_bytes(chunk_content.as_bytes())
                .to_hex()
                .to_string();

            let chunk_record = crate::db::ChunkRecord {
                file_id,
                blob_sha,
                symbol_name: chunk.symbol_name.clone(),
                kind: chunk.kind.clone(),
                signature: chunk.signature.clone(),
                docstring: chunk.docstring.clone(),
                start_line: chunk.start_line,
                end_line: chunk.end_line,
                preview,
                ts_doc_text,
                recency_score: 1.0, // New file = max recency
                churn_score: 0.0,   // New file = no churn
                metadata: chunk.metadata.clone(),
                worktree_id: self.worktree_id,
            };

            let chunk_id = self
                .store
                .insert_chunk(&chunk_record)
                .await
                .with_context(|| format!("Failed to insert chunk for file: {}", path.display()))?;
            chunk_ids.push(chunk_id);
        }

        // 4. Update edges for new chunks (delegate to EdgeUpdater)
        // Note: EdgeUpdater.update_edges takes file_id, not chunk_ids
        // Edge computation is done by file for consistency
        self.edge_updater
            .update_edges(file_id)
            .await
            .with_context(|| format!("Failed to update edges for file: {}", path.display()))?;

        debug!(
            path = %path.display(),
            file_id = file_id,
            chunks = chunk_ids.len(),
            "Indexed new file"
        );

        Ok(())
    }

    /// Update an existing file by replacing its chunks.
    ///
    /// # Transaction Flow
    /// 1. Begin transaction
    /// 2. Delete all existing chunks for the file
    /// 3. Parse file and insert new chunks
    /// 4. Update file record with new hash and timestamp
    /// 5. Commit transaction
    /// 6. Update edges (after transaction completes)
    ///
    /// # Arguments
    /// * `path` - Absolute filesystem path to the modified file
    /// * `new_hash` - New content hash of the file
    ///
    /// # Returns
    /// * `Ok(())` - File updated successfully
    /// * `Err(_)` - Update failed, transaction rolled back
    async fn update_file(&self, path: &Path, _new_hash: &ContentHash) -> Result<()> {
        // Note: This function previously used PostgreSQL transactions.
        // SQLite implementation will be added in future tickets.

        // Check file size BEFORE reading to prevent OOM on very large files
        let metadata = fs::metadata(path)
            .with_context(|| format!("Failed to get file metadata: {}", path.display()))?;

        if metadata.len() > MAX_FILE_SIZE_BYTES {
            warn!(
                path = %path.display(),
                size_mb = metadata.len() / (1024 * 1024),
                limit_mb = MAX_FILE_SIZE_BYTES / (1024 * 1024),
                "File too large to index, skipping"
            );
            return Ok(()); // Skip gracefully, don't error
        }

        // Detect symlinks for security awareness (log but allow indexing)
        // Use symlink_metadata to check the link itself, not the target
        let symlink_metadata = fs::symlink_metadata(path)
            .with_context(|| format!("Failed to get symlink metadata: {}", path.display()))?;

        if symlink_metadata.file_type().is_symlink() {
            warn!(
                path = %path.display(),
                "Indexing symlink - resolved path may be outside repository"
            );
            // Continue processing - just log awareness
        }

        // CRITICAL: Read file content using absolute path (filesystem operation)
        let content = fs::read_to_string(path)
            .with_context(|| format!("Failed to read file: {}", path.display()))?;

        // Detect language from file extension
        let language = detect_language_from_path(path);

        // CRITICAL: Normalize path for database query (database stores relative paths)
        // Absolute path example: "/workspace/packages/cli/src/main.ts"
        // Relative path example: "packages/cli/src/main.ts"
        let relpath = normalize_to_relpath(path, &self.repo_root)
            .with_context(|| format!("Failed to normalize path: {}", path.display()))?;

        let relpath_str = relpath
            .to_str()
            .ok_or_else(|| anyhow::anyhow!("Invalid UTF-8 in path: {}", relpath.display()))?;

        // 1. Look up existing file by relpath
        let file_id = self
            .store
            .get_file_id_by_relpath(relpath_str, self.worktree_id)
            .await
            .with_context(|| format!("Failed to look up file: {}", path.display()))?;

        let file_id = match file_id {
            Some(id) => id,
            None => {
                // File doesn't exist in DB yet - treat as new file
                debug!(path = %path.display(), "File not found in DB, treating as new");
                return self.index_new_file(path, _new_hash).await;
            }
        };

        // 2. Delete old chunks (this also cleans up edges and embeddings)
        let chunks_deleted = self
            .store
            .delete_chunks_by_file(file_id)
            .await
            .with_context(|| format!("Failed to delete old chunks for file: {}", path.display()))?;

        debug!(path = %path.display(), chunks_deleted = chunks_deleted, "Deleted old chunks");

        // 3. Parse file to extract new chunks
        let lang_str = language.unwrap_or("unknown");
        let symbol_chunks = parse_file_chunks(&content, lang_str)
            .with_context(|| format!("Failed to parse file: {}", path.display()))?;

        // 4. Insert new chunks
        let mut chunk_ids = Vec::new();
        for chunk in &symbol_chunks {
            let preview = content
                .lines()
                .skip((chunk.start_line - 1) as usize)
                .take((chunk.end_line - chunk.start_line + 1) as usize)
                .collect::<Vec<_>>()
                .join("\n");

            let ts_doc_text = build_ts_doc(
                chunk.symbol_name.as_deref(),
                chunk.signature.as_deref(),
                chunk.docstring.as_deref(),
                &preview,
            );

            // Compute blob_sha for this chunk's content
            let chunk_content = &preview;
            let blob_sha = super::hash::FileHasher::hash_bytes(chunk_content.as_bytes())
                .to_hex()
                .to_string();

            let chunk_record = crate::db::ChunkRecord {
                file_id,
                blob_sha,
                symbol_name: chunk.symbol_name.clone(),
                kind: chunk.kind.clone(),
                signature: chunk.signature.clone(),
                docstring: chunk.docstring.clone(),
                start_line: chunk.start_line,
                end_line: chunk.end_line,
                preview,
                ts_doc_text,
                recency_score: 1.0, // Modified = high recency
                churn_score: 0.5,   // Modified = some churn
                metadata: chunk.metadata.clone(),
                worktree_id: self.worktree_id,
            };

            let chunk_id = self
                .store
                .insert_chunk(&chunk_record)
                .await
                .with_context(|| format!("Failed to insert chunk for file: {}", path.display()))?;
            chunk_ids.push(chunk_id);
        }

        // 5. Update edges for new chunks
        // Note: EdgeUpdater.update_edges takes file_id, not chunk_ids
        self.edge_updater
            .update_edges(file_id)
            .await
            .with_context(|| format!("Failed to update edges for file: {}", path.display()))?;

        debug!(
            path = %path.display(),
            file_id = file_id,
            chunks = chunk_ids.len(),
            "Updated file"
        );

        Ok(())
    }

    /// Remove a deleted file and all its chunks.
    ///
    /// # Transaction Flow
    /// 1. Begin transaction
    /// 2. Delete all chunks (CASCADE deletes edges automatically)
    /// 3. Delete file record
    /// 4. Commit transaction
    ///
    /// # Arguments
    /// * `path` - Absolute filesystem path to the deleted file
    ///
    /// # Returns
    /// * `Ok(())` - File removed successfully
    /// * `Err(_)` - Removal failed, transaction rolled back
    async fn remove_file(&self, path: &Path) -> Result<()> {
        // Note: This function previously used PostgreSQL transactions.
        // SQLite implementation will be added in future tickets.

        // CRITICAL: Normalize path for database query (database stores relative paths)
        // Absolute path example: "/workspace/packages/cli/src/main.ts"
        // Relative path example: "packages/cli/src/main.ts"
        let relpath = normalize_to_relpath(path, &self.repo_root)
            .with_context(|| format!("Failed to normalize path: {}", path.display()))?;

        let relpath_str = relpath
            .to_str()
            .ok_or_else(|| anyhow::anyhow!("Invalid UTF-8 in path: {}", relpath.display()))?;

        // 1. Look up file by relpath
        let file_id = self
            .store
            .get_file_id_by_relpath(relpath_str, self.worktree_id)
            .await
            .with_context(|| format!("Failed to look up file: {}", path.display()))?;

        let file_id = match file_id {
            Some(id) => id,
            None => {
                // File doesn't exist in DB - nothing to delete
                debug!(path = %path.display(), "File not found in DB, nothing to delete");
                return Ok(());
            }
        };

        // 2. Delete chunks (this also cleans up edges and embeddings via CASCADE)
        let chunks_deleted = self
            .store
            .delete_chunks_by_file(file_id)
            .await
            .with_context(|| format!("Failed to delete chunks for file: {}", path.display()))?;

        // 3. Delete file record
        self.store
            .delete_file(file_id)
            .await
            .with_context(|| format!("Failed to delete file record: {}", path.display()))?;

        debug!(
            path = %path.display(),
            file_id = file_id,
            chunks_deleted = chunks_deleted,
            "Removed file"
        );

        Ok(())
    }
}

/// Detect programming language from file path extension.
///
/// # Arguments
/// * `path` - File path to analyze
///
/// # Returns
/// Language identifier (e.g., "ts", "rs", "md") or None if unknown
fn detect_language_from_path(path: &Path) -> Option<&'static str> {
    match path.extension().and_then(|e| e.to_str()).unwrap_or("") {
        "ts" => Some("ts"),
        "tsx" => Some("tsx"),
        "js" => Some("js"),
        "jsx" => Some("jsx"),
        "rs" => Some("rs"),
        "md" => Some("md"),
        "mdx" => Some("mdx"),
        "json" => Some("json"),
        "yaml" | "yml" => Some("yaml"),
        "toml" => Some("toml"),
        _ => None,
    }
}

/// Parse a file's content to extract symbol chunks.
///
/// Uses the existing parser infrastructure from `crate::indexer::parser`.
///
/// # Arguments
/// * `content` - File content as string
/// * `language` - Language identifier
///
/// # Returns
/// Vector of symbol chunks extracted from the file
fn parse_file_chunks(content: &str, language: &str) -> Result<Vec<SymbolChunk>> {
    use crate::indexer::parser;

    let chunks = parser::extract_chunks(content, language);

    // If no chunks extracted, create a single module-level chunk
    if chunks.is_empty() {
        Ok(vec![SymbolChunk {
            symbol_name: None,
            kind: "module".to_string(),
            signature: None,
            docstring: None,
            start_line: 1,
            end_line: content.lines().count() as i32,
            metadata: None,
        }])
    } else {
        Ok(chunks)
    }
}

// Note: insert_chunk_in_transaction was removed as part of PostgreSQL to SQLite migration.
// Chunk insertion will be implemented using SqliteStore methods in future tickets.

/// Build full-text search document from chunk metadata.
///
/// Combines symbol name, signature, docstring, and preview into a single
/// searchable document.
///
/// # Arguments
/// * `symbol_name` - Optional symbol name
/// * `signature` - Optional function/class signature
/// * `docstring` - Optional documentation string
/// * `preview` - Code preview text
///
/// # Returns
/// Combined text document for full-text search
fn build_ts_doc(
    symbol_name: Option<&str>,
    signature: Option<&str>,
    docstring: Option<&str>,
    preview: &str,
) -> String {
    let mut parts: Vec<String> = Vec::new();

    if let Some(s) = symbol_name {
        parts.push(s.to_owned());
    }
    if let Some(s) = signature {
        parts.push(s.to_owned());
    }
    if let Some(s) = docstring {
        parts.push(s.to_owned());
    }
    parts.push(preview.to_owned());

    parts.join(" \n ")
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_detect_language_from_path() {
        assert_eq!(
            detect_language_from_path(Path::new("src/main.rs")),
            Some("rs")
        );
        assert_eq!(
            detect_language_from_path(Path::new("src/lib.ts")),
            Some("ts")
        );
        assert_eq!(
            detect_language_from_path(Path::new("README.md")),
            Some("md")
        );
        assert_eq!(
            detect_language_from_path(Path::new("config.yaml")),
            Some("yaml")
        );
        assert_eq!(detect_language_from_path(Path::new("unknown.xyz")), None);
    }

    #[test]
    fn test_build_ts_doc() {
        let doc = build_ts_doc(
            Some("myFunction"),
            Some("fn myFunction(x: i32) -> i32"),
            Some("Does something cool"),
            "let x = 42;",
        );

        assert!(doc.contains("myFunction"));
        assert!(doc.contains("fn myFunction"));
        assert!(doc.contains("Does something cool"));
        assert!(doc.contains("let x = 42;"));
    }

    #[test]
    fn test_build_ts_doc_minimal() {
        let doc = build_ts_doc(None, None, None, "some code");
        assert_eq!(doc, "some code");
    }

    #[test]
    fn test_parse_file_chunks_creates_module_for_empty() {
        let chunks = parse_file_chunks("", "unknown").unwrap();
        assert_eq!(chunks.len(), 1);
        assert_eq!(chunks[0].kind, "module");
    }
}