magellan 3.2.0 - Docs.rs

//! Graph schema definitions for Magellan
//!
//! Defines node and edge payloads for sqlitegraph persistence.

use serde::{Deserialize, Serialize};

use anyhow::Result;

/// File node payload stored in sqlitegraph
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FileNode {
    pub path: String,
    pub hash: String,
    /// Unix timestamp (seconds since epoch) when this file was last indexed
    pub last_indexed_at: i64,
    /// Unix timestamp (seconds since epoch) of filesystem mtime when indexed
    pub last_modified: i64,
}

/// Symbol node payload stored in sqlitegraph
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SymbolNode {
    /// Stable symbol ID derived from (language, fqn, span_id)
    ///
    /// Generated via SHA-256 hash of language:fqn:span_id.
    /// This ID is deterministic and stable across runs for the same symbol.
    #[serde(default)]
    pub symbol_id: Option<String>,

    /// Fully-qualified name for this symbol
    /// Format varies by language (crate::module::Name for Rust, package.Class.Name for Java)
    /// This is the primary key for symbol lookup, preventing name collisions
    #[serde(default)]
    pub fqn: Option<String>,

    /// Canonical fully-qualified name with file path for unambiguous identity
    /// Format: crate_name::file_path::kind symbol_name
    /// Example: my_crate::src/lib.rs::Function my_function
    #[serde(default)]
    pub canonical_fqn: Option<String>,

    /// Display fully-qualified name for human-readable output
    /// Shortened form without file path when possible
    /// Example: my_crate::my_function
    #[serde(default)]
    pub display_fqn: Option<String>,

    /// Simple symbol name (display name)
    /// For user-facing output only. Not used as a unique identifier.
    pub name: Option<String>,
    pub kind: String,
    #[serde(default)]
    pub kind_normalized: Option<String>,
    pub byte_start: usize,
    pub byte_end: usize,
    pub start_line: usize,
    pub start_col: usize,
    pub end_line: usize,
    pub end_col: usize,
}

/// Reference node payload stored in sqlitegraph
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ReferenceNode {
    pub file: String,
    pub byte_start: u64,
    pub byte_end: u64,
    pub start_line: u64,
    pub start_col: u64,
    pub end_line: u64,
    pub end_col: u64,
}

/// Cross-file reference entry for efficient lookup
///
/// Stored separately from ReferenceNode to enable efficient cross-file
/// reference queries without traversing the entire graph.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CrossFileRef {
    /// Symbol ID of the referencing symbol (source)
    pub from_symbol_id: String,
    /// Symbol ID of the referenced symbol (target)
    pub to_symbol_id: String,
    /// File path where the reference occurs
    pub file_path: String,
    /// Line number where the reference occurs (1-indexed)
    pub line_number: usize,
    /// Byte start position
    pub byte_start: usize,
    /// Byte end position
    pub byte_end: usize,
}

/// Call node payload stored in sqlitegraph
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CallNode {
    pub file: String,
    pub caller: String,
    pub callee: String,
    /// Stable symbol ID of the caller
    #[serde(default)]
    pub caller_symbol_id: Option<String>,
    /// Stable symbol ID of the callee
    #[serde(default)]
    pub callee_symbol_id: Option<String>,
    pub byte_start: u64,
    pub byte_end: u64,
    pub start_line: u64,
    pub start_col: u64,
    pub end_line: u64,
    pub end_col: u64,
}

/// A single edge endpoint row for orphan detection.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct EdgeEndpoints {
    pub from_id: i64,
    pub to_id: i64,
}

/// Control Flow Basic Block node payload stored in database
///
/// Represents a single basic block in a function's control flow graph.
/// Basic blocks are sequences of statements with a single entry point
/// and single exit point (via terminator).
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CfgBlock {
    /// Hash of the MIR/AST for cache invalidation
    pub cfg_hash: Option<String>,

    /// High-fidelity statements (MIR instructions or AST snippets)
    pub statements: Option<Vec<String>>,
    /// Function symbol ID this block belongs to
    pub function_id: i64,

    /// Block kind (entry, conditional, loop, match, return, etc.)
    pub kind: String,

    /// Terminator kind (how control exits this block)
    pub terminator: String,

    /// Byte offset where block starts
    pub byte_start: u64,

    /// Byte offset where block ends
    pub byte_end: u64,

    /// Line where block starts (1-indexed)
    pub start_line: u64,

    /// Column where block starts (0-indexed)
    pub start_col: u64,

    /// Line where block ends (1-indexed)
    pub end_line: u64,

    /// Column where block ends (0-indexed)
    pub end_col: u64,

    // --- 4D Spatial-Temporal Coordinates (Schema v10+) ---
    /// X coordinate: Dominator depth (structural hierarchy depth)
    /// 0 = entry block, increases with nesting depth
    #[serde(default)]
    pub coord_x: i64,

    /// Y coordinate: Loop nesting level (iterative complexity)
    /// 0 = no loops, increases with nested loop depth
    #[serde(default)]
    pub coord_y: i64,

    /// Z coordinate: Branch count (decision density)
    /// Number of branch decisions from entry to this block
    #[serde(default)]
    pub coord_z: i64,

    /// T coordinate: Time/version (git commit or trace timestamp)
    /// None for current version, Some(commit_hash) for historical queries
    #[serde(default)]
    pub coord_t: Option<String>,
}

/// Control Flow Edge payload stored in graph_edges table
///
/// Represents edges between basic blocks in a function's CFG.
/// Uses edge_type = "CFG_BLOCK" for graph_edges entries.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CfgEdge {
    /// Source block ID
    pub from_block_id: i64,

    /// Target block ID
    pub to_block_id: i64,

    /// Edge kind (unconditional, conditional_true, conditional_false, etc.)
    pub kind: String,
}

/// Import node payload stored in sqlitegraph
///
/// Represents an import/use/from statement in source code.
/// Stores metadata about what was imported and where.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ImportNode {
    /// File path containing this import
    pub file: String,
    /// Kind of import statement (use_crate, use_super, use_self, etc.)
    pub import_kind: String,
    /// Full import path as components (e.g., ["crate", "foo", "bar"] for crate::foo::bar)
    pub import_path: Vec<String>,
    /// Specific names imported (e.g., ["HashMap", "HashSet"] for use std::collections::* with specific items)
    pub imported_names: Vec<String>,
    /// Whether this is a glob import (e.g., use foo::* or from foo import *)
    pub is_glob: bool,
    /// Byte offset where import starts
    pub byte_start: u64,
    /// Byte offset where import ends
    pub byte_end: u64,
    /// Line where import starts (1-indexed)
    pub start_line: u64,
    /// Column where import starts (0-indexed)
    pub start_col: u64,
    /// Line where import ends (1-indexed)
    pub end_line: u64,
    /// Column where import ends (0-indexed)
    pub end_col: u64,
}

/// Module path cache for efficient module resolution
///
/// Maps module paths (e.g., "crate::foo::bar") to file IDs.
/// Used by ModuleResolver to convert relative paths to concrete file IDs.
#[derive(Debug, Clone, Default)]
pub struct ModulePathCache {
    /// Cache mapping module path -> file_id
    cache: std::collections::HashMap<String, i64>,
}

impl ModulePathCache {
    /// Create a new empty cache
    pub fn new() -> Self {
        Self::default()
    }

    /// Insert a module path -> file_id mapping
    pub fn insert(&mut self, module_path: String, file_id: i64) {
        self.cache.insert(module_path, file_id);
    }

    /// Get file_id for a module path
    pub fn get(&self, module_path: &str) -> Option<i64> {
        self.cache.get(module_path).copied()
    }

    /// Get the number of entries in the cache
    pub fn len(&self) -> usize {
        self.cache.len()
    }

    /// Check if cache is empty
    pub fn is_empty(&self) -> bool {
        self.cache.is_empty()
    }

    /// Clear the cache
    pub fn clear(&mut self) {
        self.cache.clear();
    }

    /// Build the cache from indexed files in the database
    ///
    /// Scans all File nodes and extracts module declarations
    /// to build path -> file_id mappings.
    /// Normalizes paths to match how resolve_path() will query them.
    pub fn build_from_index(backend: &std::sync::Arc<dyn sqlitegraph::GraphBackend>) -> Self {
        let mut cache = Self::new();

        let snapshot = sqlitegraph::SnapshotId::current();
        let entity_ids = match backend.entity_ids() {
            Ok(ids) => ids,
            Err(_) => return cache,
        };

        for entity_id in entity_ids {
            let node = match backend.get_node(snapshot, entity_id) {
                Ok(n) => n,
                Err(_) => continue,
            };

            if node.kind != "File" {
                continue;
            }

            // Get file_path from node.file_path or extract from FileNode data
            let file_path = node.file_path.clone().or_else(|| {
                // For V3 backend, file_path might not be set directly
                // Extract from FileNode data
                serde_json::from_value::<FileNode>(node.data.clone())
                    .ok()
                    .map(|file_node| file_node.path)
            });

            let file_path_str = file_path.as_deref().unwrap_or("");

            // Normalize path to match how resolve_path() will query (absolute paths)
            let normalized_path = crate::graph::files::normalize_path_for_index(file_path_str);

            // Build module path from file path
            // For Rust: src/foo/bar.rs -> crate::foo::bar
            // For Rust: src/foo/mod.rs -> crate::foo
            // For Rust: src/lib.rs -> crate
            if normalized_path.ends_with(".rs") {
                let module_path = Self::file_path_to_module_path(&normalized_path);
                cache.insert(module_path, entity_id);
            }
        }

        cache
    }

    /// Convert file path to module path (public for use by ModuleResolver)
    ///
    /// Examples:
    /// - "/home/user/project/src/lib.rs" -> "crate"
    /// - "/home/user/project/src/main.rs" -> "crate"
    /// - "/home/user/project/src/foo.rs" -> "crate::foo"
    /// - "/home/user/project/src/foo/mod.rs" -> "crate::foo"
    /// - "/home/user/project/src/foo/bar.rs" -> "crate::foo::bar"
    /// - "src/lib.rs" -> "crate"
    /// - "src/foo.rs" -> "crate::foo"
    pub fn file_path_to_module_path(file_path: &str) -> String {
        // Find the "src/" component and extract everything from there
        // This handles both absolute paths (/home/user/project/src/lib.rs) and relative (src/lib.rs)
        let src_path = file_path
            .rsplit_once("/src/")
            .or_else(|| file_path.rsplit_once("\\src\\"))
            .map(|(_, rest)| format!("src/{}", rest))
            .or_else(|| {
                // Maybe it starts with src/
                if file_path.starts_with("src/") {
                    Some(file_path.to_string())
                } else {
                    None
                }
            })
            .unwrap_or_else(|| file_path.to_string());

        // Remove leading "src/" or trailing ".rs"
        let path = src_path.strip_prefix("src/").unwrap_or(&src_path);
        let path = path.strip_suffix(".rs").unwrap_or(path);

        // Split by "/"
        let parts: Vec<&str> = path.split('/').collect();

        // Filter out "mod" parts (from mod.rs files) but only if they're standalone
        // "foo/mod" -> remove "mod" -> "foo"
        // "foo/bar/mod" -> remove "mod" -> "foo/bar"
        let module_parts: Vec<&str> = parts
            .iter()
            .filter(|&&part| part != "mod")
            .copied()
            .collect();

        // If we have lib.rs or main.rs, this is the crate root
        if module_parts.len() == 1 && (module_parts[0] == "lib" || module_parts[0] == "main") {
            return "crate".to_string();
        }

        // Build module path with crate:: prefix
        let module_path = module_parts.join("::");
        format!("crate::{}", module_path)
    }
}

/// Delete edges whose from_id OR to_id is in the provided set of entity IDs.
///
/// Determinism: IDs must be pre-sorted by caller.
pub fn delete_edges_touching_entities(
    conn: &rusqlite::Connection,
    entity_ids_sorted: &[i64],
) -> Result<usize> {
    use rusqlite::params_from_iter;

    if entity_ids_sorted.is_empty() {
        return Ok(0);
    }

    // Build placeholders for IN list.
    let placeholders = std::iter::repeat_n("?", entity_ids_sorted.len())
        .collect::<Vec<_>>()
        .join(", ");

    let sql = format!(
        "DELETE FROM graph_edges WHERE from_id IN ({}) OR to_id IN ({})",
        placeholders, placeholders
    );

    // Params are duplicated (for from_id and to_id IN lists).
    let params = entity_ids_sorted
        .iter()
        .chain(entity_ids_sorted.iter())
        .copied();

    let affected = conn
        .execute(&sql, params_from_iter(params))
        .map_err(|e| anyhow::anyhow!("Failed to delete edges touching entities: {}", e))?;

    Ok(affected)
}

/// Metadata about a lazy-built geometric index (.geo file)
///
/// Stored in the `geo_index_meta` table to track when the .geo file
/// was last built from the SQLite database and whether it needs rebuilding.
#[derive(Debug, Clone)]
pub struct GeoIndexMeta {
    pub geo_path: String,
    pub built_at: i64,
    pub schema_version: i64,
    pub symbol_count: i64,
    pub call_count: i64,
    pub cfg_block_count: i64,
    pub checksum: String,
}

impl GeoIndexMeta {
    /// Record that a .geo index was built, overwriting any previous record.
    pub fn record_geo_index_built(
        conn: &rusqlite::Connection,
        geo_path: &str,
        symbol_count: i64,
        call_count: i64,
        cfg_block_count: i64,
        checksum: &str,
    ) -> Result<()> {
        let schema_version = Self::current_schema_version();
        let built_at = std::time::SystemTime::now()
            .duration_since(std::time::UNIX_EPOCH)?
            .as_secs() as i64;

        conn.execute(
            "INSERT INTO geo_index_meta (id, geo_path, built_at, schema_version, symbol_count, call_count, cfg_block_count, checksum)
             VALUES (1, ?1, ?2, ?3, ?4, ?5, ?6, ?7)
             ON CONFLICT(id) DO UPDATE SET
               geo_path = excluded.geo_path,
               built_at = excluded.built_at,
               schema_version = excluded.schema_version,
               symbol_count = excluded.symbol_count,
               call_count = excluded.call_count,
               cfg_block_count = excluded.cfg_block_count,
               checksum = excluded.checksum",
            rusqlite::params![
                geo_path,
                built_at,
                schema_version,
                symbol_count,
                call_count,
                cfg_block_count,
                checksum
            ],
        )?;
        Ok(())
    }

    /// Get the last recorded geo index metadata.
    pub fn get_geo_index_meta(conn: &rusqlite::Connection) -> Result<Option<GeoIndexMeta>> {
        let mut stmt = conn.prepare(
            "SELECT geo_path, built_at, schema_version, symbol_count, call_count, cfg_block_count, checksum
             FROM geo_index_meta WHERE id = 1"
        )?;
        let result = stmt.query_row([], |row| {
            Ok(GeoIndexMeta {
                geo_path: row.get(0)?,
                built_at: row.get(1)?,
                schema_version: row.get(2)?,
                symbol_count: row.get(3)?,
                call_count: row.get(4)?,
                cfg_block_count: row.get(5)?,
                checksum: row.get(6)?,
            })
        });
        match result {
            Ok(meta) => Ok(Some(meta)),
            Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
            Err(e) => Err(e.into()),
        }
    }

    /// Return the current Magellan schema version.
    pub fn current_schema_version() -> i64 {
        crate::migrate_cmd::MAGELLAN_SCHEMA_VERSION
    }
}