magellan 3.2.0

Deterministic codebase mapping tool for local development
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
//! Graph schema definitions for Magellan
//!
//! Defines node and edge payloads for sqlitegraph persistence.

use serde::{Deserialize, Serialize};

use anyhow::Result;

/// File node payload stored in sqlitegraph
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FileNode {
    pub path: String,
    pub hash: String,
    /// Unix timestamp (seconds since epoch) when this file was last indexed
    pub last_indexed_at: i64,
    /// Unix timestamp (seconds since epoch) of filesystem mtime when indexed
    pub last_modified: i64,
}

/// Symbol node payload stored in sqlitegraph
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SymbolNode {
    /// Stable symbol ID derived from (language, fqn, span_id)
    ///
    /// Generated via SHA-256 hash of language:fqn:span_id.
    /// This ID is deterministic and stable across runs for the same symbol.
    #[serde(default)]
    pub symbol_id: Option<String>,

    /// Fully-qualified name for this symbol
    /// Format varies by language (crate::module::Name for Rust, package.Class.Name for Java)
    /// This is the primary key for symbol lookup, preventing name collisions
    #[serde(default)]
    pub fqn: Option<String>,

    /// Canonical fully-qualified name with file path for unambiguous identity
    /// Format: crate_name::file_path::kind symbol_name
    /// Example: my_crate::src/lib.rs::Function my_function
    #[serde(default)]
    pub canonical_fqn: Option<String>,

    /// Display fully-qualified name for human-readable output
    /// Shortened form without file path when possible
    /// Example: my_crate::my_function
    #[serde(default)]
    pub display_fqn: Option<String>,

    /// Simple symbol name (display name)
    /// For user-facing output only. Not used as a unique identifier.
    pub name: Option<String>,
    pub kind: String,
    #[serde(default)]
    pub kind_normalized: Option<String>,
    pub byte_start: usize,
    pub byte_end: usize,
    pub start_line: usize,
    pub start_col: usize,
    pub end_line: usize,
    pub end_col: usize,
}

/// Reference node payload stored in sqlitegraph
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ReferenceNode {
    pub file: String,
    pub byte_start: u64,
    pub byte_end: u64,
    pub start_line: u64,
    pub start_col: u64,
    pub end_line: u64,
    pub end_col: u64,
}

/// Cross-file reference entry for efficient lookup
///
/// Stored separately from ReferenceNode to enable efficient cross-file
/// reference queries without traversing the entire graph.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CrossFileRef {
    /// Symbol ID of the referencing symbol (source)
    pub from_symbol_id: String,
    /// Symbol ID of the referenced symbol (target)
    pub to_symbol_id: String,
    /// File path where the reference occurs
    pub file_path: String,
    /// Line number where the reference occurs (1-indexed)
    pub line_number: usize,
    /// Byte start position
    pub byte_start: usize,
    /// Byte end position
    pub byte_end: usize,
}

/// Call node payload stored in sqlitegraph
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CallNode {
    pub file: String,
    pub caller: String,
    pub callee: String,
    /// Stable symbol ID of the caller
    #[serde(default)]
    pub caller_symbol_id: Option<String>,
    /// Stable symbol ID of the callee
    #[serde(default)]
    pub callee_symbol_id: Option<String>,
    pub byte_start: u64,
    pub byte_end: u64,
    pub start_line: u64,
    pub start_col: u64,
    pub end_line: u64,
    pub end_col: u64,
}

/// A single edge endpoint row for orphan detection.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct EdgeEndpoints {
    pub from_id: i64,
    pub to_id: i64,
}

/// Control Flow Basic Block node payload stored in database
///
/// Represents a single basic block in a function's control flow graph.
/// Basic blocks are sequences of statements with a single entry point
/// and single exit point (via terminator).
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CfgBlock {
    /// Hash of the MIR/AST for cache invalidation
    pub cfg_hash: Option<String>,

    /// High-fidelity statements (MIR instructions or AST snippets)
    pub statements: Option<Vec<String>>,
    /// Function symbol ID this block belongs to
    pub function_id: i64,

    /// Block kind (entry, conditional, loop, match, return, etc.)
    pub kind: String,

    /// Terminator kind (how control exits this block)
    pub terminator: String,

    /// Byte offset where block starts
    pub byte_start: u64,

    /// Byte offset where block ends
    pub byte_end: u64,

    /// Line where block starts (1-indexed)
    pub start_line: u64,

    /// Column where block starts (0-indexed)
    pub start_col: u64,

    /// Line where block ends (1-indexed)
    pub end_line: u64,

    /// Column where block ends (0-indexed)
    pub end_col: u64,

    // --- 4D Spatial-Temporal Coordinates (Schema v10+) ---
    /// X coordinate: Dominator depth (structural hierarchy depth)
    /// 0 = entry block, increases with nesting depth
    #[serde(default)]
    pub coord_x: i64,

    /// Y coordinate: Loop nesting level (iterative complexity)
    /// 0 = no loops, increases with nested loop depth
    #[serde(default)]
    pub coord_y: i64,

    /// Z coordinate: Branch count (decision density)
    /// Number of branch decisions from entry to this block
    #[serde(default)]
    pub coord_z: i64,

    /// T coordinate: Time/version (git commit or trace timestamp)
    /// None for current version, Some(commit_hash) for historical queries
    #[serde(default)]
    pub coord_t: Option<String>,
}

/// Control Flow Edge payload stored in graph_edges table
///
/// Represents edges between basic blocks in a function's CFG.
/// Uses edge_type = "CFG_BLOCK" for graph_edges entries.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CfgEdge {
    /// Source block ID
    pub from_block_id: i64,

    /// Target block ID
    pub to_block_id: i64,

    /// Edge kind (unconditional, conditional_true, conditional_false, etc.)
    pub kind: String,
}

/// Import node payload stored in sqlitegraph
///
/// Represents an import/use/from statement in source code.
/// Stores metadata about what was imported and where.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ImportNode {
    /// File path containing this import
    pub file: String,
    /// Kind of import statement (use_crate, use_super, use_self, etc.)
    pub import_kind: String,
    /// Full import path as components (e.g., ["crate", "foo", "bar"] for crate::foo::bar)
    pub import_path: Vec<String>,
    /// Specific names imported (e.g., ["HashMap", "HashSet"] for use std::collections::* with specific items)
    pub imported_names: Vec<String>,
    /// Whether this is a glob import (e.g., use foo::* or from foo import *)
    pub is_glob: bool,
    /// Byte offset where import starts
    pub byte_start: u64,
    /// Byte offset where import ends
    pub byte_end: u64,
    /// Line where import starts (1-indexed)
    pub start_line: u64,
    /// Column where import starts (0-indexed)
    pub start_col: u64,
    /// Line where import ends (1-indexed)
    pub end_line: u64,
    /// Column where import ends (0-indexed)
    pub end_col: u64,
}

/// Module path cache for efficient module resolution
///
/// Maps module paths (e.g., "crate::foo::bar") to file IDs.
/// Used by ModuleResolver to convert relative paths to concrete file IDs.
#[derive(Debug, Clone, Default)]
pub struct ModulePathCache {
    /// Cache mapping module path -> file_id
    cache: std::collections::HashMap<String, i64>,
}

impl ModulePathCache {
    /// Create a new empty cache
    pub fn new() -> Self {
        Self::default()
    }

    /// Insert a module path -> file_id mapping
    pub fn insert(&mut self, module_path: String, file_id: i64) {
        self.cache.insert(module_path, file_id);
    }

    /// Get file_id for a module path
    pub fn get(&self, module_path: &str) -> Option<i64> {
        self.cache.get(module_path).copied()
    }

    /// Get the number of entries in the cache
    pub fn len(&self) -> usize {
        self.cache.len()
    }

    /// Check if cache is empty
    pub fn is_empty(&self) -> bool {
        self.cache.is_empty()
    }

    /// Clear the cache
    pub fn clear(&mut self) {
        self.cache.clear();
    }

    /// Build the cache from indexed files in the database
    ///
    /// Scans all File nodes and extracts module declarations
    /// to build path -> file_id mappings.
    /// Normalizes paths to match how resolve_path() will query them.
    pub fn build_from_index(backend: &std::sync::Arc<dyn sqlitegraph::GraphBackend>) -> Self {
        let mut cache = Self::new();

        let snapshot = sqlitegraph::SnapshotId::current();
        let entity_ids = match backend.entity_ids() {
            Ok(ids) => ids,
            Err(_) => return cache,
        };

        for entity_id in entity_ids {
            let node = match backend.get_node(snapshot, entity_id) {
                Ok(n) => n,
                Err(_) => continue,
            };

            if node.kind != "File" {
                continue;
            }

            // Get file_path from node.file_path or extract from FileNode data
            let file_path = node.file_path.clone().or_else(|| {
                // For V3 backend, file_path might not be set directly
                // Extract from FileNode data
                serde_json::from_value::<FileNode>(node.data.clone())
                    .ok()
                    .map(|file_node| file_node.path)
            });

            let file_path_str = file_path.as_deref().unwrap_or("");

            // Normalize path to match how resolve_path() will query (absolute paths)
            let normalized_path = crate::graph::files::normalize_path_for_index(file_path_str);

            // Build module path from file path
            // For Rust: src/foo/bar.rs -> crate::foo::bar
            // For Rust: src/foo/mod.rs -> crate::foo
            // For Rust: src/lib.rs -> crate
            if normalized_path.ends_with(".rs") {
                let module_path = Self::file_path_to_module_path(&normalized_path);
                cache.insert(module_path, entity_id);
            }
        }

        cache
    }

    /// Convert file path to module path (public for use by ModuleResolver)
    ///
    /// Examples:
    /// - "/home/user/project/src/lib.rs" -> "crate"
    /// - "/home/user/project/src/main.rs" -> "crate"
    /// - "/home/user/project/src/foo.rs" -> "crate::foo"
    /// - "/home/user/project/src/foo/mod.rs" -> "crate::foo"
    /// - "/home/user/project/src/foo/bar.rs" -> "crate::foo::bar"
    /// - "src/lib.rs" -> "crate"
    /// - "src/foo.rs" -> "crate::foo"
    pub fn file_path_to_module_path(file_path: &str) -> String {
        // Find the "src/" component and extract everything from there
        // This handles both absolute paths (/home/user/project/src/lib.rs) and relative (src/lib.rs)
        let src_path = file_path
            .rsplit_once("/src/")
            .or_else(|| file_path.rsplit_once("\\src\\"))
            .map(|(_, rest)| format!("src/{}", rest))
            .or_else(|| {
                // Maybe it starts with src/
                if file_path.starts_with("src/") {
                    Some(file_path.to_string())
                } else {
                    None
                }
            })
            .unwrap_or_else(|| file_path.to_string());

        // Remove leading "src/" or trailing ".rs"
        let path = src_path.strip_prefix("src/").unwrap_or(&src_path);
        let path = path.strip_suffix(".rs").unwrap_or(path);

        // Split by "/"
        let parts: Vec<&str> = path.split('/').collect();

        // Filter out "mod" parts (from mod.rs files) but only if they're standalone
        // "foo/mod" -> remove "mod" -> "foo"
        // "foo/bar/mod" -> remove "mod" -> "foo/bar"
        let module_parts: Vec<&str> = parts
            .iter()
            .filter(|&&part| part != "mod")
            .copied()
            .collect();

        // If we have lib.rs or main.rs, this is the crate root
        if module_parts.len() == 1 && (module_parts[0] == "lib" || module_parts[0] == "main") {
            return "crate".to_string();
        }

        // Build module path with crate:: prefix
        let module_path = module_parts.join("::");
        format!("crate::{}", module_path)
    }
}

/// Delete edges whose from_id OR to_id is in the provided set of entity IDs.
///
/// Determinism: IDs must be pre-sorted by caller.
pub fn delete_edges_touching_entities(
    conn: &rusqlite::Connection,
    entity_ids_sorted: &[i64],
) -> Result<usize> {
    use rusqlite::params_from_iter;

    if entity_ids_sorted.is_empty() {
        return Ok(0);
    }

    // Build placeholders for IN list.
    let placeholders = std::iter::repeat_n("?", entity_ids_sorted.len())
        .collect::<Vec<_>>()
        .join(", ");

    let sql = format!(
        "DELETE FROM graph_edges WHERE from_id IN ({}) OR to_id IN ({})",
        placeholders, placeholders
    );

    // Params are duplicated (for from_id and to_id IN lists).
    let params = entity_ids_sorted
        .iter()
        .chain(entity_ids_sorted.iter())
        .copied();

    let affected = conn
        .execute(&sql, params_from_iter(params))
        .map_err(|e| anyhow::anyhow!("Failed to delete edges touching entities: {}", e))?;

    Ok(affected)
}

/// Metadata about a lazy-built geometric index (.geo file)
///
/// Stored in the `geo_index_meta` table to track when the .geo file
/// was last built from the SQLite database and whether it needs rebuilding.
#[derive(Debug, Clone)]
pub struct GeoIndexMeta {
    pub geo_path: String,
    pub built_at: i64,
    pub schema_version: i64,
    pub symbol_count: i64,
    pub call_count: i64,
    pub cfg_block_count: i64,
    pub checksum: String,
}

impl GeoIndexMeta {
    /// Record that a .geo index was built, overwriting any previous record.
    pub fn record_geo_index_built(
        conn: &rusqlite::Connection,
        geo_path: &str,
        symbol_count: i64,
        call_count: i64,
        cfg_block_count: i64,
        checksum: &str,
    ) -> Result<()> {
        let schema_version = Self::current_schema_version();
        let built_at = std::time::SystemTime::now()
            .duration_since(std::time::UNIX_EPOCH)?
            .as_secs() as i64;

        conn.execute(
            "INSERT INTO geo_index_meta (id, geo_path, built_at, schema_version, symbol_count, call_count, cfg_block_count, checksum)
             VALUES (1, ?1, ?2, ?3, ?4, ?5, ?6, ?7)
             ON CONFLICT(id) DO UPDATE SET
               geo_path = excluded.geo_path,
               built_at = excluded.built_at,
               schema_version = excluded.schema_version,
               symbol_count = excluded.symbol_count,
               call_count = excluded.call_count,
               cfg_block_count = excluded.cfg_block_count,
               checksum = excluded.checksum",
            rusqlite::params![
                geo_path,
                built_at,
                schema_version,
                symbol_count,
                call_count,
                cfg_block_count,
                checksum
            ],
        )?;
        Ok(())
    }

    /// Get the last recorded geo index metadata.
    pub fn get_geo_index_meta(conn: &rusqlite::Connection) -> Result<Option<GeoIndexMeta>> {
        let mut stmt = conn.prepare(
            "SELECT geo_path, built_at, schema_version, symbol_count, call_count, cfg_block_count, checksum
             FROM geo_index_meta WHERE id = 1"
        )?;
        let result = stmt.query_row([], |row| {
            Ok(GeoIndexMeta {
                geo_path: row.get(0)?,
                built_at: row.get(1)?,
                schema_version: row.get(2)?,
                symbol_count: row.get(3)?,
                call_count: row.get(4)?,
                cfg_block_count: row.get(5)?,
                checksum: row.get(6)?,
            })
        });
        match result {
            Ok(meta) => Ok(Some(meta)),
            Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
            Err(e) => Err(e.into()),
        }
    }

    /// Return the current Magellan schema version.
    pub fn current_schema_version() -> i64 {
        crate::migrate_cmd::MAGELLAN_SCHEMA_VERSION
    }
}