Skip to main content

llmgrep/
algorithm.rs

1//! Algorithm integration module for Magellan 2.0 graph algorithms.
2//!
3//! This module provides integration with Magellan's executable graph algorithms
4//! through a shell-out pattern. It supports:
5//!
6//! - Loading pre-computed SymbolSet files from JSON
7//! - Running Magellan algorithms (reachable, dead-code, slice, cycles) and extracting SymbolIds
8//! - Resolving simple symbol names to SymbolIds via `magellan find`
9//!
10//! The SymbolSet type is the core abstraction for algorithm-based filtering.
11//! It represents a collection of SymbolIds (32-char BLAKE3 hashes) that can be
12//! used to filter search results.
13
14use crate::error::LlmError;
15use serde::{Deserialize, Serialize};
16use serde_json::Value;
17use std::collections::HashMap;
18use std::path::Path;
19use std::process::Command;
20use rusqlite::Connection;
21
22/// SymbolSet - a collection of SymbolIds for filtering search results.
23///
24/// This type represents a set of symbols identified by their 32-character
25/// BLAKE3 hash SymbolIds. It can be loaded from JSON files or extracted
26/// from Magellan algorithm outputs.
27///
28/// # JSON Format
29///
30/// ```json
31/// {
32///   "symbol_ids": [
33///     "abc123def456789012345678901234ab",
34///     "def456789012345678901234abcd1234",
35///     ...
36///   ]
37/// }
38/// ```
39///
40/// # Example
41///
42/// ```no_run
43/// use llmgrep::algorithm::SymbolSet;
44/// use std::path::Path;
45///
46/// let symbol_set = SymbolSet::from_file(Path::new("symbols.json"))?;
47/// symbol_set.validate()?;
48/// # Ok::<(), llmgrep::error::LlmError>(())
49/// ```
50#[derive(Debug, Clone, Serialize, Deserialize)]
51pub struct SymbolSet {
52    /// List of SymbolIds (32-char BLAKE3 hashes)
53    pub symbol_ids: Vec<String>,
54}
55
56impl SymbolSet {
57    /// Load SymbolSet from a JSON file.
58    ///
59    /// # Arguments
60    ///
61    /// * `path` - Path to the JSON file containing SymbolSet data
62    ///
63    /// # Returns
64    ///
65    /// A validated SymbolSet if the file exists and contains valid JSON.
66    ///
67    /// # Errors
68    ///
69    /// Returns `LlmError::IoError` if the file cannot be read.
70    /// Returns `LlmError::JsonError` if the JSON is invalid or malformed.
71    ///
72    /// # Example
73    ///
74    /// ```no_run
75    /// use llmgrep::algorithm::SymbolSet;
76    /// use std::path::Path;
77    ///
78    /// let symbol_set = SymbolSet::from_file(Path::new("reachable.json"))?;
79    /// # Ok::<(), llmgrep::error::LlmError>(())
80    /// ```
81    pub fn from_file(path: &Path) -> Result<Self, LlmError> {
82        let content = std::fs::read_to_string(path).map_err(LlmError::IoError)?;
83        serde_json::from_str(&content).map_err(LlmError::JsonError)
84    }
85
86    /// Validate that all SymbolIds are in the correct format (32 hex characters).
87    ///
88    /// Magellan SymbolIds are 32-character BLAKE3 hashes represented as lowercase
89    /// hexadecimal strings. This method validates that format.
90    ///
91    /// # Returns
92    ///
93    /// `Ok(())` if all SymbolIds are valid.
94    ///
95    /// # Errors
96    ///
97    /// Returns `LlmError::InvalidQuery` if any SymbolId has an invalid format.
98    ///
99    /// # Example
100    ///
101    /// ```
102    /// use llmgrep::algorithm::SymbolSet;
103    ///
104    /// let symbol_set = SymbolSet {
105    ///     symbol_ids: vec![
106    ///         "abc123def456789012345678901234ab".to_string(),
107    ///     ],
108    /// };
109    /// assert!(symbol_set.validate().is_ok());
110    /// ```
111    pub fn validate(&self) -> Result<(), LlmError> {
112        for symbol_id in &self.symbol_ids {
113            if symbol_id.len() != 32 {
114                return Err(LlmError::InvalidQuery {
115                    query: format!(
116                        "Invalid SymbolId format: '{}'. Expected 32 hex characters.",
117                        symbol_id
118                    ),
119                });
120            }
121            if !symbol_id.chars().all(|c| c.is_ascii_hexdigit()) {
122                return Err(LlmError::InvalidQuery {
123                    query: format!(
124                        "Invalid SymbolId format: '{}'. Expected only hexadecimal characters.",
125                        symbol_id
126                    ),
127                });
128            }
129        }
130        Ok(())
131    }
132
133    /// Check if the SymbolSet is empty.
134    pub fn is_empty(&self) -> bool {
135        self.symbol_ids.is_empty()
136    }
137
138    /// Return the number of SymbolIds in the set.
139    pub fn len(&self) -> usize {
140        self.symbol_ids.len()
141    }
142}
143
144/// Check if Magellan CLI is available and at the correct version.
145///
146/// This function implements the fail-fast approach: check availability
147/// before attempting any algorithm shell-out. Returns Ok(()) if Magellan
148/// is available and version is acceptable, otherwise returns an error
149/// with helpful installation instructions.
150///
151/// # Errors
152///
153/// Returns `LlmError::MagellanNotFound` if magellan CLI is not in PATH.
154/// Returns `LlmError::MagellanVersionMismatch` if version is less than 2.1.0.
155///
156/// # Example
157///
158/// ```no_run
159/// use llmgrep::algorithm::check_magellan_available;
160///
161/// check_magellan_available()?;
162/// # Ok::<(), llmgrep::error::LlmError>(())
163/// ```
164pub fn check_magellan_available() -> Result<(), LlmError> {
165    check_magellan_version()
166}
167
168/// Check Magellan version is 2.1.0 or higher.
169///
170/// Parses `magellan --version` output and enforces minimum version.
171/// Magellan output format: "magellan VERSION (DATE) rustc RUSTC_VERSION"
172fn check_magellan_version() -> Result<(), LlmError> {
173    use std::thread;
174    use std::time::Duration;
175
176    let child = Command::new("magellan")
177        .arg("--version")
178        .spawn();
179
180    let output = match child {
181        Ok(mut child) => {
182            let timeout = Duration::from_secs(5);
183            let start = std::time::Instant::now();
184
185            loop {
186                if let Ok(status) = child.try_wait() {
187                    match status {
188                        Some(_status) => {
189                            // Process has exited, get output
190                            break child.wait_with_output();
191                        }
192                        None => {
193                            // Still running, check timeout
194                            if start.elapsed() >= timeout {
195                                child.kill().ok();
196                                return Err(LlmError::MagellanExecutionFailed {
197                                    algorithm: "version check".to_string(),
198                                    stderr: "Version check timed out after 5 seconds".to_string(),
199                                });
200                            }
201                            thread::sleep(Duration::from_millis(100));
202                        }
203                    }
204                }
205            }
206        }
207        Err(e) => return Err(match e.kind() {
208            std::io::ErrorKind::NotFound => LlmError::MagellanNotFound,
209            _ => LlmError::MagellanExecutionFailed {
210                algorithm: "version check".to_string(),
211                stderr: e.to_string(),
212            },
213        }),
214    };
215
216    let output = output.map_err(|e| match e.kind() {
217        std::io::ErrorKind::NotFound => LlmError::MagellanNotFound,
218        _ => LlmError::MagellanExecutionFailed {
219            algorithm: "version check".to_string(),
220            stderr: e.to_string(),
221        },
222    })?;
223
224    if !output.status.success() {
225        return Err(LlmError::MagellanExecutionFailed {
226            algorithm: "version check".to_string(),
227            stderr: String::from_utf8_lossy(&output.stderr).to_string(),
228        });
229    }
230
231    // Parse version from output: "magellan 2.1.0 (2024-01-15) rustc 1.75.0"
232    let version_str = String::from_utf8_lossy(&output.stdout);
233    let version = parse_magellan_version(&version_str)?;
234
235    // Require 2.1.0 or higher
236    if version < (2, 1, 0) {
237        return Err(LlmError::MagellanVersionMismatch {
238            current: format!("{}.{}.{}", version.0, version.1, version.2),
239            required: "2.1.0".to_string(),
240        });
241    }
242
243    Ok(())
244}
245
246/// Parse Magellan version string into (major, minor, patch) tuple.
247///
248/// Expected format: "magellan X.Y.Z (DATE) rustc VERSION"
249fn parse_magellan_version(output: &str) -> Result<(u32, u32, u32), LlmError> {
250    // Extract the version number after "magellan"
251    let first_line = output.lines().next().unwrap_or("");
252    let version_part = first_line
253        .strip_prefix("magellan")
254        .unwrap_or("")
255        .split_whitespace()
256        .next()
257        .unwrap_or("");
258
259    // Parse X.Y.Z format
260    let parts: Vec<&str> = version_part.split('.').collect();
261    if parts.len() != 3 {
262        return Err(LlmError::MagellanExecutionFailed {
263            algorithm: "version parsing".to_string(),
264            stderr: format!("Unable to parse version from: {}", first_line),
265        });
266    }
267
268    let major: u32 = parts[0].parse().map_err(|_| LlmError::MagellanExecutionFailed {
269        algorithm: "version parsing".to_string(),
270        stderr: format!("Invalid major version: {}", parts[0]),
271    })?;
272    let minor: u32 = parts[1].parse().map_err(|_| LlmError::MagellanExecutionFailed {
273        algorithm: "version parsing".to_string(),
274        stderr: format!("Invalid minor version: {}", parts[1]),
275    })?;
276    let patch: u32 = parts[2].parse().map_err(|_| LlmError::MagellanExecutionFailed {
277        algorithm: "version parsing".to_string(),
278        stderr: format!("Invalid patch version: {}", parts[2]),
279    })?;
280
281    Ok((major, minor, patch))
282}
283
284/// Run a Magellan algorithm and extract SymbolIds from its JSON output.
285///
286/// This function shells out to the Magellan CLI, executes the specified algorithm,
287/// and parses the JSON output to extract SymbolIds. The extraction logic handles
288/// the different JSON structures returned by each algorithm.
289///
290/// # Arguments
291///
292/// * `db_path` - Path to the Magellan code graph database
293/// * `algorithm` - Algorithm name: "reachable", "dead-code", "slice", "cycles"
294/// * `args` - Algorithm-specific arguments (e.g., ["--symbol-id", "abc123..."])
295///
296/// # Returns
297///
298/// A `SymbolSet` containing the SymbolIds extracted from the algorithm output.
299///
300/// # Errors
301///
302/// Returns `LlmError::SearchFailed` if:
303/// - Magellan CLI is not found in PATH
304/// - The algorithm execution fails (non-zero exit code)
305/// - The JSON output cannot be parsed
306///
307/// # Supported Algorithms
308///
309/// - **reachable**: Extracts from `result.symbols[].symbol_id`
310/// - **dead-code**: Extracts from `result.dead_symbols[].symbol.symbol_id`
311/// - **slice**: Extracts from `result.included_symbols[].symbol_id`
312/// - **cycles**: Extracts from `result.cycles[].members[].symbol_id`
313///
314/// # Example
315///
316/// ```no_run
317/// use llmgrep::algorithm::run_magellan_algorithm;
318/// use std::path::Path;
319///
320/// let db_path = Path::new(".codemcp/codegraph.db");
321/// let symbol_set = run_magellan_algorithm(
322///     db_path,
323///     "reachable",
324///     &["--symbol-id", "abc123def456789012345678901234ab"],
325/// )?;
326/// # Ok::<(), llmgrep::error::LlmError>(())
327/// ```
328pub fn run_magellan_algorithm(
329    db_path: &Path,
330    algorithm: &str,
331    args: &[&str],
332) -> Result<SymbolSet, LlmError> {
333    // Check Magellan availability and version
334    check_magellan_available()?;
335
336    // Build magellan command
337    let mut cmd = Command::new("magellan");
338    cmd.arg(algorithm)
339        .arg("--db")
340        .arg(db_path)
341        .arg("--output")
342        .arg("json")
343        .args(args);
344
345    // Execute and capture output
346    let output = cmd.output().map_err(|e| match e.kind() {
347        std::io::ErrorKind::NotFound => LlmError::MagellanNotFound,
348        _ => LlmError::MagellanExecutionFailed {
349            algorithm: algorithm.to_string(),
350            stderr: e.to_string(),
351        },
352    })?;
353
354    if !output.status.success() {
355        let stderr = String::from_utf8_lossy(&output.stderr);
356        return Err(LlmError::MagellanExecutionFailed {
357            algorithm: algorithm.to_string(),
358            stderr: stderr.to_string(),
359        });
360    }
361
362    // Parse JSON response and extract SymbolIds
363    let json_str = String::from_utf8_lossy(&output.stdout);
364    extract_symbol_ids_from_magellan_json(&json_str, algorithm)
365}
366
367/// Extract SymbolIds from Magellan algorithm JSON output.
368///
369/// Each Magellan algorithm has a different JSON structure. This function
370/// parses the algorithm-specific format and extracts all SymbolIds into
371/// a unified SymbolSet.
372///
373/// # JSON Structures by Algorithm
374///
375/// - **reachable**: `{"result": {"symbols": [{"symbol_id": "..."}, ...]}}`
376/// - **dead-code**: `{"result": {"dead_symbols": [{"symbol": {"symbol_id": "..."}}, ...]}}`
377/// - **slice**: `{"result": {"included_symbols": [{"symbol_id": "..."}, ...]}}`
378/// - **cycles**: `{"result": {"cycles": [{"members": [{"symbol_id": "..."}, ...]}]}}`
379///
380/// # Arguments
381///
382/// * `json` - The raw JSON string from Magellan's stdout
383/// * `algorithm` - The algorithm name (determines which parser to use)
384///
385/// # Returns
386///
387/// A `SymbolSet` containing all extracted SymbolIds.
388fn extract_symbol_ids_from_magellan_json(
389    json: &str,
390    algorithm: &str,
391) -> Result<SymbolSet, LlmError> {
392    let parsed: Value = serde_json::from_str(json).map_err(LlmError::JsonError)?;
393
394    // Extract symbol_ids based on algorithm type
395    let symbol_ids = match algorithm {
396        "reachable" => {
397            let symbols = parsed["result"]["symbols"]
398                .as_array()
399                .ok_or_else(|| LlmError::InvalidQuery {
400                    query: "Missing 'symbols' array in reachable output".to_string(),
401                })?;
402            symbols
403                .iter()
404                .filter_map(|s| s["symbol_id"].as_str().map(|id| id.to_string()))
405                .collect()
406        }
407        "dead-code" => {
408            let dead_symbols = parsed["result"]["dead_symbols"]
409                .as_array()
410                .ok_or_else(|| LlmError::InvalidQuery {
411                    query: "Missing 'dead_symbols' array in dead-code output".to_string(),
412                })?;
413            dead_symbols
414                .iter()
415                .filter_map(|s| s["symbol"]["symbol_id"].as_str().map(|id| id.to_string()))
416                .collect()
417        }
418        "slice" => {
419            let included_symbols = parsed["result"]["included_symbols"]
420                .as_array()
421                .ok_or_else(|| LlmError::InvalidQuery {
422                    query: "Missing 'included_symbols' array in slice output".to_string(),
423                })?;
424            included_symbols
425                .iter()
426                .filter_map(|s| s["symbol_id"].as_str().map(|id| id.to_string()))
427                .collect()
428        }
429        "cycles" => {
430            let cycles = parsed["result"]["cycles"]
431                .as_array()
432                .ok_or_else(|| LlmError::InvalidQuery {
433                    query: "Missing 'cycles' array in cycles output".to_string(),
434                })?;
435            let mut ids = Vec::new();
436            for cycle in cycles {
437                if let Some(members) = cycle["members"].as_array() {
438                    for member in members {
439                        if let Some(id) = member["symbol_id"].as_str() {
440                            ids.push(id.to_string());
441                        }
442                    }
443                }
444            }
445            ids
446        }
447        _ => {
448            return Err(LlmError::InvalidQuery {
449                query: format!("Unknown algorithm: {}", algorithm),
450            });
451        }
452    };
453
454    Ok(SymbolSet { symbol_ids })
455}
456
457/// Parse magellan condense output and extract SCC membership.
458///
459/// This function parses the JSON output from `magellan condense` and extracts:
460/// - All symbol_ids from SCC members
461/// - A mapping of symbol_id -> supernode_id for output decoration
462///
463/// # Condense JSON Structure
464///
465/// ```json
466/// {
467///   "supernodes": [
468///     {
469///       "id": "supernode_0",
470///       "members": [
471///         {"symbol_id": "abc123def456789012345678901234ab"},
472///         {"symbol_id": "def456789012345678901234abcd1234"}
473///       ]
474///     },
475///     ...
476///   ]
477/// }
478/// ```
479///
480/// # Arguments
481///
482/// * `json` - The raw JSON string from Magellan's stdout
483///
484/// # Returns
485///
486/// A tuple of:
487/// - `Vec<String>` of all symbol_ids from all SCC members
488/// - `HashMap<String, String>` mapping symbol_id -> supernode_id
489///
490/// # Errors
491///
492/// Returns `LlmError::MagellanExecutionFailed` (LLM-E002) if JSON structure is invalid.
493/// Returns empty Vec/HashMap if supernodes array is empty (not an error).
494///
495/// # Example
496///
497/// ```no_run
498/// use llmgrep::algorithm::parse_condense_output;
499///
500/// let json = r#"{"supernodes": [{"id": "supernode_0", "members": [{"symbol_id": "abc123..."}]}]}"#;
501/// let (symbol_ids, supernode_map) = parse_condense_output(json)?;
502/// # Ok::<(), llmgrep::error::LlmError>(())
503/// ```
504pub fn parse_condense_output(json: &str) -> Result<(Vec<String>, std::collections::HashMap<String, String>), LlmError> {
505    use std::collections::HashMap;
506
507    let parsed: Value = serde_json::from_str(json).map_err(LlmError::JsonError)?;
508
509    // Handle magellan wrapper structure: {"data": {"supernodes": [...]}}
510    // Fall back to direct structure for backward compatibility
511    let supernodes = if parsed["data"]["supernodes"].is_array() {
512        parsed["data"]["supernodes"].as_array()
513    } else {
514        parsed["supernodes"].as_array()
515    }
516    .ok_or_else(|| LlmError::MagellanExecutionFailed {
517        algorithm: "condense".to_string(),
518        stderr: "Missing 'supernodes' array in condense output".to_string(),
519    })?;
520
521    let mut all_symbol_ids = Vec::new();
522    let mut supernode_map = HashMap::new();
523
524    // Extract members from each supernode
525    for supernode in supernodes {
526        // Handle both numeric and string IDs (magellan uses numeric)
527        let supernode_id = if let Some(id_str) = supernode["id"].as_str() {
528            id_str.to_string()
529        } else if let Some(id_num) = supernode["id"].as_u64() {
530            format!("supernode_{}", id_num)
531        } else if let Some(id_i64) = supernode["id"].as_i64() {
532            format!("supernode_{}", id_i64)
533        } else {
534            return Err(LlmError::MagellanExecutionFailed {
535                algorithm: "condense".to_string(),
536                stderr: "Supernode missing 'id' field".to_string(),
537            });
538        };
539
540        let members = supernode["members"]
541            .as_array()
542            .ok_or_else(|| LlmError::MagellanExecutionFailed {
543                algorithm: "condense".to_string(),
544                stderr: "Supernode missing 'members' array".to_string(),
545            })?;
546
547        for member in members {
548            let symbol_id = member["symbol_id"]
549                .as_str()
550                .ok_or_else(|| LlmError::MagellanExecutionFailed {
551                    algorithm: "condense".to_string(),
552                    stderr: "Member missing 'symbol_id' field".to_string(),
553                })?;
554
555            all_symbol_ids.push(symbol_id.to_string());
556            supernode_map.insert(symbol_id.to_string(), supernode_id.to_string());
557        }
558    }
559
560    Ok((all_symbol_ids, supernode_map))
561}
562
563/// Parse magellan paths output and extract execution path symbols.
564///
565/// This function parses the JSON output from `magellan paths` command and extracts
566/// all unique SymbolIds from the returned execution paths. The paths command
567/// enumerates all possible execution paths between symbols using bounded depth-first
568/// search.
569///
570/// # Arguments
571///
572/// * `json` - The raw JSON string from Magellan's stdout
573///
574/// # Returns
575///
576/// A tuple of:
577/// - `Vec<String>` of all unique symbol_ids from all paths
578/// - `bool` indicating if bounded enumeration hit limits (bounded_hit)
579///
580/// # Errors
581///
582/// Returns `LlmError::MagellanExecutionFailed` (LLM-E002) if JSON structure is invalid.
583/// Returns empty Vec and bounded_flag=false if paths array is empty (not an error).
584///
585/// # Example
586///
587/// ```no_run
588/// use llmgrep::algorithm::parse_paths_output;
589///
590/// let json = r#"{"paths": [{"symbols": [{"symbol_id": "abc123..."}], "length": 1}], "bounded_hit": false}"#;
591/// let (symbol_ids, bounded_hit) = parse_paths_output(json)?;
592/// # Ok::<(), llmgrep::error::LlmError>(())
593/// ```
594pub fn parse_paths_output(json: &str) -> Result<(Vec<String>, bool), LlmError> {
595    use std::collections::HashSet;
596
597    let parsed: Value = serde_json::from_str(json).map_err(LlmError::JsonError)?;
598
599    // Handle magellan wrapper structure: {"data": {"paths": [...]}}
600    // Fall back to direct structure for backward compatibility
601    let paths = if parsed["data"]["paths"].is_array() {
602        parsed["data"]["paths"].as_array()
603    } else {
604        parsed["paths"].as_array()
605    }
606    .ok_or_else(|| LlmError::MagellanExecutionFailed {
607        algorithm: "paths".to_string(),
608        stderr: "Missing 'paths' array in output".to_string(),
609    })?;
610
611    let mut all_symbol_ids = HashSet::new();
612
613    // Extract symbol_ids from all paths
614    for path in paths {
615        let symbols = path["symbols"]
616            .as_array()
617            .ok_or_else(|| LlmError::MagellanExecutionFailed {
618                algorithm: "paths".to_string(),
619                stderr: "Path missing 'symbols' array".to_string(),
620            })?;
621
622        for symbol in symbols {
623            let symbol_id = symbol["symbol_id"]
624                .as_str()
625                .ok_or_else(|| LlmError::MagellanExecutionFailed {
626                    algorithm: "paths".to_string(),
627                    stderr: "Symbol missing 'symbol_id' field".to_string(),
628                })?;
629
630            all_symbol_ids.insert(symbol_id.to_string());
631        }
632    }
633
634    // Extract bounded_hit flag from JSON (indicates if max-depth or max-paths was hit)
635    let bounded_hit = parsed["bounded_hit"]
636        .as_bool()
637        .or_else(|| parsed["data"]["bounded_hit"].as_bool())
638        .unwrap_or(false);
639
640    // Convert HashSet to Vec for return
641    Ok((all_symbol_ids.into_iter().collect(), bounded_hit))
642}
643
644/// Parse a SymbolSet from a JSON file and validate its format.
645///
646/// This is a convenience function that combines `SymbolSet::from_file`
647/// and `SymbolSet::validate` into a single call.
648///
649/// # Arguments
650///
651/// * `path` - Path to the SymbolSet JSON file
652///
653/// # Returns
654///
655/// A validated `SymbolSet`.
656///
657/// # Errors
658///
659/// Returns `LlmError::IoError` if the file cannot be read.
660/// Returns `LlmError::JsonError` if the JSON is invalid.
661/// Returns `LlmError::InvalidQuery` if SymbolId format is invalid.
662///
663/// # Example
664///
665/// ```no_run
666/// use llmgrep::algorithm::parse_symbol_set_file;
667/// use std::path::Path;
668///
669/// let symbol_set = parse_symbol_set_file(Path::new("symbols.json"))?;
670/// # Ok::<(), llmgrep::error::LlmError>(())
671/// ```
672pub fn parse_symbol_set_file(path: &Path) -> Result<SymbolSet, LlmError> {
673    let symbol_set = SymbolSet::from_file(path)?;
674    symbol_set.validate()?;
675    Ok(symbol_set)
676}
677
678/// Algorithm-based filtering options for search
679#[derive(Debug, Clone, Default)]
680pub struct AlgorithmOptions<'a> {
681    /// Load pre-computed SymbolSet from JSON file
682    pub from_symbol_set: Option<&'a str>,
683    /// One-shot: reachable from symbol (shell-out to magellan reachable)
684    pub reachable_from: Option<&'a str>,
685    /// One-shot: dead code from entry point (shell-out to magellan dead-code)
686    pub dead_code_in: Option<&'a str>,
687    /// One-shot: symbols in cycle (shell-out to magellan cycles)
688    pub in_cycle: Option<&'a str>,
689    /// One-shot: backward slice from target (shell-out to magellan slice)
690    pub slice_backward_from: Option<&'a str>,
691    /// One-shot: forward slice from target (shell-out to magellan slice)
692    pub slice_forward_from: Option<&'a str>,
693    /// One-shot: condense SCC detection (shell-out to magellan condense)
694    pub condense: bool,
695    /// One-shot: paths from start symbol (shell-out to magellan paths)
696    pub paths_from: Option<&'a str>,
697    /// Optional end symbol for path enumeration (shell-out to magellan paths --end)
698    pub paths_to: Option<&'a str>,
699}
700
701impl<'a> AlgorithmOptions<'a> {
702    /// Check if any algorithm filter is active
703    pub fn is_active(&self) -> bool {
704        self.from_symbol_set.is_some()
705            || self.reachable_from.is_some()
706            || self.dead_code_in.is_some()
707            || self.in_cycle.is_some()
708            || self.slice_backward_from.is_some()
709            || self.slice_forward_from.is_some()
710            || self.condense
711            || self.paths_from.is_some()
712    }
713
714    /// Create empty AlgorithmOptions
715    pub fn new() -> Self {
716        Self::default()
717    }
718}
719
720/// Result type for algorithm filtering operations.
721///
722/// Contains:
723/// - `Vec<String>` of SymbolIds for filtering
724/// - `HashMap<String, String>` mapping symbol_id -> supernode_id for decoration
725/// - `bool` indicating if path enumeration hit bounds
726pub type AlgorithmFilterResult = Result<(Vec<String>, HashMap<String, String>, bool), LlmError>;
727
728/// Apply algorithm filters and return SymbolSet for search filtering
729///
730/// Handles:
731/// - Pre-computed SymbolSet from file (--from-symbol-set)
732/// - One-shot algorithm execution (--reachable-from, --dead-code-in, etc.)
733/// - FQN resolution for simple names (resolves to SymbolId before shelling out)
734///
735/// Returns: (`Vec<String>` of SymbolIds, `HashMap<String, String>` of symbol_id -> supernode_id, `bool` paths_bounded)
736///         All empty if no active filters
737pub fn apply_algorithm_filters(
738    db_path: &Path,
739    options: &AlgorithmOptions<'_>,
740) -> AlgorithmFilterResult {
741    // Priority 1: Pre-computed SymbolSet from file
742    if let Some(file_path) = options.from_symbol_set {
743        let symbol_set = parse_symbol_set_file(Path::new(file_path))?;
744        symbol_set.validate()?;
745        return Ok((symbol_set.symbol_ids, HashMap::new(), false));
746    }
747
748    // Priority 2: One-shot algorithm execution (only one allowed)
749    // Check for exactly one active one-shot filter
750    let active_count = [
751        options.reachable_from.is_some(),
752        options.dead_code_in.is_some(),
753        options.in_cycle.is_some(),
754        options.slice_backward_from.is_some(),
755        options.slice_forward_from.is_some(),
756        options.condense,
757        options.paths_from.is_some(),
758    ]
759    .iter()
760    .filter(|&&x| x)
761    .count();
762
763    if active_count > 1 {
764        return Err(LlmError::InvalidQuery {
765            query: "Only one one-shot algorithm filter may be specified. Use --from-symbol-set for composed filters.".to_string(),
766        });
767    }
768
769    // Execute the active one-shot algorithm
770    if let Some(symbol) = options.reachable_from {
771        let symbol_id = resolve_fqn_to_symbol_id(db_path, symbol)?;
772        let args = ["--from", &symbol_id];
773        return Ok((run_magellan_algorithm(db_path, "reachable", &args)?.symbol_ids, HashMap::new(), false));
774    }
775
776    if let Some(symbol) = options.dead_code_in {
777        let symbol_id = resolve_fqn_to_symbol_id(db_path, symbol)?;
778        let args = ["--entry", &symbol_id];
779        return Ok((run_magellan_algorithm(db_path, "dead-code", &args)?.symbol_ids, HashMap::new(), false));
780    }
781
782    if let Some(symbol) = options.in_cycle {
783        let symbol_id = resolve_fqn_to_symbol_id(db_path, symbol)?;
784        let args = ["--symbol", &symbol_id];
785        return Ok((run_magellan_algorithm(db_path, "cycles", &args)?.symbol_ids, HashMap::new(), false));
786    }
787
788    if let Some(symbol) = options.slice_backward_from {
789        let symbol_id = resolve_fqn_to_symbol_id(db_path, symbol)?;
790        let args = ["--target", &symbol_id, "--direction", "backward"];
791        return Ok((run_magellan_algorithm(db_path, "slice", &args)?.symbol_ids, HashMap::new(), false));
792    }
793
794    if let Some(symbol) = options.slice_forward_from {
795        let symbol_id = resolve_fqn_to_symbol_id(db_path, symbol)?;
796        let args = ["--target", &symbol_id, "--direction", "forward"];
797        return Ok((run_magellan_algorithm(db_path, "slice", &args)?.symbol_ids, HashMap::new(), false));
798    }
799
800    // Condense SCC detection
801    if options.condense {
802        let output = Command::new("magellan")
803            .args(["condense", "--db", &db_path.to_string_lossy(), "--output", "json"])
804            .output()
805            .map_err(|e| match e.kind() {
806                std::io::ErrorKind::NotFound => LlmError::MagellanNotFound,
807                _ => LlmError::MagellanExecutionFailed {
808                    algorithm: "condense".to_string(),
809                    stderr: format!("{}\n\nTry running: magellan condense --db {} for more details",
810                        e, db_path.to_string_lossy()),
811                },
812            })?;
813
814        if !output.status.success() {
815            let stderr = String::from_utf8_lossy(&output.stderr);
816            return Err(LlmError::MagellanExecutionFailed {
817                algorithm: "condense".to_string(),
818                stderr: format!("{}\n\nTry running: magellan condense --db {} for more details",
819                    stderr, db_path.to_string_lossy()),
820            });
821        }
822
823        let json = String::from_utf8_lossy(&output.stdout);
824        let (symbol_ids, supernode_map) = parse_condense_output(&json)?;
825
826        // Return both symbol_ids for filtering and supernode_map for output decoration
827        // Condense doesn't have bounded_hit (always false)
828        return Ok((symbol_ids, supernode_map, false));
829    }
830
831    // Path enumeration
832    if let Some(start_symbol) = options.paths_from {
833        let start_id = resolve_fqn_to_symbol_id(db_path, start_symbol)?;
834        let db_path_str = db_path.to_string_lossy().to_string();
835
836        // Build args: magellan paths --db <DB> --start <ID> [--end <ID>] --max-depth 100 --max-paths 1000 --output json
837        let mut args = vec![
838            "paths".to_string(),
839            "--db".to_string(),
840            db_path_str.clone(),
841            "--start".to_string(),
842            start_id.clone(),
843            "--max-depth".to_string(),
844            "100".to_string(),
845            "--max-paths".to_string(),
846            "1000".to_string(),
847            "--output".to_string(),
848            "json".to_string(),
849        ];
850
851        if let Some(end_symbol) = options.paths_to {
852            let end_id = resolve_fqn_to_symbol_id(db_path, end_symbol)?;
853            args.push("--end".to_string());
854            args.push(end_id);
855        }
856
857        let args_refs: Vec<&str> = args.iter().map(|s| s.as_str()).collect();
858
859        let output = Command::new("magellan")
860            .args(&args_refs)
861            .output()
862            .map_err(|e| match e.kind() {
863                std::io::ErrorKind::NotFound => LlmError::MagellanNotFound,
864                _ => LlmError::MagellanExecutionFailed {
865                    algorithm: "paths".to_string(),
866                    stderr: format!("{}\n\nTry running: magellan paths --db {} --start {} for more details",
867                        e, db_path_str, start_id),
868                },
869            })?;
870
871        if !output.status.success() {
872            let stderr = String::from_utf8_lossy(&output.stderr);
873            return Err(LlmError::MagellanExecutionFailed {
874                algorithm: "paths".to_string(),
875                stderr: format!("{}\n\nTry running: magellan paths --db {} --start {} for more details",
876                    stderr, db_path_str, start_id),
877            });
878        }
879
880        let json = String::from_utf8_lossy(&output.stdout);
881        let (symbol_ids, bounded_hit) = parse_paths_output(&json)?;
882
883        // bounded_hit is propagated to main.rs for warning display
884        // No decoration map needed for paths (unlike condense)
885        return Ok((symbol_ids, HashMap::new(), bounded_hit));
886    }
887
888    // No active filters
889    Ok((Vec::new(), HashMap::new(), false))
890}
891
892/// Threshold for using temporary table instead of IN clause
893const SYMBOL_SET_TEMP_TABLE_THRESHOLD: usize = 1000;
894
895/// Create temporary table for large SymbolSet filtering
896///
897/// For SymbolSets larger than SYMBOL_SET_TEMP_TABLE_THRESHOLD,
898/// creates a temp table and returns table name for JOIN.
899pub fn create_symbol_set_temp_table(
900    conn: &Connection,
901    symbol_ids: &[String],
902) -> Result<String, LlmError> {
903    let table_name = format!("symbol_set_filter_{}", std::process::id());
904
905    // Create temporary table
906    conn.execute(
907        &format!("CREATE TEMP TABLE {} (symbol_id TEXT PRIMARY KEY)", table_name),
908        [],
909    )
910    .map_err(LlmError::SqliteError)?;
911
912    // Insert all symbol_ids
913    let mut stmt = conn
914        .prepare(&format!("INSERT INTO {} (symbol_id) VALUES (?)", table_name))
915        .map_err(LlmError::SqliteError)?;
916
917    for symbol_id in symbol_ids {
918        stmt.execute([symbol_id]).map_err(LlmError::SqliteError)?;
919    }
920
921    Ok(table_name)
922}
923
924/// Get the appropriate filtering strategy for a SymbolSet
925pub fn symbol_set_filter_strategy(symbol_ids: &[String]) -> SymbolSetStrategy {
926    if symbol_ids.is_empty() {
927        SymbolSetStrategy::None
928    } else if symbol_ids.len() > SYMBOL_SET_TEMP_TABLE_THRESHOLD {
929        SymbolSetStrategy::TempTable
930    } else {
931        SymbolSetStrategy::InClause
932    }
933}
934
935/// Strategy for filtering by SymbolSet
936#[derive(Debug, Clone, PartialEq)]
937pub enum SymbolSetStrategy {
938    /// No filtering needed
939    None,
940    /// Use SQL IN clause (for <= 1000 items)
941    InClause,
942    /// Use temporary table with JOIN (for > 1000 items)
943    TempTable,
944}
945
946/// Resolve a simple symbol name to its SymbolId using `magellan find`.
947///
948/// When users provide a simple name (e.g., "main") instead of a full SymbolId,
949/// this function queries Magellan's database to find matching symbols.
950///
951/// # Ambiguity Detection
952///
953/// If multiple symbols match the given name, this function returns
954/// `LlmError::AmbiguousSymbolName` with the count of matches. The user must
955/// then provide a more specific identifier (full SymbolId or path-qualified name).
956///
957/// # Arguments
958///
959/// * `db_path` - Path to the Magellan code graph database
960/// * `name` - Simple symbol name to resolve (e.g., "main", "process_request")
961///
962/// # Returns
963///
964/// The SymbolId of the first matching symbol (if unique).
965///
966/// # Errors
967///
968/// Returns `LlmError::MagellanNotFound` if magellan CLI is not available.
969/// Returns `LlmError::AmbiguousSymbolName` if multiple symbols match the name.
970/// Returns `LlmError::InvalidQuery` (LLM-E011) if no symbols match.
971/// Returns `LlmError::SearchFailed` if the `magellan find` command fails.
972///
973/// # Example
974///
975/// ```no_run
976/// use llmgrep::algorithm::resolve_fqn_to_symbol_id;
977/// use std::path::Path;
978///
979/// let db_path = Path::new(".codemcp/codegraph.db");
980/// let symbol_id = resolve_fqn_to_symbol_id(db_path, "main")?;
981/// println!("Resolved to: {}", symbol_id);
982/// # Ok::<(), llmgrep::error::LlmError>(())
983/// ```
984pub fn resolve_fqn_to_symbol_id(db_path: &Path, name: &str) -> Result<String, LlmError> {
985    // Check Magellan availability and version
986    check_magellan_available()?;
987
988    // Run magellan find
989    let output = Command::new("magellan")
990        .args(["find", "--db", &db_path.to_string_lossy(), "--name", name, "--output", "json"])
991        .output()
992        .map_err(|e| match e.kind() {
993            std::io::ErrorKind::NotFound => LlmError::MagellanNotFound,
994            _ => LlmError::SearchFailed {
995                reason: format!("Failed to execute magellan find: {}", e),
996            },
997        })?;
998
999    if !output.status.success() {
1000        let stderr = String::from_utf8_lossy(&output.stderr);
1001        return Err(LlmError::SearchFailed {
1002            reason: format!("magellan find failed: {}", stderr),
1003        });
1004    }
1005
1006    // Parse JSON response
1007    let json: Value = serde_json::from_slice(&output.stdout).map_err(LlmError::JsonError)?;
1008
1009    // Check for matches array
1010    let matches = json["matches"]
1011        .as_array()
1012        .ok_or_else(|| LlmError::InvalidQuery {
1013            query: "Invalid magellan find output: missing 'matches' array".to_string(),
1014        })?;
1015
1016    // Handle ambiguity
1017    if matches.len() > 1 {
1018        return Err(LlmError::AmbiguousSymbolName {
1019            name: name.to_string(),
1020            count: matches.len(),
1021        });
1022    }
1023
1024    // Handle not found
1025    if matches.is_empty() {
1026        return Err(LlmError::InvalidQuery {
1027            query: format!("Symbol '{}' not found in database", name),
1028        });
1029    }
1030
1031    // Extract symbol_id from first match
1032    matches[0]["symbol_id"]
1033        .as_str()
1034        .map(|id| id.to_string())
1035        .ok_or_else(|| LlmError::InvalidQuery {
1036            query: format!(
1037                "Symbol '{}' found but missing symbol_id in magellan output",
1038                name
1039            ),
1040        })
1041}
1042
1043#[cfg(test)]
1044mod tests {
1045    use super::*;
1046
1047    #[test]
1048    fn test_symbol_set_validation_valid() {
1049        let symbol_set = SymbolSet {
1050            symbol_ids: vec![
1051                "abc123def456789012345678901234ab".to_string(),
1052                "0123456789abcdef0123456789abcdef".to_string(),
1053                "ffffffffffffffffffffffffffffffff".to_string(),
1054            ],
1055        };
1056        assert!(symbol_set.validate().is_ok());
1057        assert_eq!(symbol_set.len(), 3);
1058        assert!(!symbol_set.is_empty());
1059    }
1060
1061    #[test]
1062    fn test_symbol_set_validation_invalid_length() {
1063        let symbol_set = SymbolSet {
1064            symbol_ids: vec!["abc123".to_string()],
1065        };
1066        assert!(symbol_set.validate().is_err());
1067    }
1068
1069    #[test]
1070    fn test_symbol_set_validation_invalid_chars() {
1071        let symbol_set = SymbolSet {
1072            symbol_ids: vec!["abc123def456789012345678901234g!".to_string()],
1073        };
1074        assert!(symbol_set.validate().is_err());
1075    }
1076
1077    #[test]
1078    fn test_symbol_set_empty() {
1079        let symbol_set = SymbolSet {
1080            symbol_ids: vec![],
1081        };
1082        assert!(symbol_set.validate().is_ok());
1083        assert_eq!(symbol_set.len(), 0);
1084        assert!(symbol_set.is_empty());
1085    }
1086
1087    #[test]
1088    fn test_symbol_set_json_deserialize() {
1089        let json = r#"{"symbol_ids": ["abc123def456789012345678901234ab"]}"#;
1090        let symbol_set: SymbolSet = serde_json::from_str(json).unwrap();
1091        assert_eq!(symbol_set.symbol_ids.len(), 1);
1092        assert_eq!(
1093            symbol_set.symbol_ids[0],
1094            "abc123def456789012345678901234ab"
1095        );
1096    }
1097
1098    #[test]
1099    fn test_symbol_set_json_serialize() {
1100        let symbol_set = SymbolSet {
1101            symbol_ids: vec!["abc123def456789012345678901234ab".to_string()],
1102        };
1103        let json = serde_json::to_string(&symbol_set).unwrap();
1104        assert!(json.contains("symbol_ids"));
1105        assert!(json.contains("abc123def456789012345678901234ab"));
1106    }
1107}