Skip to main content

sqry_cli/
index_discovery.rs

1//! Index discovery module for finding unified graph in ancestor directories.
2//!
3//! This module implements git-like behavior where sqry walks up the directory
4//! tree to find the nearest graph index, enabling queries from subdirectories to
5//! automatically use a parent index with appropriate scope filtering.
6
7use sqry_core::workspace::{MAX_ANCESTOR_DEPTH, WorkspaceRootDiscovery, discover_workspace_root};
8use std::path::{Path, PathBuf};
9
10/// Legacy index file name constant (deprecated).
11pub const INDEX_FILE_NAME: &str = ".sqry-index";
12
13/// Characters that need escaping in path patterns for sqry query language.
14/// These are glob metacharacters that would be interpreted specially.
15const PATH_ESCAPE_CHARS: &[char] = &['*', '?', '[', ']', '{', '}', '\\'];
16
17/// Result of index discovery, containing location and scope information.
18#[derive(Debug, Clone)]
19pub struct IndexLocation {
20    /// Absolute path to the directory containing .sqry-index
21    pub index_root: PathBuf,
22
23    /// Original path the user requested (for scoping results)
24    pub query_scope: PathBuf,
25
26    /// True if index was found in an ancestor directory (relative to start dir)
27    pub is_ancestor: bool,
28
29    /// True if the query scope is a file (not a directory)
30    pub is_file_query: bool,
31
32    /// True if query augmentation/filtering is needed.
33    /// This is true when:
34    /// - Index is in ancestor directory (`is_ancestor`), OR
35    /// - Query targets a specific file (`is_file_query`)
36    ///
37    /// Note: File queries always need filtering even when the index
38    /// is in the file's parent directory (`is_ancestor` would be false
39    /// due to how we start discovery from the parent).
40    pub requires_scope_filter: bool,
41}
42
43impl IndexLocation {
44    /// Get the relative path from `index_root` to `query_scope` for filtering.
45    ///
46    /// Returns:
47    /// - `Some(relative_path)` when scope filtering is needed and path is inside index root
48    /// - `None` when no filtering needed (`query_scope` == `index_root` and !`is_file_query`)
49    /// - `None` when `query_scope` is outside `index_root` (edge case, shouldn't happen)
50    ///
51    /// Note: Uses `requires_scope_filter` (not `is_ancestor`) to ensure file queries
52    /// in the index root still compute their relative scope for exact-match filtering.
53    #[must_use]
54    pub fn relative_scope(&self) -> Option<PathBuf> {
55        if self.requires_scope_filter {
56            self.query_scope
57                .strip_prefix(&self.index_root)
58                .ok()
59                .map(Path::to_path_buf)
60        } else {
61            None
62        }
63    }
64}
65
66/// Find the nearest unified graph (or legacy `.sqry-index` file) by
67/// walking up from the given path.
68///
69/// # Algorithm
70///
71/// 1. First consult [`discover_workspace_root`] (cluster-E §E.1). The walk
72///    is bounded by [`MAX_ANCESTOR_DEPTH`] and stops at the first project
73///    marker (`.git`, `Cargo.toml`, `package.json`, `pyproject.toml`,
74///    `go.mod`). A graph above the project boundary is discarded — this
75///    eliminates the "stray `~/.sqry/graph`" foot-gun where a leftover
76///    graph at `$HOME` was silently picked up for a brand-new project.
77/// 2. If the discovery returns `BoundaryOnly`, also walk for the legacy
78///    `.sqry-index` file from `start` up to (but not above) the project
79///    boundary, since `discover_workspace_root` already records legacy
80///    `.sqry-index` files but does so without producing an
81///    `IndexLocation`. We keep this fallback path for backward
82///    compatibility with v1 layouts.
83///
84/// # Returns
85///
86/// * `Some(IndexLocation)` if a unified graph (or legacy index) was found
87///   inside the project boundary.
88/// * `None` if no usable index exists in any ancestor below the project
89///   boundary, or if the walk hit [`MAX_ANCESTOR_DEPTH`] without finding
90///   either.
91#[must_use]
92pub fn find_nearest_index(start: &Path) -> Option<IndexLocation> {
93    let query_scope = start.to_path_buf();
94
95    // Canonicalize for consistent path matching; fall back to original if fails
96    // (e.g., permission denied, path doesn't exist yet)
97    let canonical_start = start.canonicalize().unwrap_or_else(|_| start.to_path_buf());
98
99    // Determine if input is a file or directory
100    let is_file_query = canonical_start.is_file();
101
102    // Step 1: bounded discovery via the shared workspace walker.
103    let (boundary, graph_root, depth_to_graph) = match discover_workspace_root(&canonical_start) {
104        WorkspaceRootDiscovery::GraphFound {
105            root,
106            boundary,
107            depth,
108            ..
109        } => (Some(boundary), Some(root), Some(depth)),
110        WorkspaceRootDiscovery::BoundaryOnly { boundary, .. } => (Some(boundary), None, None),
111        WorkspaceRootDiscovery::None => (None, None, None),
112    };
113
114    if let (Some(root), Some(depth)) = (graph_root, depth_to_graph) {
115        // `depth` is measured from the canonicalised start (or its parent for
116        // file inputs). The legacy `is_ancestor` flag fires whenever the
117        // index lives above the *original* request, including the file-input
118        // case (where we walked up from the parent directory).
119        let is_ancestor = depth > 0;
120        return Some(IndexLocation {
121            index_root: root,
122            query_scope: query_scope.canonicalize().unwrap_or(query_scope),
123            is_ancestor,
124            is_file_query,
125            requires_scope_filter: is_ancestor || is_file_query,
126        });
127    }
128
129    // Step 2: legacy `.sqry-index` fallback — walk up from `start` (or its
130    // parent for file inputs) but never above the project boundary.
131    let mut dir: PathBuf = if is_file_query {
132        canonical_start
133            .parent()
134            .map_or_else(|| canonical_start.clone(), Path::to_path_buf)
135    } else {
136        canonical_start.clone()
137    };
138    if dir.is_relative()
139        && let Ok(cwd) = std::env::current_dir()
140    {
141        dir = cwd.join(&dir);
142    }
143
144    for ancestor_depth in 0..MAX_ANCESTOR_DEPTH {
145        let legacy_index_path = dir.join(INDEX_FILE_NAME);
146        if legacy_index_path.exists() && legacy_index_path.is_file() {
147            let is_ancestor = ancestor_depth > 0;
148            return Some(IndexLocation {
149                index_root: dir,
150                query_scope: query_scope.canonicalize().unwrap_or(query_scope),
151                is_ancestor,
152                is_file_query,
153                requires_scope_filter: is_ancestor || is_file_query,
154            });
155        }
156        // Stop at the project boundary so a stray legacy index in $HOME is
157        // never picked up for a project that has its own marker.
158        if let Some(b) = boundary.as_ref()
159            && &dir == b
160        {
161            break;
162        }
163        if !dir.pop() {
164            break;
165        }
166    }
167
168    None
169}
170
171/// Escape special characters in a path component for safe use in path: predicate.
172/// Also normalizes Windows backslashes to forward slashes for consistent query syntax.
173///
174/// # Double Escaping for Glob Patterns
175/// Glob metacharacters need double-escaping because there are two parsing stages:
176/// 1. Query lexer: `\\[` → `\[` (consumes one level of escaping)
177/// 2. Globset matcher: `\[` → literal `[` (consumes second level)
178///
179/// Without double-escaping, `src/[test]` would become `path:"src/\[test\]/**"`,
180/// lexer would yield `src/[test]/**`, and globset would treat `[test]` as a
181/// character class instead of a literal directory name.
182fn escape_path_for_query(path: &Path) -> String {
183    let path_str = path.to_string_lossy();
184    let mut escaped = String::with_capacity(path_str.len() + 20);
185
186    for ch in path_str.chars() {
187        // Normalize Windows backslashes to forward slashes
188        if ch == '\\' && cfg!(windows) {
189            escaped.push('/');
190            continue;
191        }
192        if ch == '\\' {
193            // Backslash needs 4 chars: `\\\\` → lexer `\\` → globset `\`
194            escaped.push_str("\\\\\\\\");
195        } else if PATH_ESCAPE_CHARS.contains(&ch) {
196            // Other glob chars: `\\[` → lexer `\[` → globset literal `[`
197            escaped.push_str("\\\\");
198            escaped.push(ch);
199        } else {
200            escaped.push(ch);
201        }
202    }
203
204    escaped
205}
206
207/// Check if a path requires quoting due to special characters.
208/// Paths need quoting when they contain:
209/// - Spaces or double quotes (for tokenization)
210/// - Glob metacharacters with escapes (backslash escapes only work in quoted strings)
211/// - A leading character that the query lexer does not accept as a word start
212///   (the lexer's `is_word_start` permits only ASCII alphabetic + `_`, so paths
213///   beginning with `.`, `-`, a digit, `/`, etc. must be quoted).
214fn path_needs_quoting(path: &Path) -> bool {
215    let path_str = path.to_string_lossy();
216    let leading_requires_quoting = path_str
217        .chars()
218        .next()
219        .is_some_and(|c| !c.is_ascii_alphabetic() && c != '_');
220    leading_requires_quoting
221        || path_str
222            .chars()
223            .any(|c| c == ' ' || c == '"' || PATH_ESCAPE_CHARS.contains(&c))
224}
225
226/// Augment a query with an implicit path filter when using ancestor index.
227///
228/// # Arguments
229/// * `query` - Original query string
230/// * `relative_scope` - Path relative to index root to filter by
231/// * `is_file_query` - True if scope is a file, false if directory
232///
233/// # Returns
234/// Query string with path filter appended
235///
236/// # Path Handling
237/// - Paths with spaces, quotes, or glob metacharacters are quoted automatically
238/// - Inside quotes, glob metacharacters are escaped with backslashes
239/// - The implicit filter is `ANDed` with the original query
240/// - Parentheses ensure correct precedence
241/// - File queries use exact path match; directory queries use `/**` glob
242#[must_use]
243pub fn augment_query_with_scope(query: &str, relative_scope: &Path, is_file_query: bool) -> String {
244    // Empty scope means no filtering needed
245    if relative_scope.as_os_str().is_empty() {
246        return query.to_string();
247    }
248
249    // Build the path filter pattern
250    // - File query: exact match (no glob suffix)
251    // - Directory query: recursive glob (/**)
252    let scope_pattern = if path_needs_quoting(relative_scope) {
253        // Escape glob metacharacters (backslash escapes only work in quoted strings)
254        let escaped_path = escape_path_for_query(relative_scope);
255        // Also escape internal double quotes
256        let quoted = escaped_path.replace('"', "\\\"");
257        if is_file_query {
258            format!("\"{quoted}\"")
259        } else {
260            format!("\"{quoted}/**\"")
261        }
262    } else {
263        // Simple path without special characters - use unquoted
264        let path_str = relative_scope.to_string_lossy();
265        if is_file_query {
266            path_str.into_owned()
267        } else {
268            format!("{path_str}/**")
269        }
270    };
271
272    let path_filter = format!("path:{scope_pattern}");
273
274    if query.trim().is_empty() {
275        path_filter
276    } else {
277        // Wrap original query in parentheses to preserve precedence
278        // Example: "kind:fn OR kind:method" -> "(kind:fn OR kind:method) AND path:src/**"
279        format!("({query}) AND {path_filter}")
280    }
281}
282
283#[cfg(test)]
284mod tests {
285    use super::*;
286    use std::fs;
287    use tempfile::TempDir;
288
289    /// Helper to create a minimal index file for discovery tests.
290    fn create_test_index(path: &Path) {
291        let index_path = path.join(INDEX_FILE_NAME);
292        fs::write(&index_path, "test-index-marker").unwrap();
293    }
294
295    #[test]
296    fn find_nearest_index_at_current_dir() {
297        let tmp = TempDir::new().unwrap();
298        create_test_index(tmp.path());
299
300        let result = find_nearest_index(tmp.path());
301
302        assert!(result.is_some());
303        let loc = result.unwrap();
304        assert_eq!(loc.index_root, tmp.path().canonicalize().unwrap());
305        assert!(!loc.is_ancestor);
306        assert!(!loc.is_file_query);
307        assert!(!loc.requires_scope_filter);
308    }
309
310    #[test]
311    fn find_nearest_index_in_parent() {
312        let tmp = TempDir::new().unwrap();
313        create_test_index(tmp.path());
314
315        let subdir = tmp.path().join("src");
316        fs::create_dir(&subdir).unwrap();
317
318        let result = find_nearest_index(&subdir);
319
320        assert!(result.is_some());
321        let loc = result.unwrap();
322        assert_eq!(loc.index_root, tmp.path().canonicalize().unwrap());
323        assert!(loc.is_ancestor);
324        assert!(!loc.is_file_query);
325        assert!(loc.requires_scope_filter);
326    }
327
328    #[test]
329    fn find_nearest_index_in_grandparent() {
330        let tmp = TempDir::new().unwrap();
331        create_test_index(tmp.path());
332
333        let deep = tmp.path().join("src").join("utils");
334        fs::create_dir_all(&deep).unwrap();
335
336        let result = find_nearest_index(&deep);
337
338        assert!(result.is_some());
339        let loc = result.unwrap();
340        assert_eq!(loc.index_root, tmp.path().canonicalize().unwrap());
341        assert!(loc.is_ancestor);
342        assert!(loc.requires_scope_filter);
343    }
344
345    #[test]
346    fn find_nearest_index_none_found() {
347        let tmp = TempDir::new().unwrap();
348        // No index created
349
350        let result = find_nearest_index(tmp.path());
351
352        // The search traverses ancestor directories, so if a .sqry/ exists
353        // in an ancestor of the temp dir (e.g. /tmp/.sqry/ from a previous
354        // run), it will be found. We only assert no index was found *within*
355        // the temp dir itself.
356        match &result {
357            None => {} // expected
358            Some(loc) => {
359                let tmp_canonical = tmp.path().canonicalize().unwrap();
360                assert!(
361                    !loc.index_root.starts_with(&tmp_canonical),
362                    "found unexpected index inside temp dir: {:?}",
363                    loc.index_root
364                );
365            }
366        }
367    }
368
369    #[test]
370    fn find_nearest_index_nested_repos() {
371        let tmp = TempDir::new().unwrap();
372        create_test_index(tmp.path()); // Root index
373
374        let inner = tmp.path().join("packages").join("web");
375        fs::create_dir_all(&inner).unwrap();
376        create_test_index(&inner); // Inner index
377
378        let query_path = inner.join("src");
379        fs::create_dir(&query_path).unwrap();
380
381        let result = find_nearest_index(&query_path);
382
383        // Should find the nearest (inner) index
384        assert!(result.is_some());
385        let loc = result.unwrap();
386        assert_eq!(loc.index_root, inner.canonicalize().unwrap());
387        assert!(loc.is_ancestor);
388    }
389
390    #[test]
391    fn find_nearest_index_file_input() {
392        let tmp = TempDir::new().unwrap();
393        create_test_index(tmp.path());
394
395        let subdir = tmp.path().join("src");
396        fs::create_dir(&subdir).unwrap();
397        let file = subdir.join("main.rs");
398        fs::write(&file, "fn main() {}").unwrap();
399
400        let result = find_nearest_index(&file);
401
402        assert!(result.is_some());
403        let loc = result.unwrap();
404        assert!(loc.is_file_query);
405        assert!(loc.is_ancestor); // Index is in grandparent
406        assert!(loc.requires_scope_filter);
407    }
408
409    #[test]
410    fn find_nearest_index_file_in_index_dir() {
411        let tmp = TempDir::new().unwrap();
412        create_test_index(tmp.path());
413
414        let file = tmp.path().join("main.rs");
415        fs::write(&file, "fn main() {}").unwrap();
416
417        let result = find_nearest_index(&file);
418
419        assert!(result.is_some());
420        let loc = result.unwrap();
421        assert!(!loc.is_ancestor); // Index is in file's parent
422        assert!(loc.is_file_query);
423        assert!(loc.requires_scope_filter); // File queries always need filtering
424    }
425
426    #[test]
427    fn relative_scope_calculation() {
428        let loc = IndexLocation {
429            index_root: PathBuf::from("/project"),
430            query_scope: PathBuf::from("/project/src/utils"),
431            is_ancestor: true,
432            is_file_query: false,
433            requires_scope_filter: true,
434        };
435
436        let scope = loc.relative_scope();
437        assert_eq!(scope, Some(PathBuf::from("src/utils")));
438    }
439
440    #[test]
441    fn relative_scope_same_dir() {
442        let loc = IndexLocation {
443            index_root: PathBuf::from("/project"),
444            query_scope: PathBuf::from("/project"),
445            is_ancestor: false,
446            is_file_query: false,
447            requires_scope_filter: false,
448        };
449
450        let scope = loc.relative_scope();
451        assert!(scope.is_none());
452    }
453
454    #[test]
455    fn relative_scope_file_in_root() {
456        let loc = IndexLocation {
457            index_root: PathBuf::from("/project"),
458            query_scope: PathBuf::from("/project/main.rs"),
459            is_ancestor: false,
460            is_file_query: true,
461            requires_scope_filter: true,
462        };
463
464        let scope = loc.relative_scope();
465        assert_eq!(scope, Some(PathBuf::from("main.rs")));
466    }
467
468    #[test]
469    fn augment_query_with_scope_basic() {
470        let result = augment_query_with_scope("kind:function", Path::new("src"), false);
471        assert_eq!(result, "(kind:function) AND path:src/**");
472    }
473
474    #[test]
475    fn augment_query_with_scope_empty_query() {
476        let result = augment_query_with_scope("", Path::new("src"), false);
477        assert_eq!(result, "path:src/**");
478    }
479
480    #[test]
481    fn augment_query_with_scope_empty_path() {
482        let result = augment_query_with_scope("kind:fn", Path::new(""), false);
483        assert_eq!(result, "kind:fn");
484    }
485
486    #[test]
487    fn augment_query_with_scope_file_query() {
488        let result = augment_query_with_scope("kind:function", Path::new("src/main.rs"), true);
489        assert_eq!(result, "(kind:function) AND path:src/main.rs");
490    }
491
492    #[test]
493    fn augment_query_with_scope_directory_query() {
494        let result = augment_query_with_scope("kind:function", Path::new("src"), false);
495        assert_eq!(result, "(kind:function) AND path:src/**");
496    }
497
498    #[test]
499    fn augment_query_file_with_spaces() {
500        let result =
501            augment_query_with_scope("kind:function", Path::new("my project/main.rs"), true);
502        assert_eq!(result, "(kind:function) AND path:\"my project/main.rs\"");
503    }
504
505    #[test]
506    fn augment_query_with_scope_path_with_spaces() {
507        let result = augment_query_with_scope("kind:function", Path::new("my project/src"), false);
508        assert_eq!(result, "(kind:function) AND path:\"my project/src/**\"");
509    }
510
511    #[test]
512    fn augment_query_with_scope_path_with_glob_chars() {
513        // Paths with glob metacharacters must be quoted and double-escaped:
514        // - CLI emits `\\[` so lexer returns `\[` for globset to interpret as literal `[`
515        let result = augment_query_with_scope("kind:function", Path::new("src/[test]"), false);
516        assert_eq!(result, "(kind:function) AND path:\"src/\\\\[test\\\\]/**\"");
517    }
518
519    #[test]
520    fn augment_query_preserves_precedence() {
521        let result = augment_query_with_scope("kind:fn OR kind:method", Path::new("src"), false);
522        assert_eq!(result, "(kind:fn OR kind:method) AND path:src/**");
523    }
524
525    #[test]
526    fn augment_query_with_existing_path_predicate() {
527        let result =
528            augment_query_with_scope("kind:fn AND path:*.rs", Path::new("src/utils"), false);
529        assert_eq!(result, "(kind:fn AND path:*.rs) AND path:src/utils/**");
530    }
531
532    #[test]
533    fn augment_query_path_with_leading_dot() {
534        // Paths starting with `.` (e.g. hidden directories or git worktrees under
535        // `.worktrees/`) must be quoted because the query lexer's word-start rule
536        // accepts only ASCII alpha + `_`. An unquoted `path:.worktrees/...` value
537        // parses as `path:` followed by a stray `.` and fails.
538        let result = augment_query_with_scope(
539            "kind:function",
540            Path::new(".worktrees/phase3a/test-fixtures/cli-basic"),
541            false,
542        );
543        assert_eq!(
544            result,
545            "(kind:function) AND path:\".worktrees/phase3a/test-fixtures/cli-basic/**\""
546        );
547    }
548
549    #[test]
550    fn augment_query_path_with_leading_digit() {
551        // Paths starting with a digit similarly violate the lexer's word-start rule.
552        let result =
553            augment_query_with_scope("kind:function", Path::new("2024-archive/src"), false);
554        assert_eq!(result, "(kind:function) AND path:\"2024-archive/src/**\"");
555    }
556
557    #[test]
558    #[cfg(unix)]
559    fn escape_path_with_backslash_on_unix() {
560        // Backslash in path gets double-escaped: `\` → `\\\\` (4 chars in raw string)
561        // So lexer returns `\\` and globset matches literal backslash
562        let result = escape_path_for_query(Path::new("src/file\\name"));
563        assert_eq!(result, "src/file\\\\\\\\name");
564    }
565
566    /// Test that augmented queries with special characters can be parsed by the lexer.
567    /// This ensures the escaping strategy produces valid query syntax.
568    #[test]
569    fn augmented_queries_are_parseable() {
570        use sqry_core::query::Lexer;
571
572        let test_cases = [
573            // Simple path (no escaping needed)
574            ("kind:fn", Path::new("src"), false),
575            // Path with spaces (quoted)
576            ("kind:fn", Path::new("my project/src"), false),
577            // Path with glob metacharacters (quoted + escaped)
578            ("kind:fn", Path::new("src/[test]"), false),
579            ("kind:fn", Path::new("src/test*"), false),
580            ("kind:fn", Path::new("src/test?"), false),
581            ("kind:fn", Path::new("src/{a,b}"), false),
582            // File queries
583            ("kind:fn", Path::new("src/main.rs"), true),
584            ("kind:fn", Path::new("src/[test]/main.rs"), true),
585            // Complex query with special path
586            ("kind:fn OR kind:method", Path::new("src/[utils]"), false),
587        ];
588
589        for (query, path, is_file) in test_cases {
590            let augmented = augment_query_with_scope(query, path, is_file);
591            let mut lexer = Lexer::new(&augmented);
592            let result = lexer.tokenize();
593            assert!(
594                result.is_ok(),
595                "Failed to parse augmented query for path {:?}: {:?}\nQuery: {}",
596                path,
597                result.err(),
598                augmented
599            );
600        }
601    }
602}