claude_agent/context/
import_extractor.rs

1//! CLI-compatible @import extraction using Markdown-aware parsing.
2//!
3//! This module provides CLI-compatible import path extraction from Markdown files.
4//! It skips code blocks (fenced and inline) to avoid extracting @paths from code examples.
5
6use regex::Regex;
7use std::collections::HashSet;
8use std::path::{Path, PathBuf};
9
10/// Extracts @import paths from Markdown content, CLI-compatible implementation.
11///
12/// # CLI Compatibility
13/// This implementation matches the Claude Code CLI 2.1.12 behavior:
14/// - Uses regex pattern: `(?:^|\s)@((?:[^\s\\]|\\ )+)` to match @paths
15/// - Skips content inside fenced code blocks (```)
16/// - Skips content inside inline code spans (backticks)
17/// - Supports escaped spaces in paths (`\ `)
18/// - Validates paths using the same rules as CLI
19pub struct ImportExtractor {
20    regex: Regex,
21    code_block_regex: Regex,
22    inline_code_regex: Regex,
23}
24
25impl ImportExtractor {
26    /// Creates a new ImportExtractor with CLI-compatible regex.
27    pub fn new() -> Self {
28        Self {
29            // CLI-compatible regex: matches @path at line start or after whitespace
30            // Captures: path with escaped spaces (\ )
31            regex: Regex::new(r"(?:^|\s)@((?:[^\s\\]|\\ )+)").expect("Invalid regex pattern"),
32            // Match fenced code blocks (``` or ~~~)
33            code_block_regex: Regex::new(r"(?s)(?:```|~~~).*?(?:```|~~~)").expect("Invalid regex"),
34            // Match inline code (`...`)
35            inline_code_regex: Regex::new(r"`[^`]+`").expect("Invalid regex"),
36        }
37    }
38
39    /// Extracts @import paths from Markdown content, skipping code blocks.
40    ///
41    /// # Arguments
42    /// * `content` - Markdown content to parse
43    /// * `base_dir` - Base directory for resolving relative paths
44    ///
45    /// # Returns
46    /// Vector of unique resolved PathBufs for valid import paths (duplicates removed)
47    pub fn extract(&self, content: &str, base_dir: &Path) -> Vec<PathBuf> {
48        // Remove fenced code blocks first
49        let without_fenced = self.code_block_regex.replace_all(content, " ");
50        // Remove inline code spans
51        let clean_content = self.inline_code_regex.replace_all(&without_fenced, " ");
52
53        // Extract paths from cleaned content with deduplication
54        let mut seen = HashSet::new();
55        let mut paths = Vec::new();
56        self.extract_from_text_dedup(&clean_content, base_dir, &mut seen, &mut paths);
57        paths
58    }
59
60    /// Extracts @import paths with deduplication.
61    fn extract_from_text_dedup(
62        &self,
63        text: &str,
64        base_dir: &Path,
65        seen: &mut HashSet<PathBuf>,
66        paths: &mut Vec<PathBuf>,
67    ) {
68        for cap in self.regex.captures_iter(text) {
69            if let Some(m) = cap.get(1) {
70                // Unescape spaces (CLI compatibility: `\ ` -> ` `)
71                let raw_path = m.as_str().replace("\\ ", " ");
72                if let Some(resolved) = self.resolve_path(&raw_path, base_dir) {
73                    // Only add if not seen before
74                    if seen.insert(resolved.clone()) {
75                        paths.push(resolved);
76                    }
77                }
78            }
79        }
80    }
81
82    /// Resolves a path string to an absolute PathBuf.
83    ///
84    /// # Path Resolution Rules (CLI-compatible)
85    /// - `~/...` -> Expands to home directory
86    /// - `/...` -> Absolute path (as-is)
87    /// - `./...` or relative -> Relative to base_dir
88    fn resolve_path(&self, path: &str, base_dir: &Path) -> Option<PathBuf> {
89        if !self.is_valid_path(path) {
90            return None;
91        }
92
93        Some(if let Some(rest) = path.strip_prefix("~/") {
94            crate::common::home_dir()?.join(rest)
95        } else if path.starts_with('/') {
96            PathBuf::from(path)
97        } else {
98            base_dir.join(path)
99        })
100    }
101
102    /// Validates a path string using CLI-compatible rules.
103    ///
104    /// # Valid Path Patterns (CLI: isValidPath)
105    /// - Starts with `./` (explicit relative)
106    /// - Starts with `~/` (home directory)
107    /// - Starts with `/` but not just `/` alone (absolute path)
108    /// - Starts with alphanumeric, `.`, `_`, or `-` (implicit relative)
109    /// - Does NOT start with `@` (escaped @)
110    /// - Does NOT start with special characters `#%^&*()`
111    fn is_valid_path(&self, path: &str) -> bool {
112        if path.is_empty() {
113            return false;
114        }
115
116        // CLI-compatible validation rules
117        path.starts_with("./")
118            || path.starts_with("~/")
119            || (path.starts_with('/') && path != "/")
120            || (!path.starts_with('@')
121                && !path.starts_with(|c| "#%^&*()".contains(c))
122                && path
123                    .starts_with(|c: char| c.is_alphanumeric() || c == '.' || c == '_' || c == '-'))
124    }
125}
126
127impl Default for ImportExtractor {
128    fn default() -> Self {
129        Self::new()
130    }
131}
132
133#[cfg(test)]
134mod tests {
135    use super::*;
136
137    #[test]
138    fn test_extract_line_start() {
139        let extractor = ImportExtractor::new();
140        let content = "@docs/api.md\n@config/settings.md";
141        let imports = extractor.extract(content, Path::new("/project"));
142        assert_eq!(imports.len(), 2);
143        assert!(imports[0].ends_with("docs/api.md"));
144        assert!(imports[1].ends_with("config/settings.md"));
145    }
146
147    #[test]
148    fn test_extract_inline() {
149        let extractor = ImportExtractor::new();
150        let content = "Prerequisites: @docs/guide.md for details";
151        let imports = extractor.extract(content, Path::new("/project"));
152        assert_eq!(imports.len(), 1);
153        assert!(imports[0].ends_with("docs/guide.md"));
154    }
155
156    #[test]
157    fn test_skip_fenced_code_block() {
158        let extractor = ImportExtractor::new();
159        let content = "```\n@should/not/import.md\n```\n@should/import.md";
160        let imports = extractor.extract(content, Path::new("/project"));
161        assert_eq!(imports.len(), 1);
162        assert!(imports[0].ends_with("should/import.md"));
163    }
164
165    #[test]
166    fn test_skip_indented_code_block() {
167        let extractor = ImportExtractor::new();
168        let content =
169            "Normal text @real/import.md\n\n    @indented/code.md\n\nMore @another/import.md";
170        let imports = extractor.extract(content, Path::new("/project"));
171        // Note: Indented code blocks are NOT skipped (CLI-compatible behavior)
172        // Only fenced code blocks (``` or ~~~) and inline code are skipped
173        assert!(imports.iter().any(|p| p.ends_with("real/import.md")));
174        assert!(imports.iter().any(|p| p.ends_with("another/import.md")));
175    }
176
177    #[test]
178    fn test_skip_inline_code() {
179        let extractor = ImportExtractor::new();
180        let content = "Use `@decorator` syntax and @real/import.md file";
181        let imports = extractor.extract(content, Path::new("/project"));
182        assert_eq!(imports.len(), 1);
183        assert!(imports[0].ends_with("real/import.md"));
184    }
185
186    #[test]
187    fn test_home_expansion() {
188        let extractor = ImportExtractor::new();
189        let content = "@~/shared/config.md";
190        let imports = extractor.extract(content, Path::new("/project"));
191        assert_eq!(imports.len(), 1);
192        assert!(!imports[0].to_string_lossy().contains('~'));
193    }
194
195    #[test]
196    fn test_relative_paths() {
197        let extractor = ImportExtractor::new();
198        let content = "@./local/file.md";
199        let imports = extractor.extract(content, Path::new("/project/subdir"));
200        assert_eq!(imports.len(), 1);
201        assert!(imports[0].starts_with("/project/subdir"));
202    }
203
204    #[test]
205    fn test_absolute_path() {
206        let extractor = ImportExtractor::new();
207        let content = "@/absolute/path/file.md";
208        let imports = extractor.extract(content, Path::new("/project"));
209        assert_eq!(imports.len(), 1);
210        assert_eq!(imports[0], PathBuf::from("/absolute/path/file.md"));
211    }
212
213    #[test]
214    fn test_invalid_paths_ignored() {
215        let extractor = ImportExtractor::new();
216        let content = "@#invalid @%also-invalid @^nope @&bad @*bad @(bad @)bad";
217        let imports = extractor.extract(content, Path::new("/project"));
218        assert!(imports.is_empty());
219    }
220
221    #[test]
222    fn test_escaped_at_ignored() {
223        let extractor = ImportExtractor::new();
224        let content = "@@escaped @valid/path.md";
225        let imports = extractor.extract(content, Path::new("/project"));
226        // @@escaped should produce @escaped which starts with @ and is invalid
227        // But @valid/path.md should be valid
228        assert!(imports.iter().any(|p| p.ends_with("valid/path.md")));
229    }
230
231    #[test]
232    fn test_escaped_spaces_in_path() {
233        let extractor = ImportExtractor::new();
234        let content = r"@docs/my\ file.md";
235        let imports = extractor.extract(content, Path::new("/project"));
236        assert_eq!(imports.len(), 1);
237        assert!(imports[0].ends_with("docs/my file.md"));
238    }
239
240    #[test]
241    fn test_root_slash_only_invalid() {
242        let extractor = ImportExtractor::new();
243        // "/" alone is not a valid import path
244        assert!(!extractor.is_valid_path("/"));
245    }
246
247    #[test]
248    fn test_implicit_relative_path() {
249        let extractor = ImportExtractor::new();
250
251        // Verify is_valid_path accepts each pattern
252        assert!(
253            extractor.is_valid_path("docs/file.md"),
254            "alphanumeric start"
255        );
256        assert!(
257            extractor.is_valid_path("_private/config.md"),
258            "underscore start"
259        );
260        assert!(
261            extractor.is_valid_path(".hidden/file.md"),
262            "dot start (not ./)"
263        );
264
265        // Test extraction of each pattern individually
266        let content1 = "@docs/file.md";
267        let imports1 = extractor.extract(content1, Path::new("/project"));
268        assert_eq!(imports1.len(), 1, "alphanumeric path");
269
270        let content2 = "@_private/config.md";
271        let imports2 = extractor.extract(content2, Path::new("/project"));
272        assert_eq!(imports2.len(), 1, "underscore path");
273
274        let content3 = "@.hidden/file.md";
275        let imports3 = extractor.extract(content3, Path::new("/project"));
276        assert_eq!(imports3.len(), 1, "dot path");
277    }
278
279    #[test]
280    fn test_multiple_imports_same_line() {
281        let extractor = ImportExtractor::new();
282        let content = "Include @first.md and @second.md and @third.md";
283        let imports = extractor.extract(content, Path::new("/project"));
284        assert_eq!(imports.len(), 3);
285    }
286
287    #[test]
288    fn test_empty_content() {
289        let extractor = ImportExtractor::new();
290        let imports = extractor.extract("", Path::new("/project"));
291        assert!(imports.is_empty());
292    }
293
294    #[test]
295    fn test_no_imports() {
296        let extractor = ImportExtractor::new();
297        let content = "# Title\n\nJust regular content without any imports.";
298        let imports = extractor.extract(content, Path::new("/project"));
299        assert!(imports.is_empty());
300    }
301
302    #[test]
303    fn test_markdown_link_not_imported() {
304        let extractor = ImportExtractor::new();
305        // Markdown link format: [@text](@path) should NOT be extracted
306        // because @ follows [ and ( which are not whitespace
307        let content = "See [@.agents/docs.md](@.agents/docs.md) for details";
308        let imports = extractor.extract(content, Path::new("/project"));
309        assert!(
310            imports.is_empty(),
311            "Markdown links should not be extracted as imports"
312        );
313    }
314
315    #[test]
316    fn test_duplicate_import_paths_deduped() {
317        let extractor = ImportExtractor::new();
318        let content = "@docs/api.md\nSome text\n@docs/api.md";
319        let imports = extractor.extract(content, Path::new("/project"));
320        assert_eq!(imports.len(), 1, "Duplicates should be removed");
321        assert!(imports[0].ends_with("docs/api.md"));
322    }
323
324    #[test]
325    fn test_same_file_inline_twice_deduped() {
326        let extractor = ImportExtractor::new();
327        let content = "See @docs/api.md and also @docs/api.md";
328        let imports = extractor.extract(content, Path::new("/project"));
329        assert_eq!(imports.len(), 1, "Duplicates should be removed");
330    }
331
332    #[test]
333    fn test_different_paths_not_deduped() {
334        let extractor = ImportExtractor::new();
335        let content = "@docs/api.md\n@docs/guide.md\n@docs/api.md";
336        let imports = extractor.extract(content, Path::new("/project"));
337        assert_eq!(imports.len(), 2, "Different paths should be preserved");
338    }
339}