agpm_cli/markdown/
reference_extractor.rs

1//! File reference extraction and validation for markdown documents.
2//!
3//! This module provides utilities to extract and validate markdown file references
4//! within markdown content. It helps catch broken cross-references before
5//! installation by checking that referenced files actually exist.
6//!
7//! # Supported Reference Types
8//!
9//! - **Markdown links**: `[text](path.md)` - only `.md` files
10//! - **Direct file paths**: `.agpm/snippets/file.md`, `docs/guide.md` - only `.md` files
11//!
12//! # Extraction Rules
13//!
14//! The extractor intelligently filters references to avoid false positives:
15//! - Skips absolute URLs (http://, https://, etc.)
16//! - Skips absolute filesystem paths (starting with /)
17//! - Skips content inside YAML frontmatter (--- delimited)
18//! - Skips content inside code blocks (``` delimited)
19//! - Skips content inside inline code (` delimited)
20//! - Only extracts relative markdown file paths (.md extension)
21//!
22//! # Usage
23//!
24//! ```rust,no_run
25//! use agpm_cli::markdown::reference_extractor::{extract_file_references, validate_file_references};
26//! use std::path::Path;
27//!
28//! # fn example() -> anyhow::Result<()> {
29//! let markdown = r#"
30//! See [documentation](../docs/guide.md) for details.
31//!
32//! Also check `.agpm/snippets/example.md` for examples.
33//! "#;
34//!
35//! let references = extract_file_references(markdown);
36//! // Returns: ["../docs/guide.md", ".agpm/snippets/example.md"]
37//!
38//! // Validate references exist
39//! let project_dir = Path::new("/path/to/project");
40//! let missing = validate_file_references(&references, project_dir)?;
41//! # Ok(())
42//! # }
43//! ```
44
45use anyhow::Result;
46use regex::Regex;
47use std::path::Path;
48
49use crate::markdown::frontmatter::FrontmatterParser;
50
51/// A missing file reference found during validation.
52///
53/// This struct captures information about a file reference that was found
54/// in markdown content but does not exist on the filesystem.
55#[derive(Debug, Clone, PartialEq, Eq)]
56pub struct MissingReference {
57    /// The markdown file that contains the broken reference
58    pub source_file: String,
59
60    /// The referenced path that was not found
61    pub referenced_path: String,
62}
63
64impl MissingReference {
65    /// Create a new missing reference record.
66    ///
67    /// # Arguments
68    ///
69    /// * `source_file` - The file containing the reference
70    /// * `referenced_path` - The path that was referenced but not found
71    #[must_use]
72    pub fn new(source_file: String, referenced_path: String) -> Self {
73        Self {
74            source_file,
75            referenced_path,
76        }
77    }
78}
79
80/// Extract markdown file references from markdown content.
81///
82/// This function scans markdown content for markdown file path references and returns
83/// a deduplicated list of relative markdown file paths. It intelligently filters out
84/// URLs, absolute paths, non-markdown files, and references inside code blocks.
85///
86/// # Extracted Reference Types
87///
88/// - Markdown links: `[text](path.md)` → extracts `path.md` (only `.md` files)
89/// - Direct file paths: `.agpm/snippets/file.md` → extracts `.agpm/snippets/file.md` (only `.md` files)
90///
91/// # Filtering Rules
92///
93/// References are excluded if they:
94/// - Start with URL schemes (http://, https://, ftp://, etc.)
95/// - Are absolute paths (starting with /)
96/// - Appear inside YAML frontmatter (--- delimited at file start)
97/// - Appear inside code blocks (``` delimited)
98/// - Appear inside inline code (` delimited)
99/// - Don't have the .md extension
100/// - Contain URL-like patterns (://)
101///
102/// # Arguments
103///
104/// * `content` - The markdown content to scan
105///
106/// # Returns
107///
108/// A vector of unique relative file paths found in the content
109///
110/// # Examples
111///
112/// ```rust,no_run
113/// # use agpm_cli::markdown::reference_extractor::extract_file_references;
114/// let markdown = r#"
115/// Check [docs](./guide.md) and `.agpm/snippets/example.md`.
116///
117/// But not this [external link](https://example.com) or `inline code .md`.
118/// "#;
119///
120/// let refs = extract_file_references(markdown);
121/// assert_eq!(refs.len(), 2);
122/// assert!(refs.contains(&"./guide.md".to_string()));
123/// assert!(refs.contains(&".agpm/snippets/example.md".to_string()));
124/// ```
125#[must_use]
126pub fn extract_file_references(content: &str) -> Vec<String> {
127    let mut references = Vec::new();
128
129    // Remove frontmatter and code blocks to avoid extracting paths from metadata
130    let content_without_frontmatter = remove_frontmatter(content);
131    let content_without_code = remove_code_blocks(&content_without_frontmatter);
132
133    // Extract markdown links: [text](path.md) - only .md files
134    if let Ok(link_regex) = Regex::new(r"\[([^\]]+)\]\(([^)]+)\)") {
135        for cap in link_regex.captures_iter(&content_without_code) {
136            if let Some(path) = cap.get(2) {
137                let path_str = path.as_str();
138                // Only include markdown files
139                if path_str.ends_with(".md") && is_valid_file_reference(path_str) {
140                    references.push(path_str.to_string());
141                }
142            }
143        }
144    }
145
146    // Extract direct file paths with markdown extensions
147    // Pattern: paths containing / with .md extension only
148    if let Ok(path_regex) = Regex::new(r#"(?:^|\s|["'`])([./a-zA-Z_][\w./-]*\.md)(?:\s|["'`]|$)"#) {
149        for cap in path_regex.captures_iter(&content_without_code) {
150            if let Some(path) = cap.get(1) {
151                let path_str = path.as_str();
152                if is_valid_file_reference(path_str) {
153                    references.push(path_str.to_string());
154                }
155            }
156        }
157    }
158
159    // Deduplicate while preserving order
160    let mut seen = std::collections::HashSet::new();
161    references.retain(|r| seen.insert(r.clone()));
162
163    references
164}
165
166/// Remove YAML frontmatter from markdown content.
167///
168/// This prevents extracting dependency paths from frontmatter metadata,
169/// which are transitive dependencies rather than actual file references in
170/// the content.
171///
172/// # Arguments
173///
174/// * `content` - The markdown content
175///
176/// # Returns
177///
178/// Content with frontmatter removed (--- delimited at the start)
179fn remove_frontmatter(content: &str) -> String {
180    let parser = FrontmatterParser::new();
181    parser.strip_frontmatter(content)
182}
183
184/// Remove code blocks from markdown content.
185///
186/// This helps prevent extracting file paths that appear in code block examples,
187/// which should not be validated as actual file references. Inline code (single
188/// backticks) is preserved since it may contain legitimate file path references.
189///
190/// # Arguments
191///
192/// * `content` - The markdown content
193///
194/// # Returns
195///
196/// Content with code blocks removed (``` delimited)
197fn remove_code_blocks(content: &str) -> String {
198    let mut result = String::new();
199    let mut in_code_block = false;
200    let mut chars = content.chars().peekable();
201
202    while let Some(ch) = chars.next() {
203        // Check for code block delimiter (```)
204        if ch == '`' {
205            let mut backtick_count = 1;
206
207            // Count consecutive backticks
208            while chars.peek() == Some(&'`') {
209                backtick_count += 1;
210                chars.next();
211            }
212
213            // Three or more backticks toggle code block mode
214            if backtick_count >= 3 {
215                in_code_block = !in_code_block;
216                // Replace code block delimiter with spaces
217                for _ in 0..backtick_count {
218                    result.push(' ');
219                }
220                continue;
221            }
222            // It's inline code (1-2 backticks), preserve it
223            for _ in 0..backtick_count {
224                result.push('`');
225            }
226            continue;
227        }
228
229        // Skip content inside code blocks
230        if in_code_block {
231            result.push(' '); // Maintain structure with spaces
232        } else {
233            result.push(ch);
234        }
235    }
236
237    result
238}
239
240/// Check if a path string is a valid file reference to validate.
241///
242/// This function filters out URLs, absolute paths, and other patterns
243/// that should not be validated as local file references.
244///
245/// # Valid References
246///
247/// - Relative paths: `./file.md`, `../docs/guide.md`
248/// - Dot-prefixed paths: `.agpm/snippets/file.md`
249/// - Simple paths: `docs/guide.md`
250///
251/// # Invalid References (Filtered Out)
252///
253/// - URLs: `http://example.com`, `https://github.com/...`
254/// - Absolute paths: `/usr/local/file.md`
255/// - Paths with URL schemes: `file://...`, `ftp://...`
256/// - Empty or whitespace-only strings
257///
258/// # Arguments
259///
260/// * `path` - The path string to validate
261///
262/// # Returns
263///
264/// `true` if the path should be validated, `false` otherwise
265#[must_use]
266pub fn is_valid_file_reference(path: &str) -> bool {
267    let trimmed = path.trim();
268
269    // Skip empty strings
270    if trimmed.is_empty() {
271        return false;
272    }
273
274    // Skip URLs (any scheme://...)
275    if trimmed.contains("://") {
276        return false;
277    }
278
279    // Skip absolute paths
280    if trimmed.starts_with('/') {
281        return false;
282    }
283
284    // Skip anchor links
285    if trimmed.starts_with('#') {
286        return false;
287    }
288
289    // Must have a file extension
290    if !trimmed.contains('.') {
291        return false;
292    }
293
294    // Must contain a path separator (/) to be considered a file path
295    // This filters out simple filenames like "example.md" that aren't paths
296    if !trimmed.contains('/') {
297        return false;
298    }
299
300    true
301}
302
303/// Validate that file references exist on the filesystem.
304///
305/// This function takes a list of relative file paths and checks if they
306/// exist relative to the given project directory. It returns a list of
307/// missing references for error reporting.
308///
309/// # Arguments
310///
311/// * `references` - List of relative file paths to validate
312/// * `project_dir` - Base directory to resolve relative paths against
313///
314/// # Returns
315///
316/// A list of references that were not found
317///
318/// # Errors
319///
320/// Returns an error if the project directory cannot be accessed
321///
322/// # Examples
323///
324/// ```rust,no_run
325/// # use agpm_cli::markdown::reference_extractor::validate_file_references;
326/// # use std::path::Path;
327/// # fn example() -> anyhow::Result<()> {
328/// let references = vec![
329///     ".agpm/snippets/existing.md".to_string(),
330///     ".agpm/snippets/missing.md".to_string(),
331/// ];
332///
333/// let project_dir = Path::new("/path/to/project");
334/// let missing = validate_file_references(&references, project_dir)?;
335/// // Returns only the missing.md entry
336/// # Ok(())
337/// # }
338/// ```
339pub fn validate_file_references(references: &[String], project_dir: &Path) -> Result<Vec<String>> {
340    let mut missing = Vec::new();
341
342    for reference in references {
343        let full_path = project_dir.join(reference);
344
345        if !full_path.exists() {
346            missing.push(reference.clone());
347        }
348    }
349
350    Ok(missing)
351}
352
353#[cfg(test)]
354mod tests {
355    use super::*;
356    use std::fs;
357    use tempfile::tempdir;
358
359    #[test]
360    fn test_extract_markdown_links() {
361        let content = r#"
362Check the [documentation](./docs/guide.md) for more info.
363Also see [examples](../examples/demo.md).
364"#;
365
366        let refs = extract_file_references(content);
367        assert_eq!(refs.len(), 2);
368        assert!(refs.contains(&"./docs/guide.md".to_string()));
369        assert!(refs.contains(&"../examples/demo.md".to_string()));
370    }
371
372    #[test]
373    fn test_extract_direct_file_paths() {
374        let content = r#"
375See `.agpm/snippets/example.md` for the implementation.
376Check `./docs/overview.md` and `.claude/agents/test.md`.
377"#;
378
379        let refs = extract_file_references(content);
380        assert!(refs.contains(&".agpm/snippets/example.md".to_string()));
381        assert!(refs.contains(&".claude/agents/test.md".to_string()));
382        assert!(refs.contains(&"./docs/overview.md".to_string()));
383    }
384
385    #[test]
386    fn test_skip_urls() {
387        let content = r#"
388Visit [GitHub](https://github.com/user/repo) for source.
389Or check http://example.com/page.html.
390"#;
391
392        let refs = extract_file_references(content);
393        assert_eq!(refs.len(), 0);
394    }
395
396    #[test]
397    fn test_skip_code_blocks() {
398        let content = r#"
399Normal reference: `.agpm/snippets/real.md`
400
401```bash
402# This should be skipped: `.agpm/snippets/code.md`
403cat .agpm/snippets/example.md
404```
405
406Another real reference: `docs/guide.md`
407"#;
408
409        let refs = extract_file_references(content);
410        assert!(refs.contains(&".agpm/snippets/real.md".to_string()));
411        assert!(refs.contains(&"docs/guide.md".to_string()));
412        // Should not contain references from code block
413        assert!(!refs.iter().any(|r| r.contains("code.md")));
414    }
415
416    #[test]
417    fn test_inline_code_path_extraction() {
418        let content = "Check `.agpm/real.md` for details.";
419
420        let refs = extract_file_references(content);
421        // File paths in inline code are still extracted if they look like actual paths
422        assert!(refs.contains(&".agpm/real.md".to_string()));
423    }
424
425    #[test]
426    fn test_deduplication() {
427        let content = r#"
428See `.agpm/snippets/example.md` for details.
429Also check `.agpm/snippets/example.md` again.
430"#;
431
432        let refs = extract_file_references(content);
433        assert_eq!(refs.len(), 1);
434    }
435
436    #[test]
437    fn test_is_valid_file_reference() {
438        // Valid references
439        assert!(is_valid_file_reference("./docs/guide.md"));
440        assert!(is_valid_file_reference(".agpm/snippets/file.md"));
441        assert!(is_valid_file_reference("../parent/file.json"));
442
443        // Invalid references
444        assert!(!is_valid_file_reference("https://example.com"));
445        assert!(!is_valid_file_reference("http://test.com/file.md"));
446        assert!(!is_valid_file_reference("/absolute/path.md"));
447        assert!(!is_valid_file_reference("#anchor"));
448        assert!(!is_valid_file_reference(""));
449        assert!(!is_valid_file_reference("no-extension"));
450    }
451
452    #[test]
453    fn test_validate_file_references() -> Result<()> {
454        let temp_dir = tempdir()?;
455        let project_dir = temp_dir.path();
456
457        // Create some test files
458        let existing_dir = project_dir.join(".agpm").join("snippets");
459        fs::create_dir_all(&existing_dir)?;
460        fs::write(existing_dir.join("existing.md"), "content")?;
461
462        let references = vec![
463            ".agpm/snippets/existing.md".to_string(),
464            ".agpm/snippets/missing.md".to_string(),
465            "nonexistent/file.md".to_string(),
466        ];
467
468        let missing = validate_file_references(&references, project_dir)?;
469
470        assert_eq!(missing.len(), 2);
471        assert!(missing.contains(&".agpm/snippets/missing.md".to_string()));
472        assert!(missing.contains(&"nonexistent/file.md".to_string()));
473        assert!(!missing.contains(&".agpm/snippets/existing.md".to_string()));
474
475        Ok(())
476    }
477
478    #[test]
479    fn test_remove_code_blocks() {
480        let content = r#"
481Normal text with `.agpm/file.md`
482
483```rust
484let path = ".agpm/in_code.md";
485```
486
487More normal text `.agpm/another.md`
488"#;
489
490        let cleaned = remove_code_blocks(content);
491        assert!(cleaned.contains(".agpm/file.md"));
492        assert!(cleaned.contains(".agpm/another.md"));
493        // Code block content should be replaced with spaces
494        assert!(
495            !cleaned.contains("in_code.md")
496                || cleaned.split_whitespace().all(|word| !word.contains("in_code.md"))
497        );
498    }
499
500    #[test]
501    fn test_remove_frontmatter() {
502        let content = r#"---
503dependencies:
504  agents:
505    - path: agents/helper.md
506  snippets:
507    - path: snippets/utils.md
508---
509
510# Main Content
511
512See [documentation](./docs/guide.md) for details.
513"#;
514
515        let cleaned = remove_frontmatter(content);
516        // Frontmatter should be removed
517        assert!(!cleaned.contains("dependencies:"));
518        assert!(!cleaned.contains("agents/helper.md"));
519        assert!(!cleaned.contains("snippets/utils.md"));
520        // Content should remain
521        assert!(cleaned.contains("# Main Content"));
522        assert!(cleaned.contains("./docs/guide.md"));
523    }
524
525    #[test]
526    fn test_extract_with_frontmatter_dependencies() {
527        let content = r#"---
528dependencies:
529  agents:
530    - path: agents/helper.md
531      version: v1.0.0
532  snippets:
533    - path: .agpm/snippets/utils.md
534---
535
536# Command
537
538See [real reference](./docs/guide.md) for details.
539Check `.claude/agents/example.md` for the implementation.
540"#;
541
542        let refs = extract_file_references(content);
543
544        // Should extract content references
545        assert!(refs.contains(&"./docs/guide.md".to_string()));
546        assert!(refs.contains(&".claude/agents/example.md".to_string()));
547
548        // Should NOT extract frontmatter dependencies
549        assert!(!refs.contains(&"agents/helper.md".to_string()));
550        assert!(!refs.contains(&".agpm/snippets/utils.md".to_string()));
551    }
552
553    #[test]
554    fn test_complex_markdown_with_mixed_references() {
555        let content = r#"
556# Documentation
557
558See the [main guide](./docs/guide.md) for details.
559
560## Implementation
561
562The core logic is in `.agpm/snippets/core.md` file.
563
564```rust
565// This code reference should be ignored
566let path = ".agpm/snippets/ignored.md";
567```
568
569Also check:
570- [Examples](../examples/demo.md)
571- External: https://github.com/user/repo
572- `.claude/agents/helper.md`
573
574Inline code like `example.md` should be skipped.
575"#;
576
577        let refs = extract_file_references(content);
578
579        // Should extract these
580        assert!(refs.contains(&"./docs/guide.md".to_string()));
581        assert!(refs.contains(&".agpm/snippets/core.md".to_string()));
582        assert!(refs.contains(&"../examples/demo.md".to_string()));
583        assert!(refs.contains(&".claude/agents/helper.md".to_string()));
584
585        // Should NOT extract these
586        assert!(!refs.iter().any(|r| r.contains("github.com")));
587        assert!(!refs.iter().any(|r| r.contains("ignored.md")));
588        assert!(!refs.contains(&"example.md".to_string())); // Was in inline code
589    }
590}