agpm_cli/markdown/
reference_extractor.rs

1//! File reference extraction and validation for markdown documents.
2//!
3//! This module provides utilities to extract and validate markdown file references
4//! within markdown content. It helps catch broken cross-references before
5//! installation by checking that referenced files actually exist.
6//!
7//! # Supported Reference Types
8//!
9//! - **Markdown links**: `[text](path.md)` - only `.md` files
10//! - **Direct file paths**: `.agpm/snippets/file.md`, `docs/guide.md` - only `.md` files
11//!
12//! # Extraction Rules
13//!
14//! The extractor intelligently filters references to avoid false positives:
15//! - Skips absolute URLs (http://, https://, etc.)
16//! - Skips absolute filesystem paths (starting with /)
17//! - Skips content inside YAML frontmatter (--- delimited)
18//! - Skips content inside code blocks (``` delimited)
19//! - Skips content inside inline code (` delimited)
20//! - Only extracts relative markdown file paths (.md extension)
21//!
22//! # Usage
23//!
24//! ```rust,no_run
25//! use agpm_cli::markdown::reference_extractor::{extract_file_references, validate_file_references};
26//! use std::path::Path;
27//!
28//! # fn example() -> anyhow::Result<()> {
29//! let markdown = r#"
30//! See [documentation](../docs/guide.md) for details.
31//!
32//! Also check `.agpm/snippets/example.md` for examples.
33//! "#;
34//!
35//! let references = extract_file_references(markdown);
36//! // Returns: ["../docs/guide.md", ".agpm/snippets/example.md"]
37//!
38//! // Validate references exist
39//! let project_dir = Path::new("/path/to/project");
40//! let missing = validate_file_references(&references, project_dir)?;
41//! # Ok(())
42//! # }
43//! ```
44
45use anyhow::Result;
46use regex::Regex;
47use std::path::Path;
48
49/// A missing file reference found during validation.
50///
51/// This struct captures information about a file reference that was found
52/// in markdown content but does not exist on the filesystem.
53#[derive(Debug, Clone, PartialEq, Eq)]
54pub struct MissingReference {
55    /// The markdown file that contains the broken reference
56    pub source_file: String,
57
58    /// The referenced path that was not found
59    pub referenced_path: String,
60}
61
62impl MissingReference {
63    /// Create a new missing reference record.
64    ///
65    /// # Arguments
66    ///
67    /// * `source_file` - The file containing the reference
68    /// * `referenced_path` - The path that was referenced but not found
69    #[must_use]
70    pub fn new(source_file: String, referenced_path: String) -> Self {
71        Self {
72            source_file,
73            referenced_path,
74        }
75    }
76}
77
78/// Extract markdown file references from markdown content.
79///
80/// This function scans markdown content for markdown file path references and returns
81/// a deduplicated list of relative markdown file paths. It intelligently filters out
82/// URLs, absolute paths, non-markdown files, and references inside code blocks.
83///
84/// # Extracted Reference Types
85///
86/// - Markdown links: `[text](path.md)` → extracts `path.md` (only `.md` files)
87/// - Direct file paths: `.agpm/snippets/file.md` → extracts `.agpm/snippets/file.md` (only `.md` files)
88///
89/// # Filtering Rules
90///
91/// References are excluded if they:
92/// - Start with URL schemes (http://, https://, ftp://, etc.)
93/// - Are absolute paths (starting with /)
94/// - Appear inside YAML frontmatter (--- delimited at file start)
95/// - Appear inside code blocks (``` delimited)
96/// - Appear inside inline code (` delimited)
97/// - Don't have the .md extension
98/// - Contain URL-like patterns (://)
99///
100/// # Arguments
101///
102/// * `content` - The markdown content to scan
103///
104/// # Returns
105///
106/// A vector of unique relative file paths found in the content
107///
108/// # Examples
109///
110/// ```rust,no_run
111/// # use agpm_cli::markdown::reference_extractor::extract_file_references;
112/// let markdown = r#"
113/// Check [docs](./guide.md) and `.agpm/snippets/example.md`.
114///
115/// But not this [external link](https://example.com) or `inline code .md`.
116/// "#;
117///
118/// let refs = extract_file_references(markdown);
119/// assert_eq!(refs.len(), 2);
120/// assert!(refs.contains(&"./guide.md".to_string()));
121/// assert!(refs.contains(&".agpm/snippets/example.md".to_string()));
122/// ```
123#[must_use]
124pub fn extract_file_references(content: &str) -> Vec<String> {
125    let mut references = Vec::new();
126
127    // Remove frontmatter and code blocks to avoid extracting paths from metadata
128    let content_without_frontmatter = remove_frontmatter(content);
129    let content_without_code = remove_code_blocks(&content_without_frontmatter);
130
131    // Extract markdown links: [text](path.md) - only .md files
132    if let Ok(link_regex) = Regex::new(r"\[([^\]]+)\]\(([^)]+)\)") {
133        for cap in link_regex.captures_iter(&content_without_code) {
134            if let Some(path) = cap.get(2) {
135                let path_str = path.as_str();
136                // Only include markdown files
137                if path_str.ends_with(".md") && is_valid_file_reference(path_str) {
138                    references.push(path_str.to_string());
139                }
140            }
141        }
142    }
143
144    // Extract direct file paths with markdown extensions
145    // Pattern: paths containing / with .md extension only
146    if let Ok(path_regex) = Regex::new(r#"(?:^|\s|["'`])([./a-zA-Z_][\w./-]*\.md)(?:\s|["'`]|$)"#) {
147        for cap in path_regex.captures_iter(&content_without_code) {
148            if let Some(path) = cap.get(1) {
149                let path_str = path.as_str();
150                if is_valid_file_reference(path_str) {
151                    references.push(path_str.to_string());
152                }
153            }
154        }
155    }
156
157    // Deduplicate while preserving order
158    let mut seen = std::collections::HashSet::new();
159    references.retain(|r| seen.insert(r.clone()));
160
161    references
162}
163
164/// Remove YAML frontmatter from markdown content.
165///
166/// This prevents extracting dependency paths from frontmatter metadata,
167/// which are transitive dependencies rather than actual file references in
168/// the content.
169///
170/// # Arguments
171///
172/// * `content` - The markdown content
173///
174/// # Returns
175///
176/// Content with frontmatter removed (--- delimited at the start)
177fn remove_frontmatter(content: &str) -> String {
178    // Check if content starts with frontmatter delimiter
179    if !content.starts_with("---\n") && !content.starts_with("---\r\n") {
180        return content.to_string();
181    }
182
183    // Find the end of frontmatter
184    let search_start = if content.starts_with("---\n") {
185        4
186    } else {
187        5
188    };
189
190    let end_pattern = if content.contains("\r\n") {
191        "\r\n---\r\n"
192    } else {
193        "\n---\n"
194    };
195
196    if let Some(end_pos) = content[search_start..].find(end_pattern) {
197        // Return content after frontmatter, skipping the closing delimiter
198        let content_start = search_start + end_pos + end_pattern.len();
199        content[content_start..].to_string()
200    } else {
201        // No closing delimiter found, return original content
202        content.to_string()
203    }
204}
205
206/// Remove code blocks from markdown content.
207///
208/// This helps prevent extracting file paths that appear in code block examples,
209/// which should not be validated as actual file references. Inline code (single
210/// backticks) is preserved since it may contain legitimate file path references.
211///
212/// # Arguments
213///
214/// * `content` - The markdown content
215///
216/// # Returns
217///
218/// Content with code blocks removed (``` delimited)
219fn remove_code_blocks(content: &str) -> String {
220    let mut result = String::new();
221    let mut in_code_block = false;
222    let mut chars = content.chars().peekable();
223
224    while let Some(ch) = chars.next() {
225        // Check for code block delimiter (```)
226        if ch == '`' {
227            let mut backtick_count = 1;
228
229            // Count consecutive backticks
230            while chars.peek() == Some(&'`') {
231                backtick_count += 1;
232                chars.next();
233            }
234
235            // Three or more backticks toggle code block mode
236            if backtick_count >= 3 {
237                in_code_block = !in_code_block;
238                // Replace code block delimiter with spaces
239                for _ in 0..backtick_count {
240                    result.push(' ');
241                }
242                continue;
243            } else {
244                // It's inline code (1-2 backticks), preserve it
245                for _ in 0..backtick_count {
246                    result.push('`');
247                }
248                continue;
249            }
250        }
251
252        // Skip content inside code blocks
253        if in_code_block {
254            result.push(' '); // Maintain structure with spaces
255        } else {
256            result.push(ch);
257        }
258    }
259
260    result
261}
262
263/// Check if a path string is a valid file reference to validate.
264///
265/// This function filters out URLs, absolute paths, and other patterns
266/// that should not be validated as local file references.
267///
268/// # Valid References
269///
270/// - Relative paths: `./file.md`, `../docs/guide.md`
271/// - Dot-prefixed paths: `.agpm/snippets/file.md`
272/// - Simple paths: `docs/guide.md`
273///
274/// # Invalid References (Filtered Out)
275///
276/// - URLs: `http://example.com`, `https://github.com/...`
277/// - Absolute paths: `/usr/local/file.md`
278/// - Paths with URL schemes: `file://...`, `ftp://...`
279/// - Empty or whitespace-only strings
280///
281/// # Arguments
282///
283/// * `path` - The path string to validate
284///
285/// # Returns
286///
287/// `true` if the path should be validated, `false` otherwise
288#[must_use]
289pub fn is_valid_file_reference(path: &str) -> bool {
290    let trimmed = path.trim();
291
292    // Skip empty strings
293    if trimmed.is_empty() {
294        return false;
295    }
296
297    // Skip URLs (any scheme://...)
298    if trimmed.contains("://") {
299        return false;
300    }
301
302    // Skip absolute paths
303    if trimmed.starts_with('/') {
304        return false;
305    }
306
307    // Skip anchor links
308    if trimmed.starts_with('#') {
309        return false;
310    }
311
312    // Must have a file extension
313    if !trimmed.contains('.') {
314        return false;
315    }
316
317    // Must contain a path separator (/) to be considered a file path
318    // This filters out simple filenames like "example.md" that aren't paths
319    if !trimmed.contains('/') {
320        return false;
321    }
322
323    true
324}
325
326/// Validate that file references exist on the filesystem.
327///
328/// This function takes a list of relative file paths and checks if they
329/// exist relative to the given project directory. It returns a list of
330/// missing references for error reporting.
331///
332/// # Arguments
333///
334/// * `references` - List of relative file paths to validate
335/// * `project_dir` - Base directory to resolve relative paths against
336///
337/// # Returns
338///
339/// A list of references that were not found
340///
341/// # Errors
342///
343/// Returns an error if the project directory cannot be accessed
344///
345/// # Examples
346///
347/// ```rust,no_run
348/// # use agpm_cli::markdown::reference_extractor::validate_file_references;
349/// # use std::path::Path;
350/// # fn example() -> anyhow::Result<()> {
351/// let references = vec![
352///     ".agpm/snippets/existing.md".to_string(),
353///     ".agpm/snippets/missing.md".to_string(),
354/// ];
355///
356/// let project_dir = Path::new("/path/to/project");
357/// let missing = validate_file_references(&references, project_dir)?;
358/// // Returns only the missing.md entry
359/// # Ok(())
360/// # }
361/// ```
362pub fn validate_file_references(references: &[String], project_dir: &Path) -> Result<Vec<String>> {
363    let mut missing = Vec::new();
364
365    for reference in references {
366        let full_path = project_dir.join(reference);
367
368        if !full_path.exists() {
369            missing.push(reference.clone());
370        }
371    }
372
373    Ok(missing)
374}
375
376#[cfg(test)]
377mod tests {
378    use super::*;
379    use std::fs;
380    use tempfile::tempdir;
381
382    #[test]
383    fn test_extract_markdown_links() {
384        let content = r#"
385Check the [documentation](./docs/guide.md) for more info.
386Also see [examples](../examples/demo.md).
387"#;
388
389        let refs = extract_file_references(content);
390        assert_eq!(refs.len(), 2);
391        assert!(refs.contains(&"./docs/guide.md".to_string()));
392        assert!(refs.contains(&"../examples/demo.md".to_string()));
393    }
394
395    #[test]
396    fn test_extract_direct_file_paths() {
397        let content = r#"
398See `.agpm/snippets/example.md` for the implementation.
399Check `./docs/overview.md` and `.claude/agents/test.md`.
400"#;
401
402        let refs = extract_file_references(content);
403        assert!(refs.contains(&".agpm/snippets/example.md".to_string()));
404        assert!(refs.contains(&".claude/agents/test.md".to_string()));
405        assert!(refs.contains(&"./docs/overview.md".to_string()));
406    }
407
408    #[test]
409    fn test_skip_urls() {
410        let content = r#"
411Visit [GitHub](https://github.com/user/repo) for source.
412Or check http://example.com/page.html.
413"#;
414
415        let refs = extract_file_references(content);
416        assert_eq!(refs.len(), 0);
417    }
418
419    #[test]
420    fn test_skip_code_blocks() {
421        let content = r#"
422Normal reference: `.agpm/snippets/real.md`
423
424```bash
425# This should be skipped: `.agpm/snippets/code.md`
426cat .agpm/snippets/example.md
427```
428
429Another real reference: `docs/guide.md`
430"#;
431
432        let refs = extract_file_references(content);
433        assert!(refs.contains(&".agpm/snippets/real.md".to_string()));
434        assert!(refs.contains(&"docs/guide.md".to_string()));
435        // Should not contain references from code block
436        assert!(!refs.iter().any(|r| r.contains("code.md")));
437    }
438
439    #[test]
440    fn test_inline_code_path_extraction() {
441        let content = "Check `.agpm/real.md` for details.";
442
443        let refs = extract_file_references(content);
444        // File paths in inline code are still extracted if they look like actual paths
445        assert!(refs.contains(&".agpm/real.md".to_string()));
446    }
447
448    #[test]
449    fn test_deduplication() {
450        let content = r#"
451See `.agpm/snippets/example.md` for details.
452Also check `.agpm/snippets/example.md` again.
453"#;
454
455        let refs = extract_file_references(content);
456        assert_eq!(refs.len(), 1);
457    }
458
459    #[test]
460    fn test_is_valid_file_reference() {
461        // Valid references
462        assert!(is_valid_file_reference("./docs/guide.md"));
463        assert!(is_valid_file_reference(".agpm/snippets/file.md"));
464        assert!(is_valid_file_reference("../parent/file.json"));
465
466        // Invalid references
467        assert!(!is_valid_file_reference("https://example.com"));
468        assert!(!is_valid_file_reference("http://test.com/file.md"));
469        assert!(!is_valid_file_reference("/absolute/path.md"));
470        assert!(!is_valid_file_reference("#anchor"));
471        assert!(!is_valid_file_reference(""));
472        assert!(!is_valid_file_reference("no-extension"));
473    }
474
475    #[test]
476    fn test_validate_file_references() -> Result<()> {
477        let temp_dir = tempdir()?;
478        let project_dir = temp_dir.path();
479
480        // Create some test files
481        let existing_dir = project_dir.join(".agpm").join("snippets");
482        fs::create_dir_all(&existing_dir)?;
483        fs::write(existing_dir.join("existing.md"), "content")?;
484
485        let references = vec![
486            ".agpm/snippets/existing.md".to_string(),
487            ".agpm/snippets/missing.md".to_string(),
488            "nonexistent/file.md".to_string(),
489        ];
490
491        let missing = validate_file_references(&references, project_dir)?;
492
493        assert_eq!(missing.len(), 2);
494        assert!(missing.contains(&".agpm/snippets/missing.md".to_string()));
495        assert!(missing.contains(&"nonexistent/file.md".to_string()));
496        assert!(!missing.contains(&".agpm/snippets/existing.md".to_string()));
497
498        Ok(())
499    }
500
501    #[test]
502    fn test_remove_code_blocks() {
503        let content = r#"
504Normal text with `.agpm/file.md`
505
506```rust
507let path = ".agpm/in_code.md";
508```
509
510More normal text `.agpm/another.md`
511"#;
512
513        let cleaned = remove_code_blocks(content);
514        assert!(cleaned.contains(".agpm/file.md"));
515        assert!(cleaned.contains(".agpm/another.md"));
516        // Code block content should be replaced with spaces
517        assert!(
518            !cleaned.contains("in_code.md")
519                || cleaned.split_whitespace().all(|word| !word.contains("in_code.md"))
520        );
521    }
522
523    #[test]
524    fn test_remove_frontmatter() {
525        let content = r#"---
526dependencies:
527  agents:
528    - path: agents/helper.md
529  snippets:
530    - path: snippets/utils.md
531---
532
533# Main Content
534
535See [documentation](./docs/guide.md) for details.
536"#;
537
538        let cleaned = remove_frontmatter(content);
539        // Frontmatter should be removed
540        assert!(!cleaned.contains("dependencies:"));
541        assert!(!cleaned.contains("agents/helper.md"));
542        assert!(!cleaned.contains("snippets/utils.md"));
543        // Content should remain
544        assert!(cleaned.contains("# Main Content"));
545        assert!(cleaned.contains("./docs/guide.md"));
546    }
547
548    #[test]
549    fn test_extract_with_frontmatter_dependencies() {
550        let content = r#"---
551dependencies:
552  agents:
553    - path: agents/helper.md
554      version: v1.0.0
555  snippets:
556    - path: .agpm/snippets/utils.md
557---
558
559# Command
560
561See [real reference](./docs/guide.md) for details.
562Check `.claude/agents/example.md` for the implementation.
563"#;
564
565        let refs = extract_file_references(content);
566
567        // Should extract content references
568        assert!(refs.contains(&"./docs/guide.md".to_string()));
569        assert!(refs.contains(&".claude/agents/example.md".to_string()));
570
571        // Should NOT extract frontmatter dependencies
572        assert!(!refs.contains(&"agents/helper.md".to_string()));
573        assert!(!refs.contains(&".agpm/snippets/utils.md".to_string()));
574    }
575
576    #[test]
577    fn test_complex_markdown_with_mixed_references() {
578        let content = r#"
579# Documentation
580
581See the [main guide](./docs/guide.md) for details.
582
583## Implementation
584
585The core logic is in `.agpm/snippets/core.md` file.
586
587```rust
588// This code reference should be ignored
589let path = ".agpm/snippets/ignored.md";
590```
591
592Also check:
593- [Examples](../examples/demo.md)
594- External: https://github.com/user/repo
595- `.claude/agents/helper.md`
596
597Inline code like `example.md` should be skipped.
598"#;
599
600        let refs = extract_file_references(content);
601
602        // Should extract these
603        assert!(refs.contains(&"./docs/guide.md".to_string()));
604        assert!(refs.contains(&".agpm/snippets/core.md".to_string()));
605        assert!(refs.contains(&"../examples/demo.md".to_string()));
606        assert!(refs.contains(&".claude/agents/helper.md".to_string()));
607
608        // Should NOT extract these
609        assert!(!refs.iter().any(|r| r.contains("github.com")));
610        assert!(!refs.iter().any(|r| r.contains("ignored.md")));
611        assert!(!refs.contains(&"example.md".to_string())); // Was in inline code
612    }
613}