agpm_cli/markdown/reference_extractor.rs
1//! File reference extraction and validation for markdown documents.
2//!
3//! This module provides utilities to extract and validate markdown file references
4//! within markdown content. It helps catch broken cross-references before
5//! installation by checking that referenced files actually exist.
6//!
7//! # Supported Reference Types
8//!
9//! - **Markdown links**: `[text](path.md)` - only `.md` files
10//! - **Direct file paths**: `.agpm/snippets/file.md`, `docs/guide.md` - only `.md` files
11//!
12//! # Extraction Rules
13//!
14//! The extractor intelligently filters references to avoid false positives:
15//! - Skips absolute URLs (http://, https://, etc.)
16//! - Skips absolute filesystem paths (starting with /)
17//! - Skips content inside YAML frontmatter (--- delimited)
18//! - Skips content inside code blocks (``` delimited)
19//! - Skips content inside inline code (` delimited)
20//! - Only extracts relative markdown file paths (.md extension)
21//!
22//! # Usage
23//!
24//! ```rust,no_run
25//! use agpm_cli::markdown::reference_extractor::{extract_file_references, validate_file_references};
26//! use std::path::Path;
27//!
28//! # fn example() -> anyhow::Result<()> {
29//! let markdown = r#"
30//! See [documentation](../docs/guide.md) for details.
31//!
32//! Also check `.agpm/snippets/example.md` for examples.
33//! "#;
34//!
35//! let references = extract_file_references(markdown);
36//! // Returns: ["../docs/guide.md", ".agpm/snippets/example.md"]
37//!
38//! // Validate references exist
39//! let project_dir = Path::new("/path/to/project");
40//! let missing = validate_file_references(&references, project_dir)?;
41//! # Ok(())
42//! # }
43//! ```
44
45use anyhow::Result;
46use regex::Regex;
47use std::path::Path;
48
49use crate::markdown::frontmatter::FrontmatterParser;
50
51/// A missing file reference found during validation.
52///
53/// This struct captures information about a file reference that was found
54/// in markdown content but does not exist on the filesystem.
55#[derive(Debug, Clone, PartialEq, Eq)]
56pub struct MissingReference {
57 /// The markdown file that contains the broken reference
58 pub source_file: String,
59
60 /// The referenced path that was not found
61 pub referenced_path: String,
62}
63
64impl MissingReference {
65 /// Create a new missing reference record.
66 ///
67 /// # Arguments
68 ///
69 /// * `source_file` - The file containing the reference
70 /// * `referenced_path` - The path that was referenced but not found
71 #[must_use]
72 pub fn new(source_file: String, referenced_path: String) -> Self {
73 Self {
74 source_file,
75 referenced_path,
76 }
77 }
78}
79
80/// Extract markdown file references from markdown content.
81///
82/// This function scans markdown content for markdown file path references and returns
83/// a deduplicated list of relative markdown file paths. It intelligently filters out
84/// URLs, absolute paths, non-markdown files, and references inside code blocks.
85///
86/// # Extracted Reference Types
87///
88/// - Markdown links: `[text](path.md)` → extracts `path.md` (only `.md` files)
89/// - Direct file paths: `.agpm/snippets/file.md` → extracts `.agpm/snippets/file.md` (only `.md` files)
90///
91/// # Filtering Rules
92///
93/// References are excluded if they:
94/// - Start with URL schemes (http://, https://, ftp://, etc.)
95/// - Are absolute paths (starting with /)
96/// - Appear inside YAML frontmatter (--- delimited at file start)
97/// - Appear inside code blocks (``` delimited)
98/// - Appear inside inline code (` delimited)
99/// - Don't have the .md extension
100/// - Contain URL-like patterns (://)
101///
102/// # Arguments
103///
104/// * `content` - The markdown content to scan
105///
106/// # Returns
107///
108/// A vector of unique relative file paths found in the content
109///
110/// # Examples
111///
112/// ```rust,no_run
113/// # use agpm_cli::markdown::reference_extractor::extract_file_references;
114/// let markdown = r#"
115/// Check [docs](./guide.md) and `.agpm/snippets/example.md`.
116///
117/// But not this [external link](https://example.com) or `inline code .md`.
118/// "#;
119///
120/// let refs = extract_file_references(markdown);
121/// assert_eq!(refs.len(), 2);
122/// assert!(refs.contains(&"./guide.md".to_string()));
123/// assert!(refs.contains(&".agpm/snippets/example.md".to_string()));
124/// ```
125#[must_use]
126pub fn extract_file_references(content: &str) -> Vec<String> {
127 let mut references = Vec::new();
128
129 // Remove frontmatter and code blocks to avoid extracting paths from metadata
130 let content_without_frontmatter = remove_frontmatter(content);
131 let content_without_code = remove_code_blocks(&content_without_frontmatter);
132
133 // Extract markdown links: [text](path.md) - only .md files
134 if let Ok(link_regex) = Regex::new(r"\[([^\]]+)\]\(([^)]+)\)") {
135 for cap in link_regex.captures_iter(&content_without_code) {
136 if let Some(path) = cap.get(2) {
137 let path_str = path.as_str();
138 // Only include markdown files
139 if path_str.ends_with(".md") && is_valid_file_reference(path_str) {
140 references.push(path_str.to_string());
141 }
142 }
143 }
144 }
145
146 // Extract direct file paths with markdown extensions
147 // Pattern: paths containing / with .md extension only
148 if let Ok(path_regex) = Regex::new(r#"(?:^|\s|["'`])([./a-zA-Z_][\w./-]*\.md)(?:\s|["'`]|$)"#) {
149 for cap in path_regex.captures_iter(&content_without_code) {
150 if let Some(path) = cap.get(1) {
151 let path_str = path.as_str();
152 if is_valid_file_reference(path_str) {
153 references.push(path_str.to_string());
154 }
155 }
156 }
157 }
158
159 // Deduplicate while preserving order
160 let mut seen = std::collections::HashSet::new();
161 references.retain(|r| seen.insert(r.clone()));
162
163 references
164}
165
166/// Remove YAML frontmatter from markdown content.
167///
168/// This prevents extracting dependency paths from frontmatter metadata,
169/// which are transitive dependencies rather than actual file references in
170/// the content.
171///
172/// # Arguments
173///
174/// * `content` - The markdown content
175///
176/// # Returns
177///
178/// Content with frontmatter removed (--- delimited at the start)
179fn remove_frontmatter(content: &str) -> String {
180 let parser = FrontmatterParser::new();
181 parser.strip_frontmatter(content)
182}
183
184/// Remove code blocks from markdown content.
185///
186/// This helps prevent extracting file paths that appear in code block examples,
187/// which should not be validated as actual file references. Inline code (single
188/// backticks) is preserved since it may contain legitimate file path references.
189///
190/// # Arguments
191///
192/// * `content` - The markdown content
193///
194/// # Returns
195///
196/// Content with code blocks removed (``` delimited)
197fn remove_code_blocks(content: &str) -> String {
198 let mut result = String::new();
199 let mut in_code_block = false;
200 let mut chars = content.chars().peekable();
201
202 while let Some(ch) = chars.next() {
203 // Check for code block delimiter (```)
204 if ch == '`' {
205 let mut backtick_count = 1;
206
207 // Count consecutive backticks
208 while chars.peek() == Some(&'`') {
209 backtick_count += 1;
210 chars.next();
211 }
212
213 // Three or more backticks toggle code block mode
214 if backtick_count >= 3 {
215 in_code_block = !in_code_block;
216 // Replace code block delimiter with spaces
217 for _ in 0..backtick_count {
218 result.push(' ');
219 }
220 continue;
221 }
222 // It's inline code (1-2 backticks), preserve it
223 for _ in 0..backtick_count {
224 result.push('`');
225 }
226 continue;
227 }
228
229 // Skip content inside code blocks
230 if in_code_block {
231 result.push(' '); // Maintain structure with spaces
232 } else {
233 result.push(ch);
234 }
235 }
236
237 result
238}
239
240/// Check if a path string is a valid file reference to validate.
241///
242/// This function filters out URLs, absolute paths, and other patterns
243/// that should not be validated as local file references.
244///
245/// # Valid References
246///
247/// - Relative paths: `./file.md`, `../docs/guide.md`
248/// - Dot-prefixed paths: `.agpm/snippets/file.md`
249/// - Simple paths: `docs/guide.md`
250///
251/// # Invalid References (Filtered Out)
252///
253/// - URLs: `http://example.com`, `https://github.com/...`
254/// - Absolute paths: `/usr/local/file.md`
255/// - Paths with URL schemes: `file://...`, `ftp://...`
256/// - Empty or whitespace-only strings
257///
258/// # Arguments
259///
260/// * `path` - The path string to validate
261///
262/// # Returns
263///
264/// `true` if the path should be validated, `false` otherwise
265#[must_use]
266pub fn is_valid_file_reference(path: &str) -> bool {
267 let trimmed = path.trim();
268
269 // Skip empty strings
270 if trimmed.is_empty() {
271 return false;
272 }
273
274 // Skip URLs (any scheme://...)
275 if trimmed.contains("://") {
276 return false;
277 }
278
279 // Skip absolute paths
280 if trimmed.starts_with('/') {
281 return false;
282 }
283
284 // Skip anchor links
285 if trimmed.starts_with('#') {
286 return false;
287 }
288
289 // Must have a file extension
290 if !trimmed.contains('.') {
291 return false;
292 }
293
294 // Must contain a path separator (/) to be considered a file path
295 // This filters out simple filenames like "example.md" that aren't paths
296 if !trimmed.contains('/') {
297 return false;
298 }
299
300 true
301}
302
303/// Validate that file references exist on the filesystem.
304///
305/// This function takes a list of relative file paths and checks if they
306/// exist relative to the given project directory. It returns a list of
307/// missing references for error reporting.
308///
309/// # Arguments
310///
311/// * `references` - List of relative file paths to validate
312/// * `project_dir` - Base directory to resolve relative paths against
313///
314/// # Returns
315///
316/// A list of references that were not found
317///
318/// # Errors
319///
320/// Returns an error if the project directory cannot be accessed
321///
322/// # Examples
323///
324/// ```rust,no_run
325/// # use agpm_cli::markdown::reference_extractor::validate_file_references;
326/// # use std::path::Path;
327/// # fn example() -> anyhow::Result<()> {
328/// let references = vec![
329/// ".agpm/snippets/existing.md".to_string(),
330/// ".agpm/snippets/missing.md".to_string(),
331/// ];
332///
333/// let project_dir = Path::new("/path/to/project");
334/// let missing = validate_file_references(&references, project_dir)?;
335/// // Returns only the missing.md entry
336/// # Ok(())
337/// # }
338/// ```
339pub fn validate_file_references(references: &[String], project_dir: &Path) -> Result<Vec<String>> {
340 let mut missing = Vec::new();
341
342 for reference in references {
343 let full_path = project_dir.join(reference);
344
345 if !full_path.exists() {
346 missing.push(reference.clone());
347 }
348 }
349
350 Ok(missing)
351}
352
353#[cfg(test)]
354mod tests {
355 use super::*;
356 use std::fs;
357 use tempfile::tempdir;
358
359 #[test]
360 fn test_extract_markdown_links() {
361 let content = r#"
362Check the [documentation](./docs/guide.md) for more info.
363Also see [examples](../examples/demo.md).
364"#;
365
366 let refs = extract_file_references(content);
367 assert_eq!(refs.len(), 2);
368 assert!(refs.contains(&"./docs/guide.md".to_string()));
369 assert!(refs.contains(&"../examples/demo.md".to_string()));
370 }
371
372 #[test]
373 fn test_extract_direct_file_paths() {
374 let content = r#"
375See `.agpm/snippets/example.md` for the implementation.
376Check `./docs/overview.md` and `.claude/agents/test.md`.
377"#;
378
379 let refs = extract_file_references(content);
380 assert!(refs.contains(&".agpm/snippets/example.md".to_string()));
381 assert!(refs.contains(&".claude/agents/test.md".to_string()));
382 assert!(refs.contains(&"./docs/overview.md".to_string()));
383 }
384
385 #[test]
386 fn test_skip_urls() {
387 let content = r#"
388Visit [GitHub](https://github.com/user/repo) for source.
389Or check http://example.com/page.html.
390"#;
391
392 let refs = extract_file_references(content);
393 assert_eq!(refs.len(), 0);
394 }
395
396 #[test]
397 fn test_skip_code_blocks() {
398 let content = r#"
399Normal reference: `.agpm/snippets/real.md`
400
401```bash
402# This should be skipped: `.agpm/snippets/code.md`
403cat .agpm/snippets/example.md
404```
405
406Another real reference: `docs/guide.md`
407"#;
408
409 let refs = extract_file_references(content);
410 assert!(refs.contains(&".agpm/snippets/real.md".to_string()));
411 assert!(refs.contains(&"docs/guide.md".to_string()));
412 // Should not contain references from code block
413 assert!(!refs.iter().any(|r| r.contains("code.md")));
414 }
415
416 #[test]
417 fn test_inline_code_path_extraction() {
418 let content = "Check `.agpm/real.md` for details.";
419
420 let refs = extract_file_references(content);
421 // File paths in inline code are still extracted if they look like actual paths
422 assert!(refs.contains(&".agpm/real.md".to_string()));
423 }
424
425 #[test]
426 fn test_deduplication() {
427 let content = r#"
428See `.agpm/snippets/example.md` for details.
429Also check `.agpm/snippets/example.md` again.
430"#;
431
432 let refs = extract_file_references(content);
433 assert_eq!(refs.len(), 1);
434 }
435
436 #[test]
437 fn test_is_valid_file_reference() {
438 // Valid references
439 assert!(is_valid_file_reference("./docs/guide.md"));
440 assert!(is_valid_file_reference(".agpm/snippets/file.md"));
441 assert!(is_valid_file_reference("../parent/file.json"));
442
443 // Invalid references
444 assert!(!is_valid_file_reference("https://example.com"));
445 assert!(!is_valid_file_reference("http://test.com/file.md"));
446 assert!(!is_valid_file_reference("/absolute/path.md"));
447 assert!(!is_valid_file_reference("#anchor"));
448 assert!(!is_valid_file_reference(""));
449 assert!(!is_valid_file_reference("no-extension"));
450 }
451
452 #[test]
453 fn test_validate_file_references() -> Result<()> {
454 let temp_dir = tempdir()?;
455 let project_dir = temp_dir.path();
456
457 // Create some test files
458 let existing_dir = project_dir.join(".agpm").join("snippets");
459 fs::create_dir_all(&existing_dir)?;
460 fs::write(existing_dir.join("existing.md"), "content")?;
461
462 let references = vec![
463 ".agpm/snippets/existing.md".to_string(),
464 ".agpm/snippets/missing.md".to_string(),
465 "nonexistent/file.md".to_string(),
466 ];
467
468 let missing = validate_file_references(&references, project_dir)?;
469
470 assert_eq!(missing.len(), 2);
471 assert!(missing.contains(&".agpm/snippets/missing.md".to_string()));
472 assert!(missing.contains(&"nonexistent/file.md".to_string()));
473 assert!(!missing.contains(&".agpm/snippets/existing.md".to_string()));
474
475 Ok(())
476 }
477
478 #[test]
479 fn test_remove_code_blocks() {
480 let content = r#"
481Normal text with `.agpm/file.md`
482
483```rust
484let path = ".agpm/in_code.md";
485```
486
487More normal text `.agpm/another.md`
488"#;
489
490 let cleaned = remove_code_blocks(content);
491 assert!(cleaned.contains(".agpm/file.md"));
492 assert!(cleaned.contains(".agpm/another.md"));
493 // Code block content should be replaced with spaces
494 assert!(
495 !cleaned.contains("in_code.md")
496 || cleaned.split_whitespace().all(|word| !word.contains("in_code.md"))
497 );
498 }
499
500 #[test]
501 fn test_remove_frontmatter() {
502 let content = r#"---
503dependencies:
504 agents:
505 - path: agents/helper.md
506 snippets:
507 - path: snippets/utils.md
508---
509
510# Main Content
511
512See [documentation](./docs/guide.md) for details.
513"#;
514
515 let cleaned = remove_frontmatter(content);
516 // Frontmatter should be removed
517 assert!(!cleaned.contains("dependencies:"));
518 assert!(!cleaned.contains("agents/helper.md"));
519 assert!(!cleaned.contains("snippets/utils.md"));
520 // Content should remain
521 assert!(cleaned.contains("# Main Content"));
522 assert!(cleaned.contains("./docs/guide.md"));
523 }
524
525 #[test]
526 fn test_extract_with_frontmatter_dependencies() {
527 let content = r#"---
528dependencies:
529 agents:
530 - path: agents/helper.md
531 version: v1.0.0
532 snippets:
533 - path: .agpm/snippets/utils.md
534---
535
536# Command
537
538See [real reference](./docs/guide.md) for details.
539Check `.claude/agents/example.md` for the implementation.
540"#;
541
542 let refs = extract_file_references(content);
543
544 // Should extract content references
545 assert!(refs.contains(&"./docs/guide.md".to_string()));
546 assert!(refs.contains(&".claude/agents/example.md".to_string()));
547
548 // Should NOT extract frontmatter dependencies
549 assert!(!refs.contains(&"agents/helper.md".to_string()));
550 assert!(!refs.contains(&".agpm/snippets/utils.md".to_string()));
551 }
552
553 #[test]
554 fn test_complex_markdown_with_mixed_references() {
555 let content = r#"
556# Documentation
557
558See the [main guide](./docs/guide.md) for details.
559
560## Implementation
561
562The core logic is in `.agpm/snippets/core.md` file.
563
564```rust
565// This code reference should be ignored
566let path = ".agpm/snippets/ignored.md";
567```
568
569Also check:
570- [Examples](../examples/demo.md)
571- External: https://github.com/user/repo
572- `.claude/agents/helper.md`
573
574Inline code like `example.md` should be skipped.
575"#;
576
577 let refs = extract_file_references(content);
578
579 // Should extract these
580 assert!(refs.contains(&"./docs/guide.md".to_string()));
581 assert!(refs.contains(&".agpm/snippets/core.md".to_string()));
582 assert!(refs.contains(&"../examples/demo.md".to_string()));
583 assert!(refs.contains(&".claude/agents/helper.md".to_string()));
584
585 // Should NOT extract these
586 assert!(!refs.iter().any(|r| r.contains("github.com")));
587 assert!(!refs.iter().any(|r| r.contains("ignored.md")));
588 assert!(!refs.contains(&"example.md".to_string())); // Was in inline code
589 }
590}