agpm_cli/markdown/reference_extractor.rs
1//! File reference extraction and validation for markdown documents.
2//!
3//! This module provides utilities to extract and validate markdown file references
4//! within markdown content. It helps catch broken cross-references before
5//! installation by checking that referenced files actually exist.
6//!
7//! # Supported Reference Types
8//!
9//! - **Markdown links**: `[text](path.md)` - only `.md` files
10//! - **Direct file paths**: `.agpm/snippets/file.md`, `docs/guide.md` - only `.md` files
11//!
12//! # Extraction Rules
13//!
14//! The extractor intelligently filters references to avoid false positives:
15//! - Skips absolute URLs (http://, https://, etc.)
16//! - Skips absolute filesystem paths (starting with /)
17//! - Skips content inside YAML frontmatter (--- delimited)
18//! - Skips content inside code blocks (``` delimited)
19//! - Skips content inside inline code (` delimited)
20//! - Only extracts relative markdown file paths (.md extension)
21//!
22//! # Usage
23//!
24//! ```rust,no_run
25//! use agpm_cli::markdown::reference_extractor::{extract_file_references, validate_file_references};
26//! use std::path::Path;
27//!
28//! # fn example() -> anyhow::Result<()> {
29//! let markdown = r#"
30//! See [documentation](../docs/guide.md) for details.
31//!
32//! Also check `.agpm/snippets/example.md` for examples.
33//! "#;
34//!
35//! let references = extract_file_references(markdown);
36//! // Returns: ["../docs/guide.md", ".agpm/snippets/example.md"]
37//!
38//! // Validate references exist
39//! let project_dir = Path::new("/path/to/project");
40//! let missing = validate_file_references(&references, project_dir)?;
41//! # Ok(())
42//! # }
43//! ```
44
45use anyhow::Result;
46use regex::Regex;
47use std::path::Path;
48
49/// A missing file reference found during validation.
50///
51/// This struct captures information about a file reference that was found
52/// in markdown content but does not exist on the filesystem.
53#[derive(Debug, Clone, PartialEq, Eq)]
54pub struct MissingReference {
55 /// The markdown file that contains the broken reference
56 pub source_file: String,
57
58 /// The referenced path that was not found
59 pub referenced_path: String,
60}
61
62impl MissingReference {
63 /// Create a new missing reference record.
64 ///
65 /// # Arguments
66 ///
67 /// * `source_file` - The file containing the reference
68 /// * `referenced_path` - The path that was referenced but not found
69 #[must_use]
70 pub fn new(source_file: String, referenced_path: String) -> Self {
71 Self {
72 source_file,
73 referenced_path,
74 }
75 }
76}
77
78/// Extract markdown file references from markdown content.
79///
80/// This function scans markdown content for markdown file path references and returns
81/// a deduplicated list of relative markdown file paths. It intelligently filters out
82/// URLs, absolute paths, non-markdown files, and references inside code blocks.
83///
84/// # Extracted Reference Types
85///
86/// - Markdown links: `[text](path.md)` → extracts `path.md` (only `.md` files)
87/// - Direct file paths: `.agpm/snippets/file.md` → extracts `.agpm/snippets/file.md` (only `.md` files)
88///
89/// # Filtering Rules
90///
91/// References are excluded if they:
92/// - Start with URL schemes (http://, https://, ftp://, etc.)
93/// - Are absolute paths (starting with /)
94/// - Appear inside YAML frontmatter (--- delimited at file start)
95/// - Appear inside code blocks (``` delimited)
96/// - Appear inside inline code (` delimited)
97/// - Don't have the .md extension
98/// - Contain URL-like patterns (://)
99///
100/// # Arguments
101///
102/// * `content` - The markdown content to scan
103///
104/// # Returns
105///
106/// A vector of unique relative file paths found in the content
107///
108/// # Examples
109///
110/// ```rust,no_run
111/// # use agpm_cli::markdown::reference_extractor::extract_file_references;
112/// let markdown = r#"
113/// Check [docs](./guide.md) and `.agpm/snippets/example.md`.
114///
115/// But not this [external link](https://example.com) or `inline code .md`.
116/// "#;
117///
118/// let refs = extract_file_references(markdown);
119/// assert_eq!(refs.len(), 2);
120/// assert!(refs.contains(&"./guide.md".to_string()));
121/// assert!(refs.contains(&".agpm/snippets/example.md".to_string()));
122/// ```
123#[must_use]
124pub fn extract_file_references(content: &str) -> Vec<String> {
125 let mut references = Vec::new();
126
127 // Remove frontmatter and code blocks to avoid extracting paths from metadata
128 let content_without_frontmatter = remove_frontmatter(content);
129 let content_without_code = remove_code_blocks(&content_without_frontmatter);
130
131 // Extract markdown links: [text](path.md) - only .md files
132 if let Ok(link_regex) = Regex::new(r"\[([^\]]+)\]\(([^)]+)\)") {
133 for cap in link_regex.captures_iter(&content_without_code) {
134 if let Some(path) = cap.get(2) {
135 let path_str = path.as_str();
136 // Only include markdown files
137 if path_str.ends_with(".md") && is_valid_file_reference(path_str) {
138 references.push(path_str.to_string());
139 }
140 }
141 }
142 }
143
144 // Extract direct file paths with markdown extensions
145 // Pattern: paths containing / with .md extension only
146 if let Ok(path_regex) = Regex::new(r#"(?:^|\s|["'`])([./a-zA-Z_][\w./-]*\.md)(?:\s|["'`]|$)"#) {
147 for cap in path_regex.captures_iter(&content_without_code) {
148 if let Some(path) = cap.get(1) {
149 let path_str = path.as_str();
150 if is_valid_file_reference(path_str) {
151 references.push(path_str.to_string());
152 }
153 }
154 }
155 }
156
157 // Deduplicate while preserving order
158 let mut seen = std::collections::HashSet::new();
159 references.retain(|r| seen.insert(r.clone()));
160
161 references
162}
163
164/// Remove YAML frontmatter from markdown content.
165///
166/// This prevents extracting dependency paths from frontmatter metadata,
167/// which are transitive dependencies rather than actual file references in
168/// the content.
169///
170/// # Arguments
171///
172/// * `content` - The markdown content
173///
174/// # Returns
175///
176/// Content with frontmatter removed (--- delimited at the start)
177fn remove_frontmatter(content: &str) -> String {
178 // Check if content starts with frontmatter delimiter
179 if !content.starts_with("---\n") && !content.starts_with("---\r\n") {
180 return content.to_string();
181 }
182
183 // Find the end of frontmatter
184 let search_start = if content.starts_with("---\n") {
185 4
186 } else {
187 5
188 };
189
190 let end_pattern = if content.contains("\r\n") {
191 "\r\n---\r\n"
192 } else {
193 "\n---\n"
194 };
195
196 if let Some(end_pos) = content[search_start..].find(end_pattern) {
197 // Return content after frontmatter, skipping the closing delimiter
198 let content_start = search_start + end_pos + end_pattern.len();
199 content[content_start..].to_string()
200 } else {
201 // No closing delimiter found, return original content
202 content.to_string()
203 }
204}
205
206/// Remove code blocks from markdown content.
207///
208/// This helps prevent extracting file paths that appear in code block examples,
209/// which should not be validated as actual file references. Inline code (single
210/// backticks) is preserved since it may contain legitimate file path references.
211///
212/// # Arguments
213///
214/// * `content` - The markdown content
215///
216/// # Returns
217///
218/// Content with code blocks removed (``` delimited)
219fn remove_code_blocks(content: &str) -> String {
220 let mut result = String::new();
221 let mut in_code_block = false;
222 let mut chars = content.chars().peekable();
223
224 while let Some(ch) = chars.next() {
225 // Check for code block delimiter (```)
226 if ch == '`' {
227 let mut backtick_count = 1;
228
229 // Count consecutive backticks
230 while chars.peek() == Some(&'`') {
231 backtick_count += 1;
232 chars.next();
233 }
234
235 // Three or more backticks toggle code block mode
236 if backtick_count >= 3 {
237 in_code_block = !in_code_block;
238 // Replace code block delimiter with spaces
239 for _ in 0..backtick_count {
240 result.push(' ');
241 }
242 continue;
243 } else {
244 // It's inline code (1-2 backticks), preserve it
245 for _ in 0..backtick_count {
246 result.push('`');
247 }
248 continue;
249 }
250 }
251
252 // Skip content inside code blocks
253 if in_code_block {
254 result.push(' '); // Maintain structure with spaces
255 } else {
256 result.push(ch);
257 }
258 }
259
260 result
261}
262
263/// Check if a path string is a valid file reference to validate.
264///
265/// This function filters out URLs, absolute paths, and other patterns
266/// that should not be validated as local file references.
267///
268/// # Valid References
269///
270/// - Relative paths: `./file.md`, `../docs/guide.md`
271/// - Dot-prefixed paths: `.agpm/snippets/file.md`
272/// - Simple paths: `docs/guide.md`
273///
274/// # Invalid References (Filtered Out)
275///
276/// - URLs: `http://example.com`, `https://github.com/...`
277/// - Absolute paths: `/usr/local/file.md`
278/// - Paths with URL schemes: `file://...`, `ftp://...`
279/// - Empty or whitespace-only strings
280///
281/// # Arguments
282///
283/// * `path` - The path string to validate
284///
285/// # Returns
286///
287/// `true` if the path should be validated, `false` otherwise
288#[must_use]
289pub fn is_valid_file_reference(path: &str) -> bool {
290 let trimmed = path.trim();
291
292 // Skip empty strings
293 if trimmed.is_empty() {
294 return false;
295 }
296
297 // Skip URLs (any scheme://...)
298 if trimmed.contains("://") {
299 return false;
300 }
301
302 // Skip absolute paths
303 if trimmed.starts_with('/') {
304 return false;
305 }
306
307 // Skip anchor links
308 if trimmed.starts_with('#') {
309 return false;
310 }
311
312 // Must have a file extension
313 if !trimmed.contains('.') {
314 return false;
315 }
316
317 // Must contain a path separator (/) to be considered a file path
318 // This filters out simple filenames like "example.md" that aren't paths
319 if !trimmed.contains('/') {
320 return false;
321 }
322
323 true
324}
325
326/// Validate that file references exist on the filesystem.
327///
328/// This function takes a list of relative file paths and checks if they
329/// exist relative to the given project directory. It returns a list of
330/// missing references for error reporting.
331///
332/// # Arguments
333///
334/// * `references` - List of relative file paths to validate
335/// * `project_dir` - Base directory to resolve relative paths against
336///
337/// # Returns
338///
339/// A list of references that were not found
340///
341/// # Errors
342///
343/// Returns an error if the project directory cannot be accessed
344///
345/// # Examples
346///
347/// ```rust,no_run
348/// # use agpm_cli::markdown::reference_extractor::validate_file_references;
349/// # use std::path::Path;
350/// # fn example() -> anyhow::Result<()> {
351/// let references = vec![
352/// ".agpm/snippets/existing.md".to_string(),
353/// ".agpm/snippets/missing.md".to_string(),
354/// ];
355///
356/// let project_dir = Path::new("/path/to/project");
357/// let missing = validate_file_references(&references, project_dir)?;
358/// // Returns only the missing.md entry
359/// # Ok(())
360/// # }
361/// ```
362pub fn validate_file_references(references: &[String], project_dir: &Path) -> Result<Vec<String>> {
363 let mut missing = Vec::new();
364
365 for reference in references {
366 let full_path = project_dir.join(reference);
367
368 if !full_path.exists() {
369 missing.push(reference.clone());
370 }
371 }
372
373 Ok(missing)
374}
375
376#[cfg(test)]
377mod tests {
378 use super::*;
379 use std::fs;
380 use tempfile::tempdir;
381
382 #[test]
383 fn test_extract_markdown_links() {
384 let content = r#"
385Check the [documentation](./docs/guide.md) for more info.
386Also see [examples](../examples/demo.md).
387"#;
388
389 let refs = extract_file_references(content);
390 assert_eq!(refs.len(), 2);
391 assert!(refs.contains(&"./docs/guide.md".to_string()));
392 assert!(refs.contains(&"../examples/demo.md".to_string()));
393 }
394
395 #[test]
396 fn test_extract_direct_file_paths() {
397 let content = r#"
398See `.agpm/snippets/example.md` for the implementation.
399Check `./docs/overview.md` and `.claude/agents/test.md`.
400"#;
401
402 let refs = extract_file_references(content);
403 assert!(refs.contains(&".agpm/snippets/example.md".to_string()));
404 assert!(refs.contains(&".claude/agents/test.md".to_string()));
405 assert!(refs.contains(&"./docs/overview.md".to_string()));
406 }
407
408 #[test]
409 fn test_skip_urls() {
410 let content = r#"
411Visit [GitHub](https://github.com/user/repo) for source.
412Or check http://example.com/page.html.
413"#;
414
415 let refs = extract_file_references(content);
416 assert_eq!(refs.len(), 0);
417 }
418
419 #[test]
420 fn test_skip_code_blocks() {
421 let content = r#"
422Normal reference: `.agpm/snippets/real.md`
423
424```bash
425# This should be skipped: `.agpm/snippets/code.md`
426cat .agpm/snippets/example.md
427```
428
429Another real reference: `docs/guide.md`
430"#;
431
432 let refs = extract_file_references(content);
433 assert!(refs.contains(&".agpm/snippets/real.md".to_string()));
434 assert!(refs.contains(&"docs/guide.md".to_string()));
435 // Should not contain references from code block
436 assert!(!refs.iter().any(|r| r.contains("code.md")));
437 }
438
439 #[test]
440 fn test_inline_code_path_extraction() {
441 let content = "Check `.agpm/real.md` for details.";
442
443 let refs = extract_file_references(content);
444 // File paths in inline code are still extracted if they look like actual paths
445 assert!(refs.contains(&".agpm/real.md".to_string()));
446 }
447
448 #[test]
449 fn test_deduplication() {
450 let content = r#"
451See `.agpm/snippets/example.md` for details.
452Also check `.agpm/snippets/example.md` again.
453"#;
454
455 let refs = extract_file_references(content);
456 assert_eq!(refs.len(), 1);
457 }
458
459 #[test]
460 fn test_is_valid_file_reference() {
461 // Valid references
462 assert!(is_valid_file_reference("./docs/guide.md"));
463 assert!(is_valid_file_reference(".agpm/snippets/file.md"));
464 assert!(is_valid_file_reference("../parent/file.json"));
465
466 // Invalid references
467 assert!(!is_valid_file_reference("https://example.com"));
468 assert!(!is_valid_file_reference("http://test.com/file.md"));
469 assert!(!is_valid_file_reference("/absolute/path.md"));
470 assert!(!is_valid_file_reference("#anchor"));
471 assert!(!is_valid_file_reference(""));
472 assert!(!is_valid_file_reference("no-extension"));
473 }
474
475 #[test]
476 fn test_validate_file_references() -> Result<()> {
477 let temp_dir = tempdir()?;
478 let project_dir = temp_dir.path();
479
480 // Create some test files
481 let existing_dir = project_dir.join(".agpm").join("snippets");
482 fs::create_dir_all(&existing_dir)?;
483 fs::write(existing_dir.join("existing.md"), "content")?;
484
485 let references = vec![
486 ".agpm/snippets/existing.md".to_string(),
487 ".agpm/snippets/missing.md".to_string(),
488 "nonexistent/file.md".to_string(),
489 ];
490
491 let missing = validate_file_references(&references, project_dir)?;
492
493 assert_eq!(missing.len(), 2);
494 assert!(missing.contains(&".agpm/snippets/missing.md".to_string()));
495 assert!(missing.contains(&"nonexistent/file.md".to_string()));
496 assert!(!missing.contains(&".agpm/snippets/existing.md".to_string()));
497
498 Ok(())
499 }
500
501 #[test]
502 fn test_remove_code_blocks() {
503 let content = r#"
504Normal text with `.agpm/file.md`
505
506```rust
507let path = ".agpm/in_code.md";
508```
509
510More normal text `.agpm/another.md`
511"#;
512
513 let cleaned = remove_code_blocks(content);
514 assert!(cleaned.contains(".agpm/file.md"));
515 assert!(cleaned.contains(".agpm/another.md"));
516 // Code block content should be replaced with spaces
517 assert!(
518 !cleaned.contains("in_code.md")
519 || cleaned.split_whitespace().all(|word| !word.contains("in_code.md"))
520 );
521 }
522
523 #[test]
524 fn test_remove_frontmatter() {
525 let content = r#"---
526dependencies:
527 agents:
528 - path: agents/helper.md
529 snippets:
530 - path: snippets/utils.md
531---
532
533# Main Content
534
535See [documentation](./docs/guide.md) for details.
536"#;
537
538 let cleaned = remove_frontmatter(content);
539 // Frontmatter should be removed
540 assert!(!cleaned.contains("dependencies:"));
541 assert!(!cleaned.contains("agents/helper.md"));
542 assert!(!cleaned.contains("snippets/utils.md"));
543 // Content should remain
544 assert!(cleaned.contains("# Main Content"));
545 assert!(cleaned.contains("./docs/guide.md"));
546 }
547
548 #[test]
549 fn test_extract_with_frontmatter_dependencies() {
550 let content = r#"---
551dependencies:
552 agents:
553 - path: agents/helper.md
554 version: v1.0.0
555 snippets:
556 - path: .agpm/snippets/utils.md
557---
558
559# Command
560
561See [real reference](./docs/guide.md) for details.
562Check `.claude/agents/example.md` for the implementation.
563"#;
564
565 let refs = extract_file_references(content);
566
567 // Should extract content references
568 assert!(refs.contains(&"./docs/guide.md".to_string()));
569 assert!(refs.contains(&".claude/agents/example.md".to_string()));
570
571 // Should NOT extract frontmatter dependencies
572 assert!(!refs.contains(&"agents/helper.md".to_string()));
573 assert!(!refs.contains(&".agpm/snippets/utils.md".to_string()));
574 }
575
576 #[test]
577 fn test_complex_markdown_with_mixed_references() {
578 let content = r#"
579# Documentation
580
581See the [main guide](./docs/guide.md) for details.
582
583## Implementation
584
585The core logic is in `.agpm/snippets/core.md` file.
586
587```rust
588// This code reference should be ignored
589let path = ".agpm/snippets/ignored.md";
590```
591
592Also check:
593- [Examples](../examples/demo.md)
594- External: https://github.com/user/repo
595- `.claude/agents/helper.md`
596
597Inline code like `example.md` should be skipped.
598"#;
599
600 let refs = extract_file_references(content);
601
602 // Should extract these
603 assert!(refs.contains(&"./docs/guide.md".to_string()));
604 assert!(refs.contains(&".agpm/snippets/core.md".to_string()));
605 assert!(refs.contains(&"../examples/demo.md".to_string()));
606 assert!(refs.contains(&".claude/agents/helper.md".to_string()));
607
608 // Should NOT extract these
609 assert!(!refs.iter().any(|r| r.contains("github.com")));
610 assert!(!refs.iter().any(|r| r.contains("ignored.md")));
611 assert!(!refs.contains(&"example.md".to_string())); // Was in inline code
612 }
613}