Skip to main content

drft/parsers/
frontmatter.rs

1use super::{ParseResult, Parser};
2
3/// Check whether a frontmatter value looks like a link target (file path or URI).
4fn is_link_candidate(value: &str) -> bool {
5    // URIs are always candidates — graph builder creates External nodes
6    if crate::graph::is_uri(value) {
7        return true;
8    }
9    // Explicit path prefixes are always candidates.
10    // The graph builder gates all filesystem access for out-of-root targets.
11    if value.starts_with("./") || value.starts_with("../") || value.starts_with('/') {
12        return true;
13    }
14    // Prose contains spaces — file paths don't
15    if value.contains(' ') {
16        return false;
17    }
18    // Must have a plausible file extension: dot followed by 1-4 alphanumeric
19    // chars that aren't all digits (rejects v2.0, e.g., Dr.)
20    let basename = value.rsplit('/').next().unwrap_or(value);
21    if let Some(dot_pos) = basename.rfind('.') {
22        let ext = &basename[dot_pos + 1..];
23        !ext.is_empty()
24            && ext.len() <= 4
25            && ext.chars().all(|c| c.is_ascii_alphanumeric())
26            && !ext.chars().all(|c| c.is_ascii_digit())
27    } else {
28        false
29    }
30}
31
32/// Strip all code content (fenced blocks and inline backtick spans),
33/// replacing with spaces to preserve offsets.
34fn strip_code(content: &str) -> String {
35    // First strip fenced code blocks (``` and ~~~)
36    let mut result = String::with_capacity(content.len());
37    let mut in_code_block = false;
38    let mut fence_marker = "";
39
40    for line in content.lines() {
41        let trimmed = line.trim_start();
42        if !in_code_block {
43            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
44                in_code_block = true;
45                fence_marker = if trimmed.starts_with("```") {
46                    "```"
47                } else {
48                    "~~~"
49                };
50                result.push_str(&" ".repeat(line.len()));
51            } else {
52                result.push_str(line);
53            }
54        } else if trimmed.starts_with(fence_marker) && trimmed.trim() == fence_marker {
55            in_code_block = false;
56            result.push_str(&" ".repeat(line.len()));
57        } else {
58            result.push_str(&" ".repeat(line.len()));
59        }
60        result.push('\n');
61    }
62
63    // Then strip inline code spans (single and double backticks)
64    let mut cleaned = String::with_capacity(result.len());
65    let chars: Vec<char> = result.chars().collect();
66    let mut i = 0;
67    while i < chars.len() {
68        if chars[i] == '`' {
69            // Count opening backticks
70            let mut ticks = 0;
71            while i + ticks < chars.len() && chars[i + ticks] == '`' {
72                ticks += 1;
73            }
74            // Find matching closing backticks in the char array
75            let after = i + ticks;
76            let mut found = None;
77            let mut j = after;
78            while j + ticks <= chars.len() {
79                if chars[j..j + ticks].iter().all(|c| *c == '`') {
80                    found = Some(j);
81                    break;
82                }
83                j += 1;
84            }
85            if let Some(close_start) = found {
86                // Replace entire span (backticks + content + backticks) with spaces
87                let total = close_start + ticks - i;
88                for _ in 0..total {
89                    cleaned.push(' ');
90                }
91                i += total;
92            } else {
93                // No closing — keep the backtick as-is
94                cleaned.push(chars[i]);
95                i += 1;
96            }
97        } else {
98            cleaned.push(chars[i]);
99            i += 1;
100        }
101    }
102
103    cleaned
104}
105
106/// Built-in frontmatter parser. Extracts YAML frontmatter as links and metadata.
107pub struct FrontmatterParser {
108    /// File routing filter. None = receives all File nodes.
109    pub file_filter: Option<globset::GlobSet>,
110}
111
112impl Parser for FrontmatterParser {
113    fn name(&self) -> &str {
114        "frontmatter"
115    }
116
117    fn matches(&self, path: &str) -> bool {
118        match &self.file_filter {
119            Some(set) => set.is_match(path),
120            None => true,
121        }
122    }
123
124    fn parse(&self, _path: &str, content: &str) -> ParseResult {
125        let links = extract_frontmatter_links(content);
126        let metadata = extract_frontmatter_metadata(content);
127
128        ParseResult { links, metadata }
129    }
130}
131
132/// Extract file path references from YAML frontmatter.
133/// Operates on code-block-stripped content to avoid parsing frontmatter
134/// inside fenced code block examples.
135fn extract_frontmatter_links(content: &str) -> Vec<String> {
136    let content = &strip_code(content);
137    let mut links = Vec::new();
138
139    if !content.starts_with("---") {
140        return links;
141    }
142
143    let rest = &content[3..];
144    let end = match rest.find("\n---") {
145        Some(idx) => idx,
146        None => return links,
147    };
148
149    let frontmatter = &rest[..end];
150
151    for line in frontmatter.lines() {
152        let line = line.trim();
153
154        let value = if let Some(stripped) = line.strip_prefix("- ") {
155            stripped.trim()
156        } else if let Some((_key, val)) = line.split_once(':') {
157            val.trim()
158        } else {
159            continue;
160        };
161
162        if value.is_empty() {
163            continue;
164        }
165
166        if value.starts_with('{')
167            || value.starts_with('[')
168            || value.starts_with('"')
169            || value.starts_with('\'')
170        {
171            continue;
172        }
173
174        // Skip numeric values (e.g., "1.0") — not file paths
175        if value.parse::<f64>().is_ok() {
176            continue;
177        }
178
179        if !is_link_candidate(value) {
180            continue;
181        }
182
183        links.push(value.to_string());
184    }
185
186    links
187}
188
189/// Parse YAML frontmatter into a JSON value for node metadata.
190/// Returns None if no valid frontmatter is found.
191fn extract_frontmatter_metadata(content: &str) -> Option<serde_json::Value> {
192    let content = &strip_code(content);
193
194    if !content.starts_with("---") {
195        return None;
196    }
197
198    let rest = &content[3..];
199    let end = rest.find("\n---")?;
200    let yaml_str = &rest[..end];
201
202    if yaml_str.trim().is_empty() {
203        return None;
204    }
205
206    match serde_yml::from_str::<serde_yml::Value>(yaml_str) {
207        Ok(yaml_val) => Some(yaml_to_json(yaml_val)),
208        Err(e) => {
209            eprintln!("warn: frontmatter parser: invalid YAML: {e}");
210            None
211        }
212    }
213}
214
215/// Convert serde_yml::Value to serde_json::Value.
216fn yaml_to_json(yaml: serde_yml::Value) -> serde_json::Value {
217    match yaml {
218        serde_yml::Value::Null => serde_json::Value::Null,
219        serde_yml::Value::Bool(b) => serde_json::Value::Bool(b),
220        serde_yml::Value::Number(n) => {
221            if let Some(i) = n.as_i64() {
222                serde_json::Value::Number(i.into())
223            } else if let Some(f) = n.as_f64() {
224                serde_json::Number::from_f64(f)
225                    .map(serde_json::Value::Number)
226                    .unwrap_or(serde_json::Value::Null)
227            } else {
228                serde_json::Value::Null
229            }
230        }
231        serde_yml::Value::String(s) => serde_json::Value::String(s),
232        serde_yml::Value::Sequence(seq) => {
233            serde_json::Value::Array(seq.into_iter().map(yaml_to_json).collect())
234        }
235        serde_yml::Value::Mapping(map) => {
236            let obj: serde_json::Map<String, serde_json::Value> = map
237                .into_iter()
238                .filter_map(|(k, v)| {
239                    let key = match k {
240                        serde_yml::Value::String(s) => s,
241                        other => serde_json::to_string(&yaml_to_json(other)).ok()?,
242                    };
243                    Some((key, yaml_to_json(v)))
244                })
245                .collect();
246            serde_json::Value::Object(obj)
247        }
248        serde_yml::Value::Tagged(tagged) => yaml_to_json(tagged.value),
249    }
250}
251
252#[cfg(test)]
253mod tests {
254    use super::*;
255
256    fn parse(content: &str) -> ParseResult {
257        let parser = FrontmatterParser { file_filter: None };
258        parser.parse("test.md", content)
259    }
260
261    #[test]
262    fn parser_name() {
263        let parser = FrontmatterParser { file_filter: None };
264        assert_eq!(parser.name(), "frontmatter");
265    }
266
267    #[test]
268    fn extracts_frontmatter_links() {
269        let content =
270            "---\nsources:\n  - ../shared/glossary.md\n  - ./prior-art.md\n---\n\n# Hello\n";
271        let result = parse(content);
272        assert_eq!(result.links.len(), 2);
273        assert_eq!(result.links[0], "../shared/glossary.md");
274        assert_eq!(result.links[1], "./prior-art.md");
275    }
276
277    #[test]
278    fn extracts_same_directory_links() {
279        let content = "---\nsources:\n  - setup.md\n  - config.rs\n---\n";
280        let result = parse(content);
281        assert_eq!(result.links.len(), 2);
282        assert_eq!(result.links[0], "setup.md");
283        assert_eq!(result.links[1], "config.rs");
284    }
285
286    #[test]
287    fn frontmatter_skips_non_paths() {
288        let content = "---\ntitle: My Document\nversion: 1.0\ntags:\n  - rust\n  - cli\n---\n";
289        let result = parse(content);
290        assert!(result.links.is_empty());
291    }
292
293    #[test]
294    fn frontmatter_skips_code_block_examples() {
295        let content = "# Doc\n\n```markdown\n---\nsources:\n  - ./fake.md\n---\n```\n";
296        let result = parse(content);
297        assert!(
298            result.links.is_empty(),
299            "frontmatter inside code block should be ignored"
300        );
301        assert!(result.metadata.is_none());
302    }
303
304    #[test]
305    fn extracts_metadata() {
306        let content =
307            "---\ntitle: My Doc\nstatus: draft\ntags:\n  - rust\n  - cli\n---\n\n# Hello\n";
308        let result = parse(content);
309        let meta = result.metadata.unwrap();
310        assert_eq!(meta["title"], "My Doc");
311        assert_eq!(meta["status"], "draft");
312        assert_eq!(meta["tags"], serde_json::json!(["rust", "cli"]));
313    }
314
315    #[test]
316    fn no_metadata_without_frontmatter() {
317        let result = parse("# Just a heading\n");
318        assert!(result.metadata.is_none());
319    }
320
321    #[test]
322    fn metadata_handles_nested_yaml() {
323        let content = "---\ntitle: Test\nauthor:\n  name: Alice\n  role: dev\n---\n";
324        let result = parse(content);
325        let meta = result.metadata.unwrap();
326        assert_eq!(meta["author"]["name"], "Alice");
327        assert_eq!(meta["author"]["role"], "dev");
328    }
329
330    #[test]
331    fn no_filter_matches_everything() {
332        let parser = FrontmatterParser { file_filter: None };
333        assert!(parser.matches("index.md"));
334        assert!(parser.matches("main.rs"));
335    }
336
337    #[test]
338    fn file_filter_restricts_matching() {
339        let mut builder = globset::GlobSetBuilder::new();
340        builder.add(globset::Glob::new("*.md").unwrap());
341        let parser = FrontmatterParser {
342            file_filter: Some(builder.build().unwrap()),
343        };
344        assert!(parser.matches("index.md"));
345        assert!(!parser.matches("main.rs"));
346    }
347
348    #[test]
349    fn extracts_uris() {
350        let content = "---\nsources:\n  - https://example.com\n  - ./local.md\n---\n";
351        let result = parse(content);
352        assert_eq!(result.links.len(), 2);
353        assert_eq!(result.links[0], "https://example.com");
354        assert_eq!(result.links[1], "./local.md");
355    }
356
357    #[test]
358    fn skips_prose_with_spaces() {
359        let content = "---\npurpose: configuration reference\nstatus: needs review\n---\n";
360        let result = parse(content);
361        assert!(result.links.is_empty());
362    }
363
364    #[test]
365    fn skips_abbreviations_and_versions() {
366        let content = "---\nnote: e.g.\nversion: v2.0\nauthor: Dr.\n---\n";
367        let result = parse(content);
368        assert!(result.links.is_empty());
369    }
370
371    #[test]
372    fn accepts_paths_without_prefix() {
373        let content = "---\nsources:\n  - config.rs\n  - docs/setup.md\n---\n";
374        let result = parse(content);
375        assert_eq!(result.links.len(), 2);
376        assert_eq!(result.links[0], "config.rs");
377        assert_eq!(result.links[1], "docs/setup.md");
378    }
379
380    #[test]
381    fn emits_absolute_paths() {
382        let content = "---\nsource: /usr/local/config.toml\n---\n";
383        let result = parse(content);
384        assert_eq!(result.links.len(), 1);
385        assert_eq!(result.links[0], "/usr/local/config.toml");
386    }
387}