Skip to main content

drft/parsers/
frontmatter.rs

1use super::{ParseResult, Parser};
2
3/// Check whether a frontmatter value looks like a link target (file path or URI).
4fn is_link_candidate(value: &str) -> bool {
5    // URIs are always candidates — graph builder creates External nodes
6    if crate::graph::is_uri(value) {
7        return true;
8    }
9    // Explicit path prefixes are always candidates.
10    // The graph builder gates all filesystem access for out-of-root targets.
11    if value.starts_with("./") || value.starts_with("../") || value.starts_with('/') {
12        return true;
13    }
14    // Prose contains spaces — file paths don't
15    if value.contains(' ') {
16        return false;
17    }
18    // Must have a plausible file extension: dot followed by 1-6 alphanumeric
19    // chars that aren't all digits (rejects v2.0, e.g., Dr.)
20    let basename = value.rsplit('/').next().unwrap_or(value);
21    if let Some(dot_pos) = basename.rfind('.') {
22        let ext = &basename[dot_pos + 1..];
23        !ext.is_empty()
24            && ext.len() <= 6
25            && ext.chars().all(|c| c.is_ascii_alphanumeric())
26            && !ext.chars().all(|c| c.is_ascii_digit())
27    } else {
28        false
29    }
30}
31
32/// Strip all code content (fenced blocks and inline backtick spans),
33/// replacing with spaces to preserve offsets.
34fn strip_code(content: &str) -> String {
35    // First strip fenced code blocks (``` and ~~~)
36    let mut result = String::with_capacity(content.len());
37    let mut in_code_block = false;
38    let mut fence_marker = "";
39
40    for line in content.lines() {
41        let trimmed = line.trim_start();
42        if !in_code_block {
43            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
44                in_code_block = true;
45                fence_marker = if trimmed.starts_with("```") {
46                    "```"
47                } else {
48                    "~~~"
49                };
50                result.push_str(&" ".repeat(line.len()));
51            } else {
52                result.push_str(line);
53            }
54        } else if trimmed.starts_with(fence_marker) && trimmed.trim() == fence_marker {
55            in_code_block = false;
56            result.push_str(&" ".repeat(line.len()));
57        } else {
58            result.push_str(&" ".repeat(line.len()));
59        }
60        result.push('\n');
61    }
62
63    // Then strip inline code spans (single and double backticks)
64    let mut cleaned = String::with_capacity(result.len());
65    let chars: Vec<char> = result.chars().collect();
66    let mut i = 0;
67    while i < chars.len() {
68        if chars[i] == '`' {
69            // Count opening backticks
70            let mut ticks = 0;
71            while i + ticks < chars.len() && chars[i + ticks] == '`' {
72                ticks += 1;
73            }
74            // Find matching closing backticks in the char array
75            let after = i + ticks;
76            let mut found = None;
77            let mut j = after;
78            while j + ticks <= chars.len() {
79                if chars[j..j + ticks].iter().all(|c| *c == '`') {
80                    found = Some(j);
81                    break;
82                }
83                j += 1;
84            }
85            if let Some(close_start) = found {
86                // Replace entire span (backticks + content + backticks) with spaces
87                let total = close_start + ticks - i;
88                for _ in 0..total {
89                    cleaned.push(' ');
90                }
91                i += total;
92            } else {
93                // No closing — keep the backtick as-is
94                cleaned.push(chars[i]);
95                i += 1;
96            }
97        } else {
98            cleaned.push(chars[i]);
99            i += 1;
100        }
101    }
102
103    cleaned
104}
105
106/// Built-in frontmatter parser. Extracts YAML frontmatter as links and metadata.
107pub struct FrontmatterParser {
108    /// File routing filter. None = receives all File nodes.
109    pub file_filter: Option<globset::GlobSet>,
110}
111
112impl Parser for FrontmatterParser {
113    fn name(&self) -> &str {
114        "frontmatter"
115    }
116
117    fn matches(&self, path: &str) -> bool {
118        match &self.file_filter {
119            Some(set) => set.is_match(path),
120            None => true,
121        }
122    }
123
124    fn parse(&self, _path: &str, content: &str) -> ParseResult {
125        let links = extract_frontmatter_links(content);
126        let metadata = extract_frontmatter_metadata(content);
127
128        ParseResult { links, metadata }
129    }
130}
131
132/// Extract file path references from YAML frontmatter.
133/// Operates on code-block-stripped content to avoid parsing frontmatter
134/// inside fenced code block examples.
135fn extract_frontmatter_links(content: &str) -> Vec<String> {
136    let content = &strip_code(content);
137
138    if !content.starts_with("---") {
139        return Vec::new();
140    }
141
142    let rest = &content[3..];
143    let end = match rest.find("\n---") {
144        Some(idx) => idx,
145        None => return Vec::new(),
146    };
147
148    let yaml_str = &rest[..end];
149    if yaml_str.trim().is_empty() {
150        return Vec::new();
151    }
152
153    let yaml: serde_yml::Value = match serde_yml::from_str(yaml_str) {
154        Ok(v) => v,
155        Err(e) => {
156            eprintln!("warn: frontmatter parser: invalid YAML: {e}");
157            return Vec::new();
158        }
159    };
160
161    let mut links = Vec::new();
162    collect_string_leaves(&yaml, &mut links);
163    links.retain(|v| is_link_candidate(v));
164    links
165}
166
167/// Recursively collect all string leaf values from a YAML structure.
168/// Skips keys (only visits values) and non-string types (numbers, bools, null).
169fn collect_string_leaves(value: &serde_yml::Value, out: &mut Vec<String>) {
170    match value {
171        serde_yml::Value::String(s) => out.push(s.clone()),
172        serde_yml::Value::Sequence(seq) => {
173            for item in seq {
174                collect_string_leaves(item, out);
175            }
176        }
177        serde_yml::Value::Mapping(map) => {
178            for (_key, val) in map {
179                collect_string_leaves(val, out);
180            }
181        }
182        serde_yml::Value::Tagged(tagged) => collect_string_leaves(&tagged.value, out),
183        _ => {}
184    }
185}
186
187/// Parse YAML frontmatter into a JSON value for node metadata.
188/// Returns None if no valid frontmatter is found.
189fn extract_frontmatter_metadata(content: &str) -> Option<serde_json::Value> {
190    let content = &strip_code(content);
191
192    if !content.starts_with("---") {
193        return None;
194    }
195
196    let rest = &content[3..];
197    let end = rest.find("\n---")?;
198    let yaml_str = &rest[..end];
199
200    if yaml_str.trim().is_empty() {
201        return None;
202    }
203
204    match serde_yml::from_str::<serde_yml::Value>(yaml_str) {
205        Ok(yaml_val) => Some(yaml_to_json(yaml_val)),
206        Err(e) => {
207            eprintln!("warn: frontmatter parser: invalid YAML: {e}");
208            None
209        }
210    }
211}
212
213/// Convert serde_yml::Value to serde_json::Value.
214fn yaml_to_json(yaml: serde_yml::Value) -> serde_json::Value {
215    match yaml {
216        serde_yml::Value::Null => serde_json::Value::Null,
217        serde_yml::Value::Bool(b) => serde_json::Value::Bool(b),
218        serde_yml::Value::Number(n) => {
219            if let Some(i) = n.as_i64() {
220                serde_json::Value::Number(i.into())
221            } else if let Some(f) = n.as_f64() {
222                serde_json::Number::from_f64(f)
223                    .map(serde_json::Value::Number)
224                    .unwrap_or(serde_json::Value::Null)
225            } else {
226                serde_json::Value::Null
227            }
228        }
229        serde_yml::Value::String(s) => serde_json::Value::String(s),
230        serde_yml::Value::Sequence(seq) => {
231            serde_json::Value::Array(seq.into_iter().map(yaml_to_json).collect())
232        }
233        serde_yml::Value::Mapping(map) => {
234            let obj: serde_json::Map<String, serde_json::Value> = map
235                .into_iter()
236                .filter_map(|(k, v)| {
237                    let key = match k {
238                        serde_yml::Value::String(s) => s,
239                        other => serde_json::to_string(&yaml_to_json(other)).ok()?,
240                    };
241                    Some((key, yaml_to_json(v)))
242                })
243                .collect();
244            serde_json::Value::Object(obj)
245        }
246        serde_yml::Value::Tagged(tagged) => yaml_to_json(tagged.value),
247    }
248}
249
250#[cfg(test)]
251mod tests {
252    use super::*;
253
254    fn parse(content: &str) -> ParseResult {
255        let parser = FrontmatterParser { file_filter: None };
256        parser.parse("test.md", content)
257    }
258
259    #[test]
260    fn parser_name() {
261        let parser = FrontmatterParser { file_filter: None };
262        assert_eq!(parser.name(), "frontmatter");
263    }
264
265    #[test]
266    fn extracts_frontmatter_links() {
267        let content =
268            "---\nsources:\n  - ../shared/glossary.md\n  - ./prior-art.md\n---\n\n# Hello\n";
269        let result = parse(content);
270        assert_eq!(result.links.len(), 2);
271        assert_eq!(result.links[0], "../shared/glossary.md");
272        assert_eq!(result.links[1], "./prior-art.md");
273    }
274
275    #[test]
276    fn extracts_same_directory_links() {
277        let content = "---\nsources:\n  - setup.md\n  - config.rs\n---\n";
278        let result = parse(content);
279        assert_eq!(result.links.len(), 2);
280        assert_eq!(result.links[0], "setup.md");
281        assert_eq!(result.links[1], "config.rs");
282    }
283
284    #[test]
285    fn frontmatter_skips_non_paths() {
286        let content = "---\ntitle: My Document\nversion: 1.0\ntags:\n  - rust\n  - cli\n---\n";
287        let result = parse(content);
288        assert!(result.links.is_empty());
289    }
290
291    #[test]
292    fn frontmatter_skips_code_block_examples() {
293        let content = "# Doc\n\n```markdown\n---\nsources:\n  - ./fake.md\n---\n```\n";
294        let result = parse(content);
295        assert!(
296            result.links.is_empty(),
297            "frontmatter inside code block should be ignored"
298        );
299        assert!(result.metadata.is_none());
300    }
301
302    #[test]
303    fn extracts_metadata() {
304        let content =
305            "---\ntitle: My Doc\nstatus: draft\ntags:\n  - rust\n  - cli\n---\n\n# Hello\n";
306        let result = parse(content);
307        let meta = result.metadata.unwrap();
308        assert_eq!(meta["title"], "My Doc");
309        assert_eq!(meta["status"], "draft");
310        assert_eq!(meta["tags"], serde_json::json!(["rust", "cli"]));
311    }
312
313    #[test]
314    fn no_metadata_without_frontmatter() {
315        let result = parse("# Just a heading\n");
316        assert!(result.metadata.is_none());
317    }
318
319    #[test]
320    fn metadata_handles_nested_yaml() {
321        let content = "---\ntitle: Test\nauthor:\n  name: Alice\n  role: dev\n---\n";
322        let result = parse(content);
323        let meta = result.metadata.unwrap();
324        assert_eq!(meta["author"]["name"], "Alice");
325        assert_eq!(meta["author"]["role"], "dev");
326    }
327
328    #[test]
329    fn no_filter_matches_everything() {
330        let parser = FrontmatterParser { file_filter: None };
331        assert!(parser.matches("index.md"));
332        assert!(parser.matches("main.rs"));
333    }
334
335    #[test]
336    fn file_filter_restricts_matching() {
337        let mut builder = globset::GlobSetBuilder::new();
338        builder.add(globset::Glob::new("*.md").unwrap());
339        let parser = FrontmatterParser {
340            file_filter: Some(builder.build().unwrap()),
341        };
342        assert!(parser.matches("index.md"));
343        assert!(!parser.matches("main.rs"));
344    }
345
346    #[test]
347    fn extracts_uris() {
348        let content = "---\nsources:\n  - https://example.com\n  - ./local.md\n---\n";
349        let result = parse(content);
350        assert_eq!(result.links.len(), 2);
351        assert_eq!(result.links[0], "https://example.com");
352        assert_eq!(result.links[1], "./local.md");
353    }
354
355    #[test]
356    fn skips_prose_with_spaces() {
357        let content = "---\npurpose: configuration reference\nstatus: needs review\n---\n";
358        let result = parse(content);
359        assert!(result.links.is_empty());
360    }
361
362    #[test]
363    fn skips_abbreviations_and_versions() {
364        let content = "---\nnote: e.g.\nversion: v2.0\nauthor: Dr.\n---\n";
365        let result = parse(content);
366        assert!(result.links.is_empty());
367    }
368
369    #[test]
370    fn accepts_paths_without_prefix() {
371        let content = "---\nsources:\n  - config.rs\n  - docs/setup.md\n---\n";
372        let result = parse(content);
373        assert_eq!(result.links.len(), 2);
374        assert_eq!(result.links[0], "config.rs");
375        assert_eq!(result.links[1], "docs/setup.md");
376    }
377
378    #[test]
379    fn emits_absolute_paths() {
380        let content = "---\nsource: /usr/local/config.toml\n---\n";
381        let result = parse(content);
382        assert_eq!(result.links.len(), 1);
383        assert_eq!(result.links[0], "/usr/local/config.toml");
384    }
385
386    #[test]
387    fn yaml_list_values_not_parsed_as_uris() {
388        // Regression: `- name: foo bar bazz` was split on `- ` to get
389        // `name: foo bar bazz`, which the old is_uri matched as scheme `name:`
390        let content = "---\ntags:\n  - name: foo bar bazz\n  - status: draft\n---\n";
391        let result = parse(content);
392        assert!(result.links.is_empty());
393    }
394}