Skip to main content

drft/parsers/
mod.rs

1pub mod custom;
2pub mod frontmatter;
3pub mod markdown;
4
5use crate::config::ParserConfig;
6use std::collections::HashMap;
7
8/// Combined output from parsing a single file: links + optional metadata.
9/// Links are raw strings as they appear in the source — the graph builder handles
10/// normalization (fragment stripping, anchor filtering, URI detection).
11///
12/// See [`docs/parsers`](../../docs/parsers/README.md) for details.
13#[derive(Debug, Clone, Default)]
14pub struct ParseResult {
15    pub links: Vec<String>,
16    /// Structured metadata extracted from the file, namespaced by parser on the node.
17    pub metadata: Option<serde_json::Value>,
18}
19
20/// Trait implemented by all parsers (built-in and custom).
21pub trait Parser {
22    /// Parser name — used as provenance on edges.
23    fn name(&self) -> &str;
24    /// Check if this parser should run on a given file path.
25    fn matches(&self, path: &str) -> bool;
26    /// Parse a file's content and return discovered links + optional metadata.
27    fn parse(&self, path: &str, content: &str) -> ParseResult;
28    /// Parse multiple files in one call. Default falls back to per-file parsing.
29    /// Custom parsers override this to spawn one process for all files.
30    fn parse_batch(&self, files: &[(&str, &str)]) -> HashMap<String, ParseResult> {
31        files
32            .iter()
33            .map(|(path, content)| (path.to_string(), self.parse(path, content)))
34            .collect()
35    }
36}
37
38/// Build a GlobSet from file patterns (for parser routing).
39/// Returns None if no patterns → parser receives all File nodes.
40fn build_file_filter(patterns: &Option<Vec<String>>, name: &str) -> Option<globset::GlobSet> {
41    let patterns = patterns.as_ref()?;
42    match crate::config::compile_globs(patterns) {
43        Ok(set) => set,
44        Err(e) => {
45            eprintln!("warn: invalid glob in parser {name}.files: {e}");
46            None
47        }
48    }
49}
50
51/// Check whether the last component of a path has a file extension.
52pub(crate) fn has_file_extension(path: &str) -> bool {
53    if let Some(basename) = path.rsplit('/').next() {
54        basename.contains('.')
55    } else {
56        path.contains('.')
57    }
58}
59
60/// Strip all code content (fenced blocks and inline backtick spans),
61/// replacing with spaces to preserve offsets.
62pub(crate) fn strip_code(content: &str) -> String {
63    // First strip fenced code blocks (``` and ~~~)
64    let mut result = String::with_capacity(content.len());
65    let mut in_code_block = false;
66    let mut fence_marker = "";
67
68    for line in content.lines() {
69        let trimmed = line.trim_start();
70        if !in_code_block {
71            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
72                in_code_block = true;
73                fence_marker = if trimmed.starts_with("```") {
74                    "```"
75                } else {
76                    "~~~"
77                };
78                result.push_str(&" ".repeat(line.len()));
79            } else {
80                result.push_str(line);
81            }
82        } else if trimmed.starts_with(fence_marker) && trimmed.trim() == fence_marker {
83            in_code_block = false;
84            result.push_str(&" ".repeat(line.len()));
85        } else {
86            result.push_str(&" ".repeat(line.len()));
87        }
88        result.push('\n');
89    }
90
91    // Then strip inline code spans (single and double backticks)
92    let mut cleaned = String::with_capacity(result.len());
93    let chars: Vec<char> = result.chars().collect();
94    let mut i = 0;
95    while i < chars.len() {
96        if chars[i] == '`' {
97            // Count opening backticks
98            let mut ticks = 0;
99            while i + ticks < chars.len() && chars[i + ticks] == '`' {
100                ticks += 1;
101            }
102            // Find matching closing backticks in the char array
103            let after = i + ticks;
104            let mut found = None;
105            let mut j = after;
106            while j + ticks <= chars.len() {
107                if chars[j..j + ticks].iter().all(|c| *c == '`') {
108                    found = Some(j);
109                    break;
110                }
111                j += 1;
112            }
113            if let Some(close_start) = found {
114                // Replace entire span (backticks + content + backticks) with spaces
115                let total = close_start + ticks - i;
116                for _ in 0..total {
117                    cleaned.push(' ');
118                }
119                i += total;
120            } else {
121                // No closing — keep the backtick as-is
122                cleaned.push(chars[i]);
123                i += 1;
124            }
125        } else {
126            cleaned.push(chars[i]);
127            i += 1;
128        }
129    }
130
131    cleaned
132}
133
134/// Build the parser registry from config.
135/// Returns a list of boxed parsers ready to run.
136pub fn build_parsers(
137    parsers_config: &HashMap<String, ParserConfig>,
138    config_dir: Option<&std::path::Path>,
139    root: &std::path::Path,
140) -> Vec<Box<dyn Parser>> {
141    let mut parsers: Vec<Box<dyn Parser>> = Vec::new();
142
143    for (name, config) in parsers_config {
144        let file_filter = build_file_filter(&config.files, name);
145
146        if let Some(ref command) = config.command {
147            // Custom parser
148            let resolved_command = if let Some(dir) = config_dir {
149                let cmd_path = dir.join(command);
150                if cmd_path.exists() {
151                    cmd_path.to_string_lossy().to_string()
152                } else {
153                    command.clone()
154                }
155            } else {
156                command.clone()
157            };
158
159            parsers.push(Box::new(custom::CustomParser {
160                parser_name: name.clone(),
161                file_filter,
162                command: resolved_command,
163                timeout_ms: config.timeout.unwrap_or(5000),
164                scope_dir: root.to_path_buf(),
165                options: config.options.clone(),
166            }));
167        } else {
168            // Built-in parser
169            match name.as_str() {
170                "markdown" => {
171                    parsers.push(Box::new(markdown::MarkdownParser { file_filter }));
172                }
173                "frontmatter" => {
174                    parsers.push(Box::new(frontmatter::FrontmatterParser { file_filter }));
175                }
176                _ => {
177                    eprintln!(
178                        "warn: unknown built-in parser \"{name}\" (use 'command' field for custom parsers)"
179                    );
180                }
181            }
182        }
183    }
184
185    parsers
186}