Skip to main content

mdvault_core/vault/
extractor.rs

1//! Note content extraction: links, title, type, frontmatter.
2
3use std::path::Path;
4use std::sync::LazyLock;
5
6use regex::Regex;
7
8use crate::frontmatter::{self, Frontmatter};
9use crate::index::types::{LinkType, NoteType};
10
11/// Extracted information from a note file.
12#[derive(Debug, Clone)]
13pub struct ExtractedNote {
14    /// Note title (from frontmatter, first heading, or filename).
15    pub title: String,
16    /// Note type from frontmatter `type:` field.
17    pub note_type: NoteType,
18    /// Frontmatter as JSON string (if present).
19    pub frontmatter_json: Option<String>,
20    /// All links found in the document.
21    pub links: Vec<ExtractedLink>,
22}
23
24/// A link extracted from a note.
25#[derive(Debug, Clone)]
26pub struct ExtractedLink {
27    /// Target path/name (raw, as written in the link).
28    pub target: String,
29    /// Display text (alias for wikilinks, text for markdown links).
30    pub text: Option<String>,
31    /// Type of link.
32    pub link_type: LinkType,
33    /// Line number where link appears (1-based).
34    pub line_number: u32,
35    /// Context text (surrounding content).
36    pub context: Option<String>,
37}
38
39// Regex patterns for link extraction
40static WIKILINK_RE: LazyLock<Regex> = LazyLock::new(|| {
41    // Matches [[target]] or [[target|alias]]
42    // Also handles [[target#section]] and [[target#section|alias]]
43    Regex::new(r"\[\[([^\]|]+)(?:\|([^\]]+))?\]\]").unwrap()
44});
45
46static MARKDOWN_LINK_RE: LazyLock<Regex> = LazyLock::new(|| {
47    // Matches [text](url) - captures .md files and relative paths
48    // Excludes http:// and https:// URLs
49    Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").unwrap()
50});
51
52/// Extract note information from file content.
53pub fn extract_note(content: &str, file_path: &Path) -> ExtractedNote {
54    // Parse frontmatter
55    let parsed = frontmatter::parse(content).unwrap_or_else(|_| {
56        crate::frontmatter::ParsedDocument {
57            frontmatter: None,
58            body: content.to_string(),
59        }
60    });
61
62    // Extract note type from frontmatter
63    let note_type = parsed
64        .frontmatter
65        .as_ref()
66        .and_then(|fm| fm.fields.get("type"))
67        .and_then(|v| v.as_str())
68        .map(|s| s.parse().unwrap_or_default())
69        .unwrap_or_default();
70
71    // Extract title: frontmatter > first heading > filename
72    let title = extract_title(&parsed.frontmatter, &parsed.body, file_path);
73
74    // Serialize frontmatter to JSON
75    let frontmatter_json = parsed
76        .frontmatter
77        .as_ref()
78        .map(|fm| serde_json::to_string(&fm.fields).unwrap_or_default());
79
80    // Extract links from body
81    let mut links = extract_links(&parsed.body);
82
83    // Extract frontmatter references (project:, parent:, etc.)
84    let fm_links = extract_frontmatter_links(&parsed.frontmatter);
85    links.extend(fm_links);
86
87    ExtractedNote { title, note_type, frontmatter_json, links }
88}
89
90fn extract_title(fm: &Option<Frontmatter>, body: &str, file_path: &Path) -> String {
91    // Try frontmatter title
92    if let Some(fm) = fm
93        && let Some(title) = fm.fields.get("title").and_then(|v| v.as_str())
94    {
95        return title.to_string();
96    }
97
98    // Try first heading
99    for line in body.lines() {
100        let trimmed = line.trim();
101        if let Some(heading) = trimmed.strip_prefix('#') {
102            let heading = heading.trim_start_matches('#').trim();
103            if !heading.is_empty() {
104                return heading.to_string();
105            }
106        }
107    }
108
109    // Fall back to filename without extension
110    file_path.file_stem().and_then(|s| s.to_str()).unwrap_or("Untitled").to_string()
111}
112
113fn extract_links(body: &str) -> Vec<ExtractedLink> {
114    let mut links = Vec::new();
115
116    for (line_num, line) in body.lines().enumerate() {
117        let line_number = (line_num + 1) as u32;
118
119        // Extract wikilinks
120        for cap in WIKILINK_RE.captures_iter(line) {
121            let target = cap.get(1).map(|m| m.as_str()).unwrap_or("");
122            let alias = cap.get(2).map(|m| m.as_str().to_string());
123
124            links.push(ExtractedLink {
125                target: target.to_string(),
126                text: alias,
127                link_type: LinkType::Wikilink,
128                line_number,
129                context: Some(truncate_context(line, 100)),
130            });
131        }
132
133        // Extract markdown links to local files
134        for cap in MARKDOWN_LINK_RE.captures_iter(line) {
135            let text = cap.get(1).map(|m| m.as_str()).unwrap_or("");
136            let url = cap.get(2).map(|m| m.as_str()).unwrap_or("");
137
138            // Skip external URLs
139            if url.starts_with("http://") || url.starts_with("https://") {
140                continue;
141            }
142
143            // Skip non-markdown links (images, etc.) unless they're relative paths
144            if !url.ends_with(".md") && !is_likely_note_reference(url) {
145                continue;
146            }
147
148            links.push(ExtractedLink {
149                target: url.to_string(),
150                text: Some(text.to_string()),
151                link_type: LinkType::Markdown,
152                line_number,
153                context: Some(truncate_context(line, 100)),
154            });
155        }
156    }
157
158    links
159}
160
161fn is_likely_note_reference(url: &str) -> bool {
162    // Consider it a note reference if it:
163    // - Doesn't have a file extension (might be a note name)
164    // - Or ends with .md
165    // - And doesn't look like an image or other asset
166    let lower = url.to_lowercase();
167
168    // Skip obvious non-notes
169    if lower.ends_with(".png")
170        || lower.ends_with(".jpg")
171        || lower.ends_with(".jpeg")
172        || lower.ends_with(".gif")
173        || lower.ends_with(".svg")
174        || lower.ends_with(".pdf")
175    {
176        return false;
177    }
178
179    // If no extension, it might be a note reference
180    !url.contains('.')
181}
182
183fn extract_frontmatter_links(fm: &Option<Frontmatter>) -> Vec<ExtractedLink> {
184    let mut links = Vec::new();
185
186    let fm = match fm {
187        Some(fm) => fm,
188        None => return links,
189    };
190
191    // Known reference fields
192    let ref_fields = ["project", "parent", "related", "blocks", "blocked_by"];
193
194    for field in &ref_fields {
195        if let Some(value) = fm.fields.get(*field) {
196            // Handle single string value
197            if let Some(s) = value.as_str() {
198                links.push(ExtractedLink {
199                    target: s.to_string(),
200                    text: Some(format!("{}: {}", field, s)),
201                    link_type: LinkType::Frontmatter,
202                    line_number: 0, // Frontmatter doesn't have meaningful line numbers
203                    context: None,
204                });
205            }
206            // Handle array of strings
207            if let Some(arr) = value.as_sequence() {
208                for item in arr {
209                    if let Some(s) = item.as_str() {
210                        links.push(ExtractedLink {
211                            target: s.to_string(),
212                            text: Some(format!("{}: {}", field, s)),
213                            link_type: LinkType::Frontmatter,
214                            line_number: 0,
215                            context: None,
216                        });
217                    }
218                }
219            }
220        }
221    }
222
223    links
224}
225
226fn truncate_context(line: &str, max_len: usize) -> String {
227    if line.len() <= max_len {
228        line.to_string()
229    } else {
230        format!("{}...", &line[..max_len])
231    }
232}
233
234#[cfg(test)]
235mod tests {
236    use super::*;
237
238    #[test]
239    fn test_extract_wikilinks() {
240        let content = r#"---
241title: Test Note
242---
243# Heading
244
245This links to [[other-note]] and [[another|with alias]].
246Also [[path/to/note]] works.
247"#;
248        let note = extract_note(content, Path::new("test.md"));
249
250        assert_eq!(note.links.len(), 3);
251        assert_eq!(note.links[0].target, "other-note");
252        assert_eq!(note.links[0].text, None);
253        assert_eq!(note.links[0].link_type, LinkType::Wikilink);
254
255        assert_eq!(note.links[1].target, "another");
256        assert_eq!(note.links[1].text, Some("with alias".to_string()));
257
258        assert_eq!(note.links[2].target, "path/to/note");
259    }
260
261    #[test]
262    fn test_extract_markdown_links() {
263        let content = r#"# Note
264
265See [this note](./other.md) for details.
266Also [external](https://example.com) should be skipped.
267And [image](./pic.png) should be skipped too.
268"#;
269        let note = extract_note(content, Path::new("test.md"));
270
271        assert_eq!(note.links.len(), 1);
272        assert_eq!(note.links[0].target, "./other.md");
273        assert_eq!(note.links[0].text, Some("this note".to_string()));
274        assert_eq!(note.links[0].link_type, LinkType::Markdown);
275    }
276
277    #[test]
278    fn test_extract_frontmatter_links() {
279        let content = r#"---
280title: Task
281type: task
282project: my-project
283related:
284  - note-a
285  - note-b
286---
287# Task content
288"#;
289        let note = extract_note(content, Path::new("task.md"));
290
291        let fm_links: Vec<_> =
292            note.links.iter().filter(|l| l.link_type == LinkType::Frontmatter).collect();
293
294        assert_eq!(fm_links.len(), 3);
295        assert!(fm_links.iter().any(|l| l.target == "my-project"));
296        assert!(fm_links.iter().any(|l| l.target == "note-a"));
297        assert!(fm_links.iter().any(|l| l.target == "note-b"));
298    }
299
300    #[test]
301    fn test_extract_title_from_frontmatter() {
302        let content = r#"---
303title: My Title
304---
305# Heading
306"#;
307        let note = extract_note(content, Path::new("file.md"));
308        assert_eq!(note.title, "My Title");
309    }
310
311    #[test]
312    fn test_extract_title_from_heading() {
313        let content = "# First Heading\n\nContent here.";
314        let note = extract_note(content, Path::new("file.md"));
315        assert_eq!(note.title, "First Heading");
316    }
317
318    #[test]
319    fn test_extract_title_from_filename() {
320        let content = "No frontmatter, no heading.";
321        let note = extract_note(content, Path::new("my-note.md"));
322        assert_eq!(note.title, "my-note");
323    }
324
325    #[test]
326    fn test_extract_note_type() {
327        let content = r#"---
328type: task
329---
330# Task
331"#;
332        let note = extract_note(content, Path::new("task.md"));
333        assert_eq!(note.note_type, NoteType::Task);
334    }
335
336    #[test]
337    fn test_extract_note_type_default() {
338        let content = "# Just a note";
339        let note = extract_note(content, Path::new("note.md"));
340        assert_eq!(note.note_type, NoteType::None);
341    }
342
343    #[test]
344    fn test_line_numbers() {
345        let content = r#"Line 1
346Line 2 with [[link1]]
347Line 3
348Line 4 with [[link2]]
349"#;
350        let note = extract_note(content, Path::new("test.md"));
351
352        assert_eq!(note.links.len(), 2);
353        assert_eq!(note.links[0].line_number, 2);
354        assert_eq!(note.links[1].line_number, 4);
355    }
356
357    #[test]
358    fn test_wikilink_with_section() {
359        let content = "Link to [[note#section]] here.";
360        let note = extract_note(content, Path::new("test.md"));
361
362        assert_eq!(note.links.len(), 1);
363        assert_eq!(note.links[0].target, "note#section");
364    }
365}