Skip to main content

turbovault_parser/
lib.rs

1//! # TurboVault Parser
2//!
3//! Obsidian Flavored Markdown (OFM) parser built on `pulldown-cmark`.
4//!
5//! This crate provides:
6//! - Fast markdown parsing via `pulldown-cmark` (CommonMark foundation)
7//! - Frontmatter extraction (YAML via pulldown-cmark metadata blocks)
8//! - Obsidian-specific syntax: wikilinks, embeds, callouts, tags
9//! - **Code block awareness**: patterns inside code blocks/inline code are excluded
10//! - Link extraction and resolution
11//! - **Standalone parsing without vault context** (for tools like treemd)
12//!
13//! ## Architecture
14//!
15//! The parser uses a hybrid two-phase approach via unified `ParseEngine`:
16//!
17//! ### Phase 1: pulldown-cmark pass
18//! - Extracts CommonMark elements: headings, markdown links, tasks, frontmatter
19//! - Builds excluded ranges (code blocks, inline code, HTML) for Phase 2
20//!
21//! ### Phase 2: Regex pass (OFM extensions)
22//! - Parses Obsidian-specific syntax: wikilinks `[[]]`, embeds `![[]]`, tags `#tag`, callouts
23//! - **Skips excluded ranges** to avoid matching inside code blocks
24//!
25//! ### Performance optimizations
26//! - Builds a `LineIndex` once for O(log n) position lookups
27//! - Uses fast pre-filters to skip regex when patterns aren't present
28//!
29//! ## Quick Start
30//!
31//! ### With Vault Context
32//!
33//! ```
34//! use turbovault_parser::Parser;
35//! use std::path::PathBuf;
36//!
37//! let content = r#"---
38//! title: My Note
39//! tags: [important, review]
40//! ---
41//!
42//! # Heading
43//!
44//! [[WikiLink]] and [[Other Note#Heading]].
45//!
46//! - [x] Completed task
47//! - [ ] Pending task
48//! "#;
49//!
50//! let vault_path = PathBuf::from("/vault");
51//! let parser = Parser::new(vault_path);
52//!
53//! let path = PathBuf::from("my-note.md");
54//! if let Ok(result) = parser.parse_file(&path, content) {
55//!     // Access parsed components
56//!     if let Some(frontmatter) = &result.frontmatter {
57//!         println!("Frontmatter data: {:?}", frontmatter.data);
58//!     }
59//!     println!("Links: {}", result.links.len());
60//!     println!("Tasks: {}", result.tasks.len());
61//! }
62//! ```
63//!
64//! ### Standalone Parsing (No Vault Required)
65//!
66//! ```
67//! use turbovault_parser::{ParsedContent, ParseOptions};
68//!
69//! let content = "# Title\n\n[[WikiLink]] and [markdown](url) with #tag";
70//!
71//! // Parse everything
72//! let parsed = ParsedContent::parse(content);
73//! assert_eq!(parsed.wikilinks.len(), 1);
74//! assert_eq!(parsed.markdown_links.len(), 1);
75//! assert_eq!(parsed.tags.len(), 1);
76//!
77//! // Or parse selectively for better performance
78//! let parsed = ParsedContent::parse_with_options(content, ParseOptions::links_only());
79//! ```
80//!
81//! ### Individual Parsers (Granular Control)
82//!
83//! ```
84//! use turbovault_parser::{parse_wikilinks, parse_tags, parse_callouts};
85//!
86//! let content = "[[Link]] with #tag and > [!NOTE] callout";
87//!
88//! let wikilinks = parse_wikilinks(content);
89//! let tags = parse_tags(content);
90//! let callouts = parse_callouts(content);
91//! ```
92//!
93//! ## Supported OFM Features
94//!
95//! ### Links
96//! - Wikilinks: `[[Note]]`
97//! - Aliases: `[[Note|Alias]]`
98//! - Block references: `[[Note#^blockid]]`
99//! - Heading references: `[[Note#Heading]]`
100//! - Embeds: `![[Note]]`
101//! - Markdown links: `[text](url)`
102//!
103//! ### Frontmatter
104//! YAML frontmatter between `---` delimiters is extracted and parsed.
105//!
106//! ### Elements
107//! - **Headings**: H1-H6 with level tracking
108//! - **Tasks**: Markdown checkboxes with completion status
109//! - **Tags**: Inline tags like `#important`
110//! - **Callouts**: Obsidian callout syntax `> [!TYPE]` with multi-line content
111//!
112//! ## Performance
113//!
114//! The parser uses:
115//! - `pulldown-cmark` for CommonMark parsing + code block detection (O(n) linear time)
116//! - `std::sync::LazyLock` for compiled regex patterns (Rust 1.80+)
117//! - `LineIndex` for O(log n) position lookups via binary search
118//! - Fast pre-filters to skip regex when patterns aren't present
119//! - Excluded range tracking to avoid parsing inside code blocks
120
121// Core modules
122mod blocks;
123mod engine;
124pub mod models;
125pub mod parsers;
126mod standalone;
127
128// Main exports
129pub use models::TaskStatus;
130pub use parsers::Parser;
131pub use standalone::{ParseOptions, ParsedContent};
132
133// Re-export frontmatter extraction (deprecated but kept for backwards compatibility)
134#[allow(deprecated)]
135pub use parsers::frontmatter_parser::extract_frontmatter;
136
137// Block-level parsing (for treemd integration)
138pub use blocks::{parse_blocks, parse_blocks_from_line, slugify, to_plain_text};
139
140// Re-export core types for consumers (no need to depend on turbovault-core separately)
141pub use turbovault_core::{
142    ContentBlock, InlineElement, LineIndex, LinkType, ListItem, SourcePosition, TableAlignment,
143};
144
145// ============================================================================
146// Simplified Public API - Individual Parser Functions
147// ============================================================================
148//
149// These functions provide granular parsing when you only need specific elements.
150// They all use the unified engine internally with LineIndex for efficient position tracking.
151
152/// Parse wikilinks from content.
153///
154/// Returns links with empty `source_file`. Use `Parser::parse_file()` for vault-aware parsing.
155///
156/// # Example
157/// ```
158/// use turbovault_parser::parse_wikilinks;
159///
160/// let links = parse_wikilinks("See [[Note]] and [[Other|alias]]");
161/// assert_eq!(links.len(), 2);
162/// assert_eq!(links[0].target, "Note");
163/// ```
164pub fn parse_wikilinks(content: &str) -> Vec<turbovault_core::Link> {
165    let engine = engine::ParseEngine::new(content);
166    let opts = ParseOptions {
167        parse_wikilinks: true,
168        ..ParseOptions::none()
169    };
170    engine.parse(&opts).wikilinks
171}
172
173/// Parse embeds from content.
174///
175/// # Example
176/// ```
177/// use turbovault_parser::parse_embeds;
178///
179/// let embeds = parse_embeds("![[image.png]] and ![[Note]]");
180/// assert_eq!(embeds.len(), 2);
181/// ```
182pub fn parse_embeds(content: &str) -> Vec<turbovault_core::Link> {
183    let engine = engine::ParseEngine::new(content);
184    let opts = ParseOptions {
185        parse_wikilinks: true, // Embeds are parsed with wikilinks
186        ..ParseOptions::none()
187    };
188    engine.parse(&opts).embeds
189}
190
191/// Parse markdown links from content.
192///
193/// # Example
194/// ```
195/// use turbovault_parser::parse_markdown_links;
196///
197/// let links = parse_markdown_links("[text](url) and [other](http://example.com)");
198/// assert_eq!(links.len(), 2);
199/// ```
200pub fn parse_markdown_links(content: &str) -> Vec<turbovault_core::Link> {
201    let engine = engine::ParseEngine::new(content);
202    let opts = ParseOptions {
203        parse_markdown_links: true,
204        ..ParseOptions::none()
205    };
206    engine.parse(&opts).markdown_links
207}
208
209/// Parse tags from content.
210///
211/// # Example
212/// ```
213/// use turbovault_parser::parse_tags;
214///
215/// let tags = parse_tags("Has #tag and #nested/tag");
216/// assert_eq!(tags.len(), 2);
217/// assert!(tags[1].is_nested);
218/// ```
219pub fn parse_tags(content: &str) -> Vec<turbovault_core::Tag> {
220    let engine = engine::ParseEngine::new(content);
221    let opts = ParseOptions {
222        parse_tags: true,
223        ..ParseOptions::none()
224    };
225    engine.parse(&opts).tags
226}
227
228/// Parse headings from content.
229///
230/// # Example
231/// ```
232/// use turbovault_parser::parse_headings;
233///
234/// let headings = parse_headings("# H1\n## H2\n### H3");
235/// assert_eq!(headings.len(), 3);
236/// assert_eq!(headings[0].level, 1);
237/// ```
238pub fn parse_headings(content: &str) -> Vec<turbovault_core::Heading> {
239    let engine = engine::ParseEngine::new(content);
240    let opts = ParseOptions {
241        parse_headings: true,
242        ..ParseOptions::none()
243    };
244    engine.parse(&opts).headings
245}
246
247/// Parse tasks from content.
248///
249/// # Example
250/// ```
251/// use turbovault_parser::parse_tasks;
252///
253/// let tasks = parse_tasks("- [ ] Todo\n- [x] Done");
254/// assert_eq!(tasks.len(), 2);
255/// assert!(!tasks[0].is_completed);
256/// assert!(tasks[1].is_completed);
257/// ```
258pub fn parse_tasks(content: &str) -> Vec<turbovault_core::TaskItem> {
259    let engine = engine::ParseEngine::new(content);
260    let opts = ParseOptions {
261        parse_tasks: true,
262        ..ParseOptions::none()
263    };
264    engine.parse(&opts).tasks
265}
266
267/// Parse callouts from content (header only, no multi-line content).
268///
269/// # Example
270/// ```
271/// use turbovault_parser::parse_callouts;
272///
273/// let callouts = parse_callouts("> [!NOTE] Title\n> Content");
274/// assert_eq!(callouts.len(), 1);
275/// ```
276pub fn parse_callouts(content: &str) -> Vec<turbovault_core::Callout> {
277    let engine = engine::ParseEngine::new(content);
278    let opts = ParseOptions {
279        parse_callouts: true,
280        full_callouts: false,
281        ..ParseOptions::none()
282    };
283    engine.parse(&opts).callouts
284}
285
286/// Parse callouts with full multi-line content extraction.
287///
288/// # Example
289/// ```
290/// use turbovault_parser::parse_callouts_full;
291///
292/// let callouts = parse_callouts_full("> [!NOTE] Title\n> Line 1\n> Line 2");
293/// assert_eq!(callouts[0].content, "Line 1\nLine 2");
294/// ```
295pub fn parse_callouts_full(content: &str) -> Vec<turbovault_core::Callout> {
296    let engine = engine::ParseEngine::new(content);
297    let opts = ParseOptions {
298        parse_callouts: true,
299        full_callouts: true,
300        ..ParseOptions::none()
301    };
302    engine.parse(&opts).callouts
303}
304
305/// Convenient prelude for common imports.
306///
307/// Includes core types, the main parser, standalone parsing API, and all parser functions.
308pub mod prelude {
309    // Core types from turbovault-core
310    pub use turbovault_core::{
311        Callout, CalloutType, ContentBlock, Frontmatter, Heading, InlineElement, LineIndex, Link,
312        LinkType, ListItem, SourcePosition, TableAlignment, Tag, TaskItem,
313    };
314
315    // Main parser
316    pub use crate::Parser;
317
318    // Standalone parsing API
319    pub use crate::{ParseOptions, ParsedContent};
320
321    // Individual parsers
322    #[allow(deprecated)]
323    pub use crate::{
324        extract_frontmatter, parse_blocks, parse_blocks_from_line, parse_callouts,
325        parse_callouts_full, parse_embeds, parse_headings, parse_markdown_links, parse_tags,
326        parse_tasks, parse_wikilinks, slugify, to_plain_text,
327    };
328}