turbovault_parser/
lib.rs

1//! # TurboVault Parser
2//!
3//! Obsidian Flavored Markdown (OFM) parser built on `pulldown-cmark`.
4//!
5//! This crate provides:
6//! - Fast markdown parsing via `pulldown-cmark` (CommonMark foundation)
7//! - Frontmatter extraction (YAML via pulldown-cmark metadata blocks)
8//! - Obsidian-specific syntax: wikilinks, embeds, callouts, tags
9//! - **Code block awareness**: patterns inside code blocks/inline code are excluded
10//! - Link extraction and resolution
11//! - **Standalone parsing without vault context** (for tools like treemd)
12//!
13//! ## Architecture
14//!
15//! The parser uses a hybrid two-phase approach via unified `ParseEngine`:
16//!
17//! ### Phase 1: pulldown-cmark pass
18//! - Extracts CommonMark elements: headings, markdown links, tasks, frontmatter
19//! - Builds excluded ranges (code blocks, inline code, HTML) for Phase 2
20//!
21//! ### Phase 2: Regex pass (OFM extensions)
22//! - Parses Obsidian-specific syntax: wikilinks `[[]]`, embeds `![[]]`, tags `#tag`, callouts
23//! - **Skips excluded ranges** to avoid matching inside code blocks
24//!
25//! ### Performance optimizations
26//! - Builds a `LineIndex` once for O(log n) position lookups
27//! - Uses fast pre-filters to skip regex when patterns aren't present
28//!
29//! ## Quick Start
30//!
31//! ### With Vault Context
32//!
33//! ```
34//! use turbovault_parser::Parser;
35//! use std::path::PathBuf;
36//!
37//! let content = r#"---
38//! title: My Note
39//! tags: [important, review]
40//! ---
41//!
42//! # Heading
43//!
44//! [[WikiLink]] and [[Other Note#Heading]].
45//!
46//! - [x] Completed task
47//! - [ ] Pending task
48//! "#;
49//!
50//! let vault_path = PathBuf::from("/vault");
51//! let parser = Parser::new(vault_path);
52//!
53//! let path = PathBuf::from("my-note.md");
54//! if let Ok(result) = parser.parse_file(&path, content) {
55//!     // Access parsed components
56//!     if let Some(frontmatter) = &result.frontmatter {
57//!         println!("Frontmatter data: {:?}", frontmatter.data);
58//!     }
59//!     println!("Links: {}", result.links.len());
60//!     println!("Tasks: {}", result.tasks.len());
61//! }
62//! ```
63//!
64//! ### Standalone Parsing (No Vault Required)
65//!
66//! ```
67//! use turbovault_parser::{ParsedContent, ParseOptions};
68//!
69//! let content = "# Title\n\n[[WikiLink]] and [markdown](url) with #tag";
70//!
71//! // Parse everything
72//! let parsed = ParsedContent::parse(content);
73//! assert_eq!(parsed.wikilinks.len(), 1);
74//! assert_eq!(parsed.markdown_links.len(), 1);
75//! assert_eq!(parsed.tags.len(), 1);
76//!
77//! // Or parse selectively for better performance
78//! let parsed = ParsedContent::parse_with_options(content, ParseOptions::links_only());
79//! ```
80//!
81//! ### Individual Parsers (Granular Control)
82//!
83//! ```
84//! use turbovault_parser::{parse_wikilinks, parse_tags, parse_callouts};
85//!
86//! let content = "[[Link]] with #tag and > [!NOTE] callout";
87//!
88//! let wikilinks = parse_wikilinks(content);
89//! let tags = parse_tags(content);
90//! let callouts = parse_callouts(content);
91//! ```
92//!
93//! ## Supported OFM Features
94//!
95//! ### Links
96//! - Wikilinks: `[[Note]]`
97//! - Aliases: `[[Note|Alias]]`
98//! - Block references: `[[Note#^blockid]]`
99//! - Heading references: `[[Note#Heading]]`
100//! - Embeds: `![[Note]]`
101//! - Markdown links: `[text](url)`
102//!
103//! ### Frontmatter
104//! YAML frontmatter between `---` delimiters is extracted and parsed.
105//!
106//! ### Elements
107//! - **Headings**: H1-H6 with level tracking
108//! - **Tasks**: Markdown checkboxes with completion status
109//! - **Tags**: Inline tags like `#important`
110//! - **Callouts**: Obsidian callout syntax `> [!TYPE]` with multi-line content
111//!
112//! ## Performance
113//!
114//! The parser uses:
115//! - `pulldown-cmark` for CommonMark parsing + code block detection (O(n) linear time)
116//! - `std::sync::LazyLock` for compiled regex patterns (Rust 1.80+)
117//! - `LineIndex` for O(log n) position lookups via binary search
118//! - Fast pre-filters to skip regex when patterns aren't present
119//! - Excluded range tracking to avoid parsing inside code blocks
120
121// Core modules
122mod blocks;
123mod engine;
124pub mod parsers;
125mod standalone;
126
127// Main exports
128pub use parsers::Parser;
129pub use standalone::{ParseOptions, ParsedContent};
130
131// Re-export frontmatter extraction
132pub use parsers::frontmatter_parser::extract_frontmatter;
133
134// Block-level parsing (for treemd integration)
135pub use blocks::{parse_blocks, parse_blocks_from_line, slugify};
136
137// Re-export core types for consumers (no need to depend on turbovault-core separately)
138pub use turbovault_core::{
139    ContentBlock, InlineElement, LineIndex, LinkType, ListItem, SourcePosition, TableAlignment,
140};
141
142// ============================================================================
143// Simplified Public API - Individual Parser Functions
144// ============================================================================
145//
146// These functions provide granular parsing when you only need specific elements.
147// They all use the unified engine internally with LineIndex for efficient position tracking.
148
149/// Parse wikilinks from content.
150///
151/// Returns links with empty `source_file`. Use `Parser::parse_file()` for vault-aware parsing.
152///
153/// # Example
154/// ```
155/// use turbovault_parser::parse_wikilinks;
156///
157/// let links = parse_wikilinks("See [[Note]] and [[Other|alias]]");
158/// assert_eq!(links.len(), 2);
159/// assert_eq!(links[0].target, "Note");
160/// ```
161pub fn parse_wikilinks(content: &str) -> Vec<turbovault_core::Link> {
162    let engine = engine::ParseEngine::new(content);
163    let opts = ParseOptions {
164        parse_wikilinks: true,
165        ..ParseOptions::none()
166    };
167    engine.parse(&opts).wikilinks
168}
169
170/// Parse embeds from content.
171///
172/// # Example
173/// ```
174/// use turbovault_parser::parse_embeds;
175///
176/// let embeds = parse_embeds("![[image.png]] and ![[Note]]");
177/// assert_eq!(embeds.len(), 2);
178/// ```
179pub fn parse_embeds(content: &str) -> Vec<turbovault_core::Link> {
180    let engine = engine::ParseEngine::new(content);
181    let opts = ParseOptions {
182        parse_wikilinks: true, // Embeds are parsed with wikilinks
183        ..ParseOptions::none()
184    };
185    engine.parse(&opts).embeds
186}
187
188/// Parse markdown links from content.
189///
190/// # Example
191/// ```
192/// use turbovault_parser::parse_markdown_links;
193///
194/// let links = parse_markdown_links("[text](url) and [other](http://example.com)");
195/// assert_eq!(links.len(), 2);
196/// ```
197pub fn parse_markdown_links(content: &str) -> Vec<turbovault_core::Link> {
198    let engine = engine::ParseEngine::new(content);
199    let opts = ParseOptions {
200        parse_markdown_links: true,
201        ..ParseOptions::none()
202    };
203    engine.parse(&opts).markdown_links
204}
205
206/// Parse tags from content.
207///
208/// # Example
209/// ```
210/// use turbovault_parser::parse_tags;
211///
212/// let tags = parse_tags("Has #tag and #nested/tag");
213/// assert_eq!(tags.len(), 2);
214/// assert!(tags[1].is_nested);
215/// ```
216pub fn parse_tags(content: &str) -> Vec<turbovault_core::Tag> {
217    let engine = engine::ParseEngine::new(content);
218    let opts = ParseOptions {
219        parse_tags: true,
220        ..ParseOptions::none()
221    };
222    engine.parse(&opts).tags
223}
224
225/// Parse headings from content.
226///
227/// # Example
228/// ```
229/// use turbovault_parser::parse_headings;
230///
231/// let headings = parse_headings("# H1\n## H2\n### H3");
232/// assert_eq!(headings.len(), 3);
233/// assert_eq!(headings[0].level, 1);
234/// ```
235pub fn parse_headings(content: &str) -> Vec<turbovault_core::Heading> {
236    let engine = engine::ParseEngine::new(content);
237    let opts = ParseOptions {
238        parse_headings: true,
239        ..ParseOptions::none()
240    };
241    engine.parse(&opts).headings
242}
243
244/// Parse tasks from content.
245///
246/// # Example
247/// ```
248/// use turbovault_parser::parse_tasks;
249///
250/// let tasks = parse_tasks("- [ ] Todo\n- [x] Done");
251/// assert_eq!(tasks.len(), 2);
252/// assert!(!tasks[0].is_completed);
253/// assert!(tasks[1].is_completed);
254/// ```
255pub fn parse_tasks(content: &str) -> Vec<turbovault_core::TaskItem> {
256    let engine = engine::ParseEngine::new(content);
257    let opts = ParseOptions {
258        parse_tasks: true,
259        ..ParseOptions::none()
260    };
261    engine.parse(&opts).tasks
262}
263
264/// Parse callouts from content (header only, no multi-line content).
265///
266/// # Example
267/// ```
268/// use turbovault_parser::parse_callouts;
269///
270/// let callouts = parse_callouts("> [!NOTE] Title\n> Content");
271/// assert_eq!(callouts.len(), 1);
272/// ```
273pub fn parse_callouts(content: &str) -> Vec<turbovault_core::Callout> {
274    let engine = engine::ParseEngine::new(content);
275    let opts = ParseOptions {
276        parse_callouts: true,
277        full_callouts: false,
278        ..ParseOptions::none()
279    };
280    engine.parse(&opts).callouts
281}
282
283/// Parse callouts with full multi-line content extraction.
284///
285/// # Example
286/// ```
287/// use turbovault_parser::parse_callouts_full;
288///
289/// let callouts = parse_callouts_full("> [!NOTE] Title\n> Line 1\n> Line 2");
290/// assert_eq!(callouts[0].content, "Line 1\nLine 2");
291/// ```
292pub fn parse_callouts_full(content: &str) -> Vec<turbovault_core::Callout> {
293    let engine = engine::ParseEngine::new(content);
294    let opts = ParseOptions {
295        parse_callouts: true,
296        full_callouts: true,
297        ..ParseOptions::none()
298    };
299    engine.parse(&opts).callouts
300}
301
302/// Convenient prelude for common imports.
303///
304/// Includes core types, the main parser, standalone parsing API, and all parser functions.
305pub mod prelude {
306    // Core types from turbovault-core
307    pub use turbovault_core::{
308        Callout, CalloutType, ContentBlock, Frontmatter, Heading, InlineElement, LineIndex, Link,
309        LinkType, ListItem, SourcePosition, TableAlignment, Tag, TaskItem,
310    };
311
312    // Main parser
313    pub use crate::Parser;
314
315    // Standalone parsing API
316    pub use crate::{ParseOptions, ParsedContent};
317
318    // Individual parsers
319    pub use crate::{
320        extract_frontmatter, parse_blocks, parse_blocks_from_line, parse_callouts,
321        parse_callouts_full, parse_embeds, parse_headings, parse_markdown_links, parse_tags,
322        parse_tasks, parse_wikilinks, slugify,
323    };
324}