turbovault_parser/lib.rs
1//! # TurboVault Parser
2//!
3//! Obsidian Flavored Markdown (OFM) parser built on `pulldown-cmark`.
4//!
5//! This crate provides:
6//! - Fast markdown parsing via `pulldown-cmark` (CommonMark foundation)
7//! - Frontmatter extraction (YAML via pulldown-cmark metadata blocks)
8//! - Obsidian-specific syntax: wikilinks, embeds, callouts, tags
9//! - **Code block awareness**: patterns inside code blocks/inline code are excluded
10//! - Link extraction and resolution
11//! - **Standalone parsing without vault context** (for tools like treemd)
12//!
13//! ## Architecture
14//!
15//! The parser uses a hybrid two-phase approach via unified `ParseEngine`:
16//!
17//! ### Phase 1: pulldown-cmark pass
18//! - Extracts CommonMark elements: headings, markdown links, tasks, frontmatter
19//! - Builds excluded ranges (code blocks, inline code, HTML) for Phase 2
20//!
21//! ### Phase 2: Regex pass (OFM extensions)
22//! - Parses Obsidian-specific syntax: wikilinks `[[]]`, embeds `![[]]`, tags `#tag`, callouts
23//! - **Skips excluded ranges** to avoid matching inside code blocks
24//!
25//! ### Performance optimizations
26//! - Builds a `LineIndex` once for O(log n) position lookups
27//! - Uses fast pre-filters to skip regex when patterns aren't present
28//!
29//! ## Quick Start
30//!
31//! ### With Vault Context
32//!
33//! ```
34//! use turbovault_parser::Parser;
35//! use std::path::PathBuf;
36//!
37//! let content = r#"---
38//! title: My Note
39//! tags: [important, review]
40//! ---
41//!
42//! # Heading
43//!
44//! [[WikiLink]] and [[Other Note#Heading]].
45//!
46//! - [x] Completed task
47//! - [ ] Pending task
48//! "#;
49//!
50//! let vault_path = PathBuf::from("/vault");
51//! let parser = Parser::new(vault_path);
52//!
53//! let path = PathBuf::from("my-note.md");
54//! if let Ok(result) = parser.parse_file(&path, content) {
55//! // Access parsed components
56//! if let Some(frontmatter) = &result.frontmatter {
57//! println!("Frontmatter data: {:?}", frontmatter.data);
58//! }
59//! println!("Links: {}", result.links.len());
60//! println!("Tasks: {}", result.tasks.len());
61//! }
62//! ```
63//!
64//! ### Standalone Parsing (No Vault Required)
65//!
66//! ```
67//! use turbovault_parser::{ParsedContent, ParseOptions};
68//!
69//! let content = "# Title\n\n[[WikiLink]] and [markdown](url) with #tag";
70//!
71//! // Parse everything
72//! let parsed = ParsedContent::parse(content);
73//! assert_eq!(parsed.wikilinks.len(), 1);
74//! assert_eq!(parsed.markdown_links.len(), 1);
75//! assert_eq!(parsed.tags.len(), 1);
76//!
77//! // Or parse selectively for better performance
78//! let parsed = ParsedContent::parse_with_options(content, ParseOptions::links_only());
79//! ```
80//!
81//! ### Individual Parsers (Granular Control)
82//!
83//! ```
84//! use turbovault_parser::{parse_wikilinks, parse_tags, parse_callouts};
85//!
86//! let content = "[[Link]] with #tag and > [!NOTE] callout";
87//!
88//! let wikilinks = parse_wikilinks(content);
89//! let tags = parse_tags(content);
90//! let callouts = parse_callouts(content);
91//! ```
92//!
93//! ## Supported OFM Features
94//!
95//! ### Links
96//! - Wikilinks: `[[Note]]`
97//! - Aliases: `[[Note|Alias]]`
98//! - Block references: `[[Note#^blockid]]`
99//! - Heading references: `[[Note#Heading]]`
100//! - Embeds: `![[Note]]`
101//! - Markdown links: `[text](url)`
102//!
103//! ### Frontmatter
104//! YAML frontmatter between `---` delimiters is extracted and parsed.
105//!
106//! ### Elements
107//! - **Headings**: H1-H6 with level tracking
108//! - **Tasks**: Markdown checkboxes with completion status
109//! - **Tags**: Inline tags like `#important`
110//! - **Callouts**: Obsidian callout syntax `> [!TYPE]` with multi-line content
111//!
112//! ## Performance
113//!
114//! The parser uses:
115//! - `pulldown-cmark` for CommonMark parsing + code block detection (O(n) linear time)
116//! - `std::sync::LazyLock` for compiled regex patterns (Rust 1.80+)
117//! - `LineIndex` for O(log n) position lookups via binary search
118//! - Fast pre-filters to skip regex when patterns aren't present
119//! - Excluded range tracking to avoid parsing inside code blocks
120
121// Core modules
122mod blocks;
123mod engine;
124pub mod parsers;
125mod standalone;
126
127// Main exports
128pub use parsers::Parser;
129pub use standalone::{ParseOptions, ParsedContent};
130
131// Re-export frontmatter extraction (deprecated but kept for backwards compatibility)
132#[allow(deprecated)]
133pub use parsers::frontmatter_parser::extract_frontmatter;
134
135// Block-level parsing (for treemd integration)
136pub use blocks::{parse_blocks, parse_blocks_from_line, slugify, to_plain_text};
137
138// Re-export core types for consumers (no need to depend on turbovault-core separately)
139pub use turbovault_core::{
140 ContentBlock, InlineElement, LineIndex, LinkType, ListItem, SourcePosition, TableAlignment,
141};
142
143// ============================================================================
144// Simplified Public API - Individual Parser Functions
145// ============================================================================
146//
147// These functions provide granular parsing when you only need specific elements.
148// They all use the unified engine internally with LineIndex for efficient position tracking.
149
150/// Parse wikilinks from content.
151///
152/// Returns links with empty `source_file`. Use `Parser::parse_file()` for vault-aware parsing.
153///
154/// # Example
155/// ```
156/// use turbovault_parser::parse_wikilinks;
157///
158/// let links = parse_wikilinks("See [[Note]] and [[Other|alias]]");
159/// assert_eq!(links.len(), 2);
160/// assert_eq!(links[0].target, "Note");
161/// ```
162pub fn parse_wikilinks(content: &str) -> Vec<turbovault_core::Link> {
163 let engine = engine::ParseEngine::new(content);
164 let opts = ParseOptions {
165 parse_wikilinks: true,
166 ..ParseOptions::none()
167 };
168 engine.parse(&opts).wikilinks
169}
170
171/// Parse embeds from content.
172///
173/// # Example
174/// ```
175/// use turbovault_parser::parse_embeds;
176///
177/// let embeds = parse_embeds("![[image.png]] and ![[Note]]");
178/// assert_eq!(embeds.len(), 2);
179/// ```
180pub fn parse_embeds(content: &str) -> Vec<turbovault_core::Link> {
181 let engine = engine::ParseEngine::new(content);
182 let opts = ParseOptions {
183 parse_wikilinks: true, // Embeds are parsed with wikilinks
184 ..ParseOptions::none()
185 };
186 engine.parse(&opts).embeds
187}
188
189/// Parse markdown links from content.
190///
191/// # Example
192/// ```
193/// use turbovault_parser::parse_markdown_links;
194///
195/// let links = parse_markdown_links("[text](url) and [other](http://example.com)");
196/// assert_eq!(links.len(), 2);
197/// ```
198pub fn parse_markdown_links(content: &str) -> Vec<turbovault_core::Link> {
199 let engine = engine::ParseEngine::new(content);
200 let opts = ParseOptions {
201 parse_markdown_links: true,
202 ..ParseOptions::none()
203 };
204 engine.parse(&opts).markdown_links
205}
206
207/// Parse tags from content.
208///
209/// # Example
210/// ```
211/// use turbovault_parser::parse_tags;
212///
213/// let tags = parse_tags("Has #tag and #nested/tag");
214/// assert_eq!(tags.len(), 2);
215/// assert!(tags[1].is_nested);
216/// ```
217pub fn parse_tags(content: &str) -> Vec<turbovault_core::Tag> {
218 let engine = engine::ParseEngine::new(content);
219 let opts = ParseOptions {
220 parse_tags: true,
221 ..ParseOptions::none()
222 };
223 engine.parse(&opts).tags
224}
225
226/// Parse headings from content.
227///
228/// # Example
229/// ```
230/// use turbovault_parser::parse_headings;
231///
232/// let headings = parse_headings("# H1\n## H2\n### H3");
233/// assert_eq!(headings.len(), 3);
234/// assert_eq!(headings[0].level, 1);
235/// ```
236pub fn parse_headings(content: &str) -> Vec<turbovault_core::Heading> {
237 let engine = engine::ParseEngine::new(content);
238 let opts = ParseOptions {
239 parse_headings: true,
240 ..ParseOptions::none()
241 };
242 engine.parse(&opts).headings
243}
244
245/// Parse tasks from content.
246///
247/// # Example
248/// ```
249/// use turbovault_parser::parse_tasks;
250///
251/// let tasks = parse_tasks("- [ ] Todo\n- [x] Done");
252/// assert_eq!(tasks.len(), 2);
253/// assert!(!tasks[0].is_completed);
254/// assert!(tasks[1].is_completed);
255/// ```
256pub fn parse_tasks(content: &str) -> Vec<turbovault_core::TaskItem> {
257 let engine = engine::ParseEngine::new(content);
258 let opts = ParseOptions {
259 parse_tasks: true,
260 ..ParseOptions::none()
261 };
262 engine.parse(&opts).tasks
263}
264
265/// Parse callouts from content (header only, no multi-line content).
266///
267/// # Example
268/// ```
269/// use turbovault_parser::parse_callouts;
270///
271/// let callouts = parse_callouts("> [!NOTE] Title\n> Content");
272/// assert_eq!(callouts.len(), 1);
273/// ```
274pub fn parse_callouts(content: &str) -> Vec<turbovault_core::Callout> {
275 let engine = engine::ParseEngine::new(content);
276 let opts = ParseOptions {
277 parse_callouts: true,
278 full_callouts: false,
279 ..ParseOptions::none()
280 };
281 engine.parse(&opts).callouts
282}
283
284/// Parse callouts with full multi-line content extraction.
285///
286/// # Example
287/// ```
288/// use turbovault_parser::parse_callouts_full;
289///
290/// let callouts = parse_callouts_full("> [!NOTE] Title\n> Line 1\n> Line 2");
291/// assert_eq!(callouts[0].content, "Line 1\nLine 2");
292/// ```
293pub fn parse_callouts_full(content: &str) -> Vec<turbovault_core::Callout> {
294 let engine = engine::ParseEngine::new(content);
295 let opts = ParseOptions {
296 parse_callouts: true,
297 full_callouts: true,
298 ..ParseOptions::none()
299 };
300 engine.parse(&opts).callouts
301}
302
303/// Convenient prelude for common imports.
304///
305/// Includes core types, the main parser, standalone parsing API, and all parser functions.
306pub mod prelude {
307 // Core types from turbovault-core
308 pub use turbovault_core::{
309 Callout, CalloutType, ContentBlock, Frontmatter, Heading, InlineElement, LineIndex, Link,
310 LinkType, ListItem, SourcePosition, TableAlignment, Tag, TaskItem,
311 };
312
313 // Main parser
314 pub use crate::Parser;
315
316 // Standalone parsing API
317 pub use crate::{ParseOptions, ParsedContent};
318
319 // Individual parsers
320 #[allow(deprecated)]
321 pub use crate::{
322 extract_frontmatter, parse_blocks, parse_blocks_from_line, parse_callouts,
323 parse_callouts_full, parse_embeds, parse_headings, parse_markdown_links, parse_tags,
324 parse_tasks, parse_wikilinks, slugify, to_plain_text,
325 };
326}