arinamcnulty_markdown_parser/
lib.rs

1//! # Markdown Parser Library
2//!
3//! This library provides comprehensive Markdown parsing capabilities with HTML conversion.
4//! It uses Pest grammar for efficient parsing and supports various Markdown elements.
5//!
6//! ## Features
7//!
8//! - Full Markdown syntax support (headings, paragraphs, links, images, formatting)
9//! - Robust error handling with custom error types
10//! - HTML output generation with proper escaping
11//! - Command-line interface integration
12//!
13//! ## Example
14//!
15//! ```rust
16//! use arinamcnulty_markdown_parser::{parse_markdown, str_to_html};
17//!
18//! let markdown = "# Hello World\n\nThis is **bold** text.";
19//! let html = str_to_html(markdown).unwrap();
20//! println!("{}", html.join("\n"));
21//! ```
22
23use std::{
24    fs::{File, OpenOptions},
25    io::{BufRead, BufReader, Write},
26    path::Path,
27};
28
29use pest::{
30    Parser,
31    iterators::{Pair, Pairs},
32};
33use pest_derive::Parser;
34
35/// Custom error type for markdown parsing operations.
36/// Provides detailed error information for different failure scenarios.
37#[derive(Debug, thiserror::Error)]
38pub enum MarkdownError {
39    #[error("Parsing failed: {0}")]
40    ParseError(String),
41
42    #[error("File operation failed: {0}")]
43    IoError(#[from] std::io::Error),
44}
45
46#[derive(Parser)]
47#[grammar = "grammar.pest"]
48pub struct MarkdownParser;
49
50/// Main parsing function that processes markdown input.
51/// Returns parsed syntax tree or error if parsing fails.
52///
53/// # Arguments
54/// * `input` - Raw markdown text as string slice
55///
56/// # Returns
57/// Result containing parsed pairs or MarkdownError
58pub fn parse_markdown(input: &str) -> Result<Pairs<'_, Rule>, MarkdownError> {
59    MarkdownParser::parse(Rule::document_structure, input)
60        .map_err(|e| MarkdownError::ParseError(e.to_string()))
61}
62
63/// Convert markdown string to vector of HTML strings.
64/// Each element represents one HTML line/tag.
65///
66/// # Arguments
67/// * `input` - Markdown text to convert
68///
69/// # Returns
70/// Vector of HTML strings or MarkdownError
71pub fn str_to_html(input: &str) -> Result<Vec<String>, MarkdownError> {
72    let mut parsed = parse_markdown(input)?;
73    let document = parsed
74        .next()
75        .ok_or_else(|| MarkdownError::ParseError("Empty document".to_string()))?;
76
77    let results: Result<Vec<String>, MarkdownError> = document
78        .into_inner()
79        .filter(|pair| !matches!(pair.as_rule(), Rule::EOI))
80        .map(convert_to_html)
81        .collect();
82
83    results
84}
85
86/// Convert a single parsed rule to HTML representation.
87/// This is the core conversion dispatcher for different markdown elements.
88///
89/// # Arguments
90/// * `pair` - Pest Pair representing parsed rule
91///
92/// # Returns
93/// HTML string or MarkdownError
94fn convert_to_html(pair: Pair<Rule>) -> Result<String, MarkdownError> {
95    match pair.as_rule() {
96        Rule::document_block => {
97            let inner = pair.into_inner().next().unwrap();
98            convert_to_html(inner)
99        }
100        Rule::document_heading => process_document_heading(pair),
101        Rule::h1_heading | Rule::h2_heading | Rule::h3_heading => process_heading(pair),
102        Rule::document_paragraph => process_document_paragraph(pair),
103        Rule::document_quote => process_document_quote(pair),
104        Rule::quote_line => process_quote_line(pair),
105        Rule::paragraph_text => process_paragraph_text(pair),
106        Rule::document_unordered_list => process_unordered_list(pair),
107        Rule::document_ordered_list => process_ordered_list(pair),
108        Rule::unordered_list_item => process_list_item(pair),
109        Rule::ordered_list_item => process_list_item(pair),
110        Rule::code_fence => process_code_fence(pair),
111        Rule::thematic_break => Ok("<hr>".to_string()),
112        Rule::blank_line => Ok("<br>".to_string()),
113        Rule::EOI => Ok(String::new()),
114        _ => Err(MarkdownError::ParseError(format!(
115            "Unknown rule: {:?}",
116            pair.as_rule()
117        ))),
118    }
119}
120
121/// Process document heading container.
122fn process_document_heading(pair: Pair<Rule>) -> Result<String, MarkdownError> {
123    let inner = pair.into_inner().next().unwrap();
124    process_heading(inner)
125}
126
127/// Process heading elements (H1, H2, H3).
128fn process_heading(pair: Pair<Rule>) -> Result<String, MarkdownError> {
129    let level = match pair.as_rule() {
130        Rule::h1_heading => 1,
131        Rule::h2_heading => 2,
132        Rule::h3_heading => 3,
133        _ => return Err(MarkdownError::ParseError("Invalid heading".to_string())),
134    };
135
136    let content = pair.as_str();
137    let text = content
138        .trim_start_matches('#')
139        .trim_start_matches(char::is_whitespace)
140        .trim_end_matches('\n')
141        .trim();
142
143    Ok(format!(
144        "<h{level}>{}</h{level}>",
145        html_escape::encode_text(text)
146    ))
147}
148
149fn process_document_paragraph(pair: Pair<Rule>) -> Result<String, MarkdownError> {
150    process_paragraph(pair)
151}
152
153fn process_paragraph(pair: Pair<Rule>) -> Result<String, MarkdownError> {
154    let content: Result<String, MarkdownError> = pair
155        .into_inner()
156        .map(|line| process_paragraph_line(line))
157        .collect();
158
159    Ok(format!("<p>{}</p>", content?))
160}
161
162fn process_paragraph_text(pair: Pair<Rule>) -> Result<String, MarkdownError> {
163    pair.into_inner()
164        .map(|inline| process_inline_element(inline))
165        .collect()
166}
167
168fn process_paragraph_line(pair: Pair<Rule>) -> Result<String, MarkdownError> {
169    pair.into_inner()
170        .map(|inline| process_inline_element(inline))
171        .collect()
172}
173
174/// Process inline elements (text, formatting, links, images).
175fn process_inline_element(pair: Pair<Rule>) -> Result<String, MarkdownError> {
176    match pair.as_rule() {
177        Rule::plain_text => Ok(html_escape::encode_text(pair.as_str()).to_string()),
178        Rule::inline_code => {
179            let full = pair.as_str();
180            let code = full
181                .strip_prefix('`')
182                .and_then(|s| s.strip_suffix('`'))
183                .unwrap_or("");
184            Ok(format!("<code>{}</code>", html_escape::encode_text(code)))
185        }
186        Rule::link => process_link(pair),
187        Rule::image => process_image(pair),
188        Rule::bold_formatting => {
189            let content = process_bold_content(pair)?;
190            Ok(format!("<strong>{content}</strong>"))
191        }
192        Rule::italic_formatting => {
193            let content = process_italic_content(pair)?;
194            Ok(format!("<em>{content}</em>"))
195        }
196        Rule::strikethrough_formatting => {
197            let content = process_strikethrough_content(pair)?;
198            Ok(format!("<del>{content}</del>"))
199        }
200        Rule::underline_formatting => {
201            let content = process_underline_content(pair)?;
202            Ok(format!("<u>{content}</u>"))
203        }
204        Rule::text_formatting => process_text_formatting(pair),
205        Rule::escape_sequence => process_escape_sequence(pair),
206        _ => Ok(html_escape::encode_text(pair.as_str()).to_string()),
207    }
208}
209
210/// Process text formatting (bold, italic, strikethrough, underline).
211fn process_text_formatting(pair: Pair<Rule>) -> Result<String, MarkdownError> {
212    let rule = pair.as_rule();
213    match rule {
214        Rule::bold_formatting => {
215            let content = process_bold_content(pair)?;
216            Ok(format!("<strong>{content}</strong>"))
217        }
218        Rule::italic_formatting => {
219            let content = process_italic_content(pair)?;
220            Ok(format!("<em>{content}</em>"))
221        }
222        Rule::strikethrough_formatting => {
223            let content = process_strikethrough_content(pair)?;
224            Ok(format!("<del>{content}</del>"))
225        }
226        Rule::underline_formatting => {
227            let content = process_underline_content(pair)?;
228            Ok(format!("<u>{content}</u>"))
229        }
230        _ => Ok(html_escape::encode_text(pair.as_str()).to_string()),
231    }
232}
233
234fn process_bold_content(pair: Pair<Rule>) -> Result<String, MarkdownError> {
235    pair.into_inner()
236        .next()
237        .map(|p| html_escape::encode_text(p.as_str()).to_string())
238        .ok_or_else(|| MarkdownError::ParseError("Empty bold content".to_string()))
239}
240
241fn process_italic_content(pair: Pair<Rule>) -> Result<String, MarkdownError> {
242    pair.into_inner()
243        .next()
244        .map(|p| html_escape::encode_text(p.as_str()).to_string())
245        .ok_or_else(|| MarkdownError::ParseError("Empty italic content".to_string()))
246}
247
248fn process_strikethrough_content(pair: Pair<Rule>) -> Result<String, MarkdownError> {
249    pair.into_inner()
250        .next()
251        .map(|p| html_escape::encode_text(p.as_str()).to_string())
252        .ok_or_else(|| MarkdownError::ParseError("Empty strikethrough content".to_string()))
253}
254
255fn process_underline_content(pair: Pair<Rule>) -> Result<String, MarkdownError> {
256    pair.into_inner()
257        .next()
258        .map(|p| html_escape::encode_text(p.as_str()).to_string())
259        .ok_or_else(|| MarkdownError::ParseError("Empty underline content".to_string()))
260}
261
262/// Process markdown links [text](url).
263fn process_link(pair: Pair<Rule>) -> Result<String, MarkdownError> {
264    let mut inner = pair.into_inner();
265    let text = inner
266        .next()
267        .map(|p| p.into_inner().as_str())
268        .ok_or_else(|| MarkdownError::ParseError("Missing link text".to_string()))?;
269    let url = inner
270        .next()
271        .map(|p| p.as_str())
272        .ok_or_else(|| MarkdownError::ParseError("Missing link URL".to_string()))?;
273
274    Ok(format!(
275        "<a href=\"{}\">{}</a>",
276        url,
277        html_escape::encode_text(text)
278    ))
279}
280
281/// Process markdown images ![alt](url).
282fn process_image(pair: Pair<Rule>) -> Result<String, MarkdownError> {
283    let mut inner = pair.into_inner();
284    let alt = inner
285        .next()
286        .map(|p| p.into_inner().as_str())
287        .ok_or_else(|| MarkdownError::ParseError("Missing image alt text".to_string()))?;
288    let url = inner
289        .next()
290        .map(|p| p.as_str())
291        .ok_or_else(|| MarkdownError::ParseError("Missing image URL".to_string()))?;
292
293    Ok(format!(
294        "<img src=\"{}\" alt=\"{}\">",
295        url,
296        html_escape::encode_text(alt)
297    ))
298}
299
300fn process_document_quote(pair: Pair<Rule>) -> Result<String, MarkdownError> {
301    process_quote(pair)
302}
303
304fn process_quote_line(pair: Pair<Rule>) -> Result<String, MarkdownError> {
305    let inner = pair.into_inner().next();
306    match inner {
307        Some(content) => {
308            let html = convert_to_html(content)?;
309            Ok(format!("<p>{}</p>", html))
310        }
311        None => Ok("<p></p>".to_string()),
312    }
313}
314
315fn process_quote(pair: Pair<Rule>) -> Result<String, MarkdownError> {
316    let mut lines: Vec<String> = Vec::new();
317
318    for line in pair.into_inner() {
319        let processed = process_quote_line(line)?;
320        if !processed.is_empty() {
321            lines.push(processed);
322        }
323    }
324
325    Ok(format!("<blockquote>\n{}\n</blockquote>", lines.join("\n")))
326}
327
328fn process_code_fence(pair: Pair<Rule>) -> Result<String, MarkdownError> {
329    process_code_block(pair)
330}
331
332fn process_unordered_list(pair: Pair<Rule>) -> Result<String, MarkdownError> {
333    let items: Result<Vec<String>, MarkdownError> =
334        pair.into_inner().map(process_list_item).collect();
335
336    Ok(format!("<ul>\n{}\n</ul>", items?.join("\n")))
337}
338
339fn process_ordered_list(pair: Pair<Rule>) -> Result<String, MarkdownError> {
340    let items: Result<Vec<String>, MarkdownError> =
341        pair.into_inner().map(process_list_item).collect();
342
343    Ok(format!("<ol>\n{}\n</ol>", items?.join("\n")))
344}
345
346fn process_list_item(pair: Pair<Rule>) -> Result<String, MarkdownError> {
347    let content = pair.as_str();
348    let text = content
349        .find(char::is_whitespace)
350        .map(|pos| &content[pos + 1..])
351        .unwrap_or("")
352        .trim_end_matches('\n')
353        .trim();
354
355    Ok(format!("<li>{}</li>", html_escape::encode_text(text)))
356}
357
358/// Process fenced code blocks with optional language specification.
359/// Supports syntax like ```rust\ncode here\n```
360fn process_code_block(pair: Pair<Rule>) -> Result<String, MarkdownError> {
361    let mut language = String::new();
362    let mut code = String::new();
363
364    for inner_pair in pair.into_inner() {
365        match inner_pair.as_rule() {
366            Rule::language_spec => {
367                language = inner_pair.as_str().trim().to_string();
368            }
369            Rule::code_body => {
370                code = html_escape::encode_text(inner_pair.as_str()).to_string();
371            }
372            _ => {} // Skip other elements like whitespace, newlines, fences
373        }
374    }
375
376    let lang_attr = if language.is_empty() {
377        String::new()
378    } else {
379        format!(" class=\"language-{}\"", language)
380    };
381
382    Ok(format!("<pre><code{lang_attr}>{code}</code></pre>"))
383}
384
385fn process_escape_sequence(pair: Pair<Rule>) -> Result<String, MarkdownError> {
386    let escaped = pair.into_inner().next().map(|p| p.as_str()).unwrap_or("");
387    Ok(html_escape::encode_text(escaped).to_string())
388}
389
390/// Convert markdown file to HTML file.
391/// Reads markdown from input path and writes HTML to output path.
392///
393/// # Arguments
394/// * `input_path` - Path to markdown file
395/// * `output_path` - Path where HTML will be written
396///
397/// # Returns
398/// Ok(()) on success or MarkdownError
399pub fn convert_file_to_html(input_path: &Path, output_path: &Path) -> Result<(), MarkdownError> {
400    let file = File::open(input_path)?;
401    let reader = BufReader::new(file);
402
403    let mut content = String::new();
404    for line in reader.lines() {
405        content.push_str(&line?);
406        content.push('\n');
407    }
408
409    let html_lines = str_to_html(&content)?;
410
411    let mut output = OpenOptions::new()
412        .create(true)
413        .write(true)
414        .truncate(true)
415        .open(output_path)?;
416
417    for line in html_lines {
418        writeln!(output, "{}", line)?;
419    }
420
421    Ok(())
422}
423
424/// Print HTML conversion result to console.
425/// Useful for command-line usage.
426///
427/// # Arguments
428/// * `input` - Markdown text to convert and print
429///
430/// # Returns
431/// Ok(()) on success or MarkdownError
432pub fn print_html_to_console(input: &str) -> Result<(), MarkdownError> {
433    let html_lines = str_to_html(input)?;
434    for line in html_lines {
435        println!("{}", line);
436    }
437    Ok(())
438}