ndg_commonmark/
utils.rs

1use std::collections::HashMap;
2
3use comrak::{
4    Arena, ComrakOptions,
5    nodes::{AstNode, NodeHeading, NodeValue},
6    parse_document,
7};
8use regex::Regex;
9
10// Re-export from processor module
11pub use crate::processor::collect_markdown_files;
12
13/// Slugify a string for use as an anchor ID.
14/// Converts to lowercase, replaces non-alphanumeric characters with dashes,
15/// and trims leading/trailing dashes.
16#[must_use]
17pub fn slugify(text: &str) -> String {
18    text.to_lowercase()
19        .replace(|c: char| !c.is_alphanumeric() && c != '-' && c != '_', "-")
20        .trim_matches('-')
21        .to_string()
22}
23
24/// Extract the first heading from markdown content as the page title.
25/// Returns None if no heading is found.
26#[must_use]
27pub fn extract_markdown_title(content: &str) -> Option<String> {
28    let arena = Arena::new();
29    let mut options = ComrakOptions::default();
30    options.extension.table = true;
31    options.extension.footnotes = true;
32    options.extension.strikethrough = true;
33    options.extension.tasklist = true;
34    options.extension.superscript = true;
35    options.render.unsafe_ = true;
36
37    let root = parse_document(&arena, content, &options);
38
39    for node in root.descendants() {
40        if let NodeValue::Heading(_) = &node.data.borrow().value {
41            let mut text = String::new();
42            for child in node.children() {
43                if let NodeValue::Text(t) = &child.data.borrow().value {
44                    text.push_str(t);
45                }
46                // Optionally handle inline formatting, code, etc.
47                if let NodeValue::Code(t) = &child.data.borrow().value {
48                    text.push_str(&t.literal);
49                }
50            }
51            if !text.trim().is_empty() {
52                return Some(text.trim().to_string());
53            }
54        }
55    }
56    None
57}
58
59/// Extract the first H1 heading from markdown content as the document title.
60/// Removes inline anchors and other markup from the title text.
61///
62/// # Returns
63///
64/// `None` if no H1 heading is found.
65#[must_use]
66pub fn extract_title_from_markdown(content: &str) -> Option<String> {
67    let arena = Arena::new();
68    let mut options = ComrakOptions::default();
69    options.extension.table = true;
70    options.extension.footnotes = true;
71    options.extension.strikethrough = true;
72    options.extension.tasklist = true;
73    options.render.unsafe_ = true;
74
75    let root = parse_document(&arena, content, &options);
76
77    // Regex to match {#id} and []{#id} anchors
78    let anchor_re = Regex::new(r"(\[\]\{#.*?\}|\{#.*?\})").unwrap();
79
80    for node in root.descendants() {
81        if let NodeValue::Heading(NodeHeading { level, .. }) = &node.data.borrow().value {
82            if *level == 1 {
83                let mut text = String::new();
84                for child in node.children() {
85                    if let NodeValue::Text(ref t) = child.data.borrow().value {
86                        text.push_str(t);
87                    }
88                }
89                // Clean the title by removing inline anchors and other NDG markup
90                let clean_title = anchor_re.replace_all(&text, "").trim().to_string();
91                if !clean_title.is_empty() {
92                    return Some(clean_title);
93                }
94            }
95        }
96    }
97    None
98}
99
100/// Clean anchor patterns from text (removes {#anchor-id} patterns).
101/// This is useful for cleaning titles and navigation text.
102#[must_use]
103pub fn clean_anchor_patterns(text: &str) -> String {
104    let anchor_pattern = Regex::new(r"\s*\{#[a-zA-Z0-9_-]+\}\s*$").unwrap();
105    anchor_pattern.replace_all(text.trim(), "").to_string()
106}
107
108/// Apply a regex transformation to HTML elements using the provided function.
109/// Used by the markdown processor for HTML element transformations.
110pub fn process_html_elements<F>(html: &str, regex: &Regex, transform: F) -> String
111where
112    F: Fn(&regex::Captures) -> String,
113{
114    match regex.replace_all(html, transform) {
115        std::borrow::Cow::Borrowed(_) => html.to_string(),
116        std::borrow::Cow::Owned(s) => s,
117    }
118}
119
120/// Strip markdown formatting and return plain text.
121/// This processes the markdown through the AST and extracts only text content,
122/// excluding code blocks and other formatting.
123#[must_use]
124pub fn strip_markdown(content: &str) -> String {
125    let arena = Arena::new();
126    let mut options = ComrakOptions::default();
127    options.extension.table = true;
128    options.extension.footnotes = true;
129    options.extension.strikethrough = true;
130    options.extension.tasklist = true;
131    options.render.unsafe_ = true;
132
133    let root = parse_document(&arena, content, &options);
134
135    let mut plain_text = String::new();
136    fn extract_text<'a>(node: &'a AstNode<'a>, plain_text: &mut String, in_code_block: &mut bool) {
137        match &node.data.borrow().value {
138            NodeValue::Text(t) => {
139                if !*in_code_block {
140                    plain_text.push_str(t);
141                    plain_text.push(' ');
142                }
143            }
144            NodeValue::CodeBlock(_) => {
145                *in_code_block = true;
146            }
147            NodeValue::SoftBreak => {
148                plain_text.push(' ');
149            }
150            NodeValue::LineBreak => {
151                plain_text.push('\n');
152            }
153            _ => {}
154        }
155        for child in node.children() {
156            extract_text(child, plain_text, in_code_block);
157        }
158        if let NodeValue::CodeBlock(_) = &node.data.borrow().value {
159            *in_code_block = false;
160        }
161    }
162    let mut in_code_block = false;
163    extract_text(root, &mut plain_text, &mut in_code_block);
164    plain_text
165}
166
167/// Capitalize the first letter of a string.
168pub fn capitalize_first(s: &str) -> String {
169    let mut chars = s.chars();
170    chars.next().map_or_else(String::new, |c| {
171        c.to_uppercase().collect::<String>() + chars.as_str()
172    })
173}
174
175/// Return true if the string looks like a markdown header (starts with #).
176#[must_use]
177pub fn is_markdown_header(line: &str) -> bool {
178    line.trim_start().starts_with('#')
179}
180
181/// Load manpage URL mappings from a JSON file.
182pub fn load_manpage_urls(
183    path: &str,
184) -> Result<HashMap<String, String>, Box<dyn std::error::Error>> {
185    let content = std::fs::read_to_string(path)?;
186    let mappings: HashMap<String, String> = serde_json::from_str(&content)?;
187    Ok(mappings)
188}
189
190/// Safely process markup with panic recovery and graceful error handling.
191///
192/// Wraps potentially panicking operations and provides
193/// graceful degradation when processing fails, ensuring that malformed
194/// input doesn't crash the entire document processor.
195pub fn safely_process_markup<F>(text: &str, process_fn: F, default_on_error: &str) -> String
196where
197    F: FnOnce(&str) -> String,
198{
199    // Avoid processing empty strings
200    if text.is_empty() {
201        return String::new();
202    }
203
204    // Catch any potential panics caused by malformed input or processing errors
205    let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| process_fn(text)));
206
207    match result {
208        Ok(processed_text) => processed_text,
209        Err(e) => {
210            // Log the error but allow the program to continue
211            if let Some(error_msg) = e.downcast_ref::<String>() {
212                log::error!("Error processing markup: {error_msg}");
213            } else if let Some(error_msg) = e.downcast_ref::<&str>() {
214                log::error!("Error processing markup: {error_msg}");
215            } else {
216                log::error!("Unknown error occurred while processing markup");
217            }
218
219            // Return the original text or default value to prevent breaking the entire document
220            if default_on_error.is_empty() {
221                text.to_string()
222            } else {
223                default_on_error.to_string()
224            }
225        }
226    }
227}
228
229/// Create a regex that never matches anything.
230///
231/// This is used as a fallback pattern when a regex fails to compile.
232/// It will never match any input, which is safer than using a trivial regex
233/// like `^$` which would match empty strings.
234#[must_use]
235pub fn never_matching_regex() -> regex::Regex {
236    // Use a pattern that will never match anything because it asserts something impossible
237    regex::Regex::new(r"[^\s\S]").expect("Failed to compile never-matching regex")
238}
239
240/// Process text with error recovery, converting any processing errors to log messages.
241///
242/// This provides a more lightweight alternative to `safely_process_markup` for operations
243/// that are unlikely to panic but may have logical errors.
244pub fn process_with_error_recovery<F, T>(
245    operation_name: &str,
246    input: T,
247    process_fn: F,
248) -> Result<String, String>
249where
250    F: FnOnce(T) -> Result<String, Box<dyn std::error::Error>>,
251{
252    match process_fn(input) {
253        Ok(result) => Ok(result),
254        Err(e) => {
255            let error_msg = format!("Error in {operation_name}: {e}");
256            log::error!("{error_msg}");
257            Err(error_msg)
258        }
259    }
260}
ndg_commonmark/utils.rs

ndg_commonmark/
utils.rs