ndg_commonmark/utils/
mod.rs

1use std::{collections::HashMap, sync::OnceLock};
2pub mod codeblock;
3
4use comrak::{
5  Arena,
6  nodes::{AstNode, NodeHeading, NodeValue},
7  options::Options,
8  parse_document,
9};
10use regex::Regex;
11
12/// Error type for utility operations.
13#[derive(Debug, thiserror::Error)]
14pub enum UtilError {
15  #[error("Regex compilation failed: {0}")]
16  RegexError(#[from] regex::Error),
17}
18
19/// Result type for utility operations.
20pub type UtilResult<T> = Result<T, UtilError>;
21
22/// Slugify a string for use as an anchor ID.
23/// Converts to lowercase, replaces non-alphanumeric characters with dashes,
24/// and trims leading/trailing dashes.
25#[must_use]
26pub fn slugify(text: &str) -> String {
27  text
28    .to_lowercase()
29    .replace(|c: char| !c.is_alphanumeric() && c != '-' && c != '_', "-")
30    .trim_matches('-')
31    .to_string()
32}
33
34/// Extract the first heading from markdown content as the page title.
35/// Returns [`None`] if no heading is found.
36#[must_use]
37pub fn extract_markdown_title(content: &str) -> Option<String> {
38  let arena = Arena::new();
39  let mut options = Options::default();
40  options.extension.table = true;
41  options.extension.footnotes = true;
42  options.extension.strikethrough = true;
43  options.extension.tasklist = true;
44  options.extension.superscript = true;
45  options.render.r#unsafe = true;
46
47  let root = parse_document(&arena, content, &options);
48
49  for node in root.descendants() {
50    if let NodeValue::Heading(_) = &node.data.borrow().value {
51      let text = extract_inline_text_from_node(node);
52      if !text.trim().is_empty() {
53        return Some(text.trim().to_string());
54      }
55    }
56  }
57  None
58}
59
60/// Extract all inline text from a node, recursively handling all inline
61/// elements.
62fn extract_inline_text_from_node<'a>(node: &'a AstNode<'a>) -> String {
63  let mut text = String::new();
64  for child in node.children() {
65    match &child.data.borrow().value {
66      NodeValue::Text(t) => text.push_str(t),
67      NodeValue::Code(t) => text.push_str(&t.literal),
68      NodeValue::Link(..)
69      | NodeValue::Emph
70      | NodeValue::Strong
71      | NodeValue::Strikethrough
72      | NodeValue::Superscript
73      | NodeValue::FootnoteReference(..) => {
74        text.push_str(&extract_inline_text_from_node(child));
75      },
76      #[allow(clippy::match_same_arms, reason = "Explicit for clarity")]
77      NodeValue::HtmlInline(_) | NodeValue::Image(..) => {},
78      _ => {},
79    }
80  }
81  text
82}
83
84/// Extract the first H1 heading from markdown content as the document title.
85/// Removes inline anchors and other markup from the title text.
86///
87/// # Returns
88///
89/// [`None`] if no H1 heading is found.
90///
91/// `Some(title, id)` if a H1 heading is found. id can be None if inline anchor
92/// does not exist.
93///
94/// # Panics
95///
96/// Panics if the fallback regex pattern fails to compile, which should never
97/// happen with the hardcoded pattern.
98#[must_use]
99pub fn extract_markdown_title_and_id(
100  content: &str,
101) -> Option<(String, Option<String>)> {
102  let arena = Arena::new();
103  let mut options = Options::default();
104  options.extension.table = true;
105  options.extension.footnotes = true;
106  options.extension.strikethrough = true;
107  options.extension.tasklist = true;
108  options.render.r#unsafe = true;
109
110  let root = parse_document(&arena, content, &options);
111
112  // Use a static regex to avoid compilation failures at runtime
113  #[allow(
114    clippy::items_after_statements,
115    reason = "Static is Scoped to function for clarity"
116  )]
117  static ANCHOR_RE: OnceLock<Regex> = OnceLock::new();
118  let anchor_re = ANCHOR_RE.get_or_init(|| {
119    Regex::new(r"(\[\])?\{#(.*?)\}").unwrap_or_else(|e| {
120      log::error!(
121        "Failed to compile ANCHOR_RE regex in extract_h1_title: {e}\n Falling \
122         back to never matching regex."
123      );
124      never_matching_regex().unwrap_or_else(|_| {
125        // As a last resort, create a regex that matches nothing
126        #[allow(
127          clippy::expect_used,
128          reason = "This pattern is guaranteed to be valid"
129        )]
130        Regex::new(r"[^\s\S]")
131          .expect("regex pattern [^\\s\\S] should always compile")
132      })
133    })
134  });
135
136  for node in root.descendants() {
137    if let NodeValue::Heading(NodeHeading { level, .. }) =
138      &node.data.borrow().value
139      && *level == 1
140    {
141      let text = extract_inline_text_from_node(node);
142      // Clean the title by removing inline anchors and other NDG markup
143      let anchor_id = anchor_re
144        .captures(&text)
145        .and_then(|caps| caps.get(2).map(|m| m.as_str().to_string()));
146      let clean_title = anchor_re.replace_all(&text, "").trim().to_string();
147      if !clean_title.is_empty() {
148        return Some((clean_title, anchor_id));
149      }
150    }
151  }
152  None
153}
154
155/// Clean anchor patterns from text (removes `{#anchor-id}` patterns).
156/// This is useful for cleaning titles and navigation text.
157///
158/// # Panics
159///
160/// Panics if fallback regex pattern fails to compile, which should never happen
161/// with hardcoded pattern.
162#[must_use]
163pub fn clean_anchor_patterns(text: &str) -> String {
164  static ANCHOR_PATTERN: OnceLock<Regex> = OnceLock::new();
165  let anchor_pattern = ANCHOR_PATTERN.get_or_init(|| {
166    Regex::new(r"\s*\{#[a-zA-Z0-9_-]+\}\s*$").unwrap_or_else(|e| {
167      log::error!(
168        "Failed to compile ANCHOR_PATTERN regex in clean_anchor_patterns: \
169         {e}\n Falling back to never matching regex."
170      );
171      never_matching_regex().unwrap_or_else(|_| {
172        // As a last resort, create a regex that matches nothing
173        #[allow(
174          clippy::expect_used,
175          reason = "This pattern is guaranteed to be valid"
176        )]
177        Regex::new(r"[^\s\S]")
178          .expect("regex pattern [^\\s\\S] should always compile")
179      })
180    })
181  });
182  anchor_pattern.replace_all(text.trim(), "").to_string()
183}
184
185/// Apply a regex transformation to HTML elements using the provided function.
186/// Used by the markdown processor for HTML element transformations.
187pub fn process_html_elements<F>(
188  html: &str,
189  regex: &Regex,
190  transform: F,
191) -> String
192where
193  F: Fn(&regex::Captures) -> String,
194{
195  match regex.replace_all(html, transform) {
196    std::borrow::Cow::Borrowed(_) => html.to_string(),
197    std::borrow::Cow::Owned(s) => s,
198  }
199}
200
201/// Strip markdown formatting and return plain text.
202///
203/// This processes the markdown through the AST and extracts only text content,
204/// excluding code blocks and other formatting.
205#[must_use]
206pub fn strip_markdown(content: &str) -> String {
207  let arena = Arena::new();
208  let mut options = Options::default();
209  options.extension.table = true;
210  options.extension.footnotes = true;
211  options.extension.strikethrough = true;
212  options.extension.tasklist = true;
213  options.render.r#unsafe = true;
214
215  let root = parse_document(&arena, content, &options);
216
217  let mut plain_text = String::new();
218
219  #[allow(clippy::items_after_statements, reason = "Helper scoped for clarity")]
220  fn extract_text<'a>(
221    node: &'a AstNode<'a>,
222    plain_text: &mut String,
223    in_code_block: &mut bool,
224  ) {
225    match &node.data.borrow().value {
226      NodeValue::Document => {
227        for child in node.children() {
228          extract_text(child, plain_text, in_code_block);
229        }
230      },
231      NodeValue::Paragraph => {
232        for child in node.children() {
233          extract_text(child, plain_text, in_code_block);
234        }
235        // Add paragraph break after each paragraph
236        plain_text.push('\n');
237      },
238      NodeValue::Heading(_) => {
239        for child in node.children() {
240          extract_text(child, plain_text, in_code_block);
241        }
242        // Add line break after heading
243        plain_text.push('\n');
244      },
245      NodeValue::Text(t) => {
246        if !*in_code_block {
247          plain_text.push_str(t);
248        }
249      },
250      NodeValue::CodeBlock(_) => {
251        *in_code_block = true;
252        for child in node.children() {
253          extract_text(child, plain_text, in_code_block);
254        }
255        *in_code_block = false;
256      },
257      NodeValue::SoftBreak => {
258        plain_text.push(' ');
259      },
260      NodeValue::LineBreak => {
261        plain_text.push('\n');
262      },
263      NodeValue::List(_) => {
264        for child in node.children() {
265          extract_text(child, plain_text, in_code_block);
266        }
267        plain_text.push('\n');
268      },
269      NodeValue::Item(_) => {
270        for child in node.children() {
271          extract_text(child, plain_text, in_code_block);
272        }
273      },
274      NodeValue::Code(c) => {
275        if !*in_code_block {
276          plain_text.push_str(&c.literal);
277        }
278      },
279      _ => {
280        for child in node.children() {
281          extract_text(child, plain_text, in_code_block);
282        }
283      },
284    }
285  }
286
287  let mut in_code_block = false;
288  extract_text(root, &mut plain_text, &mut in_code_block);
289  plain_text
290}
291
292/// Capitalize the first letter of a string.
293pub fn capitalize_first(s: &str) -> String {
294  let mut chars = s.chars();
295  chars.next().map_or_else(String::new, |c| {
296    c.to_uppercase().collect::<String>() + chars.as_str()
297  })
298}
299
300/// Return true if the string looks like a markdown header (starts with #).
301#[must_use]
302pub fn is_markdown_header(line: &str) -> bool {
303  line.trim_start().starts_with('#')
304}
305
306/// Load manpage URL mappings from a JSON file.
307///
308/// # Errors
309///
310/// Returns an error if the file cannot be read or if the JSON is invalid.
311pub fn load_manpage_urls(
312  path: &str,
313) -> Result<HashMap<String, String>, Box<dyn std::error::Error>> {
314  let content = std::fs::read_to_string(path)?;
315  let mappings: HashMap<String, String> = serde_json::from_str(&content)?;
316  Ok(mappings)
317}
318
319/// Create a regex that never matches anything.
320///
321/// This is used as a fallback pattern when a regex fails to compile.
322/// It will never match any input, which is safer than using a trivial regex
323/// like `^$` which would match empty strings.
324///
325/// # Errors
326///
327/// Returns an error if both primary and fallback regex patterns fail to
328/// compile, which should never happen with hardcoded patterns.
329pub fn never_matching_regex() -> Result<regex::Regex, regex::Error> {
330  // Use a pattern that will never match anything because it asserts something
331  // impossible - this pattern is guaranteed to be valid
332  regex::Regex::new(r"[^\s\S]").or_else(|_| {
333    // As an ultimate fallback, use an empty pattern that matches nothing
334    // This SHOULD NOT happen.
335    regex::Regex::new(r"^\b$")
336  })
337}