ndg_commonmark/utils/
mod.rs

1use std::{collections::HashMap, sync::OnceLock};
2pub mod codeblock;
3
4use comrak::{
5  Arena,
6  nodes::{AstNode, NodeHeading, NodeValue},
7  options::Options,
8  parse_document,
9};
10use regex::Regex;
11
12/// Error type for utility operations.
13#[derive(Debug, thiserror::Error)]
14pub enum UtilError {
15  #[error("Regex compilation failed: {0}")]
16  RegexError(#[from] regex::Error),
17}
18
19/// Result type for utility operations.
20pub type UtilResult<T> = Result<T, UtilError>;
21
22/// Slugify a string for use as an anchor ID.
23/// Converts to lowercase, replaces non-alphanumeric characters with dashes,
24/// and trims leading/trailing dashes.
25#[must_use]
26pub fn slugify(text: &str) -> String {
27  text
28    .to_lowercase()
29    .replace(|c: char| !c.is_alphanumeric() && c != '-' && c != '_', "-")
30    .trim_matches('-')
31    .to_string()
32}
33
34/// Extract the first heading from markdown content as the page title.
35/// Returns [`None`] if no heading is found.
36#[must_use]
37pub fn extract_markdown_title(content: &str) -> Option<String> {
38  let arena = Arena::new();
39  let mut options = Options::default();
40  options.extension.table = true;
41  options.extension.footnotes = true;
42  options.extension.strikethrough = true;
43  options.extension.tasklist = true;
44  options.extension.superscript = true;
45  options.render.r#unsafe = true;
46
47  let root = parse_document(&arena, content, &options);
48
49  for node in root.descendants() {
50    if let NodeValue::Heading(_) = &node.data.borrow().value {
51      let mut text = String::new();
52      for child in node.children() {
53        if let NodeValue::Text(t) = &child.data.borrow().value {
54          text.push_str(t);
55        }
56        // Optionally handle inline formatting, code, etc.
57        if let NodeValue::Code(t) = &child.data.borrow().value {
58          text.push_str(&t.literal);
59        }
60      }
61      if !text.trim().is_empty() {
62        return Some(text.trim().to_string());
63      }
64    }
65  }
66  None
67}
68
69/// Extract the first H1 heading from markdown content as the document title.
70/// Removes inline anchors and other markup from the title text.
71///
72/// # Returns
73///
74/// [`None`] if no H1 heading is found.
75///
76/// # Panics
77///
78/// Panics if the fallback regex pattern fails to compile, which should never
79/// happen with the hardcoded pattern.
80#[must_use]
81pub fn extract_title_from_markdown(content: &str) -> Option<String> {
82  let arena = Arena::new();
83  let mut options = Options::default();
84  options.extension.table = true;
85  options.extension.footnotes = true;
86  options.extension.strikethrough = true;
87  options.extension.tasklist = true;
88  options.render.r#unsafe = true;
89
90  let root = parse_document(&arena, content, &options);
91
92  // Use a static regex to avoid compilation failures at runtime
93  #[allow(
94    clippy::items_after_statements,
95    reason = "Static is Scoped to function for clarity"
96  )]
97  static ANCHOR_RE: OnceLock<Regex> = OnceLock::new();
98  let anchor_re = ANCHOR_RE.get_or_init(|| {
99    Regex::new(r"(\[\]\{#.*?\}|\{#.*?\})").unwrap_or_else(|e| {
100      log::error!(
101        "Failed to compile ANCHOR_RE regex in extract_h1_title: {e}\n Falling \
102         back to never matching regex."
103      );
104      never_matching_regex().unwrap_or_else(|_| {
105        // As a last resort, create a regex that matches nothing
106        #[allow(
107          clippy::expect_used,
108          reason = "This pattern is guaranteed to be valid"
109        )]
110        Regex::new(r"[^\s\S]")
111          .expect("regex pattern [^\\s\\S] should always compile")
112      })
113    })
114  });
115
116  for node in root.descendants() {
117    if let NodeValue::Heading(NodeHeading { level, .. }) =
118      &node.data.borrow().value
119    {
120      if *level == 1 {
121        let mut text = String::new();
122        for child in node.children() {
123          if let NodeValue::Text(ref t) = child.data.borrow().value {
124            text.push_str(t);
125          }
126        }
127        // Clean the title by removing inline anchors and other NDG markup
128        let clean_title = anchor_re.replace_all(&text, "").trim().to_string();
129        if !clean_title.is_empty() {
130          return Some(clean_title);
131        }
132      }
133    }
134  }
135  None
136}
137
138/// Clean anchor patterns from text (removes `{#anchor-id}` patterns).
139/// This is useful for cleaning titles and navigation text.
140///
141/// # Panics
142///
143/// Panics if fallback regex pattern fails to compile, which should never happen
144/// with hardcoded pattern.
145#[must_use]
146pub fn clean_anchor_patterns(text: &str) -> String {
147  static ANCHOR_PATTERN: OnceLock<Regex> = OnceLock::new();
148  let anchor_pattern = ANCHOR_PATTERN.get_or_init(|| {
149    Regex::new(r"\s*\{#[a-zA-Z0-9_-]+\}\s*$").unwrap_or_else(|e| {
150      log::error!(
151        "Failed to compile ANCHOR_PATTERN regex in clean_anchor_patterns: \
152         {e}\n Falling back to never matching regex."
153      );
154      never_matching_regex().unwrap_or_else(|_| {
155        // As a last resort, create a regex that matches nothing
156        #[allow(
157          clippy::expect_used,
158          reason = "This pattern is guaranteed to be valid"
159        )]
160        Regex::new(r"[^\s\S]")
161          .expect("regex pattern [^\\s\\S] should always compile")
162      })
163    })
164  });
165  anchor_pattern.replace_all(text.trim(), "").to_string()
166}
167
168/// Apply a regex transformation to HTML elements using the provided function.
169/// Used by the markdown processor for HTML element transformations.
170pub fn process_html_elements<F>(
171  html: &str,
172  regex: &Regex,
173  transform: F,
174) -> String
175where
176  F: Fn(&regex::Captures) -> String,
177{
178  match regex.replace_all(html, transform) {
179    std::borrow::Cow::Borrowed(_) => html.to_string(),
180    std::borrow::Cow::Owned(s) => s,
181  }
182}
183
184/// Strip markdown formatting and return plain text.
185///
186/// This processes the markdown through the AST and extracts only text content,
187/// excluding code blocks and other formatting.
188#[must_use]
189pub fn strip_markdown(content: &str) -> String {
190  let arena = Arena::new();
191  let mut options = Options::default();
192  options.extension.table = true;
193  options.extension.footnotes = true;
194  options.extension.strikethrough = true;
195  options.extension.tasklist = true;
196  options.render.r#unsafe = true;
197
198  let root = parse_document(&arena, content, &options);
199
200  let mut plain_text = String::new();
201  #[allow(clippy::items_after_statements, reason = "Helper scoped for clarity")]
202  fn extract_text<'a>(
203    node: &'a AstNode<'a>,
204    plain_text: &mut String,
205    in_code_block: &mut bool,
206  ) {
207    match &node.data.borrow().value {
208      NodeValue::Text(t) => {
209        if !*in_code_block {
210          plain_text.push_str(t);
211          plain_text.push(' ');
212        }
213      },
214      NodeValue::CodeBlock(_) => {
215        *in_code_block = true;
216      },
217      NodeValue::SoftBreak => {
218        plain_text.push(' ');
219      },
220      NodeValue::LineBreak => {
221        plain_text.push('\n');
222      },
223      _ => {},
224    }
225    for child in node.children() {
226      extract_text(child, plain_text, in_code_block);
227    }
228    if let NodeValue::CodeBlock(_) = &node.data.borrow().value {
229      *in_code_block = false;
230    }
231  }
232  let mut in_code_block = false;
233  extract_text(root, &mut plain_text, &mut in_code_block);
234  plain_text
235}
236
237/// Capitalize the first letter of a string.
238pub fn capitalize_first(s: &str) -> String {
239  let mut chars = s.chars();
240  chars.next().map_or_else(String::new, |c| {
241    c.to_uppercase().collect::<String>() + chars.as_str()
242  })
243}
244
245/// Return true if the string looks like a markdown header (starts with #).
246#[must_use]
247pub fn is_markdown_header(line: &str) -> bool {
248  line.trim_start().starts_with('#')
249}
250
251/// Load manpage URL mappings from a JSON file.
252///
253/// # Errors
254///
255/// Returns an error if the file cannot be read or if the JSON is invalid.
256pub fn load_manpage_urls(
257  path: &str,
258) -> Result<HashMap<String, String>, Box<dyn std::error::Error>> {
259  let content = std::fs::read_to_string(path)?;
260  let mappings: HashMap<String, String> = serde_json::from_str(&content)?;
261  Ok(mappings)
262}
263
264/// Create a regex that never matches anything.
265///
266/// This is used as a fallback pattern when a regex fails to compile.
267/// It will never match any input, which is safer than using a trivial regex
268/// like `^$` which would match empty strings.
269///
270/// # Errors
271///
272/// Returns an error if both primary and fallback regex patterns fail to
273/// compile, which should never happen with hardcoded patterns.
274pub fn never_matching_regex() -> Result<regex::Regex, regex::Error> {
275  // Use a pattern that will never match anything because it asserts something
276  // impossible - this pattern is guaranteed to be valid
277  regex::Regex::new(r"[^\s\S]").or_else(|_| {
278    // As an ultimate fallback, use an empty pattern that matches nothing
279    // This SHOULD NOT happen.
280    regex::Regex::new(r"^\b$")
281  })
282}