Skip to main content

ndg_commonmark/utils/
mod.rs

1use std::sync::{LazyLock, OnceLock, RwLock};
2
3use rustc_hash::FxHashMap;
4pub mod codeblock;
5
6use comrak::{
7  Arena,
8  nodes::{AstNode, NodeHeading, NodeValue},
9  options::Options,
10  parse_document,
11};
12use regex::Regex;
13
14/// Error type for utility operations.
15#[derive(Debug, thiserror::Error)]
16pub enum UtilError {
17  #[error("Regex compilation failed: {0}")]
18  RegexError(#[from] regex::Error),
19}
20
21/// Result type for utility operations.
22pub type UtilResult<T> = Result<T, UtilError>;
23
24/// Slugify a string for use as an anchor ID. Converts to lowercase, replaces
25/// non-alphanumeric characters with dashes, and trims leading/trailing dashes.
26#[must_use]
27pub fn slugify(text: &str) -> String {
28  static CACHE: LazyLock<RwLock<FxHashMap<String, String>>> =
29    LazyLock::new(|| RwLock::new(FxHashMap::default()));
30
31  {
32    let cache = CACHE
33      .read()
34      .unwrap_or_else(std::sync::PoisonError::into_inner);
35    if let Some(cached) = cache.get(text) {
36      return cached.clone();
37    }
38  }
39
40  let result = text
41    .to_lowercase()
42    .replace(|c: char| !c.is_alphanumeric() && c != '-' && c != '_', "-")
43    .trim_matches('-')
44    .to_string();
45
46  let mut cache = CACHE
47    .write()
48    .unwrap_or_else(std::sync::PoisonError::into_inner);
49  if cache.len() < 2048 {
50    cache.insert(text.to_string(), result.clone());
51  }
52
53  result
54}
55
56/// Extract the first heading from markdown content as the page title.
57/// Returns [`None`] if no heading is found.
58#[must_use]
59pub fn extract_markdown_title(content: &str) -> Option<String> {
60  let arena = Arena::new();
61  let mut options = Options::default();
62  options.extension.table = true;
63  options.extension.footnotes = true;
64  options.extension.strikethrough = true;
65  options.extension.tasklist = true;
66  options.extension.superscript = true;
67  options.render.r#unsafe = true;
68
69  let root = parse_document(&arena, content, &options);
70
71  for node in root.descendants() {
72    if let NodeValue::Heading(NodeHeading { level, .. }) =
73      &node.data.borrow().value
74      && *level == 1
75    {
76      let text = extract_inline_text_from_node(node);
77      if !text.trim().is_empty() {
78        return Some(text.trim().to_string());
79      }
80    }
81  }
82  None
83}
84
85/// Extract all inline text from a node, recursively handling all inline
86/// elements.
87fn extract_inline_text_from_node<'a>(node: &'a AstNode<'a>) -> String {
88  let mut text = String::new();
89  for child in node.children() {
90    match &child.data.borrow().value {
91      NodeValue::Text(t) => text.push_str(t),
92      NodeValue::Code(t) => text.push_str(&t.literal),
93      NodeValue::Link(..)
94      | NodeValue::Emph
95      | NodeValue::Strong
96      | NodeValue::Strikethrough
97      | NodeValue::Superscript
98      | NodeValue::FootnoteReference(..) => {
99        text.push_str(&extract_inline_text_from_node(child));
100      },
101      #[expect(clippy::match_same_arms, reason = "Explicit for clarity")]
102      NodeValue::HtmlInline(_) | NodeValue::Image(..) => {},
103      _ => {},
104    }
105  }
106  text
107}
108
109/// Extract the first H1 heading from markdown content as the document title.
110/// Removes inline anchors and other markup from the title text.
111///
112/// # Returns
113///
114/// [`None`] if no H1 heading is found.
115///
116/// `Some(title, id)` if a H1 heading is found. id can be None if inline anchor
117/// does not exist.
118///
119/// # Panics
120///
121/// Panics if the fallback regex pattern fails to compile, which should never
122/// happen with the hardcoded pattern.
123#[must_use]
124pub fn extract_markdown_title_and_id(
125  content: &str,
126) -> Option<(String, Option<String>)> {
127  let arena = Arena::new();
128  let mut options = Options::default();
129  options.extension.table = true;
130  options.extension.footnotes = true;
131  options.extension.strikethrough = true;
132  options.extension.tasklist = true;
133  options.render.r#unsafe = true;
134
135  let root = parse_document(&arena, content, &options);
136
137  // Use a static regex to avoid compilation failures at runtime
138  #[expect(
139    clippy::items_after_statements,
140    reason = "Static is Scoped to function for clarity"
141  )]
142  static ANCHOR_RE: OnceLock<Regex> = OnceLock::new();
143  let anchor_re = ANCHOR_RE.get_or_init(|| {
144    Regex::new(r"(\[\])?\{#(.*?)\}").unwrap_or_else(|e| {
145      log::error!(
146        "Failed to compile ANCHOR_RE regex in extract_h1_title: {e}\n Falling \
147         back to never matching regex."
148      );
149      never_matching_regex().unwrap_or_else(|_| {
150        // As a last resort, create a regex that matches nothing
151        #[expect(
152          clippy::expect_used,
153          reason = "This pattern is guaranteed to be valid"
154        )]
155        Regex::new(r"[^\s\S]")
156          .expect("regex pattern [^\\s\\S] should always compile")
157      })
158    })
159  });
160
161  for node in root.descendants() {
162    if let NodeValue::Heading(NodeHeading { level, .. }) =
163      &node.data.borrow().value
164      && *level == 1
165    {
166      let text = extract_inline_text_from_node(node);
167      // Clean the title by removing inline anchors and other NDG markup
168      let anchor_id = anchor_re
169        .captures(&text)
170        .and_then(|caps| caps.get(2).map(|m| m.as_str().to_string()));
171      let clean_title = anchor_re.replace_all(&text, "").trim().to_string();
172      if !clean_title.is_empty() {
173        return Some((clean_title, anchor_id));
174      }
175    }
176  }
177  None
178}
179
180/// Clean anchor patterns from text (removes `{#anchor-id}` patterns).
181/// This is useful for cleaning titles and navigation text.
182///
183/// # Panics
184///
185/// Panics if fallback regex pattern fails to compile, which should never happen
186/// with hardcoded pattern.
187#[must_use]
188pub fn clean_anchor_patterns(text: &str) -> String {
189  static ANCHOR_PATTERN: OnceLock<Regex> = OnceLock::new();
190  let anchor_pattern = ANCHOR_PATTERN.get_or_init(|| {
191    Regex::new(r"\s*\{#[a-zA-Z0-9_-]+\}\s*$").unwrap_or_else(|e| {
192      log::error!(
193        "Failed to compile ANCHOR_PATTERN regex in clean_anchor_patterns: \
194         {e}\n Falling back to never matching regex."
195      );
196      never_matching_regex().unwrap_or_else(|_| {
197        // As a last resort, create a regex that matches nothing
198        #[expect(
199          clippy::expect_used,
200          reason = "This pattern is guaranteed to be valid"
201        )]
202        Regex::new(r"[^\s\S]")
203          .expect("regex pattern [^\\s\\S] should always compile")
204      })
205    })
206  });
207  anchor_pattern.replace_all(text.trim(), "").to_string()
208}
209
210/// Strip markdown formatting and return plain text.
211///
212/// This processes the markdown through the AST and extracts only text content,
213/// excluding code blocks and other formatting.
214#[must_use]
215pub fn strip_markdown(content: &str) -> String {
216  let arena = Arena::new();
217  let mut options = Options::default();
218  options.extension.table = true;
219  options.extension.footnotes = true;
220  options.extension.strikethrough = true;
221  options.extension.tasklist = true;
222  options.render.r#unsafe = true;
223
224  let root = parse_document(&arena, content, &options);
225
226  let mut plain_text = String::new();
227
228  #[expect(
229    clippy::items_after_statements,
230    reason = "Helper scoped for clarity"
231  )]
232  fn extract_text<'a>(
233    node: &'a AstNode<'a>,
234    plain_text: &mut String,
235    in_code_block: &mut bool,
236  ) {
237    match &node.data.borrow().value {
238      NodeValue::Paragraph => {
239        for child in node.children() {
240          extract_text(child, plain_text, in_code_block);
241        }
242        // Add paragraph break after each paragraph
243        plain_text.push('\n');
244      },
245      NodeValue::Heading(_) => {
246        for child in node.children() {
247          extract_text(child, plain_text, in_code_block);
248        }
249        // Add line break after heading
250        plain_text.push('\n');
251      },
252      NodeValue::Text(t) => {
253        if !*in_code_block {
254          plain_text.push_str(t);
255        }
256      },
257      NodeValue::CodeBlock(_) => {
258        *in_code_block = true;
259        for child in node.children() {
260          extract_text(child, plain_text, in_code_block);
261        }
262        *in_code_block = false;
263      },
264      NodeValue::SoftBreak => {
265        plain_text.push(' ');
266      },
267      NodeValue::LineBreak => {
268        plain_text.push('\n');
269      },
270      NodeValue::List(_) => {
271        for child in node.children() {
272          extract_text(child, plain_text, in_code_block);
273        }
274        plain_text.push('\n');
275      },
276      NodeValue::Code(c) => {
277        if !*in_code_block {
278          plain_text.push_str(&c.literal);
279        }
280      },
281      _ => {
282        for child in node.children() {
283          extract_text(child, plain_text, in_code_block);
284        }
285      },
286    }
287  }
288
289  let mut in_code_block = false;
290  extract_text(root, &mut plain_text, &mut in_code_block);
291  plain_text
292}
293
294/// Capitalize the first letter of a string.
295pub fn capitalize_first(s: &str) -> String {
296  let mut chars = s.chars();
297  chars.next().map_or_else(String::new, |c| {
298    c.to_uppercase().collect::<String>() + chars.as_str()
299  })
300}
301
302/// Return true if the string looks like a markdown header (starts with #).
303#[must_use]
304pub fn is_markdown_header(line: &str) -> bool {
305  line.trim_start().starts_with('#')
306}
307
308/// Load manpage URL mappings from a JSON file.
309///
310/// # Errors
311///
312/// Returns an error if the file cannot be read or if the JSON is invalid.
313pub fn load_manpage_urls(
314  path: &str,
315) -> Result<FxHashMap<String, String>, Box<dyn std::error::Error>> {
316  let content = std::fs::read_to_string(path)?;
317  let mappings: FxHashMap<String, String> = serde_json::from_str(&content)?;
318  Ok(mappings)
319}
320
321/// Create a regex that never matches anything.
322///
323/// This is used as a fallback pattern when a regex fails to compile.
324/// It will never match any input, which is safer than using a trivial regex
325/// like `^$` which would match empty strings.
326///
327/// # Errors
328///
329/// Returns an error if both primary and fallback regex patterns fail to
330/// compile, which should never happen with hardcoded patterns.
331pub fn never_matching_regex() -> Result<regex::Regex, regex::Error> {
332  // Use a pattern that will never match anything because it asserts something
333  // impossible - this pattern is guaranteed to be valid
334  regex::Regex::new(r"[^\s\S]").or_else(|_| {
335    // As an ultimate fallback, use an empty pattern that matches nothing
336    // This SHOULD NOT happen.
337    regex::Regex::new(r"^\b$")
338  })
339}