ndg_commonmark/utils/
mod.rs

1use std::{
2  collections::HashMap,
3  sync::{LazyLock, OnceLock, RwLock},
4};
5pub mod codeblock;
6
7use comrak::{
8  Arena,
9  nodes::{AstNode, NodeHeading, NodeValue},
10  options::Options,
11  parse_document,
12};
13use regex::Regex;
14
15/// Error type for utility operations.
16#[derive(Debug, thiserror::Error)]
17pub enum UtilError {
18  #[error("Regex compilation failed: {0}")]
19  RegexError(#[from] regex::Error),
20}
21
22/// Result type for utility operations.
23pub type UtilResult<T> = Result<T, UtilError>;
24
25/// Slugify a string for use as an anchor ID. Converts to lowercase, replaces
26/// non-alphanumeric characters with dashes, and trims leading/trailing dashes.
27#[must_use]
28pub fn slugify(text: &str) -> String {
29  static CACHE: LazyLock<RwLock<HashMap<String, String>>> =
30    LazyLock::new(|| RwLock::new(HashMap::new()));
31
32  {
33    let cache = CACHE
34      .read()
35      .unwrap_or_else(std::sync::PoisonError::into_inner);
36    if let Some(cached) = cache.get(text) {
37      return cached.clone();
38    }
39  }
40
41  let result = text
42    .to_lowercase()
43    .replace(|c: char| !c.is_alphanumeric() && c != '-' && c != '_', "-")
44    .trim_matches('-')
45    .to_string();
46
47  let mut cache = CACHE
48    .write()
49    .unwrap_or_else(std::sync::PoisonError::into_inner);
50  if cache.len() < 2048 {
51    cache.insert(text.to_string(), result.clone());
52  }
53
54  result
55}
56
57/// Extract the first heading from markdown content as the page title.
58/// Returns [`None`] if no heading is found.
59#[must_use]
60pub fn extract_markdown_title(content: &str) -> Option<String> {
61  let arena = Arena::new();
62  let mut options = Options::default();
63  options.extension.table = true;
64  options.extension.footnotes = true;
65  options.extension.strikethrough = true;
66  options.extension.tasklist = true;
67  options.extension.superscript = true;
68  options.render.r#unsafe = true;
69
70  let root = parse_document(&arena, content, &options);
71
72  for node in root.descendants() {
73    if let NodeValue::Heading(NodeHeading { level, .. }) =
74      &node.data.borrow().value
75      && *level == 1
76    {
77      let text = extract_inline_text_from_node(node);
78      if !text.trim().is_empty() {
79        return Some(text.trim().to_string());
80      }
81    }
82  }
83  None
84}
85
86/// Extract all inline text from a node, recursively handling all inline
87/// elements.
88fn extract_inline_text_from_node<'a>(node: &'a AstNode<'a>) -> String {
89  let mut text = String::new();
90  for child in node.children() {
91    match &child.data.borrow().value {
92      NodeValue::Text(t) => text.push_str(t),
93      NodeValue::Code(t) => text.push_str(&t.literal),
94      NodeValue::Link(..)
95      | NodeValue::Emph
96      | NodeValue::Strong
97      | NodeValue::Strikethrough
98      | NodeValue::Superscript
99      | NodeValue::FootnoteReference(..) => {
100        text.push_str(&extract_inline_text_from_node(child));
101      },
102      #[allow(clippy::match_same_arms, reason = "Explicit for clarity")]
103      NodeValue::HtmlInline(_) | NodeValue::Image(..) => {},
104      _ => {},
105    }
106  }
107  text
108}
109
110/// Extract the first H1 heading from markdown content as the document title.
111/// Removes inline anchors and other markup from the title text.
112///
113/// # Returns
114///
115/// [`None`] if no H1 heading is found.
116///
117/// `Some(title, id)` if a H1 heading is found. id can be None if inline anchor
118/// does not exist.
119///
120/// # Panics
121///
122/// Panics if the fallback regex pattern fails to compile, which should never
123/// happen with the hardcoded pattern.
124#[must_use]
125pub fn extract_markdown_title_and_id(
126  content: &str,
127) -> Option<(String, Option<String>)> {
128  let arena = Arena::new();
129  let mut options = Options::default();
130  options.extension.table = true;
131  options.extension.footnotes = true;
132  options.extension.strikethrough = true;
133  options.extension.tasklist = true;
134  options.render.r#unsafe = true;
135
136  let root = parse_document(&arena, content, &options);
137
138  // Use a static regex to avoid compilation failures at runtime
139  #[allow(
140    clippy::items_after_statements,
141    reason = "Static is Scoped to function for clarity"
142  )]
143  static ANCHOR_RE: OnceLock<Regex> = OnceLock::new();
144  let anchor_re = ANCHOR_RE.get_or_init(|| {
145    Regex::new(r"(\[\])?\{#(.*?)\}").unwrap_or_else(|e| {
146      log::error!(
147        "Failed to compile ANCHOR_RE regex in extract_h1_title: {e}\n Falling \
148         back to never matching regex."
149      );
150      never_matching_regex().unwrap_or_else(|_| {
151        // As a last resort, create a regex that matches nothing
152        #[allow(
153          clippy::expect_used,
154          reason = "This pattern is guaranteed to be valid"
155        )]
156        Regex::new(r"[^\s\S]")
157          .expect("regex pattern [^\\s\\S] should always compile")
158      })
159    })
160  });
161
162  for node in root.descendants() {
163    if let NodeValue::Heading(NodeHeading { level, .. }) =
164      &node.data.borrow().value
165      && *level == 1
166    {
167      let text = extract_inline_text_from_node(node);
168      // Clean the title by removing inline anchors and other NDG markup
169      let anchor_id = anchor_re
170        .captures(&text)
171        .and_then(|caps| caps.get(2).map(|m| m.as_str().to_string()));
172      let clean_title = anchor_re.replace_all(&text, "").trim().to_string();
173      if !clean_title.is_empty() {
174        return Some((clean_title, anchor_id));
175      }
176    }
177  }
178  None
179}
180
181/// Clean anchor patterns from text (removes `{#anchor-id}` patterns).
182/// This is useful for cleaning titles and navigation text.
183///
184/// # Panics
185///
186/// Panics if fallback regex pattern fails to compile, which should never happen
187/// with hardcoded pattern.
188#[must_use]
189pub fn clean_anchor_patterns(text: &str) -> String {
190  static ANCHOR_PATTERN: OnceLock<Regex> = OnceLock::new();
191  let anchor_pattern = ANCHOR_PATTERN.get_or_init(|| {
192    Regex::new(r"\s*\{#[a-zA-Z0-9_-]+\}\s*$").unwrap_or_else(|e| {
193      log::error!(
194        "Failed to compile ANCHOR_PATTERN regex in clean_anchor_patterns: \
195         {e}\n Falling back to never matching regex."
196      );
197      never_matching_regex().unwrap_or_else(|_| {
198        // As a last resort, create a regex that matches nothing
199        #[allow(
200          clippy::expect_used,
201          reason = "This pattern is guaranteed to be valid"
202        )]
203        Regex::new(r"[^\s\S]")
204          .expect("regex pattern [^\\s\\S] should always compile")
205      })
206    })
207  });
208  anchor_pattern.replace_all(text.trim(), "").to_string()
209}
210
211/// Strip markdown formatting and return plain text.
212///
213/// This processes the markdown through the AST and extracts only text content,
214/// excluding code blocks and other formatting.
215#[must_use]
216pub fn strip_markdown(content: &str) -> String {
217  let arena = Arena::new();
218  let mut options = Options::default();
219  options.extension.table = true;
220  options.extension.footnotes = true;
221  options.extension.strikethrough = true;
222  options.extension.tasklist = true;
223  options.render.r#unsafe = true;
224
225  let root = parse_document(&arena, content, &options);
226
227  let mut plain_text = String::new();
228
229  #[allow(clippy::items_after_statements, reason = "Helper scoped for clarity")]
230  fn extract_text<'a>(
231    node: &'a AstNode<'a>,
232    plain_text: &mut String,
233    in_code_block: &mut bool,
234  ) {
235    match &node.data.borrow().value {
236      NodeValue::Document => {
237        for child in node.children() {
238          extract_text(child, plain_text, in_code_block);
239        }
240      },
241      NodeValue::Paragraph => {
242        for child in node.children() {
243          extract_text(child, plain_text, in_code_block);
244        }
245        // Add paragraph break after each paragraph
246        plain_text.push('\n');
247      },
248      NodeValue::Heading(_) => {
249        for child in node.children() {
250          extract_text(child, plain_text, in_code_block);
251        }
252        // Add line break after heading
253        plain_text.push('\n');
254      },
255      NodeValue::Text(t) => {
256        if !*in_code_block {
257          plain_text.push_str(t);
258        }
259      },
260      NodeValue::CodeBlock(_) => {
261        *in_code_block = true;
262        for child in node.children() {
263          extract_text(child, plain_text, in_code_block);
264        }
265        *in_code_block = false;
266      },
267      NodeValue::SoftBreak => {
268        plain_text.push(' ');
269      },
270      NodeValue::LineBreak => {
271        plain_text.push('\n');
272      },
273      NodeValue::List(_) => {
274        for child in node.children() {
275          extract_text(child, plain_text, in_code_block);
276        }
277        plain_text.push('\n');
278      },
279      NodeValue::Item(_) => {
280        for child in node.children() {
281          extract_text(child, plain_text, in_code_block);
282        }
283      },
284      NodeValue::Code(c) => {
285        if !*in_code_block {
286          plain_text.push_str(&c.literal);
287        }
288      },
289      _ => {
290        for child in node.children() {
291          extract_text(child, plain_text, in_code_block);
292        }
293      },
294    }
295  }
296
297  let mut in_code_block = false;
298  extract_text(root, &mut plain_text, &mut in_code_block);
299  plain_text
300}
301
302/// Capitalize the first letter of a string.
303pub fn capitalize_first(s: &str) -> String {
304  let mut chars = s.chars();
305  chars.next().map_or_else(String::new, |c| {
306    c.to_uppercase().collect::<String>() + chars.as_str()
307  })
308}
309
310/// Return true if the string looks like a markdown header (starts with #).
311#[must_use]
312pub fn is_markdown_header(line: &str) -> bool {
313  line.trim_start().starts_with('#')
314}
315
316/// Load manpage URL mappings from a JSON file.
317///
318/// # Errors
319///
320/// Returns an error if the file cannot be read or if the JSON is invalid.
321pub fn load_manpage_urls(
322  path: &str,
323) -> Result<HashMap<String, String>, Box<dyn std::error::Error>> {
324  let content = std::fs::read_to_string(path)?;
325  let mappings: HashMap<String, String> = serde_json::from_str(&content)?;
326  Ok(mappings)
327}
328
329/// Create a regex that never matches anything.
330///
331/// This is used as a fallback pattern when a regex fails to compile.
332/// It will never match any input, which is safer than using a trivial regex
333/// like `^$` which would match empty strings.
334///
335/// # Errors
336///
337/// Returns an error if both primary and fallback regex patterns fail to
338/// compile, which should never happen with hardcoded patterns.
339pub fn never_matching_regex() -> Result<regex::Regex, regex::Error> {
340  // Use a pattern that will never match anything because it asserts something
341  // impossible - this pattern is guaranteed to be valid
342  regex::Regex::new(r"[^\s\S]").or_else(|_| {
343    // As an ultimate fallback, use an empty pattern that matches nothing
344    // This SHOULD NOT happen.
345    regex::Regex::new(r"^\b$")
346  })
347}
ndg_commonmark/utils/mod.rs

ndg_commonmark/utils/
mod.rs