ndg-commonmark 2.6.0

Flavored CommonMark processor for Nix-related projects, with support for CommonMark, GFM, and Nixpkgs extensions.
Documentation
use std::{collections::HashMap, sync::OnceLock};
pub mod codeblock;

use comrak::{
  Arena,
  nodes::{AstNode, NodeHeading, NodeValue},
  options::Options,
  parse_document,
};
use regex::Regex;

/// Error type for utility operations.
#[derive(Debug, thiserror::Error)]
pub enum UtilError {
  #[error("Regex compilation failed: {0}")]
  RegexError(#[from] regex::Error),
}

/// Result type for utility operations.
pub type UtilResult<T> = Result<T, UtilError>;

/// Slugify a string for use as an anchor ID.
/// Converts to lowercase, replaces non-alphanumeric characters with dashes,
/// and trims leading/trailing dashes.
#[must_use]
pub fn slugify(text: &str) -> String {
  text
    .to_lowercase()
    .replace(|c: char| !c.is_alphanumeric() && c != '-' && c != '_', "-")
    .trim_matches('-')
    .to_string()
}

/// Extract the first heading from markdown content as the page title.
/// Returns [`None`] if no heading is found.
#[must_use]
pub fn extract_markdown_title(content: &str) -> Option<String> {
  let arena = Arena::new();
  let mut options = Options::default();
  options.extension.table = true;
  options.extension.footnotes = true;
  options.extension.strikethrough = true;
  options.extension.tasklist = true;
  options.extension.superscript = true;
  options.render.r#unsafe = true;

  let root = parse_document(&arena, content, &options);

  for node in root.descendants() {
    if let NodeValue::Heading(_) = &node.data.borrow().value {
      let text = extract_inline_text_from_node(node);
      if !text.trim().is_empty() {
        return Some(text.trim().to_string());
      }
    }
  }
  None
}

/// Extract all inline text from a node, recursively handling all inline
/// elements.
fn extract_inline_text_from_node<'a>(node: &'a AstNode<'a>) -> String {
  let mut text = String::new();
  for child in node.children() {
    match &child.data.borrow().value {
      NodeValue::Text(t) => text.push_str(t),
      NodeValue::Code(t) => text.push_str(&t.literal),
      NodeValue::Link(..)
      | NodeValue::Emph
      | NodeValue::Strong
      | NodeValue::Strikethrough
      | NodeValue::Superscript
      | NodeValue::FootnoteReference(..) => {
        text.push_str(&extract_inline_text_from_node(child));
      },
      #[allow(clippy::match_same_arms, reason = "Explicit for clarity")]
      NodeValue::HtmlInline(_) | NodeValue::Image(..) => {},
      _ => {},
    }
  }
  text
}

/// Extract the first H1 heading from markdown content as the document title.
/// Removes inline anchors and other markup from the title text.
///
/// # Returns
///
/// [`None`] if no H1 heading is found.
///
/// `Some(title, id)` if a H1 heading is found. id can be None if inline anchor
/// does not exist.
///
/// # Panics
///
/// Panics if the fallback regex pattern fails to compile, which should never
/// happen with the hardcoded pattern.
#[must_use]
pub fn extract_markdown_title_and_id(
  content: &str,
) -> Option<(String, Option<String>)> {
  let arena = Arena::new();
  let mut options = Options::default();
  options.extension.table = true;
  options.extension.footnotes = true;
  options.extension.strikethrough = true;
  options.extension.tasklist = true;
  options.render.r#unsafe = true;

  let root = parse_document(&arena, content, &options);

  // Use a static regex to avoid compilation failures at runtime
  #[allow(
    clippy::items_after_statements,
    reason = "Static is Scoped to function for clarity"
  )]
  static ANCHOR_RE: OnceLock<Regex> = OnceLock::new();
  let anchor_re = ANCHOR_RE.get_or_init(|| {
    Regex::new(r"(\[\])?\{#(.*?)\}").unwrap_or_else(|e| {
      log::error!(
        "Failed to compile ANCHOR_RE regex in extract_h1_title: {e}\n Falling \
         back to never matching regex."
      );
      never_matching_regex().unwrap_or_else(|_| {
        // As a last resort, create a regex that matches nothing
        #[allow(
          clippy::expect_used,
          reason = "This pattern is guaranteed to be valid"
        )]
        Regex::new(r"[^\s\S]")
          .expect("regex pattern [^\\s\\S] should always compile")
      })
    })
  });

  for node in root.descendants() {
    if let NodeValue::Heading(NodeHeading { level, .. }) =
      &node.data.borrow().value
      && *level == 1
    {
      let text = extract_inline_text_from_node(node);
      // Clean the title by removing inline anchors and other NDG markup
      let anchor_id = anchor_re
        .captures(&text)
        .and_then(|caps| caps.get(2).map(|m| m.as_str().to_string()));
      let clean_title = anchor_re.replace_all(&text, "").trim().to_string();
      if !clean_title.is_empty() {
        return Some((clean_title, anchor_id));
      }
    }
  }
  None
}

/// Clean anchor patterns from text (removes `{#anchor-id}` patterns).
/// This is useful for cleaning titles and navigation text.
///
/// # Panics
///
/// Panics if fallback regex pattern fails to compile, which should never happen
/// with hardcoded pattern.
#[must_use]
pub fn clean_anchor_patterns(text: &str) -> String {
  static ANCHOR_PATTERN: OnceLock<Regex> = OnceLock::new();
  let anchor_pattern = ANCHOR_PATTERN.get_or_init(|| {
    Regex::new(r"\s*\{#[a-zA-Z0-9_-]+\}\s*$").unwrap_or_else(|e| {
      log::error!(
        "Failed to compile ANCHOR_PATTERN regex in clean_anchor_patterns: \
         {e}\n Falling back to never matching regex."
      );
      never_matching_regex().unwrap_or_else(|_| {
        // As a last resort, create a regex that matches nothing
        #[allow(
          clippy::expect_used,
          reason = "This pattern is guaranteed to be valid"
        )]
        Regex::new(r"[^\s\S]")
          .expect("regex pattern [^\\s\\S] should always compile")
      })
    })
  });
  anchor_pattern.replace_all(text.trim(), "").to_string()
}

/// Apply a regex transformation to HTML elements using the provided function.
/// Used by the markdown processor for HTML element transformations.
pub fn process_html_elements<F>(
  html: &str,
  regex: &Regex,
  transform: F,
) -> String
where
  F: Fn(&regex::Captures) -> String,
{
  match regex.replace_all(html, transform) {
    std::borrow::Cow::Borrowed(_) => html.to_string(),
    std::borrow::Cow::Owned(s) => s,
  }
}

/// Strip markdown formatting and return plain text.
///
/// This processes the markdown through the AST and extracts only text content,
/// excluding code blocks and other formatting.
#[must_use]
pub fn strip_markdown(content: &str) -> String {
  let arena = Arena::new();
  let mut options = Options::default();
  options.extension.table = true;
  options.extension.footnotes = true;
  options.extension.strikethrough = true;
  options.extension.tasklist = true;
  options.render.r#unsafe = true;

  let root = parse_document(&arena, content, &options);

  let mut plain_text = String::new();

  #[allow(clippy::items_after_statements, reason = "Helper scoped for clarity")]
  fn extract_text<'a>(
    node: &'a AstNode<'a>,
    plain_text: &mut String,
    in_code_block: &mut bool,
  ) {
    match &node.data.borrow().value {
      NodeValue::Document => {
        for child in node.children() {
          extract_text(child, plain_text, in_code_block);
        }
      },
      NodeValue::Paragraph => {
        for child in node.children() {
          extract_text(child, plain_text, in_code_block);
        }
        // Add paragraph break after each paragraph
        plain_text.push('\n');
      },
      NodeValue::Heading(_) => {
        for child in node.children() {
          extract_text(child, plain_text, in_code_block);
        }
        // Add line break after heading
        plain_text.push('\n');
      },
      NodeValue::Text(t) => {
        if !*in_code_block {
          plain_text.push_str(t);
        }
      },
      NodeValue::CodeBlock(_) => {
        *in_code_block = true;
        for child in node.children() {
          extract_text(child, plain_text, in_code_block);
        }
        *in_code_block = false;
      },
      NodeValue::SoftBreak => {
        plain_text.push(' ');
      },
      NodeValue::LineBreak => {
        plain_text.push('\n');
      },
      NodeValue::List(_) => {
        for child in node.children() {
          extract_text(child, plain_text, in_code_block);
        }
        plain_text.push('\n');
      },
      NodeValue::Item(_) => {
        for child in node.children() {
          extract_text(child, plain_text, in_code_block);
        }
      },
      NodeValue::Code(c) => {
        if !*in_code_block {
          plain_text.push_str(&c.literal);
        }
      },
      _ => {
        for child in node.children() {
          extract_text(child, plain_text, in_code_block);
        }
      },
    }
  }

  let mut in_code_block = false;
  extract_text(root, &mut plain_text, &mut in_code_block);
  plain_text
}

/// Capitalize the first letter of a string.
pub fn capitalize_first(s: &str) -> String {
  let mut chars = s.chars();
  chars.next().map_or_else(String::new, |c| {
    c.to_uppercase().collect::<String>() + chars.as_str()
  })
}

/// Return true if the string looks like a markdown header (starts with #).
#[must_use]
pub fn is_markdown_header(line: &str) -> bool {
  line.trim_start().starts_with('#')
}

/// Load manpage URL mappings from a JSON file.
///
/// # Errors
///
/// Returns an error if the file cannot be read or if the JSON is invalid.
pub fn load_manpage_urls(
  path: &str,
) -> Result<HashMap<String, String>, Box<dyn std::error::Error>> {
  let content = std::fs::read_to_string(path)?;
  let mappings: HashMap<String, String> = serde_json::from_str(&content)?;
  Ok(mappings)
}

/// Create a regex that never matches anything.
///
/// This is used as a fallback pattern when a regex fails to compile.
/// It will never match any input, which is safer than using a trivial regex
/// like `^$` which would match empty strings.
///
/// # Errors
///
/// Returns an error if both primary and fallback regex patterns fail to
/// compile, which should never happen with hardcoded patterns.
pub fn never_matching_regex() -> Result<regex::Regex, regex::Error> {
  // Use a pattern that will never match anything because it asserts something
  // impossible - this pattern is guaranteed to be valid
  regex::Regex::new(r"[^\s\S]").or_else(|_| {
    // As an ultimate fallback, use an empty pattern that matches nothing
    // This SHOULD NOT happen.
    regex::Regex::new(r"^\b$")
  })
}