ndg-commonmark 2.6.0

//! Core implementation of the Markdown processor.
//!
//! This module contains the main implementation of `MarkdownProcessor` and its
//! methods, focused on the core rendering pipeline and configuration
//! management.
use std::{
  collections::HashMap,
  path::{Path, PathBuf},
};

use comrak::{
  Arena,
  nodes::{AstNode, NodeHeading, NodeValue},
  options::Options,
  parse_document,
};
use log::trace;
use markup5ever::local_name;
use walkdir::WalkDir;

/// Error type for DOM operations.
#[derive(Debug, thiserror::Error)]
pub enum DomError {
  #[error("CSS selector failed: {0}")]
  SelectorError(String),
  #[error("DOM serialization failed: {0}")]
  SerializationError(String),
}

/// Result type for DOM operations.
pub type DomResult<T> = Result<T, DomError>;

/// Safely select DOM elements with graceful error handling.
fn safe_select(
  document: &kuchikikiki::NodeRef,
  selector: &str,
) -> Vec<kuchikikiki::NodeRef> {
  match document.select(selector) {
    Ok(selections) => selections.map(|sel| sel.as_node().clone()).collect(),
    Err(e) => {
      log::warn!("DOM selector '{selector}' failed: {e:?}");
      Vec::new()
    },
  }
}

use super::{
  process::process_safe,
  types::{
    AstTransformer,
    MarkdownOptions,
    MarkdownProcessor,
    PromptTransformer,
  },
};
use crate::{
  syntax::create_default_manager,
  types::{Header, MarkdownResult},
  utils,
};

impl MarkdownProcessor {
  /// Create a new `MarkdownProcessor` with the given options.
  #[must_use]
  pub fn new(options: MarkdownOptions) -> Self {
    let manpage_urls = options
      .manpage_urls_path
      .as_ref()
      .and_then(|path| crate::utils::load_manpage_urls(path).ok());

    let syntax_manager = if options.highlight_code {
      match create_default_manager() {
        Ok(manager) => {
          log::info!("Syntax highlighting initialized successfully");
          Some(manager)
        },
        Err(e) => {
          log::error!("Failed to initialize syntax highlighting: {e}");
          log::warn!(
            "Continuing without syntax highlighting - code blocks will not be \
             highlighted"
          );
          None
        },
      }
    } else {
      None
    };

    Self {
      options,
      manpage_urls,
      syntax_manager,
      base_dir: std::path::PathBuf::from("."),
    }
  }

  /// Access processor options.
  #[must_use]
  pub const fn options(&self) -> &MarkdownOptions {
    &self.options
  }

  /// Set the base directory for resolving relative file paths.
  #[must_use]
  pub fn with_base_dir(mut self, base_dir: &std::path::Path) -> Self {
    self.base_dir = base_dir.to_path_buf();
    self
  }

  /// Check if a specific feature is enabled.
  #[must_use]
  pub const fn has_feature(&self, feature: ProcessorFeature) -> bool {
    match feature {
      ProcessorFeature::Gfm => self.options.gfm,
      ProcessorFeature::Nixpkgs => self.options.nixpkgs,
      ProcessorFeature::SyntaxHighlighting => self.options.highlight_code,
      ProcessorFeature::ManpageUrls => self.manpage_urls.is_some(),
    }
  }

  /// Get the manpage URLs mapping for use with standalone functions.
  #[must_use]
  pub const fn manpage_urls(&self) -> Option<&HashMap<String, String>> {
    self.manpage_urls.as_ref()
  }

  /// Highlight all code blocks in HTML using the configured syntax highlighter
  #[must_use]
  pub fn highlight_codeblocks(&self, html: &str) -> String {
    use kuchikikiki::parse_html;
    use tendril::TendrilSink;

    if !self.options.highlight_code || self.syntax_manager.is_none() {
      return html.to_string();
    }

    let document = parse_html().one(html);

    // Collect all code blocks first to avoid DOM modification during iteration
    let mut code_blocks = Vec::new();
    for pre_node in safe_select(&document, "pre > code") {
      let code_node = pre_node;
      if let Some(element) = code_node.as_element() {
        let language = element
          .attributes
          .borrow()
          .get("class")
          .and_then(|class| class.strip_prefix("language-"))
          .unwrap_or("text")
          .to_string();
        let code_text = code_node.text_contents();

        if let Some(pre_parent) = code_node.parent() {
          code_blocks.push((
            pre_parent.clone(),
            code_node.clone(),
            code_text,
            language,
          ));
        }
      }
    }

    // Process each code block
    for (pre_element, _code_node, code_text, language) in code_blocks {
      if let Some(highlighted) = self.highlight_code_html(&code_text, &language)
      {
        // Wrap highlighted HTML in <pre><code> with appropriate classes
        let wrapped_html = format!(
          r#"<pre class="highlight"><code class="language-{language}">{highlighted}</code></pre>"#
        );
        let fragment = parse_html().one(wrapped_html.as_str());
        pre_element.insert_after(fragment);
        pre_element.detach();
      }
      // Do not add highlight/language-* classes if not highlighted
    }

    let mut buf = Vec::new();
    if let Err(e) = document.serialize(&mut buf) {
      log::warn!("DOM serialization failed: {e:?}");
      return html.to_string(); // Return original HTML if serialization fails
    }
    String::from_utf8(buf).unwrap_or_else(|_| html.to_string())
  }

  /// Handle hard tabs in code blocks according to configuration
  fn handle_hardtabs(&self, code: &str) -> String {
    use super::types::TabStyle;

    // Check if there are any hard tabs
    if !code.contains('\t') {
      return code.to_string();
    }

    match self.options.tab_style {
      // Do nothing
      TabStyle::None => code.to_string(),

      // Warn, but do nothing.
      TabStyle::Warn => {
        log::warn!(
          "Hard tabs detected in code block. Consider using spaces for \
           consistency. Tools like editorconfig may help you normalize spaces \
           in your documents."
        );
        code.to_string()
      },

      // Do not warn, only inform in debug mode. Then return
      // the updated code.
      TabStyle::Normalize => {
        log::debug!("Replacing hard tabs with spaces");
        code.replace('\t', "  ")
      },
    }
  }

  /// Process hard tabs in code blocks within markdown content
  fn process_hardtabs(&self, markdown: &str) -> String {
    use super::types::TabStyle;

    // If no tab handling is needed, return as-is
    if self.options.tab_style == TabStyle::None {
      return markdown.to_string();
    }

    let mut result = String::with_capacity(markdown.len());
    let mut lines = markdown.lines().peekable();
    let mut in_code_block = false;
    let mut code_fence_char = None;
    let mut code_fence_count = 0;

    while let Some(line) = lines.next() {
      let trimmed = line.trim_start();

      // Check for code fences
      if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
        let Some(fence_char) = trimmed.chars().next() else {
          // If the line is empty after trimming, it can't be a valid code fence
          // Just continue processing the line normally
          result.push_str(line);
          result.push('\n');
          continue;
        };
        let fence_count =
          trimmed.chars().take_while(|&c| c == fence_char).count();

        if fence_count >= 3 {
          if !in_code_block {
            // Starting a code block
            in_code_block = true;
            code_fence_char = Some(fence_char);
            code_fence_count = fence_count;
          } else if code_fence_char == Some(fence_char)
            && fence_count >= code_fence_count
          {
            // Ending a code block
            in_code_block = false;
            code_fence_char = None;
            code_fence_count = 0;
          }
        }
      }

      // Process line based on whether we're in a code block
      let processed_line = if in_code_block && line.contains('\t') {
        self.handle_hardtabs(line)
      } else {
        line.to_string()
      };

      result.push_str(&processed_line);

      // Add newline unless this is the last line
      if lines.peek().is_some() {
        result.push('\n');
      }
    }

    result
  }

  /// Highlight code using the configured syntax highlighter, returns HTML
  /// string
  fn highlight_code_html(&self, code: &str, language: &str) -> Option<String> {
    if !self.options.highlight_code {
      return None;
    }

    let syntax_manager = self.syntax_manager.as_ref()?;

    syntax_manager
      .highlight_code(code, language, self.options.highlight_theme.as_deref())
      .ok()
  }

  /// Render Markdown to HTML, extracting headers and title.
  #[must_use]
  pub fn render(&self, markdown: &str) -> MarkdownResult {
    let (preprocessed, included_files) = self.preprocess(markdown);
    let (headers, title) = self.extract_headers(&preprocessed);
    let html = self.process_html_pipeline(&preprocessed);

    MarkdownResult {
      html,
      headers,
      title,
      included_files,
    }
  }

  /// Process the HTML generation and post-processing pipeline.
  fn process_html_pipeline(&self, content: &str) -> String {
    let mut html = self.convert_to_html(content);

    // Apply feature-specific post-processing
    if cfg!(feature = "ndg-flavored") {
      #[cfg(feature = "ndg-flavored")]
      {
        html = super::extensions::process_option_references(
          &html,
          self.options.valid_options.as_ref(),
        );
      }
    }

    if self.options.nixpkgs {
      html = self.process_manpage_references_html(&html);
    }

    if self.options.highlight_code {
      html = self.highlight_codeblocks(&html);
    }

    self.kuchiki_postprocess(&html)
  }

  /// Preprocess the markdown content with all enabled transformations.
  fn preprocess(
    &self,
    content: &str,
  ) -> (String, Vec<crate::types::IncludedFile>) {
    let mut processed = content.to_string();
    let mut included_files = Vec::new();

    // Process MyST-style autolinks first
    processed = super::extensions::process_myst_autolinks(&processed);

    // Handle hard tabs in code blocks
    processed = self.process_hardtabs(&processed);

    if self.options.nixpkgs {
      let (content, files) = self.apply_nixpkgs_preprocessing(&processed);
      processed = content;
      included_files = files;
    }

    if self.options.nixpkgs || cfg!(feature = "ndg-flavored") {
      processed = super::extensions::process_role_markup(
        &processed,
        self.manpage_urls.as_ref(),
        self.options.auto_link_options,
        self.options.valid_options.as_ref(),
      );
    }

    (processed, included_files)
  }

  /// Apply Nixpkgs-specific preprocessing steps.
  #[cfg(feature = "nixpkgs")]
  fn apply_nixpkgs_preprocessing(
    &self,
    content: &str,
  ) -> (String, Vec<crate::types::IncludedFile>) {
    let (with_includes, included_files) =
      match super::extensions::process_file_includes(content, &self.base_dir, 0)
      {
        Ok(result) => result,
        Err(e) => {
          log::warn!(
            "File include processing failed: {e}. Continuing without includes."
          );
          (content.to_string(), Vec::new())
        },
      };
    let with_blocks = super::extensions::process_block_elements(&with_includes);
    let processed = super::extensions::process_inline_anchors(&with_blocks);
    (processed, included_files)
  }

  /// Apply Nixpkgs-specific preprocessing steps (no-op when feature disabled).
  #[cfg(not(feature = "nixpkgs"))]
  fn apply_nixpkgs_preprocessing(
    &self,
    content: &str,
  ) -> (String, Vec<crate::types::IncludedFile>) {
    (content.to_string(), Vec::new())
  }

  /// Extract headers and title from the markdown content.
  #[must_use]
  pub fn extract_headers(
    &self,
    content: &str,
  ) -> (Vec<Header>, Option<String>) {
    use std::fmt::Write;

    let arena = Arena::new();
    let options = self.comrak_options();

    // Normalize custom anchors with no heading level to h2
    let mut normalized = String::with_capacity(content.len());
    for line in content.lines() {
      let trimmed = line.trim_end();
      if !trimmed.starts_with('#')
        && let Some(anchor_start) = trimmed.rfind("{#")
        && let Some(anchor_end) = trimmed[anchor_start..].find('}')
      {
        let text = trimmed[..anchor_start].trim_end();
        let id = &trimmed[anchor_start + 2..anchor_start + anchor_end];
        let _ = writeln!(normalized, "## {text} {{#{id}}}");
        continue;
      }
      normalized.push_str(line);
      normalized.push('\n');
    }

    let root = parse_document(&arena, &normalized, &options);

    let mut headers = Vec::new();
    let mut found_title = None;

    for node in root.descendants() {
      if let NodeValue::Heading(NodeHeading { level, .. }) =
        &node.data.borrow().value
      {
        let mut text = String::new();
        let mut explicit_id = None;

        for child in node.children() {
          match &child.data.borrow().value {
            NodeValue::Text(t) => text.push_str(t),
            NodeValue::Code(t) => text.push_str(&t.literal),
            NodeValue::Link(..)
            | NodeValue::Emph
            | NodeValue::Strong
            | NodeValue::Subscript
            | NodeValue::Strikethrough
            | NodeValue::Superscript
            | NodeValue::FootnoteReference(..) => {
              text.push_str(&extract_inline_text(child));
            },
            NodeValue::HtmlInline(html) => {
              // Look for explicit anchor in HTML inline node: {#id}
              let html_str = html.as_str();
              if let Some(start) = html_str.find("{#")
                && let Some(end) = html_str[start..].find('}')
              {
                let anchor = &html_str[start + 2..start + end];
                explicit_id = Some(anchor.to_string());
              }
            },
            #[allow(clippy::match_same_arms, reason = "Explicit for clarity")]
            NodeValue::Image(..) => {},
            _ => {},
          }
        }

        // Check for trailing {#id} in heading text
        let trimmed = text.trim_end();
        #[allow(clippy::option_if_let_else)]
        // Nested options clearer with if-let
        let (final_text, id) = if let Some(start) = trimmed.rfind("{#") {
          if let Some(end) = trimmed[start..].find('}') {
            let anchor = &trimmed[start + 2..start + end];
            (trimmed[..start].trim_end().to_string(), anchor.to_string())
          } else {
            (
              text.clone(),
              explicit_id.unwrap_or_else(|| utils::slugify(&text)),
            )
          }
        } else {
          (
            text.clone(),
            explicit_id.unwrap_or_else(|| utils::slugify(&text)),
          )
        };
        if *level == 1 && found_title.is_none() {
          found_title = Some(final_text.clone());
        }
        headers.push(Header {
          text: final_text,
          level: *level,
          id,
        });
      }
    }

    (headers, found_title)
  }

  /// Convert markdown to HTML using comrak and configured options.
  fn convert_to_html(&self, content: &str) -> String {
    // Process directly without panic catching for better performance
    let arena = Arena::new();
    let options = self.comrak_options();
    let root = parse_document(&arena, content, &options);

    // Apply AST transformations
    let prompt_transformer = PromptTransformer;
    prompt_transformer.transform(root);

    let mut html_output = String::new();
    comrak::format_html(root, &options, &mut html_output).unwrap_or_default();

    // Post-process HTML to handle header anchors
    Self::process_header_anchors_html(&html_output)
  }

  /// Process header anchors in HTML by finding `{#id}` syntax and converting to
  /// proper id attributes. Also adds auto-generated IDs to headers without
  /// explicit anchors.
  fn process_header_anchors_html(html: &str) -> String {
    use std::sync::LazyLock;

    use regex::Regex;

    // First pass: handle explicit {#id} syntax
    static HEADER_ANCHOR_RE: LazyLock<Regex> = LazyLock::new(|| {
      Regex::new(r"<h([1-6])>(.*?)\s*\{#([a-zA-Z0-9_-]+)\}(.*?)</h[1-6]>")
        .unwrap_or_else(|e| {
          log::error!("Failed to compile HEADER_ANCHOR_RE regex: {e}");
          utils::never_matching_regex().unwrap_or_else(|_| {
            #[allow(
              clippy::expect_used,
              reason = "This pattern is guaranteed to be valid"
            )]
            Regex::new(r"[^\s\S]")
              .expect("regex pattern [^\\s\\S] should always compile")
          })
        })
    });

    // Second pass: add IDs to headers without attributes (no id yet)
    // Matches <h1>content</h1> but not <h1 id="...">content</h1>
    static HEADER_NO_ID_RE: LazyLock<Regex> = LazyLock::new(|| {
      Regex::new(r"<h([1-6])>(.*?)</h[1-6]>").unwrap_or_else(|e| {
        log::error!("Failed to compile HEADER_NO_ID_RE regex: {e}");
        utils::never_matching_regex().unwrap_or_else(|_| {
          #[allow(
            clippy::expect_used,
            reason = "This pattern is guaranteed to be valid"
          )]
          Regex::new(r"[^\s\S]")
            .expect("regex pattern [^\\s\\S] should always compile")
        })
      })
    });

    // Regex to strip HTML tags for slugification
    static HTML_TAG_RE: LazyLock<Regex> = LazyLock::new(|| {
      Regex::new(r"<[^>]+>").unwrap_or_else(|e| {
        log::error!("Failed to compile HTML_TAG_RE regex: {e}");
        utils::never_matching_regex().unwrap_or_else(|_| {
          #[allow(
            clippy::expect_used,
            reason = "This pattern is guaranteed to be valid"
          )]
          Regex::new(r"[^\s\S]")
            .expect("regex pattern [^\\s\\S] should always compile")
        })
      })
    });

    // First pass: explicit {#id} syntax
    let result = HEADER_ANCHOR_RE
      .replace_all(html, |caps: &regex::Captures| {
        let level = &caps[1];
        let prefix = &caps[2];
        let id = &caps[3];
        let suffix = &caps[4];
        format!("<h{level} id=\"{id}\">{prefix}{suffix}</h{level}>")
      })
      .to_string();

    // Second pass: add auto-generated IDs to headers without id attribute
    HEADER_NO_ID_RE
      .replace_all(&result, |caps: &regex::Captures| {
        let level = &caps[1];
        let content = &caps[2];
        // Strip HTML tags and slugify the text content
        let text_only = HTML_TAG_RE.replace_all(content, "");
        let id = utils::slugify(&text_only);
        if id.is_empty() {
          // If slugify produces empty string, keep header without id
          format!("<h{level}>{content}</h{level}>")
        } else {
          format!("<h{level} id=\"{id}\">{content}</h{level}>")
        }
      })
      .to_string()
  }

  /// Build comrak options from `MarkdownOptions` and feature flags.
  fn comrak_options(&self) -> Options<'_> {
    let mut options = Options::default();
    if self.options.gfm {
      options.extension.table = true;
      options.extension.footnotes = true;
      options.extension.strikethrough = true;
      options.extension.tasklist = true;
      options.extension.superscript = true;
      options.extension.autolink = true;
    }
    options.render.r#unsafe = true;
    // Enable description lists but keep custom header processing
    options.extension.header_ids = None;
    options.extension.description_lists = true;
    options
  }

  /// Post-process HTML to enhance manpage references with URL links.
  #[cfg(feature = "nixpkgs")]
  fn process_manpage_references_html(&self, html: &str) -> String {
    super::extensions::process_manpage_references(
      html,
      self.manpage_urls.as_ref(),
    )
  }

  /// Post-process HTML to enhance manpage references (no-op when feature
  /// disabled).
  #[cfg(not(feature = "nixpkgs"))]
  fn process_manpage_references_html(&self, html: &str) -> String {
    html.to_string()
  }

  /// HTML post-processing using kuchiki DOM manipulation.
  #[allow(
    clippy::unused_self,
    reason = "Method signature matches processor pattern"
  )]
  fn kuchiki_postprocess(&self, html: &str) -> String {
    // Use a standalone function to avoid borrowing issues
    kuchiki_postprocess_html(html, |document| {
      Self::apply_dom_transformations(document);
    })
  }

  /// Apply all DOM transformations to the parsed HTML document.
  fn apply_dom_transformations(document: &kuchikikiki::NodeRef) {
    Self::process_list_item_id_markers(document);
    Self::process_header_anchor_comments(document);
    Self::process_list_item_inline_anchors(document);
    Self::process_paragraph_inline_anchors(document);
    Self::process_remaining_inline_anchors(document);
    Self::process_option_anchor_links(document);
    Self::process_empty_auto_links(document);
    Self::process_empty_html_links(document);
  }

  /// Process list item ID markers: <li><!-- nixos-anchor-id:ID -->
  fn process_list_item_id_markers(document: &kuchikikiki::NodeRef) {
    let mut to_modify = Vec::new();

    for comment in document.inclusive_descendants() {
      if let Some(comment_node) = comment.as_comment() {
        let comment_text = comment_node.borrow();
        if let Some(id_start) = comment_text.find("nixos-anchor-id:") {
          let id = comment_text[id_start + 16..].trim();
          if !id.is_empty()
            && id
              .chars()
              .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
          {
            // Check if this comment is inside an <li> element
            if let Some(parent) = comment.parent()
              && let Some(element) = parent.as_element()
              && element.name.local.as_ref() == "li"
            {
              to_modify.push((comment.clone(), id.to_string()));
            }
          }
        }
      }
    }

    for (comment_node, id) in to_modify {
      let span = kuchikikiki::NodeRef::new_element(
        markup5ever::QualName::new(
          None,
          markup5ever::ns!(html),
          local_name!("span"),
        ),
        vec![
          (
            kuchikikiki::ExpandedName::new("", "id"),
            kuchikikiki::Attribute {
              prefix: None,
              value:  id,
            },
          ),
          (
            kuchikikiki::ExpandedName::new("", "class"),
            kuchikikiki::Attribute {
              prefix: None,
              value:  "nixos-anchor".into(),
            },
          ),
        ],
      );
      comment_node.insert_after(span);
      comment_node.detach();
    }
  }

  /// Process header anchors with comments: <h1>text<!-- anchor: id --></h1>
  fn process_header_anchor_comments(document: &kuchikikiki::NodeRef) {
    let mut to_modify = Vec::new();

    for comment in document.inclusive_descendants() {
      if let Some(comment_node) = comment.as_comment() {
        let comment_text = comment_node.borrow();
        if let Some(anchor_start) = comment_text.find("anchor:") {
          let id = comment_text[anchor_start + 7..].trim();
          if !id.is_empty()
            && id
              .chars()
              .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
          {
            // Check if this comment is inside a header element
            if let Some(parent) = comment.parent()
              && let Some(element) = parent.as_element()
            {
              let tag_name = element.name.local.as_ref();
              if matches!(tag_name, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
                to_modify.push((
                  parent.clone(),
                  comment.clone(),
                  id.to_string(),
                ));
              }
            }
          }
        }
      }
    }

    for (header_element, comment_node, id) in to_modify {
      if let Some(element) = header_element.as_element() {
        element
          .attributes
          .borrow_mut()
          .insert(local_name!("id"), id);
        comment_node.detach();
      }
    }
  }

  /// Process remaining inline anchors in list items: <li>[]{#id}content</li>
  fn process_list_item_inline_anchors(document: &kuchikikiki::NodeRef) {
    for li_node in safe_select(document, "li") {
      let li_element = li_node;

      // Check if this list item contains code elements
      let has_code = !safe_select(&li_element, "code, pre").is_empty();
      if has_code {
        continue; // Skip list items with code blocks
      }

      let text_content = li_element.text_contents();

      if let Some(anchor_start) = text_content.find("[]{#")
        && let Some(anchor_end) = text_content[anchor_start..].find('}')
      {
        let id = &text_content[anchor_start + 4..anchor_start + anchor_end];
        if !id.is_empty()
          && id
            .chars()
            .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
        {
          let remaining_content =
            &text_content[anchor_start + anchor_end + 1..];

          // Clear current content and rebuild
          for child in li_element.children() {
            child.detach();
          }

          let span = kuchikikiki::NodeRef::new_element(
            markup5ever::QualName::new(
              None,
              markup5ever::ns!(html),
              local_name!("span"),
            ),
            vec![
              (
                kuchikikiki::ExpandedName::new("", "id"),
                kuchikikiki::Attribute {
                  prefix: None,
                  value:  id.into(),
                },
              ),
              (
                kuchikikiki::ExpandedName::new("", "class"),
                kuchikikiki::Attribute {
                  prefix: None,
                  value:  "nixos-anchor".into(),
                },
              ),
            ],
          );
          li_element.append(span);
          if !remaining_content.is_empty() {
            li_element
              .append(kuchikikiki::NodeRef::new_text(remaining_content));
          }
        }
      }
    }
  }

  /// Process inline anchors in paragraphs: <p>[]{#id}content</p>
  fn process_paragraph_inline_anchors(document: &kuchikikiki::NodeRef) {
    for p_node in safe_select(document, "p") {
      let p_element = p_node;

      // Check if this paragraph contains code elements
      let has_code = !safe_select(&p_element, "code, pre").is_empty();
      if has_code {
        continue; // Skip paragraphs with code blocks
      }

      let text_content = p_element.text_contents();

      if let Some(anchor_start) = text_content.find("[]{#")
        && let Some(anchor_end) = text_content[anchor_start..].find('}')
      {
        let id = &text_content[anchor_start + 4..anchor_start + anchor_end];
        if !id.is_empty()
          && id
            .chars()
            .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
        {
          let remaining_content =
            &text_content[anchor_start + anchor_end + 1..];

          // Clear current content and rebuild
          for child in p_element.children() {
            child.detach();
          }

          let span = kuchikikiki::NodeRef::new_element(
            markup5ever::QualName::new(
              None,
              markup5ever::ns!(html),
              local_name!("span"),
            ),
            vec![
              (
                kuchikikiki::ExpandedName::new("", "id"),
                kuchikikiki::Attribute {
                  prefix: None,
                  value:  id.into(),
                },
              ),
              (
                kuchikikiki::ExpandedName::new("", "class"),
                kuchikikiki::Attribute {
                  prefix: None,
                  value:  "nixos-anchor".into(),
                },
              ),
            ],
          );
          p_element.append(span);
          if !remaining_content.is_empty() {
            p_element.append(kuchikikiki::NodeRef::new_text(remaining_content));
          }
        }
      }
    }
  }

  /// Process remaining standalone inline anchors throughout the document
  fn process_remaining_inline_anchors(document: &kuchikikiki::NodeRef) {
    let mut text_nodes_to_process = Vec::new();

    for node in document.inclusive_descendants() {
      if let Some(text_node) = node.as_text() {
        // Check if this text node is inside a code block
        let mut parent = node.parent();
        let mut in_code = false;
        while let Some(p) = parent {
          if let Some(element) = p.as_element()
            && (element.name.local == local_name!("code")
              || element.name.local == local_name!("pre"))
          {
            in_code = true;
            break;
          }
          parent = p.parent();
        }

        // Only process if not in code
        if !in_code {
          let text_content = text_node.borrow().clone();
          if text_content.contains("[]{#") {
            text_nodes_to_process.push((node.clone(), text_content));
          }
        }
      }
    }

    for (text_node, text_content) in text_nodes_to_process {
      let mut last_end = 0;
      let mut new_children = Vec::new();

      // Simple pattern matching for []{#id}
      let chars = text_content.chars().collect::<Vec<_>>();
      let mut i = 0;
      while i < chars.len() {
        if i + 4 < chars.len()
          && chars[i] == '['
          && chars[i + 1] == ']'
          && chars[i + 2] == '{'
          && chars[i + 3] == '#'
        {
          // Found start of anchor pattern
          let anchor_start = i;
          i += 4; // skip "[]{#"

          let mut id = String::new();
          while i < chars.len() && chars[i] != '}' {
            if chars[i].is_alphanumeric() || chars[i] == '-' || chars[i] == '_'
            {
              id.push(chars[i]);
              i += 1;
            } else {
              break;
            }
          }

          if i < chars.len() && chars[i] == '}' && !id.is_empty() {
            // Valid anchor found
            let anchor_end = i + 1;

            // Add text before anchor
            if anchor_start > last_end {
              let before_text: String =
                chars[last_end..anchor_start].iter().collect();
              if !before_text.is_empty() {
                new_children.push(kuchikikiki::NodeRef::new_text(before_text));
              }
            }

            // Add span element
            let span = kuchikikiki::NodeRef::new_element(
              markup5ever::QualName::new(
                None,
                markup5ever::ns!(html),
                local_name!("span"),
              ),
              vec![
                (
                  kuchikikiki::ExpandedName::new("", "id"),
                  kuchikikiki::Attribute {
                    prefix: None,
                    value:  id,
                  },
                ),
                (
                  kuchikikiki::ExpandedName::new("", "class"),
                  kuchikikiki::Attribute {
                    prefix: None,
                    value:  "nixos-anchor".into(),
                  },
                ),
              ],
            );
            new_children.push(span);

            last_end = anchor_end;
            i = anchor_end;
          } else {
            i += 1;
          }
        } else {
          i += 1;
        }
      }

      // Add remaining text
      if last_end < chars.len() {
        let after_text: String = chars[last_end..].iter().collect();
        if !after_text.is_empty() {
          new_children.push(kuchikikiki::NodeRef::new_text(after_text));
        }
      }

      // Replace text node if we found anchors
      if !new_children.is_empty() {
        for child in new_children {
          text_node.insert_before(child);
        }
        text_node.detach();
      }
    }
  }

  /// Process empty auto-links: [](#anchor) -> <a href="#anchor">Anchor</a>
  fn process_empty_auto_links(document: &kuchikikiki::NodeRef) {
    for link_node in safe_select(document, "a") {
      let link_element = link_node;
      if let Some(element) = link_element.as_element() {
        let href = element
          .attributes
          .borrow()
          .get(local_name!("href"))
          .map(std::string::ToString::to_string);
        let text_content = link_element.text_contents();

        if let Some(href_value) = href
          && href_value.starts_with('#')
          && (text_content.trim().is_empty()
            || text_content.trim() == "{{ANCHOR}}")
        {
          // Clear placeholder text if present
          if text_content.trim() == "{{ANCHOR}}" {
            for child in link_element.children() {
              child.detach();
            }
          }
          // Empty link with anchor - add humanized text
          let display_text = Self::humanize_anchor_id(&href_value);
          link_element.append(kuchikikiki::NodeRef::new_text(display_text));
        }
      }
    }
  }

  /// Process empty HTML links that have no content
  fn process_empty_html_links(document: &kuchikikiki::NodeRef) {
    for link_node in safe_select(document, "a[href^='#']") {
      let link_element = link_node;
      let text_content = link_element.text_contents();

      if text_content.trim().is_empty() || text_content.trim() == "{{ANCHOR}}" {
        // Clear placeholder text if present
        if text_content.trim() == "{{ANCHOR}}" {
          for child in link_element.children() {
            child.detach();
          }
        }
        if let Some(element) = link_element.as_element()
          && let Some(href) =
            element.attributes.borrow().get(local_name!("href"))
        {
          let display_text = Self::humanize_anchor_id(href);
          link_element.append(kuchikikiki::NodeRef::new_text(display_text));
        }
      }
    }
  }

  /// Process option anchor links: [](#opt-option.path) -> link to options.html
  fn process_option_anchor_links(document: &kuchikikiki::NodeRef) {
    let mut to_modify = Vec::new();

    // Collect all option anchor links first
    for link_node in safe_select(document, "a[href^='#opt-']") {
      let link_element = link_node;
      if let Some(element) = link_element.as_element() {
        let href = element
          .attributes
          .borrow()
          .get(local_name!("href"))
          .map(std::string::ToString::to_string);
        let text_content = link_element.text_contents();

        if let Some(href_value) = href
          && href_value.starts_with("#opt-")
        {
          let option_anchor = href_value[1..].to_string(); // remove the leading #
          let needs_text_replacement = text_content.trim().is_empty()
            || text_content.trim() == "{{ANCHOR}}";
          to_modify.push((
            link_element.clone(),
            option_anchor,
            needs_text_replacement,
          ));
        }
      }
    }

    // Apply modifications
    for (link_element, option_anchor, needs_text_replacement) in to_modify {
      if let Some(element) = link_element.as_element() {
        let new_href = format!("options.html#{option_anchor}");
        element
          .attributes
          .borrow_mut()
          .insert(local_name!("href"), new_href);

        if needs_text_replacement {
          // Clear existing content
          for child in link_element.children() {
            child.detach();
          }

          // Extract option name from anchor
          // opt-services-nginx-enable -> services.nginx.enable
          if let Some(option_path) = option_anchor.strip_prefix("opt-") {
            let option_name = option_path.replace('-', ".");
            link_element.append(kuchikikiki::NodeRef::new_text(option_name));
          }
        }
      }
    }
  }

  /// Convert an anchor ID to human-readable text
  fn humanize_anchor_id(anchor: &str) -> String {
    // Strip the leading #
    let cleaned = anchor.trim_start_matches('#');

    // Remove common prefixes
    let without_prefix = cleaned
      .trim_start_matches("sec-")
      .trim_start_matches("ssec-")
      .trim_start_matches("opt-");

    // Replace separators with spaces
    let spaced = without_prefix.replace(['-', '_'], " ");

    // Capitalize each word
    spaced
      .split_whitespace()
      .map(|word| {
        let mut chars = word.chars();
        chars.next().map_or_else(String::new, |c| {
          c.to_uppercase().collect::<String>() + chars.as_str()
        })
      })
      .collect::<Vec<String>>()
      .join(" ")
  }
}

/// Extract all inline text from a heading node.
pub fn extract_inline_text<'a>(node: &'a AstNode<'a>) -> String {
  let mut text = String::new();
  for child in node.children() {
    match &child.data.borrow().value {
      NodeValue::Text(t) => text.push_str(t),
      NodeValue::Code(t) => text.push_str(&t.literal),
      NodeValue::Link(..)
      | NodeValue::Emph
      | NodeValue::Strong
      | NodeValue::Strikethrough
      | NodeValue::Superscript
      | NodeValue::Subscript
      | NodeValue::FootnoteReference(..) => {
        text.push_str(&extract_inline_text(child));
      },
      #[allow(clippy::match_same_arms, reason = "Explicit for clarity")]
      NodeValue::HtmlInline(_) | NodeValue::Image(..) => {},
      _ => {},
    }
  }
  text
}

/// Collect all markdown files from the input directory
pub fn collect_markdown_files(input_dir: &Path) -> Vec<PathBuf> {
  let mut files = Vec::with_capacity(100);

  for entry in WalkDir::new(input_dir)
    .follow_links(true)
    .into_iter()
    .filter_map(Result::ok)
  {
    let path = entry.path();
    if path.is_file() && path.extension().is_some_and(|ext| ext == "md") {
      files.push(path.to_owned());
    }
  }

  trace!("Found {} markdown files to process", files.len());
  files
}

/// Features that can be queried on a processor instance.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ProcessorFeature {
  /// GitHub Flavored Markdown support
  Gfm,
  /// Nixpkgs documentation extensions
  Nixpkgs,
  /// Syntax highlighting for code blocks
  SyntaxHighlighting,
  /// Manpage URL mapping support
  ManpageUrls,
}

/// Standalone HTML post-processing function to avoid borrowing issues.
fn kuchiki_postprocess_html<F>(html: &str, transform_fn: F) -> String
where
  F: FnOnce(&kuchikikiki::NodeRef),
{
  process_safe(
    html,
    |html| {
      use tendril::TendrilSink;

      let document = kuchikikiki::parse_html().one(html);
      transform_fn(&document);

      let mut out = Vec::new();
      let _ = document.serialize(&mut out);
      String::from_utf8(out).unwrap_or_default()
    },
    html,
  )
}