ndg_commonmark/processor/
core.rs

1//! Core implementation of the Markdown processor.
2//!
3//! Main implementation of `MarkdownProcessor` and its methods focused on the
4//! core rendering pipeline and configuration management.
5use std::{
6  collections::HashMap,
7  path::{Path, PathBuf},
8  sync::LazyLock,
9};
10
11use comrak::{
12  Arena,
13  nodes::{AstNode, NodeHeading, NodeValue},
14  options::Options,
15  parse_document,
16};
17use log::trace;
18use markup5ever::local_name;
19use regex::Regex;
20use walkdir::WalkDir;
21
22use super::{
23  dom::safe_select,
24  process::process_safe,
25  types::{
26    AstTransformer,
27    MarkdownOptions,
28    MarkdownProcessor,
29    PromptTransformer,
30  },
31};
32use crate::{
33  syntax::create_default_manager,
34  types::{Header, MarkdownResult},
35  utils,
36};
37
38static HEADER_ANCHOR_RE: LazyLock<Regex> = LazyLock::new(|| {
39  Regex::new(r"<h([1-6])>(.*?)\s*\{#([a-zA-Z0-9_-]+)\}(.*?)</h[1-6]>")
40    .unwrap_or_else(|e| {
41      log::error!("Failed to compile HEADER_ANCHOR_RE regex: {e}");
42      utils::never_matching_regex().unwrap_or_else(|_| {
43        #[allow(
44          clippy::expect_used,
45          reason = "This pattern is guaranteed to be valid"
46        )]
47        Regex::new(r"[^\s\S]")
48          .expect("regex pattern [^\\s\\S] should always compile")
49      })
50    })
51});
52
53static HEADER_NO_ID_RE: LazyLock<Regex> = LazyLock::new(|| {
54  Regex::new(r"<h([1-6])>(.*?)</h[1-6]>").unwrap_or_else(|e| {
55    log::error!("Failed to compile HEADER_NO_ID_RE regex: {e}");
56    utils::never_matching_regex().unwrap_or_else(|_| {
57      #[allow(
58        clippy::expect_used,
59        reason = "This pattern is guaranteed to be valid"
60      )]
61      Regex::new(r"[^\s\S]")
62        .expect("regex pattern [^\\s\\S] should always compile")
63    })
64  })
65});
66
67static HTML_TAG_RE: LazyLock<Regex> = LazyLock::new(|| {
68  Regex::new(r"<[^>]+>").unwrap_or_else(|e| {
69    log::error!("Failed to compile HTML_TAG_RE regex: {e}");
70    utils::never_matching_regex().unwrap_or_else(|_| {
71      #[allow(
72        clippy::expect_used,
73        reason = "This pattern is guaranteed to be valid"
74      )]
75      Regex::new(r"[^\s\S]")
76        .expect("regex pattern [^\\s\\S] should always compile")
77    })
78  })
79});
80
81impl MarkdownProcessor {
82  /// Create a new `MarkdownProcessor` with the given options.
83  #[must_use]
84  pub fn new(options: MarkdownOptions) -> Self {
85    let manpage_urls = options
86      .manpage_urls_path
87      .as_ref()
88      .and_then(|path| crate::utils::load_manpage_urls(path).ok());
89
90    let syntax_manager = if options.highlight_code {
91      match create_default_manager(
92        options
93          .syntax_queries_path
94          .as_deref()
95          .map(std::path::Path::new),
96      ) {
97        Ok(manager) => {
98          log::info!("Syntax highlighting initialized successfully");
99          Some(manager)
100        },
101        Err(e) => {
102          log::error!("Failed to initialize syntax highlighting: {e}");
103          log::warn!(
104            "Continuing without syntax highlighting - code blocks will not be \
105             highlighted"
106          );
107          None
108        },
109      }
110    } else {
111      None
112    };
113
114    Self {
115      options,
116      manpage_urls,
117      syntax_manager,
118      base_dir: std::path::PathBuf::from("."),
119    }
120  }
121
122  /// Access processor options.
123  #[must_use]
124  pub const fn options(&self) -> &MarkdownOptions {
125    &self.options
126  }
127
128  /// Set the base directory for resolving relative file paths.
129  #[must_use]
130  pub fn with_base_dir(mut self, base_dir: &std::path::Path) -> Self {
131    self.base_dir = base_dir.to_path_buf();
132    self
133  }
134
135  /// Check if a specific feature is enabled.
136  #[must_use]
137  pub const fn has_feature(&self, feature: ProcessorFeature) -> bool {
138    match feature {
139      ProcessorFeature::Gfm => self.options.gfm,
140      ProcessorFeature::Nixpkgs => self.options.nixpkgs,
141      ProcessorFeature::SyntaxHighlighting => self.options.highlight_code,
142      ProcessorFeature::ManpageUrls => self.manpage_urls.is_some(),
143    }
144  }
145
146  /// Get the manpage URLs mapping for use with standalone functions.
147  #[must_use]
148  pub const fn manpage_urls(&self) -> Option<&HashMap<String, String>> {
149    self.manpage_urls.as_ref()
150  }
151
152  /// Highlight all code blocks in HTML using the configured syntax highlighter
153  #[must_use]
154  pub fn highlight_codeblocks(&self, html: &str) -> String {
155    use kuchikikiki::parse_html;
156    use tendril::TendrilSink;
157
158    if !self.options.highlight_code || self.syntax_manager.is_none() {
159      return html.to_string();
160    }
161
162    let document = parse_html().one(html);
163
164    // Collect all code blocks first to avoid DOM modification during iteration
165    let mut code_blocks = Vec::new();
166    for pre_node in safe_select(&document, "pre > code") {
167      let code_node = pre_node;
168      if let Some(element) = code_node.as_element() {
169        let language = element
170          .attributes
171          .borrow()
172          .get("class")
173          .and_then(|class| class.strip_prefix("language-"))
174          .unwrap_or("text")
175          .to_string();
176        let code_text = code_node.text_contents();
177
178        if let Some(pre_parent) = code_node.parent() {
179          code_blocks.push((
180            pre_parent.clone(),
181            code_node.clone(),
182            code_text,
183            language,
184          ));
185        }
186      }
187    }
188
189    // Process each code block
190    for (pre_element, _code_node, code_text, language) in code_blocks {
191      if let Some(highlighted) = self.highlight_code_html(&code_text, &language)
192      {
193        // Wrap highlighted HTML in <pre><code> with appropriate classes
194        let wrapped_html = format!(
195          r#"<pre class="highlight"><code class="language-{language}">{highlighted}</code></pre>"#
196        );
197        let fragment = parse_html().one(wrapped_html.as_str());
198        pre_element.insert_after(fragment);
199        pre_element.detach();
200      }
201      // Do not add highlight/language-* classes if not highlighted
202    }
203
204    let mut buf = Vec::new();
205    if let Err(e) = document.serialize(&mut buf) {
206      log::warn!("DOM serialization failed: {e:?}");
207      return html.to_string(); // Return original HTML if serialization fails
208    }
209    String::from_utf8(buf).unwrap_or_else(|_| html.to_string())
210  }
211
212  /// Handle hard tabs in code blocks according to configuration
213  fn handle_hardtabs(&self, code: &str) -> String {
214    use super::types::TabStyle;
215
216    // Check if there are any hard tabs
217    if !code.contains('\t') {
218      return code.to_string();
219    }
220
221    match self.options.tab_style {
222      // Do nothing
223      TabStyle::None => code.to_string(),
224
225      // Warn, but do nothing.
226      TabStyle::Warn => {
227        log::warn!(
228          "Hard tabs detected in code block. Consider using spaces for \
229           consistency. Tools like editorconfig may help you normalize spaces \
230           in your documents."
231        );
232        code.to_string()
233      },
234
235      // Do not warn, only inform in debug mode. Then return
236      // the updated code.
237      TabStyle::Normalize => {
238        log::debug!("Replacing hard tabs with spaces");
239        code.replace('\t', "  ")
240      },
241    }
242  }
243
244  /// Process hard tabs in code blocks within markdown content
245  fn process_hardtabs(&self, markdown: &str) -> String {
246    use super::types::TabStyle;
247    use crate::utils::codeblock::FenceTracker;
248
249    // If no tab handling is needed, return as-is
250    if self.options.tab_style == TabStyle::None {
251      return markdown.to_string();
252    }
253
254    let mut result = String::with_capacity(markdown.len());
255    let mut lines = markdown.lines().peekable();
256    let mut tracker = FenceTracker::new();
257
258    while let Some(line) = lines.next() {
259      tracker = tracker.process_line(line);
260
261      // Only replace tabs inside fenced code blocks
262      let processed_line = if tracker.in_code_block() && line.contains('\t') {
263        self.handle_hardtabs(line)
264      } else {
265        line.to_string()
266      };
267
268      result.push_str(&processed_line);
269
270      // Add newline unless this is the last line
271      if lines.peek().is_some() {
272        result.push('\n');
273      }
274    }
275
276    result
277  }
278
279  /// Highlight code using the configured syntax highlighter, returns HTML
280  /// string
281  fn highlight_code_html(&self, code: &str, language: &str) -> Option<String> {
282    if !self.options.highlight_code {
283      return None;
284    }
285
286    let syntax_manager = self.syntax_manager.as_ref()?;
287
288    syntax_manager
289      .highlight_code(code, language, self.options.highlight_theme.as_deref())
290      .ok()
291  }
292
293  /// Render Markdown to HTML, extracting headers and title.
294  #[must_use]
295  pub fn render(&self, markdown: &str) -> MarkdownResult {
296    let (preprocessed, included_files) = self.preprocess(markdown);
297    let (headers, title) = self.extract_headers(&preprocessed);
298    let html = self.process_html_pipeline(&preprocessed);
299
300    MarkdownResult {
301      html,
302      headers,
303      title,
304      included_files,
305    }
306  }
307
308  /// Process the HTML generation and post-processing pipeline.
309  fn process_html_pipeline(&self, content: &str) -> String {
310    let mut html = self.convert_to_html(content);
311
312    // Apply feature-specific post-processing
313    if cfg!(feature = "ndg-flavored") {
314      #[cfg(feature = "ndg-flavored")]
315      {
316        html = super::extensions::process_option_references(
317          &html,
318          self.options.valid_options.as_ref(),
319        );
320      }
321    }
322
323    if self.options.nixpkgs {
324      html = self.process_manpage_references_html(&html);
325    }
326
327    if self.options.highlight_code {
328      html = self.highlight_codeblocks(&html);
329    }
330
331    self.kuchiki_postprocess(&html)
332  }
333
334  /// Preprocess the markdown content with all enabled transformations.
335  fn preprocess(
336    &self,
337    content: &str,
338  ) -> (String, Vec<crate::types::IncludedFile>) {
339    let mut processed = content.to_string();
340    let mut included_files = Vec::new();
341
342    // Process MyST-style autolinks first
343    processed = super::extensions::process_myst_autolinks(&processed);
344
345    // Handle hard tabs in code blocks
346    processed = self.process_hardtabs(&processed);
347
348    if self.options.nixpkgs {
349      let (content, files) = self.apply_nixpkgs_preprocessing(&processed);
350      processed = content;
351      included_files = files;
352    }
353
354    if self.options.nixpkgs || cfg!(feature = "ndg-flavored") {
355      processed = super::extensions::process_role_markup(
356        &processed,
357        self.manpage_urls.as_ref(),
358        self.options.auto_link_options,
359        self.options.valid_options.as_ref(),
360      );
361    }
362
363    #[cfg(feature = "wiki")]
364    {
365      processed = super::extensions::process_wikilinks(&processed);
366    }
367
368    (processed, included_files)
369  }
370
371  /// Apply Nixpkgs-specific preprocessing steps.
372  #[cfg(feature = "nixpkgs")]
373  fn apply_nixpkgs_preprocessing(
374    &self,
375    content: &str,
376  ) -> (String, Vec<crate::types::IncludedFile>) {
377    let (with_includes, included_files) =
378      match super::extensions::process_file_includes(content, &self.base_dir, 0)
379      {
380        Ok(result) => result,
381        Err(e) => {
382          log::warn!(
383            "File include processing failed: {e}. Continuing without includes."
384          );
385          (content.to_string(), Vec::new())
386        },
387      };
388    let with_blocks = super::extensions::process_block_elements(&with_includes);
389    let with_spans = super::extensions::process_bracketed_spans(&with_blocks);
390    let processed = super::extensions::process_inline_anchors(&with_spans);
391    (processed, included_files)
392  }
393
394  /// Apply Nixpkgs-specific preprocessing steps (no-op when feature disabled).
395  #[cfg(not(feature = "nixpkgs"))]
396  fn apply_nixpkgs_preprocessing(
397    &self,
398    content: &str,
399  ) -> (String, Vec<crate::types::IncludedFile>) {
400    (content.to_string(), Vec::new())
401  }
402
403  /// Extract headers and title from the markdown content.
404  #[must_use]
405  pub fn extract_headers(
406    &self,
407    content: &str,
408  ) -> (Vec<Header>, Option<String>) {
409    use std::fmt::Write;
410
411    let arena = Arena::new();
412    let options = self.comrak_options();
413
414    let content = remove_admonition_blocks_for_headers(content);
415
416    // Normalize custom anchors with no heading level to h2
417    let mut normalized = String::with_capacity(content.len());
418    let mut lines = content.lines().peekable();
419    while let Some(line) = lines.next() {
420      let trimmed = line.trim();
421      if !trimmed.starts_with('#')
422        && !lines
423          .peek()
424          .is_some_and(|next| is_setext_heading_underline(next.trim()))
425        && let Some(anchor_start) = trimmed.rfind("{#")
426        && let Some(anchor_end) = trimmed[anchor_start..].find('}')
427      {
428        let text = trimmed[..anchor_start].trim_end();
429        let id = &trimmed[anchor_start + 2..anchor_start + anchor_end];
430        let _ = writeln!(normalized, "## {text} {{#{id}}}");
431        continue;
432      }
433      normalized.push_str(line);
434      normalized.push('\n');
435    }
436
437    let root = parse_document(&arena, &normalized, &options);
438
439    let mut headers = Vec::new();
440    let mut found_title = None;
441
442    for node in root.descendants() {
443      if let NodeValue::Heading(NodeHeading { level, .. }) =
444        &node.data.borrow().value
445      {
446        let mut text = String::new();
447        let mut explicit_id = None;
448
449        for child in node.children() {
450          match &child.data.borrow().value {
451            NodeValue::Text(t) => text.push_str(t),
452            NodeValue::Code(t) => text.push_str(&t.literal),
453            NodeValue::Link(..)
454            | NodeValue::Emph
455            | NodeValue::Strong
456            | NodeValue::Subscript
457            | NodeValue::Strikethrough
458            | NodeValue::Superscript
459            | NodeValue::FootnoteReference(..) => {
460              text.push_str(&extract_inline_text(child));
461            },
462            NodeValue::HtmlInline(html) => {
463              // Look for explicit anchor in HTML inline node: {#id}
464              let html_str = html.as_str();
465              if let Some(start) = html_str.find("{#")
466                && let Some(end) = html_str[start..].find('}')
467              {
468                let anchor = &html_str[start + 2..start + end];
469                explicit_id = Some(anchor.to_string());
470              }
471            },
472            #[allow(clippy::match_same_arms, reason = "Explicit for clarity")]
473            NodeValue::Image(..) => {},
474            _ => {},
475          }
476        }
477
478        // Check for trailing {#id} in heading text
479        let trimmed = text.trim_end();
480        #[allow(clippy::option_if_let_else)]
481        // Nested options clearer with if-let
482        let (final_text, id) = if let Some(start) = trimmed.rfind("{#") {
483          if let Some(end) = trimmed[start..].find('}') {
484            let anchor = &trimmed[start + 2..start + end];
485            (trimmed[..start].trim_end().to_string(), anchor.to_string())
486          } else {
487            (text.clone(), explicit_id.unwrap_or_else(|| slugify_heading(&text)))
488          }
489        } else {
490          (text.clone(), explicit_id.unwrap_or_else(|| slugify_heading(&text)))
491        };
492        if *level == 1 && found_title.is_none() {
493          found_title = Some(final_text.clone());
494        }
495        headers.push(Header {
496          text: final_text,
497          level: *level,
498          id,
499        });
500      }
501    }
502
503    (headers, found_title)
504  }
505
506  /// Convert markdown to HTML using comrak and configured options.
507  fn convert_to_html(&self, content: &str) -> String {
508    // Process directly without panic catching for better performance
509    let arena = Arena::new();
510    let options = self.comrak_options();
511    let root = parse_document(&arena, content, &options);
512
513    // Apply AST transformations
514    let prompt_transformer = PromptTransformer;
515    prompt_transformer.transform(root);
516
517    let mut html_output = String::new();
518    if let Err(e) = comrak::format_html(root, &options, &mut html_output) {
519      log::error!("Failed to format HTML: {e}");
520    }
521
522    // Post-process HTML to handle header anchors
523    Self::process_header_anchors_html(&html_output)
524  }
525
526  /// Process header anchors in HTML by finding `{#id}` syntax and converting to
527  /// proper id attributes. Also adds auto-generated IDs to headers without
528  /// explicit anchors.
529  fn process_header_anchors_html(html: &str) -> String {
530    // First pass: explicit {#id} syntax
531    let result = HEADER_ANCHOR_RE
532      .replace_all(html, |caps: &regex::Captures| {
533        let level = &caps[1];
534        let prefix = &caps[2];
535        let id = &caps[3];
536        let suffix = &caps[4];
537        format!("<h{level} id=\"{id}\">{prefix}{suffix}</h{level}>")
538      })
539      .to_string();
540
541    // Second pass: add auto-generated IDs to headers without id attribute
542    HEADER_NO_ID_RE
543      .replace_all(&result, |caps: &regex::Captures| {
544        let level = &caps[1];
545        let content = &caps[2];
546        // Strip HTML tags and slugify the text content
547        let text_only = HTML_TAG_RE.replace_all(content, "");
548        let id = utils::slugify(&text_only);
549        if id.is_empty() {
550          // If slugify produces empty string, keep header without id
551          format!("<h{level}>{content}</h{level}>")
552        } else {
553          format!("<h{level} id=\"{id}\">{content}</h{level}>")
554        }
555      })
556      .to_string()
557  }
558
559  /// Build comrak options from `MarkdownOptions` and feature flags.
560  fn comrak_options(&self) -> Options<'_> {
561    let mut options = Options::default();
562    // Markdown features present in GFM.
563    if self.options.gfm {
564      options.extension.table = true;
565      options.extension.footnotes = true;
566      options.extension.strikethrough = true;
567      options.extension.tasklist = true;
568      options.extension.superscript = true;
569      options.extension.autolink = true;
570    }
571
572    // Enable unsafe HTML references. This is not a security concern
573    // as all input is assumed to be trusted.
574    options.render.r#unsafe = true;
575
576    // Enable description lists but keep custom header processing
577    options.extension.header_id_prefix = None;
578    options.extension.description_lists = true;
579    options
580  }
581
582  /// Post-process HTML to enhance manpage references with URL links.
583  #[cfg(feature = "nixpkgs")]
584  fn process_manpage_references_html(&self, html: &str) -> String {
585    super::extensions::process_manpage_references(
586      html,
587      self.manpage_urls.as_ref(),
588    )
589  }
590
591  /// Post-process HTML to enhance manpage references (no-op when feature
592  /// disabled).
593  #[cfg(not(feature = "nixpkgs"))]
594  fn process_manpage_references_html(&self, html: &str) -> String {
595    html.to_string()
596  }
597
598  /// HTML post-processing using kuchiki DOM manipulation.
599  #[allow(
600    clippy::unused_self,
601    reason = "Method signature matches processor pattern"
602  )]
603  fn kuchiki_postprocess(&self, html: &str) -> String {
604    // Use a standalone function to avoid borrowing issues
605    kuchiki_postprocess_html(html, |document| {
606      Self::apply_dom_transformations(document);
607    })
608  }
609
610  /// Apply all DOM transformations to the parsed HTML document.
611  fn apply_dom_transformations(document: &kuchikikiki::NodeRef) {
612    Self::process_list_item_id_markers(document);
613    Self::process_header_anchor_comments(document);
614    Self::process_list_item_inline_anchors(document);
615    Self::process_paragraph_inline_anchors(document);
616    Self::process_remaining_inline_anchors(document);
617    Self::process_markdown_links(document);
618    Self::process_option_anchor_links(document);
619    Self::process_empty_auto_links(document);
620    Self::process_empty_html_links(document);
621  }
622
623  /// Process list item ID markers: <li><!-- nixos-anchor-id:ID -->
624  fn process_list_item_id_markers(document: &kuchikikiki::NodeRef) {
625    let mut to_modify = Vec::new();
626
627    for comment in document.inclusive_descendants() {
628      if let Some(comment_node) = comment.as_comment() {
629        let comment_text = comment_node.borrow();
630        if let Some(id_start) = comment_text.find("nixos-anchor-id:") {
631          let id = comment_text[id_start + 16..].trim();
632          if !id.is_empty()
633            && id
634              .chars()
635              .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
636          {
637            // Check if this comment is inside an <li> element
638            if let Some(parent) = comment.parent()
639              && let Some(element) = parent.as_element()
640              && element.name.local.as_ref() == "li"
641            {
642              to_modify.push((comment.clone(), id.to_string()));
643            }
644          }
645        }
646      }
647    }
648
649    for (comment_node, id) in to_modify {
650      let span = kuchikikiki::NodeRef::new_element(
651        markup5ever::QualName::new(
652          None,
653          markup5ever::ns!(html),
654          local_name!("span"),
655        ),
656        vec![
657          (
658            kuchikikiki::ExpandedName::new("", "id"),
659            kuchikikiki::Attribute {
660              prefix: None,
661              value:  id,
662            },
663          ),
664          (
665            kuchikikiki::ExpandedName::new("", "class"),
666            kuchikikiki::Attribute {
667              prefix: None,
668              value:  "nixos-anchor".into(),
669            },
670          ),
671        ],
672      );
673      comment_node.insert_after(span);
674      comment_node.detach();
675    }
676  }
677
678  /// Process header anchors with comments: <h1>text<!-- anchor: id --></h1>
679  fn process_header_anchor_comments(document: &kuchikikiki::NodeRef) {
680    let mut to_modify = Vec::new();
681
682    for comment in document.inclusive_descendants() {
683      if let Some(comment_node) = comment.as_comment() {
684        let comment_text = comment_node.borrow();
685        if let Some(anchor_start) = comment_text.find("anchor:") {
686          let id = comment_text[anchor_start + 7..].trim();
687          if !id.is_empty()
688            && id
689              .chars()
690              .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
691          {
692            // Check if this comment is inside a header element
693            if let Some(parent) = comment.parent()
694              && let Some(element) = parent.as_element()
695            {
696              let tag_name = element.name.local.as_ref();
697              if matches!(tag_name, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
698                to_modify.push((
699                  parent.clone(),
700                  comment.clone(),
701                  id.to_string(),
702                ));
703              }
704            }
705          }
706        }
707      }
708    }
709
710    for (header_element, comment_node, id) in to_modify {
711      if let Some(element) = header_element.as_element() {
712        element
713          .attributes
714          .borrow_mut()
715          .insert(local_name!("id"), id);
716        comment_node.detach();
717      }
718    }
719  }
720
721  /// Process remaining inline anchors in list items: <li>[]{#id}content</li>
722  fn process_list_item_inline_anchors(document: &kuchikikiki::NodeRef) {
723    for li_node in safe_select(document, "li") {
724      let li_element = li_node;
725
726      // Check if this list item contains code elements
727      let has_code = !safe_select(&li_element, "code, pre").is_empty();
728      if has_code {
729        continue; // Skip list items with code blocks
730      }
731
732      let text_content = li_element.text_contents();
733
734      if let Some(anchor_start) = text_content.find("[]{#")
735        && let Some(anchor_end) = text_content[anchor_start..].find('}')
736      {
737        let id = &text_content[anchor_start + 4..anchor_start + anchor_end];
738        if !id.is_empty()
739          && id
740            .chars()
741            .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
742        {
743          let remaining_content =
744            &text_content[anchor_start + anchor_end + 1..];
745
746          // Clear current content and rebuild
747          for child in li_element.children() {
748            child.detach();
749          }
750
751          let span = kuchikikiki::NodeRef::new_element(
752            markup5ever::QualName::new(
753              None,
754              markup5ever::ns!(html),
755              local_name!("span"),
756            ),
757            vec![
758              (
759                kuchikikiki::ExpandedName::new("", "id"),
760                kuchikikiki::Attribute {
761                  prefix: None,
762                  value:  id.into(),
763                },
764              ),
765              (
766                kuchikikiki::ExpandedName::new("", "class"),
767                kuchikikiki::Attribute {
768                  prefix: None,
769                  value:  "nixos-anchor".into(),
770                },
771              ),
772            ],
773          );
774          li_element.append(span);
775          if !remaining_content.is_empty() {
776            li_element
777              .append(kuchikikiki::NodeRef::new_text(remaining_content));
778          }
779        }
780      }
781    }
782  }
783
784  /// Process inline anchors in paragraphs: <p>[]{#id}content</p>
785  fn process_paragraph_inline_anchors(document: &kuchikikiki::NodeRef) {
786    for p_node in safe_select(document, "p") {
787      let p_element = p_node;
788
789      // Check if this paragraph contains code elements
790      let has_code = !safe_select(&p_element, "code, pre").is_empty();
791      if has_code {
792        continue; // Skip paragraphs with code blocks
793      }
794
795      let text_content = p_element.text_contents();
796
797      if let Some(anchor_start) = text_content.find("[]{#")
798        && let Some(anchor_end) = text_content[anchor_start..].find('}')
799      {
800        let id = &text_content[anchor_start + 4..anchor_start + anchor_end];
801        if !id.is_empty()
802          && id
803            .chars()
804            .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
805        {
806          let remaining_content =
807            &text_content[anchor_start + anchor_end + 1..];
808
809          // Clear current content and rebuild
810          for child in p_element.children() {
811            child.detach();
812          }
813
814          let span = kuchikikiki::NodeRef::new_element(
815            markup5ever::QualName::new(
816              None,
817              markup5ever::ns!(html),
818              local_name!("span"),
819            ),
820            vec![
821              (
822                kuchikikiki::ExpandedName::new("", "id"),
823                kuchikikiki::Attribute {
824                  prefix: None,
825                  value:  id.into(),
826                },
827              ),
828              (
829                kuchikikiki::ExpandedName::new("", "class"),
830                kuchikikiki::Attribute {
831                  prefix: None,
832                  value:  "nixos-anchor".into(),
833                },
834              ),
835            ],
836          );
837          p_element.append(span);
838          if !remaining_content.is_empty() {
839            p_element.append(kuchikikiki::NodeRef::new_text(remaining_content));
840          }
841        }
842      }
843    }
844  }
845
846  /// Process remaining standalone inline anchors throughout the document
847  fn process_remaining_inline_anchors(document: &kuchikikiki::NodeRef) {
848    let mut text_nodes_to_process = Vec::new();
849
850    for node in document.inclusive_descendants() {
851      if let Some(text_node) = node.as_text() {
852        // Check if this text node is inside a code block
853        let mut parent = node.parent();
854        let mut in_code = false;
855        while let Some(p) = parent {
856          if let Some(element) = p.as_element()
857            && (element.name.local == local_name!("code")
858              || element.name.local == local_name!("pre"))
859          {
860            in_code = true;
861            break;
862          }
863          parent = p.parent();
864        }
865
866        // Only process if not in code
867        if !in_code {
868          let text_content = text_node.borrow().clone();
869          if text_content.contains("[]{#") {
870            text_nodes_to_process.push((node.clone(), text_content));
871          }
872        }
873      }
874    }
875
876    for (text_node, text_content) in text_nodes_to_process {
877      let mut last_end = 0;
878      let mut new_children = Vec::new();
879
880      // Simple pattern matching for []{#id}
881      let chars = text_content.chars().collect::<Vec<_>>();
882      let mut i = 0;
883      while i < chars.len() {
884        if i + 4 < chars.len()
885          && chars[i] == '['
886          && chars[i + 1] == ']'
887          && chars[i + 2] == '{'
888          && chars[i + 3] == '#'
889        {
890          // Found start of anchor pattern
891          let anchor_start = i;
892          i += 4; // skip "[]{#"
893
894          let mut id = String::new();
895          while i < chars.len() && chars[i] != '}' {
896            if chars[i].is_alphanumeric() || chars[i] == '-' || chars[i] == '_'
897            {
898              id.push(chars[i]);
899              i += 1;
900            } else {
901              break;
902            }
903          }
904
905          if i < chars.len() && chars[i] == '}' && !id.is_empty() {
906            // Valid anchor found
907            let anchor_end = i + 1;
908
909            // Add text before anchor
910            if anchor_start > last_end {
911              let before_text: String =
912                chars[last_end..anchor_start].iter().collect();
913              if !before_text.is_empty() {
914                new_children.push(kuchikikiki::NodeRef::new_text(before_text));
915              }
916            }
917
918            // Add span element
919            let span = kuchikikiki::NodeRef::new_element(
920              markup5ever::QualName::new(
921                None,
922                markup5ever::ns!(html),
923                local_name!("span"),
924              ),
925              vec![
926                (
927                  kuchikikiki::ExpandedName::new("", "id"),
928                  kuchikikiki::Attribute {
929                    prefix: None,
930                    value:  id,
931                  },
932                ),
933                (
934                  kuchikikiki::ExpandedName::new("", "class"),
935                  kuchikikiki::Attribute {
936                    prefix: None,
937                    value:  "nixos-anchor".into(),
938                  },
939                ),
940              ],
941            );
942            new_children.push(span);
943
944            last_end = anchor_end;
945            i = anchor_end;
946          } else {
947            i += 1;
948          }
949        } else {
950          i += 1;
951        }
952      }
953
954      // Add remaining text
955      if last_end < chars.len() {
956        let after_text: String = chars[last_end..].iter().collect();
957        if !after_text.is_empty() {
958          new_children.push(kuchikikiki::NodeRef::new_text(after_text));
959        }
960      }
961
962      // Replace text node if we found anchors
963      if !new_children.is_empty() {
964        for child in new_children {
965          text_node.insert_before(child);
966        }
967        text_node.detach();
968      }
969    }
970  }
971
972  /// Process empty auto-links: [](#anchor) -> <a href="#anchor">Anchor</a>
973  fn process_empty_auto_links(document: &kuchikikiki::NodeRef) {
974    for link_node in safe_select(document, "a") {
975      let link_element = link_node;
976      if let Some(element) = link_element.as_element() {
977        let href = element
978          .attributes
979          .borrow()
980          .get(local_name!("href"))
981          .map(std::string::ToString::to_string);
982        let text_content = link_element.text_contents();
983
984        if let Some(href_value) = href
985          && href_value.starts_with('#')
986          && (text_content.trim().is_empty()
987            || text_content.trim() == "{{ANCHOR}}")
988        {
989          // Clear placeholder text if present
990          if text_content.trim() == "{{ANCHOR}}" {
991            for child in link_element.children() {
992              child.detach();
993            }
994          }
995          // Empty link with anchor - add humanized text
996          let display_text = Self::humanize_anchor_id(&href_value);
997          link_element.append(kuchikikiki::NodeRef::new_text(display_text));
998        }
999      }
1000    }
1001  }
1002
1003  /// Process empty HTML links that have no content
1004  fn process_empty_html_links(document: &kuchikikiki::NodeRef) {
1005    for link_node in safe_select(document, "a[href^='#']") {
1006      let link_element = link_node;
1007      let text_content = link_element.text_contents();
1008
1009      if text_content.trim().is_empty() || text_content.trim() == "{{ANCHOR}}" {
1010        // Clear placeholder text if present
1011        if text_content.trim() == "{{ANCHOR}}" {
1012          for child in link_element.children() {
1013            child.detach();
1014          }
1015        }
1016        if let Some(element) = link_element.as_element()
1017          && let Some(href) =
1018            element.attributes.borrow().get(local_name!("href"))
1019        {
1020          let display_text = Self::humanize_anchor_id(href);
1021          link_element.append(kuchikikiki::NodeRef::new_text(display_text));
1022        }
1023      }
1024    }
1025  }
1026
1027  /// Process option anchor links: [](#opt-option.path) -> link to options.html
1028  fn process_option_anchor_links(document: &kuchikikiki::NodeRef) {
1029    let mut to_modify = Vec::new();
1030
1031    // Collect all option anchor links first
1032    for link_node in safe_select(document, "a[href^='#opt-']") {
1033      let link_element = link_node;
1034      if let Some(element) = link_element.as_element() {
1035        let href = element
1036          .attributes
1037          .borrow()
1038          .get(local_name!("href"))
1039          .map(std::string::ToString::to_string);
1040        let text_content = link_element.text_contents();
1041
1042        if let Some(href_value) = href
1043          && href_value.starts_with("#opt-")
1044        {
1045          let option_anchor = href_value[1..].to_string(); // remove the leading #
1046          let needs_text_replacement = text_content.trim().is_empty()
1047            || text_content.trim() == "{{ANCHOR}}";
1048          to_modify.push((
1049            link_element.clone(),
1050            option_anchor,
1051            needs_text_replacement,
1052          ));
1053        }
1054      }
1055    }
1056
1057    // Apply modifications
1058    for (link_element, option_anchor, needs_text_replacement) in to_modify {
1059      if let Some(element) = link_element.as_element() {
1060        let new_href = format!("options.html#{option_anchor}");
1061        element
1062          .attributes
1063          .borrow_mut()
1064          .insert(local_name!("href"), new_href);
1065
1066        if needs_text_replacement {
1067          // Clear existing content
1068          for child in link_element.children() {
1069            child.detach();
1070          }
1071
1072          // Extract option name from anchor
1073          // opt-services-nginx-enable -> services.nginx.enable
1074          if let Some(option_path) = option_anchor.strip_prefix("opt-") {
1075            let option_name = option_path.replace('-', ".");
1076            link_element.append(kuchikikiki::NodeRef::new_text(option_name));
1077          }
1078        }
1079      }
1080    }
1081  }
1082
1083  /// Process markdown file links: convert .md hrefs to .html
1084  fn process_markdown_links(document: &kuchikikiki::NodeRef) {
1085    for link_node in safe_select(document, "a") {
1086      let link_element = link_node;
1087      if let Some(element) = link_element.as_element() {
1088        let href = element
1089          .attributes
1090          .borrow()
1091          .get(local_name!("href"))
1092          .map(std::string::ToString::to_string);
1093
1094        if let Some(href_value) = href {
1095          // Only process relative links ending in .md (not absolute URLs, not
1096          // anchors)
1097          if !href_value.starts_with("http://")
1098            && !href_value.starts_with("https://")
1099            && !href_value.starts_with('#')
1100            && !href_value.starts_with("mailto:")
1101          {
1102            // Split off fragment (#) and query (?) to check the path extension
1103            let (path_part, suffix) = href_value
1104              .find(['#', '?'])
1105              .map_or((href_value.as_str(), ""), |idx| {
1106                href_value.split_at(idx)
1107              });
1108
1109            if std::path::Path::new(path_part)
1110              .extension()
1111              .is_some_and(|ext| ext.eq_ignore_ascii_case("md"))
1112            {
1113              let new_href =
1114                format!("{}.html{}", &path_part[..path_part.len() - 3], suffix);
1115              element
1116                .attributes
1117                .borrow_mut()
1118                .insert(local_name!("href"), new_href);
1119            }
1120          }
1121        }
1122      }
1123    }
1124  }
1125
1126  /// Convert an anchor ID to human-readable text
1127  fn humanize_anchor_id(anchor: &str) -> String {
1128    // Strip the leading #
1129    let cleaned = anchor.trim_start_matches('#');
1130
1131    // Remove common prefixes
1132    let without_prefix = cleaned
1133      .trim_start_matches("sec-")
1134      .trim_start_matches("ssec-")
1135      .trim_start_matches("opt-");
1136
1137    // Replace separators with spaces
1138    let spaced = without_prefix.replace(['-', '_'], " ");
1139
1140    // Capitalize each word
1141    spaced
1142      .split_whitespace()
1143      .map(|word| {
1144        let mut chars = word.chars();
1145        chars.next().map_or_else(String::new, |c| {
1146          c.to_uppercase().collect::<String>() + chars.as_str()
1147        })
1148      })
1149      .collect::<Vec<String>>()
1150      .join(" ")
1151  }
1152}
1153
1154/// Extract all inline text from a heading node.
1155pub fn extract_inline_text<'a>(node: &'a AstNode<'a>) -> String {
1156  fn inner<'a>(node: &'a AstNode<'a>) -> String {
1157    let mut text = String::new();
1158    for child in node.children() {
1159      match &child.data.borrow().value {
1160        NodeValue::Text(t) => text.push_str(t),
1161        NodeValue::Code(t) => text.push_str(&t.literal),
1162        NodeValue::Link(..)
1163        | NodeValue::Emph
1164        | NodeValue::Strong
1165        | NodeValue::Strikethrough
1166        | NodeValue::Superscript
1167        | NodeValue::Subscript
1168        | NodeValue::FootnoteReference(..) => {
1169          text.push_str(&inner(child));
1170        },
1171        #[allow(clippy::match_same_arms, reason = "Explicit for clarity")]
1172        NodeValue::HtmlInline(_) | NodeValue::Image(..) => {},
1173        _ => {},
1174      }
1175    }
1176    text
1177  }
1178  inner(node)
1179}
1180
1181/// Slugify heading text for use as a table-of-contents anchor.
1182///
1183/// The auto-generated heading `id` is produced by slugifying the *rendered*
1184/// HTML (see `process_header_anchors_html`), in which comrak has escaped any
1185/// markup characters: an option heading like `environments.<name>.deployment`
1186/// becomes `environments.&lt;name&gt;.deployment` before slugifying. The
1187/// table-of-contents, by contrast, slugifies the raw inline text (`<name>`),
1188/// which would yield a different slug and break "jump to header" for every
1189/// heading containing such characters.
1190///
1191/// To keep both in sync, escape the text the same way comrak does before
1192/// slugifying, so the TOC href matches the on-page heading `id`. The heading
1193/// `id` itself is intentionally left unchanged to preserve existing deep links.
1194#[must_use]
1195pub(crate) fn slugify_heading(text: &str) -> String {
1196  utils::slugify(&html_escape::encode_text(text))
1197}
1198
1199/// Collect all markdown files from the input directory
1200pub fn collect_markdown_files(input_dir: &Path) -> Vec<PathBuf> {
1201  let mut files = Vec::with_capacity(100);
1202
1203  for entry in WalkDir::new(input_dir)
1204    .follow_links(true)
1205    .into_iter()
1206    .filter_map(Result::ok)
1207  {
1208    let path = entry.path();
1209    if path.is_file() && path.extension().is_some_and(|ext| ext == "md") {
1210      files.push(path.to_owned());
1211    }
1212  }
1213
1214  trace!("Found {} markdown files to process", files.len());
1215  files
1216}
1217
1218/// Features that can be queried on a processor instance.
1219#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1220pub enum ProcessorFeature {
1221  /// GitHub Flavored Markdown support
1222  Gfm,
1223  /// Nixpkgs documentation extensions
1224  Nixpkgs,
1225  /// Syntax highlighting for code blocks
1226  SyntaxHighlighting,
1227  /// Manpage URL mapping support
1228  ManpageUrls,
1229}
1230
1231fn remove_admonition_blocks_for_headers(content: &str) -> String {
1232  let mut output = String::with_capacity(content.len());
1233  let mut admonition_depth = 0usize;
1234
1235  for line in content.lines() {
1236    let trimmed = line.trim_start();
1237    if trimmed.starts_with("<div class=\"admonition ") {
1238      admonition_depth += 1;
1239      output.push('\n');
1240      continue;
1241    }
1242
1243    if admonition_depth > 0 {
1244      if trimmed == "</div>" {
1245        admonition_depth -= 1;
1246      }
1247      output.push('\n');
1248      continue;
1249    }
1250
1251    output.push_str(line);
1252    output.push('\n');
1253  }
1254
1255  output
1256}
1257
1258fn is_setext_heading_underline(line: &str) -> bool {
1259  !line.is_empty()
1260    && (line.chars().all(|ch| ch == '=' || ch.is_whitespace())
1261      || line.chars().all(|ch| ch == '-' || ch.is_whitespace()))
1262}
1263
1264/// Standalone HTML post-processing function to avoid borrowing issues.
1265fn kuchiki_postprocess_html<F>(html: &str, transform_fn: F) -> String
1266where
1267  F: FnOnce(&kuchikikiki::NodeRef),
1268{
1269  process_safe(
1270    html,
1271    |html| {
1272      use tendril::TendrilSink;
1273
1274      let document = kuchikikiki::parse_html().one(html);
1275      transform_fn(&document);
1276
1277      let mut out = Vec::new();
1278      let _ = document.serialize(&mut out);
1279      String::from_utf8_lossy(&out).into_owned()
1280    },
1281    html,
1282  )
1283}
ndg_commonmark/processor/core.rs

ndg_commonmark/processor/
core.rs