ndg_commonmark/processor/
core.rs

1//! Core implementation of the Markdown processor.
2//!
3//! Main implementation of `MarkdownProcessor` and its methods focused on the
4//! core rendering pipeline and configuration management.
5#[expect(
6  clippy::disallowed_types,
7  reason = "Required for generic hasher abstraction"
8)]
9use std::collections::HashMap;
10use std::{
11  path::{Component, Path, PathBuf},
12  sync::LazyLock,
13};
14
15use comrak::{
16  Arena,
17  nodes::{AstNode, NodeHeading, NodeValue},
18  options::Options,
19  parse_document,
20};
21use log::trace;
22use markup5ever::local_name;
23use regex::Regex;
24use rustc_hash::FxHashMap;
25use walkdir::WalkDir;
26
27use super::{
28  dom::safe_select,
29  process::process_safe,
30  types::{
31    AstTransformer,
32    MarkdownOptions,
33    MarkdownProcessor,
34    PromptTransformer,
35  },
36};
37use crate::{
38  syntax::create_default_manager,
39  types::{Header, MarkdownResult},
40  utils,
41};
42
43static HEADER_ANCHOR_RE: LazyLock<Regex> = LazyLock::new(|| {
44  Regex::new(r"<h([1-6])>(.*?)\s*\{#([a-zA-Z0-9_.-]+)\}(.*?)</h[1-6]>")
45    .unwrap_or_else(|e| {
46      log::error!("Failed to compile HEADER_ANCHOR_RE regex: {e}");
47      utils::never_matching_regex().unwrap_or_else(|_| {
48        #[expect(
49          clippy::expect_used,
50          reason = "This pattern is guaranteed to be valid"
51        )]
52        Regex::new(r"[^\s\S]")
53          .expect("regex pattern [^\\s\\S] should always compile")
54      })
55    })
56});
57
58static HEADER_NO_ID_RE: LazyLock<Regex> = LazyLock::new(|| {
59  Regex::new(r"<h([1-6])>(.*?)</h[1-6]>").unwrap_or_else(|e| {
60    log::error!("Failed to compile HEADER_NO_ID_RE regex: {e}");
61    utils::never_matching_regex().unwrap_or_else(|_| {
62      #[expect(
63        clippy::expect_used,
64        reason = "This pattern is guaranteed to be valid"
65      )]
66      Regex::new(r"[^\s\S]")
67        .expect("regex pattern [^\\s\\S] should always compile")
68    })
69  })
70});
71
72static HTML_TAG_RE: LazyLock<Regex> = LazyLock::new(|| {
73  Regex::new(r"<[^>]+>").unwrap_or_else(|e| {
74    log::error!("Failed to compile HTML_TAG_RE regex: {e}");
75    utils::never_matching_regex().unwrap_or_else(|_| {
76      #[expect(
77        clippy::expect_used,
78        reason = "This pattern is guaranteed to be valid"
79      )]
80      Regex::new(r"[^\s\S]")
81        .expect("regex pattern [^\\s\\S] should always compile")
82    })
83  })
84});
85
86impl MarkdownProcessor {
87  /// Create a new `MarkdownProcessor` with the given options.
88  #[must_use]
89  pub fn new(options: MarkdownOptions) -> Self {
90    let manpage_urls = options
91      .manpage_urls_path
92      .as_ref()
93      .and_then(|path| crate::utils::load_manpage_urls(path).ok());
94
95    let syntax_manager = if options.highlight_code {
96      match create_default_manager(
97        options
98          .syntax_queries_path
99          .as_deref()
100          .map(std::path::Path::new),
101      ) {
102        Ok(manager) => {
103          log::info!("Syntax highlighting initialized successfully");
104          Some(manager)
105        },
106        Err(e) => {
107          log::error!("Failed to initialize syntax highlighting: {e}");
108          log::warn!(
109            "Continuing without syntax highlighting - code blocks will not be \
110             highlighted"
111          );
112          None
113        },
114      }
115    } else {
116      None
117    };
118
119    Self {
120      options,
121      manpage_urls,
122      syntax_manager,
123      base_dir: std::path::PathBuf::from("."),
124    }
125  }
126
127  /// Access processor options.
128  #[must_use]
129  pub const fn options(&self) -> &MarkdownOptions {
130    &self.options
131  }
132
133  /// Set the base directory for resolving relative file paths.
134  #[must_use]
135  pub fn with_base_dir(mut self, base_dir: &std::path::Path) -> Self {
136    self.base_dir = base_dir.to_path_buf();
137    self
138  }
139
140  /// Check if a specific feature is enabled.
141  #[must_use]
142  pub const fn has_feature(&self, feature: ProcessorFeature) -> bool {
143    match feature {
144      ProcessorFeature::Gfm => self.options.gfm,
145      ProcessorFeature::Nixpkgs => self.options.nixpkgs,
146      ProcessorFeature::SyntaxHighlighting => self.options.highlight_code,
147      ProcessorFeature::ManpageUrls => self.manpage_urls.is_some(),
148    }
149  }
150
151  /// Get the manpage URLs mapping for use with standalone functions.
152  #[must_use]
153  pub const fn manpage_urls(&self) -> Option<&FxHashMap<String, String>> {
154    self.manpage_urls.as_ref()
155  }
156
157  /// Highlight all code blocks in HTML using the configured syntax highlighter
158  #[must_use]
159  pub fn highlight_codeblocks(&self, html: &str) -> String {
160    use kuchikikiki::parse_html;
161    use tendril::TendrilSink;
162
163    if !self.options.highlight_code || self.syntax_manager.is_none() {
164      return html.to_string();
165    }
166
167    let document = parse_html().one(html);
168
169    // Collect all code blocks first to avoid DOM modification during iteration
170    let mut code_blocks = Vec::new();
171    for pre_node in safe_select(&document, "pre > code") {
172      let code_node = pre_node;
173      if let Some(element) = code_node.as_element() {
174        let language = element
175          .attributes
176          .borrow()
177          .get("class")
178          .and_then(|class| class.strip_prefix("language-"))
179          .unwrap_or("text")
180          .to_string();
181        let code_text = code_node.text_contents();
182
183        if let Some(pre_parent) = code_node.parent() {
184          code_blocks.push((
185            pre_parent.clone(),
186            code_node.clone(),
187            code_text,
188            language,
189          ));
190        }
191      }
192    }
193
194    // Process each code block
195    for (pre_element, _code_node, code_text, language) in code_blocks {
196      if let Some(highlighted) = self.highlight_code_html(&code_text, &language)
197      {
198        // Wrap highlighted HTML in <pre><code> with appropriate classes
199        let wrapped_html = format!(
200          r#"<pre class="highlight"><code class="language-{language}">{highlighted}</code></pre>"#
201        );
202        let fragment = parse_html().one(wrapped_html.as_str());
203        pre_element.insert_after(fragment);
204        pre_element.detach();
205      }
206      // Do not add highlight/language-* classes if not highlighted
207    }
208
209    let mut buf = Vec::new();
210    if let Err(e) = document.serialize(&mut buf) {
211      log::warn!("DOM serialization failed: {e:?}");
212      return html.to_string(); // Return original HTML if serialization fails
213    }
214    String::from_utf8(buf).unwrap_or_else(|_| html.to_string())
215  }
216
217  /// Handle hard tabs in code blocks according to configuration
218  fn handle_hardtabs(&self, code: &str) -> String {
219    use super::types::TabStyle;
220
221    // Check if there are any hard tabs
222    if !code.contains('\t') {
223      return code.to_string();
224    }
225
226    match self.options.tab_style {
227      // Do nothing
228      TabStyle::None => code.to_string(),
229
230      // Warn, but do nothing.
231      TabStyle::Warn => {
232        log::warn!(
233          "Hard tabs detected in code block. Consider using spaces for \
234           consistency. Tools like editorconfig may help you normalize spaces \
235           in your documents."
236        );
237        code.to_string()
238      },
239
240      // Do not warn, only inform in debug mode. Then return
241      // the updated code.
242      TabStyle::Normalize => {
243        log::debug!("Replacing hard tabs with spaces");
244        code.replace('\t', "  ")
245      },
246    }
247  }
248
249  /// Process hard tabs in code blocks within markdown content
250  fn process_hardtabs(&self, markdown: &str) -> String {
251    use super::types::TabStyle;
252    use crate::utils::codeblock::FenceTracker;
253
254    // If no tab handling is needed, return as-is
255    if self.options.tab_style == TabStyle::None {
256      return markdown.to_string();
257    }
258
259    let mut result = String::with_capacity(markdown.len());
260    let mut lines = markdown.lines().peekable();
261    let mut tracker = FenceTracker::new();
262
263    while let Some(line) = lines.next() {
264      tracker = tracker.process_line(line);
265
266      // Only replace tabs inside fenced code blocks
267      let processed_line = if tracker.in_code_block() && line.contains('\t') {
268        self.handle_hardtabs(line)
269      } else {
270        line.to_string()
271      };
272
273      result.push_str(&processed_line);
274
275      // Add newline unless this is the last line
276      if lines.peek().is_some() {
277        result.push('\n');
278      }
279    }
280
281    result
282  }
283
284  /// Highlight code using the configured syntax highlighter, returns HTML
285  /// string
286  fn highlight_code_html(&self, code: &str, language: &str) -> Option<String> {
287    if !self.options.highlight_code {
288      return None;
289    }
290
291    let syntax_manager = self.syntax_manager.as_ref()?;
292
293    syntax_manager
294      .highlight_code(code, language, self.options.highlight_theme.as_deref())
295      .ok()
296  }
297
298  /// Render Markdown to HTML, extracting headers and title.
299  #[must_use]
300  pub fn render(&self, markdown: &str) -> MarkdownResult {
301    let (preprocessed, included_files) = self.preprocess(markdown);
302    let (headers, title) = self.extract_headers(&preprocessed);
303    let html = self.process_html_pipeline(&preprocessed);
304
305    MarkdownResult {
306      html,
307      headers,
308      title,
309      included_files,
310    }
311  }
312
313  /// Process the HTML generation and post-processing pipeline.
314  fn process_html_pipeline(&self, content: &str) -> String {
315    let mut html = self.convert_to_html(content);
316
317    // Apply feature-specific post-processing
318    if cfg!(feature = "ndg-flavored") {
319      #[cfg(feature = "ndg-flavored")]
320      {
321        html = super::extensions::process_option_references(
322          &html,
323          self.options.valid_options.as_ref(),
324        );
325      }
326    }
327
328    if self.options.nixpkgs {
329      html = self.process_manpage_references_html(&html);
330    }
331
332    if self.options.highlight_code {
333      html = self.highlight_codeblocks(&html);
334    }
335
336    self.kuchiki_postprocess(&html)
337  }
338
339  /// Preprocess the markdown content with all enabled transformations.
340  fn preprocess(
341    &self,
342    content: &str,
343  ) -> (String, Vec<crate::types::IncludedFile>) {
344    let mut processed = content.to_string();
345    let mut included_files = Vec::new();
346
347    // Process MyST-style autolinks first
348    processed = super::extensions::process_myst_autolinks(&processed);
349
350    // Handle hard tabs in code blocks
351    processed = self.process_hardtabs(&processed);
352
353    if self.options.nixpkgs {
354      let (content, files) = self.apply_nixpkgs_preprocessing(&processed);
355      processed = content;
356      included_files = files;
357    }
358
359    if self.options.nixpkgs || cfg!(feature = "ndg-flavored") {
360      processed = super::extensions::process_role_markup(
361        &processed,
362        self.manpage_urls.as_ref(),
363        self.options.auto_link_options,
364        self.options.valid_options.as_ref(),
365      );
366    }
367
368    #[cfg(feature = "wiki")]
369    {
370      processed = super::extensions::process_wikilinks(&processed);
371    }
372
373    (processed, included_files)
374  }
375
376  /// Apply Nixpkgs-specific preprocessing steps.
377  #[cfg(feature = "nixpkgs")]
378  fn apply_nixpkgs_preprocessing(
379    &self,
380    content: &str,
381  ) -> (String, Vec<crate::types::IncludedFile>) {
382    let (with_includes, included_files) =
383      match super::extensions::process_file_includes(content, &self.base_dir, 0)
384      {
385        Ok(result) => result,
386        Err(e) => {
387          log::warn!(
388            "File include processing failed: {e}. Continuing without includes."
389          );
390          (content.to_string(), Vec::new())
391        },
392      };
393    let with_blocks = super::extensions::process_block_elements(&with_includes);
394    let with_spans = super::extensions::process_bracketed_spans(&with_blocks);
395    let processed = super::extensions::process_inline_anchors(&with_spans);
396    (processed, included_files)
397  }
398
399  /// Apply Nixpkgs-specific preprocessing steps (no-op when feature disabled).
400  #[cfg(not(feature = "nixpkgs"))]
401  fn apply_nixpkgs_preprocessing(
402    &self,
403    content: &str,
404  ) -> (String, Vec<crate::types::IncludedFile>) {
405    (content.to_string(), Vec::new())
406  }
407
408  /// Extract headers and title from the markdown content.
409  #[must_use]
410  pub fn extract_headers(
411    &self,
412    content: &str,
413  ) -> (Vec<Header>, Option<String>) {
414    use std::fmt::Write;
415
416    let arena = Arena::new();
417    let options = self.comrak_options();
418
419    let content = remove_admonition_blocks_for_headers(content);
420
421    // Normalize custom anchors with no heading level to h2
422    let mut normalized = String::with_capacity(content.len());
423    let mut lines = content.lines().peekable();
424    while let Some(line) = lines.next() {
425      let trimmed = line.trim();
426      if !trimmed.starts_with('#')
427        && !lines
428          .peek()
429          .is_some_and(|next| is_setext_heading_underline(next.trim()))
430        && let Some(anchor_start) = trimmed.rfind("{#")
431        && let Some(anchor_end) = trimmed[anchor_start..].find('}')
432      {
433        let text = trimmed[..anchor_start].trim_end();
434        let id = &trimmed[anchor_start + 2..anchor_start + anchor_end];
435        let _ = writeln!(normalized, "## {text} {{#{id}}}");
436        continue;
437      }
438      normalized.push_str(line);
439      normalized.push('\n');
440    }
441
442    let root = parse_document(&arena, &normalized, &options);
443
444    let mut headers = Vec::new();
445    let mut found_title = None;
446
447    for node in root.descendants() {
448      if let NodeValue::Heading(NodeHeading { level, .. }) =
449        &node.data.borrow().value
450      {
451        let mut text = String::new();
452        let mut explicit_id = None;
453
454        for child in node.children() {
455          match &child.data.borrow().value {
456            NodeValue::Text(t) => text.push_str(t),
457            NodeValue::Code(t) => text.push_str(&t.literal),
458            NodeValue::Link(..)
459            | NodeValue::Emph
460            | NodeValue::Strong
461            | NodeValue::Subscript
462            | NodeValue::Strikethrough
463            | NodeValue::Superscript
464            | NodeValue::FootnoteReference(..) => {
465              text.push_str(&extract_inline_text(child));
466            },
467            NodeValue::HtmlInline(html) => {
468              // Look for explicit anchor in HTML inline node: {#id}
469              let html_str = html.as_str();
470              if let Some(start) = html_str.find("{#")
471                && let Some(end) = html_str[start..].find('}')
472              {
473                let anchor = &html_str[start + 2..start + end];
474                explicit_id = Some(anchor.to_string());
475              }
476            },
477            #[expect(clippy::match_same_arms, reason = "Explicit for clarity")]
478            NodeValue::Image(..) => {},
479            _ => {},
480          }
481        }
482
483        // Check for trailing {#id} in heading text
484        let trimmed = text.trim_end();
485        #[expect(
486          clippy::option_if_let_else,
487          reason = "nested options clearer with if-let"
488        )]
489        let (final_text, id) = if let Some(start) = trimmed.rfind("{#") {
490          if let Some(end) = trimmed[start..].find('}') {
491            let anchor = &trimmed[start + 2..start + end];
492            (trimmed[..start].trim_end().to_string(), anchor.to_string())
493          } else {
494            (
495              text.clone(),
496              explicit_id.unwrap_or_else(|| slugify_heading(&text)),
497            )
498          }
499        } else {
500          (
501            text.clone(),
502            explicit_id.unwrap_or_else(|| slugify_heading(&text)),
503          )
504        };
505        if *level == 1 && found_title.is_none() {
506          found_title = Some(final_text.clone());
507        }
508        headers.push(Header {
509          text: final_text,
510          level: *level,
511          id,
512        });
513      }
514    }
515
516    (headers, found_title)
517  }
518
519  /// Convert markdown to HTML using comrak and configured options.
520  fn convert_to_html(&self, content: &str) -> String {
521    // Process directly without panic catching for better performance
522    let arena = Arena::new();
523    let options = self.comrak_options();
524    let root = parse_document(&arena, content, &options);
525
526    // Apply AST transformations
527    let prompt_transformer = PromptTransformer;
528    prompt_transformer.transform(root);
529
530    let mut html_output = String::new();
531    if let Err(e) = comrak::format_html(root, &options, &mut html_output) {
532      log::error!("Failed to format HTML: {e}");
533    }
534
535    // Post-process HTML to handle header anchors
536    Self::process_header_anchors_html(&html_output)
537  }
538
539  /// Process header anchors in HTML by finding `{#id}` syntax and converting to
540  /// proper id attributes. Also adds auto-generated IDs to headers without
541  /// explicit anchors.
542  fn process_header_anchors_html(html: &str) -> String {
543    // First pass: explicit {#id} syntax
544    let result = HEADER_ANCHOR_RE
545      .replace_all(html, |caps: &regex::Captures| {
546        let level = &caps[1];
547        let prefix = &caps[2];
548        let id = &caps[3];
549        let suffix = &caps[4];
550        format!("<h{level} id=\"{id}\">{prefix}{suffix}</h{level}>")
551      })
552      .to_string();
553
554    // Second pass: add auto-generated IDs to headers without id attribute
555    HEADER_NO_ID_RE
556      .replace_all(&result, |caps: &regex::Captures| {
557        let level = &caps[1];
558        let content = &caps[2];
559        // Strip HTML tags and slugify the text content
560        let text_only = HTML_TAG_RE.replace_all(content, "");
561        let id = utils::slugify(&text_only);
562        if id.is_empty() {
563          // If slugify produces empty string, keep header without id
564          format!("<h{level}>{content}</h{level}>")
565        } else {
566          format!("<h{level} id=\"{id}\">{content}</h{level}>")
567        }
568      })
569      .to_string()
570  }
571
572  /// Build comrak options from `MarkdownOptions` and feature flags.
573  fn comrak_options(&self) -> Options<'_> {
574    let mut options = Options::default();
575    // Markdown features present in GFM.
576    if self.options.gfm {
577      options.extension.table = true;
578      options.extension.footnotes = true;
579      options.extension.strikethrough = true;
580      options.extension.tasklist = true;
581      options.extension.superscript = true;
582      options.extension.autolink = true;
583    }
584
585    // Enable unsafe HTML references. This is not a security concern
586    // as all input is assumed to be trusted.
587    options.render.r#unsafe = true;
588
589    // Enable description lists but keep custom header processing
590    options.extension.header_id_prefix = None;
591    options.extension.description_lists = true;
592    options
593  }
594
595  /// Post-process HTML to enhance manpage references with URL links.
596  #[cfg(feature = "nixpkgs")]
597  fn process_manpage_references_html(&self, html: &str) -> String {
598    super::extensions::process_manpage_references(
599      html,
600      self.manpage_urls.as_ref(),
601    )
602  }
603
604  /// Post-process HTML to enhance manpage references (no-op when feature
605  /// disabled).
606  #[cfg(not(feature = "nixpkgs"))]
607  fn process_manpage_references_html(&self, html: &str) -> String {
608    html.to_string()
609  }
610
611  /// HTML post-processing using kuchiki DOM manipulation.
612  #[expect(
613    clippy::unused_self,
614    reason = "Method signature matches processor pattern"
615  )]
616  fn kuchiki_postprocess(&self, html: &str) -> String {
617    // Use a standalone function to avoid borrowing issues
618    kuchiki_postprocess_html(html, |document| {
619      Self::apply_dom_transformations(document);
620    })
621  }
622
623  /// Apply all DOM transformations to the parsed HTML document.
624  fn apply_dom_transformations(document: &kuchikikiki::NodeRef) {
625    Self::process_list_item_id_markers(document);
626    Self::process_header_anchor_comments(document);
627    Self::process_list_item_inline_anchors(document);
628    Self::process_paragraph_inline_anchors(document);
629    Self::process_remaining_inline_anchors(document);
630    Self::process_markdown_links(document);
631    Self::process_option_anchor_links(document);
632    Self::process_empty_auto_links(document);
633  }
634
635  /// Process list item ID markers: <li><!-- nixos-anchor-id:ID -->
636  fn process_list_item_id_markers(document: &kuchikikiki::NodeRef) {
637    let mut to_modify = Vec::new();
638
639    for comment in document.inclusive_descendants() {
640      if let Some(comment_node) = comment.as_comment() {
641        let comment_text = comment_node.borrow();
642        if let Some(id_start) = comment_text.find("nixos-anchor-id:") {
643          let id = comment_text[id_start + 16..].trim();
644          if !id.is_empty()
645            && id
646              .chars()
647              .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
648          {
649            // Check if this comment is inside an <li> element
650            if let Some(parent) = comment.parent()
651              && let Some(element) = parent.as_element()
652              && element.name.local.as_ref() == "li"
653            {
654              to_modify.push((comment.clone(), id.to_string()));
655            }
656          }
657        }
658      }
659    }
660
661    for (comment_node, id) in to_modify {
662      let span = kuchikikiki::NodeRef::new_element(
663        markup5ever::QualName::new(
664          None,
665          markup5ever::ns!(html),
666          local_name!("span"),
667        ),
668        vec![
669          (
670            kuchikikiki::ExpandedName::new("", "id"),
671            kuchikikiki::Attribute {
672              prefix: None,
673              value:  id,
674            },
675          ),
676          (
677            kuchikikiki::ExpandedName::new("", "class"),
678            kuchikikiki::Attribute {
679              prefix: None,
680              value:  "nixos-anchor".into(),
681            },
682          ),
683        ],
684      );
685      comment_node.insert_after(span);
686      comment_node.detach();
687    }
688  }
689
690  /// Process header anchors with comments: <h1>text<!-- anchor: id --></h1>
691  fn process_header_anchor_comments(document: &kuchikikiki::NodeRef) {
692    let mut to_modify = Vec::new();
693
694    for comment in document.inclusive_descendants() {
695      if let Some(comment_node) = comment.as_comment() {
696        let comment_text = comment_node.borrow();
697        if let Some(anchor_start) = comment_text.find("anchor:") {
698          let id = comment_text[anchor_start + 7..].trim();
699          if !id.is_empty()
700            && id
701              .chars()
702              .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
703          {
704            // Check if this comment is inside a header element
705            if let Some(parent) = comment.parent()
706              && let Some(element) = parent.as_element()
707              && matches!(
708                element.name.local.as_ref(),
709                "h1" | "h2" | "h3" | "h4" | "h5" | "h6"
710              )
711            {
712              to_modify.push((parent.clone(), comment.clone(), id.to_string()));
713            }
714          }
715        }
716      }
717    }
718
719    for (header_element, comment_node, id) in to_modify {
720      if let Some(element) = header_element.as_element() {
721        element
722          .attributes
723          .borrow_mut()
724          .insert(local_name!("id"), id);
725        comment_node.detach();
726      }
727    }
728  }
729
730  /// Process remaining inline anchors in list items: <li>[]{#id}content</li>
731  fn process_list_item_inline_anchors(document: &kuchikikiki::NodeRef) {
732    for li_node in safe_select(document, "li") {
733      let li_element = li_node;
734
735      // Check if this list item contains code elements
736      let has_code = !safe_select(&li_element, "code, pre").is_empty();
737      if has_code {
738        continue; // Skip list items with code blocks
739      }
740
741      let text_content = li_element.text_contents();
742
743      if let Some(anchor_start) = text_content.find("[]{#")
744        && let Some(anchor_end) = text_content[anchor_start..].find('}')
745      {
746        let id = &text_content[anchor_start + 4..anchor_start + anchor_end];
747        if !id.is_empty()
748          && id
749            .chars()
750            .all(|c| c.is_alphanumeric() || c == '-' || c == '_' || c == '.')
751        {
752          let remaining_content =
753            &text_content[anchor_start + anchor_end + 1..];
754
755          // Clear current content and rebuild
756          for child in li_element.children() {
757            child.detach();
758          }
759
760          let span = kuchikikiki::NodeRef::new_element(
761            markup5ever::QualName::new(
762              None,
763              markup5ever::ns!(html),
764              local_name!("span"),
765            ),
766            vec![
767              (
768                kuchikikiki::ExpandedName::new("", "id"),
769                kuchikikiki::Attribute {
770                  prefix: None,
771                  value:  id.into(),
772                },
773              ),
774              (
775                kuchikikiki::ExpandedName::new("", "class"),
776                kuchikikiki::Attribute {
777                  prefix: None,
778                  value:  "nixos-anchor".into(),
779                },
780              ),
781            ],
782          );
783          li_element.append(span);
784          if !remaining_content.is_empty() {
785            li_element
786              .append(kuchikikiki::NodeRef::new_text(remaining_content));
787          }
788        }
789      }
790    }
791  }
792
793  /// Process inline anchors in paragraphs: <p>[]{#id}content</p>
794  fn process_paragraph_inline_anchors(document: &kuchikikiki::NodeRef) {
795    for p_node in safe_select(document, "p") {
796      let p_element = p_node;
797
798      // Check if this paragraph contains code elements
799      let has_code = !safe_select(&p_element, "code, pre").is_empty();
800      if has_code {
801        continue; // Skip paragraphs with code blocks
802      }
803
804      let text_content = p_element.text_contents();
805
806      if let Some(anchor_start) = text_content.find("[]{#")
807        && let Some(anchor_end) = text_content[anchor_start..].find('}')
808      {
809        let id = &text_content[anchor_start + 4..anchor_start + anchor_end];
810        if !id.is_empty()
811          && id
812            .chars()
813            .all(|c| c.is_alphanumeric() || c == '-' || c == '_' || c == '.')
814        {
815          let remaining_content =
816            &text_content[anchor_start + anchor_end + 1..];
817
818          // Clear current content and rebuild
819          for child in p_element.children() {
820            child.detach();
821          }
822
823          let span = kuchikikiki::NodeRef::new_element(
824            markup5ever::QualName::new(
825              None,
826              markup5ever::ns!(html),
827              local_name!("span"),
828            ),
829            vec![
830              (
831                kuchikikiki::ExpandedName::new("", "id"),
832                kuchikikiki::Attribute {
833                  prefix: None,
834                  value:  id.into(),
835                },
836              ),
837              (
838                kuchikikiki::ExpandedName::new("", "class"),
839                kuchikikiki::Attribute {
840                  prefix: None,
841                  value:  "nixos-anchor".into(),
842                },
843              ),
844            ],
845          );
846          p_element.append(span);
847          if !remaining_content.is_empty() {
848            p_element.append(kuchikikiki::NodeRef::new_text(remaining_content));
849          }
850        }
851      }
852    }
853  }
854
855  /// Process remaining standalone inline anchors throughout the document
856  fn process_remaining_inline_anchors(document: &kuchikikiki::NodeRef) {
857    let mut text_nodes_to_process = Vec::new();
858
859    for node in document.inclusive_descendants() {
860      if let Some(text_node) = node.as_text() {
861        // Check if this text node is inside a code block
862        let mut parent = node.parent();
863        let mut in_code = false;
864        while let Some(p) = parent {
865          if let Some(element) = p.as_element()
866            && (element.name.local == local_name!("code")
867              || element.name.local == local_name!("pre"))
868          {
869            in_code = true;
870            break;
871          }
872          parent = p.parent();
873        }
874
875        // Only process if not in code
876        if !in_code {
877          let text_content = text_node.borrow().clone();
878          if text_content.contains("[]{#") {
879            text_nodes_to_process.push((node.clone(), text_content));
880          }
881        }
882      }
883    }
884
885    for (text_node, text_content) in text_nodes_to_process {
886      let mut last_end = 0;
887      let mut new_children = Vec::new();
888
889      // Simple pattern matching for []{#id}
890      let chars = text_content.chars().collect::<Vec<_>>();
891      let mut i = 0;
892      while i < chars.len() {
893        if i + 4 < chars.len()
894          && chars[i] == '['
895          && chars[i + 1] == ']'
896          && chars[i + 2] == '{'
897          && chars[i + 3] == '#'
898        {
899          // Found start of anchor pattern
900          let anchor_start = i;
901          i += 4; // skip "[]{#"
902
903          let mut id = String::new();
904          while i < chars.len() && chars[i] != '}' {
905            if chars[i].is_alphanumeric()
906              || chars[i] == '-'
907              || chars[i] == '_'
908              || chars[i] == '.'
909            {
910              id.push(chars[i]);
911              i += 1;
912            } else {
913              break;
914            }
915          }
916
917          if i < chars.len() && chars[i] == '}' && !id.is_empty() {
918            // Valid anchor found
919            let anchor_end = i + 1;
920
921            // Add text before anchor
922            if anchor_start > last_end {
923              new_children.push(kuchikikiki::NodeRef::new_text(
924                chars[last_end..anchor_start].iter().collect::<String>(),
925              ));
926            }
927
928            // Add span element
929            let span = kuchikikiki::NodeRef::new_element(
930              markup5ever::QualName::new(
931                None,
932                markup5ever::ns!(html),
933                local_name!("span"),
934              ),
935              vec![
936                (
937                  kuchikikiki::ExpandedName::new("", "id"),
938                  kuchikikiki::Attribute {
939                    prefix: None,
940                    value:  id,
941                  },
942                ),
943                (
944                  kuchikikiki::ExpandedName::new("", "class"),
945                  kuchikikiki::Attribute {
946                    prefix: None,
947                    value:  "nixos-anchor".into(),
948                  },
949                ),
950              ],
951            );
952            new_children.push(span);
953
954            last_end = anchor_end;
955            i = anchor_end;
956          } else {
957            i += 1;
958          }
959        } else {
960          i += 1;
961        }
962      }
963
964      // Add remaining text
965      if last_end < chars.len() {
966        let after_text: String = chars[last_end..].iter().collect();
967        if !after_text.is_empty() {
968          new_children.push(kuchikikiki::NodeRef::new_text(after_text));
969        }
970      }
971
972      // Replace text node if we found anchors
973      if !new_children.is_empty() {
974        for child in new_children {
975          text_node.insert_before(child);
976        }
977        text_node.detach();
978      }
979    }
980  }
981
982  /// Process empty auto-links: [](#anchor) -> <a href="#anchor">Anchor</a>
983  fn process_empty_auto_links(document: &kuchikikiki::NodeRef) {
984    for link_node in safe_select(document, "a") {
985      let link_element = link_node;
986      if let Some(element) = link_element.as_element() {
987        let href = element
988          .attributes
989          .borrow()
990          .get(local_name!("href"))
991          .map(std::string::ToString::to_string);
992        let text_content = link_element.text_contents();
993
994        if let Some(href_value) = href
995          && href_value.starts_with('#')
996          && (text_content.trim().is_empty()
997            || text_content.trim() == "{{ANCHOR}}")
998        {
999          // Clear placeholder text if present
1000          if text_content.trim() == "{{ANCHOR}}" {
1001            for child in link_element.children() {
1002              child.detach();
1003            }
1004          }
1005          // Empty link with anchor - add humanized text
1006          let display_text = humanize_anchor(&href_value);
1007          link_element.append(kuchikikiki::NodeRef::new_text(display_text));
1008        }
1009      }
1010    }
1011  }
1012
1013  /// Process option anchor links: [](#opt-option.path) -> link to options.html
1014  fn process_option_anchor_links(document: &kuchikikiki::NodeRef) {
1015    let mut to_modify = Vec::new();
1016
1017    // Collect all option anchor links first
1018    for link_node in safe_select(document, "a[href^='#opt-']") {
1019      let link_element = link_node;
1020      if let Some(element) = link_element.as_element() {
1021        let href = element
1022          .attributes
1023          .borrow()
1024          .get(local_name!("href"))
1025          .map(std::string::ToString::to_string);
1026        let text_content = link_element.text_contents();
1027
1028        if let Some(href_value) = href
1029          && href_value.starts_with("#opt-")
1030        {
1031          let option_anchor = href_value[1..].to_string(); // remove the leading #
1032          let needs_text_replacement = text_content.trim().is_empty()
1033            || text_content.trim() == "{{ANCHOR}}";
1034          to_modify.push((
1035            link_element.clone(),
1036            option_anchor,
1037            needs_text_replacement,
1038          ));
1039        }
1040      }
1041    }
1042
1043    // Apply modifications
1044    for (link_element, option_anchor, needs_text_replacement) in to_modify {
1045      if let Some(element) = link_element.as_element() {
1046        let new_href = format!("options.html#{option_anchor}");
1047        element
1048          .attributes
1049          .borrow_mut()
1050          .insert(local_name!("href"), new_href);
1051
1052        if needs_text_replacement {
1053          // Clear existing content
1054          for child in link_element.children() {
1055            child.detach();
1056          }
1057
1058          // Extract option name from anchor
1059          // opt-services-nginx-enable -> services.nginx.enable
1060          if let Some(option_path) = option_anchor.strip_prefix("opt-") {
1061            let option_name = option_path.replace('-', ".");
1062            link_element.append(kuchikikiki::NodeRef::new_text(option_name));
1063          }
1064        }
1065      }
1066    }
1067  }
1068
1069  /// Process markdown file links: convert .md hrefs to .html
1070  fn process_markdown_links(document: &kuchikikiki::NodeRef) {
1071    for link_node in safe_select(document, "a") {
1072      let link_element = link_node;
1073      if let Some(element) = link_element.as_element() {
1074        let href = element
1075          .attributes
1076          .borrow()
1077          .get(local_name!("href"))
1078          .map(std::string::ToString::to_string);
1079
1080        if let Some(href_value) = href {
1081          // Only process relative links ending in .md (not absolute URLs, not
1082          // anchors)
1083          if !href_value.starts_with("http://")
1084            && !href_value.starts_with("https://")
1085            && !href_value.starts_with('#')
1086            && !href_value.starts_with("mailto:")
1087          {
1088            // Split off fragment (#) and query (?) to check the path extension
1089            let (path_part, suffix) = href_value
1090              .find(['#', '?'])
1091              .map_or((href_value.as_str(), ""), |idx| {
1092                href_value.split_at(idx)
1093              });
1094
1095            if std::path::Path::new(path_part)
1096              .extension()
1097              .is_some_and(|ext| ext.eq_ignore_ascii_case("md"))
1098            {
1099              let new_href =
1100                format!("{}.html{}", &path_part[..path_part.len() - 3], suffix);
1101              element
1102                .attributes
1103                .borrow_mut()
1104                .insert(local_name!("href"), new_href);
1105            }
1106          }
1107        }
1108      }
1109    }
1110  }
1111}
1112
1113/// Convert an anchor ID to human-readable text.
1114fn humanize_anchor(anchor: &str) -> String {
1115  let cleaned = anchor.trim_start_matches('#');
1116  let without_prefix = cleaned
1117    .trim_start_matches("sec-")
1118    .trim_start_matches("ssec-")
1119    .trim_start_matches("opt-");
1120  let spaced = without_prefix.replace(['-', '_'], " ");
1121  spaced
1122    .split_whitespace()
1123    .map(|word| {
1124      let mut chars = word.chars();
1125      chars.next().map_or_else(String::new, |c| {
1126        c.to_uppercase().collect::<String>() + chars.as_str()
1127      })
1128    })
1129    .collect::<Vec<String>>()
1130    .join(" ")
1131}
1132
1133/// Compute the relative URL from one output page to another.
1134///
1135/// E.g. `relative_page_path("guide/intro.html", "install.html")` →
1136/// `"../install.html"`.
1137fn relative_page_path(from_page: &str, to_page: &str) -> String {
1138  let from_dir = Path::new(from_page)
1139    .parent()
1140    .unwrap_or_else(|| Path::new(""));
1141  let to_path = Path::new(to_page);
1142
1143  let from_parts: Vec<_> = from_dir
1144    .components()
1145    .filter(|c| !matches!(c, Component::CurDir))
1146    .collect();
1147  let to_parts: Vec<_> = to_path
1148    .components()
1149    .filter(|c| !matches!(c, Component::CurDir))
1150    .collect();
1151
1152  let common = from_parts
1153    .iter()
1154    .zip(to_parts.iter())
1155    .take_while(|(a, b)| a == b)
1156    .count();
1157
1158  let ups = from_parts.len() - common;
1159  let remainder = &to_parts[common..];
1160
1161  let mut result = std::path::PathBuf::new();
1162  for _ in 0..ups {
1163    result.push("..");
1164  }
1165  for part in remainder {
1166    result.push(part);
1167  }
1168
1169  let s = result.to_string_lossy().to_string();
1170  if s.is_empty() { to_page.to_string() } else { s }
1171}
1172
1173/// Rewrite cross-page anchor links in rendered HTML.
1174///
1175/// For every `<a href="#anchor">` element:
1176/// - If `anchor` is found in `registry` and its owning page differs from
1177///   `current_page`, rewrites the `href` to the relative path to that page plus
1178///   the fragment (`other.html#anchor`).
1179/// - If the link text is empty, the `{{ANCHOR}}` placeholder, or the
1180///   auto-generated humanized slug, replaces it with the registry title.
1181/// - Same-page anchors and anchors absent from the registry are left as-is.
1182///
1183/// The `registry` maps anchor ID → `(owning_output_page, heading_title)`.
1184#[must_use]
1185#[expect(
1186  clippy::disallowed_types,
1187  reason = "Uses generic HashMap for hasher flexibility"
1188)]
1189pub fn rewrite_cross_page_anchor_links<S: std::hash::BuildHasher>(
1190  html: &str,
1191  current_page: &str,
1192  registry: &HashMap<String, (String, String), S>,
1193) -> String {
1194  if registry.is_empty() {
1195    return html.to_string();
1196  }
1197
1198  kuchiki_postprocess_html(html, |document| {
1199    // Collect modifications first to avoid borrow conflicts.
1200    let mut modifications: Vec<(kuchikikiki::NodeRef, String, Option<String>)> =
1201      Vec::new();
1202
1203    for link_node in safe_select(document, "a[href^='#']") {
1204      let Some(element) = link_node.as_element() else {
1205        continue;
1206      };
1207
1208      let href = element
1209        .attributes
1210        .borrow()
1211        .get(local_name!("href"))
1212        .map(std::string::ToString::to_string);
1213      let Some(href_val) = href else { continue };
1214
1215      let anchor_id = href_val.trim_start_matches('#');
1216      let Some((target_page, target_title)) = registry.get(anchor_id) else {
1217        continue;
1218      };
1219
1220      if target_page == current_page {
1221        continue;
1222      }
1223
1224      let rel = relative_page_path(current_page, target_page);
1225      let new_href = format!("{rel}#{anchor_id}");
1226
1227      let current_text = link_node.text_contents();
1228      let humanized = humanize_anchor(&href_val);
1229      let replace_text = current_text.trim().is_empty()
1230        || current_text.trim() == "{{ANCHOR}}"
1231        || current_text.trim() == humanized.trim();
1232
1233      let new_text = if replace_text {
1234        Some(target_title.clone())
1235      } else {
1236        None
1237      };
1238
1239      modifications.push((link_node, new_href, new_text));
1240    }
1241
1242    for (link_node, new_href, new_text) in modifications {
1243      if let Some(element) = link_node.as_element() {
1244        element
1245          .attributes
1246          .borrow_mut()
1247          .insert(local_name!("href"), new_href);
1248      }
1249      if let Some(text) = new_text {
1250        for child in link_node.children() {
1251          child.detach();
1252        }
1253        link_node.append(kuchikikiki::NodeRef::new_text(text));
1254      }
1255    }
1256  })
1257}
1258
1259/// Extract all inline text from a heading node.
1260pub fn extract_inline_text<'a>(node: &'a AstNode<'a>) -> String {
1261  fn inner<'a>(node: &'a AstNode<'a>) -> String {
1262    let mut text = String::new();
1263    for child in node.children() {
1264      match &child.data.borrow().value {
1265        NodeValue::Text(t) => text.push_str(t),
1266        NodeValue::Code(t) => text.push_str(&t.literal),
1267        NodeValue::Link(..)
1268        | NodeValue::Emph
1269        | NodeValue::Strong
1270        | NodeValue::Strikethrough
1271        | NodeValue::Superscript
1272        | NodeValue::Subscript
1273        | NodeValue::FootnoteReference(..) => {
1274          text.push_str(&inner(child));
1275        },
1276        #[expect(clippy::match_same_arms, reason = "Explicit for clarity")]
1277        NodeValue::HtmlInline(_) | NodeValue::Image(..) => {},
1278        _ => {},
1279      }
1280    }
1281    text
1282  }
1283  inner(node)
1284}
1285
1286/// Slugify heading text for use as a table-of-contents anchor.
1287///
1288/// The auto-generated heading `id` is produced by slugifying the *rendered*
1289/// HTML (see `process_header_anchors_html`), in which comrak has escaped any
1290/// markup characters: an option heading like `environments.<name>.deployment`
1291/// becomes `environments.&lt;name&gt;.deployment` before slugifying. The
1292/// table-of-contents, by contrast, slugifies the raw inline text (`<name>`),
1293/// which would yield a different slug and break "jump to header" for every
1294/// heading containing such characters.
1295///
1296/// To keep both in sync, escape the text the same way comrak does before
1297/// slugifying, so the TOC href matches the on-page heading `id`. The heading
1298/// `id` itself is intentionally left unchanged to preserve existing deep links.
1299#[must_use]
1300pub(crate) fn slugify_heading(text: &str) -> String {
1301  utils::slugify(&html_escape::encode_text(text))
1302}
1303
1304/// Collect all markdown files from the input directory
1305pub fn collect_markdown_files(input_dir: &Path) -> Vec<PathBuf> {
1306  let mut files = Vec::with_capacity(100);
1307
1308  for entry in WalkDir::new(input_dir)
1309    .follow_links(true)
1310    .into_iter()
1311    .filter_map(Result::ok)
1312  {
1313    let path = entry.path();
1314    if path.is_file() && path.extension().is_some_and(|ext| ext == "md") {
1315      files.push(path.to_owned());
1316    }
1317  }
1318
1319  trace!("Found {} markdown files to process", files.len());
1320  files
1321}
1322
1323/// Features that can be queried on a processor instance.
1324#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1325pub enum ProcessorFeature {
1326  /// GitHub Flavored Markdown support
1327  Gfm,
1328  /// Nixpkgs documentation extensions
1329  Nixpkgs,
1330  /// Syntax highlighting for code blocks
1331  SyntaxHighlighting,
1332  /// Manpage URL mapping support
1333  ManpageUrls,
1334}
1335
1336fn remove_admonition_blocks_for_headers(content: &str) -> String {
1337  let mut output = String::with_capacity(content.len());
1338  let mut admonition_depth = 0usize;
1339
1340  for line in content.lines() {
1341    let trimmed = line.trim_start();
1342    if trimmed.starts_with("<div class=\"admonition ") {
1343      admonition_depth += 1;
1344      output.push('\n');
1345      continue;
1346    }
1347
1348    if admonition_depth > 0 {
1349      if trimmed == "</div>" {
1350        admonition_depth -= 1;
1351      }
1352      output.push('\n');
1353      continue;
1354    }
1355
1356    output.push_str(line);
1357    output.push('\n');
1358  }
1359
1360  output
1361}
1362
1363fn is_setext_heading_underline(line: &str) -> bool {
1364  !line.is_empty()
1365    && (line.chars().all(|ch| ch == '=' || ch.is_whitespace())
1366      || line.chars().all(|ch| ch == '-' || ch.is_whitespace()))
1367}
1368
1369/// Standalone HTML post-processing function to avoid borrowing issues.
1370fn kuchiki_postprocess_html<F>(html: &str, transform_fn: F) -> String
1371where
1372  F: FnOnce(&kuchikikiki::NodeRef),
1373{
1374  process_safe(
1375    html,
1376    |html| {
1377      use tendril::TendrilSink;
1378
1379      let document = kuchikikiki::parse_html().one(html);
1380      transform_fn(&document);
1381
1382      let mut out = Vec::new();
1383      let _ = document.serialize(&mut out);
1384      String::from_utf8_lossy(&out).into_owned()
1385    },
1386    html,
1387  )
1388}
ndg_commonmark/processor/core.rs

ndg_commonmark/processor/
core.rs