ndg_commonmark/processor/
core.rs

1//! Core implementation of the Markdown processor.
2//!
3//! This module contains the main implementation of `MarkdownProcessor` and its
4//! methods, focused on the core rendering pipeline and configuration
5//! management.
6use std::{
7  collections::HashMap,
8  path::{Path, PathBuf},
9};
10
11use comrak::{
12  Arena,
13  nodes::{AstNode, NodeHeading, NodeValue},
14  options::Options,
15  parse_document,
16};
17use log::trace;
18use markup5ever::local_name;
19use walkdir::WalkDir;
20
21/// Error type for DOM operations.
22#[derive(Debug, thiserror::Error)]
23pub enum DomError {
24  #[error("CSS selector failed: {0}")]
25  SelectorError(String),
26  #[error("DOM serialization failed: {0}")]
27  SerializationError(String),
28}
29
30/// Result type for DOM operations.
31pub type DomResult<T> = Result<T, DomError>;
32
33/// Safely select DOM elements with graceful error handling.
34fn safe_select(
35  document: &kuchikikiki::NodeRef,
36  selector: &str,
37) -> Vec<kuchikikiki::NodeRef> {
38  match document.select(selector) {
39    Ok(selections) => selections.map(|sel| sel.as_node().clone()).collect(),
40    Err(e) => {
41      log::warn!("DOM selector '{selector}' failed: {e:?}");
42      Vec::new()
43    },
44  }
45}
46
47use super::{
48  process::process_safe,
49  types::{
50    AstTransformer,
51    MarkdownOptions,
52    MarkdownProcessor,
53    PromptTransformer,
54  },
55};
56use crate::{
57  syntax::create_default_manager,
58  types::{Header, MarkdownResult},
59  utils,
60};
61
62impl MarkdownProcessor {
63  /// Create a new `MarkdownProcessor` with the given options.
64  #[must_use]
65  pub fn new(options: MarkdownOptions) -> Self {
66    let manpage_urls = options
67      .manpage_urls_path
68      .as_ref()
69      .and_then(|path| crate::utils::load_manpage_urls(path).ok());
70
71    let syntax_manager = if options.highlight_code {
72      match create_default_manager() {
73        Ok(manager) => {
74          log::info!("Syntax highlighting initialized successfully");
75          Some(manager)
76        },
77        Err(e) => {
78          log::error!("Failed to initialize syntax highlighting: {e}");
79          log::warn!(
80            "Continuing without syntax highlighting - code blocks will not be \
81             highlighted"
82          );
83          None
84        },
85      }
86    } else {
87      None
88    };
89
90    Self {
91      options,
92      manpage_urls,
93      syntax_manager,
94      base_dir: std::path::PathBuf::from("."),
95    }
96  }
97
98  /// Access processor options.
99  #[must_use]
100  pub const fn options(&self) -> &MarkdownOptions {
101    &self.options
102  }
103
104  /// Set the base directory for resolving relative file paths.
105  #[must_use]
106  pub fn with_base_dir(mut self, base_dir: &std::path::Path) -> Self {
107    self.base_dir = base_dir.to_path_buf();
108    self
109  }
110
111  /// Check if a specific feature is enabled.
112  #[must_use]
113  pub const fn has_feature(&self, feature: ProcessorFeature) -> bool {
114    match feature {
115      ProcessorFeature::Gfm => self.options.gfm,
116      ProcessorFeature::Nixpkgs => self.options.nixpkgs,
117      ProcessorFeature::SyntaxHighlighting => self.options.highlight_code,
118      ProcessorFeature::ManpageUrls => self.manpage_urls.is_some(),
119    }
120  }
121
122  /// Get the manpage URLs mapping for use with standalone functions.
123  #[must_use]
124  pub const fn manpage_urls(&self) -> Option<&HashMap<String, String>> {
125    self.manpage_urls.as_ref()
126  }
127
128  /// Highlight all code blocks in HTML using the configured syntax highlighter
129  #[must_use]
130  pub fn highlight_codeblocks(&self, html: &str) -> String {
131    use kuchikikiki::parse_html;
132    use tendril::TendrilSink;
133
134    if !self.options.highlight_code || self.syntax_manager.is_none() {
135      return html.to_string();
136    }
137
138    let document = parse_html().one(html);
139
140    // Collect all code blocks first to avoid DOM modification during iteration
141    let mut code_blocks = Vec::new();
142    for pre_node in safe_select(&document, "pre > code") {
143      let code_node = pre_node;
144      if let Some(element) = code_node.as_element() {
145        let language = element
146          .attributes
147          .borrow()
148          .get("class")
149          .and_then(|class| class.strip_prefix("language-"))
150          .unwrap_or("text")
151          .to_string();
152        let code_text = code_node.text_contents();
153
154        if let Some(pre_parent) = code_node.parent() {
155          code_blocks.push((
156            pre_parent.clone(),
157            code_node.clone(),
158            code_text,
159            language,
160          ));
161        }
162      }
163    }
164
165    // Process each code block
166    for (pre_element, _code_node, code_text, language) in code_blocks {
167      if let Some(highlighted) = self.highlight_code_html(&code_text, &language)
168      {
169        // Wrap highlighted HTML in <pre><code> with appropriate classes
170        let wrapped_html = format!(
171          r#"<pre class="highlight"><code class="language-{language}">{highlighted}</code></pre>"#
172        );
173        let fragment = parse_html().one(wrapped_html.as_str());
174        pre_element.insert_after(fragment);
175        pre_element.detach();
176      }
177      // Do not add highlight/language-* classes if not highlighted
178    }
179
180    let mut buf = Vec::new();
181    if let Err(e) = document.serialize(&mut buf) {
182      log::warn!("DOM serialization failed: {e:?}");
183      return html.to_string(); // Return original HTML if serialization fails
184    }
185    String::from_utf8(buf).unwrap_or_else(|_| html.to_string())
186  }
187
188  /// Handle hard tabs in code blocks according to configuration
189  fn handle_hardtabs(&self, code: &str) -> String {
190    use super::types::TabStyle;
191
192    // Check if there are any hard tabs
193    if !code.contains('\t') {
194      return code.to_string();
195    }
196
197    match self.options.tab_style {
198      // Do nothing
199      TabStyle::None => code.to_string(),
200
201      // Warn, but do nothing.
202      TabStyle::Warn => {
203        log::warn!(
204          "Hard tabs detected in code block. Consider using spaces for \
205           consistency. Tools like editorconfig may help you normalize spaces \
206           in your documents."
207        );
208        code.to_string()
209      },
210
211      // Do not warn, only inform in debug mode. Then return
212      // the updated code.
213      TabStyle::Normalize => {
214        log::debug!("Replacing hard tabs with spaces");
215        code.replace('\t', "  ")
216      },
217    }
218  }
219
220  /// Process hard tabs in code blocks within markdown content
221  fn process_hardtabs(&self, markdown: &str) -> String {
222    use super::types::TabStyle;
223
224    // If no tab handling is needed, return as-is
225    if self.options.tab_style == TabStyle::None {
226      return markdown.to_string();
227    }
228
229    let mut result = String::with_capacity(markdown.len());
230    let mut lines = markdown.lines().peekable();
231    let mut in_code_block = false;
232    let mut code_fence_char = None;
233    let mut code_fence_count = 0;
234
235    while let Some(line) = lines.next() {
236      let trimmed = line.trim_start();
237
238      // Check for code fences
239      if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
240        let Some(fence_char) = trimmed.chars().next() else {
241          // If the line is empty after trimming, it can't be a valid code fence
242          // Just continue processing the line normally
243          result.push_str(line);
244          result.push('\n');
245          continue;
246        };
247        let fence_count =
248          trimmed.chars().take_while(|&c| c == fence_char).count();
249
250        if fence_count >= 3 {
251          if !in_code_block {
252            // Starting a code block
253            in_code_block = true;
254            code_fence_char = Some(fence_char);
255            code_fence_count = fence_count;
256          } else if code_fence_char == Some(fence_char)
257            && fence_count >= code_fence_count
258          {
259            // Ending a code block
260            in_code_block = false;
261            code_fence_char = None;
262            code_fence_count = 0;
263          }
264        }
265      }
266
267      // Process line based on whether we're in a code block
268      let processed_line = if in_code_block && line.contains('\t') {
269        self.handle_hardtabs(line)
270      } else {
271        line.to_string()
272      };
273
274      result.push_str(&processed_line);
275
276      // Add newline unless this is the last line
277      if lines.peek().is_some() {
278        result.push('\n');
279      }
280    }
281
282    result
283  }
284
285  /// Highlight code using the configured syntax highlighter, returns HTML
286  /// string
287  fn highlight_code_html(&self, code: &str, language: &str) -> Option<String> {
288    if !self.options.highlight_code {
289      return None;
290    }
291
292    let syntax_manager = self.syntax_manager.as_ref()?;
293
294    syntax_manager
295      .highlight_code(code, language, self.options.highlight_theme.as_deref())
296      .ok()
297  }
298
299  /// Render Markdown to HTML, extracting headers and title.
300  #[must_use]
301  pub fn render(&self, markdown: &str) -> MarkdownResult {
302    let (preprocessed, included_files) = self.preprocess(markdown);
303    let (headers, title) = self.extract_headers(&preprocessed);
304    let html = self.process_html_pipeline(&preprocessed);
305
306    MarkdownResult {
307      html,
308      headers,
309      title,
310      included_files,
311    }
312  }
313
314  /// Process the HTML generation and post-processing pipeline.
315  fn process_html_pipeline(&self, content: &str) -> String {
316    let mut html = self.convert_to_html(content);
317
318    // Apply feature-specific post-processing
319    if cfg!(feature = "ndg-flavored") {
320      #[cfg(feature = "ndg-flavored")]
321      {
322        html = super::extensions::process_option_references(
323          &html,
324          self.options.valid_options.as_ref(),
325        );
326      }
327    }
328
329    if self.options.nixpkgs {
330      html = self.process_manpage_references_html(&html);
331    }
332
333    if self.options.highlight_code {
334      html = self.highlight_codeblocks(&html);
335    }
336
337    self.kuchiki_postprocess(&html)
338  }
339
340  /// Preprocess the markdown content with all enabled transformations.
341  fn preprocess(
342    &self,
343    content: &str,
344  ) -> (String, Vec<crate::types::IncludedFile>) {
345    let mut processed = content.to_string();
346    let mut included_files = Vec::new();
347
348    // Process MyST-style autolinks first
349    processed = super::extensions::process_myst_autolinks(&processed);
350
351    // Handle hard tabs in code blocks
352    processed = self.process_hardtabs(&processed);
353
354    if self.options.nixpkgs {
355      let (content, files) = self.apply_nixpkgs_preprocessing(&processed);
356      processed = content;
357      included_files = files;
358    }
359
360    if self.options.nixpkgs || cfg!(feature = "ndg-flavored") {
361      processed = super::extensions::process_role_markup(
362        &processed,
363        self.manpage_urls.as_ref(),
364        self.options.auto_link_options,
365        self.options.valid_options.as_ref(),
366      );
367    }
368
369    (processed, included_files)
370  }
371
372  /// Apply Nixpkgs-specific preprocessing steps.
373  #[cfg(feature = "nixpkgs")]
374  fn apply_nixpkgs_preprocessing(
375    &self,
376    content: &str,
377  ) -> (String, Vec<crate::types::IncludedFile>) {
378    let (with_includes, included_files) =
379      match super::extensions::process_file_includes(content, &self.base_dir, 0)
380      {
381        Ok(result) => result,
382        Err(e) => {
383          log::warn!(
384            "File include processing failed: {e}. Continuing without includes."
385          );
386          (content.to_string(), Vec::new())
387        },
388      };
389    let with_blocks = super::extensions::process_block_elements(&with_includes);
390    let processed = super::extensions::process_inline_anchors(&with_blocks);
391    (processed, included_files)
392  }
393
394  /// Apply Nixpkgs-specific preprocessing steps (no-op when feature disabled).
395  #[cfg(not(feature = "nixpkgs"))]
396  fn apply_nixpkgs_preprocessing(
397    &self,
398    content: &str,
399  ) -> (String, Vec<crate::types::IncludedFile>) {
400    (content.to_string(), Vec::new())
401  }
402
403  /// Extract headers and title from the markdown content.
404  #[must_use]
405  pub fn extract_headers(
406    &self,
407    content: &str,
408  ) -> (Vec<Header>, Option<String>) {
409    use std::fmt::Write;
410
411    let arena = Arena::new();
412    let options = self.comrak_options();
413
414    // Normalize custom anchors with no heading level to h2
415    let mut normalized = String::with_capacity(content.len());
416    for line in content.lines() {
417      let trimmed = line.trim_end();
418      if !trimmed.starts_with('#')
419        && let Some(anchor_start) = trimmed.rfind("{#")
420        && let Some(anchor_end) = trimmed[anchor_start..].find('}')
421      {
422        let text = trimmed[..anchor_start].trim_end();
423        let id = &trimmed[anchor_start + 2..anchor_start + anchor_end];
424        let _ = writeln!(normalized, "## {text} {{#{id}}}");
425        continue;
426      }
427      normalized.push_str(line);
428      normalized.push('\n');
429    }
430
431    let root = parse_document(&arena, &normalized, &options);
432
433    let mut headers = Vec::new();
434    let mut found_title = None;
435
436    for node in root.descendants() {
437      if let NodeValue::Heading(NodeHeading { level, .. }) =
438        &node.data.borrow().value
439      {
440        let mut text = String::new();
441        let mut explicit_id = None;
442
443        for child in node.children() {
444          match &child.data.borrow().value {
445            NodeValue::Text(t) => text.push_str(t),
446            NodeValue::Code(t) => text.push_str(&t.literal),
447            NodeValue::Link(..)
448            | NodeValue::Emph
449            | NodeValue::Strong
450            | NodeValue::Subscript
451            | NodeValue::Strikethrough
452            | NodeValue::Superscript
453            | NodeValue::FootnoteReference(..) => {
454              text.push_str(&extract_inline_text(child));
455            },
456            NodeValue::HtmlInline(html) => {
457              // Look for explicit anchor in HTML inline node: {#id}
458              let html_str = html.as_str();
459              if let Some(start) = html_str.find("{#")
460                && let Some(end) = html_str[start..].find('}')
461              {
462                let anchor = &html_str[start + 2..start + end];
463                explicit_id = Some(anchor.to_string());
464              }
465            },
466            #[allow(clippy::match_same_arms, reason = "Explicit for clarity")]
467            NodeValue::Image(..) => {},
468            _ => {},
469          }
470        }
471
472        // Check for trailing {#id} in heading text
473        let trimmed = text.trim_end();
474        #[allow(clippy::option_if_let_else)]
475        // Nested options clearer with if-let
476        let (final_text, id) = if let Some(start) = trimmed.rfind("{#") {
477          if let Some(end) = trimmed[start..].find('}') {
478            let anchor = &trimmed[start + 2..start + end];
479            (trimmed[..start].trim_end().to_string(), anchor.to_string())
480          } else {
481            (
482              text.clone(),
483              explicit_id.unwrap_or_else(|| utils::slugify(&text)),
484            )
485          }
486        } else {
487          (
488            text.clone(),
489            explicit_id.unwrap_or_else(|| utils::slugify(&text)),
490          )
491        };
492        if *level == 1 && found_title.is_none() {
493          found_title = Some(final_text.clone());
494        }
495        headers.push(Header {
496          text: final_text,
497          level: *level,
498          id,
499        });
500      }
501    }
502
503    (headers, found_title)
504  }
505
506  /// Convert markdown to HTML using comrak and configured options.
507  fn convert_to_html(&self, content: &str) -> String {
508    // Process directly without panic catching for better performance
509    let arena = Arena::new();
510    let options = self.comrak_options();
511    let root = parse_document(&arena, content, &options);
512
513    // Apply AST transformations
514    let prompt_transformer = PromptTransformer;
515    prompt_transformer.transform(root);
516
517    let mut html_output = String::new();
518    comrak::format_html(root, &options, &mut html_output).unwrap_or_default();
519
520    // Post-process HTML to handle header anchors
521    Self::process_header_anchors_html(&html_output)
522  }
523
524  /// Process header anchors in HTML by finding `{#id}` syntax and converting to
525  /// proper id attributes. Also adds auto-generated IDs to headers without
526  /// explicit anchors.
527  fn process_header_anchors_html(html: &str) -> String {
528    use std::sync::LazyLock;
529
530    use regex::Regex;
531
532    // First pass: handle explicit {#id} syntax
533    static HEADER_ANCHOR_RE: LazyLock<Regex> = LazyLock::new(|| {
534      Regex::new(r"<h([1-6])>(.*?)\s*\{#([a-zA-Z0-9_-]+)\}(.*?)</h[1-6]>")
535        .unwrap_or_else(|e| {
536          log::error!("Failed to compile HEADER_ANCHOR_RE regex: {e}");
537          utils::never_matching_regex().unwrap_or_else(|_| {
538            #[allow(
539              clippy::expect_used,
540              reason = "This pattern is guaranteed to be valid"
541            )]
542            Regex::new(r"[^\s\S]")
543              .expect("regex pattern [^\\s\\S] should always compile")
544          })
545        })
546    });
547
548    // Second pass: add IDs to headers without attributes (no id yet)
549    // Matches <h1>content</h1> but not <h1 id="...">content</h1>
550    static HEADER_NO_ID_RE: LazyLock<Regex> = LazyLock::new(|| {
551      Regex::new(r"<h([1-6])>(.*?)</h[1-6]>").unwrap_or_else(|e| {
552        log::error!("Failed to compile HEADER_NO_ID_RE regex: {e}");
553        utils::never_matching_regex().unwrap_or_else(|_| {
554          #[allow(
555            clippy::expect_used,
556            reason = "This pattern is guaranteed to be valid"
557          )]
558          Regex::new(r"[^\s\S]")
559            .expect("regex pattern [^\\s\\S] should always compile")
560        })
561      })
562    });
563
564    // Regex to strip HTML tags for slugification
565    static HTML_TAG_RE: LazyLock<Regex> = LazyLock::new(|| {
566      Regex::new(r"<[^>]+>").unwrap_or_else(|e| {
567        log::error!("Failed to compile HTML_TAG_RE regex: {e}");
568        utils::never_matching_regex().unwrap_or_else(|_| {
569          #[allow(
570            clippy::expect_used,
571            reason = "This pattern is guaranteed to be valid"
572          )]
573          Regex::new(r"[^\s\S]")
574            .expect("regex pattern [^\\s\\S] should always compile")
575        })
576      })
577    });
578
579    // First pass: explicit {#id} syntax
580    let result = HEADER_ANCHOR_RE
581      .replace_all(html, |caps: &regex::Captures| {
582        let level = &caps[1];
583        let prefix = &caps[2];
584        let id = &caps[3];
585        let suffix = &caps[4];
586        format!("<h{level} id=\"{id}\">{prefix}{suffix}</h{level}>")
587      })
588      .to_string();
589
590    // Second pass: add auto-generated IDs to headers without id attribute
591    HEADER_NO_ID_RE
592      .replace_all(&result, |caps: &regex::Captures| {
593        let level = &caps[1];
594        let content = &caps[2];
595        // Strip HTML tags and slugify the text content
596        let text_only = HTML_TAG_RE.replace_all(content, "");
597        let id = utils::slugify(&text_only);
598        if id.is_empty() {
599          // If slugify produces empty string, keep header without id
600          format!("<h{level}>{content}</h{level}>")
601        } else {
602          format!("<h{level} id=\"{id}\">{content}</h{level}>")
603        }
604      })
605      .to_string()
606  }
607
608  /// Build comrak options from `MarkdownOptions` and feature flags.
609  fn comrak_options(&self) -> Options<'_> {
610    let mut options = Options::default();
611    if self.options.gfm {
612      options.extension.table = true;
613      options.extension.footnotes = true;
614      options.extension.strikethrough = true;
615      options.extension.tasklist = true;
616      options.extension.superscript = true;
617      options.extension.autolink = true;
618    }
619    options.render.r#unsafe = true;
620    // Enable description lists but keep custom header processing
621    options.extension.header_ids = None;
622    options.extension.description_lists = true;
623    options
624  }
625
626  /// Post-process HTML to enhance manpage references with URL links.
627  #[cfg(feature = "nixpkgs")]
628  fn process_manpage_references_html(&self, html: &str) -> String {
629    super::extensions::process_manpage_references(
630      html,
631      self.manpage_urls.as_ref(),
632    )
633  }
634
635  /// Post-process HTML to enhance manpage references (no-op when feature
636  /// disabled).
637  #[cfg(not(feature = "nixpkgs"))]
638  fn process_manpage_references_html(&self, html: &str) -> String {
639    html.to_string()
640  }
641
642  /// HTML post-processing using kuchiki DOM manipulation.
643  #[allow(
644    clippy::unused_self,
645    reason = "Method signature matches processor pattern"
646  )]
647  fn kuchiki_postprocess(&self, html: &str) -> String {
648    // Use a standalone function to avoid borrowing issues
649    kuchiki_postprocess_html(html, |document| {
650      Self::apply_dom_transformations(document);
651    })
652  }
653
654  /// Apply all DOM transformations to the parsed HTML document.
655  fn apply_dom_transformations(document: &kuchikikiki::NodeRef) {
656    Self::process_list_item_id_markers(document);
657    Self::process_header_anchor_comments(document);
658    Self::process_list_item_inline_anchors(document);
659    Self::process_paragraph_inline_anchors(document);
660    Self::process_remaining_inline_anchors(document);
661    Self::process_option_anchor_links(document);
662    Self::process_empty_auto_links(document);
663    Self::process_empty_html_links(document);
664  }
665
666  /// Process list item ID markers: <li><!-- nixos-anchor-id:ID -->
667  fn process_list_item_id_markers(document: &kuchikikiki::NodeRef) {
668    let mut to_modify = Vec::new();
669
670    for comment in document.inclusive_descendants() {
671      if let Some(comment_node) = comment.as_comment() {
672        let comment_text = comment_node.borrow();
673        if let Some(id_start) = comment_text.find("nixos-anchor-id:") {
674          let id = comment_text[id_start + 16..].trim();
675          if !id.is_empty()
676            && id
677              .chars()
678              .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
679          {
680            // Check if this comment is inside an <li> element
681            if let Some(parent) = comment.parent()
682              && let Some(element) = parent.as_element()
683              && element.name.local.as_ref() == "li"
684            {
685              to_modify.push((comment.clone(), id.to_string()));
686            }
687          }
688        }
689      }
690    }
691
692    for (comment_node, id) in to_modify {
693      let span = kuchikikiki::NodeRef::new_element(
694        markup5ever::QualName::new(
695          None,
696          markup5ever::ns!(html),
697          local_name!("span"),
698        ),
699        vec![
700          (
701            kuchikikiki::ExpandedName::new("", "id"),
702            kuchikikiki::Attribute {
703              prefix: None,
704              value:  id,
705            },
706          ),
707          (
708            kuchikikiki::ExpandedName::new("", "class"),
709            kuchikikiki::Attribute {
710              prefix: None,
711              value:  "nixos-anchor".into(),
712            },
713          ),
714        ],
715      );
716      comment_node.insert_after(span);
717      comment_node.detach();
718    }
719  }
720
721  /// Process header anchors with comments: <h1>text<!-- anchor: id --></h1>
722  fn process_header_anchor_comments(document: &kuchikikiki::NodeRef) {
723    let mut to_modify = Vec::new();
724
725    for comment in document.inclusive_descendants() {
726      if let Some(comment_node) = comment.as_comment() {
727        let comment_text = comment_node.borrow();
728        if let Some(anchor_start) = comment_text.find("anchor:") {
729          let id = comment_text[anchor_start + 7..].trim();
730          if !id.is_empty()
731            && id
732              .chars()
733              .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
734          {
735            // Check if this comment is inside a header element
736            if let Some(parent) = comment.parent()
737              && let Some(element) = parent.as_element()
738            {
739              let tag_name = element.name.local.as_ref();
740              if matches!(tag_name, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
741                to_modify.push((
742                  parent.clone(),
743                  comment.clone(),
744                  id.to_string(),
745                ));
746              }
747            }
748          }
749        }
750      }
751    }
752
753    for (header_element, comment_node, id) in to_modify {
754      if let Some(element) = header_element.as_element() {
755        element
756          .attributes
757          .borrow_mut()
758          .insert(local_name!("id"), id);
759        comment_node.detach();
760      }
761    }
762  }
763
764  /// Process remaining inline anchors in list items: <li>[]{#id}content</li>
765  fn process_list_item_inline_anchors(document: &kuchikikiki::NodeRef) {
766    for li_node in safe_select(document, "li") {
767      let li_element = li_node;
768
769      // Check if this list item contains code elements
770      let has_code = !safe_select(&li_element, "code, pre").is_empty();
771      if has_code {
772        continue; // Skip list items with code blocks
773      }
774
775      let text_content = li_element.text_contents();
776
777      if let Some(anchor_start) = text_content.find("[]{#")
778        && let Some(anchor_end) = text_content[anchor_start..].find('}')
779      {
780        let id = &text_content[anchor_start + 4..anchor_start + anchor_end];
781        if !id.is_empty()
782          && id
783            .chars()
784            .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
785        {
786          let remaining_content =
787            &text_content[anchor_start + anchor_end + 1..];
788
789          // Clear current content and rebuild
790          for child in li_element.children() {
791            child.detach();
792          }
793
794          let span = kuchikikiki::NodeRef::new_element(
795            markup5ever::QualName::new(
796              None,
797              markup5ever::ns!(html),
798              local_name!("span"),
799            ),
800            vec![
801              (
802                kuchikikiki::ExpandedName::new("", "id"),
803                kuchikikiki::Attribute {
804                  prefix: None,
805                  value:  id.into(),
806                },
807              ),
808              (
809                kuchikikiki::ExpandedName::new("", "class"),
810                kuchikikiki::Attribute {
811                  prefix: None,
812                  value:  "nixos-anchor".into(),
813                },
814              ),
815            ],
816          );
817          li_element.append(span);
818          if !remaining_content.is_empty() {
819            li_element
820              .append(kuchikikiki::NodeRef::new_text(remaining_content));
821          }
822        }
823      }
824    }
825  }
826
827  /// Process inline anchors in paragraphs: <p>[]{#id}content</p>
828  fn process_paragraph_inline_anchors(document: &kuchikikiki::NodeRef) {
829    for p_node in safe_select(document, "p") {
830      let p_element = p_node;
831
832      // Check if this paragraph contains code elements
833      let has_code = !safe_select(&p_element, "code, pre").is_empty();
834      if has_code {
835        continue; // Skip paragraphs with code blocks
836      }
837
838      let text_content = p_element.text_contents();
839
840      if let Some(anchor_start) = text_content.find("[]{#")
841        && let Some(anchor_end) = text_content[anchor_start..].find('}')
842      {
843        let id = &text_content[anchor_start + 4..anchor_start + anchor_end];
844        if !id.is_empty()
845          && id
846            .chars()
847            .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
848        {
849          let remaining_content =
850            &text_content[anchor_start + anchor_end + 1..];
851
852          // Clear current content and rebuild
853          for child in p_element.children() {
854            child.detach();
855          }
856
857          let span = kuchikikiki::NodeRef::new_element(
858            markup5ever::QualName::new(
859              None,
860              markup5ever::ns!(html),
861              local_name!("span"),
862            ),
863            vec![
864              (
865                kuchikikiki::ExpandedName::new("", "id"),
866                kuchikikiki::Attribute {
867                  prefix: None,
868                  value:  id.into(),
869                },
870              ),
871              (
872                kuchikikiki::ExpandedName::new("", "class"),
873                kuchikikiki::Attribute {
874                  prefix: None,
875                  value:  "nixos-anchor".into(),
876                },
877              ),
878            ],
879          );
880          p_element.append(span);
881          if !remaining_content.is_empty() {
882            p_element.append(kuchikikiki::NodeRef::new_text(remaining_content));
883          }
884        }
885      }
886    }
887  }
888
889  /// Process remaining standalone inline anchors throughout the document
890  fn process_remaining_inline_anchors(document: &kuchikikiki::NodeRef) {
891    let mut text_nodes_to_process = Vec::new();
892
893    for node in document.inclusive_descendants() {
894      if let Some(text_node) = node.as_text() {
895        // Check if this text node is inside a code block
896        let mut parent = node.parent();
897        let mut in_code = false;
898        while let Some(p) = parent {
899          if let Some(element) = p.as_element()
900            && (element.name.local == local_name!("code")
901              || element.name.local == local_name!("pre"))
902          {
903            in_code = true;
904            break;
905          }
906          parent = p.parent();
907        }
908
909        // Only process if not in code
910        if !in_code {
911          let text_content = text_node.borrow().clone();
912          if text_content.contains("[]{#") {
913            text_nodes_to_process.push((node.clone(), text_content));
914          }
915        }
916      }
917    }
918
919    for (text_node, text_content) in text_nodes_to_process {
920      let mut last_end = 0;
921      let mut new_children = Vec::new();
922
923      // Simple pattern matching for []{#id}
924      let chars = text_content.chars().collect::<Vec<_>>();
925      let mut i = 0;
926      while i < chars.len() {
927        if i + 4 < chars.len()
928          && chars[i] == '['
929          && chars[i + 1] == ']'
930          && chars[i + 2] == '{'
931          && chars[i + 3] == '#'
932        {
933          // Found start of anchor pattern
934          let anchor_start = i;
935          i += 4; // skip "[]{#"
936
937          let mut id = String::new();
938          while i < chars.len() && chars[i] != '}' {
939            if chars[i].is_alphanumeric() || chars[i] == '-' || chars[i] == '_'
940            {
941              id.push(chars[i]);
942              i += 1;
943            } else {
944              break;
945            }
946          }
947
948          if i < chars.len() && chars[i] == '}' && !id.is_empty() {
949            // Valid anchor found
950            let anchor_end = i + 1;
951
952            // Add text before anchor
953            if anchor_start > last_end {
954              let before_text: String =
955                chars[last_end..anchor_start].iter().collect();
956              if !before_text.is_empty() {
957                new_children.push(kuchikikiki::NodeRef::new_text(before_text));
958              }
959            }
960
961            // Add span element
962            let span = kuchikikiki::NodeRef::new_element(
963              markup5ever::QualName::new(
964                None,
965                markup5ever::ns!(html),
966                local_name!("span"),
967              ),
968              vec![
969                (
970                  kuchikikiki::ExpandedName::new("", "id"),
971                  kuchikikiki::Attribute {
972                    prefix: None,
973                    value:  id,
974                  },
975                ),
976                (
977                  kuchikikiki::ExpandedName::new("", "class"),
978                  kuchikikiki::Attribute {
979                    prefix: None,
980                    value:  "nixos-anchor".into(),
981                  },
982                ),
983              ],
984            );
985            new_children.push(span);
986
987            last_end = anchor_end;
988            i = anchor_end;
989          } else {
990            i += 1;
991          }
992        } else {
993          i += 1;
994        }
995      }
996
997      // Add remaining text
998      if last_end < chars.len() {
999        let after_text: String = chars[last_end..].iter().collect();
1000        if !after_text.is_empty() {
1001          new_children.push(kuchikikiki::NodeRef::new_text(after_text));
1002        }
1003      }
1004
1005      // Replace text node if we found anchors
1006      if !new_children.is_empty() {
1007        for child in new_children {
1008          text_node.insert_before(child);
1009        }
1010        text_node.detach();
1011      }
1012    }
1013  }
1014
1015  /// Process empty auto-links: [](#anchor) -> <a href="#anchor">Anchor</a>
1016  fn process_empty_auto_links(document: &kuchikikiki::NodeRef) {
1017    for link_node in safe_select(document, "a") {
1018      let link_element = link_node;
1019      if let Some(element) = link_element.as_element() {
1020        let href = element
1021          .attributes
1022          .borrow()
1023          .get(local_name!("href"))
1024          .map(std::string::ToString::to_string);
1025        let text_content = link_element.text_contents();
1026
1027        if let Some(href_value) = href
1028          && href_value.starts_with('#')
1029          && (text_content.trim().is_empty()
1030            || text_content.trim() == "{{ANCHOR}}")
1031        {
1032          // Clear placeholder text if present
1033          if text_content.trim() == "{{ANCHOR}}" {
1034            for child in link_element.children() {
1035              child.detach();
1036            }
1037          }
1038          // Empty link with anchor - add humanized text
1039          let display_text = Self::humanize_anchor_id(&href_value);
1040          link_element.append(kuchikikiki::NodeRef::new_text(display_text));
1041        }
1042      }
1043    }
1044  }
1045
1046  /// Process empty HTML links that have no content
1047  fn process_empty_html_links(document: &kuchikikiki::NodeRef) {
1048    for link_node in safe_select(document, "a[href^='#']") {
1049      let link_element = link_node;
1050      let text_content = link_element.text_contents();
1051
1052      if text_content.trim().is_empty() || text_content.trim() == "{{ANCHOR}}" {
1053        // Clear placeholder text if present
1054        if text_content.trim() == "{{ANCHOR}}" {
1055          for child in link_element.children() {
1056            child.detach();
1057          }
1058        }
1059        if let Some(element) = link_element.as_element()
1060          && let Some(href) =
1061            element.attributes.borrow().get(local_name!("href"))
1062        {
1063          let display_text = Self::humanize_anchor_id(href);
1064          link_element.append(kuchikikiki::NodeRef::new_text(display_text));
1065        }
1066      }
1067    }
1068  }
1069
1070  /// Process option anchor links: [](#opt-option.path) -> link to options.html
1071  fn process_option_anchor_links(document: &kuchikikiki::NodeRef) {
1072    let mut to_modify = Vec::new();
1073
1074    // Collect all option anchor links first
1075    for link_node in safe_select(document, "a[href^='#opt-']") {
1076      let link_element = link_node;
1077      if let Some(element) = link_element.as_element() {
1078        let href = element
1079          .attributes
1080          .borrow()
1081          .get(local_name!("href"))
1082          .map(std::string::ToString::to_string);
1083        let text_content = link_element.text_contents();
1084
1085        if let Some(href_value) = href
1086          && href_value.starts_with("#opt-")
1087        {
1088          let option_anchor = href_value[1..].to_string(); // remove the leading #
1089          let needs_text_replacement = text_content.trim().is_empty()
1090            || text_content.trim() == "{{ANCHOR}}";
1091          to_modify.push((
1092            link_element.clone(),
1093            option_anchor,
1094            needs_text_replacement,
1095          ));
1096        }
1097      }
1098    }
1099
1100    // Apply modifications
1101    for (link_element, option_anchor, needs_text_replacement) in to_modify {
1102      if let Some(element) = link_element.as_element() {
1103        let new_href = format!("options.html#{option_anchor}");
1104        element
1105          .attributes
1106          .borrow_mut()
1107          .insert(local_name!("href"), new_href);
1108
1109        if needs_text_replacement {
1110          // Clear existing content
1111          for child in link_element.children() {
1112            child.detach();
1113          }
1114
1115          // Extract option name from anchor
1116          // opt-services-nginx-enable -> services.nginx.enable
1117          if let Some(option_path) = option_anchor.strip_prefix("opt-") {
1118            let option_name = option_path.replace('-', ".");
1119            link_element.append(kuchikikiki::NodeRef::new_text(option_name));
1120          }
1121        }
1122      }
1123    }
1124  }
1125
1126  /// Convert an anchor ID to human-readable text
1127  fn humanize_anchor_id(anchor: &str) -> String {
1128    // Strip the leading #
1129    let cleaned = anchor.trim_start_matches('#');
1130
1131    // Remove common prefixes
1132    let without_prefix = cleaned
1133      .trim_start_matches("sec-")
1134      .trim_start_matches("ssec-")
1135      .trim_start_matches("opt-");
1136
1137    // Replace separators with spaces
1138    let spaced = without_prefix.replace(['-', '_'], " ");
1139
1140    // Capitalize each word
1141    spaced
1142      .split_whitespace()
1143      .map(|word| {
1144        let mut chars = word.chars();
1145        chars.next().map_or_else(String::new, |c| {
1146          c.to_uppercase().collect::<String>() + chars.as_str()
1147        })
1148      })
1149      .collect::<Vec<String>>()
1150      .join(" ")
1151  }
1152}
1153
1154/// Extract all inline text from a heading node.
1155pub fn extract_inline_text<'a>(node: &'a AstNode<'a>) -> String {
1156  let mut text = String::new();
1157  for child in node.children() {
1158    match &child.data.borrow().value {
1159      NodeValue::Text(t) => text.push_str(t),
1160      NodeValue::Code(t) => text.push_str(&t.literal),
1161      NodeValue::Link(..)
1162      | NodeValue::Emph
1163      | NodeValue::Strong
1164      | NodeValue::Strikethrough
1165      | NodeValue::Superscript
1166      | NodeValue::Subscript
1167      | NodeValue::FootnoteReference(..) => {
1168        text.push_str(&extract_inline_text(child));
1169      },
1170      #[allow(clippy::match_same_arms, reason = "Explicit for clarity")]
1171      NodeValue::HtmlInline(_) | NodeValue::Image(..) => {},
1172      _ => {},
1173    }
1174  }
1175  text
1176}
1177
1178/// Collect all markdown files from the input directory
1179pub fn collect_markdown_files(input_dir: &Path) -> Vec<PathBuf> {
1180  let mut files = Vec::with_capacity(100);
1181
1182  for entry in WalkDir::new(input_dir)
1183    .follow_links(true)
1184    .into_iter()
1185    .filter_map(Result::ok)
1186  {
1187    let path = entry.path();
1188    if path.is_file() && path.extension().is_some_and(|ext| ext == "md") {
1189      files.push(path.to_owned());
1190    }
1191  }
1192
1193  trace!("Found {} markdown files to process", files.len());
1194  files
1195}
1196
1197/// Features that can be queried on a processor instance.
1198#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1199pub enum ProcessorFeature {
1200  /// GitHub Flavored Markdown support
1201  Gfm,
1202  /// Nixpkgs documentation extensions
1203  Nixpkgs,
1204  /// Syntax highlighting for code blocks
1205  SyntaxHighlighting,
1206  /// Manpage URL mapping support
1207  ManpageUrls,
1208}
1209
1210/// Standalone HTML post-processing function to avoid borrowing issues.
1211fn kuchiki_postprocess_html<F>(html: &str, transform_fn: F) -> String
1212where
1213  F: FnOnce(&kuchikikiki::NodeRef),
1214{
1215  process_safe(
1216    html,
1217    |html| {
1218      use tendril::TendrilSink;
1219
1220      let document = kuchikikiki::parse_html().one(html);
1221      transform_fn(&document);
1222
1223      let mut out = Vec::new();
1224      let _ = document.serialize(&mut out);
1225      String::from_utf8(out).unwrap_or_default()
1226    },
1227    html,
1228  )
1229}
ndg_commonmark/processor/core.rs

ndg_commonmark/processor/
core.rs