ndg_commonmark/processor/
core.rs

1//! Core implementation of the Markdown processor.
2//!
3//! This module contains the main implementation of `MarkdownProcessor` and its
4//! methods, focused on the core rendering pipeline and configuration
5//! management.
6use std::{
7  collections::HashMap,
8  path::{Path, PathBuf},
9};
10
11use comrak::{
12  Arena,
13  nodes::{AstNode, NodeHeading, NodeValue},
14  options::Options,
15  parse_document,
16};
17use log::trace;
18use markup5ever::local_name;
19use walkdir::WalkDir;
20
21/// Error type for DOM operations.
22#[derive(Debug, thiserror::Error)]
23pub enum DomError {
24  #[error("CSS selector failed: {0}")]
25  SelectorError(String),
26  #[error("DOM serialization failed: {0}")]
27  SerializationError(String),
28}
29
30/// Result type for DOM operations.
31pub type DomResult<T> = Result<T, DomError>;
32
33/// Safely select DOM elements with graceful error handling.
34fn safe_select(
35  document: &kuchikikiki::NodeRef,
36  selector: &str,
37) -> Vec<kuchikikiki::NodeRef> {
38  match document.select(selector) {
39    Ok(selections) => selections.map(|sel| sel.as_node().clone()).collect(),
40    Err(e) => {
41      log::warn!("DOM selector '{selector}' failed: {e:?}");
42      Vec::new()
43    },
44  }
45}
46
47use super::{
48  process::process_safe,
49  types::{
50    AstTransformer,
51    MarkdownOptions,
52    MarkdownProcessor,
53    PromptTransformer,
54  },
55};
56use crate::{
57  syntax::create_default_manager,
58  types::{Header, MarkdownResult},
59  utils,
60};
61
62impl MarkdownProcessor {
63  /// Create a new `MarkdownProcessor` with the given options.
64  #[must_use]
65  pub fn new(options: MarkdownOptions) -> Self {
66    let manpage_urls = options
67      .manpage_urls_path
68      .as_ref()
69      .and_then(|path| crate::utils::load_manpage_urls(path).ok());
70
71    let syntax_manager = if options.highlight_code {
72      match create_default_manager() {
73        Ok(manager) => {
74          log::info!("Syntax highlighting initialized successfully");
75          Some(manager)
76        },
77        Err(e) => {
78          log::error!("Failed to initialize syntax highlighting: {e}");
79          log::warn!(
80            "Continuing without syntax highlighting - code blocks will not be \
81             highlighted"
82          );
83          None
84        },
85      }
86    } else {
87      None
88    };
89
90    Self {
91      options,
92      manpage_urls,
93      syntax_manager,
94      base_dir: std::path::PathBuf::from("."),
95    }
96  }
97
98  /// Access processor options.
99  #[must_use]
100  pub const fn options(&self) -> &MarkdownOptions {
101    &self.options
102  }
103
104  /// Set the base directory for resolving relative file paths.
105  #[must_use]
106  pub fn with_base_dir(mut self, base_dir: &std::path::Path) -> Self {
107    self.base_dir = base_dir.to_path_buf();
108    self
109  }
110
111  /// Check if a specific feature is enabled.
112  #[must_use]
113  pub const fn has_feature(&self, feature: ProcessorFeature) -> bool {
114    match feature {
115      ProcessorFeature::Gfm => self.options.gfm,
116      ProcessorFeature::Nixpkgs => self.options.nixpkgs,
117      ProcessorFeature::SyntaxHighlighting => self.options.highlight_code,
118      ProcessorFeature::ManpageUrls => self.manpage_urls.is_some(),
119    }
120  }
121
122  /// Get the manpage URLs mapping for use with standalone functions.
123  #[must_use]
124  pub const fn manpage_urls(&self) -> Option<&HashMap<String, String>> {
125    self.manpage_urls.as_ref()
126  }
127
128  /// Highlight all code blocks in HTML using the configured syntax highlighter
129  #[must_use]
130  pub fn highlight_codeblocks(&self, html: &str) -> String {
131    use kuchikikiki::parse_html;
132    use tendril::TendrilSink;
133
134    if !self.options.highlight_code || self.syntax_manager.is_none() {
135      return html.to_string();
136    }
137
138    let document = parse_html().one(html);
139
140    // Collect all code blocks first to avoid DOM modification during iteration
141    let mut code_blocks = Vec::new();
142    for pre_node in safe_select(&document, "pre > code") {
143      let code_node = pre_node;
144      if let Some(element) = code_node.as_element() {
145        let language = element
146          .attributes
147          .borrow()
148          .get("class")
149          .and_then(|class| class.strip_prefix("language-"))
150          .unwrap_or("text")
151          .to_string();
152        let code_text = code_node.text_contents();
153
154        if let Some(pre_parent) = code_node.parent() {
155          code_blocks.push((
156            pre_parent.clone(),
157            code_node.clone(),
158            code_text,
159            language,
160          ));
161        }
162      }
163    }
164
165    // Process each code block
166    for (pre_element, _code_node, code_text, language) in code_blocks {
167      if let Some(highlighted) = self.highlight_code_html(&code_text, &language)
168      {
169        // Wrap highlighted HTML in <pre><code> with appropriate classes
170        let wrapped_html = format!(
171          r#"<pre class="highlight"><code class="language-{language}">{highlighted}</code></pre>"#
172        );
173        let fragment = parse_html().one(wrapped_html.as_str());
174        pre_element.insert_after(fragment);
175        pre_element.detach();
176      }
177      // Do not add highlight/language-* classes if not highlighted
178    }
179
180    let mut buf = Vec::new();
181    if let Err(e) = document.serialize(&mut buf) {
182      log::warn!("DOM serialization failed: {e:?}");
183      return html.to_string(); // Return original HTML if serialization fails
184    }
185    String::from_utf8(buf).unwrap_or_else(|_| html.to_string())
186  }
187
188  /// Handle hard tabs in code blocks according to configuration
189  fn handle_hardtabs(&self, code: &str) -> String {
190    use super::types::TabStyle;
191
192    // Check if there are any hard tabs
193    if !code.contains('\t') {
194      return code.to_string();
195    }
196
197    match self.options.tab_style {
198      // Do nothing
199      TabStyle::None => code.to_string(),
200
201      // Warn, but do nothing.
202      TabStyle::Warn => {
203        log::warn!(
204          "Hard tabs detected in code block. Consider using spaces for \
205           consistency. Tools like editorconfig may help you normalize spaces \
206           in your documents."
207        );
208        code.to_string()
209      },
210
211      // Do not warn, only inform in debug mode. Then return
212      // the updated code.
213      TabStyle::Normalize => {
214        log::debug!("Replacing hard tabs with spaces");
215        code.replace('\t', "  ")
216      },
217    }
218  }
219
220  /// Process hard tabs in code blocks within markdown content
221  fn process_hardtabs(&self, markdown: &str) -> String {
222    use super::types::TabStyle;
223
224    // If no tab handling is needed, return as-is
225    if self.options.tab_style == TabStyle::None {
226      return markdown.to_string();
227    }
228
229    let mut result = String::with_capacity(markdown.len());
230    let mut lines = markdown.lines().peekable();
231    let mut in_code_block = false;
232    let mut code_fence_char = None;
233    let mut code_fence_count = 0;
234
235    while let Some(line) = lines.next() {
236      let trimmed = line.trim_start();
237
238      // Check for code fences
239      if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
240        let Some(fence_char) = trimmed.chars().next() else {
241          // If the line is empty after trimming, it can't be a valid code fence
242          // Just continue processing the line normally
243          result.push_str(line);
244          result.push('\n');
245          continue;
246        };
247        let fence_count =
248          trimmed.chars().take_while(|&c| c == fence_char).count();
249
250        if fence_count >= 3 {
251          if !in_code_block {
252            // Starting a code block
253            in_code_block = true;
254            code_fence_char = Some(fence_char);
255            code_fence_count = fence_count;
256          } else if code_fence_char == Some(fence_char)
257            && fence_count >= code_fence_count
258          {
259            // Ending a code block
260            in_code_block = false;
261            code_fence_char = None;
262            code_fence_count = 0;
263          }
264        }
265      }
266
267      // Process line based on whether we're in a code block
268      let processed_line = if in_code_block && line.contains('\t') {
269        self.handle_hardtabs(line)
270      } else {
271        line.to_string()
272      };
273
274      result.push_str(&processed_line);
275
276      // Add newline unless this is the last line
277      if lines.peek().is_some() {
278        result.push('\n');
279      }
280    }
281
282    result
283  }
284
285  /// Highlight code using the configured syntax highlighter, returns HTML
286  /// string
287  fn highlight_code_html(&self, code: &str, language: &str) -> Option<String> {
288    if !self.options.highlight_code {
289      return None;
290    }
291
292    let syntax_manager = self.syntax_manager.as_ref()?;
293
294    syntax_manager
295      .highlight_code(code, language, self.options.highlight_theme.as_deref())
296      .ok()
297  }
298
299  /// Render Markdown to HTML, extracting headers and title.
300  #[must_use]
301  pub fn render(&self, markdown: &str) -> MarkdownResult {
302    let (preprocessed, included_files) = self.preprocess(markdown);
303    let (headers, title) = self.extract_headers(&preprocessed);
304    let html = self.process_html_pipeline(&preprocessed);
305
306    MarkdownResult {
307      html,
308      headers,
309      title,
310      included_files,
311    }
312  }
313
314  /// Process the HTML generation and post-processing pipeline.
315  fn process_html_pipeline(&self, content: &str) -> String {
316    let mut html = self.convert_to_html(content);
317
318    // Apply feature-specific post-processing
319    if cfg!(feature = "ndg-flavored") {
320      #[cfg(feature = "ndg-flavored")]
321      {
322        html = super::extensions::process_option_references(
323          &html,
324          self.options.valid_options.as_ref(),
325        );
326      }
327    }
328
329    if self.options.nixpkgs {
330      html = self.process_manpage_references_html(&html);
331    }
332
333    if self.options.highlight_code {
334      html = self.highlight_codeblocks(&html);
335    }
336
337    self.kuchiki_postprocess(&html)
338  }
339
340  /// Preprocess the markdown content with all enabled transformations.
341  fn preprocess(
342    &self,
343    content: &str,
344  ) -> (String, Vec<crate::types::IncludedFile>) {
345    let mut processed = content.to_string();
346    let mut included_files = Vec::new();
347
348    // Process MyST-style autolinks first
349    processed = super::extensions::process_myst_autolinks(&processed);
350
351    // Handle hard tabs in code blocks
352    processed = self.process_hardtabs(&processed);
353
354    if self.options.nixpkgs {
355      let (content, files) = self.apply_nixpkgs_preprocessing(&processed);
356      processed = content;
357      included_files = files;
358    }
359
360    if self.options.nixpkgs || cfg!(feature = "ndg-flavored") {
361      processed = super::extensions::process_role_markup(
362        &processed,
363        self.manpage_urls.as_ref(),
364        self.options.auto_link_options,
365        self.options.valid_options.as_ref(),
366      );
367    }
368
369    (processed, included_files)
370  }
371
372  /// Apply Nixpkgs-specific preprocessing steps.
373  #[cfg(feature = "nixpkgs")]
374  fn apply_nixpkgs_preprocessing(
375    &self,
376    content: &str,
377  ) -> (String, Vec<crate::types::IncludedFile>) {
378    let (with_includes, included_files) =
379      match super::extensions::process_file_includes(content, &self.base_dir, 0)
380      {
381        Ok(result) => result,
382        Err(e) => {
383          log::warn!(
384            "File include processing failed: {e}. Continuing without includes."
385          );
386          (content.to_string(), Vec::new())
387        },
388      };
389    let with_blocks = super::extensions::process_block_elements(&with_includes);
390    let processed = super::extensions::process_inline_anchors(&with_blocks);
391    (processed, included_files)
392  }
393
394  /// Apply Nixpkgs-specific preprocessing steps (no-op when feature disabled).
395  #[cfg(not(feature = "nixpkgs"))]
396  fn apply_nixpkgs_preprocessing(
397    &self,
398    content: &str,
399  ) -> (String, Vec<crate::types::IncludedFile>) {
400    (content.to_string(), Vec::new())
401  }
402
403  /// Extract headers and title from the markdown content.
404  #[must_use]
405  pub fn extract_headers(
406    &self,
407    content: &str,
408  ) -> (Vec<Header>, Option<String>) {
409    use std::fmt::Write;
410
411    let arena = Arena::new();
412    let options = self.comrak_options();
413
414    // Normalize custom anchors with no heading level to h2
415    let mut normalized = String::with_capacity(content.len());
416    for line in content.lines() {
417      let trimmed = line.trim_end();
418      if !trimmed.starts_with('#')
419        && let Some(anchor_start) = trimmed.rfind("{#")
420        && let Some(anchor_end) = trimmed[anchor_start..].find('}')
421      {
422        let text = trimmed[..anchor_start].trim_end();
423        let id = &trimmed[anchor_start + 2..anchor_start + anchor_end];
424        let _ = writeln!(normalized, "## {text} {{#{id}}}");
425        continue;
426      }
427      normalized.push_str(line);
428      normalized.push('\n');
429    }
430
431    let root = parse_document(&arena, &normalized, &options);
432
433    let mut headers = Vec::new();
434    let mut found_title = None;
435
436    for node in root.descendants() {
437      if let NodeValue::Heading(NodeHeading { level, .. }) =
438        &node.data.borrow().value
439      {
440        let mut text = String::new();
441        let mut explicit_id = None;
442
443        for child in node.children() {
444          match &child.data.borrow().value {
445            NodeValue::Text(t) => text.push_str(t),
446            NodeValue::Code(t) => text.push_str(&t.literal),
447            NodeValue::Link(..)
448            | NodeValue::Emph
449            | NodeValue::Strong
450            | NodeValue::Subscript
451            | NodeValue::Strikethrough
452            | NodeValue::Superscript
453            | NodeValue::FootnoteReference(..) => {
454              text.push_str(&extract_inline_text(child));
455            },
456            NodeValue::HtmlInline(html) => {
457              // Look for explicit anchor in HTML inline node: {#id}
458              let html_str = html.as_str();
459              if let Some(start) = html_str.find("{#")
460                && let Some(end) = html_str[start..].find('}')
461              {
462                let anchor = &html_str[start + 2..start + end];
463                explicit_id = Some(anchor.to_string());
464              }
465            },
466            #[allow(clippy::match_same_arms, reason = "Explicit for clarity")]
467            NodeValue::Image(..) => {},
468            _ => {},
469          }
470        }
471
472        // Check for trailing {#id} in heading text
473        let trimmed = text.trim_end();
474        #[allow(clippy::option_if_let_else)]
475        // Nested options clearer with if-let
476        let (final_text, id) = if let Some(start) = trimmed.rfind("{#") {
477          if let Some(end) = trimmed[start..].find('}') {
478            let anchor = &trimmed[start + 2..start + end];
479            (trimmed[..start].trim_end().to_string(), anchor.to_string())
480          } else {
481            (
482              text.clone(),
483              explicit_id.unwrap_or_else(|| utils::slugify(&text)),
484            )
485          }
486        } else {
487          (
488            text.clone(),
489            explicit_id.unwrap_or_else(|| utils::slugify(&text)),
490          )
491        };
492        if *level == 1 && found_title.is_none() {
493          found_title = Some(final_text.clone());
494        }
495        headers.push(Header {
496          text: final_text,
497          level: *level,
498          id,
499        });
500      }
501    }
502
503    (headers, found_title)
504  }
505
506  /// Convert markdown to HTML using comrak and configured options.
507  fn convert_to_html(&self, content: &str) -> String {
508    // Process directly without panic catching for better performance
509    let arena = Arena::new();
510    let options = self.comrak_options();
511    let root = parse_document(&arena, content, &options);
512
513    // Apply AST transformations
514    let prompt_transformer = PromptTransformer;
515    prompt_transformer.transform(root);
516
517    let mut html_output = String::new();
518    comrak::format_html(root, &options, &mut html_output).unwrap_or_default();
519
520    // Post-process HTML to handle header anchors
521    Self::process_header_anchors_html(&html_output)
522  }
523
524  /// Process header anchors in HTML by finding {#id} syntax and converting to
525  /// proper id attributes
526  fn process_header_anchors_html(html: &str) -> String {
527    use std::sync::LazyLock;
528
529    use regex::Regex;
530
531    static HEADER_ANCHOR_RE: LazyLock<Regex> = LazyLock::new(|| {
532      Regex::new(r"<h([1-6])>(.*?)\s*\{#([a-zA-Z0-9_-]+)\}(.*?)</h[1-6]>")
533        .unwrap_or_else(|e| {
534          log::error!("Failed to compile HEADER_ANCHOR_RE regex: {e}");
535          utils::never_matching_regex().unwrap_or_else(|_| {
536            // As a last resort, create a regex that matches nothing
537            #[allow(
538              clippy::expect_used,
539              reason = "This pattern is guaranteed to be valid"
540            )]
541            Regex::new(r"[^\s\S]")
542              .expect("regex pattern [^\\s\\S] should always compile")
543          })
544        })
545    });
546
547    HEADER_ANCHOR_RE
548      .replace_all(html, |caps: &regex::Captures| {
549        let level = &caps[1];
550        let prefix = &caps[2];
551        let id = &caps[3];
552        let suffix = &caps[4];
553        format!("<h{level} id=\"{id}\">{prefix}{suffix}</h{level}>")
554      })
555      .to_string()
556  }
557
558  /// Build comrak options from `MarkdownOptions` and feature flags.
559  fn comrak_options(&self) -> Options<'_> {
560    let mut options = Options::default();
561    if self.options.gfm {
562      options.extension.table = true;
563      options.extension.footnotes = true;
564      options.extension.strikethrough = true;
565      options.extension.tasklist = true;
566      options.extension.superscript = true;
567      options.extension.autolink = true;
568    }
569    options.render.r#unsafe = true;
570    // Enable description lists but keep custom header processing
571    options.extension.header_ids = None;
572    options.extension.description_lists = true;
573    options
574  }
575
576  /// Post-process HTML to enhance manpage references with URL links.
577  #[cfg(feature = "nixpkgs")]
578  fn process_manpage_references_html(&self, html: &str) -> String {
579    super::extensions::process_manpage_references(
580      html,
581      self.manpage_urls.as_ref(),
582    )
583  }
584
585  /// Post-process HTML to enhance manpage references (no-op when feature
586  /// disabled).
587  #[cfg(not(feature = "nixpkgs"))]
588  fn process_manpage_references_html(&self, html: &str) -> String {
589    html.to_string()
590  }
591
592  /// HTML post-processing using kuchiki DOM manipulation.
593  #[allow(
594    clippy::unused_self,
595    reason = "Method signature matches processor pattern"
596  )]
597  fn kuchiki_postprocess(&self, html: &str) -> String {
598    // Use a standalone function to avoid borrowing issues
599    kuchiki_postprocess_html(html, |document| {
600      Self::apply_dom_transformations(document);
601    })
602  }
603
604  /// Apply all DOM transformations to the parsed HTML document.
605  fn apply_dom_transformations(document: &kuchikikiki::NodeRef) {
606    Self::process_list_item_id_markers(document);
607    Self::process_header_anchor_comments(document);
608    Self::process_list_item_inline_anchors(document);
609    Self::process_paragraph_inline_anchors(document);
610    Self::process_remaining_inline_anchors(document);
611    Self::process_option_anchor_links(document);
612    Self::process_empty_auto_links(document);
613    Self::process_empty_html_links(document);
614  }
615
616  /// Process list item ID markers: <li><!-- nixos-anchor-id:ID -->
617  fn process_list_item_id_markers(document: &kuchikikiki::NodeRef) {
618    let mut to_modify = Vec::new();
619
620    for comment in document.inclusive_descendants() {
621      if let Some(comment_node) = comment.as_comment() {
622        let comment_text = comment_node.borrow();
623        if let Some(id_start) = comment_text.find("nixos-anchor-id:") {
624          let id = comment_text[id_start + 16..].trim();
625          if !id.is_empty()
626            && id
627              .chars()
628              .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
629          {
630            // Check if this comment is inside an <li> element
631            if let Some(parent) = comment.parent()
632              && let Some(element) = parent.as_element()
633              && element.name.local.as_ref() == "li"
634            {
635              to_modify.push((comment.clone(), id.to_string()));
636            }
637          }
638        }
639      }
640    }
641
642    for (comment_node, id) in to_modify {
643      let span = kuchikikiki::NodeRef::new_element(
644        markup5ever::QualName::new(
645          None,
646          markup5ever::ns!(html),
647          local_name!("span"),
648        ),
649        vec![
650          (
651            kuchikikiki::ExpandedName::new("", "id"),
652            kuchikikiki::Attribute {
653              prefix: None,
654              value:  id,
655            },
656          ),
657          (
658            kuchikikiki::ExpandedName::new("", "class"),
659            kuchikikiki::Attribute {
660              prefix: None,
661              value:  "nixos-anchor".into(),
662            },
663          ),
664        ],
665      );
666      comment_node.insert_after(span);
667      comment_node.detach();
668    }
669  }
670
671  /// Process header anchors with comments: <h1>text<!-- anchor: id --></h1>
672  fn process_header_anchor_comments(document: &kuchikikiki::NodeRef) {
673    let mut to_modify = Vec::new();
674
675    for comment in document.inclusive_descendants() {
676      if let Some(comment_node) = comment.as_comment() {
677        let comment_text = comment_node.borrow();
678        if let Some(anchor_start) = comment_text.find("anchor:") {
679          let id = comment_text[anchor_start + 7..].trim();
680          if !id.is_empty()
681            && id
682              .chars()
683              .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
684          {
685            // Check if this comment is inside a header element
686            if let Some(parent) = comment.parent()
687              && let Some(element) = parent.as_element()
688            {
689              let tag_name = element.name.local.as_ref();
690              if matches!(tag_name, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
691                to_modify.push((
692                  parent.clone(),
693                  comment.clone(),
694                  id.to_string(),
695                ));
696              }
697            }
698          }
699        }
700      }
701    }
702
703    for (header_element, comment_node, id) in to_modify {
704      if let Some(element) = header_element.as_element() {
705        element
706          .attributes
707          .borrow_mut()
708          .insert(local_name!("id"), id);
709        comment_node.detach();
710      }
711    }
712  }
713
714  /// Process remaining inline anchors in list items: <li>[]{#id}content</li>
715  fn process_list_item_inline_anchors(document: &kuchikikiki::NodeRef) {
716    for li_node in safe_select(document, "li") {
717      let li_element = li_node;
718
719      // Check if this list item contains code elements
720      let has_code = !safe_select(&li_element, "code, pre").is_empty();
721      if has_code {
722        continue; // Skip list items with code blocks
723      }
724
725      let text_content = li_element.text_contents();
726
727      if let Some(anchor_start) = text_content.find("[]{#")
728        && let Some(anchor_end) = text_content[anchor_start..].find('}')
729      {
730        let id = &text_content[anchor_start + 4..anchor_start + anchor_end];
731        if !id.is_empty()
732          && id
733            .chars()
734            .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
735        {
736          let remaining_content =
737            &text_content[anchor_start + anchor_end + 1..];
738
739          // Clear current content and rebuild
740          for child in li_element.children() {
741            child.detach();
742          }
743
744          let span = kuchikikiki::NodeRef::new_element(
745            markup5ever::QualName::new(
746              None,
747              markup5ever::ns!(html),
748              local_name!("span"),
749            ),
750            vec![
751              (
752                kuchikikiki::ExpandedName::new("", "id"),
753                kuchikikiki::Attribute {
754                  prefix: None,
755                  value:  id.into(),
756                },
757              ),
758              (
759                kuchikikiki::ExpandedName::new("", "class"),
760                kuchikikiki::Attribute {
761                  prefix: None,
762                  value:  "nixos-anchor".into(),
763                },
764              ),
765            ],
766          );
767          li_element.append(span);
768          if !remaining_content.is_empty() {
769            li_element
770              .append(kuchikikiki::NodeRef::new_text(remaining_content));
771          }
772        }
773      }
774    }
775  }
776
777  /// Process inline anchors in paragraphs: <p>[]{#id}content</p>
778  fn process_paragraph_inline_anchors(document: &kuchikikiki::NodeRef) {
779    for p_node in safe_select(document, "p") {
780      let p_element = p_node;
781
782      // Check if this paragraph contains code elements
783      let has_code = !safe_select(&p_element, "code, pre").is_empty();
784      if has_code {
785        continue; // Skip paragraphs with code blocks
786      }
787
788      let text_content = p_element.text_contents();
789
790      if let Some(anchor_start) = text_content.find("[]{#")
791        && let Some(anchor_end) = text_content[anchor_start..].find('}')
792      {
793        let id = &text_content[anchor_start + 4..anchor_start + anchor_end];
794        if !id.is_empty()
795          && id
796            .chars()
797            .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
798        {
799          let remaining_content =
800            &text_content[anchor_start + anchor_end + 1..];
801
802          // Clear current content and rebuild
803          for child in p_element.children() {
804            child.detach();
805          }
806
807          let span = kuchikikiki::NodeRef::new_element(
808            markup5ever::QualName::new(
809              None,
810              markup5ever::ns!(html),
811              local_name!("span"),
812            ),
813            vec![
814              (
815                kuchikikiki::ExpandedName::new("", "id"),
816                kuchikikiki::Attribute {
817                  prefix: None,
818                  value:  id.into(),
819                },
820              ),
821              (
822                kuchikikiki::ExpandedName::new("", "class"),
823                kuchikikiki::Attribute {
824                  prefix: None,
825                  value:  "nixos-anchor".into(),
826                },
827              ),
828            ],
829          );
830          p_element.append(span);
831          if !remaining_content.is_empty() {
832            p_element.append(kuchikikiki::NodeRef::new_text(remaining_content));
833          }
834        }
835      }
836    }
837  }
838
839  /// Process remaining standalone inline anchors throughout the document
840  fn process_remaining_inline_anchors(document: &kuchikikiki::NodeRef) {
841    let mut text_nodes_to_process = Vec::new();
842
843    for node in document.inclusive_descendants() {
844      if let Some(text_node) = node.as_text() {
845        // Check if this text node is inside a code block
846        let mut parent = node.parent();
847        let mut in_code = false;
848        while let Some(p) = parent {
849          if let Some(element) = p.as_element()
850            && (element.name.local == local_name!("code")
851              || element.name.local == local_name!("pre"))
852          {
853            in_code = true;
854            break;
855          }
856          parent = p.parent();
857        }
858
859        // Only process if not in code
860        if !in_code {
861          let text_content = text_node.borrow().clone();
862          if text_content.contains("[]{#") {
863            text_nodes_to_process.push((node.clone(), text_content));
864          }
865        }
866      }
867    }
868
869    for (text_node, text_content) in text_nodes_to_process {
870      let mut last_end = 0;
871      let mut new_children = Vec::new();
872
873      // Simple pattern matching for []{#id}
874      let chars = text_content.chars().collect::<Vec<_>>();
875      let mut i = 0;
876      while i < chars.len() {
877        if i + 4 < chars.len()
878          && chars[i] == '['
879          && chars[i + 1] == ']'
880          && chars[i + 2] == '{'
881          && chars[i + 3] == '#'
882        {
883          // Found start of anchor pattern
884          let anchor_start = i;
885          i += 4; // skip "[]{#"
886
887          let mut id = String::new();
888          while i < chars.len() && chars[i] != '}' {
889            if chars[i].is_alphanumeric() || chars[i] == '-' || chars[i] == '_'
890            {
891              id.push(chars[i]);
892              i += 1;
893            } else {
894              break;
895            }
896          }
897
898          if i < chars.len() && chars[i] == '}' && !id.is_empty() {
899            // Valid anchor found
900            let anchor_end = i + 1;
901
902            // Add text before anchor
903            if anchor_start > last_end {
904              let before_text: String =
905                chars[last_end..anchor_start].iter().collect();
906              if !before_text.is_empty() {
907                new_children.push(kuchikikiki::NodeRef::new_text(before_text));
908              }
909            }
910
911            // Add span element
912            let span = kuchikikiki::NodeRef::new_element(
913              markup5ever::QualName::new(
914                None,
915                markup5ever::ns!(html),
916                local_name!("span"),
917              ),
918              vec![
919                (
920                  kuchikikiki::ExpandedName::new("", "id"),
921                  kuchikikiki::Attribute {
922                    prefix: None,
923                    value:  id,
924                  },
925                ),
926                (
927                  kuchikikiki::ExpandedName::new("", "class"),
928                  kuchikikiki::Attribute {
929                    prefix: None,
930                    value:  "nixos-anchor".into(),
931                  },
932                ),
933              ],
934            );
935            new_children.push(span);
936
937            last_end = anchor_end;
938            i = anchor_end;
939          } else {
940            i += 1;
941          }
942        } else {
943          i += 1;
944        }
945      }
946
947      // Add remaining text
948      if last_end < chars.len() {
949        let after_text: String = chars[last_end..].iter().collect();
950        if !after_text.is_empty() {
951          new_children.push(kuchikikiki::NodeRef::new_text(after_text));
952        }
953      }
954
955      // Replace text node if we found anchors
956      if !new_children.is_empty() {
957        for child in new_children {
958          text_node.insert_before(child);
959        }
960        text_node.detach();
961      }
962    }
963  }
964
965  /// Process empty auto-links: [](#anchor) -> <a href="#anchor">Anchor</a>
966  fn process_empty_auto_links(document: &kuchikikiki::NodeRef) {
967    for link_node in safe_select(document, "a") {
968      let link_element = link_node;
969      if let Some(element) = link_element.as_element() {
970        let href = element
971          .attributes
972          .borrow()
973          .get(local_name!("href"))
974          .map(std::string::ToString::to_string);
975        let text_content = link_element.text_contents();
976
977        if let Some(href_value) = href
978          && href_value.starts_with('#')
979          && (text_content.trim().is_empty()
980            || text_content.trim() == "{{ANCHOR}}")
981        {
982          // Clear placeholder text if present
983          if text_content.trim() == "{{ANCHOR}}" {
984            for child in link_element.children() {
985              child.detach();
986            }
987          }
988          // Empty link with anchor - add humanized text
989          let display_text = Self::humanize_anchor_id(&href_value);
990          link_element.append(kuchikikiki::NodeRef::new_text(display_text));
991        }
992      }
993    }
994  }
995
996  /// Process empty HTML links that have no content
997  fn process_empty_html_links(document: &kuchikikiki::NodeRef) {
998    for link_node in safe_select(document, "a[href^='#']") {
999      let link_element = link_node;
1000      let text_content = link_element.text_contents();
1001
1002      if text_content.trim().is_empty() || text_content.trim() == "{{ANCHOR}}" {
1003        // Clear placeholder text if present
1004        if text_content.trim() == "{{ANCHOR}}" {
1005          for child in link_element.children() {
1006            child.detach();
1007          }
1008        }
1009        if let Some(element) = link_element.as_element()
1010          && let Some(href) =
1011            element.attributes.borrow().get(local_name!("href"))
1012        {
1013          let display_text = Self::humanize_anchor_id(href);
1014          link_element.append(kuchikikiki::NodeRef::new_text(display_text));
1015        }
1016      }
1017    }
1018  }
1019
1020  /// Process option anchor links: [](#opt-option.path) -> link to options.html
1021  fn process_option_anchor_links(document: &kuchikikiki::NodeRef) {
1022    let mut to_modify = Vec::new();
1023
1024    // Collect all option anchor links first
1025    for link_node in safe_select(document, "a[href^='#opt-']") {
1026      let link_element = link_node;
1027      if let Some(element) = link_element.as_element() {
1028        let href = element
1029          .attributes
1030          .borrow()
1031          .get(local_name!("href"))
1032          .map(std::string::ToString::to_string);
1033        let text_content = link_element.text_contents();
1034
1035        if let Some(href_value) = href
1036          && href_value.starts_with("#opt-")
1037        {
1038          let option_anchor = href_value[1..].to_string(); // remove the leading #
1039          let needs_text_replacement = text_content.trim().is_empty()
1040            || text_content.trim() == "{{ANCHOR}}";
1041          to_modify.push((
1042            link_element.clone(),
1043            option_anchor,
1044            needs_text_replacement,
1045          ));
1046        }
1047      }
1048    }
1049
1050    // Apply modifications
1051    for (link_element, option_anchor, needs_text_replacement) in to_modify {
1052      if let Some(element) = link_element.as_element() {
1053        let new_href = format!("options.html#{option_anchor}");
1054        element
1055          .attributes
1056          .borrow_mut()
1057          .insert(local_name!("href"), new_href);
1058
1059        if needs_text_replacement {
1060          // Clear existing content
1061          for child in link_element.children() {
1062            child.detach();
1063          }
1064
1065          // Extract option name from anchor
1066          // opt-services-nginx-enable -> services.nginx.enable
1067          if let Some(option_path) = option_anchor.strip_prefix("opt-") {
1068            let option_name = option_path.replace('-', ".");
1069            link_element.append(kuchikikiki::NodeRef::new_text(option_name));
1070          }
1071        }
1072      }
1073    }
1074  }
1075
1076  /// Convert an anchor ID to human-readable text
1077  fn humanize_anchor_id(anchor: &str) -> String {
1078    // Strip the leading #
1079    let cleaned = anchor.trim_start_matches('#');
1080
1081    // Remove common prefixes
1082    let without_prefix = cleaned
1083      .trim_start_matches("sec-")
1084      .trim_start_matches("ssec-")
1085      .trim_start_matches("opt-");
1086
1087    // Replace separators with spaces
1088    let spaced = without_prefix.replace(['-', '_'], " ");
1089
1090    // Capitalize each word
1091    spaced
1092      .split_whitespace()
1093      .map(|word| {
1094        let mut chars = word.chars();
1095        chars.next().map_or_else(String::new, |c| {
1096          c.to_uppercase().collect::<String>() + chars.as_str()
1097        })
1098      })
1099      .collect::<Vec<String>>()
1100      .join(" ")
1101  }
1102}
1103
1104/// Extract all inline text from a heading node.
1105pub fn extract_inline_text<'a>(node: &'a AstNode<'a>) -> String {
1106  let mut text = String::new();
1107  for child in node.children() {
1108    match &child.data.borrow().value {
1109      NodeValue::Text(t) => text.push_str(t),
1110      NodeValue::Code(t) => text.push_str(&t.literal),
1111      NodeValue::Link(..)
1112      | NodeValue::Emph
1113      | NodeValue::Strong
1114      | NodeValue::Strikethrough
1115      | NodeValue::Superscript
1116      | NodeValue::Subscript
1117      | NodeValue::FootnoteReference(..) => {
1118        text.push_str(&extract_inline_text(child));
1119      },
1120      #[allow(clippy::match_same_arms, reason = "Explicit for clarity")]
1121      NodeValue::HtmlInline(_) | NodeValue::Image(..) => {},
1122      _ => {},
1123    }
1124  }
1125  text
1126}
1127
1128/// Collect all markdown files from the input directory
1129pub fn collect_markdown_files(input_dir: &Path) -> Vec<PathBuf> {
1130  let mut files = Vec::with_capacity(100);
1131
1132  for entry in WalkDir::new(input_dir)
1133    .follow_links(true)
1134    .into_iter()
1135    .filter_map(Result::ok)
1136  {
1137    let path = entry.path();
1138    if path.is_file() && path.extension().is_some_and(|ext| ext == "md") {
1139      files.push(path.to_owned());
1140    }
1141  }
1142
1143  trace!("Found {} markdown files to process", files.len());
1144  files
1145}
1146
1147/// Features that can be queried on a processor instance.
1148#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1149pub enum ProcessorFeature {
1150  /// GitHub Flavored Markdown support
1151  Gfm,
1152  /// Nixpkgs documentation extensions
1153  Nixpkgs,
1154  /// Syntax highlighting for code blocks
1155  SyntaxHighlighting,
1156  /// Manpage URL mapping support
1157  ManpageUrls,
1158}
1159
1160/// Standalone HTML post-processing function to avoid borrowing issues.
1161fn kuchiki_postprocess_html<F>(html: &str, transform_fn: F) -> String
1162where
1163  F: FnOnce(&kuchikikiki::NodeRef),
1164{
1165  process_safe(
1166    html,
1167    |html| {
1168      use tendril::TendrilSink;
1169
1170      let document = kuchikikiki::parse_html().one(html);
1171      transform_fn(&document);
1172
1173      let mut out = Vec::new();
1174      let _ = document.serialize(&mut out);
1175      String::from_utf8(out).unwrap_or_default()
1176    },
1177    html,
1178  )
1179}
ndg_commonmark/processor/core.rs

ndg_commonmark/processor/
core.rs