ndg_commonmark/processor/
core.rs

1//! Core implementation of the Markdown processor.
2//!
3//! This module contains the main implementation of `MarkdownProcessor` and its
4//! methods, focused on the core rendering pipeline and configuration
5//! management.
6use std::{
7  collections::HashMap,
8  path::{Path, PathBuf},
9};
10
11use comrak::{
12  Arena,
13  ComrakOptions,
14  nodes::{AstNode, NodeHeading, NodeValue},
15  parse_document,
16};
17use log::trace;
18use markup5ever::local_name;
19use walkdir::WalkDir;
20
21use super::{
22  process::process_safe,
23  types::{
24    AstTransformer,
25    MarkdownOptions,
26    MarkdownProcessor,
27    PromptTransformer,
28  },
29};
30use crate::{
31  syntax::create_default_manager,
32  types::{Header, MarkdownResult},
33  utils,
34};
35
36impl MarkdownProcessor {
37  /// Create a new `MarkdownProcessor` with the given options.
38  #[must_use]
39  pub fn new(options: MarkdownOptions) -> Self {
40    let manpage_urls = options
41      .manpage_urls_path
42      .as_ref()
43      .and_then(|path| crate::utils::load_manpage_urls(path).ok());
44
45    let syntax_manager = if options.highlight_code {
46      create_default_manager().ok()
47    } else {
48      None
49    };
50
51    Self {
52      options,
53      manpage_urls,
54      syntax_manager,
55      included_files: std::cell::RefCell::new(Vec::new()),
56      base_dir: std::path::PathBuf::from("."),
57    }
58  }
59
60  /// Access processor options.
61  #[must_use]
62  pub const fn options(&self) -> &MarkdownOptions {
63    &self.options
64  }
65
66  /// Set the base directory for resolving relative file paths.
67  pub fn with_base_dir(mut self, base_dir: &std::path::Path) -> Self {
68    self.base_dir = base_dir.to_path_buf();
69    self
70  }
71
72  /// Check if a specific feature is enabled.
73  #[must_use]
74  pub const fn has_feature(&self, feature: ProcessorFeature) -> bool {
75    match feature {
76      ProcessorFeature::Gfm => self.options.gfm,
77      ProcessorFeature::Nixpkgs => self.options.nixpkgs,
78      ProcessorFeature::SyntaxHighlighting => self.options.highlight_code,
79      ProcessorFeature::ManpageUrls => self.manpage_urls.is_some(),
80    }
81  }
82
83  /// Get the manpage URLs mapping for use with standalone functions.
84  #[must_use]
85  pub const fn manpage_urls(&self) -> Option<&HashMap<String, String>> {
86    self.manpage_urls.as_ref()
87  }
88
89  /// Highlight all code blocks in HTML using the configured syntax highlighter
90  #[must_use]
91  pub fn highlight_codeblocks(&self, html: &str) -> String {
92    if !self.options.highlight_code || self.syntax_manager.is_none() {
93      return html.to_string();
94    }
95
96    use kuchikikiki::parse_html;
97    use tendril::TendrilSink;
98
99    let document = parse_html().one(html);
100
101    // Collect all code blocks first to avoid DOM modification during iteration
102    let mut code_blocks = Vec::new();
103    for pre_node in document.select("pre > code").unwrap() {
104      let code_node = pre_node.as_node();
105      if let Some(element) = code_node.as_element() {
106        let class_attr = element
107          .attributes
108          .borrow()
109          .get("class")
110          .map(std::string::ToString::to_string);
111        let language = class_attr
112          .as_deref()
113          .and_then(|s| s.strip_prefix("language-"))
114          .unwrap_or("text");
115        let code_text = code_node.text_contents();
116
117        if let Some(pre_parent) = code_node.parent() {
118          code_blocks.push((
119            pre_parent.clone(),
120            code_node.clone(),
121            code_text,
122            language.to_string(),
123          ));
124        }
125      }
126    }
127
128    // Process each code block
129    for (pre_element, _code_node, code_text, language) in code_blocks {
130      if let Some(highlighted) = self.highlight_code_html(&code_text, &language)
131      {
132        // Wrap highlighted HTML in <pre><code> with appropriate classes
133        let wrapped_html = format!(
134          r#"<pre class="highlight"><code class="language-{language}">{highlighted}</code></pre>"#
135        );
136        let fragment = parse_html().one(wrapped_html.as_str());
137        pre_element.insert_after(fragment);
138        pre_element.detach();
139      }
140      // Do not add highlight/language-* classes if not highlighted
141    }
142
143    let mut buf = Vec::new();
144    document.serialize(&mut buf).unwrap();
145    String::from_utf8(buf).unwrap_or_default()
146  }
147
148  /// Highlight code using the configured syntax highlighter, returns HTML
149  /// string
150  fn highlight_code_html(&self, code: &str, language: &str) -> Option<String> {
151    if !self.options.highlight_code {
152      return None;
153    }
154
155    let syntax_manager = self.syntax_manager.as_ref()?;
156
157    syntax_manager
158      .highlight_code(code, language, self.options.highlight_theme.as_deref())
159      .ok()
160  }
161
162  /// Render Markdown to HTML, extracting headers and title.
163  #[must_use]
164  pub fn render(&self, markdown: &str) -> MarkdownResult {
165    // Clear previous includes
166    self.included_files.borrow_mut().clear();
167
168    let preprocessed = self.preprocess(markdown);
169    let (headers, title) = self.extract_headers(&preprocessed);
170    let html = self.process_html_pipeline(&preprocessed);
171
172    MarkdownResult {
173      html,
174      headers,
175      title,
176      included_files: self.included_files.borrow().clone(),
177    }
178  }
179
180  /// Process the HTML generation and post-processing pipeline.
181  fn process_html_pipeline(&self, content: &str) -> String {
182    let mut html = self.convert_to_html(content);
183
184    // Apply feature-specific post-processing
185    if cfg!(feature = "ndg-flavored") {
186      #[cfg(feature = "ndg-flavored")]
187      {
188        html = super::extensions::process_option_references(&html);
189      }
190    }
191
192    if self.options.nixpkgs {
193      html = self.process_manpage_references_html(&html);
194    }
195
196    if self.options.highlight_code {
197      html = self.highlight_codeblocks(&html);
198    }
199
200    self.kuchiki_postprocess(&html)
201  }
202
203  /// Preprocess the markdown content with all enabled transformations.
204  fn preprocess(&self, content: &str) -> String {
205    let mut processed = content.to_string();
206
207    // Process MyST-style autolinks first
208    processed = super::extensions::process_myst_autolinks(&processed);
209
210    if self.options.nixpkgs {
211      processed = self.apply_nixpkgs_preprocessing(&processed);
212    }
213
214    if self.options.nixpkgs || cfg!(feature = "ndg-flavored") {
215      processed = super::extensions::process_role_markup(
216        &processed,
217        self.manpage_urls.as_ref(),
218        self.options.auto_link_options,
219      );
220    }
221
222    processed
223  }
224
225  /// Apply Nixpkgs-specific preprocessing steps.
226  #[cfg(feature = "nixpkgs")]
227  fn apply_nixpkgs_preprocessing(&self, content: &str) -> String {
228    let (with_includes, included_files) =
229      super::extensions::process_file_includes(content, &self.base_dir);
230    self.included_files.borrow_mut().extend(included_files);
231    let with_blocks = super::extensions::process_block_elements(&with_includes);
232    super::extensions::process_inline_anchors(&with_blocks)
233  }
234
235  /// Apply Nixpkgs-specific preprocessing steps (no-op when feature disabled).
236  #[cfg(not(feature = "nixpkgs"))]
237  fn apply_nixpkgs_preprocessing(&self, content: &str) -> String {
238    content.to_string()
239  }
240
241  /// Extract headers and title from the markdown content.
242  #[must_use]
243  pub fn extract_headers(
244    &self,
245    content: &str,
246  ) -> (Vec<Header>, Option<String>) {
247    let arena = Arena::new();
248    let options = self.comrak_options();
249
250    // Normalize custom anchors with no heading level to h2
251    let mut normalized = String::with_capacity(content.len());
252    for line in content.lines() {
253      let trimmed = line.trim_end();
254      if !trimmed.starts_with('#') {
255        if let Some(anchor_start) = trimmed.rfind("{#") {
256          if let Some(anchor_end) = trimmed[anchor_start..].find('}') {
257            let text = trimmed[..anchor_start].trim_end();
258            let id = &trimmed[anchor_start + 2..anchor_start + anchor_end];
259            normalized.push_str(&format!("## {text} {{#{id}}}\n"));
260            continue;
261          }
262        }
263      }
264      normalized.push_str(line);
265      normalized.push('\n');
266    }
267
268    let root = parse_document(&arena, &normalized, &options);
269
270    let mut headers = Vec::new();
271    let mut found_title = None;
272
273    for node in root.descendants() {
274      if let NodeValue::Heading(NodeHeading { level, .. }) =
275        &node.data.borrow().value
276      {
277        let mut text = String::new();
278        let mut explicit_id = None;
279
280        for child in node.children() {
281          match &child.data.borrow().value {
282            NodeValue::Text(t) => text.push_str(t),
283            NodeValue::Code(t) => text.push_str(&t.literal),
284            NodeValue::Link(..) => text.push_str(&extract_inline_text(child)),
285            NodeValue::Emph => text.push_str(&extract_inline_text(child)),
286            NodeValue::Strong => text.push_str(&extract_inline_text(child)),
287            NodeValue::Strikethrough => {
288              text.push_str(&extract_inline_text(child));
289            },
290            NodeValue::Superscript => {
291              text.push_str(&extract_inline_text(child));
292            },
293            NodeValue::Subscript => text.push_str(&extract_inline_text(child)),
294            NodeValue::FootnoteReference(..) => {
295              text.push_str(&extract_inline_text(child));
296            },
297            NodeValue::HtmlInline(html) => {
298              // Look for explicit anchor in HTML inline node: {#id}
299              let html_str = html.as_str();
300              if let Some(start) = html_str.find("{#") {
301                if let Some(end) = html_str[start..].find('}') {
302                  let anchor = &html_str[start + 2..start + end];
303                  explicit_id = Some(anchor.to_string());
304                }
305              }
306            },
307            NodeValue::Image(..) => {},
308            _ => {},
309          }
310        }
311
312        // Check for trailing {#id} in heading text
313        let trimmed = text.trim_end();
314        let (final_text, id) = if let Some(start) = trimmed.rfind("{#") {
315          if let Some(end) = trimmed[start..].find('}') {
316            let anchor = &trimmed[start + 2..start + end];
317            (trimmed[..start].trim_end().to_string(), anchor.to_string())
318          } else {
319            (
320              text.clone(),
321              explicit_id.unwrap_or_else(|| utils::slugify(&text)),
322            )
323          }
324        } else {
325          (
326            text.clone(),
327            explicit_id.unwrap_or_else(|| utils::slugify(&text)),
328          )
329        };
330        if *level == 1 && found_title.is_none() {
331          found_title = Some(final_text.clone());
332        }
333        headers.push(Header {
334          text: final_text,
335          level: *level,
336          id,
337        });
338      }
339    }
340
341    (headers, found_title)
342  }
343
344  /// Convert markdown to HTML using comrak and configured options.
345  fn convert_to_html(&self, content: &str) -> String {
346    // Process directly without panic catching for better performance
347    let arena = Arena::new();
348    let options = self.comrak_options();
349    let root = parse_document(&arena, content, &options);
350
351    // Apply AST transformations
352    let prompt_transformer = PromptTransformer;
353    prompt_transformer.transform(root);
354
355    let mut html_output = String::new();
356    comrak::format_html(root, &options, &mut html_output).unwrap_or_default();
357
358    // Post-process HTML to handle header anchors
359    self.process_header_anchors_html(&html_output)
360  }
361
362  /// Process header anchors in HTML by finding {#id} syntax and converting to
363  /// proper id attributes
364  fn process_header_anchors_html(&self, html: &str) -> String {
365    use std::sync::LazyLock;
366
367    use regex::Regex;
368
369    static HEADER_ANCHOR_RE: LazyLock<Regex> = LazyLock::new(|| {
370      Regex::new(r"<h([1-6])>(.*?)\s*\{#([a-zA-Z0-9_-]+)\}(.*?)</h[1-6]>")
371        .unwrap_or_else(|e| {
372          log::error!("Failed to compile HEADER_ANCHOR_RE regex: {e}");
373          utils::never_matching_regex()
374        })
375    });
376
377    HEADER_ANCHOR_RE
378      .replace_all(html, |caps: &regex::Captures| {
379        let level = &caps[1];
380        let prefix = &caps[2];
381        let id = &caps[3];
382        let suffix = &caps[4];
383        format!("<h{level} id=\"{id}\">{prefix}{suffix}</h{level}>")
384      })
385      .to_string()
386  }
387
388  /// Build comrak options from `MarkdownOptions` and feature flags.
389  fn comrak_options(&self) -> ComrakOptions<'_> {
390    let mut options = ComrakOptions::default();
391    if self.options.gfm {
392      options.extension.table = true;
393      options.extension.footnotes = true;
394      options.extension.strikethrough = true;
395      options.extension.tasklist = true;
396      options.extension.superscript = true;
397      options.extension.autolink = true;
398    }
399    options.render.unsafe_ = true;
400    // Enable description lists but keep custom header processing
401    options.extension.header_ids = None;
402    options.extension.description_lists = true;
403    options
404  }
405
406  /// Post-process HTML to enhance manpage references with URL links.
407  #[cfg(feature = "nixpkgs")]
408  fn process_manpage_references_html(&self, html: &str) -> String {
409    super::extensions::process_manpage_references(
410      html,
411      self.manpage_urls.as_ref(),
412    )
413  }
414
415  /// Post-process HTML to enhance manpage references (no-op when feature
416  /// disabled).
417  #[cfg(not(feature = "nixpkgs"))]
418  fn process_manpage_references_html(&self, html: &str) -> String {
419    html.to_string()
420  }
421
422  /// HTML post-processing using kuchiki DOM manipulation.
423  fn kuchiki_postprocess(&self, html: &str) -> String {
424    // Use a standalone function to avoid borrowing issues
425    kuchiki_postprocess_html(html, |document| {
426      self.apply_dom_transformations(document);
427    })
428  }
429
430  /// Apply all DOM transformations to the parsed HTML document.
431  fn apply_dom_transformations(&self, document: &kuchikikiki::NodeRef) {
432    self.process_list_item_id_markers(document);
433    self.process_header_anchor_comments(document);
434    self.process_list_item_inline_anchors(document);
435    self.process_paragraph_inline_anchors(document);
436    self.process_remaining_inline_anchors(document);
437    self.process_option_anchor_links(document);
438    self.process_empty_auto_links(document);
439    self.process_empty_html_links(document);
440  }
441
442  /// Process list item ID markers: <li><!-- nixos-anchor-id:ID -->
443  fn process_list_item_id_markers(&self, document: &kuchikikiki::NodeRef) {
444    let mut to_modify = Vec::new();
445
446    for comment in document.inclusive_descendants() {
447      if let Some(comment_node) = comment.as_comment() {
448        let comment_text = comment_node.borrow();
449        if let Some(id_start) = comment_text.find("nixos-anchor-id:") {
450          let id = comment_text[id_start + 16..].trim();
451          if !id.is_empty()
452            && id
453              .chars()
454              .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
455          {
456            // Check if this comment is inside an <li> element
457            if let Some(parent) = comment.parent() {
458              if let Some(element) = parent.as_element() {
459                if element.name.local.as_ref() == "li" {
460                  to_modify.push((comment.clone(), id.to_string()));
461                }
462              }
463            }
464          }
465        }
466      }
467    }
468
469    for (comment_node, id) in to_modify {
470      let span = kuchikikiki::NodeRef::new_element(
471        markup5ever::QualName::new(
472          None,
473          markup5ever::ns!(html),
474          local_name!("span"),
475        ),
476        vec![
477          (
478            kuchikikiki::ExpandedName::new("", "id"),
479            kuchikikiki::Attribute {
480              prefix: None,
481              value:  id,
482            },
483          ),
484          (
485            kuchikikiki::ExpandedName::new("", "class"),
486            kuchikikiki::Attribute {
487              prefix: None,
488              value:  "nixos-anchor".into(),
489            },
490          ),
491        ],
492      );
493      comment_node.insert_after(span);
494      comment_node.detach();
495    }
496  }
497
498  /// Process header anchors with comments: <h1>text<!-- anchor: id --></h1>
499  fn process_header_anchor_comments(&self, document: &kuchikikiki::NodeRef) {
500    let mut to_modify = Vec::new();
501
502    for comment in document.inclusive_descendants() {
503      if let Some(comment_node) = comment.as_comment() {
504        let comment_text = comment_node.borrow();
505        if let Some(anchor_start) = comment_text.find("anchor:") {
506          let id = comment_text[anchor_start + 7..].trim();
507          if !id.is_empty()
508            && id
509              .chars()
510              .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
511          {
512            // Check if this comment is inside a header element
513            if let Some(parent) = comment.parent() {
514              if let Some(element) = parent.as_element() {
515                let tag_name = element.name.local.as_ref();
516                if matches!(tag_name, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
517                  to_modify.push((
518                    parent.clone(),
519                    comment.clone(),
520                    id.to_string(),
521                  ));
522                }
523              }
524            }
525          }
526        }
527      }
528    }
529
530    for (header_element, comment_node, id) in to_modify {
531      if let Some(element) = header_element.as_element() {
532        element
533          .attributes
534          .borrow_mut()
535          .insert(local_name!("id"), id);
536        comment_node.detach();
537      }
538    }
539  }
540
541  /// Process remaining inline anchors in list items: <li>[]{#id}content</li>
542  fn process_list_item_inline_anchors(&self, document: &kuchikikiki::NodeRef) {
543    for li_node in document.select("li").unwrap() {
544      let li_element = li_node.as_node();
545
546      // Check if this list item contains code elements
547      let has_code = li_element.select("code, pre").is_ok()
548        && li_element.select("code, pre").unwrap().next().is_some();
549      if has_code {
550        continue; // Skip list items with code blocks
551      }
552
553      let text_content = li_element.text_contents();
554
555      if let Some(anchor_start) = text_content.find("[]{#") {
556        if let Some(anchor_end) = text_content[anchor_start..].find('}') {
557          let id = &text_content[anchor_start + 4..anchor_start + anchor_end];
558          if !id.is_empty()
559            && id
560              .chars()
561              .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
562          {
563            let remaining_content =
564              &text_content[anchor_start + anchor_end + 1..];
565
566            // Clear current content and rebuild
567            for child in li_element.children() {
568              child.detach();
569            }
570
571            let span = kuchikikiki::NodeRef::new_element(
572              markup5ever::QualName::new(
573                None,
574                markup5ever::ns!(html),
575                local_name!("span"),
576              ),
577              vec![
578                (
579                  kuchikikiki::ExpandedName::new("", "id"),
580                  kuchikikiki::Attribute {
581                    prefix: None,
582                    value:  id.into(),
583                  },
584                ),
585                (
586                  kuchikikiki::ExpandedName::new("", "class"),
587                  kuchikikiki::Attribute {
588                    prefix: None,
589                    value:  "nixos-anchor".into(),
590                  },
591                ),
592              ],
593            );
594            li_element.append(span);
595            if !remaining_content.is_empty() {
596              li_element
597                .append(kuchikikiki::NodeRef::new_text(remaining_content));
598            }
599          }
600        }
601      }
602    }
603  }
604
605  /// Process inline anchors in paragraphs: <p>[]{#id}content</p>
606  fn process_paragraph_inline_anchors(&self, document: &kuchikikiki::NodeRef) {
607    for p_node in document.select("p").unwrap() {
608      let p_element = p_node.as_node();
609
610      // Check if this paragraph contains code elements
611      let has_code = p_element.select("code, pre").is_ok()
612        && p_element.select("code, pre").unwrap().next().is_some();
613      if has_code {
614        continue; // Skip paragraphs with code blocks
615      }
616
617      let text_content = p_element.text_contents();
618
619      if let Some(anchor_start) = text_content.find("[]{#") {
620        if let Some(anchor_end) = text_content[anchor_start..].find('}') {
621          let id = &text_content[anchor_start + 4..anchor_start + anchor_end];
622          if !id.is_empty()
623            && id
624              .chars()
625              .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
626          {
627            let remaining_content =
628              &text_content[anchor_start + anchor_end + 1..];
629
630            // Clear current content and rebuild
631            for child in p_element.children() {
632              child.detach();
633            }
634
635            let span = kuchikikiki::NodeRef::new_element(
636              markup5ever::QualName::new(
637                None,
638                markup5ever::ns!(html),
639                local_name!("span"),
640              ),
641              vec![
642                (
643                  kuchikikiki::ExpandedName::new("", "id"),
644                  kuchikikiki::Attribute {
645                    prefix: None,
646                    value:  id.into(),
647                  },
648                ),
649                (
650                  kuchikikiki::ExpandedName::new("", "class"),
651                  kuchikikiki::Attribute {
652                    prefix: None,
653                    value:  "nixos-anchor".into(),
654                  },
655                ),
656              ],
657            );
658            p_element.append(span);
659            if !remaining_content.is_empty() {
660              p_element
661                .append(kuchikikiki::NodeRef::new_text(remaining_content));
662            }
663          }
664        }
665      }
666    }
667  }
668
669  /// Process remaining standalone inline anchors throughout the document
670  fn process_remaining_inline_anchors(&self, document: &kuchikikiki::NodeRef) {
671    let mut text_nodes_to_process = Vec::new();
672
673    for node in document.inclusive_descendants() {
674      if let Some(text_node) = node.as_text() {
675        // Check if this text node is inside a code block
676        let mut parent = node.parent();
677        let mut in_code = false;
678        while let Some(p) = parent {
679          if let Some(element) = p.as_element() {
680            if element.name.local == local_name!("code")
681              || element.name.local == local_name!("pre")
682            {
683              in_code = true;
684              break;
685            }
686          }
687          parent = p.parent();
688        }
689
690        // Only process if not in code
691        if !in_code {
692          let text_content = text_node.borrow().clone();
693          if text_content.contains("[]{#") {
694            text_nodes_to_process.push((node.clone(), text_content));
695          }
696        }
697      }
698    }
699
700    for (text_node, text_content) in text_nodes_to_process {
701      let mut last_end = 0;
702      let mut new_children = Vec::new();
703
704      // Simple pattern matching for []{#id}
705      let chars = text_content.chars().collect::<Vec<_>>();
706      let mut i = 0;
707      while i < chars.len() {
708        if i + 4 < chars.len()
709          && chars[i] == '['
710          && chars[i + 1] == ']'
711          && chars[i + 2] == '{'
712          && chars[i + 3] == '#'
713        {
714          // Found start of anchor pattern
715          let anchor_start = i;
716          i += 4; // skip "[]{#"
717
718          let mut id = String::new();
719          while i < chars.len() && chars[i] != '}' {
720            if chars[i].is_alphanumeric() || chars[i] == '-' || chars[i] == '_'
721            {
722              id.push(chars[i]);
723              i += 1;
724            } else {
725              break;
726            }
727          }
728
729          if i < chars.len() && chars[i] == '}' && !id.is_empty() {
730            // Valid anchor found
731            let anchor_end = i + 1;
732
733            // Add text before anchor
734            if anchor_start > last_end {
735              let before_text: String =
736                chars[last_end..anchor_start].iter().collect();
737              if !before_text.is_empty() {
738                new_children.push(kuchikikiki::NodeRef::new_text(before_text));
739              }
740            }
741
742            // Add span element
743            let span = kuchikikiki::NodeRef::new_element(
744              markup5ever::QualName::new(
745                None,
746                markup5ever::ns!(html),
747                local_name!("span"),
748              ),
749              vec![
750                (
751                  kuchikikiki::ExpandedName::new("", "id"),
752                  kuchikikiki::Attribute {
753                    prefix: None,
754                    value:  id,
755                  },
756                ),
757                (
758                  kuchikikiki::ExpandedName::new("", "class"),
759                  kuchikikiki::Attribute {
760                    prefix: None,
761                    value:  "nixos-anchor".into(),
762                  },
763                ),
764              ],
765            );
766            new_children.push(span);
767
768            last_end = anchor_end;
769            i = anchor_end;
770          } else {
771            i += 1;
772          }
773        } else {
774          i += 1;
775        }
776      }
777
778      // Add remaining text
779      if last_end < chars.len() {
780        let after_text: String = chars[last_end..].iter().collect();
781        if !after_text.is_empty() {
782          new_children.push(kuchikikiki::NodeRef::new_text(after_text));
783        }
784      }
785
786      // Replace text node if we found anchors
787      if !new_children.is_empty() {
788        for child in new_children {
789          text_node.insert_before(child);
790        }
791        text_node.detach();
792      }
793    }
794  }
795
796  /// Process empty auto-links: [](#anchor) -> <a href="#anchor">Anchor</a>
797  fn process_empty_auto_links(&self, document: &kuchikikiki::NodeRef) {
798    for link_node in document.select("a").unwrap() {
799      let link_element = link_node.as_node();
800      if let Some(element) = link_element.as_element() {
801        let href = element
802          .attributes
803          .borrow()
804          .get(local_name!("href"))
805          .map(std::string::ToString::to_string);
806        let text_content = link_element.text_contents();
807
808        if let Some(href_value) = href {
809          if href_value.starts_with('#')
810            && (text_content.trim().is_empty()
811              || text_content.trim() == "{{ANCHOR}}")
812          {
813            // Clear placeholder text if present
814            if text_content.trim() == "{{ANCHOR}}" {
815              for child in link_element.children() {
816                child.detach();
817              }
818            }
819            // Empty link with anchor - add humanized text
820            let display_text = self.humanize_anchor_id(&href_value);
821            link_element.append(kuchikikiki::NodeRef::new_text(display_text));
822          }
823        }
824      }
825    }
826  }
827
828  /// Process empty HTML links that have no content
829  fn process_empty_html_links(&self, document: &kuchikikiki::NodeRef) {
830    for link_node in document.select("a[href^='#']").unwrap() {
831      let link_element = link_node.as_node();
832      let text_content = link_element.text_contents();
833
834      if text_content.trim().is_empty() || text_content.trim() == "{{ANCHOR}}" {
835        // Clear placeholder text if present
836        if text_content.trim() == "{{ANCHOR}}" {
837          for child in link_element.children() {
838            child.detach();
839          }
840        }
841        if let Some(element) = link_element.as_element() {
842          if let Some(href) =
843            element.attributes.borrow().get(local_name!("href"))
844          {
845            let display_text = self.humanize_anchor_id(href);
846            link_element.append(kuchikikiki::NodeRef::new_text(display_text));
847          }
848        }
849      }
850    }
851  }
852
853  /// Process option anchor links: [](#opt-option.path) -> link to options.html
854  fn process_option_anchor_links(&self, document: &kuchikikiki::NodeRef) {
855    let mut to_modify = Vec::new();
856
857    // Collect all option anchor links first
858    for link_node in document.select("a[href^='#opt-']").unwrap() {
859      let link_element = link_node.as_node();
860      if let Some(element) = link_element.as_element() {
861        let href = element
862          .attributes
863          .borrow()
864          .get(local_name!("href"))
865          .map(std::string::ToString::to_string);
866        let text_content = link_element.text_contents();
867
868        if let Some(href_value) = href {
869          if href_value.starts_with("#opt-") {
870            let option_anchor = href_value[1..].to_string(); // remove the leading #
871            let needs_text_replacement = text_content.trim().is_empty()
872              || text_content.trim() == "{{ANCHOR}}";
873            to_modify.push((
874              link_element.clone(),
875              option_anchor,
876              needs_text_replacement,
877            ));
878          }
879        }
880      }
881    }
882
883    // Apply modifications
884    for (link_element, option_anchor, needs_text_replacement) in to_modify {
885      if let Some(element) = link_element.as_element() {
886        let new_href = format!("options.html#{option_anchor}");
887        element
888          .attributes
889          .borrow_mut()
890          .insert(local_name!("href"), new_href);
891
892        if needs_text_replacement {
893          // Clear existing content
894          for child in link_element.children() {
895            child.detach();
896          }
897
898          // Extract option name from anchor
899          // opt-services-nginx-enable -> services.nginx.enable
900          if let Some(option_path) = option_anchor.strip_prefix("opt-") {
901            let option_name = option_path.replace('-', ".");
902            link_element.append(kuchikikiki::NodeRef::new_text(option_name));
903          }
904        }
905      }
906    }
907  }
908
909  /// Convert an anchor ID to human-readable text
910  fn humanize_anchor_id(&self, anchor: &str) -> String {
911    // Strip the leading #
912    let cleaned = anchor.trim_start_matches('#');
913
914    // Remove common prefixes
915    let without_prefix = cleaned
916      .trim_start_matches("sec-")
917      .trim_start_matches("ssec-")
918      .trim_start_matches("opt-");
919
920    // Replace separators with spaces
921    let spaced = without_prefix.replace(['-', '_'], " ");
922
923    // Capitalize each word
924    spaced
925      .split_whitespace()
926      .map(|word| {
927        let mut chars = word.chars();
928        chars.next().map_or_else(String::new, |c| {
929          c.to_uppercase().collect::<String>() + chars.as_str()
930        })
931      })
932      .collect::<Vec<String>>()
933      .join(" ")
934  }
935}
936
937/// Extract all inline text from a heading node.
938pub fn extract_inline_text<'a>(node: &'a AstNode<'a>) -> String {
939  let mut text = String::new();
940  for child in node.children() {
941    match &child.data.borrow().value {
942      NodeValue::Text(t) => text.push_str(t),
943      NodeValue::Code(t) => text.push_str(&t.literal),
944      NodeValue::Link(..) => text.push_str(&extract_inline_text(child)),
945      NodeValue::Emph => text.push_str(&extract_inline_text(child)),
946      NodeValue::Strong => text.push_str(&extract_inline_text(child)),
947      NodeValue::Strikethrough => text.push_str(&extract_inline_text(child)),
948      NodeValue::Superscript => text.push_str(&extract_inline_text(child)),
949      NodeValue::Subscript => text.push_str(&extract_inline_text(child)),
950      NodeValue::FootnoteReference(..) => {
951        text.push_str(&extract_inline_text(child));
952      },
953      NodeValue::HtmlInline(_) => {},
954      NodeValue::Image(..) => {},
955      _ => {},
956    }
957  }
958  text
959}
960
961/// Collect all markdown files from the input directory
962pub fn collect_markdown_files(input_dir: &Path) -> Vec<PathBuf> {
963  let mut files = Vec::with_capacity(100);
964
965  for entry in WalkDir::new(input_dir)
966    .follow_links(true)
967    .into_iter()
968    .filter_map(Result::ok)
969  {
970    let path = entry.path();
971    if path.is_file() && path.extension().is_some_and(|ext| ext == "md") {
972      files.push(path.to_owned());
973    }
974  }
975
976  trace!("Found {} markdown files to process", files.len());
977  files
978}
979
980/// Features that can be queried on a processor instance.
981#[derive(Debug, Clone, Copy, PartialEq, Eq)]
982pub enum ProcessorFeature {
983  /// GitHub Flavored Markdown support
984  Gfm,
985  /// Nixpkgs documentation extensions
986  Nixpkgs,
987  /// Syntax highlighting for code blocks
988  SyntaxHighlighting,
989  /// Manpage URL mapping support
990  ManpageUrls,
991}
992
993/// Standalone HTML post-processing function to avoid borrowing issues.
994fn kuchiki_postprocess_html<F>(html: &str, transform_fn: F) -> String
995where
996  F: FnOnce(&kuchikikiki::NodeRef),
997{
998  process_safe(
999    html,
1000    |html| {
1001      use tendril::TendrilSink;
1002
1003      let document = kuchikikiki::parse_html().one(html);
1004      transform_fn(&document);
1005
1006      let mut out = Vec::new();
1007      document.serialize(&mut out).ok();
1008      String::from_utf8(out).unwrap_or_default()
1009    },
1010    html,
1011  )
1012}