ndg_commonmark/processor/
core.rs

1//! Core implementation of the Markdown processor.
2//!
3//! Main implementation of `MarkdownProcessor` and its methods focused on the
4//! core rendering pipeline and configuration management.
5use std::{
6  collections::HashMap,
7  path::{Path, PathBuf},
8};
9
10use comrak::{
11  Arena,
12  nodes::{AstNode, NodeHeading, NodeValue},
13  options::Options,
14  parse_document,
15};
16use log::trace;
17use markup5ever::local_name;
18use walkdir::WalkDir;
19
20use super::{
21  dom::safe_select,
22  process::process_safe,
23  types::{
24    AstTransformer,
25    MarkdownOptions,
26    MarkdownProcessor,
27    PromptTransformer,
28  },
29};
30use crate::{
31  syntax::create_default_manager,
32  types::{Header, MarkdownResult},
33  utils,
34};
35
36impl MarkdownProcessor {
37  /// Create a new `MarkdownProcessor` with the given options.
38  #[must_use]
39  pub fn new(options: MarkdownOptions) -> Self {
40    let manpage_urls = options
41      .manpage_urls_path
42      .as_ref()
43      .and_then(|path| crate::utils::load_manpage_urls(path).ok());
44
45    let syntax_manager = if options.highlight_code {
46      match create_default_manager() {
47        Ok(manager) => {
48          log::info!("Syntax highlighting initialized successfully");
49          Some(manager)
50        },
51        Err(e) => {
52          log::error!("Failed to initialize syntax highlighting: {e}");
53          log::warn!(
54            "Continuing without syntax highlighting - code blocks will not be \
55             highlighted"
56          );
57          None
58        },
59      }
60    } else {
61      None
62    };
63
64    Self {
65      options,
66      manpage_urls,
67      syntax_manager,
68      base_dir: std::path::PathBuf::from("."),
69    }
70  }
71
72  /// Access processor options.
73  #[must_use]
74  pub const fn options(&self) -> &MarkdownOptions {
75    &self.options
76  }
77
78  /// Set the base directory for resolving relative file paths.
79  #[must_use]
80  pub fn with_base_dir(mut self, base_dir: &std::path::Path) -> Self {
81    self.base_dir = base_dir.to_path_buf();
82    self
83  }
84
85  /// Check if a specific feature is enabled.
86  #[must_use]
87  pub const fn has_feature(&self, feature: ProcessorFeature) -> bool {
88    match feature {
89      ProcessorFeature::Gfm => self.options.gfm,
90      ProcessorFeature::Nixpkgs => self.options.nixpkgs,
91      ProcessorFeature::SyntaxHighlighting => self.options.highlight_code,
92      ProcessorFeature::ManpageUrls => self.manpage_urls.is_some(),
93    }
94  }
95
96  /// Get the manpage URLs mapping for use with standalone functions.
97  #[must_use]
98  pub const fn manpage_urls(&self) -> Option<&HashMap<String, String>> {
99    self.manpage_urls.as_ref()
100  }
101
102  /// Highlight all code blocks in HTML using the configured syntax highlighter
103  #[must_use]
104  pub fn highlight_codeblocks(&self, html: &str) -> String {
105    use kuchikikiki::parse_html;
106    use tendril::TendrilSink;
107
108    if !self.options.highlight_code || self.syntax_manager.is_none() {
109      return html.to_string();
110    }
111
112    let document = parse_html().one(html);
113
114    // Collect all code blocks first to avoid DOM modification during iteration
115    let mut code_blocks = Vec::new();
116    for pre_node in safe_select(&document, "pre > code") {
117      let code_node = pre_node;
118      if let Some(element) = code_node.as_element() {
119        let language = element
120          .attributes
121          .borrow()
122          .get("class")
123          .and_then(|class| class.strip_prefix("language-"))
124          .unwrap_or("text")
125          .to_string();
126        let code_text = code_node.text_contents();
127
128        if let Some(pre_parent) = code_node.parent() {
129          code_blocks.push((
130            pre_parent.clone(),
131            code_node.clone(),
132            code_text,
133            language,
134          ));
135        }
136      }
137    }
138
139    // Process each code block
140    for (pre_element, _code_node, code_text, language) in code_blocks {
141      if let Some(highlighted) = self.highlight_code_html(&code_text, &language)
142      {
143        // Wrap highlighted HTML in <pre><code> with appropriate classes
144        let wrapped_html = format!(
145          r#"<pre class="highlight"><code class="language-{language}">{highlighted}</code></pre>"#
146        );
147        let fragment = parse_html().one(wrapped_html.as_str());
148        pre_element.insert_after(fragment);
149        pre_element.detach();
150      }
151      // Do not add highlight/language-* classes if not highlighted
152    }
153
154    let mut buf = Vec::new();
155    if let Err(e) = document.serialize(&mut buf) {
156      log::warn!("DOM serialization failed: {e:?}");
157      return html.to_string(); // Return original HTML if serialization fails
158    }
159    String::from_utf8(buf).unwrap_or_else(|_| html.to_string())
160  }
161
162  /// Handle hard tabs in code blocks according to configuration
163  fn handle_hardtabs(&self, code: &str) -> String {
164    use super::types::TabStyle;
165
166    // Check if there are any hard tabs
167    if !code.contains('\t') {
168      return code.to_string();
169    }
170
171    match self.options.tab_style {
172      // Do nothing
173      TabStyle::None => code.to_string(),
174
175      // Warn, but do nothing.
176      TabStyle::Warn => {
177        log::warn!(
178          "Hard tabs detected in code block. Consider using spaces for \
179           consistency. Tools like editorconfig may help you normalize spaces \
180           in your documents."
181        );
182        code.to_string()
183      },
184
185      // Do not warn, only inform in debug mode. Then return
186      // the updated code.
187      TabStyle::Normalize => {
188        log::debug!("Replacing hard tabs with spaces");
189        code.replace('\t', "  ")
190      },
191    }
192  }
193
194  /// Process hard tabs in code blocks within markdown content
195  fn process_hardtabs(&self, markdown: &str) -> String {
196    use super::types::TabStyle;
197    use crate::utils::codeblock::FenceTracker;
198
199    // If no tab handling is needed, return as-is
200    if self.options.tab_style == TabStyle::None {
201      return markdown.to_string();
202    }
203
204    let mut result = String::with_capacity(markdown.len());
205    let mut lines = markdown.lines().peekable();
206    let mut tracker = FenceTracker::new();
207
208    while let Some(line) = lines.next() {
209      tracker = tracker.process_line(line);
210
211      // Only replace tabs inside fenced code blocks
212      let processed_line = if tracker.in_code_block() && line.contains('\t') {
213        self.handle_hardtabs(line)
214      } else {
215        line.to_string()
216      };
217
218      result.push_str(&processed_line);
219
220      // Add newline unless this is the last line
221      if lines.peek().is_some() {
222        result.push('\n');
223      }
224    }
225
226    result
227  }
228
229  /// Highlight code using the configured syntax highlighter, returns HTML
230  /// string
231  fn highlight_code_html(&self, code: &str, language: &str) -> Option<String> {
232    if !self.options.highlight_code {
233      return None;
234    }
235
236    let syntax_manager = self.syntax_manager.as_ref()?;
237
238    syntax_manager
239      .highlight_code(code, language, self.options.highlight_theme.as_deref())
240      .ok()
241  }
242
243  /// Render Markdown to HTML, extracting headers and title.
244  #[must_use]
245  pub fn render(&self, markdown: &str) -> MarkdownResult {
246    let (preprocessed, included_files) = self.preprocess(markdown);
247    let (headers, title) = self.extract_headers(&preprocessed);
248    let html = self.process_html_pipeline(&preprocessed);
249
250    MarkdownResult {
251      html,
252      headers,
253      title,
254      included_files,
255    }
256  }
257
258  /// Process the HTML generation and post-processing pipeline.
259  fn process_html_pipeline(&self, content: &str) -> String {
260    let mut html = self.convert_to_html(content);
261
262    // Apply feature-specific post-processing
263    if cfg!(feature = "ndg-flavored") {
264      #[cfg(feature = "ndg-flavored")]
265      {
266        html = super::extensions::process_option_references(
267          &html,
268          self.options.valid_options.as_ref(),
269        );
270      }
271    }
272
273    if self.options.nixpkgs {
274      html = self.process_manpage_references_html(&html);
275    }
276
277    if self.options.highlight_code {
278      html = self.highlight_codeblocks(&html);
279    }
280
281    self.kuchiki_postprocess(&html)
282  }
283
284  /// Preprocess the markdown content with all enabled transformations.
285  fn preprocess(
286    &self,
287    content: &str,
288  ) -> (String, Vec<crate::types::IncludedFile>) {
289    let mut processed = content.to_string();
290    let mut included_files = Vec::new();
291
292    // Process MyST-style autolinks first
293    processed = super::extensions::process_myst_autolinks(&processed);
294
295    // Handle hard tabs in code blocks
296    processed = self.process_hardtabs(&processed);
297
298    if self.options.nixpkgs {
299      let (content, files) = self.apply_nixpkgs_preprocessing(&processed);
300      processed = content;
301      included_files = files;
302    }
303
304    if self.options.nixpkgs || cfg!(feature = "ndg-flavored") {
305      processed = super::extensions::process_role_markup(
306        &processed,
307        self.manpage_urls.as_ref(),
308        self.options.auto_link_options,
309        self.options.valid_options.as_ref(),
310      );
311    }
312
313    if cfg!(feature = "wiki") {
314      processed = super::extensions::process_wikilinks(&processed);
315    }
316
317    (processed, included_files)
318  }
319
320  /// Apply Nixpkgs-specific preprocessing steps.
321  #[cfg(feature = "nixpkgs")]
322  fn apply_nixpkgs_preprocessing(
323    &self,
324    content: &str,
325  ) -> (String, Vec<crate::types::IncludedFile>) {
326    let (with_includes, included_files) =
327      match super::extensions::process_file_includes(content, &self.base_dir, 0)
328      {
329        Ok(result) => result,
330        Err(e) => {
331          log::warn!(
332            "File include processing failed: {e}. Continuing without includes."
333          );
334          (content.to_string(), Vec::new())
335        },
336      };
337    let with_blocks = super::extensions::process_block_elements(&with_includes);
338    let processed = super::extensions::process_inline_anchors(&with_blocks);
339    (processed, included_files)
340  }
341
342  /// Apply Nixpkgs-specific preprocessing steps (no-op when feature disabled).
343  #[cfg(not(feature = "nixpkgs"))]
344  fn apply_nixpkgs_preprocessing(
345    &self,
346    content: &str,
347  ) -> (String, Vec<crate::types::IncludedFile>) {
348    (content.to_string(), Vec::new())
349  }
350
351  /// Extract headers and title from the markdown content.
352  #[must_use]
353  pub fn extract_headers(
354    &self,
355    content: &str,
356  ) -> (Vec<Header>, Option<String>) {
357    use std::fmt::Write;
358
359    let arena = Arena::new();
360    let options = self.comrak_options();
361
362    // Normalize custom anchors with no heading level to h2
363    let mut normalized = String::with_capacity(content.len());
364    for line in content.lines() {
365      let trimmed = line.trim_end();
366      if !trimmed.starts_with('#')
367        && let Some(anchor_start) = trimmed.rfind("{#")
368        && let Some(anchor_end) = trimmed[anchor_start..].find('}')
369      {
370        let text = trimmed[..anchor_start].trim_end();
371        let id = &trimmed[anchor_start + 2..anchor_start + anchor_end];
372        let _ = writeln!(normalized, "## {text} {{#{id}}}");
373        continue;
374      }
375      normalized.push_str(line);
376      normalized.push('\n');
377    }
378
379    let root = parse_document(&arena, &normalized, &options);
380
381    let mut headers = Vec::new();
382    let mut found_title = None;
383
384    for node in root.descendants() {
385      if let NodeValue::Heading(NodeHeading { level, .. }) =
386        &node.data.borrow().value
387      {
388        let mut text = String::new();
389        let mut explicit_id = None;
390
391        for child in node.children() {
392          match &child.data.borrow().value {
393            NodeValue::Text(t) => text.push_str(t),
394            NodeValue::Code(t) => text.push_str(&t.literal),
395            NodeValue::Link(..)
396            | NodeValue::Emph
397            | NodeValue::Strong
398            | NodeValue::Subscript
399            | NodeValue::Strikethrough
400            | NodeValue::Superscript
401            | NodeValue::FootnoteReference(..) => {
402              text.push_str(&extract_inline_text(child));
403            },
404            NodeValue::HtmlInline(html) => {
405              // Look for explicit anchor in HTML inline node: {#id}
406              let html_str = html.as_str();
407              if let Some(start) = html_str.find("{#")
408                && let Some(end) = html_str[start..].find('}')
409              {
410                let anchor = &html_str[start + 2..start + end];
411                explicit_id = Some(anchor.to_string());
412              }
413            },
414            #[allow(clippy::match_same_arms, reason = "Explicit for clarity")]
415            NodeValue::Image(..) => {},
416            _ => {},
417          }
418        }
419
420        // Check for trailing {#id} in heading text
421        let trimmed = text.trim_end();
422        #[allow(clippy::option_if_let_else)]
423        // Nested options clearer with if-let
424        let (final_text, id) = if let Some(start) = trimmed.rfind("{#") {
425          if let Some(end) = trimmed[start..].find('}') {
426            let anchor = &trimmed[start + 2..start + end];
427            (trimmed[..start].trim_end().to_string(), anchor.to_string())
428          } else {
429            (
430              text.clone(),
431              explicit_id.unwrap_or_else(|| utils::slugify(&text)),
432            )
433          }
434        } else {
435          (
436            text.clone(),
437            explicit_id.unwrap_or_else(|| utils::slugify(&text)),
438          )
439        };
440        if *level == 1 && found_title.is_none() {
441          found_title = Some(final_text.clone());
442        }
443        headers.push(Header {
444          text: final_text,
445          level: *level,
446          id,
447        });
448      }
449    }
450
451    (headers, found_title)
452  }
453
454  /// Convert markdown to HTML using comrak and configured options.
455  fn convert_to_html(&self, content: &str) -> String {
456    // Process directly without panic catching for better performance
457    let arena = Arena::new();
458    let options = self.comrak_options();
459    let root = parse_document(&arena, content, &options);
460
461    // Apply AST transformations
462    let prompt_transformer = PromptTransformer;
463    prompt_transformer.transform(root);
464
465    let mut html_output = String::new();
466    if let Err(e) = comrak::format_html(root, &options, &mut html_output) {
467      log::error!("Failed to format HTML: {e}");
468    }
469
470    // Post-process HTML to handle header anchors
471    Self::process_header_anchors_html(&html_output)
472  }
473
474  /// Process header anchors in HTML by finding `{#id}` syntax and converting to
475  /// proper id attributes. Also adds auto-generated IDs to headers without
476  /// explicit anchors.
477  fn process_header_anchors_html(html: &str) -> String {
478    use std::sync::LazyLock;
479
480    use regex::Regex;
481
482    // First pass: handle explicit {#id} syntax
483    static HEADER_ANCHOR_RE: LazyLock<Regex> = LazyLock::new(|| {
484      Regex::new(r"<h([1-6])>(.*?)\s*\{#([a-zA-Z0-9_-]+)\}(.*?)</h[1-6]>")
485        .unwrap_or_else(|e| {
486          log::error!("Failed to compile HEADER_ANCHOR_RE regex: {e}");
487          utils::never_matching_regex().unwrap_or_else(|_| {
488            #[allow(
489              clippy::expect_used,
490              reason = "This pattern is guaranteed to be valid"
491            )]
492            Regex::new(r"[^\s\S]")
493              .expect("regex pattern [^\\s\\S] should always compile")
494          })
495        })
496    });
497
498    // Second pass: add IDs to headers without attributes (no id yet)
499    // Matches <h1>content</h1> but not <h1 id="...">content</h1>
500    static HEADER_NO_ID_RE: LazyLock<Regex> = LazyLock::new(|| {
501      Regex::new(r"<h([1-6])>(.*?)</h[1-6]>").unwrap_or_else(|e| {
502        log::error!("Failed to compile HEADER_NO_ID_RE regex: {e}");
503        utils::never_matching_regex().unwrap_or_else(|_| {
504          #[allow(
505            clippy::expect_used,
506            reason = "This pattern is guaranteed to be valid"
507          )]
508          Regex::new(r"[^\s\S]")
509            .expect("regex pattern [^\\s\\S] should always compile")
510        })
511      })
512    });
513
514    // Regex to strip HTML tags for slugification
515    static HTML_TAG_RE: LazyLock<Regex> = LazyLock::new(|| {
516      Regex::new(r"<[^>]+>").unwrap_or_else(|e| {
517        log::error!("Failed to compile HTML_TAG_RE regex: {e}");
518        utils::never_matching_regex().unwrap_or_else(|_| {
519          #[allow(
520            clippy::expect_used,
521            reason = "This pattern is guaranteed to be valid"
522          )]
523          Regex::new(r"[^\s\S]")
524            .expect("regex pattern [^\\s\\S] should always compile")
525        })
526      })
527    });
528
529    // First pass: explicit {#id} syntax
530    let result = HEADER_ANCHOR_RE
531      .replace_all(html, |caps: &regex::Captures| {
532        let level = &caps[1];
533        let prefix = &caps[2];
534        let id = &caps[3];
535        let suffix = &caps[4];
536        format!("<h{level} id=\"{id}\">{prefix}{suffix}</h{level}>")
537      })
538      .to_string();
539
540    // Second pass: add auto-generated IDs to headers without id attribute
541    HEADER_NO_ID_RE
542      .replace_all(&result, |caps: &regex::Captures| {
543        let level = &caps[1];
544        let content = &caps[2];
545        // Strip HTML tags and slugify the text content
546        let text_only = HTML_TAG_RE.replace_all(content, "");
547        let id = utils::slugify(&text_only);
548        if id.is_empty() {
549          // If slugify produces empty string, keep header without id
550          format!("<h{level}>{content}</h{level}>")
551        } else {
552          format!("<h{level} id=\"{id}\">{content}</h{level}>")
553        }
554      })
555      .to_string()
556  }
557
558  /// Build comrak options from `MarkdownOptions` and feature flags.
559  fn comrak_options(&self) -> Options<'_> {
560    let mut options = Options::default();
561    // Markdown features present in GFM.
562    if self.options.gfm {
563      options.extension.table = true;
564      options.extension.footnotes = true;
565      options.extension.strikethrough = true;
566      options.extension.tasklist = true;
567      options.extension.superscript = true;
568      options.extension.autolink = true;
569    }
570
571    // Enable unsafe HTML references. This is not a security concern
572    // as all input is assumed to be trusted.
573    options.render.r#unsafe = true;
574
575    // Enable description lists but keep custom header processing
576    options.extension.header_id_prefix = None;
577    options.extension.description_lists = true;
578    options
579  }
580
581  /// Post-process HTML to enhance manpage references with URL links.
582  #[cfg(feature = "nixpkgs")]
583  fn process_manpage_references_html(&self, html: &str) -> String {
584    super::extensions::process_manpage_references(
585      html,
586      self.manpage_urls.as_ref(),
587    )
588  }
589
590  /// Post-process HTML to enhance manpage references (no-op when feature
591  /// disabled).
592  #[cfg(not(feature = "nixpkgs"))]
593  fn process_manpage_references_html(&self, html: &str) -> String {
594    html.to_string()
595  }
596
597  /// HTML post-processing using kuchiki DOM manipulation.
598  #[allow(
599    clippy::unused_self,
600    reason = "Method signature matches processor pattern"
601  )]
602  fn kuchiki_postprocess(&self, html: &str) -> String {
603    // Use a standalone function to avoid borrowing issues
604    kuchiki_postprocess_html(html, |document| {
605      Self::apply_dom_transformations(document);
606    })
607  }
608
609  /// Apply all DOM transformations to the parsed HTML document.
610  fn apply_dom_transformations(document: &kuchikikiki::NodeRef) {
611    Self::process_list_item_id_markers(document);
612    Self::process_header_anchor_comments(document);
613    Self::process_list_item_inline_anchors(document);
614    Self::process_paragraph_inline_anchors(document);
615    Self::process_remaining_inline_anchors(document);
616    Self::process_option_anchor_links(document);
617    Self::process_empty_auto_links(document);
618    Self::process_empty_html_links(document);
619  }
620
621  /// Process list item ID markers: <li><!-- nixos-anchor-id:ID -->
622  fn process_list_item_id_markers(document: &kuchikikiki::NodeRef) {
623    let mut to_modify = Vec::new();
624
625    for comment in document.inclusive_descendants() {
626      if let Some(comment_node) = comment.as_comment() {
627        let comment_text = comment_node.borrow();
628        if let Some(id_start) = comment_text.find("nixos-anchor-id:") {
629          let id = comment_text[id_start + 16..].trim();
630          if !id.is_empty()
631            && id
632              .chars()
633              .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
634          {
635            // Check if this comment is inside an <li> element
636            if let Some(parent) = comment.parent()
637              && let Some(element) = parent.as_element()
638              && element.name.local.as_ref() == "li"
639            {
640              to_modify.push((comment.clone(), id.to_string()));
641            }
642          }
643        }
644      }
645    }
646
647    for (comment_node, id) in to_modify {
648      let span = kuchikikiki::NodeRef::new_element(
649        markup5ever::QualName::new(
650          None,
651          markup5ever::ns!(html),
652          local_name!("span"),
653        ),
654        vec![
655          (
656            kuchikikiki::ExpandedName::new("", "id"),
657            kuchikikiki::Attribute {
658              prefix: None,
659              value:  id,
660            },
661          ),
662          (
663            kuchikikiki::ExpandedName::new("", "class"),
664            kuchikikiki::Attribute {
665              prefix: None,
666              value:  "nixos-anchor".into(),
667            },
668          ),
669        ],
670      );
671      comment_node.insert_after(span);
672      comment_node.detach();
673    }
674  }
675
676  /// Process header anchors with comments: <h1>text<!-- anchor: id --></h1>
677  fn process_header_anchor_comments(document: &kuchikikiki::NodeRef) {
678    let mut to_modify = Vec::new();
679
680    for comment in document.inclusive_descendants() {
681      if let Some(comment_node) = comment.as_comment() {
682        let comment_text = comment_node.borrow();
683        if let Some(anchor_start) = comment_text.find("anchor:") {
684          let id = comment_text[anchor_start + 7..].trim();
685          if !id.is_empty()
686            && id
687              .chars()
688              .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
689          {
690            // Check if this comment is inside a header element
691            if let Some(parent) = comment.parent()
692              && let Some(element) = parent.as_element()
693            {
694              let tag_name = element.name.local.as_ref();
695              if matches!(tag_name, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
696                to_modify.push((
697                  parent.clone(),
698                  comment.clone(),
699                  id.to_string(),
700                ));
701              }
702            }
703          }
704        }
705      }
706    }
707
708    for (header_element, comment_node, id) in to_modify {
709      if let Some(element) = header_element.as_element() {
710        element
711          .attributes
712          .borrow_mut()
713          .insert(local_name!("id"), id);
714        comment_node.detach();
715      }
716    }
717  }
718
719  /// Process remaining inline anchors in list items: <li>[]{#id}content</li>
720  fn process_list_item_inline_anchors(document: &kuchikikiki::NodeRef) {
721    for li_node in safe_select(document, "li") {
722      let li_element = li_node;
723
724      // Check if this list item contains code elements
725      let has_code = !safe_select(&li_element, "code, pre").is_empty();
726      if has_code {
727        continue; // Skip list items with code blocks
728      }
729
730      let text_content = li_element.text_contents();
731
732      if let Some(anchor_start) = text_content.find("[]{#")
733        && let Some(anchor_end) = text_content[anchor_start..].find('}')
734      {
735        let id = &text_content[anchor_start + 4..anchor_start + anchor_end];
736        if !id.is_empty()
737          && id
738            .chars()
739            .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
740        {
741          let remaining_content =
742            &text_content[anchor_start + anchor_end + 1..];
743
744          // Clear current content and rebuild
745          for child in li_element.children() {
746            child.detach();
747          }
748
749          let span = kuchikikiki::NodeRef::new_element(
750            markup5ever::QualName::new(
751              None,
752              markup5ever::ns!(html),
753              local_name!("span"),
754            ),
755            vec![
756              (
757                kuchikikiki::ExpandedName::new("", "id"),
758                kuchikikiki::Attribute {
759                  prefix: None,
760                  value:  id.into(),
761                },
762              ),
763              (
764                kuchikikiki::ExpandedName::new("", "class"),
765                kuchikikiki::Attribute {
766                  prefix: None,
767                  value:  "nixos-anchor".into(),
768                },
769              ),
770            ],
771          );
772          li_element.append(span);
773          if !remaining_content.is_empty() {
774            li_element
775              .append(kuchikikiki::NodeRef::new_text(remaining_content));
776          }
777        }
778      }
779    }
780  }
781
782  /// Process inline anchors in paragraphs: <p>[]{#id}content</p>
783  fn process_paragraph_inline_anchors(document: &kuchikikiki::NodeRef) {
784    for p_node in safe_select(document, "p") {
785      let p_element = p_node;
786
787      // Check if this paragraph contains code elements
788      let has_code = !safe_select(&p_element, "code, pre").is_empty();
789      if has_code {
790        continue; // Skip paragraphs with code blocks
791      }
792
793      let text_content = p_element.text_contents();
794
795      if let Some(anchor_start) = text_content.find("[]{#")
796        && let Some(anchor_end) = text_content[anchor_start..].find('}')
797      {
798        let id = &text_content[anchor_start + 4..anchor_start + anchor_end];
799        if !id.is_empty()
800          && id
801            .chars()
802            .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
803        {
804          let remaining_content =
805            &text_content[anchor_start + anchor_end + 1..];
806
807          // Clear current content and rebuild
808          for child in p_element.children() {
809            child.detach();
810          }
811
812          let span = kuchikikiki::NodeRef::new_element(
813            markup5ever::QualName::new(
814              None,
815              markup5ever::ns!(html),
816              local_name!("span"),
817            ),
818            vec![
819              (
820                kuchikikiki::ExpandedName::new("", "id"),
821                kuchikikiki::Attribute {
822                  prefix: None,
823                  value:  id.into(),
824                },
825              ),
826              (
827                kuchikikiki::ExpandedName::new("", "class"),
828                kuchikikiki::Attribute {
829                  prefix: None,
830                  value:  "nixos-anchor".into(),
831                },
832              ),
833            ],
834          );
835          p_element.append(span);
836          if !remaining_content.is_empty() {
837            p_element.append(kuchikikiki::NodeRef::new_text(remaining_content));
838          }
839        }
840      }
841    }
842  }
843
844  /// Process remaining standalone inline anchors throughout the document
845  fn process_remaining_inline_anchors(document: &kuchikikiki::NodeRef) {
846    let mut text_nodes_to_process = Vec::new();
847
848    for node in document.inclusive_descendants() {
849      if let Some(text_node) = node.as_text() {
850        // Check if this text node is inside a code block
851        let mut parent = node.parent();
852        let mut in_code = false;
853        while let Some(p) = parent {
854          if let Some(element) = p.as_element()
855            && (element.name.local == local_name!("code")
856              || element.name.local == local_name!("pre"))
857          {
858            in_code = true;
859            break;
860          }
861          parent = p.parent();
862        }
863
864        // Only process if not in code
865        if !in_code {
866          let text_content = text_node.borrow().clone();
867          if text_content.contains("[]{#") {
868            text_nodes_to_process.push((node.clone(), text_content));
869          }
870        }
871      }
872    }
873
874    for (text_node, text_content) in text_nodes_to_process {
875      let mut last_end = 0;
876      let mut new_children = Vec::new();
877
878      // Simple pattern matching for []{#id}
879      let chars = text_content.chars().collect::<Vec<_>>();
880      let mut i = 0;
881      while i < chars.len() {
882        if i + 4 < chars.len()
883          && chars[i] == '['
884          && chars[i + 1] == ']'
885          && chars[i + 2] == '{'
886          && chars[i + 3] == '#'
887        {
888          // Found start of anchor pattern
889          let anchor_start = i;
890          i += 4; // skip "[]{#"
891
892          let mut id = String::new();
893          while i < chars.len() && chars[i] != '}' {
894            if chars[i].is_alphanumeric() || chars[i] == '-' || chars[i] == '_'
895            {
896              id.push(chars[i]);
897              i += 1;
898            } else {
899              break;
900            }
901          }
902
903          if i < chars.len() && chars[i] == '}' && !id.is_empty() {
904            // Valid anchor found
905            let anchor_end = i + 1;
906
907            // Add text before anchor
908            if anchor_start > last_end {
909              let before_text: String =
910                chars[last_end..anchor_start].iter().collect();
911              if !before_text.is_empty() {
912                new_children.push(kuchikikiki::NodeRef::new_text(before_text));
913              }
914            }
915
916            // Add span element
917            let span = kuchikikiki::NodeRef::new_element(
918              markup5ever::QualName::new(
919                None,
920                markup5ever::ns!(html),
921                local_name!("span"),
922              ),
923              vec![
924                (
925                  kuchikikiki::ExpandedName::new("", "id"),
926                  kuchikikiki::Attribute {
927                    prefix: None,
928                    value:  id,
929                  },
930                ),
931                (
932                  kuchikikiki::ExpandedName::new("", "class"),
933                  kuchikikiki::Attribute {
934                    prefix: None,
935                    value:  "nixos-anchor".into(),
936                  },
937                ),
938              ],
939            );
940            new_children.push(span);
941
942            last_end = anchor_end;
943            i = anchor_end;
944          } else {
945            i += 1;
946          }
947        } else {
948          i += 1;
949        }
950      }
951
952      // Add remaining text
953      if last_end < chars.len() {
954        let after_text: String = chars[last_end..].iter().collect();
955        if !after_text.is_empty() {
956          new_children.push(kuchikikiki::NodeRef::new_text(after_text));
957        }
958      }
959
960      // Replace text node if we found anchors
961      if !new_children.is_empty() {
962        for child in new_children {
963          text_node.insert_before(child);
964        }
965        text_node.detach();
966      }
967    }
968  }
969
970  /// Process empty auto-links: [](#anchor) -> <a href="#anchor">Anchor</a>
971  fn process_empty_auto_links(document: &kuchikikiki::NodeRef) {
972    for link_node in safe_select(document, "a") {
973      let link_element = link_node;
974      if let Some(element) = link_element.as_element() {
975        let href = element
976          .attributes
977          .borrow()
978          .get(local_name!("href"))
979          .map(std::string::ToString::to_string);
980        let text_content = link_element.text_contents();
981
982        if let Some(href_value) = href
983          && href_value.starts_with('#')
984          && (text_content.trim().is_empty()
985            || text_content.trim() == "{{ANCHOR}}")
986        {
987          // Clear placeholder text if present
988          if text_content.trim() == "{{ANCHOR}}" {
989            for child in link_element.children() {
990              child.detach();
991            }
992          }
993          // Empty link with anchor - add humanized text
994          let display_text = Self::humanize_anchor_id(&href_value);
995          link_element.append(kuchikikiki::NodeRef::new_text(display_text));
996        }
997      }
998    }
999  }
1000
1001  /// Process empty HTML links that have no content
1002  fn process_empty_html_links(document: &kuchikikiki::NodeRef) {
1003    for link_node in safe_select(document, "a[href^='#']") {
1004      let link_element = link_node;
1005      let text_content = link_element.text_contents();
1006
1007      if text_content.trim().is_empty() || text_content.trim() == "{{ANCHOR}}" {
1008        // Clear placeholder text if present
1009        if text_content.trim() == "{{ANCHOR}}" {
1010          for child in link_element.children() {
1011            child.detach();
1012          }
1013        }
1014        if let Some(element) = link_element.as_element()
1015          && let Some(href) =
1016            element.attributes.borrow().get(local_name!("href"))
1017        {
1018          let display_text = Self::humanize_anchor_id(href);
1019          link_element.append(kuchikikiki::NodeRef::new_text(display_text));
1020        }
1021      }
1022    }
1023  }
1024
1025  /// Process option anchor links: [](#opt-option.path) -> link to options.html
1026  fn process_option_anchor_links(document: &kuchikikiki::NodeRef) {
1027    let mut to_modify = Vec::new();
1028
1029    // Collect all option anchor links first
1030    for link_node in safe_select(document, "a[href^='#opt-']") {
1031      let link_element = link_node;
1032      if let Some(element) = link_element.as_element() {
1033        let href = element
1034          .attributes
1035          .borrow()
1036          .get(local_name!("href"))
1037          .map(std::string::ToString::to_string);
1038        let text_content = link_element.text_contents();
1039
1040        if let Some(href_value) = href
1041          && href_value.starts_with("#opt-")
1042        {
1043          let option_anchor = href_value[1..].to_string(); // remove the leading #
1044          let needs_text_replacement = text_content.trim().is_empty()
1045            || text_content.trim() == "{{ANCHOR}}";
1046          to_modify.push((
1047            link_element.clone(),
1048            option_anchor,
1049            needs_text_replacement,
1050          ));
1051        }
1052      }
1053    }
1054
1055    // Apply modifications
1056    for (link_element, option_anchor, needs_text_replacement) in to_modify {
1057      if let Some(element) = link_element.as_element() {
1058        let new_href = format!("options.html#{option_anchor}");
1059        element
1060          .attributes
1061          .borrow_mut()
1062          .insert(local_name!("href"), new_href);
1063
1064        if needs_text_replacement {
1065          // Clear existing content
1066          for child in link_element.children() {
1067            child.detach();
1068          }
1069
1070          // Extract option name from anchor
1071          // opt-services-nginx-enable -> services.nginx.enable
1072          if let Some(option_path) = option_anchor.strip_prefix("opt-") {
1073            let option_name = option_path.replace('-', ".");
1074            link_element.append(kuchikikiki::NodeRef::new_text(option_name));
1075          }
1076        }
1077      }
1078    }
1079  }
1080
1081  /// Convert an anchor ID to human-readable text
1082  fn humanize_anchor_id(anchor: &str) -> String {
1083    // Strip the leading #
1084    let cleaned = anchor.trim_start_matches('#');
1085
1086    // Remove common prefixes
1087    let without_prefix = cleaned
1088      .trim_start_matches("sec-")
1089      .trim_start_matches("ssec-")
1090      .trim_start_matches("opt-");
1091
1092    // Replace separators with spaces
1093    let spaced = without_prefix.replace(['-', '_'], " ");
1094
1095    // Capitalize each word
1096    spaced
1097      .split_whitespace()
1098      .map(|word| {
1099        let mut chars = word.chars();
1100        chars.next().map_or_else(String::new, |c| {
1101          c.to_uppercase().collect::<String>() + chars.as_str()
1102        })
1103      })
1104      .collect::<Vec<String>>()
1105      .join(" ")
1106  }
1107}
1108
1109/// Extract all inline text from a heading node.
1110pub fn extract_inline_text<'a>(node: &'a AstNode<'a>) -> String {
1111  let mut text = String::new();
1112  for child in node.children() {
1113    match &child.data.borrow().value {
1114      NodeValue::Text(t) => text.push_str(t),
1115      NodeValue::Code(t) => text.push_str(&t.literal),
1116      NodeValue::Link(..)
1117      | NodeValue::Emph
1118      | NodeValue::Strong
1119      | NodeValue::Strikethrough
1120      | NodeValue::Superscript
1121      | NodeValue::Subscript
1122      | NodeValue::FootnoteReference(..) => {
1123        text.push_str(&extract_inline_text(child));
1124      },
1125      #[allow(clippy::match_same_arms, reason = "Explicit for clarity")]
1126      NodeValue::HtmlInline(_) | NodeValue::Image(..) => {},
1127      _ => {},
1128    }
1129  }
1130  text
1131}
1132
1133/// Collect all markdown files from the input directory
1134pub fn collect_markdown_files(input_dir: &Path) -> Vec<PathBuf> {
1135  let mut files = Vec::with_capacity(100);
1136
1137  for entry in WalkDir::new(input_dir)
1138    .follow_links(true)
1139    .into_iter()
1140    .filter_map(Result::ok)
1141  {
1142    let path = entry.path();
1143    if path.is_file() && path.extension().is_some_and(|ext| ext == "md") {
1144      files.push(path.to_owned());
1145    }
1146  }
1147
1148  trace!("Found {} markdown files to process", files.len());
1149  files
1150}
1151
1152/// Features that can be queried on a processor instance.
1153#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1154pub enum ProcessorFeature {
1155  /// GitHub Flavored Markdown support
1156  Gfm,
1157  /// Nixpkgs documentation extensions
1158  Nixpkgs,
1159  /// Syntax highlighting for code blocks
1160  SyntaxHighlighting,
1161  /// Manpage URL mapping support
1162  ManpageUrls,
1163}
1164
1165/// Standalone HTML post-processing function to avoid borrowing issues.
1166fn kuchiki_postprocess_html<F>(html: &str, transform_fn: F) -> String
1167where
1168  F: FnOnce(&kuchikikiki::NodeRef),
1169{
1170  process_safe(
1171    html,
1172    |html| {
1173      use tendril::TendrilSink;
1174
1175      let document = kuchikikiki::parse_html().one(html);
1176      transform_fn(&document);
1177
1178      let mut out = Vec::new();
1179      let _ = document.serialize(&mut out);
1180      String::from_utf8_lossy(&out).into_owned()
1181    },
1182    html,
1183  )
1184}
ndg_commonmark/processor/core.rs

ndg_commonmark/processor/
core.rs