ndg_commonmark/processor/
core.rs

1//! Core implementation of the Markdown processor.
2//!
3//! Main implementation of `MarkdownProcessor` and its methods focused on the
4//! core rendering pipeline and configuration management.
5use std::{
6  collections::HashMap,
7  path::{Path, PathBuf},
8  sync::LazyLock,
9};
10
11use comrak::{
12  Arena,
13  nodes::{AstNode, NodeHeading, NodeValue},
14  options::Options,
15  parse_document,
16};
17use log::trace;
18use markup5ever::local_name;
19use regex::Regex;
20use walkdir::WalkDir;
21
22use super::{
23  dom::safe_select,
24  process::process_safe,
25  types::{
26    AstTransformer,
27    MarkdownOptions,
28    MarkdownProcessor,
29    PromptTransformer,
30  },
31};
32use crate::{
33  syntax::create_default_manager,
34  types::{Header, MarkdownResult},
35  utils,
36};
37
38static HEADER_ANCHOR_RE: LazyLock<Regex> = LazyLock::new(|| {
39  Regex::new(r"<h([1-6])>(.*?)\s*\{#([a-zA-Z0-9_-]+)\}(.*?)</h[1-6]>")
40    .unwrap_or_else(|e| {
41      log::error!("Failed to compile HEADER_ANCHOR_RE regex: {e}");
42      utils::never_matching_regex().unwrap_or_else(|_| {
43        #[allow(
44          clippy::expect_used,
45          reason = "This pattern is guaranteed to be valid"
46        )]
47        Regex::new(r"[^\s\S]")
48          .expect("regex pattern [^\\s\\S] should always compile")
49      })
50    })
51});
52
53static HEADER_NO_ID_RE: LazyLock<Regex> = LazyLock::new(|| {
54  Regex::new(r"<h([1-6])>(.*?)</h[1-6]>").unwrap_or_else(|e| {
55    log::error!("Failed to compile HEADER_NO_ID_RE regex: {e}");
56    utils::never_matching_regex().unwrap_or_else(|_| {
57      #[allow(
58        clippy::expect_used,
59        reason = "This pattern is guaranteed to be valid"
60      )]
61      Regex::new(r"[^\s\S]")
62        .expect("regex pattern [^\\s\\S] should always compile")
63    })
64  })
65});
66
67static HTML_TAG_RE: LazyLock<Regex> = LazyLock::new(|| {
68  Regex::new(r"<[^>]+>").unwrap_or_else(|e| {
69    log::error!("Failed to compile HTML_TAG_RE regex: {e}");
70    utils::never_matching_regex().unwrap_or_else(|_| {
71      #[allow(
72        clippy::expect_used,
73        reason = "This pattern is guaranteed to be valid"
74      )]
75      Regex::new(r"[^\s\S]")
76        .expect("regex pattern [^\\s\\S] should always compile")
77    })
78  })
79});
80
81impl MarkdownProcessor {
82  /// Create a new `MarkdownProcessor` with the given options.
83  #[must_use]
84  pub fn new(options: MarkdownOptions) -> Self {
85    let manpage_urls = options
86      .manpage_urls_path
87      .as_ref()
88      .and_then(|path| crate::utils::load_manpage_urls(path).ok());
89
90    let syntax_manager = if options.highlight_code {
91      match create_default_manager(
92        options
93          .syntax_queries_path
94          .as_deref()
95          .map(std::path::Path::new),
96      ) {
97        Ok(manager) => {
98          log::info!("Syntax highlighting initialized successfully");
99          Some(manager)
100        },
101        Err(e) => {
102          log::error!("Failed to initialize syntax highlighting: {e}");
103          log::warn!(
104            "Continuing without syntax highlighting - code blocks will not be \
105             highlighted"
106          );
107          None
108        },
109      }
110    } else {
111      None
112    };
113
114    Self {
115      options,
116      manpage_urls,
117      syntax_manager,
118      base_dir: std::path::PathBuf::from("."),
119    }
120  }
121
122  /// Access processor options.
123  #[must_use]
124  pub const fn options(&self) -> &MarkdownOptions {
125    &self.options
126  }
127
128  /// Set the base directory for resolving relative file paths.
129  #[must_use]
130  pub fn with_base_dir(mut self, base_dir: &std::path::Path) -> Self {
131    self.base_dir = base_dir.to_path_buf();
132    self
133  }
134
135  /// Check if a specific feature is enabled.
136  #[must_use]
137  pub const fn has_feature(&self, feature: ProcessorFeature) -> bool {
138    match feature {
139      ProcessorFeature::Gfm => self.options.gfm,
140      ProcessorFeature::Nixpkgs => self.options.nixpkgs,
141      ProcessorFeature::SyntaxHighlighting => self.options.highlight_code,
142      ProcessorFeature::ManpageUrls => self.manpage_urls.is_some(),
143    }
144  }
145
146  /// Get the manpage URLs mapping for use with standalone functions.
147  #[must_use]
148  pub const fn manpage_urls(&self) -> Option<&HashMap<String, String>> {
149    self.manpage_urls.as_ref()
150  }
151
152  /// Highlight all code blocks in HTML using the configured syntax highlighter
153  #[must_use]
154  pub fn highlight_codeblocks(&self, html: &str) -> String {
155    use kuchikikiki::parse_html;
156    use tendril::TendrilSink;
157
158    if !self.options.highlight_code || self.syntax_manager.is_none() {
159      return html.to_string();
160    }
161
162    let document = parse_html().one(html);
163
164    // Collect all code blocks first to avoid DOM modification during iteration
165    let mut code_blocks = Vec::new();
166    for pre_node in safe_select(&document, "pre > code") {
167      let code_node = pre_node;
168      if let Some(element) = code_node.as_element() {
169        let language = element
170          .attributes
171          .borrow()
172          .get("class")
173          .and_then(|class| class.strip_prefix("language-"))
174          .unwrap_or("text")
175          .to_string();
176        let code_text = code_node.text_contents();
177
178        if let Some(pre_parent) = code_node.parent() {
179          code_blocks.push((
180            pre_parent.clone(),
181            code_node.clone(),
182            code_text,
183            language,
184          ));
185        }
186      }
187    }
188
189    // Process each code block
190    for (pre_element, _code_node, code_text, language) in code_blocks {
191      if let Some(highlighted) = self.highlight_code_html(&code_text, &language)
192      {
193        // Wrap highlighted HTML in <pre><code> with appropriate classes
194        let wrapped_html = format!(
195          r#"<pre class="highlight"><code class="language-{language}">{highlighted}</code></pre>"#
196        );
197        let fragment = parse_html().one(wrapped_html.as_str());
198        pre_element.insert_after(fragment);
199        pre_element.detach();
200      }
201      // Do not add highlight/language-* classes if not highlighted
202    }
203
204    let mut buf = Vec::new();
205    if let Err(e) = document.serialize(&mut buf) {
206      log::warn!("DOM serialization failed: {e:?}");
207      return html.to_string(); // Return original HTML if serialization fails
208    }
209    String::from_utf8(buf).unwrap_or_else(|_| html.to_string())
210  }
211
212  /// Handle hard tabs in code blocks according to configuration
213  fn handle_hardtabs(&self, code: &str) -> String {
214    use super::types::TabStyle;
215
216    // Check if there are any hard tabs
217    if !code.contains('\t') {
218      return code.to_string();
219    }
220
221    match self.options.tab_style {
222      // Do nothing
223      TabStyle::None => code.to_string(),
224
225      // Warn, but do nothing.
226      TabStyle::Warn => {
227        log::warn!(
228          "Hard tabs detected in code block. Consider using spaces for \
229           consistency. Tools like editorconfig may help you normalize spaces \
230           in your documents."
231        );
232        code.to_string()
233      },
234
235      // Do not warn, only inform in debug mode. Then return
236      // the updated code.
237      TabStyle::Normalize => {
238        log::debug!("Replacing hard tabs with spaces");
239        code.replace('\t', "  ")
240      },
241    }
242  }
243
244  /// Process hard tabs in code blocks within markdown content
245  fn process_hardtabs(&self, markdown: &str) -> String {
246    use super::types::TabStyle;
247    use crate::utils::codeblock::FenceTracker;
248
249    // If no tab handling is needed, return as-is
250    if self.options.tab_style == TabStyle::None {
251      return markdown.to_string();
252    }
253
254    let mut result = String::with_capacity(markdown.len());
255    let mut lines = markdown.lines().peekable();
256    let mut tracker = FenceTracker::new();
257
258    while let Some(line) = lines.next() {
259      tracker = tracker.process_line(line);
260
261      // Only replace tabs inside fenced code blocks
262      let processed_line = if tracker.in_code_block() && line.contains('\t') {
263        self.handle_hardtabs(line)
264      } else {
265        line.to_string()
266      };
267
268      result.push_str(&processed_line);
269
270      // Add newline unless this is the last line
271      if lines.peek().is_some() {
272        result.push('\n');
273      }
274    }
275
276    result
277  }
278
279  /// Highlight code using the configured syntax highlighter, returns HTML
280  /// string
281  fn highlight_code_html(&self, code: &str, language: &str) -> Option<String> {
282    if !self.options.highlight_code {
283      return None;
284    }
285
286    let syntax_manager = self.syntax_manager.as_ref()?;
287
288    syntax_manager
289      .highlight_code(code, language, self.options.highlight_theme.as_deref())
290      .ok()
291  }
292
293  /// Render Markdown to HTML, extracting headers and title.
294  #[must_use]
295  pub fn render(&self, markdown: &str) -> MarkdownResult {
296    let (preprocessed, included_files) = self.preprocess(markdown);
297    let (headers, title) = self.extract_headers(&preprocessed);
298    let html = self.process_html_pipeline(&preprocessed);
299
300    MarkdownResult {
301      html,
302      headers,
303      title,
304      included_files,
305    }
306  }
307
308  /// Process the HTML generation and post-processing pipeline.
309  fn process_html_pipeline(&self, content: &str) -> String {
310    let mut html = self.convert_to_html(content);
311
312    // Apply feature-specific post-processing
313    if cfg!(feature = "ndg-flavored") {
314      #[cfg(feature = "ndg-flavored")]
315      {
316        html = super::extensions::process_option_references(
317          &html,
318          self.options.valid_options.as_ref(),
319        );
320      }
321    }
322
323    if self.options.nixpkgs {
324      html = self.process_manpage_references_html(&html);
325    }
326
327    if self.options.highlight_code {
328      html = self.highlight_codeblocks(&html);
329    }
330
331    self.kuchiki_postprocess(&html)
332  }
333
334  /// Preprocess the markdown content with all enabled transformations.
335  fn preprocess(
336    &self,
337    content: &str,
338  ) -> (String, Vec<crate::types::IncludedFile>) {
339    let mut processed = content.to_string();
340    let mut included_files = Vec::new();
341
342    // Process MyST-style autolinks first
343    processed = super::extensions::process_myst_autolinks(&processed);
344
345    // Handle hard tabs in code blocks
346    processed = self.process_hardtabs(&processed);
347
348    if self.options.nixpkgs {
349      let (content, files) = self.apply_nixpkgs_preprocessing(&processed);
350      processed = content;
351      included_files = files;
352    }
353
354    if self.options.nixpkgs || cfg!(feature = "ndg-flavored") {
355      processed = super::extensions::process_role_markup(
356        &processed,
357        self.manpage_urls.as_ref(),
358        self.options.auto_link_options,
359        self.options.valid_options.as_ref(),
360      );
361    }
362
363    if cfg!(feature = "wiki") {
364      processed = super::extensions::process_wikilinks(&processed);
365    }
366
367    (processed, included_files)
368  }
369
370  /// Apply Nixpkgs-specific preprocessing steps.
371  #[cfg(feature = "nixpkgs")]
372  fn apply_nixpkgs_preprocessing(
373    &self,
374    content: &str,
375  ) -> (String, Vec<crate::types::IncludedFile>) {
376    let (with_includes, included_files) =
377      match super::extensions::process_file_includes(content, &self.base_dir, 0)
378      {
379        Ok(result) => result,
380        Err(e) => {
381          log::warn!(
382            "File include processing failed: {e}. Continuing without includes."
383          );
384          (content.to_string(), Vec::new())
385        },
386      };
387    let with_blocks = super::extensions::process_block_elements(&with_includes);
388    let processed = super::extensions::process_inline_anchors(&with_blocks);
389    (processed, included_files)
390  }
391
392  /// Apply Nixpkgs-specific preprocessing steps (no-op when feature disabled).
393  #[cfg(not(feature = "nixpkgs"))]
394  fn apply_nixpkgs_preprocessing(
395    &self,
396    content: &str,
397  ) -> (String, Vec<crate::types::IncludedFile>) {
398    (content.to_string(), Vec::new())
399  }
400
401  /// Extract headers and title from the markdown content.
402  #[must_use]
403  pub fn extract_headers(
404    &self,
405    content: &str,
406  ) -> (Vec<Header>, Option<String>) {
407    use std::fmt::Write;
408
409    let arena = Arena::new();
410    let options = self.comrak_options();
411
412    // Normalize custom anchors with no heading level to h2
413    let mut normalized = String::with_capacity(content.len());
414    for line in content.lines() {
415      let trimmed = line.trim();
416      if !trimmed.starts_with('#')
417        && let Some(anchor_start) = trimmed.rfind("{#")
418        && let Some(anchor_end) = trimmed[anchor_start..].find('}')
419      {
420        let text = trimmed[..anchor_start].trim_end();
421        let id = &trimmed[anchor_start + 2..anchor_start + anchor_end];
422        let _ = writeln!(normalized, "## {text} {{#{id}}}");
423        continue;
424      }
425      normalized.push_str(line);
426      normalized.push('\n');
427    }
428
429    let root = parse_document(&arena, &normalized, &options);
430
431    let mut headers = Vec::new();
432    let mut found_title = None;
433
434    for node in root.descendants() {
435      if let NodeValue::Heading(NodeHeading { level, .. }) =
436        &node.data.borrow().value
437      {
438        let mut text = String::new();
439        let mut explicit_id = None;
440
441        for child in node.children() {
442          match &child.data.borrow().value {
443            NodeValue::Text(t) => text.push_str(t),
444            NodeValue::Code(t) => text.push_str(&t.literal),
445            NodeValue::Link(..)
446            | NodeValue::Emph
447            | NodeValue::Strong
448            | NodeValue::Subscript
449            | NodeValue::Strikethrough
450            | NodeValue::Superscript
451            | NodeValue::FootnoteReference(..) => {
452              text.push_str(&extract_inline_text(child));
453            },
454            NodeValue::HtmlInline(html) => {
455              // Look for explicit anchor in HTML inline node: {#id}
456              let html_str = html.as_str();
457              if let Some(start) = html_str.find("{#")
458                && let Some(end) = html_str[start..].find('}')
459              {
460                let anchor = &html_str[start + 2..start + end];
461                explicit_id = Some(anchor.to_string());
462              }
463            },
464            #[allow(clippy::match_same_arms, reason = "Explicit for clarity")]
465            NodeValue::Image(..) => {},
466            _ => {},
467          }
468        }
469
470        // Check for trailing {#id} in heading text
471        let trimmed = text.trim_end();
472        #[allow(clippy::option_if_let_else)]
473        // Nested options clearer with if-let
474        let (final_text, id) = if let Some(start) = trimmed.rfind("{#") {
475          if let Some(end) = trimmed[start..].find('}') {
476            let anchor = &trimmed[start + 2..start + end];
477            (trimmed[..start].trim_end().to_string(), anchor.to_string())
478          } else {
479            (
480              text.clone(),
481              explicit_id.unwrap_or_else(|| utils::slugify(&text)),
482            )
483          }
484        } else {
485          (
486            text.clone(),
487            explicit_id.unwrap_or_else(|| utils::slugify(&text)),
488          )
489        };
490        if *level == 1 && found_title.is_none() {
491          found_title = Some(final_text.clone());
492        }
493        headers.push(Header {
494          text: final_text,
495          level: *level,
496          id,
497        });
498      }
499    }
500
501    (headers, found_title)
502  }
503
504  /// Convert markdown to HTML using comrak and configured options.
505  fn convert_to_html(&self, content: &str) -> String {
506    // Process directly without panic catching for better performance
507    let arena = Arena::new();
508    let options = self.comrak_options();
509    let root = parse_document(&arena, content, &options);
510
511    // Apply AST transformations
512    let prompt_transformer = PromptTransformer;
513    prompt_transformer.transform(root);
514
515    let mut html_output = String::new();
516    if let Err(e) = comrak::format_html(root, &options, &mut html_output) {
517      log::error!("Failed to format HTML: {e}");
518    }
519
520    // Post-process HTML to handle header anchors
521    Self::process_header_anchors_html(&html_output)
522  }
523
524  /// Process header anchors in HTML by finding `{#id}` syntax and converting to
525  /// proper id attributes. Also adds auto-generated IDs to headers without
526  /// explicit anchors.
527  fn process_header_anchors_html(html: &str) -> String {
528    // First pass: explicit {#id} syntax
529    let result = HEADER_ANCHOR_RE
530      .replace_all(html, |caps: &regex::Captures| {
531        let level = &caps[1];
532        let prefix = &caps[2];
533        let id = &caps[3];
534        let suffix = &caps[4];
535        format!("<h{level} id=\"{id}\">{prefix}{suffix}</h{level}>")
536      })
537      .to_string();
538
539    // Second pass: add auto-generated IDs to headers without id attribute
540    HEADER_NO_ID_RE
541      .replace_all(&result, |caps: &regex::Captures| {
542        let level = &caps[1];
543        let content = &caps[2];
544        // Strip HTML tags and slugify the text content
545        let text_only = HTML_TAG_RE.replace_all(content, "");
546        let id = utils::slugify(&text_only);
547        if id.is_empty() {
548          // If slugify produces empty string, keep header without id
549          format!("<h{level}>{content}</h{level}>")
550        } else {
551          format!("<h{level} id=\"{id}\">{content}</h{level}>")
552        }
553      })
554      .to_string()
555  }
556
557  /// Build comrak options from `MarkdownOptions` and feature flags.
558  fn comrak_options(&self) -> Options<'_> {
559    let mut options = Options::default();
560    // Markdown features present in GFM.
561    if self.options.gfm {
562      options.extension.table = true;
563      options.extension.footnotes = true;
564      options.extension.strikethrough = true;
565      options.extension.tasklist = true;
566      options.extension.superscript = true;
567      options.extension.autolink = true;
568    }
569
570    // Enable unsafe HTML references. This is not a security concern
571    // as all input is assumed to be trusted.
572    options.render.r#unsafe = true;
573
574    // Enable description lists but keep custom header processing
575    options.extension.header_id_prefix = None;
576    options.extension.description_lists = true;
577    options
578  }
579
580  /// Post-process HTML to enhance manpage references with URL links.
581  #[cfg(feature = "nixpkgs")]
582  fn process_manpage_references_html(&self, html: &str) -> String {
583    super::extensions::process_manpage_references(
584      html,
585      self.manpage_urls.as_ref(),
586    )
587  }
588
589  /// Post-process HTML to enhance manpage references (no-op when feature
590  /// disabled).
591  #[cfg(not(feature = "nixpkgs"))]
592  fn process_manpage_references_html(&self, html: &str) -> String {
593    html.to_string()
594  }
595
596  /// HTML post-processing using kuchiki DOM manipulation.
597  #[allow(
598    clippy::unused_self,
599    reason = "Method signature matches processor pattern"
600  )]
601  fn kuchiki_postprocess(&self, html: &str) -> String {
602    // Use a standalone function to avoid borrowing issues
603    kuchiki_postprocess_html(html, |document| {
604      Self::apply_dom_transformations(document);
605    })
606  }
607
608  /// Apply all DOM transformations to the parsed HTML document.
609  fn apply_dom_transformations(document: &kuchikikiki::NodeRef) {
610    Self::process_list_item_id_markers(document);
611    Self::process_header_anchor_comments(document);
612    Self::process_list_item_inline_anchors(document);
613    Self::process_paragraph_inline_anchors(document);
614    Self::process_remaining_inline_anchors(document);
615    Self::process_markdown_links(document);
616    Self::process_option_anchor_links(document);
617    Self::process_empty_auto_links(document);
618    Self::process_empty_html_links(document);
619  }
620
621  /// Process list item ID markers: <li><!-- nixos-anchor-id:ID -->
622  fn process_list_item_id_markers(document: &kuchikikiki::NodeRef) {
623    let mut to_modify = Vec::new();
624
625    for comment in document.inclusive_descendants() {
626      if let Some(comment_node) = comment.as_comment() {
627        let comment_text = comment_node.borrow();
628        if let Some(id_start) = comment_text.find("nixos-anchor-id:") {
629          let id = comment_text[id_start + 16..].trim();
630          if !id.is_empty()
631            && id
632              .chars()
633              .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
634          {
635            // Check if this comment is inside an <li> element
636            if let Some(parent) = comment.parent()
637              && let Some(element) = parent.as_element()
638              && element.name.local.as_ref() == "li"
639            {
640              to_modify.push((comment.clone(), id.to_string()));
641            }
642          }
643        }
644      }
645    }
646
647    for (comment_node, id) in to_modify {
648      let span = kuchikikiki::NodeRef::new_element(
649        markup5ever::QualName::new(
650          None,
651          markup5ever::ns!(html),
652          local_name!("span"),
653        ),
654        vec![
655          (
656            kuchikikiki::ExpandedName::new("", "id"),
657            kuchikikiki::Attribute {
658              prefix: None,
659              value:  id,
660            },
661          ),
662          (
663            kuchikikiki::ExpandedName::new("", "class"),
664            kuchikikiki::Attribute {
665              prefix: None,
666              value:  "nixos-anchor".into(),
667            },
668          ),
669        ],
670      );
671      comment_node.insert_after(span);
672      comment_node.detach();
673    }
674  }
675
676  /// Process header anchors with comments: <h1>text<!-- anchor: id --></h1>
677  fn process_header_anchor_comments(document: &kuchikikiki::NodeRef) {
678    let mut to_modify = Vec::new();
679
680    for comment in document.inclusive_descendants() {
681      if let Some(comment_node) = comment.as_comment() {
682        let comment_text = comment_node.borrow();
683        if let Some(anchor_start) = comment_text.find("anchor:") {
684          let id = comment_text[anchor_start + 7..].trim();
685          if !id.is_empty()
686            && id
687              .chars()
688              .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
689          {
690            // Check if this comment is inside a header element
691            if let Some(parent) = comment.parent()
692              && let Some(element) = parent.as_element()
693            {
694              let tag_name = element.name.local.as_ref();
695              if matches!(tag_name, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
696                to_modify.push((
697                  parent.clone(),
698                  comment.clone(),
699                  id.to_string(),
700                ));
701              }
702            }
703          }
704        }
705      }
706    }
707
708    for (header_element, comment_node, id) in to_modify {
709      if let Some(element) = header_element.as_element() {
710        element
711          .attributes
712          .borrow_mut()
713          .insert(local_name!("id"), id);
714        comment_node.detach();
715      }
716    }
717  }
718
719  /// Process remaining inline anchors in list items: <li>[]{#id}content</li>
720  fn process_list_item_inline_anchors(document: &kuchikikiki::NodeRef) {
721    for li_node in safe_select(document, "li") {
722      let li_element = li_node;
723
724      // Check if this list item contains code elements
725      let has_code = !safe_select(&li_element, "code, pre").is_empty();
726      if has_code {
727        continue; // Skip list items with code blocks
728      }
729
730      let text_content = li_element.text_contents();
731
732      if let Some(anchor_start) = text_content.find("[]{#")
733        && let Some(anchor_end) = text_content[anchor_start..].find('}')
734      {
735        let id = &text_content[anchor_start + 4..anchor_start + anchor_end];
736        if !id.is_empty()
737          && id
738            .chars()
739            .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
740        {
741          let remaining_content =
742            &text_content[anchor_start + anchor_end + 1..];
743
744          // Clear current content and rebuild
745          for child in li_element.children() {
746            child.detach();
747          }
748
749          let span = kuchikikiki::NodeRef::new_element(
750            markup5ever::QualName::new(
751              None,
752              markup5ever::ns!(html),
753              local_name!("span"),
754            ),
755            vec![
756              (
757                kuchikikiki::ExpandedName::new("", "id"),
758                kuchikikiki::Attribute {
759                  prefix: None,
760                  value:  id.into(),
761                },
762              ),
763              (
764                kuchikikiki::ExpandedName::new("", "class"),
765                kuchikikiki::Attribute {
766                  prefix: None,
767                  value:  "nixos-anchor".into(),
768                },
769              ),
770            ],
771          );
772          li_element.append(span);
773          if !remaining_content.is_empty() {
774            li_element
775              .append(kuchikikiki::NodeRef::new_text(remaining_content));
776          }
777        }
778      }
779    }
780  }
781
782  /// Process inline anchors in paragraphs: <p>[]{#id}content</p>
783  fn process_paragraph_inline_anchors(document: &kuchikikiki::NodeRef) {
784    for p_node in safe_select(document, "p") {
785      let p_element = p_node;
786
787      // Check if this paragraph contains code elements
788      let has_code = !safe_select(&p_element, "code, pre").is_empty();
789      if has_code {
790        continue; // Skip paragraphs with code blocks
791      }
792
793      let text_content = p_element.text_contents();
794
795      if let Some(anchor_start) = text_content.find("[]{#")
796        && let Some(anchor_end) = text_content[anchor_start..].find('}')
797      {
798        let id = &text_content[anchor_start + 4..anchor_start + anchor_end];
799        if !id.is_empty()
800          && id
801            .chars()
802            .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
803        {
804          let remaining_content =
805            &text_content[anchor_start + anchor_end + 1..];
806
807          // Clear current content and rebuild
808          for child in p_element.children() {
809            child.detach();
810          }
811
812          let span = kuchikikiki::NodeRef::new_element(
813            markup5ever::QualName::new(
814              None,
815              markup5ever::ns!(html),
816              local_name!("span"),
817            ),
818            vec![
819              (
820                kuchikikiki::ExpandedName::new("", "id"),
821                kuchikikiki::Attribute {
822                  prefix: None,
823                  value:  id.into(),
824                },
825              ),
826              (
827                kuchikikiki::ExpandedName::new("", "class"),
828                kuchikikiki::Attribute {
829                  prefix: None,
830                  value:  "nixos-anchor".into(),
831                },
832              ),
833            ],
834          );
835          p_element.append(span);
836          if !remaining_content.is_empty() {
837            p_element.append(kuchikikiki::NodeRef::new_text(remaining_content));
838          }
839        }
840      }
841    }
842  }
843
844  /// Process remaining standalone inline anchors throughout the document
845  fn process_remaining_inline_anchors(document: &kuchikikiki::NodeRef) {
846    let mut text_nodes_to_process = Vec::new();
847
848    for node in document.inclusive_descendants() {
849      if let Some(text_node) = node.as_text() {
850        // Check if this text node is inside a code block
851        let mut parent = node.parent();
852        let mut in_code = false;
853        while let Some(p) = parent {
854          if let Some(element) = p.as_element()
855            && (element.name.local == local_name!("code")
856              || element.name.local == local_name!("pre"))
857          {
858            in_code = true;
859            break;
860          }
861          parent = p.parent();
862        }
863
864        // Only process if not in code
865        if !in_code {
866          let text_content = text_node.borrow().clone();
867          if text_content.contains("[]{#") {
868            text_nodes_to_process.push((node.clone(), text_content));
869          }
870        }
871      }
872    }
873
874    for (text_node, text_content) in text_nodes_to_process {
875      let mut last_end = 0;
876      let mut new_children = Vec::new();
877
878      // Simple pattern matching for []{#id}
879      let chars = text_content.chars().collect::<Vec<_>>();
880      let mut i = 0;
881      while i < chars.len() {
882        if i + 4 < chars.len()
883          && chars[i] == '['
884          && chars[i + 1] == ']'
885          && chars[i + 2] == '{'
886          && chars[i + 3] == '#'
887        {
888          // Found start of anchor pattern
889          let anchor_start = i;
890          i += 4; // skip "[]{#"
891
892          let mut id = String::new();
893          while i < chars.len() && chars[i] != '}' {
894            if chars[i].is_alphanumeric() || chars[i] == '-' || chars[i] == '_'
895            {
896              id.push(chars[i]);
897              i += 1;
898            } else {
899              break;
900            }
901          }
902
903          if i < chars.len() && chars[i] == '}' && !id.is_empty() {
904            // Valid anchor found
905            let anchor_end = i + 1;
906
907            // Add text before anchor
908            if anchor_start > last_end {
909              let before_text: String =
910                chars[last_end..anchor_start].iter().collect();
911              if !before_text.is_empty() {
912                new_children.push(kuchikikiki::NodeRef::new_text(before_text));
913              }
914            }
915
916            // Add span element
917            let span = kuchikikiki::NodeRef::new_element(
918              markup5ever::QualName::new(
919                None,
920                markup5ever::ns!(html),
921                local_name!("span"),
922              ),
923              vec![
924                (
925                  kuchikikiki::ExpandedName::new("", "id"),
926                  kuchikikiki::Attribute {
927                    prefix: None,
928                    value:  id,
929                  },
930                ),
931                (
932                  kuchikikiki::ExpandedName::new("", "class"),
933                  kuchikikiki::Attribute {
934                    prefix: None,
935                    value:  "nixos-anchor".into(),
936                  },
937                ),
938              ],
939            );
940            new_children.push(span);
941
942            last_end = anchor_end;
943            i = anchor_end;
944          } else {
945            i += 1;
946          }
947        } else {
948          i += 1;
949        }
950      }
951
952      // Add remaining text
953      if last_end < chars.len() {
954        let after_text: String = chars[last_end..].iter().collect();
955        if !after_text.is_empty() {
956          new_children.push(kuchikikiki::NodeRef::new_text(after_text));
957        }
958      }
959
960      // Replace text node if we found anchors
961      if !new_children.is_empty() {
962        for child in new_children {
963          text_node.insert_before(child);
964        }
965        text_node.detach();
966      }
967    }
968  }
969
970  /// Process empty auto-links: [](#anchor) -> <a href="#anchor">Anchor</a>
971  fn process_empty_auto_links(document: &kuchikikiki::NodeRef) {
972    for link_node in safe_select(document, "a") {
973      let link_element = link_node;
974      if let Some(element) = link_element.as_element() {
975        let href = element
976          .attributes
977          .borrow()
978          .get(local_name!("href"))
979          .map(std::string::ToString::to_string);
980        let text_content = link_element.text_contents();
981
982        if let Some(href_value) = href
983          && href_value.starts_with('#')
984          && (text_content.trim().is_empty()
985            || text_content.trim() == "{{ANCHOR}}")
986        {
987          // Clear placeholder text if present
988          if text_content.trim() == "{{ANCHOR}}" {
989            for child in link_element.children() {
990              child.detach();
991            }
992          }
993          // Empty link with anchor - add humanized text
994          let display_text = Self::humanize_anchor_id(&href_value);
995          link_element.append(kuchikikiki::NodeRef::new_text(display_text));
996        }
997      }
998    }
999  }
1000
1001  /// Process empty HTML links that have no content
1002  fn process_empty_html_links(document: &kuchikikiki::NodeRef) {
1003    for link_node in safe_select(document, "a[href^='#']") {
1004      let link_element = link_node;
1005      let text_content = link_element.text_contents();
1006
1007      if text_content.trim().is_empty() || text_content.trim() == "{{ANCHOR}}" {
1008        // Clear placeholder text if present
1009        if text_content.trim() == "{{ANCHOR}}" {
1010          for child in link_element.children() {
1011            child.detach();
1012          }
1013        }
1014        if let Some(element) = link_element.as_element()
1015          && let Some(href) =
1016            element.attributes.borrow().get(local_name!("href"))
1017        {
1018          let display_text = Self::humanize_anchor_id(href);
1019          link_element.append(kuchikikiki::NodeRef::new_text(display_text));
1020        }
1021      }
1022    }
1023  }
1024
1025  /// Process option anchor links: [](#opt-option.path) -> link to options.html
1026  fn process_option_anchor_links(document: &kuchikikiki::NodeRef) {
1027    let mut to_modify = Vec::new();
1028
1029    // Collect all option anchor links first
1030    for link_node in safe_select(document, "a[href^='#opt-']") {
1031      let link_element = link_node;
1032      if let Some(element) = link_element.as_element() {
1033        let href = element
1034          .attributes
1035          .borrow()
1036          .get(local_name!("href"))
1037          .map(std::string::ToString::to_string);
1038        let text_content = link_element.text_contents();
1039
1040        if let Some(href_value) = href
1041          && href_value.starts_with("#opt-")
1042        {
1043          let option_anchor = href_value[1..].to_string(); // remove the leading #
1044          let needs_text_replacement = text_content.trim().is_empty()
1045            || text_content.trim() == "{{ANCHOR}}";
1046          to_modify.push((
1047            link_element.clone(),
1048            option_anchor,
1049            needs_text_replacement,
1050          ));
1051        }
1052      }
1053    }
1054
1055    // Apply modifications
1056    for (link_element, option_anchor, needs_text_replacement) in to_modify {
1057      if let Some(element) = link_element.as_element() {
1058        let new_href = format!("options.html#{option_anchor}");
1059        element
1060          .attributes
1061          .borrow_mut()
1062          .insert(local_name!("href"), new_href);
1063
1064        if needs_text_replacement {
1065          // Clear existing content
1066          for child in link_element.children() {
1067            child.detach();
1068          }
1069
1070          // Extract option name from anchor
1071          // opt-services-nginx-enable -> services.nginx.enable
1072          if let Some(option_path) = option_anchor.strip_prefix("opt-") {
1073            let option_name = option_path.replace('-', ".");
1074            link_element.append(kuchikikiki::NodeRef::new_text(option_name));
1075          }
1076        }
1077      }
1078    }
1079  }
1080
1081  /// Process markdown file links: convert .md hrefs to .html
1082  fn process_markdown_links(document: &kuchikikiki::NodeRef) {
1083    for link_node in safe_select(document, "a") {
1084      let link_element = link_node;
1085      if let Some(element) = link_element.as_element() {
1086        let href = element
1087          .attributes
1088          .borrow()
1089          .get(local_name!("href"))
1090          .map(std::string::ToString::to_string);
1091
1092        if let Some(href_value) = href {
1093          // Only process relative links ending in .md (not absolute URLs, not
1094          // anchors)
1095          if !href_value.starts_with("http://")
1096            && !href_value.starts_with("https://")
1097            && !href_value.starts_with('#')
1098            && !href_value.starts_with("mailto:")
1099          {
1100            // Split off fragment (#) and query (?) to check the path extension
1101            let (path_part, suffix) = href_value
1102              .find(|c| c == '#' || c == '?')
1103              .map_or((href_value.as_str(), ""), |idx| {
1104                href_value.split_at(idx)
1105              });
1106
1107            if std::path::Path::new(path_part)
1108              .extension()
1109              .is_some_and(|ext| ext.eq_ignore_ascii_case("md"))
1110            {
1111              let new_href =
1112                format!("{}.html{}", &path_part[..path_part.len() - 3], suffix);
1113              element
1114                .attributes
1115                .borrow_mut()
1116                .insert(local_name!("href"), new_href);
1117            }
1118          }
1119        }
1120      }
1121    }
1122  }
1123
1124  /// Convert an anchor ID to human-readable text
1125  fn humanize_anchor_id(anchor: &str) -> String {
1126    // Strip the leading #
1127    let cleaned = anchor.trim_start_matches('#');
1128
1129    // Remove common prefixes
1130    let without_prefix = cleaned
1131      .trim_start_matches("sec-")
1132      .trim_start_matches("ssec-")
1133      .trim_start_matches("opt-");
1134
1135    // Replace separators with spaces
1136    let spaced = without_prefix.replace(['-', '_'], " ");
1137
1138    // Capitalize each word
1139    spaced
1140      .split_whitespace()
1141      .map(|word| {
1142        let mut chars = word.chars();
1143        chars.next().map_or_else(String::new, |c| {
1144          c.to_uppercase().collect::<String>() + chars.as_str()
1145        })
1146      })
1147      .collect::<Vec<String>>()
1148      .join(" ")
1149  }
1150}
1151
1152/// Extract all inline text from a heading node.
1153pub fn extract_inline_text<'a>(node: &'a AstNode<'a>) -> String {
1154  fn inner<'a>(node: &'a AstNode<'a>) -> String {
1155    let mut text = String::new();
1156    for child in node.children() {
1157      match &child.data.borrow().value {
1158        NodeValue::Text(t) => text.push_str(t),
1159        NodeValue::Code(t) => text.push_str(&t.literal),
1160        NodeValue::Link(..)
1161        | NodeValue::Emph
1162        | NodeValue::Strong
1163        | NodeValue::Strikethrough
1164        | NodeValue::Superscript
1165        | NodeValue::Subscript
1166        | NodeValue::FootnoteReference(..) => {
1167          text.push_str(&inner(child));
1168        },
1169        #[allow(clippy::match_same_arms, reason = "Explicit for clarity")]
1170        NodeValue::HtmlInline(_) | NodeValue::Image(..) => {},
1171        _ => {},
1172      }
1173    }
1174    text
1175  }
1176  inner(node)
1177}
1178
1179/// Collect all markdown files from the input directory
1180pub fn collect_markdown_files(input_dir: &Path) -> Vec<PathBuf> {
1181  let mut files = Vec::with_capacity(100);
1182
1183  for entry in WalkDir::new(input_dir)
1184    .follow_links(true)
1185    .into_iter()
1186    .filter_map(Result::ok)
1187  {
1188    let path = entry.path();
1189    if path.is_file() && path.extension().is_some_and(|ext| ext == "md") {
1190      files.push(path.to_owned());
1191    }
1192  }
1193
1194  trace!("Found {} markdown files to process", files.len());
1195  files
1196}
1197
1198/// Features that can be queried on a processor instance.
1199#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1200pub enum ProcessorFeature {
1201  /// GitHub Flavored Markdown support
1202  Gfm,
1203  /// Nixpkgs documentation extensions
1204  Nixpkgs,
1205  /// Syntax highlighting for code blocks
1206  SyntaxHighlighting,
1207  /// Manpage URL mapping support
1208  ManpageUrls,
1209}
1210
1211/// Standalone HTML post-processing function to avoid borrowing issues.
1212fn kuchiki_postprocess_html<F>(html: &str, transform_fn: F) -> String
1213where
1214  F: FnOnce(&kuchikikiki::NodeRef),
1215{
1216  process_safe(
1217    html,
1218    |html| {
1219      use tendril::TendrilSink;
1220
1221      let document = kuchikikiki::parse_html().one(html);
1222      transform_fn(&document);
1223
1224      let mut out = Vec::new();
1225      let _ = document.serialize(&mut out);
1226      String::from_utf8_lossy(&out).into_owned()
1227    },
1228    html,
1229  )
1230}
ndg_commonmark/processor/core.rs

ndg_commonmark/processor/
core.rs