ndg_commonmark/processor/
core.rs

1//! Core implementation of the Markdown processor.
2//!
3//! This module contains the main implementation of `MarkdownProcessor` and its
4//! methods, focused on the core rendering pipeline and configuration
5//! management.
6use std::{
7  collections::HashMap,
8  path::{Path, PathBuf},
9};
10
11use comrak::{
12  Arena,
13  nodes::{AstNode, NodeHeading, NodeValue},
14  options::Options,
15  parse_document,
16};
17use log::trace;
18use markup5ever::local_name;
19use walkdir::WalkDir;
20
21/// Error type for DOM operations.
22#[derive(Debug, thiserror::Error)]
23pub enum DomError {
24  #[error("CSS selector failed: {0}")]
25  SelectorError(String),
26  #[error("DOM serialization failed: {0}")]
27  SerializationError(String),
28}
29
30/// Result type for DOM operations.
31pub type DomResult<T> = Result<T, DomError>;
32
33/// Safely select DOM elements with graceful error handling.
34fn safe_select(
35  document: &kuchikikiki::NodeRef,
36  selector: &str,
37) -> Vec<kuchikikiki::NodeRef> {
38  match document.select(selector) {
39    Ok(selections) => selections.map(|sel| sel.as_node().clone()).collect(),
40    Err(e) => {
41      log::warn!("DOM selector '{selector}' failed: {e:?}");
42      Vec::new()
43    },
44  }
45}
46
47use super::{
48  process::process_safe,
49  types::{
50    AstTransformer,
51    MarkdownOptions,
52    MarkdownProcessor,
53    PromptTransformer,
54  },
55};
56use crate::{
57  syntax::create_default_manager,
58  types::{Header, MarkdownResult},
59  utils,
60};
61
62impl MarkdownProcessor {
63  /// Create a new `MarkdownProcessor` with the given options.
64  #[must_use]
65  pub fn new(options: MarkdownOptions) -> Self {
66    let manpage_urls = options
67      .manpage_urls_path
68      .as_ref()
69      .and_then(|path| crate::utils::load_manpage_urls(path).ok());
70
71    let syntax_manager = if options.highlight_code {
72      match create_default_manager() {
73        Ok(manager) => {
74          log::info!("Syntax highlighting initialized successfully");
75          Some(manager)
76        },
77        Err(e) => {
78          log::error!("Failed to initialize syntax highlighting: {e}");
79          log::warn!(
80            "Continuing without syntax highlighting - code blocks will not be \
81             highlighted"
82          );
83          None
84        },
85      }
86    } else {
87      None
88    };
89
90    Self {
91      options,
92      manpage_urls,
93      syntax_manager,
94      base_dir: std::path::PathBuf::from("."),
95    }
96  }
97
98  /// Access processor options.
99  #[must_use]
100  pub const fn options(&self) -> &MarkdownOptions {
101    &self.options
102  }
103
104  /// Set the base directory for resolving relative file paths.
105  #[must_use]
106  pub fn with_base_dir(mut self, base_dir: &std::path::Path) -> Self {
107    self.base_dir = base_dir.to_path_buf();
108    self
109  }
110
111  /// Check if a specific feature is enabled.
112  #[must_use]
113  pub const fn has_feature(&self, feature: ProcessorFeature) -> bool {
114    match feature {
115      ProcessorFeature::Gfm => self.options.gfm,
116      ProcessorFeature::Nixpkgs => self.options.nixpkgs,
117      ProcessorFeature::SyntaxHighlighting => self.options.highlight_code,
118      ProcessorFeature::ManpageUrls => self.manpage_urls.is_some(),
119    }
120  }
121
122  /// Get the manpage URLs mapping for use with standalone functions.
123  #[must_use]
124  pub const fn manpage_urls(&self) -> Option<&HashMap<String, String>> {
125    self.manpage_urls.as_ref()
126  }
127
128  /// Highlight all code blocks in HTML using the configured syntax highlighter
129  #[must_use]
130  pub fn highlight_codeblocks(&self, html: &str) -> String {
131    use kuchikikiki::parse_html;
132    use tendril::TendrilSink;
133
134    if !self.options.highlight_code || self.syntax_manager.is_none() {
135      return html.to_string();
136    }
137
138    let document = parse_html().one(html);
139
140    // Collect all code blocks first to avoid DOM modification during iteration
141    let mut code_blocks = Vec::new();
142    for pre_node in safe_select(&document, "pre > code") {
143      let code_node = pre_node;
144      if let Some(element) = code_node.as_element() {
145        let language = element
146          .attributes
147          .borrow()
148          .get("class")
149          .and_then(|class| class.strip_prefix("language-"))
150          .unwrap_or("text")
151          .to_string();
152        let code_text = code_node.text_contents();
153
154        if let Some(pre_parent) = code_node.parent() {
155          code_blocks.push((
156            pre_parent.clone(),
157            code_node.clone(),
158            code_text,
159            language,
160          ));
161        }
162      }
163    }
164
165    // Process each code block
166    for (pre_element, _code_node, code_text, language) in code_blocks {
167      if let Some(highlighted) = self.highlight_code_html(&code_text, &language)
168      {
169        // Wrap highlighted HTML in <pre><code> with appropriate classes
170        let wrapped_html = format!(
171          r#"<pre class="highlight"><code class="language-{language}">{highlighted}</code></pre>"#
172        );
173        let fragment = parse_html().one(wrapped_html.as_str());
174        pre_element.insert_after(fragment);
175        pre_element.detach();
176      }
177      // Do not add highlight/language-* classes if not highlighted
178    }
179
180    let mut buf = Vec::new();
181    if let Err(e) = document.serialize(&mut buf) {
182      log::warn!("DOM serialization failed: {e:?}");
183      return html.to_string(); // Return original HTML if serialization fails
184    }
185    String::from_utf8(buf).unwrap_or_else(|_| html.to_string())
186  }
187
188  /// Handle hard tabs in code blocks according to configuration
189  fn handle_hardtabs(&self, code: &str) -> String {
190    use super::types::TabStyle;
191
192    // Check if there are any hard tabs
193    if !code.contains('\t') {
194      return code.to_string();
195    }
196
197    match self.options.tab_style {
198      // Do nothing
199      TabStyle::None => code.to_string(),
200
201      // Warn, but do nothing.
202      TabStyle::Warn => {
203        log::warn!(
204          "Hard tabs detected in code block. Consider using spaces for \
205           consistency. Tools like editorconfig may help you normalize spaces \
206           in your documents."
207        );
208        code.to_string()
209      },
210
211      // Do not warn, only inform in debug mode. Then return
212      // the updated code.
213      TabStyle::Normalize => {
214        log::debug!("Replacing hard tabs with spaces");
215        code.replace('\t', "  ")
216      },
217    }
218  }
219
220  /// Process hard tabs in code blocks within markdown content
221  fn process_hardtabs(&self, markdown: &str) -> String {
222    use super::types::TabStyle;
223
224    // If no tab handling is needed, return as-is
225    if self.options.tab_style == TabStyle::None {
226      return markdown.to_string();
227    }
228
229    let mut result = String::with_capacity(markdown.len());
230    let mut lines = markdown.lines().peekable();
231    let mut in_code_block = false;
232    let mut code_fence_char = None;
233    let mut code_fence_count = 0;
234
235    while let Some(line) = lines.next() {
236      let trimmed = line.trim_start();
237
238      // Check for code fences
239      if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
240        let Some(fence_char) = trimmed.chars().next() else {
241          // If the line is empty after trimming, it can't be a valid code fence
242          // Just continue processing the line normally
243          result.push_str(line);
244          result.push('\n');
245          continue;
246        };
247        let fence_count =
248          trimmed.chars().take_while(|&c| c == fence_char).count();
249
250        if fence_count >= 3 {
251          if !in_code_block {
252            // Starting a code block
253            in_code_block = true;
254            code_fence_char = Some(fence_char);
255            code_fence_count = fence_count;
256          } else if code_fence_char == Some(fence_char)
257            && fence_count >= code_fence_count
258          {
259            // Ending a code block
260            in_code_block = false;
261            code_fence_char = None;
262            code_fence_count = 0;
263          }
264        }
265      }
266
267      // Process line based on whether we're in a code block
268      let processed_line = if in_code_block && line.contains('\t') {
269        self.handle_hardtabs(line)
270      } else {
271        line.to_string()
272      };
273
274      result.push_str(&processed_line);
275
276      // Add newline unless this is the last line
277      if lines.peek().is_some() {
278        result.push('\n');
279      }
280    }
281
282    result
283  }
284
285  /// Highlight code using the configured syntax highlighter, returns HTML
286  /// string
287  fn highlight_code_html(&self, code: &str, language: &str) -> Option<String> {
288    if !self.options.highlight_code {
289      return None;
290    }
291
292    let syntax_manager = self.syntax_manager.as_ref()?;
293
294    syntax_manager
295      .highlight_code(code, language, self.options.highlight_theme.as_deref())
296      .ok()
297  }
298
299  /// Render Markdown to HTML, extracting headers and title.
300  #[must_use]
301  pub fn render(&self, markdown: &str) -> MarkdownResult {
302    let (preprocessed, included_files) = self.preprocess(markdown);
303    let (headers, title) = self.extract_headers(&preprocessed);
304    let html = self.process_html_pipeline(&preprocessed);
305
306    MarkdownResult {
307      html,
308      headers,
309      title,
310      included_files,
311    }
312  }
313
314  /// Process the HTML generation and post-processing pipeline.
315  fn process_html_pipeline(&self, content: &str) -> String {
316    let mut html = self.convert_to_html(content);
317
318    // Apply feature-specific post-processing
319    if cfg!(feature = "ndg-flavored") {
320      #[cfg(feature = "ndg-flavored")]
321      {
322        html = super::extensions::process_option_references(
323          &html,
324          self.options.valid_options.as_ref(),
325        );
326      }
327    }
328
329    if self.options.nixpkgs {
330      html = self.process_manpage_references_html(&html);
331    }
332
333    if self.options.highlight_code {
334      html = self.highlight_codeblocks(&html);
335    }
336
337    self.kuchiki_postprocess(&html)
338  }
339
340  /// Preprocess the markdown content with all enabled transformations.
341  fn preprocess(
342    &self,
343    content: &str,
344  ) -> (String, Vec<crate::types::IncludedFile>) {
345    let mut processed = content.to_string();
346    let mut included_files = Vec::new();
347
348    // Process MyST-style autolinks first
349    processed = super::extensions::process_myst_autolinks(&processed);
350
351    // Handle hard tabs in code blocks
352    processed = self.process_hardtabs(&processed);
353
354    if self.options.nixpkgs {
355      let (content, files) = self.apply_nixpkgs_preprocessing(&processed);
356      processed = content;
357      included_files = files;
358    }
359
360    if self.options.nixpkgs || cfg!(feature = "ndg-flavored") {
361      processed = super::extensions::process_role_markup(
362        &processed,
363        self.manpage_urls.as_ref(),
364        self.options.auto_link_options,
365        self.options.valid_options.as_ref(),
366      );
367    }
368
369    (processed, included_files)
370  }
371
372  /// Apply Nixpkgs-specific preprocessing steps.
373  #[cfg(feature = "nixpkgs")]
374  fn apply_nixpkgs_preprocessing(
375    &self,
376    content: &str,
377  ) -> (String, Vec<crate::types::IncludedFile>) {
378    let (with_includes, included_files) =
379      match super::extensions::process_file_includes(content, &self.base_dir, 0)
380      {
381        Ok(result) => result,
382        Err(e) => {
383          log::warn!(
384            "File include processing failed: {e}. Continuing without includes."
385          );
386          (content.to_string(), Vec::new())
387        },
388      };
389    let with_blocks = super::extensions::process_block_elements(&with_includes);
390    let processed = super::extensions::process_inline_anchors(&with_blocks);
391    (processed, included_files)
392  }
393
394  /// Apply Nixpkgs-specific preprocessing steps (no-op when feature disabled).
395  #[cfg(not(feature = "nixpkgs"))]
396  fn apply_nixpkgs_preprocessing(
397    &self,
398    content: &str,
399  ) -> (String, Vec<crate::types::IncludedFile>) {
400    (content.to_string(), Vec::new())
401  }
402
403  /// Extract headers and title from the markdown content.
404  #[must_use]
405  pub fn extract_headers(
406    &self,
407    content: &str,
408  ) -> (Vec<Header>, Option<String>) {
409    use std::fmt::Write;
410
411    let arena = Arena::new();
412    let options = self.comrak_options();
413
414    // Normalize custom anchors with no heading level to h2
415    let mut normalized = String::with_capacity(content.len());
416    for line in content.lines() {
417      let trimmed = line.trim_end();
418      if !trimmed.starts_with('#') {
419        if let Some(anchor_start) = trimmed.rfind("{#") {
420          if let Some(anchor_end) = trimmed[anchor_start..].find('}') {
421            let text = trimmed[..anchor_start].trim_end();
422            let id = &trimmed[anchor_start + 2..anchor_start + anchor_end];
423            let _ = writeln!(normalized, "## {text} {{#{id}}}");
424            continue;
425          }
426        }
427      }
428      normalized.push_str(line);
429      normalized.push('\n');
430    }
431
432    let root = parse_document(&arena, &normalized, &options);
433
434    let mut headers = Vec::new();
435    let mut found_title = None;
436
437    for node in root.descendants() {
438      if let NodeValue::Heading(NodeHeading { level, .. }) =
439        &node.data.borrow().value
440      {
441        let mut text = String::new();
442        let mut explicit_id = None;
443
444        for child in node.children() {
445          match &child.data.borrow().value {
446            NodeValue::Text(t) => text.push_str(t),
447            NodeValue::Code(t) => text.push_str(&t.literal),
448            NodeValue::Link(..)
449            | NodeValue::Emph
450            | NodeValue::Strong
451            | NodeValue::Subscript
452            | NodeValue::Strikethrough
453            | NodeValue::Superscript
454            | NodeValue::FootnoteReference(..) => {
455              text.push_str(&extract_inline_text(child));
456            },
457            NodeValue::HtmlInline(html) => {
458              // Look for explicit anchor in HTML inline node: {#id}
459              let html_str = html.as_str();
460              if let Some(start) = html_str.find("{#") {
461                if let Some(end) = html_str[start..].find('}') {
462                  let anchor = &html_str[start + 2..start + end];
463                  explicit_id = Some(anchor.to_string());
464                }
465              }
466            },
467            #[allow(clippy::match_same_arms, reason = "Explicit for clarity")]
468            NodeValue::Image(..) => {},
469            _ => {},
470          }
471        }
472
473        // Check for trailing {#id} in heading text
474        let trimmed = text.trim_end();
475        #[allow(clippy::option_if_let_else)]
476        // Nested options clearer with if-let
477        let (final_text, id) = if let Some(start) = trimmed.rfind("{#") {
478          if let Some(end) = trimmed[start..].find('}') {
479            let anchor = &trimmed[start + 2..start + end];
480            (trimmed[..start].trim_end().to_string(), anchor.to_string())
481          } else {
482            (
483              text.clone(),
484              explicit_id.unwrap_or_else(|| utils::slugify(&text)),
485            )
486          }
487        } else {
488          (
489            text.clone(),
490            explicit_id.unwrap_or_else(|| utils::slugify(&text)),
491          )
492        };
493        if *level == 1 && found_title.is_none() {
494          found_title = Some(final_text.clone());
495        }
496        headers.push(Header {
497          text: final_text,
498          level: *level,
499          id,
500        });
501      }
502    }
503
504    (headers, found_title)
505  }
506
507  /// Convert markdown to HTML using comrak and configured options.
508  fn convert_to_html(&self, content: &str) -> String {
509    // Process directly without panic catching for better performance
510    let arena = Arena::new();
511    let options = self.comrak_options();
512    let root = parse_document(&arena, content, &options);
513
514    // Apply AST transformations
515    let prompt_transformer = PromptTransformer;
516    prompt_transformer.transform(root);
517
518    let mut html_output = String::new();
519    comrak::format_html(root, &options, &mut html_output).unwrap_or_default();
520
521    // Post-process HTML to handle header anchors
522    Self::process_header_anchors_html(&html_output)
523  }
524
525  /// Process header anchors in HTML by finding {#id} syntax and converting to
526  /// proper id attributes
527  fn process_header_anchors_html(html: &str) -> String {
528    use std::sync::LazyLock;
529
530    use regex::Regex;
531
532    static HEADER_ANCHOR_RE: LazyLock<Regex> = LazyLock::new(|| {
533      Regex::new(r"<h([1-6])>(.*?)\s*\{#([a-zA-Z0-9_-]+)\}(.*?)</h[1-6]>")
534        .unwrap_or_else(|e| {
535          log::error!("Failed to compile HEADER_ANCHOR_RE regex: {e}");
536          utils::never_matching_regex().unwrap_or_else(|_| {
537            // As a last resort, create a regex that matches nothing
538            #[allow(
539              clippy::expect_used,
540              reason = "This pattern is guaranteed to be valid"
541            )]
542            Regex::new(r"[^\s\S]")
543              .expect("regex pattern [^\\s\\S] should always compile")
544          })
545        })
546    });
547
548    HEADER_ANCHOR_RE
549      .replace_all(html, |caps: &regex::Captures| {
550        let level = &caps[1];
551        let prefix = &caps[2];
552        let id = &caps[3];
553        let suffix = &caps[4];
554        format!("<h{level} id=\"{id}\">{prefix}{suffix}</h{level}>")
555      })
556      .to_string()
557  }
558
559  /// Build comrak options from `MarkdownOptions` and feature flags.
560  fn comrak_options(&self) -> Options<'_> {
561    let mut options = Options::default();
562    if self.options.gfm {
563      options.extension.table = true;
564      options.extension.footnotes = true;
565      options.extension.strikethrough = true;
566      options.extension.tasklist = true;
567      options.extension.superscript = true;
568      options.extension.autolink = true;
569    }
570    options.render.r#unsafe = true;
571    // Enable description lists but keep custom header processing
572    options.extension.header_ids = None;
573    options.extension.description_lists = true;
574    options
575  }
576
577  /// Post-process HTML to enhance manpage references with URL links.
578  #[cfg(feature = "nixpkgs")]
579  fn process_manpage_references_html(&self, html: &str) -> String {
580    super::extensions::process_manpage_references(
581      html,
582      self.manpage_urls.as_ref(),
583    )
584  }
585
586  /// Post-process HTML to enhance manpage references (no-op when feature
587  /// disabled).
588  #[cfg(not(feature = "nixpkgs"))]
589  fn process_manpage_references_html(&self, html: &str) -> String {
590    html.to_string()
591  }
592
593  /// HTML post-processing using kuchiki DOM manipulation.
594  #[allow(
595    clippy::unused_self,
596    reason = "Method signature matches processor pattern"
597  )]
598  fn kuchiki_postprocess(&self, html: &str) -> String {
599    // Use a standalone function to avoid borrowing issues
600    kuchiki_postprocess_html(html, |document| {
601      Self::apply_dom_transformations(document);
602    })
603  }
604
605  /// Apply all DOM transformations to the parsed HTML document.
606  fn apply_dom_transformations(document: &kuchikikiki::NodeRef) {
607    Self::process_list_item_id_markers(document);
608    Self::process_header_anchor_comments(document);
609    Self::process_list_item_inline_anchors(document);
610    Self::process_paragraph_inline_anchors(document);
611    Self::process_remaining_inline_anchors(document);
612    Self::process_option_anchor_links(document);
613    Self::process_empty_auto_links(document);
614    Self::process_empty_html_links(document);
615  }
616
617  /// Process list item ID markers: <li><!-- nixos-anchor-id:ID -->
618  fn process_list_item_id_markers(document: &kuchikikiki::NodeRef) {
619    let mut to_modify = Vec::new();
620
621    for comment in document.inclusive_descendants() {
622      if let Some(comment_node) = comment.as_comment() {
623        let comment_text = comment_node.borrow();
624        if let Some(id_start) = comment_text.find("nixos-anchor-id:") {
625          let id = comment_text[id_start + 16..].trim();
626          if !id.is_empty()
627            && id
628              .chars()
629              .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
630          {
631            // Check if this comment is inside an <li> element
632            if let Some(parent) = comment.parent() {
633              if let Some(element) = parent.as_element() {
634                if element.name.local.as_ref() == "li" {
635                  to_modify.push((comment.clone(), id.to_string()));
636                }
637              }
638            }
639          }
640        }
641      }
642    }
643
644    for (comment_node, id) in to_modify {
645      let span = kuchikikiki::NodeRef::new_element(
646        markup5ever::QualName::new(
647          None,
648          markup5ever::ns!(html),
649          local_name!("span"),
650        ),
651        vec![
652          (
653            kuchikikiki::ExpandedName::new("", "id"),
654            kuchikikiki::Attribute {
655              prefix: None,
656              value:  id,
657            },
658          ),
659          (
660            kuchikikiki::ExpandedName::new("", "class"),
661            kuchikikiki::Attribute {
662              prefix: None,
663              value:  "nixos-anchor".into(),
664            },
665          ),
666        ],
667      );
668      comment_node.insert_after(span);
669      comment_node.detach();
670    }
671  }
672
673  /// Process header anchors with comments: <h1>text<!-- anchor: id --></h1>
674  fn process_header_anchor_comments(document: &kuchikikiki::NodeRef) {
675    let mut to_modify = Vec::new();
676
677    for comment in document.inclusive_descendants() {
678      if let Some(comment_node) = comment.as_comment() {
679        let comment_text = comment_node.borrow();
680        if let Some(anchor_start) = comment_text.find("anchor:") {
681          let id = comment_text[anchor_start + 7..].trim();
682          if !id.is_empty()
683            && id
684              .chars()
685              .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
686          {
687            // Check if this comment is inside a header element
688            if let Some(parent) = comment.parent() {
689              if let Some(element) = parent.as_element() {
690                let tag_name = element.name.local.as_ref();
691                if matches!(tag_name, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
692                  to_modify.push((
693                    parent.clone(),
694                    comment.clone(),
695                    id.to_string(),
696                  ));
697                }
698              }
699            }
700          }
701        }
702      }
703    }
704
705    for (header_element, comment_node, id) in to_modify {
706      if let Some(element) = header_element.as_element() {
707        element
708          .attributes
709          .borrow_mut()
710          .insert(local_name!("id"), id);
711        comment_node.detach();
712      }
713    }
714  }
715
716  /// Process remaining inline anchors in list items: <li>[]{#id}content</li>
717  fn process_list_item_inline_anchors(document: &kuchikikiki::NodeRef) {
718    for li_node in safe_select(document, "li") {
719      let li_element = li_node;
720
721      // Check if this list item contains code elements
722      let has_code = !safe_select(&li_element, "code, pre").is_empty();
723      if has_code {
724        continue; // Skip list items with code blocks
725      }
726
727      let text_content = li_element.text_contents();
728
729      if let Some(anchor_start) = text_content.find("[]{#") {
730        if let Some(anchor_end) = text_content[anchor_start..].find('}') {
731          let id = &text_content[anchor_start + 4..anchor_start + anchor_end];
732          if !id.is_empty()
733            && id
734              .chars()
735              .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
736          {
737            let remaining_content =
738              &text_content[anchor_start + anchor_end + 1..];
739
740            // Clear current content and rebuild
741            for child in li_element.children() {
742              child.detach();
743            }
744
745            let span = kuchikikiki::NodeRef::new_element(
746              markup5ever::QualName::new(
747                None,
748                markup5ever::ns!(html),
749                local_name!("span"),
750              ),
751              vec![
752                (
753                  kuchikikiki::ExpandedName::new("", "id"),
754                  kuchikikiki::Attribute {
755                    prefix: None,
756                    value:  id.into(),
757                  },
758                ),
759                (
760                  kuchikikiki::ExpandedName::new("", "class"),
761                  kuchikikiki::Attribute {
762                    prefix: None,
763                    value:  "nixos-anchor".into(),
764                  },
765                ),
766              ],
767            );
768            li_element.append(span);
769            if !remaining_content.is_empty() {
770              li_element
771                .append(kuchikikiki::NodeRef::new_text(remaining_content));
772            }
773          }
774        }
775      }
776    }
777  }
778
779  /// Process inline anchors in paragraphs: <p>[]{#id}content</p>
780  fn process_paragraph_inline_anchors(document: &kuchikikiki::NodeRef) {
781    for p_node in safe_select(document, "p") {
782      let p_element = p_node;
783
784      // Check if this paragraph contains code elements
785      let has_code = !safe_select(&p_element, "code, pre").is_empty();
786      if has_code {
787        continue; // Skip paragraphs with code blocks
788      }
789
790      let text_content = p_element.text_contents();
791
792      if let Some(anchor_start) = text_content.find("[]{#") {
793        if let Some(anchor_end) = text_content[anchor_start..].find('}') {
794          let id = &text_content[anchor_start + 4..anchor_start + anchor_end];
795          if !id.is_empty()
796            && id
797              .chars()
798              .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
799          {
800            let remaining_content =
801              &text_content[anchor_start + anchor_end + 1..];
802
803            // Clear current content and rebuild
804            for child in p_element.children() {
805              child.detach();
806            }
807
808            let span = kuchikikiki::NodeRef::new_element(
809              markup5ever::QualName::new(
810                None,
811                markup5ever::ns!(html),
812                local_name!("span"),
813              ),
814              vec![
815                (
816                  kuchikikiki::ExpandedName::new("", "id"),
817                  kuchikikiki::Attribute {
818                    prefix: None,
819                    value:  id.into(),
820                  },
821                ),
822                (
823                  kuchikikiki::ExpandedName::new("", "class"),
824                  kuchikikiki::Attribute {
825                    prefix: None,
826                    value:  "nixos-anchor".into(),
827                  },
828                ),
829              ],
830            );
831            p_element.append(span);
832            if !remaining_content.is_empty() {
833              p_element
834                .append(kuchikikiki::NodeRef::new_text(remaining_content));
835            }
836          }
837        }
838      }
839    }
840  }
841
842  /// Process remaining standalone inline anchors throughout the document
843  fn process_remaining_inline_anchors(document: &kuchikikiki::NodeRef) {
844    let mut text_nodes_to_process = Vec::new();
845
846    for node in document.inclusive_descendants() {
847      if let Some(text_node) = node.as_text() {
848        // Check if this text node is inside a code block
849        let mut parent = node.parent();
850        let mut in_code = false;
851        while let Some(p) = parent {
852          if let Some(element) = p.as_element() {
853            if element.name.local == local_name!("code")
854              || element.name.local == local_name!("pre")
855            {
856              in_code = true;
857              break;
858            }
859          }
860          parent = p.parent();
861        }
862
863        // Only process if not in code
864        if !in_code {
865          let text_content = text_node.borrow().clone();
866          if text_content.contains("[]{#") {
867            text_nodes_to_process.push((node.clone(), text_content));
868          }
869        }
870      }
871    }
872
873    for (text_node, text_content) in text_nodes_to_process {
874      let mut last_end = 0;
875      let mut new_children = Vec::new();
876
877      // Simple pattern matching for []{#id}
878      let chars = text_content.chars().collect::<Vec<_>>();
879      let mut i = 0;
880      while i < chars.len() {
881        if i + 4 < chars.len()
882          && chars[i] == '['
883          && chars[i + 1] == ']'
884          && chars[i + 2] == '{'
885          && chars[i + 3] == '#'
886        {
887          // Found start of anchor pattern
888          let anchor_start = i;
889          i += 4; // skip "[]{#"
890
891          let mut id = String::new();
892          while i < chars.len() && chars[i] != '}' {
893            if chars[i].is_alphanumeric() || chars[i] == '-' || chars[i] == '_'
894            {
895              id.push(chars[i]);
896              i += 1;
897            } else {
898              break;
899            }
900          }
901
902          if i < chars.len() && chars[i] == '}' && !id.is_empty() {
903            // Valid anchor found
904            let anchor_end = i + 1;
905
906            // Add text before anchor
907            if anchor_start > last_end {
908              let before_text: String =
909                chars[last_end..anchor_start].iter().collect();
910              if !before_text.is_empty() {
911                new_children.push(kuchikikiki::NodeRef::new_text(before_text));
912              }
913            }
914
915            // Add span element
916            let span = kuchikikiki::NodeRef::new_element(
917              markup5ever::QualName::new(
918                None,
919                markup5ever::ns!(html),
920                local_name!("span"),
921              ),
922              vec![
923                (
924                  kuchikikiki::ExpandedName::new("", "id"),
925                  kuchikikiki::Attribute {
926                    prefix: None,
927                    value:  id,
928                  },
929                ),
930                (
931                  kuchikikiki::ExpandedName::new("", "class"),
932                  kuchikikiki::Attribute {
933                    prefix: None,
934                    value:  "nixos-anchor".into(),
935                  },
936                ),
937              ],
938            );
939            new_children.push(span);
940
941            last_end = anchor_end;
942            i = anchor_end;
943          } else {
944            i += 1;
945          }
946        } else {
947          i += 1;
948        }
949      }
950
951      // Add remaining text
952      if last_end < chars.len() {
953        let after_text: String = chars[last_end..].iter().collect();
954        if !after_text.is_empty() {
955          new_children.push(kuchikikiki::NodeRef::new_text(after_text));
956        }
957      }
958
959      // Replace text node if we found anchors
960      if !new_children.is_empty() {
961        for child in new_children {
962          text_node.insert_before(child);
963        }
964        text_node.detach();
965      }
966    }
967  }
968
969  /// Process empty auto-links: [](#anchor) -> <a href="#anchor">Anchor</a>
970  fn process_empty_auto_links(document: &kuchikikiki::NodeRef) {
971    for link_node in safe_select(document, "a") {
972      let link_element = link_node;
973      if let Some(element) = link_element.as_element() {
974        let href = element
975          .attributes
976          .borrow()
977          .get(local_name!("href"))
978          .map(std::string::ToString::to_string);
979        let text_content = link_element.text_contents();
980
981        if let Some(href_value) = href {
982          if href_value.starts_with('#')
983            && (text_content.trim().is_empty()
984              || text_content.trim() == "{{ANCHOR}}")
985          {
986            // Clear placeholder text if present
987            if text_content.trim() == "{{ANCHOR}}" {
988              for child in link_element.children() {
989                child.detach();
990              }
991            }
992            // Empty link with anchor - add humanized text
993            let display_text = Self::humanize_anchor_id(&href_value);
994            link_element.append(kuchikikiki::NodeRef::new_text(display_text));
995          }
996        }
997      }
998    }
999  }
1000
1001  /// Process empty HTML links that have no content
1002  fn process_empty_html_links(document: &kuchikikiki::NodeRef) {
1003    for link_node in safe_select(document, "a[href^='#']") {
1004      let link_element = link_node;
1005      let text_content = link_element.text_contents();
1006
1007      if text_content.trim().is_empty() || text_content.trim() == "{{ANCHOR}}" {
1008        // Clear placeholder text if present
1009        if text_content.trim() == "{{ANCHOR}}" {
1010          for child in link_element.children() {
1011            child.detach();
1012          }
1013        }
1014        if let Some(element) = link_element.as_element() {
1015          if let Some(href) =
1016            element.attributes.borrow().get(local_name!("href"))
1017          {
1018            let display_text = Self::humanize_anchor_id(href);
1019            link_element.append(kuchikikiki::NodeRef::new_text(display_text));
1020          }
1021        }
1022      }
1023    }
1024  }
1025
1026  /// Process option anchor links: [](#opt-option.path) -> link to options.html
1027  fn process_option_anchor_links(document: &kuchikikiki::NodeRef) {
1028    let mut to_modify = Vec::new();
1029
1030    // Collect all option anchor links first
1031    for link_node in safe_select(document, "a[href^='#opt-']") {
1032      let link_element = link_node;
1033      if let Some(element) = link_element.as_element() {
1034        let href = element
1035          .attributes
1036          .borrow()
1037          .get(local_name!("href"))
1038          .map(std::string::ToString::to_string);
1039        let text_content = link_element.text_contents();
1040
1041        if let Some(href_value) = href {
1042          if href_value.starts_with("#opt-") {
1043            let option_anchor = href_value[1..].to_string(); // remove the leading #
1044            let needs_text_replacement = text_content.trim().is_empty()
1045              || text_content.trim() == "{{ANCHOR}}";
1046            to_modify.push((
1047              link_element.clone(),
1048              option_anchor,
1049              needs_text_replacement,
1050            ));
1051          }
1052        }
1053      }
1054    }
1055
1056    // Apply modifications
1057    for (link_element, option_anchor, needs_text_replacement) in to_modify {
1058      if let Some(element) = link_element.as_element() {
1059        let new_href = format!("options.html#{option_anchor}");
1060        element
1061          .attributes
1062          .borrow_mut()
1063          .insert(local_name!("href"), new_href);
1064
1065        if needs_text_replacement {
1066          // Clear existing content
1067          for child in link_element.children() {
1068            child.detach();
1069          }
1070
1071          // Extract option name from anchor
1072          // opt-services-nginx-enable -> services.nginx.enable
1073          if let Some(option_path) = option_anchor.strip_prefix("opt-") {
1074            let option_name = option_path.replace('-', ".");
1075            link_element.append(kuchikikiki::NodeRef::new_text(option_name));
1076          }
1077        }
1078      }
1079    }
1080  }
1081
1082  /// Convert an anchor ID to human-readable text
1083  fn humanize_anchor_id(anchor: &str) -> String {
1084    // Strip the leading #
1085    let cleaned = anchor.trim_start_matches('#');
1086
1087    // Remove common prefixes
1088    let without_prefix = cleaned
1089      .trim_start_matches("sec-")
1090      .trim_start_matches("ssec-")
1091      .trim_start_matches("opt-");
1092
1093    // Replace separators with spaces
1094    let spaced = without_prefix.replace(['-', '_'], " ");
1095
1096    // Capitalize each word
1097    spaced
1098      .split_whitespace()
1099      .map(|word| {
1100        let mut chars = word.chars();
1101        chars.next().map_or_else(String::new, |c| {
1102          c.to_uppercase().collect::<String>() + chars.as_str()
1103        })
1104      })
1105      .collect::<Vec<String>>()
1106      .join(" ")
1107  }
1108}
1109
1110/// Extract all inline text from a heading node.
1111pub fn extract_inline_text<'a>(node: &'a AstNode<'a>) -> String {
1112  let mut text = String::new();
1113  for child in node.children() {
1114    match &child.data.borrow().value {
1115      NodeValue::Text(t) => text.push_str(t),
1116      NodeValue::Code(t) => text.push_str(&t.literal),
1117      NodeValue::Link(..)
1118      | NodeValue::Emph
1119      | NodeValue::Strong
1120      | NodeValue::Strikethrough
1121      | NodeValue::Superscript
1122      | NodeValue::Subscript
1123      | NodeValue::FootnoteReference(..) => {
1124        text.push_str(&extract_inline_text(child));
1125      },
1126      #[allow(clippy::match_same_arms, reason = "Explicit for clarity")]
1127      NodeValue::HtmlInline(_) | NodeValue::Image(..) => {},
1128      _ => {},
1129    }
1130  }
1131  text
1132}
1133
1134/// Collect all markdown files from the input directory
1135pub fn collect_markdown_files(input_dir: &Path) -> Vec<PathBuf> {
1136  let mut files = Vec::with_capacity(100);
1137
1138  for entry in WalkDir::new(input_dir)
1139    .follow_links(true)
1140    .into_iter()
1141    .filter_map(Result::ok)
1142  {
1143    let path = entry.path();
1144    if path.is_file() && path.extension().is_some_and(|ext| ext == "md") {
1145      files.push(path.to_owned());
1146    }
1147  }
1148
1149  trace!("Found {} markdown files to process", files.len());
1150  files
1151}
1152
1153/// Features that can be queried on a processor instance.
1154#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1155pub enum ProcessorFeature {
1156  /// GitHub Flavored Markdown support
1157  Gfm,
1158  /// Nixpkgs documentation extensions
1159  Nixpkgs,
1160  /// Syntax highlighting for code blocks
1161  SyntaxHighlighting,
1162  /// Manpage URL mapping support
1163  ManpageUrls,
1164}
1165
1166/// Standalone HTML post-processing function to avoid borrowing issues.
1167fn kuchiki_postprocess_html<F>(html: &str, transform_fn: F) -> String
1168where
1169  F: FnOnce(&kuchikikiki::NodeRef),
1170{
1171  process_safe(
1172    html,
1173    |html| {
1174      use tendril::TendrilSink;
1175
1176      let document = kuchikikiki::parse_html().one(html);
1177      transform_fn(&document);
1178
1179      let mut out = Vec::new();
1180      document.serialize(&mut out).ok();
1181      String::from_utf8(out).unwrap_or_default()
1182    },
1183    html,
1184  )
1185}
ndg_commonmark/processor/core.rs

ndg_commonmark/processor/
core.rs