ndg_commonmark/processor/
core.rs

1//! Core implementation of the Markdown processor.
2//!
3//! Main implementation of `MarkdownProcessor` and its methods focused on the
4//! core rendering pipeline and configuration management.
5use std::{
6  collections::HashMap,
7  path::{Path, PathBuf},
8  sync::LazyLock,
9};
10
11use comrak::{
12  Arena,
13  nodes::{AstNode, NodeHeading, NodeValue},
14  options::Options,
15  parse_document,
16};
17use log::trace;
18use markup5ever::local_name;
19use regex::Regex;
20use walkdir::WalkDir;
21
22use super::{
23  dom::safe_select,
24  process::process_safe,
25  types::{
26    AstTransformer,
27    MarkdownOptions,
28    MarkdownProcessor,
29    PromptTransformer,
30  },
31};
32use crate::{
33  syntax::create_default_manager,
34  types::{Header, MarkdownResult},
35  utils,
36};
37
38static HEADER_ANCHOR_RE: LazyLock<Regex> = LazyLock::new(|| {
39  Regex::new(r"<h([1-6])>(.*?)\s*\{#([a-zA-Z0-9_-]+)\}(.*?)</h[1-6]>")
40    .unwrap_or_else(|e| {
41      log::error!("Failed to compile HEADER_ANCHOR_RE regex: {e}");
42      utils::never_matching_regex().unwrap_or_else(|_| {
43        #[allow(
44          clippy::expect_used,
45          reason = "This pattern is guaranteed to be valid"
46        )]
47        Regex::new(r"[^\s\S]")
48          .expect("regex pattern [^\\s\\S] should always compile")
49      })
50    })
51});
52
53static HEADER_NO_ID_RE: LazyLock<Regex> = LazyLock::new(|| {
54  Regex::new(r"<h([1-6])>(.*?)</h[1-6]>").unwrap_or_else(|e| {
55    log::error!("Failed to compile HEADER_NO_ID_RE regex: {e}");
56    utils::never_matching_regex().unwrap_or_else(|_| {
57      #[allow(
58        clippy::expect_used,
59        reason = "This pattern is guaranteed to be valid"
60      )]
61      Regex::new(r"[^\s\S]")
62        .expect("regex pattern [^\\s\\S] should always compile")
63    })
64  })
65});
66
67static HTML_TAG_RE: LazyLock<Regex> = LazyLock::new(|| {
68  Regex::new(r"<[^>]+>").unwrap_or_else(|e| {
69    log::error!("Failed to compile HTML_TAG_RE regex: {e}");
70    utils::never_matching_regex().unwrap_or_else(|_| {
71      #[allow(
72        clippy::expect_used,
73        reason = "This pattern is guaranteed to be valid"
74      )]
75      Regex::new(r"[^\s\S]")
76        .expect("regex pattern [^\\s\\S] should always compile")
77    })
78  })
79});
80
81impl MarkdownProcessor {
82  /// Create a new `MarkdownProcessor` with the given options.
83  #[must_use]
84  pub fn new(options: MarkdownOptions) -> Self {
85    let manpage_urls = options
86      .manpage_urls_path
87      .as_ref()
88      .and_then(|path| crate::utils::load_manpage_urls(path).ok());
89
90    let syntax_manager = if options.highlight_code {
91      match create_default_manager(
92        options
93          .syntax_queries_path
94          .as_deref()
95          .map(std::path::Path::new),
96      ) {
97        Ok(manager) => {
98          log::info!("Syntax highlighting initialized successfully");
99          Some(manager)
100        },
101        Err(e) => {
102          log::error!("Failed to initialize syntax highlighting: {e}");
103          log::warn!(
104            "Continuing without syntax highlighting - code blocks will not be \
105             highlighted"
106          );
107          None
108        },
109      }
110    } else {
111      None
112    };
113
114    Self {
115      options,
116      manpage_urls,
117      syntax_manager,
118      base_dir: std::path::PathBuf::from("."),
119    }
120  }
121
122  /// Access processor options.
123  #[must_use]
124  pub const fn options(&self) -> &MarkdownOptions {
125    &self.options
126  }
127
128  /// Set the base directory for resolving relative file paths.
129  #[must_use]
130  pub fn with_base_dir(mut self, base_dir: &std::path::Path) -> Self {
131    self.base_dir = base_dir.to_path_buf();
132    self
133  }
134
135  /// Check if a specific feature is enabled.
136  #[must_use]
137  pub const fn has_feature(&self, feature: ProcessorFeature) -> bool {
138    match feature {
139      ProcessorFeature::Gfm => self.options.gfm,
140      ProcessorFeature::Nixpkgs => self.options.nixpkgs,
141      ProcessorFeature::SyntaxHighlighting => self.options.highlight_code,
142      ProcessorFeature::ManpageUrls => self.manpage_urls.is_some(),
143    }
144  }
145
146  /// Get the manpage URLs mapping for use with standalone functions.
147  #[must_use]
148  pub const fn manpage_urls(&self) -> Option<&HashMap<String, String>> {
149    self.manpage_urls.as_ref()
150  }
151
152  /// Highlight all code blocks in HTML using the configured syntax highlighter
153  #[must_use]
154  pub fn highlight_codeblocks(&self, html: &str) -> String {
155    use kuchikikiki::parse_html;
156    use tendril::TendrilSink;
157
158    if !self.options.highlight_code || self.syntax_manager.is_none() {
159      return html.to_string();
160    }
161
162    let document = parse_html().one(html);
163
164    // Collect all code blocks first to avoid DOM modification during iteration
165    let mut code_blocks = Vec::new();
166    for pre_node in safe_select(&document, "pre > code") {
167      let code_node = pre_node;
168      if let Some(element) = code_node.as_element() {
169        let language = element
170          .attributes
171          .borrow()
172          .get("class")
173          .and_then(|class| class.strip_prefix("language-"))
174          .unwrap_or("text")
175          .to_string();
176        let code_text = code_node.text_contents();
177
178        if let Some(pre_parent) = code_node.parent() {
179          code_blocks.push((
180            pre_parent.clone(),
181            code_node.clone(),
182            code_text,
183            language,
184          ));
185        }
186      }
187    }
188
189    // Process each code block
190    for (pre_element, _code_node, code_text, language) in code_blocks {
191      if let Some(highlighted) = self.highlight_code_html(&code_text, &language)
192      {
193        // Wrap highlighted HTML in <pre><code> with appropriate classes
194        let wrapped_html = format!(
195          r#"<pre class="highlight"><code class="language-{language}">{highlighted}</code></pre>"#
196        );
197        let fragment = parse_html().one(wrapped_html.as_str());
198        pre_element.insert_after(fragment);
199        pre_element.detach();
200      }
201      // Do not add highlight/language-* classes if not highlighted
202    }
203
204    let mut buf = Vec::new();
205    if let Err(e) = document.serialize(&mut buf) {
206      log::warn!("DOM serialization failed: {e:?}");
207      return html.to_string(); // Return original HTML if serialization fails
208    }
209    String::from_utf8(buf).unwrap_or_else(|_| html.to_string())
210  }
211
212  /// Handle hard tabs in code blocks according to configuration
213  fn handle_hardtabs(&self, code: &str) -> String {
214    use super::types::TabStyle;
215
216    // Check if there are any hard tabs
217    if !code.contains('\t') {
218      return code.to_string();
219    }
220
221    match self.options.tab_style {
222      // Do nothing
223      TabStyle::None => code.to_string(),
224
225      // Warn, but do nothing.
226      TabStyle::Warn => {
227        log::warn!(
228          "Hard tabs detected in code block. Consider using spaces for \
229           consistency. Tools like editorconfig may help you normalize spaces \
230           in your documents."
231        );
232        code.to_string()
233      },
234
235      // Do not warn, only inform in debug mode. Then return
236      // the updated code.
237      TabStyle::Normalize => {
238        log::debug!("Replacing hard tabs with spaces");
239        code.replace('\t', "  ")
240      },
241    }
242  }
243
244  /// Process hard tabs in code blocks within markdown content
245  fn process_hardtabs(&self, markdown: &str) -> String {
246    use super::types::TabStyle;
247    use crate::utils::codeblock::FenceTracker;
248
249    // If no tab handling is needed, return as-is
250    if self.options.tab_style == TabStyle::None {
251      return markdown.to_string();
252    }
253
254    let mut result = String::with_capacity(markdown.len());
255    let mut lines = markdown.lines().peekable();
256    let mut tracker = FenceTracker::new();
257
258    while let Some(line) = lines.next() {
259      tracker = tracker.process_line(line);
260
261      // Only replace tabs inside fenced code blocks
262      let processed_line = if tracker.in_code_block() && line.contains('\t') {
263        self.handle_hardtabs(line)
264      } else {
265        line.to_string()
266      };
267
268      result.push_str(&processed_line);
269
270      // Add newline unless this is the last line
271      if lines.peek().is_some() {
272        result.push('\n');
273      }
274    }
275
276    result
277  }
278
279  /// Highlight code using the configured syntax highlighter, returns HTML
280  /// string
281  fn highlight_code_html(&self, code: &str, language: &str) -> Option<String> {
282    if !self.options.highlight_code {
283      return None;
284    }
285
286    let syntax_manager = self.syntax_manager.as_ref()?;
287
288    syntax_manager
289      .highlight_code(code, language, self.options.highlight_theme.as_deref())
290      .ok()
291  }
292
293  /// Render Markdown to HTML, extracting headers and title.
294  #[must_use]
295  pub fn render(&self, markdown: &str) -> MarkdownResult {
296    let (preprocessed, included_files) = self.preprocess(markdown);
297    let (headers, title) = self.extract_headers(&preprocessed);
298    let html = self.process_html_pipeline(&preprocessed);
299
300    MarkdownResult {
301      html,
302      headers,
303      title,
304      included_files,
305    }
306  }
307
308  /// Process the HTML generation and post-processing pipeline.
309  fn process_html_pipeline(&self, content: &str) -> String {
310    let mut html = self.convert_to_html(content);
311
312    // Apply feature-specific post-processing
313    if cfg!(feature = "ndg-flavored") {
314      #[cfg(feature = "ndg-flavored")]
315      {
316        html = super::extensions::process_option_references(
317          &html,
318          self.options.valid_options.as_ref(),
319        );
320      }
321    }
322
323    if self.options.nixpkgs {
324      html = self.process_manpage_references_html(&html);
325    }
326
327    if self.options.highlight_code {
328      html = self.highlight_codeblocks(&html);
329    }
330
331    self.kuchiki_postprocess(&html)
332  }
333
334  /// Preprocess the markdown content with all enabled transformations.
335  fn preprocess(
336    &self,
337    content: &str,
338  ) -> (String, Vec<crate::types::IncludedFile>) {
339    let mut processed = content.to_string();
340    let mut included_files = Vec::new();
341
342    // Process MyST-style autolinks first
343    processed = super::extensions::process_myst_autolinks(&processed);
344
345    // Handle hard tabs in code blocks
346    processed = self.process_hardtabs(&processed);
347
348    if self.options.nixpkgs {
349      let (content, files) = self.apply_nixpkgs_preprocessing(&processed);
350      processed = content;
351      included_files = files;
352    }
353
354    if self.options.nixpkgs || cfg!(feature = "ndg-flavored") {
355      processed = super::extensions::process_role_markup(
356        &processed,
357        self.manpage_urls.as_ref(),
358        self.options.auto_link_options,
359        self.options.valid_options.as_ref(),
360      );
361    }
362
363    #[cfg(feature = "wiki")]
364    {
365      processed = super::extensions::process_wikilinks(&processed);
366    }
367
368    (processed, included_files)
369  }
370
371  /// Apply Nixpkgs-specific preprocessing steps.
372  #[cfg(feature = "nixpkgs")]
373  fn apply_nixpkgs_preprocessing(
374    &self,
375    content: &str,
376  ) -> (String, Vec<crate::types::IncludedFile>) {
377    let (with_includes, included_files) =
378      match super::extensions::process_file_includes(content, &self.base_dir, 0)
379      {
380        Ok(result) => result,
381        Err(e) => {
382          log::warn!(
383            "File include processing failed: {e}. Continuing without includes."
384          );
385          (content.to_string(), Vec::new())
386        },
387      };
388    let with_blocks = super::extensions::process_block_elements(&with_includes);
389    let with_spans = super::extensions::process_bracketed_spans(&with_blocks);
390    let processed = super::extensions::process_inline_anchors(&with_spans);
391    (processed, included_files)
392  }
393
394  /// Apply Nixpkgs-specific preprocessing steps (no-op when feature disabled).
395  #[cfg(not(feature = "nixpkgs"))]
396  fn apply_nixpkgs_preprocessing(
397    &self,
398    content: &str,
399  ) -> (String, Vec<crate::types::IncludedFile>) {
400    (content.to_string(), Vec::new())
401  }
402
403  /// Extract headers and title from the markdown content.
404  #[must_use]
405  pub fn extract_headers(
406    &self,
407    content: &str,
408  ) -> (Vec<Header>, Option<String>) {
409    use std::fmt::Write;
410
411    let arena = Arena::new();
412    let options = self.comrak_options();
413
414    let content = remove_admonition_blocks_for_headers(content);
415
416    // Normalize custom anchors with no heading level to h2
417    let mut normalized = String::with_capacity(content.len());
418    let mut lines = content.lines().peekable();
419    while let Some(line) = lines.next() {
420      let trimmed = line.trim();
421      if !trimmed.starts_with('#')
422        && !lines
423          .peek()
424          .is_some_and(|next| is_setext_heading_underline(next.trim()))
425        && let Some(anchor_start) = trimmed.rfind("{#")
426        && let Some(anchor_end) = trimmed[anchor_start..].find('}')
427      {
428        let text = trimmed[..anchor_start].trim_end();
429        let id = &trimmed[anchor_start + 2..anchor_start + anchor_end];
430        let _ = writeln!(normalized, "## {text} {{#{id}}}");
431        continue;
432      }
433      normalized.push_str(line);
434      normalized.push('\n');
435    }
436
437    let root = parse_document(&arena, &normalized, &options);
438
439    let mut headers = Vec::new();
440    let mut found_title = None;
441
442    for node in root.descendants() {
443      if let NodeValue::Heading(NodeHeading { level, .. }) =
444        &node.data.borrow().value
445      {
446        let mut text = String::new();
447        let mut explicit_id = None;
448
449        for child in node.children() {
450          match &child.data.borrow().value {
451            NodeValue::Text(t) => text.push_str(t),
452            NodeValue::Code(t) => text.push_str(&t.literal),
453            NodeValue::Link(..)
454            | NodeValue::Emph
455            | NodeValue::Strong
456            | NodeValue::Subscript
457            | NodeValue::Strikethrough
458            | NodeValue::Superscript
459            | NodeValue::FootnoteReference(..) => {
460              text.push_str(&extract_inline_text(child));
461            },
462            NodeValue::HtmlInline(html) => {
463              // Look for explicit anchor in HTML inline node: {#id}
464              let html_str = html.as_str();
465              if let Some(start) = html_str.find("{#")
466                && let Some(end) = html_str[start..].find('}')
467              {
468                let anchor = &html_str[start + 2..start + end];
469                explicit_id = Some(anchor.to_string());
470              }
471            },
472            #[allow(clippy::match_same_arms, reason = "Explicit for clarity")]
473            NodeValue::Image(..) => {},
474            _ => {},
475          }
476        }
477
478        // Check for trailing {#id} in heading text
479        let trimmed = text.trim_end();
480        #[allow(clippy::option_if_let_else)]
481        // Nested options clearer with if-let
482        let (final_text, id) = if let Some(start) = trimmed.rfind("{#") {
483          if let Some(end) = trimmed[start..].find('}') {
484            let anchor = &trimmed[start + 2..start + end];
485            (trimmed[..start].trim_end().to_string(), anchor.to_string())
486          } else {
487            (
488              text.clone(),
489              explicit_id.unwrap_or_else(|| utils::slugify(&text)),
490            )
491          }
492        } else {
493          (
494            text.clone(),
495            explicit_id.unwrap_or_else(|| utils::slugify(&text)),
496          )
497        };
498        if *level == 1 && found_title.is_none() {
499          found_title = Some(final_text.clone());
500        }
501        headers.push(Header {
502          text: final_text,
503          level: *level,
504          id,
505        });
506      }
507    }
508
509    (headers, found_title)
510  }
511
512  /// Convert markdown to HTML using comrak and configured options.
513  fn convert_to_html(&self, content: &str) -> String {
514    // Process directly without panic catching for better performance
515    let arena = Arena::new();
516    let options = self.comrak_options();
517    let root = parse_document(&arena, content, &options);
518
519    // Apply AST transformations
520    let prompt_transformer = PromptTransformer;
521    prompt_transformer.transform(root);
522
523    let mut html_output = String::new();
524    if let Err(e) = comrak::format_html(root, &options, &mut html_output) {
525      log::error!("Failed to format HTML: {e}");
526    }
527
528    // Post-process HTML to handle header anchors
529    Self::process_header_anchors_html(&html_output)
530  }
531
532  /// Process header anchors in HTML by finding `{#id}` syntax and converting to
533  /// proper id attributes. Also adds auto-generated IDs to headers without
534  /// explicit anchors.
535  fn process_header_anchors_html(html: &str) -> String {
536    // First pass: explicit {#id} syntax
537    let result = HEADER_ANCHOR_RE
538      .replace_all(html, |caps: &regex::Captures| {
539        let level = &caps[1];
540        let prefix = &caps[2];
541        let id = &caps[3];
542        let suffix = &caps[4];
543        format!("<h{level} id=\"{id}\">{prefix}{suffix}</h{level}>")
544      })
545      .to_string();
546
547    // Second pass: add auto-generated IDs to headers without id attribute
548    HEADER_NO_ID_RE
549      .replace_all(&result, |caps: &regex::Captures| {
550        let level = &caps[1];
551        let content = &caps[2];
552        // Strip HTML tags and slugify the text content
553        let text_only = HTML_TAG_RE.replace_all(content, "");
554        let id = utils::slugify(&text_only);
555        if id.is_empty() {
556          // If slugify produces empty string, keep header without id
557          format!("<h{level}>{content}</h{level}>")
558        } else {
559          format!("<h{level} id=\"{id}\">{content}</h{level}>")
560        }
561      })
562      .to_string()
563  }
564
565  /// Build comrak options from `MarkdownOptions` and feature flags.
566  fn comrak_options(&self) -> Options<'_> {
567    let mut options = Options::default();
568    // Markdown features present in GFM.
569    if self.options.gfm {
570      options.extension.table = true;
571      options.extension.footnotes = true;
572      options.extension.strikethrough = true;
573      options.extension.tasklist = true;
574      options.extension.superscript = true;
575      options.extension.autolink = true;
576    }
577
578    // Enable unsafe HTML references. This is not a security concern
579    // as all input is assumed to be trusted.
580    options.render.r#unsafe = true;
581
582    // Enable description lists but keep custom header processing
583    options.extension.header_id_prefix = None;
584    options.extension.description_lists = true;
585    options
586  }
587
588  /// Post-process HTML to enhance manpage references with URL links.
589  #[cfg(feature = "nixpkgs")]
590  fn process_manpage_references_html(&self, html: &str) -> String {
591    super::extensions::process_manpage_references(
592      html,
593      self.manpage_urls.as_ref(),
594    )
595  }
596
597  /// Post-process HTML to enhance manpage references (no-op when feature
598  /// disabled).
599  #[cfg(not(feature = "nixpkgs"))]
600  fn process_manpage_references_html(&self, html: &str) -> String {
601    html.to_string()
602  }
603
604  /// HTML post-processing using kuchiki DOM manipulation.
605  #[allow(
606    clippy::unused_self,
607    reason = "Method signature matches processor pattern"
608  )]
609  fn kuchiki_postprocess(&self, html: &str) -> String {
610    // Use a standalone function to avoid borrowing issues
611    kuchiki_postprocess_html(html, |document| {
612      Self::apply_dom_transformations(document);
613    })
614  }
615
616  /// Apply all DOM transformations to the parsed HTML document.
617  fn apply_dom_transformations(document: &kuchikikiki::NodeRef) {
618    Self::process_list_item_id_markers(document);
619    Self::process_header_anchor_comments(document);
620    Self::process_list_item_inline_anchors(document);
621    Self::process_paragraph_inline_anchors(document);
622    Self::process_remaining_inline_anchors(document);
623    Self::process_markdown_links(document);
624    Self::process_option_anchor_links(document);
625    Self::process_empty_auto_links(document);
626    Self::process_empty_html_links(document);
627  }
628
629  /// Process list item ID markers: <li><!-- nixos-anchor-id:ID -->
630  fn process_list_item_id_markers(document: &kuchikikiki::NodeRef) {
631    let mut to_modify = Vec::new();
632
633    for comment in document.inclusive_descendants() {
634      if let Some(comment_node) = comment.as_comment() {
635        let comment_text = comment_node.borrow();
636        if let Some(id_start) = comment_text.find("nixos-anchor-id:") {
637          let id = comment_text[id_start + 16..].trim();
638          if !id.is_empty()
639            && id
640              .chars()
641              .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
642          {
643            // Check if this comment is inside an <li> element
644            if let Some(parent) = comment.parent()
645              && let Some(element) = parent.as_element()
646              && element.name.local.as_ref() == "li"
647            {
648              to_modify.push((comment.clone(), id.to_string()));
649            }
650          }
651        }
652      }
653    }
654
655    for (comment_node, id) in to_modify {
656      let span = kuchikikiki::NodeRef::new_element(
657        markup5ever::QualName::new(
658          None,
659          markup5ever::ns!(html),
660          local_name!("span"),
661        ),
662        vec![
663          (
664            kuchikikiki::ExpandedName::new("", "id"),
665            kuchikikiki::Attribute {
666              prefix: None,
667              value:  id,
668            },
669          ),
670          (
671            kuchikikiki::ExpandedName::new("", "class"),
672            kuchikikiki::Attribute {
673              prefix: None,
674              value:  "nixos-anchor".into(),
675            },
676          ),
677        ],
678      );
679      comment_node.insert_after(span);
680      comment_node.detach();
681    }
682  }
683
684  /// Process header anchors with comments: <h1>text<!-- anchor: id --></h1>
685  fn process_header_anchor_comments(document: &kuchikikiki::NodeRef) {
686    let mut to_modify = Vec::new();
687
688    for comment in document.inclusive_descendants() {
689      if let Some(comment_node) = comment.as_comment() {
690        let comment_text = comment_node.borrow();
691        if let Some(anchor_start) = comment_text.find("anchor:") {
692          let id = comment_text[anchor_start + 7..].trim();
693          if !id.is_empty()
694            && id
695              .chars()
696              .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
697          {
698            // Check if this comment is inside a header element
699            if let Some(parent) = comment.parent()
700              && let Some(element) = parent.as_element()
701            {
702              let tag_name = element.name.local.as_ref();
703              if matches!(tag_name, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
704                to_modify.push((
705                  parent.clone(),
706                  comment.clone(),
707                  id.to_string(),
708                ));
709              }
710            }
711          }
712        }
713      }
714    }
715
716    for (header_element, comment_node, id) in to_modify {
717      if let Some(element) = header_element.as_element() {
718        element
719          .attributes
720          .borrow_mut()
721          .insert(local_name!("id"), id);
722        comment_node.detach();
723      }
724    }
725  }
726
727  /// Process remaining inline anchors in list items: <li>[]{#id}content</li>
728  fn process_list_item_inline_anchors(document: &kuchikikiki::NodeRef) {
729    for li_node in safe_select(document, "li") {
730      let li_element = li_node;
731
732      // Check if this list item contains code elements
733      let has_code = !safe_select(&li_element, "code, pre").is_empty();
734      if has_code {
735        continue; // Skip list items with code blocks
736      }
737
738      let text_content = li_element.text_contents();
739
740      if let Some(anchor_start) = text_content.find("[]{#")
741        && let Some(anchor_end) = text_content[anchor_start..].find('}')
742      {
743        let id = &text_content[anchor_start + 4..anchor_start + anchor_end];
744        if !id.is_empty()
745          && id
746            .chars()
747            .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
748        {
749          let remaining_content =
750            &text_content[anchor_start + anchor_end + 1..];
751
752          // Clear current content and rebuild
753          for child in li_element.children() {
754            child.detach();
755          }
756
757          let span = kuchikikiki::NodeRef::new_element(
758            markup5ever::QualName::new(
759              None,
760              markup5ever::ns!(html),
761              local_name!("span"),
762            ),
763            vec![
764              (
765                kuchikikiki::ExpandedName::new("", "id"),
766                kuchikikiki::Attribute {
767                  prefix: None,
768                  value:  id.into(),
769                },
770              ),
771              (
772                kuchikikiki::ExpandedName::new("", "class"),
773                kuchikikiki::Attribute {
774                  prefix: None,
775                  value:  "nixos-anchor".into(),
776                },
777              ),
778            ],
779          );
780          li_element.append(span);
781          if !remaining_content.is_empty() {
782            li_element
783              .append(kuchikikiki::NodeRef::new_text(remaining_content));
784          }
785        }
786      }
787    }
788  }
789
790  /// Process inline anchors in paragraphs: <p>[]{#id}content</p>
791  fn process_paragraph_inline_anchors(document: &kuchikikiki::NodeRef) {
792    for p_node in safe_select(document, "p") {
793      let p_element = p_node;
794
795      // Check if this paragraph contains code elements
796      let has_code = !safe_select(&p_element, "code, pre").is_empty();
797      if has_code {
798        continue; // Skip paragraphs with code blocks
799      }
800
801      let text_content = p_element.text_contents();
802
803      if let Some(anchor_start) = text_content.find("[]{#")
804        && let Some(anchor_end) = text_content[anchor_start..].find('}')
805      {
806        let id = &text_content[anchor_start + 4..anchor_start + anchor_end];
807        if !id.is_empty()
808          && id
809            .chars()
810            .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
811        {
812          let remaining_content =
813            &text_content[anchor_start + anchor_end + 1..];
814
815          // Clear current content and rebuild
816          for child in p_element.children() {
817            child.detach();
818          }
819
820          let span = kuchikikiki::NodeRef::new_element(
821            markup5ever::QualName::new(
822              None,
823              markup5ever::ns!(html),
824              local_name!("span"),
825            ),
826            vec![
827              (
828                kuchikikiki::ExpandedName::new("", "id"),
829                kuchikikiki::Attribute {
830                  prefix: None,
831                  value:  id.into(),
832                },
833              ),
834              (
835                kuchikikiki::ExpandedName::new("", "class"),
836                kuchikikiki::Attribute {
837                  prefix: None,
838                  value:  "nixos-anchor".into(),
839                },
840              ),
841            ],
842          );
843          p_element.append(span);
844          if !remaining_content.is_empty() {
845            p_element.append(kuchikikiki::NodeRef::new_text(remaining_content));
846          }
847        }
848      }
849    }
850  }
851
852  /// Process remaining standalone inline anchors throughout the document
853  fn process_remaining_inline_anchors(document: &kuchikikiki::NodeRef) {
854    let mut text_nodes_to_process = Vec::new();
855
856    for node in document.inclusive_descendants() {
857      if let Some(text_node) = node.as_text() {
858        // Check if this text node is inside a code block
859        let mut parent = node.parent();
860        let mut in_code = false;
861        while let Some(p) = parent {
862          if let Some(element) = p.as_element()
863            && (element.name.local == local_name!("code")
864              || element.name.local == local_name!("pre"))
865          {
866            in_code = true;
867            break;
868          }
869          parent = p.parent();
870        }
871
872        // Only process if not in code
873        if !in_code {
874          let text_content = text_node.borrow().clone();
875          if text_content.contains("[]{#") {
876            text_nodes_to_process.push((node.clone(), text_content));
877          }
878        }
879      }
880    }
881
882    for (text_node, text_content) in text_nodes_to_process {
883      let mut last_end = 0;
884      let mut new_children = Vec::new();
885
886      // Simple pattern matching for []{#id}
887      let chars = text_content.chars().collect::<Vec<_>>();
888      let mut i = 0;
889      while i < chars.len() {
890        if i + 4 < chars.len()
891          && chars[i] == '['
892          && chars[i + 1] == ']'
893          && chars[i + 2] == '{'
894          && chars[i + 3] == '#'
895        {
896          // Found start of anchor pattern
897          let anchor_start = i;
898          i += 4; // skip "[]{#"
899
900          let mut id = String::new();
901          while i < chars.len() && chars[i] != '}' {
902            if chars[i].is_alphanumeric() || chars[i] == '-' || chars[i] == '_'
903            {
904              id.push(chars[i]);
905              i += 1;
906            } else {
907              break;
908            }
909          }
910
911          if i < chars.len() && chars[i] == '}' && !id.is_empty() {
912            // Valid anchor found
913            let anchor_end = i + 1;
914
915            // Add text before anchor
916            if anchor_start > last_end {
917              let before_text: String =
918                chars[last_end..anchor_start].iter().collect();
919              if !before_text.is_empty() {
920                new_children.push(kuchikikiki::NodeRef::new_text(before_text));
921              }
922            }
923
924            // Add span element
925            let span = kuchikikiki::NodeRef::new_element(
926              markup5ever::QualName::new(
927                None,
928                markup5ever::ns!(html),
929                local_name!("span"),
930              ),
931              vec![
932                (
933                  kuchikikiki::ExpandedName::new("", "id"),
934                  kuchikikiki::Attribute {
935                    prefix: None,
936                    value:  id,
937                  },
938                ),
939                (
940                  kuchikikiki::ExpandedName::new("", "class"),
941                  kuchikikiki::Attribute {
942                    prefix: None,
943                    value:  "nixos-anchor".into(),
944                  },
945                ),
946              ],
947            );
948            new_children.push(span);
949
950            last_end = anchor_end;
951            i = anchor_end;
952          } else {
953            i += 1;
954          }
955        } else {
956          i += 1;
957        }
958      }
959
960      // Add remaining text
961      if last_end < chars.len() {
962        let after_text: String = chars[last_end..].iter().collect();
963        if !after_text.is_empty() {
964          new_children.push(kuchikikiki::NodeRef::new_text(after_text));
965        }
966      }
967
968      // Replace text node if we found anchors
969      if !new_children.is_empty() {
970        for child in new_children {
971          text_node.insert_before(child);
972        }
973        text_node.detach();
974      }
975    }
976  }
977
978  /// Process empty auto-links: [](#anchor) -> <a href="#anchor">Anchor</a>
979  fn process_empty_auto_links(document: &kuchikikiki::NodeRef) {
980    for link_node in safe_select(document, "a") {
981      let link_element = link_node;
982      if let Some(element) = link_element.as_element() {
983        let href = element
984          .attributes
985          .borrow()
986          .get(local_name!("href"))
987          .map(std::string::ToString::to_string);
988        let text_content = link_element.text_contents();
989
990        if let Some(href_value) = href
991          && href_value.starts_with('#')
992          && (text_content.trim().is_empty()
993            || text_content.trim() == "{{ANCHOR}}")
994        {
995          // Clear placeholder text if present
996          if text_content.trim() == "{{ANCHOR}}" {
997            for child in link_element.children() {
998              child.detach();
999            }
1000          }
1001          // Empty link with anchor - add humanized text
1002          let display_text = Self::humanize_anchor_id(&href_value);
1003          link_element.append(kuchikikiki::NodeRef::new_text(display_text));
1004        }
1005      }
1006    }
1007  }
1008
1009  /// Process empty HTML links that have no content
1010  fn process_empty_html_links(document: &kuchikikiki::NodeRef) {
1011    for link_node in safe_select(document, "a[href^='#']") {
1012      let link_element = link_node;
1013      let text_content = link_element.text_contents();
1014
1015      if text_content.trim().is_empty() || text_content.trim() == "{{ANCHOR}}" {
1016        // Clear placeholder text if present
1017        if text_content.trim() == "{{ANCHOR}}" {
1018          for child in link_element.children() {
1019            child.detach();
1020          }
1021        }
1022        if let Some(element) = link_element.as_element()
1023          && let Some(href) =
1024            element.attributes.borrow().get(local_name!("href"))
1025        {
1026          let display_text = Self::humanize_anchor_id(href);
1027          link_element.append(kuchikikiki::NodeRef::new_text(display_text));
1028        }
1029      }
1030    }
1031  }
1032
1033  /// Process option anchor links: [](#opt-option.path) -> link to options.html
1034  fn process_option_anchor_links(document: &kuchikikiki::NodeRef) {
1035    let mut to_modify = Vec::new();
1036
1037    // Collect all option anchor links first
1038    for link_node in safe_select(document, "a[href^='#opt-']") {
1039      let link_element = link_node;
1040      if let Some(element) = link_element.as_element() {
1041        let href = element
1042          .attributes
1043          .borrow()
1044          .get(local_name!("href"))
1045          .map(std::string::ToString::to_string);
1046        let text_content = link_element.text_contents();
1047
1048        if let Some(href_value) = href
1049          && href_value.starts_with("#opt-")
1050        {
1051          let option_anchor = href_value[1..].to_string(); // remove the leading #
1052          let needs_text_replacement = text_content.trim().is_empty()
1053            || text_content.trim() == "{{ANCHOR}}";
1054          to_modify.push((
1055            link_element.clone(),
1056            option_anchor,
1057            needs_text_replacement,
1058          ));
1059        }
1060      }
1061    }
1062
1063    // Apply modifications
1064    for (link_element, option_anchor, needs_text_replacement) in to_modify {
1065      if let Some(element) = link_element.as_element() {
1066        let new_href = format!("options.html#{option_anchor}");
1067        element
1068          .attributes
1069          .borrow_mut()
1070          .insert(local_name!("href"), new_href);
1071
1072        if needs_text_replacement {
1073          // Clear existing content
1074          for child in link_element.children() {
1075            child.detach();
1076          }
1077
1078          // Extract option name from anchor
1079          // opt-services-nginx-enable -> services.nginx.enable
1080          if let Some(option_path) = option_anchor.strip_prefix("opt-") {
1081            let option_name = option_path.replace('-', ".");
1082            link_element.append(kuchikikiki::NodeRef::new_text(option_name));
1083          }
1084        }
1085      }
1086    }
1087  }
1088
1089  /// Process markdown file links: convert .md hrefs to .html
1090  fn process_markdown_links(document: &kuchikikiki::NodeRef) {
1091    for link_node in safe_select(document, "a") {
1092      let link_element = link_node;
1093      if let Some(element) = link_element.as_element() {
1094        let href = element
1095          .attributes
1096          .borrow()
1097          .get(local_name!("href"))
1098          .map(std::string::ToString::to_string);
1099
1100        if let Some(href_value) = href {
1101          // Only process relative links ending in .md (not absolute URLs, not
1102          // anchors)
1103          if !href_value.starts_with("http://")
1104            && !href_value.starts_with("https://")
1105            && !href_value.starts_with('#')
1106            && !href_value.starts_with("mailto:")
1107          {
1108            // Split off fragment (#) and query (?) to check the path extension
1109            let (path_part, suffix) = href_value
1110              .find(['#', '?'])
1111              .map_or((href_value.as_str(), ""), |idx| {
1112                href_value.split_at(idx)
1113              });
1114
1115            if std::path::Path::new(path_part)
1116              .extension()
1117              .is_some_and(|ext| ext.eq_ignore_ascii_case("md"))
1118            {
1119              let new_href =
1120                format!("{}.html{}", &path_part[..path_part.len() - 3], suffix);
1121              element
1122                .attributes
1123                .borrow_mut()
1124                .insert(local_name!("href"), new_href);
1125            }
1126          }
1127        }
1128      }
1129    }
1130  }
1131
1132  /// Convert an anchor ID to human-readable text
1133  fn humanize_anchor_id(anchor: &str) -> String {
1134    // Strip the leading #
1135    let cleaned = anchor.trim_start_matches('#');
1136
1137    // Remove common prefixes
1138    let without_prefix = cleaned
1139      .trim_start_matches("sec-")
1140      .trim_start_matches("ssec-")
1141      .trim_start_matches("opt-");
1142
1143    // Replace separators with spaces
1144    let spaced = without_prefix.replace(['-', '_'], " ");
1145
1146    // Capitalize each word
1147    spaced
1148      .split_whitespace()
1149      .map(|word| {
1150        let mut chars = word.chars();
1151        chars.next().map_or_else(String::new, |c| {
1152          c.to_uppercase().collect::<String>() + chars.as_str()
1153        })
1154      })
1155      .collect::<Vec<String>>()
1156      .join(" ")
1157  }
1158}
1159
1160/// Extract all inline text from a heading node.
1161pub fn extract_inline_text<'a>(node: &'a AstNode<'a>) -> String {
1162  fn inner<'a>(node: &'a AstNode<'a>) -> String {
1163    let mut text = String::new();
1164    for child in node.children() {
1165      match &child.data.borrow().value {
1166        NodeValue::Text(t) => text.push_str(t),
1167        NodeValue::Code(t) => text.push_str(&t.literal),
1168        NodeValue::Link(..)
1169        | NodeValue::Emph
1170        | NodeValue::Strong
1171        | NodeValue::Strikethrough
1172        | NodeValue::Superscript
1173        | NodeValue::Subscript
1174        | NodeValue::FootnoteReference(..) => {
1175          text.push_str(&inner(child));
1176        },
1177        #[allow(clippy::match_same_arms, reason = "Explicit for clarity")]
1178        NodeValue::HtmlInline(_) | NodeValue::Image(..) => {},
1179        _ => {},
1180      }
1181    }
1182    text
1183  }
1184  inner(node)
1185}
1186
1187/// Collect all markdown files from the input directory
1188pub fn collect_markdown_files(input_dir: &Path) -> Vec<PathBuf> {
1189  let mut files = Vec::with_capacity(100);
1190
1191  for entry in WalkDir::new(input_dir)
1192    .follow_links(true)
1193    .into_iter()
1194    .filter_map(Result::ok)
1195  {
1196    let path = entry.path();
1197    if path.is_file() && path.extension().is_some_and(|ext| ext == "md") {
1198      files.push(path.to_owned());
1199    }
1200  }
1201
1202  trace!("Found {} markdown files to process", files.len());
1203  files
1204}
1205
1206/// Features that can be queried on a processor instance.
1207#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1208pub enum ProcessorFeature {
1209  /// GitHub Flavored Markdown support
1210  Gfm,
1211  /// Nixpkgs documentation extensions
1212  Nixpkgs,
1213  /// Syntax highlighting for code blocks
1214  SyntaxHighlighting,
1215  /// Manpage URL mapping support
1216  ManpageUrls,
1217}
1218
1219fn remove_admonition_blocks_for_headers(content: &str) -> String {
1220  let mut output = String::with_capacity(content.len());
1221  let mut admonition_depth = 0usize;
1222
1223  for line in content.lines() {
1224    let trimmed = line.trim_start();
1225    if trimmed.starts_with("<div class=\"admonition ") {
1226      admonition_depth += 1;
1227      output.push('\n');
1228      continue;
1229    }
1230
1231    if admonition_depth > 0 {
1232      if trimmed == "</div>" {
1233        admonition_depth -= 1;
1234      }
1235      output.push('\n');
1236      continue;
1237    }
1238
1239    output.push_str(line);
1240    output.push('\n');
1241  }
1242
1243  output
1244}
1245
1246fn is_setext_heading_underline(line: &str) -> bool {
1247  !line.is_empty()
1248    && (line.chars().all(|ch| ch == '=' || ch.is_whitespace())
1249      || line.chars().all(|ch| ch == '-' || ch.is_whitespace()))
1250}
1251
1252/// Standalone HTML post-processing function to avoid borrowing issues.
1253fn kuchiki_postprocess_html<F>(html: &str, transform_fn: F) -> String
1254where
1255  F: FnOnce(&kuchikikiki::NodeRef),
1256{
1257  process_safe(
1258    html,
1259    |html| {
1260      use tendril::TendrilSink;
1261
1262      let document = kuchikikiki::parse_html().one(html);
1263      transform_fn(&document);
1264
1265      let mut out = Vec::new();
1266      let _ = document.serialize(&mut out);
1267      String::from_utf8_lossy(&out).into_owned()
1268    },
1269    html,
1270  )
1271}
ndg_commonmark/processor/core.rs

ndg_commonmark/processor/
core.rs