ndg_commonmark/processor/
core.rs

1//! Core implementation of the Markdown processor.
2//!
3//! Main implementation of `MarkdownProcessor` and its methods focused on the
4//! core rendering pipeline and configuration management.
5use std::{
6  collections::HashMap,
7  path::{Path, PathBuf},
8};
9
10use comrak::{
11  Arena,
12  nodes::{AstNode, NodeHeading, NodeValue},
13  options::Options,
14  parse_document,
15};
16use log::trace;
17use markup5ever::local_name;
18use walkdir::WalkDir;
19
20use super::{
21  dom::safe_select,
22  process::process_safe,
23  types::{
24    AstTransformer,
25    MarkdownOptions,
26    MarkdownProcessor,
27    PromptTransformer,
28  },
29};
30use crate::{
31  syntax::create_default_manager,
32  types::{Header, MarkdownResult},
33  utils,
34};
35
36impl MarkdownProcessor {
37  /// Create a new `MarkdownProcessor` with the given options.
38  #[must_use]
39  pub fn new(options: MarkdownOptions) -> Self {
40    let manpage_urls = options
41      .manpage_urls_path
42      .as_ref()
43      .and_then(|path| crate::utils::load_manpage_urls(path).ok());
44
45    let syntax_manager = if options.highlight_code {
46      match create_default_manager() {
47        Ok(manager) => {
48          log::info!("Syntax highlighting initialized successfully");
49          Some(manager)
50        },
51        Err(e) => {
52          log::error!("Failed to initialize syntax highlighting: {e}");
53          log::warn!(
54            "Continuing without syntax highlighting - code blocks will not be \
55             highlighted"
56          );
57          None
58        },
59      }
60    } else {
61      None
62    };
63
64    Self {
65      options,
66      manpage_urls,
67      syntax_manager,
68      base_dir: std::path::PathBuf::from("."),
69    }
70  }
71
72  /// Access processor options.
73  #[must_use]
74  pub const fn options(&self) -> &MarkdownOptions {
75    &self.options
76  }
77
78  /// Set the base directory for resolving relative file paths.
79  #[must_use]
80  pub fn with_base_dir(mut self, base_dir: &std::path::Path) -> Self {
81    self.base_dir = base_dir.to_path_buf();
82    self
83  }
84
85  /// Check if a specific feature is enabled.
86  #[must_use]
87  pub const fn has_feature(&self, feature: ProcessorFeature) -> bool {
88    match feature {
89      ProcessorFeature::Gfm => self.options.gfm,
90      ProcessorFeature::Nixpkgs => self.options.nixpkgs,
91      ProcessorFeature::SyntaxHighlighting => self.options.highlight_code,
92      ProcessorFeature::ManpageUrls => self.manpage_urls.is_some(),
93    }
94  }
95
96  /// Get the manpage URLs mapping for use with standalone functions.
97  #[must_use]
98  pub const fn manpage_urls(&self) -> Option<&HashMap<String, String>> {
99    self.manpage_urls.as_ref()
100  }
101
102  /// Highlight all code blocks in HTML using the configured syntax highlighter
103  #[must_use]
104  pub fn highlight_codeblocks(&self, html: &str) -> String {
105    use kuchikikiki::parse_html;
106    use tendril::TendrilSink;
107
108    if !self.options.highlight_code || self.syntax_manager.is_none() {
109      return html.to_string();
110    }
111
112    let document = parse_html().one(html);
113
114    // Collect all code blocks first to avoid DOM modification during iteration
115    let mut code_blocks = Vec::new();
116    for pre_node in safe_select(&document, "pre > code") {
117      let code_node = pre_node;
118      if let Some(element) = code_node.as_element() {
119        let language = element
120          .attributes
121          .borrow()
122          .get("class")
123          .and_then(|class| class.strip_prefix("language-"))
124          .unwrap_or("text")
125          .to_string();
126        let code_text = code_node.text_contents();
127
128        if let Some(pre_parent) = code_node.parent() {
129          code_blocks.push((
130            pre_parent.clone(),
131            code_node.clone(),
132            code_text,
133            language,
134          ));
135        }
136      }
137    }
138
139    // Process each code block
140    for (pre_element, _code_node, code_text, language) in code_blocks {
141      if let Some(highlighted) = self.highlight_code_html(&code_text, &language)
142      {
143        // Wrap highlighted HTML in <pre><code> with appropriate classes
144        let wrapped_html = format!(
145          r#"<pre class="highlight"><code class="language-{language}">{highlighted}</code></pre>"#
146        );
147        let fragment = parse_html().one(wrapped_html.as_str());
148        pre_element.insert_after(fragment);
149        pre_element.detach();
150      }
151      // Do not add highlight/language-* classes if not highlighted
152    }
153
154    let mut buf = Vec::new();
155    if let Err(e) = document.serialize(&mut buf) {
156      log::warn!("DOM serialization failed: {e:?}");
157      return html.to_string(); // Return original HTML if serialization fails
158    }
159    String::from_utf8(buf).unwrap_or_else(|_| html.to_string())
160  }
161
162  /// Handle hard tabs in code blocks according to configuration
163  fn handle_hardtabs(&self, code: &str) -> String {
164    use super::types::TabStyle;
165
166    // Check if there are any hard tabs
167    if !code.contains('\t') {
168      return code.to_string();
169    }
170
171    match self.options.tab_style {
172      // Do nothing
173      TabStyle::None => code.to_string(),
174
175      // Warn, but do nothing.
176      TabStyle::Warn => {
177        log::warn!(
178          "Hard tabs detected in code block. Consider using spaces for \
179           consistency. Tools like editorconfig may help you normalize spaces \
180           in your documents."
181        );
182        code.to_string()
183      },
184
185      // Do not warn, only inform in debug mode. Then return
186      // the updated code.
187      TabStyle::Normalize => {
188        log::debug!("Replacing hard tabs with spaces");
189        code.replace('\t', "  ")
190      },
191    }
192  }
193
194  /// Process hard tabs in code blocks within markdown content
195  fn process_hardtabs(&self, markdown: &str) -> String {
196    use super::types::TabStyle;
197    use crate::utils::codeblock::FenceTracker;
198
199    // If no tab handling is needed, return as-is
200    if self.options.tab_style == TabStyle::None {
201      return markdown.to_string();
202    }
203
204    let mut result = String::with_capacity(markdown.len());
205    let mut lines = markdown.lines().peekable();
206    let mut tracker = FenceTracker::new();
207
208    while let Some(line) = lines.next() {
209      tracker = tracker.process_line(line);
210
211      // Only replace tabs inside fenced code blocks
212      let processed_line = if tracker.in_code_block() && line.contains('\t') {
213        self.handle_hardtabs(line)
214      } else {
215        line.to_string()
216      };
217
218      result.push_str(&processed_line);
219
220      // Add newline unless this is the last line
221      if lines.peek().is_some() {
222        result.push('\n');
223      }
224    }
225
226    result
227  }
228
229  /// Highlight code using the configured syntax highlighter, returns HTML
230  /// string
231  fn highlight_code_html(&self, code: &str, language: &str) -> Option<String> {
232    if !self.options.highlight_code {
233      return None;
234    }
235
236    let syntax_manager = self.syntax_manager.as_ref()?;
237
238    syntax_manager
239      .highlight_code(code, language, self.options.highlight_theme.as_deref())
240      .ok()
241  }
242
243  /// Render Markdown to HTML, extracting headers and title.
244  #[must_use]
245  pub fn render(&self, markdown: &str) -> MarkdownResult {
246    let (preprocessed, included_files) = self.preprocess(markdown);
247    let (headers, title) = self.extract_headers(&preprocessed);
248    let html = self.process_html_pipeline(&preprocessed);
249
250    MarkdownResult {
251      html,
252      headers,
253      title,
254      included_files,
255    }
256  }
257
258  /// Process the HTML generation and post-processing pipeline.
259  fn process_html_pipeline(&self, content: &str) -> String {
260    let mut html = self.convert_to_html(content);
261
262    // Apply feature-specific post-processing
263    if cfg!(feature = "ndg-flavored") {
264      #[cfg(feature = "ndg-flavored")]
265      {
266        html = super::extensions::process_option_references(
267          &html,
268          self.options.valid_options.as_ref(),
269        );
270      }
271    }
272
273    if self.options.nixpkgs {
274      html = self.process_manpage_references_html(&html);
275    }
276
277    if self.options.highlight_code {
278      html = self.highlight_codeblocks(&html);
279    }
280
281    self.kuchiki_postprocess(&html)
282  }
283
284  /// Preprocess the markdown content with all enabled transformations.
285  fn preprocess(
286    &self,
287    content: &str,
288  ) -> (String, Vec<crate::types::IncludedFile>) {
289    let mut processed = content.to_string();
290    let mut included_files = Vec::new();
291
292    // Process MyST-style autolinks first
293    processed = super::extensions::process_myst_autolinks(&processed);
294
295    // Handle hard tabs in code blocks
296    processed = self.process_hardtabs(&processed);
297
298    if self.options.nixpkgs {
299      let (content, files) = self.apply_nixpkgs_preprocessing(&processed);
300      processed = content;
301      included_files = files;
302    }
303
304    if self.options.nixpkgs || cfg!(feature = "ndg-flavored") {
305      processed = super::extensions::process_role_markup(
306        &processed,
307        self.manpage_urls.as_ref(),
308        self.options.auto_link_options,
309        self.options.valid_options.as_ref(),
310      );
311    }
312
313    if cfg!(feature = "wiki") {
314      processed = super::extensions::process_wikilinks(&processed);
315    }
316
317    (processed, included_files)
318  }
319
320  /// Apply Nixpkgs-specific preprocessing steps.
321  #[cfg(feature = "nixpkgs")]
322  fn apply_nixpkgs_preprocessing(
323    &self,
324    content: &str,
325  ) -> (String, Vec<crate::types::IncludedFile>) {
326    let (with_includes, included_files) =
327      match super::extensions::process_file_includes(content, &self.base_dir, 0)
328      {
329        Ok(result) => result,
330        Err(e) => {
331          log::warn!(
332            "File include processing failed: {e}. Continuing without includes."
333          );
334          (content.to_string(), Vec::new())
335        },
336      };
337    let with_blocks = super::extensions::process_block_elements(&with_includes);
338    let processed = super::extensions::process_inline_anchors(&with_blocks);
339    (processed, included_files)
340  }
341
342  /// Apply Nixpkgs-specific preprocessing steps (no-op when feature disabled).
343  #[cfg(not(feature = "nixpkgs"))]
344  fn apply_nixpkgs_preprocessing(
345    &self,
346    content: &str,
347  ) -> (String, Vec<crate::types::IncludedFile>) {
348    (content.to_string(), Vec::new())
349  }
350
351  /// Extract headers and title from the markdown content.
352  #[must_use]
353  pub fn extract_headers(
354    &self,
355    content: &str,
356  ) -> (Vec<Header>, Option<String>) {
357    use std::fmt::Write;
358
359    let arena = Arena::new();
360    let options = self.comrak_options();
361
362    // Normalize custom anchors with no heading level to h2
363    let mut normalized = String::with_capacity(content.len());
364    for line in content.lines() {
365      let trimmed = line.trim_end();
366      if !trimmed.starts_with('#')
367        && let Some(anchor_start) = trimmed.rfind("{#")
368        && let Some(anchor_end) = trimmed[anchor_start..].find('}')
369      {
370        let text = trimmed[..anchor_start].trim_end();
371        let id = &trimmed[anchor_start + 2..anchor_start + anchor_end];
372        let _ = writeln!(normalized, "## {text} {{#{id}}}");
373        continue;
374      }
375      normalized.push_str(line);
376      normalized.push('\n');
377    }
378
379    let root = parse_document(&arena, &normalized, &options);
380
381    let mut headers = Vec::new();
382    let mut found_title = None;
383
384    for node in root.descendants() {
385      if let NodeValue::Heading(NodeHeading { level, .. }) =
386        &node.data.borrow().value
387      {
388        let mut text = String::new();
389        let mut explicit_id = None;
390
391        for child in node.children() {
392          match &child.data.borrow().value {
393            NodeValue::Text(t) => text.push_str(t),
394            NodeValue::Code(t) => text.push_str(&t.literal),
395            NodeValue::Link(..)
396            | NodeValue::Emph
397            | NodeValue::Strong
398            | NodeValue::Subscript
399            | NodeValue::Strikethrough
400            | NodeValue::Superscript
401            | NodeValue::FootnoteReference(..) => {
402              text.push_str(&extract_inline_text(child));
403            },
404            NodeValue::HtmlInline(html) => {
405              // Look for explicit anchor in HTML inline node: {#id}
406              let html_str = html.as_str();
407              if let Some(start) = html_str.find("{#")
408                && let Some(end) = html_str[start..].find('}')
409              {
410                let anchor = &html_str[start + 2..start + end];
411                explicit_id = Some(anchor.to_string());
412              }
413            },
414            #[allow(clippy::match_same_arms, reason = "Explicit for clarity")]
415            NodeValue::Image(..) => {},
416            _ => {},
417          }
418        }
419
420        // Check for trailing {#id} in heading text
421        let trimmed = text.trim_end();
422        #[allow(clippy::option_if_let_else)]
423        // Nested options clearer with if-let
424        let (final_text, id) = if let Some(start) = trimmed.rfind("{#") {
425          if let Some(end) = trimmed[start..].find('}') {
426            let anchor = &trimmed[start + 2..start + end];
427            (trimmed[..start].trim_end().to_string(), anchor.to_string())
428          } else {
429            (
430              text.clone(),
431              explicit_id.unwrap_or_else(|| utils::slugify(&text)),
432            )
433          }
434        } else {
435          (
436            text.clone(),
437            explicit_id.unwrap_or_else(|| utils::slugify(&text)),
438          )
439        };
440        if *level == 1 && found_title.is_none() {
441          found_title = Some(final_text.clone());
442        }
443        headers.push(Header {
444          text: final_text,
445          level: *level,
446          id,
447        });
448      }
449    }
450
451    (headers, found_title)
452  }
453
454  /// Convert markdown to HTML using comrak and configured options.
455  fn convert_to_html(&self, content: &str) -> String {
456    // Process directly without panic catching for better performance
457    let arena = Arena::new();
458    let options = self.comrak_options();
459    let root = parse_document(&arena, content, &options);
460
461    // Apply AST transformations
462    let prompt_transformer = PromptTransformer;
463    prompt_transformer.transform(root);
464
465    let mut html_output = String::new();
466    if let Err(e) = comrak::format_html(root, &options, &mut html_output) {
467      log::error!("Failed to format HTML: {e}");
468    }
469
470    // Post-process HTML to handle header anchors
471    Self::process_header_anchors_html(&html_output)
472  }
473
474  /// Process header anchors in HTML by finding `{#id}` syntax and converting to
475  /// proper id attributes. Also adds auto-generated IDs to headers without
476  /// explicit anchors.
477  fn process_header_anchors_html(html: &str) -> String {
478    use std::sync::LazyLock;
479
480    use regex::Regex;
481
482    // First pass: handle explicit {#id} syntax
483    static HEADER_ANCHOR_RE: LazyLock<Regex> = LazyLock::new(|| {
484      Regex::new(r"<h([1-6])>(.*?)\s*\{#([a-zA-Z0-9_-]+)\}(.*?)</h[1-6]>")
485        .unwrap_or_else(|e| {
486          log::error!("Failed to compile HEADER_ANCHOR_RE regex: {e}");
487          utils::never_matching_regex().unwrap_or_else(|_| {
488            #[allow(
489              clippy::expect_used,
490              reason = "This pattern is guaranteed to be valid"
491            )]
492            Regex::new(r"[^\s\S]")
493              .expect("regex pattern [^\\s\\S] should always compile")
494          })
495        })
496    });
497
498    // Second pass: add IDs to headers without attributes (no id yet)
499    // Matches <h1>content</h1> but not <h1 id="...">content</h1>
500    static HEADER_NO_ID_RE: LazyLock<Regex> = LazyLock::new(|| {
501      Regex::new(r"<h([1-6])>(.*?)</h[1-6]>").unwrap_or_else(|e| {
502        log::error!("Failed to compile HEADER_NO_ID_RE regex: {e}");
503        utils::never_matching_regex().unwrap_or_else(|_| {
504          #[allow(
505            clippy::expect_used,
506            reason = "This pattern is guaranteed to be valid"
507          )]
508          Regex::new(r"[^\s\S]")
509            .expect("regex pattern [^\\s\\S] should always compile")
510        })
511      })
512    });
513
514    // Regex to strip HTML tags for slugification
515    static HTML_TAG_RE: LazyLock<Regex> = LazyLock::new(|| {
516      Regex::new(r"<[^>]+>").unwrap_or_else(|e| {
517        log::error!("Failed to compile HTML_TAG_RE regex: {e}");
518        utils::never_matching_regex().unwrap_or_else(|_| {
519          #[allow(
520            clippy::expect_used,
521            reason = "This pattern is guaranteed to be valid"
522          )]
523          Regex::new(r"[^\s\S]")
524            .expect("regex pattern [^\\s\\S] should always compile")
525        })
526      })
527    });
528
529    // First pass: explicit {#id} syntax
530    let result = HEADER_ANCHOR_RE
531      .replace_all(html, |caps: &regex::Captures| {
532        let level = &caps[1];
533        let prefix = &caps[2];
534        let id = &caps[3];
535        let suffix = &caps[4];
536        format!("<h{level} id=\"{id}\">{prefix}{suffix}</h{level}>")
537      })
538      .to_string();
539
540    // Second pass: add auto-generated IDs to headers without id attribute
541    HEADER_NO_ID_RE
542      .replace_all(&result, |caps: &regex::Captures| {
543        let level = &caps[1];
544        let content = &caps[2];
545        // Strip HTML tags and slugify the text content
546        let text_only = HTML_TAG_RE.replace_all(content, "");
547        let id = utils::slugify(&text_only);
548        if id.is_empty() {
549          // If slugify produces empty string, keep header without id
550          format!("<h{level}>{content}</h{level}>")
551        } else {
552          format!("<h{level} id=\"{id}\">{content}</h{level}>")
553        }
554      })
555      .to_string()
556  }
557
558  /// Build comrak options from `MarkdownOptions` and feature flags.
559  fn comrak_options(&self) -> Options<'_> {
560    let mut options = Options::default();
561    // Markdown features present in GFM.
562    if self.options.gfm {
563      options.extension.table = true;
564      options.extension.footnotes = true;
565      options.extension.strikethrough = true;
566      options.extension.tasklist = true;
567      options.extension.superscript = true;
568      options.extension.autolink = true;
569    }
570
571    // Enable unsafe HTML references. This is not a security concern
572    // as all input is assumed to be trusted.
573    options.render.r#unsafe = true;
574
575    // Enable description lists but keep custom header processing
576    options.extension.header_id_prefix = None;
577    options.extension.description_lists = true;
578    options
579  }
580
581  /// Post-process HTML to enhance manpage references with URL links.
582  #[cfg(feature = "nixpkgs")]
583  fn process_manpage_references_html(&self, html: &str) -> String {
584    super::extensions::process_manpage_references(
585      html,
586      self.manpage_urls.as_ref(),
587    )
588  }
589
590  /// Post-process HTML to enhance manpage references (no-op when feature
591  /// disabled).
592  #[cfg(not(feature = "nixpkgs"))]
593  fn process_manpage_references_html(&self, html: &str) -> String {
594    html.to_string()
595  }
596
597  /// HTML post-processing using kuchiki DOM manipulation.
598  #[allow(
599    clippy::unused_self,
600    reason = "Method signature matches processor pattern"
601  )]
602  fn kuchiki_postprocess(&self, html: &str) -> String {
603    // Use a standalone function to avoid borrowing issues
604    kuchiki_postprocess_html(html, |document| {
605      Self::apply_dom_transformations(document);
606    })
607  }
608
609  /// Apply all DOM transformations to the parsed HTML document.
610  fn apply_dom_transformations(document: &kuchikikiki::NodeRef) {
611    Self::process_list_item_id_markers(document);
612    Self::process_header_anchor_comments(document);
613    Self::process_list_item_inline_anchors(document);
614    Self::process_paragraph_inline_anchors(document);
615    Self::process_remaining_inline_anchors(document);
616    Self::process_markdown_links(document);
617    Self::process_option_anchor_links(document);
618    Self::process_empty_auto_links(document);
619    Self::process_empty_html_links(document);
620  }
621
622  /// Process list item ID markers: <li><!-- nixos-anchor-id:ID -->
623  fn process_list_item_id_markers(document: &kuchikikiki::NodeRef) {
624    let mut to_modify = Vec::new();
625
626    for comment in document.inclusive_descendants() {
627      if let Some(comment_node) = comment.as_comment() {
628        let comment_text = comment_node.borrow();
629        if let Some(id_start) = comment_text.find("nixos-anchor-id:") {
630          let id = comment_text[id_start + 16..].trim();
631          if !id.is_empty()
632            && id
633              .chars()
634              .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
635          {
636            // Check if this comment is inside an <li> element
637            if let Some(parent) = comment.parent()
638              && let Some(element) = parent.as_element()
639              && element.name.local.as_ref() == "li"
640            {
641              to_modify.push((comment.clone(), id.to_string()));
642            }
643          }
644        }
645      }
646    }
647
648    for (comment_node, id) in to_modify {
649      let span = kuchikikiki::NodeRef::new_element(
650        markup5ever::QualName::new(
651          None,
652          markup5ever::ns!(html),
653          local_name!("span"),
654        ),
655        vec![
656          (
657            kuchikikiki::ExpandedName::new("", "id"),
658            kuchikikiki::Attribute {
659              prefix: None,
660              value:  id,
661            },
662          ),
663          (
664            kuchikikiki::ExpandedName::new("", "class"),
665            kuchikikiki::Attribute {
666              prefix: None,
667              value:  "nixos-anchor".into(),
668            },
669          ),
670        ],
671      );
672      comment_node.insert_after(span);
673      comment_node.detach();
674    }
675  }
676
677  /// Process header anchors with comments: <h1>text<!-- anchor: id --></h1>
678  fn process_header_anchor_comments(document: &kuchikikiki::NodeRef) {
679    let mut to_modify = Vec::new();
680
681    for comment in document.inclusive_descendants() {
682      if let Some(comment_node) = comment.as_comment() {
683        let comment_text = comment_node.borrow();
684        if let Some(anchor_start) = comment_text.find("anchor:") {
685          let id = comment_text[anchor_start + 7..].trim();
686          if !id.is_empty()
687            && id
688              .chars()
689              .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
690          {
691            // Check if this comment is inside a header element
692            if let Some(parent) = comment.parent()
693              && let Some(element) = parent.as_element()
694            {
695              let tag_name = element.name.local.as_ref();
696              if matches!(tag_name, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
697                to_modify.push((
698                  parent.clone(),
699                  comment.clone(),
700                  id.to_string(),
701                ));
702              }
703            }
704          }
705        }
706      }
707    }
708
709    for (header_element, comment_node, id) in to_modify {
710      if let Some(element) = header_element.as_element() {
711        element
712          .attributes
713          .borrow_mut()
714          .insert(local_name!("id"), id);
715        comment_node.detach();
716      }
717    }
718  }
719
720  /// Process remaining inline anchors in list items: <li>[]{#id}content</li>
721  fn process_list_item_inline_anchors(document: &kuchikikiki::NodeRef) {
722    for li_node in safe_select(document, "li") {
723      let li_element = li_node;
724
725      // Check if this list item contains code elements
726      let has_code = !safe_select(&li_element, "code, pre").is_empty();
727      if has_code {
728        continue; // Skip list items with code blocks
729      }
730
731      let text_content = li_element.text_contents();
732
733      if let Some(anchor_start) = text_content.find("[]{#")
734        && let Some(anchor_end) = text_content[anchor_start..].find('}')
735      {
736        let id = &text_content[anchor_start + 4..anchor_start + anchor_end];
737        if !id.is_empty()
738          && id
739            .chars()
740            .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
741        {
742          let remaining_content =
743            &text_content[anchor_start + anchor_end + 1..];
744
745          // Clear current content and rebuild
746          for child in li_element.children() {
747            child.detach();
748          }
749
750          let span = kuchikikiki::NodeRef::new_element(
751            markup5ever::QualName::new(
752              None,
753              markup5ever::ns!(html),
754              local_name!("span"),
755            ),
756            vec![
757              (
758                kuchikikiki::ExpandedName::new("", "id"),
759                kuchikikiki::Attribute {
760                  prefix: None,
761                  value:  id.into(),
762                },
763              ),
764              (
765                kuchikikiki::ExpandedName::new("", "class"),
766                kuchikikiki::Attribute {
767                  prefix: None,
768                  value:  "nixos-anchor".into(),
769                },
770              ),
771            ],
772          );
773          li_element.append(span);
774          if !remaining_content.is_empty() {
775            li_element
776              .append(kuchikikiki::NodeRef::new_text(remaining_content));
777          }
778        }
779      }
780    }
781  }
782
783  /// Process inline anchors in paragraphs: <p>[]{#id}content</p>
784  fn process_paragraph_inline_anchors(document: &kuchikikiki::NodeRef) {
785    for p_node in safe_select(document, "p") {
786      let p_element = p_node;
787
788      // Check if this paragraph contains code elements
789      let has_code = !safe_select(&p_element, "code, pre").is_empty();
790      if has_code {
791        continue; // Skip paragraphs with code blocks
792      }
793
794      let text_content = p_element.text_contents();
795
796      if let Some(anchor_start) = text_content.find("[]{#")
797        && let Some(anchor_end) = text_content[anchor_start..].find('}')
798      {
799        let id = &text_content[anchor_start + 4..anchor_start + anchor_end];
800        if !id.is_empty()
801          && id
802            .chars()
803            .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
804        {
805          let remaining_content =
806            &text_content[anchor_start + anchor_end + 1..];
807
808          // Clear current content and rebuild
809          for child in p_element.children() {
810            child.detach();
811          }
812
813          let span = kuchikikiki::NodeRef::new_element(
814            markup5ever::QualName::new(
815              None,
816              markup5ever::ns!(html),
817              local_name!("span"),
818            ),
819            vec![
820              (
821                kuchikikiki::ExpandedName::new("", "id"),
822                kuchikikiki::Attribute {
823                  prefix: None,
824                  value:  id.into(),
825                },
826              ),
827              (
828                kuchikikiki::ExpandedName::new("", "class"),
829                kuchikikiki::Attribute {
830                  prefix: None,
831                  value:  "nixos-anchor".into(),
832                },
833              ),
834            ],
835          );
836          p_element.append(span);
837          if !remaining_content.is_empty() {
838            p_element.append(kuchikikiki::NodeRef::new_text(remaining_content));
839          }
840        }
841      }
842    }
843  }
844
845  /// Process remaining standalone inline anchors throughout the document
846  fn process_remaining_inline_anchors(document: &kuchikikiki::NodeRef) {
847    let mut text_nodes_to_process = Vec::new();
848
849    for node in document.inclusive_descendants() {
850      if let Some(text_node) = node.as_text() {
851        // Check if this text node is inside a code block
852        let mut parent = node.parent();
853        let mut in_code = false;
854        while let Some(p) = parent {
855          if let Some(element) = p.as_element()
856            && (element.name.local == local_name!("code")
857              || element.name.local == local_name!("pre"))
858          {
859            in_code = true;
860            break;
861          }
862          parent = p.parent();
863        }
864
865        // Only process if not in code
866        if !in_code {
867          let text_content = text_node.borrow().clone();
868          if text_content.contains("[]{#") {
869            text_nodes_to_process.push((node.clone(), text_content));
870          }
871        }
872      }
873    }
874
875    for (text_node, text_content) in text_nodes_to_process {
876      let mut last_end = 0;
877      let mut new_children = Vec::new();
878
879      // Simple pattern matching for []{#id}
880      let chars = text_content.chars().collect::<Vec<_>>();
881      let mut i = 0;
882      while i < chars.len() {
883        if i + 4 < chars.len()
884          && chars[i] == '['
885          && chars[i + 1] == ']'
886          && chars[i + 2] == '{'
887          && chars[i + 3] == '#'
888        {
889          // Found start of anchor pattern
890          let anchor_start = i;
891          i += 4; // skip "[]{#"
892
893          let mut id = String::new();
894          while i < chars.len() && chars[i] != '}' {
895            if chars[i].is_alphanumeric() || chars[i] == '-' || chars[i] == '_'
896            {
897              id.push(chars[i]);
898              i += 1;
899            } else {
900              break;
901            }
902          }
903
904          if i < chars.len() && chars[i] == '}' && !id.is_empty() {
905            // Valid anchor found
906            let anchor_end = i + 1;
907
908            // Add text before anchor
909            if anchor_start > last_end {
910              let before_text: String =
911                chars[last_end..anchor_start].iter().collect();
912              if !before_text.is_empty() {
913                new_children.push(kuchikikiki::NodeRef::new_text(before_text));
914              }
915            }
916
917            // Add span element
918            let span = kuchikikiki::NodeRef::new_element(
919              markup5ever::QualName::new(
920                None,
921                markup5ever::ns!(html),
922                local_name!("span"),
923              ),
924              vec![
925                (
926                  kuchikikiki::ExpandedName::new("", "id"),
927                  kuchikikiki::Attribute {
928                    prefix: None,
929                    value:  id,
930                  },
931                ),
932                (
933                  kuchikikiki::ExpandedName::new("", "class"),
934                  kuchikikiki::Attribute {
935                    prefix: None,
936                    value:  "nixos-anchor".into(),
937                  },
938                ),
939              ],
940            );
941            new_children.push(span);
942
943            last_end = anchor_end;
944            i = anchor_end;
945          } else {
946            i += 1;
947          }
948        } else {
949          i += 1;
950        }
951      }
952
953      // Add remaining text
954      if last_end < chars.len() {
955        let after_text: String = chars[last_end..].iter().collect();
956        if !after_text.is_empty() {
957          new_children.push(kuchikikiki::NodeRef::new_text(after_text));
958        }
959      }
960
961      // Replace text node if we found anchors
962      if !new_children.is_empty() {
963        for child in new_children {
964          text_node.insert_before(child);
965        }
966        text_node.detach();
967      }
968    }
969  }
970
971  /// Process empty auto-links: [](#anchor) -> <a href="#anchor">Anchor</a>
972  fn process_empty_auto_links(document: &kuchikikiki::NodeRef) {
973    for link_node in safe_select(document, "a") {
974      let link_element = link_node;
975      if let Some(element) = link_element.as_element() {
976        let href = element
977          .attributes
978          .borrow()
979          .get(local_name!("href"))
980          .map(std::string::ToString::to_string);
981        let text_content = link_element.text_contents();
982
983        if let Some(href_value) = href
984          && href_value.starts_with('#')
985          && (text_content.trim().is_empty()
986            || text_content.trim() == "{{ANCHOR}}")
987        {
988          // Clear placeholder text if present
989          if text_content.trim() == "{{ANCHOR}}" {
990            for child in link_element.children() {
991              child.detach();
992            }
993          }
994          // Empty link with anchor - add humanized text
995          let display_text = Self::humanize_anchor_id(&href_value);
996          link_element.append(kuchikikiki::NodeRef::new_text(display_text));
997        }
998      }
999    }
1000  }
1001
1002  /// Process empty HTML links that have no content
1003  fn process_empty_html_links(document: &kuchikikiki::NodeRef) {
1004    for link_node in safe_select(document, "a[href^='#']") {
1005      let link_element = link_node;
1006      let text_content = link_element.text_contents();
1007
1008      if text_content.trim().is_empty() || text_content.trim() == "{{ANCHOR}}" {
1009        // Clear placeholder text if present
1010        if text_content.trim() == "{{ANCHOR}}" {
1011          for child in link_element.children() {
1012            child.detach();
1013          }
1014        }
1015        if let Some(element) = link_element.as_element()
1016          && let Some(href) =
1017            element.attributes.borrow().get(local_name!("href"))
1018        {
1019          let display_text = Self::humanize_anchor_id(href);
1020          link_element.append(kuchikikiki::NodeRef::new_text(display_text));
1021        }
1022      }
1023    }
1024  }
1025
1026  /// Process option anchor links: [](#opt-option.path) -> link to options.html
1027  fn process_option_anchor_links(document: &kuchikikiki::NodeRef) {
1028    let mut to_modify = Vec::new();
1029
1030    // Collect all option anchor links first
1031    for link_node in safe_select(document, "a[href^='#opt-']") {
1032      let link_element = link_node;
1033      if let Some(element) = link_element.as_element() {
1034        let href = element
1035          .attributes
1036          .borrow()
1037          .get(local_name!("href"))
1038          .map(std::string::ToString::to_string);
1039        let text_content = link_element.text_contents();
1040
1041        if let Some(href_value) = href
1042          && href_value.starts_with("#opt-")
1043        {
1044          let option_anchor = href_value[1..].to_string(); // remove the leading #
1045          let needs_text_replacement = text_content.trim().is_empty()
1046            || text_content.trim() == "{{ANCHOR}}";
1047          to_modify.push((
1048            link_element.clone(),
1049            option_anchor,
1050            needs_text_replacement,
1051          ));
1052        }
1053      }
1054    }
1055
1056    // Apply modifications
1057    for (link_element, option_anchor, needs_text_replacement) in to_modify {
1058      if let Some(element) = link_element.as_element() {
1059        let new_href = format!("options.html#{option_anchor}");
1060        element
1061          .attributes
1062          .borrow_mut()
1063          .insert(local_name!("href"), new_href);
1064
1065        if needs_text_replacement {
1066          // Clear existing content
1067          for child in link_element.children() {
1068            child.detach();
1069          }
1070
1071          // Extract option name from anchor
1072          // opt-services-nginx-enable -> services.nginx.enable
1073          if let Some(option_path) = option_anchor.strip_prefix("opt-") {
1074            let option_name = option_path.replace('-', ".");
1075            link_element.append(kuchikikiki::NodeRef::new_text(option_name));
1076          }
1077        }
1078      }
1079    }
1080  }
1081
1082  /// Process markdown file links: convert .md hrefs to .html
1083  fn process_markdown_links(document: &kuchikikiki::NodeRef) {
1084    for link_node in safe_select(document, "a") {
1085      let link_element = link_node;
1086      if let Some(element) = link_element.as_element() {
1087        let href = element
1088          .attributes
1089          .borrow()
1090          .get(local_name!("href"))
1091          .map(std::string::ToString::to_string);
1092
1093        if let Some(href_value) = href {
1094          // Only process relative links ending in .md (not absolute URLs, not anchors)
1095          if !href_value.starts_with("http://")
1096            && !href_value.starts_with("https://")
1097            && !href_value.starts_with('#')
1098            && !href_value.starts_with("mailto:")
1099          {
1100            // Split off fragment (#) and query (?) to check the path extension
1101            let (path_part, suffix) = href_value
1102              .find(|c| c == '#' || c == '?')
1103              .map_or((href_value.as_str(), ""), |idx| href_value.split_at(idx));
1104
1105            if std::path::Path::new(path_part)
1106              .extension()
1107              .is_some_and(|ext| ext.eq_ignore_ascii_case("md"))
1108            {
1109              let new_href = format!("{}.html{}", &path_part[..path_part.len() - 3], suffix);
1110              element
1111                .attributes
1112                .borrow_mut()
1113                .insert(local_name!("href"), new_href);
1114            }
1115          }
1116        }
1117      }
1118    }
1119  }
1120
1121  /// Convert an anchor ID to human-readable text
1122  fn humanize_anchor_id(anchor: &str) -> String {
1123    // Strip the leading #
1124    let cleaned = anchor.trim_start_matches('#');
1125
1126    // Remove common prefixes
1127    let without_prefix = cleaned
1128      .trim_start_matches("sec-")
1129      .trim_start_matches("ssec-")
1130      .trim_start_matches("opt-");
1131
1132    // Replace separators with spaces
1133    let spaced = without_prefix.replace(['-', '_'], " ");
1134
1135    // Capitalize each word
1136    spaced
1137      .split_whitespace()
1138      .map(|word| {
1139        let mut chars = word.chars();
1140        chars.next().map_or_else(String::new, |c| {
1141          c.to_uppercase().collect::<String>() + chars.as_str()
1142        })
1143      })
1144      .collect::<Vec<String>>()
1145      .join(" ")
1146  }
1147}
1148
1149/// Extract all inline text from a heading node.
1150pub fn extract_inline_text<'a>(node: &'a AstNode<'a>) -> String {
1151  let mut text = String::new();
1152  for child in node.children() {
1153    match &child.data.borrow().value {
1154      NodeValue::Text(t) => text.push_str(t),
1155      NodeValue::Code(t) => text.push_str(&t.literal),
1156      NodeValue::Link(..)
1157      | NodeValue::Emph
1158      | NodeValue::Strong
1159      | NodeValue::Strikethrough
1160      | NodeValue::Superscript
1161      | NodeValue::Subscript
1162      | NodeValue::FootnoteReference(..) => {
1163        text.push_str(&extract_inline_text(child));
1164      },
1165      #[allow(clippy::match_same_arms, reason = "Explicit for clarity")]
1166      NodeValue::HtmlInline(_) | NodeValue::Image(..) => {},
1167      _ => {},
1168    }
1169  }
1170  text
1171}
1172
1173/// Collect all markdown files from the input directory
1174pub fn collect_markdown_files(input_dir: &Path) -> Vec<PathBuf> {
1175  let mut files = Vec::with_capacity(100);
1176
1177  for entry in WalkDir::new(input_dir)
1178    .follow_links(true)
1179    .into_iter()
1180    .filter_map(Result::ok)
1181  {
1182    let path = entry.path();
1183    if path.is_file() && path.extension().is_some_and(|ext| ext == "md") {
1184      files.push(path.to_owned());
1185    }
1186  }
1187
1188  trace!("Found {} markdown files to process", files.len());
1189  files
1190}
1191
1192/// Features that can be queried on a processor instance.
1193#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1194pub enum ProcessorFeature {
1195  /// GitHub Flavored Markdown support
1196  Gfm,
1197  /// Nixpkgs documentation extensions
1198  Nixpkgs,
1199  /// Syntax highlighting for code blocks
1200  SyntaxHighlighting,
1201  /// Manpage URL mapping support
1202  ManpageUrls,
1203}
1204
1205/// Standalone HTML post-processing function to avoid borrowing issues.
1206fn kuchiki_postprocess_html<F>(html: &str, transform_fn: F) -> String
1207where
1208  F: FnOnce(&kuchikikiki::NodeRef),
1209{
1210  process_safe(
1211    html,
1212    |html| {
1213      use tendril::TendrilSink;
1214
1215      let document = kuchikikiki::parse_html().one(html);
1216      transform_fn(&document);
1217
1218      let mut out = Vec::new();
1219      let _ = document.serialize(&mut out);
1220      String::from_utf8_lossy(&out).into_owned()
1221    },
1222    html,
1223  )
1224}
ndg_commonmark/processor/core.rs

ndg_commonmark/processor/
core.rs