ndg_commonmark/processor/
extensions.rs

1//! Feature-specific Markdown processing extensions.
2use std::{fmt::Write, fs, path::Path};
3
4use html_escape::encode_text;
5
6use super::{dom::safe_select, process::process_safe};
7
8/// Sanitize an option name into a valid HTML ID, matching nixos-render-docs
9/// XML ID format.
10///
11/// Translates `*`, `<`, `>`, `[`, `]`, `:`, `"`, and space to `_`.
12/// Dots are preserved to match nixos-render-docs behavior.
13fn sanitize_option_id(name: &str) -> String {
14  let sanitized: String = name
15    .chars()
16    .map(|c| {
17      match c {
18        '*' | '<' | '>' | '[' | ']' | ':' | '"' | ' ' => '_',
19        c => c,
20      }
21    })
22    .collect();
23  format!("option-{sanitized}")
24}
25
26/// Apply GitHub Flavored Markdown (GFM) extensions to the input markdown.
27///
28/// This is a placeholder for future GFM-specific preprocessing or AST
29/// transformations. In practice, most GFM features are enabled via comrak
30/// options, but additional logic (such as custom tables, task lists, etc.) can
31/// be added here.
32///
33/// # Arguments
34/// * `markdown` - The input markdown text
35///
36/// # Returns
37/// The processed markdown text with GFM extensions applied
38#[cfg(feature = "gfm")]
39#[must_use]
40pub fn apply_gfm_extensions(markdown: &str) -> String {
41  // XXX: Comrak already supports GFM, but if there is any feature in the spec
42  // that is not implemented as we'd like for it to be, we can add it here.
43  markdown.to_owned()
44}
45
46/// Maximum recursion depth for file includes to prevent infinite recursion.
47const MAX_INCLUDE_DEPTH: usize = 8;
48
49/// Check if a path is safe for file inclusion (no absolute paths, no parent
50/// directory traversal).
51#[cfg(feature = "nixpkgs")]
52fn is_safe_path(path: &str, _base_dir: &Path) -> bool {
53  let p = Path::new(path);
54  if p.is_absolute() || path.contains('\\') {
55    return false;
56  }
57
58  // Reject any path containing parent directory components
59  for component in p.components() {
60    if matches!(component, std::path::Component::ParentDir) {
61      return false;
62    }
63  }
64
65  true
66}
67
68/// Parse the custom output directive from an include block.
69#[cfg(feature = "nixpkgs")]
70#[allow(
71  clippy::option_if_let_else,
72  reason = "Nested options are clearer with if-let"
73)]
74fn parse_include_directive(line: &str) -> Option<String> {
75  if let Some(start) = line.find("html:into-file=") {
76    let start = start + "html:into-file=".len();
77    if let Some(end) = line[start..].find(' ') {
78      Some(line[start..start + end].to_string())
79    } else {
80      Some(line[start..].trim().to_string())
81    }
82  } else {
83    None
84  }
85}
86
87/// Read and process files listed in an include block.
88#[cfg(feature = "nixpkgs")]
89#[allow(
90  clippy::needless_pass_by_value,
91  reason = "Owned value needed for cloning in loop"
92)]
93fn read_includes(
94  listing: &str,
95  base_dir: &Path,
96  custom_output: Option<String>,
97  included_files: &mut Vec<crate::types::IncludedFile>,
98  depth: usize,
99) -> Result<String, String> {
100  let mut result = String::new();
101
102  for line in listing.lines() {
103    let trimmed = line.trim();
104    if trimmed.is_empty() || !is_safe_path(trimmed, base_dir) {
105      continue;
106    }
107    let full_path = base_dir.join(trimmed);
108    log::info!("Including file: {}", full_path.display());
109
110    match fs::read_to_string(&full_path) {
111      Ok(content) => {
112        let file_dir = full_path.parent().unwrap_or(base_dir);
113        let (processed_content, nested_includes) =
114          process_file_includes(&content, file_dir, depth + 1)?;
115
116        result.push_str(&processed_content);
117        if !processed_content.ends_with('\n') {
118          result.push('\n');
119        }
120
121        included_files.push(crate::types::IncludedFile {
122          path:          trimmed.to_string(),
123          custom_output: custom_output.clone(),
124        });
125
126        // Normalize nested include paths relative to original base_dir
127        for nested in nested_includes {
128          let nested_full_path = file_dir.join(&nested.path);
129          if let Ok(normalized_path) = nested_full_path.strip_prefix(base_dir) {
130            included_files.push(crate::types::IncludedFile {
131              path:          normalized_path.to_string_lossy().to_string(),
132              custom_output: nested.custom_output,
133            });
134          }
135        }
136      },
137      Err(_) => {
138        let _ = writeln!(
139          result,
140          "<!-- ndg: could not include file: {} -->",
141          full_path.display()
142        );
143      },
144    }
145  }
146  Ok(result)
147}
148
149/// Process file includes in Nixpkgs/NixOS documentation.
150///
151/// This function processes file include syntax:
152///
153/// ````markdown
154/// ```{=include=}
155/// path/to/file1.md
156/// path/to/file2.md
157/// ```
158/// ````
159///
160/// # Arguments
161///
162/// * `markdown` - The input markdown text
163/// * `base_dir` - The base directory for resolving relative file paths
164/// * `depth` - Current recursion depth (use 0 for initial call)
165///
166/// # Returns
167///
168/// Returns `Ok((processed_markdown, included_files))` where `included_files` is
169/// a list of all successfully included files.
170///
171/// # Errors
172///
173/// Returns `Err(message)` if recursion depth exceeds [`MAX_INCLUDE_DEPTH`],
174/// which likely indicates a circular include cycle.
175///
176/// # Safety
177///
178/// Only relative paths without ".." are allowed for security.
179#[cfg(feature = "nixpkgs")]
180pub fn process_file_includes(
181  markdown: &str,
182  base_dir: &std::path::Path,
183  depth: usize,
184) -> Result<(String, Vec<crate::types::IncludedFile>), String> {
185  // Check recursion depth limit
186  if depth >= MAX_INCLUDE_DEPTH {
187    return Err(format!(
188      "Maximum include recursion depth ({MAX_INCLUDE_DEPTH}) exceeded. This \
189       likely indicates a cycle in file includes."
190    ));
191  }
192
193  let mut output = String::new();
194  let mut lines = markdown.lines();
195  let mut fence_tracker = crate::utils::codeblock::FenceTracker::new();
196  let mut all_included_files: Vec<crate::types::IncludedFile> = Vec::new();
197
198  while let Some(line) = lines.next() {
199    let trimmed = line.trim_start();
200
201    if !fence_tracker.in_code_block() && trimmed.starts_with("```{=include=}") {
202      let custom_output = parse_include_directive(trimmed);
203
204      let mut include_listing = String::new();
205      for next_line in lines.by_ref() {
206        if next_line.trim_start().starts_with("```") {
207          break;
208        }
209        include_listing.push_str(next_line);
210        include_listing.push('\n');
211      }
212
213      let included = read_includes(
214        &include_listing,
215        base_dir,
216        custom_output,
217        &mut all_included_files,
218        depth,
219      )?;
220      output.push_str(&included);
221      continue;
222    }
223
224    // Update fence tracking state
225    fence_tracker = fence_tracker.process_line(line);
226
227    output.push_str(line);
228    output.push('\n');
229  }
230
231  Ok((output, all_included_files))
232}
233
234/// Process role markup in markdown content.
235///
236/// This function processes role syntax like `{command}ls -la`
237///
238/// # Arguments
239///
240/// * `content` - The markdown content to process
241/// * `manpage_urls` - Optional mapping of manpage names to URLs
242/// * `auto_link_options` - Whether to convert {option} roles to links
243/// * `valid_options` - Optional set of valid option names for validation
244///
245/// # Returns
246///
247/// The processed markdown with role markup converted to HTML
248#[cfg(any(feature = "nixpkgs", feature = "ndg-flavored"))]
249#[must_use]
250#[allow(
251  clippy::implicit_hasher,
252  reason = "Standard HashMap/HashSet sufficient for this use case"
253)]
254pub fn process_role_markup(
255  content: &str,
256  manpage_urls: Option<&std::collections::HashMap<String, String>>,
257  auto_link_options: bool,
258  valid_options: Option<&std::collections::HashSet<String>>,
259) -> String {
260  let mut result = String::new();
261  let mut chars = content.chars().peekable();
262  let mut tracker = crate::utils::codeblock::InlineTracker::new();
263
264  while let Some(ch) = chars.next() {
265    // Handle backticks (code fences and inline code)
266    if ch == '`' {
267      let (new_tracker, tick_count) = tracker.process_backticks(&mut chars);
268      tracker = new_tracker;
269
270      // Add all the backticks
271      result.push_str(&"`".repeat(tick_count));
272      continue;
273    }
274
275    // Handle tilde code fences (~~~)
276    if ch == '~' && chars.peek() == Some(&'~') {
277      let (new_tracker, tilde_count) = tracker.process_tildes(&mut chars);
278      tracker = new_tracker;
279
280      result.push_str(&"~".repeat(tilde_count));
281      continue;
282    }
283
284    // Handle newlines
285    if ch == '\n' {
286      tracker = tracker.process_newline();
287      result.push(ch);
288      continue;
289    }
290
291    // Process role markup only if we're not in any kind of code
292    if ch == '{' && !tracker.in_any_code() {
293      // Collect remaining characters to test parsing
294      let remaining: Vec<char> = chars.clone().collect();
295      let remaining_str: String = remaining.iter().collect();
296      let mut temp_chars = remaining_str.chars().peekable();
297
298      if let Some(role_markup) = parse_role_markup(
299        &mut temp_chars,
300        manpage_urls,
301        auto_link_options,
302        valid_options,
303      ) {
304        // Valid role markup found, advance the main iterator
305        let remaining_after_parse: String = temp_chars.collect();
306        let consumed = remaining_str.len() - remaining_after_parse.len();
307        for _ in 0..consumed {
308          chars.next();
309        }
310        result.push_str(&role_markup);
311      } else {
312        // Not a valid role markup, keep the original character
313        result.push(ch);
314      }
315    } else {
316      result.push(ch);
317    }
318  }
319
320  result
321}
322
323/// Parse a role markup from the character iterator.
324///
325/// # Returns
326///
327/// `Some(html)` if a valid role markup is found, `None` otherwise.
328fn parse_role_markup(
329  chars: &mut std::iter::Peekable<std::str::Chars>,
330  manpage_urls: Option<&std::collections::HashMap<String, String>>,
331  auto_link_options: bool,
332  valid_options: Option<&std::collections::HashSet<String>>,
333) -> Option<String> {
334  let mut role_name = String::new();
335
336  // Parse role name (lowercase letters only)
337  while let Some(&ch) = chars.peek() {
338    if ch.is_ascii_lowercase() {
339      role_name.push(ch);
340      chars.next();
341    } else {
342      break;
343    }
344  }
345
346  // Must have a non-empty role name
347  if role_name.is_empty() {
348    return None;
349  }
350
351  // Expect closing brace
352  if chars.peek() != Some(&'}') {
353    return None;
354  }
355  chars.next(); // consume '}'
356
357  // Expect opening backtick
358  if chars.peek() != Some(&'`') {
359    return None;
360  }
361  chars.next(); // consume '`'
362
363  // Parse content until closing backtick
364  let mut content = String::new();
365  for ch in chars.by_ref() {
366    if ch == '`' {
367      // Found closing backtick, validate content
368      // Most role types should not have empty content
369      if content.is_empty() && !matches!(role_name.as_str(), "manpage") {
370        return None; // reject empty content for most roles
371      }
372      return Some(format_role_markup(
373        &role_name,
374        &content,
375        manpage_urls,
376        auto_link_options,
377        valid_options,
378      ));
379    }
380    content.push(ch);
381  }
382
383  // No closing backtick found
384  None
385}
386
387/// Format the role markup as HTML based on the role type and content.
388#[must_use]
389#[allow(
390  clippy::option_if_let_else,
391  reason = "Nested options clearer with if-let"
392)]
393#[allow(
394  clippy::implicit_hasher,
395  reason = "Standard HashMap/HashSet sufficient for this use case"
396)]
397pub fn format_role_markup(
398  role_type: &str,
399  content: &str,
400  manpage_urls: Option<&std::collections::HashMap<String, String>>,
401  auto_link_options: bool,
402  valid_options: Option<&std::collections::HashSet<String>>,
403) -> String {
404  let escaped_content = encode_text(content);
405  match role_type {
406    "manpage" => {
407      if let Some(urls) = manpage_urls {
408        if let Some(url) = urls.get(content) {
409          format!(
410            "<a href=\"{url}\" \
411             class=\"manpage-reference\">{escaped_content}</a>"
412          )
413        } else {
414          format!("<span class=\"manpage-reference\">{escaped_content}</span>")
415        }
416      } else {
417        format!("<span class=\"manpage-reference\">{escaped_content}</span>")
418      }
419    },
420    "command" => format!("<code class=\"command\">{escaped_content}</code>"),
421    "env" => format!("<code class=\"env-var\">{escaped_content}</code>"),
422    "file" => format!("<code class=\"file-path\">{escaped_content}</code>"),
423    "option" => {
424      if cfg!(feature = "ndg-flavored") && auto_link_options {
425        // Check if validation is enabled and option is valid
426        let should_link =
427          valid_options.is_none_or(|opts| opts.contains(content)); // If no validation set, link all options
428
429        if should_link {
430          let option_id = sanitize_option_id(content);
431          format!(
432            "<a class=\"option-reference\" \
433             href=\"options.html#{option_id}\"><code \
434             class=\"nixos-option\">{escaped_content}</code></a>"
435          )
436        } else {
437          format!("<code class=\"nixos-option\">{escaped_content}</code>")
438        }
439      } else {
440        format!("<code class=\"nixos-option\">{escaped_content}</code>")
441      }
442    },
443    "var" => format!("<code class=\"nix-var\">{escaped_content}</code>"),
444    _ => format!("<span class=\"{role_type}-markup\">{escaped_content}</span>"),
445  }
446}
447
448/// Process MyST-style autolinks in markdown content.
449///
450/// Converts MyST-like autolinks supported by Nixpkgs-flavored commonmark:
451/// - `[](#anchor)` -> `[](#anchor) -> {{ANCHOR}}` (placeholder for comrak)
452/// - `[](https://url)` -> `<https://url>` (converted to standard autolink)
453///
454/// # Arguments
455///
456/// * `content` - The markdown content to process
457///
458/// # Returns
459///
460/// The processed markdown with `MyST` autolinks converted as a [`String`]
461#[must_use]
462pub fn process_myst_autolinks(content: &str) -> String {
463  let mut result = String::with_capacity(content.len());
464  let mut fence_tracker = crate::utils::codeblock::FenceTracker::new();
465
466  for line in content.lines() {
467    // Update fence tracking state
468    fence_tracker = fence_tracker.process_line(line);
469
470    // Only process MyST autolinks if we're not in a code block
471    if fence_tracker.in_code_block() {
472      result.push_str(line);
473    } else {
474      result.push_str(&process_line_myst_autolinks(line));
475    }
476    result.push('\n');
477  }
478
479  result
480}
481
482/// Process `MyST` autolinks in a single line.
483fn process_line_myst_autolinks(line: &str) -> String {
484  let mut result = String::with_capacity(line.len());
485  let mut chars = line.chars().peekable();
486
487  while let Some(ch) = chars.next() {
488    if ch == '[' && chars.peek() == Some(&']') {
489      chars.next(); // consume ']'
490
491      // Check if this is []{#...} syntax (inline anchor, not autolink)
492      // Nice pit, would be a shame if someone was to... fall into it.
493      if chars.peek() == Some(&'{') {
494        // This is inline anchor syntax, not autolink, keep as-is
495        result.push_str("[]");
496        continue;
497      }
498
499      if chars.peek() == Some(&'(') {
500        chars.next(); // consume '('
501
502        // Collect URL until ')'
503        let mut url = String::new();
504        let mut found_closing = false;
505        while let Some(&next_ch) = chars.peek() {
506          if next_ch == ')' {
507            chars.next(); // consume ')'
508            found_closing = true;
509            break;
510          }
511          url.push(next_ch);
512          chars.next();
513        }
514
515        if found_closing && !url.is_empty() {
516          // Check if it's an anchor link (starts with #) or a URL
517          if url.starts_with('#') {
518            // Add placeholder text for comrak to parse it as a link
519            let _ = write!(result, "[{{{{ANCHOR}}}}]({url})");
520          } else if url.starts_with("http://") || url.starts_with("https://") {
521            // Convert URL autolinks to standard <url> format
522            let _ = write!(result, "<{url}>");
523          } else {
524            // Keep other patterns as-is
525            let _ = write!(result, "[]({url})");
526          }
527        } else {
528          // Malformed, put back what we consumed
529          result.push_str("](");
530          result.push_str(&url);
531        }
532      } else {
533        // Not a link, put back consumed character
534        result.push(']');
535      }
536    } else {
537      result.push(ch);
538    }
539  }
540
541  result
542}
543
544/// Process inline anchors in markdown content.
545///
546/// This function processes inline anchor syntax like `[]{#my-anchor}` while
547/// being code-block aware to avoid processing inside code fences.
548///
549/// # Arguments
550///
551/// * `content` - The markdown content to process
552///
553/// # Returns
554///
555/// The processed markdown with inline anchors converted to HTML spans
556#[cfg(feature = "nixpkgs")]
557#[must_use]
558pub fn process_inline_anchors(content: &str) -> String {
559  let mut result = String::with_capacity(content.len() + 100);
560  let mut fence_tracker = crate::utils::codeblock::FenceTracker::new();
561
562  for line in content.lines() {
563    let trimmed = line.trim_start();
564
565    // Update fence tracking state
566    fence_tracker = fence_tracker.process_line(line);
567
568    // Only process inline anchors if we're not in a code block
569    if fence_tracker.in_code_block() {
570      // In code block, keep line as-is
571      result.push_str(line);
572    } else {
573      // Check for list items with anchors:
574      // "- []{#id} content" or "1. []{#id} content"
575      if let Some(anchor_start) = find_list_item_anchor(trimmed)
576        && let Some(processed_line) =
577          process_list_item_anchor(line, anchor_start)
578      {
579        result.push_str(&processed_line);
580        result.push('\n');
581        continue;
582      }
583
584      // Process regular inline anchors in the line
585      result.push_str(&process_line_anchors(line));
586    }
587    result.push('\n');
588  }
589
590  result
591}
592
593/// Find if a line starts with a list marker followed by an anchor.
594fn find_list_item_anchor(trimmed: &str) -> Option<usize> {
595  // Check for unordered list: "- []{#id}" or "* []{#id}" or "+ []{#id}"
596  if (trimmed.starts_with("- ")
597    || trimmed.starts_with("* ")
598    || trimmed.starts_with("+ "))
599    && trimmed.len() > 2
600  {
601    let after_marker = &trimmed[2..];
602    if after_marker.starts_with("[]{#") {
603      return Some(2);
604    }
605  }
606
607  // Check for ordered list: "1. []{#id}" or "123. []{#id}".
608  let digit_end = trimmed
609    .char_indices()
610    .find(|(_, c)| !c.is_ascii_digit())
611    .map_or(trimmed.len(), |(i, _)| i);
612  if digit_end > 0
613    && digit_end < trimmed.len() - 1
614    && trimmed.as_bytes().get(digit_end) == Some(&b'.')
615  {
616    let after_marker = &trimmed[digit_end + 1..];
617    if after_marker.starts_with(" []{#") {
618      return Some(digit_end + 2);
619    }
620  }
621
622  None
623}
624
625/// Process a list item line that contains an anchor.
626fn process_list_item_anchor(line: &str, anchor_start: usize) -> Option<String> {
627  let before_anchor = &line[..anchor_start];
628  let after_marker = &line[anchor_start..];
629
630  if !after_marker.starts_with("[]{#") {
631    return None;
632  }
633
634  // Find the end of the anchor: []{#id}
635  if let Some(anchor_end) = after_marker.find('}') {
636    let id = &after_marker[4..anchor_end]; // skip "[]{#" and take until '}'
637    let remaining_content = &after_marker[anchor_end + 1..]; // skip '}'
638
639    // Validate ID contains only allowed characters
640    if id
641      .chars()
642      .all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
643      && !id.is_empty()
644    {
645      return Some(format!(
646        "{before_anchor}<span id=\"{id}\" \
647         class=\"nixos-anchor\"></span>{remaining_content}"
648      ));
649    }
650  }
651
652  None
653}
654
655/// Process inline anchors in a single line.
656fn process_line_anchors(line: &str) -> String {
657  let mut result = String::with_capacity(line.len());
658  let mut chars = line.chars().peekable();
659
660  while let Some(ch) = chars.next() {
661    if ch == '[' && chars.peek() == Some(&']') {
662      chars.next(); // consume ']'
663
664      // Check for {#id} pattern
665      if chars.peek() == Some(&'{') {
666        chars.next(); // consume '{'
667        if chars.peek() == Some(&'#') {
668          chars.next(); // consume '#'
669
670          // Collect the ID
671          let mut id = String::new();
672          while let Some(&next_ch) = chars.peek() {
673            if next_ch == '}' {
674              chars.next(); // consume '}'
675
676              // Validate ID and create span
677              if !id.is_empty()
678                && id
679                  .chars()
680                  .all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
681              {
682                let _ = write!(
683                  result,
684                  "<span id=\"{id}\" class=\"nixos-anchor\"></span>"
685                );
686              } else {
687                // Invalid ID, put back original text
688                let _ = write!(result, "[]{{{{#{id}}}}}");
689              }
690              break;
691            } else if next_ch.is_ascii_alphanumeric()
692              || next_ch == '-'
693              || next_ch == '_'
694            {
695              id.push(next_ch);
696              chars.next();
697            } else {
698              // Invalid character, put back original text
699              let _ = write!(result, "[]{{{{#{id}");
700              break;
701            }
702          }
703        } else {
704          // Not an anchor, put back consumed characters
705          result.push_str("]{");
706        }
707      } else {
708        // Not an anchor, put back consumed character
709        result.push(']');
710      }
711    } else {
712      result.push(ch);
713    }
714  }
715
716  result
717}
718
719/// Process block elements in markdown content.
720///
721/// This function processes block elements including admonitions, figures, and
722/// definition lists while being code-block aware to avoid processing inside
723/// code fences.
724///
725/// # Arguments
726///
727/// * `content` - The markdown content to process
728///
729/// # Returns
730///
731/// The processed markdown with block elements converted to HTML
732#[cfg(feature = "nixpkgs")]
733#[must_use]
734pub fn process_block_elements(content: &str) -> String {
735  let mut result = Vec::new();
736  let mut lines = content.lines().peekable();
737  let mut fence_tracker = crate::utils::codeblock::FenceTracker::new();
738
739  while let Some(line) = lines.next() {
740    // Update fence tracking state
741    fence_tracker = fence_tracker.process_line(line);
742
743    // Only process block elements if we're not in a code block
744    if !fence_tracker.in_code_block() {
745      // Check for GitHub-style callouts: > [!TYPE]
746      if let Some((callout_type, initial_content)) = parse_github_callout(line)
747      {
748        let content =
749          collect_github_callout_content(&mut lines, &initial_content);
750        let admonition = render_admonition(&callout_type, None, &content);
751        result.push(admonition);
752        continue;
753      }
754
755      // Check for fenced admonitions: ::: {.type}
756      if let Some((adm_type, id)) = parse_fenced_admonition_start(line) {
757        let (content, trailing) = collect_fenced_content(&mut lines);
758        let admonition = render_admonition(&adm_type, id.as_deref(), &content);
759        result.push(admonition);
760        // If there's trailing content after the closing :::, add it as a new
761        // line
762        if let Some(trailing_content) = trailing {
763          result.push(trailing_content);
764        }
765        continue;
766      }
767
768      // Check for figures: ::: {.figure #id}
769      if let Some((id, title, content)) = parse_figure_block(line, &mut lines) {
770        let figure = render_figure(id.as_deref(), &title, &content);
771        result.push(figure);
772        continue;
773      }
774    }
775
776    // Regular line, keep as-is
777    result.push(line.to_string());
778  }
779
780  result.join("\n")
781}
782
783/// Parse GitHub-style callout syntax: > [!TYPE] content
784fn parse_github_callout(line: &str) -> Option<(String, String)> {
785  let trimmed = line.trim_start();
786  if !trimmed.starts_with("> [!") {
787    return None;
788  }
789
790  // Find the closing bracket
791  if let Some(close_bracket) = trimmed.find(']')
792    && close_bracket > 4
793  {
794    let callout_type = &trimmed[4..close_bracket];
795
796    // Validate callout type
797    match callout_type {
798      "NOTE" | "TIP" | "IMPORTANT" | "WARNING" | "CAUTION" | "DANGER" => {
799        let content = trimmed[close_bracket + 1..].trim();
800        return Some((callout_type.to_lowercase(), content.to_string()));
801      },
802      _ => return None,
803    }
804  }
805
806  None
807}
808
809/// Check if a line starts with a valid ATX header (1-6 '#' followed by
810/// whitespace or EOL).
811///
812/// Per `CommonMark` spec, an ATX header requires 1-6 '#' characters followed by
813/// either:
814///
815/// - A whitespace character (space, tab, etc.)
816/// - End of line (the string ends)
817///
818/// # Arguments
819///
820/// * `line` - The line to check
821///
822/// # Returns
823///
824/// `true` if the line starts with a valid ATX header marker
825fn is_atx_header(line: &str) -> bool {
826  let mut chars = line.chars();
827  let mut hash_count = 0;
828
829  // Count leading '#' characters (max 6)
830  while let Some(c) = chars.next() {
831    if c == '#' {
832      hash_count += 1;
833      if hash_count > 6 {
834        return false;
835      }
836    } else {
837      // Found a non-'#' character, check if it's whitespace or we're at EOL
838      return (1..=6).contains(&hash_count)
839        && (c.is_whitespace() || chars.as_str().is_empty());
840    }
841  }
842
843  // Reached end of string, check if we have 1-6 hashes
844  (1..=6).contains(&hash_count)
845}
846
847/// Collect content for GitHub-style callouts
848fn collect_github_callout_content(
849  lines: &mut std::iter::Peekable<std::str::Lines>,
850  initial_content: &str,
851) -> String {
852  let mut content = String::new();
853
854  if !initial_content.is_empty() {
855    content.push_str(initial_content);
856    content.push('\n');
857  }
858
859  while let Some(line) = lines.peek() {
860    let trimmed = line.trim_start();
861
862    // Empty line ends the blockquote
863    if trimmed.is_empty() {
864      break;
865    }
866
867    // Check if this is a continuation line with `>`
868    let content_part = if trimmed.starts_with('>') {
869      trimmed.strip_prefix('>').unwrap_or("").trim_start()
870    } else {
871      // Check if this line starts a new block element that cannot be
872      // lazy-continued ATX headers, setext header underlines, code
873      // fences, and thematic breaks
874      let starts_new_block = is_atx_header(trimmed)
875        || trimmed.starts_with("```")
876        || trimmed.starts_with("~~~")
877        || (trimmed.starts_with("---")
878          && trimmed.chars().all(|c| c == '-' || c.is_whitespace()))
879        || (trimmed.starts_with("===")
880          && trimmed.chars().all(|c| c == '=' || c.is_whitespace()))
881        || (trimmed.starts_with("***")
882          && trimmed.chars().all(|c| c == '*' || c.is_whitespace()));
883
884      if starts_new_block {
885        break;
886      }
887
888      // Lazy continuation
889      // Mind you, "lazy" doesn't refer to me being lazy but the GFM feature for
890      // a line without `>` that continues the blockquote
891      // paragraph
892      trimmed
893    };
894
895    content.push_str(content_part);
896    content.push('\n');
897    lines.next(); // consume the line
898  }
899
900  content.trim().to_string()
901}
902
903/// Parse fenced admonition start: ::: {.type #id}
904fn parse_fenced_admonition_start(
905  line: &str,
906) -> Option<(String, Option<String>)> {
907  let trimmed = line.trim();
908  if !trimmed.starts_with(":::") {
909    return None;
910  }
911
912  let after_colons = trimmed[3..].trim_start();
913  if !after_colons.starts_with("{.") {
914    return None;
915  }
916
917  // Find the closing brace
918  if let Some(close_brace) = after_colons.find('}') {
919    let content = &after_colons[2..close_brace]; // skip "{."
920
921    // Parse type and optional ID
922    let parts: Vec<&str> = content.split_whitespace().collect();
923    if let Some(&adm_type) = parts.first() {
924      let id = parts
925        .iter()
926        .find(|part| part.starts_with('#'))
927        .map(|id_part| id_part[1..].to_string()); // remove '#'
928
929      return Some((adm_type.to_string(), id));
930    }
931  }
932
933  None
934}
935
936/// Collect content until closing :::
937///
938/// # Returns
939///
940/// Tuple of (`admonition_content`, `trailing_content`). If there's content
941/// after the closing `:::` on the same line, it's returned as
942/// `trailing_content`.
943fn collect_fenced_content(
944  lines: &mut std::iter::Peekable<std::str::Lines>,
945) -> (String, Option<String>) {
946  let mut content = String::new();
947
948  for line in lines.by_ref() {
949    let trimmed = line.trim();
950    if trimmed.starts_with(":::") {
951      // check if there's content after the closing :::
952      let after_colons = trimmed.strip_prefix(":::").unwrap_or("");
953      if !after_colons.is_empty() {
954        // there's trailing content on the same line as the closing delimiter
955        return (content.trim().to_string(), Some(after_colons.to_string()));
956      }
957      break;
958    }
959    content.push_str(line);
960    content.push('\n');
961  }
962
963  (content.trim().to_string(), None)
964}
965
966/// Parse figure block: ::: {.figure #id}
967#[allow(
968  clippy::option_if_let_else,
969  reason = "Nested options clearer with if-let"
970)]
971fn parse_figure_block(
972  line: &str,
973  lines: &mut std::iter::Peekable<std::str::Lines>,
974) -> Option<(Option<String>, String, String)> {
975  let trimmed = line.trim();
976  if !trimmed.starts_with(":::") {
977    return None;
978  }
979
980  let after_colons = trimmed[3..].trim_start();
981  if !after_colons.starts_with("{.figure") {
982    return None;
983  }
984
985  // Extract ID if present
986  let id = if let Some(hash_pos) = after_colons.find('#') {
987    if let Some(close_brace) = after_colons.find('}') {
988      if hash_pos < close_brace {
989        Some(after_colons[hash_pos + 1..close_brace].trim().to_string())
990      } else {
991        None
992      }
993    } else {
994      None
995    }
996  } else {
997    None
998  };
999
1000  // Get title from next line (should start with #)
1001  let title = if let Some(title_line) = lines.next() {
1002    let trimmed_title = title_line.trim();
1003    if let Some(this) = trimmed_title.strip_prefix('#') {
1004      { this.trim_matches(char::is_whitespace) }.to_string()
1005    } else {
1006      // Put the line back if it's not a title
1007      return None;
1008    }
1009  } else {
1010    return None;
1011  };
1012
1013  // Collect figure content
1014  let mut content = String::new();
1015  for line in lines.by_ref() {
1016    if line.trim().starts_with(":::") {
1017      break;
1018    }
1019    content.push_str(line);
1020    content.push('\n');
1021  }
1022
1023  Some((id, title, content.trim().to_string()))
1024}
1025
1026/// Render an admonition as HTML
1027fn render_admonition(
1028  adm_type: &str,
1029  id: Option<&str>,
1030  content: &str,
1031) -> String {
1032  let capitalized_type = crate::utils::capitalize_first(adm_type);
1033  let id_attr = id.map_or(String::new(), |id| format!(" id=\"{id}\""));
1034
1035  let opening = format!(
1036    "<div class=\"admonition {adm_type}\"{id_attr}>\n<p \
1037     class=\"admonition-title\">{capitalized_type}</p>"
1038  );
1039  format!("{opening}\n\n{content}\n\n</div>\n")
1040}
1041
1042/// Render a figure as HTML
1043fn render_figure(id: Option<&str>, title: &str, content: &str) -> String {
1044  let id_attr = id.map_or(String::new(), |id| format!(" id=\"{id}\""));
1045
1046  format!(
1047    "<figure{id_attr}>\n<figcaption>{title}</figcaption>\n{content}\n</figure>"
1048  )
1049}
1050
1051/// Process manpage references in HTML content. Pocesses manpage references by
1052/// finding span elements with manpage-reference class and converting them to
1053/// links when URLs are available.
1054///
1055/// # Arguments
1056///
1057/// * `html` - The HTML content to process
1058/// * `manpage_urls` - Optional mapping of manpage names to URLs
1059///
1060/// # Returns
1061///
1062/// The processed HTML with manpage references converted to links
1063#[cfg(feature = "nixpkgs")]
1064#[must_use]
1065#[allow(
1066  clippy::implicit_hasher,
1067  reason = "Standard HashMap sufficient for this use case"
1068)]
1069pub fn process_manpage_references(
1070  html: &str,
1071  manpage_urls: Option<&std::collections::HashMap<String, String>>,
1072) -> String {
1073  process_safe(
1074    html,
1075    |html| {
1076      use kuchikikiki::NodeRef;
1077      use tendril::TendrilSink;
1078
1079      let document = kuchikikiki::parse_html().one(html);
1080      let mut to_replace = Vec::new();
1081
1082      // Find all spans with class "manpage-reference"
1083      for span_node in safe_select(&document, "span.manpage-reference") {
1084        let span_el = span_node;
1085        let span_text = span_el.text_contents();
1086
1087        if let Some(urls) = manpage_urls {
1088          // Check for direct URL match
1089          if let Some(url) = urls.get(&span_text) {
1090            let clean_url = extract_url_from_html(url);
1091            let link = NodeRef::new_element(
1092              markup5ever::QualName::new(
1093                None,
1094                markup5ever::ns!(html),
1095                markup5ever::local_name!("a"),
1096              ),
1097              vec![
1098                (
1099                  kuchikikiki::ExpandedName::new("", "href"),
1100                  kuchikikiki::Attribute {
1101                    prefix: None,
1102                    value:  clean_url.into(),
1103                  },
1104                ),
1105                (
1106                  kuchikikiki::ExpandedName::new("", "class"),
1107                  kuchikikiki::Attribute {
1108                    prefix: None,
1109                    value:  "manpage-reference".into(),
1110                  },
1111                ),
1112              ],
1113            );
1114            link.append(NodeRef::new_text(span_text.clone()));
1115            to_replace.push((span_el.clone(), link));
1116          }
1117        }
1118      }
1119
1120      // Apply replacements
1121      for (old, new) in to_replace {
1122        old.insert_before(new);
1123        old.detach();
1124      }
1125
1126      let mut out = Vec::new();
1127      let _ = document.serialize(&mut out);
1128      String::from_utf8(out).unwrap_or_else(|_| html.to_string())
1129    },
1130    // Return original HTML on error
1131    "",
1132  )
1133}
1134
1135/// Process option references
1136/// Converts {option} role markup into links to the options page.
1137///
1138/// This processes `<code>` elements that have the `nixos-option` class, i.e.,
1139/// {option} role markup and convert them into links to the options page.
1140///
1141/// # Arguments
1142///
1143/// * `html` - The HTML string to process.
1144/// * `valid_options` - Optional set of valid option names for validation.
1145///
1146/// # Returns
1147///
1148/// The HTML string with option references rewritten as links.
1149#[cfg(feature = "ndg-flavored")]
1150#[must_use]
1151#[allow(
1152  clippy::implicit_hasher,
1153  reason = "Standard HashSet sufficient for this use case"
1154)]
1155pub fn process_option_references(
1156  html: &str,
1157  valid_options: Option<&std::collections::HashSet<String>>,
1158) -> String {
1159  use kuchikikiki::{Attribute, ExpandedName, NodeRef};
1160  use markup5ever::{QualName, local_name, ns};
1161  use tendril::TendrilSink;
1162
1163  process_safe(
1164    html,
1165    |html| {
1166      let document = kuchikikiki::parse_html().one(html);
1167
1168      let mut to_replace = vec![];
1169
1170      // Only process code elements that already have the nixos-option class
1171      // from {option} role syntax
1172      for code_node in safe_select(&document, "code.nixos-option") {
1173        let code_el = code_node;
1174        let code_text = code_el.text_contents();
1175
1176        // Skip if already wrapped in an option-reference link
1177        let mut is_already_option_ref = false;
1178        let mut current = code_el.parent();
1179        while let Some(parent) = current {
1180          if let Some(element) = parent.as_element()
1181            && element.name.local == local_name!("a")
1182            && let Some(class_attr) =
1183              element.attributes.borrow().get(local_name!("class"))
1184            && class_attr.contains("option-reference")
1185          {
1186            is_already_option_ref = true;
1187            break;
1188          }
1189          current = parent.parent();
1190        }
1191
1192        if !is_already_option_ref {
1193          // Check if validation is enabled and option is valid. If no
1194          // validation set, link all options
1195          let should_link =
1196            valid_options.is_none_or(|opts| opts.contains(code_text.as_str()));
1197
1198          if should_link {
1199            let option_id = sanitize_option_id(code_text.as_str());
1200            let attrs = vec![
1201              (ExpandedName::new("", "href"), Attribute {
1202                prefix: None,
1203                value:  format!("options.html#{option_id}"),
1204              }),
1205              (ExpandedName::new("", "class"), Attribute {
1206                prefix: None,
1207                value:  "option-reference".into(),
1208              }),
1209            ];
1210            let a = NodeRef::new_element(
1211              QualName::new(None, ns!(html), local_name!("a")),
1212              attrs,
1213            );
1214            let code = NodeRef::new_element(
1215              QualName::new(None, ns!(html), local_name!("code")),
1216              vec![],
1217            );
1218            code.append(NodeRef::new_text(code_text.clone()));
1219            a.append(code);
1220            to_replace.push((code_el.clone(), a));
1221          }
1222          // If should_link is false, leave the code element as-is (no wrapping)
1223        }
1224      }
1225
1226      for (old, new) in to_replace {
1227        old.insert_before(new);
1228        old.detach();
1229      }
1230
1231      let mut out = Vec::new();
1232      let _ = document.serialize(&mut out);
1233      String::from_utf8(out).unwrap_or_else(|_| html.to_string())
1234    },
1235    // Return original HTML on error
1236    "",
1237  )
1238}
1239
1240/// Extract URL from HTML anchor tag or return the string as-is if it's a plain
1241/// URL
1242fn extract_url_from_html(url_or_html: &str) -> &str {
1243  // Check if it looks like HTML (starts with <a href=")
1244  if url_or_html.starts_with("<a href=\"") {
1245    // Extract the URL from href attribute
1246    if let Some(start) = url_or_html.find("href=\"") {
1247      let start = start + 6; // Skip 'href="'
1248      if let Some(end) = url_or_html[start..].find('"') {
1249        return &url_or_html[start..start + end];
1250      }
1251    }
1252  }
1253
1254  // Return as-is if not HTML or if extraction fails
1255  url_or_html
1256}
1257
1258/// Process wikilinks and Obsidian-style links in markdown content.
1259///
1260/// Converts:
1261///
1262/// - `[[page]]` (Obsidian link) -> `[page](page.html)`
1263/// - `[[name|url]]` (Wiki link) -> `[name](url)`
1264///
1265/// Being code-block aware to avoid processing inside fenced code blocks.
1266///
1267/// # Arguments
1268///
1269/// * `content` - The markdown content to process
1270///
1271/// # Returns
1272///
1273/// The processed markdown with wiki/Obsidian links converted to HTML
1274#[cfg(feature = "wiki")]
1275#[must_use]
1276pub fn process_wikilinks(content: &str) -> String {
1277  use crate::utils::codeblock::FenceTracker;
1278
1279  let mut result = String::with_capacity(content.len());
1280  let lines = content.lines();
1281  let mut tracker = FenceTracker::new();
1282
1283  for line in lines {
1284    tracker = tracker.process_line(line);
1285
1286    if tracker.in_code_block() {
1287      result.push_str(line);
1288    } else {
1289      result.push_str(&process_line_wikilinks(line));
1290    }
1291    result.push('\n');
1292  }
1293
1294  result.trim_end().to_string()
1295}
1296
1297/// Process wikilinks in a single line.
1298#[cfg(feature = "wiki")]
1299fn process_line_wikilinks(line: &str) -> String {
1300  let mut result = String::with_capacity(line.len());
1301  let mut chars = line.chars().peekable();
1302
1303  while let Some(ch) = chars.next() {
1304    if ch == '[' && chars.peek() == Some(&'[') {
1305      chars.next();
1306
1307      let mut inner = String::new();
1308      let mut found_double_close = false;
1309
1310      while let Some(&next_ch) = chars.peek() {
1311        chars.next();
1312        if next_ch == ']' && chars.peek() == Some(&']') {
1313          chars.next();
1314          found_double_close = true;
1315          break;
1316        }
1317        inner.push(next_ch);
1318      }
1319
1320      if found_double_close {
1321        if inner.is_empty() {
1322          result.push_str("[[]]");
1323        } else if inner.contains('|') {
1324          let parts: Vec<&str> = inner.splitn(2, '|').collect();
1325          let name = parts[0].trim();
1326          let url = parts.get(1).unwrap_or(&name).trim();
1327          let escaped_name = encode_text(name);
1328          let escaped_url = encode_text(url);
1329          let _ = write!(
1330            result,
1331            "<a href=\"{escaped_url}\" class=\"wikilink\">{escaped_name}</a>"
1332          );
1333        } else {
1334          let page = inner.trim();
1335          let escaped_page = encode_text(page);
1336          let link_target = format!("{page}.html");
1337          let _ = write!(
1338            result,
1339            "<a href=\"{link_target}\" \
1340             class=\"obsidian-link\">{escaped_page}</a>"
1341          );
1342        }
1343      } else {
1344        result.push_str("[[");
1345        result.push_str(&inner);
1346      }
1347    } else {
1348      result.push(ch);
1349    }
1350  }
1351
1352  result
1353}
1354
1355#[cfg(test)]
1356mod tests {
1357  use super::*;
1358
1359  #[test]
1360  fn test_is_atx_header_valid_headers() {
1361    // valid ATX headers with 1-6 hashes followed by space
1362    assert!(is_atx_header("# Header"));
1363    assert!(is_atx_header("## Header"));
1364    assert!(is_atx_header("### Header"));
1365    assert!(is_atx_header("#### Header"));
1366    assert!(is_atx_header("##### Header"));
1367    assert!(is_atx_header("###### Header"));
1368
1369    // valid ATX headers with tab after hashes
1370    assert!(is_atx_header("#\tHeader"));
1371    assert!(is_atx_header("##\tHeader"));
1372
1373    // valid ATX headers with just hashes (no content after)
1374    assert!(is_atx_header("#"));
1375    assert!(is_atx_header("##"));
1376    assert!(is_atx_header("###"));
1377    assert!(is_atx_header("####"));
1378    assert!(is_atx_header("#####"));
1379    assert!(is_atx_header("######"));
1380
1381    // valid ATX headers with multiple spaces
1382    assert!(is_atx_header("#  Header with multiple spaces"));
1383    assert!(is_atx_header("##   Header"));
1384  }
1385
1386  #[test]
1387  fn test_is_atx_header_invalid_headers() {
1388    // more than 6 hashes
1389    assert!(!is_atx_header("####### Too many hashes"));
1390    assert!(!is_atx_header("######## Even more"));
1391
1392    // no space after hash
1393    assert!(!is_atx_header("#NoSpace"));
1394    assert!(!is_atx_header("##NoSpace"));
1395
1396    // hash in the middle
1397    assert!(!is_atx_header("Not # a header"));
1398
1399    // empty string
1400    assert!(!is_atx_header(""));
1401
1402    // no hash at all
1403    assert!(!is_atx_header("Regular text"));
1404
1405    // hash with non-whitespace immediately after
1406    assert!(!is_atx_header("#hashtag"));
1407    assert!(!is_atx_header("##hashtag"));
1408    assert!(!is_atx_header("#123"));
1409    assert!(!is_atx_header("##abc"));
1410
1411    // special characters immediately after hash
1412    assert!(!is_atx_header("#!important"));
1413    assert!(!is_atx_header("#@mention"));
1414    assert!(!is_atx_header("#$variable"));
1415  }
1416
1417  #[test]
1418  fn test_is_atx_header_edge_cases() {
1419    // whitespace before hash is handled by caller (trimmed)
1420    // but testing it here to ensure robustness
1421    assert!(!is_atx_header(" # Header"));
1422    assert!(!is_atx_header("  ## Header"));
1423
1424    // only spaces after hash (should be valid)
1425    assert!(is_atx_header("#     "));
1426    assert!(is_atx_header("##    "));
1427
1428    // newline handling (string ends after valid header marker)
1429    assert!(is_atx_header("# Header\n"));
1430    assert!(is_atx_header("## Header\n"));
1431
1432    // mixed whitespace after hash
1433    assert!(is_atx_header("# \t  Header"));
1434    assert!(is_atx_header("##  \tHeader"));
1435  }
1436
1437  #[test]
1438  fn test_is_atx_header_blockquote_context() {
1439    // these are the types of strings that would be passed from
1440    // collect_github_callout_content after trim_start()
1441    assert!(is_atx_header("# New Section"));
1442    assert!(is_atx_header("## Subsection"));
1443
1444    // non-headers that should not break blockquote
1445    assert!(!is_atx_header("#tag"));
1446    assert!(!is_atx_header("##issue-123"));
1447    assert!(!is_atx_header("###no-space"));
1448
1449    // edge case: exactly 6 hashes (valid)
1450    assert!(is_atx_header("###### Level 6"));
1451
1452    // edge case: 7 hashes (invalid)
1453    assert!(!is_atx_header("####### Not valid"));
1454  }
1455
1456  #[cfg(feature = "wiki")]
1457  #[test]
1458  fn test_wikilink_obsidian_basic() {
1459    let input = "Check out [[Some Page]] for details.";
1460    let result = process_wikilinks(input);
1461    assert!(result.contains("href=\"Some Page.html\""));
1462    assert!(result.contains("class=\"obsidian-link\""));
1463    assert!(result.contains(">Some Page<"));
1464  }
1465
1466  #[cfg(feature = "wiki")]
1467  #[test]
1468  fn test_wikilink_with_url() {
1469    let input = "See [[Custom Name|https://example.com]]";
1470    let result = process_wikilinks(input);
1471    assert!(result.contains("href=\"https://example.com\""));
1472    assert!(result.contains("class=\"wikilink\""));
1473    assert!(result.contains(">Custom Name<"));
1474  }
1475
1476  #[cfg(feature = "wiki")]
1477  #[test]
1478  fn test_wikilink_with_spaces() {
1479    let input = "[[My Page Name]]";
1480    let result = process_wikilinks(input);
1481    assert!(result.contains("href=\"My Page Name.html\""));
1482  }
1483
1484  #[cfg(feature = "wiki")]
1485  #[test]
1486  fn test_wikilink_in_code_block() {
1487    let input = "```\n[[Wiki Link]]\n```\nThen [[Another]]";
1488    let result = process_wikilinks(input);
1489    assert!(result.contains("[[Wiki Link]]"));
1490    assert!(result.contains("href=\"Another.html\""));
1491  }
1492
1493  #[cfg(feature = "wiki")]
1494  #[test]
1495  fn test_wikilink_empty() {
1496    let input = "[[]]";
1497    let result = process_wikilinks(input);
1498    assert!(result.contains("[[]]"));
1499  }
1500
1501  #[cfg(feature = "wiki")]
1502  #[test]
1503  fn test_wikilink_malformed() {
1504    let input = "[[ incomplete";
1505    let result = process_wikilinks(input);
1506    assert!(result.contains("[[ incomplete"));
1507  }
1508
1509  #[cfg(feature = "wiki")]
1510  #[test]
1511  fn test_wikilink_html_escaping() {
1512    let input = "See [[Page With <script>]] for info";
1513    let result = process_wikilinks(input);
1514    assert!(result.contains("&lt;script&gt;"));
1515    assert!(!result.contains(">Page With <script><"));
1516  }
1517}
ndg_commonmark/processor/extensions.rs

ndg_commonmark/processor/
extensions.rs