ndg_commonmark/processor/
extensions.rs

1//! Feature-specific Markdown processing extensions.
2use std::{fmt::Write, fs, path::Path};
3
4use html_escape;
5
6use super::process::process_safe;
7
8/// Safely select DOM elements with graceful error handling.
9fn safe_select(
10  document: &kuchikikiki::NodeRef,
11  selector: &str,
12) -> Vec<kuchikikiki::NodeRef> {
13  match document.select(selector) {
14    Ok(selections) => selections.map(|sel| sel.as_node().clone()).collect(),
15    Err(e) => {
16      log::warn!("DOM selector '{selector}' failed: {e:?}");
17      Vec::new()
18    },
19  }
20}
21
22/// Apply GitHub Flavored Markdown (GFM) extensions to the input markdown.
23///
24/// This is a placeholder for future GFM-specific preprocessing or AST
25/// transformations. In practice, most GFM features are enabled via comrak
26/// options, but additional logic (such as custom tables, task lists, etc.) can
27/// be added here.
28///
29/// # Arguments
30/// * `markdown` - The input markdown text
31///
32/// # Returns
33/// The processed markdown text with GFM extensions applied
34#[cfg(feature = "gfm")]
35#[must_use]
36pub fn apply_gfm_extensions(markdown: &str) -> String {
37  // XXX: Comrak already supports GFM, but if there is any feature in the spec
38  // that is not implemented as we'd like for it to be, we can add it here.
39  markdown.to_owned()
40}
41
42/// Maximum recursion depth for file includes to prevent infinite recursion.
43const MAX_INCLUDE_DEPTH: usize = 8;
44
45/// Check if a path is safe for file inclusion (no absolute paths, no parent
46/// directory traversal).
47#[cfg(feature = "nixpkgs")]
48fn is_safe_path(path: &str, _base_dir: &Path) -> bool {
49  let p = Path::new(path);
50  if p.is_absolute() || path.contains('\\') {
51    return false;
52  }
53
54  // Reject any path containing parent directory components
55  for component in p.components() {
56    if matches!(component, std::path::Component::ParentDir) {
57      return false;
58    }
59  }
60
61  true
62}
63
64/// Parse the custom output directive from an include block.
65#[cfg(feature = "nixpkgs")]
66#[allow(
67  clippy::option_if_let_else,
68  reason = "Nested options are clearer with if-let"
69)]
70fn parse_include_directive(line: &str) -> Option<String> {
71  if let Some(start) = line.find("html:into-file=") {
72    let start = start + "html:into-file=".len();
73    if let Some(end) = line[start..].find(' ') {
74      Some(line[start..start + end].to_string())
75    } else {
76      Some(line[start..].trim().to_string())
77    }
78  } else {
79    None
80  }
81}
82
83/// Read and process files listed in an include block.
84#[cfg(feature = "nixpkgs")]
85#[allow(
86  clippy::needless_pass_by_value,
87  reason = "Owned value needed for cloning in loop"
88)]
89fn read_includes(
90  listing: &str,
91  base_dir: &Path,
92  custom_output: Option<String>,
93  included_files: &mut Vec<crate::types::IncludedFile>,
94  depth: usize,
95) -> Result<String, String> {
96  let mut result = String::new();
97
98  for line in listing.lines() {
99    let trimmed = line.trim();
100    if trimmed.is_empty() || !is_safe_path(trimmed, base_dir) {
101      continue;
102    }
103    let full_path = base_dir.join(trimmed);
104    log::info!("Including file: {}", full_path.display());
105
106    match fs::read_to_string(&full_path) {
107      Ok(content) => {
108        let file_dir = full_path.parent().unwrap_or(base_dir);
109        let (processed_content, nested_includes) =
110          process_file_includes(&content, file_dir, depth + 1)?;
111
112        result.push_str(&processed_content);
113        if !processed_content.ends_with('\n') {
114          result.push('\n');
115        }
116
117        included_files.push(crate::types::IncludedFile {
118          path:          trimmed.to_string(),
119          custom_output: custom_output.clone(),
120        });
121
122        // Normalize nested include paths relative to original base_dir
123        for nested in nested_includes {
124          let nested_full_path = file_dir.join(&nested.path);
125          if let Ok(normalized_path) = nested_full_path.strip_prefix(base_dir) {
126            included_files.push(crate::types::IncludedFile {
127              path:          normalized_path.to_string_lossy().to_string(),
128              custom_output: nested.custom_output,
129            });
130          }
131        }
132      },
133      Err(_) => {
134        let _ = writeln!(
135          result,
136          "<!-- ndg: could not include file: {} -->",
137          full_path.display()
138        );
139      },
140    }
141  }
142  Ok(result)
143}
144
145/// Process file includes in Nixpkgs/NixOS documentation.
146///
147/// This function processes file include syntax:
148///
149/// ````markdown
150/// ```{=include=}
151/// path/to/file1.md
152/// path/to/file2.md
153/// ```
154/// ````
155///
156/// # Arguments
157///
158/// * `markdown` - The input markdown text
159/// * `base_dir` - The base directory for resolving relative file paths
160/// * `depth` - Current recursion depth (use 0 for initial call)
161///
162/// # Returns
163///
164/// Returns `Ok((processed_markdown, included_files))` where `included_files` is
165/// a list of all successfully included files.
166///
167/// # Errors
168///
169/// Returns `Err(message)` if recursion depth exceeds [`MAX_INCLUDE_DEPTH`],
170/// which likely indicates a circular include cycle.
171///
172/// # Safety
173///
174/// Only relative paths without ".." are allowed for security.
175#[cfg(feature = "nixpkgs")]
176pub fn process_file_includes(
177  markdown: &str,
178  base_dir: &std::path::Path,
179  depth: usize,
180) -> Result<(String, Vec<crate::types::IncludedFile>), String> {
181  // Check recursion depth limit
182  if depth >= MAX_INCLUDE_DEPTH {
183    return Err(format!(
184      "Maximum include recursion depth ({MAX_INCLUDE_DEPTH}) exceeded. This \
185       likely indicates a cycle in file includes."
186    ));
187  }
188
189  let mut output = String::new();
190  let mut lines = markdown.lines();
191  let mut fence_tracker = crate::utils::codeblock::FenceTracker::new();
192  let mut all_included_files: Vec<crate::types::IncludedFile> = Vec::new();
193
194  while let Some(line) = lines.next() {
195    let trimmed = line.trim_start();
196
197    if !fence_tracker.in_code_block() && trimmed.starts_with("```{=include=}") {
198      let custom_output = parse_include_directive(trimmed);
199
200      let mut include_listing = String::new();
201      for next_line in lines.by_ref() {
202        if next_line.trim_start().starts_with("```") {
203          break;
204        }
205        include_listing.push_str(next_line);
206        include_listing.push('\n');
207      }
208
209      let included = read_includes(
210        &include_listing,
211        base_dir,
212        custom_output,
213        &mut all_included_files,
214        depth,
215      )?;
216      output.push_str(&included);
217      continue;
218    }
219
220    // Update fence tracking state
221    fence_tracker = fence_tracker.process_line(line);
222
223    output.push_str(line);
224    output.push('\n');
225  }
226
227  Ok((output, all_included_files))
228}
229
230/// Process role markup in markdown content.
231///
232/// This function processes role syntax like `{command}ls -la`
233///
234/// # Arguments
235///
236/// * `content` - The markdown content to process
237/// * `manpage_urls` - Optional mapping of manpage names to URLs
238/// * `auto_link_options` - Whether to convert {option} roles to links
239/// * `valid_options` - Optional set of valid option names for validation
240///
241/// # Returns
242///
243/// The processed markdown with role markup converted to HTML
244#[cfg(any(feature = "nixpkgs", feature = "ndg-flavored"))]
245#[must_use]
246#[allow(
247  clippy::implicit_hasher,
248  reason = "Standard HashMap/HashSet sufficient for this use case"
249)]
250pub fn process_role_markup(
251  content: &str,
252  manpage_urls: Option<&std::collections::HashMap<String, String>>,
253  auto_link_options: bool,
254  valid_options: Option<&std::collections::HashSet<String>>,
255) -> String {
256  let mut result = String::new();
257  let mut chars = content.chars().peekable();
258  let mut tracker = crate::utils::codeblock::InlineTracker::new();
259
260  while let Some(ch) = chars.next() {
261    // Handle backticks (code fences and inline code)
262    if ch == '`' {
263      let (new_tracker, tick_count) = tracker.process_backticks(&mut chars);
264      tracker = new_tracker;
265
266      // Add all the backticks
267      result.push_str(&"`".repeat(tick_count));
268      continue;
269    }
270
271    // Handle tilde code fences (~~~)
272    if ch == '~' && chars.peek() == Some(&'~') {
273      let (new_tracker, tilde_count) = tracker.process_tildes(&mut chars);
274      tracker = new_tracker;
275
276      result.push_str(&"~".repeat(tilde_count));
277      continue;
278    }
279
280    // Handle newlines
281    if ch == '\n' {
282      tracker = tracker.process_newline();
283      result.push(ch);
284      continue;
285    }
286
287    // Process role markup only if we're not in any kind of code
288    if ch == '{' && !tracker.in_any_code() {
289      // Collect remaining characters to test parsing
290      let remaining: Vec<char> = chars.clone().collect();
291      let remaining_str: String = remaining.iter().collect();
292      let mut temp_chars = remaining_str.chars().peekable();
293
294      if let Some(role_markup) = parse_role_markup(
295        &mut temp_chars,
296        manpage_urls,
297        auto_link_options,
298        valid_options,
299      ) {
300        // Valid role markup found, advance the main iterator
301        let remaining_after_parse: String = temp_chars.collect();
302        let consumed = remaining_str.len() - remaining_after_parse.len();
303        for _ in 0..consumed {
304          chars.next();
305        }
306        result.push_str(&role_markup);
307      } else {
308        // Not a valid role markup, keep the original character
309        result.push(ch);
310      }
311    } else {
312      result.push(ch);
313    }
314  }
315
316  result
317}
318
319/// Parse a role markup from the character iterator.
320///
321/// # Returns
322///
323/// `Some(html)` if a valid role markup is found, `None` otherwise.
324fn parse_role_markup(
325  chars: &mut std::iter::Peekable<std::str::Chars>,
326  manpage_urls: Option<&std::collections::HashMap<String, String>>,
327  auto_link_options: bool,
328  valid_options: Option<&std::collections::HashSet<String>>,
329) -> Option<String> {
330  let mut role_name = String::new();
331
332  // Parse role name (lowercase letters only)
333  while let Some(&ch) = chars.peek() {
334    if ch.is_ascii_lowercase() {
335      role_name.push(ch);
336      chars.next();
337    } else {
338      break;
339    }
340  }
341
342  // Must have a non-empty role name
343  if role_name.is_empty() {
344    return None;
345  }
346
347  // Expect closing brace
348  if chars.peek() != Some(&'}') {
349    return None;
350  }
351  chars.next(); // consume '}'
352
353  // Expect opening backtick
354  if chars.peek() != Some(&'`') {
355    return None;
356  }
357  chars.next(); // consume '`'
358
359  // Parse content until closing backtick
360  let mut content = String::new();
361  for ch in chars.by_ref() {
362    if ch == '`' {
363      // Found closing backtick, validate content
364      // Most role types should not have empty content
365      if content.is_empty() && !matches!(role_name.as_str(), "manpage") {
366        return None; // reject empty content for most roles
367      }
368      return Some(format_role_markup(
369        &role_name,
370        &content,
371        manpage_urls,
372        auto_link_options,
373        valid_options,
374      ));
375    }
376    content.push(ch);
377  }
378
379  // No closing backtick found
380  None
381}
382
383/// Format the role markup as HTML based on the role type and content.
384#[must_use]
385#[allow(
386  clippy::option_if_let_else,
387  reason = "Nested options clearer with if-let"
388)]
389#[allow(
390  clippy::implicit_hasher,
391  reason = "Standard HashMap/HashSet sufficient for this use case"
392)]
393pub fn format_role_markup(
394  role_type: &str,
395  content: &str,
396  manpage_urls: Option<&std::collections::HashMap<String, String>>,
397  auto_link_options: bool,
398  valid_options: Option<&std::collections::HashSet<String>>,
399) -> String {
400  let escaped_content = html_escape::encode_text(content);
401  match role_type {
402    "manpage" => {
403      if let Some(urls) = manpage_urls {
404        if let Some(url) = urls.get(content) {
405          format!(
406            "<a href=\"{url}\" \
407             class=\"manpage-reference\">{escaped_content}</a>"
408          )
409        } else {
410          format!("<span class=\"manpage-reference\">{escaped_content}</span>")
411        }
412      } else {
413        format!("<span class=\"manpage-reference\">{escaped_content}</span>")
414      }
415    },
416    "command" => format!("<code class=\"command\">{escaped_content}</code>"),
417    "env" => format!("<code class=\"env-var\">{escaped_content}</code>"),
418    "file" => format!("<code class=\"file-path\">{escaped_content}</code>"),
419    "option" => {
420      if cfg!(feature = "ndg-flavored") && auto_link_options {
421        // Check if validation is enabled and option is valid
422        let should_link =
423          valid_options.is_none_or(|opts| opts.contains(content)); // If no validation set, link all options
424
425        if should_link {
426          let option_id = format!("option-{}", content.replace('.', "-"));
427          format!(
428            "<a class=\"option-reference\" \
429             href=\"options.html#{option_id}\"><code \
430             class=\"nixos-option\">{escaped_content}</code></a>"
431          )
432        } else {
433          format!("<code class=\"nixos-option\">{escaped_content}</code>")
434        }
435      } else {
436        format!("<code class=\"nixos-option\">{escaped_content}</code>")
437      }
438    },
439    "var" => format!("<code class=\"nix-var\">{escaped_content}</code>"),
440    _ => format!("<span class=\"{role_type}-markup\">{escaped_content}</span>"),
441  }
442}
443
444/// Process MyST-style autolinks in markdown content.
445///
446/// Converts MyST-like autolinks supported by Nixpkgs-flavored commonmark:
447/// - `[](#anchor)` -> `[](#anchor) -> {{ANCHOR}}` (placeholder for comrak)
448/// - `[](https://url)` -> `<https://url>` (converted to standard autolink)
449///
450/// # Arguments
451///
452/// * `content` - The markdown content to process
453///
454/// # Returns
455///
456/// The processed markdown with `MyST` autolinks converted as a [`String`]
457#[must_use]
458pub fn process_myst_autolinks(content: &str) -> String {
459  let mut result = String::with_capacity(content.len());
460  let mut fence_tracker = crate::utils::codeblock::FenceTracker::new();
461
462  for line in content.lines() {
463    // Update fence tracking state
464    fence_tracker = fence_tracker.process_line(line);
465
466    // Only process MyST autolinks if we're not in a code block
467    if fence_tracker.in_code_block() {
468      result.push_str(line);
469    } else {
470      result.push_str(&process_line_myst_autolinks(line));
471    }
472    result.push('\n');
473  }
474
475  result
476}
477
478/// Process `MyST` autolinks in a single line.
479fn process_line_myst_autolinks(line: &str) -> String {
480  let mut result = String::with_capacity(line.len());
481  let mut chars = line.chars().peekable();
482
483  while let Some(ch) = chars.next() {
484    if ch == '[' && chars.peek() == Some(&']') {
485      chars.next(); // consume ']'
486
487      // Check if this is []{#...} syntax (inline anchor, not autolink)
488      // Nice pit, would be a shame if someone was to... fall into it.
489      if chars.peek() == Some(&'{') {
490        // This is inline anchor syntax, not autolink, keep as-is
491        result.push_str("[]");
492        continue;
493      }
494
495      if chars.peek() == Some(&'(') {
496        chars.next(); // consume '('
497
498        // Collect URL until ')'
499        let mut url = String::new();
500        let mut found_closing = false;
501        while let Some(&next_ch) = chars.peek() {
502          if next_ch == ')' {
503            chars.next(); // consume ')'
504            found_closing = true;
505            break;
506          }
507          url.push(next_ch);
508          chars.next();
509        }
510
511        if found_closing && !url.is_empty() {
512          // Check if it's an anchor link (starts with #) or a URL
513          if url.starts_with('#') {
514            // Add placeholder text for comrak to parse it as a link
515            let _ = write!(result, "[{{{{ANCHOR}}}}]({url})");
516          } else if url.starts_with("http://") || url.starts_with("https://") {
517            // Convert URL autolinks to standard <url> format
518            let _ = write!(result, "<{url}>");
519          } else {
520            // Keep other patterns as-is
521            let _ = write!(result, "[]({url})");
522          }
523        } else {
524          // Malformed, put back what we consumed
525          result.push_str("](");
526          result.push_str(&url);
527        }
528      } else {
529        // Not a link, put back consumed character
530        result.push(']');
531      }
532    } else {
533      result.push(ch);
534    }
535  }
536
537  result
538}
539
540/// Process inline anchors in markdown content.
541///
542/// This function processes inline anchor syntax like `[]{#my-anchor}` while
543/// being code-block aware to avoid processing inside code fences.
544///
545/// # Arguments
546///
547/// * `content` - The markdown content to process
548///
549/// # Returns
550///
551/// The processed markdown with inline anchors converted to HTML spans
552///
553/// # Panics
554///
555/// Panics if a code fence marker line is empty (which should not occur in valid
556/// markdown).
557#[cfg(feature = "nixpkgs")]
558#[must_use]
559pub fn process_inline_anchors(content: &str) -> String {
560  let mut result = String::with_capacity(content.len() + 100);
561  let mut fence_tracker = crate::utils::codeblock::FenceTracker::new();
562
563  for line in content.lines() {
564    let trimmed = line.trim_start();
565
566    // Update fence tracking state
567    fence_tracker = fence_tracker.process_line(line);
568
569    // Only process inline anchors if we're not in a code block
570    if fence_tracker.in_code_block() {
571      // In code block, keep line as-is
572      result.push_str(line);
573    } else {
574      // Check for list items with anchors:
575      // "- []{#id} content" or "1. []{#id} content"
576      if let Some(anchor_start) = find_list_item_anchor(trimmed)
577        && let Some(processed_line) =
578          process_list_item_anchor(line, anchor_start)
579      {
580        result.push_str(&processed_line);
581        result.push('\n');
582        continue;
583      }
584
585      // Process regular inline anchors in the line
586      result.push_str(&process_line_anchors(line));
587    }
588    result.push('\n');
589  }
590
591  result
592}
593
594/// Find if a line starts with a list marker followed by an anchor.
595fn find_list_item_anchor(trimmed: &str) -> Option<usize> {
596  // Check for unordered list: "- []{#id}" or "* []{#id}" or "+ []{#id}"
597  if (trimmed.starts_with("- ")
598    || trimmed.starts_with("* ")
599    || trimmed.starts_with("+ "))
600    && trimmed.len() > 2
601  {
602    let after_marker = &trimmed[2..];
603    if after_marker.starts_with("[]{#") {
604      return Some(2);
605    }
606  }
607
608  // Check for ordered list: "1. []{#id}" or "123. []{#id}"
609  let mut i = 0;
610  while i < trimmed.len()
611    && trimmed.chars().nth(i).unwrap_or(' ').is_ascii_digit()
612  {
613    i += 1;
614  }
615  if i > 0 && i < trimmed.len() - 1 && trimmed.chars().nth(i) == Some('.') {
616    let after_marker = &trimmed[i + 1..];
617    if after_marker.starts_with(" []{#") {
618      return Some(i + 2);
619    }
620  }
621
622  None
623}
624
625/// Process a list item line that contains an anchor.
626fn process_list_item_anchor(line: &str, anchor_start: usize) -> Option<String> {
627  let before_anchor = &line[..anchor_start];
628  let after_marker = &line[anchor_start..];
629
630  if !after_marker.starts_with("[]{#") {
631    return None;
632  }
633
634  // Find the end of the anchor: []{#id}
635  if let Some(anchor_end) = after_marker.find('}') {
636    let id = &after_marker[4..anchor_end]; // skip "[]{#" and take until '}'
637    let remaining_content = &after_marker[anchor_end + 1..]; // skip '}'
638
639    // Validate ID contains only allowed characters
640    if id
641      .chars()
642      .all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
643      && !id.is_empty()
644    {
645      return Some(format!(
646        "{before_anchor}<span id=\"{id}\" \
647         class=\"nixos-anchor\"></span>{remaining_content}"
648      ));
649    }
650  }
651
652  None
653}
654
655/// Process inline anchors in a single line.
656fn process_line_anchors(line: &str) -> String {
657  let mut result = String::with_capacity(line.len());
658  let mut chars = line.chars().peekable();
659
660  while let Some(ch) = chars.next() {
661    if ch == '[' && chars.peek() == Some(&']') {
662      chars.next(); // consume ']'
663
664      // Check for {#id} pattern
665      if chars.peek() == Some(&'{') {
666        chars.next(); // consume '{'
667        if chars.peek() == Some(&'#') {
668          chars.next(); // consume '#'
669
670          // Collect the ID
671          let mut id = String::new();
672          while let Some(&next_ch) = chars.peek() {
673            if next_ch == '}' {
674              chars.next(); // consume '}'
675
676              // Validate ID and create span
677              if !id.is_empty()
678                && id
679                  .chars()
680                  .all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
681              {
682                let _ = write!(
683                  result,
684                  "<span id=\"{id}\" class=\"nixos-anchor\"></span>"
685                );
686              } else {
687                // Invalid ID, put back original text
688                let _ = write!(result, "[]{{{{#{id}}}}}");
689              }
690              break;
691            } else if next_ch.is_ascii_alphanumeric()
692              || next_ch == '-'
693              || next_ch == '_'
694            {
695              id.push(next_ch);
696              chars.next();
697            } else {
698              // Invalid character, put back original text
699              let _ = write!(result, "[]{{{{#{id}");
700              break;
701            }
702          }
703        } else {
704          // Not an anchor, put back consumed characters
705          result.push_str("]{");
706        }
707      } else {
708        // Not an anchor, put back consumed character
709        result.push(']');
710      }
711    } else {
712      result.push(ch);
713    }
714  }
715
716  result
717}
718
719/// Process block elements in markdown content.
720///
721/// This function processes block elements including admonitions, figures, and
722/// definition lists while being code-block aware to avoid processing inside
723/// code fences.
724///
725/// # Arguments
726/// * `content` - The markdown content to process
727///
728/// # Returns
729/// The processed markdown with block elements converted to HTML
730///
731/// # Panics
732///
733/// Panics if a code fence marker line is empty (which should not occur in valid
734/// markdown).
735#[cfg(feature = "nixpkgs")]
736#[must_use]
737pub fn process_block_elements(content: &str) -> String {
738  let mut result = Vec::new();
739  let mut lines = content.lines().peekable();
740  let mut fence_tracker = crate::utils::codeblock::FenceTracker::new();
741
742  while let Some(line) = lines.next() {
743    // Update fence tracking state
744    fence_tracker = fence_tracker.process_line(line);
745
746    // Only process block elements if we're not in a code block
747    if !fence_tracker.in_code_block() {
748      // Check for GitHub-style callouts: > [!TYPE]
749      if let Some((callout_type, initial_content)) = parse_github_callout(line)
750      {
751        let content =
752          collect_github_callout_content(&mut lines, &initial_content);
753        let admonition = render_admonition(&callout_type, None, &content);
754        result.push(admonition);
755        continue;
756      }
757
758      // Check for fenced admonitions: ::: {.type}
759      if let Some((adm_type, id)) = parse_fenced_admonition_start(line) {
760        let content = collect_fenced_content(&mut lines);
761        let admonition = render_admonition(&adm_type, id.as_deref(), &content);
762        result.push(admonition);
763        continue;
764      }
765
766      // Check for figures: ::: {.figure #id}
767      if let Some((id, title, content)) = parse_figure_block(line, &mut lines) {
768        let figure = render_figure(id.as_deref(), &title, &content);
769        result.push(figure);
770        continue;
771      }
772    }
773
774    // Regular line, keep as-is
775    result.push(line.to_string());
776  }
777
778  result.join("\n")
779}
780
781/// Parse GitHub-style callout syntax: > [!TYPE] content
782fn parse_github_callout(line: &str) -> Option<(String, String)> {
783  let trimmed = line.trim_start();
784  if !trimmed.starts_with("> [!") {
785    return None;
786  }
787
788  // Find the closing bracket
789  if let Some(close_bracket) = trimmed.find(']')
790    && close_bracket > 4
791  {
792    let callout_type = &trimmed[4..close_bracket];
793
794    // Validate callout type
795    match callout_type {
796      "NOTE" | "TIP" | "IMPORTANT" | "WARNING" | "CAUTION" | "DANGER" => {
797        let content = trimmed[close_bracket + 1..].trim();
798        return Some((callout_type.to_lowercase(), content.to_string()));
799      },
800      _ => return None,
801    }
802  }
803
804  None
805}
806
807/// Check if a line starts with a valid ATX header (1-6 '#' followed by
808/// whitespace or EOL).
809///
810/// Per `CommonMark` spec, an ATX header requires 1-6 '#' characters followed by
811/// either:
812/// - A whitespace character (space, tab, etc.)
813/// - End of line (the string ends)
814///
815/// # Arguments
816/// * `line` - The line to check
817///
818/// # Returns
819/// `true` if the line starts with a valid ATX header marker
820fn is_atx_header(line: &str) -> bool {
821  let mut chars = line.chars();
822  let mut hash_count = 0;
823
824  // count leading '#' characters (max 6)
825  while let Some(c) = chars.next() {
826    if c == '#' {
827      hash_count += 1;
828      if hash_count > 6 {
829        return false;
830      }
831    } else {
832      // found a non-'#' character, check if it's whitespace or we're at EOL
833      return (1..=6).contains(&hash_count)
834        && (c.is_whitespace() || chars.as_str().is_empty());
835    }
836  }
837
838  // reached end of string, check if we have 1-6 hashes
839  (1..=6).contains(&hash_count)
840}
841
842/// Collect content for GitHub-style callouts
843fn collect_github_callout_content(
844  lines: &mut std::iter::Peekable<std::str::Lines>,
845  initial_content: &str,
846) -> String {
847  let mut content = String::new();
848
849  if !initial_content.is_empty() {
850    content.push_str(initial_content);
851    content.push('\n');
852  }
853
854  while let Some(line) = lines.peek() {
855    let trimmed = line.trim_start();
856
857    // Empty line ends the blockquote
858    if trimmed.is_empty() {
859      break;
860    }
861
862    // Check if this is a continuation line with `>`
863    let content_part = if trimmed.starts_with('>') {
864      trimmed.strip_prefix('>').unwrap_or("").trim_start()
865    } else {
866      // Check if this line starts a new block element that cannot be
867      // lazy-continued ATX headers, setext header underlines, code
868      // fences, and thematic breaks
869      let starts_new_block = is_atx_header(trimmed)
870        || trimmed.starts_with("```")
871        || trimmed.starts_with("~~~")
872        || (trimmed.starts_with("---")
873          && trimmed.chars().all(|c| c == '-' || c.is_whitespace()))
874        || (trimmed.starts_with("===")
875          && trimmed.chars().all(|c| c == '=' || c.is_whitespace()))
876        || (trimmed.starts_with("***")
877          && trimmed.chars().all(|c| c == '*' || c.is_whitespace()));
878
879      if starts_new_block {
880        break;
881      }
882
883      // Lazy continuation
884      // Mind yu, "lazy" doesn't refer to me being lazy but the GFM feature for
885      // a line without `>` that continues the blockquote
886      // paragraph
887      trimmed
888    };
889
890    content.push_str(content_part);
891    content.push('\n');
892    lines.next(); // consume the line
893  }
894
895  content.trim().to_string()
896}
897
898/// Parse fenced admonition start: ::: {.type #id}
899fn parse_fenced_admonition_start(
900  line: &str,
901) -> Option<(String, Option<String>)> {
902  let trimmed = line.trim();
903  if !trimmed.starts_with(":::") {
904    return None;
905  }
906
907  let after_colons = trimmed[3..].trim_start();
908  if !after_colons.starts_with("{.") {
909    return None;
910  }
911
912  // Find the closing brace
913  if let Some(close_brace) = after_colons.find('}') {
914    let content = &after_colons[2..close_brace]; // Skip "{."
915
916    // Parse type and optional ID
917    let parts: Vec<&str> = content.split_whitespace().collect();
918    if let Some(&adm_type) = parts.first() {
919      let id = parts
920        .iter()
921        .find(|part| part.starts_with('#'))
922        .map(|id_part| id_part[1..].to_string()); // Remove '#'
923
924      return Some((adm_type.to_string(), id));
925    }
926  }
927
928  None
929}
930
931/// Collect content until closing :::
932fn collect_fenced_content(
933  lines: &mut std::iter::Peekable<std::str::Lines>,
934) -> String {
935  let mut content = String::new();
936
937  for line in lines.by_ref() {
938    if line.trim().starts_with(":::") {
939      break;
940    }
941    content.push_str(line);
942    content.push('\n');
943  }
944
945  content.trim().to_string()
946}
947
948/// Parse figure block: ::: {.figure #id}
949#[allow(
950  clippy::option_if_let_else,
951  reason = "Nested options clearer with if-let"
952)]
953fn parse_figure_block(
954  line: &str,
955  lines: &mut std::iter::Peekable<std::str::Lines>,
956) -> Option<(Option<String>, String, String)> {
957  let trimmed = line.trim();
958  if !trimmed.starts_with(":::") {
959    return None;
960  }
961
962  let after_colons = trimmed[3..].trim_start();
963  if !after_colons.starts_with("{.figure") {
964    return None;
965  }
966
967  // Extract ID if present
968  let id = if let Some(hash_pos) = after_colons.find('#') {
969    if let Some(close_brace) = after_colons.find('}') {
970      if hash_pos < close_brace {
971        Some(after_colons[hash_pos + 1..close_brace].trim().to_string())
972      } else {
973        None
974      }
975    } else {
976      None
977    }
978  } else {
979    None
980  };
981
982  // Get title from next line (should start with #)
983  let title = if let Some(title_line) = lines.next() {
984    let trimmed_title = title_line.trim();
985    if let Some(this) = trimmed_title.strip_prefix('#') {
986      { this.trim_matches(char::is_whitespace) }.to_string()
987    } else {
988      // Put the line back if it's not a title
989      return None;
990    }
991  } else {
992    return None;
993  };
994
995  // Collect figure content
996  let mut content = String::new();
997  for line in lines.by_ref() {
998    if line.trim().starts_with(":::") {
999      break;
1000    }
1001    content.push_str(line);
1002    content.push('\n');
1003  }
1004
1005  Some((id, title, content.trim().to_string()))
1006}
1007
1008/// Render an admonition as HTML
1009fn render_admonition(
1010  adm_type: &str,
1011  id: Option<&str>,
1012  content: &str,
1013) -> String {
1014  let capitalized_type = crate::utils::capitalize_first(adm_type);
1015  let id_attr = id.map_or(String::new(), |id| format!(" id=\"{id}\""));
1016
1017  let opening = format!(
1018    "<div class=\"admonition {adm_type}\"{id_attr}>\n<p \
1019     class=\"admonition-title\">{capitalized_type}</p>"
1020  );
1021  format!("{opening}\n\n{content}\n\n</div>\n")
1022}
1023
1024/// Render a figure as HTML
1025fn render_figure(id: Option<&str>, title: &str, content: &str) -> String {
1026  let id_attr = id.map_or(String::new(), |id| format!(" id=\"{id}\""));
1027
1028  format!(
1029    "<figure{id_attr}>\n<figcaption>{title}</figcaption>\n{content}\n</figure>"
1030  )
1031}
1032
1033/// Process manpage references in HTML content.
1034///
1035/// This function processes manpage references by finding span elements with
1036/// manpage-reference class and converting them to links when URLs are
1037/// available.
1038///
1039/// # Arguments
1040/// * `html` - The HTML content to process
1041/// * `manpage_urls` - Optional mapping of manpage names to URLs
1042///
1043/// # Returns
1044/// The processed HTML with manpage references converted to links
1045#[cfg(feature = "nixpkgs")]
1046#[must_use]
1047#[allow(
1048  clippy::implicit_hasher,
1049  reason = "Standard HashMap sufficient for this use case"
1050)]
1051pub fn process_manpage_references(
1052  html: &str,
1053  manpage_urls: Option<&std::collections::HashMap<String, String>>,
1054) -> String {
1055  process_safe(
1056    html,
1057    |html| {
1058      use kuchikikiki::NodeRef;
1059      use tendril::TendrilSink;
1060
1061      let document = kuchikikiki::parse_html().one(html);
1062      let mut to_replace = Vec::new();
1063
1064      // Find all spans with class "manpage-reference"
1065      for span_node in safe_select(&document, "span.manpage-reference") {
1066        let span_el = span_node;
1067        let span_text = span_el.text_contents();
1068
1069        if let Some(urls) = manpage_urls {
1070          // Check for direct URL match
1071          if let Some(url) = urls.get(&span_text) {
1072            let clean_url = extract_url_from_html(url);
1073            let link = NodeRef::new_element(
1074              markup5ever::QualName::new(
1075                None,
1076                markup5ever::ns!(html),
1077                markup5ever::local_name!("a"),
1078              ),
1079              vec![
1080                (
1081                  kuchikikiki::ExpandedName::new("", "href"),
1082                  kuchikikiki::Attribute {
1083                    prefix: None,
1084                    value:  clean_url.into(),
1085                  },
1086                ),
1087                (
1088                  kuchikikiki::ExpandedName::new("", "class"),
1089                  kuchikikiki::Attribute {
1090                    prefix: None,
1091                    value:  "manpage-reference".into(),
1092                  },
1093                ),
1094              ],
1095            );
1096            link.append(NodeRef::new_text(span_text.clone()));
1097            to_replace.push((span_el.clone(), link));
1098          }
1099        }
1100      }
1101
1102      // Apply replacements
1103      for (old, new) in to_replace {
1104        old.insert_before(new);
1105        old.detach();
1106      }
1107
1108      let mut out = Vec::new();
1109      let _ = document.serialize(&mut out);
1110      String::from_utf8(out).unwrap_or_default()
1111    },
1112    // Return original HTML on error
1113    "",
1114  )
1115}
1116
1117/// Process option references
1118/// Converts {option} role markup into links to the options page.
1119///
1120/// This processes `<code>` elements that have the `nixos-option` class, i.e.,
1121/// {option} role markup and convert them into links to the options page.
1122///
1123/// # Arguments
1124///
1125/// * `html` - The HTML string to process.
1126/// * `valid_options` - Optional set of valid option names for validation.
1127///
1128/// # Returns
1129///
1130/// The HTML string with option references rewritten as links.
1131#[cfg(feature = "ndg-flavored")]
1132#[must_use]
1133#[allow(
1134  clippy::implicit_hasher,
1135  reason = "Standard HashSet sufficient for this use case"
1136)]
1137pub fn process_option_references(
1138  html: &str,
1139  valid_options: Option<&std::collections::HashSet<String>>,
1140) -> String {
1141  use kuchikikiki::{Attribute, ExpandedName, NodeRef};
1142  use markup5ever::{QualName, local_name, ns};
1143  use tendril::TendrilSink;
1144
1145  process_safe(
1146    html,
1147    |html| {
1148      let document = kuchikikiki::parse_html().one(html);
1149
1150      let mut to_replace = vec![];
1151
1152      // Only process code elements that already have the nixos-option class
1153      // from {option} role syntax
1154      for code_node in safe_select(&document, "code.nixos-option") {
1155        let code_el = code_node;
1156        let code_text = code_el.text_contents();
1157
1158        // Skip if already wrapped in an option-reference link
1159        let mut is_already_option_ref = false;
1160        let mut current = code_el.parent();
1161        while let Some(parent) = current {
1162          if let Some(element) = parent.as_element()
1163            && element.name.local == local_name!("a")
1164            && let Some(class_attr) =
1165              element.attributes.borrow().get(local_name!("class"))
1166            && class_attr.contains("option-reference")
1167          {
1168            is_already_option_ref = true;
1169            break;
1170          }
1171          current = parent.parent();
1172        }
1173
1174        if !is_already_option_ref {
1175          // Check if validation is enabled and option is valid
1176          let should_link =
1177            valid_options.is_none_or(|opts| opts.contains(code_text.as_str())); // If no validation set, link all options
1178
1179          if should_link {
1180            let option_id = format!("option-{}", code_text.replace('.', "-"));
1181            let attrs = vec![
1182              (ExpandedName::new("", "href"), Attribute {
1183                prefix: None,
1184                value:  format!("options.html#{option_id}"),
1185              }),
1186              (ExpandedName::new("", "class"), Attribute {
1187                prefix: None,
1188                value:  "option-reference".into(),
1189              }),
1190            ];
1191            let a = NodeRef::new_element(
1192              QualName::new(None, ns!(html), local_name!("a")),
1193              attrs,
1194            );
1195            let code = NodeRef::new_element(
1196              QualName::new(None, ns!(html), local_name!("code")),
1197              vec![],
1198            );
1199            code.append(NodeRef::new_text(code_text.clone()));
1200            a.append(code);
1201            to_replace.push((code_el.clone(), a));
1202          }
1203          // If should_link is false, leave the code element as-is (no wrapping)
1204        }
1205      }
1206
1207      for (old, new) in to_replace {
1208        old.insert_before(new);
1209        old.detach();
1210      }
1211
1212      let mut out = Vec::new();
1213      let _ = document.serialize(&mut out);
1214      String::from_utf8(out).unwrap_or_default()
1215    },
1216    // Return original HTML on error
1217    "",
1218  )
1219}
1220
1221/// Extract URL from HTML anchor tag or return the string as-is if it's a plain
1222/// URL
1223fn extract_url_from_html(url_or_html: &str) -> &str {
1224  // Check if it looks like HTML (starts with <a href=")
1225  if url_or_html.starts_with("<a href=\"") {
1226    // Extract the URL from href attribute
1227    if let Some(start) = url_or_html.find("href=\"") {
1228      let start = start + 6; // Skip 'href="'
1229      if let Some(end) = url_or_html[start..].find('"') {
1230        return &url_or_html[start..start + end];
1231      }
1232    }
1233  }
1234
1235  // Return as-is if not HTML or if extraction fails
1236  url_or_html
1237}
1238
1239#[cfg(test)]
1240mod tests {
1241  use super::*;
1242
1243  #[test]
1244  fn test_is_atx_header_valid_headers() {
1245    // valid ATX headers with 1-6 hashes followed by space
1246    assert!(is_atx_header("# Header"));
1247    assert!(is_atx_header("## Header"));
1248    assert!(is_atx_header("### Header"));
1249    assert!(is_atx_header("#### Header"));
1250    assert!(is_atx_header("##### Header"));
1251    assert!(is_atx_header("###### Header"));
1252
1253    // valid ATX headers with tab after hashes
1254    assert!(is_atx_header("#\tHeader"));
1255    assert!(is_atx_header("##\tHeader"));
1256
1257    // valid ATX headers with just hashes (no content after)
1258    assert!(is_atx_header("#"));
1259    assert!(is_atx_header("##"));
1260    assert!(is_atx_header("###"));
1261    assert!(is_atx_header("####"));
1262    assert!(is_atx_header("#####"));
1263    assert!(is_atx_header("######"));
1264
1265    // valid ATX headers with multiple spaces
1266    assert!(is_atx_header("#  Header with multiple spaces"));
1267    assert!(is_atx_header("##   Header"));
1268  }
1269
1270  #[test]
1271  fn test_is_atx_header_invalid_headers() {
1272    // more than 6 hashes
1273    assert!(!is_atx_header("####### Too many hashes"));
1274    assert!(!is_atx_header("######## Even more"));
1275
1276    // no space after hash
1277    assert!(!is_atx_header("#NoSpace"));
1278    assert!(!is_atx_header("##NoSpace"));
1279
1280    // hash in the middle
1281    assert!(!is_atx_header("Not # a header"));
1282
1283    // empty string
1284    assert!(!is_atx_header(""));
1285
1286    // no hash at all
1287    assert!(!is_atx_header("Regular text"));
1288
1289    // hash with non-whitespace immediately after
1290    assert!(!is_atx_header("#hashtag"));
1291    assert!(!is_atx_header("##hashtag"));
1292    assert!(!is_atx_header("#123"));
1293    assert!(!is_atx_header("##abc"));
1294
1295    // special characters immediately after hash
1296    assert!(!is_atx_header("#!important"));
1297    assert!(!is_atx_header("#@mention"));
1298    assert!(!is_atx_header("#$variable"));
1299  }
1300
1301  #[test]
1302  fn test_is_atx_header_edge_cases() {
1303    // whitespace before hash is handled by caller (trimmed)
1304    // but testing it here to ensure robustness
1305    assert!(!is_atx_header(" # Header"));
1306    assert!(!is_atx_header("  ## Header"));
1307
1308    // only spaces after hash (should be valid)
1309    assert!(is_atx_header("#     "));
1310    assert!(is_atx_header("##    "));
1311
1312    // newline handling (string ends after valid header marker)
1313    assert!(is_atx_header("# Header\n"));
1314    assert!(is_atx_header("## Header\n"));
1315
1316    // mixed whitespace after hash
1317    assert!(is_atx_header("# \t  Header"));
1318    assert!(is_atx_header("##  \tHeader"));
1319  }
1320
1321  #[test]
1322  fn test_is_atx_header_blockquote_context() {
1323    // these are the types of strings that would be passed from
1324    // collect_github_callout_content after trim_start()
1325    assert!(is_atx_header("# New Section"));
1326    assert!(is_atx_header("## Subsection"));
1327
1328    // non-headers that should not break blockquote
1329    assert!(!is_atx_header("#tag"));
1330    assert!(!is_atx_header("##issue-123"));
1331    assert!(!is_atx_header("###no-space"));
1332
1333    // edge case: exactly 6 hashes (valid)
1334    assert!(is_atx_header("###### Level 6"));
1335
1336    // edge case: 7 hashes (invalid)
1337    assert!(!is_atx_header("####### Not valid"));
1338  }
1339}
ndg_commonmark/processor/extensions.rs

ndg_commonmark/processor/
extensions.rs