ndg_commonmark/
processor.rs

1use std::{
2    collections::HashMap,
3    path::{Path, PathBuf},
4};
5
6use comrak::{
7    Arena, ComrakOptions,
8    nodes::{AstNode, NodeHeading, NodeValue},
9    parse_document,
10};
11use log::trace;
12use markup5ever::{local_name, ns};
13use walkdir::WalkDir;
14
15use crate::{
16    syntax::{SyntaxManager, create_default_manager},
17    types::{Header, MarkdownResult},
18    utils::{self, safely_process_markup},
19};
20
21/// Options for configuring the Markdown processor.
22#[derive(Debug, Clone)]
23pub struct MarkdownOptions {
24    /// Enable GitHub Flavored Markdown (GFM) extensions.
25    pub gfm: bool,
26
27    /// Enable Nixpkgs/NixOS documentation extensions.
28    pub nixpkgs: bool,
29
30    /// Enable syntax highlighting for code blocks.
31    pub highlight_code: bool,
32
33    /// Optional: Custom syntax highlighting theme name.
34    pub highlight_theme: Option<String>,
35
36    /// Optional: Path to manpage URL mappings (for {manpage} roles).
37    pub manpage_urls_path: Option<String>,
38}
39
40impl Default for MarkdownOptions {
41    fn default() -> Self {
42        Self {
43            gfm: cfg!(feature = "gfm"),
44            nixpkgs: cfg!(feature = "nixpkgs"),
45            highlight_code: cfg!(feature = "syntastica"),
46            manpage_urls_path: None,
47            highlight_theme: None,
48        }
49    }
50}
51
52/// Main Markdown processor struct.
53pub struct MarkdownProcessor {
54    options: MarkdownOptions,
55    manpage_urls: Option<HashMap<String, String>>,
56    syntax_manager: Option<SyntaxManager>,
57}
58
59impl MarkdownProcessor {
60    /// Create a new `MarkdownProcessor` with the given options.
61    #[must_use]
62    pub fn new(options: MarkdownOptions) -> Self {
63        let manpage_urls = options
64            .manpage_urls_path
65            .as_ref()
66            .and_then(|path| utils::load_manpage_urls(path).ok());
67
68        let syntax_manager = if options.highlight_code {
69            create_default_manager().ok()
70        } else {
71            None
72        };
73
74        Self {
75            options,
76            manpage_urls,
77            syntax_manager,
78        }
79    }
80
81    /// Highlight all code blocks in HTML using the configured syntax highlighter
82    #[must_use]
83    pub fn highlight_codeblocks(&self, html: &str) -> String {
84        if !self.options.highlight_code || self.syntax_manager.is_none() {
85            return html.to_string();
86        }
87
88        use kuchikikiki::parse_html;
89        use tendril::TendrilSink;
90
91        let document = parse_html().one(html);
92
93        // Collect all code blocks first to avoid DOM modification during iteration
94        let mut code_blocks = Vec::new();
95        for pre_node in document.select("pre > code").unwrap() {
96            let code_node = pre_node.as_node();
97            if let Some(element) = code_node.as_element() {
98                let class_attr = element
99                    .attributes
100                    .borrow()
101                    .get("class")
102                    .map(std::string::ToString::to_string);
103                let language = class_attr
104                    .as_deref()
105                    .and_then(|s| s.strip_prefix("language-"))
106                    .unwrap_or("text");
107                let code_text = code_node.text_contents();
108
109                if let Some(pre_parent) = code_node.parent() {
110                    code_blocks.push((pre_parent.clone(), code_text, language.to_string()));
111                }
112            }
113        }
114
115        // Process each code block
116        for (pre_element, code_text, language) in code_blocks {
117            if let Some(highlighted) = self.highlight_code_html(&code_text, &language) {
118                // Replace the entire <pre><code>...</code></pre> with highlighted HTML
119                let fragment = parse_html().one(highlighted.as_str());
120                pre_element.insert_after(fragment);
121                pre_element.detach();
122            }
123        }
124
125        let mut buf = Vec::new();
126        document.serialize(&mut buf).unwrap();
127        String::from_utf8(buf).unwrap_or_default()
128    }
129
130    /// Highlight code using the configured syntax highlighter, returns HTML string
131    fn highlight_code_html(&self, code: &str, language: &str) -> Option<String> {
132        if !self.options.highlight_code {
133            return None;
134        }
135
136        let syntax_manager = self.syntax_manager.as_ref()?;
137
138        syntax_manager
139            .highlight_code(code, language, self.options.highlight_theme.as_deref())
140            .ok()
141    }
142
143    /// Render Markdown to HTML, extracting headers and title.
144    #[must_use]
145    pub fn render(&self, markdown: &str) -> MarkdownResult {
146        // 1. Preprocess (includes, block elements, headers, inline anchors, roles)
147        let preprocessed = self.preprocess(markdown);
148
149        // 2. Extract headers and title
150        let (headers, title) = self.extract_headers(&preprocessed);
151
152        // 3. Convert to HTML
153        let html = self.convert_to_html(&preprocessed);
154
155        // 4. Process option references
156        let html = if cfg!(feature = "ndg-flavored") {
157            #[cfg(feature = "ndg-flavored")]
158            {
159                process_option_references(&html)
160            }
161            #[cfg(not(feature = "ndg-flavored"))]
162            {
163                html
164            }
165        } else {
166            html
167        };
168
169        // 5. Process autolinks
170        let html = if self.options.gfm {
171            self.process_autolinks(&html)
172        } else {
173            html
174        };
175
176        // 6. Post-process manpage references
177        let html = if self.options.nixpkgs {
178            self.process_manpage_references_html(&html)
179        } else {
180            html
181        };
182
183        // 7. Apply syntax highlighting to code blocks
184        let html = if self.options.highlight_code {
185            self.highlight_codeblocks(&html)
186        } else {
187            html
188        };
189
190        // 8. Complete HTML post-processing
191        let html = self.kuchiki_postprocess(&html);
192
193        MarkdownResult {
194            html,
195            headers,
196            title,
197        }
198    }
199
200    /// Preprocess the markdown content (includes, block elements, headers, roles, etc).
201    fn preprocess(&self, content: &str) -> String {
202        // 1. Process file includes if nixpkgs feature is enabled
203        let with_includes = if self.options.nixpkgs {
204            #[cfg(feature = "nixpkgs")]
205            {
206                process_file_includes(content, std::path::Path::new("."))
207            }
208            #[cfg(not(feature = "nixpkgs"))]
209            {
210                content.to_string()
211            }
212        } else {
213            content.to_string()
214        };
215
216        // 2. Process block elements (admonitions, figures, definition lists)
217        let preprocessed = if self.options.nixpkgs {
218            self.process_block_elements(&with_includes)
219        } else {
220            with_includes
221        };
222
223        // 3. Process inline anchors
224        let with_inline_anchors = if self.options.nixpkgs {
225            self.process_inline_anchors(&preprocessed)
226        } else {
227            preprocessed
228        };
229
230        // 4. Process role markup
231        if self.options.nixpkgs || cfg!(feature = "ndg-flavored") {
232            self.process_role_markup(&with_inline_anchors)
233        } else {
234            with_inline_anchors
235        }
236    }
237
238    /// Process inline anchors by converting []{#id} syntax to HTML spans.
239    /// Also handles list items with anchors at the beginning.
240    /// This is nixpkgs-specific functionality.
241    fn process_inline_anchors(&self, content: &str) -> String {
242        if !self.options.nixpkgs {
243            return content.to_string();
244        }
245        let mut result = String::with_capacity(content.len() + 100);
246        let mut in_code_block = false;
247        let mut code_fence_char = None;
248        let mut code_fence_count = 0;
249
250        for line in content.lines() {
251            let trimmed = line.trim_start();
252
253            // Check for code fences
254            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
255                let fence_char = trimmed.chars().next().unwrap();
256                let fence_count = trimmed.chars().take_while(|&c| c == fence_char).count();
257
258                if fence_count >= 3 {
259                    if !in_code_block {
260                        // Starting a code block
261                        in_code_block = true;
262                        code_fence_char = Some(fence_char);
263                        code_fence_count = fence_count;
264                    } else if code_fence_char == Some(fence_char) && fence_count >= code_fence_count
265                    {
266                        // Ending a code block
267                        in_code_block = false;
268                        code_fence_char = None;
269                        code_fence_count = 0;
270                    }
271                }
272            }
273
274            // Only process inline anchors if we're not in a code block
275            if in_code_block {
276                // In code block, keep line as-is
277                result.push_str(line);
278                result.push('\n');
279            } else {
280                // Check for list items with anchors:
281                // "- []{#id} content" or "1. []{#id} content"
282                if let Some(anchor_start) = Self::find_list_item_anchor(trimmed) {
283                    if let Some(processed_line) = Self::process_list_item_anchor(line, anchor_start)
284                    {
285                        result.push_str(&processed_line);
286                        result.push('\n');
287                        continue;
288                    }
289                }
290
291                // Process regular inline anchors in the line
292                result.push_str(&Self::process_line_anchors(line));
293                result.push('\n');
294            }
295        }
296
297        result
298    }
299
300    /// Find if a line starts with a list marker followed by an anchor.
301    fn find_list_item_anchor(trimmed: &str) -> Option<usize> {
302        // Check for unordered list: "- []{#id}" or "* []{#id}" or "+ []{#id}"
303        if (trimmed.starts_with("- ") || trimmed.starts_with("* ") || trimmed.starts_with("+ "))
304            && trimmed.len() > 2
305        {
306            let after_marker = &trimmed[2..];
307            if after_marker.starts_with("[]{#") {
308                return Some(2);
309            }
310        }
311
312        // Check for ordered list: "1. []{#id}" or "123. []{#id}"
313        let mut i = 0;
314        while i < trimmed.len() && trimmed.chars().nth(i).unwrap_or(' ').is_ascii_digit() {
315            i += 1;
316        }
317        if i > 0 && i < trimmed.len() - 1 && trimmed.chars().nth(i) == Some('.') {
318            let after_marker = &trimmed[i + 1..];
319            if after_marker.starts_with(" []{#") {
320                return Some(i + 2);
321            }
322        }
323
324        None
325    }
326
327    /// Process a list item line that contains an anchor.
328    fn process_list_item_anchor(line: &str, anchor_start: usize) -> Option<String> {
329        let before_anchor = &line[..anchor_start];
330        let after_marker = &line[anchor_start..];
331
332        if !after_marker.starts_with("[]{#") {
333            return None;
334        }
335
336        // Find the end of the anchor: []{#id}
337        if let Some(anchor_end) = after_marker.find('}') {
338            let id = &after_marker[4..anchor_end]; // skip "[]{#" and take until '}'
339            let remaining_content = &after_marker[anchor_end + 1..]; // skip '}'
340
341            // Validate ID contains only allowed characters
342            if id
343                .chars()
344                .all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
345                && !id.is_empty()
346            {
347                return Some(format!(
348                    "{before_anchor}<span id=\"{id}\" class=\"nixos-anchor\"></span>{remaining_content}"
349                ));
350            }
351        }
352
353        None
354    }
355
356    /// Process inline anchors in a single line.
357    fn process_line_anchors(line: &str) -> String {
358        let mut result = String::with_capacity(line.len());
359        let mut chars = line.chars().peekable();
360
361        while let Some(ch) = chars.next() {
362            if ch == '[' && chars.peek() == Some(&']') {
363                chars.next(); // consume ']'
364
365                // Check for {#id} pattern
366                if chars.peek() == Some(&'{') {
367                    chars.next(); // consume '{'
368                    if chars.peek() == Some(&'#') {
369                        chars.next(); // consume '#'
370
371                        // Collect the ID
372                        let mut id = String::new();
373                        while let Some(&next_ch) = chars.peek() {
374                            if next_ch == '}' {
375                                chars.next(); // consume '}'
376
377                                // Validate ID and create span
378                                if !id.is_empty()
379                                    && id
380                                        .chars()
381                                        .all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
382                                {
383                                    result.push_str(&format!(
384                                        "<span id=\"{id}\" class=\"nixos-anchor\"></span>"
385                                    ));
386                                } else {
387                                    // Invalid ID, put back original text
388                                    result.push_str(&format!("[]{{{{#{id}}}}}"));
389                                }
390                                break;
391                            } else if next_ch.is_ascii_alphanumeric()
392                                || next_ch == '-'
393                                || next_ch == '_'
394                            {
395                                id.push(next_ch);
396                                chars.next();
397                            } else {
398                                // Invalid character, put back original text
399                                result.push_str(&format!("[]{{{{#{id}"));
400                                break;
401                            }
402                        }
403                    } else {
404                        // Not an anchor, put back consumed characters
405                        result.push_str("]{");
406                    }
407                } else {
408                    // Not an anchor, put back consumed character
409                    result.push(']');
410                }
411            } else {
412                result.push(ch);
413            }
414        }
415
416        result
417    }
418
419    /// Process block elements including admonitions, figures, and definition lists.
420    /// This is nixpkgs-specific functionality.
421    fn process_block_elements(&self, content: &str) -> String {
422        if !self.options.nixpkgs {
423            return content.to_string();
424        }
425        let mut result = Vec::new();
426        let mut lines = content.lines().peekable();
427        let mut in_code_block = false;
428        let mut code_fence_char = None;
429        let mut code_fence_count = 0;
430
431        while let Some(line) = lines.next() {
432            // Check for code fences
433            let trimmed = line.trim_start();
434            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
435                let fence_char = trimmed.chars().next().unwrap();
436                let fence_count = trimmed.chars().take_while(|&c| c == fence_char).count();
437
438                if fence_count >= 3 {
439                    if !in_code_block {
440                        // Starting a code block
441                        in_code_block = true;
442                        code_fence_char = Some(fence_char);
443                        code_fence_count = fence_count;
444                    } else if code_fence_char == Some(fence_char) && fence_count >= code_fence_count
445                    {
446                        // Ending a code block
447                        in_code_block = false;
448                        code_fence_char = None;
449                        code_fence_count = 0;
450                    }
451                }
452            }
453
454            // Only process block elements if we're not in a code block
455            if !in_code_block {
456                // Check for GitHub-style callouts: > [!TYPE]
457                if let Some((callout_type, initial_content)) = Self::parse_github_callout(line) {
458                    let content = self.collect_github_callout_content(&mut lines, initial_content);
459                    let admonition = Self::render_admonition(&callout_type, None, &content);
460                    result.push(admonition);
461                    continue;
462                }
463
464                // Check for fenced admonitions: ::: {.type}
465                if let Some((adm_type, id)) = Self::parse_fenced_admonition_start(line) {
466                    let content = self.collect_fenced_content(&mut lines);
467                    let admonition = Self::render_admonition(&adm_type, id.as_deref(), &content);
468                    result.push(admonition);
469                    continue;
470                }
471
472                // Check for figures: ::: {.figure #id}
473                if let Some((id, title, content)) = Self::parse_figure_block(line, &mut lines) {
474                    let figure = Self::render_figure(id.as_deref(), &title, &content);
475                    result.push(figure);
476                    continue;
477                }
478
479                // Check for definition lists: Term\n:   Definition
480                if !line.is_empty() && !line.starts_with(':') {
481                    if let Some(next_line) = lines.peek() {
482                        if next_line.starts_with(":   ") {
483                            let term = line;
484                            let def_line = lines.next().unwrap();
485                            let definition = &def_line[4..]; // Skip ":   "
486                            let dl = format!("<dl>\n<dt>{term}</dt>\n<dd>{definition}</dd>\n</dl>");
487                            result.push(dl);
488                            continue;
489                        }
490                    }
491                }
492            }
493
494            // Regular line, keep as-is
495            result.push(line.to_string());
496        }
497
498        result.join("\n")
499    }
500
501    /// Parse GitHub-style callout syntax: > [!TYPE] content
502    fn parse_github_callout(line: &str) -> Option<(String, String)> {
503        let trimmed = line.trim_start();
504        if !trimmed.starts_with("> [!") {
505            return None;
506        }
507
508        // Find the closing bracket
509        if let Some(close_bracket) = trimmed.find(']') {
510            if close_bracket > 4 {
511                let callout_type = &trimmed[4..close_bracket];
512
513                // Validate callout type
514                match callout_type {
515                    "NOTE" | "TIP" | "IMPORTANT" | "WARNING" | "CAUTION" | "DANGER" => {
516                        let content = trimmed[close_bracket + 1..].trim();
517                        return Some((callout_type.to_lowercase(), content.to_string()));
518                    }
519                    _ => return None,
520                }
521            }
522        }
523
524        None
525    }
526
527    /// Collect content for GitHub-style callouts
528    fn collect_github_callout_content(
529        &self,
530        lines: &mut std::iter::Peekable<std::str::Lines>,
531        initial_content: String,
532    ) -> String {
533        let mut content = String::new();
534
535        if !initial_content.is_empty() {
536            content.push_str(&initial_content);
537            content.push('\n');
538        }
539
540        while let Some(line) = lines.peek() {
541            let trimmed = line.trim_start();
542            if trimmed.starts_with('>') {
543                let content_part = trimmed.strip_prefix('>').unwrap_or("").trim_start();
544                content.push_str(content_part);
545                content.push('\n');
546                lines.next(); // consume the line
547            } else {
548                break;
549            }
550        }
551
552        content.trim().to_string()
553    }
554
555    /// Parse fenced admonition start: ::: {.type #id}
556    fn parse_fenced_admonition_start(line: &str) -> Option<(String, Option<String>)> {
557        let trimmed = line.trim();
558        if !trimmed.starts_with(":::") {
559            return None;
560        }
561
562        let after_colons = trimmed[3..].trim_start();
563        if !after_colons.starts_with("{.") {
564            return None;
565        }
566
567        // Find the closing brace
568        if let Some(close_brace) = after_colons.find('}') {
569            let content = &after_colons[2..close_brace]; // Skip "{."
570
571            // Parse type and optional ID
572            let parts: Vec<&str> = content.split_whitespace().collect();
573            if let Some(&adm_type) = parts.first() {
574                let id = parts
575                    .iter()
576                    .find(|part| part.starts_with('#'))
577                    .map(|id_part| id_part[1..].to_string()); // Remove '#'
578
579                return Some((adm_type.to_string(), id));
580            }
581        }
582
583        None
584    }
585
586    /// Collect content until closing :::
587    fn collect_fenced_content(&self, lines: &mut std::iter::Peekable<std::str::Lines>) -> String {
588        let mut content = String::new();
589
590        for line in lines.by_ref() {
591            if line.trim().starts_with(":::") {
592                break;
593            }
594            content.push_str(line);
595            content.push('\n');
596        }
597
598        content.trim().to_string()
599    }
600
601    /// Parse figure block: ::: {.figure #id}
602    fn parse_figure_block(
603        line: &str,
604        lines: &mut std::iter::Peekable<std::str::Lines>,
605    ) -> Option<(Option<String>, String, String)> {
606        let trimmed = line.trim();
607        if !trimmed.starts_with(":::") {
608            return None;
609        }
610
611        let after_colons = trimmed[3..].trim_start();
612        if !after_colons.starts_with("{.figure") {
613            return None;
614        }
615
616        // Extract ID if present
617        let id = if let Some(hash_pos) = after_colons.find('#') {
618            if let Some(close_brace) = after_colons.find('}') {
619                if hash_pos < close_brace {
620                    Some(after_colons[hash_pos + 1..close_brace].trim().to_string())
621                } else {
622                    None
623                }
624            } else {
625                None
626            }
627        } else {
628            None
629        };
630
631        // Get title from next line (should start with #)
632        let title = if let Some(title_line) = lines.next() {
633            let trimmed_title = title_line.trim();
634            if let Some(this) = trimmed_title.strip_prefix('#') {
635                { this.trim_matches(char::is_whitespace) }.to_string()
636            } else {
637                // Put the line back if it's not a title
638                return None;
639            }
640        } else {
641            return None;
642        };
643
644        // Collect figure content
645        let mut content = String::new();
646        for line in lines.by_ref() {
647            if line.trim().starts_with(":::") {
648                break;
649            }
650            content.push_str(line);
651            content.push('\n');
652        }
653
654        Some((id, title, content.trim().to_string()))
655    }
656
657    /// Render an admonition as HTML
658    fn render_admonition(adm_type: &str, id: Option<&str>, content: &str) -> String {
659        let capitalized_type = crate::utils::capitalize_first(adm_type);
660        let id_attr = id.map_or(String::new(), |id| format!(" id=\"{id}\""));
661
662        format!(
663            "<div class=\"admonition {adm_type}\"{id_attr}>\n<p class=\"admonition-title\">{capitalized_type}</p>\n\n{content}\n\n</div>"
664        )
665    }
666
667    /// Render a figure as HTML
668    fn render_figure(id: Option<&str>, title: &str, content: &str) -> String {
669        let id_attr = id.map_or(String::new(), |id| format!(" id=\"{id}\""));
670
671        format!("<figure{id_attr}>\n<figcaption>{title}</figcaption>\n{content}\n</figure>")
672    }
673
674    /// Extract headers and title from the markdown content.
675    #[must_use]
676    pub fn extract_headers(&self, content: &str) -> (Vec<Header>, Option<String>) {
677        let arena = Arena::new();
678        let options = self.comrak_options();
679
680        // Normalize custom anchors with no heading level to h2
681        let mut normalized = String::with_capacity(content.len());
682        for line in content.lines() {
683            let trimmed = line.trim_end();
684            if !trimmed.starts_with('#') {
685                if let Some(anchor_start) = trimmed.rfind("{#") {
686                    if let Some(anchor_end) = trimmed[anchor_start..].find('}') {
687                        let text = trimmed[..anchor_start].trim_end();
688                        let id = &trimmed[anchor_start + 2..anchor_start + anchor_end];
689                        normalized.push_str(&format!("## {text} {{#{id}}}\n"));
690                        continue;
691                    }
692                }
693            }
694            normalized.push_str(line);
695            normalized.push('\n');
696        }
697
698        let root = parse_document(&arena, &normalized, &options);
699
700        let mut headers = Vec::new();
701        let mut found_title = None;
702
703        for node in root.descendants() {
704            if let NodeValue::Heading(NodeHeading { level, .. }) = &node.data.borrow().value {
705                let mut text = String::new();
706                let mut explicit_id = None;
707
708                for child in node.children() {
709                    match &child.data.borrow().value {
710                        NodeValue::Text(t) => text.push_str(t),
711                        NodeValue::Code(t) => text.push_str(&t.literal),
712                        NodeValue::Link(..) => text.push_str(&extract_inline_text(child)),
713                        NodeValue::Emph => text.push_str(&extract_inline_text(child)),
714                        NodeValue::Strong => text.push_str(&extract_inline_text(child)),
715                        NodeValue::Strikethrough => text.push_str(&extract_inline_text(child)),
716                        NodeValue::Superscript => text.push_str(&extract_inline_text(child)),
717                        NodeValue::Subscript => text.push_str(&extract_inline_text(child)),
718                        NodeValue::FootnoteReference(..) => {
719                            text.push_str(&extract_inline_text(child));
720                        }
721                        NodeValue::HtmlInline(html) => {
722                            // Look for explicit anchor in HTML inline node: {#id}
723                            let html_str = html.as_str();
724                            if let Some(start) = html_str.find("{#") {
725                                if let Some(end) = html_str[start..].find('}') {
726                                    let anchor = &html_str[start + 2..start + end];
727                                    explicit_id = Some(anchor.to_string());
728                                }
729                            }
730                        }
731                        NodeValue::Image(..) => {}
732                        _ => {}
733                    }
734                }
735
736                // Check for trailing {#id} in heading text
737                let trimmed = text.trim_end();
738                let (final_text, id) = if let Some(start) = trimmed.rfind("{#") {
739                    if let Some(end) = trimmed[start..].find('}') {
740                        let anchor = &trimmed[start + 2..start + end];
741                        (trimmed[..start].trim_end().to_string(), anchor.to_string())
742                    } else {
743                        (
744                            text.clone(),
745                            explicit_id.unwrap_or_else(|| utils::slugify(&text)),
746                        )
747                    }
748                } else {
749                    (
750                        text.clone(),
751                        explicit_id.unwrap_or_else(|| utils::slugify(&text)),
752                    )
753                };
754                if *level == 1 && found_title.is_none() {
755                    found_title = Some(final_text.clone());
756                }
757                headers.push(Header {
758                    text: final_text,
759                    level: *level,
760                    id,
761                });
762            }
763        }
764
765        (headers, found_title)
766    }
767
768    /// Convert markdown to HTML using comrak and configured options.
769    fn convert_to_html(&self, content: &str) -> String {
770        safely_process_markup(
771            content,
772            |content| {
773                let arena = Arena::new();
774                let options = self.comrak_options();
775                let root = parse_document(&arena, content, &options);
776
777                // Apply AST transformations
778                let prompt_transformer = PromptTransformer;
779                prompt_transformer.transform(root);
780
781                let mut html_output = Vec::new();
782                comrak::format_html(root, &options, &mut html_output).unwrap_or_default();
783                let html = String::from_utf8(html_output).unwrap_or_default();
784
785                // Post-process HTML to handle header anchors
786                self.process_header_anchors_html(&html)
787            },
788            "<div class=\"error\">Error processing markdown content</div>",
789        )
790    }
791
792    /// Process header anchors in HTML by finding {#id} syntax and converting to proper id attributes
793    fn process_header_anchors_html(&self, html: &str) -> String {
794        use std::sync::LazyLock;
795
796        use regex::Regex;
797
798        static HEADER_ANCHOR_RE: LazyLock<Regex> = LazyLock::new(|| {
799            Regex::new(r"<h([1-6])>(.*?)\s*\{#([a-zA-Z0-9_-]+)\}(.*?)</h[1-6]>").unwrap_or_else(
800                |e| {
801                    log::error!("Failed to compile HEADER_ANCHOR_RE regex: {e}");
802                    utils::never_matching_regex()
803                },
804            )
805        });
806
807        HEADER_ANCHOR_RE
808            .replace_all(html, |caps: &regex::Captures| {
809                let level = &caps[1];
810                let prefix = &caps[2];
811                let id = &caps[3];
812                let suffix = &caps[4];
813                format!("<h{level} id=\"{id}\">{prefix}{suffix}</h{level}>")
814            })
815            .to_string()
816    }
817
818    /// Build comrak options from `MarkdownOptions` and feature flags.
819    fn comrak_options(&self) -> ComrakOptions<'_> {
820        let mut options = ComrakOptions::default();
821        if self.options.gfm {
822            options.extension.table = true;
823            options.extension.footnotes = true;
824            options.extension.strikethrough = true;
825            options.extension.tasklist = true;
826            options.extension.superscript = true;
827        }
828        options.render.unsafe_ = true;
829        // Disable automatic header ID generation - we handle anchors manually
830        options.extension.header_ids = None;
831        options
832    }
833
834    // Role markup processing moved to standalone function process_role_markup_standalone
835
836    /// Get the manpage URLs mapping for use with standalone functions.
837    #[must_use]
838    pub fn manpage_urls(&self) -> Option<&HashMap<String, String>> {
839        self.manpage_urls.as_ref()
840    }
841
842    /// Process role markup while being aware of code blocks and inline code.
843    /// This avoids processing role markup inside code fences and inline code.
844    #[cfg(any(feature = "nixpkgs", feature = "ndg-flavored"))]
845    #[must_use]
846    pub fn process_role_markup(&self, content: &str) -> String {
847        process_role_markup(content, self.manpage_urls.as_ref())
848    }
849
850    /// Process autolinks by converting plain URLs to HTML anchor tags.
851    /// This searches for URLs in text nodes and converts them to clickable links.
852    #[cfg(feature = "gfm")]
853    fn process_autolinks(&self, html: &str) -> String {
854        process_autolinks(html)
855    }
856
857    /// Post-process HTML to enhance manpage references with URL links.
858    /// This finds <span class="manpage-reference"> elements and converts them to links when URLs are available.
859    #[cfg(feature = "nixpkgs")]
860    fn process_manpage_references_html(&self, html: &str) -> String {
861        process_manpage_references(html, self.manpage_urls.as_ref())
862    }
863
864    /// HTML post-processing using kuchiki DOM manipulation.
865    fn kuchiki_postprocess(&self, html: &str) -> String {
866        safely_process_markup(
867            html,
868            |html| {
869                use tendril::TendrilSink;
870
871                let document = kuchikikiki::parse_html().one(html);
872
873                // Process list item ID markers: <li><!-- nixos-anchor-id:ID -->
874                self.process_list_item_id_markers(&document);
875
876                // Process header anchors with comments: <h1>text<!-- anchor: id --></h1>
877                self.process_header_anchor_comments(&document);
878
879                // Process remaining inline anchors in list items: <li>[]{#id}content</li>
880                self.process_list_item_inline_anchors(&document);
881
882                // Process inline anchors in paragraphs: <p>[]{#id}content</p>
883                self.process_paragraph_inline_anchors(&document);
884
885                // Process remaining standalone inline anchors
886                self.process_remaining_inline_anchors(&document);
887
888                // Process empty auto-links: [](#anchor)
889                self.process_empty_auto_links(&document);
890
891                // Process empty HTML links: <a href="#anchor"></a>
892                self.process_empty_html_links(&document);
893
894                let mut out = Vec::new();
895                document.serialize(&mut out).ok();
896                String::from_utf8(out).unwrap_or_default()
897            },
898            // Return original HTML on error
899            "",
900        )
901    }
902
903    /// Process list item ID markers: <li><!-- nixos-anchor-id:ID -->
904    fn process_list_item_id_markers(&self, document: &kuchikikiki::NodeRef) {
905        let mut to_modify = Vec::new();
906
907        for comment in document.inclusive_descendants() {
908            if let Some(comment_node) = comment.as_comment() {
909                let comment_text = comment_node.borrow();
910                if let Some(id_start) = comment_text.find("nixos-anchor-id:") {
911                    let id = comment_text[id_start + 16..].trim();
912                    if !id.is_empty()
913                        && id
914                            .chars()
915                            .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
916                    {
917                        // Check if this comment is inside an <li> element
918                        if let Some(parent) = comment.parent() {
919                            if let Some(element) = parent.as_element() {
920                                if element.name.local.as_ref() == "li" {
921                                    to_modify.push((comment.clone(), id.to_string()));
922                                }
923                            }
924                        }
925                    }
926                }
927            }
928        }
929
930        for (comment_node, id) in to_modify {
931            let span = kuchikikiki::NodeRef::new_element(
932                markup5ever::QualName::new(None, ns!(html), local_name!("span")),
933                vec![
934                    (
935                        kuchikikiki::ExpandedName::new("", "id"),
936                        kuchikikiki::Attribute {
937                            prefix: None,
938                            value: id,
939                        },
940                    ),
941                    (
942                        kuchikikiki::ExpandedName::new("", "class"),
943                        kuchikikiki::Attribute {
944                            prefix: None,
945                            value: "nixos-anchor".into(),
946                        },
947                    ),
948                ],
949            );
950            comment_node.insert_after(span);
951            comment_node.detach();
952        }
953    }
954
955    /// Process header anchors with comments: <h1>text<!-- anchor: id --></h1>
956    fn process_header_anchor_comments(&self, document: &kuchikikiki::NodeRef) {
957        let mut to_modify = Vec::new();
958
959        for comment in document.inclusive_descendants() {
960            if let Some(comment_node) = comment.as_comment() {
961                let comment_text = comment_node.borrow();
962                if let Some(anchor_start) = comment_text.find("anchor:") {
963                    let id = comment_text[anchor_start + 7..].trim();
964                    if !id.is_empty()
965                        && id
966                            .chars()
967                            .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
968                    {
969                        // Check if this comment is inside a header element
970                        if let Some(parent) = comment.parent() {
971                            if let Some(element) = parent.as_element() {
972                                let tag_name = element.name.local.as_ref();
973                                if matches!(tag_name, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
974                                    to_modify.push((
975                                        parent.clone(),
976                                        comment.clone(),
977                                        id.to_string(),
978                                    ));
979                                }
980                            }
981                        }
982                    }
983                }
984            }
985        }
986
987        for (header_element, comment_node, id) in to_modify {
988            if let Some(element) = header_element.as_element() {
989                element
990                    .attributes
991                    .borrow_mut()
992                    .insert(local_name!("id"), id);
993                comment_node.detach();
994            }
995        }
996    }
997
998    /// Process remaining inline anchors in list items: <li>[]{#id}content</li>
999    fn process_list_item_inline_anchors(&self, document: &kuchikikiki::NodeRef) {
1000        for li_node in document.select("li").unwrap() {
1001            let li_element = li_node.as_node();
1002
1003            // Check if this list item contains code elements
1004            let has_code = li_element.select("code, pre").is_ok()
1005                && li_element.select("code, pre").unwrap().next().is_some();
1006            if has_code {
1007                continue; // Skip list items with code blocks
1008            }
1009
1010            let text_content = li_element.text_contents();
1011
1012            if let Some(anchor_start) = text_content.find("[]{#") {
1013                if let Some(anchor_end) = text_content[anchor_start..].find('}') {
1014                    let id = &text_content[anchor_start + 4..anchor_start + anchor_end];
1015                    if !id.is_empty()
1016                        && id
1017                            .chars()
1018                            .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
1019                    {
1020                        let remaining_content = &text_content[anchor_start + anchor_end + 1..];
1021
1022                        // Clear current content and rebuild
1023                        for child in li_element.children() {
1024                            child.detach();
1025                        }
1026
1027                        let span = kuchikikiki::NodeRef::new_element(
1028                            markup5ever::QualName::new(None, ns!(html), local_name!("span")),
1029                            vec![
1030                                (
1031                                    kuchikikiki::ExpandedName::new("", "id"),
1032                                    kuchikikiki::Attribute {
1033                                        prefix: None,
1034                                        value: id.into(),
1035                                    },
1036                                ),
1037                                (
1038                                    kuchikikiki::ExpandedName::new("", "class"),
1039                                    kuchikikiki::Attribute {
1040                                        prefix: None,
1041                                        value: "nixos-anchor".into(),
1042                                    },
1043                                ),
1044                            ],
1045                        );
1046                        li_element.append(span);
1047                        if !remaining_content.is_empty() {
1048                            li_element.append(kuchikikiki::NodeRef::new_text(remaining_content));
1049                        }
1050                    }
1051                }
1052            }
1053        }
1054    }
1055
1056    /// Process inline anchors in paragraphs: <p>[]{#id}content</p>
1057    fn process_paragraph_inline_anchors(&self, document: &kuchikikiki::NodeRef) {
1058        for p_node in document.select("p").unwrap() {
1059            let p_element = p_node.as_node();
1060
1061            // Check if this paragraph contains code elements
1062            let has_code = p_element.select("code, pre").is_ok()
1063                && p_element.select("code, pre").unwrap().next().is_some();
1064            if has_code {
1065                continue; // Skip paragraphs with code blocks
1066            }
1067
1068            let text_content = p_element.text_contents();
1069
1070            if let Some(anchor_start) = text_content.find("[]{#") {
1071                if let Some(anchor_end) = text_content[anchor_start..].find('}') {
1072                    let id = &text_content[anchor_start + 4..anchor_start + anchor_end];
1073                    if !id.is_empty()
1074                        && id
1075                            .chars()
1076                            .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
1077                    {
1078                        let remaining_content = &text_content[anchor_start + anchor_end + 1..];
1079
1080                        // Clear current content and rebuild
1081                        for child in p_element.children() {
1082                            child.detach();
1083                        }
1084
1085                        let span = kuchikikiki::NodeRef::new_element(
1086                            markup5ever::QualName::new(None, ns!(html), local_name!("span")),
1087                            vec![
1088                                (
1089                                    kuchikikiki::ExpandedName::new("", "id"),
1090                                    kuchikikiki::Attribute {
1091                                        prefix: None,
1092                                        value: id.into(),
1093                                    },
1094                                ),
1095                                (
1096                                    kuchikikiki::ExpandedName::new("", "class"),
1097                                    kuchikikiki::Attribute {
1098                                        prefix: None,
1099                                        value: "nixos-anchor".into(),
1100                                    },
1101                                ),
1102                            ],
1103                        );
1104                        p_element.append(span);
1105                        if !remaining_content.is_empty() {
1106                            p_element.append(kuchikikiki::NodeRef::new_text(remaining_content));
1107                        }
1108                    }
1109                }
1110            }
1111        }
1112    }
1113
1114    /// Process remaining standalone inline anchors throughout the document
1115    fn process_remaining_inline_anchors(&self, document: &kuchikikiki::NodeRef) {
1116        let mut text_nodes_to_process = Vec::new();
1117
1118        for node in document.inclusive_descendants() {
1119            if let Some(text_node) = node.as_text() {
1120                // Check if this text node is inside a code block
1121                let mut parent = node.parent();
1122                let mut in_code = false;
1123                while let Some(p) = parent {
1124                    if let Some(element) = p.as_element() {
1125                        if element.name.local == local_name!("code")
1126                            || element.name.local == local_name!("pre")
1127                        {
1128                            in_code = true;
1129                            break;
1130                        }
1131                    }
1132                    parent = p.parent();
1133                }
1134
1135                // Only process if not in code
1136                if !in_code {
1137                    let text_content = text_node.borrow().clone();
1138                    if text_content.contains("[]{#") {
1139                        text_nodes_to_process.push((node.clone(), text_content));
1140                    }
1141                }
1142            }
1143        }
1144
1145        for (text_node, text_content) in text_nodes_to_process {
1146            let mut last_end = 0;
1147            let mut new_children = Vec::new();
1148
1149            // Simple pattern matching for []{#id}
1150            let chars = text_content.chars().collect::<Vec<_>>();
1151            let mut i = 0;
1152            while i < chars.len() {
1153                if i + 4 < chars.len()
1154                    && chars[i] == '['
1155                    && chars[i + 1] == ']'
1156                    && chars[i + 2] == '{'
1157                    && chars[i + 3] == '#'
1158                {
1159                    // Found start of anchor pattern
1160                    let anchor_start = i;
1161                    i += 4; // skip "[]{#"
1162
1163                    let mut id = String::new();
1164                    while i < chars.len() && chars[i] != '}' {
1165                        if chars[i].is_alphanumeric() || chars[i] == '-' || chars[i] == '_' {
1166                            id.push(chars[i]);
1167                            i += 1;
1168                        } else {
1169                            break;
1170                        }
1171                    }
1172
1173                    if i < chars.len() && chars[i] == '}' && !id.is_empty() {
1174                        // Valid anchor found
1175                        let anchor_end = i + 1;
1176
1177                        // Add text before anchor
1178                        if anchor_start > last_end {
1179                            let before_text: String =
1180                                chars[last_end..anchor_start].iter().collect();
1181                            if !before_text.is_empty() {
1182                                new_children.push(kuchikikiki::NodeRef::new_text(before_text));
1183                            }
1184                        }
1185
1186                        // Add span element
1187                        let span = kuchikikiki::NodeRef::new_element(
1188                            markup5ever::QualName::new(None, ns!(html), local_name!("span")),
1189                            vec![
1190                                (
1191                                    kuchikikiki::ExpandedName::new("", "id"),
1192                                    kuchikikiki::Attribute {
1193                                        prefix: None,
1194                                        value: id,
1195                                    },
1196                                ),
1197                                (
1198                                    kuchikikiki::ExpandedName::new("", "class"),
1199                                    kuchikikiki::Attribute {
1200                                        prefix: None,
1201                                        value: "nixos-anchor".into(),
1202                                    },
1203                                ),
1204                            ],
1205                        );
1206                        new_children.push(span);
1207
1208                        last_end = anchor_end;
1209                        i = anchor_end;
1210                    } else {
1211                        i += 1;
1212                    }
1213                } else {
1214                    i += 1;
1215                }
1216            }
1217
1218            // Add remaining text
1219            if last_end < chars.len() {
1220                let after_text: String = chars[last_end..].iter().collect();
1221                if !after_text.is_empty() {
1222                    new_children.push(kuchikikiki::NodeRef::new_text(after_text));
1223                }
1224            }
1225
1226            // Replace text node if we found anchors
1227            if !new_children.is_empty() {
1228                for child in new_children {
1229                    text_node.insert_before(child);
1230                }
1231                text_node.detach();
1232            }
1233        }
1234    }
1235
1236    /// Process empty auto-links: [](#anchor) -> <a href="#anchor">Anchor</a>
1237    fn process_empty_auto_links(&self, document: &kuchikikiki::NodeRef) {
1238        for link_node in document.select("a").unwrap() {
1239            let link_element = link_node.as_node();
1240            if let Some(element) = link_element.as_element() {
1241                let href = element
1242                    .attributes
1243                    .borrow()
1244                    .get(local_name!("href"))
1245                    .map(std::string::ToString::to_string);
1246                let text_content = link_element.text_contents();
1247
1248                if let Some(href_value) = href {
1249                    if href_value.starts_with('#') && text_content.trim().is_empty() {
1250                        // Empty link with anchor - add humanized text
1251                        let display_text = self.humanize_anchor_id(&href_value);
1252                        link_element.append(kuchikikiki::NodeRef::new_text(display_text));
1253                    }
1254                }
1255            }
1256        }
1257    }
1258
1259    /// Process empty HTML links that have no content
1260    fn process_empty_html_links(&self, document: &kuchikikiki::NodeRef) {
1261        for link_node in document.select("a[href^='#']").unwrap() {
1262            let link_element = link_node.as_node();
1263            let text_content = link_element.text_contents();
1264
1265            if text_content.trim().is_empty() {
1266                if let Some(element) = link_element.as_element() {
1267                    if let Some(href) = element.attributes.borrow().get(local_name!("href")) {
1268                        let display_text = self.humanize_anchor_id(href);
1269                        link_element.append(kuchikikiki::NodeRef::new_text(display_text));
1270                    }
1271                }
1272            }
1273        }
1274    }
1275
1276    /// Convert an anchor ID to human-readable text
1277    fn humanize_anchor_id(&self, anchor: &str) -> String {
1278        // Strip the leading #
1279        let cleaned = anchor.trim_start_matches('#');
1280
1281        // Remove common prefixes
1282        let without_prefix = cleaned
1283            .trim_start_matches("sec-")
1284            .trim_start_matches("ssec-")
1285            .trim_start_matches("opt-");
1286
1287        // Replace separators with spaces
1288        let spaced = without_prefix.replace(['-', '_'], " ");
1289
1290        // Capitalize each word
1291        spaced
1292            .split_whitespace()
1293            .map(|word| {
1294                let mut chars = word.chars();
1295                chars.next().map_or_else(String::new, |c| {
1296                    c.to_uppercase().collect::<String>() + chars.as_str()
1297                })
1298            })
1299            .collect::<Vec<String>>()
1300            .join(" ")
1301    }
1302}
1303
1304/// Trait for AST transformations (e.g., prompt highlighting).
1305pub trait AstTransformer {
1306    fn transform<'a>(&self, node: &'a AstNode<'a>);
1307}
1308
1309/// Extract all inline text from a heading node.
1310/// AST transformer for processing command and REPL prompts in inline code blocks.
1311pub struct PromptTransformer;
1312
1313impl AstTransformer for PromptTransformer {
1314    fn transform<'a>(&self, node: &'a AstNode<'a>) {
1315        use comrak::nodes::NodeValue;
1316        use regex::Regex;
1317
1318        let command_prompt_re = Regex::new(r"^\s*\$\s+(.+)$").unwrap();
1319        let repl_prompt_re = Regex::new(r"^nix-repl>\s*(.*)$").unwrap();
1320
1321        for child in node.children() {
1322            {
1323                let mut data = child.data.borrow_mut();
1324                if let NodeValue::Code(ref code) = data.value {
1325                    let literal = code.literal.trim();
1326
1327                    // Match command prompts with flexible whitespace
1328                    if let Some(caps) = command_prompt_re.captures(literal) {
1329                        // Skip escaped prompts
1330                        if !literal.starts_with("\\$") && !literal.starts_with("$$") {
1331                            let command = caps[1].trim();
1332                            let html = format!(
1333                                "<code class=\"terminal\"><span class=\"prompt\">$</span> {command}</code>"
1334                            );
1335                            data.value = NodeValue::HtmlInline(html);
1336                        }
1337                    } else if let Some(caps) = repl_prompt_re.captures(literal) {
1338                        // Skip double prompts
1339                        if !literal.starts_with("nix-repl>>") {
1340                            let expression = caps[1].trim();
1341                            let html = format!(
1342                                "<code class=\"nix-repl\"><span class=\"prompt\">nix-repl&gt;</span> {expression}</code>"
1343                            );
1344                            data.value = NodeValue::HtmlInline(html);
1345                        }
1346                    }
1347                }
1348            }
1349            self.transform(child);
1350        }
1351    }
1352}
1353
1354fn extract_inline_text<'a>(node: &'a AstNode<'a>) -> String {
1355    let mut text = String::new();
1356    for child in node.children() {
1357        match &child.data.borrow().value {
1358            NodeValue::Text(t) => text.push_str(t),
1359            NodeValue::Code(t) => text.push_str(&t.literal),
1360            NodeValue::Link(..) => text.push_str(&extract_inline_text(child)),
1361            NodeValue::Emph => text.push_str(&extract_inline_text(child)),
1362            NodeValue::Strong => text.push_str(&extract_inline_text(child)),
1363            NodeValue::Strikethrough => text.push_str(&extract_inline_text(child)),
1364            NodeValue::Superscript => text.push_str(&extract_inline_text(child)),
1365            NodeValue::Subscript => text.push_str(&extract_inline_text(child)),
1366            NodeValue::FootnoteReference(..) => text.push_str(&extract_inline_text(child)),
1367            NodeValue::HtmlInline(_) => {}
1368            NodeValue::Image(..) => {}
1369            _ => {}
1370        }
1371    }
1372    text
1373}
1374
1375/// Apply GitHub Flavored Markdown (GFM) extensions to the input markdown.
1376///
1377/// This is a placeholder for future GFM-specific preprocessing or AST transformations.
1378/// In practice, most GFM features are enabled via comrak options, but additional
1379/// logic (such as custom tables, task lists, etc.) can be added here.
1380///
1381/// # Arguments
1382/// * `markdown` - The input markdown text
1383///
1384/// # Returns
1385/// The processed markdown text with GFM extensions applied
1386#[cfg(feature = "gfm")]
1387#[must_use]
1388pub fn apply_gfm_extensions(markdown: &str) -> String {
1389    // XXX: Comrak already supports GFM, but if there is any feature in the spec
1390    // that is not implemented as we'd like for it to be, we can add it here.
1391    markdown.to_owned()
1392}
1393
1394/// Process file includes in Nixpkgs/NixOS documentation.
1395///
1396/// This function processes file include syntax:
1397///
1398/// ````markdown
1399/// ```{=include=}
1400/// path/to/file1.md
1401/// path/to/file2.md
1402/// ```
1403/// ````
1404///
1405/// # Arguments
1406/// * `markdown` - The input markdown text
1407/// * `base_dir` - The base directory for resolving relative file paths
1408///
1409/// # Returns
1410/// The processed markdown text with included files expanded
1411///
1412/// # Safety
1413/// Only relative paths without ".." are allowed for security.
1414#[cfg(feature = "nixpkgs")]
1415#[must_use]
1416pub fn process_file_includes(markdown: &str, base_dir: &std::path::Path) -> String {
1417    use std::{fs, path::Path};
1418
1419    // Check if a path is safe (no absolute, no ..)
1420    fn is_safe_path(path: &str) -> bool {
1421        let p = Path::new(path);
1422        !p.is_absolute() && !path.contains("..") && !path.contains('\\')
1423    }
1424
1425    // Read included files, return concatenated content
1426    fn read_includes(listing: &str, base_dir: &Path) -> String {
1427        let mut result = String::new();
1428        for line in listing.lines() {
1429            let trimmed = line.trim();
1430            if trimmed.is_empty() || !is_safe_path(trimmed) {
1431                continue;
1432            }
1433            let full_path = base_dir.join(trimmed);
1434            log::info!("Including file: {}", full_path.display());
1435            match fs::read_to_string(&full_path) {
1436                Ok(content) => {
1437                    result.push_str(&content);
1438                    if !content.ends_with('\n') {
1439                        result.push('\n');
1440                    }
1441                }
1442                Err(_) => {
1443                    // Insert a warning comment for missing files
1444                    result.push_str(&format!(
1445                        "<!-- ndg: could not include file: {} -->\n",
1446                        full_path.display()
1447                    ));
1448                }
1449            }
1450        }
1451        result
1452    }
1453
1454    // Replace {=include=} code blocks with included file contents - code-block aware
1455    let mut output = String::new();
1456    let mut lines = markdown.lines().peekable();
1457    let mut in_code_block = false;
1458    let mut code_fence_char = None;
1459    let mut code_fence_count = 0;
1460
1461    while let Some(line) = lines.next() {
1462        let trimmed = line.trim_start();
1463
1464        // Check for includes BEFORE checking for code fences
1465        if !in_code_block && trimmed.starts_with("```{=include=}") {
1466            // Start of an include block
1467            let mut include_listing = String::new();
1468            for next_line in lines.by_ref() {
1469                if next_line.trim_start().starts_with("```") {
1470                    break;
1471                }
1472                include_listing.push_str(next_line);
1473                include_listing.push('\n');
1474            }
1475
1476            let included = read_includes(&include_listing, base_dir);
1477            output.push_str(&included);
1478            continue;
1479        }
1480
1481        // Check for code fences
1482        if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
1483            let fence_char = trimmed.chars().next().unwrap();
1484            let fence_count = trimmed.chars().take_while(|&c| c == fence_char).count();
1485
1486            if fence_count >= 3 {
1487                if !in_code_block {
1488                    // Starting a code block
1489                    in_code_block = true;
1490                    code_fence_char = Some(fence_char);
1491                    code_fence_count = fence_count;
1492                } else if code_fence_char == Some(fence_char) && fence_count >= code_fence_count {
1493                    // Ending a code block
1494                    in_code_block = false;
1495                    code_fence_char = None;
1496                    code_fence_count = 0;
1497                }
1498            }
1499        }
1500
1501        // Regular line, keep as-is
1502        output.push_str(line);
1503        output.push('\n');
1504    }
1505
1506    output
1507}
1508
1509/// Process role markup in markdown content.
1510///
1511/// This function processes role syntax like `{command}`ls -la``
1512///
1513/// # Arguments
1514/// * `content` - The markdown content to process
1515/// * `manpage_urls` - Optional mapping of manpage names to URLs
1516///
1517/// # Returns
1518/// The processed markdown with role markup converted to HTML
1519#[cfg(any(feature = "nixpkgs", feature = "ndg-flavored"))]
1520#[must_use]
1521pub fn process_role_markup(
1522    content: &str,
1523    manpage_urls: Option<&std::collections::HashMap<String, String>>,
1524) -> String {
1525    let mut result = String::new();
1526    let mut chars = content.chars().peekable();
1527    let mut in_code_block = false;
1528    let mut in_inline_code = false;
1529    let mut code_fence_char = None;
1530    let mut code_fence_count = 0;
1531
1532    while let Some(ch) = chars.next() {
1533        // Handle code fences (```)
1534        if ch == '`' {
1535            let mut tick_count = 1;
1536            while chars.peek() == Some(&'`') {
1537                chars.next();
1538                tick_count += 1;
1539            }
1540
1541            if tick_count >= 3 {
1542                // This is a code fence
1543                if !in_code_block {
1544                    // Starting a code block
1545                    in_code_block = true;
1546                    code_fence_char = Some('`');
1547                    code_fence_count = tick_count;
1548                } else if code_fence_char == Some('`') && tick_count >= code_fence_count {
1549                    // Ending a code block
1550                    in_code_block = false;
1551                    code_fence_char = None;
1552                    code_fence_count = 0;
1553                }
1554            } else if tick_count == 1 && !in_code_block {
1555                // Single backtick - inline code
1556                in_inline_code = !in_inline_code;
1557            }
1558
1559            // Add all the backticks
1560            result.push_str(&"`".repeat(tick_count));
1561            continue;
1562        }
1563
1564        // Handle tilde code fences (~~~)
1565        if ch == '~' && chars.peek() == Some(&'~') {
1566            let mut tilde_count = 1;
1567            while chars.peek() == Some(&'~') {
1568                chars.next();
1569                tilde_count += 1;
1570            }
1571
1572            if tilde_count >= 3 {
1573                if !in_code_block {
1574                    in_code_block = true;
1575                    code_fence_char = Some('~');
1576                    code_fence_count = tilde_count;
1577                } else if code_fence_char == Some('~') && tilde_count >= code_fence_count {
1578                    in_code_block = false;
1579                    code_fence_char = None;
1580                    code_fence_count = 0;
1581                }
1582            }
1583
1584            result.push_str(&"~".repeat(tilde_count));
1585            continue;
1586        }
1587
1588        // Handle newlines - they can end inline code if not properly closed
1589        if ch == '\n' {
1590            in_inline_code = false;
1591            result.push(ch);
1592            continue;
1593        }
1594
1595        // Process role markup only if we're not in any kind of code
1596        if ch == '{' && !in_code_block && !in_inline_code {
1597            // Collect remaining characters to test parsing
1598            let remaining: Vec<char> = chars.clone().collect();
1599            let remaining_str: String = remaining.iter().collect();
1600            let mut temp_chars = remaining_str.chars().peekable();
1601
1602            if let Some(role_markup) = parse_role_markup(&mut temp_chars, manpage_urls) {
1603                // Valid role markup found, advance the main iterator
1604                let remaining_after_parse: String = temp_chars.collect();
1605                let consumed = remaining_str.len() - remaining_after_parse.len();
1606                for _ in 0..consumed {
1607                    chars.next();
1608                }
1609                result.push_str(&role_markup);
1610            } else {
1611                // Not a valid role markup, keep the original character
1612                result.push(ch);
1613            }
1614        } else {
1615            result.push(ch);
1616        }
1617    }
1618
1619    result
1620}
1621
1622/// Parse a role markup from the character iterator.
1623/// Returns Some(html) if a valid role markup is found, None otherwise.
1624fn parse_role_markup(
1625    chars: &mut std::iter::Peekable<std::str::Chars>,
1626    manpage_urls: Option<&std::collections::HashMap<String, String>>,
1627) -> Option<String> {
1628    let mut role_name = String::new();
1629
1630    // Parse role name (lowercase letters only)
1631    while let Some(&ch) = chars.peek() {
1632        if ch.is_ascii_lowercase() {
1633            role_name.push(ch);
1634            chars.next();
1635        } else {
1636            break;
1637        }
1638    }
1639
1640    // Must have a non-empty role name
1641    if role_name.is_empty() {
1642        return None;
1643    }
1644
1645    // Expect closing brace
1646    if chars.peek() != Some(&'}') {
1647        return None;
1648    }
1649    chars.next(); // consume '}'
1650
1651    // Expect opening backtick
1652    if chars.peek() != Some(&'`') {
1653        return None;
1654    }
1655    chars.next(); // consume '`'
1656
1657    // Parse content until closing backtick
1658    let mut content = String::new();
1659    for ch in chars.by_ref() {
1660        if ch == '`' {
1661            // Found closing backtick, validate content
1662            // Most role types should not have empty content
1663            if content.is_empty() && !matches!(role_name.as_str(), "manpage") {
1664                return None; // reject empty content for most roles
1665            }
1666            return Some(format_role_markup(&role_name, &content, manpage_urls));
1667        }
1668        content.push(ch);
1669    }
1670
1671    // No closing backtick found
1672    None
1673}
1674
1675/// Format the role markup as HTML based on the role type and content.
1676fn format_role_markup(
1677    role_type: &str,
1678    content: &str,
1679    manpage_urls: Option<&std::collections::HashMap<String, String>>,
1680) -> String {
1681    match role_type {
1682        "manpage" => {
1683            if let Some(urls) = manpage_urls {
1684                if let Some(url) = urls.get(content) {
1685                    let clean_url = extract_url_from_html(url);
1686                    format!("<a href=\"{clean_url}\" class=\"manpage-reference\">{content}</a>")
1687                } else {
1688                    format!("<span class=\"manpage-reference\">{content}</span>")
1689                }
1690            } else {
1691                format!("<span class=\"manpage-reference\">{content}</span>")
1692            }
1693        }
1694        "command" => format!("<code class=\"command\">{content}</code>"),
1695        "env" => format!("<code class=\"env-var\">{content}</code>"),
1696        "file" => format!("<code class=\"file-path\">{content}</code>"),
1697        "option" => {
1698            if cfg!(feature = "ndg-flavored") {
1699                let option_id = format!("option-{}", content.replace('.', "-"));
1700                format!(
1701                    "<a class=\"option-reference\" href=\"options.html#{option_id}\"><code>{content}</code></a>"
1702                )
1703            } else {
1704                format!("<code>{content}</code>")
1705            }
1706        }
1707        "var" => format!("<code class=\"nix-var\">{content}</code>"),
1708        _ => format!("<span class=\"{role_type}-markup\">{content}</span>"),
1709    }
1710}
1711
1712/// Process inline anchors in markdown content.
1713///
1714/// This function processes inline anchor syntax like `[]{#my-anchor}` while being
1715/// code-block aware to avoid processing inside code fences.
1716///
1717/// # Arguments
1718/// * `content` - The markdown content to process
1719///
1720/// # Returns
1721/// The processed markdown with inline anchors converted to HTML spans
1722#[cfg(feature = "nixpkgs")]
1723#[must_use]
1724pub fn process_inline_anchors(content: &str) -> String {
1725    let mut result = String::with_capacity(content.len() + 100);
1726    let mut in_code_block = false;
1727    let mut code_fence_char = None;
1728    let mut code_fence_count = 0;
1729
1730    for line in content.lines() {
1731        let trimmed = line.trim_start();
1732
1733        // Check for code fences
1734        if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
1735            let fence_char = trimmed.chars().next().unwrap();
1736            let fence_count = trimmed.chars().take_while(|&c| c == fence_char).count();
1737
1738            if fence_count >= 3 {
1739                if !in_code_block {
1740                    // Starting a code block
1741                    in_code_block = true;
1742                    code_fence_char = Some(fence_char);
1743                    code_fence_count = fence_count;
1744                } else if code_fence_char == Some(fence_char) && fence_count >= code_fence_count {
1745                    // Ending a code block
1746                    in_code_block = false;
1747                    code_fence_char = None;
1748                    code_fence_count = 0;
1749                }
1750            }
1751        }
1752
1753        // Only process inline anchors if we're not in a code block
1754        if in_code_block {
1755            // In code block, keep line as-is
1756            result.push_str(line);
1757            result.push('\n');
1758        } else {
1759            // Check for list items with anchors:
1760            // "- []{#id} content" or "1. []{#id} content"
1761            if let Some(anchor_start) = find_list_item_anchor(trimmed) {
1762                if let Some(processed_line) = process_list_item_anchor(line, anchor_start) {
1763                    result.push_str(&processed_line);
1764                    result.push('\n');
1765                    continue;
1766                }
1767            }
1768
1769            // Process regular inline anchors in the line
1770            result.push_str(&process_line_anchors(line));
1771            result.push('\n');
1772        }
1773    }
1774
1775    result
1776}
1777
1778/// Find if a line starts with a list marker followed by an anchor.
1779fn find_list_item_anchor(trimmed: &str) -> Option<usize> {
1780    // Check for unordered list: "- []{#id}" or "* []{#id}" or "+ []{#id}"
1781    if (trimmed.starts_with("- ") || trimmed.starts_with("* ") || trimmed.starts_with("+ "))
1782        && trimmed.len() > 2
1783    {
1784        let after_marker = &trimmed[2..];
1785        if after_marker.starts_with("[]{#") {
1786            return Some(2);
1787        }
1788    }
1789
1790    // Check for ordered list: "1. []{#id}" or "123. []{#id}"
1791    let mut i = 0;
1792    while i < trimmed.len() && trimmed.chars().nth(i).unwrap_or(' ').is_ascii_digit() {
1793        i += 1;
1794    }
1795    if i > 0 && i < trimmed.len() - 1 && trimmed.chars().nth(i) == Some('.') {
1796        let after_marker = &trimmed[i + 1..];
1797        if after_marker.starts_with(" []{#") {
1798            return Some(i + 2);
1799        }
1800    }
1801
1802    None
1803}
1804
1805/// Process a list item line that contains an anchor.
1806fn process_list_item_anchor(line: &str, anchor_start: usize) -> Option<String> {
1807    let before_anchor = &line[..anchor_start];
1808    let after_marker = &line[anchor_start..];
1809
1810    if !after_marker.starts_with("[]{#") {
1811        return None;
1812    }
1813
1814    // Find the end of the anchor: []{#id}
1815    if let Some(anchor_end) = after_marker.find('}') {
1816        let id = &after_marker[4..anchor_end]; // skip "[]{#" and take until '}'
1817        let remaining_content = &after_marker[anchor_end + 1..]; // skip '}'
1818
1819        // Validate ID contains only allowed characters
1820        if id
1821            .chars()
1822            .all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
1823            && !id.is_empty()
1824        {
1825            return Some(format!(
1826                "{before_anchor}<span id=\"{id}\" class=\"nixos-anchor\"></span>{remaining_content}"
1827            ));
1828        }
1829    }
1830
1831    None
1832}
1833
1834/// Process inline anchors in a single line.
1835fn process_line_anchors(line: &str) -> String {
1836    let mut result = String::with_capacity(line.len());
1837    let mut chars = line.chars().peekable();
1838
1839    while let Some(ch) = chars.next() {
1840        if ch == '[' && chars.peek() == Some(&']') {
1841            chars.next(); // consume ']'
1842
1843            // Check for {#id} pattern
1844            if chars.peek() == Some(&'{') {
1845                chars.next(); // consume '{'
1846                if chars.peek() == Some(&'#') {
1847                    chars.next(); // consume '#'
1848
1849                    // Collect the ID
1850                    let mut id = String::new();
1851                    while let Some(&next_ch) = chars.peek() {
1852                        if next_ch == '}' {
1853                            chars.next(); // consume '}'
1854
1855                            // Validate ID and create span
1856                            if !id.is_empty()
1857                                && id
1858                                    .chars()
1859                                    .all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
1860                            {
1861                                result.push_str(&format!(
1862                                    "<span id=\"{id}\" class=\"nixos-anchor\"></span>"
1863                                ));
1864                            } else {
1865                                // Invalid ID, put back original text
1866                                result.push_str(&format!("[]{{{{#{id}}}}}"));
1867                            }
1868                            break;
1869                        } else if next_ch.is_ascii_alphanumeric()
1870                            || next_ch == '-'
1871                            || next_ch == '_'
1872                        {
1873                            id.push(next_ch);
1874                            chars.next();
1875                        } else {
1876                            // Invalid character, put back original text
1877                            result.push_str(&format!("[]{{{{#{id}"));
1878                            break;
1879                        }
1880                    }
1881                } else {
1882                    // Not an anchor, put back consumed characters
1883                    result.push_str("]{");
1884                }
1885            } else {
1886                // Not an anchor, put back consumed character
1887                result.push(']');
1888            }
1889        } else {
1890            result.push(ch);
1891        }
1892    }
1893
1894    result
1895}
1896
1897/// Process block elements in markdown content.
1898///
1899/// This function processes block elements including admonitions, figures, and definition lists
1900/// while being code-block aware to avoid processing inside code fences.
1901///
1902/// # Arguments
1903/// * `content` - The markdown content to process
1904///
1905/// # Returns
1906/// The processed markdown with block elements converted to HTML
1907#[cfg(feature = "nixpkgs")]
1908#[must_use]
1909pub fn process_block_elements(content: &str) -> String {
1910    let mut result = Vec::new();
1911    let mut lines = content.lines().peekable();
1912    let mut in_code_block = false;
1913    let mut code_fence_char = None;
1914    let mut code_fence_count = 0;
1915
1916    while let Some(line) = lines.next() {
1917        // Check for code fences
1918        let trimmed = line.trim_start();
1919        if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
1920            let fence_char = trimmed.chars().next().unwrap();
1921            let fence_count = trimmed.chars().take_while(|&c| c == fence_char).count();
1922
1923            if fence_count >= 3 {
1924                if !in_code_block {
1925                    // Starting a code block
1926                    in_code_block = true;
1927                    code_fence_char = Some(fence_char);
1928                    code_fence_count = fence_count;
1929                } else if code_fence_char == Some(fence_char) && fence_count >= code_fence_count {
1930                    // Ending a code block
1931                    in_code_block = false;
1932                    code_fence_char = None;
1933                    code_fence_count = 0;
1934                }
1935            }
1936        }
1937
1938        // Only process block elements if we're not in a code block
1939        if !in_code_block {
1940            // Check for GitHub-style callouts: > [!TYPE]
1941            if let Some((callout_type, initial_content)) = parse_github_callout(line) {
1942                let content = collect_github_callout_content(&mut lines, initial_content);
1943                let admonition = render_admonition(&callout_type, None, &content);
1944                result.push(admonition);
1945                continue;
1946            }
1947
1948            // Check for fenced admonitions: ::: {.type}
1949            if let Some((adm_type, id)) = parse_fenced_admonition_start(line) {
1950                let content = collect_fenced_content(&mut lines);
1951                let admonition = render_admonition(&adm_type, id.as_deref(), &content);
1952                result.push(admonition);
1953                continue;
1954            }
1955
1956            // Check for figures: ::: {.figure #id}
1957            if let Some((id, title, content)) = parse_figure_block(line, &mut lines) {
1958                let figure = render_figure(id.as_deref(), &title, &content);
1959                result.push(figure);
1960                continue;
1961            }
1962
1963            // Check for definition lists: Term\n:   Definition
1964            if !line.is_empty() && !line.starts_with(':') {
1965                if let Some(next_line) = lines.peek() {
1966                    if next_line.starts_with(":   ") {
1967                        let term = line;
1968                        let def_line = lines.next().unwrap();
1969                        let definition = &def_line[4..]; // Skip ":   "
1970                        let dl = format!("<dl>\n<dt>{term}</dt>\n<dd>{definition}</dd>\n</dl>");
1971                        result.push(dl);
1972                        continue;
1973                    }
1974                }
1975            }
1976        }
1977
1978        // Regular line, keep as-is
1979        result.push(line.to_string());
1980    }
1981
1982    result.join("\n")
1983}
1984
1985/// Parse GitHub-style callout syntax: > [!TYPE] content
1986fn parse_github_callout(line: &str) -> Option<(String, String)> {
1987    let trimmed = line.trim_start();
1988    if !trimmed.starts_with("> [!") {
1989        return None;
1990    }
1991
1992    // Find the closing bracket
1993    if let Some(close_bracket) = trimmed.find(']') {
1994        if close_bracket > 4 {
1995            let callout_type = &trimmed[4..close_bracket];
1996
1997            // Validate callout type
1998            match callout_type {
1999                "NOTE" | "TIP" | "IMPORTANT" | "WARNING" | "CAUTION" | "DANGER" => {
2000                    let content = trimmed[close_bracket + 1..].trim();
2001                    return Some((callout_type.to_lowercase(), content.to_string()));
2002                }
2003                _ => return None,
2004            }
2005        }
2006    }
2007
2008    None
2009}
2010
2011/// Collect content for GitHub-style callouts
2012fn collect_github_callout_content(
2013    lines: &mut std::iter::Peekable<std::str::Lines>,
2014    initial_content: String,
2015) -> String {
2016    let mut content = String::new();
2017
2018    if !initial_content.is_empty() {
2019        content.push_str(&initial_content);
2020        content.push('\n');
2021    }
2022
2023    while let Some(line) = lines.peek() {
2024        let trimmed = line.trim_start();
2025        if trimmed.starts_with('>') {
2026            let content_part = trimmed.strip_prefix('>').unwrap_or("").trim_start();
2027            content.push_str(content_part);
2028            content.push('\n');
2029            lines.next(); // consume the line
2030        } else {
2031            break;
2032        }
2033    }
2034
2035    content.trim().to_string()
2036}
2037
2038/// Parse fenced admonition start: ::: {.type #id}
2039fn parse_fenced_admonition_start(line: &str) -> Option<(String, Option<String>)> {
2040    let trimmed = line.trim();
2041    if !trimmed.starts_with(":::") {
2042        return None;
2043    }
2044
2045    let after_colons = trimmed[3..].trim_start();
2046    if !after_colons.starts_with("{.") {
2047        return None;
2048    }
2049
2050    // Find the closing brace
2051    if let Some(close_brace) = after_colons.find('}') {
2052        let content = &after_colons[2..close_brace]; // Skip "{."
2053
2054        // Parse type and optional ID
2055        let parts: Vec<&str> = content.split_whitespace().collect();
2056        if let Some(&adm_type) = parts.first() {
2057            let id = parts
2058                .iter()
2059                .find(|part| part.starts_with('#'))
2060                .map(|id_part| id_part[1..].to_string()); // Remove '#'
2061
2062            return Some((adm_type.to_string(), id));
2063        }
2064    }
2065
2066    None
2067}
2068
2069/// Collect content until closing :::
2070fn collect_fenced_content(lines: &mut std::iter::Peekable<std::str::Lines>) -> String {
2071    let mut content = String::new();
2072
2073    for line in lines.by_ref() {
2074        if line.trim().starts_with(":::") {
2075            break;
2076        }
2077        content.push_str(line);
2078        content.push('\n');
2079    }
2080
2081    content.trim().to_string()
2082}
2083
2084/// Parse figure block: ::: {.figure #id}
2085fn parse_figure_block(
2086    line: &str,
2087    lines: &mut std::iter::Peekable<std::str::Lines>,
2088) -> Option<(Option<String>, String, String)> {
2089    let trimmed = line.trim();
2090    if !trimmed.starts_with(":::") {
2091        return None;
2092    }
2093
2094    let after_colons = trimmed[3..].trim_start();
2095    if !after_colons.starts_with("{.figure") {
2096        return None;
2097    }
2098
2099    // Extract ID if present
2100    let id = if let Some(hash_pos) = after_colons.find('#') {
2101        if let Some(close_brace) = after_colons.find('}') {
2102            if hash_pos < close_brace {
2103                Some(after_colons[hash_pos + 1..close_brace].trim().to_string())
2104            } else {
2105                None
2106            }
2107        } else {
2108            None
2109        }
2110    } else {
2111        None
2112    };
2113
2114    // Get title from next line (should start with #)
2115    let title = if let Some(title_line) = lines.next() {
2116        let trimmed_title = title_line.trim();
2117        if let Some(this) = trimmed_title.strip_prefix('#') {
2118            { this.trim_matches(char::is_whitespace) }.to_string()
2119        } else {
2120            // Put the line back if it's not a title
2121            return None;
2122        }
2123    } else {
2124        return None;
2125    };
2126
2127    // Collect figure content
2128    let mut content = String::new();
2129    for line in lines.by_ref() {
2130        if line.trim().starts_with(":::") {
2131            break;
2132        }
2133        content.push_str(line);
2134        content.push('\n');
2135    }
2136
2137    Some((id, title, content.trim().to_string()))
2138}
2139
2140/// Render an admonition as HTML
2141fn render_admonition(adm_type: &str, id: Option<&str>, content: &str) -> String {
2142    let capitalized_type = crate::utils::capitalize_first(adm_type);
2143    let id_attr = id.map_or(String::new(), |id| format!(" id=\"{id}\""));
2144
2145    format!(
2146        "<div class=\"admonition {adm_type}\"{id_attr}>\n<p class=\"admonition-title\">{capitalized_type}</p>\n\n{content}\n\n</div>"
2147    )
2148}
2149
2150/// Render a figure as HTML
2151fn render_figure(id: Option<&str>, title: &str, content: &str) -> String {
2152    let id_attr = id.map_or(String::new(), |id| format!(" id=\"{id}\""));
2153
2154    format!("<figure{id_attr}>\n<figcaption>{title}</figcaption>\n{content}\n</figure>")
2155}
2156
2157/// Process autolinks by converting plain URLs to HTML anchor tags.
2158///
2159/// This function processes autolinks in HTML content by finding plain URLs
2160/// and converting them to clickable links.
2161///
2162/// # Arguments
2163/// * `html` - The HTML content to process
2164///
2165/// # Returns
2166/// The processed HTML with autolinks converted to anchor tags
2167#[cfg(feature = "gfm")]
2168#[must_use]
2169pub fn process_autolinks(html: &str) -> String {
2170    safely_process_markup(
2171        html,
2172        |html| {
2173            use std::sync::LazyLock;
2174
2175            use kuchikikiki::NodeRef;
2176            use regex::Regex;
2177            use tendril::TendrilSink;
2178
2179            static AUTOLINK_RE: LazyLock<Regex> = LazyLock::new(|| {
2180                Regex::new(r#"(https?://[^\s<>"')\}]+)"#).unwrap_or_else(|e| {
2181                    log::error!("Failed to compile AUTOLINK_RE regex: {e}");
2182                    utils::never_matching_regex()
2183                })
2184            });
2185
2186            let document = kuchikikiki::parse_html().one(html);
2187
2188            // Find all text nodes that aren't already inside links
2189            let mut text_nodes_to_process = Vec::new();
2190
2191            for node in document.inclusive_descendants() {
2192                if let Some(text_node) = node.as_text() {
2193                    let text_content = text_node.borrow().clone();
2194
2195                    // Skip if this text node is inside a link or code block
2196                    let mut is_inside_link = false;
2197                    let mut is_inside_code = false;
2198                    let mut current = Some(node.clone());
2199                    while let Some(parent) = current.and_then(|n| n.parent()) {
2200                        if let Some(element) = parent.as_element() {
2201                            if element.name.local.as_ref() == "a" {
2202                                is_inside_link = true;
2203                                break;
2204                            }
2205                            if element.name.local.as_ref() == "code"
2206                                || element.name.local.as_ref() == "pre"
2207                            {
2208                                is_inside_code = true;
2209                                break;
2210                            }
2211                        }
2212                        current = parent.parent();
2213                    }
2214
2215                    if !is_inside_link && !is_inside_code && AUTOLINK_RE.is_match(&text_content) {
2216                        text_nodes_to_process.push((node.clone(), text_content));
2217                    }
2218                }
2219            }
2220
2221            // Process each text node that contains URLs
2222            for (text_node, text_content) in text_nodes_to_process {
2223                let mut last_end = 0;
2224                let mut new_children = Vec::new();
2225
2226                for url_match in AUTOLINK_RE.find_iter(&text_content) {
2227                    // Add text before the URL
2228                    if url_match.start() > last_end {
2229                        let before_text = &text_content[last_end..url_match.start()];
2230                        if !before_text.is_empty() {
2231                            new_children.push(NodeRef::new_text(before_text));
2232                        }
2233                    }
2234
2235                    // Create link for the URL, trimming trailing punctuation
2236                    let mut url = url_match.as_str();
2237
2238                    // Trim common trailing punctuation
2239                    while let Some(last_char) = url.chars().last() {
2240                        if matches!(last_char, '.' | '!' | '?' | ';' | ',' | ')' | ']' | '}') {
2241                            url = &url[..url.len() - last_char.len_utf8()];
2242                        } else {
2243                            break;
2244                        }
2245                    }
2246
2247                    let link = NodeRef::new_element(
2248                        markup5ever::QualName::new(None, ns!(html), local_name!("a")),
2249                        vec![(
2250                            kuchikikiki::ExpandedName::new("", "href"),
2251                            kuchikikiki::Attribute {
2252                                prefix: None,
2253                                value: url.into(),
2254                            },
2255                        )],
2256                    );
2257                    link.append(NodeRef::new_text(url));
2258                    new_children.push(link);
2259
2260                    // Add any trimmed punctuation as separate text
2261                    let original_url = url_match.as_str();
2262                    if url.len() < original_url.len() {
2263                        let punctuation = &original_url[url.len()..];
2264                        new_children.push(NodeRef::new_text(punctuation));
2265                    }
2266
2267                    last_end = url_match.end();
2268                }
2269
2270                // Add remaining text after the last URL
2271                if last_end < text_content.len() {
2272                    let after_text = &text_content[last_end..];
2273                    if !after_text.is_empty() {
2274                        new_children.push(NodeRef::new_text(after_text));
2275                    }
2276                }
2277
2278                // Replace the text node with new children
2279                if !new_children.is_empty() {
2280                    for child in new_children {
2281                        text_node.insert_before(child);
2282                    }
2283                    text_node.detach();
2284                }
2285            }
2286
2287            let mut out = Vec::new();
2288            document.serialize(&mut out).ok();
2289            String::from_utf8(out).unwrap_or_default()
2290        },
2291        // Return original HTML on error
2292        "",
2293    )
2294}
2295
2296/// Process manpage references in HTML content.
2297///
2298/// This function processes manpage references by finding span elements with
2299/// manpage-reference class and converting them to links when URLs are available.
2300///
2301/// # Arguments
2302/// * `html` - The HTML content to process
2303/// * `manpage_urls` - Optional mapping of manpage names to URLs
2304///
2305/// # Returns
2306/// The processed HTML with manpage references converted to links
2307#[cfg(feature = "nixpkgs")]
2308#[must_use]
2309pub fn process_manpage_references(
2310    html: &str,
2311    manpage_urls: Option<&std::collections::HashMap<String, String>>,
2312) -> String {
2313    safely_process_markup(
2314        html,
2315        |html| {
2316            use kuchikikiki::NodeRef;
2317            use tendril::TendrilSink;
2318
2319            let document = kuchikikiki::parse_html().one(html);
2320            let mut to_replace = Vec::new();
2321
2322            // Find all spans with class "manpage-reference"
2323            for span_node in document.select("span.manpage-reference").unwrap() {
2324                let span_el = span_node.as_node();
2325                let span_text = span_el.text_contents();
2326
2327                if let Some(urls) = manpage_urls {
2328                    // Check for direct URL match
2329                    if let Some(url) = urls.get(&span_text) {
2330                        let clean_url = extract_url_from_html(url);
2331                        let link = NodeRef::new_element(
2332                            markup5ever::QualName::new(None, ns!(html), local_name!("a")),
2333                            vec![
2334                                (
2335                                    kuchikikiki::ExpandedName::new("", "href"),
2336                                    kuchikikiki::Attribute {
2337                                        prefix: None,
2338                                        value: clean_url.into(),
2339                                    },
2340                                ),
2341                                (
2342                                    kuchikikiki::ExpandedName::new("", "class"),
2343                                    kuchikikiki::Attribute {
2344                                        prefix: None,
2345                                        value: "manpage-reference".into(),
2346                                    },
2347                                ),
2348                            ],
2349                        );
2350                        link.append(NodeRef::new_text(span_text.clone()));
2351                        to_replace.push((span_el.clone(), link));
2352                    }
2353                }
2354            }
2355
2356            // Apply replacements
2357            for (old, new) in to_replace {
2358                old.insert_before(new);
2359                old.detach();
2360            }
2361
2362            let mut out = Vec::new();
2363            document.serialize(&mut out).ok();
2364            String::from_utf8(out).unwrap_or_default()
2365        },
2366        // Return original HTML on error
2367        "",
2368    )
2369}
2370
2371/// Process option references
2372/// Rewrites NixOS/Nix option references in HTML output.
2373///
2374/// This scans the HTML for `<code>option.path</code>` elements that look like NixOS/Nix option references
2375/// and replaces them with option reference links. Only processes plain `<code>` elements that don't
2376/// already have specific role classes.
2377///
2378/// # Arguments
2379///
2380/// * `html` - The HTML string to process.
2381///
2382/// # Returns
2383///
2384/// The HTML string with option references rewritten as links.
2385#[cfg(feature = "ndg-flavored")]
2386#[must_use]
2387pub fn process_option_references(html: &str) -> String {
2388    use kuchikikiki::{Attribute, ExpandedName, NodeRef};
2389    use markup5ever::{QualName, local_name, ns};
2390    use tendril::TendrilSink;
2391
2392    safely_process_markup(
2393        html,
2394        |html| {
2395            let document = kuchikikiki::parse_html().one(html);
2396
2397            let mut to_replace = vec![];
2398
2399            for code_node in document.select("code").unwrap() {
2400                let code_el = code_node.as_node();
2401                let code_text = code_el.text_contents();
2402
2403                // Skip if this code element already has a role-specific class
2404                if let Some(element) = code_el.as_element() {
2405                    if let Some(class_attr) = element.attributes.borrow().get(local_name!("class"))
2406                    {
2407                        if class_attr.contains("command")
2408                            || class_attr.contains("env-var")
2409                            || class_attr.contains("file-path")
2410                            || class_attr.contains("nixos-option")
2411                            || class_attr.contains("nix-var")
2412                        {
2413                            continue;
2414                        }
2415                    }
2416                }
2417
2418                // Skip if this code element is already inside an option-reference link
2419                let mut is_already_option_ref = false;
2420                let mut current = code_el.parent();
2421                while let Some(parent) = current {
2422                    if let Some(element) = parent.as_element() {
2423                        if element.name.local == local_name!("a") {
2424                            if let Some(class_attr) =
2425                                element.attributes.borrow().get(local_name!("class"))
2426                            {
2427                                if class_attr.contains("option-reference") {
2428                                    is_already_option_ref = true;
2429                                    break;
2430                                }
2431                            }
2432                        }
2433                    }
2434                    current = parent.parent();
2435                }
2436
2437                if !is_already_option_ref && is_nixos_option_reference(&code_text) {
2438                    let option_id = format!("option-{}", code_text.replace('.', "-"));
2439                    let attrs = vec![
2440                        (
2441                            ExpandedName::new("", "href"),
2442                            Attribute {
2443                                prefix: None,
2444                                value: format!("options.html#{option_id}"),
2445                            },
2446                        ),
2447                        (
2448                            ExpandedName::new("", "class"),
2449                            Attribute {
2450                                prefix: None,
2451                                value: "option-reference".into(),
2452                            },
2453                        ),
2454                    ];
2455                    let a = NodeRef::new_element(
2456                        QualName::new(None, ns!(html), local_name!("a")),
2457                        attrs,
2458                    );
2459                    let code = NodeRef::new_element(
2460                        QualName::new(None, ns!(html), local_name!("code")),
2461                        vec![],
2462                    );
2463                    code.append(NodeRef::new_text(code_text.clone()));
2464                    a.append(code);
2465                    to_replace.push((code_el.clone(), a));
2466                }
2467            }
2468
2469            for (old, new) in to_replace {
2470                old.insert_before(new);
2471                old.detach();
2472            }
2473
2474            let mut out = Vec::new();
2475            document.serialize(&mut out).ok();
2476            String::from_utf8(out).unwrap_or_default()
2477        },
2478        // Return original HTML on error
2479        "",
2480    )
2481}
2482
2483/// Check if a string looks like a `NixOS` option reference
2484fn is_nixos_option_reference(text: &str) -> bool {
2485    // Must have at least 2 dots and no whitespace
2486    let dot_count = text.chars().filter(|&c| c == '.').count();
2487    if dot_count < 2 || text.chars().any(char::is_whitespace) {
2488        return false;
2489    }
2490
2491    // Must not contain special characters that indicate it's not an option
2492    if text.contains('<') || text.contains('>') || text.contains('$') || text.contains('/') {
2493        return false;
2494    }
2495
2496    // Must start with a letter (options don't start with numbers or special chars)
2497    if !text.chars().next().is_some_and(char::is_alphabetic) {
2498        return false;
2499    }
2500
2501    // Must look like a structured option path (letters, numbers, dots, dashes, underscores)
2502    text.chars()
2503        .all(|c| c.is_alphanumeric() || c == '.' || c == '-' || c == '_')
2504}
2505
2506/// Collect all markdown files from the input directory
2507pub fn collect_markdown_files(input_dir: &Path) -> Vec<PathBuf> {
2508    let mut files = Vec::with_capacity(100);
2509
2510    for entry in WalkDir::new(input_dir)
2511        .follow_links(true)
2512        .into_iter()
2513        .filter_map(Result::ok)
2514    {
2515        let path = entry.path();
2516        if path.is_file() && path.extension().is_some_and(|ext| ext == "md") {
2517            files.push(path.to_owned());
2518        }
2519    }
2520
2521    trace!("Found {} markdown files to process", files.len());
2522    files
2523}
2524
2525/// Extract URL from HTML anchor tag or return the string as-is if it's a plain URL
2526fn extract_url_from_html(url_or_html: &str) -> &str {
2527    // Check if it looks like HTML (starts with <a href=")
2528    if url_or_html.starts_with("<a href=\"") {
2529        // Extract the URL from href attribute
2530        if let Some(start) = url_or_html.find("href=\"") {
2531            let start = start + 6; // Skip 'href="'
2532            if let Some(end) = url_or_html[start..].find('"') {
2533                return &url_or_html[start..start + end];
2534            }
2535        }
2536    }
2537
2538    // Return as-is if not HTML or if extraction fails
2539    url_or_html
2540}
ndg_commonmark/processor.rs

ndg_commonmark/
processor.rs