1use std::{
6 collections::HashMap,
7 path::{Path, PathBuf},
8 sync::LazyLock,
9};
10
11use comrak::{
12 Arena,
13 nodes::{AstNode, NodeHeading, NodeValue},
14 options::Options,
15 parse_document,
16};
17use log::trace;
18use markup5ever::local_name;
19use regex::Regex;
20use walkdir::WalkDir;
21
22use super::{
23 dom::safe_select,
24 process::process_safe,
25 types::{
26 AstTransformer,
27 MarkdownOptions,
28 MarkdownProcessor,
29 PromptTransformer,
30 },
31};
32use crate::{
33 syntax::create_default_manager,
34 types::{Header, MarkdownResult},
35 utils,
36};
37
38static HEADER_ANCHOR_RE: LazyLock<Regex> = LazyLock::new(|| {
39 Regex::new(r"<h([1-6])>(.*?)\s*\{#([a-zA-Z0-9_-]+)\}(.*?)</h[1-6]>")
40 .unwrap_or_else(|e| {
41 log::error!("Failed to compile HEADER_ANCHOR_RE regex: {e}");
42 utils::never_matching_regex().unwrap_or_else(|_| {
43 #[allow(
44 clippy::expect_used,
45 reason = "This pattern is guaranteed to be valid"
46 )]
47 Regex::new(r"[^\s\S]")
48 .expect("regex pattern [^\\s\\S] should always compile")
49 })
50 })
51});
52
53static HEADER_NO_ID_RE: LazyLock<Regex> = LazyLock::new(|| {
54 Regex::new(r"<h([1-6])>(.*?)</h[1-6]>").unwrap_or_else(|e| {
55 log::error!("Failed to compile HEADER_NO_ID_RE regex: {e}");
56 utils::never_matching_regex().unwrap_or_else(|_| {
57 #[allow(
58 clippy::expect_used,
59 reason = "This pattern is guaranteed to be valid"
60 )]
61 Regex::new(r"[^\s\S]")
62 .expect("regex pattern [^\\s\\S] should always compile")
63 })
64 })
65});
66
67static HTML_TAG_RE: LazyLock<Regex> = LazyLock::new(|| {
68 Regex::new(r"<[^>]+>").unwrap_or_else(|e| {
69 log::error!("Failed to compile HTML_TAG_RE regex: {e}");
70 utils::never_matching_regex().unwrap_or_else(|_| {
71 #[allow(
72 clippy::expect_used,
73 reason = "This pattern is guaranteed to be valid"
74 )]
75 Regex::new(r"[^\s\S]")
76 .expect("regex pattern [^\\s\\S] should always compile")
77 })
78 })
79});
80
81impl MarkdownProcessor {
82 #[must_use]
84 pub fn new(options: MarkdownOptions) -> Self {
85 let manpage_urls = options
86 .manpage_urls_path
87 .as_ref()
88 .and_then(|path| crate::utils::load_manpage_urls(path).ok());
89
90 let syntax_manager = if options.highlight_code {
91 match create_default_manager(
92 options
93 .syntax_queries_path
94 .as_deref()
95 .map(std::path::Path::new),
96 ) {
97 Ok(manager) => {
98 log::info!("Syntax highlighting initialized successfully");
99 Some(manager)
100 },
101 Err(e) => {
102 log::error!("Failed to initialize syntax highlighting: {e}");
103 log::warn!(
104 "Continuing without syntax highlighting - code blocks will not be \
105 highlighted"
106 );
107 None
108 },
109 }
110 } else {
111 None
112 };
113
114 Self {
115 options,
116 manpage_urls,
117 syntax_manager,
118 base_dir: std::path::PathBuf::from("."),
119 }
120 }
121
122 #[must_use]
124 pub const fn options(&self) -> &MarkdownOptions {
125 &self.options
126 }
127
128 #[must_use]
130 pub fn with_base_dir(mut self, base_dir: &std::path::Path) -> Self {
131 self.base_dir = base_dir.to_path_buf();
132 self
133 }
134
135 #[must_use]
137 pub const fn has_feature(&self, feature: ProcessorFeature) -> bool {
138 match feature {
139 ProcessorFeature::Gfm => self.options.gfm,
140 ProcessorFeature::Nixpkgs => self.options.nixpkgs,
141 ProcessorFeature::SyntaxHighlighting => self.options.highlight_code,
142 ProcessorFeature::ManpageUrls => self.manpage_urls.is_some(),
143 }
144 }
145
146 #[must_use]
148 pub const fn manpage_urls(&self) -> Option<&HashMap<String, String>> {
149 self.manpage_urls.as_ref()
150 }
151
152 #[must_use]
154 pub fn highlight_codeblocks(&self, html: &str) -> String {
155 use kuchikikiki::parse_html;
156 use tendril::TendrilSink;
157
158 if !self.options.highlight_code || self.syntax_manager.is_none() {
159 return html.to_string();
160 }
161
162 let document = parse_html().one(html);
163
164 let mut code_blocks = Vec::new();
166 for pre_node in safe_select(&document, "pre > code") {
167 let code_node = pre_node;
168 if let Some(element) = code_node.as_element() {
169 let language = element
170 .attributes
171 .borrow()
172 .get("class")
173 .and_then(|class| class.strip_prefix("language-"))
174 .unwrap_or("text")
175 .to_string();
176 let code_text = code_node.text_contents();
177
178 if let Some(pre_parent) = code_node.parent() {
179 code_blocks.push((
180 pre_parent.clone(),
181 code_node.clone(),
182 code_text,
183 language,
184 ));
185 }
186 }
187 }
188
189 for (pre_element, _code_node, code_text, language) in code_blocks {
191 if let Some(highlighted) = self.highlight_code_html(&code_text, &language)
192 {
193 let wrapped_html = format!(
195 r#"<pre class="highlight"><code class="language-{language}">{highlighted}</code></pre>"#
196 );
197 let fragment = parse_html().one(wrapped_html.as_str());
198 pre_element.insert_after(fragment);
199 pre_element.detach();
200 }
201 }
203
204 let mut buf = Vec::new();
205 if let Err(e) = document.serialize(&mut buf) {
206 log::warn!("DOM serialization failed: {e:?}");
207 return html.to_string(); }
209 String::from_utf8(buf).unwrap_or_else(|_| html.to_string())
210 }
211
212 fn handle_hardtabs(&self, code: &str) -> String {
214 use super::types::TabStyle;
215
216 if !code.contains('\t') {
218 return code.to_string();
219 }
220
221 match self.options.tab_style {
222 TabStyle::None => code.to_string(),
224
225 TabStyle::Warn => {
227 log::warn!(
228 "Hard tabs detected in code block. Consider using spaces for \
229 consistency. Tools like editorconfig may help you normalize spaces \
230 in your documents."
231 );
232 code.to_string()
233 },
234
235 TabStyle::Normalize => {
238 log::debug!("Replacing hard tabs with spaces");
239 code.replace('\t', " ")
240 },
241 }
242 }
243
244 fn process_hardtabs(&self, markdown: &str) -> String {
246 use super::types::TabStyle;
247 use crate::utils::codeblock::FenceTracker;
248
249 if self.options.tab_style == TabStyle::None {
251 return markdown.to_string();
252 }
253
254 let mut result = String::with_capacity(markdown.len());
255 let mut lines = markdown.lines().peekable();
256 let mut tracker = FenceTracker::new();
257
258 while let Some(line) = lines.next() {
259 tracker = tracker.process_line(line);
260
261 let processed_line = if tracker.in_code_block() && line.contains('\t') {
263 self.handle_hardtabs(line)
264 } else {
265 line.to_string()
266 };
267
268 result.push_str(&processed_line);
269
270 if lines.peek().is_some() {
272 result.push('\n');
273 }
274 }
275
276 result
277 }
278
279 fn highlight_code_html(&self, code: &str, language: &str) -> Option<String> {
282 if !self.options.highlight_code {
283 return None;
284 }
285
286 let syntax_manager = self.syntax_manager.as_ref()?;
287
288 syntax_manager
289 .highlight_code(code, language, self.options.highlight_theme.as_deref())
290 .ok()
291 }
292
293 #[must_use]
295 pub fn render(&self, markdown: &str) -> MarkdownResult {
296 let (preprocessed, included_files) = self.preprocess(markdown);
297 let (headers, title) = self.extract_headers(&preprocessed);
298 let html = self.process_html_pipeline(&preprocessed);
299
300 MarkdownResult {
301 html,
302 headers,
303 title,
304 included_files,
305 }
306 }
307
308 fn process_html_pipeline(&self, content: &str) -> String {
310 let mut html = self.convert_to_html(content);
311
312 if cfg!(feature = "ndg-flavored") {
314 #[cfg(feature = "ndg-flavored")]
315 {
316 html = super::extensions::process_option_references(
317 &html,
318 self.options.valid_options.as_ref(),
319 );
320 }
321 }
322
323 if self.options.nixpkgs {
324 html = self.process_manpage_references_html(&html);
325 }
326
327 if self.options.highlight_code {
328 html = self.highlight_codeblocks(&html);
329 }
330
331 self.kuchiki_postprocess(&html)
332 }
333
334 fn preprocess(
336 &self,
337 content: &str,
338 ) -> (String, Vec<crate::types::IncludedFile>) {
339 let mut processed = content.to_string();
340 let mut included_files = Vec::new();
341
342 processed = super::extensions::process_myst_autolinks(&processed);
344
345 processed = self.process_hardtabs(&processed);
347
348 if self.options.nixpkgs {
349 let (content, files) = self.apply_nixpkgs_preprocessing(&processed);
350 processed = content;
351 included_files = files;
352 }
353
354 if self.options.nixpkgs || cfg!(feature = "ndg-flavored") {
355 processed = super::extensions::process_role_markup(
356 &processed,
357 self.manpage_urls.as_ref(),
358 self.options.auto_link_options,
359 self.options.valid_options.as_ref(),
360 );
361 }
362
363 #[cfg(feature = "wiki")]
364 {
365 processed = super::extensions::process_wikilinks(&processed);
366 }
367
368 (processed, included_files)
369 }
370
371 #[cfg(feature = "nixpkgs")]
373 fn apply_nixpkgs_preprocessing(
374 &self,
375 content: &str,
376 ) -> (String, Vec<crate::types::IncludedFile>) {
377 let (with_includes, included_files) =
378 match super::extensions::process_file_includes(content, &self.base_dir, 0)
379 {
380 Ok(result) => result,
381 Err(e) => {
382 log::warn!(
383 "File include processing failed: {e}. Continuing without includes."
384 );
385 (content.to_string(), Vec::new())
386 },
387 };
388 let with_blocks = super::extensions::process_block_elements(&with_includes);
389 let with_spans = super::extensions::process_bracketed_spans(&with_blocks);
390 let processed = super::extensions::process_inline_anchors(&with_spans);
391 (processed, included_files)
392 }
393
394 #[cfg(not(feature = "nixpkgs"))]
396 fn apply_nixpkgs_preprocessing(
397 &self,
398 content: &str,
399 ) -> (String, Vec<crate::types::IncludedFile>) {
400 (content.to_string(), Vec::new())
401 }
402
403 #[must_use]
405 pub fn extract_headers(
406 &self,
407 content: &str,
408 ) -> (Vec<Header>, Option<String>) {
409 use std::fmt::Write;
410
411 let arena = Arena::new();
412 let options = self.comrak_options();
413
414 let content = remove_admonition_blocks_for_headers(content);
415
416 let mut normalized = String::with_capacity(content.len());
418 let mut lines = content.lines().peekable();
419 while let Some(line) = lines.next() {
420 let trimmed = line.trim();
421 if !trimmed.starts_with('#')
422 && !lines
423 .peek()
424 .is_some_and(|next| is_setext_heading_underline(next.trim()))
425 && let Some(anchor_start) = trimmed.rfind("{#")
426 && let Some(anchor_end) = trimmed[anchor_start..].find('}')
427 {
428 let text = trimmed[..anchor_start].trim_end();
429 let id = &trimmed[anchor_start + 2..anchor_start + anchor_end];
430 let _ = writeln!(normalized, "## {text} {{#{id}}}");
431 continue;
432 }
433 normalized.push_str(line);
434 normalized.push('\n');
435 }
436
437 let root = parse_document(&arena, &normalized, &options);
438
439 let mut headers = Vec::new();
440 let mut found_title = None;
441
442 for node in root.descendants() {
443 if let NodeValue::Heading(NodeHeading { level, .. }) =
444 &node.data.borrow().value
445 {
446 let mut text = String::new();
447 let mut explicit_id = None;
448
449 for child in node.children() {
450 match &child.data.borrow().value {
451 NodeValue::Text(t) => text.push_str(t),
452 NodeValue::Code(t) => text.push_str(&t.literal),
453 NodeValue::Link(..)
454 | NodeValue::Emph
455 | NodeValue::Strong
456 | NodeValue::Subscript
457 | NodeValue::Strikethrough
458 | NodeValue::Superscript
459 | NodeValue::FootnoteReference(..) => {
460 text.push_str(&extract_inline_text(child));
461 },
462 NodeValue::HtmlInline(html) => {
463 let html_str = html.as_str();
465 if let Some(start) = html_str.find("{#")
466 && let Some(end) = html_str[start..].find('}')
467 {
468 let anchor = &html_str[start + 2..start + end];
469 explicit_id = Some(anchor.to_string());
470 }
471 },
472 #[allow(clippy::match_same_arms, reason = "Explicit for clarity")]
473 NodeValue::Image(..) => {},
474 _ => {},
475 }
476 }
477
478 let trimmed = text.trim_end();
480 #[allow(clippy::option_if_let_else)]
481 let (final_text, id) = if let Some(start) = trimmed.rfind("{#") {
483 if let Some(end) = trimmed[start..].find('}') {
484 let anchor = &trimmed[start + 2..start + end];
485 (trimmed[..start].trim_end().to_string(), anchor.to_string())
486 } else {
487 (
488 text.clone(),
489 explicit_id.unwrap_or_else(|| utils::slugify(&text)),
490 )
491 }
492 } else {
493 (
494 text.clone(),
495 explicit_id.unwrap_or_else(|| utils::slugify(&text)),
496 )
497 };
498 if *level == 1 && found_title.is_none() {
499 found_title = Some(final_text.clone());
500 }
501 headers.push(Header {
502 text: final_text,
503 level: *level,
504 id,
505 });
506 }
507 }
508
509 (headers, found_title)
510 }
511
512 fn convert_to_html(&self, content: &str) -> String {
514 let arena = Arena::new();
516 let options = self.comrak_options();
517 let root = parse_document(&arena, content, &options);
518
519 let prompt_transformer = PromptTransformer;
521 prompt_transformer.transform(root);
522
523 let mut html_output = String::new();
524 if let Err(e) = comrak::format_html(root, &options, &mut html_output) {
525 log::error!("Failed to format HTML: {e}");
526 }
527
528 Self::process_header_anchors_html(&html_output)
530 }
531
532 fn process_header_anchors_html(html: &str) -> String {
536 let result = HEADER_ANCHOR_RE
538 .replace_all(html, |caps: ®ex::Captures| {
539 let level = &caps[1];
540 let prefix = &caps[2];
541 let id = &caps[3];
542 let suffix = &caps[4];
543 format!("<h{level} id=\"{id}\">{prefix}{suffix}</h{level}>")
544 })
545 .to_string();
546
547 HEADER_NO_ID_RE
549 .replace_all(&result, |caps: ®ex::Captures| {
550 let level = &caps[1];
551 let content = &caps[2];
552 let text_only = HTML_TAG_RE.replace_all(content, "");
554 let id = utils::slugify(&text_only);
555 if id.is_empty() {
556 format!("<h{level}>{content}</h{level}>")
558 } else {
559 format!("<h{level} id=\"{id}\">{content}</h{level}>")
560 }
561 })
562 .to_string()
563 }
564
565 fn comrak_options(&self) -> Options<'_> {
567 let mut options = Options::default();
568 if self.options.gfm {
570 options.extension.table = true;
571 options.extension.footnotes = true;
572 options.extension.strikethrough = true;
573 options.extension.tasklist = true;
574 options.extension.superscript = true;
575 options.extension.autolink = true;
576 }
577
578 options.render.r#unsafe = true;
581
582 options.extension.header_id_prefix = None;
584 options.extension.description_lists = true;
585 options
586 }
587
588 #[cfg(feature = "nixpkgs")]
590 fn process_manpage_references_html(&self, html: &str) -> String {
591 super::extensions::process_manpage_references(
592 html,
593 self.manpage_urls.as_ref(),
594 )
595 }
596
597 #[cfg(not(feature = "nixpkgs"))]
600 fn process_manpage_references_html(&self, html: &str) -> String {
601 html.to_string()
602 }
603
604 #[allow(
606 clippy::unused_self,
607 reason = "Method signature matches processor pattern"
608 )]
609 fn kuchiki_postprocess(&self, html: &str) -> String {
610 kuchiki_postprocess_html(html, |document| {
612 Self::apply_dom_transformations(document);
613 })
614 }
615
616 fn apply_dom_transformations(document: &kuchikikiki::NodeRef) {
618 Self::process_list_item_id_markers(document);
619 Self::process_header_anchor_comments(document);
620 Self::process_list_item_inline_anchors(document);
621 Self::process_paragraph_inline_anchors(document);
622 Self::process_remaining_inline_anchors(document);
623 Self::process_markdown_links(document);
624 Self::process_option_anchor_links(document);
625 Self::process_empty_auto_links(document);
626 Self::process_empty_html_links(document);
627 }
628
629 fn process_list_item_id_markers(document: &kuchikikiki::NodeRef) {
631 let mut to_modify = Vec::new();
632
633 for comment in document.inclusive_descendants() {
634 if let Some(comment_node) = comment.as_comment() {
635 let comment_text = comment_node.borrow();
636 if let Some(id_start) = comment_text.find("nixos-anchor-id:") {
637 let id = comment_text[id_start + 16..].trim();
638 if !id.is_empty()
639 && id
640 .chars()
641 .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
642 {
643 if let Some(parent) = comment.parent()
645 && let Some(element) = parent.as_element()
646 && element.name.local.as_ref() == "li"
647 {
648 to_modify.push((comment.clone(), id.to_string()));
649 }
650 }
651 }
652 }
653 }
654
655 for (comment_node, id) in to_modify {
656 let span = kuchikikiki::NodeRef::new_element(
657 markup5ever::QualName::new(
658 None,
659 markup5ever::ns!(html),
660 local_name!("span"),
661 ),
662 vec![
663 (
664 kuchikikiki::ExpandedName::new("", "id"),
665 kuchikikiki::Attribute {
666 prefix: None,
667 value: id,
668 },
669 ),
670 (
671 kuchikikiki::ExpandedName::new("", "class"),
672 kuchikikiki::Attribute {
673 prefix: None,
674 value: "nixos-anchor".into(),
675 },
676 ),
677 ],
678 );
679 comment_node.insert_after(span);
680 comment_node.detach();
681 }
682 }
683
684 fn process_header_anchor_comments(document: &kuchikikiki::NodeRef) {
686 let mut to_modify = Vec::new();
687
688 for comment in document.inclusive_descendants() {
689 if let Some(comment_node) = comment.as_comment() {
690 let comment_text = comment_node.borrow();
691 if let Some(anchor_start) = comment_text.find("anchor:") {
692 let id = comment_text[anchor_start + 7..].trim();
693 if !id.is_empty()
694 && id
695 .chars()
696 .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
697 {
698 if let Some(parent) = comment.parent()
700 && let Some(element) = parent.as_element()
701 {
702 let tag_name = element.name.local.as_ref();
703 if matches!(tag_name, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
704 to_modify.push((
705 parent.clone(),
706 comment.clone(),
707 id.to_string(),
708 ));
709 }
710 }
711 }
712 }
713 }
714 }
715
716 for (header_element, comment_node, id) in to_modify {
717 if let Some(element) = header_element.as_element() {
718 element
719 .attributes
720 .borrow_mut()
721 .insert(local_name!("id"), id);
722 comment_node.detach();
723 }
724 }
725 }
726
727 fn process_list_item_inline_anchors(document: &kuchikikiki::NodeRef) {
729 for li_node in safe_select(document, "li") {
730 let li_element = li_node;
731
732 let has_code = !safe_select(&li_element, "code, pre").is_empty();
734 if has_code {
735 continue; }
737
738 let text_content = li_element.text_contents();
739
740 if let Some(anchor_start) = text_content.find("[]{#")
741 && let Some(anchor_end) = text_content[anchor_start..].find('}')
742 {
743 let id = &text_content[anchor_start + 4..anchor_start + anchor_end];
744 if !id.is_empty()
745 && id
746 .chars()
747 .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
748 {
749 let remaining_content =
750 &text_content[anchor_start + anchor_end + 1..];
751
752 for child in li_element.children() {
754 child.detach();
755 }
756
757 let span = kuchikikiki::NodeRef::new_element(
758 markup5ever::QualName::new(
759 None,
760 markup5ever::ns!(html),
761 local_name!("span"),
762 ),
763 vec![
764 (
765 kuchikikiki::ExpandedName::new("", "id"),
766 kuchikikiki::Attribute {
767 prefix: None,
768 value: id.into(),
769 },
770 ),
771 (
772 kuchikikiki::ExpandedName::new("", "class"),
773 kuchikikiki::Attribute {
774 prefix: None,
775 value: "nixos-anchor".into(),
776 },
777 ),
778 ],
779 );
780 li_element.append(span);
781 if !remaining_content.is_empty() {
782 li_element
783 .append(kuchikikiki::NodeRef::new_text(remaining_content));
784 }
785 }
786 }
787 }
788 }
789
790 fn process_paragraph_inline_anchors(document: &kuchikikiki::NodeRef) {
792 for p_node in safe_select(document, "p") {
793 let p_element = p_node;
794
795 let has_code = !safe_select(&p_element, "code, pre").is_empty();
797 if has_code {
798 continue; }
800
801 let text_content = p_element.text_contents();
802
803 if let Some(anchor_start) = text_content.find("[]{#")
804 && let Some(anchor_end) = text_content[anchor_start..].find('}')
805 {
806 let id = &text_content[anchor_start + 4..anchor_start + anchor_end];
807 if !id.is_empty()
808 && id
809 .chars()
810 .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
811 {
812 let remaining_content =
813 &text_content[anchor_start + anchor_end + 1..];
814
815 for child in p_element.children() {
817 child.detach();
818 }
819
820 let span = kuchikikiki::NodeRef::new_element(
821 markup5ever::QualName::new(
822 None,
823 markup5ever::ns!(html),
824 local_name!("span"),
825 ),
826 vec![
827 (
828 kuchikikiki::ExpandedName::new("", "id"),
829 kuchikikiki::Attribute {
830 prefix: None,
831 value: id.into(),
832 },
833 ),
834 (
835 kuchikikiki::ExpandedName::new("", "class"),
836 kuchikikiki::Attribute {
837 prefix: None,
838 value: "nixos-anchor".into(),
839 },
840 ),
841 ],
842 );
843 p_element.append(span);
844 if !remaining_content.is_empty() {
845 p_element.append(kuchikikiki::NodeRef::new_text(remaining_content));
846 }
847 }
848 }
849 }
850 }
851
852 fn process_remaining_inline_anchors(document: &kuchikikiki::NodeRef) {
854 let mut text_nodes_to_process = Vec::new();
855
856 for node in document.inclusive_descendants() {
857 if let Some(text_node) = node.as_text() {
858 let mut parent = node.parent();
860 let mut in_code = false;
861 while let Some(p) = parent {
862 if let Some(element) = p.as_element()
863 && (element.name.local == local_name!("code")
864 || element.name.local == local_name!("pre"))
865 {
866 in_code = true;
867 break;
868 }
869 parent = p.parent();
870 }
871
872 if !in_code {
874 let text_content = text_node.borrow().clone();
875 if text_content.contains("[]{#") {
876 text_nodes_to_process.push((node.clone(), text_content));
877 }
878 }
879 }
880 }
881
882 for (text_node, text_content) in text_nodes_to_process {
883 let mut last_end = 0;
884 let mut new_children = Vec::new();
885
886 let chars = text_content.chars().collect::<Vec<_>>();
888 let mut i = 0;
889 while i < chars.len() {
890 if i + 4 < chars.len()
891 && chars[i] == '['
892 && chars[i + 1] == ']'
893 && chars[i + 2] == '{'
894 && chars[i + 3] == '#'
895 {
896 let anchor_start = i;
898 i += 4; let mut id = String::new();
901 while i < chars.len() && chars[i] != '}' {
902 if chars[i].is_alphanumeric() || chars[i] == '-' || chars[i] == '_'
903 {
904 id.push(chars[i]);
905 i += 1;
906 } else {
907 break;
908 }
909 }
910
911 if i < chars.len() && chars[i] == '}' && !id.is_empty() {
912 let anchor_end = i + 1;
914
915 if anchor_start > last_end {
917 let before_text: String =
918 chars[last_end..anchor_start].iter().collect();
919 if !before_text.is_empty() {
920 new_children.push(kuchikikiki::NodeRef::new_text(before_text));
921 }
922 }
923
924 let span = kuchikikiki::NodeRef::new_element(
926 markup5ever::QualName::new(
927 None,
928 markup5ever::ns!(html),
929 local_name!("span"),
930 ),
931 vec![
932 (
933 kuchikikiki::ExpandedName::new("", "id"),
934 kuchikikiki::Attribute {
935 prefix: None,
936 value: id,
937 },
938 ),
939 (
940 kuchikikiki::ExpandedName::new("", "class"),
941 kuchikikiki::Attribute {
942 prefix: None,
943 value: "nixos-anchor".into(),
944 },
945 ),
946 ],
947 );
948 new_children.push(span);
949
950 last_end = anchor_end;
951 i = anchor_end;
952 } else {
953 i += 1;
954 }
955 } else {
956 i += 1;
957 }
958 }
959
960 if last_end < chars.len() {
962 let after_text: String = chars[last_end..].iter().collect();
963 if !after_text.is_empty() {
964 new_children.push(kuchikikiki::NodeRef::new_text(after_text));
965 }
966 }
967
968 if !new_children.is_empty() {
970 for child in new_children {
971 text_node.insert_before(child);
972 }
973 text_node.detach();
974 }
975 }
976 }
977
978 fn process_empty_auto_links(document: &kuchikikiki::NodeRef) {
980 for link_node in safe_select(document, "a") {
981 let link_element = link_node;
982 if let Some(element) = link_element.as_element() {
983 let href = element
984 .attributes
985 .borrow()
986 .get(local_name!("href"))
987 .map(std::string::ToString::to_string);
988 let text_content = link_element.text_contents();
989
990 if let Some(href_value) = href
991 && href_value.starts_with('#')
992 && (text_content.trim().is_empty()
993 || text_content.trim() == "{{ANCHOR}}")
994 {
995 if text_content.trim() == "{{ANCHOR}}" {
997 for child in link_element.children() {
998 child.detach();
999 }
1000 }
1001 let display_text = Self::humanize_anchor_id(&href_value);
1003 link_element.append(kuchikikiki::NodeRef::new_text(display_text));
1004 }
1005 }
1006 }
1007 }
1008
1009 fn process_empty_html_links(document: &kuchikikiki::NodeRef) {
1011 for link_node in safe_select(document, "a[href^='#']") {
1012 let link_element = link_node;
1013 let text_content = link_element.text_contents();
1014
1015 if text_content.trim().is_empty() || text_content.trim() == "{{ANCHOR}}" {
1016 if text_content.trim() == "{{ANCHOR}}" {
1018 for child in link_element.children() {
1019 child.detach();
1020 }
1021 }
1022 if let Some(element) = link_element.as_element()
1023 && let Some(href) =
1024 element.attributes.borrow().get(local_name!("href"))
1025 {
1026 let display_text = Self::humanize_anchor_id(href);
1027 link_element.append(kuchikikiki::NodeRef::new_text(display_text));
1028 }
1029 }
1030 }
1031 }
1032
1033 fn process_option_anchor_links(document: &kuchikikiki::NodeRef) {
1035 let mut to_modify = Vec::new();
1036
1037 for link_node in safe_select(document, "a[href^='#opt-']") {
1039 let link_element = link_node;
1040 if let Some(element) = link_element.as_element() {
1041 let href = element
1042 .attributes
1043 .borrow()
1044 .get(local_name!("href"))
1045 .map(std::string::ToString::to_string);
1046 let text_content = link_element.text_contents();
1047
1048 if let Some(href_value) = href
1049 && href_value.starts_with("#opt-")
1050 {
1051 let option_anchor = href_value[1..].to_string(); let needs_text_replacement = text_content.trim().is_empty()
1053 || text_content.trim() == "{{ANCHOR}}";
1054 to_modify.push((
1055 link_element.clone(),
1056 option_anchor,
1057 needs_text_replacement,
1058 ));
1059 }
1060 }
1061 }
1062
1063 for (link_element, option_anchor, needs_text_replacement) in to_modify {
1065 if let Some(element) = link_element.as_element() {
1066 let new_href = format!("options.html#{option_anchor}");
1067 element
1068 .attributes
1069 .borrow_mut()
1070 .insert(local_name!("href"), new_href);
1071
1072 if needs_text_replacement {
1073 for child in link_element.children() {
1075 child.detach();
1076 }
1077
1078 if let Some(option_path) = option_anchor.strip_prefix("opt-") {
1081 let option_name = option_path.replace('-', ".");
1082 link_element.append(kuchikikiki::NodeRef::new_text(option_name));
1083 }
1084 }
1085 }
1086 }
1087 }
1088
1089 fn process_markdown_links(document: &kuchikikiki::NodeRef) {
1091 for link_node in safe_select(document, "a") {
1092 let link_element = link_node;
1093 if let Some(element) = link_element.as_element() {
1094 let href = element
1095 .attributes
1096 .borrow()
1097 .get(local_name!("href"))
1098 .map(std::string::ToString::to_string);
1099
1100 if let Some(href_value) = href {
1101 if !href_value.starts_with("http://")
1104 && !href_value.starts_with("https://")
1105 && !href_value.starts_with('#')
1106 && !href_value.starts_with("mailto:")
1107 {
1108 let (path_part, suffix) = href_value
1110 .find(['#', '?'])
1111 .map_or((href_value.as_str(), ""), |idx| {
1112 href_value.split_at(idx)
1113 });
1114
1115 if std::path::Path::new(path_part)
1116 .extension()
1117 .is_some_and(|ext| ext.eq_ignore_ascii_case("md"))
1118 {
1119 let new_href =
1120 format!("{}.html{}", &path_part[..path_part.len() - 3], suffix);
1121 element
1122 .attributes
1123 .borrow_mut()
1124 .insert(local_name!("href"), new_href);
1125 }
1126 }
1127 }
1128 }
1129 }
1130 }
1131
1132 fn humanize_anchor_id(anchor: &str) -> String {
1134 let cleaned = anchor.trim_start_matches('#');
1136
1137 let without_prefix = cleaned
1139 .trim_start_matches("sec-")
1140 .trim_start_matches("ssec-")
1141 .trim_start_matches("opt-");
1142
1143 let spaced = without_prefix.replace(['-', '_'], " ");
1145
1146 spaced
1148 .split_whitespace()
1149 .map(|word| {
1150 let mut chars = word.chars();
1151 chars.next().map_or_else(String::new, |c| {
1152 c.to_uppercase().collect::<String>() + chars.as_str()
1153 })
1154 })
1155 .collect::<Vec<String>>()
1156 .join(" ")
1157 }
1158}
1159
1160pub fn extract_inline_text<'a>(node: &'a AstNode<'a>) -> String {
1162 fn inner<'a>(node: &'a AstNode<'a>) -> String {
1163 let mut text = String::new();
1164 for child in node.children() {
1165 match &child.data.borrow().value {
1166 NodeValue::Text(t) => text.push_str(t),
1167 NodeValue::Code(t) => text.push_str(&t.literal),
1168 NodeValue::Link(..)
1169 | NodeValue::Emph
1170 | NodeValue::Strong
1171 | NodeValue::Strikethrough
1172 | NodeValue::Superscript
1173 | NodeValue::Subscript
1174 | NodeValue::FootnoteReference(..) => {
1175 text.push_str(&inner(child));
1176 },
1177 #[allow(clippy::match_same_arms, reason = "Explicit for clarity")]
1178 NodeValue::HtmlInline(_) | NodeValue::Image(..) => {},
1179 _ => {},
1180 }
1181 }
1182 text
1183 }
1184 inner(node)
1185}
1186
1187pub fn collect_markdown_files(input_dir: &Path) -> Vec<PathBuf> {
1189 let mut files = Vec::with_capacity(100);
1190
1191 for entry in WalkDir::new(input_dir)
1192 .follow_links(true)
1193 .into_iter()
1194 .filter_map(Result::ok)
1195 {
1196 let path = entry.path();
1197 if path.is_file() && path.extension().is_some_and(|ext| ext == "md") {
1198 files.push(path.to_owned());
1199 }
1200 }
1201
1202 trace!("Found {} markdown files to process", files.len());
1203 files
1204}
1205
1206#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1208pub enum ProcessorFeature {
1209 Gfm,
1211 Nixpkgs,
1213 SyntaxHighlighting,
1215 ManpageUrls,
1217}
1218
1219fn remove_admonition_blocks_for_headers(content: &str) -> String {
1220 let mut output = String::with_capacity(content.len());
1221 let mut admonition_depth = 0usize;
1222
1223 for line in content.lines() {
1224 let trimmed = line.trim_start();
1225 if trimmed.starts_with("<div class=\"admonition ") {
1226 admonition_depth += 1;
1227 output.push('\n');
1228 continue;
1229 }
1230
1231 if admonition_depth > 0 {
1232 if trimmed == "</div>" {
1233 admonition_depth -= 1;
1234 }
1235 output.push('\n');
1236 continue;
1237 }
1238
1239 output.push_str(line);
1240 output.push('\n');
1241 }
1242
1243 output
1244}
1245
1246fn is_setext_heading_underline(line: &str) -> bool {
1247 !line.is_empty()
1248 && (line.chars().all(|ch| ch == '=' || ch.is_whitespace())
1249 || line.chars().all(|ch| ch == '-' || ch.is_whitespace()))
1250}
1251
1252fn kuchiki_postprocess_html<F>(html: &str, transform_fn: F) -> String
1254where
1255 F: FnOnce(&kuchikikiki::NodeRef),
1256{
1257 process_safe(
1258 html,
1259 |html| {
1260 use tendril::TendrilSink;
1261
1262 let document = kuchikikiki::parse_html().one(html);
1263 transform_fn(&document);
1264
1265 let mut out = Vec::new();
1266 let _ = document.serialize(&mut out);
1267 String::from_utf8_lossy(&out).into_owned()
1268 },
1269 html,
1270 )
1271}