ndg_commonmark/utils/
mod.rs1use std::sync::{LazyLock, OnceLock, RwLock};
2
3use rustc_hash::FxHashMap;
4pub mod codeblock;
5
6use comrak::{
7 Arena,
8 nodes::{AstNode, NodeHeading, NodeValue},
9 options::Options,
10 parse_document,
11};
12use regex::Regex;
13
14#[derive(Debug, thiserror::Error)]
16pub enum UtilError {
17 #[error("Regex compilation failed: {0}")]
18 RegexError(#[from] regex::Error),
19}
20
21pub type UtilResult<T> = Result<T, UtilError>;
23
24#[must_use]
27pub fn slugify(text: &str) -> String {
28 static CACHE: LazyLock<RwLock<FxHashMap<String, String>>> =
29 LazyLock::new(|| RwLock::new(FxHashMap::default()));
30
31 {
32 let cache = CACHE
33 .read()
34 .unwrap_or_else(std::sync::PoisonError::into_inner);
35 if let Some(cached) = cache.get(text) {
36 return cached.clone();
37 }
38 }
39
40 let result = text
41 .to_lowercase()
42 .replace(|c: char| !c.is_alphanumeric() && c != '-' && c != '_', "-")
43 .trim_matches('-')
44 .to_string();
45
46 let mut cache = CACHE
47 .write()
48 .unwrap_or_else(std::sync::PoisonError::into_inner);
49 if cache.len() < 2048 {
50 cache.insert(text.to_string(), result.clone());
51 }
52
53 result
54}
55
56#[must_use]
59pub fn extract_markdown_title(content: &str) -> Option<String> {
60 let arena = Arena::new();
61 let mut options = Options::default();
62 options.extension.table = true;
63 options.extension.footnotes = true;
64 options.extension.strikethrough = true;
65 options.extension.tasklist = true;
66 options.extension.superscript = true;
67 options.render.r#unsafe = true;
68
69 let root = parse_document(&arena, content, &options);
70
71 for node in root.descendants() {
72 if let NodeValue::Heading(NodeHeading { level, .. }) =
73 &node.data.borrow().value
74 && *level == 1
75 {
76 let text = extract_inline_text_from_node(node);
77 if !text.trim().is_empty() {
78 return Some(text.trim().to_string());
79 }
80 }
81 }
82 None
83}
84
85fn extract_inline_text_from_node<'a>(node: &'a AstNode<'a>) -> String {
88 let mut text = String::new();
89 for child in node.children() {
90 match &child.data.borrow().value {
91 NodeValue::Text(t) => text.push_str(t),
92 NodeValue::Code(t) => text.push_str(&t.literal),
93 NodeValue::Link(..)
94 | NodeValue::Emph
95 | NodeValue::Strong
96 | NodeValue::Strikethrough
97 | NodeValue::Superscript
98 | NodeValue::FootnoteReference(..) => {
99 text.push_str(&extract_inline_text_from_node(child));
100 },
101 #[expect(clippy::match_same_arms, reason = "Explicit for clarity")]
102 NodeValue::HtmlInline(_) | NodeValue::Image(..) => {},
103 _ => {},
104 }
105 }
106 text
107}
108
109#[must_use]
124pub fn extract_markdown_title_and_id(
125 content: &str,
126) -> Option<(String, Option<String>)> {
127 let arena = Arena::new();
128 let mut options = Options::default();
129 options.extension.table = true;
130 options.extension.footnotes = true;
131 options.extension.strikethrough = true;
132 options.extension.tasklist = true;
133 options.render.r#unsafe = true;
134
135 let root = parse_document(&arena, content, &options);
136
137 #[expect(
139 clippy::items_after_statements,
140 reason = "Static is Scoped to function for clarity"
141 )]
142 static ANCHOR_RE: OnceLock<Regex> = OnceLock::new();
143 let anchor_re = ANCHOR_RE.get_or_init(|| {
144 Regex::new(r"(\[\])?\{#(.*?)\}").unwrap_or_else(|e| {
145 log::error!(
146 "Failed to compile ANCHOR_RE regex in extract_h1_title: {e}\n Falling \
147 back to never matching regex."
148 );
149 never_matching_regex().unwrap_or_else(|_| {
150 #[expect(
152 clippy::expect_used,
153 reason = "This pattern is guaranteed to be valid"
154 )]
155 Regex::new(r"[^\s\S]")
156 .expect("regex pattern [^\\s\\S] should always compile")
157 })
158 })
159 });
160
161 for node in root.descendants() {
162 if let NodeValue::Heading(NodeHeading { level, .. }) =
163 &node.data.borrow().value
164 && *level == 1
165 {
166 let text = extract_inline_text_from_node(node);
167 let anchor_id = anchor_re
169 .captures(&text)
170 .and_then(|caps| caps.get(2).map(|m| m.as_str().to_string()));
171 let clean_title = anchor_re.replace_all(&text, "").trim().to_string();
172 if !clean_title.is_empty() {
173 return Some((clean_title, anchor_id));
174 }
175 }
176 }
177 None
178}
179
180#[must_use]
188pub fn clean_anchor_patterns(text: &str) -> String {
189 static ANCHOR_PATTERN: OnceLock<Regex> = OnceLock::new();
190 let anchor_pattern = ANCHOR_PATTERN.get_or_init(|| {
191 Regex::new(r"\s*\{#[a-zA-Z0-9_-]+\}\s*$").unwrap_or_else(|e| {
192 log::error!(
193 "Failed to compile ANCHOR_PATTERN regex in clean_anchor_patterns: \
194 {e}\n Falling back to never matching regex."
195 );
196 never_matching_regex().unwrap_or_else(|_| {
197 #[expect(
199 clippy::expect_used,
200 reason = "This pattern is guaranteed to be valid"
201 )]
202 Regex::new(r"[^\s\S]")
203 .expect("regex pattern [^\\s\\S] should always compile")
204 })
205 })
206 });
207 anchor_pattern.replace_all(text.trim(), "").to_string()
208}
209
210#[must_use]
215pub fn strip_markdown(content: &str) -> String {
216 let arena = Arena::new();
217 let mut options = Options::default();
218 options.extension.table = true;
219 options.extension.footnotes = true;
220 options.extension.strikethrough = true;
221 options.extension.tasklist = true;
222 options.render.r#unsafe = true;
223
224 let root = parse_document(&arena, content, &options);
225
226 let mut plain_text = String::new();
227
228 #[expect(
229 clippy::items_after_statements,
230 reason = "Helper scoped for clarity"
231 )]
232 fn extract_text<'a>(
233 node: &'a AstNode<'a>,
234 plain_text: &mut String,
235 in_code_block: &mut bool,
236 ) {
237 match &node.data.borrow().value {
238 NodeValue::Paragraph => {
239 for child in node.children() {
240 extract_text(child, plain_text, in_code_block);
241 }
242 plain_text.push('\n');
244 },
245 NodeValue::Heading(_) => {
246 for child in node.children() {
247 extract_text(child, plain_text, in_code_block);
248 }
249 plain_text.push('\n');
251 },
252 NodeValue::Text(t) => {
253 if !*in_code_block {
254 plain_text.push_str(t);
255 }
256 },
257 NodeValue::CodeBlock(_) => {
258 *in_code_block = true;
259 for child in node.children() {
260 extract_text(child, plain_text, in_code_block);
261 }
262 *in_code_block = false;
263 },
264 NodeValue::SoftBreak => {
265 plain_text.push(' ');
266 },
267 NodeValue::LineBreak => {
268 plain_text.push('\n');
269 },
270 NodeValue::List(_) => {
271 for child in node.children() {
272 extract_text(child, plain_text, in_code_block);
273 }
274 plain_text.push('\n');
275 },
276 NodeValue::Code(c) => {
277 if !*in_code_block {
278 plain_text.push_str(&c.literal);
279 }
280 },
281 _ => {
282 for child in node.children() {
283 extract_text(child, plain_text, in_code_block);
284 }
285 },
286 }
287 }
288
289 let mut in_code_block = false;
290 extract_text(root, &mut plain_text, &mut in_code_block);
291 plain_text
292}
293
294pub fn capitalize_first(s: &str) -> String {
296 let mut chars = s.chars();
297 chars.next().map_or_else(String::new, |c| {
298 c.to_uppercase().collect::<String>() + chars.as_str()
299 })
300}
301
302#[must_use]
304pub fn is_markdown_header(line: &str) -> bool {
305 line.trim_start().starts_with('#')
306}
307
308pub fn load_manpage_urls(
314 path: &str,
315) -> Result<FxHashMap<String, String>, Box<dyn std::error::Error>> {
316 let content = std::fs::read_to_string(path)?;
317 let mappings: FxHashMap<String, String> = serde_json::from_str(&content)?;
318 Ok(mappings)
319}
320
321pub fn never_matching_regex() -> Result<regex::Regex, regex::Error> {
332 regex::Regex::new(r"[^\s\S]").or_else(|_| {
335 regex::Regex::new(r"^\b$")
338 })
339}