1use regex::Regex;
28use std::sync::LazyLock;
29
30static HTML_ANCHOR_ELEMENT: LazyLock<Regex> =
34 LazyLock::new(|| Regex::new(r#"<a\s+(?:name|id)="[^"]*"(?:\s+(?:name|id)="[^"]*")?>\s*</a>\s*"#).unwrap());
35
36static HEADER_ID_PATTERN: LazyLock<Regex> =
41 LazyLock::new(|| Regex::new(r"\s*\{\s*:?\s*([^}]*?#[^}]*?)\s*\}\s*$").unwrap());
42
43static ID_VALIDATE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^[a-zA-Z0-9_\-:]+$").unwrap());
45
46static STANDALONE_ATTR_LIST_PATTERN: LazyLock<Regex> =
49 LazyLock::new(|| Regex::new(r"^\s*\{\s*:?\s*([^}]*#[a-zA-Z0-9_\-:]+[^}]*)\s*\}\s*$").unwrap());
50
51pub fn extract_header_id(line: &str) -> (String, Option<String>) {
72 let line = HTML_ANCHOR_ELEMENT.replace_all(line, "");
75 let line = line.as_ref();
76
77 if let Some(captures) = HEADER_ID_PATTERN.captures(line)
78 && let Some(full_match) = captures.get(0)
79 && let Some(attr_content) = captures.get(1)
80 {
81 let attr_str = attr_content.as_str().trim();
82
83 if let Some(hash_pos) = attr_str.find('#') {
85 let after_hash = &attr_str[hash_pos + 1..];
87
88 let is_simple_format = !attr_str.contains(' ') && !attr_str.contains('=') && attr_str.starts_with('#');
93
94 if is_simple_format {
95 let potential_id = after_hash;
97 if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
98 let clean_text = line[..full_match.start()].trim_end().to_string();
99 return (clean_text, Some(potential_id.to_string()));
100 }
101 } else {
103 if let Some(delimiter_pos) = after_hash.find(|c: char| c.is_whitespace() || c == '.' || c == '=') {
105 let potential_id = &after_hash[..delimiter_pos];
106 if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
107 let clean_text = line[..full_match.start()].trim_end().to_string();
108 return (clean_text, Some(potential_id.to_string()));
109 }
110 } else {
111 let potential_id = after_hash;
113 if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
114 let clean_text = line[..full_match.start()].trim_end().to_string();
115 return (clean_text, Some(potential_id.to_string()));
116 }
117 }
118 }
119 }
120 }
121 (line.to_string(), None)
122}
123
124pub fn is_standalone_attr_list(line: &str) -> bool {
139 STANDALONE_ATTR_LIST_PATTERN.is_match(line)
140}
141
142pub fn extract_standalone_attr_list_id(line: &str) -> Option<String> {
155 if let Some(captures) = STANDALONE_ATTR_LIST_PATTERN.captures(line)
156 && let Some(attr_content) = captures.get(1)
157 {
158 let attr_str = attr_content.as_str().trim();
159
160 if let Some(hash_pos) = attr_str.find('#') {
162 let after_hash = &attr_str[hash_pos + 1..];
163
164 let is_simple_format = !attr_str.contains(' ') && !attr_str.contains('=') && attr_str.starts_with('#');
166
167 if is_simple_format {
168 let potential_id = after_hash;
170 if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
171 return Some(potential_id.to_string());
172 }
173 } else {
174 if let Some(delimiter_pos) = after_hash.find(|c: char| c.is_whitespace() || c == '.' || c == '=') {
176 let potential_id = &after_hash[..delimiter_pos];
177 if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
178 return Some(potential_id.to_string());
179 }
180 } else {
181 let potential_id = after_hash;
183 if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
184 return Some(potential_id.to_string());
185 }
186 }
187 }
188 }
189 }
190 None
191}
192
193pub fn parse_blockquote_atx_heading(bq_content: &str) -> Option<(String, Option<String>)> {
201 static BQ_ATX_HEADING_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(#{1,6})\s+(.*)$").unwrap());
202
203 let trimmed = bq_content.trim();
204 let caps = BQ_ATX_HEADING_RE.captures(trimmed)?;
205 let mut rest = caps.get(2).map_or("", |m| m.as_str()).to_string();
206
207 let rest_trimmed = rest.trim_end();
209 if let Some(last_hash_pos) = rest_trimmed.rfind('#') {
210 let after_hashes = &rest_trimmed[last_hash_pos..];
211 if after_hashes.chars().all(|c| c == '#') {
212 let mut hash_start = last_hash_pos;
214 while hash_start > 0 && rest_trimmed.as_bytes()[hash_start - 1] == b'#' {
215 hash_start -= 1;
216 }
217 if hash_start == 0
219 || rest_trimmed
220 .as_bytes()
221 .get(hash_start - 1)
222 .is_some_and(u8::is_ascii_whitespace)
223 {
224 rest = rest_trimmed[..hash_start].trim_end().to_string();
225 }
226 }
227 }
228
229 let (clean_text, custom_id) = extract_header_id(&rest);
230 Some((clean_text, custom_id))
231}
232
233#[cfg(test)]
234mod tests {
235 use super::*;
236
237 #[test]
238 fn test_kramdown_format_extraction() {
239 let (text, id) = extract_header_id("# Header {#simple}");
241 assert_eq!(text, "# Header");
242 assert_eq!(id, Some("simple".to_string()));
243
244 let (text, id) = extract_header_id("## Section {#section-id}");
245 assert_eq!(text, "## Section");
246 assert_eq!(id, Some("section-id".to_string()));
247 }
248
249 #[test]
250 fn test_python_markdown_attr_list_extraction() {
251 let (text, id) = extract_header_id("# Header {:#colon-id}");
253 assert_eq!(text, "# Header");
254 assert_eq!(id, Some("colon-id".to_string()));
255
256 let (text, id) = extract_header_id("# Header {: #spaced-id }");
257 assert_eq!(text, "# Header");
258 assert_eq!(id, Some("spaced-id".to_string()));
259 }
260
261 #[test]
262 fn test_extended_attr_list_extraction() {
263 let (text, id) = extract_header_id("# Header {: #with-class .highlight }");
265 assert_eq!(text, "# Header");
266 assert_eq!(id, Some("with-class".to_string()));
267
268 let (text, id) = extract_header_id("## Section {: #multi .class1 .class2 }");
270 assert_eq!(text, "## Section");
271 assert_eq!(id, Some("multi".to_string()));
272
273 let (text, id) = extract_header_id("### Subsection {: #with-attrs data-test=\"value\" style=\"color: red\" }");
275 assert_eq!(text, "### Subsection");
276 assert_eq!(id, Some("with-attrs".to_string()));
277
278 let (text, id) = extract_header_id("#### Complex {: #complex .highlight data-role=\"button\" title=\"Test\" }");
280 assert_eq!(text, "#### Complex");
281 assert_eq!(id, Some("complex".to_string()));
282
283 let (text, id) = extract_header_id("##### Quotes {: #quotes title=\"Has \\\"nested\\\" quotes\" }");
285 assert_eq!(text, "##### Quotes");
286 assert_eq!(id, Some("quotes".to_string()));
287 }
288
289 #[test]
290 fn test_attr_list_detection_edge_cases() {
291 let (text, id) = extract_header_id("# Header {: .class-only }");
293 assert_eq!(text, "# Header {: .class-only }");
294 assert_eq!(id, None);
295
296 let (text, id) = extract_header_id("# Header { no-hash }");
298 assert_eq!(text, "# Header { no-hash }");
299 assert_eq!(id, None);
300
301 let (text, id) = extract_header_id("# Header {: # }");
303 assert_eq!(text, "# Header {: # }");
304 assert_eq!(id, None);
305
306 let (text, id) = extract_header_id("# Header {: #middle } with more text");
308 assert_eq!(text, "# Header {: #middle } with more text");
309 assert_eq!(id, None);
310 }
311
312 #[test]
313 fn test_standalone_attr_list_detection() {
314 assert!(is_standalone_attr_list("{#custom-id}"));
316 assert!(is_standalone_attr_list("{ #spaced-id }"));
317 assert!(is_standalone_attr_list("{:#colon-id}"));
318 assert!(is_standalone_attr_list("{: #full-format }"));
319
320 assert!(is_standalone_attr_list("{: #with-class .highlight }"));
322 assert!(is_standalone_attr_list("{: #multi .class1 .class2 }"));
323 assert!(is_standalone_attr_list("{: #complex .highlight data-test=\"value\" }"));
324
325 assert!(!is_standalone_attr_list("Some text {#not-standalone}"));
327 assert!(!is_standalone_attr_list("Text before {#id}"));
328 assert!(!is_standalone_attr_list("{#id} text after"));
329 assert!(!is_standalone_attr_list(""));
330 assert!(!is_standalone_attr_list(" ")); assert!(!is_standalone_attr_list("{: .class-only }")); }
333
334 #[test]
335 fn test_standalone_attr_list_id_extraction() {
336 assert_eq!(extract_standalone_attr_list_id("{#simple}"), Some("simple".to_string()));
338 assert_eq!(
339 extract_standalone_attr_list_id("{ #spaced }"),
340 Some("spaced".to_string())
341 );
342 assert_eq!(extract_standalone_attr_list_id("{:#colon}"), Some("colon".to_string()));
343 assert_eq!(extract_standalone_attr_list_id("{: #full }"), Some("full".to_string()));
344
345 assert_eq!(
347 extract_standalone_attr_list_id("{: #with-class .highlight }"),
348 Some("with-class".to_string())
349 );
350 assert_eq!(
351 extract_standalone_attr_list_id("{: #complex .class1 .class2 data=\"value\" }"),
352 Some("complex".to_string())
353 );
354
355 assert_eq!(extract_standalone_attr_list_id("Not an attr-list"), None);
357 assert_eq!(extract_standalone_attr_list_id("Text {#not-standalone}"), None);
358 assert_eq!(extract_standalone_attr_list_id("{: .class-only }"), None);
359 assert_eq!(extract_standalone_attr_list_id(""), None);
360 }
361
362 #[test]
363 fn test_backward_compatibility() {
364 let test_cases = vec![
366 ("# Header {#a}", "# Header", Some("a".to_string())),
367 ("# Header {#simple-id}", "# Header", Some("simple-id".to_string())),
368 ("## Heading {#heading-2}", "## Heading", Some("heading-2".to_string())),
369 (
370 "### With-Hyphens {#with-hyphens}",
371 "### With-Hyphens",
372 Some("with-hyphens".to_string()),
373 ),
374 ];
375
376 for (input, expected_text, expected_id) in test_cases {
377 let (text, id) = extract_header_id(input);
378 assert_eq!(text, expected_text, "Text mismatch for input: {input}");
379 assert_eq!(id, expected_id, "ID mismatch for input: {input}");
380 }
381 }
382
383 #[test]
384 fn test_invalid_id_with_dots() {
385 let (text, id) = extract_header_id("## Another. {#id.with.dots}");
387 assert_eq!(text, "## Another. {#id.with.dots}"); assert_eq!(id, None); let (text, id) = extract_header_id("## Another. {#id.more.dots}");
393 assert_eq!(text, "## Another. {#id.more.dots}");
394 assert_eq!(id, None);
395 }
396
397 #[test]
398 fn test_html_anchor_stripping() {
399 let (text, id) = extract_header_id("<a name=\"cheatsheets\"></a>Cheat Sheets");
404 assert_eq!(text, "Cheat Sheets");
405 assert_eq!(id, None);
406
407 let (text, id) = extract_header_id("<a id=\"tools\"></a>Tools and session management");
409 assert_eq!(text, "Tools and session management");
410 assert_eq!(id, None);
411
412 let (text, id) = extract_header_id("<a name=\"foo\"></a> Heading with space");
414 assert_eq!(text, "Heading with space");
415 assert_eq!(id, None);
416
417 let (text, id) = extract_header_id("<a name=\"old\"></a>My Section {#my-custom-id}");
419 assert_eq!(text, "My Section");
420 assert_eq!(id, Some("my-custom-id".to_string()));
421 }
422}