use regex::Regex;
use std::sync::LazyLock;
static HTML_ANCHOR_ELEMENT: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r#"<a\s+(?:name|id)="[^"]*"(?:\s+(?:name|id)="[^"]*")?>\s*</a>\s*"#).unwrap());
static HEADER_ID_PATTERN: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\s*\{\s*:?\s*([^}]*?#[^}]*?)\s*\}\s*$").unwrap());
static ID_VALIDATE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^[a-zA-Z0-9_\-:]+$").unwrap());
static STANDALONE_ATTR_LIST_PATTERN: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^\s*\{\s*:?\s*([^}]*#[a-zA-Z0-9_\-:]+[^}]*)\s*\}\s*$").unwrap());
pub fn extract_header_id(line: &str) -> (String, Option<String>) {
let line = HTML_ANCHOR_ELEMENT.replace_all(line, "");
let line = line.as_ref();
if let Some(captures) = HEADER_ID_PATTERN.captures(line)
&& let Some(full_match) = captures.get(0)
&& let Some(attr_content) = captures.get(1)
{
let attr_str = attr_content.as_str().trim();
if let Some(hash_pos) = attr_str.find('#') {
let after_hash = &attr_str[hash_pos + 1..];
let is_simple_format = !attr_str.contains(' ') && !attr_str.contains('=') && attr_str.starts_with('#');
if is_simple_format {
let potential_id = after_hash;
if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
let clean_text = line[..full_match.start()].trim_end().to_string();
return (clean_text, Some(potential_id.to_string()));
}
} else {
if let Some(delimiter_pos) = after_hash.find(|c: char| c.is_whitespace() || c == '.' || c == '=') {
let potential_id = &after_hash[..delimiter_pos];
if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
let clean_text = line[..full_match.start()].trim_end().to_string();
return (clean_text, Some(potential_id.to_string()));
}
} else {
let potential_id = after_hash;
if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
let clean_text = line[..full_match.start()].trim_end().to_string();
return (clean_text, Some(potential_id.to_string()));
}
}
}
}
}
(line.to_string(), None)
}
pub fn is_standalone_attr_list(line: &str) -> bool {
STANDALONE_ATTR_LIST_PATTERN.is_match(line)
}
pub fn extract_standalone_attr_list_id(line: &str) -> Option<String> {
if let Some(captures) = STANDALONE_ATTR_LIST_PATTERN.captures(line)
&& let Some(attr_content) = captures.get(1)
{
let attr_str = attr_content.as_str().trim();
if let Some(hash_pos) = attr_str.find('#') {
let after_hash = &attr_str[hash_pos + 1..];
let is_simple_format = !attr_str.contains(' ') && !attr_str.contains('=') && attr_str.starts_with('#');
if is_simple_format {
let potential_id = after_hash;
if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
return Some(potential_id.to_string());
}
} else {
if let Some(delimiter_pos) = after_hash.find(|c: char| c.is_whitespace() || c == '.' || c == '=') {
let potential_id = &after_hash[..delimiter_pos];
if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
return Some(potential_id.to_string());
}
} else {
let potential_id = after_hash;
if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
return Some(potential_id.to_string());
}
}
}
}
}
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_kramdown_format_extraction() {
let (text, id) = extract_header_id("# Header {#simple}");
assert_eq!(text, "# Header");
assert_eq!(id, Some("simple".to_string()));
let (text, id) = extract_header_id("## Section {#section-id}");
assert_eq!(text, "## Section");
assert_eq!(id, Some("section-id".to_string()));
}
#[test]
fn test_python_markdown_attr_list_extraction() {
let (text, id) = extract_header_id("# Header {:#colon-id}");
assert_eq!(text, "# Header");
assert_eq!(id, Some("colon-id".to_string()));
let (text, id) = extract_header_id("# Header {: #spaced-id }");
assert_eq!(text, "# Header");
assert_eq!(id, Some("spaced-id".to_string()));
}
#[test]
fn test_extended_attr_list_extraction() {
let (text, id) = extract_header_id("# Header {: #with-class .highlight }");
assert_eq!(text, "# Header");
assert_eq!(id, Some("with-class".to_string()));
let (text, id) = extract_header_id("## Section {: #multi .class1 .class2 }");
assert_eq!(text, "## Section");
assert_eq!(id, Some("multi".to_string()));
let (text, id) = extract_header_id("### Subsection {: #with-attrs data-test=\"value\" style=\"color: red\" }");
assert_eq!(text, "### Subsection");
assert_eq!(id, Some("with-attrs".to_string()));
let (text, id) = extract_header_id("#### Complex {: #complex .highlight data-role=\"button\" title=\"Test\" }");
assert_eq!(text, "#### Complex");
assert_eq!(id, Some("complex".to_string()));
let (text, id) = extract_header_id("##### Quotes {: #quotes title=\"Has \\\"nested\\\" quotes\" }");
assert_eq!(text, "##### Quotes");
assert_eq!(id, Some("quotes".to_string()));
}
#[test]
fn test_attr_list_detection_edge_cases() {
let (text, id) = extract_header_id("# Header {: .class-only }");
assert_eq!(text, "# Header {: .class-only }");
assert_eq!(id, None);
let (text, id) = extract_header_id("# Header { no-hash }");
assert_eq!(text, "# Header { no-hash }");
assert_eq!(id, None);
let (text, id) = extract_header_id("# Header {: # }");
assert_eq!(text, "# Header {: # }");
assert_eq!(id, None);
let (text, id) = extract_header_id("# Header {: #middle } with more text");
assert_eq!(text, "# Header {: #middle } with more text");
assert_eq!(id, None);
}
#[test]
fn test_standalone_attr_list_detection() {
assert!(is_standalone_attr_list("{#custom-id}"));
assert!(is_standalone_attr_list("{ #spaced-id }"));
assert!(is_standalone_attr_list("{:#colon-id}"));
assert!(is_standalone_attr_list("{: #full-format }"));
assert!(is_standalone_attr_list("{: #with-class .highlight }"));
assert!(is_standalone_attr_list("{: #multi .class1 .class2 }"));
assert!(is_standalone_attr_list("{: #complex .highlight data-test=\"value\" }"));
assert!(!is_standalone_attr_list("Some text {#not-standalone}"));
assert!(!is_standalone_attr_list("Text before {#id}"));
assert!(!is_standalone_attr_list("{#id} text after"));
assert!(!is_standalone_attr_list(""));
assert!(!is_standalone_attr_list(" ")); assert!(!is_standalone_attr_list("{: .class-only }")); }
#[test]
fn test_standalone_attr_list_id_extraction() {
assert_eq!(extract_standalone_attr_list_id("{#simple}"), Some("simple".to_string()));
assert_eq!(
extract_standalone_attr_list_id("{ #spaced }"),
Some("spaced".to_string())
);
assert_eq!(extract_standalone_attr_list_id("{:#colon}"), Some("colon".to_string()));
assert_eq!(extract_standalone_attr_list_id("{: #full }"), Some("full".to_string()));
assert_eq!(
extract_standalone_attr_list_id("{: #with-class .highlight }"),
Some("with-class".to_string())
);
assert_eq!(
extract_standalone_attr_list_id("{: #complex .class1 .class2 data=\"value\" }"),
Some("complex".to_string())
);
assert_eq!(extract_standalone_attr_list_id("Not an attr-list"), None);
assert_eq!(extract_standalone_attr_list_id("Text {#not-standalone}"), None);
assert_eq!(extract_standalone_attr_list_id("{: .class-only }"), None);
assert_eq!(extract_standalone_attr_list_id(""), None);
}
#[test]
fn test_backward_compatibility() {
let test_cases = vec![
("# Header {#a}", "# Header", Some("a".to_string())),
("# Header {#simple-id}", "# Header", Some("simple-id".to_string())),
("## Heading {#heading-2}", "## Heading", Some("heading-2".to_string())),
(
"### With-Hyphens {#with-hyphens}",
"### With-Hyphens",
Some("with-hyphens".to_string()),
),
];
for (input, expected_text, expected_id) in test_cases {
let (text, id) = extract_header_id(input);
assert_eq!(text, expected_text, "Text mismatch for input: {input}");
assert_eq!(id, expected_id, "ID mismatch for input: {input}");
}
}
#[test]
fn test_invalid_id_with_dots() {
let (text, id) = extract_header_id("## Another. {#id.with.dots}");
assert_eq!(text, "## Another. {#id.with.dots}"); assert_eq!(id, None);
let (text, id) = extract_header_id("## Another. {#id.more.dots}");
assert_eq!(text, "## Another. {#id.more.dots}");
assert_eq!(id, None);
}
#[test]
fn test_html_anchor_stripping() {
let (text, id) = extract_header_id("<a name=\"cheatsheets\"></a>Cheat Sheets");
assert_eq!(text, "Cheat Sheets");
assert_eq!(id, None);
let (text, id) = extract_header_id("<a id=\"tools\"></a>Tools and session management");
assert_eq!(text, "Tools and session management");
assert_eq!(id, None);
let (text, id) = extract_header_id("<a name=\"foo\"></a> Heading with space");
assert_eq!(text, "Heading with space");
assert_eq!(id, None);
let (text, id) = extract_header_id("<a name=\"old\"></a>My Section {#my-custom-id}");
assert_eq!(text, "My Section");
assert_eq!(id, Some("my-custom-id".to_string()));
}
}