use regex::Regex;
use std::sync::LazyLock;
use unicode_normalization::UnicodeNormalization;
use super::common::MAX_INPUT_LENGTH;
static HTML_TAG_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?i)</?[a-z][^>]*>").unwrap());
static CODE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"`+([^`]*?)`+").unwrap());
static STRIP_NON_WORD: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"[^\w\s-]").unwrap()
});
static COLLAPSE_SEPARATORS: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"[-\s]+").unwrap()
});
pub fn heading_to_fragment(heading: &str) -> String {
if heading.is_empty() {
return String::new();
}
let input = if heading.len() > MAX_INPUT_LENGTH {
let mut end = MAX_INPUT_LENGTH;
while end < heading.len() && !heading.is_char_boundary(end) {
end += 1;
}
&heading[..end.min(heading.len())]
} else {
heading
};
let mut code_extracts: Vec<String> = Vec::new();
let input = CODE_PATTERN
.replace_all(input, |caps: ®ex::Captures| {
let idx = code_extracts.len();
let content = caps.get(1).map(|m| m.as_str()).unwrap_or("");
code_extracts.push(content.to_string());
format!("\x00CODE{idx}\x00")
})
.to_string();
let input = HTML_TAG_PATTERN.replace_all(&input, "");
let mut input = input.to_string();
for (idx, content) in code_extracts.into_iter().enumerate() {
input = input.replace(&format!("\x00CODE{idx}\x00"), &content);
}
let nfkd: String = input.nfkd().collect();
let ascii_only: String = nfkd.chars().filter(|c| c.is_ascii()).collect();
let cleaned = STRIP_NON_WORD.replace_all(&ascii_only, "");
let lowered = cleaned.to_lowercase();
let stripped = lowered.trim();
let result = COLLAPSE_SEPARATORS.replace_all(stripped, "-");
result.into_owned()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basic() {
assert_eq!(heading_to_fragment("Hello World"), "hello-world");
}
#[test]
fn test_slash_collapsing() {
assert_eq!(
heading_to_fragment("Cross-references to other projects / inventories"),
"cross-references-to-other-projects-inventories"
);
}
#[test]
fn test_underscores_preserved() {
assert_eq!(heading_to_fragment("test_with_underscores"), "test_with_underscores");
}
#[test]
fn test_hyphens_preserved() {
assert_eq!(heading_to_fragment("well-known"), "well-known");
}
#[test]
fn test_consecutive_hyphens_collapsed() {
assert_eq!(heading_to_fragment("test--double"), "test-double");
assert_eq!(heading_to_fragment("test---triple"), "test-triple");
}
#[test]
fn test_html_jsx_tag_stripping() {
assert_eq!(heading_to_fragment("retentionPolicy<Component />"), "retentionpolicy");
assert_eq!(heading_to_fragment("Test <span>extra</span>"), "test-extra");
assert_eq!(heading_to_fragment("Generic<T>"), "generic");
assert_eq!(heading_to_fragment("`import <FILE>`"), "import-file");
}
#[test]
fn test_special_characters_removed() {
assert_eq!(heading_to_fragment("Hello & World"), "hello-world");
assert_eq!(heading_to_fragment("C++ Guide"), "c-guide");
assert_eq!(heading_to_fragment("Q&A"), "qa");
}
#[test]
fn test_unicode_decomposed() {
assert_eq!(heading_to_fragment("café"), "cafe");
assert_eq!(heading_to_fragment("résumé"), "resume");
}
#[test]
fn test_non_ascii_removed() {
assert_eq!(heading_to_fragment("日本語 Test"), "test");
}
#[test]
fn test_empty() {
assert_eq!(heading_to_fragment(""), "");
}
#[test]
fn test_whitespace_only() {
assert_eq!(heading_to_fragment(" "), "");
}
#[test]
fn test_numbers() {
assert_eq!(heading_to_fragment("Step 1: Setup"), "step-1-setup");
}
#[test]
fn test_arrows() {
assert_eq!(heading_to_fragment("A --> B"), "a-b");
assert_eq!(heading_to_fragment("A -> B"), "a-b");
}
#[test]
fn test_leading_trailing_stripped() {
assert_eq!(heading_to_fragment(" Hello "), "hello");
}
#[test]
fn test_mixed_separators() {
assert_eq!(heading_to_fragment("a - b - c"), "a-b-c");
assert_eq!(heading_to_fragment("a - b"), "a-b");
}
}