use regex::Regex;
use std::sync::LazyLock;
static HTML_TAG_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?i)</?[a-z][^>]*>").unwrap());
static CODE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"`+([^`]*?)`+").unwrap());
pub fn heading_to_fragment(heading: &str) -> String {
if heading.is_empty() {
return String::new(); }
const MAX_INPUT_SIZE: usize = 10 * 1024; if heading.len() > MAX_INPUT_SIZE {
let mut truncate_pos = MAX_INPUT_SIZE;
while truncate_pos > 0 && !heading.is_char_boundary(truncate_pos) {
truncate_pos -= 1;
}
if truncate_pos == 0 {
truncate_pos = heading
.char_indices()
.take_while(|(i, _)| *i < MAX_INPUT_SIZE)
.last()
.map(|(i, c)| i + c.len_utf8())
.unwrap_or(0);
}
return heading_to_fragment(&heading[..truncate_pos]);
}
let text = heading.trim();
if text.is_empty() {
return "section".to_string();
}
let normalized = normalize_and_filter_unicode(text);
if normalized.is_empty() {
return "section".to_string();
}
let mut code_extracts: Vec<String> = Vec::new();
let normalized = CODE_PATTERN
.replace_all(&normalized, |caps: ®ex::Captures| {
let idx = code_extracts.len();
let content = caps.get(1).map(|m| m.as_str()).unwrap_or("");
code_extracts.push(content.to_string());
format!("\x00CODE{idx}\x00")
})
.to_string();
let normalized = HTML_TAG_PATTERN.replace_all(&normalized, "").to_string();
let mut normalized = normalized;
for (idx, content) in code_extracts.into_iter().enumerate() {
normalized = normalized.replace(&format!("\x00CODE{idx}\x00"), &content);
}
let mut filtered = String::new();
for c in normalized.chars() {
if c.is_ascii_alphabetic() || c.is_ascii_digit() || c == ' ' || c == '-' || c == '>' || c == '&' {
filtered.push(c);
}
}
filtered = filtered.replace(" --> ", "----"); filtered = filtered.replace(" -> ", "---"); filtered = filtered.replace(" & ", "--"); filtered = filtered.replace(" > ", "--");
filtered = transform_unspaced_arrows(&filtered);
let mut start_pos = 0;
let mut found_letter = false;
for (i, c) in filtered.char_indices() {
if c.is_ascii_alphabetic() {
start_pos = i;
found_letter = true;
break;
}
}
if !found_letter {
return "section".to_string();
}
let trimmed = &filtered[start_pos..];
let mut result = String::new();
for c in trimmed.chars() {
if c.is_ascii_alphabetic() {
result.push(c.to_ascii_lowercase());
} else if c.is_ascii_digit() {
result.push(c);
} else {
result.push('-');
}
}
let result = result.trim_start_matches('-').to_string();
if result.is_empty() {
"section".to_string()
} else {
result
}
}
fn normalize_and_filter_unicode(text: &str) -> String {
use unicode_normalization::UnicodeNormalization;
let mut result = String::new();
for c in text.nfc() {
if is_safe_unicode_char(c) {
result.push(c);
}
}
limit_consecutive_chars(&result)
}
fn is_safe_unicode_char(c: char) -> bool {
let code = c as u32;
if (0x20..=0x7E).contains(&code) {
return true;
}
if (0xA0..=0xFF).contains(&code) {
return true;
}
if is_control_character(code) {
return false;
}
if is_zero_width_character(code) {
return false;
}
if is_bidi_character(code) {
return false;
}
if is_private_use_character(code) {
return false;
}
true
}
fn is_control_character(code: u32) -> bool {
(0x00..=0x1F).contains(&code) && ![0x09, 0x0A, 0x0D].contains(&code)
|| (0x80..=0x9F).contains(&code)
|| code == 0x7F
|| [0x2028, 0x2029].contains(&code)
}
fn is_zero_width_character(code: u32) -> bool {
[
0x200B, 0x200C, 0x200D, 0x2060, 0xFEFF, 0x061C, 0x034F, ]
.contains(&code)
}
fn is_bidi_character(code: u32) -> bool {
(0x202A..=0x202E).contains(&code)
|| (0x2066..=0x2069).contains(&code)
}
fn is_private_use_character(code: u32) -> bool {
(0xE000..=0xF8FF).contains(&code)
|| (0xF0000..=0xFFFFD).contains(&code)
|| (0x100000..=0x10FFFD).contains(&code)
|| [0xFFFE, 0xFFFF].contains(&code)
|| code == 0xFFFD }
fn limit_consecutive_chars(text: &str) -> String {
const MAX_CONSECUTIVE: usize = 50;
let mut result = String::new();
let mut last_char = None;
let mut consecutive_count = 0;
for c in text.chars() {
if last_char == Some(c) {
consecutive_count += 1;
if consecutive_count >= MAX_CONSECUTIVE {
continue; }
} else {
consecutive_count = 1;
}
result.push(c);
last_char = Some(c);
}
result
}
fn transform_unspaced_arrows(text: &str) -> String {
let mut result = String::new();
let chars: Vec<char> = text.chars().collect();
let mut i = 0;
while i < chars.len() {
let c = chars[i];
if c == '-' && i + 2 < chars.len() && chars[i + 1] == '-' && chars[i + 2] == '>' {
result.push('-');
result.push('>');
i += 3; } else if c == '-' && i + 1 < chars.len() && chars[i + 1] == '>' {
result.push('-');
i += 2; } else if c == '>' {
i += 1;
} else if c == '&' {
i += 1;
} else {
result.push(c);
i += 1;
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_kramdown_basic_cases() {
assert_eq!(heading_to_fragment("Hello World"), "hello-world");
assert_eq!(heading_to_fragment("Test Case"), "test-case");
assert_eq!(heading_to_fragment(""), "");
}
#[test]
fn test_kramdown_underscores_removed() {
assert_eq!(heading_to_fragment("test_with_underscores"), "testwithunderscores");
assert_eq!(heading_to_fragment("Update login_type"), "update-logintype"); assert_eq!(heading_to_fragment("__dunder__"), "dunder");
}
#[test]
fn test_kramdown_character_filtering() {
assert_eq!(heading_to_fragment("API::Response"), "apiresponse");
assert_eq!(heading_to_fragment("Café René"), "caf-ren"); assert_eq!(heading_to_fragment("über uns"), "ber-uns"); assert_eq!(heading_to_fragment("naïve"), "nave"); }
#[test]
fn test_kramdown_hyphen_preservation() {
assert_eq!(heading_to_fragment("Test-Hyphen"), "test-hyphen"); assert_eq!(heading_to_fragment("Test--Handling"), "test--handling"); assert_eq!(heading_to_fragment("Test---Multiple"), "test---multiple"); assert_eq!(heading_to_fragment("Test----Four"), "test----four"); assert_eq!(heading_to_fragment("Test-----Five"), "test-----five"); assert_eq!(heading_to_fragment("Test------Six"), "test------six"); }
#[test]
fn test_kramdown_arrows_issue_39() {
assert_eq!(
heading_to_fragment("cbrown --> sbrown: --unsafe-paths"),
"cbrown----sbrown---unsafe-paths"
);
assert_eq!(heading_to_fragment("cbrown -> sbrown"), "cbrown---sbrown");
assert_eq!(heading_to_fragment("respect_gitignore"), "respectgitignore");
assert_eq!(heading_to_fragment("test-->more"), "test--more"); assert_eq!(heading_to_fragment("test->more"), "test-more"); assert_eq!(heading_to_fragment("test>more"), "testmore"); assert_eq!(heading_to_fragment("a->b->c"), "a-b-c"); assert_eq!(heading_to_fragment("cmd-->output"), "cmd--output"); }
#[test]
fn test_kramdown_html_jsx_tag_stripping() {
assert_eq!(heading_to_fragment("retentionPolicy<Component />"), "retentionpolicy");
assert_eq!(heading_to_fragment("Test <span>extra</span>"), "test-extra");
assert_eq!(heading_to_fragment("Generic<T>"), "generic");
assert_eq!(heading_to_fragment("`import <FILE>`"), "import-file");
}
#[test]
fn test_kramdown_symbol_replacements() {
assert_eq!(heading_to_fragment("Testing & Coverage"), "testing--coverage");
assert_eq!(heading_to_fragment("Compare > Results"), "compare--results");
assert_eq!(heading_to_fragment("Arrow --> Test"), "arrow----test"); }
#[test]
fn test_kramdown_leading_trimming() {
assert_eq!(heading_to_fragment("---leading"), "leading");
assert_eq!(heading_to_fragment("trailing---"), "trailing---"); assert_eq!(heading_to_fragment("---both---"), "both---"); assert_eq!(heading_to_fragment("----both----"), "both----"); }
#[test]
fn test_kramdown_numbers() {
assert_eq!(heading_to_fragment("Step 1: Getting Started"), "step-1-getting-started");
assert_eq!(heading_to_fragment("Version 2.1.0"), "version-210");
assert_eq!(heading_to_fragment("123 Numbers"), "numbers"); }
#[test]
fn test_kramdown_comprehensive_verified() {
let test_cases = [
("cbrown --> sbrown: --unsafe-paths", "cbrown----sbrown---unsafe-paths"),
("Update login_type", "update-logintype"),
("API::Response > Error--Handling", "apiresponse--error--handling"),
("Test---with---multiple---hyphens", "test---with---multiple---hyphens"),
("respect_gitignore", "respectgitignore"),
("Simple test case", "simple-test-case"),
("Testing & Coverage", "testing--coverage"),
("test_with_underscores", "testwithunderscores"),
];
for (input, expected) in test_cases {
let actual = heading_to_fragment(input);
assert_eq!(
actual, expected,
"Kramdown verified test failed for input: '{input}'\nExpected: '{expected}'\nActual: '{actual}'"
);
}
let arrow_test_cases = [
("test-->more", "test--more"), ("test->more", "test-more"), ("test > more", "test--more"), ("test -> more", "test---more"), ("test --> more", "test----more"), ];
for (input, expected) in arrow_test_cases {
let actual = heading_to_fragment(input);
assert_eq!(
actual, expected,
"Arrow processing test failed for input: '{input}'\nExpected: '{expected}'\nActual: '{actual}'"
);
}
}
#[test]
fn test_kramdown_edge_cases() {
assert_eq!(heading_to_fragment("123"), "section"); assert_eq!(heading_to_fragment("!!!"), "section"); assert_eq!(heading_to_fragment(" "), "section"); assert_eq!(heading_to_fragment("a"), "a"); assert_eq!(heading_to_fragment("1a"), "a"); }
#[test]
fn test_kramdown_unicode_security() {
assert_eq!(heading_to_fragment("café"), "caf"); assert_eq!(heading_to_fragment("cafe\u{0301}"), "caf");
assert_eq!(heading_to_fragment("word\u{200B}break"), "wordbreak");
assert_eq!(heading_to_fragment("test\u{200C}ing"), "testing");
assert_eq!(heading_to_fragment("test\u{0000}null"), "testnull");
assert_eq!(heading_to_fragment("test\u{001B}escape"), "testescape");
assert_eq!(heading_to_fragment("safe\u{202E}attack"), "safeattack");
assert_eq!(heading_to_fragment("test\u{E000}private"), "testprivate");
}
#[test]
fn test_kramdown_performance_protection() {
let large_input = "a".repeat(20000);
let result = heading_to_fragment(&large_input);
assert!(!result.is_empty());
assert!(result.len() < large_input.len());
let bomb = format!("test{}end", "a".repeat(1000));
let result = heading_to_fragment(&bomb);
assert!(result.starts_with("test"));
assert!(result.ends_with("end"));
let mut consecutive_count = 1;
let mut last_char = None;
let mut max_consecutive = 0;
for c in result.chars() {
if last_char == Some(c) {
consecutive_count += 1;
} else {
max_consecutive = max_consecutive.max(consecutive_count);
consecutive_count = 1;
}
last_char = Some(c);
}
max_consecutive = max_consecutive.max(consecutive_count);
assert!(max_consecutive <= 50, "Too many consecutive chars: {max_consecutive}");
let mixed_stress = "word🎉-中文".repeat(100);
let result = heading_to_fragment(&mixed_stress);
assert!(!result.is_empty());
}
#[test]
fn test_kramdown_arrow_symbol_replacement_order() {
assert_eq!(heading_to_fragment("test --> more > info"), "test----more--info");
assert_eq!(heading_to_fragment("cmd -> output & result"), "cmd---output--result");
assert_eq!(heading_to_fragment("a > b --> c & d"), "a--b----c--d");
assert_eq!(heading_to_fragment("a->b --> c>d"), "a-b----cd");
}
}
#[cfg(test)]
mod security_tests {
use super::*;
#[test]
fn test_input_size_limits() {
let huge_input = "a".repeat(100_000); let result = heading_to_fragment(&huge_input);
assert!(!result.is_empty());
assert!(result.len() < huge_input.len());
}
#[test]
fn test_unicode_normalization() {
let composed = "é"; let decomposed = "e\u{0301}";
let result1 = heading_to_fragment(composed);
let result2 = heading_to_fragment(decomposed);
assert_eq!(result1, result2);
}
#[test]
fn test_dangerous_unicode_filtering() {
let dangerous_cases = vec![
("test\u{202E}attack", "RTL override"),
("safe\u{200B}split", "Zero-width space"),
("ctrl\u{0001}char", "Control character"),
("private\u{E000}use", "Private use area"),
("nonchar\u{FFFE}test", "Non-character"),
];
for (input, description) in dangerous_cases {
let result = heading_to_fragment(input);
assert!(!result.is_empty(), "Failed to handle: {description}");
assert!(!result.contains('\u{202E}'), "RTL override not filtered");
assert!(!result.contains('\u{200B}'), "Zero-width space not filtered");
}
}
#[test]
fn test_consecutive_character_limits() {
let bomb_cases = vec![
(format!("start{}end", "a".repeat(200)), "a-bomb"),
(format!("begin{}-finish", "-".repeat(100)), "hyphen-bomb"),
(format!("test{}more", " ".repeat(150)), "space-bomb"),
];
for (input, description) in bomb_cases {
let result = heading_to_fragment(&input);
assert!(!result.is_empty(), "Failed to handle: {description}");
let mut consecutive_count = 1;
let mut last_char = None;
let mut max_consecutive = 0;
for c in result.chars() {
if last_char == Some(c) {
consecutive_count += 1;
} else {
max_consecutive = max_consecutive.max(consecutive_count);
consecutive_count = 1;
}
last_char = Some(c);
}
max_consecutive = max_consecutive.max(consecutive_count);
assert!(
max_consecutive <= 50,
"Consecutive character limit exceeded for {description}: {max_consecutive} consecutive chars"
);
}
}
#[test]
fn test_performance_bounds() {
use std::time::Instant;
let pathological_cases = vec![
"a".repeat(10_000),
"-".repeat(5_000),
"🎉".repeat(1_000),
"test".repeat(2_500),
format!(
"{} -> {} --> {}",
"word".repeat(1000),
"more".repeat(1000),
"text".repeat(1000)
),
];
for input in pathological_cases {
let start = Instant::now();
let result = heading_to_fragment(&input);
let duration = start.elapsed();
assert!(
duration < std::time::Duration::from_secs(1),
"Performance test failed: took {:?} for input of length {}",
duration,
input.len()
);
assert!(!result.is_empty() || result == "section");
}
}
}