use regex::Regex;
use std::sync::LazyLock;
use unicode_normalization::UnicodeNormalization;
use super::common::{
CONTROL_CHARS, MAX_INPUT_SIZE_LARGE, UnicodeLetterMode, WHITESPACE_NORMALIZE, is_emoji_or_symbol,
is_safe_unicode_letter,
};
static HTML_TAG_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?i)</?[a-z][^>]*>").unwrap());
static ASTERISK_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\*+([^*]*?)\*+").unwrap());
static UNDERSCORE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\b_+([^_\s][^_]*?)_+\b").unwrap());
static CODE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"`+([^`]*?)`+").unwrap());
static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\([^)]*\)").unwrap());
static LINK_PATTERN: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\[((?:[^\[\]]|\[[^\]]*\])*)\](?:\([^)]*\)|\[[^\]]*\])").unwrap());
pub fn heading_to_fragment(heading: &str) -> String {
if heading.is_empty() || heading.len() > MAX_INPUT_SIZE_LARGE {
return if heading.is_empty() {
String::new()
} else {
"section".to_string()
};
}
let normalized: String = heading.nfc().collect();
let text = CONTROL_CHARS.replace_all(&normalized, "");
let text = WHITESPACE_NORMALIZE.replace_all(&text, " ");
let mut text = text.to_string();
text = text
.replace(" & ", " -- ")
.replace(" < ", " -- ")
.replace(" > ", " -- ")
.replace(" = ", " -- ");
for _ in 0..3 {
let prev = text.clone();
text = ASTERISK_PATTERN.replace_all(&text, "$1").to_string();
text = UNDERSCORE_PATTERN.replace_all(&text, "$1").to_string();
if text == prev {
break;
} }
let mut code_extracts: Vec<String> = Vec::new();
text = CODE_PATTERN
.replace_all(&text, |caps: ®ex::Captures| {
let idx = code_extracts.len();
let content = caps.get(1).map(|m| m.as_str()).unwrap_or("");
code_extracts.push(content.to_string());
format!("\x00CODE{idx}\x00")
})
.to_string();
text = IMAGE_PATTERN.replace_all(&text, "$1").to_string();
text = LINK_PATTERN.replace_all(&text, "$1").to_string();
text = HTML_TAG_PATTERN.replace_all(&text, "").to_string();
for (idx, content) in code_extracts.into_iter().enumerate() {
text = text.replace(&format!("\x00CODE{idx}\x00"), &content);
}
#[cfg(test)]
if heading.contains('_') {
eprintln!("DEBUG: Before character filtering:");
eprintln!(" text: '{text}'");
eprintln!(" contains underscores: {}", text.chars().any(|c| c == '_'));
}
let mut filtered = String::with_capacity(text.len());
let mut had_leading_emoji = false;
let mut seen_non_emoji = false;
for c in text.chars() {
if is_safe_unicode_letter(c, UnicodeLetterMode::Permissive)
|| c.is_ascii_digit()
|| c == ' '
|| c == '_'
|| c == '-'
{
filtered.push(c);
seen_non_emoji = true;
} else if is_emoji_or_symbol(c) {
if !seen_non_emoji && filtered.is_empty() {
had_leading_emoji = true;
}
}
}
#[cfg(test)]
if heading.contains("==>") {
eprintln!("DEBUG: Processing '{heading}', after filtering: '{filtered}'");
}
let mut start_pos = 0;
let first_char = filtered.chars().next();
if let Some(c) = first_char {
if !c.is_ascii_digit() && !is_safe_unicode_letter(c, UnicodeLetterMode::Permissive) {
let mut found_alnum = false;
for (i, ch) in filtered.char_indices() {
if is_safe_unicode_letter(ch, UnicodeLetterMode::Permissive) || ch.is_ascii_digit() {
start_pos = i;
found_alnum = true;
break;
}
}
if !found_alnum {
return "section".to_string();
}
}
} else {
return "section".to_string();
}
let trimmed = &filtered[start_pos..];
#[cfg(test)]
if trimmed.contains('_') {
eprintln!("DEBUG: After trimming, contains underscores: '{trimmed}'");
}
#[cfg(test)]
if heading.contains("==>") {
eprintln!("DEBUG: Before smart typography, trimmed: '{trimmed}'");
}
let trimmed = trimmed
.replace(" --- ", "§EMDASH§") .replace(" -- ", "§ENDASH§") .replace(" == ", "§EQUALS§") .replace(" - ", "§HYPHEN§");
#[cfg(test)]
if heading.contains("==>") {
eprintln!("DEBUG: After smart typography replacements, trimmed: '{trimmed}'");
}
let trimmed_chars: Vec<char> = trimmed.chars().collect();
let mut result = String::new();
let mut i = 0;
while i < trimmed_chars.len() {
let c = trimmed_chars[i];
if i + 2 < trimmed_chars.len() && c == ' ' {
let next1 = trimmed_chars[i + 1];
let next2 = trimmed_chars[i + 2];
if (next1 == '-' && next2 == '-') || (next1 == '=' && next2 == '=') {
if i + 3 < trimmed_chars.len() && trimmed_chars[i + 3].is_alphabetic() {
result.push_str("§REDUCEHYPHEN§");
i += 3; continue;
}
}
}
#[cfg(test)]
if c == '_' {
eprintln!("DEBUG: Pushing underscore at position {i}");
}
result.push(c);
i += 1;
}
let trimmed = result;
#[cfg(test)]
if heading.contains('_') && !trimmed.contains('_') {
eprintln!("DEBUG: Underscores lost in smart typography loop!");
eprintln!(" Original: '{heading}'");
eprintln!(" After loop: '{trimmed}'");
}
let mut result = String::with_capacity(trimmed.len());
#[cfg(test)]
if heading.contains("==>") {
eprintln!("DEBUG: Starting processing of: '{trimmed}'");
}
let mut i = 0;
let chars: Vec<char> = trimmed.chars().collect();
while i < chars.len() {
let c = chars[i];
if c == '§' {
let remaining = &trimmed[trimmed.char_indices().nth(i).unwrap().0..];
if remaining.starts_with("§ENDASH§") {
result.push_str("§ENDASH§"); i += "§ENDASH§".chars().count();
continue;
} else if remaining.starts_with("§EMDASH§") {
result.push_str("§EMDASH§"); i += "§EMDASH§".chars().count();
continue;
} else if remaining.starts_with("§HYPHEN§") {
result.push_str("§HYPHEN§"); i += "§HYPHEN§".chars().count();
continue;
} else if remaining.starts_with("§REDUCEHYPHEN§") {
result.push_str("§REDUCEHYPHEN§"); i += "§REDUCEHYPHEN§".chars().count();
continue;
} else if remaining.starts_with("§EQUALS§") {
result.push_str("§EQUALS§"); i += "§EQUALS§".chars().count();
continue;
}
}
if is_safe_unicode_letter(c, UnicodeLetterMode::Permissive) {
for lowercase_c in c.to_lowercase() {
result.push(lowercase_c);
}
} else if c.is_ascii_digit() || c == '-' || c == '_' {
result.push(c);
} else if c == ' ' {
result.push('§');
result.push('S');
result.push('§');
}
i += 1;
}
#[cfg(test)]
if heading.contains("==>") {
eprintln!("DEBUG: After processing, result: '{result}'");
}
static HYPHEN_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"-{2,}").unwrap());
let result = HYPHEN_PATTERN
.replace_all(&result, |caps: ®ex::Captures| {
let hyphen_count = caps[0].len();
if hyphen_count >= 4 && (hyphen_count % 3) == 1 {
"-".to_string()
} else {
"".to_string()
}
})
.to_string();
let mut result = result
.replace("§ENDASH§", "--") .replace("§EMDASH§", "--") .replace("§EQUALS§", "--") .replace("§HYPHEN§", "---") .replace("§REDUCEHYPHEN§", "-") .replace("§S§", "-");
result = result.trim_start_matches('-').to_string();
if had_leading_emoji && !result.is_empty() {
result = format!("-{result}");
}
if result.is_empty() {
"section".to_string()
} else {
result
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_jekyll_basic_cases() {
assert_eq!(heading_to_fragment("Hello World"), "hello-world");
assert_eq!(heading_to_fragment("Test Case"), "test-case");
assert_eq!(heading_to_fragment(""), "");
}
#[test]
fn test_jekyll_underscores() {
assert_eq!(heading_to_fragment("test_with_underscores"), "test_with_underscores");
assert_eq!(heading_to_fragment("Update login_type"), "update-login_type");
assert_eq!(heading_to_fragment("__dunder__"), "dunder");
}
#[test]
fn test_jekyll_arrows_issue_39() {
assert_eq!(
heading_to_fragment("cbrown --> sbrown: --unsafe-paths"),
"cbrown--sbrown-unsafe-paths"
);
assert_eq!(heading_to_fragment("cbrown -> sbrown"), "cbrown---sbrown");
assert_eq!(heading_to_fragment("test-->more"), "testmore");
assert_eq!(heading_to_fragment("test->more"), "test-more");
}
#[test]
fn test_jekyll_character_filtering() {
assert_eq!(heading_to_fragment("API::Response"), "apiresponse");
assert_eq!(heading_to_fragment("Café René"), "café-rené");
assert_eq!(heading_to_fragment("über uns"), "über-uns");
}
#[test]
fn test_jekyll_symbol_replacements() {
assert_eq!(heading_to_fragment("Testing & Coverage"), "testing--coverage");
assert_eq!(heading_to_fragment("Compare > Results"), "compare--results");
assert_eq!(heading_to_fragment("Arrow --> Test"), "arrow--test");
assert_eq!(heading_to_fragment("Arrow ==> Test"), "arrow--test");
}
#[test]
fn test_jekyll_hyphens() {
assert_eq!(heading_to_fragment("Double--Hyphen"), "doublehyphen");
assert_eq!(heading_to_fragment("Pre-existing-hyphens"), "pre-existing-hyphens");
assert_eq!(heading_to_fragment("Test---Multiple"), "testmultiple");
assert_eq!(heading_to_fragment("Single-Hyphen"), "single-hyphen");
}
#[test]
fn test_jekyll_leading_trailing_trimming() {
assert_eq!(heading_to_fragment("---leading"), "leading");
assert_eq!(heading_to_fragment("trailing---"), "trailing");
assert_eq!(heading_to_fragment("---both---"), "both");
}
#[test]
fn test_jekyll_numbers() {
assert_eq!(heading_to_fragment("Step 1: Getting Started"), "step-1-getting-started");
assert_eq!(heading_to_fragment("Version 2.1.0"), "version-210");
assert_eq!(heading_to_fragment("123 Numbers"), "123-numbers");
assert_eq!(heading_to_fragment("123"), "123"); }
#[test]
fn test_jekyll_markdown_removal() {
assert_eq!(heading_to_fragment("*emphasized* text"), "emphasized-text");
assert_eq!(heading_to_fragment("`code` in heading"), "code-in-heading");
assert_eq!(heading_to_fragment("[link text](url)"), "link-text");
assert_eq!(heading_to_fragment("**bold *italic* text**"), "bold-italic-text");
assert_eq!(heading_to_fragment("_underline **bold** mix_"), "underline-bold-mix");
}
#[test]
fn test_jekyll_emojis() {
assert_eq!(heading_to_fragment("🎉 emoji test"), "-emoji-test");
}
#[test]
fn test_jekyll_comprehensive_verified() {
let test_cases = [
("cbrown --> sbrown: --unsafe-paths", "cbrown--sbrown-unsafe-paths"),
("test_with_underscores", "test_with_underscores"),
("Update login_type", "update-login_type"),
("[link text](url)", "link-text"),
("trailing---", "trailing"),
("---both---", "both"),
("Double--Hyphen", "doublehyphen"),
("Test---Multiple", "testmultiple"),
("test-->more", "testmore"),
("123", "123"),
("🎉 emoji test", "-emoji-test"),
];
for (input, expected) in test_cases {
let actual = heading_to_fragment(input);
assert_eq!(
actual, expected,
"Jekyll verified test failed for input: '{input}'\nExpected: '{expected}'\nActual: '{actual}'"
);
}
}
#[test]
fn test_jekyll_edge_cases() {
assert_eq!(heading_to_fragment("123"), "123"); assert_eq!(heading_to_fragment("!!!"), "section"); assert_eq!(heading_to_fragment(" "), "section"); assert_eq!(heading_to_fragment("a"), "a"); assert_eq!(heading_to_fragment("1a"), "1a"); }
#[test]
fn test_jekyll_html_jsx_tag_stripping() {
assert_eq!(heading_to_fragment("retentionPolicy<Component />"), "retentionpolicy");
assert_eq!(heading_to_fragment("Test <span>extra</span>"), "test-extra");
assert_eq!(heading_to_fragment("Generic<T>"), "generic");
assert_eq!(heading_to_fragment("`import <FILE>`"), "import-file");
}
#[test]
fn test_security_features() {
let large_input = "a".repeat(MAX_INPUT_SIZE_LARGE + 1);
assert_eq!(heading_to_fragment(&large_input), "section");
assert_eq!(heading_to_fragment("Test\x00\x1F\x7FContent"), "testcontent");
assert_eq!(heading_to_fragment("Test\u{200B}\u{FEFF}Content"), "testcontent");
assert_eq!(heading_to_fragment("café"), "café");
assert_eq!(heading_to_fragment("Test\tTab\u{00A0}Space"), "testtab-space");
}
#[test]
fn test_unicode_safety() {
assert_eq!(heading_to_fragment("Café"), "café");
assert_eq!(heading_to_fragment("Naïve"), "naïve");
assert_eq!(heading_to_fragment("Résumé"), "résumé");
assert_eq!(heading_to_fragment("Test ∑ Math ∞ Symbols"), "test--math--symbols");
assert_eq!(heading_to_fragment("Test 🚀 Emoji 💡 Content"), "test--emoji--content");
assert_eq!(heading_to_fragment("Price €100 ¥200 $300"), "price-100-200-300");
}
}