use html_escape::decode_html_entities;
use unicode_normalization::{UnicodeNormalization, char::is_combining_mark};
#[derive(Debug, Clone)]
pub struct HeadingSegmentVariants {
pub display: String,
pub normalized: String,
pub tokens: Vec<String>,
}
pub fn segment_variants(raw: &str) -> HeadingSegmentVariants {
let stripped = strip_links_and_anchors(raw);
let display = decode_html_entities(&stripped).trim().to_string();
let normalized = normalize_for_search(&display);
let tokens: Vec<String> = normalized
.split_whitespace()
.filter(|token| !token.is_empty())
.map(std::string::ToString::to_string)
.collect();
HeadingSegmentVariants {
display,
normalized,
tokens,
}
}
#[derive(Debug, Clone)]
pub struct HeadingPathVariants {
pub display_segments: Vec<String>,
pub normalized_segments: Vec<String>,
pub tokens: Vec<String>,
}
pub fn path_variants(path: &[String]) -> HeadingPathVariants {
let mut display_segments = Vec::with_capacity(path.len());
let mut normalized_segments = Vec::with_capacity(path.len());
let mut tokens = Vec::new();
for segment in path {
let HeadingSegmentVariants {
display,
normalized,
tokens: mut seg_tokens,
} = segment_variants(segment);
let display_segment = if display.is_empty() {
segment.clone()
} else {
display
};
let normalized_segment = if normalized.is_empty() {
display_segment.to_lowercase()
} else {
normalized
};
if seg_tokens.is_empty() {
seg_tokens = normalized_segment
.split_whitespace()
.map(std::string::ToString::to_string)
.collect();
}
display_segments.push(display_segment.clone());
normalized_segments.push(normalized_segment);
for token in &seg_tokens {
if !tokens.contains(token) {
tokens.push(token.clone());
}
}
if let Some(transliterated) = transliterate_latin_extensions(&display_segment) {
let transliterated_tokens: Vec<String> = normalize_for_search(&transliterated)
.split_whitespace()
.map(std::string::ToString::to_string)
.collect();
for token in transliterated_tokens {
if !token.is_empty() && !tokens.contains(&token) {
tokens.push(token);
}
}
}
}
HeadingPathVariants {
display_segments,
normalized_segments,
tokens,
}
}
fn transliterate_latin_extensions(input: &str) -> Option<String> {
let mut output = String::with_capacity(input.len());
let mut changed = false;
for ch in input.chars() {
match ch {
'Ä' | 'ä' => {
output.push_str("ae");
changed = true;
},
'Ö' | 'ö' => {
output.push_str("oe");
changed = true;
},
'Ü' | 'ü' => {
output.push_str("ue");
changed = true;
},
'ẞ' | 'ß' => {
output.push_str("ss");
changed = true;
},
_ => output.push(ch),
}
}
if changed { Some(output) } else { None }
}
fn strip_links_and_anchors(input: &str) -> String {
let mut output = String::with_capacity(input.len());
let bytes = input.as_bytes();
let mut i = 0;
while i < bytes.len() {
match bytes[i] {
b'[' => {
if let Some((label_end, link_end)) = find_markdown_link(bytes, i) {
output.push_str(&input[i + 1..label_end]);
i = link_end + 1;
continue;
}
output.push('[');
i += 1;
},
b'<' => {
if let Some(next_gt) = memchr::memchr(b'>', &bytes[i + 1..]).map(|pos| pos + i + 1)
{
let tag = &input[i + 1..next_gt];
let tag_lower = tag.trim().to_ascii_lowercase();
if tag_lower.starts_with("a ") || tag_lower.starts_with("a>") {
i = next_gt + 1;
continue;
}
if tag_lower.starts_with("/a") {
i = next_gt + 1;
continue;
}
}
output.push('<');
i += 1;
},
_ => {
if let Some(ch) = input[i..].chars().next() {
output.push(ch);
i += ch.len_utf8();
} else {
i += 1;
}
},
}
}
output
}
fn find_markdown_link(bytes: &[u8], start: usize) -> Option<(usize, usize)> {
let mut idx = start + 1;
while idx < bytes.len() {
match bytes[idx] {
b'\\' => idx += 2,
b']' => {
if idx + 1 < bytes.len() && bytes[idx + 1] == b'(' {
if let Some(close_paren) = find_matching_paren(bytes, idx + 2) {
return Some((idx, close_paren));
}
}
return None;
},
_ => idx += 1,
}
}
None
}
fn find_matching_paren(bytes: &[u8], mut pos: usize) -> Option<usize> {
let mut depth = 1;
while pos < bytes.len() {
match bytes[pos] {
b'\\' => pos += 2,
b'(' => {
depth += 1;
pos += 1;
},
b')' => {
depth -= 1;
if depth == 0 {
return Some(pos);
}
pos += 1;
},
_ => pos += 1,
}
}
None
}
fn normalize_for_search(display: &str) -> String {
let mut normalized = String::with_capacity(display.len());
let mut prev_was_space = true;
for ch in display.nfkd() {
if is_combining_mark(ch) {
continue;
}
for lower in ch.to_lowercase() {
if lower.is_ascii_alphanumeric() {
normalized.push(lower);
prev_was_space = false;
} else if lower.is_whitespace()
|| matches!(
lower,
'-' | '_'
| '/'
| '.'
| '#'
| ':'
| '`'
| '\''
| '"'
| '('
| ')'
| '['
| ']'
| '{'
| '}'
)
|| lower.is_ascii()
{
push_space(&mut normalized, &mut prev_was_space);
} else if lower.is_alphanumeric() {
normalized.push(lower);
prev_was_space = false;
} else {
push_space(&mut normalized, &mut prev_was_space);
}
}
}
normalized.trim().to_string()
}
#[must_use]
pub fn normalize_text_for_search(text: &str) -> String {
normalize_for_search(text)
}
fn push_space(normalized: &mut String, prev_was_space: &mut bool) {
if !*prev_was_space && !normalized.is_empty() {
normalized.push(' ');
*prev_was_space = true;
}
}