blz_core/
heading.rs

1use html_escape::decode_html_entities;
2use unicode_normalization::{UnicodeNormalization, char::is_combining_mark};
3
4/// Variants derived from a raw heading segment.
5#[derive(Debug, Clone)]
6pub struct HeadingSegmentVariants {
7    /// Human-friendly display text (markdown links & empty anchors removed).
8    pub display: String,
9    /// Normalized search string (lowercase, punctuation stripped, diacritics removed).
10    pub normalized: String,
11    /// Tokenized representation of the normalized string.
12    pub tokens: Vec<String>,
13}
14
15/// Compute display and normalized variants for a raw heading segment.
16///
17/// - Markdown links (`[Label](url)`) are reduced to `Label`
18/// - Empty HTML anchors (`<a id=\"foo\"></a>`) and surrounding tags are removed
19/// - HTML entities are decoded
20/// - Normalized form lowercases, strips diacritics/punctuation, and collapses whitespace
21pub fn segment_variants(raw: &str) -> HeadingSegmentVariants {
22    let stripped = strip_links_and_anchors(raw);
23    let display = decode_html_entities(&stripped).trim().to_string();
24    let normalized = normalize_for_search(&display);
25
26    let tokens: Vec<String> = normalized
27        .split_whitespace()
28        .filter(|token| !token.is_empty())
29        .map(std::string::ToString::to_string)
30        .collect();
31
32    HeadingSegmentVariants {
33        display,
34        normalized,
35        tokens,
36    }
37}
38
39/// Aggregated heading path variants for display and search.
40#[derive(Debug, Clone)]
41pub struct HeadingPathVariants {
42    /// Sanitized segments suitable for presentation.
43    pub display_segments: Vec<String>,
44    /// Lowercased, punctuation-stripped segments per heading level.
45    pub normalized_segments: Vec<String>,
46    /// Tokenized representation of the normalized path for indexing.
47    pub tokens: Vec<String>,
48}
49
50/// Compute display + normalized variants for an entire heading path.
51pub fn path_variants(path: &[String]) -> HeadingPathVariants {
52    let mut display_segments = Vec::with_capacity(path.len());
53    let mut normalized_segments = Vec::with_capacity(path.len());
54    let mut tokens = Vec::new();
55
56    for segment in path {
57        let HeadingSegmentVariants {
58            display,
59            normalized,
60            tokens: mut seg_tokens,
61        } = segment_variants(segment);
62
63        let display_segment = if display.is_empty() {
64            segment.clone()
65        } else {
66            display
67        };
68
69        let normalized_segment = if normalized.is_empty() {
70            display_segment.to_lowercase()
71        } else {
72            normalized
73        };
74
75        if seg_tokens.is_empty() {
76            seg_tokens = normalized_segment
77                .split_whitespace()
78                .map(std::string::ToString::to_string)
79                .collect();
80        }
81
82        display_segments.push(display_segment.clone());
83        normalized_segments.push(normalized_segment);
84
85        for token in &seg_tokens {
86            if !tokens.contains(token) {
87                tokens.push(token.clone());
88            }
89        }
90
91        if let Some(transliterated) = transliterate_latin_extensions(&display_segment) {
92            let transliterated_tokens: Vec<String> = normalize_for_search(&transliterated)
93                .split_whitespace()
94                .map(std::string::ToString::to_string)
95                .collect();
96            for token in transliterated_tokens {
97                if !token.is_empty() && !tokens.contains(&token) {
98                    tokens.push(token);
99                }
100            }
101        }
102    }
103
104    HeadingPathVariants {
105        display_segments,
106        normalized_segments,
107        tokens,
108    }
109}
110
111fn transliterate_latin_extensions(input: &str) -> Option<String> {
112    let mut output = String::with_capacity(input.len());
113    let mut changed = false;
114
115    for ch in input.chars() {
116        match ch {
117            'Ä' | 'ä' => {
118                output.push_str("ae");
119                changed = true;
120            },
121            'Ö' | 'ö' => {
122                output.push_str("oe");
123                changed = true;
124            },
125            'Ü' | 'ü' => {
126                output.push_str("ue");
127                changed = true;
128            },
129            'ẞ' | 'ß' => {
130                output.push_str("ss");
131                changed = true;
132            },
133            _ => output.push(ch),
134        }
135    }
136
137    if changed { Some(output) } else { None }
138}
139
140fn strip_links_and_anchors(input: &str) -> String {
141    let mut output = String::with_capacity(input.len());
142    let bytes = input.as_bytes();
143    let mut i = 0;
144
145    while i < bytes.len() {
146        match bytes[i] {
147            b'[' => {
148                if let Some((label_end, link_end)) = find_markdown_link(bytes, i) {
149                    output.push_str(&input[i + 1..label_end]);
150                    i = link_end + 1;
151                    continue;
152                }
153                output.push('[');
154                i += 1;
155            },
156            b'<' => {
157                if let Some(next_gt) = memchr::memchr(b'>', &bytes[i + 1..]).map(|pos| pos + i + 1)
158                {
159                    let tag = &input[i + 1..next_gt];
160                    let tag_lower = tag.trim().to_ascii_lowercase();
161                    if tag_lower.starts_with("a ") || tag_lower.starts_with("a>") {
162                        i = next_gt + 1;
163                        continue;
164                    }
165                    if tag_lower.starts_with("/a") {
166                        i = next_gt + 1;
167                        continue;
168                    }
169                }
170                output.push('<');
171                i += 1;
172            },
173            _ => {
174                if let Some(ch) = input[i..].chars().next() {
175                    output.push(ch);
176                    i += ch.len_utf8();
177                } else {
178                    i += 1;
179                }
180            },
181        }
182    }
183
184    output
185}
186
187fn find_markdown_link(bytes: &[u8], start: usize) -> Option<(usize, usize)> {
188    let mut idx = start + 1;
189    while idx < bytes.len() {
190        match bytes[idx] {
191            b'\\' => idx += 2,
192            b']' => {
193                if idx + 1 < bytes.len() && bytes[idx + 1] == b'(' {
194                    if let Some(close_paren) = find_matching_paren(bytes, idx + 2) {
195                        return Some((idx, close_paren));
196                    }
197                }
198                return None;
199            },
200            _ => idx += 1,
201        }
202    }
203    None
204}
205
206fn find_matching_paren(bytes: &[u8], mut pos: usize) -> Option<usize> {
207    let mut depth = 1;
208    while pos < bytes.len() {
209        match bytes[pos] {
210            b'\\' => pos += 2,
211            b'(' => {
212                depth += 1;
213                pos += 1;
214            },
215            b')' => {
216                depth -= 1;
217                if depth == 0 {
218                    return Some(pos);
219                }
220                pos += 1;
221            },
222            _ => pos += 1,
223        }
224    }
225    None
226}
227
228fn normalize_for_search(display: &str) -> String {
229    let mut normalized = String::with_capacity(display.len());
230    let mut prev_was_space = true;
231
232    for ch in display.nfkd() {
233        if is_combining_mark(ch) {
234            continue;
235        }
236
237        for lower in ch.to_lowercase() {
238            if lower.is_ascii_alphanumeric() {
239                normalized.push(lower);
240                prev_was_space = false;
241            } else if lower.is_whitespace()
242                || matches!(
243                    lower,
244                    '-' | '_'
245                        | '/'
246                        | '.'
247                        | '#'
248                        | ':'
249                        | '`'
250                        | '\''
251                        | '"'
252                        | '('
253                        | ')'
254                        | '['
255                        | ']'
256                        | '{'
257                        | '}'
258                )
259                || lower.is_ascii()
260            {
261                push_space(&mut normalized, &mut prev_was_space);
262            } else if lower.is_alphanumeric() {
263                normalized.push(lower);
264                prev_was_space = false;
265            } else {
266                push_space(&mut normalized, &mut prev_was_space);
267            }
268        }
269    }
270
271    normalized.trim().to_string()
272}
273
274/// Public helper to normalize arbitrary text using the same rules as headings.
275#[must_use]
276pub fn normalize_text_for_search(text: &str) -> String {
277    normalize_for_search(text)
278}
279
280fn push_space(normalized: &mut String, prev_was_space: &mut bool) {
281    if !*prev_was_space && !normalized.is_empty() {
282        normalized.push(' ');
283        *prev_was_space = true;
284    }
285}
blz_core/heading.rs

blz_core/
heading.rs