1use html_escape::decode_html_entities;
2use unicode_normalization::{UnicodeNormalization, char::is_combining_mark};
3
4#[derive(Debug, Clone)]
6pub struct HeadingSegmentVariants {
7 pub display: String,
9 pub normalized: String,
11 pub tokens: Vec<String>,
13}
14
15pub fn segment_variants(raw: &str) -> HeadingSegmentVariants {
22 let stripped = strip_links_and_anchors(raw);
23 let display = decode_html_entities(&stripped).trim().to_string();
24 let normalized = normalize_for_search(&display);
25
26 let tokens: Vec<String> = normalized
27 .split_whitespace()
28 .filter(|token| !token.is_empty())
29 .map(std::string::ToString::to_string)
30 .collect();
31
32 HeadingSegmentVariants {
33 display,
34 normalized,
35 tokens,
36 }
37}
38
39#[derive(Debug, Clone)]
41pub struct HeadingPathVariants {
42 pub display_segments: Vec<String>,
44 pub normalized_segments: Vec<String>,
46 pub tokens: Vec<String>,
48}
49
50pub fn path_variants(path: &[String]) -> HeadingPathVariants {
52 let mut display_segments = Vec::with_capacity(path.len());
53 let mut normalized_segments = Vec::with_capacity(path.len());
54 let mut tokens = Vec::new();
55
56 for segment in path {
57 let HeadingSegmentVariants {
58 display,
59 normalized,
60 tokens: mut seg_tokens,
61 } = segment_variants(segment);
62
63 let display_segment = if display.is_empty() {
64 segment.clone()
65 } else {
66 display
67 };
68
69 let normalized_segment = if normalized.is_empty() {
70 display_segment.to_lowercase()
71 } else {
72 normalized
73 };
74
75 if seg_tokens.is_empty() {
76 seg_tokens = normalized_segment
77 .split_whitespace()
78 .map(std::string::ToString::to_string)
79 .collect();
80 }
81
82 display_segments.push(display_segment.clone());
83 normalized_segments.push(normalized_segment);
84
85 for token in &seg_tokens {
86 if !tokens.contains(token) {
87 tokens.push(token.clone());
88 }
89 }
90
91 if let Some(transliterated) = transliterate_latin_extensions(&display_segment) {
92 let transliterated_tokens: Vec<String> = normalize_for_search(&transliterated)
93 .split_whitespace()
94 .map(std::string::ToString::to_string)
95 .collect();
96 for token in transliterated_tokens {
97 if !token.is_empty() && !tokens.contains(&token) {
98 tokens.push(token);
99 }
100 }
101 }
102 }
103
104 HeadingPathVariants {
105 display_segments,
106 normalized_segments,
107 tokens,
108 }
109}
110
111fn transliterate_latin_extensions(input: &str) -> Option<String> {
112 let mut output = String::with_capacity(input.len());
113 let mut changed = false;
114
115 for ch in input.chars() {
116 match ch {
117 'Ä' | 'ä' => {
118 output.push_str("ae");
119 changed = true;
120 },
121 'Ö' | 'ö' => {
122 output.push_str("oe");
123 changed = true;
124 },
125 'Ü' | 'ü' => {
126 output.push_str("ue");
127 changed = true;
128 },
129 'ẞ' | 'ß' => {
130 output.push_str("ss");
131 changed = true;
132 },
133 _ => output.push(ch),
134 }
135 }
136
137 if changed { Some(output) } else { None }
138}
139
140fn strip_links_and_anchors(input: &str) -> String {
141 let mut output = String::with_capacity(input.len());
142 let bytes = input.as_bytes();
143 let mut i = 0;
144
145 while i < bytes.len() {
146 match bytes[i] {
147 b'[' => {
148 if let Some((label_end, link_end)) = find_markdown_link(bytes, i) {
149 output.push_str(&input[i + 1..label_end]);
150 i = link_end + 1;
151 continue;
152 }
153 output.push('[');
154 i += 1;
155 },
156 b'<' => {
157 if let Some(next_gt) = memchr::memchr(b'>', &bytes[i + 1..]).map(|pos| pos + i + 1)
158 {
159 let tag = &input[i + 1..next_gt];
160 let tag_lower = tag.trim().to_ascii_lowercase();
161 if tag_lower.starts_with("a ") || tag_lower.starts_with("a>") {
162 i = next_gt + 1;
163 continue;
164 }
165 if tag_lower.starts_with("/a") {
166 i = next_gt + 1;
167 continue;
168 }
169 }
170 output.push('<');
171 i += 1;
172 },
173 _ => {
174 if let Some(ch) = input[i..].chars().next() {
175 output.push(ch);
176 i += ch.len_utf8();
177 } else {
178 i += 1;
179 }
180 },
181 }
182 }
183
184 output
185}
186
187fn find_markdown_link(bytes: &[u8], start: usize) -> Option<(usize, usize)> {
188 let mut idx = start + 1;
189 while idx < bytes.len() {
190 match bytes[idx] {
191 b'\\' => idx += 2,
192 b']' => {
193 if idx + 1 < bytes.len() && bytes[idx + 1] == b'(' {
194 if let Some(close_paren) = find_matching_paren(bytes, idx + 2) {
195 return Some((idx, close_paren));
196 }
197 }
198 return None;
199 },
200 _ => idx += 1,
201 }
202 }
203 None
204}
205
206fn find_matching_paren(bytes: &[u8], mut pos: usize) -> Option<usize> {
207 let mut depth = 1;
208 while pos < bytes.len() {
209 match bytes[pos] {
210 b'\\' => pos += 2,
211 b'(' => {
212 depth += 1;
213 pos += 1;
214 },
215 b')' => {
216 depth -= 1;
217 if depth == 0 {
218 return Some(pos);
219 }
220 pos += 1;
221 },
222 _ => pos += 1,
223 }
224 }
225 None
226}
227
228fn normalize_for_search(display: &str) -> String {
229 let mut normalized = String::with_capacity(display.len());
230 let mut prev_was_space = true;
231
232 for ch in display.nfkd() {
233 if is_combining_mark(ch) {
234 continue;
235 }
236
237 for lower in ch.to_lowercase() {
238 if lower.is_ascii_alphanumeric() {
239 normalized.push(lower);
240 prev_was_space = false;
241 } else if lower.is_whitespace()
242 || matches!(
243 lower,
244 '-' | '_'
245 | '/'
246 | '.'
247 | '#'
248 | ':'
249 | '`'
250 | '\''
251 | '"'
252 | '('
253 | ')'
254 | '['
255 | ']'
256 | '{'
257 | '}'
258 )
259 || lower.is_ascii()
260 {
261 push_space(&mut normalized, &mut prev_was_space);
262 } else if lower.is_alphanumeric() {
263 normalized.push(lower);
264 prev_was_space = false;
265 } else {
266 push_space(&mut normalized, &mut prev_was_space);
267 }
268 }
269 }
270
271 normalized.trim().to_string()
272}
273
274#[must_use]
276pub fn normalize_text_for_search(text: &str) -> String {
277 normalize_for_search(text)
278}
279
280fn push_space(normalized: &mut String, prev_was_space: &mut bool) {
281 if !*prev_was_space && !normalized.is_empty() {
282 normalized.push(' ');
283 *prev_was_space = true;
284 }
285}