1use phf::phf_map;
7
8static LATEX_ACCENTS: phf::Map<&'static str, &'static str> = phf_map! {
10 "\\'a" => "á", "\\\\'a" => "á",
12 "\\'e" => "é", "\\\\'e" => "é",
13 "\\'i" => "í", "\\\\'i" => "í",
14 "\\'o" => "ó", "\\\\'o" => "ó",
15 "\\'u" => "ú", "\\\\'u" => "ú",
16 "\\'A" => "Á", "\\\\'A" => "Á",
17 "\\'E" => "É", "\\\\'E" => "É",
18 "\\'I" => "Í", "\\\\'I" => "Í",
19 "\\'O" => "Ó", "\\\\'O" => "Ó",
20 "\\'U" => "Ú", "\\\\'U" => "Ú",
21 "\\'y" => "ý", "\\\\'y" => "ý",
22 "\\'Y" => "Ý", "\\\\'Y" => "Ý",
23
24 "\\`a" => "à", "\\\\`a" => "à",
26 "\\`e" => "è", "\\\\`e" => "è",
27 "\\`i" => "ì", "\\\\`i" => "ì",
28 "\\`o" => "ò", "\\\\`o" => "ò",
29 "\\`u" => "ù", "\\\\`u" => "ù",
30 "\\`A" => "À", "\\\\`A" => "À",
31 "\\`E" => "È", "\\\\`E" => "È",
32 "\\`I" => "Ì", "\\\\`I" => "Ì",
33 "\\`O" => "Ò", "\\\\`O" => "Ò",
34 "\\`U" => "Ù", "\\\\`U" => "Ù",
35
36 "\\^a" => "â", "\\\\^a" => "â",
38 "\\^e" => "ê", "\\\\^e" => "ê",
39 "\\^i" => "î", "\\\\^i" => "î",
40 "\\^o" => "ô", "\\\\^o" => "ô",
41 "\\^u" => "û", "\\\\^u" => "û",
42 "\\^A" => "Â", "\\\\^A" => "Â",
43 "\\^E" => "Ê", "\\\\^E" => "Ê",
44 "\\^I" => "Î", "\\\\^I" => "Î",
45 "\\^O" => "Ô", "\\\\^O" => "Ô",
46 "\\^U" => "Û", "\\\\^U" => "Û",
47
48 "\\\"a" => "ä", "\\\\\"a" => "ä", "\\\\\\\"a" => "ä",
50 "\\\"e" => "ë", "\\\\\"e" => "ë", "\\\\\\\"e" => "ë",
51 "\\\"i" => "ï", "\\\\\"i" => "ï", "\\\\\\\"i" => "ï",
52 "\\\"o" => "ö", "\\\\\"o" => "ö", "\\\\\\\"o" => "ö",
53 "\\\"u" => "ü", "\\\\\"u" => "ü", "\\\\\\\"u" => "ü",
54 "\\\"A" => "Ä", "\\\\\"A" => "Ä", "\\\\\\\"A" => "Ä",
55 "\\\"E" => "Ë", "\\\\\"E" => "Ë", "\\\\\\\"E" => "Ë",
56 "\\\"I" => "Ï", "\\\\\"I" => "Ï", "\\\\\\\"I" => "Ï",
57 "\\\"O" => "Ö", "\\\\\"O" => "Ö", "\\\\\\\"O" => "Ö",
58 "\\\"U" => "Ü", "\\\\\"U" => "Ü", "\\\\\\\"U" => "Ü",
59 "\\\"y" => "ÿ", "\\\\\"y" => "ÿ", "\\\\\\\"y" => "ÿ",
60 "\\\"Y" => "Ÿ", "\\\\\"Y" => "Ÿ", "\\\\\\\"Y" => "Ÿ",
61
62 "\\~a" => "ã", "\\\\~a" => "ã",
64 "\\~n" => "ñ", "\\\\~n" => "ñ",
65 "\\~o" => "õ", "\\\\~o" => "õ",
66 "\\~A" => "Ã", "\\\\~A" => "Ã",
67 "\\~N" => "Ñ", "\\\\~N" => "Ñ",
68 "\\~O" => "Õ", "\\\\~O" => "Õ",
69
70 "\\c c" => "ç", "\\\\c c" => "ç",
72 "\\c C" => "Ç", "\\\\c C" => "Ç",
73
74 "\\r a" => "å", "\\\\r a" => "å",
76 "\\r A" => "Å", "\\\\r A" => "Å",
77};
78
79static LATEX_BRACED: phf::Map<&'static str, &'static str> = phf_map! {
81 "\\'{a}" => "á", "\\\\'{a}" => "á",
83 "\\'{e}" => "é", "\\\\'{e}" => "é",
84 "\\'{i}" => "í", "\\\\'{i}" => "í",
85 "\\'{o}" => "ó", "\\\\'{o}" => "ó",
86 "\\'{u}" => "ú", "\\\\'{u}" => "ú",
87 "\\'{A}" => "Á", "\\\\'{A}" => "Á",
88 "\\'{E}" => "É", "\\\\'{E}" => "É",
89 "\\'{I}" => "Í", "\\\\'{I}" => "Í",
90 "\\'{O}" => "Ó", "\\\\'{O}" => "Ó",
91 "\\'{U}" => "Ú", "\\\\'{U}" => "Ú",
92 "\\'{y}" => "ý", "\\\\'{y}" => "ý",
93 "\\'{Y}" => "Ý", "\\\\'{Y}" => "Ý",
94
95 "\\`{a}" => "à", "\\\\`{a}" => "à",
97 "\\`{e}" => "è", "\\\\`{e}" => "è",
98 "\\`{i}" => "ì", "\\\\`{i}" => "ì",
99 "\\`{o}" => "ò", "\\\\`{o}" => "ò",
100 "\\`{u}" => "ù", "\\\\`{u}" => "ù",
101 "\\`{A}" => "À", "\\\\`{A}" => "À",
102 "\\`{E}" => "È", "\\\\`{E}" => "È",
103 "\\`{I}" => "Ì", "\\\\`{I}" => "Ì",
104 "\\`{O}" => "Ò", "\\\\`{O}" => "Ò",
105 "\\`{U}" => "Ù", "\\\\`{U}" => "Ù",
106
107 "\\^{a}" => "â", "\\\\^{a}" => "â",
109 "\\^{e}" => "ê", "\\\\^{e}" => "ê",
110 "\\^{i}" => "î", "\\\\^{i}" => "î",
111 "\\^{o}" => "ô", "\\\\^{o}" => "ô",
112 "\\^{u}" => "û", "\\\\^{u}" => "û",
113 "\\^{A}" => "Â", "\\\\^{A}" => "Â",
114 "\\^{E}" => "Ê", "\\\\^{E}" => "Ê",
115 "\\^{I}" => "Î", "\\\\^{I}" => "Î",
116 "\\^{O}" => "Ô", "\\\\^{O}" => "Ô",
117 "\\^{U}" => "Û", "\\\\^{U}" => "Û",
118
119 "\\\"{a}" => "ä", "\\\\\"{a}" => "ä", "\\\\\\\"{a}" => "ä",
121 "\\\"{e}" => "ë", "\\\\\"{e}" => "ë", "\\\\\\\"{e}" => "ë",
122 "\\\"{i}" => "ï", "\\\\\"{i}" => "ï", "\\\\\\\"{i}" => "ï",
123 "\\\"{o}" => "ö", "\\\\\"{o}" => "ö", "\\\\\\\"{o}" => "ö",
124 "\\\"{u}" => "ü", "\\\\\"{u}" => "ü", "\\\\\\\"{u}" => "ü",
125 "\\\"{A}" => "Ä", "\\\\\"{A}" => "Ä", "\\\\\\\"{A}" => "Ä",
126 "\\\"{E}" => "Ë", "\\\\\"{E}" => "Ë", "\\\\\\\"{E}" => "Ë",
127 "\\\"{I}" => "Ï", "\\\\\"{I}" => "Ï", "\\\\\\\"{I}" => "Ï",
128 "\\\"{O}" => "Ö", "\\\\\"{O}" => "Ö", "\\\\\\\"{O}" => "Ö",
129 "\\\"{U}" => "Ü", "\\\\\"{U}" => "Ü", "\\\\\\\"{U}" => "Ü",
130 "\\\"{y}" => "ÿ", "\\\\\"{y}" => "ÿ", "\\\\\\\"{y}" => "ÿ",
131 "\\\"{Y}" => "Ÿ", "\\\\\"{Y}" => "Ÿ", "\\\\\\\"{Y}" => "Ÿ",
132
133 "\\~{a}" => "ã", "\\\\~{a}" => "ã",
135 "\\~{n}" => "ñ", "\\\\~{n}" => "ñ",
136 "\\~{o}" => "õ", "\\\\~{o}" => "õ",
137 "\\~{A}" => "Ã", "\\\\~{A}" => "Ã",
138 "\\~{N}" => "Ñ", "\\\\~{N}" => "Ñ",
139 "\\~{O}" => "Õ", "\\\\~{O}" => "Õ",
140
141 "\\c{c}" => "ç", "\\\\c{c}" => "ç",
143 "\\c{C}" => "Ç", "\\\\c{C}" => "Ç",
144
145 "\\r{a}" => "å", "\\\\r{a}" => "å",
147 "\\r{A}" => "Å", "\\\\r{A}" => "Å",
148};
149
150static LATEX_SYMBOLS: phf::Map<&'static str, &'static str> = phf_map! {
152 "\\ae" => "æ", "\\AE" => "Æ", "\\\\ae" => "æ", "\\\\AE" => "Æ",
154 "\\oe" => "œ", "\\OE" => "Œ", "\\\\oe" => "œ", "\\\\OE" => "Œ",
155 "\\ss" => "ß", "\\\\ss" => "ß",
156 "\\o " => "ø", "\\O " => "Ø", "\\\\o " => "ø", "\\\\O " => "Ø", "\\o" => "ø", "\\O" => "Ø", "\\\\o" => "ø", "\\\\O" => "Ø", "\\aa" => "å", "\\AA" => "Å", "\\\\aa" => "å", "\\\\AA" => "Å",
159
160 "\\alpha" => "α", "\\\\alpha" => "α",
162 "\\beta" => "β", "\\\\beta" => "β",
163 "\\gamma" => "γ", "\\\\gamma" => "γ",
164 "\\delta" => "δ", "\\\\delta" => "δ",
165 "\\epsilon" => "ε", "\\\\epsilon" => "ε",
166 "\\varepsilon" => "ε", "\\\\varepsilon" => "ε",
167 "\\zeta" => "ζ", "\\\\zeta" => "ζ",
168 "\\eta" => "η", "\\\\eta" => "η",
169 "\\theta" => "θ", "\\\\theta" => "θ",
170 "\\vartheta" => "θ", "\\\\vartheta" => "θ",
171 "\\iota" => "ι", "\\\\iota" => "ι",
172 "\\kappa" => "κ", "\\\\kappa" => "κ",
173 "\\lambda" => "λ", "\\\\lambda" => "λ",
174 "\\mu" => "μ", "\\\\mu" => "μ",
175 "\\nu" => "ν", "\\\\nu" => "ν",
176 "\\xi" => "ξ", "\\\\xi" => "ξ",
177 "\\pi" => "π", "\\\\pi" => "π",
178 "\\varpi" => "π", "\\\\varpi" => "π",
179 "\\rho" => "ρ", "\\\\rho" => "ρ",
180 "\\varrho" => "ρ", "\\\\varrho" => "ρ",
181 "\\sigma" => "σ", "\\\\sigma" => "σ",
182 "\\varsigma" => "ς", "\\\\varsigma" => "ς",
183 "\\tau" => "τ", "\\\\tau" => "τ",
184 "\\upsilon" => "υ", "\\\\upsilon" => "υ",
185 "\\phi" => "φ", "\\\\phi" => "φ",
186 "\\varphi" => "φ", "\\\\varphi" => "φ",
187 "\\chi" => "χ", "\\\\chi" => "χ",
188 "\\psi" => "ψ", "\\\\psi" => "ψ",
189 "\\omega" => "ω", "\\\\omega" => "ω",
190
191 "\\Gamma" => "Γ", "\\\\Gamma" => "Γ",
193 "\\Delta" => "Δ", "\\\\Delta" => "Δ",
194 "\\Theta" => "Θ", "\\\\Theta" => "Θ",
195 "\\Lambda" => "Λ", "\\\\Lambda" => "Λ",
196 "\\Xi" => "Ξ", "\\\\Xi" => "Ξ",
197 "\\Pi" => "Π", "\\\\Pi" => "Π",
198 "\\Sigma" => "Σ", "\\\\Sigma" => "Σ",
199 "\\Upsilon" => "Υ", "\\\\Upsilon" => "Υ",
200 "\\Phi" => "Φ", "\\\\Phi" => "Φ",
201 "\\Psi" => "Ψ", "\\\\Psi" => "Ψ",
202 "\\Omega" => "Ω", "\\\\Omega" => "Ω",
203
204 "\\infty" => "∞", "\\\\infty" => "∞",
206 "\\partial" => "∂", "\\\\partial" => "∂",
207 "\\nabla" => "∇", "\\\\nabla" => "∇",
208 "\\pm" => "±", "\\\\pm" => "±",
209 "\\mp" => "∓", "\\\\mp" => "∓",
210 "\\sim" => "∼", "\\\\sim" => "∼",
211 "\\times" => "×", "\\\\times" => "×",
212 "\\div" => "÷", "\\\\div" => "÷",
213 "\\leq" => "≤", "\\\\leq" => "≤",
214 "\\geq" => "≥", "\\\\geq" => "≥",
215 "\\neq" => "≠", "\\\\neq" => "≠",
216 "\\approx" => "≈", "\\\\approx" => "≈",
217 "\\equiv" => "≡", "\\\\equiv" => "≡",
218 "\\subset" => "⊂", "\\\\subset" => "⊂",
219 "\\supset" => "⊃", "\\\\supset" => "⊃",
220 "\\subseteq" => "⊆", "\\\\subseteq" => "⊆",
221 "\\supseteq" => "⊇", "\\\\supseteq" => "⊇",
222 "\\in" => "∈", "\\\\in" => "∈",
223 "\\notin" => "∉", "\\\\notin" => "∉",
224 "\\cup" => "∪", "\\\\cup" => "∪",
225 "\\cap" => "∩", "\\\\cap" => "∩",
226 "\\rightarrow" => "→", "\\\\rightarrow" => "→",
227 "\\leftarrow" => "←", "\\\\leftarrow" => "←",
228 "\\leftrightarrow" => "↔", "\\\\leftrightarrow" => "↔",
229 "\\Rightarrow" => "⇒", "\\\\Rightarrow" => "⇒",
230 "\\Leftarrow" => "⇐", "\\\\Leftarrow" => "⇐",
231 "\\Leftrightarrow" => "⇔", "\\\\Leftrightarrow" => "⇔",
232
233 "\\hbar" => "ℏ", "\\\\hbar" => "ℏ",
235 "\\hat{H}" => "Ĥ", "\\\\hat{H}" => "Ĥ",
236
237 "\\frac{\\partial}{\\partial t}" => "∂/∂t ",
239 "\\\\frac{\\\\partial}{\\\\partial t}" => "∂/∂t ",
240
241 "\\ldots" => "…", "\\\\ldots" => "…",
243 "\\dots" => "…", "\\\\dots" => "…",
244 "\\cdots" => "⋯", "\\\\cdots" => "⋯",
245 "\\&" => "&", "\\\\&" => "&",
246 "\\%" => "%", "\\\\%" => "%",
247 "\\$" => "$", "\\\\$" => "$",
248 "\\#" => "#", "\\\\#" => "#",
249 "\\{" => "{", "\\\\{" => "{",
250 "\\}" => "}", "\\\\}" => "}",
251 "\\textbackslash" => "\\", "\\\\textbackslash" => "\\",
252 "\\_" => "_", "\\\\_" => "_",
253
254 "\\\\\\\\" => "\\\\",
256
257 "\\lq " => "'", "\\\\lq " => "'", "\\lq" => "'", "\\\\lq" => "'", "\\rq" => "'", "\\\\rq" => "'", "\\lqq " => "\u{201c}", "\\\\lqq " => "\u{201c}", "\\lqq" => "\u{201c}", "\\\\lqq" => "\u{201c}", "\\rqq" => "\u{201d}", "\\\\rqq" => "\u{201d}", "\\," => " ", "\\\\," => " ",
267 "\\degree" => "°", "\\\\degree" => "°",
271 "\\textdegree" => "°", "\\\\textdegree" => "°",
272
273 "\\copyright" => "©", "\\\\copyright" => "©",
275 "\\textcopyright" => "©", "\\\\textcopyright" => "©",
276 "\\textregistered" => "®", "\\\\textregistered" => "®",
277 "\\texttrademark" => "™", "\\\\texttrademark" => "™",
278
279 "\\pounds" => "£", "\\\\pounds" => "£",
281 "\\textsterling" => "£", "\\\\textsterling" => "£",
282};
283
284#[must_use]
294pub fn latex_to_unicode(input: &str) -> String {
295 if !input.contains('\\') && !input.contains('~') {
297 return input.to_string();
298 }
299
300 let mut result = String::with_capacity(input.len());
301 let mut chars = input.char_indices();
302
303 while let Some((pos, ch)) = chars.next() {
304 if ch == '\\' {
305 let remaining = &input[pos..];
307
308 let mut best_match: Option<(&str, &str)> = None;
310
311 for (pattern, replacement) in LATEX_BRACED.entries() {
314 if remaining.starts_with(pattern)
315 && (best_match.is_none() || pattern.len() > best_match.unwrap().0.len())
316 {
317 best_match = Some((pattern, replacement));
318 }
319 }
320
321 for (pattern, replacement) in LATEX_ACCENTS.entries() {
323 if remaining.starts_with(pattern)
324 && (best_match.is_none() || pattern.len() > best_match.unwrap().0.len())
325 {
326 best_match = Some((pattern, replacement));
327 }
328 }
329
330 for (pattern, replacement) in LATEX_SYMBOLS.entries() {
332 if remaining.starts_with(pattern)
333 && (best_match.is_none() || pattern.len() > best_match.unwrap().0.len())
334 {
335 best_match = Some((pattern, replacement));
336 }
337 }
338
339 if let Some((pattern, replacement)) = best_match {
340 result.push_str(replacement);
341
342 for _ in 1..pattern.len() {
344 chars.next();
345 }
346 } else {
347 result.push(ch);
349 }
350 } else if ch == '~' {
351 if is_url_path_tilde(input, pos) {
353 result.push(ch);
354 } else if pos == 0 || !input[..pos].ends_with('\\') {
355 result.push(' ');
357 } else {
358 result.push(ch);
360 }
361 } else {
362 result.push(ch);
363 }
364 }
365
366 result
367}
368
369fn is_url_path_tilde(input: &str, pos: usize) -> bool {
370 let before = &input[..pos];
371 let token_start = before
372 .rfind(char::is_whitespace)
373 .map_or(0, |index| index + 1);
374 before[token_start..].contains("://")
375}
376
377#[cfg(test)]
378mod tests {
379 use super::*;
380
381 #[test]
382 fn test_basic_accents() {
383 assert_eq!(latex_to_unicode("\\'e"), "é");
384 assert_eq!(latex_to_unicode("\\'{e}"), "é");
385 assert_eq!(latex_to_unicode("\\\"o"), "ö");
386 assert_eq!(latex_to_unicode("\\\"{o}"), "ö");
387 assert_eq!(latex_to_unicode("\\~n"), "ñ");
388 assert_eq!(latex_to_unicode("\\^a"), "â");
389 assert_eq!(latex_to_unicode("\\`u"), "ù");
390 }
391
392 #[test]
393 fn test_cedilla_and_ring() {
394 assert_eq!(latex_to_unicode("\\c{c}"), "ç");
395 assert_eq!(latex_to_unicode("\\c C"), "Ç");
396 assert_eq!(latex_to_unicode("\\r{a}"), "å");
397 assert_eq!(latex_to_unicode("\\r A"), "Å");
398 assert_eq!(latex_to_unicode("\\aa"), "å");
399 assert_eq!(latex_to_unicode("\\AA"), "Å");
400 }
401
402 #[test]
403 fn test_ligatures() {
404 assert_eq!(latex_to_unicode("\\ae"), "æ");
405 assert_eq!(latex_to_unicode("\\AE"), "Æ");
406 assert_eq!(latex_to_unicode("\\oe"), "œ");
407 assert_eq!(latex_to_unicode("\\ss"), "ß");
408 assert_eq!(latex_to_unicode("\\o"), "ø");
409 assert_eq!(latex_to_unicode("\\O"), "Ø");
410 }
411
412 #[test]
413 fn test_mixed_text() {
414 assert_eq!(latex_to_unicode("Fran\\c{c}ois R\\'emi"), "François Rémi");
415 assert_eq!(
416 latex_to_unicode("M\\\"uller and Schr\\\"{o}dinger"),
417 "Müller and Schrödinger"
418 );
419 assert_eq!(latex_to_unicode("Jos\\'e Garc\\'ia"), "José García");
420 }
421
422 #[test]
423 fn test_no_latex() {
424 let plain = "This has no LaTeX";
425 assert_eq!(latex_to_unicode(plain), plain);
426 }
427
428 #[test]
429 fn test_greek_letters() {
430 assert_eq!(latex_to_unicode("\\alpha-\\beta"), "α-β");
431 assert_eq!(latex_to_unicode("\\gamma \\delta"), "γ δ");
432 assert_eq!(latex_to_unicode("\\Gamma\\Delta"), "ΓΔ");
433 }
434
435 #[test]
436 fn test_symbols() {
437 assert_eq!(latex_to_unicode("\\ldots"), "…");
438 assert_eq!(latex_to_unicode("\\&"), "&");
439 assert_eq!(latex_to_unicode("\\%"), "%");
440 assert_eq!(latex_to_unicode("\\copyright"), "©");
441 }
442
443 #[test]
444 fn test_mathematical_symbols() {
445 assert_eq!(latex_to_unicode("\\leq"), "≤");
446 assert_eq!(latex_to_unicode("\\geq"), "≥");
447 assert_eq!(latex_to_unicode("\\neq"), "≠");
448 assert_eq!(latex_to_unicode("\\pm"), "±");
449 assert_eq!(latex_to_unicode("\\times"), "×");
450 }
451
452 #[test]
453 fn test_tildes() {
454 assert_eq!(latex_to_unicode("word~word"), "word word");
456 assert_eq!(
457 latex_to_unicode("https://example.org/~user/paper.pdf"),
458 "https://example.org/~user/paper.pdf"
459 );
460 assert_eq!(latex_to_unicode("\\~n"), "ñ");
462 assert_eq!(latex_to_unicode("Se\\~nor~Garc\\'ia"), "Señor García");
464 }
465
466 #[test]
467 fn test_performance_fast_path() {
468 let plain = "This is plain ASCII text with no LaTeX sequences whatsoever";
469 assert_eq!(latex_to_unicode(plain), plain);
471 }
472
473 #[test]
474 fn test_complex_scientific_text() {
475 let input = "The \\alpha-particle decay rate follows \\lambda \\propto e^{-\\gamma t}";
476 assert!(latex_to_unicode(input).contains("α"));
479 assert!(latex_to_unicode(input).contains("λ"));
480 }
481
482 #[test]
483 fn test_edge_cases() {
484 assert_eq!(latex_to_unicode("\\"), "\\");
486 assert_eq!(latex_to_unicode("\\'"), "\\'");
487 assert_eq!(latex_to_unicode("\\'{"), "\\'{");
488
489 assert_eq!(latex_to_unicode("\\xyz"), "\\xyz");
491 assert_eq!(latex_to_unicode("\\unknown{test}"), "\\unknown{test}");
492
493 assert_eq!(
495 latex_to_unicode("\\alpha and \\beta particles"),
496 "α and β particles"
497 );
498 }
499}