bibtex_parser/
latex_unicode.rs

1//! LaTeX to Unicode conversion for common escape sequences
2//!
3//! This module provides conversion from LaTeX escape sequences to Unicode
4//! characters for improved readability of BibTeX data.
5
6use phf::phf_map;
7
8/// Common LaTeX accent commands to Unicode (direct format like \'e)
9static LATEX_ACCENTS: phf::Map<&'static str, &'static str> = phf_map! {
10    // Acute accent: \' (both single and double backslash versions)
11    "\\'a" => "á", "\\\\'a" => "á",
12    "\\'e" => "é", "\\\\'e" => "é",
13    "\\'i" => "í", "\\\\'i" => "í",
14    "\\'o" => "ó", "\\\\'o" => "ó",
15    "\\'u" => "ú", "\\\\'u" => "ú",
16    "\\'A" => "Á", "\\\\'A" => "Á",
17    "\\'E" => "É", "\\\\'E" => "É",
18    "\\'I" => "Í", "\\\\'I" => "Í",
19    "\\'O" => "Ó", "\\\\'O" => "Ó",
20    "\\'U" => "Ú", "\\\\'U" => "Ú",
21    "\\'y" => "ý", "\\\\'y" => "ý",
22    "\\'Y" => "Ý", "\\\\'Y" => "Ý",
23
24    // Grave accent: \` (both single and double backslash versions)
25    "\\`a" => "à", "\\\\`a" => "à",
26    "\\`e" => "è", "\\\\`e" => "è",
27    "\\`i" => "ì", "\\\\`i" => "ì",
28    "\\`o" => "ò", "\\\\`o" => "ò",
29    "\\`u" => "ù", "\\\\`u" => "ù",
30    "\\`A" => "À", "\\\\`A" => "À",
31    "\\`E" => "È", "\\\\`E" => "È",
32    "\\`I" => "Ì", "\\\\`I" => "Ì",
33    "\\`O" => "Ò", "\\\\`O" => "Ò",
34    "\\`U" => "Ù", "\\\\`U" => "Ù",
35
36    // Circumflex: \^ (both single and double backslash versions)
37    "\\^a" => "â", "\\\\^a" => "â",
38    "\\^e" => "ê", "\\\\^e" => "ê",
39    "\\^i" => "î", "\\\\^i" => "î",
40    "\\^o" => "ô", "\\\\^o" => "ô",
41    "\\^u" => "û", "\\\\^u" => "û",
42    "\\^A" => "Â", "\\\\^A" => "Â",
43    "\\^E" => "Ê", "\\\\^E" => "Ê",
44    "\\^I" => "Î", "\\\\^I" => "Î",
45    "\\^O" => "Ô", "\\\\^O" => "Ô",
46    "\\^U" => "Û", "\\\\^U" => "Û",
47
48    // Umlaut/Diaeresis: \" (single, double, and triple backslash versions)
49    "\\\"a" => "ä", "\\\\\"a" => "ä", "\\\\\\\"a" => "ä",
50    "\\\"e" => "ë", "\\\\\"e" => "ë", "\\\\\\\"e" => "ë",
51    "\\\"i" => "ï", "\\\\\"i" => "ï", "\\\\\\\"i" => "ï",
52    "\\\"o" => "ö", "\\\\\"o" => "ö", "\\\\\\\"o" => "ö",
53    "\\\"u" => "ü", "\\\\\"u" => "ü", "\\\\\\\"u" => "ü",
54    "\\\"A" => "Ä", "\\\\\"A" => "Ä", "\\\\\\\"A" => "Ä",
55    "\\\"E" => "Ë", "\\\\\"E" => "Ë", "\\\\\\\"E" => "Ë",
56    "\\\"I" => "Ï", "\\\\\"I" => "Ï", "\\\\\\\"I" => "Ï",
57    "\\\"O" => "Ö", "\\\\\"O" => "Ö", "\\\\\\\"O" => "Ö",
58    "\\\"U" => "Ü", "\\\\\"U" => "Ü", "\\\\\\\"U" => "Ü",
59    "\\\"y" => "ÿ", "\\\\\"y" => "ÿ", "\\\\\\\"y" => "ÿ",
60    "\\\"Y" => "Ÿ", "\\\\\"Y" => "Ÿ", "\\\\\\\"Y" => "Ÿ",
61
62    // Tilde: \~ (both single and double backslash versions)
63    "\\~a" => "ã", "\\\\~a" => "ã",
64    "\\~n" => "ñ", "\\\\~n" => "ñ",
65    "\\~o" => "õ", "\\\\~o" => "õ",
66    "\\~A" => "Ã", "\\\\~A" => "Ã",
67    "\\~N" => "Ñ", "\\\\~N" => "Ñ",
68    "\\~O" => "Õ", "\\\\~O" => "Õ",
69
70    // Cedilla: \c with space (both single and double backslash versions)
71    "\\c c" => "ç", "\\\\c c" => "ç",
72    "\\c C" => "Ç", "\\\\c C" => "Ç",
73
74    // Ring: \r with space (both single and double backslash versions)
75    "\\r a" => "å", "\\\\r a" => "å",
76    "\\r A" => "Å", "\\\\r A" => "Å",
77};
78
79/// LaTeX commands with braces like \'{e}, \"{o}, etc.
80static LATEX_BRACED: phf::Map<&'static str, &'static str> = phf_map! {
81    // Acute accent (both single and double backslash versions)
82    "\\'{a}" => "á", "\\\\'{a}" => "á",
83    "\\'{e}" => "é", "\\\\'{e}" => "é",
84    "\\'{i}" => "í", "\\\\'{i}" => "í",
85    "\\'{o}" => "ó", "\\\\'{o}" => "ó",
86    "\\'{u}" => "ú", "\\\\'{u}" => "ú",
87    "\\'{A}" => "Á", "\\\\'{A}" => "Á",
88    "\\'{E}" => "É", "\\\\'{E}" => "É",
89    "\\'{I}" => "Í", "\\\\'{I}" => "Í",
90    "\\'{O}" => "Ó", "\\\\'{O}" => "Ó",
91    "\\'{U}" => "Ú", "\\\\'{U}" => "Ú",
92    "\\'{y}" => "ý", "\\\\'{y}" => "ý",
93    "\\'{Y}" => "Ý", "\\\\'{Y}" => "Ý",
94
95    // Grave accent (both single and double backslash versions)
96    "\\`{a}" => "à", "\\\\`{a}" => "à",
97    "\\`{e}" => "è", "\\\\`{e}" => "è",
98    "\\`{i}" => "ì", "\\\\`{i}" => "ì",
99    "\\`{o}" => "ò", "\\\\`{o}" => "ò",
100    "\\`{u}" => "ù", "\\\\`{u}" => "ù",
101    "\\`{A}" => "À", "\\\\`{A}" => "À",
102    "\\`{E}" => "È", "\\\\`{E}" => "È",
103    "\\`{I}" => "Ì", "\\\\`{I}" => "Ì",
104    "\\`{O}" => "Ò", "\\\\`{O}" => "Ò",
105    "\\`{U}" => "Ù", "\\\\`{U}" => "Ù",
106
107    // Circumflex (both single and double backslash versions)
108    "\\^{a}" => "â", "\\\\^{a}" => "â",
109    "\\^{e}" => "ê", "\\\\^{e}" => "ê",
110    "\\^{i}" => "î", "\\\\^{i}" => "î",
111    "\\^{o}" => "ô", "\\\\^{o}" => "ô",
112    "\\^{u}" => "û", "\\\\^{u}" => "û",
113    "\\^{A}" => "Â", "\\\\^{A}" => "Â",
114    "\\^{E}" => "Ê", "\\\\^{E}" => "Ê",
115    "\\^{I}" => "Î", "\\\\^{I}" => "Î",
116    "\\^{O}" => "Ô", "\\\\^{O}" => "Ô",
117    "\\^{U}" => "Û", "\\\\^{U}" => "Û",
118
119    // Umlaut (single, double, and triple backslash versions)
120    "\\\"{a}" => "ä", "\\\\\"{a}" => "ä", "\\\\\\\"{a}" => "ä",
121    "\\\"{e}" => "ë", "\\\\\"{e}" => "ë", "\\\\\\\"{e}" => "ë",
122    "\\\"{i}" => "ï", "\\\\\"{i}" => "ï", "\\\\\\\"{i}" => "ï",
123    "\\\"{o}" => "ö", "\\\\\"{o}" => "ö", "\\\\\\\"{o}" => "ö",
124    "\\\"{u}" => "ü", "\\\\\"{u}" => "ü", "\\\\\\\"{u}" => "ü",
125    "\\\"{A}" => "Ä", "\\\\\"{A}" => "Ä", "\\\\\\\"{A}" => "Ä",
126    "\\\"{E}" => "Ë", "\\\\\"{E}" => "Ë", "\\\\\\\"{E}" => "Ë",
127    "\\\"{I}" => "Ï", "\\\\\"{I}" => "Ï", "\\\\\\\"{I}" => "Ï",
128    "\\\"{O}" => "Ö", "\\\\\"{O}" => "Ö", "\\\\\\\"{O}" => "Ö",
129    "\\\"{U}" => "Ü", "\\\\\"{U}" => "Ü", "\\\\\\\"{U}" => "Ü",
130    "\\\"{y}" => "ÿ", "\\\\\"{y}" => "ÿ", "\\\\\\\"{y}" => "ÿ",
131    "\\\"{Y}" => "Ÿ", "\\\\\"{Y}" => "Ÿ", "\\\\\\\"{Y}" => "Ÿ",
132
133    // Tilde (both single and double backslash versions)
134    "\\~{a}" => "ã", "\\\\~{a}" => "ã",
135    "\\~{n}" => "ñ", "\\\\~{n}" => "ñ",
136    "\\~{o}" => "õ", "\\\\~{o}" => "õ",
137    "\\~{A}" => "Ã", "\\\\~{A}" => "Ã",
138    "\\~{N}" => "Ñ", "\\\\~{N}" => "Ñ",
139    "\\~{O}" => "Õ", "\\\\~{O}" => "Õ",
140
141    // Cedilla with braces (both single and double backslash versions)
142    "\\c{c}" => "ç", "\\\\c{c}" => "ç",
143    "\\c{C}" => "Ç", "\\\\c{C}" => "Ç",
144
145    // Ring with braces (both single and double backslash versions)
146    "\\r{a}" => "å", "\\\\r{a}" => "å",
147    "\\r{A}" => "Å", "\\\\r{A}" => "Å",
148};
149
150/// Special LaTeX symbols and commands
151static LATEX_SYMBOLS: phf::Map<&'static str, &'static str> = phf_map! {
152    // Special ligatures and characters (both single and double backslash versions)
153    "\\ae" => "æ", "\\AE" => "Æ", "\\\\ae" => "æ", "\\\\AE" => "Æ",
154    "\\oe" => "œ", "\\OE" => "Œ", "\\\\oe" => "œ", "\\\\OE" => "Œ",
155    "\\ss" => "ß", "\\\\ss" => "ß",
156    "\\o " => "ø", "\\O " => "Ø", "\\\\o " => "ø", "\\\\O " => "Ø",  // With space absorption
157    "\\o" => "ø", "\\O" => "Ø", "\\\\o" => "ø", "\\\\O" => "Ø",      // Without space
158    "\\aa" => "å", "\\AA" => "Å", "\\\\aa" => "å", "\\\\AA" => "Å",
159
160    // Greek letters (both single and double backslash versions)
161    "\\alpha" => "α", "\\\\alpha" => "α",
162    "\\beta" => "β", "\\\\beta" => "β",
163    "\\gamma" => "γ", "\\\\gamma" => "γ",
164    "\\delta" => "δ", "\\\\delta" => "δ",
165    "\\epsilon" => "ε", "\\\\epsilon" => "ε",
166    "\\varepsilon" => "ε", "\\\\varepsilon" => "ε",
167    "\\zeta" => "ζ", "\\\\zeta" => "ζ",
168    "\\eta" => "η", "\\\\eta" => "η",
169    "\\theta" => "θ", "\\\\theta" => "θ",
170    "\\vartheta" => "θ", "\\\\vartheta" => "θ",
171    "\\iota" => "ι", "\\\\iota" => "ι",
172    "\\kappa" => "κ", "\\\\kappa" => "κ",
173    "\\lambda" => "λ", "\\\\lambda" => "λ",
174    "\\mu" => "μ", "\\\\mu" => "μ",
175    "\\nu" => "ν", "\\\\nu" => "ν",
176    "\\xi" => "ξ", "\\\\xi" => "ξ",
177    "\\pi" => "π", "\\\\pi" => "π",
178    "\\varpi" => "π", "\\\\varpi" => "π",
179    "\\rho" => "ρ", "\\\\rho" => "ρ",
180    "\\varrho" => "ρ", "\\\\varrho" => "ρ",
181    "\\sigma" => "σ", "\\\\sigma" => "σ",
182    "\\varsigma" => "ς", "\\\\varsigma" => "ς",
183    "\\tau" => "τ", "\\\\tau" => "τ",
184    "\\upsilon" => "υ", "\\\\upsilon" => "υ",
185    "\\phi" => "φ", "\\\\phi" => "φ",
186    "\\varphi" => "φ", "\\\\varphi" => "φ",
187    "\\chi" => "χ", "\\\\chi" => "χ",
188    "\\psi" => "ψ", "\\\\psi" => "ψ",
189    "\\omega" => "ω", "\\\\omega" => "ω",
190
191    // Capital Greek letters (both single and double backslash versions)
192    "\\Gamma" => "Γ", "\\\\Gamma" => "Γ",
193    "\\Delta" => "Δ", "\\\\Delta" => "Δ",
194    "\\Theta" => "Θ", "\\\\Theta" => "Θ",
195    "\\Lambda" => "Λ", "\\\\Lambda" => "Λ",
196    "\\Xi" => "Ξ", "\\\\Xi" => "Ξ",
197    "\\Pi" => "Π", "\\\\Pi" => "Π",
198    "\\Sigma" => "Σ", "\\\\Sigma" => "Σ",
199    "\\Upsilon" => "Υ", "\\\\Upsilon" => "Υ",
200    "\\Phi" => "Φ", "\\\\Phi" => "Φ",
201    "\\Psi" => "Ψ", "\\\\Psi" => "Ψ",
202    "\\Omega" => "Ω", "\\\\Omega" => "Ω",
203
204    // Mathematical symbols (both single and double backslash versions)
205    "\\infty" => "∞", "\\\\infty" => "∞",
206    "\\partial" => "∂", "\\\\partial" => "∂",
207    "\\nabla" => "∇", "\\\\nabla" => "∇",
208    "\\pm" => "±", "\\\\pm" => "±",
209    "\\mp" => "∓", "\\\\mp" => "∓",
210    "\\sim" => "∼", "\\\\sim" => "∼",
211    "\\times" => "×", "\\\\times" => "×",
212    "\\div" => "÷", "\\\\div" => "÷",
213    "\\leq" => "≤", "\\\\leq" => "≤",
214    "\\geq" => "≥", "\\\\geq" => "≥",
215    "\\neq" => "≠", "\\\\neq" => "≠",
216    "\\approx" => "≈", "\\\\approx" => "≈",
217    "\\equiv" => "≡", "\\\\equiv" => "≡",
218    "\\subset" => "⊂", "\\\\subset" => "⊂",
219    "\\supset" => "⊃", "\\\\supset" => "⊃",
220    "\\subseteq" => "⊆", "\\\\subseteq" => "⊆",
221    "\\supseteq" => "⊇", "\\\\supseteq" => "⊇",
222    "\\in" => "∈", "\\\\in" => "∈",
223    "\\notin" => "∉", "\\\\notin" => "∉",
224    "\\cup" => "∪", "\\\\cup" => "∪",
225    "\\cap" => "∩", "\\\\cap" => "∩",
226    "\\rightarrow" => "→", "\\\\rightarrow" => "→",
227    "\\leftarrow" => "←", "\\\\leftarrow" => "←",
228    "\\leftrightarrow" => "↔", "\\\\leftrightarrow" => "↔",
229    "\\Rightarrow" => "⇒", "\\\\Rightarrow" => "⇒",
230    "\\Leftarrow" => "⇐", "\\\\Leftarrow" => "⇐",
231    "\\Leftrightarrow" => "⇔", "\\\\Leftrightarrow" => "⇔",
232
233    // Physics and advanced math symbols
234    "\\hbar" => "ℏ", "\\\\hbar" => "ℏ",
235    "\\hat{H}" => "Ĥ", "\\\\hat{H}" => "Ĥ",
236
237    // Special mathematical expressions (specific patterns)
238    "\\frac{\\partial}{\\partial t}" => "∂/∂t ",
239    "\\\\frac{\\\\partial}{\\\\partial t}" => "∂/∂t ",
240
241    // Punctuation and symbols (both single and double backslash versions)
242    "\\ldots" => "…", "\\\\ldots" => "…",
243    "\\dots" => "…", "\\\\dots" => "…",
244    "\\cdots" => "⋯", "\\\\cdots" => "⋯",
245    "\\&" => "&", "\\\\&" => "&",
246    "\\%" => "%", "\\\\%" => "%",
247    "\\$" => "$", "\\\\$" => "$",
248    "\\#" => "#", "\\\\#" => "#",
249    "\\{" => "{", "\\\\{" => "{",
250    "\\}" => "}", "\\\\}" => "}",
251    "\\textbackslash" => "\\", "\\\\textbackslash" => "\\",
252    "\\_" => "_", "\\\\_" => "_",
253
254    // Special case for four backslashes representing escaped backslash
255    "\\\\\\\\" => "\\\\",
256
257    // Quotes (both single and double backslash versions)
258    "\\lq " => "'", "\\\\lq " => "'",  // Opening quotes with space absorption
259    "\\lq" => "'", "\\\\lq" => "'",    // Opening quotes without space
260    "\\rq" => "'", "\\\\rq" => "'",    // Closing quotes (no space absorption)
261    "\\lqq " => "\u{201c}", "\\\\lqq " => "\u{201c}",  // Opening quotes with space absorption
262    "\\lqq" => "\u{201c}", "\\\\lqq" => "\u{201c}",    // Opening quotes without space
263    "\\rqq" => "\u{201d}", "\\\\rqq" => "\u{201d}",    // Closing quotes (no space absorption)
264
265    // Spacing commands (both single and double backslash versions)
266    "\\," => " ", "\\\\," => " ",
267    // Note: removed "\\ " pattern as it interferes with literal backslashes
268
269    // Degree symbol (both single and double backslash versions)
270    "\\degree" => "°", "\\\\degree" => "°",
271    "\\textdegree" => "°", "\\\\textdegree" => "°",
272
273    // Copyright and related (both single and double backslash versions)
274    "\\copyright" => "©", "\\\\copyright" => "©",
275    "\\textcopyright" => "©", "\\\\textcopyright" => "©",
276    "\\textregistered" => "®", "\\\\textregistered" => "®",
277    "\\texttrademark" => "™", "\\\\texttrademark" => "™",
278
279    // Currency (both single and double backslash versions)
280    "\\pounds" => "£", "\\\\pounds" => "£",
281    "\\textsterling" => "£", "\\\\textsterling" => "£",
282};
283
284/// Convert LaTeX escape sequences to Unicode
285///
286/// This function performs a single pass through the string, replacing
287/// known LaTeX sequences with their Unicode equivalents.
288///
289/// # Performance
290///
291/// Uses a fast path for strings without LaTeX sequences to avoid
292/// unnecessary scanning and allocation.
293#[must_use]
294pub fn latex_to_unicode(input: &str) -> String {
295    // Fast path: if no backslashes or tildes, no LaTeX to convert
296    if !input.contains('\\') && !input.contains('~') {
297        return input.to_string();
298    }
299
300    let mut result = String::with_capacity(input.len());
301    let mut chars = input.char_indices();
302
303    while let Some((pos, ch)) = chars.next() {
304        if ch == '\\' {
305            // Look for the longest matching pattern starting at this position
306            let remaining = &input[pos..];
307
308            // Try to find the longest match
309            let mut best_match: Option<(&str, &str)> = None;
310
311            // Check all patterns, keeping the longest match
312            // First check LATEX_BRACED (usually longest)
313            for (pattern, replacement) in LATEX_BRACED.entries() {
314                if remaining.starts_with(pattern)
315                    && (best_match.is_none() || pattern.len() > best_match.unwrap().0.len())
316                {
317                    best_match = Some((pattern, replacement));
318                }
319            }
320
321            // Then check LATEX_ACCENTS
322            for (pattern, replacement) in LATEX_ACCENTS.entries() {
323                if remaining.starts_with(pattern)
324                    && (best_match.is_none() || pattern.len() > best_match.unwrap().0.len())
325                {
326                    best_match = Some((pattern, replacement));
327                }
328            }
329
330            // Then check LATEX_SYMBOLS
331            for (pattern, replacement) in LATEX_SYMBOLS.entries() {
332                if remaining.starts_with(pattern)
333                    && (best_match.is_none() || pattern.len() > best_match.unwrap().0.len())
334                {
335                    best_match = Some((pattern, replacement));
336                }
337            }
338
339            if let Some((pattern, replacement)) = best_match {
340                result.push_str(replacement);
341
342                // Skip the matched characters
343                for _ in 1..pattern.len() {
344                    chars.next();
345                }
346            } else {
347                // No pattern matched, keep the backslash
348                result.push(ch);
349            }
350        } else if ch == '~' {
351            // Check if this is a standalone tilde (not part of \~)
352            if is_url_path_tilde(input, pos) {
353                result.push(ch);
354            } else if pos == 0 || !input[..pos].ends_with('\\') {
355                // Non-breaking space
356                result.push(' ');
357            } else {
358                // Part of \~ sequence, keep it
359                result.push(ch);
360            }
361        } else {
362            result.push(ch);
363        }
364    }
365
366    result
367}
368
369fn is_url_path_tilde(input: &str, pos: usize) -> bool {
370    let before = &input[..pos];
371    let token_start = before
372        .rfind(char::is_whitespace)
373        .map_or(0, |index| index + 1);
374    before[token_start..].contains("://")
375}
376
377#[cfg(test)]
378mod tests {
379    use super::*;
380
381    #[test]
382    fn test_basic_accents() {
383        assert_eq!(latex_to_unicode("\\'e"), "é");
384        assert_eq!(latex_to_unicode("\\'{e}"), "é");
385        assert_eq!(latex_to_unicode("\\\"o"), "ö");
386        assert_eq!(latex_to_unicode("\\\"{o}"), "ö");
387        assert_eq!(latex_to_unicode("\\~n"), "ñ");
388        assert_eq!(latex_to_unicode("\\^a"), "â");
389        assert_eq!(latex_to_unicode("\\`u"), "ù");
390    }
391
392    #[test]
393    fn test_cedilla_and_ring() {
394        assert_eq!(latex_to_unicode("\\c{c}"), "ç");
395        assert_eq!(latex_to_unicode("\\c C"), "Ç");
396        assert_eq!(latex_to_unicode("\\r{a}"), "å");
397        assert_eq!(latex_to_unicode("\\r A"), "Å");
398        assert_eq!(latex_to_unicode("\\aa"), "å");
399        assert_eq!(latex_to_unicode("\\AA"), "Å");
400    }
401
402    #[test]
403    fn test_ligatures() {
404        assert_eq!(latex_to_unicode("\\ae"), "æ");
405        assert_eq!(latex_to_unicode("\\AE"), "Æ");
406        assert_eq!(latex_to_unicode("\\oe"), "œ");
407        assert_eq!(latex_to_unicode("\\ss"), "ß");
408        assert_eq!(latex_to_unicode("\\o"), "ø");
409        assert_eq!(latex_to_unicode("\\O"), "Ø");
410    }
411
412    #[test]
413    fn test_mixed_text() {
414        assert_eq!(latex_to_unicode("Fran\\c{c}ois R\\'emi"), "François Rémi");
415        assert_eq!(
416            latex_to_unicode("M\\\"uller and Schr\\\"{o}dinger"),
417            "Müller and Schrödinger"
418        );
419        assert_eq!(latex_to_unicode("Jos\\'e Garc\\'ia"), "José García");
420    }
421
422    #[test]
423    fn test_no_latex() {
424        let plain = "This has no LaTeX";
425        assert_eq!(latex_to_unicode(plain), plain);
426    }
427
428    #[test]
429    fn test_greek_letters() {
430        assert_eq!(latex_to_unicode("\\alpha-\\beta"), "α-β");
431        assert_eq!(latex_to_unicode("\\gamma \\delta"), "γ δ");
432        assert_eq!(latex_to_unicode("\\Gamma\\Delta"), "ΓΔ");
433    }
434
435    #[test]
436    fn test_symbols() {
437        assert_eq!(latex_to_unicode("\\ldots"), "…");
438        assert_eq!(latex_to_unicode("\\&"), "&");
439        assert_eq!(latex_to_unicode("\\%"), "%");
440        assert_eq!(latex_to_unicode("\\copyright"), "©");
441    }
442
443    #[test]
444    fn test_mathematical_symbols() {
445        assert_eq!(latex_to_unicode("\\leq"), "≤");
446        assert_eq!(latex_to_unicode("\\geq"), "≥");
447        assert_eq!(latex_to_unicode("\\neq"), "≠");
448        assert_eq!(latex_to_unicode("\\pm"), "±");
449        assert_eq!(latex_to_unicode("\\times"), "×");
450    }
451
452    #[test]
453    fn test_tildes() {
454        // Standalone tildes become spaces
455        assert_eq!(latex_to_unicode("word~word"), "word word");
456        assert_eq!(
457            latex_to_unicode("https://example.org/~user/paper.pdf"),
458            "https://example.org/~user/paper.pdf"
459        );
460        // LaTeX tildes are accents
461        assert_eq!(latex_to_unicode("\\~n"), "ñ");
462        // Mixed
463        assert_eq!(latex_to_unicode("Se\\~nor~Garc\\'ia"), "Señor García");
464    }
465
466    #[test]
467    fn test_performance_fast_path() {
468        let plain = "This is plain ASCII text with no LaTeX sequences whatsoever";
469        // Should use fast path and return identical string
470        assert_eq!(latex_to_unicode(plain), plain);
471    }
472
473    #[test]
474    fn test_complex_scientific_text() {
475        let input = "The \\alpha-particle decay rate follows \\lambda \\propto e^{-\\gamma t}";
476        // Note: Complex math expressions in braces are not fully supported
477        // This test shows current behavior - only simple substitutions
478        assert!(latex_to_unicode(input).contains("α"));
479        assert!(latex_to_unicode(input).contains("λ"));
480    }
481
482    #[test]
483    fn test_edge_cases() {
484        // Incomplete sequences should be left alone
485        assert_eq!(latex_to_unicode("\\"), "\\");
486        assert_eq!(latex_to_unicode("\\'"), "\\'");
487        assert_eq!(latex_to_unicode("\\'{"), "\\'{");
488
489        // Unknown sequences should be left alone
490        assert_eq!(latex_to_unicode("\\xyz"), "\\xyz");
491        assert_eq!(latex_to_unicode("\\unknown{test}"), "\\unknown{test}");
492
493        // Test specific failing case
494        assert_eq!(
495            latex_to_unicode("\\alpha and \\beta particles"),
496            "α and β particles"
497        );
498    }
499}
bibtex_parser/latex_unicode.rs

bibtex_parser/
latex_unicode.rs