1use std::ops::Range;
9
10use crate::error::{LatexError, LatexErrorKind, SourceSpan};
11use crate::parser::{
12 Accent, AccentKind, Atom, Delimited, Delimiter, Fraction, Group, MathBody, Node, NodeKind, ParseDiagnostic,
13 ParseDiagnosticKind, Script, ScriptArgument, ScriptBase, Sqrt, parse_math_body,
14};
15use crate::registry::{
16 LatexSourceFragment, OperatorWordKind, latex_symbol, lookup_command, math_alphabet_latex_command,
17 operator_word_latex_source, styled_operator_word_latex_source, unicode_sub_latex, unicode_sub_str,
18 unicode_super_latex, unicode_super_str, unicode_symbol_latex_source,
19};
20use crate::unicode_lexer::CombiningAccent;
21use crate::unicode_parser::{
22 Accent as UnicodeAccent, AccentTarget as UnicodeAccentTarget, ArrowDirection as UnicodeArrowDirection,
23 Group as UnicodeGroup, GroupDelimiter as UnicodeGroupDelimiter, LinearArrow as UnicodeLinearArrow,
24 Root as UnicodeRoot, Script as UnicodeScript, ScriptArgument as UnicodeScriptArgument,
25 ScriptBase as UnicodeScriptBase, UnicodeMathBody, UnicodeNode, UnicodeNodeKind, UnicodeParseDiagnostic,
26 UnicodeParseDiagnosticKind, parse_unicode_math_body_with_diagnostics,
27};
28
29#[derive(Clone, Debug, PartialEq, Eq)]
31pub struct TranslationLoss {
32 span: SourceSpan,
33 reason: String,
34}
35
36impl TranslationLoss {
37 fn new(span: SourceSpan, reason: impl Into<String>) -> Self {
38 Self {
39 span,
40 reason: reason.into(),
41 }
42 }
43
44 #[must_use]
46 pub const fn span(&self) -> SourceSpan {
47 self.span
48 }
49
50 #[must_use]
52 pub fn reason(&self) -> &str {
53 &self.reason
54 }
55}
56
57#[derive(Clone, Copy, Debug, PartialEq, Eq)]
59pub enum TranslationStatus {
60 Unchanged,
63 Lossless,
65 Lossy,
68}
69
70#[derive(Clone, Debug, PartialEq, Eq)]
72pub struct Translation {
73 text: String,
74 edit_count: usize,
75 losses: Vec<TranslationLoss>,
76 diagnostics: Vec<LatexError>,
77}
78
79impl Translation {
80 fn with_diagnostics(source: &str, diagnostics: Vec<LatexError>) -> Self {
81 Self {
82 text: source.to_owned(),
83 edit_count: 0,
84 losses: Vec::new(),
85 diagnostics,
86 }
87 }
88
89 fn from_parts(source: &str, text: String, losses: Vec<TranslationLoss>, diagnostics: Vec<LatexError>) -> Self {
90 Self {
91 edit_count: usize::from(text != source),
92 text,
93 losses,
94 diagnostics,
95 }
96 }
97
98 #[must_use]
100 pub fn text(&self) -> &str {
101 &self.text
102 }
103
104 #[must_use]
106 pub const fn edit_count(&self) -> usize {
107 self.edit_count
108 }
109
110 #[must_use]
112 pub fn losses(&self) -> &[TranslationLoss] {
113 &self.losses
114 }
115
116 #[must_use]
118 pub fn diagnostics(&self) -> &[LatexError] {
119 &self.diagnostics
120 }
121
122 #[must_use]
124 pub fn is_lossless(&self) -> bool {
125 self.losses.is_empty() && self.diagnostics.is_empty()
126 }
127
128 #[must_use]
130 pub fn status(&self) -> TranslationStatus {
131 if !self.is_lossless() {
132 TranslationStatus::Lossy
133 } else if self.edit_count == 0 {
134 TranslationStatus::Unchanged
135 } else {
136 TranslationStatus::Lossless
137 }
138 }
139}
140
141#[must_use]
143pub fn translate_latex_to_unicode(source: &str) -> Translation {
144 let body = match parse_math_body(source) {
145 Ok(body) => body,
146 Err(diagnostics) => return Translation::with_diagnostics(source, parse_errors(&diagnostics)),
147 };
148 let mut ctx = TranslateContext::new(source);
149 let text = ctx.translate_body_preserving_gaps(&body, 0, source.len());
150 Translation::from_parts(source, text, ctx.losses, ctx.diagnostics)
151}
152
153#[must_use]
155pub fn translate_unicode_to_latex(source: &str) -> Translation {
156 let parsed = parse_unicode_math_body_with_diagnostics(source);
157 let mut ctx = UnicodeEmitContext::new(source);
158 ctx.diagnostics
159 .extend(parsed.diagnostics.iter().map(unicode_parse_error));
160 let text = ctx.emit_body_preserving_gaps(&parsed.body, 0, source.len());
161 Translation::from_parts(source, text, ctx.losses, ctx.diagnostics)
162}
163
164struct UnicodeEmitContext<'src> {
165 source: &'src str,
166 losses: Vec<TranslationLoss>,
167 diagnostics: Vec<LatexError>,
168}
169
170impl<'src> UnicodeEmitContext<'src> {
171 fn new(source: &'src str) -> Self {
172 Self {
173 source,
174 losses: Vec::new(),
175 diagnostics: Vec::new(),
176 }
177 }
178
179 fn emit_body_preserving_gaps(&mut self, body: &UnicodeMathBody<'_>, start: usize, end: usize) -> String {
180 let mut out = String::new();
181 let mut cursor = start;
182 let mut index = 0;
183 while let Some(node) = body.elements.get(index) {
184 if cursor < node.span.start() {
185 out.push_str(slice_or_empty(self.source, cursor..node.span.start()));
186 }
187 if preserves_following_group(&node.kind)
188 && let Some(next) = body.elements.get(index.saturating_add(1))
189 && let Some((preserved, span_end)) = self.emit_preserved_latex_argument(node, next)
190 {
191 out.push_str(&preserved);
192 cursor = span_end;
193 index = index.saturating_add(2);
194 continue;
195 }
196 out.push_str(&self.emit_node(node));
197 cursor = node.span.end();
198 index = index.saturating_add(1);
199 }
200 if cursor < end {
201 out.push_str(slice_or_empty(self.source, cursor..end));
202 }
203 out
204 }
205
206 fn emit_preserved_latex_argument(
207 &mut self,
208 command: &UnicodeNode<'_>,
209 argument: &UnicodeNode<'_>,
210 ) -> Option<(String, usize)> {
211 let mut out = self.emit_node(command);
212 match &argument.kind {
213 UnicodeNodeKind::Group(_) => {
214 if command.span.end() < argument.span.start() {
215 out.push_str(slice_or_empty(self.source, command.span.end()..argument.span.start()));
216 }
217 out.push_str(slice_or_empty(self.source, argument.span.as_range()));
218 Some((out, argument.span.end()))
219 }
220 UnicodeNodeKind::Script(script) => {
221 let base = script_group_base(script)?;
222 if command.span.end() < base.span.start() {
223 out.push_str(slice_or_empty(self.source, command.span.end()..base.span.start()));
224 }
225 out.push_str(slice_or_empty(self.source, base.span.as_range()));
226 out.push_str(&self.emit_script_suffix(script).ok()?);
227 Some((out, argument.span.end()))
228 }
229 UnicodeNodeKind::Plain(_)
230 | UnicodeNodeKind::Number(_)
231 | UnicodeNodeKind::Punctuation(_)
232 | UnicodeNodeKind::CanonicalSource(_)
233 | UnicodeNodeKind::DirectSymbol(_)
234 | UnicodeNodeKind::ExistingLatex(_)
235 | UnicodeNodeKind::StyledRun(_)
236 | UnicodeNodeKind::Accent(_)
237 | UnicodeNodeKind::Root(_)
238 | UnicodeNodeKind::LinearArrow(_)
239 | UnicodeNodeKind::Unknown(_) => None,
240 }
241 }
242
243 fn emit_node(&mut self, node: &UnicodeNode<'_>) -> String {
244 match self.try_emit_node(node) {
245 Ok(text) => text,
246 Err(reason) => {
247 self.losses.push(TranslationLoss::new(node.span, reason));
248 slice_or_empty(self.source, node.span.as_range()).to_owned()
249 }
250 }
251 }
252
253 fn try_emit_node(&mut self, node: &UnicodeNode<'_>) -> Result<String, String> {
254 match &node.kind {
255 UnicodeNodeKind::Plain(text) => Ok(self.emit_plain(text, node.span)),
256 UnicodeNodeKind::Number(text) | UnicodeNodeKind::Punctuation(text) => {
257 Ok(prime_source(text).unwrap_or(text).to_owned())
258 }
259 UnicodeNodeKind::CanonicalSource(text) => Ok(translate_unicode_to_latex(text).text().to_owned()),
260 UnicodeNodeKind::DirectSymbol(text) => self.emit_direct_symbol(text, node.span),
261 UnicodeNodeKind::ExistingLatex(text) => Ok(canonical_latex_passthrough(text).to_owned()),
262 UnicodeNodeKind::StyledRun(run) => Ok(self.emit_styled_run(run.style, &run.base, node.span)),
263 UnicodeNodeKind::Script(script) => self.emit_script(script),
264 UnicodeNodeKind::Accent(accent) => self.emit_accent(accent),
265 UnicodeNodeKind::Group(group) => Ok(self.emit_group(group)),
266 UnicodeNodeKind::Root(root) => self.emit_root(root),
267 UnicodeNodeKind::LinearArrow(arrow) => Ok(self.emit_linear_arrow(arrow)),
268 UnicodeNodeKind::Unknown(text) => Err(format!("unknown Unicode math source {text:?}")),
269 }
270 }
271
272 fn emit_plain(&self, text: &str, span: SourceSpan) -> String {
273 if let Some(source) = prime_source(text) {
274 return source.to_owned();
275 }
276 if let Some(info) = operator_word_latex_source(text) {
277 let source = match info.kind {
278 OperatorWordKind::BuiltInCommand | OperatorWordKind::OperatorName => info.source,
279 };
280 return self.latex_fragment(source, span.end());
281 }
282 text.to_owned()
283 }
284
285 fn emit_direct_symbol(&self, text: &str, span: SourceSpan) -> Result<String, String> {
286 if let Some(source) = prime_source(text) {
287 return Ok(source.to_owned());
288 }
289 let Some(fragment) = unicode_symbol_latex_source(text) else {
290 return Err(format!("Unicode symbol {text:?} has no canonical LaTeX source"));
291 };
292 Ok(self.latex_fragment(fragment, span.end()))
293 }
294
295 fn emit_styled_run(&self, style: crate::registry::MathAlphabetStyle, base: &str, span: SourceSpan) -> String {
296 if let Some(info) = styled_operator_word_latex_source(style, base) {
297 let source = match info.kind {
298 OperatorWordKind::BuiltInCommand | OperatorWordKind::OperatorName => info.source,
299 };
300 return self.latex_fragment(source, span.end());
301 }
302 if base.chars().any(|ch| !ch.is_ascii_alphanumeric()) {
303 return translate_unicode_to_latex(base).text().to_owned();
304 }
305 if let Some(command) = math_alphabet_latex_command(style) {
306 return format!(r"\{command}{{{base}}}");
307 }
308 base.to_owned()
309 }
310
311 fn emit_script(&mut self, script: &UnicodeScript<'_>) -> Result<String, String> {
312 if let Some(command) = directional_limit_script_command(script) {
313 return Ok(command.to_owned());
314 }
315 let mut out = self.emit_script_base(&script.base);
316 out.push_str(&self.emit_script_suffix(script)?);
317 Ok(out)
318 }
319
320 fn emit_script_suffix(&mut self, script: &UnicodeScript<'_>) -> Result<String, String> {
321 let mut out = String::new();
322 if let Some(superscript) = &script.superscript {
323 out.push_str("^{");
324 out.push_str(&self.emit_script_argument(superscript)?);
325 out.push('}');
326 }
327 if let Some(subscript) = &script.subscript {
328 out.push_str("_{");
329 out.push_str(&self.emit_script_argument(subscript)?);
330 out.push('}');
331 }
332 Ok(out)
333 }
334
335 fn emit_script_base(&mut self, base: &UnicodeScriptBase<'_>) -> String {
336 match base {
337 UnicodeScriptBase::Node(node) => self.emit_node(node),
338 UnicodeScriptBase::Empty(_) => "{}".to_owned(),
339 }
340 }
341
342 fn emit_script_argument(&mut self, argument: &UnicodeScriptArgument<'_>) -> Result<String, String> {
343 match argument {
344 UnicodeScriptArgument::Node(node) => Ok(self.emit_node(node)),
345 UnicodeScriptArgument::Group(group) => {
346 Ok(self.emit_body_preserving_gaps(&group.body, group.body.span.start(), group.body.span.end()))
347 }
348 UnicodeScriptArgument::ScriptRun { source, .. } => Ok(source.clone()),
349 }
350 }
351
352 fn emit_accent(&mut self, accent: &UnicodeAccent<'_>) -> Result<String, String> {
353 if let Some(command) = directional_limit_command(accent) {
354 return Ok(command.to_owned());
355 }
356 let command = accent_command(accent.accent);
357 let body = match &accent.target {
358 UnicodeAccentTarget::Node(node) => self.emit_node(node),
359 UnicodeAccentTarget::Group(group) => {
360 self.emit_body_preserving_gaps(&group.body, group.body.span.start(), group.body.span.end())
361 }
362 };
363 Ok(format!(r"\{command}{{{body}}}"))
364 }
365
366 fn emit_group(&mut self, group: &UnicodeGroup<'_>) -> String {
367 let mut out = String::new();
368 let (open, close) = group_delimiters(group.delimiter);
369 out.push(open);
370 out.push_str(&self.emit_body_preserving_gaps(&group.body, group.body.span.start(), group.body.span.end()));
371 out.push(close);
372 out
373 }
374
375 fn emit_root(&mut self, root: &UnicodeRoot<'_>) -> Result<String, String> {
376 let mut out = String::from(r"\sqrt");
377 if let Some(degree) = &root.degree {
378 out.push('[');
379 out.push_str(&self.emit_script_argument(degree)?);
380 out.push(']');
381 }
382 out.push('{');
383 out.push_str(&self.emit_node(&root.body));
384 out.push('}');
385 Ok(out)
386 }
387
388 fn emit_linear_arrow(&mut self, arrow: &UnicodeLinearArrow<'_>) -> String {
389 let unlabelled = match arrow.direction {
390 UnicodeArrowDirection::Left => r"\leftarrow",
391 UnicodeArrowDirection::Right => r"\to",
392 };
393 let labelled = match arrow.direction {
394 UnicodeArrowDirection::Left => r"\xleftarrow",
395 UnicodeArrowDirection::Right => r"\xrightarrow",
396 };
397 let Some(label) = &arrow.label else {
398 return unlabelled.to_owned();
399 };
400 let label_text = self.emit_arrow_label(label);
401 if label_text.is_empty() {
402 unlabelled.to_owned()
403 } else {
404 format!("{labelled}{{{label_text}}}")
405 }
406 }
407
408 fn emit_arrow_label(&mut self, label: &UnicodeMathBody<'_>) -> String {
409 if let [node] = label.elements.as_slice()
410 && let UnicodeNodeKind::Script(script) = &node.kind
411 && matches!(script.base, UnicodeScriptBase::Empty(_))
412 && script.subscript.is_none()
413 && let Some(superscript) = &script.superscript
414 {
415 return self
416 .emit_script_argument(superscript)
417 .unwrap_or_else(|_| self.emit_body_preserving_gaps(label, label.span.start(), label.span.end()));
418 }
419 self.emit_body_preserving_gaps(label, label.span.start(), label.span.end())
420 }
421
422 fn latex_fragment(&self, fragment: LatexSourceFragment, span_end: usize) -> String {
423 match fragment {
424 LatexSourceFragment::Command(command) => {
425 let mut out = String::from('\\');
426 out.push_str(command);
427 if next_char(self.source, span_end).is_some_and(needs_command_separator) {
428 out.push(' ');
429 }
430 out
431 }
432 LatexSourceFragment::Raw(source) => source.to_owned(),
433 }
434 }
435}
436
437fn prime_source(text: &str) -> Option<&'static str> {
438 match text {
439 "′" => Some("'"),
440 "″" => Some("''"),
441 "‴" => Some("'''"),
442 "⁗" => Some("''''"),
443 _ => None,
444 }
445}
446
447fn canonical_latex_passthrough(source: &str) -> &str {
448 let Some(command) = source.strip_prefix('\\') else {
449 return source;
450 };
451 match command {
452 "supset\u{0338}" => r"\nsupset",
453 "subset\u{0338}" => r"\nsubset",
454 "supseteq\u{0338}" => r"\nsupseteq",
455 "subseteq\u{0338}" => r"\nsubseteq",
456 "leq\u{0338}" | "le\u{0338}" | "leqslant\u{0338}" => r"\nleq",
457 "geq\u{0338}" | "ge\u{0338}" | "geqslant\u{0338}" => r"\ngeq",
458 "in\u{0338}" => r"\notin",
459 _ => source,
460 }
461}
462
463fn preserves_following_group(kind: &UnicodeNodeKind<'_>) -> bool {
464 matches!(
465 kind,
466 UnicodeNodeKind::ExistingLatex(
467 r"\operatorname"
468 | r"\mathrm"
469 | r"\mathbf"
470 | r"\mathit"
471 | r"\mathsf"
472 | r"\mathtt"
473 | r"\mathcal"
474 | r"\mathfrak"
475 | r"\mathbb"
476 | r"\text"
477 )
478 )
479}
480
481fn script_group_base<'a>(script: &'a UnicodeScript<'_>) -> Option<&'a UnicodeNode<'a>> {
482 let UnicodeScriptBase::Node(base) = &script.base else {
483 return None;
484 };
485 if matches!(base.kind, UnicodeNodeKind::Group(_)) {
486 Some(base)
487 } else {
488 None
489 }
490}
491
492fn directional_limit_script_command(script: &UnicodeScript<'_>) -> Option<&'static str> {
493 if script.superscript.is_some() {
494 return None;
495 }
496 let UnicodeScriptBase::Node(base) = &script.base else {
497 return None;
498 };
499 if !matches!(&base.kind, UnicodeNodeKind::Plain("lim")) {
500 return None;
501 }
502 script
503 .subscript
504 .as_ref()
505 .and_then(directional_limit_script_argument_command)
506}
507
508fn directional_limit_script_argument_command(argument: &UnicodeScriptArgument<'_>) -> Option<&'static str> {
509 match argument {
510 UnicodeScriptArgument::Node(node) => directional_limit_node_command(node),
511 UnicodeScriptArgument::Group(group) => match group.body.elements.as_slice() {
512 [node] => directional_limit_node_command(node),
513 _ => None,
514 },
515 UnicodeScriptArgument::ScriptRun { source, .. } => directional_limit_source_command(source),
516 }
517}
518
519fn directional_limit_node_command(node: &UnicodeNode<'_>) -> Option<&'static str> {
520 match &node.kind {
521 UnicodeNodeKind::DirectSymbol(source)
522 | UnicodeNodeKind::Plain(source)
523 | UnicodeNodeKind::ExistingLatex(source) => directional_limit_source_command(source),
524 UnicodeNodeKind::CanonicalSource(source) => directional_limit_source_command(source),
525 UnicodeNodeKind::Number(_)
526 | UnicodeNodeKind::Punctuation(_)
527 | UnicodeNodeKind::StyledRun(_)
528 | UnicodeNodeKind::Script(_)
529 | UnicodeNodeKind::Accent(_)
530 | UnicodeNodeKind::Group(_)
531 | UnicodeNodeKind::Root(_)
532 | UnicodeNodeKind::LinearArrow(_)
533 | UnicodeNodeKind::Unknown(_) => None,
534 }
535}
536
537fn directional_limit_source_command(source: &str) -> Option<&'static str> {
538 match source {
539 "→" | r"\to" | r"\rightarrow" => Some(r"\varinjlim"),
540 "←" | r"\gets" | r"\leftarrow" => Some(r"\varprojlim"),
541 _ => None,
542 }
543}
544
545fn directional_limit_command(accent: &UnicodeAccent<'_>) -> Option<&'static str> {
546 let UnicodeAccentTarget::Node(node) = &accent.target else {
547 return None;
548 };
549 if !matches!(&node.kind, UnicodeNodeKind::Plain("lim")) {
550 return None;
551 }
552 match accent.accent {
553 CombiningAccent::Vec => Some(r"\varinjlim"),
554 CombiningAccent::Overleftarrow => Some(r"\varprojlim"),
555 CombiningAccent::Tilde
556 | CombiningAccent::Hat
557 | CombiningAccent::Check
558 | CombiningAccent::Bar
559 | CombiningAccent::Breve
560 | CombiningAccent::Dot
561 | CombiningAccent::Ddot
562 | CombiningAccent::Acute
563 | CombiningAccent::Grave
564 | CombiningAccent::Overleftrightarrow
565 | CombiningAccent::Overline => None,
566 }
567}
568
569fn accent_command(accent: CombiningAccent) -> &'static str {
570 match accent {
571 CombiningAccent::Tilde => "tilde",
572 CombiningAccent::Hat => "hat",
573 CombiningAccent::Check => "check",
574 CombiningAccent::Bar => "bar",
575 CombiningAccent::Breve => "breve",
576 CombiningAccent::Dot => "dot",
577 CombiningAccent::Ddot => "ddot",
578 CombiningAccent::Acute => "acute",
579 CombiningAccent::Grave => "grave",
580 CombiningAccent::Vec => "vec",
581 CombiningAccent::Overleftarrow => "overleftarrow",
582 CombiningAccent::Overleftrightarrow => "overleftrightarrow",
583 CombiningAccent::Overline => "overline",
584 }
585}
586
587fn group_delimiters(delimiter: UnicodeGroupDelimiter) -> (char, char) {
588 match delimiter {
589 UnicodeGroupDelimiter::Brace => ('{', '}'),
590 UnicodeGroupDelimiter::Bracket => ('[', ']'),
591 UnicodeGroupDelimiter::Parenthesis => ('(', ')'),
592 }
593}
594
595fn unicode_parse_error(diagnostic: &UnicodeParseDiagnostic) -> LatexError {
596 let kind = match diagnostic.kind() {
597 UnicodeParseDiagnosticKind::Lexical
598 | UnicodeParseDiagnosticKind::DetachedCombiningMark
599 | UnicodeParseDiagnosticKind::UnknownUnicodeSourceShape => LatexErrorKind::Lexical,
600 UnicodeParseDiagnosticKind::UnsupportedAccentTarget | UnicodeParseDiagnosticKind::UnprovenLinearArrow => {
601 LatexErrorKind::Unsupported
602 }
603 UnicodeParseDiagnosticKind::UnexpectedToken
604 | UnicodeParseDiagnosticKind::ScriptWithoutRepresentableBase
605 | UnicodeParseDiagnosticKind::DuplicateScript
606 | UnicodeParseDiagnosticKind::MalformedGroupedScript
607 | UnicodeParseDiagnosticKind::UnclosedGroup => LatexErrorKind::Syntax,
608 };
609 LatexError::new(kind, diagnostic.span(), diagnostic.message())
610}
611
612fn next_char(source: &str, index: usize) -> Option<char> {
613 source.get(index..)?.chars().next()
614}
615
616#[must_use]
621pub fn translate_latex_ranges_to_unicode(source: &str, ranges: &[Range<usize>]) -> Translation {
622 translate_ranges(source, ranges, translate_latex_to_unicode)
623}
624
625#[must_use]
630pub fn translate_unicode_ranges_to_latex(source: &str, ranges: &[Range<usize>]) -> Translation {
631 translate_ranges(source, ranges, translate_unicode_to_latex)
632}
633
634struct TranslateContext<'src> {
635 source: &'src str,
636 losses: Vec<TranslationLoss>,
637 diagnostics: Vec<LatexError>,
638}
639
640impl<'src> TranslateContext<'src> {
641 fn new(source: &'src str) -> Self {
642 Self {
643 source,
644 losses: Vec::new(),
645 diagnostics: Vec::new(),
646 }
647 }
648
649 fn translate_body_preserving_gaps(&mut self, body: &MathBody<'_>, start: usize, end: usize) -> String {
650 let mut out = String::new();
651 let mut cursor = start;
652 for node in &body.elements {
653 if cursor < node.span.start() {
654 out.push_str(slice_or_empty(self.source, cursor..node.span.start()));
655 }
656 out.push_str(&self.translate_node(node));
657 cursor = node.span.end();
658 }
659 if cursor < end {
660 out.push_str(slice_or_empty(self.source, cursor..end));
661 }
662 out
663 }
664
665 fn translate_node(&mut self, node: &Node<'_>) -> String {
666 match self.try_translate_node(node) {
667 Ok(text) => text,
668 Err(reason) => {
669 self.losses.push(TranslationLoss::new(node.span, reason));
670 slice_or_empty(self.source, node.span.as_range()).to_owned()
671 }
672 }
673 }
674
675 fn try_translate_node(&mut self, node: &Node<'_>) -> Result<String, String> {
676 match &node.kind {
677 NodeKind::Atom(atom) => self.translate_atom(*atom, node.span),
678 NodeKind::Group(group) => Ok(self.translate_group_preserving_delimiters(group)),
679 NodeKind::Fraction(fraction) => Self::translate_fraction(fraction),
680 NodeKind::Sqrt(sqrt) => self.translate_sqrt(sqrt),
681 NodeKind::Accent(accent) => self.translate_accent(accent),
682 NodeKind::Script(script) => self.translate_script(script),
683 NodeKind::Delimited(delimited) => Ok(self.translate_delimited(delimited)),
684 NodeKind::Environment(_) => Err("environment has no editable Unicode source form".to_owned()),
685 }
686 }
687
688 fn translate_atom(&mut self, atom: Atom<'_>, span: SourceSpan) -> Result<String, String> {
689 match atom {
690 Atom::Identifier(text) | Atom::Number(text) | Atom::Punctuation(text) | Atom::UnicodeSymbol(text) => {
691 Ok(text.to_owned())
692 }
693 Atom::ControlSymbol(text) => Ok(control_symbol_text(text).to_owned()),
694 Atom::Delimiter(delimiter) => Ok(delimiter_text(delimiter).to_owned()),
695 Atom::CommandSymbol(name) => {
696 let Some(symbol) = latex_symbol(name) else {
697 return Ok(String::new());
698 };
699 if let Some(command) = lookup_command(name)
700 && command.preferred() != name
701 {
702 self.losses.push(TranslationLoss::new(
703 span,
704 format!(
705 "alias `\\{name}` canonicalises to `\\{}` in reverse translation",
706 command.preferred()
707 ),
708 ));
709 }
710 Ok(symbol.to_owned())
711 }
712 }
713 }
714
715 fn translate_group_preserving_delimiters(&mut self, group: &Group<'_>) -> String {
716 let mut out = String::new();
717 out.push_str(slice_or_empty(self.source, group.span.start()..group.body.span.start()));
718 out.push_str(&self.translate_body_preserving_gaps(&group.body, group.body.span.start(), group.body.span.end()));
719 out.push_str(slice_or_empty(self.source, group.body.span.end()..group.span.end()));
720 out
721 }
722
723 fn translate_fraction(_fraction: &Fraction<'_>) -> Result<String, String> {
724 Err("fraction has no unambiguous editable Unicode source form".to_owned())
725 }
726
727 fn translate_sqrt(&mut self, sqrt: &Sqrt<'_>) -> Result<String, String> {
728 let mut out = String::new();
729 if let Some(degree) = &sqrt.degree {
730 let degree = self.translate_body_plain(°ree.body)?;
731 let Some(script) = unicode_super_str(°ree) else {
732 return Err("root degree has no Unicode superscript form".to_owned());
733 };
734 out.push_str(&script);
735 }
736 out.push('√');
737 out.push_str(&self.translate_body_plain(&sqrt.body.body)?);
738 Ok(out)
739 }
740
741 fn translate_accent(&mut self, accent: &Accent<'_>) -> Result<String, String> {
742 let body = self.translate_body_plain(&accent.body.body)?;
743 let mark = match accent.accent {
744 AccentKind::Hat => '\u{302}',
745 AccentKind::Bar => '\u{305}',
746 AccentKind::Tilde => '\u{303}',
747 AccentKind::Vec => '\u{20d7}',
748 };
749 if needs_grouped_unicode_accent_target(&body) {
750 return Ok(format!("{{{body}}}{mark}"));
751 }
752 let mut out = String::new();
753 for ch in body.chars() {
754 out.push(ch);
755 if !ch.is_whitespace() {
756 out.push(mark);
757 }
758 }
759 Ok(out)
760 }
761
762 fn translate_script(&mut self, script: &Script<'_>) -> Result<String, String> {
763 let mut out = self.translate_script_base(&script.base)?;
764 if let Some(subscript) = &script.subscript {
765 let text = self.translate_script_argument(subscript)?;
766 let Some(rendered) = unicode_sub_str(&text) else {
767 return Err(format!("subscript {text:?} has no Unicode source form"));
768 };
769 out.push_str(&rendered);
770 }
771 if let Some(superscript) = &script.superscript {
772 let text = self.translate_script_argument(superscript)?;
773 let Some(rendered) = unicode_super_str(&text) else {
774 return Err(format!("superscript {text:?} has no Unicode source form"));
775 };
776 out.push_str(&rendered);
777 }
778 Ok(out)
779 }
780
781 fn translate_script_base(&mut self, base: &ScriptBase<'_>) -> Result<String, String> {
782 match base {
783 ScriptBase::Atom(atom) => self.translate_atom(*atom, SourceSpan::new(0, 0)),
784 ScriptBase::Group(group) => self.translate_body_plain(&group.body),
785 ScriptBase::Sqrt(sqrt) => self.translate_sqrt(sqrt),
786 ScriptBase::Accent(accent) => self.translate_accent(accent),
787 ScriptBase::Delimited(delimited) => Ok(self.translate_delimited(delimited)),
788 ScriptBase::Fraction(_) => Err("scripted fraction has no Unicode source form".to_owned()),
789 }
790 }
791
792 fn translate_script_argument(&mut self, argument: &ScriptArgument<'_>) -> Result<String, String> {
793 match argument {
794 ScriptArgument::Atom { atom, span } => self.translate_atom(*atom, *span),
795 ScriptArgument::Group(group) => self.translate_body_plain(&group.body),
796 }
797 }
798
799 fn translate_delimited(&mut self, delimited: &Delimited<'_>) -> String {
800 let mut out = String::new();
801 out.push_str(delimiter_text(delimited.opener));
802 out.push_str(&self.translate_body_preserving_gaps(
803 &delimited.body,
804 delimited.body.span.start(),
805 delimited.body.span.end(),
806 ));
807 out.push_str(delimiter_text(delimited.closer));
808 out
809 }
810
811 fn translate_body_plain(&mut self, body: &MathBody<'_>) -> Result<String, String> {
812 let translated = self.translate_body_preserving_gaps(body, body.span.start(), body.span.end());
813 if translated.contains('\n') {
814 Err("multi-line math body has no compact Unicode source form".to_owned())
815 } else {
816 Ok(translated)
817 }
818 }
819}
820
821fn needs_grouped_unicode_accent_target(body: &str) -> bool {
822 body.chars().filter(|ch| !ch.is_whitespace()).count() > 1
823}
824
825fn translate_ranges(source: &str, ranges: &[Range<usize>], translate_body: fn(&str) -> Translation) -> Translation {
826 if let Some(error) = validate_ranges(source, ranges) {
827 return Translation::with_diagnostics(source, vec![error]);
828 }
829
830 let mut out = String::with_capacity(source.len());
831 let mut cursor = 0usize;
832 let mut edit_count = 0usize;
833 let mut losses = Vec::new();
834 let mut diagnostics = Vec::new();
835 for range in ranges {
836 out.push_str(slice_or_empty(source, cursor..range.start));
837 let body = slice_or_empty(source, range.clone());
838 let translated = translate_body(body);
839 if translated.text() != body {
840 edit_count = edit_count.saturating_add(1);
841 }
842 losses.extend(shift_losses(translated.losses(), range.start));
843 diagnostics.extend(shift_diagnostics(translated.diagnostics(), range.start));
844 out.push_str(translated.text());
845 cursor = range.end;
846 }
847 out.push_str(slice_or_empty(source, cursor..source.len()));
848 Translation {
849 text: out,
850 edit_count,
851 losses,
852 diagnostics,
853 }
854}
855
856fn validate_ranges(source: &str, ranges: &[Range<usize>]) -> Option<LatexError> {
857 let mut end = 0usize;
858 for range in ranges {
859 if range.start < end {
860 return Some(LatexError::new(
861 LatexErrorKind::Syntax,
862 SourceSpan::new(range.start, range.end.min(source.len())),
863 "math body ranges must be sorted and non-overlapping",
864 ));
865 }
866 if range.start > range.end
867 || range.end > source.len()
868 || !source.is_char_boundary(range.start)
869 || !source.is_char_boundary(range.end)
870 {
871 return Some(LatexError::new(
872 LatexErrorKind::Syntax,
873 SourceSpan::new(range.start.min(source.len()), range.end.min(source.len())),
874 "math body range is not a valid UTF-8 source range",
875 ));
876 }
877 end = range.end;
878 }
879 None
880}
881
882fn shift_losses(losses: &[TranslationLoss], base: usize) -> impl Iterator<Item = TranslationLoss> + '_ {
883 losses.iter().map(move |loss| {
884 TranslationLoss::new(
885 SourceSpan::new(
886 loss.span.start().saturating_add(base),
887 loss.span.end().saturating_add(base),
888 ),
889 loss.reason.clone(),
890 )
891 })
892}
893
894fn shift_diagnostics(diagnostics: &[LatexError], base: usize) -> impl Iterator<Item = LatexError> + '_ {
895 diagnostics.iter().map(move |diagnostic| {
896 LatexError::new(
897 diagnostic.kind().clone(),
898 SourceSpan::new(
899 diagnostic.span().start().saturating_add(base),
900 diagnostic.span().end().saturating_add(base),
901 ),
902 diagnostic.message(),
903 )
904 })
905}
906
907fn parse_errors(diagnostics: &[ParseDiagnostic]) -> Vec<LatexError> {
908 diagnostics.iter().map(parse_error).collect()
909}
910
911fn parse_error(diagnostic: &ParseDiagnostic) -> LatexError {
912 let kind = match diagnostic.kind() {
913 ParseDiagnosticKind::Lexical => LatexErrorKind::Lexical,
914 ParseDiagnosticKind::UnsupportedCommand | ParseDiagnosticKind::UnsupportedEnvironment => {
915 LatexErrorKind::Unsupported
916 }
917 ParseDiagnosticKind::UnexpectedToken
918 | ParseDiagnosticKind::MissingRequiredArgument
919 | ParseDiagnosticKind::UnbalancedGroup
920 | ParseDiagnosticKind::UnmatchedEnvironmentEnd
921 | ParseDiagnosticKind::ScriptWithoutBase
922 | ParseDiagnosticKind::DuplicateSubscript
923 | ParseDiagnosticKind::DuplicateSuperscript => LatexErrorKind::Syntax,
924 };
925 LatexError::new(kind, diagnostic.span(), diagnostic.message())
926}
927
928fn delimiter_text(delimiter: Delimiter<'_>) -> &str {
929 match delimiter {
930 Delimiter::Source(".") => "",
931 Delimiter::Source(source) => source,
932 }
933}
934
935fn control_symbol_text(source: &str) -> &str {
936 source.strip_prefix('\\').unwrap_or(source)
937}
938
939fn needs_command_separator(next: char) -> bool {
940 next.is_alphanumeric() && unicode_super_latex(next).is_none() && unicode_sub_latex(next).is_none()
941}
942
943fn slice_or_empty(source: &str, range: Range<usize>) -> &str {
944 source.get(range).unwrap_or("")
945}
946
947#[cfg(test)]
948mod tests {
949 #![allow(
950 clippy::indexing_slicing,
951 clippy::literal_string_with_formatting_args,
952 clippy::unicode_not_nfc,
953 reason = "translation tests inspect exact source output"
954 )]
955
956 use super::*;
957
958 #[test]
959 fn latex_to_unicode_translates_direct_commands_and_scripts() {
960 let translated = translate_latex_to_unicode(r"\alpha_i + x^{2} \to \beta");
961
962 assert_eq!(translated.text(), "αᵢ + x² → β");
963 assert_eq!(translated.status(), TranslationStatus::Lossless);
964 assert_eq!(translated.edit_count(), 1);
965 }
966
967 #[test]
968 fn unicode_to_latex_uses_preferred_spellings_and_script_groups() {
969 let translated = translate_unicode_to_latex("αᵢ ≤ x² → β");
970
971 assert_eq!(translated.text(), r"\alpha_{i} \leq x^{2} \to \beta");
972 assert_eq!(translated.status(), TranslationStatus::Lossless);
973 }
974
975 #[test]
976 fn script_translation_laws_hold_for_supported_vocabulary() {
977 let unicode = translate_latex_to_unicode(r"\alpha_i");
978 assert_eq!(unicode.text(), "αᵢ");
979
980 let latex = translate_unicode_to_latex(unicode.text());
981 assert_eq!(latex.text(), r"\alpha_{i}");
982
983 let unicode_again = translate_latex_to_unicode(latex.text());
984 assert_eq!(unicode_again.text(), "αᵢ");
985 }
986
987 #[test]
988 fn latex_accents_over_compound_targets_preserve_unicode_ownership() {
989 let unicode = translate_latex_to_unicode(r"\bar{Y'}");
990 assert_eq!(unicode.text(), "{Y'}\u{305}");
991
992 let latex = translate_unicode_to_latex(unicode.text());
993 assert_eq!(latex.text(), r"\bar{Y'}");
994 }
995
996 #[test]
997 fn aliases_translate_to_unicode_with_reverse_canonicalisation_loss() {
998 let translated = translate_latex_to_unicode(r"\le");
999
1000 assert_eq!(translated.text(), "≤");
1001 assert_eq!(translated.status(), TranslationStatus::Lossy);
1002 assert_eq!(
1003 translated.losses()[0].reason(),
1004 r"alias `\le` canonicalises to `\leq` in reverse translation"
1005 );
1006 }
1007
1008 #[test]
1009 fn unsupported_commands_return_diagnostics_without_regex_replacement() {
1010 let translated = translate_latex_to_unicode(r"\alphabeta + \color{red}{x}");
1011
1012 assert_eq!(translated.text(), r"\alphabeta + \color{red}{x}");
1013 assert_eq!(translated.status(), TranslationStatus::Lossy);
1014 assert_eq!(translated.diagnostics().len(), 2);
1015 assert!(
1016 translated
1017 .diagnostics()
1018 .iter()
1019 .all(|diagnostic| diagnostic.kind() == &LatexErrorKind::Unsupported)
1020 );
1021 }
1022
1023 #[test]
1024 fn structural_forms_without_honest_source_shape_remain_visible() {
1025 let translated = translate_latex_to_unicode(r"\frac{a}{b} + \sqrt[n]{x}");
1026
1027 assert_eq!(translated.text(), r"\frac{a}{b} + ⁿ√x");
1028 assert_eq!(translated.status(), TranslationStatus::Lossy);
1029 assert_eq!(
1030 translated.losses()[0].reason(),
1031 "fraction has no unambiguous editable Unicode source form"
1032 );
1033 }
1034
1035 #[test]
1036 fn unicode_roots_translate_to_preferred_latex_source() {
1037 assert_eq!(translate_unicode_to_latex("√x").text(), r"\sqrt{x}");
1038 assert_eq!(translate_unicode_to_latex("ⁿ√x").text(), r"\sqrt[n]{x}");
1039 assert_eq!(translate_unicode_to_latex("√x²").text(), r"\sqrt{x^{2}}");
1040 }
1041
1042 #[test]
1043 fn unicode_to_latex_preserves_visible_latex_fallback_syntax() {
1044 assert_eq!(translate_unicode_to_latex(r"\frac{a}{b}").text(), r"\frac{a}{b}");
1045 assert_eq!(
1046 translate_unicode_to_latex(r"x_{n} + \color{red}{x}").text(),
1047 r"x_{n} + \color{red}{x}"
1048 );
1049 assert_eq!(
1050 translate_unicode_to_latex(r"\mathrm{Hom}(X,Y)").text(),
1051 r"\mathrm{Hom}(X,Y)"
1052 );
1053 assert_eq!(
1054 translate_unicode_to_latex(r"\mathrm{Div}^+_X").text(),
1055 r"\mathrm{Div}^{+}_{X}"
1056 );
1057 }
1058
1059 #[test]
1060 fn unicode_to_latex_normalizes_ascii_style_bracket_scripts() {
1061 assert_eq!(translate_unicode_to_latex("M_[φ]").text(), r"M_{\phi}");
1062 assert_eq!(translate_unicode_to_latex("x^(n)").text(), r"x^{n}");
1063 }
1064
1065 #[test]
1066 fn unicode_to_latex_normalizes_prime_suffixes() {
1067 assert_eq!(translate_unicode_to_latex("A′").text(), "A'");
1068 assert_eq!(translate_unicode_to_latex("𝔭′").text(), r"\mathfrak{p}'");
1069 assert_eq!(translate_unicode_to_latex("A″").text(), "A''");
1070 }
1071
1072 #[test]
1073 fn unicode_to_latex_normalizes_math_alphabets() {
1074 assert_eq!(translate_unicode_to_latex("𝒪_X").text(), r"\mathcal{O}_{X}");
1075 assert_eq!(translate_unicode_to_latex("ℱ(U)").text(), r"\mathcal{F}(U)");
1076 assert_eq!(translate_unicode_to_latex("𝔭").text(), r"\mathfrak{p}");
1077 assert_eq!(translate_unicode_to_latex("𝔏").text(), r"\mathfrak{L}");
1078 assert_eq!(translate_unicode_to_latex("𝔍").text(), r"\mathfrak{J}");
1079 assert_eq!(translate_unicode_to_latex("ℤ").text(), r"\mathbb{Z}");
1080 assert_eq!(translate_unicode_to_latex("𝓗𝓸𝓶").text(), r"\operatorname{Hom}");
1081 assert_eq!(translate_unicode_to_latex("𝓟𝓻𝓸𝓳").text(), r"\operatorname{Proj}");
1082 assert_eq!(translate_unicode_to_latex("𝒟ℯ𝓇").text(), r"\operatorname{Der}");
1083 assert_eq!(translate_unicode_to_latex("𝚪_*").text(), r"\Gamma_{*}");
1084 assert_eq!(translate_unicode_to_latex("𝐒").text(), r"\mathbf{S}");
1085 assert_eq!(translate_unicode_to_latex("𝐕").text(), r"\mathbf{V}");
1086 assert_eq!(translate_unicode_to_latex("𝐗").text(), r"\mathbf{X}");
1087 assert_eq!(translate_unicode_to_latex("𝐟𝐠").text(), r"\mathbf{fg}");
1088 assert_eq!(translate_unicode_to_latex("𝔭𝔮").text(), r"\mathfrak{pq}");
1089 assert_eq!(translate_unicode_to_latex("𝓕𝓸𝓸").text(), r"\mathcal{Foo}");
1090 }
1091
1092 #[test]
1093 fn unicode_to_latex_normalizes_operator_words() {
1094 assert_eq!(translate_unicode_to_latex("log(q)").text(), r"\log(q)");
1095 assert_eq!(translate_unicode_to_latex("6 · log(q)").text(), r"6 \cdot \log(q)");
1096 assert_eq!(translate_unicode_to_latex("Spec(A)").text(), r"\operatorname{Spec}(A)");
1097 assert_eq!(translate_unicode_to_latex("Proj(A)").text(), r"\operatorname{Proj}(A)");
1098 assert_eq!(
1099 translate_unicode_to_latex("Hom(A,B)").text(),
1100 r"\operatorname{Hom}(A,B)"
1101 );
1102 assert_eq!(
1103 translate_unicode_to_latex("Gal(L/K)").text(),
1104 r"\operatorname{Gal}(L/K)"
1105 );
1106 assert_eq!(translate_unicode_to_latex("Idem(A)").text(), r"\operatorname{Idem}(A)");
1107 assert_eq!(translate_unicode_to_latex("Thing(A)").text(), "Thing(A)");
1108 assert_eq!(translate_unicode_to_latex("𝓣𝓱𝓲𝓷𝓰").text(), r"\mathcal{Thing}");
1109 }
1110
1111 #[test]
1112 fn unicode_to_latex_normalizes_combining_accents() {
1113 assert_eq!(translate_unicode_to_latex("M̃").text(), r"\tilde{M}");
1114 assert_eq!(translate_unicode_to_latex("Ω̂").text(), r"\hat{\Omega}");
1115 assert_eq!(translate_unicode_to_latex("Ĉ").text(), r"\hat{C}");
1116 assert_eq!(translate_unicode_to_latex("{x}̅").text(), r"\bar{x}");
1117 assert_eq!(translate_unicode_to_latex("Ȳ'").text(), r"\bar{Y}'");
1118 assert_eq!(translate_unicode_to_latex("{Y'}̄").text(), r"\bar{Y'}");
1119 let prime_bar = translate_unicode_to_latex("Y'̄");
1120 assert_eq!(prime_bar.text(), "Y'̄");
1121 assert_eq!(prime_bar.status(), TranslationStatus::Lossy);
1122 assert_eq!(prime_bar.diagnostics()[0].kind(), &LatexErrorKind::Unsupported);
1123 assert_eq!(translate_unicode_to_latex("ũ").text(), r"\tilde{u}");
1124 assert_eq!(translate_unicode_to_latex("ẑ").text(), r"\hat{z}");
1125 assert_eq!(translate_unicode_to_latex("c̄").text(), r"\bar{c}");
1126 assert_eq!(translate_unicode_to_latex("lim⃗").text(), r"\varinjlim");
1127 assert_eq!(translate_unicode_to_latex("lim⃖").text(), r"\varprojlim");
1128 }
1129
1130 #[test]
1131 fn unicode_to_latex_normalizes_directional_limit_scripts() {
1132 assert_eq!(translate_unicode_to_latex("lim_→ A_t").text(), r"\varinjlim A_{t}");
1133 assert_eq!(translate_unicode_to_latex("lim_← H^n").text(), r"\varprojlim H^{n}");
1134 assert_eq!(
1135 translate_unicode_to_latex("lim_← H^n(𝔛, ℱ_k)").text(),
1136 r"\varprojlim H^{n}(\mathfrak{X}, \mathcal{F}_{k})"
1137 );
1138 assert_eq!(translate_unicode_to_latex(r"lim_\to A_t").text(), r"\varinjlim A_{t}");
1139 assert_eq!(
1140 translate_unicode_to_latex(r"lim_\leftarrow H^n").text(),
1141 r"\varprojlim H^{n}"
1142 );
1143 assert_eq!(translate_unicode_to_latex("lim⃗ M_n").text(), r"\varinjlim M_{n}");
1144 assert_eq!(translate_unicode_to_latex("lim⃖ M_n").text(), r"\varprojlim M_{n}");
1145 }
1146
1147 #[test]
1148 fn unicode_to_latex_normalizes_extended_scripts() {
1149 assert_eq!(translate_unicode_to_latex("iˢ_A").text(), r"i^{s}_{A}");
1150 assert_eq!(translate_unicode_to_latex("iᵀ_M").text(), r"i^{T}_{M}");
1151 assert_eq!(translate_unicode_to_latex(r"ᵃ\phi").text(), r"{}^{a}\phi");
1152 assert_eq!(translate_unicode_to_latex("D₊").text(), r"D_{+}");
1153 assert_eq!(translate_unicode_to_latex("xᵐ").text(), r"x^{m}");
1154 assert_eq!(translate_unicode_to_latex("Aᵖ").text(), r"A^{p}");
1155 assert_eq!(translate_unicode_to_latex("A_𝔭").text(), r"A_{\mathfrak{p}}");
1156 assert_eq!(translate_unicode_to_latex("A_𝔭𝔮").text(), r"A_{\mathfrak{pq}}");
1157 assert_eq!(translate_unicode_to_latex("C/𝔏").text(), r"C/\mathfrak{L}");
1158
1159 let dangling = translate_unicode_to_latex("((A'_i)_{𝔪'})^");
1160 assert_eq!(dangling.text(), r"((A'_{i})_{\mathfrak{m}'})^");
1161 assert_eq!(dangling.status(), TranslationStatus::Lossy);
1162 assert_eq!(dangling.diagnostics()[0].kind(), &LatexErrorKind::Syntax);
1163 }
1164
1165 #[test]
1166 fn unicode_to_latex_normalizes_common_operator_fragments() {
1167 assert_eq!(translate_unicode_to_latex("a · m").text(), r"a \cdot m");
1168 assert_eq!(translate_unicode_to_latex("A − 𝔭").text(), r"A - \mathfrak{p}");
1169 assert_eq!(translate_unicode_to_latex("A ⥲ B").text(), r"A \xrightarrow{\sim} B");
1170 assert_eq!(translate_unicode_to_latex("A ⟺ B").text(), r"A \Longleftrightarrow B");
1171 assert_eq!(
1172 translate_unicode_to_latex("a ⩾ b ⩽ c").text(),
1173 r"a \geqslant b \leqslant c"
1174 );
1175 assert_eq!(translate_unicode_to_latex("a ⋯ b … c").text(), r"a \cdots b \cdots c");
1176 assert_eq!(
1177 translate_unicode_to_latex("⨁ A □ ∁B").text(),
1178 r"\bigoplus A \square \complement B"
1179 );
1180 assert_eq!(translate_unicode_to_latex("A ↔ B").text(), r"A \leftrightarrow B");
1181 assert_eq!(
1182 translate_unicode_to_latex("f♯ ⊠ g♭ ⊔ h♮").text(),
1183 r"f\sharp \boxtimes g\flat \sqcup h\natural"
1184 );
1185 assert_eq!(
1186 translate_unicode_to_latex("A ↠ B ≀ C").text(),
1187 r"A \twoheadrightarrow B \wr C"
1188 );
1189 assert_eq!(
1190 translate_unicode_to_latex("A ⊉ B ⊄ C ⊀ D").text(),
1191 r"A \nsupseteq B \nsubset C \nprec D"
1192 );
1193 assert_eq!(
1194 translate_unicode_to_latex(concat!(
1195 "a ",
1196 "\u{227A}\u{0338}",
1197 " b ",
1198 "\u{2A7D}\u{0338}",
1199 " c ≽ d ≼ e ≫ f"
1200 ))
1201 .text(),
1202 r"a \nprec b \nleqslant c \succeq d \preceq e \gg f"
1203 );
1204 assert_eq!(
1205 translate_unicode_to_latex("codim(‾{x}, S)").text(),
1206 r"codim(\overline{x}, S)"
1207 );
1208 assert_eq!(translate_unicode_to_latex("‾K").text(), r"\overline{K}");
1209 assert_eq!(translate_unicode_to_latex("ℎ^{q} ℴ").text(), r"h^{q} o");
1210 assert_eq!(translate_unicode_to_latex("X°_{y}").text(), r"X^{\circ}_{y}");
1211 assert_eq!(
1212 translate_unicode_to_latex("A ⨂ B ↝ C").text(),
1213 r"A \bigotimes B \rightsquigarrow C"
1214 );
1215 assert_eq!(translate_unicode_to_latex("⋂ A").text(), r"\bigcap A");
1216 assert_eq!(
1217 translate_unicode_to_latex(concat!("\\supset", "\u{0338}", " S")).text(),
1218 r"\nsupset S"
1219 );
1220 assert_eq!(
1221 translate_unicode_to_latex(concat!("\\leqslant", "\u{0338}", r" \lambda")).text(),
1222 r"\nleq \lambda"
1223 );
1224 }
1225
1226 #[test]
1227 fn unicode_to_latex_normalizes_linear_arrow_notation() {
1228 assert_eq!(translate_unicode_to_latex("A ─u→ B").text(), r"A \xrightarrow{u} B");
1229 assert_eq!(translate_unicode_to_latex("A ←u─ B").text(), r"A \xleftarrow{u} B");
1230 assert_eq!(translate_unicode_to_latex("A ——→ B").text(), r"A \to B");
1231 assert_eq!(translate_unicode_to_latex("A ─^{u}→ B").text(), r"A \xrightarrow{u} B");
1232 assert_eq!(translate_unicode_to_latex("A ←^{u}─ B").text(), r"A \xleftarrow{u} B");
1233 assert_eq!(
1234 translate_unicode_to_latex("A ──φ^{S′}──→ B").text(),
1235 r"A \xrightarrow{\phi^{S'}} B"
1236 );
1237 assert_eq!(translate_unicode_to_latex("A ─────→ B").text(), r"A \to B");
1238 assert_eq!(
1239 translate_unicode_to_latex("Ω ──u──▸ Ω'").text(),
1240 r"\Omega \xrightarrow{u} \Omega'"
1241 );
1242 assert_eq!(
1243 translate_unicode_to_latex("U ─j × 1→ X").text(),
1244 r"U \xrightarrow{j \times 1} X"
1245 );
1246 assert_eq!(translate_unicode_to_latex("A ⤏ B").text(), r"A \dashrightarrow B");
1247 }
1248
1249 #[test]
1250 fn unsupported_unicode_remains_visible_and_lossy() {
1251 let translated = translate_unicode_to_latex("A ⥪ B");
1252
1253 assert_eq!(translated.text(), "A ⥪ B");
1254 assert_eq!(translated.status(), TranslationStatus::Lossy);
1255 assert_eq!(translated.losses().len(), 1);
1256 assert_eq!(translated.diagnostics()[0].kind(), &LatexErrorKind::Lexical);
1257 }
1258
1259 #[test]
1260 fn unicode_to_latex_corpus_forms_reach_latex_fixed_point() {
1261 for source in [
1262 "M_[φ]",
1263 "A′",
1264 "S⁻¹A",
1265 "𝒪_X",
1266 "a · m",
1267 "A ⥲ B",
1268 "A_𝔭",
1269 "C/𝔏",
1270 "lim_← H^n(𝔛, ℱ_k)",
1271 "U ─j × 1→ X",
1272 "f♯ ⊠ g♭",
1273 "codim(‾{x}, S)",
1274 ] {
1275 let latex = translate_unicode_to_latex(source);
1276 let unicode = translate_latex_to_unicode(latex.text());
1277 let latex_again = translate_unicode_to_latex(unicode.text());
1278 assert_eq!(latex_again.text(), latex.text(), "source {source:?}");
1279 }
1280 }
1281
1282 #[test]
1283 fn span_translation_preserves_markdown_delimiters() {
1284 let source = r"Inline \( \alpha_i \) and \( x^{2} \).";
1285 let first = r"Inline \( ".len();
1286 let second = r"Inline \( \alpha_i \) and \[ ".len();
1287 let translated = translate_latex_ranges_to_unicode(
1288 source,
1289 &[first..first + r"\alpha_i".len(), second..second + "x^{2}".len()],
1290 );
1291
1292 assert_eq!(translated.text(), r"Inline \( αᵢ \) and \( x² \).");
1293 assert_eq!(translated.edit_count(), 2);
1294 assert_eq!(translated.status(), TranslationStatus::Lossless);
1295 }
1296
1297 #[test]
1298 fn invalid_span_sets_do_not_rewrite_source() {
1299 let range = 1..2;
1300 let translated = translate_latex_ranges_to_unicode("αβ", std::slice::from_ref(&range));
1301
1302 assert_eq!(translated.text(), "αβ");
1303 assert_eq!(translated.status(), TranslationStatus::Lossy);
1304 assert_eq!(translated.diagnostics()[0].kind(), &LatexErrorKind::Syntax);
1305 }
1306
1307 #[test]
1308 fn scanner_backed_unicode_normalizer_is_deleted() {
1309 let deleted_type = ["Unicode", "Latex", "Normalizer"].concat();
1310 assert!(!include_str!("translation.rs").contains(&deleted_type));
1311 }
1312}