libmathcat/braille.rs
1#![allow(clippy::needless_return)]
2use strum_macros::Display;
3use sxd_document::dom::{Element, ChildOfElement};
4use sxd_document::Package;
5use crate::definitions::SPEECH_DEFINITIONS;
6use crate::errors::*;
7use crate::pretty_print::mml_to_string;
8use crate::prefs::PreferenceManager;
9use std::cell::Ref;
10use regex::{Captures, Regex, RegexSet};
11use phf::{phf_map, phf_set};
12use crate::speech::{BRAILLE_RULES, SpeechRulesWithContext, braille_replace_chars, make_quoted_string};
13use crate::canonicalize::get_parent;
14use std::borrow::Cow;
15use std::ops::Range;
16
17static UEB_PREFIXES: phf::Set<char> = phf_set! {
18 '⠼', '⠈', '⠘', '⠸', '⠐', '⠨', '⠰', '⠠',
19};
20
21/// Returns the braille *char* at the given position in the braille string.
22fn braille_at(braille: &str, index: usize) -> char {
23 // braille is always 3 bytes per char
24 return braille[index..index+3].chars().next().unwrap();
25
26}
27
28/// braille the MathML
29/// If 'nav_node_id' is not an empty string, then the element with that id will have dots 7 & 8 turned on as per the pref
30/// Returns the braille string (highlighted) along with the *character* start/end of the highlight (whole string if no highlight)
31pub fn braille_mathml(mathml: Element, nav_node_id: &str) -> Result<(String, usize, usize)> {
32 return BRAILLE_RULES.with(|rules| {
33 rules.borrow_mut().read_files()?;
34 let rules = rules.borrow();
35 let new_package = Package::new();
36 let mut rules_with_context = SpeechRulesWithContext::new(&rules, new_package.as_document(), nav_node_id);
37 let braille_string = rules_with_context.match_pattern::<String>(mathml)
38 .chain_err(|| "Pattern match/replacement failure!")?;
39 // debug!("braille_mathml: braille string: {}", &braille_string);
40 let braille_string = braille_string.replace(' ', "");
41 let pref_manager = rules_with_context.get_rules().pref_manager.borrow();
42 let highlight_style = pref_manager.pref_to_string("BrailleNavHighlight");
43 let braille_code = pref_manager.pref_to_string("BrailleCode");
44 let braille = match braille_code.as_str() {
45 "Nemeth" => nemeth_cleanup(pref_manager, braille_string),
46 "UEB" => ueb_cleanup(pref_manager, braille_string),
47 "Vietnam" => vietnam_cleanup(pref_manager, braille_string),
48 "CMU" => cmu_cleanup(pref_manager, braille_string),
49 "Finnish" => finnish_cleanup(pref_manager, braille_string),
50 "Swedish" => swedish_cleanup(pref_manager, braille_string),
51 "LaTeX" => LaTeX_cleanup(pref_manager, braille_string),
52 "ASCIIMath" => ASCIIMath_cleanup(pref_manager, braille_string),
53 _ => braille_string.trim_matches('⠀').to_string(), // probably needs cleanup if someone has another code, but this will have to get added by hand
54 };
55
56 return Ok(
57 if highlight_style != "Off" {
58 highlight_braille_chars(braille, &braille_code, highlight_style == "All")
59 } else {
60 let end = braille.len()/3;
61 (braille, 0, end)
62 }
63 );
64 });
65
66 /// highlight with dots 7 & 8 based on the highlight style
67 /// both the start and stop points will be extended to deal with indicators such as capitalization
68 /// if 'fill_range' is true, the interior will be highlighted
69 /// Returns the braille string (highlighted) along with the [start, end) *character* of the highlight (whole string if no highlight)
70 fn highlight_braille_chars(braille: String, braille_code: &str, fill_range: bool) -> (String, usize, usize) {
71 let mut braille = braille;
72 // some special (non-braille) chars weren't converted to having dots 7 & 8 to indicate navigation position
73 // they need to be added to the start
74
75 // find start and end (byte) indexes of the highlighted region (braille chars have length=3 bytes)
76 let start = braille.find(is_highlighted);
77 let end = braille.rfind(is_highlighted);
78 if start.is_none() {
79 assert!(end.is_none());
80 let end = braille.len();
81 return (braille, 0, end/3);
82 };
83
84 let start = start.unwrap();
85 let mut end = end.unwrap() + 3; // always exists if start exists ('end' is exclusive)
86 // debug!("braille highlight: start/end={}/{}; braille={}", start/3, end/3, braille);
87 let mut start = highlight_first_indicator(&mut braille, braille_code, start, end);
88 if let Some(new_range) = expand_highlight(&mut braille, braille_code, start, end) {
89 (start, end) = new_range
90 }
91
92 if start == end {
93 return (braille, start/3, end/3);
94 }
95
96 if !fill_range {
97 return (braille, start/3, end/3);
98 }
99
100 let mut result = String::with_capacity(braille.len());
101 result.push_str(&braille[..start]);
102 let highlight_region =&mut braille[start..end];
103 for ch in highlight_region.chars() {
104 result.push( highlight(ch) );
105 };
106 result.push_str(&braille[end..]);
107 return (result, start/3, end/3);
108
109 /// Return the byte index of the first place to highlight
110 fn highlight_first_indicator(braille: &mut String, braille_code: &str, start_index: usize, end_index: usize) -> usize {
111 // chars in the braille block range use 3 bytes -- we can use that to optimize the code some
112 let first_ch = unhighlight(braille_at(braille, start_index));
113
114 // need to highlight (optional) capital/number, language, and style (max 2 chars) also in that (rev) order
115 let mut prefix_ch_index = std::cmp::max(0, start_index as isize - 5*3) as usize;
116 if prefix_ch_index == 0 && braille_code == "UEB" {
117 // don't count the word or passage mode as part of a indicator
118 if braille.starts_with("⠰⠰⠰") {
119 prefix_ch_index = 9;
120 } else if braille.starts_with("⠰⠰") {
121 prefix_ch_index = 6;
122 }
123 }
124 let indicators = &braille[prefix_ch_index..start_index]; // chars to be examined
125 let i_byte_start = start_index - 3 * match braille_code {
126 "Nemeth" => i_start_nemeth(indicators, first_ch),
127 _ => i_start_ueb(indicators), // treat all the other like UEB because they probably have similar number and letter prefixes
128 };
129 if i_byte_start < start_index {
130 // remove old highlight as long as we don't wipe out the end highlight
131 if start_index < end_index {
132 let old_first_char_bytes = start_index..start_index+3;
133 let replacement_str = unhighlight(braille_at(braille, start_index)).to_string();
134 braille.replace_range(old_first_char_bytes, &replacement_str);
135 }
136
137 // add new highlight
138 let new_first_char_bytes = i_byte_start..i_byte_start+3;
139 let replacement_str = highlight(braille_at(braille, i_byte_start)).to_string();
140 braille.replace_range(new_first_char_bytes, &replacement_str);
141 }
142
143 return i_byte_start;
144 }
145
146 /// Return the byte indexes of the first and last place to highlight
147 /// Currently, this only does something for CMU braille
148 fn expand_highlight(braille: &mut String, braille_code: &str, start_index: usize, end_index: usize) -> Option<(usize, usize)> {
149 // For CMU, we want to expand mrows to include the opening and closing grouping indicators if they exist
150 if start_index == 0 || end_index == braille.len() || braille_code != "CMU" {
151 return None;
152 }
153
154 let first_ch = unhighlight(braille_at(braille, start_index));
155 let last_ch = unhighlight(braille_at(braille, end_index-3));
156 // We need to be careful not to expand the selection if we are already on a grouping indicator
157 if first_ch == '⠢' && last_ch == '⠔'{
158 return None;
159 }
160 let preceding_ch = braille_at(braille, start_index-3);
161 if preceding_ch != '⠢' {
162 return None;
163 }
164
165 let following_ch = braille_at(braille, end_index);
166 if following_ch != '⠔' {
167 return None;
168 }
169
170 let preceding_ch = highlight(preceding_ch);
171 braille.replace_range(start_index-3..start_index+3, format!("{preceding_ch}{first_ch}").as_str());
172 let following_ch = highlight(following_ch);
173 braille.replace_range(end_index-3..end_index+3, format!("{last_ch}{following_ch}").as_str());
174 return Some( (start_index-3, end_index + 3) );
175 }
176 }
177
178 /// Given a position in a Nemeth string, what is the position character that starts it (e.g, the prev char for capital letter)
179 fn i_start_nemeth(braille_prefix: &str, first_ch: char) -> usize {
180 static NEMETH_NUMBERS: phf::Set<char> = phf_set! {
181 '⠂', '⠆', '⠒', '⠲', '⠢', '⠖', '⠶', '⠦', '⠔', '⠴', '⠨' // 1, 2, ...9, 0, decimal pt
182 };
183 let mut n_chars = 0;
184 let prefix = &mut braille_prefix.chars().rev().peekable();
185 if prefix.peek() == Some(&'⠠') || // cap indicator
186 (prefix.peek() == Some(&'⠼') && NEMETH_NUMBERS.contains(&first_ch)) || // number indicator
187 [Some(&'⠸'), Some(&'⠈'), Some(&'⠨')].contains(&prefix.peek()) { // bold, script/blackboard, italic indicator
188 n_chars += 1;
189 prefix.next();
190 }
191
192 if [Some(&'⠰'), Some(&'⠸'), Some(&'⠨')].contains(&prefix.peek()) { // English, German, Greek
193 n_chars += 1;
194 } else if prefix.peek() == Some(&'⠈') {
195 let ch = prefix.next(); // Russian/Greek Variant
196 if ch == Some('⠈') || ch == Some('⠨') {
197 n_chars += 2;
198 }
199 } else if prefix.peek() == Some(&'⠠') { // Hebrew
200 let ch = prefix.next(); // Russian/Greek Variant
201 if ch == Some('⠠') {
202 n_chars += 2;
203 }
204 };
205 return n_chars;
206 }
207
208 /// Given a position in a UEB string, what is the position character that starts it (e.g, the prev char for capital letter)
209 fn i_start_ueb(braille_prefix: &str) -> usize {
210 let prefix = &mut braille_prefix.chars().rev().peekable();
211 let mut n_chars = 0;
212 while let Some(ch) = prefix.next() {
213 if UEB_PREFIXES.contains(&ch) {
214 n_chars += 1;
215 } else if ch == '⠆' {
216 let n_typeform_chars = check_for_typeform(prefix);
217 if n_typeform_chars > 0 {
218 n_chars += n_typeform_chars;
219 } else {
220 break;
221 }
222 } else {
223 break;
224 }
225 }
226 return n_chars;
227 }
228
229
230 fn check_for_typeform(prefix: &mut dyn std::iter::Iterator<Item=char>) -> usize {
231 static UEB_TYPEFORM_PREFIXES: phf::Set<char> = phf_set! {
232 '⠈', '⠘', '⠸', '⠨',
233 };
234
235 if let Some(typeform_indicator) = prefix.next() {
236 if UEB_TYPEFORM_PREFIXES.contains(&typeform_indicator) {
237 return 2;
238 } else if typeform_indicator == '⠼' {
239 if let Some(user_defined_typeform_indicator) = prefix.next() {
240 if UEB_TYPEFORM_PREFIXES.contains(&user_defined_typeform_indicator) || user_defined_typeform_indicator == '⠐' {
241 return 3;
242 }
243 }
244 }
245 }
246 return 0;
247 }
248}
249
250// FIX: if 8-dot braille is needed, perhaps the highlights can be shifted to a "highlighted" 256 char block in private space
251// they would need to be unshifted for the external world
252fn is_highlighted(ch: char) -> bool {
253 let ch_as_u32 = ch as u32;
254 return (0x28C0..0x28FF).contains(&ch_as_u32) || ch == '𝑏'; // 0x28C0..0x28FF all have dots 7 & 8 on
255}
256
257fn highlight(ch: char) -> char {
258 return unsafe{char::from_u32_unchecked(ch as u32 | 0xC0)}; // 0x28C0..0x28FF all have dots 7 & 8 on
259}
260
261fn unhighlight(ch: char) -> char {
262 let ch_as_u32 = ch as u32;
263 if (0x28C0..0x28FF).contains(&ch_as_u32) { // 0x28C0..0x28FF all have dots 7 & 8 on
264 return unsafe{char::from_u32_unchecked(ch_as_u32 & 0x283F)};
265 } else {
266 return ch;
267 }
268}
269
270use std::cell::RefCell;
271thread_local!{
272 /// Count number of probes -- get a sense of how well algorithm is working (for debugging)
273 static N_PROBES: RefCell<usize> = const { RefCell::new(0) };
274}
275
276
277/// Given a 0-based braille position, return the id of the smallest MathML node enclosing it.
278/// This node might be a leaf with an offset.
279pub fn get_navigation_node_from_braille_position(mathml: Element, position: usize) -> Result<(String, usize)> {
280 // This works via a "smart" binary search (the trees aren't binary or balanced, we estimate the child to look in):
281 // braille the mathml with a nav node and see where 'position' is in relation to the start/end of the nav node
282 // Each call to find_navigation_node() returns a search state that tell us where to look next if not found
283 #[derive(Debug, Display)]
284 enum SearchStatus {
285 LookInParent, // look up a level for exact match
286 LookLeft, // went too far, backup
287 LookRight, // continue searching right
288 Found,
289 }
290
291 struct SearchState<'e> {
292 status: SearchStatus,
293 node: Element<'e>,
294 highlight_start: usize, // if status is Found, then this is the offset within a leaf node
295 highlight_end: usize, // if status is Found, this is ignored
296 }
297
298 // save the current highlight state, set the state to be the end points so we can find the braille, then restore the state
299 // FIX: this can fail if there is 8-dot braille
300 use crate::interface::{get_preference, set_preference};
301 let saved_highlight_style = get_preference("BrailleNavHighlight".to_string()).unwrap();
302 set_preference("BrailleNavHighlight".to_string(), "EndPoints".to_string()).unwrap();
303
304 N_PROBES.with(|n| {*n.borrow_mut() = 0});
305 // dive into the child of the <math> element (should only be one)
306 let search_state = find_navigation_node(mathml, as_element(mathml.children()[0]), position)?;
307 set_preference("BrailleNavHighlight".to_string(), saved_highlight_style.to_string()).unwrap();
308
309 // we know the attr value exists because it was found internally
310 // FIX: what should be done if we never did the search?
311 match search_state.status {
312 SearchStatus::Found | SearchStatus::LookInParent => {
313 return Ok( (search_state.node.attribute_value("id").unwrap().to_string(), search_state.highlight_start) )
314 },
315 _ => {
316 // weird state -- return the entire expr
317 match mathml.attribute_value("id") {
318 None => bail!("'id' is not present on mathml: {}", mml_to_string(mathml)),
319 Some(id) => return Ok( (id.to_string(), 0) ),
320 }
321 }
322 }
323
324 /// find the navigation node that most tightly encapsulates the target position (0-based)
325 /// 'node' is the current node we are on inside of 'mathml'
326 fn find_navigation_node<'e>(mathml: Element<'e>, node: Element<'e>, target_position: usize) -> Result<SearchState<'e>> {
327 let node_id = match node.attribute_value("id") {
328 Some(id) => id,
329 None => bail!("'id' is not present on mathml: {}", mml_to_string(node)),
330 };
331 N_PROBES.with(|n| {*n.borrow_mut() += 1});
332 let (braille, char_start, char_end) = braille_mathml(mathml, node_id)?;
333 let mut status = None;
334 // debug!("find_navigation_node ({}, id={}): highlight=[{}, {}); target={}", name(node), node_id, char_start, char_end, target_position);
335 if is_leaf(node) {
336 if char_start == 0 && char_end == braille.len()/3 {
337 // nothing highlighted -- probably invisible char not represented in braille -- continue looking to the right
338 // debug!(" return due invisible char (?)' ");
339 status = Some(SearchStatus::LookRight);
340 } else if char_start <= target_position && target_position < char_end {
341 // FIX: need to handle multi-char leaves and set the offset (char_start) appropriately
342 // debug!(" return due to target_position inside leaf: {} <= {} < {}", char_start, target_position, char_end);
343 return Ok( SearchState {
344 status: SearchStatus::Found,
345 node,
346 highlight_start: target_position - char_start,
347 highlight_end: 0,
348 });
349 } else if name(node) == "mo" {
350 // if there is whitespace before or after the operator, consider the operator to be a match
351 if (char_start > 0 && target_position == char_start - 1 &&
352 braille_at(&braille, 3*(char_start - 1)) == '⠀' && is_operator_that_adds_whitespace(node)) ||
353 (3*char_end < braille.len() && target_position == char_end &&
354 braille_at(&braille, 3*char_end) == '⠀' && is_operator_that_adds_whitespace(node)) {
355 return Ok( SearchState {
356 status: SearchStatus::Found,
357 node,
358 highlight_start: 0,
359 highlight_end: 0,
360 } );
361 }
362 }
363 }
364 if status.is_none() {
365 if target_position < char_start {
366 // debug!(" return due to target_position {} < start {}", target_position, char_start);
367 status = Some(SearchStatus::LookLeft);
368 } else if target_position >= char_end {
369 // debug!(" return due to target_position {} >= end {}", target_position, char_end);
370 status = Some(SearchStatus::LookRight);
371 }
372 }
373 if let Some(status) = status {
374 return Ok( SearchState {
375 status,
376 node,
377 highlight_start: char_start,
378 highlight_end: char_end,
379 } );
380 }
381
382 let children = node.children();
383 let mut i_left_child = 0; // inclusive
384 let mut i_right_child = children.len(); // exclusive
385 let mut call_start = char_start;
386 let mut guess_fn: Box<dyn Fn(usize, usize, usize, usize) -> usize> = Box::new(|i_left, i_right, start, target: usize| guess_child_node_ltr(&children, i_left, i_right, start, target));
387 while i_left_child < i_right_child {
388 let i_guess_child = guess_fn(i_left_child, i_right_child, call_start, target_position);
389 let status = find_navigation_node(mathml, as_element(children[i_guess_child]), target_position)?;
390 // debug!(" in {} loop: status: {}, child: left/guess/right {}/({},{})/{}; highlight=[{}, {})",
391 // name(node), status.status,
392 // i_left_child, i_guess_child, name(as_element(children[i_guess_child])),i_right_child,
393 // status.highlight_start, status.highlight_end);
394 match status.status {
395 SearchStatus::Found => {
396 return Ok(status);
397 },
398 SearchStatus::LookInParent => {
399 let (_, start, end) = braille_mathml(mathml, node_id)?;
400 // debug!(" parent ({}) braille: start/end={}/{}; target_position={}", name(node), start, end, target_position);
401 if start <= target_position && target_position < end {
402 // debug!(" ..found: id={}", node_id);
403 return Ok( SearchState{
404 status: SearchStatus::Found,
405 node,
406 highlight_start: 0,
407 highlight_end: 0,
408 } ); // done or look up another level
409 }
410 return Ok(status); // look up a level
411 },
412 SearchStatus::LookLeft => {
413 i_right_child = if i_guess_child == 0 {0} else {i_guess_child}; // exclusive
414 call_start = status.highlight_start-1;
415 guess_fn = Box::new(|i_left, i_right, start, target| guess_child_node_rtl(&children, i_left, i_right, start, target));
416 },
417 SearchStatus::LookRight => {
418 i_left_child = i_guess_child+1;
419 call_start = status.highlight_end+1;
420 guess_fn = Box::new(|i_left, i_right, start, target| guess_child_node_ltr(&children, i_left, i_right, start, target));
421 },
422 }
423 }
424 // debug!("Didn't child in node {}: left/right={}/{}; target_position={}", name(node), i_left_child, i_right_child, target_position);
425
426 // if we get here, we didn't find it in the children
427 // debug!("..end of loop: look in parent of {} has start/end={}/{}", name(node), char_start, char_end);
428 return Ok( SearchState{
429 status: if char_start <= target_position && target_position <= char_end {SearchStatus::Found} else {SearchStatus::LookInParent},
430 node,
431 highlight_start: 0,
432 highlight_end: 0,
433 } );
434 }
435
436 fn is_operator_that_adds_whitespace(node: Element) -> bool {
437 use crate::definitions::BRAILLE_DEFINITIONS;
438 if PreferenceManager::get().borrow().pref_to_string("UseSpacesAroundAllOperators") == "true" {
439 return true;
440 }
441
442 return BRAILLE_DEFINITIONS.with(|definitions| {
443 let definitions = definitions.borrow();
444 let comparison_operators = definitions.get_hashset("ComparisonOperators").unwrap();
445 return comparison_operators.contains(as_text(node));
446 });
447 }
448
449 /// look in children[i_left..i_right] for a count that exceeds target
450 fn guess_child_node_ltr(children: &[ChildOfElement], i_left: usize, i_right: usize, start: usize, target: usize) -> usize {
451 let mut estimated_position = start;
452 // number of chars to add for number indicators
453 let n_number_indicator = if PreferenceManager::get().borrow().pref_to_string("BrailleCode") == "Nemeth" {0} else {1}; // Nemeth doesn't typically need number or letter indicators
454 #[allow(clippy::needless_range_loop)] // I don't like enumerate/take/skip here
455 for i in i_left..i_right {
456 estimated_position += estimate_braille_chars(children[i], n_number_indicator);
457 if estimated_position >= target {
458 return i;
459 }
460 }
461 return i_right-1; // estimate was too large, return the last child as a guess
462 }
463
464 /// look in children[i_left..i_right].rev for a count that is less than target
465 fn guess_child_node_rtl(children: &[ChildOfElement], i_left: usize, i_right: usize, start: usize, target: usize) -> usize {
466 let mut estimated_position = start;
467 let n_number_indicator = if PreferenceManager::get().borrow().pref_to_string("BrailleCode") == "Nemeth" {0} else {1}; // Nemeth doesn't typically need number or letter indicators
468 for i in (i_left..i_right).rev() {
469 estimated_position -= estimate_braille_chars(children[i], n_number_indicator);
470 if estimated_position <= target {
471 return i;
472 }
473 }
474 return i_left; // estimate was too small, return the first child as a guess
475 }
476
477 fn estimate_braille_chars(child: ChildOfElement, n_number_indicator: usize) -> usize {
478 let node = as_element(child);
479 let leaf_name = name(node);
480 if is_leaf(node) {
481 let text = as_text(node);
482 // len() is close since mn's probably have ASCII digits and lower case vars are common (count as) and other chars need extra braille chars
483 // don't want to count invisible chars since they don't display and would give a length = 3
484 if text == "\u{2061}" || text == "\u{2062}" { // invisible function apply/times (most common by far)
485 return 0;
486 }
487 // FIX: this assumption is bad for 8-dot braille
488 return match leaf_name {
489 "mn" => n_number_indicator + text.len(),
490 "mo" => 2, // could do better by actually brailling char, but that is more expensive
491 _ => text.len(),
492 }
493 }
494 let mut estimate = if leaf_name == "mrow" {0} else {node.children().len() + 1}; // guess extra chars need for mfrac, msub, etc (start+intermediate+end).
495 if leaf_name == "msup" || leaf_name == "msub" || leaf_name == "msubsup" {
496 estimate -= 1; // opening superscript/subscript indicator not needed
497 }
498 for child in node.children() {
499 estimate += estimate_braille_chars(child, n_number_indicator);
500 }
501 // debug!("estimate_braille_chars for {}: {}", crate::canonicalize::element_summary(as_element(child)), estimate);
502 return estimate;
503 }
504}
505
506fn nemeth_cleanup(pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String {
507 // Typeface: S: sans-serif, B: bold, T: script/blackboard, I: italic, R: Roman
508 // Language: E: English, D: German, G: Greek, V: Greek variants, H: Hebrew, U: Russian
509 // Indicators: C: capital, N: number, P: punctuation, M: multipurpose
510 // Others:
511 // W -- whitespace that should be kept (e.g, in a numeral)
512 // 𝑁 -- hack for special case of a lone decimal pt -- not considered a number but follows rules mostly
513 // SRE doesn't have H: Hebrew or U: Russian, so not encoded (yet)
514 // Note: some "positive" patterns find cases to keep the char and transform them to the lower case version
515 static NEMETH_INDICATOR_REPLACEMENTS: phf::Map<&str, &str> = phf_map! {
516 "S" => "⠠⠨", // sans-serif
517 "B" => "⠸", // bold
518 "𝔹" => "⠨", // blackboard
519 "T" => "⠈", // script
520 "I" => "⠨", // italic (mapped to be the same a blackboard)
521 "R" => "", // roman
522 "E" => "⠰", // English
523 "D" => "⠸", // German (Deutsche)
524 "G" => "⠨", // Greek
525 "V" => "⠨⠈", // Greek Variants
526 "H" => "⠠⠠", // Hebrew
527 "U" => "⠈⠈", // Russian
528 "C" => "⠠", // capital
529 "P" => "⠸", // punctuation
530 "𝐏" => "⠸", // hack for punctuation after a roman numeral -- never removed
531 "L" => "", // letter
532 "l" => "", // letter inside enclosed list
533 "M" => "", // multipurpose indicator
534 "m" => "⠐", // required multipurpose indicator
535 "N" => "", // potential number indicator before digit
536 "n" => "⠼", // required number indicator before digit
537 "𝑁" => "", // hack for special case of a lone decimal pt -- not considered a number but follows rules mostly
538 "W" => "⠀", // whitespace
539 "w" => "⠀", // whitespace from comparison operator
540 "," => "⠠⠀", // comma
541 "b" => "⠐", // baseline
542 "𝑏" => "⣐", // highlight baseline (it's a hack)
543 "↑" => "⠘", // superscript
544 "↓" => "⠰", // subscript
545 };
546
547 lazy_static! {
548 // Add an English Letter indicator. This involves finding "single letters".
549 // The green book has a complicated set of cases, but the Nemeth UEB Rule book (May 2020), 4.10 has a much shorter explanation:
550 // punctuation or whitespace on the left and right ignoring open/close chars
551 // https://nfb.org/sites/www.nfb.org/files/files-pdf/braille-certification/lesson-4--provisional-5-9-20.pdf
552 static ref ADD_ENGLISH_LETTER_INDICATOR: Regex =
553 Regex::new(r"(?P<start>^|W|P.[\u2800-\u28FF]?|,)(?P<open>[\u2800-\u28FF]?⠷)?(?P<letter>C?L.)(?P<close>[\u2800-\u28FF]?⠾)?(?P<end>W|P|,|$)").unwrap();
554
555 // Trim braille spaces before and after braille indicators
556 // In order: fraction, /, cancellation, letter, baseline
557 // Note: fraction over is not listed due to example 42(4) which shows a space before the "/"
558 static ref REMOVE_SPACE_BEFORE_BRAILLE_INDICATORS: Regex =
559 Regex::new(r"(⠄⠄⠄|⠤⠤⠤⠤)[Ww]+([⠼⠸⠪])").unwrap();
560 static ref REMOVE_SPACE_AFTER_BRAILLE_INDICATORS: Regex =
561 Regex::new(r"([⠹⠻Llb])[Ww]+(⠄⠄⠄|⠤⠤⠤⠤)").unwrap();
562
563 // Hack to convert non-numeric '.' to numeric '.'
564 // The problem is that the numbers are hidden inside of mover -- this might be more general than rule 99_2.
565 static ref DOTS_99_A_2: Regex = Regex::new(r"𝑁⠨mN").unwrap();
566
567 // Punctuation is one or two chars. There are (currently) only 3 2-char punct chars (—‘’) -- we explicitly list them below
568 static ref REMOVE_SPACE_BEFORE_PUNCTUATION_151: Regex =
569 Regex::new(r"w(P.[⠤⠦⠠]?|[\u2800-\u28FF]?⠾)").unwrap();
570 static ref REMOVE_SPACE_AFTER_PUNCTUATION_151: Regex =
571 Regex::new(r"(P.[⠤⠦⠠]?|[\u2800-\u28FF]?⠷)w").unwrap();
572
573 // Multipurpose indicator insertion
574 // 149 -- consecutive comparison operators have no space -- instead a multipurpose indicator is used (doesn't require a regex)
575
576 // 177.2 -- add after a letter and before a digit (or decimal pt) -- digits will start with N
577 static ref MULTI_177_2: Regex =
578 Regex::new(r"([Ll].)[N𝑁]").unwrap();
579
580 // keep between numeric subscript and digit ('M' added by subscript rule)
581 static ref MULTI_177_3: Regex =
582 Regex::new(r"([N𝑁].)M([N𝑁].)").unwrap();
583
584 // Add after decimal pt for non-digits except for comma and punctuation
585 // Note: since "." can be in the middle of a number, there is not necessarily a "N"
586 // Although not mentioned in 177_5, don't add an 'M' before an 'm'
587 static ref MULTI_177_5: Regex =
588 Regex::new(r"([N𝑁]⠨)([^⠂⠆⠒⠲⠢⠖⠶⠦⠔N𝑁,Pm])").unwrap();
589
590
591 // Pattern for rule II.9a (add numeric indicator at start of line or after a space)
592 // 1. start of line
593 // 2. optional minus sign (⠤)
594 // 3. optional typeface indicator
595 // 4. number (N)
596 static ref NUM_IND_9A: Regex =
597 Regex::new(r"(?P<start>^|[,Ww])(?P<minus>⠤?)N").unwrap();
598
599 // Needed after section mark(§), paragraph mark(¶), #, or *
600 static ref NUM_IND_9C: Regex =
601 Regex::new(r"(⠤?)(⠠⠷|⠠⠳|⠠⠈⠷)N").unwrap();
602
603 // Needed after section mark(§), paragraph mark(¶), #, or *
604 static ref NUM_IND_9D: Regex =
605 Regex::new(r"(⠈⠠⠎|⠈⠠⠏|⠨⠼|⠈⠼)N").unwrap();
606
607 // Needed after a typeface change or interior shape modifier indicator
608 static ref NUM_IND_9E: Regex = Regex::new(r"(?P<face>[SB𝔹TIR]+?)N").unwrap();
609 static ref NUM_IND_9E_SHAPE: Regex = Regex::new(r"(?P<mod>⠸⠫)N").unwrap();
610
611 // Needed after hyphen that follows a word, abbreviation, or punctuation (caution about rule 11d)
612 // Note -- hyphen might encode as either "P⠤" or "⠤" depending on the tag used
613 static ref NUM_IND_9F: Regex = Regex::new(r"([Ll].[Ll].|P.)(P?⠤)N").unwrap();
614
615 // Enclosed list exception
616 // Normally we don't add numeric indicators in enclosed lists (done in get_braille_nemeth_chars).
617 // The green book says "at the start" of an item, don't add the numeric indicator.
618 // The NFB list exceptions after function abbreviations and angles, but what this really means is "after a space"
619 static ref NUM_IND_ENCLOSED_LIST: Regex = Regex::new(r"w([⠂⠆⠒⠲⠢⠖⠶⠦⠔⠴])").unwrap();
620
621 // Punctuation chars (Rule 38.6 says don't use before ",", "hyphen", "-", "…")
622 // Never use punctuation indicator before these (38-6)
623 // "…": "⠀⠄⠄⠄"
624 // "-": "⠸⠤" (hyphen and dash)
625 // ",": "⠠⠀" -- spacing already added
626 // Rule II.9b (add numeric indicator after punctuation [optional minus[optional .][digit]
627 // because this is run after the above rule, some cases are already caught, so don't
628 // match if there is already a numeric indicator
629 static ref NUM_IND_9B: Regex = Regex::new(r"(?P<punct>P..?)(?P<minus>⠤?)N").unwrap();
630
631 // Before 79b (punctuation)
632 static ref REMOVE_LEVEL_IND_BEFORE_SPACE_COMMA_PUNCT: Regex = Regex::new(r"(?:[↑↓]+[b𝑏]?|[b𝑏])([Ww,P]|$)").unwrap();
633
634 // Most commas have a space after them, but not when followed by a close quote (others?)
635 static ref NO_SPACE_AFTER_COMMA: Regex = Regex::new(r",P⠴").unwrap(); // captures both single and double close quote
636 static ref REMOVE_LEVEL_IND_BEFORE_BASELINE: Regex = Regex::new(r"(?:[↑↓mb𝑏]+)([b𝑏])").unwrap();
637
638 // Except for the four chars above, the unicode rules always include a punctuation indicator.
639 // The cases to remove them (that seem relevant to MathML) are:
640 // Beginning of line or after a space (V 38.1)
641 // After a word (38.4)
642 // 2nd or subsequent punctuation (includes, "-", etc) (38.7)
643 static ref REMOVE_AFTER_PUNCT_IND: Regex = Regex::new(r"(^|[Ww]|[Ll].[Ll].)P(.)").unwrap();
644 static ref REPLACE_INDICATORS: Regex =Regex::new(r"([SB𝔹TIREDGVHUP𝐏CLlMmb𝑏↑↓Nn𝑁Ww,])").unwrap();
645 static ref COLLAPSE_SPACES: Regex = Regex::new(r"⠀⠀+").unwrap();
646 }
647
648// debug!("Before: \"{}\"", raw_braille);
649 // replacements might overlap at boundaries (e.g., whitespace) -- need to repeat
650 let mut start = 0;
651 let mut result = String::with_capacity(raw_braille.len()+ raw_braille.len()/4); // likely upper bound
652 while let Some(matched) = ADD_ENGLISH_LETTER_INDICATOR.find_at(&raw_braille, start) {
653 result.push_str(&raw_braille[start..matched.start()]);
654 let replacement = ADD_ENGLISH_LETTER_INDICATOR.replace(
655 &raw_braille[matched.start()..matched.end()], "${start}${open}E${letter}${close}");
656 // debug!("matched='{}', start/end={}/{}; replacement: {}", &raw_braille[matched.start()..matched.end()], matched.start(), matched.end(), replacement);
657 result.push_str(&replacement);
658 // put $end back on because needed for next match (e.g., whitespace at end and then start of next match)
659 // but it could also match because it was at the end, in which case "-1" is wrong -- tested after loop for that
660 start = matched.end() - 1;
661 }
662 if !raw_braille.is_empty() && ( start < raw_braille.len()-1 || "WP,".contains(raw_braille.chars().nth_back(0).unwrap()) ) { // see comment about $end above
663 result.push_str(&raw_braille[start..]);
664 }
665// debug!("ELIs: \"{}\"", result);
666
667 let result = NUM_IND_ENCLOSED_LIST.replace_all(&result, "wn${1}");
668
669 // Remove blanks before and after braille indicators
670 let result = REMOVE_SPACE_BEFORE_BRAILLE_INDICATORS.replace_all(&result, "$1$2");
671 let result = REMOVE_SPACE_AFTER_BRAILLE_INDICATORS.replace_all(&result, "$1$2");
672
673 let result = REMOVE_SPACE_BEFORE_PUNCTUATION_151.replace_all(&result, "$1");
674 let result = REMOVE_SPACE_AFTER_PUNCTUATION_151.replace_all(&result, "$1");
675// debug!("spaces: \"{}\"", result);
676
677 let result = DOTS_99_A_2.replace_all(&result, "N⠨mN");
678
679 // Multipurpose indicator
680 let result = result.replace("ww", "m"); // 149
681 let result = MULTI_177_2.replace_all(&result, "${1}m${2}");
682 let result = MULTI_177_3.replace_all(&result, "${1}m$2");
683 let result = MULTI_177_5.replace_all(&result, "${1}m$2");
684// debug!("MULTI: \"{}\"", result);
685
686 let result = NUM_IND_9A.replace_all(&result, "${start}${minus}n");
687 // debug!("IND_9A: \"{}\"", result);
688 let result = NUM_IND_9C.replace_all(&result, "${1}${2}n");
689 let result = NUM_IND_9D.replace_all(&result, "${1}n");
690 let result = NUM_IND_9E.replace_all(&result, "${face}n");
691 let result = NUM_IND_9E_SHAPE.replace_all(&result, "${mod}n");
692 let result = NUM_IND_9F.replace_all(&result, "${1}${2}n");
693
694// debug!("IND_9F: \"{}\"", result);
695
696 // 9b: insert after punctuation (optional minus sign)
697 // common punctuation adds a space, so 9a handled it. Here we deal with other "punctuation"
698 // FIX other punctuation and reference symbols (9d)
699 let result = NUM_IND_9B.replace_all(&result, "$punct${minus}n");
700// debug!("A PUNCT: \"{}\"", &result);
701
702 // strip level indicators
703 // check first to remove level indicators before baseline, then potentially remove the baseline
704 let mut result = REMOVE_LEVEL_IND_BEFORE_BASELINE.replace_all(&result, "$1");
705// debug!("Punct : \"{}\"", &result);
706 // checks for punctuation char, so needs to before punctuation is stripped.
707 // if '𝑏' is removed, then the highlight needs to be shifted to the left in some cases
708 let result = remove_baseline_before_space_or_punctuation(&mut result);
709// debug!("Removed: \"{}\"", &result);
710
711 let result = NO_SPACE_AFTER_COMMA.replace_all(&result, "⠠P⠴");
712
713 let result = REMOVE_AFTER_PUNCT_IND.replace_all(&result, "$1$2");
714// debug!("Punct38: \"{}\"", &result);
715
716 // these typeforms need to get pulled from user-prefs as they are transcriber-defined
717 let sans_serif = pref_manager.pref_to_string("Nemeth_SansSerif");
718 let bold = pref_manager.pref_to_string("Nemeth_Bold");
719 let double_struck = pref_manager.pref_to_string("Nemeth_DoubleStruck");
720 let script = pref_manager.pref_to_string("Nemeth_Script");
721 let italic = pref_manager.pref_to_string("Nemeth_Italic");
722
723 let result = REPLACE_INDICATORS.replace_all(&result, |cap: &Captures| {
724 let matched_char = &cap[0];
725 match matched_char {
726 "S" => &sans_serif,
727 "B" => &bold,
728 "𝔹" => &double_struck,
729 "T" => &script,
730 "I" => &italic,
731 _ => match NEMETH_INDICATOR_REPLACEMENTS.get(&cap[0]) {
732 None => {error!("REPLACE_INDICATORS and NEMETH_INDICATOR_REPLACEMENTS are not in sync"); ""},
733 Some(&ch) => ch,
734 }
735 }
736 });
737
738 // Remove unicode blanks at start and end -- do this after the substitutions because ',' introduces spaces
739 let result = result.trim_start_matches('⠀').trim_end_matches('⠀');
740 let result = COLLAPSE_SPACES.replace_all(result, "⠀");
741
742 return result.to_string();
743
744 fn remove_baseline_before_space_or_punctuation<'a>(braille: &'a mut Cow<'a, str>) -> Cow<'a, str> {
745 // If the baseline highlight is at the end of the string and it is going to be deleted by the regex,
746 // then we need to shift the highlight to the left if what is to it's left is not whitespace (which should never be a highlight end)
747 // This only happens when BrailleNavHighlight == "EndPoints".
748 let highlight_style = PreferenceManager::get().borrow().pref_to_string("BrailleNavHighlight");
749 if highlight_style == "EndPoints" {
750 if let Some(last_highlighted) = braille.rfind(is_highlighted) {
751 if braille[last_highlighted..].starts_with('𝑏') {
752 let i_after_baseline = last_highlighted + '𝑏'.len_utf8();
753 if i_after_baseline == braille.len() || braille[i_after_baseline..].starts_with(['W', 'w', ',', 'P']) {
754 // shift the highlight to the left after doing just the replacement (if any) that the regex below does
755 // the shift runs until a non blank braille char is found
756 let mut bytes_deleted = 0;
757 let mut char_to_highlight = "".to_string(); // illegal value
758 for ch in braille[..last_highlighted].chars().rev() {
759 bytes_deleted += ch.len_utf8();
760 if (0x2801..0x28FF).contains(&(ch as u32)) {
761 char_to_highlight = highlight(ch).to_string();
762 break;
763 }
764 }
765 braille.to_mut().replace_range(last_highlighted-bytes_deleted..last_highlighted+'𝑏'.len_utf8(),
766 &char_to_highlight);
767 }
768 }
769 }
770 }
771 return REMOVE_LEVEL_IND_BEFORE_SPACE_COMMA_PUNCT.replace_all(braille, "$1");
772
773 }
774}
775
776// Typeface: S: sans-serif, B: bold, T: script/blackboard, I: italic, R: Roman
777// Language: E: English, D: German, G: Greek, V: Greek variants, H: Hebrew, U: Russian
778// Indicators: C: capital, N: number, P: punctuation, M: multipurpose
779// Others:
780// W -- whitespace that should be kept (e.g, in a numeral)
781// 𝑁 -- hack for special case of a lone decimal pt -- not considered a number but follows rules mostly
782// Note: some "positive" patterns find cases to keep the char and transform them to the lower case version
783static UEB_INDICATOR_REPLACEMENTS: phf::Map<&str, &str> = phf_map! {
784 "S" => "XXX", // sans-serif -- from prefs
785 "B" => "⠘", // bold
786 "𝔹" => "XXX", // blackboard -- from prefs
787 "T" => "⠈", // script
788 "I" => "⠨", // italic
789 "R" => "", // roman
790 // "E" => "⠰", // English
791 "1" => "⠰", // Grade 1 symbol
792 "𝟙" => "⠰⠰", // Grade 1 word
793 "L" => "", // Letter left in to assist in locating letters
794 "D" => "XXX", // German (Deutsche) -- from prefs
795 "G" => "⠨", // Greek
796 "V" => "⠨⠈", // Greek Variants
797 // "H" => "⠠⠠", // Hebrew
798 // "U" => "⠈⠈", // Russian
799 "C" => "⠠", // capital
800 "𝐶" => "⠠", // capital that never should get word indicator (from chemical element)
801 "N" => "⠼", // number indicator
802 "t" => "⠱", // shape terminator
803 "W" => "⠀", // whitespace
804 "𝐖"=> "⠀", // whitespace (hard break -- basically, it separates exprs)
805 "s" => "⠆", // typeface single char indicator
806 "w" => "⠂", // typeface word indicator
807 "e" => "⠄", // typeface & capital terminator
808 "o" => "", // flag that what follows is an open indicator (used for standing alone rule)
809 "c" => "", // flag that what follows is an close indicator (used for standing alone rule)
810 "b" => "", // flag that what follows is an open or close indicator (used for standing alone rule)
811 "," => "⠂", // comma
812 "." => "⠲", // period
813 "-" => "-", // hyphen
814 "—" => "⠠⠤", // normal dash (2014) -- assume all normal dashes are unified here [RUEB appendix 3]
815 "―" => "⠐⠠⠤", // long dash (2015) -- assume all long dashes are unified here [RUEB appendix 3]
816 "#" => "", // signals end of script
817 // '(', '{', '[', '"', '\'', '“', '‘', '«', // opening chars
818 // ')', '}', ']', '\"', '\'', '”', '’', '»', // closing chars
819 // ',', ';', ':', '.', '…', '!', '?' // punctuation
820
821};
822
823// static LETTERS: phf::Set<char> = phf_set! {
824// '⠁', '⠃', '⠉', '⠙', '⠑', '⠋', '⠛', '⠓', '⠊', '⠚', '⠅', '⠇', '⠍',
825// '⠝', '⠕', '⠏', '⠟', '⠗', '⠎', '⠞', '⠥', '⠧', '⠺', '⠭', '⠽', '⠵',
826// };
827
828static LETTER_NUMBERS: phf::Set<char> = phf_set! {
829 '⠁', '⠃', '⠉', '⠙', '⠑', '⠋', '⠛', '⠓', '⠊', '⠚',
830};
831
832static SHORT_FORMS: phf::Set<&str> = phf_set! {
833 "L⠁L⠃", "L⠁L⠃L⠧", "L⠁L⠉", "L⠁L⠉L⠗", "L⠁L⠋",
834 "L⠁L⠋L⠝", "L⠁L⠋L⠺", "L⠁L⠛", "L⠁L⠛L⠌", "L⠁L⠇",
835 "L⠁L⠇L⠍", "L⠁L⠇L⠗", "L⠁L⠇L⠞", "L⠁L⠇L⠹", "L⠁L⠇L⠺",
836 "L⠃L⠇", "L⠃L⠗L⠇", "L⠉L⠙", "L⠙L⠉L⠇", "L⠙L⠉L⠇L⠛",
837 "L⠙L⠉L⠧", "L⠙L⠉L⠧L⠛", "L⠑L⠊", "L⠋L⠗", "L⠋L⠌", "L⠛L⠙",
838 "L⠛L⠗L⠞", "L⠓L⠍", "L⠓L⠍L⠋", "L⠓L⠻L⠋", "L⠊L⠍L⠍", "L⠇L⠇", "L⠇L⠗",
839 "L⠍L⠽L⠋", "L⠍L⠡", "L⠍L⠌", "L⠝L⠑L⠉", "L⠝L⠑L⠊", "L⠏L⠙",
840 "L⠏L⠻L⠉L⠧", "L⠏L⠻L⠉L⠧L⠛", "L⠏L⠻L⠓", "L⠟L⠅", "L⠗L⠉L⠧",
841 "L⠗L⠉L⠧L⠛", "L⠗L⠚L⠉", "L⠗L⠚L⠉L⠛", "L⠎L⠙", "L⠎L⠡", "L⠞L⠙",
842 "L⠞L⠛L⠗", "L⠞L⠍", "L⠞L⠝", "L⠭L⠋", "L⠭L⠎", "L⠽L⠗", "L⠽L⠗L⠋",
843 "L⠽L⠗L⠧L⠎", "L⠮L⠍L⠧L⠎", "L⠡L⠝", "L⠩L⠙", "L⠹L⠽L⠋", "L⠳L⠗L⠧L⠎",
844 "L⠺L⠙", "L⠆L⠉", "L⠆L⠋", "L⠆L⠓", "L⠆L⠇", "L⠆L⠝", "L⠆L⠎", "L⠆L⠞",
845 "L⠆L⠽", "L⠒L⠉L⠧", "L⠒L⠉L⠧L⠛", "L⠐L⠕L⠋"
846};
847
848static LETTER_PREFIXES: phf::Set<char> = phf_set! {
849 'B', 'I', '𝔹', 'S', 'T', 'D', 'C', '𝐶', '𝑐',
850};
851
852lazy_static! {
853 // Trim braille spaces before and after braille indicators
854 // In order: fraction, /, cancellation, letter, baseline
855 // Note: fraction over is not listed due to example 42(4) which shows a space before the "/"
856 // static ref REMOVE_SPACE_BEFORE_BRAILLE_INDICATORS: Regex =
857 // Regex::new(r"(⠄⠄⠄|⠤⠤⠤)W+([⠼⠸⠪])").unwrap();
858 static ref REPLACE_INDICATORS: Regex =Regex::new(r"([1𝟙SB𝔹TIREDGVHP𝐶𝑐CLMNW𝐖swe,.-—―#ocb])").unwrap();
859 static ref COLLAPSE_SPACES: Regex = Regex::new(r"⠀⠀+").unwrap();
860}
861
862fn is_short_form(chars: &[char]) -> bool {
863 let chars_as_string = chars.iter().map(|ch| ch.to_string()).collect::<String>();
864 return SHORT_FORMS.contains(&chars_as_string);
865}
866
867fn ueb_cleanup(pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String {
868 // debug!("ueb_cleanup: start={}", raw_braille);
869 let result = typeface_to_word_mode(&raw_braille);
870 let result = capitals_to_word_mode(&result);
871
872 let use_only_grade1 = pref_manager.pref_to_string("UEB_START_MODE").as_str() == "Grade1";
873
874 // '𝐖' is a hard break -- basically, it separates exprs
875 let mut result = result.split('𝐖')
876 .map(|str| pick_start_mode(str, use_only_grade1) + "W")
877 .collect::<String>();
878 result.pop(); // we added a 'W' at the end that needs to be removed.
879
880 let result = result.replace("tW", "W");
881
882 // these typeforms need to get pulled from user-prefs as they are transcriber-defined
883 let double_struck = pref_manager.pref_to_string("UEB_DoubleStruck");
884 let sans_serif = pref_manager.pref_to_string("UEB_SansSerif");
885 let fraktur = pref_manager.pref_to_string("UEB_Fraktur");
886 let greek_variant = pref_manager.pref_to_string("UEB_GreekVariant");
887
888 let result = REPLACE_INDICATORS.replace_all(&result, |cap: &Captures| {
889 let matched_char = &cap[0];
890 match matched_char {
891 "𝔹" => &double_struck,
892 "S" => &sans_serif,
893 "D" => &fraktur,
894 "V" => &greek_variant,
895 _ => match UEB_INDICATOR_REPLACEMENTS.get(matched_char) {
896 None => {error!("REPLACE_INDICATORS and UEB_INDICATOR_REPLACEMENTS are not in sync: missing '{matched_char}'"); ""},
897 Some(&ch) => ch,
898 },
899 }
900 });
901
902 // Remove unicode blanks at start and end -- do this after the substitutions because ',' introduces spaces
903 // let result = result.trim_start_matches('⠀').trim_end_matches('⠀');
904 let result = COLLAPSE_SPACES.replace_all(&result, "⠀");
905
906 return result.to_string();
907
908 fn pick_start_mode(raw_braille: &str, use_only_grade1: bool) -> String {
909 // Need to decide what the start mode should be
910 // From http://www.brailleauthority.org/ueb/ueb_math_guidance/final_for_posting_ueb_math_guidance_may_2019_102419.pdf
911 // Unless a math expression can be correctly represented with only a grade 1 symbol indicator in the first three cells
912 // or before a single letter standing alone anywhere in the expression,
913 // begin the expression with a grade 1 word indicator (or a passage indicator if the expression includes spaces)
914 // Apparently "only a grade 1 symbol..." means at most one grade 1 symbol based on some examples (GTM 6.4, example 4)
915 // debug!("before determining mode: '{}'", raw_braille);
916
917 // a bit ugly because we need to store the string if we have cap passage mode
918 let raw_braille_string = if is_cap_passage_mode_good(raw_braille) {convert_to_cap_passage_mode(raw_braille)} else {String::default()};
919 let raw_braille = if raw_braille_string.is_empty() {raw_braille} else {&raw_braille_string};
920 if use_only_grade1 {
921 return remove_unneeded_mode_changes(raw_braille, UEB_Mode::Grade1, UEB_Duration::Passage);
922 }
923 let grade2 = remove_unneeded_mode_changes(raw_braille, UEB_Mode::Grade2, UEB_Duration::Symbol);
924 // debug!("Symbol mode: '{}'", grade2);
925
926 if is_grade2_string_ok(&grade2) {
927 return grade2;
928 } else {
929 // BANA says use g1 word mode if spaces are present, but that's not what their examples do
930 // A conversation with Ms. DeAndrea from BANA said that they mean use passage mode if ≥3 "segments" (≥2 blanks)
931 // The G1 Word mode might not be at the start (iceb.rs:omission_3_6_7)
932 let grade1_word = try_grade1_word_mode(raw_braille);
933 // debug!("Word mode: '{}'", grade1_word);
934 if !grade1_word.is_empty() {
935 return grade1_word;
936 } else {
937 let grade1_passage = remove_unneeded_mode_changes(raw_braille, UEB_Mode::Grade1, UEB_Duration::Passage);
938 return "⠰⠰⠰".to_string() + &grade1_passage + "⠰⠄";
939 }
940 }
941
942 /// Return true if at least five (= # of cap passage indicators) cap indicators and no lower case letters
943 fn is_cap_passage_mode_good(braille: &str) -> bool {
944 let mut n_caps = 0;
945 let mut is_cap_mode = false;
946 let mut cap_mode = UEB_Duration::Symbol; // real value set when is_cap_mode is set to true
947 let mut chars = braille.chars();
948
949 // look CL or CCL for caps (CC runs until we get whitespace)
950 // if we find an L not in caps mode, we return false
951 // Note: caps can be C𝐶, whitespace can be W𝐖
952 while let Some(ch) = chars.next() {
953 if ch == 'L' {
954 if !is_cap_mode {
955 return false;
956 }
957 chars.next(); // skip letter
958 if cap_mode == UEB_Duration::Symbol {
959 is_cap_mode = false;
960 }
961 } else if ch == 'C' || ch == '𝐶' {
962 if is_cap_mode {
963 if cap_mode == UEB_Duration::Symbol {
964 cap_mode = UEB_Duration::Word;
965 }
966 } else {
967 is_cap_mode = true;
968 cap_mode = UEB_Duration::Symbol;
969 }
970 n_caps += 1;
971 } else if ch == 'W' || ch == '𝐖' {
972 if is_cap_mode {
973 assert!(cap_mode == UEB_Duration::Word);
974 }
975 is_cap_mode = false;
976 } else if ch == '1' && is_cap_mode {
977 break;
978 }
979 }
980 return n_caps > 4;
981 }
982
983 fn convert_to_cap_passage_mode(braille: &str) -> String {
984 return "⠠⠠⠠".to_string() + &braille.replace(['C', '𝐶'], "") + "⠠⠄";
985 }
986
987 /// Return true if the BANA or ICEB guidelines say it is ok to start with grade 2
988 fn is_grade2_string_ok(grade2_braille: &str) -> bool {
989 // BANA says use grade 2 if there is not more than one grade one symbol or single letter standing alone.
990 // The exact quote from their guidance:
991 // Unless a math expression can be correctly represented with only a grade 1 symbol indicator in the first three cells
992 // or before a single letter standing alone anywhere in the expression,
993 // begin the expression with a grade 1 word indicator
994 // Note: I modified this slightly to exclude the cap indicator in the count. That allows three more ICEB rule to pass and seems
995 // like it is a reasonable thing to do.
996 // Another modification is allow a single G1 indicator to occur after whitespace later on
997 // because ICEB examples show it and it seems better than going to passage mode if it is the only G1 indicator
998
999 // Because of the 'L's which go away, we have to put a little more work into finding the first three chars
1000 let chars = grade2_braille.chars().collect::<Vec<char>>();
1001 let mut n_real_chars = 0; // actually number of chars
1002 let mut found_g1 = false;
1003 let mut i = 0;
1004 while i < chars.len() {
1005 let ch = chars[i];
1006 if ch == '1' && !is_forced_grade1(&chars, i) {
1007 if found_g1 {
1008 return false;
1009 }
1010 found_g1 = true;
1011 } else if !"𝐶CLobc".contains(ch) {
1012 if n_real_chars == 2 {
1013 i += 1;
1014 break; // this is the third real char
1015 };
1016 n_real_chars += 1;
1017 }
1018 i += 1
1019 }
1020
1021 // if we find *another* g1 that isn't forced and isn't standing alone, we are done
1022 // I've added a 'follows whitespace' clause for test iceb.rs:omission_3_6_2 to the standing alone rule
1023 // we only allow one standing alone example -- not sure if BANA guidance has this limit, but GTM 11_5_5_3 seems better with it
1024 // Same for GTM 1_7_3_1 (passage mode is mentioned also)
1025 let mut is_standing_alone_already_encountered = false;
1026 let mut is_after_whitespace = false;
1027 while i < chars.len() {
1028 let ch = chars[i];
1029 if ch == 'W' {
1030 is_after_whitespace = true;
1031 } else if ch == '1' && !is_forced_grade1(&chars, i) {
1032 if is_standing_alone_already_encountered ||
1033 ((found_g1 || !is_after_whitespace) && !is_single_letter_on_right(&chars, i)) {
1034 return false;
1035 }
1036 found_g1 = true;
1037 is_standing_alone_already_encountered = true;
1038 }
1039 i += 1;
1040 }
1041 return true;
1042 }
1043
1044 /// Return true if the sequence of chars forces a '1' at the `i`th position
1045 /// Note: `chars[i]` should be '1'
1046 fn is_forced_grade1(chars: &[char], i: usize) -> bool {
1047 // A '1' is forced if 'a-j' follows a digit
1048 assert_eq!(chars[i], '1', "'is_forced_grade1' didn't start with '1'");
1049 // check that a-j follows the '1' -- we have '1Lx' where 'x' is the letter to check
1050 if i+2 < chars.len() && LETTER_NUMBERS.contains(&unhighlight(chars[i+2])) {
1051 // check for a number before the '1'
1052 // this will be 'N' followed by LETTER_NUMBERS or the number ".", ",", or " "
1053 for j in (0..i).rev() {
1054 let ch = chars[j];
1055 if !(LETTER_NUMBERS.contains(&unhighlight(ch)) || ".,W𝐖".contains(ch)) {
1056 return ch == 'N'
1057 }
1058 }
1059 }
1060 return false;
1061 }
1062
1063 fn is_single_letter_on_right(chars: &[char], i: usize) -> bool {
1064 static SKIP_CHARS: phf::Set<char> = phf_set! {
1065 'B', 'I', '𝔹', 'S', 'T', 'D', 'C', '𝐶', 's', 'w' // indicators
1066 };
1067
1068 // find the first char (if any)
1069 let mut count = 0; // how many letters
1070 let mut i = i+1;
1071 while i < chars.len() {
1072 let ch = chars[i];
1073 if !SKIP_CHARS.contains(&ch) {
1074 if ch == 'L' {
1075 if count == 1 {
1076 return false; // found a second letter in the sequence
1077 }
1078 count += 1;
1079 } else {
1080 return count==1;
1081 }
1082 i += 2; // eat 'L' and actual letter
1083 } else {
1084 i += 1;
1085 }
1086 }
1087 return true;
1088 }
1089
1090 fn try_grade1_word_mode(raw_braille: &str) -> String {
1091 // this isn't quite right, but pretty close -- try splitting at 'W' (words)
1092 // only one of the parts can be in word mode and none of the others can have '1' unless forced
1093 let mut g1_words = Vec::default();
1094 let mut found_word_mode = false;
1095 for raw_word in raw_braille.split('W') {
1096 let word = remove_unneeded_mode_changes(raw_word, UEB_Mode::Grade2, UEB_Duration::Symbol);
1097 // debug!("try_grade1_word_mode: word='{}'", word);
1098 let word_chars = word.chars().collect::<Vec<char>>();
1099 let needs_word_mode = word_chars.iter().enumerate()
1100 .any(|(i, &ch) | ch == '1' && !is_forced_grade1(&word_chars, i));
1101 if needs_word_mode {
1102 if found_word_mode {
1103 return "".to_string();
1104 }
1105 found_word_mode = true;
1106 g1_words.push("⠰⠰".to_string() + &remove_unneeded_mode_changes(raw_word, UEB_Mode::Grade1, UEB_Duration::Word)
1107 );
1108 } else {
1109 g1_words.push(word);
1110 }
1111 }
1112 return if found_word_mode {g1_words.join("W")} else {"".to_string()};
1113 }
1114 }
1115}
1116
1117fn typeface_to_word_mode(braille: &str) -> String {
1118 lazy_static! {
1119 static ref HAS_TYPEFACE: Regex = Regex::new("[BI𝔹STD]").unwrap();
1120 }
1121 // debug!("before typeface fix: '{}'", braille);
1122
1123 let mut result = "".to_string();
1124 let chars = braille.chars().collect::<Vec<char>>();
1125 let mut word_mode = Vec::with_capacity(5);
1126 let mut word_mode_end = Vec::with_capacity(5);
1127 let mut i = 0;
1128 while i < chars.len() {
1129 let ch = chars[i];
1130 if HAS_TYPEFACE.is_match(ch.to_string().as_str()) {
1131 let i_next_char_target = find_next_char(&chars[i+1..], ch);
1132 if word_mode.contains(&ch) {
1133 if i_next_char_target.is_none() {
1134 word_mode.retain(|&item| item!=ch); // drop the char since word mode is done
1135 word_mode_end.push(ch); // add the char to signal to add end sequence
1136 }
1137 } else {
1138 result.push(ch);
1139 if i_next_char_target.is_some() {
1140 result.push('w'); // typeface word indicator
1141 word_mode.push(ch); // starting word mode for this char
1142 } else {
1143 result.push('s'); // typeface single char indicator
1144 }
1145 }
1146 i += 1; // eat "B", etc
1147 } else if ch == 'L' || ch == 'N' {
1148 result.push(chars[i]);
1149 result.push(chars[i+1]);
1150 if !word_mode_end.is_empty() && i+2 < chars.len() && !(chars[i+2] == 'W'|| chars[i+2] == '𝐖') {
1151 // add terminator unless word sequence is terminated by end of string or whitespace
1152 for &ch in &word_mode_end {
1153 result.push(ch);
1154 result.push('e');
1155 };
1156 word_mode_end.clear();
1157 }
1158 i += 2; // eat Ll/Nd
1159 } else {
1160 result.push(ch);
1161 i += 1;
1162 }
1163 }
1164 return result;
1165
1166}
1167
1168fn capitals_to_word_mode(braille: &str) -> String {
1169 use std::iter::FromIterator;
1170 // debug!("before capitals fix: '{}'", braille);
1171
1172 let mut result = "".to_string();
1173 let chars = braille.chars().collect::<Vec<char>>();
1174 let mut is_word_mode = false;
1175 let mut i = 0;
1176 // look for a sequence of CLxCLy... and create CCLxLy...
1177 while i < chars.len() {
1178 let ch = chars[i];
1179 if ch == 'C' {
1180 // '𝑐' should only occur after a 'C', so we don't have top-level check for it
1181 let mut next_non_cap = i+1;
1182 while let Some(i_next) = find_next_char(&chars[next_non_cap..], '𝑐') {
1183 next_non_cap += i_next + 1; // C/𝑐, L, letter
1184 }
1185 if find_next_char(&chars[next_non_cap..], 'C').is_some() { // next letter sequence "C..."
1186 if is_next_char_start_of_section_12_modifier(&chars[next_non_cap+1..]) {
1187 // to me this is tricky -- section 12 modifiers apply to the previous item
1188 // the last clause of the "item" def is the previous indivisible symbol" which ICEB 2.1 say is:
1189 // braille sign: one or more consecutive braille characters comprising a unit,
1190 // consisting of a root on its own or a root preceded by one or more
1191 // prefixes (also referred to as braille symbol)
1192 // this means the capital indicator needs to be stated and can't be part of a word or passage
1193 is_word_mode = false;
1194 result.push_str(String::from_iter(&chars[i..next_non_cap]).as_str());
1195 i = next_non_cap;
1196 continue;
1197 }
1198 if is_word_mode {
1199 i += 1; // skip the 'C'
1200 } else {
1201 // start word mode -- need an extra 'C'
1202 result.push('C');
1203 is_word_mode = true;
1204 }
1205 } else if is_word_mode {
1206 i += 1; // skip the 'C'
1207 }
1208 if chars[next_non_cap] == 'G' {
1209 // Greek letters are a bit exceptional in that the pattern is "CGLx" -- bump 'i'
1210 next_non_cap += 1;
1211 }
1212 if chars[next_non_cap] != 'L' {
1213 error!("capitals_to_word_mode: internal error: didn't find L after C in '{}'.",
1214 chars[i..next_non_cap+2].iter().collect::<String>().as_str());
1215 }
1216 let i_braille_char = next_non_cap + 2;
1217 result.push_str(String::from_iter(&chars[i..i_braille_char]).as_str());
1218 i = i_braille_char;
1219 } else if ch == 'L' { // must be lowercase -- uppercase consumed above
1220 // assert!(LETTERS.contains(&unhighlight(chars[i+1]))); not true for other alphabets
1221 if is_word_mode {
1222 result.push('e'); // terminate Word mode (letter after caps)
1223 is_word_mode = false;
1224 }
1225 result.push('L');
1226 result.push(chars[i+1]);
1227 i += 2; // eat L, letter
1228 } else {
1229 is_word_mode = false; // non-letters terminate cap word mode
1230 result.push(ch);
1231 i += 1;
1232 }
1233 }
1234 return result;
1235
1236 fn is_next_char_start_of_section_12_modifier(chars: &[char]) -> bool {
1237 // first find the L and eat the char so that we are at the potential start of where the target lies
1238 let chars_len = chars.len();
1239 let mut i_cap = 0;
1240 while chars[i_cap] != 'C' { // we know 'C' is in the string, so no need to check for exceeding chars_len
1241 i_cap += 1;
1242 }
1243 for i_end in i_cap+1..chars_len {
1244 if chars[i_end] == 'L' {
1245 // skip the next char to get to the real start, and then look for the modifier string or next L/N
1246 // debug!(" after L '{}'", chars[i_end+2..].iter().collect::<String>());
1247 for i in i_end+2..chars_len {
1248 let ch = chars[i];
1249 if ch == '1' {
1250 // Fix: there's probably a much better way to check if we have a match against one of "⠱", "⠘⠱", "⠘⠲", "⠸⠱", "⠐⠱ ", "⠨⠸⠱"
1251 if chars[i+1] == '⠱' {
1252 return true;
1253 } else if i+2 < chars_len {
1254 let mut str = chars[i+1].to_string();
1255 str.push(chars[i+2]);
1256 if str == "⠘⠱" || str == "⠘⠲" || str == "⠸⠱" || str == "⠐⠱" {
1257 return true;
1258 } else if i+3 < chars_len {
1259 str.push(chars[i+3]);
1260 return str == "⠨⠸⠱";
1261 }
1262 return false;
1263 }
1264 }
1265 if ch == 'L' || ch == 'N' || !LETTER_PREFIXES.contains(&ch) {
1266 return false;
1267 }
1268 }
1269 }
1270 }
1271 return false;
1272 }
1273}
1274
1275fn find_next_char(chars: &[char], target: char) -> Option<usize> {
1276 // first find the L or N and eat the char so that we are at the potential start of where the target lies
1277 // debug!("Looking for '{}' in '{}'", target, chars.iter().collect::<String>());
1278 for i_end in 0..chars.len() {
1279 if chars[i_end] == 'L' || chars[i_end] == 'N' {
1280 // skip the next char to get to the real start, and then look for the target
1281 // stop when L/N signals past potential target or we hit some non L/N char (actual braille)
1282 // debug!(" after L/N '{}'", chars[i_end+2..].iter().collect::<String>());
1283 for (i, &ch) in chars.iter().enumerate().skip(i_end+2) {
1284 if ch == 'L' || ch == 'N' || !LETTER_PREFIXES.contains(&ch) {
1285 return None;
1286 } else if ch == target {
1287 // debug!(" found target");
1288 return Some(i);
1289 }
1290 }
1291 }
1292 }
1293 return None;
1294}
1295
1296#[allow(non_camel_case_types)]
1297#[derive(Debug, PartialEq, Copy, Clone)]
1298enum UEB_Mode {
1299 Numeric, // also includes Grade1
1300 Grade1,
1301 Grade2,
1302}
1303
1304#[allow(non_camel_case_types)]
1305#[derive(Debug, PartialEq, Copy, Clone)]
1306enum UEB_Duration {
1307 // Standing alone: A braille symbol that is standing alone may have a contracted (grade 2) meaning.
1308 // A letter or unbroken sequence of letters is “standing alone” if the symbols before and after the letter or
1309 // sequence are spaces, hyphens, dashes or any combination thereof, including some common punctuation.
1310 // Item: An “item” is defined as the next symbol or one of seven groupings listed in Rules of Unified English Braille, §11.4.1.
1311 Symbol,
1312
1313 // The grade 1 word indicator sets grade 1 mode for the next word or symbol sequence.
1314 // A symbol sequence in UEB is defined as an unbroken string of braille signs,
1315 // whether alphabetic or non-alphabetic, preceded and followed by a space.
1316 Word,
1317 Passage,
1318}
1319
1320// used to determine standing alone (on left side)
1321static LEFT_INTERVENING_CHARS: phf::Set<char> = phf_set! { // see RUEB 2.6.2
1322 'B', 'I', '𝔹', 'S', 'T', 'D', 'C', '𝐶', 's', 'w', // indicators
1323 // opening chars have prefix 'o', so not in set ['(', '{', '[', '"', '\'', '“', '‘', '«']
1324};
1325
1326/// Return value for use_g1_word_mode()
1327#[derive(Debug, PartialEq)]
1328enum Grade1WordIndicator {
1329 NotInWord, // no '𝟙' in the current/next word
1330 InWord, // '𝟙' in the current/next word
1331 NotInChars, // no '𝟙' in the entire string (optimization for common case)
1332}
1333
1334fn remove_unneeded_mode_changes(raw_braille: &str, start_mode: UEB_Mode, start_duration: UEB_Duration) -> String {
1335 // FIX: need to be smarter about moving on wrt to typeforms/typefaces, caps, bold/italic. [maybe just let them loop through the default?]
1336 let mut mode = start_mode;
1337 let mut duration = start_duration;
1338 let mut start_g2_letter = None; // used for start of contraction checks
1339 let mut i_g2_start = None; // set to 'i' when entering G2 mode; None in other modes. '1' indicator goes here if standing alone
1340 let mut cap_word_mode = false; // only set to true in G2 to prevent contractions
1341 let mut result = String::default();
1342 let chars = raw_braille.chars().collect::<Vec<char>>();
1343 let mut g1_word_indicator = Grade1WordIndicator::NotInChars; // almost always true (and often irrelevant)
1344 if mode == UEB_Mode::Grade2 || duration == UEB_Duration::Symbol {
1345 g1_word_indicator = use_g1_word_mode(&chars);
1346 if g1_word_indicator == Grade1WordIndicator::InWord {
1347 mode = UEB_Mode::Grade1;
1348 if duration == UEB_Duration::Symbol {
1349 duration = UEB_Duration::Word; // if Passage mode, leave as is
1350 result.push('𝟙')
1351 }
1352 }
1353 }
1354 let mut i = 0;
1355 while i < chars.len() {
1356 let ch = chars[i];
1357 match mode {
1358 UEB_Mode::Numeric => {
1359 // Numeric Mode: (from https://uebmath.aphtech.org/lesson1.0 and lesson4.0)
1360 // Symbols that can appear within numeric mode include the ten digits, comma, period, simple fraction line,
1361 // line continuation indicator, and numeric space digit symbols.
1362 // A space or any other symbol not listed here terminates numeric mode.
1363 // Numeric mode is also terminated by the "!" -- used after a script
1364 //
1365 // The numeric indicator also turns on grade 1 mode.
1366 // When grade 1 mode is set by the numeric indicator,
1367 // grade 1 indicators are not used unless a single lower-case letter a-j immediately follows a digit.
1368 // Grade 1 mode when set by the numeric indicator is terminated by a space, hyphen, dash, or a grade 1 indicator.
1369 i_g2_start = None;
1370 // debug!("Numeric: ch={}, duration: {:?}", ch, duration);
1371 match ch {
1372 'L' => {
1373 // terminate numeric mode -- duration doesn't change
1374 // let the default case handle pushing on the chars for the letter
1375 if LETTER_NUMBERS.contains(&unhighlight(chars[i+1])) {
1376 result.push('1'); // need to distinguish a-j from a digit
1377 }
1378 result.push(ch);
1379 i += 1;
1380 mode = UEB_Mode::Grade1;
1381 // duration remains Word
1382 },
1383 '1' | '𝟙' => {
1384 // numeric mode implies grade 1, so don't output indicator;
1385 i += 1;
1386 mode = UEB_Mode::Grade1;
1387 if start_duration == UEB_Duration::Passage {
1388 duration = UEB_Duration::Passage; // otherwise it remains at Word
1389 }
1390 },
1391 '#' => {
1392 // terminate numeric mode -- duration doesn't change
1393 i += 1;
1394 if i+1 < chars.len() && chars[i] == 'L' && LETTER_NUMBERS.contains(&unhighlight(chars[i+1])) {
1395 // special case where the script was numeric and a letter follows, so need to put out G1 indicator
1396 result.push('1');
1397 // the G1 case should work with 'L' now
1398 }
1399 mode = UEB_Mode::Grade1;
1400 },
1401 'N' => {
1402 // stay in the same mode (includes numeric "," and "." space) -- don't let default get these chars
1403 result.push(chars[i+1]);
1404 i += 2;
1405 },
1406 _ => {
1407 // moving out of numeric mode
1408 result.push(ch);
1409 i += 1;
1410 if "W𝐖-—―".contains(ch) {
1411 mode = start_mode;
1412 if mode == UEB_Mode::Grade2 {
1413 start_g2_letter = None; // will be set to real letter
1414 }
1415 if start_duration != UEB_Duration::Passage {
1416 duration = UEB_Duration::Symbol;
1417 }
1418 } else {
1419 mode = UEB_Mode::Grade1
1420 }
1421 },
1422 }
1423 },
1424 UEB_Mode::Grade1 => {
1425 // Grade 1 Mode:
1426 // The numeric indicator also sets grade 1 mode.
1427 // Grade 1 mode, when initiated by the numeric indicator, is terminated by a space, hyphen, dash or grade 1 terminator.
1428 // Grade 1 mode is also set by grade 1 indicators.
1429 i_g2_start = None;
1430 // debug!("Grade 1: ch={}, duration: {:?}", ch, duration);
1431 match ch {
1432 'L' => {
1433 // note: be aware of '#' case for Numeric because '1' might already be generated
1434 // let prev_ch = if i > 1 {chars[i-1]} else {'1'}; // '1' -- anything beside ',' or '.'
1435 // if duration == UEB_Duration::Symbol ||
1436 // ( ",. ".contains(prev_ch) && LETTER_NUMBERS.contains(&unhighlight(chars[i+1])) ) {
1437 // result.push('1'); // need to retain grade 1 indicator (RUEB 6.5.2)
1438 // }
1439 // let the default case handle pushing on the chars for the letter
1440 result.push(ch);
1441 i += 1;
1442 },
1443 '1' | '𝟙' => {
1444 assert!(ch == '1' || duration != UEB_Duration::Symbol); // if '𝟙', should be Word or Passage duration
1445 // nothing to do -- let the default case handle the following chars
1446 i += 1;
1447 },
1448 'N' => {
1449 result.push(ch);
1450 result.push(chars[i+1]);
1451 i += 2;
1452 mode = UEB_Mode::Numeric;
1453 duration = UEB_Duration::Word;
1454 },
1455 'W' | '𝐖' => {
1456 // this terminates a word mode if there was one
1457 result.push(ch);
1458 i += 1;
1459 if start_duration != UEB_Duration::Passage {
1460 duration = UEB_Duration::Symbol;
1461 mode = UEB_Mode::Grade2;
1462 }
1463 },
1464 _ => {
1465 result.push(ch);
1466 i += 1;
1467 if duration == UEB_Duration::Symbol && !LETTER_PREFIXES.contains(&ch) {
1468 mode = start_mode;
1469 }
1470 }
1471 }
1472 if mode == UEB_Mode::Grade2 {
1473 start_g2_letter = None; // will be set to real letter
1474 }
1475
1476 },
1477 UEB_Mode::Grade2 => {
1478 // note: if we ended up using a '1', it only extends to the next char, which is also dealt with, so mode doesn't change
1479 if i_g2_start.is_none() {
1480 i_g2_start = Some(i);
1481 cap_word_mode = false;
1482 }
1483 // debug!("Grade 2: ch={}, duration: {:?}", ch, duration);
1484 match ch {
1485 'L' => {
1486 if start_g2_letter.is_none() {
1487 start_g2_letter = Some(i);
1488 }
1489 let (is_alone, right_matched_chars, n_letters) = stands_alone(&chars, i);
1490 // GTM 1.2.1 says we only need to use G1 for single letters or sequences that are a shortform (e.g, "ab")
1491 if is_alone && (n_letters == 1 || is_short_form(&right_matched_chars[..2*n_letters])) {
1492 // debug!(" is_alone -- pushing '1'");
1493 result.push('1');
1494 mode = UEB_Mode::Grade1;
1495 }
1496 // debug!(" pushing {:?}", right_matched_chars);
1497 right_matched_chars.iter().for_each(|&ch| result.push(ch));
1498 i += right_matched_chars.len();
1499 },
1500 'C' => {
1501 // Want 'C' before 'L'; Could be CC for word cap -- if so, eat it and move on
1502 // Note: guaranteed that there is a char after the 'C', so chars[i+1] is safe
1503 if chars[i+1] == 'C' {
1504 cap_word_mode = true;
1505 i += 1;
1506 } else {
1507 let is_greek = chars[i+1] == 'G';
1508 let (is_alone, right_matched_chars, n_letters) = stands_alone(&chars, if is_greek {i+2} else {i+1});
1509 // GTM 1.2.1 says we only need to use G1 for single letters or sequences that are a shortform (e.g, "ab")
1510 if is_alone && (n_letters == 1 || is_short_form(&right_matched_chars[..2*n_letters])) {
1511 // debug!(" is_alone -- pushing '1'");
1512 result.push('1');
1513 mode = UEB_Mode::Grade1;
1514 }
1515 if cap_word_mode {
1516 result.push('C'); // first 'C' if cap word
1517 }
1518 result.push('C');
1519 if is_greek {
1520 result.push('G');
1521 i += 1;
1522 }
1523 start_g2_letter = Some(i);
1524 // debug!(" pushing 'C' + {:?}", right_matched_chars);
1525 right_matched_chars.iter().for_each(|&ch| result.push(ch));
1526 i += 1 + right_matched_chars.len();
1527 }
1528 },
1529 '1' => {
1530 result.push(ch);
1531 i += 1;
1532 mode = UEB_Mode::Grade1;
1533 duration = UEB_Duration::Symbol;
1534 },
1535 '𝟙' => {
1536 // '𝟙' should have forced G1 Word mode
1537 error!("Internal error: '𝟙' found in G2 mode: index={i} in '{raw_braille}'");
1538 i += 1;
1539 }
1540 'N' => {
1541 result.push(ch);
1542 result.push(chars[i+1]);
1543 i += 2;
1544 mode = UEB_Mode::Numeric;
1545 duration = UEB_Duration::Word;
1546 },
1547 _ => {
1548 if let Some(start) = start_g2_letter {
1549 if !cap_word_mode {
1550 result = handle_contractions(&chars[start..i], result);
1551 }
1552 cap_word_mode = false;
1553 start_g2_letter = None; // not start of char sequence
1554 }
1555 result.push(ch);
1556 i += 1;
1557 if !LEFT_INTERVENING_CHARS.contains(&ch) {
1558 cap_word_mode = false;
1559 i_g2_start = Some(i);
1560 }
1561
1562 }
1563 }
1564 if mode != UEB_Mode::Grade2 && !cap_word_mode {
1565 if let Some(start) = start_g2_letter {
1566 result = handle_contractions(&chars[start..i], result);
1567 start_g2_letter = None; // not start of char sequence
1568 }
1569 }
1570 },
1571 }
1572
1573 if (ch == 'W' || ch == '𝐖') && g1_word_indicator != Grade1WordIndicator::NotInChars &&
1574 (mode == UEB_Mode::Grade2 || duration == UEB_Duration::Symbol) {
1575 g1_word_indicator = use_g1_word_mode(&chars[i..]);
1576 if g1_word_indicator == Grade1WordIndicator::InWord {
1577 mode = UEB_Mode::Grade1;
1578 if duration == UEB_Duration::Symbol {
1579 duration = UEB_Duration::Word; // if Passage mode, leave as is
1580 result.push('𝟙')
1581 }
1582 }
1583 }
1584 }
1585 if mode == UEB_Mode::Grade2 {
1586 if let Some(start) = start_g2_letter {
1587 result = handle_contractions(&chars[start..i], result);
1588 }
1589 }
1590
1591 return result;
1592
1593
1594 fn use_g1_word_mode(chars: &[char]) -> Grade1WordIndicator {
1595 // debug!("use_g1_word_mode: chars='{:?}'", chars);
1596 for &ch in chars {
1597 if ch == 'W' || ch == '𝐖' {
1598 return Grade1WordIndicator::NotInWord; // reached a word boundary
1599 }
1600 if ch == '𝟙' {
1601 return Grade1WordIndicator::InWord; // need word mode in this "word"
1602 }
1603 }
1604 return Grade1WordIndicator::NotInChars; //
1605 }
1606}
1607
1608/// Returns a tuple:
1609/// true if the ith char "stands alone" (UEB 2.6)
1610/// the chars on the right that are part of the standing alone sequence
1611/// the number of letters in that sequence
1612/// This basically means a letter sequence surrounded by white space with some potentially intervening chars
1613/// The intervening chars can be typeform/cap indicators, along with various forms of punctuation
1614/// The ith char should be an "L"
1615/// This assumes that there is whitespace before and after the character string
1616fn stands_alone(chars: &[char], i: usize) -> (bool, &[char], usize) {
1617 // scan backward and check the conditions for "standing-alone"
1618 // we scan forward and check the conditions for "standing-alone"
1619 assert_eq!(chars[i], 'L', "'stands_alone' starts with non 'L'");
1620 // debug!("stands_alone: i={}, chars: {:?}", i, chars);
1621 if !left_side_stands_alone(&chars[0..i]) {
1622 return (false, &chars[i..i+2], 0);
1623 }
1624
1625 let (mut is_alone, n_letters, n_right_matched) = right_side_stands_alone(&chars[i+2..]);
1626 // debug!("left is alone, right is alone: {}, : n_letters={}, n_right_matched={}", is_alone, n_letters, n_right_matched);
1627
1628 if is_alone && n_letters == 1 {
1629 let ch = chars[i+1];
1630 if ch=='⠁' || ch=='⠊' || ch=='⠕' { // a, i, o
1631 is_alone = false;
1632 }
1633 }
1634 return (is_alone, &chars[i..i+2+n_right_matched], n_letters);
1635
1636 /// chars before before 'L'
1637 fn left_side_stands_alone(chars: &[char]) -> bool {
1638 // scan backwards to skip letters and intervening chars
1639 // once we hit an intervening char, only intervening chars are allowed if standing alone
1640 let mut intervening_chars_mode = false; // true when we are on the final stretch
1641 let mut i = chars.len();
1642 while i > 0 {
1643 i -= 1;
1644 let ch = chars[i];
1645 let prev_ch = if i > 0 {chars[i-1]} else {' '}; // ' ' is a char not in input
1646 // debug!(" left alone: prev/ch {}/{}", prev_ch, ch);
1647 if (!intervening_chars_mode && prev_ch == 'L') ||
1648 (prev_ch == 'o' || prev_ch == 'b') {
1649 intervening_chars_mode = true;
1650 i -= 1; // ignore 'Lx' and also ignore 'ox'
1651 } else if LEFT_INTERVENING_CHARS.contains(&ch) {
1652 intervening_chars_mode = true;
1653 } else {
1654 return "W𝐖-—―".contains(ch);
1655 }
1656 }
1657
1658 return true;
1659 }
1660
1661 // chars after character we are testing
1662 fn right_side_stands_alone(chars: &[char]) -> (bool, usize, usize) {
1663 // see RUEB 2.6.3
1664 static RIGHT_INTERVENING_CHARS: phf::Set<char> = phf_set! {
1665 'B', 'I', '𝔹', 'S', 'T', 'D', 'C', '𝐶', 's', 'w', 'e', // indicators
1666 // ')', '}', ']', '\"', '\'', '”', '’', '»', // closing chars
1667 // ',', ';', ':', '.', '…', '!', '?' // punctuation
1668 };
1669 // scan forward to skip letters and intervening chars
1670 // once we hit an intervening char, only intervening chars are allowed if standing alone ('c' and 'b' are part of them)
1671 let mut intervening_chars_mode = false; // true when we are on the final stretch
1672 let mut i = 0;
1673 let mut n_letters = 1; // we have skipped the first letter
1674 while i < chars.len() {
1675 let ch = chars[i];
1676 // debug!(" right alone: ch/next {}/{}", ch, if i+1<chars.len() {chars[i+1]} else {' '});
1677 if !intervening_chars_mode && ch == 'L' {
1678 n_letters += 1;
1679 i += 1; // ignore 'Lx' and also ignore 'ox'
1680 } else if ch == 'c' || ch == 'b' {
1681 i += 1; // ignore 'Lx' and also ignore 'ox'
1682 } else if RIGHT_INTERVENING_CHARS.contains(&ch) {
1683 intervening_chars_mode = true;
1684 } else {
1685 return if "W𝐖-—―".contains(ch) {(true, n_letters, i)} else {(false, n_letters, i)};
1686 }
1687 i += 1;
1688 }
1689
1690 return (true, n_letters, chars.len());
1691 }
1692}
1693
1694
1695/// Return a modified result if chars can be contracted.
1696/// Otherwise, the original string is returned
1697fn handle_contractions(chars: &[char], mut result: String) -> String {
1698 struct Replacement {
1699 pattern: String,
1700 replacement: &'static str
1701 }
1702
1703 const ASCII_TO_UNICODE: &[char] = &[
1704 '⠀', '⠮', '⠐', '⠼', '⠫', '⠩', '⠯', '⠄', '⠷', '⠾', '⠡', '⠬', '⠠', '⠤', '⠨', '⠌',
1705 '⠴', '⠂', '⠆', '⠒', '⠲', '⠢', '⠖', '⠶', '⠦', '⠔', '⠱', '⠰', '⠣', '⠿', '⠜', '⠹',
1706 '⠈', '⠁', '⠃', '⠉', '⠙', '⠑', '⠋', '⠛', '⠓', '⠊', '⠚', '⠅', '⠇', '⠍', '⠝', '⠕',
1707 '⠏', '⠟', '⠗', '⠎', '⠞', '⠥', '⠧', '⠺', '⠭', '⠽', '⠵', '⠪', '⠳', '⠻', '⠘', '⠸',
1708 ];
1709
1710 fn to_unicode_braille(ascii: &str) -> String {
1711 let mut unicode = String::with_capacity(4*ascii.len()); // 'L' + 3 bytes for braille char
1712 for ch in ascii.as_bytes() {
1713 unicode.push('L');
1714 unicode.push(ASCII_TO_UNICODE[(ch.to_ascii_uppercase() - 32) as usize])
1715 }
1716 return unicode;
1717 }
1718
1719 // It would be much better from an extensibility point of view to read the table in from a file
1720 lazy_static! {
1721 static ref CONTRACTIONS: Vec<Replacement> = vec![
1722 // 10.3: Strong contractions
1723 Replacement{ pattern: to_unicode_braille("and"), replacement: "L⠯"},
1724 Replacement{ pattern: to_unicode_braille("for"), replacement: "L⠿"},
1725 Replacement{ pattern: to_unicode_braille("of"), replacement: "L⠷"},
1726 Replacement{ pattern: to_unicode_braille("the"), replacement: "L⠮"},
1727 Replacement{ pattern: to_unicode_braille("with"), replacement: "L⠾"},
1728
1729 // 10.8: final-letter group signs (this need to precede 'en' and any other shorter contraction)
1730 Replacement{ pattern: "(?P<s>L.)L⠍L⠑L⠝L⠞".to_string(), replacement: "${s}L⠰L⠞" }, // ment
1731 Replacement{ pattern: "(?P<s>L.)L⠞L⠊L⠕L⠝".to_string(), replacement: "${s}L⠰L⠝" } ,// tion
1732
1733 // 10.4: Strong group signs
1734 Replacement{ pattern: to_unicode_braille("ch"), replacement: "L⠡"},
1735 Replacement{ pattern: to_unicode_braille("gh"), replacement: "L⠣"},
1736 Replacement{ pattern: to_unicode_braille("sh"), replacement: "L⠩"},
1737 Replacement{ pattern: to_unicode_braille("th"), replacement: "L⠹"},
1738 Replacement{ pattern: to_unicode_braille("wh"), replacement: "L⠱"},
1739 Replacement{ pattern: to_unicode_braille("ed"), replacement: "L⠫"},
1740 Replacement{ pattern: to_unicode_braille("er"), replacement: "L⠻"},
1741 Replacement{ pattern: to_unicode_braille("ou"), replacement: "L⠳"},
1742 Replacement{ pattern: to_unicode_braille("ow"), replacement: "L⠪"},
1743 Replacement{ pattern: to_unicode_braille("st"), replacement: "L⠌"},
1744 Replacement{ pattern: "(?P<s>L.)L⠊L⠝L⠛".to_string(), replacement: "${s}L⠬" }, // 'ing', not at start
1745 Replacement{ pattern: to_unicode_braille("ar"), replacement: "L⠜"},
1746
1747 // 10.6.5: Lower group signs preceded and followed by letters
1748 // FIX: don't match if after/before a cap letter -- can't use negative pattern (?!...) in regex package
1749 // Note: removed cc because "arccos" shouldn't be contracted (10.11.1), but there is no way to know about compound words
1750 // Add it back after implementing a lookup dictionary of exceptions
1751 Replacement{ pattern: "(?P<s>L.)L⠑L⠁(?P<e>L.)".to_string(), replacement: "${s}L⠂${e}" }, // ea
1752 Replacement{ pattern: "(?P<s>L.)L⠃L⠃(?P<e>L.)".to_string(), replacement: "${s}L⠆${e}" }, // bb
1753 // Replacement{ pattern: "(?P<s>L.)L⠉L⠉(?P<e>L.)".to_string(), replacement: "${s}L⠒${e}" }, // cc
1754 Replacement{ pattern: "(?P<s>L.)L⠋L⠋(?P<e>L.)".to_string(), replacement: "${s}L⠖${e}" }, // ff
1755 Replacement{ pattern: "(?P<s>L.)L⠛L⠛(?P<e>L.)".to_string(), replacement: "${s}L⠶${e}" }, // gg
1756
1757 // 10.6.8: Lower group signs ("in" also 10.5.4 lower word signs)
1758 // FIX: these need restrictions about only applying when upper dots are present
1759 Replacement{ pattern: to_unicode_braille("en"), replacement: "⠢"},
1760 Replacement{ pattern: to_unicode_braille("in"), replacement: "⠔"},
1761
1762 ];
1763
1764 static ref CONTRACTION_PATTERNS: RegexSet = init_patterns(&CONTRACTIONS);
1765 static ref CONTRACTION_REGEX: Vec<Regex> = init_regex(&CONTRACTIONS);
1766 }
1767
1768 let mut chars_as_str = chars.iter().collect::<String>();
1769 // debug!(" handle_contractions: examine '{}'", &chars_as_str);
1770 let matches = CONTRACTION_PATTERNS.matches(&chars_as_str);
1771 for i in matches.iter() {
1772 let element = &CONTRACTIONS[i];
1773 // debug!(" replacing '{}' with '{}' in '{}'", element.pattern, element.replacement, &chars_as_str);
1774 result.truncate(result.len() - chars_as_str.len());
1775 chars_as_str = CONTRACTION_REGEX[i].replace_all(&chars_as_str, element.replacement).to_string();
1776 result.push_str(&chars_as_str);
1777 // debug!(" result after replace '{}'", result);
1778 }
1779 return result;
1780
1781
1782
1783 fn init_patterns(contractions: &[Replacement]) -> RegexSet {
1784 let mut vec: Vec<&str> = Vec::with_capacity(contractions.len());
1785 for contraction in contractions {
1786 vec.push(&contraction.pattern);
1787 }
1788 return RegexSet::new(&vec).unwrap();
1789 }
1790
1791 fn init_regex(contractions: &[Replacement]) -> Vec<Regex> {
1792 let mut vec = Vec::with_capacity(contractions.len());
1793 for contraction in contractions {
1794 vec.push(Regex::new(&contraction.pattern).unwrap());
1795 }
1796 return vec;
1797 }
1798}
1799
1800
1801
1802
1803static VIETNAM_INDICATOR_REPLACEMENTS: phf::Map<&str, &str> = phf_map! {
1804 "S" => "XXX", // sans-serif -- from prefs
1805 "B" => "⠘", // bold
1806 "𝔹" => "XXX", // blackboard -- from prefs
1807 "T" => "⠈", // script
1808 "I" => "⠨", // italic
1809 "R" => "", // roman
1810 // "E" => "⠰", // English
1811 "1" => "⠠", // Grade 1 symbol
1812 "L" => "", // Letter left in to assist in locating letters
1813 "D" => "XXX", // German (Deutsche) -- from prefs
1814 "G" => "⠰", // Greek
1815 "V" => "XXX", // Greek Variants
1816 // "H" => "⠠⠠", // Hebrew
1817 // "U" => "⠈⠈", // Russian
1818 "C" => "⠨", // capital
1819 "𝑐" => "", // second or latter braille cell of a capital letter
1820 "𝐶" => "⠨", // capital that never should get word indicator (from chemical element)
1821 "N" => "⠼", // number indicator
1822 "t" => "⠱", // shape terminator
1823 "W" => "⠀", // whitespace"
1824 "𝐖"=> "⠀", // whitespace
1825 "s" => "⠆", // typeface single char indicator
1826 "w" => "", // typeface word indicator
1827 "e" => "", // typeface & capital terminator
1828 "o" => "", // flag that what follows is an open indicator (used for standing alone rule)
1829 "c" => "", // flag that what follows is an close indicator (used for standing alone rule)
1830 "b" => "", // flag that what follows is an open or close indicator (used for standing alone rule)
1831 "," => "⠂", // comma
1832 "." => "⠲", // period
1833 "-" => "-", // hyphen
1834 "—" => "⠠⠤", // normal dash (2014) -- assume all normal dashes are unified here [RUEB appendix 3]
1835 "―" => "⠐⠠⠤", // long dash (2015) -- assume all long dashes are unified here [RUEB appendix 3]
1836 "#" => "", // signals end of script
1837 "!" => "", // Hack used to prevent some regular expression matches
1838};
1839
1840fn vietnam_cleanup(pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String {
1841 lazy_static! {
1842 // Deal with Vietnamese "rhymes" -- moving accents around
1843 // See "Vietnamese Uncontracted Braille Update in MathCAT" or maybe https://icanreadvietnamese.com/blog/14-rule-of-tone-mark-placement
1844 // Note: I don't know how to write (for example) I_E_RULE so that it excludes "qu" and "gi", so I use two rules
1845 // The first rule rewrites the patterns with "qu" and "gi" to add "!" to prevent a match of the second rule -- "!" is dropped later
1846 static ref QU_GI_RULE_EXCEPTION: Regex = Regex::new(r"(L⠟L⠥|L⠛L⠊)").unwrap();
1847 static ref IUOY_E_RULE: Regex = Regex::new(r"L(⠊|⠥|⠕|⠽)(L[⠔⠰⠢⠤⠠])L(⠑|⠣)").unwrap(); // ie, ue, oe, and ye rule
1848 static ref UO_A_RULE: Regex = Regex::new(r"L(⠥|⠕)(L[⠔⠰⠢⠤⠠])L(⠁|⠡|⠜)").unwrap(); // ua, oa rule
1849 static ref UU_O_RULE: Regex = Regex::new(r"L(⠥|⠳)(L[⠔⠰⠢⠤⠠])L(⠪|⠹)").unwrap(); // uo, ưo rule
1850 static ref UYE_RULE: Regex = Regex::new(r"L⠥L([⠔⠰⠢⠤⠠])L⠽L⠣").unwrap(); // uo, ưo rule
1851 static ref UY_RULE: Regex = Regex::new(r"L⠥L([⠔⠰⠢⠤⠠])L⠽").unwrap(); // uo, ưo rule
1852 static ref REPLACE_INDICATORS: Regex =Regex::new(r"([1𝟙SB𝔹TIREDGVHP𝐶𝑐CLMNW𝐖swe,.-—―#ocb!])").unwrap();
1853
1854 }
1855 // debug!("vietnam_cleanup: start={}", raw_braille);
1856 let result = typeface_to_word_mode(&raw_braille);
1857 let result = capitals_to_word_mode(&result);
1858
1859 let result = result.replace("tW", "W");
1860 let result = result.replace("CG", "⠸"); // capital Greek letters are problematic in Vietnam braille
1861 let result = result.replace("CC", "⠸"); // capital word more is the same as capital Greek letters
1862 // debug!(" after typeface/caps={}", &result);
1863
1864 // deal with "rhymes"
1865 let result = QU_GI_RULE_EXCEPTION.replace_all(&result, "${1}!");
1866 // debug!(" after except={}", &result);
1867 let result = IUOY_E_RULE.replace_all(&result, "${2}L${1}L${3}");
1868 // debug!(" after IUOY_E={}", &result);
1869 let result = UO_A_RULE.replace_all(&result, "${2}L${1}L${3}");
1870 // debug!(" after UO_A={}", &result);
1871 let result = UU_O_RULE.replace_all(&result, "${2}L${1}L${3}");
1872 // debug!(" after UO_O={}", &result);
1873 let result = UYE_RULE.replace_all(&result, "${1}L⠥L⠽L⠣"); // longer match first
1874 // debug!(" after UYE={}", &result);
1875 let result = UY_RULE.replace_all(&result, "${1}L⠥L⠽");
1876 // debug!(" after UY={}", &result);
1877
1878 // these typeforms need to get pulled from user-prefs as they are transcriber-defined
1879 let double_struck = pref_manager.pref_to_string("Vietnam_DoubleStruck");
1880 let sans_serif = pref_manager.pref_to_string("Vietnam_SansSerif");
1881 let fraktur = pref_manager.pref_to_string("Vietnam_Fraktur");
1882 let greek_variant = pref_manager.pref_to_string("Vietnam_GreekVariant");
1883
1884 // This reuses the code just for getting rid of unnecessary "L"s and "N"s
1885 let result = remove_unneeded_mode_changes(&result, UEB_Mode::Grade1, UEB_Duration::Passage);
1886
1887
1888 let result = REPLACE_INDICATORS.replace_all(&result, |cap: &Captures| {
1889 let matched_char = &cap[0];
1890 match matched_char {
1891 "𝔹" => &double_struck,
1892 "S" => &sans_serif,
1893 "D" => &fraktur,
1894 "V" => &greek_variant,
1895 _ => match VIETNAM_INDICATOR_REPLACEMENTS.get(matched_char) {
1896 None => {error!("REPLACE_INDICATORS and VIETNAM_INDICATOR_REPLACEMENTS are not in sync: missing '{matched_char}'"); ""},
1897 Some(&ch) => ch,
1898 },
1899 }
1900 });
1901
1902 // Remove unicode blanks at start and end -- do this after the substitutions because ',' introduces spaces
1903 // let result = result.trim_start_matches('⠀').trim_end_matches('⠀');
1904 let result = COLLAPSE_SPACES.replace_all(&result, "⠀");
1905
1906 return result.to_string();
1907}
1908
1909
1910static CMU_INDICATOR_REPLACEMENTS: phf::Map<&str, &str> = phf_map! {
1911 // "S" => "XXX", // sans-serif -- from prefs
1912 "B" => "⠔", // bold
1913 "𝔹" => "⠬", // blackboard -- from prefs
1914 // "T" => "⠈", // script
1915 "I" => "⠔", // italic -- same as bold
1916 // "R" => "", // roman
1917 // "E" => "⠰", // English
1918 "1" => "⠐", // Grade 1 symbol -- used here for a-j after number
1919 "L" => "", // Letter left in to assist in locating letters
1920 "D" => "⠠", // German (Gothic)
1921 "G" => "⠈", // Greek
1922 "V" => "⠈⠬", // Greek Variants
1923 // "H" => "⠠⠠", // Hebrew
1924 // "U" => "⠈⠈", // Russian
1925 "C" => "⠨", // capital
1926 "𝐶" => "⠨", // capital that never should get word indicator (from chemical element)
1927 "N" => "⠼", // number indicator
1928 "𝑁" => "", // continue number
1929 // "t" => "⠱", // shape terminator
1930 "W" => "⠀", // whitespace"
1931 "𝐖"=> "⠀", // whitespace
1932 // "𝘄" => "⠀", // add whitespace if char to the left has dots 1, 2, or 3 -- special rule handled separately, so commented out
1933 "s" => "", // typeface single char indicator
1934 // "w" => "⠂", // typeface word indicator
1935 // "e" => "⠄", // typeface & capital terminator
1936 // "o" => "", // flag that what follows is an open indicator (used for standing alone rule)
1937 // "c" => "", // flag that what follows is an close indicator (used for standing alone rule)
1938 // "b" => "", // flag that what follows is an open or close indicator (used for standing alone rule)
1939 "," => "⠂", // comma
1940 "." => "⠄", // period
1941 "-" => "⠤", // hyphen
1942 "—" => "⠤⠤", // normal dash (2014) -- assume all normal dashes are unified here [RUEB appendix 3]
1943 // "―" => "⠐⠤⠤", // long dash (2015) -- assume all long dashes are unified here [RUEB appendix 3]
1944 "#" => "⠼", // signals to end/restart of numeric mode (mixed fractions)
1945};
1946
1947
1948fn cmu_cleanup(_pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String {
1949 lazy_static! {
1950 static ref ADD_WHITE_SPACE: Regex = Regex::new(r"𝘄(.)|𝘄$").unwrap();
1951 }
1952
1953 // debug!("cmu_cleanup: start={}", raw_braille);
1954 // let result = typeface_to_word_mode(&raw_braille);
1955
1956 // let result = result.replace("tW", "W");
1957 let result = raw_braille.replace("CG", "⠘")
1958 .replace("𝔹C", "⠩")
1959 .replace("DC", "⠰");
1960 // let result = result.replace("CC", "⠸");
1961
1962 // these typeforms need to get pulled from user-prefs as they are transcriber-defined
1963 // let double_struck = pref_manager.pref_to_string("CMU_DoubleStruck");
1964 // let sans_serif = pref_manager.pref_to_string("CMU_SansSerif");
1965 // let fraktur = pref_manager.pref_to_string("CMU_Fraktur");
1966
1967 // debug!("Before remove mode changes: '{}'", &result);
1968 // This reuses the code just for getting rid of unnecessary "L"s and "N"s
1969 let result = remove_unneeded_mode_changes(&result, UEB_Mode::Grade1, UEB_Duration::Passage);
1970 let result = result.replace("𝑁N", "");
1971 // debug!(" After remove mode changes: '{}'", &result);
1972
1973 let result = REPLACE_INDICATORS.replace_all(&result, |cap: &Captures| {
1974 match CMU_INDICATOR_REPLACEMENTS.get(&cap[0]) {
1975 None => {error!("REPLACE_INDICATORS and CMU_INDICATOR_REPLACEMENTS are not in sync"); ""},
1976 Some(&ch) => ch,
1977 }
1978 });
1979 let result = ADD_WHITE_SPACE.replace_all(&result, |cap: &Captures| {
1980 if cap.get(1).is_none() {
1981 return "⠀".to_string();
1982 } else {
1983 // debug!("ADD_WHITE_SPACE match='{}', has left dots = {}", &cap[1], has_left_dots(cap[1].chars().next().unwrap()));
1984 let mut next_chars = cap[1].chars();
1985 let next_char = next_chars.next().unwrap();
1986 assert!(next_chars.next().is_none());
1987 return (if has_left_dots(next_char) {"⠀"} else {""}).to_string() + &cap[1];
1988 }
1989 });
1990
1991 // Remove unicode blanks at start and end -- do this after the substitutions because ',' introduces spaces
1992 let result = COLLAPSE_SPACES.replace_all(&result, "⠀");
1993 let result = result.trim_start_matches('⠀'); // don't trip end (e.g., see once::vector_11_2_5)
1994 return result.to_string();
1995
1996 fn has_left_dots(ch: char) -> bool {
1997 // Unicode braille is set up so dot 1 is 2^0, dot 2 is 2^1, etc
1998 return ( (ch as u32 - 0x2800) >> 4 ) > 0;
1999 }
2000}
2001
2002
2003
2004static SWEDISH_INDICATOR_REPLACEMENTS: phf::Map<&str, &str> = phf_map! {
2005 // FIX: this needs cleaning up -- not all of these are used
2006 "S" => "XXX", // sans-serif -- from prefs
2007 "B" => "⠨", // bold
2008 "𝔹" => "XXX", // blackboard -- from prefs
2009 "T" => "⠈", // script
2010 "I" => "⠨", // italic
2011 "R" => "", // roman
2012 "1" => "⠱", // Grade 1 symbol (used for number followed by a letter)
2013 "L" => "", // Letter left in to assist in locating letters
2014 "D" => "XXX", // German (Deutsche) -- from prefs
2015 "G" => "⠰", // Greek
2016 "V" => "XXX", // Greek Variants
2017 // "H" => "⠠⠠", // Hebrew
2018 // "U" => "⠈⠈", // Russian
2019 "C" => "⠠", // capital
2020 "𝑐" => "", // second or latter braille cell of a capital letter
2021 "𝐶" => "⠠", // capital that never should get word indicator (from chemical element)
2022 "N" => "⠼", // number indicator
2023 "t" => "⠱", // shape terminator
2024 "W" => "⠀", // whitespace"
2025 "𝐖"=> "⠀", // whitespace
2026 "w" => "⠀", // whitespace after function name
2027 "s" => "", // typeface single char indicator
2028 "e" => "", // typeface & capital terminator
2029 "E" => "⠱", // empty base -- see index of radical
2030 "o" => "", // flag that what follows is an open indicator (used for standing alone rule)
2031 "c" => "", // flag that what follows is an close indicator (used for standing alone rule)
2032 "b" => "", // flag that what follows is an open or close indicator (used for standing alone rule)
2033 "," => "⠂", // comma
2034 "." => "⠲", // period
2035 "-" => "-", // hyphen
2036 "—" => "⠠⠤", // normal dash (2014) -- assume all normal dashes are unified here [RUEB appendix 3]
2037 "―" => "⠐⠠⠤", // long dash (2015) -- assume all long dashes are unified here [RUEB appendix 3]
2038 "#" => "", // signals end of script
2039
2040};
2041
2042
2043static FINNISH_INDICATOR_REPLACEMENTS: phf::Map<&str, &str> = phf_map! {
2044 // FIX: this needs cleaning up -- not all of these are used
2045 "S" => "XXX", // sans-serif -- from prefs
2046 "B" => "⠨", // bold
2047 "𝔹" => "XXX", // blackboard -- from prefs
2048 "T" => "⠈", // script
2049 "I" => "⠨", // italic
2050 "R" => "", // roman
2051 "E" => "⠰", // English
2052 "1" => "⠀", // Grade 1 symbol (used for number followed by a letter)
2053 "L" => "", // Letter left in to assist in locating letters
2054 "D" => "XXX", // German (Deutsche) -- from prefs
2055 "G" => "⠨", // Greek
2056 "V" => "XXX", // Greek Variants
2057 // "H" => "⠠⠠", // Hebrew
2058 // "U" => "⠈⠈", // Russian
2059 "C" => "⠠", // capital
2060 "𝑐" => "", // second or latter braille cell of a capital letter
2061 "𝐶" => "⠠", // capital that never should get whitespace in front (from chemical element)
2062 "N" => "⠼", // number indicator
2063 "n" => "⠼", // number indicator for drop numbers (special case with close parens)
2064 "t" => "⠱", // shape terminator
2065 "W" => "⠀", // whitespace"
2066 "𝐖"=> "⠀", // whitespace
2067 "s" => "⠆", // typeface single char indicator
2068 "w" => "", // typeface word indicator
2069 "e" => "", // typeface & capital terminator
2070 "," => "⠂", // comma
2071 "." => "⠲", // period
2072 "-" => "-", // hyphen
2073 "—" => "⠠⠤", // normal dash (2014) -- assume all normal dashes are unified here [RUEB appendix 3]
2074 "―" => "⠐⠠⠤", // long dash (2015) -- assume all long dashes are unified here [RUEB appendix 3]
2075 "(" => "⠦", // Not really needed, but done for consistency with ")"
2076 ")" => "⠴", // Needed for rules with drop numbers to avoid mistaking for dropped 0
2077 "↑" => "⠬", // superscript
2078 "↓" => "⠡", // subscript
2079 "#" => "", // signals end of script
2080 "Z" => "⠐", // signals end of index of root, integrand/lim from function ("zone change")
2081
2082};
2083
2084fn finnish_cleanup(pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String {
2085 lazy_static! {
2086 static ref REPLACE_INDICATORS: Regex =Regex::new(r"([SB𝔹TIREDGVHUP𝐏C𝐶LlMmb↑↓Nn𝑁WwZ,()])").unwrap();
2087 // Numbers need to end with a space, but sometimes there is one there for other reasons
2088 static ref DROP_NUMBER_SEPARATOR: Regex = Regex::new(r"(n.)\)").unwrap();
2089 static ref NUMBER_MATCH: Regex = Regex::new(r"((N.)+[^WN𝐶#↑↓Z])").unwrap();
2090 }
2091
2092 // debug!("finnish_cleanup: start={}", raw_braille);
2093 let result = DROP_NUMBER_SEPARATOR.replace_all(&raw_braille, |cap: &Captures| {
2094 // match includes the char after the number -- insert the whitespace before it
2095 // debug!("DROP_NUMBER_SEPARATOR match='{}'", &cap[1]);
2096 return cap[1].to_string() + "𝐶)"; // hack to use "𝐶" instead of dot 6 directly, but works for NUMBER_MATCH
2097 });
2098 let result = result.replace('n', "N"); // avoids having to modify remove_unneeded_mode_changes()
2099 let result = NUMBER_MATCH.replace_all(&result, |cap: &Captures| {
2100 // match includes the char after the number -- insert the whitespace before it
2101 // debug!("NUMBER_MATCH match='{}'", &cap[1]);
2102 let mut chars = cap[0].chars();
2103 let last_char = chars.next_back().unwrap(); // unwrap safe since several chars were matched
2104 return chars.as_str().to_string() + "W" + &last_char.to_string();
2105 });
2106
2107 // FIX: need to implement this -- this is just a copy of the Vietnam code
2108 let result = result.replace("CG", "⠘")
2109 .replace("𝔹C", "⠩")
2110 .replace("DC", "⠰");
2111
2112 // debug!(" after typeface/caps={}", &result);
2113
2114 // these typeforms need to get pulled from user-prefs as they are transcriber-defined
2115 let double_struck = pref_manager.pref_to_string("Vietnam_DoubleStruck");
2116 let sans_serif = pref_manager.pref_to_string("Vietnam_SansSerif");
2117 let fraktur = pref_manager.pref_to_string("Vietnam_Fraktur");
2118 let greek_variant = pref_manager.pref_to_string("Vietnam_GreekVariant");
2119
2120 // This reuses the code just for getting rid of unnecessary "L"s and "N"s
2121 let result = remove_unneeded_mode_changes(&result, UEB_Mode::Grade1, UEB_Duration::Passage);
2122 // debug!(" remove_unneeded_mode_changes={}", &result);
2123
2124
2125 let result = REPLACE_INDICATORS.replace_all(&result, |cap: &Captures| {
2126 let matched_char = &cap[0];
2127 match matched_char {
2128 "𝔹" => &double_struck,
2129 "S" => &sans_serif,
2130 "D" => &fraktur,
2131 "V" => &greek_variant,
2132 _ => match FINNISH_INDICATOR_REPLACEMENTS.get(matched_char) {
2133 None => {error!("REPLACE_INDICATORS and SWEDISH_INDICATOR_REPLACEMENTS are not in sync: missing '{matched_char}'"); ""},
2134 Some(&ch) => ch,
2135 },
2136 }
2137 });
2138
2139 // Remove unicode blanks at start and end -- do this after the substitutions because ',' introduces spaces
2140 // let result = result.trim_start_matches('⠀').trim_end_matches('⠀');
2141 let result = COLLAPSE_SPACES.replace_all(&result, "⠀");
2142
2143 return result.to_string();
2144}
2145
2146
2147fn swedish_cleanup(pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String {
2148 // FIX: need to implement this -- this is just a copy of the Vietnam code
2149 lazy_static! {
2150 // Empty bases are ok if they follow whitespace
2151 static ref EMPTY_BASE: Regex = Regex::new(r"(^|[W𝐖w])E").unwrap();
2152 }
2153 // debug!("swedish_cleanup: start={}", raw_braille);
2154 let result = typeface_to_word_mode(&raw_braille);
2155 let result = capitals_to_word_mode(&result);
2156
2157 let result = result.replace("CG", "⠘")
2158 .replace("𝔹C", "⠩")
2159 .replace("DC", "⠰");
2160
2161 // debug!(" after typeface/caps={}", &result);
2162
2163 // these typeforms need to get pulled from user-prefs as they are transcriber-defined
2164 let double_struck = pref_manager.pref_to_string("Vietnam_DoubleStruck");
2165 let sans_serif = pref_manager.pref_to_string("Vietnam_SansSerif");
2166 let fraktur = pref_manager.pref_to_string("Vietnam_Fraktur");
2167 let greek_variant = pref_manager.pref_to_string("Vietnam_GreekVariant");
2168
2169 // This reuses the code just for getting rid of unnecessary "L"s and "N"s
2170 let result = remove_unneeded_mode_changes(&result, UEB_Mode::Grade1, UEB_Duration::Passage);
2171 // debug!(" after removing mode changes={}", &result);
2172
2173
2174 let result = EMPTY_BASE.replace_all(&result, "$1");
2175 let result = REPLACE_INDICATORS.replace_all(&result, |cap: &Captures| {
2176 let matched_char = &cap[0];
2177 match matched_char {
2178 "𝔹" => &double_struck,
2179 "S" => &sans_serif,
2180 "D" => &fraktur,
2181 "V" => &greek_variant,
2182 _ => match SWEDISH_INDICATOR_REPLACEMENTS.get(matched_char) {
2183 None => {error!("REPLACE_INDICATORS and SWEDISH_INDICATOR_REPLACEMENTS are not in sync: missing '{matched_char}'"); ""},
2184 Some(&ch) => ch,
2185 },
2186 }
2187 });
2188
2189 // Remove unicode blanks at start and end -- do this after the substitutions because ',' introduces spaces
2190 // let result = result.trim_start_matches('⠀').trim_end_matches('⠀');
2191 let result = COLLAPSE_SPACES.replace_all(&result, "⠀");
2192
2193 return result.to_string();
2194}
2195
2196#[allow(non_snake_case)]
2197fn LaTeX_cleanup(_pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String {
2198 lazy_static! {
2199 static ref REMOVE_SPACE: Regex =Regex::new(r" ([\^_,;)\]}])").unwrap(); // '^', '_', ',', ';', ')', ']', '}'
2200 static ref COLLAPSE_SPACES: Regex = Regex::new(r" +").unwrap();
2201 }
2202 // debug!("LaTeX_cleanup: start={}", raw_braille);
2203 let result = raw_braille.replace('𝐖', " ");
2204 // let result = COLLAPSE_SPACES.replace_all(&raw_braille, "⠀");
2205 let result = COLLAPSE_SPACES.replace_all(&result, " ");
2206 // debug!("After collapse: {}", &result);
2207 let result = REMOVE_SPACE.replace_all(&result, "$1");
2208 // debug!("After remove: {}", &result);
2209 // let result = result.trim_matches('⠀');
2210 let result = result.trim_matches(' ');
2211
2212 return result.to_string();
2213}
2214
2215#[allow(non_snake_case)]
2216fn ASCIIMath_cleanup(_pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String {
2217 lazy_static! {
2218 static ref REMOVE_SPACE_BEFORE_OP: Regex = Regex::new(r#"([\w\d]) +([^\w\d"]|[\^_,;)\]}])"#).unwrap();
2219 static ref REMOVE_SPACE_AFTER_OP: Regex = Regex::new(r#"([^\^_,;)\]}\w\d"]) +([\w\d])"#).unwrap();
2220 static ref COLLAPSE_SPACES: Regex = Regex::new(r" +").unwrap();
2221 }
2222 // debug!("ASCIIMath_cleanup: start={}", raw_braille);
2223 let result = raw_braille.replace("|𝐖__|", "|𝐰__|"); // protect the whitespace to prevent misinterpretation as lfloor
2224 let result = result.replace('𝐖', " ");
2225 let result = COLLAPSE_SPACES.replace_all(&result, " ");
2226 // debug!("After collapse: {}", &result);
2227 let result = REMOVE_SPACE_BEFORE_OP.replace_all(&result, "$1$2");
2228 let result = REMOVE_SPACE_AFTER_OP.replace_all(&result, "$1$2");
2229 let result = result.replace('𝐰', " "); // spaces around relational operators
2230 let result = COLLAPSE_SPACES.replace_all(&result, " ");
2231 // debug!("After remove: {}", &result);
2232 // let result = result.trim_matches('⠀');
2233 let result = result.trim_matches(' ');
2234
2235 return result.to_string();
2236}
2237
2238
2239/************** Braille xpath functionality ***************/
2240use crate::canonicalize::{name, as_element, as_text};
2241use crate::xpath_functions::{is_leaf, IsBracketed, validate_one_node};
2242use sxd_document::dom::ParentOfChild;
2243use sxd_xpath::{Value, context, nodeset::*};
2244use sxd_xpath::function::{Function, Args};
2245use sxd_xpath::function::Error as XPathError;
2246use std::result::Result as StdResult;
2247
2248pub struct NemethNestingChars;
2249const NEMETH_FRAC_LEVEL: &str = "data-nemeth-frac-level"; // name of attr where value is cached
2250const FIRST_CHILD_ONLY: &[&str] = &["mroot", "msub", "msup", "msubsup", "munder", "mover", "munderover", "mmultiscripts"];
2251impl NemethNestingChars {
2252 // returns a 'repeat_char' corresponding to the Nemeth rules for nesting
2253 // note: this value is likely one char too long because the starting fraction is counted
2254 fn nemeth_frac_value(node: Element, repeat_char: &str) -> String {
2255 let children = node.children();
2256 let name = name(node);
2257 if is_leaf(node) {
2258 return "".to_string();
2259 } else if name == "mfrac" {
2260 // have we already computed the value?
2261 if let Some(value) = node.attribute_value(NEMETH_FRAC_LEVEL) {
2262 return value.to_string();
2263 }
2264
2265 let num_value = NemethNestingChars::nemeth_frac_value(as_element(children[0]), repeat_char);
2266 let denom_value = NemethNestingChars::nemeth_frac_value(as_element(children[1]), repeat_char);
2267 let mut max_value = if num_value.len() > denom_value.len() {num_value} else {denom_value};
2268 max_value += repeat_char;
2269 node.set_attribute_value(NEMETH_FRAC_LEVEL, &max_value);
2270 return max_value;
2271 } else if FIRST_CHILD_ONLY.contains(&name) {
2272 // only look at the base -- ignore scripts/index
2273 return NemethNestingChars::nemeth_frac_value(as_element(children[0]), repeat_char);
2274 } else {
2275 let mut result = "".to_string();
2276 for child in children {
2277 let value = NemethNestingChars::nemeth_frac_value(as_element(child), repeat_char);
2278 if value.len() > result.len() {
2279 result = value;
2280 }
2281 }
2282 return result;
2283 }
2284 }
2285
2286 fn nemeth_root_value(node: Element, repeat_char: &str) -> StdResult<String, XPathError> {
2287 // returns the correct number of repeat_chars to use
2288 // note: because the highest count is toward the leaves and
2289 // because this is a loop and not recursive, caching doesn't work without a lot of overhead
2290 let parent = node.parent().unwrap();
2291 if let ParentOfChild::Element(e) = parent {
2292 let mut parent = e;
2293 let mut result = "".to_string();
2294 loop {
2295 let name = name(parent);
2296 if name == "math" {
2297 return Ok( result );
2298 }
2299 if name == "msqrt" || name == "mroot" {
2300 result += repeat_char;
2301 }
2302 let parent_of_child = parent.parent().unwrap();
2303 if let ParentOfChild::Element(e) = parent_of_child {
2304 parent = e;
2305 } else {
2306 return Err( sxd_xpath::function::Error::Other("Internal error in nemeth_root_value: didn't find 'math' tag".to_string()) );
2307 }
2308 }
2309 }
2310 return Err( XPathError::Other("Internal error in nemeth_root_value: didn't find 'math' tag".to_string()) );
2311 }
2312}
2313
2314impl Function for NemethNestingChars {
2315/**
2316 * Returns a string with the correct number of nesting chars (could be an empty string)
2317 * @param(node) -- current node
2318 * @param(char) -- char (string) that should be repeated
2319 * Note: as a side effect, an attribute with the value so repeated calls to this or a child will be fast
2320 */
2321 fn evaluate<'d>(&self,
2322 _context: &context::Evaluation<'_, 'd>,
2323 args: Vec<Value<'d>>)
2324 -> StdResult<Value<'d>, XPathError>
2325 {
2326 let mut args = Args(args);
2327 args.exactly(2)?;
2328 let repeat_char = args.pop_string()?;
2329 let node = crate::xpath_functions::validate_one_node(args.pop_nodeset()?, "NestingChars")?;
2330 if let Node::Element(el) = node {
2331 let name = name(el);
2332 // it is likely a bug to call this one a non mfrac
2333 if name == "mfrac" {
2334 // because it is called on itself, the fraction is counted one too many times -- chop one off
2335 // this is slightly messy because we are chopping off a char, not a byte
2336 const BRAILLE_BYTE_LEN: usize = "⠹".len(); // all Unicode braille symbols have the same number of bytes
2337 return Ok( Value::String( NemethNestingChars::nemeth_frac_value(el, &repeat_char)[BRAILLE_BYTE_LEN..].to_string() ) );
2338 } else if name == "msqrt" || name == "mroot" {
2339 return Ok( Value::String( NemethNestingChars::nemeth_root_value(el, &repeat_char)? ) );
2340 } else {
2341 panic!("NestingChars chars should be used only on 'mfrac'. '{}' was passed in", name);
2342 }
2343 } else {
2344 // not an element, so nothing to do
2345 return Ok( Value::String("".to_string()) );
2346 }
2347 }
2348}
2349
2350pub struct BrailleChars;
2351impl BrailleChars {
2352 // returns a string for the chars in the *leaf* node.
2353 // this string follows the Nemeth rules typefaces and deals with mathvariant
2354 // which has partially turned chars to the alphanumeric block
2355 fn get_braille_chars(node: Element, code: &str, text_range: Option<Range<usize>>) -> StdResult<String, XPathError> {
2356 let result = match code {
2357 "Nemeth" => BrailleChars::get_braille_nemeth_chars(node, text_range),
2358 "UEB" => BrailleChars:: get_braille_ueb_chars(node, text_range),
2359 "CMU" => BrailleChars:: get_braille_cmu_chars(node, text_range),
2360 "Vietnam" => BrailleChars:: get_braille_vietnam_chars(node, text_range),
2361 "Swedish" => BrailleChars:: get_braille_ueb_chars(node, text_range), // FIX: need to figure out what to implement
2362 "Finnish" => BrailleChars:: get_braille_ueb_chars(node, text_range), // FIX: need to figure out what to implement
2363 _ => return Err(sxd_xpath::function::Error::Other(format!("get_braille_chars: unknown braille code '{code}'")))
2364 };
2365 return match result {
2366 Ok(string) => Ok(make_quoted_string(string)),
2367 Err(err) => return Err(sxd_xpath::function::Error::Other(err.to_string())),
2368 }
2369 }
2370
2371 fn get_braille_nemeth_chars(node: Element, text_range: Option<Range<usize>>) -> Result<String> {
2372 lazy_static! {
2373 // To greatly simplify typeface/language generation, the chars have unique ASCII chars for them:
2374 // Typeface: S: sans-serif, B: bold, 𝔹: blackboard, T: script, I: italic, R: Roman
2375 // Language: E: English, D: German, G: Greek, V: Greek variants, H: Hebrew, U: Russian
2376 // Indicators: C: capital, L: letter, N: number, P: punctuation, M: multipurpose
2377 static ref PICK_APART_CHAR: Regex =
2378 Regex::new(r"(?P<face>[SB𝔹TIR]*)(?P<lang>[EDGVHU]?)(?P<cap>C?)(?P<letter>L?)(?P<num>[N]?)(?P<char>.)").unwrap();
2379 }
2380 let math_variant = node.attribute_value("mathvariant");
2381 // FIX: cover all the options -- use phf::Map
2382 let attr_typeface = match math_variant {
2383 None => "R",
2384 Some(variant) => match variant {
2385 "bold" => "B",
2386 "italic" => "I",
2387 "double-struck" => "𝔹",
2388 "script" => "T",
2389 "fraktur" => "D",
2390 "sans-serif" => "S",
2391 _ => "R", // normal and unknown
2392 },
2393 };
2394 let text = BrailleChars::substring(as_text(node), &text_range);
2395 let braille_chars = braille_replace_chars(&text, node)?;
2396 // debug!("Nemeth chars: text='{}', braille_chars='{}'", &text, &braille_chars);
2397
2398 // we want to pull the prefix (typeface, language) out to the front until a change happens
2399 // the same is true for number indicator
2400 // also true (sort of) for capitalization -- if all caps, use double cap in front (assume abbr or Roman Numeral)
2401
2402 // we only care about this for numbers and identifiers/text, so we filter for only those
2403 let node_name = name(node);
2404 let is_in_enclosed_list = node_name != "mo" && BrailleChars::is_in_enclosed_list(node);
2405 let is_mn_in_enclosed_list = is_in_enclosed_list && node_name == "mn";
2406 let mut typeface = "R".to_string(); // assumption is "R" and if attr or letter is different, something happens
2407 let mut is_all_caps = true;
2408 let mut is_all_caps_valid = false; // all_caps only valid if we did a replacement
2409 let result = PICK_APART_CHAR.replace_all(&braille_chars, |caps: &Captures| {
2410 // debug!(" face: {:?}, lang: {:?}, num {:?}, letter: {:?}, cap: {:?}, char: {:?}",
2411 // &caps["face"], &caps["lang"], &caps["num"], &caps["letter"], &caps["cap"], &caps["char"]);
2412 let mut nemeth_chars = "".to_string();
2413 let char_face = if caps["face"].is_empty() {attr_typeface} else {&caps["face"]};
2414 let typeface_changed = typeface != char_face;
2415 if typeface_changed {
2416 typeface = char_face.to_string(); // needs to outlast this instance of the loop
2417 nemeth_chars += &typeface;
2418 nemeth_chars += &caps["lang"];
2419 } else {
2420 nemeth_chars += &caps["lang"];
2421 }
2422 // debug!(" typeface changed: {}, is_in_list: {}; num: {}", typeface_changed, is_in_enclosed_list, !caps["num"].is_empty());
2423 if !caps["num"].is_empty() && (typeface_changed || !is_mn_in_enclosed_list) {
2424 nemeth_chars += "N";
2425 }
2426 is_all_caps_valid = true;
2427 is_all_caps &= !&caps["cap"].is_empty();
2428 nemeth_chars += &caps["cap"]; // will be stripped later if all caps
2429 if is_in_enclosed_list {
2430 nemeth_chars += &caps["letter"].replace('L', "l");
2431 } else {
2432 nemeth_chars += &caps["letter"];
2433 }
2434 nemeth_chars += &caps["char"];
2435 return nemeth_chars;
2436 });
2437 // debug!(" result: {}", &result);
2438 let mut text_chars = text.chars(); // see if more than one char
2439 if is_all_caps_valid && is_all_caps && text_chars.next().is_some() && text_chars.next().is_some() {
2440 return Ok( "CC".to_string() + &result.replace('C', ""));
2441 } else {
2442 return Ok( result.to_string() );
2443 }
2444 }
2445
2446 fn get_braille_ueb_chars(node: Element, text_range: Option<Range<usize>>) -> Result<String> {
2447 // Because in UEB typeforms and caps may extend for multiple tokens,
2448 // this routine merely deals with the mathvariant attr.
2449 // Canonicalize has already transformed all chars it can to math alphanumerics, but not all have bold/italic
2450 // The typeform/caps transforms to (potentially) word mode are handled later.
2451 lazy_static! {
2452 static ref HAS_TYPEFACE: Regex = Regex::new(".*?(double-struck|script|fraktur|sans-serif).*").unwrap();
2453 static ref PICK_APART_CHAR: Regex =
2454 Regex::new(r"(?P<bold>B??)(?P<italic>I??)(?P<face>[S𝔹TD]??)s??(?P<cap>C??)(?P<greek>G??)(?P<char>[NL].)").unwrap();
2455 }
2456
2457 let math_variant = node.attribute_value("mathvariant");
2458 let text = BrailleChars::substring(as_text(node), &text_range);
2459 let mut braille_chars = braille_replace_chars(&text, node)?;
2460
2461 // debug!("get_braille_ueb_chars: before/after unicode.yaml: '{}'/'{}'", text, braille_chars);
2462 if math_variant.is_none() { // nothing we need to do
2463 return Ok(braille_chars);
2464 }
2465 // mathvariant could be "sans-serif-bold-italic" -- get the parts
2466 let math_variant = math_variant.unwrap();
2467 let italic = math_variant.contains("italic");
2468 if italic & !braille_chars.contains('I') {
2469 braille_chars = "I".to_string() + &braille_chars;
2470 }
2471 let bold = math_variant.contains("bold");
2472 if bold & !braille_chars.contains('B') {
2473 braille_chars = "B".to_string() + &braille_chars;
2474 }
2475 let typeface = match HAS_TYPEFACE.find(math_variant) {
2476 None => "",
2477 Some(m) => match m.as_str() {
2478 "double-struck" => "𝔹",
2479 "script" => "T",
2480 "fraktur" => "D",
2481 "sans-serif" => "S",
2482 // don't consider monospace as a typeform
2483 _ => "",
2484 },
2485 };
2486 let result = PICK_APART_CHAR.replace_all(&braille_chars, |caps: &Captures| {
2487 // debug!("captures: {:?}", caps);
2488 // debug!(" bold: {:?}, italic: {:?}, face: {:?}, cap: {:?}, char: {:?}",
2489 // &caps["bold"], &caps["italic"], &caps["face"], &caps["cap"], &caps["char"]);
2490 if bold || !caps["bold"].is_empty() {"B"} else {""}.to_string()
2491 + if italic || !caps["italic"].is_empty() {"I"} else {""}
2492 + if !&caps["face"].is_empty() {&caps["face"]} else {typeface}
2493 + &caps["cap"]
2494 + &caps["greek"]
2495 + &caps["char"]
2496 });
2497 // debug!("get_braille_ueb_chars: '{}'", &result);
2498 return Ok(result.to_string())
2499 }
2500
2501 fn get_braille_cmu_chars(node: Element, text_range: Option<Range<usize>>) -> Result<String> {
2502 // In CMU, we need to replace spaces used for number blocks with "."
2503 // For other numbers, we need to add "." to create digit blocks
2504
2505 lazy_static! {
2506 static ref HAS_TYPEFACE: Regex = Regex::new(".*?(double-struck|script|fraktur|sans-serif).*").unwrap();
2507 static ref PICK_APART_CHAR: Regex =
2508 Regex::new(r"(?P<bold>B??)(?P<italic>I??)(?P<face>[S𝔹TD]??)s??(?P<cap>C??)(?P<greek>G??)(?P<char>[NL].)").unwrap();
2509 }
2510
2511 let math_variant = node.attribute_value("mathvariant");
2512 let text = BrailleChars::substring(as_text(node), &text_range);
2513 let text = add_separator(text);
2514
2515 let braille_chars = braille_replace_chars(&text, node)?;
2516
2517 // debug!("get_braille_ueb_chars: before/after unicode.yaml: '{}'/'{}'", text, braille_chars);
2518 if math_variant.is_none() { // nothing we need to do
2519 return Ok(braille_chars);
2520 }
2521 // mathvariant could be "sans-serif-bold-italic" -- get the parts
2522 let math_variant = math_variant.unwrap();
2523 let bold = math_variant.contains("bold");
2524 let italic = math_variant.contains("italic");
2525 let typeface = match HAS_TYPEFACE.find(math_variant) {
2526 None => "",
2527 Some(m) => match m.as_str() {
2528 "double-struck" => "𝔹",
2529 "script" => "T",
2530 "fraktur" => "D",
2531 "sans-serif" => "S",
2532 // don't consider monospace as a typeform
2533 _ => "",
2534 },
2535 };
2536 let result = PICK_APART_CHAR.replace_all(&braille_chars, |caps: &Captures| {
2537 // debug!("captures: {:?}", caps);
2538 // debug!(" bold: {:?}, italic: {:?}, face: {:?}, cap: {:?}, char: {:?}",
2539 // &caps["bold"], &caps["italic"], &caps["face"], &caps["cap"], &caps["char"]);
2540 if bold || !caps["bold"].is_empty() {"B"} else {""}.to_string()
2541 + if italic || !caps["italic"].is_empty() {"I"} else {""}
2542 + if !&caps["face"].is_empty() {&caps["face"]} else {typeface}
2543 + &caps["cap"]
2544 + &caps["greek"]
2545 + &caps["char"]
2546 });
2547 return Ok(result.to_string());
2548
2549 fn add_separator(text: String) -> String {
2550 use crate::definitions::BRAILLE_DEFINITIONS;
2551 if let Some(text_without_arc) = text.strip_prefix("arc") {
2552 // "." after arc (7.5.3)
2553 let is_function_name = BRAILLE_DEFINITIONS.with(|definitions| {
2554 let definitions = definitions.borrow();
2555 let set = definitions.get_hashset("CMUFunctionNames").unwrap();
2556 return set.contains(&text);
2557 });
2558 if is_function_name {
2559 return "arc.".to_string() + text_without_arc;
2560 }
2561 }
2562 return text;
2563 }
2564 }
2565
2566 fn get_braille_vietnam_chars(node: Element, text_range: Option<Range<usize>>) -> Result<String> {
2567 // this is basically the same as for ueb except:
2568 // 1. we deal with switching '.' and ',' if in English style for numbers
2569 // 2. if it is identified as a Roman Numeral, we make all but the first char lower case because they shouldn't get a cap indicator
2570 // 3. double letter chemical elements should NOT be part of a cap word sequence
2571 if name(node) == "mn" {
2572 // text of element is modified by these if needed
2573 lower_case_roman_numerals(node);
2574 switch_if_english_style_number(node);
2575 }
2576 let result = BrailleChars::get_braille_ueb_chars(node, text_range)?;
2577 return Ok(result);
2578
2579 fn lower_case_roman_numerals(mn_node: Element) {
2580 if mn_node.attribute("data-roman-numeral").is_some() {
2581 // if a roman numeral, all ASCII so we can optimize
2582 let text = as_text(mn_node);
2583 let mut new_text = String::from(&text[..1]);
2584 new_text.push_str(text[1..].to_ascii_lowercase().as_str()); // works for single char too
2585 mn_node.set_text(&new_text);
2586 }
2587 }
2588 fn switch_if_english_style_number(mn_node: Element) {
2589 let text = as_text(mn_node);
2590 let dot = text.find('.');
2591 let comma = text.find(',');
2592 match (dot, comma) {
2593 (None, None) => (),
2594 (Some(dot), Some(comma)) => {
2595 if comma < dot {
2596 // switch dot/comma -- using "\x01" as a temp when switching the the two chars
2597 let switched = text.replace('.', "\x01").replace(',', ".").replace('\x01', ",");
2598 mn_node.set_text(&switched);
2599 }
2600 },
2601 (Some(dot), None) => {
2602 // If it starts with a '.', a leading 0, or if there is only one '.' and not three chars after it
2603 if dot==0 ||
2604 (dot==1 && text.starts_with('0')) ||
2605 (text[dot+1..].find('.').is_none() && text[dot+1..].len()!=3) {
2606 mn_node.set_text(&text.replace('.', ","));
2607 }
2608 },
2609 (None, Some(comma)) => {
2610 // if there is more than one ",", than it can't be a decimal separator
2611 if text[comma+1..].find(',').is_some() {
2612 mn_node.set_text(&text.replace(',', "."));
2613 }
2614 },
2615 }
2616 }
2617
2618 }
2619
2620
2621 fn is_in_enclosed_list(node: Element) -> bool {
2622 // Nemeth Rule 10 defines an enclosed list:
2623 // 1: begins and ends with fence
2624 // 2: FIX: not implemented -- must contain no word, abbreviation, ordinal or plural ending
2625 // 3: function names or signs of shape and the signs which follow them are a single item (not a word)
2626 // 4: an item of the list may be an ellipsis or any sign used for omission
2627 // 5: no relational operator may appear within the list
2628 // 6: the list must have at least 2 items.
2629 // Items are separated by commas, can not have other punctuation (except ellipsis and dash)
2630 let mut parent = get_parent(node); // safe since 'math' is always at root
2631 while name(parent) == "mrow" {
2632 if IsBracketed::is_bracketed(parent, "", "", true, false) {
2633 for child in parent.children() {
2634 if !child_meets_conditions(as_element(child)) {
2635 return false;
2636 }
2637 }
2638 return true;
2639 }
2640 parent = get_parent(parent);
2641 }
2642 return false;
2643
2644 fn child_meets_conditions(node: Element) -> bool {
2645 let name = name(node);
2646 return match name {
2647 "mi" | "mn" => true,
2648 "mo" => !crate::canonicalize::is_relational_op(node),
2649 "mtext" => {
2650 let text = as_text(node).trim();
2651 return text=="?" || text=="-?-" || text.is_empty(); // various forms of "fill in missing content" (see also Nemeth_RULEs.yaml, "omissions")
2652 },
2653 "mrow" => {
2654 if IsBracketed::is_bracketed(node, "", "", false, false) {
2655 return child_meets_conditions(as_element(node.children()[1]));
2656 } else {
2657 for child in node.children() {
2658 if !child_meets_conditions(as_element(child)) {
2659 return false;
2660 }
2661 }
2662 }
2663 true
2664 },
2665 "menclose" => {
2666 if let Some(notation) = node.attribute_value("notation") {
2667 if notation != "bottom" || notation != "box" {
2668 return false;
2669 }
2670 let child = as_element(node.children()[0]); // menclose has exactly one child
2671 return is_leaf(child) && as_text(child) == "?";
2672 }
2673 return false;
2674 },
2675 _ => {
2676 for child in node.children() {
2677 if !child_meets_conditions(as_element(child)) {
2678 return false;
2679 }
2680 }
2681 true
2682 },
2683 }
2684 }
2685 }
2686
2687 /// Extract the `char`s from `str` within `range` (these are chars, not byte offsets)
2688 fn substring(str: &str, text_range: &Option<Range<usize>>) -> String {
2689 return match text_range {
2690 None => str.to_string(),
2691 Some(range) => str.chars().skip(range.start).take(range.end - range.start).collect(),
2692 }
2693 }
2694}
2695
2696impl Function for BrailleChars {
2697 /**
2698 * Returns a string with the correct number of nesting chars (could be an empty string)
2699 * @param(node) -- current node or string
2700 * @param(char) -- char (string) that should be repeated
2701 * Note: as a side effect, an attribute with the value so repeated calls to this or a child will be fast
2702 */
2703 fn evaluate<'d>(&self,
2704 context: &context::Evaluation<'_, 'd>,
2705 args: Vec<Value<'d>>)
2706 -> StdResult<Value<'d>, XPathError>
2707 {
2708 use crate::canonicalize::create_mathml_element;
2709 let mut args = Args(args);
2710 if let Err(e) = args.exactly(2).or_else(|_| args.exactly(4)) {
2711 return Err( XPathError::Other(format!("BrailleChars requires 2 or 4 args: {e}")));
2712 };
2713
2714 let range = if args.len() == 4 {
2715 let end = args.pop_number()? as usize - 1; // non-inclusive at end, 0-based
2716 let start = args.pop_number()? as usize - 1; // inclusive at start, a 0-based
2717 Some(start..end)
2718 } else {
2719 None
2720 };
2721 let braille_code = args.pop_string()?;
2722 let v: Value<'_> = args.0.pop().ok_or(XPathError::ArgumentMissing)?;
2723 let node = match v {
2724 Value::Nodeset(nodes) => {
2725 validate_one_node(nodes, "BrailleChars")?.element().unwrap()
2726 },
2727 Value::Number(n) => {
2728 let new_node = create_mathml_element(&context.node.document(), "mn");
2729 new_node.set_text(&n.to_string());
2730 new_node
2731 },
2732 Value::String(s) => {
2733 let new_node = create_mathml_element(&context.node.document(), "mi"); // FIX: try to guess mi vs mo???
2734 new_node.set_text(&s);
2735 new_node
2736 },
2737 _ => {
2738 return Ok( Value::String("".to_string()) ) // not an element, so nothing to do
2739 },
2740 };
2741
2742 if !is_leaf(node) {
2743 return Err( XPathError::Other(format!("BrailleChars called on non-leaf element '{}'", mml_to_string(node))) );
2744 }
2745 return Ok( Value::String( BrailleChars::get_braille_chars(node, &braille_code, range)? ) );
2746 }
2747}
2748
2749pub struct NeedsToBeGrouped;
2750impl NeedsToBeGrouped {
2751 // ordinals often have an irregular start (e.g., "half") before becoming regular.
2752 // if the number is irregular, return the ordinal form, otherwise return 'None'.
2753 fn needs_grouping_for_cmu(element: Element, _is_base: bool) -> bool {
2754 let node_name = name(element);
2755 let children = element.children();
2756 if node_name == "mrow" {
2757 // check for bracketed exprs
2758 if IsBracketed::is_bracketed(element, "", "", false, true) {
2759 return false;
2760 }
2761
2762 // check for prefix and postfix ops at start or end (=> len()==2, prefix is first op, postfix is last op)
2763 if children.len() == 2 &&
2764 (name(as_element(children[0])) == "mo" || name(as_element(children[1])) == "mo") {
2765 return false;
2766 }
2767
2768 if children.len() != 3 { // ==3, need to check if it a linear fraction
2769 return true;
2770 }
2771 let operator = as_element(children[1]);
2772 if name(operator) != "mo" || as_text(operator) != "/" {
2773 return true;
2774 }
2775 }
2776
2777 if !(node_name == "mrow" || node_name == "mfrac") {
2778 return false;
2779 }
2780 // check for numeric fractions (regular fractions need brackets, not numeric fractions), either as an mfrac or with "/"
2781 // if the fraction starts with a "-", it is still a numeric fraction that doesn't need parens
2782 let mut numerator = as_element(children[0]);
2783 let denominator = as_element(children[children.len()-1]);
2784 let decimal_separator = crate::interface::get_preference("DecimalSeparators".to_string()).unwrap()
2785 .chars().next().unwrap_or('.');
2786 if is_integer(denominator, decimal_separator) {
2787 // check numerator being either an integer "- integer"
2788 if name(numerator) == "mrow" {
2789 let numerator_children = numerator.children();
2790 if !(numerator_children.len() == 2 &&
2791 name(as_element(numerator_children[0])) == "mo" &&
2792 as_text(as_element(numerator_children[0])) == "-") {
2793 return true;
2794 }
2795 numerator = as_element(numerator_children[1]);
2796 }
2797 return !is_integer(numerator, decimal_separator);
2798 }
2799 return true;
2800
2801 fn is_integer(mathml: Element, decimal_separator: char) -> bool {
2802 return name(mathml) == "mn" && !as_text(mathml).contains(decimal_separator)
2803 }
2804 }
2805
2806 /// FIX: what needs to be implemented?
2807 fn needs_grouping_for_finnish(mathml: Element, is_base: bool) -> bool {
2808 use crate::xpath_functions::IsInDefinition;
2809 let mut node_name = name(mathml);
2810 if mathml.attribute_value("data-roman-numeral").is_some() {
2811 node_name = "mi"; // roman numerals don't follow number rules
2812 }
2813
2814 // FIX: the leaf rules are from UEB -- check the Swedish rules
2815 match node_name {
2816 "mn" => {
2817 if !is_base {
2818 return false;
2819 } // clause 1
2820 // two 'mn's can be adjacent, in which case we need to group the 'mn' to make it clear it is separate (see bug #204)
2821 let parent = get_parent(mathml); // there is always a "math" node
2822 let grandparent = if name(parent) == "math" {parent} else {get_parent(parent)};
2823 if name(grandparent) != "mrow" {
2824 return false;
2825 }
2826 let preceding = parent.preceding_siblings();
2827 if preceding.len() < 2 {
2828 return false;
2829 }
2830 // any 'mn' would be separated from this node by invisible times
2831 let previous_child = as_element(preceding[preceding.len()-1]);
2832 if name(previous_child) == "mo" && as_text(previous_child) == "\u{2062}" {
2833 let previous_child = as_element(preceding[preceding.len()-2]);
2834 return name(previous_child) == "mn"
2835 } else {
2836 return false;
2837 }
2838 },
2839 "mi" | "mo" | "mtext" => {
2840 let text = as_text(mathml);
2841 let parent = get_parent(mathml); // there is always a "math" node
2842 let parent_name = name(parent); // there is always a "math" node
2843 if is_base && (parent_name == "msub" || parent_name == "msup" || parent_name == "msubsup") && !text.contains([' ', '\u{00A0}']) {
2844 return false;
2845 }
2846 let mut chars = text.chars();
2847 let first_char = chars.next().unwrap(); // canonicalization assures it isn't empty;
2848 let is_one_char = chars.next().is_none();
2849 // '¨', etc., brailles as two chars -- there probably is some exception list but I haven't found it -- these are the ones I know about
2850 return !((is_one_char && !['¨', '″', '‴', '⁗'].contains(&first_char)) || // clause 8
2851 // "lim", "cos", etc., appear not to get parens, but the rules don't mention it (tests show it)
2852 IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "FunctionNames").unwrap() ||
2853 IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "Arrows").unwrap() || // clause 4
2854 IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "GeometryShapes").unwrap()); // clause 5
2855 },
2856 "mrow" => {
2857 // check for bracketed exprs
2858 if IsBracketed::is_bracketed(mathml, "", "", false, true) {
2859 return false;
2860 }
2861
2862 let parent = get_parent(mathml); // safe since 'math' is always at root
2863 if name(parent) == "mfrac" {
2864 let children = mathml.children();
2865 if mathml.preceding_siblings().is_empty() {
2866 // numerator: check for multiplication -- doesn't need grouping in numerator
2867 if children.len() >= 3 {
2868 let operator = as_element(children[1]);
2869 if name(operator) == "mo" {
2870 let ch = as_text(operator);
2871 if ch == "\u{2062}" || ch == "⋅" || ch == "×" {
2872 return false;
2873 }
2874 }
2875 }
2876 return true;
2877 } else {
2878 // denominator
2879 return true;
2880 }
2881
2882 }
2883 // check for prefix at start
2884 // example 7.12 has "2-" in superscript and is grouped, so we don't consider postfix ops
2885 let children = mathml.children();
2886 if children.len() == 2 &&
2887 (name(as_element(children[0])) == "mo") {
2888 return false;
2889 }
2890 return true;
2891 },
2892 _ => return false,
2893 }
2894 }
2895
2896 // ordinals often have an irregular start (e.g., "half") before becoming regular.
2897 // if the number is irregular, return the ordinal form, otherwise return 'None'.
2898 fn needs_grouping_for_swedish(mathml: Element, is_base: bool) -> bool {
2899 use crate::xpath_functions::IsInDefinition;
2900 let mut node_name = name(mathml);
2901 if mathml.attribute_value("data-roman-numeral").is_some() {
2902 node_name = "mi"; // roman numerals don't follow number rules
2903 }
2904
2905 match node_name {
2906 "mn" => return false,
2907 "mi" | "mo" | "mtext" => {
2908 let text = as_text(mathml);
2909 let parent = get_parent(mathml); // there is always a "math" node
2910 let parent_name = name(parent); // there is always a "math" node
2911 if is_base && (parent_name == "msub" || parent_name == "msup" || parent_name == "msubsup") && !text.contains([' ', '\u{00A0}']) {
2912 return false;
2913 }
2914 let mut chars = text.chars();
2915 let first_char = chars.next().unwrap(); // canonicalization assures it isn't empty;
2916 let is_one_char = chars.next().is_none();
2917 // '¨', etc., brailles as two chars -- there probably is some exception list but I haven't found it -- these are the ones I know about
2918 return !((is_one_char && !['¨', '″', '‴', '⁗'].contains(&first_char)) || // clause 8
2919 // "lim", "cos", etc., appear not to get parens, but the rules don't mention it (tests show it)
2920 IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "FunctionNames").unwrap() ||
2921 IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "Arrows").unwrap() || // clause 4
2922 IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "GeometryShapes").unwrap()); // clause 5
2923 },
2924 "mrow" => {
2925 // check for bracketed exprs
2926 if IsBracketed::is_bracketed(mathml, "", "", false, true) {
2927 return false;
2928 }
2929
2930 // check for prefix at start
2931 // example 7.12 has "2-" in superscript and is grouped, so we don't consider postfix ops
2932 let children = mathml.children();
2933 if children.len() == 2 &&
2934 (name(as_element(children[0])) == "mo") {
2935 return false;
2936 }
2937 return true;
2938 },
2939 "mfrac" => {
2940 // exclude simple fractions -- they are not bracketed with start/end marks
2941 let children = mathml.children();
2942 return !(NeedsToBeGrouped::needs_grouping_for_swedish(as_element(children[0]), true) ||
2943 NeedsToBeGrouped::needs_grouping_for_swedish(as_element(children[0]), true));
2944 },
2945 // At least for msup (Ex 7.7, and 7.32 and maybe more), spec seems to feel grouping is not needed.
2946 // "msub" | "msup" | "msubsup" | "munder" | "mover" | "munderover" => return true,
2947 "mtable" => return true, // Fix: should check for trivial cases that don't need grouping
2948 _ => return false,
2949 }
2950 }
2951
2952 /// Returns true if the element needs grouping symbols
2953 /// Bases need extra attention because if they are a number and the item to the left is one, that needs distinguishing
2954 fn needs_grouping_for_ueb(mathml: Element, is_base: bool) -> bool {
2955 // From GTM 7.1
2956 // 1. An entire number, i.e. the initiating numeric symbol and all succeeding symbols within the numeric mode thus
2957 // established (which would include any interior decimal points, commas, separator spaces, or simple numeric fraction lines).
2958 // 2. An entire general fraction, enclosed in fraction indicators.
2959 // 3. An entire radical expression, enclosed in radical indicators.
2960 // 4. An arrow.
2961 // 5. An arbitrary shape.
2962 // 6. Any expression enclosed in matching pairs of round parentheses, square brackets or curly braces.
2963 // 7. Any expression enclosed in the braille grouping indicators. [Note: not possible here]
2964 // 8. If none of the foregoing apply, the item is simply the [this element's] individual symbol.
2965
2966 use crate::xpath_functions::IsInDefinition;
2967 let mut node_name = name(mathml);
2968 if mathml.attribute_value("data-roman-numeral").is_some() {
2969 node_name = "mi"; // roman numerals don't follow number rules
2970 }
2971 match node_name {
2972 "mn" => {
2973 if !is_base {
2974 return false;
2975 } // clause 1
2976 // two 'mn's can be adjacent, in which case we need to group the 'mn' to make it clear it is separate (see bug #204)
2977 let parent = get_parent(mathml); // there is always a "math" node
2978 let grandparent = if name(parent) == "math" {parent} else {get_parent(parent)};
2979 if name(grandparent) != "mrow" {
2980 return false;
2981 }
2982 let preceding = parent.preceding_siblings();
2983 if preceding.len() < 2 {
2984 return false;
2985 }
2986 // any 'mn' would be separated from this node by invisible times
2987 let previous_child = as_element(preceding[preceding.len()-1]);
2988 if name(previous_child) == "mo" && as_text(previous_child) == "\u{2062}" {
2989 let previous_child = as_element(preceding[preceding.len()-2]);
2990 return name(previous_child) == "mn"
2991 } else {
2992 return false;
2993 }
2994 },
2995 "mi" | "mo" | "mtext" => {
2996 let text = as_text(mathml);
2997 let parent = get_parent(mathml); // there is always a "math" node
2998 let parent_name = name(parent); // there is always a "math" node
2999 if is_base && (parent_name == "msub" || parent_name == "msup" || parent_name == "msubsup") && !text.contains([' ', '\u{00A0}']) {
3000 return false;
3001 }
3002 let mut chars = text.chars();
3003 let first_char = chars.next().unwrap(); // canonicalization assures it isn't empty;
3004 let is_one_char = chars.next().is_none();
3005 // '¨', etc., brailles as two chars -- there probably is some exception list but I haven't found it -- these are the ones I know about
3006 return !((is_one_char && !['¨', '″', '‴', '⁗'].contains(&first_char)) || // clause 8
3007 // "lim", "cos", etc., appear not to get parens, but the rules don't mention it (tests show it)
3008 IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "FunctionNames").unwrap() ||
3009 IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "Arrows").unwrap() || // clause 4
3010 IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "GeometryShapes").unwrap()); // clause 5
3011 },
3012 "mfrac" => return false, // clause 2 (test GTM 8.2(4) shows numeric fractions are not special)
3013 "msqrt" | "mroot" => return false, // clause 3
3014 // clause 6 only mentions three grouping chars, I'm a little suspicious of that, but that's what it says
3015 "mrow" => return !(IsBracketed::is_bracketed(mathml, "(", ")", false, false) ||
3016 IsBracketed::is_bracketed(mathml, "[", "]", false, false) ||
3017 IsBracketed::is_bracketed(mathml, "{", "}", false, false) ),
3018 "msub" | "msup" | "msubsup" => {
3019 // I'm a little dubious about the false value, but see GTM 7.7(2)
3020 if !is_base {
3021 return true;
3022 }
3023 // need to group nested scripts in base -- see GTM 12.2(2)
3024 let parent = get_parent(mathml); // there is always a "math" node
3025 let parent_name = name(parent); // there is always a "math" node
3026 return parent_name == "munder" || parent_name == "mover" || parent_name == "munderover";
3027 },
3028 _ => return true,
3029 }
3030
3031 }
3032}
3033
3034impl Function for NeedsToBeGrouped {
3035 // convert a node to an ordinal number
3036 fn evaluate<'d>(&self,
3037 _context: &context::Evaluation<'_, 'd>,
3038 args: Vec<Value<'d>>)
3039 -> StdResult<Value<'d>, XPathError>
3040 {
3041 let mut args = Args(args);
3042 args.exactly(3)?;
3043 let is_base = args.pop_boolean()?;
3044 let braille_code = args.pop_string()?;
3045 let node = validate_one_node(args.pop_nodeset()?, "NeedsToBeGrouped")?;
3046 if let Node::Element(e) = node {
3047 let answer = match braille_code.as_str() {
3048 "CMU" => NeedsToBeGrouped::needs_grouping_for_cmu(e, is_base),
3049 "UEB" => NeedsToBeGrouped::needs_grouping_for_ueb(e, is_base),
3050 "Finnish" => NeedsToBeGrouped::needs_grouping_for_finnish(e, is_base),
3051 "Swedish" => NeedsToBeGrouped::needs_grouping_for_swedish(e, is_base),
3052 _ => return Err(XPathError::Other(format!("NeedsToBeGrouped: braille code arg '{braille_code:?}' is not a known code ('UEB', 'CMU', or 'Swedish')"))),
3053 };
3054 return Ok( Value::Boolean( answer ) );
3055 }
3056
3057 return Err(XPathError::Other(format!("NeedsToBeGrouped: first arg '{node:?}' is not a node")));
3058 }
3059}
3060
3061
3062
3063#[cfg(test)]
3064mod tests {
3065 use super::*;
3066 #[allow(unused_imports)]
3067 use crate::init_logger;
3068 use crate::interface::*;
3069
3070 #[test]
3071 fn ueb_highlight_24() -> Result<()> { // issue 24
3072 let mathml_str = "<math display='block' id='id-0'>
3073 <mrow id='id-1'>
3074 <mn id='id-2'>4</mn>
3075 <mo id='id-3'>⁢</mo>
3076 <mi id='id-4'>a</mi>
3077 <mo id='id-5'>⁢</mo>
3078 <mi id='id-6'>c</mi>
3079 </mrow>
3080 </math>";
3081 crate::interface::set_rules_dir(super::super::abs_rules_dir_path()).unwrap();
3082 set_mathml(mathml_str.to_string()).unwrap();
3083 set_preference("BrailleCode".to_string(), "UEB".to_string()).unwrap();
3084 set_preference("BrailleNavHighlight".to_string(), "All".to_string()).unwrap();
3085 let braille = get_braille("id-2".to_string())?;
3086 assert_eq!("⣼⣙⠰⠁⠉", braille);
3087 set_navigation_node("id-2".to_string(), 0)?;
3088 assert_eq!( get_braille_position()?, (0,2));
3089
3090 let braille = get_braille("id-4".to_string())?;
3091 assert_eq!("⠼⠙⣰⣁⠉", braille);
3092 set_navigation_node("id-4".to_string(), 0)?;
3093 assert_eq!( get_braille_position()?, (2,4));
3094 return Ok( () );
3095 }
3096
3097 #[test]
3098 // This test probably should be repeated for each braille code and be taken out of here
3099 fn find_mathml_from_braille() -> Result<()> {
3100 use std::time::Instant;
3101 let mathml_str = "<math id='id-0'>
3102 <mrow data-changed='added' id='id-1'>
3103 <mi id='id-2'>x</mi>
3104 <mo id='id-3'>=</mo>
3105 <mfrac id='id-4'>
3106 <mrow id='id-5'>
3107 <mrow data-changed='added' id='id-6'>
3108 <mo id='id-7'>-</mo>
3109 <mi id='id-8'>b</mi>
3110 </mrow>
3111 <mo id='id-9'>±</mo>
3112 <msqrt id='id-10'>
3113 <mrow data-changed='added' id='id-11'>
3114 <msup id='id-12'>
3115 <mi id='id-13'>b</mi>
3116 <mn id='id-14'>2</mn>
3117 </msup>
3118 <mo id='id-15'>-</mo>
3119 <mrow data-changed='added' id='id-16'>
3120 <mn id='id-17'>4</mn>
3121 <mo data-changed='added' id='id-18'>⁢</mo>
3122 <mi id='id-19'>a</mi>
3123 <mo data-changed='added' id='id-20'>⁢</mo>
3124 <mi id='id-21'>c</mi>
3125 </mrow>
3126 </mrow>
3127 </msqrt>
3128 </mrow>
3129 <mrow id='id-22'>
3130 <mn id='id-23'>2</mn>
3131 <mo data-changed='added' id='id-24'>⁢</mo>
3132 <mi id='id-25'>a</mi>
3133 </mrow>
3134 </mfrac>
3135 </mrow>
3136 </math>";
3137 crate::interface::set_rules_dir(super::super::abs_rules_dir_path()).unwrap();
3138 set_mathml(mathml_str.to_string()).unwrap();
3139 set_preference("BrailleNavHighlight".to_string(), "Off".to_string()).unwrap();
3140
3141 set_preference("BrailleCode".to_string(), "Nemeth".to_string()).unwrap();
3142 let braille = get_braille("".to_string())?;
3143 let answers= &[2, 3, 3, 3, 3, 4, 7, 8, 9, 9, 10, 13, 12, 14, 12, 15, 17, 19, 21, 10, 4, 23, 25, 4];
3144 let answers = answers.map(|num| format!("id-{}", num));
3145 debug!("\n*** Testing Nemeth ***");
3146 for i in 0..braille.chars().count() {
3147 debug!("\n=== i={} ===", i);
3148 let instant = Instant::now();
3149 let (id, _offset) = crate::interface::get_navigation_node_from_braille_position(i)?;
3150 N_PROBES.with(|n| {debug!("test {:2} #probes = {}", i, n.borrow())});
3151 debug!("Time taken: {}ms", instant.elapsed().as_millis());
3152 assert_eq!(answers[i], id, "\nNemeth test ith position={}", i);
3153 }
3154
3155 set_preference("BrailleCode".to_string(), "UEB".to_string()).unwrap();
3156 let braille = get_braille("".to_string())?;
3157 let answers= &[0, 0, 0, 2, 3, 3, 3, 3, 4, 7, 7, 8, 9, 9, 10, 13, 12, 14, 14, 15, 15, 17, 17, 19, 19, 21, 10, 4, 4, 23, 23, 25, 25, 4, 0, 0];
3158 let answers = answers.map(|num| format!("id-{}", num));
3159 debug!("\n\n*** Testing UEB ***");
3160 for i in 0..braille.chars().count() {
3161 debug!("\n=== i={} ===", i);
3162 let instant = Instant::now();
3163 let (id, _offset) = crate::interface::get_navigation_node_from_braille_position(i)?;
3164 N_PROBES.with(|n| {debug!("test {:2} #probes = {}", i, n.borrow())});
3165 debug!("Time taken: {}ms", instant.elapsed().as_millis());
3166 assert_eq!(answers[i], id, "\nUEB test ith position={}", i);
3167 }
3168 set_preference("BrailleCode".to_string(), "CMU".to_string()).unwrap();
3169 let braille = get_braille("".to_string())?;
3170 let answers= &[2, 3, 5, 7, 8, 9, 9, 9, 10, 10, 11, 13, 12, 14, 14, 15, 17, 17, 19, 19, 21, 11, 5, 4, 22, 23, 23, 25, 25, 22,];
3171 let answers = answers.map(|num| format!("id-{}", num));
3172 debug!("\n\n*** Testing CMU ***");
3173 debug!("Braille: {}", braille);
3174 for i in 0..braille.chars().count() {
3175 debug!("\n=== i={} ===", i);
3176 let instant = Instant::now();
3177 let (id, _offset) = crate::interface::get_navigation_node_from_braille_position(i)?;
3178 N_PROBES.with(|n| {debug!("test {:2} #probes = {}", i, n.borrow())});
3179 debug!("Time taken: {}ms", instant.elapsed().as_millis());
3180 assert_eq!(answers[i], id, "\nCMU test ith position={}", i);
3181 }
3182 return Ok( () );
3183 }
3184
3185 #[test]
3186 #[allow(non_snake_case)]
3187 fn test_UEB_start_mode() -> Result<()> {
3188 let mathml_str = "<math><msup><mi>x</mi><mi>n</mi></msup></math>";
3189 crate::interface::set_rules_dir(super::super::abs_rules_dir_path()).unwrap();
3190 set_mathml(mathml_str.to_string()).unwrap();
3191 set_preference("BrailleCode".to_string(), "UEB".to_string()).unwrap();
3192 set_preference("UEB_START_MODE".to_string(), "Grade2".to_string()).unwrap();
3193 let braille = get_braille("".to_string())?;
3194 assert_eq!("⠭⠰⠔⠝", braille, "Grade2");
3195 set_preference("UEB_START_MODE".to_string(), "Grade1".to_string()).unwrap();
3196 let braille = get_braille("".to_string())?;
3197 assert_eq!("⠭⠔⠝", braille, "Grade1");
3198 return Ok( () );
3199 }
3200}