libmathcat/
braille.rs

1#![allow(clippy::needless_return)]
2use strum_macros::Display;
3use sxd_document::dom::{Element, ChildOfElement};
4use sxd_document::Package;
5use crate::definitions::SPEECH_DEFINITIONS;
6use crate::errors::*;
7use crate::pretty_print::mml_to_string;
8use crate::prefs::PreferenceManager;
9use std::cell::Ref;
10use regex::{Captures, Regex, RegexSet};
11use phf::{phf_map, phf_set};
12use crate::speech::{BRAILLE_RULES, SpeechRulesWithContext, braille_replace_chars, make_quoted_string};
13use crate::canonicalize::get_parent;
14use std::borrow::Cow;
15use std::ops::Range;
16
17static UEB_PREFIXES: phf::Set<char> = phf_set! {
18    '⠼', '⠈', '⠘', '⠸', '⠐', '⠨', '⠰', '⠠',
19};
20
21/// Returns the braille *char* at the given position in the braille string.
22fn braille_at(braille: &str, index: usize) -> char {
23    // braille is always 3 bytes per char
24    return braille[index..index+3].chars().next().unwrap();
25
26}
27
28/// braille the MathML
29/// If 'nav_node_id' is not an empty string, then the element with that id will have dots 7 & 8 turned on as per the pref
30/// Returns the braille string (highlighted) along with the *character* start/end of the highlight (whole string if no highlight)
31pub fn braille_mathml(mathml: Element, nav_node_id: &str) -> Result<(String, usize, usize)> {
32    return BRAILLE_RULES.with(|rules| {
33        rules.borrow_mut().read_files()?;
34        let rules = rules.borrow();
35        let new_package = Package::new();
36        let mut rules_with_context = SpeechRulesWithContext::new(&rules, new_package.as_document(), nav_node_id);
37        let braille_string = rules_with_context.match_pattern::<String>(mathml)
38                        .chain_err(|| "Pattern match/replacement failure!")?;
39        // debug!("braille_mathml: braille string: {}", &braille_string);
40        let braille_string = braille_string.replace(' ', "");
41        let pref_manager = rules_with_context.get_rules().pref_manager.borrow();
42        let highlight_style = pref_manager.pref_to_string("BrailleNavHighlight");
43        let braille_code = pref_manager.pref_to_string("BrailleCode");
44        let braille = match braille_code.as_str() {
45            "Nemeth" => nemeth_cleanup(pref_manager, braille_string),
46            "UEB" => ueb_cleanup(pref_manager, braille_string),
47            "Vietnam" => vietnam_cleanup(pref_manager, braille_string),
48            "CMU" => cmu_cleanup(pref_manager, braille_string), 
49            "Finnish" => finnish_cleanup(pref_manager, braille_string),
50            "Swedish" => swedish_cleanup(pref_manager, braille_string),
51            "LaTeX" => LaTeX_cleanup(pref_manager, braille_string),
52            "ASCIIMath" => ASCIIMath_cleanup(pref_manager, braille_string),
53            _ => braille_string.trim_matches('⠀').to_string(),    // probably needs cleanup if someone has another code, but this will have to get added by hand
54        };
55
56        return Ok(
57            if highlight_style != "Off" {
58                highlight_braille_chars(braille, &braille_code, highlight_style == "All")
59            } else {
60                let end = braille.len()/3;
61                (braille, 0, end)
62            }
63        );
64    });
65
66    /// highlight with dots 7 & 8 based on the highlight style
67    /// both the start and stop points will be extended to deal with indicators such as capitalization
68    /// if 'fill_range' is true, the interior will be highlighted
69    /// Returns the braille string (highlighted) along with the [start, end) *character* of the highlight (whole string if no highlight)
70    fn highlight_braille_chars(braille: String, braille_code: &str, fill_range: bool) -> (String, usize, usize) {
71        let mut braille = braille;
72        // some special (non-braille) chars weren't converted to having dots 7 & 8 to indicate navigation position
73        // they need to be added to the start
74
75        // find start and end (byte) indexes of the highlighted region (braille chars have length=3 bytes)
76        let start = braille.find(is_highlighted);
77        let end = braille.rfind(is_highlighted);
78        if start.is_none() {
79            assert!(end.is_none());
80            let end = braille.len();
81            return (braille, 0, end/3);
82        };
83
84        let start = start.unwrap();
85        let mut end = end.unwrap() + 3;         // always exists if start exists ('end' is exclusive)
86        // debug!("braille highlight: start/end={}/{}; braille={}", start/3, end/3, braille);
87        let mut start = highlight_first_indicator(&mut braille, braille_code, start, end);
88        if let Some(new_range) = expand_highlight(&mut braille, braille_code, start, end) {
89            (start, end) = new_range
90        }
91
92        if start == end {
93            return (braille, start/3, end/3);
94        }
95
96        if !fill_range {
97            return (braille, start/3, end/3);
98        }
99
100        let mut result = String::with_capacity(braille.len());
101        result.push_str(&braille[..start]);
102        let highlight_region =&mut braille[start..end];
103        for ch in highlight_region.chars() {
104            result.push( highlight(ch) );
105        };
106        result.push_str(&braille[end..]);
107        return (result, start/3, end/3);
108
109        /// Return the byte index of the first place to highlight
110        fn highlight_first_indicator(braille: &mut String, braille_code: &str, start_index: usize, end_index: usize) -> usize {
111            // chars in the braille block range use 3 bytes -- we can use that to optimize the code some
112            let first_ch = unhighlight(braille_at(braille, start_index));
113
114            // need to highlight (optional) capital/number, language, and style (max 2 chars) also in that (rev) order
115            let mut prefix_ch_index = std::cmp::max(0, start_index as isize - 5*3) as usize;
116            if prefix_ch_index == 0 && braille_code == "UEB" {
117                // don't count the word or passage mode as part of a indicator
118                if braille.starts_with("⠰⠰⠰") {
119                    prefix_ch_index = 9;
120                } else if braille.starts_with("⠰⠰") {
121                    prefix_ch_index = 6;
122                }
123            }
124            let indicators = &braille[prefix_ch_index..start_index];   // chars to be examined
125            let i_byte_start = start_index - 3 * match braille_code {
126                "Nemeth" => i_start_nemeth(indicators, first_ch),
127                _ => i_start_ueb(indicators),               // treat all the other like UEB because they probably have similar number and letter prefixes
128            };
129            if i_byte_start < start_index {
130                // remove old highlight as long as we don't wipe out the end highlight
131                if start_index < end_index {
132                    let old_first_char_bytes = start_index..start_index+3;
133                    let replacement_str = unhighlight(braille_at(braille, start_index)).to_string();
134                    braille.replace_range(old_first_char_bytes, &replacement_str);
135                }
136
137                // add new highlight
138                let new_first_char_bytes = i_byte_start..i_byte_start+3;
139                let replacement_str = highlight(braille_at(braille, i_byte_start)).to_string();
140                braille.replace_range(new_first_char_bytes, &replacement_str);
141            }
142
143            return i_byte_start;
144        }
145
146        /// Return the byte indexes of the first and last place to highlight
147        /// Currently, this only does something for CMU braille
148        fn expand_highlight(braille: &mut String, braille_code: &str, start_index: usize, end_index: usize) -> Option<(usize, usize)> {
149            // For CMU, we want to expand mrows to include the opening and closing grouping indicators if they exist
150            if start_index == 0 || end_index == braille.len() || braille_code != "CMU" {
151                return None;
152            }
153
154            let first_ch = unhighlight(braille_at(braille, start_index));
155            let last_ch = unhighlight(braille_at(braille, end_index-3));
156            // We need to be careful not to expand the selection if we are already on a grouping indicator
157            if first_ch == '⠢' && last_ch == '⠔'{
158                return None;
159            }
160            let preceding_ch = braille_at(braille, start_index-3);
161            if preceding_ch != '⠢' {
162                return None;
163            }
164
165            let following_ch = braille_at(braille, end_index);
166            if following_ch != '⠔' {
167                return None;
168            }
169
170            let preceding_ch = highlight(preceding_ch);
171            braille.replace_range(start_index-3..start_index+3, format!("{preceding_ch}{first_ch}").as_str());
172            let following_ch = highlight(following_ch);
173            braille.replace_range(end_index-3..end_index+3, format!("{last_ch}{following_ch}").as_str());
174            return Some( (start_index-3, end_index + 3) );
175        }
176    }
177
178    /// Given a position in a Nemeth string, what is the position character that starts it (e.g, the prev char for capital letter)
179    fn i_start_nemeth(braille_prefix: &str, first_ch: char) -> usize {
180        static NEMETH_NUMBERS: phf::Set<char> = phf_set! {
181            '⠂', '⠆', '⠒', '⠲', '⠢', '⠖', '⠶', '⠦', '⠔', '⠴', '⠨' // 1, 2, ...9, 0, decimal pt
182        };
183        let mut n_chars = 0;
184        let prefix = &mut braille_prefix.chars().rev().peekable();
185        if prefix.peek() == Some(&'⠠') ||  // cap indicator
186           (prefix.peek() == Some(&'⠼') && NEMETH_NUMBERS.contains(&first_ch)) ||  // number indicator
187           [Some(&'⠸'), Some(&'⠈'), Some(&'⠨')].contains(&prefix.peek()) {         // bold, script/blackboard, italic indicator
188            n_chars += 1;
189            prefix.next();
190        } 
191
192        if [Some(&'⠰'), Some(&'⠸'), Some(&'⠨')].contains(&prefix.peek()) {   // English, German, Greek
193            n_chars += 1;
194        } else if prefix.peek() == Some(&'⠈') {  
195            let ch = prefix.next();                              // Russian/Greek Variant
196            if ch == Some('⠈') || ch == Some('⠨') {
197                n_chars += 2;
198            }
199        } else if prefix.peek() == Some(&'⠠')  { // Hebrew 
200            let ch = prefix.next();                              // Russian/Greek Variant
201            if ch == Some('⠠') {
202                n_chars += 2;
203            }
204        };
205        return n_chars;
206    }
207
208    /// Given a position in a UEB string, what is the position character that starts it (e.g, the prev char for capital letter)
209    fn i_start_ueb(braille_prefix: &str) -> usize {
210        let prefix = &mut braille_prefix.chars().rev().peekable();
211        let mut n_chars = 0;
212        while let Some(ch) = prefix.next() {
213            if UEB_PREFIXES.contains(&ch) {
214                n_chars += 1;
215            } else if ch == '⠆' {
216                let n_typeform_chars = check_for_typeform(prefix);
217                if n_typeform_chars > 0 {
218                    n_chars += n_typeform_chars;
219                } else {
220                    break;
221                }
222            } else {
223                break;
224            }
225        }
226        return n_chars;
227    }
228
229    
230    fn check_for_typeform(prefix: &mut dyn std::iter::Iterator<Item=char>) -> usize {
231        static UEB_TYPEFORM_PREFIXES: phf::Set<char> = phf_set! {
232            '⠈', '⠘', '⠸', '⠨',
233        };
234
235        if let Some(typeform_indicator) = prefix.next() {
236            if UEB_TYPEFORM_PREFIXES.contains(&typeform_indicator) {
237                return 2;
238            } else if typeform_indicator == '⠼' {
239                if let Some(user_defined_typeform_indicator) = prefix.next() {
240                    if UEB_TYPEFORM_PREFIXES.contains(&user_defined_typeform_indicator) || user_defined_typeform_indicator == '⠐' {
241                        return 3;
242                    }
243                }
244            }
245        }
246        return 0;
247    }
248}
249
250// FIX: if 8-dot braille is needed, perhaps the highlights can be shifted to a "highlighted" 256 char block in private space 
251//   they would need to be unshifted for the external world
252fn is_highlighted(ch: char) -> bool {
253    let ch_as_u32 = ch as u32;
254    return (0x28C0..0x28FF).contains(&ch_as_u32) || ch == '𝑏';           // 0x28C0..0x28FF all have dots 7 & 8 on
255}
256
257fn highlight(ch: char) -> char {
258    return unsafe{char::from_u32_unchecked(ch as u32 | 0xC0)};    // 0x28C0..0x28FF all have dots 7 & 8 on
259}
260
261fn unhighlight(ch: char) -> char {
262    let ch_as_u32 = ch as u32;
263    if (0x28C0..0x28FF).contains(&ch_as_u32) {              // 0x28C0..0x28FF all have dots 7 & 8 on
264        return unsafe{char::from_u32_unchecked(ch_as_u32 & 0x283F)};
265    } else {
266        return ch;
267    }
268}
269
270use std::cell::RefCell;
271thread_local!{
272    /// Count number of probes -- get a sense of how well algorithm is working (for debugging)
273    static N_PROBES: RefCell<usize> = const { RefCell::new(0) };
274}
275
276
277/// Given a 0-based braille position, return the id of the smallest MathML node enclosing it.
278/// This node might be a leaf with an offset.
279pub fn get_navigation_node_from_braille_position(mathml: Element, position: usize) -> Result<(String, usize)> {
280    // This works via a "smart" binary search (the trees aren't binary or balanced, we estimate the child to look in):
281    //   braille the mathml with a nav node and see where 'position' is in relation to the start/end of the nav node
282    // Each call to find_navigation_node() returns a search state that tell us where to look next if not found
283    #[derive(Debug, Display)]
284    enum SearchStatus {
285        LookInParent,       // look up a level for exact match
286        LookLeft,           // went too far, backup
287        LookRight,          // continue searching right
288        Found,
289    }
290
291    struct SearchState<'e> {
292        status: SearchStatus,
293        node: Element<'e>,
294        highlight_start: usize,     // if status is Found, then this is the offset within a leaf node
295        highlight_end: usize,       // if status is Found, this is ignored
296    }
297
298    // save the current highlight state, set the state to be the end points so we can find the braille, then restore the state
299    // FIX: this can fail if there is 8-dot braille
300    use crate::interface::{get_preference, set_preference};
301    let saved_highlight_style = get_preference("BrailleNavHighlight".to_string()).unwrap();
302    set_preference("BrailleNavHighlight".to_string(), "EndPoints".to_string()).unwrap();
303
304    N_PROBES.with(|n| {*n.borrow_mut() = 0});
305    // dive into the child of the <math> element (should only be one)
306    let search_state = find_navigation_node(mathml, as_element(mathml.children()[0]), position)?;
307    set_preference("BrailleNavHighlight".to_string(), saved_highlight_style.to_string()).unwrap();
308
309    // we know the attr value exists because it was found internally
310    // FIX: what should be done if we never did the search?
311    match search_state.status {
312        SearchStatus::Found | SearchStatus::LookInParent => {
313            return Ok( (search_state.node.attribute_value("id").unwrap().to_string(), search_state.highlight_start) )
314        },
315        _ => {
316            // weird state -- return the entire expr
317            match mathml.attribute_value("id") {
318                None => bail!("'id' is not present on mathml: {}", mml_to_string(mathml)),
319                Some(id) => return Ok( (id.to_string(), 0) ),
320            }
321        }
322    } 
323
324    /// find the navigation node that most tightly encapsulates the target position (0-based)
325    /// 'node' is the current node we are on inside of 'mathml'
326    fn find_navigation_node<'e>(mathml: Element<'e>, node: Element<'e>, target_position: usize) -> Result<SearchState<'e>> {
327        let node_id = match node.attribute_value("id") {
328            Some(id) => id,
329            None => bail!("'id' is not present on mathml: {}", mml_to_string(node)),
330        };
331        N_PROBES.with(|n| {*n.borrow_mut() += 1});
332        let (braille, char_start, char_end) = braille_mathml(mathml, node_id)?;
333        let mut status = None;
334        // debug!("find_navigation_node ({}, id={}): highlight=[{}, {});  target={}", name(node), node_id, char_start, char_end, target_position);
335        if is_leaf(node) {
336            if char_start == 0 && char_end == braille.len()/3 {
337                // nothing highlighted -- probably invisible char not represented in braille -- continue looking to the right
338                // debug!("  return due invisible char (?)' ");
339                status = Some(SearchStatus::LookRight);
340            } else if char_start <= target_position && target_position < char_end {
341                // FIX: need to handle multi-char leaves and set the offset (char_start) appropriately
342                // debug!("  return due to target_position inside leaf: {} <= {} < {}", char_start, target_position, char_end);
343                return Ok( SearchState {
344                    status: SearchStatus::Found,
345                    node,
346                    highlight_start: target_position - char_start,
347                    highlight_end: 0,
348                });
349            } else if name(node) == "mo" {
350                // if there is whitespace before or after the operator, consider the operator to be a match
351                if (char_start > 0 && target_position == char_start - 1 && 
352                    braille_at(&braille, 3*(char_start - 1)) == '⠀' && is_operator_that_adds_whitespace(node)) ||
353                   (3*char_end < braille.len() && target_position == char_end &&
354                    braille_at(&braille, 3*char_end) == '⠀' && is_operator_that_adds_whitespace(node)) {
355                    return Ok( SearchState {
356                        status: SearchStatus::Found,
357                        node,
358                        highlight_start: 0,
359                        highlight_end: 0,
360                    } );
361                }
362            }
363        }
364        if status.is_none() {
365            if target_position < char_start {
366                // debug!("  return due to target_position {} < start {}", target_position, char_start);
367                status = Some(SearchStatus::LookLeft);
368            } else if target_position >= char_end {
369                // debug!("  return due to target_position {} >= end {}", target_position, char_end);
370                status = Some(SearchStatus::LookRight);
371            }
372        }
373        if let Some(status) = status {
374            return Ok( SearchState {
375                status,
376                node,
377                highlight_start: char_start,
378                highlight_end: char_end,
379            } );
380        }
381
382        let children = node.children();
383        let mut i_left_child = 0;                         // inclusive
384        let mut i_right_child = children.len();           // exclusive
385        let mut call_start = char_start;
386        let mut guess_fn: Box<dyn Fn(usize, usize, usize, usize) -> usize> = Box::new(|i_left, i_right, start, target: usize| guess_child_node_ltr(&children, i_left, i_right, start, target));
387        while i_left_child < i_right_child {
388            let i_guess_child = guess_fn(i_left_child, i_right_child, call_start, target_position);
389            let status = find_navigation_node(mathml, as_element(children[i_guess_child]), target_position)?;
390            // debug!("  in {} loop: status: {}, child: left/guess/right {}/({},{})/{}; highlight=[{}, {})", 
391            //         name(node), status.status,
392            //         i_left_child, i_guess_child, name(as_element(children[i_guess_child])),i_right_child,
393            //         status.highlight_start, status.highlight_end);
394            match status.status {
395                SearchStatus::Found => {
396                    return Ok(status);
397                },
398                SearchStatus::LookInParent => {
399                    let (_, start, end) = braille_mathml(mathml, node_id)?;
400                    // debug!("  parent ({}) braille: start/end={}/{};  target_position={}", name(node), start, end, target_position);
401                    if start <= target_position && target_position < end {
402                        // debug!("  ..found: id={}", node_id);
403                        return Ok( SearchState{
404                            status: SearchStatus::Found,
405                            node,
406                            highlight_start: 0,
407                            highlight_end: 0,
408                        } );      // done or look up another level
409                    }
410                    return Ok(status);  // look up a level
411                },
412                SearchStatus::LookLeft => {
413                    i_right_child = if i_guess_child == 0 {0} else {i_guess_child};         // exclusive
414                    call_start = status.highlight_start-1;
415                    guess_fn = Box::new(|i_left, i_right, start, target| guess_child_node_rtl(&children, i_left, i_right, start, target));
416                },
417                SearchStatus::LookRight => {
418                    i_left_child = i_guess_child+1;
419                    call_start = status.highlight_end+1;
420                    guess_fn = Box::new(|i_left, i_right, start, target| guess_child_node_ltr(&children, i_left, i_right, start, target));
421                },
422            }
423        }
424        // debug!("Didn't child in node {}: left/right={}/{};  target_position={}", name(node), i_left_child, i_right_child, target_position);
425
426        // if we get here, we didn't find it in the children
427        // debug!("..end of loop: look in parent of {} has start/end={}/{}", name(node), char_start, char_end);
428        return Ok( SearchState{
429            status: if char_start <= target_position && target_position <= char_end {SearchStatus::Found} else {SearchStatus::LookInParent},
430            node,
431            highlight_start: 0,
432            highlight_end: 0,
433        } );
434    }
435
436    fn is_operator_that_adds_whitespace(node: Element) -> bool {
437        use crate::definitions::BRAILLE_DEFINITIONS;
438        if PreferenceManager::get().borrow().pref_to_string("UseSpacesAroundAllOperators") == "true" {
439            return true;
440        } 
441
442        return BRAILLE_DEFINITIONS.with(|definitions| {
443            let definitions = definitions.borrow();
444            let comparison_operators = definitions.get_hashset("ComparisonOperators").unwrap();
445            return comparison_operators.contains(as_text(node));
446        });        
447    }
448
449    /// look in children[i_left..i_right] for a count that exceeds target
450    fn guess_child_node_ltr(children: &[ChildOfElement], i_left: usize, i_right: usize, start: usize, target: usize) -> usize {
451        let mut estimated_position = start;
452        // number of chars to add for number indicators
453        let n_number_indicator = if PreferenceManager::get().borrow().pref_to_string("BrailleCode") == "Nemeth" {0} else {1};   // Nemeth doesn't typically need number or letter indicators
454        #[allow(clippy::needless_range_loop)]  // I don't like enumerate/take/skip here
455        for i in i_left..i_right {
456            estimated_position += estimate_braille_chars(children[i], n_number_indicator);
457            if estimated_position >= target {
458                return i;
459            }
460        }
461        return i_right-1;       // estimate was too large, return the last child as a guess
462    }
463
464    /// look in children[i_left..i_right].rev for a count that is less than target
465    fn guess_child_node_rtl(children: &[ChildOfElement], i_left: usize, i_right: usize, start: usize, target: usize) -> usize {
466        let mut estimated_position = start;
467        let n_number_indicator = if PreferenceManager::get().borrow().pref_to_string("BrailleCode") == "Nemeth" {0} else {1};   // Nemeth doesn't typically need number or letter indicators
468        for i in (i_left..i_right).rev() {
469            estimated_position -= estimate_braille_chars(children[i], n_number_indicator);
470            if estimated_position <= target {
471                return i;
472            }
473        }
474        return i_left;       // estimate was too small, return the first child as a guess
475    }
476
477    fn estimate_braille_chars(child: ChildOfElement, n_number_indicator: usize) -> usize {
478        let node = as_element(child);
479        let leaf_name = name(node);
480        if is_leaf(node) {
481            let text = as_text(node);
482            // len() is close since mn's probably have ASCII digits and lower case vars are common (count as) and other chars need extra braille chars
483            // don't want to count invisible chars since they don't display and would give a length = 3
484            if text == "\u{2061}" || text == "\u{2062}"  {       // invisible function apply/times (most common by far)
485                return 0;
486            }
487            // FIX: this assumption is bad for 8-dot braille
488            return match leaf_name {
489                "mn" => n_number_indicator + text.len(),
490                "mo" => 2,  // could do better by actually brailling char, but that is more expensive
491                _ => text.len(),
492            }
493        }
494        let mut estimate = if leaf_name == "mrow" {0} else {node.children().len() + 1};     // guess extra chars need for mfrac, msub, etc (start+intermediate+end).
495        if leaf_name == "msup" || leaf_name == "msub" || leaf_name == "msubsup" {
496            estimate -= 1;   // opening superscript/subscript indicator not needed
497        }
498        for child in node.children() {
499            estimate += estimate_braille_chars(child, n_number_indicator);
500        }
501        // debug!("estimate_braille_chars for {}: {}", crate::canonicalize::element_summary(as_element(child)), estimate);
502        return estimate;
503    }
504}
505
506fn nemeth_cleanup(pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String {
507    // Typeface: S: sans-serif, B: bold, T: script/blackboard, I: italic, R: Roman
508    // Language: E: English, D: German, G: Greek, V: Greek variants, H: Hebrew, U: Russian
509    // Indicators: C: capital, N: number, P: punctuation, M: multipurpose
510    // Others:
511    //      W -- whitespace that should be kept (e.g, in a numeral)
512    //      𝑁 -- hack for special case of a lone decimal pt -- not considered a number but follows rules mostly 
513    // SRE doesn't have H: Hebrew or U: Russian, so not encoded (yet)
514    // Note: some "positive" patterns find cases to keep the char and transform them to the lower case version
515    static NEMETH_INDICATOR_REPLACEMENTS: phf::Map<&str, &str> = phf_map! {
516        "S" => "⠠⠨",    // sans-serif
517        "B" => "⠸",     // bold
518        "𝔹" => "⠨",     // blackboard
519        "T" => "⠈",     // script
520        "I" => "⠨",     // italic (mapped to be the same a blackboard)
521        "R" => "",      // roman
522        "E" => "⠰",     // English
523        "D" => "⠸",     // German (Deutsche)
524        "G" => "⠨",     // Greek
525        "V" => "⠨⠈",    // Greek Variants
526        "H" => "⠠⠠",    // Hebrew
527        "U" => "⠈⠈",    // Russian
528        "C" => "⠠",     // capital
529        "P" => "⠸",     // punctuation
530        "𝐏" => "⠸",     // hack for punctuation after a roman numeral -- never removed
531        "L" => "",      // letter
532        "l" => "",      // letter inside enclosed list
533        "M" => "",      // multipurpose indicator
534        "m" => "⠐",     // required multipurpose indicator
535        "N" => "",      // potential number indicator before digit
536        "n" => "⠼",     // required number indicator before digit
537        "𝑁" => "",      // hack for special case of a lone decimal pt -- not considered a number but follows rules mostly
538        "W" => "⠀",     // whitespace
539        "w" => "⠀",     // whitespace from comparison operator
540        "," => "⠠⠀",    // comma
541        "b" => "⠐",     // baseline
542        "𝑏" => "⣐",     // highlight baseline (it's a hack)
543        "↑" => "⠘",     // superscript
544        "↓" => "⠰",     // subscript
545    };
546
547    lazy_static! {
548        // Add an English Letter indicator. This involves finding "single letters".
549        // The green book has a complicated set of cases, but the Nemeth UEB Rule book (May 2020), 4.10 has a much shorter explanation:
550        //   punctuation or whitespace on the left and right ignoring open/close chars
551        //   https://nfb.org/sites/www.nfb.org/files/files-pdf/braille-certification/lesson-4--provisional-5-9-20.pdf
552        static ref ADD_ENGLISH_LETTER_INDICATOR: Regex = 
553            Regex::new(r"(?P<start>^|W|P.[\u2800-\u28FF]?|,)(?P<open>[\u2800-\u28FF]?⠷)?(?P<letter>C?L.)(?P<close>[\u2800-\u28FF]?⠾)?(?P<end>W|P|,|$)").unwrap();
554        
555        // Trim braille spaces before and after braille indicators
556        // In order: fraction, /, cancellation, letter, baseline
557        // Note: fraction over is not listed due to example 42(4) which shows a space before the "/"
558        static ref REMOVE_SPACE_BEFORE_BRAILLE_INDICATORS: Regex = 
559            Regex::new(r"(⠄⠄⠄|⠤⠤⠤⠤)[Ww]+([⠼⠸⠪])").unwrap();
560        static ref REMOVE_SPACE_AFTER_BRAILLE_INDICATORS: Regex = 
561            Regex::new(r"([⠹⠻Llb])[Ww]+(⠄⠄⠄|⠤⠤⠤⠤)").unwrap();
562
563        // Hack to convert non-numeric '.' to numeric '.'
564        // The problem is that the numbers are hidden inside of mover -- this might be more general than rule 99_2.
565        static ref DOTS_99_A_2: Regex = Regex::new(r"𝑁⠨mN").unwrap();
566
567        // Punctuation is one or two chars. There are (currently) only 3 2-char punct chars (—‘’) -- we explicitly list them below
568        static ref REMOVE_SPACE_BEFORE_PUNCTUATION_151: Regex = 
569            Regex::new(r"w(P.[⠤⠦⠠]?|[\u2800-\u28FF]?⠾)").unwrap();
570        static ref REMOVE_SPACE_AFTER_PUNCTUATION_151: Regex = 
571            Regex::new(r"(P.[⠤⠦⠠]?|[\u2800-\u28FF]?⠷)w").unwrap();
572
573        // Multipurpose indicator insertion
574        // 149 -- consecutive comparison operators have no space -- instead a multipurpose indicator is used (doesn't require a regex)
575
576        // 177.2 -- add after a letter and before a digit (or decimal pt) -- digits will start with N
577        static ref MULTI_177_2: Regex = 
578            Regex::new(r"([Ll].)[N𝑁]").unwrap();
579
580        // keep between numeric subscript and digit ('M' added by subscript rule)
581        static ref MULTI_177_3: Regex = 
582            Regex::new(r"([N𝑁].)M([N𝑁].)").unwrap();
583
584        // Add after decimal pt for non-digits except for comma and punctuation
585        // Note: since "." can be in the middle of a number, there is not necessarily a "N"
586        // Although not mentioned in 177_5, don't add an 'M' before an 'm'
587        static ref MULTI_177_5: Regex = 
588            Regex::new(r"([N𝑁]⠨)([^⠂⠆⠒⠲⠢⠖⠶⠦⠔N𝑁,Pm])").unwrap();
589
590
591        // Pattern for rule II.9a (add numeric indicator at start of line or after a space)
592        // 1. start of line
593        // 2. optional minus sign (⠤)
594        // 3. optional typeface indicator
595        // 4. number (N)
596        static ref NUM_IND_9A: Regex = 
597            Regex::new(r"(?P<start>^|[,Ww])(?P<minus>⠤?)N").unwrap();
598
599        // Needed after section mark(§), paragraph mark(¶), #, or *
600        static ref NUM_IND_9C: Regex = 
601            Regex::new(r"(⠤?)(⠠⠷|⠠⠳|⠠⠈⠷)N").unwrap();
602
603        // Needed after section mark(§), paragraph mark(¶), #, or *
604        static ref NUM_IND_9D: Regex = 
605            Regex::new(r"(⠈⠠⠎|⠈⠠⠏|⠨⠼|⠈⠼)N").unwrap();
606
607        // Needed after a typeface change or interior shape modifier indicator
608        static ref NUM_IND_9E: Regex = Regex::new(r"(?P<face>[SB𝔹TIR]+?)N").unwrap();
609        static ref NUM_IND_9E_SHAPE: Regex = Regex::new(r"(?P<mod>⠸⠫)N").unwrap();
610
611        // Needed after hyphen that follows a word, abbreviation, or punctuation (caution about rule 11d)
612        // Note -- hyphen might encode as either "P⠤" or "⠤" depending on the tag used
613        static ref NUM_IND_9F: Regex = Regex::new(r"([Ll].[Ll].|P.)(P?⠤)N").unwrap();
614
615        // Enclosed list exception
616        // Normally we don't add numeric indicators in enclosed lists (done in get_braille_nemeth_chars).
617        // The green book says "at the start" of an item, don't add the numeric indicator.
618        // The NFB list exceptions after function abbreviations and angles, but what this really means is "after a space"
619        static ref NUM_IND_ENCLOSED_LIST: Regex = Regex::new(r"w([⠂⠆⠒⠲⠢⠖⠶⠦⠔⠴])").unwrap();
620
621        // Punctuation chars (Rule 38.6 says don't use before ",", "hyphen", "-", "…")
622        // Never use punctuation indicator before these (38-6)
623        //      "…": "⠀⠄⠄⠄"
624        //      "-": "⠸⠤" (hyphen and dash)
625        //      ",": "⠠⠀"     -- spacing already added
626        // Rule II.9b (add numeric indicator after punctuation [optional minus[optional .][digit]
627        //  because this is run after the above rule, some cases are already caught, so don't
628        //  match if there is already a numeric indicator
629        static ref NUM_IND_9B: Regex = Regex::new(r"(?P<punct>P..?)(?P<minus>⠤?)N").unwrap();
630
631        // Before 79b (punctuation)
632        static ref REMOVE_LEVEL_IND_BEFORE_SPACE_COMMA_PUNCT: Regex = Regex::new(r"(?:[↑↓]+[b𝑏]?|[b𝑏])([Ww,P]|$)").unwrap();
633
634        // Most commas have a space after them, but not when followed by a close quote (others?)
635        static ref NO_SPACE_AFTER_COMMA: Regex = Regex::new(r",P⠴").unwrap();      // captures both single and double close quote
636        static ref REMOVE_LEVEL_IND_BEFORE_BASELINE: Regex = Regex::new(r"(?:[↑↓mb𝑏]+)([b𝑏])").unwrap();
637
638        // Except for the four chars above, the unicode rules always include a punctuation indicator.
639        // The cases to remove them (that seem relevant to MathML) are:
640        //   Beginning of line or after a space (V 38.1)
641        //   After a word (38.4)
642        //   2nd or subsequent punctuation (includes, "-", etc) (38.7)
643        static ref REMOVE_AFTER_PUNCT_IND: Regex = Regex::new(r"(^|[Ww]|[Ll].[Ll].)P(.)").unwrap();
644        static ref REPLACE_INDICATORS: Regex =Regex::new(r"([SB𝔹TIREDGVHUP𝐏CLlMmb𝑏↑↓Nn𝑁Ww,])").unwrap();
645        static ref COLLAPSE_SPACES: Regex = Regex::new(r"⠀⠀+").unwrap();
646    }
647
648//   debug!("Before:  \"{}\"", raw_braille);
649    // replacements might overlap at boundaries (e.g., whitespace) -- need to repeat
650    let mut start = 0;
651    let mut result = String::with_capacity(raw_braille.len()+ raw_braille.len()/4);  // likely upper bound
652    while let Some(matched) = ADD_ENGLISH_LETTER_INDICATOR.find_at(&raw_braille, start) {
653        result.push_str(&raw_braille[start..matched.start()]);
654        let replacement = ADD_ENGLISH_LETTER_INDICATOR.replace(
655                &raw_braille[matched.start()..matched.end()], "${start}${open}E${letter}${close}");
656        // debug!("matched='{}', start/end={}/{}; replacement: {}", &raw_braille[matched.start()..matched.end()], matched.start(), matched.end(), replacement);
657        result.push_str(&replacement);
658        // put $end back on because needed for next match (e.g., whitespace at end and then start of next match)
659        // but it could also match because it was at the end, in which case "-1" is wrong -- tested after loop for that
660        start = matched.end() - 1;
661    }
662    if !raw_braille.is_empty() && ( start < raw_braille.len()-1 || "WP,".contains(raw_braille.chars().nth_back(0).unwrap()) ) {       // see comment about $end above
663        result.push_str(&raw_braille[start..]);
664    }
665//   debug!("ELIs:    \"{}\"", result);
666
667    let result = NUM_IND_ENCLOSED_LIST.replace_all(&result, "wn${1}");
668
669    // Remove blanks before and after braille indicators
670    let result = REMOVE_SPACE_BEFORE_BRAILLE_INDICATORS.replace_all(&result, "$1$2");
671    let result = REMOVE_SPACE_AFTER_BRAILLE_INDICATORS.replace_all(&result, "$1$2");
672
673    let result = REMOVE_SPACE_BEFORE_PUNCTUATION_151.replace_all(&result, "$1");
674    let result = REMOVE_SPACE_AFTER_PUNCTUATION_151.replace_all(&result, "$1");
675//   debug!("spaces:  \"{}\"", result);
676
677    let result = DOTS_99_A_2.replace_all(&result, "N⠨mN");
678
679    // Multipurpose indicator
680    let result = result.replace("ww", "m"); // 149
681    let result = MULTI_177_2.replace_all(&result, "${1}m${2}");
682    let result = MULTI_177_3.replace_all(&result, "${1}m$2");
683    let result = MULTI_177_5.replace_all(&result, "${1}m$2");
684//   debug!("MULTI:   \"{}\"", result);
685
686    let result = NUM_IND_9A.replace_all(&result, "${start}${minus}n");
687    // debug!("IND_9A:  \"{}\"", result);
688    let result = NUM_IND_9C.replace_all(&result, "${1}${2}n");
689    let result = NUM_IND_9D.replace_all(&result, "${1}n");
690    let result = NUM_IND_9E.replace_all(&result, "${face}n");
691    let result = NUM_IND_9E_SHAPE.replace_all(&result, "${mod}n");
692    let result = NUM_IND_9F.replace_all(&result, "${1}${2}n");
693
694//   debug!("IND_9F:  \"{}\"", result);
695
696    // 9b: insert after punctuation (optional minus sign)
697    // common punctuation adds a space, so 9a handled it. Here we deal with other "punctuation" 
698    // FIX other punctuation and reference symbols (9d)
699    let result = NUM_IND_9B.replace_all(&result, "$punct${minus}n");
700//   debug!("A PUNCT: \"{}\"", &result);
701
702    // strip level indicators
703    // check first to remove level indicators before baseline, then potentially remove the baseline
704    let mut result = REMOVE_LEVEL_IND_BEFORE_BASELINE.replace_all(&result, "$1");
705//   debug!("Punct  : \"{}\"", &result);
706    // checks for punctuation char, so needs to before punctuation is stripped.
707    // if '𝑏' is removed, then the highlight needs to be shifted to the left in some cases
708    let result = remove_baseline_before_space_or_punctuation(&mut result);
709//   debug!("Removed: \"{}\"", &result);
710
711    let result = NO_SPACE_AFTER_COMMA.replace_all(&result, "⠠P⠴");
712
713    let result = REMOVE_AFTER_PUNCT_IND.replace_all(&result, "$1$2");
714//   debug!("Punct38: \"{}\"", &result);
715
716    // these typeforms need to get pulled from user-prefs as they are transcriber-defined
717    let sans_serif = pref_manager.pref_to_string("Nemeth_SansSerif");
718    let bold = pref_manager.pref_to_string("Nemeth_Bold");
719    let double_struck = pref_manager.pref_to_string("Nemeth_DoubleStruck");
720    let script = pref_manager.pref_to_string("Nemeth_Script");
721    let italic = pref_manager.pref_to_string("Nemeth_Italic");
722
723    let result = REPLACE_INDICATORS.replace_all(&result, |cap: &Captures| {
724        let matched_char = &cap[0];
725        match matched_char {
726            "S" => &sans_serif,
727            "B" => &bold,
728            "𝔹" => &double_struck,
729            "T" => &script,
730            "I" => &italic,
731            _ => match NEMETH_INDICATOR_REPLACEMENTS.get(&cap[0]) {
732                None => {error!("REPLACE_INDICATORS and NEMETH_INDICATOR_REPLACEMENTS are not in sync"); ""},
733                Some(&ch) => ch,
734            }
735        }
736    });
737
738    // Remove unicode blanks at start and end -- do this after the substitutions because ',' introduces spaces
739    let result = result.trim_start_matches('⠀').trim_end_matches('⠀');
740    let result = COLLAPSE_SPACES.replace_all(result, "⠀");
741   
742    return result.to_string();
743
744    fn remove_baseline_before_space_or_punctuation<'a>(braille: &'a mut Cow<'a, str>) -> Cow<'a, str> {
745        // If the baseline highlight is at the end of the string and it is going to be deleted by the regex,
746        //   then we need to shift the highlight to the left if what is to it's left is not whitespace (which should never be a highlight end)
747        // This only happens when BrailleNavHighlight == "EndPoints".
748        let highlight_style = PreferenceManager::get().borrow().pref_to_string("BrailleNavHighlight");
749        if highlight_style == "EndPoints" {
750            if let Some(last_highlighted) = braille.rfind(is_highlighted) {
751                if braille[last_highlighted..].starts_with('𝑏') {
752                    let i_after_baseline = last_highlighted + '𝑏'.len_utf8();
753                    if i_after_baseline == braille.len() || braille[i_after_baseline..].starts_with(['W', 'w', ',', 'P']) {
754                        // shift the highlight to the left after doing just the replacement (if any) that the regex below does
755                        // the shift runs until a non blank braille char is found
756                        let mut bytes_deleted = 0;
757                        let mut char_to_highlight = "".to_string();   // illegal value
758                        for ch in braille[..last_highlighted].chars().rev() {
759                            bytes_deleted += ch.len_utf8();
760                            if (0x2801..0x28FF).contains(&(ch as u32)) {
761                                char_to_highlight = highlight(ch).to_string();
762                                break;
763                            }
764                        }
765                        braille.to_mut().replace_range(last_highlighted-bytes_deleted..last_highlighted+'𝑏'.len_utf8(),
766                                                        &char_to_highlight);
767                    }
768                }
769            }
770        }
771        return REMOVE_LEVEL_IND_BEFORE_SPACE_COMMA_PUNCT.replace_all(braille, "$1");
772
773    }
774}
775
776// Typeface: S: sans-serif, B: bold, T: script/blackboard, I: italic, R: Roman
777// Language: E: English, D: German, G: Greek, V: Greek variants, H: Hebrew, U: Russian
778// Indicators: C: capital, N: number, P: punctuation, M: multipurpose
779// Others:
780//      W -- whitespace that should be kept (e.g, in a numeral)
781//      𝑁 -- hack for special case of a lone decimal pt -- not considered a number but follows rules mostly 
782// Note: some "positive" patterns find cases to keep the char and transform them to the lower case version
783static UEB_INDICATOR_REPLACEMENTS: phf::Map<&str, &str> = phf_map! {
784    "S" => "XXX",    // sans-serif -- from prefs
785    "B" => "⠘",     // bold
786    "𝔹" => "XXX",     // blackboard -- from prefs
787    "T" => "⠈",     // script
788    "I" => "⠨",     // italic
789    "R" => "",      // roman
790    // "E" => "⠰",     // English
791    "1" => "⠰",      // Grade 1 symbol
792    "𝟙" => "⠰⠰",     // Grade 1 word
793    "L" => "",       // Letter left in to assist in locating letters
794    "D" => "XXX",    // German (Deutsche) -- from prefs
795    "G" => "⠨",      // Greek
796    "V" => "⠨⠈",     // Greek Variants
797    // "H" => "⠠⠠",  // Hebrew
798    // "U" => "⠈⠈",  // Russian
799    "C" => "⠠",      // capital
800    "𝐶" => "⠠",      // capital that never should get word indicator (from chemical element)
801    "N" => "⠼",     // number indicator
802    "t" => "⠱",     // shape terminator
803    "W" => "⠀",     // whitespace
804    "𝐖"=> "⠀",     // whitespace (hard break -- basically, it separates exprs)
805    "s" => "⠆",     // typeface single char indicator
806    "w" => "⠂",     // typeface word indicator
807    "e" => "⠄",     // typeface & capital terminator 
808    "o" => "",       // flag that what follows is an open indicator (used for standing alone rule)
809    "c" => "",       // flag that what follows is an close indicator (used for standing alone rule)
810    "b" => "",       // flag that what follows is an open or close indicator (used for standing alone rule)
811    "," => "⠂",     // comma
812    "." => "⠲",     // period
813    "-" => "-",     // hyphen
814    "—" => "⠠⠤",   // normal dash (2014) -- assume all normal dashes are unified here [RUEB appendix 3]
815    "―" => "⠐⠠⠤",  // long dash (2015) -- assume all long dashes are unified here [RUEB appendix 3]
816    "#" => "",      // signals end of script
817    // '(', '{', '[', '"', '\'', '“', '‘', '«',    // opening chars
818    // ')', '}', ']', '\"', '\'', '”', '’', '»',           // closing chars
819    // ',', ';', ':', '.', '…', '!', '?'                    // punctuation           
820
821};
822
823// static LETTERS: phf::Set<char> = phf_set! {
824//     '⠁', '⠃', '⠉', '⠙', '⠑', '⠋', '⠛', '⠓', '⠊', '⠚', '⠅', '⠇', '⠍', 
825//     '⠝', '⠕', '⠏', '⠟', '⠗', '⠎', '⠞', '⠥', '⠧', '⠺', '⠭', '⠽', '⠵',
826// };
827
828static LETTER_NUMBERS: phf::Set<char> = phf_set! {
829    '⠁', '⠃', '⠉', '⠙', '⠑', '⠋', '⠛', '⠓', '⠊', '⠚',
830};
831
832static SHORT_FORMS: phf::Set<&str> = phf_set! {
833    "L⠁L⠃", "L⠁L⠃L⠧", "L⠁L⠉", "L⠁L⠉L⠗", "L⠁L⠋",
834    "L⠁L⠋L⠝", "L⠁L⠋L⠺", "L⠁L⠛", "L⠁L⠛L⠌", "L⠁L⠇",
835     "L⠁L⠇L⠍", "L⠁L⠇L⠗", "L⠁L⠇L⠞", "L⠁L⠇L⠹", "L⠁L⠇L⠺",
836     "L⠃L⠇", "L⠃L⠗L⠇", "L⠉L⠙", "L⠙L⠉L⠇", "L⠙L⠉L⠇L⠛",
837     "L⠙L⠉L⠧", "L⠙L⠉L⠧L⠛", "L⠑L⠊", "L⠋L⠗", "L⠋L⠌", "L⠛L⠙",
838     "L⠛L⠗L⠞", "L⠓L⠍", "L⠓L⠍L⠋", "L⠓L⠻L⠋", "L⠊L⠍L⠍", "L⠇L⠇", "L⠇L⠗",
839     "L⠍L⠽L⠋", "L⠍L⠡", "L⠍L⠌", "L⠝L⠑L⠉", "L⠝L⠑L⠊", "L⠏L⠙",
840     "L⠏L⠻L⠉L⠧", "L⠏L⠻L⠉L⠧L⠛", "L⠏L⠻L⠓", "L⠟L⠅", "L⠗L⠉L⠧",
841     "L⠗L⠉L⠧L⠛", "L⠗L⠚L⠉", "L⠗L⠚L⠉L⠛", "L⠎L⠙", "L⠎L⠡", "L⠞L⠙",
842     "L⠞L⠛L⠗", "L⠞L⠍", "L⠞L⠝", "L⠭L⠋", "L⠭L⠎", "L⠽L⠗", "L⠽L⠗L⠋",
843     "L⠽L⠗L⠧L⠎", "L⠮L⠍L⠧L⠎", "L⠡L⠝", "L⠩L⠙", "L⠹L⠽L⠋", "L⠳L⠗L⠧L⠎",
844     "L⠺L⠙", "L⠆L⠉", "L⠆L⠋", "L⠆L⠓", "L⠆L⠇", "L⠆L⠝", "L⠆L⠎", "L⠆L⠞",
845     "L⠆L⠽", "L⠒L⠉L⠧", "L⠒L⠉L⠧L⠛", "L⠐L⠕L⠋"
846};
847
848static LETTER_PREFIXES: phf::Set<char> = phf_set! {
849    'B', 'I', '𝔹', 'S', 'T', 'D', 'C', '𝐶', '𝑐',
850};
851
852lazy_static! {
853    // Trim braille spaces before and after braille indicators
854    // In order: fraction, /, cancellation, letter, baseline
855    // Note: fraction over is not listed due to example 42(4) which shows a space before the "/"
856    // static ref REMOVE_SPACE_BEFORE_BRAILLE_INDICATORS: Regex = 
857    //     Regex::new(r"(⠄⠄⠄|⠤⠤⠤)W+([⠼⠸⠪])").unwrap();
858    static ref REPLACE_INDICATORS: Regex =Regex::new(r"([1𝟙SB𝔹TIREDGVHP𝐶𝑐CLMNW𝐖swe,.-—―#ocb])").unwrap();
859    static ref COLLAPSE_SPACES: Regex = Regex::new(r"⠀⠀+").unwrap();
860}
861
862fn is_short_form(chars: &[char]) -> bool {
863    let chars_as_string = chars.iter().map(|ch| ch.to_string()).collect::<String>();
864    return SHORT_FORMS.contains(&chars_as_string);
865}
866
867fn ueb_cleanup(pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String {
868    // debug!("ueb_cleanup: start={}", raw_braille);
869    let result = typeface_to_word_mode(&raw_braille);
870    let result = capitals_to_word_mode(&result);
871
872    let use_only_grade1 = pref_manager.pref_to_string("UEB_START_MODE").as_str() == "Grade1";
873    
874    // '𝐖' is a hard break -- basically, it separates exprs
875    let mut result = result.split('𝐖')
876                        .map(|str| pick_start_mode(str, use_only_grade1) + "W")
877                        .collect::<String>();
878    result.pop();   // we added a 'W' at the end that needs to be removed.
879
880    let result = result.replace("tW", "W");
881
882    // these typeforms need to get pulled from user-prefs as they are transcriber-defined
883    let double_struck = pref_manager.pref_to_string("UEB_DoubleStruck");
884    let sans_serif = pref_manager.pref_to_string("UEB_SansSerif");
885    let fraktur = pref_manager.pref_to_string("UEB_Fraktur");
886    let greek_variant = pref_manager.pref_to_string("UEB_GreekVariant");
887
888    let result = REPLACE_INDICATORS.replace_all(&result, |cap: &Captures| {
889        let matched_char = &cap[0];
890        match matched_char {
891            "𝔹" => &double_struck,
892            "S" => &sans_serif,
893            "D" => &fraktur,
894            "V" => &greek_variant,
895            _ => match UEB_INDICATOR_REPLACEMENTS.get(matched_char) {
896                None => {error!("REPLACE_INDICATORS and UEB_INDICATOR_REPLACEMENTS are not in sync: missing '{matched_char}'"); ""},
897                Some(&ch) => ch,
898            },
899        }
900    });
901
902    // Remove unicode blanks at start and end -- do this after the substitutions because ',' introduces spaces
903    // let result = result.trim_start_matches('⠀').trim_end_matches('⠀');
904    let result = COLLAPSE_SPACES.replace_all(&result, "⠀");
905   
906    return result.to_string();
907
908    fn pick_start_mode(raw_braille: &str, use_only_grade1: bool) -> String {
909        // Need to decide what the start mode should be
910        // From http://www.brailleauthority.org/ueb/ueb_math_guidance/final_for_posting_ueb_math_guidance_may_2019_102419.pdf
911        //   Unless a math expression can be correctly represented with only a grade 1 symbol indicator in the first three cells
912        //   or before a single letter standing alone anywhere in the expression,
913        //   begin the expression with a grade 1 word indicator (or a passage indicator if the expression includes spaces)
914        // Apparently "only a grade 1 symbol..." means at most one grade 1 symbol based on some examples (GTM 6.4, example 4)
915        // debug!("before determining mode:  '{}'", raw_braille);
916
917        // a bit ugly because we need to store the string if we have cap passage mode
918        let raw_braille_string = if is_cap_passage_mode_good(raw_braille) {convert_to_cap_passage_mode(raw_braille)} else {String::default()};
919        let raw_braille = if raw_braille_string.is_empty() {raw_braille} else {&raw_braille_string};
920        if use_only_grade1 {
921            return remove_unneeded_mode_changes(raw_braille, UEB_Mode::Grade1, UEB_Duration::Passage);
922        }
923        let grade2 = remove_unneeded_mode_changes(raw_braille, UEB_Mode::Grade2, UEB_Duration::Symbol);
924        // debug!("Symbol mode:  '{}'", grade2);
925
926        if is_grade2_string_ok(&grade2) {
927            return grade2;
928        } else {
929            // BANA says use g1 word mode if spaces are present, but that's not what their examples do
930            // A conversation with Ms. DeAndrea from BANA said that they mean use passage mode if ≥3 "segments" (≥2 blanks)
931            // The G1 Word mode might not be at the start (iceb.rs:omission_3_6_7)
932            let grade1_word = try_grade1_word_mode(raw_braille);
933            // debug!("Word mode:    '{}'", grade1_word);
934            if !grade1_word.is_empty() {
935                return grade1_word;
936            } else {
937                let grade1_passage = remove_unneeded_mode_changes(raw_braille, UEB_Mode::Grade1, UEB_Duration::Passage);
938                return "⠰⠰⠰".to_string() + &grade1_passage + "⠰⠄";
939            }
940        }
941
942        /// Return true if at least five (= # of cap passage indicators) cap indicators and no lower case letters
943        fn is_cap_passage_mode_good(braille: &str) -> bool {
944            let mut n_caps = 0;
945            let mut is_cap_mode = false;
946            let mut cap_mode = UEB_Duration::Symbol;    // real value set when is_cap_mode is set to true
947            let mut chars = braille.chars();
948
949            // look CL or CCL for caps (CC runs until we get whitespace)
950            // if we find an L not in caps mode, we return false
951            // Note: caps can be C𝐶, whitespace can be W𝐖
952            while let Some(ch) = chars.next() {
953                if ch == 'L' {
954                    if !is_cap_mode {
955                        return false;
956                    }
957                    chars.next();       // skip letter
958                    if cap_mode == UEB_Duration::Symbol {
959                        is_cap_mode = false;
960                    }
961                } else if ch == 'C' || ch == '𝐶' {
962                    if is_cap_mode {
963                        if cap_mode == UEB_Duration::Symbol {
964                            cap_mode = UEB_Duration::Word;
965                        }
966                    } else {
967                        is_cap_mode = true;
968                        cap_mode = UEB_Duration::Symbol;
969                    }
970                    n_caps += 1;
971                } else if ch == 'W' || ch == '𝐖' {
972                    if is_cap_mode {
973                        assert!(cap_mode == UEB_Duration::Word);
974                    }
975                    is_cap_mode = false;
976                } else if ch == '1' && is_cap_mode {
977                    break;
978                }
979            }
980            return n_caps > 4;
981        }
982
983        fn convert_to_cap_passage_mode(braille: &str) -> String {
984            return "⠠⠠⠠".to_string() + &braille.replace(['C', '𝐶'], "") + "⠠⠄";
985        }
986
987        /// Return true if the BANA or ICEB guidelines say it is ok to start with grade 2
988        fn is_grade2_string_ok(grade2_braille: &str) -> bool {
989            // BANA says use grade 2 if there is not more than one grade one symbol or single letter standing alone.
990            // The exact quote from their guidance:
991            //    Unless a math expression can be correctly represented with only a grade 1 symbol indicator in the first three cells
992            //    or before a single letter standing alone anywhere in the expression,
993            //    begin the expression with a grade 1 word indicator
994            // Note: I modified this slightly to exclude the cap indicator in the count. That allows three more ICEB rule to pass and seems
995            //    like it is a reasonable thing to do.
996            // Another modification is allow a single G1 indicator to occur after whitespace later on
997            //    because ICEB examples show it and it seems better than going to passage mode if it is the only G1 indicator
998
999            // Because of the 'L's which go away, we have to put a little more work into finding the first three chars
1000            let chars = grade2_braille.chars().collect::<Vec<char>>();
1001            let mut n_real_chars = 0;  // actually number of chars
1002            let mut found_g1 = false;
1003            let mut i = 0;
1004            while i < chars.len() {
1005                let ch = chars[i];
1006                if ch == '1' && !is_forced_grade1(&chars, i) {
1007                    if found_g1 {
1008                        return false;
1009                    }
1010                    found_g1 = true;
1011                } else if !"𝐶CLobc".contains(ch) {
1012                    if n_real_chars == 2 {
1013                        i += 1;
1014                        break;              // this is the third real char
1015                    };
1016                    n_real_chars += 1;
1017                }
1018                i += 1
1019            }
1020
1021            // if we find *another* g1 that isn't forced and isn't standing alone, we are done
1022            // I've added a 'follows whitespace' clause for test iceb.rs:omission_3_6_2 to the standing alone rule
1023            // we only allow one standing alone example -- not sure if BANA guidance has this limit, but GTM 11_5_5_3 seems better with it
1024            // Same for GTM 1_7_3_1 (passage mode is mentioned also)
1025            let mut is_standing_alone_already_encountered = false;
1026            let mut is_after_whitespace = false;
1027            while i < chars.len() {
1028                let ch = chars[i];
1029                if ch == 'W' {
1030                    is_after_whitespace = true;
1031                } else if ch == '1' && !is_forced_grade1(&chars, i) {
1032                    if is_standing_alone_already_encountered ||
1033                       ((found_g1 || !is_after_whitespace) && !is_single_letter_on_right(&chars, i)) {
1034                        return false;
1035                    }
1036                    found_g1 = true;
1037                    is_standing_alone_already_encountered = true;
1038                }
1039                i += 1;
1040            }
1041            return true;
1042        }
1043
1044        /// Return true if the sequence of chars forces a '1' at the `i`th position
1045        /// Note: `chars[i]` should be '1'
1046        fn is_forced_grade1(chars: &[char], i: usize) -> bool {
1047            // A '1' is forced if 'a-j' follows a digit
1048            assert_eq!(chars[i], '1', "'is_forced_grade1' didn't start with '1'");
1049            // check that a-j follows the '1' -- we have '1Lx' where 'x' is the letter to check
1050            if i+2 < chars.len() && LETTER_NUMBERS.contains(&unhighlight(chars[i+2])) {
1051                // check for a number before the '1'
1052                // this will be 'N' followed by LETTER_NUMBERS or the number ".", ",", or " "
1053                for j in (0..i).rev() {
1054                    let ch = chars[j];
1055                    if !(LETTER_NUMBERS.contains(&unhighlight(ch)) || ".,W𝐖".contains(ch)) {
1056                        return ch == 'N'
1057                    }
1058                }
1059            }
1060            return false;
1061        }
1062
1063        fn is_single_letter_on_right(chars: &[char], i: usize) -> bool {
1064            static SKIP_CHARS: phf::Set<char> = phf_set! {
1065                'B', 'I', '𝔹', 'S', 'T', 'D', 'C', '𝐶', 's', 'w'   // indicators
1066            };
1067
1068            // find the first char (if any)
1069            let mut count = 0;      // how many letters
1070            let mut i = i+1;
1071            while i < chars.len() {
1072                let ch = chars[i];
1073                if !SKIP_CHARS.contains(&ch) {
1074                    if ch == 'L' {
1075                        if count == 1 {
1076                            return false;   // found a second letter in the sequence
1077                        }
1078                        count += 1;
1079                    } else {
1080                        return count==1;
1081                    }
1082                    i += 2;   // eat 'L' and actual letter
1083                } else {
1084                    i += 1;
1085                }
1086            }
1087            return true;
1088        }
1089
1090        fn try_grade1_word_mode(raw_braille: &str) -> String {
1091            // this isn't quite right, but pretty close -- try splitting at 'W' (words)
1092            // only one of the parts can be in word mode and none of the others can have '1' unless forced
1093            let mut g1_words = Vec::default();
1094            let mut found_word_mode = false;
1095            for raw_word in raw_braille.split('W') {
1096                let word = remove_unneeded_mode_changes(raw_word, UEB_Mode::Grade2, UEB_Duration::Symbol);
1097                // debug!("try_grade1_word_mode: word='{}'", word);
1098                let word_chars = word.chars().collect::<Vec<char>>();
1099                let needs_word_mode = word_chars.iter().enumerate()
1100                    .any(|(i, &ch) | ch == '1' && !is_forced_grade1(&word_chars, i));
1101                if needs_word_mode {
1102                    if found_word_mode {
1103                        return "".to_string();
1104                    }
1105                    found_word_mode = true;
1106                    g1_words.push("⠰⠰".to_string() + &remove_unneeded_mode_changes(raw_word, UEB_Mode::Grade1, UEB_Duration::Word)
1107                    );
1108                } else {
1109                    g1_words.push(word);
1110                }
1111            }
1112            return if found_word_mode {g1_words.join("W")} else {"".to_string()};
1113        }
1114    }
1115}
1116
1117fn typeface_to_word_mode(braille: &str) -> String {
1118    lazy_static! {
1119        static ref HAS_TYPEFACE: Regex = Regex::new("[BI𝔹STD]").unwrap();
1120    }
1121    // debug!("before typeface fix:  '{}'", braille);
1122
1123    let mut result = "".to_string();
1124    let chars = braille.chars().collect::<Vec<char>>();
1125    let mut word_mode = Vec::with_capacity(5);
1126    let mut word_mode_end = Vec::with_capacity(5);
1127    let mut i = 0;
1128    while i < chars.len() {
1129        let ch = chars[i];
1130        if HAS_TYPEFACE.is_match(ch.to_string().as_str()) {
1131            let i_next_char_target = find_next_char(&chars[i+1..], ch);
1132            if word_mode.contains(&ch) {
1133                if i_next_char_target.is_none() {
1134                    word_mode.retain(|&item| item!=ch);  // drop the char since word mode is done
1135                    word_mode_end.push(ch);   // add the char to signal to add end sequence
1136                }
1137            } else {
1138                result.push(ch);
1139                if i_next_char_target.is_some() {
1140                    result.push('w');     // typeface word indicator
1141                    word_mode.push(ch);      // starting word mode for this char
1142                } else {
1143                    result.push('s');     // typeface single char indicator
1144                }
1145            }
1146            i += 1; // eat "B", etc
1147        } else if ch == 'L' || ch == 'N' {
1148            result.push(chars[i]);
1149            result.push(chars[i+1]);
1150            if !word_mode_end.is_empty() && i+2 < chars.len() && !(chars[i+2] == 'W'|| chars[i+2] == '𝐖') {
1151                // add terminator unless word sequence is terminated by end of string or whitespace
1152                for &ch in &word_mode_end {
1153                    result.push(ch);
1154                    result.push('e');
1155                };
1156                word_mode_end.clear();
1157            }
1158            i += 2; // eat Ll/Nd
1159        } else {
1160            result.push(ch);
1161            i += 1;
1162        }
1163    }
1164    return result;
1165
1166}
1167
1168fn capitals_to_word_mode(braille: &str) -> String {
1169    use std::iter::FromIterator;
1170    // debug!("before capitals fix:  '{}'", braille);
1171
1172    let mut result = "".to_string();
1173    let chars = braille.chars().collect::<Vec<char>>();
1174    let mut is_word_mode = false;
1175    let mut i = 0;
1176    // look for a sequence of CLxCLy... and create CCLxLy...
1177    while i < chars.len() {
1178        let ch = chars[i];
1179        if ch == 'C' {
1180            // '𝑐' should only occur after a 'C', so we don't have top-level check for it
1181            let mut next_non_cap = i+1;
1182            while let Some(i_next) = find_next_char(&chars[next_non_cap..], '𝑐') {
1183                next_non_cap += i_next + 1; // C/𝑐, L, letter
1184            }
1185            if find_next_char(&chars[next_non_cap..], 'C').is_some() { // next letter sequence "C..."
1186                if is_next_char_start_of_section_12_modifier(&chars[next_non_cap+1..]) {
1187                    // to me this is tricky -- section 12 modifiers apply to the previous item
1188                    // the last clause of the "item" def is the previous indivisible symbol" which ICEB 2.1 say is:
1189                    //   braille sign: one or more consecutive braille characters comprising a unit,
1190                    //     consisting of a root on its own or a root preceded by one or more
1191                    //     prefixes (also referred to as braille symbol)
1192                    // this means the capital indicator needs to be stated and can't be part of a word or passage
1193                    is_word_mode = false;
1194                    result.push_str(String::from_iter(&chars[i..next_non_cap]).as_str());
1195                    i = next_non_cap;
1196                    continue;
1197                }
1198                if is_word_mode {
1199                    i += 1;     // skip the 'C'
1200                } else {
1201                    // start word mode -- need an extra 'C'
1202                    result.push('C');
1203                    is_word_mode = true;
1204                }
1205            } else if is_word_mode {
1206                i += 1;         // skip the 'C'
1207            }
1208            if chars[next_non_cap] == 'G' {
1209                // Greek letters are a bit exceptional in that the pattern is "CGLx" -- bump 'i'
1210                next_non_cap += 1;
1211            }
1212            if chars[next_non_cap] != 'L' {
1213                error!("capitals_to_word_mode: internal error: didn't find L after C in '{}'.",
1214                       chars[i..next_non_cap+2].iter().collect::<String>().as_str());
1215            }
1216            let i_braille_char = next_non_cap + 2;
1217            result.push_str(String::from_iter(&chars[i..i_braille_char]).as_str());
1218            i = i_braille_char;
1219        } else if ch == 'L' {       // must be lowercase -- uppercase consumed above
1220            // assert!(LETTERS.contains(&unhighlight(chars[i+1]))); not true for other alphabets
1221            if is_word_mode {
1222                result.push('e');       // terminate Word mode (letter after caps)
1223                is_word_mode = false;
1224            }
1225            result.push('L');
1226            result.push(chars[i+1]);
1227            i += 2; // eat L, letter
1228        } else {
1229            is_word_mode = false;   // non-letters terminate cap word mode
1230            result.push(ch);
1231            i += 1;
1232        }
1233    }
1234    return result;
1235
1236    fn is_next_char_start_of_section_12_modifier(chars: &[char]) -> bool {
1237        // first find the L and eat the char so that we are at the potential start of where the target lies
1238        let chars_len = chars.len();
1239        let mut i_cap = 0;
1240        while chars[i_cap] != 'C' {     // we know 'C' is in the string, so no need to check for exceeding chars_len
1241            i_cap += 1;
1242        }
1243        for i_end in i_cap+1..chars_len {
1244            if chars[i_end] == 'L' {
1245                // skip the next char to get to the real start, and then look for the modifier string or next L/N
1246                // debug!("   after L '{}'", chars[i_end+2..].iter().collect::<String>());
1247                for i in i_end+2..chars_len {
1248                    let ch = chars[i];
1249                    if ch == '1' {
1250                        // Fix: there's probably a much better way to check if we have a match against one of "⠱", "⠘⠱", "⠘⠲", "⠸⠱", "⠐⠱ ", "⠨⠸⠱"
1251                        if chars[i+1] == '⠱' {
1252                            return true;
1253                        } else if i+2 < chars_len {
1254                            let mut str = chars[i+1].to_string();
1255                            str.push(chars[i+2]);
1256                            if str == "⠘⠱" || str == "⠘⠲" || str == "⠸⠱" || str == "⠐⠱" {
1257                                return true;
1258                            } else if i+3 < chars_len {
1259                                str.push(chars[i+3]);
1260                                return str == "⠨⠸⠱";
1261                            }
1262                            return false;
1263                        }
1264                    }
1265                    if ch == 'L' || ch == 'N' || !LETTER_PREFIXES.contains(&ch) {
1266                        return false;
1267                    }
1268                }
1269            }
1270        }
1271        return false;
1272    }    
1273}
1274
1275fn find_next_char(chars: &[char], target: char) -> Option<usize> {        
1276    // first find the L or N and eat the char so that we are at the potential start of where the target lies
1277    // debug!("Looking for '{}' in '{}'", target, chars.iter().collect::<String>());
1278    for i_end in 0..chars.len() {
1279        if chars[i_end] == 'L' || chars[i_end] == 'N' {
1280            // skip the next char to get to the real start, and then look for the target
1281            // stop when L/N signals past potential target or we hit some non L/N char (actual braille)
1282            // debug!("   after L/N '{}'", chars[i_end+2..].iter().collect::<String>());
1283            for (i, &ch) in chars.iter().enumerate().skip(i_end+2) {
1284                if ch == 'L' || ch == 'N' || !LETTER_PREFIXES.contains(&ch) {
1285                    return None;
1286                } else if ch == target {
1287                    // debug!("   found target");
1288                    return Some(i);
1289                }
1290            }
1291        }
1292    }
1293    return None;
1294}
1295
1296#[allow(non_camel_case_types)]
1297#[derive(Debug, PartialEq, Copy, Clone)]
1298enum UEB_Mode {
1299    Numeric,        // also includes Grade1
1300    Grade1,
1301    Grade2,
1302}
1303
1304#[allow(non_camel_case_types)]
1305#[derive(Debug, PartialEq, Copy, Clone)]
1306enum UEB_Duration {
1307    // Standing alone: A braille symbol that is standing alone may have a contracted (grade 2) meaning.
1308    // A letter or unbroken sequence of letters is “standing alone” if the symbols before and after the letter or
1309    //   sequence are spaces, hyphens, dashes or any combination thereof, including some common punctuation.
1310    // Item: An “item” is defined as the next symbol or one of seven groupings listed in Rules of Unified English Braille, §11.4.1.
1311    Symbol,
1312
1313    // The grade 1 word indicator sets grade 1 mode for the next word or symbol sequence.
1314    // A symbol sequence in UEB is defined as an unbroken string of braille signs,
1315    //   whether alphabetic or non-alphabetic, preceded and followed by a space.
1316    Word,
1317    Passage,
1318}
1319
1320// used to determine standing alone (on left side)
1321static LEFT_INTERVENING_CHARS: phf::Set<char> = phf_set! {  // see RUEB 2.6.2
1322    'B', 'I', '𝔹', 'S', 'T', 'D', 'C', '𝐶', 's', 'w',     // indicators
1323    // opening chars have prefix 'o', so not in set ['(', '{', '[', '"', '\'', '“', '‘', '«'] 
1324};
1325
1326/// Return value for use_g1_word_mode()
1327#[derive(Debug, PartialEq)]
1328enum Grade1WordIndicator {
1329    NotInWord,        // no '𝟙' in the current/next word
1330    InWord,           // '𝟙' in the current/next word
1331    NotInChars,       // no '𝟙' in the entire string (optimization for common case)
1332}
1333
1334fn remove_unneeded_mode_changes(raw_braille: &str, start_mode: UEB_Mode, start_duration: UEB_Duration) -> String {
1335    // FIX: need to be smarter about moving on wrt to typeforms/typefaces, caps, bold/italic. [maybe just let them loop through the default?]
1336    let mut mode = start_mode;
1337    let mut duration = start_duration;
1338    let mut start_g2_letter = None;    // used for start of contraction checks
1339    let mut i_g2_start = None;  // set to 'i' when entering G2 mode; None in other modes. '1' indicator goes here if standing alone
1340    let mut cap_word_mode = false;     // only set to true in G2 to prevent contractions
1341    let mut result = String::default();
1342    let chars = raw_braille.chars().collect::<Vec<char>>();
1343    let mut g1_word_indicator = Grade1WordIndicator::NotInChars;        // almost always true (and often irrelevant)
1344    if mode == UEB_Mode::Grade2 || duration == UEB_Duration::Symbol {
1345        g1_word_indicator = use_g1_word_mode(&chars);
1346        if g1_word_indicator == Grade1WordIndicator::InWord {
1347            mode = UEB_Mode::Grade1;
1348            if duration == UEB_Duration::Symbol {
1349                duration = UEB_Duration::Word;     // if Passage mode, leave as is
1350                result.push('𝟙')
1351            }
1352        }
1353    }
1354    let mut i = 0;
1355    while i < chars.len() {
1356        let ch = chars[i];
1357        match mode {
1358            UEB_Mode::Numeric => {
1359                // Numeric Mode: (from https://uebmath.aphtech.org/lesson1.0 and lesson4.0)
1360                // Symbols that can appear within numeric mode include the ten digits, comma, period, simple fraction line,
1361                // line continuation indicator, and numeric space digit symbols.
1362                // A space or any other symbol not listed here terminates numeric mode.
1363                // Numeric mode is also terminated by the "!" -- used after a script
1364                //
1365                // The numeric indicator also turns on grade 1 mode.
1366                // When grade 1 mode is set by the numeric indicator,
1367                //   grade 1 indicators are not used unless a single lower-case letter a-j immediately follows a digit.
1368                // Grade 1 mode when set by the numeric indicator is terminated by a space, hyphen, dash, or a grade 1 indicator.
1369                i_g2_start = None;
1370                // debug!("Numeric: ch={}, duration: {:?}", ch, duration);
1371                match ch {
1372                    'L' => {
1373                        // terminate numeric mode -- duration doesn't change
1374                        // let the default case handle pushing on the chars for the letter
1375                        if LETTER_NUMBERS.contains(&unhighlight(chars[i+1])) {
1376                            result.push('1');   // need to distinguish a-j from a digit
1377                        }
1378                        result.push(ch);
1379                        i += 1;
1380                        mode = UEB_Mode::Grade1;
1381                        // duration remains Word
1382                    },
1383                    '1' | '𝟙' => {
1384                        // numeric mode implies grade 1, so don't output indicator;
1385                        i += 1;
1386                        mode = UEB_Mode::Grade1;
1387                        if start_duration == UEB_Duration::Passage {
1388                            duration = UEB_Duration::Passage;      // otherwise it remains at Word
1389                        }
1390                    },
1391                    '#' => {
1392                        // terminate numeric mode -- duration doesn't change
1393                        i += 1;
1394                        if i+1 < chars.len() && chars[i] == 'L' && LETTER_NUMBERS.contains(&unhighlight(chars[i+1])) {
1395                            // special case where the script was numeric and a letter follows, so need to put out G1 indicator
1396                            result.push('1');
1397                            // the G1 case should work with 'L' now
1398                        }
1399                        mode = UEB_Mode::Grade1;
1400                    },
1401                    'N' => {
1402                        // stay in the same mode (includes numeric "," and "." space) -- don't let default get these chars
1403                        result.push(chars[i+1]);
1404                        i += 2;
1405                    },
1406                    _ => {
1407                        // moving out of numeric mode
1408                        result.push(ch);
1409                        i += 1;
1410                        if "W𝐖-—―".contains(ch) {
1411                            mode = start_mode;
1412                            if mode == UEB_Mode::Grade2 {
1413                                start_g2_letter = None;        // will be set to real letter
1414                            }
1415                            if start_duration != UEB_Duration::Passage {
1416                                duration = UEB_Duration::Symbol;
1417                            }
1418                        } else {
1419                            mode = UEB_Mode::Grade1
1420                        }
1421                    },
1422                }
1423            },
1424            UEB_Mode::Grade1 => {
1425                // Grade 1 Mode:
1426                // The numeric indicator also sets grade 1 mode.
1427                // Grade 1 mode, when initiated by the numeric indicator, is terminated by a space, hyphen, dash or grade 1 terminator.
1428                // Grade 1 mode is also set by grade 1 indicators.
1429                i_g2_start = None;
1430                // debug!("Grade 1: ch={}, duration: {:?}", ch, duration);
1431                match ch {
1432                    'L' => {
1433                        // note: be aware of '#' case for Numeric because '1' might already be generated
1434                        // let prev_ch = if i > 1 {chars[i-1]} else {'1'};   // '1' -- anything beside ',' or '.'
1435                        // if duration == UEB_Duration::Symbol || 
1436                        //     ( ",. ".contains(prev_ch) && LETTER_NUMBERS.contains(&unhighlight(chars[i+1])) ) {
1437                        //     result.push('1');        // need to retain grade 1 indicator (RUEB 6.5.2)
1438                        // }
1439                        // let the default case handle pushing on the chars for the letter
1440                        result.push(ch);
1441                        i += 1;
1442                    },
1443                    '1' | '𝟙' => {
1444                        assert!(ch == '1' || duration != UEB_Duration::Symbol);     // if '𝟙', should be Word or Passage duration
1445                        // nothing to do -- let the default case handle the following chars
1446                        i += 1;
1447                    },
1448                    'N' => {
1449                        result.push(ch);
1450                        result.push(chars[i+1]);
1451                        i += 2;
1452                        mode = UEB_Mode::Numeric;
1453                        duration = UEB_Duration::Word;
1454                    },
1455                    'W' | '𝐖' => {
1456                        // this terminates a word mode if there was one
1457                        result.push(ch);
1458                        i += 1;
1459                        if start_duration != UEB_Duration::Passage {
1460                            duration = UEB_Duration::Symbol;
1461                            mode = UEB_Mode::Grade2;
1462                        }
1463                    },
1464                    _ => {
1465                        result.push(ch);
1466                        i += 1;
1467                        if duration == UEB_Duration::Symbol && !LETTER_PREFIXES.contains(&ch) {
1468                            mode = start_mode;
1469                        }
1470                    }
1471                }
1472                if mode == UEB_Mode::Grade2 {
1473                    start_g2_letter = None;        // will be set to real letter
1474                }
1475
1476            },
1477            UEB_Mode::Grade2 => {
1478                // note: if we ended up using a '1', it only extends to the next char, which is also dealt with, so mode doesn't change
1479               if i_g2_start.is_none() {
1480                   i_g2_start = Some(i);
1481                   cap_word_mode = false;
1482               }
1483                // debug!("Grade 2: ch={}, duration: {:?}", ch, duration);
1484                match ch {
1485                    'L' => {
1486                        if start_g2_letter.is_none() {
1487                            start_g2_letter = Some(i);
1488                        }
1489                        let (is_alone, right_matched_chars, n_letters) = stands_alone(&chars, i);
1490                        // GTM 1.2.1 says we only need to use G1 for single letters or sequences that are a shortform (e.g, "ab")
1491                        if is_alone && (n_letters == 1 || is_short_form(&right_matched_chars[..2*n_letters])) {
1492                            // debug!("  is_alone -- pushing '1'");
1493                            result.push('1');
1494                            mode = UEB_Mode::Grade1;
1495                        }
1496                        // debug!("  pushing {:?}", right_matched_chars);
1497                        right_matched_chars.iter().for_each(|&ch| result.push(ch));
1498                        i += right_matched_chars.len();
1499                    },
1500                    'C' => {
1501                        // Want 'C' before 'L'; Could be CC for word cap -- if so, eat it and move on
1502                        // Note: guaranteed that there is a char after the 'C', so chars[i+1] is safe
1503                        if chars[i+1] == 'C' {
1504                            cap_word_mode = true;
1505                            i += 1;
1506                        } else {
1507                            let is_greek = chars[i+1] == 'G';
1508                            let (is_alone, right_matched_chars, n_letters) = stands_alone(&chars, if is_greek {i+2} else {i+1});
1509                            // GTM 1.2.1 says we only need to use G1 for single letters or sequences that are a shortform (e.g, "ab")
1510                            if is_alone && (n_letters == 1 || is_short_form(&right_matched_chars[..2*n_letters])) {
1511                                // debug!("  is_alone -- pushing '1'");
1512                                result.push('1');
1513                                mode = UEB_Mode::Grade1;
1514                            }
1515                            if cap_word_mode {
1516                                result.push('C');   // first 'C' if cap word
1517                            }
1518                            result.push('C');
1519                            if is_greek {
1520                                result.push('G');
1521                                i += 1;
1522                            }
1523                            start_g2_letter = Some(i);
1524                            // debug!("  pushing 'C' + {:?}", right_matched_chars);
1525                            right_matched_chars.iter().for_each(|&ch| result.push(ch));
1526                            i += 1 + right_matched_chars.len();
1527                        }
1528                    },
1529                    '1' => {
1530                        result.push(ch);
1531                        i += 1;
1532                        mode = UEB_Mode::Grade1;
1533                        duration = UEB_Duration::Symbol;
1534                    },
1535                    '𝟙' => {
1536                        // '𝟙' should have forced G1 Word mode
1537                        error!("Internal error: '𝟙' found in G2 mode: index={i} in '{raw_braille}'");
1538                        i += 1;
1539                    }
1540                    'N' => {
1541                        result.push(ch);
1542                        result.push(chars[i+1]);
1543                        i += 2;
1544                        mode = UEB_Mode::Numeric;
1545                        duration = UEB_Duration::Word;
1546                    },
1547                    _ => {
1548                        if let Some(start) = start_g2_letter {
1549                            if !cap_word_mode {
1550                                result = handle_contractions(&chars[start..i], result);
1551                            }
1552                            cap_word_mode = false;
1553                            start_g2_letter = None;     // not start of char sequence
1554                        }
1555                        result.push(ch);
1556                        i += 1;
1557                        if !LEFT_INTERVENING_CHARS.contains(&ch) {
1558                            cap_word_mode = false;
1559                            i_g2_start = Some(i);
1560                        }
1561
1562                    }
1563                }
1564                if mode != UEB_Mode::Grade2 && !cap_word_mode {
1565                    if let Some(start) = start_g2_letter {
1566                        result = handle_contractions(&chars[start..i], result);
1567                        start_g2_letter = None;     // not start of char sequence
1568                    }
1569                }
1570            },
1571        }
1572
1573        if (ch == 'W' || ch == '𝐖') && g1_word_indicator != Grade1WordIndicator::NotInChars &&
1574           (mode == UEB_Mode::Grade2 || duration == UEB_Duration::Symbol) {
1575            g1_word_indicator = use_g1_word_mode(&chars[i..]);
1576            if g1_word_indicator == Grade1WordIndicator::InWord {
1577                mode = UEB_Mode::Grade1;
1578                if duration == UEB_Duration::Symbol {
1579                    duration = UEB_Duration::Word;     // if Passage mode, leave as is
1580                    result.push('𝟙')
1581                }
1582            }
1583        }
1584    }
1585    if mode == UEB_Mode::Grade2 {
1586        if let Some(start) = start_g2_letter {
1587            result = handle_contractions(&chars[start..i], result);
1588        }
1589    }
1590
1591    return result;
1592
1593
1594    fn use_g1_word_mode(chars: &[char]) -> Grade1WordIndicator {
1595        // debug!("use_g1_word_mode: chars='{:?}'", chars);
1596        for &ch in chars {
1597            if ch == 'W' || ch == '𝐖' {
1598                return Grade1WordIndicator::NotInWord;       // reached a word boundary
1599            }
1600            if ch == '𝟙' {
1601                return Grade1WordIndicator::InWord;        // need word mode in this "word"
1602            }
1603        }
1604        return Grade1WordIndicator::NotInChars;               // 
1605    }
1606}
1607
1608/// Returns a tuple:
1609///   true if the ith char "stands alone" (UEB 2.6)
1610///   the chars on the right that are part of the standing alone sequence
1611///   the number of letters in that sequence
1612/// This basically means a letter sequence surrounded by white space with some potentially intervening chars
1613/// The intervening chars can be typeform/cap indicators, along with various forms of punctuation
1614/// The ith char should be an "L"
1615/// This assumes that there is whitespace before and after the character string
1616fn stands_alone(chars: &[char], i: usize) -> (bool, &[char], usize) {
1617    // scan backward and check the conditions for "standing-alone"
1618    // we scan forward and check the conditions for "standing-alone"
1619    assert_eq!(chars[i], 'L', "'stands_alone' starts with non 'L'");
1620    // debug!("stands_alone: i={}, chars: {:?}", i, chars);
1621    if !left_side_stands_alone(&chars[0..i]) {
1622        return (false, &chars[i..i+2], 0);
1623    }
1624
1625    let (mut is_alone, n_letters, n_right_matched) = right_side_stands_alone(&chars[i+2..]);
1626    // debug!("left is alone, right is alone: {}, : n_letters={}, n_right_matched={}", is_alone, n_letters, n_right_matched);
1627
1628    if is_alone && n_letters == 1 {
1629        let ch = chars[i+1];
1630        if ch=='⠁' || ch=='⠊' || ch=='⠕' {      // a, i, o
1631            is_alone = false;
1632        }
1633    }
1634    return (is_alone, &chars[i..i+2+n_right_matched], n_letters);
1635
1636    /// chars before before 'L'
1637    fn left_side_stands_alone(chars: &[char]) -> bool {
1638        // scan backwards to skip letters and intervening chars
1639        // once we hit an intervening char, only intervening chars are allowed if standing alone
1640        let mut intervening_chars_mode = false; // true when we are on the final stretch
1641        let mut i = chars.len();
1642        while i > 0 {
1643            i -= 1;
1644            let ch = chars[i];
1645            let prev_ch = if i > 0 {chars[i-1]} else {' '};  // ' ' is a char not in input
1646            // debug!("  left alone: prev/ch {}/{}", prev_ch, ch);
1647            if (!intervening_chars_mode && prev_ch == 'L') ||
1648               (prev_ch == 'o' || prev_ch == 'b') {
1649                intervening_chars_mode = true;
1650                i -= 1;       // ignore 'Lx' and also ignore 'ox'
1651            } else if LEFT_INTERVENING_CHARS.contains(&ch) {
1652                intervening_chars_mode = true;
1653            } else {
1654                return "W𝐖-—―".contains(ch);
1655            }
1656        }
1657
1658        return true;
1659    }
1660
1661    // chars after character we are testing
1662    fn right_side_stands_alone(chars: &[char]) -> (bool, usize, usize) {
1663        // see RUEB 2.6.3
1664        static RIGHT_INTERVENING_CHARS: phf::Set<char> = phf_set! {
1665            'B', 'I', '𝔹', 'S', 'T', 'D', 'C', '𝐶', 's', 'w', 'e',   // indicators
1666            // ')', '}', ']', '\"', '\'', '”', '’', '»',      // closing chars
1667            // ',', ';', ':', '.', '…', '!', '?'              // punctuation           
1668        };
1669        // scan forward to skip letters and intervening chars
1670        // once we hit an intervening char, only intervening chars are allowed if standing alone ('c' and 'b' are part of them)
1671        let mut intervening_chars_mode = false; // true when we are on the final stretch
1672        let mut i = 0;
1673        let mut n_letters = 1;      // we have skipped the first letter
1674        while i < chars.len() {
1675            let ch = chars[i];
1676            // debug!("  right alone: ch/next {}/{}", ch, if i+1<chars.len() {chars[i+1]} else {' '});
1677            if !intervening_chars_mode && ch == 'L' {
1678                n_letters += 1;
1679                i += 1;       // ignore 'Lx' and also ignore 'ox'
1680            } else if ch == 'c' || ch == 'b' {
1681                i += 1;       // ignore 'Lx' and also ignore 'ox'
1682            } else if RIGHT_INTERVENING_CHARS.contains(&ch) {  
1683                intervening_chars_mode = true;
1684            } else {
1685                return if "W𝐖-—―".contains(ch) {(true, n_letters, i)} else {(false, n_letters, i)};
1686            }
1687            i += 1;
1688        }
1689
1690        return (true, n_letters, chars.len());
1691    }
1692}
1693
1694
1695/// Return a modified result if chars can be contracted.
1696/// Otherwise, the original string is returned
1697fn handle_contractions(chars: &[char], mut result: String) -> String {
1698    struct Replacement {
1699        pattern: String,
1700        replacement: &'static str
1701    }
1702
1703    const ASCII_TO_UNICODE: &[char] = &[
1704        '⠀', '⠮', '⠐', '⠼', '⠫', '⠩', '⠯', '⠄', '⠷', '⠾', '⠡', '⠬', '⠠', '⠤', '⠨', '⠌',
1705        '⠴', '⠂', '⠆', '⠒', '⠲', '⠢', '⠖', '⠶', '⠦', '⠔', '⠱', '⠰', '⠣', '⠿', '⠜', '⠹',
1706        '⠈', '⠁', '⠃', '⠉', '⠙', '⠑', '⠋', '⠛', '⠓', '⠊', '⠚', '⠅', '⠇', '⠍', '⠝', '⠕',
1707        '⠏', '⠟', '⠗', '⠎', '⠞', '⠥', '⠧', '⠺', '⠭', '⠽', '⠵', '⠪', '⠳', '⠻', '⠘', '⠸',
1708    ];
1709
1710    fn to_unicode_braille(ascii: &str) -> String {
1711        let mut unicode = String::with_capacity(4*ascii.len());   // 'L' + 3 bytes for braille char
1712        for ch in ascii.as_bytes() {
1713            unicode.push('L');
1714            unicode.push(ASCII_TO_UNICODE[(ch.to_ascii_uppercase() - 32) as usize])
1715        }
1716        return unicode;
1717    }
1718
1719    // It would be much better from an extensibility point of view to read the table in from a file
1720    lazy_static! {
1721        static ref CONTRACTIONS: Vec<Replacement> = vec![
1722            // 10.3: Strong contractions
1723            Replacement{ pattern: to_unicode_braille("and"), replacement: "L⠯"},
1724            Replacement{ pattern: to_unicode_braille("for"), replacement: "L⠿"},
1725            Replacement{ pattern: to_unicode_braille("of"), replacement: "L⠷"},
1726            Replacement{ pattern: to_unicode_braille("the"), replacement: "L⠮"},
1727            Replacement{ pattern: to_unicode_braille("with"), replacement: "L⠾"},
1728            
1729            // 10.8: final-letter group signs (this need to precede 'en' and any other shorter contraction)
1730            Replacement{ pattern: "(?P<s>L.)L⠍L⠑L⠝L⠞".to_string(), replacement: "${s}L⠰L⠞" }, // ment
1731            Replacement{ pattern: "(?P<s>L.)L⠞L⠊L⠕L⠝".to_string(), replacement: "${s}L⠰L⠝" } ,// tion
1732
1733            // 10.4: Strong group signs
1734            Replacement{ pattern: to_unicode_braille("ch"), replacement: "L⠡"},
1735            Replacement{ pattern: to_unicode_braille("gh"), replacement: "L⠣"},
1736            Replacement{ pattern: to_unicode_braille("sh"), replacement: "L⠩"},
1737            Replacement{ pattern: to_unicode_braille("th"), replacement: "L⠹"},
1738            Replacement{ pattern: to_unicode_braille("wh"), replacement: "L⠱"},
1739            Replacement{ pattern: to_unicode_braille("ed"), replacement: "L⠫"},
1740            Replacement{ pattern: to_unicode_braille("er"), replacement: "L⠻"},
1741            Replacement{ pattern: to_unicode_braille("ou"), replacement: "L⠳"},
1742            Replacement{ pattern: to_unicode_braille("ow"), replacement: "L⠪"},
1743            Replacement{ pattern: to_unicode_braille("st"), replacement: "L⠌"},
1744            Replacement{ pattern: "(?P<s>L.)L⠊L⠝L⠛".to_string(), replacement: "${s}L⠬" },  // 'ing', not at start
1745            Replacement{ pattern: to_unicode_braille("ar"), replacement: "L⠜"},
1746
1747            // 10.6.5: Lower group signs preceded and followed by letters
1748            // FIX: don't match if after/before a cap letter -- can't use negative pattern (?!...) in regex package
1749            // Note: removed cc because "arccos" shouldn't be contracted (10.11.1), but there is no way to know about compound words
1750            // Add it back after implementing a lookup dictionary of exceptions
1751            Replacement{ pattern: "(?P<s>L.)L⠑L⠁(?P<e>L.)".to_string(), replacement: "${s}L⠂${e}" },  // ea
1752            Replacement{ pattern: "(?P<s>L.)L⠃L⠃(?P<e>L.)".to_string(), replacement: "${s}L⠆${e}" },  // bb
1753            // Replacement{ pattern: "(?P<s>L.)L⠉L⠉(?P<e>L.)".to_string(), replacement: "${s}L⠒${e}" },  // cc
1754            Replacement{ pattern: "(?P<s>L.)L⠋L⠋(?P<e>L.)".to_string(), replacement: "${s}L⠖${e}" },  // ff
1755            Replacement{ pattern: "(?P<s>L.)L⠛L⠛(?P<e>L.)".to_string(), replacement: "${s}L⠶${e}" },  // gg
1756
1757            // 10.6.8: Lower group signs ("in" also 10.5.4 lower word signs)
1758            // FIX: these need restrictions about only applying when upper dots are present
1759            Replacement{ pattern: to_unicode_braille("en"), replacement: "⠢"},
1760            Replacement{ pattern: to_unicode_braille("in"), replacement: "⠔"},
1761           
1762        ];
1763
1764        static ref CONTRACTION_PATTERNS: RegexSet = init_patterns(&CONTRACTIONS);
1765        static ref CONTRACTION_REGEX: Vec<Regex> = init_regex(&CONTRACTIONS);
1766    }
1767
1768    let mut chars_as_str = chars.iter().collect::<String>();
1769    // debug!("  handle_contractions: examine '{}'", &chars_as_str);
1770    let matches = CONTRACTION_PATTERNS.matches(&chars_as_str);
1771    for i in matches.iter() {
1772        let element = &CONTRACTIONS[i];
1773        // debug!("  replacing '{}' with '{}' in '{}'", element.pattern, element.replacement, &chars_as_str);
1774        result.truncate(result.len() - chars_as_str.len());
1775        chars_as_str = CONTRACTION_REGEX[i].replace_all(&chars_as_str, element.replacement).to_string();
1776        result.push_str(&chars_as_str);
1777        // debug!("  result after replace '{}'", result);
1778    }
1779    return result;
1780
1781
1782
1783    fn init_patterns(contractions: &[Replacement]) -> RegexSet {
1784        let mut vec: Vec<&str> = Vec::with_capacity(contractions.len());
1785        for contraction in contractions {
1786            vec.push(&contraction.pattern);
1787        }
1788        return RegexSet::new(&vec).unwrap();
1789    }
1790
1791    fn init_regex(contractions: &[Replacement]) -> Vec<Regex> {
1792        let mut vec = Vec::with_capacity(contractions.len());
1793        for contraction in contractions {
1794            vec.push(Regex::new(&contraction.pattern).unwrap());
1795        }
1796        return vec;
1797    }
1798}
1799
1800
1801
1802
1803static VIETNAM_INDICATOR_REPLACEMENTS: phf::Map<&str, &str> = phf_map! {
1804    "S" => "XXX",    // sans-serif -- from prefs
1805    "B" => "⠘",     // bold
1806    "𝔹" => "XXX",     // blackboard -- from prefs
1807    "T" => "⠈",     // script
1808    "I" => "⠨",     // italic
1809    "R" => "",      // roman
1810    // "E" => "⠰",     // English
1811    "1" => "⠠",     // Grade 1 symbol
1812    "L" => "",     // Letter left in to assist in locating letters
1813    "D" => "XXX",     // German (Deutsche) -- from prefs
1814    "G" => "⠰",     // Greek
1815    "V" => "XXX",    // Greek Variants
1816    // "H" => "⠠⠠",    // Hebrew
1817    // "U" => "⠈⠈",    // Russian
1818    "C" => "⠨",      // capital
1819    "𝑐" => "",       // second or latter braille cell of a capital letter
1820    "𝐶" => "⠨",      // capital that never should get word indicator (from chemical element)
1821    "N" => "⠼",     // number indicator
1822    "t" => "⠱",     // shape terminator
1823    "W" => "⠀",     // whitespace"
1824    "𝐖"=> "⠀",     // whitespace
1825    "s" => "⠆",     // typeface single char indicator
1826    "w" => "",     // typeface word indicator
1827    "e" => "",     // typeface & capital terminator 
1828    "o" => "",       // flag that what follows is an open indicator (used for standing alone rule)
1829    "c" => "",     // flag that what follows is an close indicator (used for standing alone rule)
1830    "b" => "",       // flag that what follows is an open or close indicator (used for standing alone rule)
1831    "," => "⠂",     // comma
1832    "." => "⠲",     // period
1833    "-" => "-",     // hyphen
1834    "—" => "⠠⠤",   // normal dash (2014) -- assume all normal dashes are unified here [RUEB appendix 3]
1835    "―" => "⠐⠠⠤",  // long dash (2015) -- assume all long dashes are unified here [RUEB appendix 3]
1836    "#" => "",      // signals end of script
1837    "!" => "",      // Hack used to prevent some regular expression matches
1838};
1839
1840fn vietnam_cleanup(pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String {
1841    lazy_static! {
1842        // Deal with Vietnamese "rhymes" -- moving accents around
1843        // See "Vietnamese Uncontracted Braille Update in MathCAT" or maybe https://icanreadvietnamese.com/blog/14-rule-of-tone-mark-placement
1844        // Note: I don't know how to write (for example) I_E_RULE so that it excludes "qu" and "gi", so I use two rules
1845        // The first rule rewrites the patterns with "qu" and "gi" to add "!" to prevent a match of the second rule -- "!" is dropped later
1846        static ref QU_GI_RULE_EXCEPTION: Regex = Regex::new(r"(L⠟L⠥|L⠛L⠊)").unwrap();
1847        static ref IUOY_E_RULE: Regex = Regex::new(r"L(⠊|⠥|⠕|⠽)(L[⠔⠰⠢⠤⠠])L(⠑|⠣)").unwrap();     // ie, ue, oe, and ye rule
1848        static ref UO_A_RULE: Regex = Regex::new(r"L(⠥|⠕)(L[⠔⠰⠢⠤⠠])L(⠁|⠡|⠜)").unwrap();     // ua, oa rule
1849        static ref UU_O_RULE: Regex = Regex::new(r"L(⠥|⠳)(L[⠔⠰⠢⠤⠠])L(⠪|⠹)").unwrap();     // uo, ưo rule
1850        static ref UYE_RULE: Regex = Regex::new(r"L⠥L([⠔⠰⠢⠤⠠])L⠽L⠣").unwrap();     // uo, ưo rule
1851        static ref UY_RULE: Regex = Regex::new(r"L⠥L([⠔⠰⠢⠤⠠])L⠽").unwrap();     // uo, ưo rule
1852        static ref REPLACE_INDICATORS: Regex =Regex::new(r"([1𝟙SB𝔹TIREDGVHP𝐶𝑐CLMNW𝐖swe,.-—―#ocb!])").unwrap();
1853
1854    }
1855    // debug!("vietnam_cleanup: start={}", raw_braille);
1856    let result = typeface_to_word_mode(&raw_braille);
1857    let result = capitals_to_word_mode(&result);
1858
1859    let result = result.replace("tW", "W");
1860    let result = result.replace("CG", "⠸");    // capital Greek letters are problematic in Vietnam braille
1861    let result = result.replace("CC", "⠸");    // capital word more is the same as capital Greek letters
1862    // debug!("   after typeface/caps={}", &result);
1863
1864    // deal with "rhymes"
1865    let result = QU_GI_RULE_EXCEPTION.replace_all(&result, "${1}!");
1866    // debug!("          after except={}", &result);
1867    let result = IUOY_E_RULE.replace_all(&result, "${2}L${1}L${3}");
1868    // debug!("          after IUOY_E={}", &result);
1869    let result = UO_A_RULE.replace_all(&result, "${2}L${1}L${3}");
1870    // debug!("          after   UO_A={}", &result);
1871    let result = UU_O_RULE.replace_all(&result, "${2}L${1}L${3}");
1872    // debug!("          after   UO_O={}", &result);
1873    let result = UYE_RULE.replace_all(&result, "${1}L⠥L⠽L⠣");  // longer match first
1874    // debug!("          after    UYE={}", &result);
1875    let result = UY_RULE.replace_all(&result, "${1}L⠥L⠽");
1876    // debug!("          after     UY={}", &result);
1877
1878    // these typeforms need to get pulled from user-prefs as they are transcriber-defined
1879    let double_struck = pref_manager.pref_to_string("Vietnam_DoubleStruck");
1880    let sans_serif = pref_manager.pref_to_string("Vietnam_SansSerif");
1881    let fraktur = pref_manager.pref_to_string("Vietnam_Fraktur");
1882    let greek_variant = pref_manager.pref_to_string("Vietnam_GreekVariant");
1883
1884    // This reuses the code just for getting rid of unnecessary "L"s and "N"s
1885    let result = remove_unneeded_mode_changes(&result, UEB_Mode::Grade1, UEB_Duration::Passage);
1886
1887
1888    let result = REPLACE_INDICATORS.replace_all(&result, |cap: &Captures| {
1889        let matched_char = &cap[0];
1890        match matched_char {
1891            "𝔹" => &double_struck,
1892            "S" => &sans_serif,
1893            "D" => &fraktur,
1894            "V" => &greek_variant,
1895            _ => match VIETNAM_INDICATOR_REPLACEMENTS.get(matched_char) {
1896                None => {error!("REPLACE_INDICATORS and VIETNAM_INDICATOR_REPLACEMENTS are not in sync: missing '{matched_char}'"); ""},
1897                Some(&ch) => ch,
1898            },
1899        }
1900    });
1901
1902    // Remove unicode blanks at start and end -- do this after the substitutions because ',' introduces spaces
1903    // let result = result.trim_start_matches('⠀').trim_end_matches('⠀');
1904    let result = COLLAPSE_SPACES.replace_all(&result, "⠀");
1905   
1906    return result.to_string();
1907}
1908
1909
1910static CMU_INDICATOR_REPLACEMENTS: phf::Map<&str, &str> = phf_map! {
1911    // "S" => "XXX",    // sans-serif -- from prefs
1912    "B" => "⠔",     // bold
1913    "𝔹" => "⠬",     // blackboard -- from prefs
1914    // "T" => "⠈",     // script
1915    "I" => "⠔",     // italic -- same as bold
1916    // "R" => "",      // roman
1917    // "E" => "⠰",     // English
1918    "1" => "⠐",     // Grade 1 symbol -- used here for a-j after number
1919    "L" => "",     // Letter left in to assist in locating letters
1920    "D" => "⠠",     // German (Gothic)
1921    "G" => "⠈",     // Greek
1922    "V" => "⠈⠬",    // Greek Variants
1923    // "H" => "⠠⠠",    // Hebrew
1924    // "U" => "⠈⠈",    // Russian
1925    "C" => "⠨",      // capital
1926    "𝐶" => "⠨",      // capital that never should get word indicator (from chemical element)
1927    "N" => "⠼",     // number indicator
1928    "𝑁" => "",      // continue number
1929    // "t" => "⠱",     // shape terminator
1930    "W" => "⠀",     // whitespace"
1931    "𝐖"=> "⠀",     // whitespace
1932    // "𝘄" => "⠀",    // add whitespace if char to the left has dots 1, 2, or 3 -- special rule handled separately, so commented out
1933    "s" => "",     // typeface single char indicator
1934    // "w" => "⠂",     // typeface word indicator
1935    // "e" => "⠄",     // typeface & capital terminator 
1936    // "o" => "",       // flag that what follows is an open indicator (used for standing alone rule)
1937    // "c" => "",       // flag that what follows is an close indicator (used for standing alone rule)
1938    // "b" => "",       // flag that what follows is an open or close indicator (used for standing alone rule)
1939    "," => "⠂",     // comma
1940    "." => "⠄",     // period
1941    "-" => "⠤",     // hyphen
1942    "—" => "⠤⠤",   // normal dash (2014) -- assume all normal dashes are unified here [RUEB appendix 3]
1943    // "―" => "⠐⠤⠤",  // long dash (2015) -- assume all long dashes are unified here [RUEB appendix 3]
1944    "#" => "⠼",      // signals to end/restart of numeric mode (mixed fractions)
1945};
1946
1947
1948fn cmu_cleanup(_pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String {
1949    lazy_static! {
1950        static ref ADD_WHITE_SPACE: Regex = Regex::new(r"𝘄(.)|𝘄$").unwrap();
1951    }
1952
1953    // debug!("cmu_cleanup: start={}", raw_braille);
1954    // let result = typeface_to_word_mode(&raw_braille);
1955
1956    // let result = result.replace("tW", "W");
1957    let result = raw_braille.replace("CG", "⠘")
1958                                .replace("𝔹C", "⠩")
1959                                .replace("DC", "⠰");
1960    // let result = result.replace("CC", "⠸");
1961
1962    // these typeforms need to get pulled from user-prefs as they are transcriber-defined
1963    // let double_struck = pref_manager.pref_to_string("CMU_DoubleStruck");
1964    // let sans_serif = pref_manager.pref_to_string("CMU_SansSerif");
1965    // let fraktur = pref_manager.pref_to_string("CMU_Fraktur");
1966
1967    // debug!("Before remove mode changes: '{}'", &result);
1968    // This reuses the code just for getting rid of unnecessary "L"s and "N"s
1969    let result = remove_unneeded_mode_changes(&result, UEB_Mode::Grade1, UEB_Duration::Passage);
1970    let result = result.replace("𝑁N", "");
1971    // debug!(" After remove mode changes: '{}'", &result);
1972
1973    let result = REPLACE_INDICATORS.replace_all(&result, |cap: &Captures| {
1974        match CMU_INDICATOR_REPLACEMENTS.get(&cap[0]) {
1975            None => {error!("REPLACE_INDICATORS and CMU_INDICATOR_REPLACEMENTS are not in sync"); ""},
1976            Some(&ch) => ch,
1977        }
1978    });
1979    let result = ADD_WHITE_SPACE.replace_all(&result, |cap: &Captures| {
1980        if cap.get(1).is_none() {
1981            return "⠀".to_string();
1982        } else {
1983            // debug!("ADD_WHITE_SPACE match='{}', has left dots = {}", &cap[1], has_left_dots(cap[1].chars().next().unwrap()));
1984            let mut next_chars = cap[1].chars();
1985            let next_char = next_chars.next().unwrap();
1986            assert!(next_chars.next().is_none());
1987            return (if has_left_dots(next_char) {"⠀"} else {""}).to_string() + &cap[1];
1988        }
1989    });
1990    
1991    // Remove unicode blanks at start and end -- do this after the substitutions because ',' introduces spaces
1992    let result = COLLAPSE_SPACES.replace_all(&result, "⠀");
1993    let result = result.trim_start_matches('⠀');            // don't trip end (e.g., see once::vector_11_2_5)
1994    return result.to_string();
1995
1996    fn has_left_dots(ch: char) -> bool {
1997        // Unicode braille is set up so dot 1 is 2^0, dot 2 is 2^1, etc
1998        return ( (ch as u32 - 0x2800) >> 4 ) > 0;
1999    }
2000}
2001
2002
2003
2004static SWEDISH_INDICATOR_REPLACEMENTS: phf::Map<&str, &str> = phf_map! {
2005    // FIX: this needs cleaning up -- not all of these are used
2006    "S" => "XXX",    // sans-serif -- from prefs
2007    "B" => "⠨",     // bold
2008    "𝔹" => "XXX",     // blackboard -- from prefs
2009    "T" => "⠈",     // script
2010    "I" => "⠨",     // italic
2011    "R" => "",      // roman
2012    "1" => "⠱",     // Grade 1 symbol (used for number followed by a letter)
2013    "L" => "",     // Letter left in to assist in locating letters
2014    "D" => "XXX",     // German (Deutsche) -- from prefs
2015    "G" => "⠰",     // Greek
2016    "V" => "XXX",    // Greek Variants
2017    // "H" => "⠠⠠",    // Hebrew
2018    // "U" => "⠈⠈",    // Russian
2019    "C" => "⠠",      // capital
2020    "𝑐" => "",       // second or latter braille cell of a capital letter
2021    "𝐶" => "⠠",      // capital that never should get word indicator (from chemical element)
2022    "N" => "⠼",     // number indicator
2023    "t" => "⠱",     // shape terminator
2024    "W" => "⠀",     // whitespace"
2025    "𝐖"=> "⠀",     // whitespace
2026    "w" => "⠀",     // whitespace after function name
2027    "s" => "",     // typeface single char indicator
2028    "e" => "",     // typeface & capital terminator 
2029    "E" => "⠱",     // empty base -- see index of radical
2030    "o" => "",       // flag that what follows is an open indicator (used for standing alone rule)
2031    "c" => "",     // flag that what follows is an close indicator (used for standing alone rule)
2032    "b" => "",       // flag that what follows is an open or close indicator (used for standing alone rule)
2033    "," => "⠂",     // comma
2034    "." => "⠲",     // period
2035    "-" => "-",     // hyphen
2036    "—" => "⠠⠤",   // normal dash (2014) -- assume all normal dashes are unified here [RUEB appendix 3]
2037    "―" => "⠐⠠⠤",  // long dash (2015) -- assume all long dashes are unified here [RUEB appendix 3]
2038    "#" => "",      // signals end of script
2039
2040};
2041
2042
2043static FINNISH_INDICATOR_REPLACEMENTS: phf::Map<&str, &str> = phf_map! {
2044    // FIX: this needs cleaning up -- not all of these are used
2045    "S" => "XXX",    // sans-serif -- from prefs
2046    "B" => "⠨",     // bold
2047    "𝔹" => "XXX",     // blackboard -- from prefs
2048    "T" => "⠈",     // script
2049    "I" => "⠨",     // italic
2050    "R" => "",      // roman
2051    "E" => "⠰",     // English
2052    "1" => "⠀",     // Grade 1 symbol (used for number followed by a letter)
2053    "L" => "",     // Letter left in to assist in locating letters
2054    "D" => "XXX",     // German (Deutsche) -- from prefs
2055    "G" => "⠨",     // Greek
2056    "V" => "XXX",    // Greek Variants
2057    // "H" => "⠠⠠",    // Hebrew
2058    // "U" => "⠈⠈",    // Russian
2059    "C" => "⠠",      // capital
2060    "𝑐" => "",       // second or latter braille cell of a capital letter
2061    "𝐶" => "⠠",      // capital that never should get whitespace in front (from chemical element)
2062    "N" => "⠼",     // number indicator
2063    "n" => "⠼",     // number indicator for drop numbers (special case with close parens)
2064    "t" => "⠱",     // shape terminator
2065    "W" => "⠀",     // whitespace"
2066    "𝐖"=> "⠀",     // whitespace
2067    "s" => "⠆",     // typeface single char indicator
2068    "w" => "",     // typeface word indicator
2069    "e" => "",     // typeface & capital terminator 
2070    "," => "⠂",     // comma
2071    "." => "⠲",     // period
2072    "-" => "-",     // hyphen
2073    "—" => "⠠⠤",   // normal dash (2014) -- assume all normal dashes are unified here [RUEB appendix 3]
2074    "―" => "⠐⠠⠤",  // long dash (2015) -- assume all long dashes are unified here [RUEB appendix 3]
2075    "(" => "⠦",     // Not really needed, but done for consistency with ")"
2076    ")" => "⠴",     // Needed for rules with drop numbers to avoid mistaking for dropped 0
2077    "↑" => "⠬",     // superscript
2078    "↓" => "⠡",     // subscript
2079    "#" => "",      // signals end of script
2080    "Z" => "⠐",     // signals end of index of root, integrand/lim from function ("zone change")
2081
2082};
2083
2084fn finnish_cleanup(pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String {
2085    lazy_static! {
2086        static ref REPLACE_INDICATORS: Regex =Regex::new(r"([SB𝔹TIREDGVHUP𝐏C𝐶LlMmb↑↓Nn𝑁WwZ,()])").unwrap();
2087        // Numbers need to end with a space, but sometimes there is one there for other reasons
2088        static ref DROP_NUMBER_SEPARATOR: Regex = Regex::new(r"(n.)\)").unwrap();
2089        static ref NUMBER_MATCH: Regex = Regex::new(r"((N.)+[^WN𝐶#↑↓Z])").unwrap();
2090    }
2091
2092    // debug!("finnish_cleanup: start={}", raw_braille);
2093    let result = DROP_NUMBER_SEPARATOR.replace_all(&raw_braille, |cap: &Captures| {
2094        // match includes the char after the number -- insert the whitespace before it
2095        // debug!("DROP_NUMBER_SEPARATOR match='{}'", &cap[1]);
2096        return cap[1].to_string() + "𝐶)";       // hack to use "𝐶" instead of dot 6 directly, but works for NUMBER_MATCH
2097    });
2098    let result = result.replace('n', "N");  // avoids having to modify remove_unneeded_mode_changes()
2099    let result = NUMBER_MATCH.replace_all(&result, |cap: &Captures| {
2100        // match includes the char after the number -- insert the whitespace before it
2101        // debug!("NUMBER_MATCH match='{}'", &cap[1]);
2102        let mut chars = cap[0].chars();
2103        let last_char = chars.next_back().unwrap(); // unwrap safe since several chars were matched
2104        return chars.as_str().to_string() + "W" + &last_char.to_string();
2105    });
2106
2107    // FIX: need to implement this -- this is just a copy of the Vietnam code
2108    let result = result.replace("CG", "⠘")
2109                                    .replace("𝔹C", "⠩")
2110                                    .replace("DC", "⠰");
2111
2112    // debug!("   after typeface/caps={}", &result);
2113
2114    // these typeforms need to get pulled from user-prefs as they are transcriber-defined
2115    let double_struck = pref_manager.pref_to_string("Vietnam_DoubleStruck");
2116    let sans_serif = pref_manager.pref_to_string("Vietnam_SansSerif");
2117    let fraktur = pref_manager.pref_to_string("Vietnam_Fraktur");
2118    let greek_variant = pref_manager.pref_to_string("Vietnam_GreekVariant");
2119
2120    // This reuses the code just for getting rid of unnecessary "L"s and "N"s
2121    let result = remove_unneeded_mode_changes(&result, UEB_Mode::Grade1, UEB_Duration::Passage);
2122    // debug!("   remove_unneeded_mode_changes={}", &result);
2123
2124
2125    let result = REPLACE_INDICATORS.replace_all(&result, |cap: &Captures| {
2126        let matched_char = &cap[0];
2127        match matched_char {
2128            "𝔹" => &double_struck,
2129            "S" => &sans_serif,
2130            "D" => &fraktur,
2131            "V" => &greek_variant,
2132            _ => match FINNISH_INDICATOR_REPLACEMENTS.get(matched_char) {
2133                None => {error!("REPLACE_INDICATORS and SWEDISH_INDICATOR_REPLACEMENTS are not in sync: missing '{matched_char}'"); ""},
2134                Some(&ch) => ch,
2135            },
2136        }
2137    });
2138
2139    // Remove unicode blanks at start and end -- do this after the substitutions because ',' introduces spaces
2140    // let result = result.trim_start_matches('⠀').trim_end_matches('⠀');
2141    let result = COLLAPSE_SPACES.replace_all(&result, "⠀");
2142   
2143    return result.to_string();
2144}
2145
2146
2147fn swedish_cleanup(pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String {
2148    // FIX: need to implement this -- this is just a copy of the Vietnam code
2149    lazy_static! {
2150        // Empty bases are ok if they follow whitespace
2151        static ref EMPTY_BASE: Regex = Regex::new(r"(^|[W𝐖w])E").unwrap();
2152    }
2153    // debug!("swedish_cleanup: start={}", raw_braille);
2154    let result = typeface_to_word_mode(&raw_braille);
2155    let result = capitals_to_word_mode(&result);
2156
2157    let result = result.replace("CG", "⠘")
2158                                    .replace("𝔹C", "⠩")
2159                                    .replace("DC", "⠰");
2160
2161    // debug!("   after typeface/caps={}", &result);
2162
2163    // these typeforms need to get pulled from user-prefs as they are transcriber-defined
2164    let double_struck = pref_manager.pref_to_string("Vietnam_DoubleStruck");
2165    let sans_serif = pref_manager.pref_to_string("Vietnam_SansSerif");
2166    let fraktur = pref_manager.pref_to_string("Vietnam_Fraktur");
2167    let greek_variant = pref_manager.pref_to_string("Vietnam_GreekVariant");
2168
2169    // This reuses the code just for getting rid of unnecessary "L"s and "N"s
2170    let result = remove_unneeded_mode_changes(&result, UEB_Mode::Grade1, UEB_Duration::Passage);
2171    // debug!("   after removing mode changes={}", &result);
2172
2173
2174    let result = EMPTY_BASE.replace_all(&result, "$1");
2175    let result = REPLACE_INDICATORS.replace_all(&result, |cap: &Captures| {
2176        let matched_char = &cap[0];
2177        match matched_char {
2178            "𝔹" => &double_struck,
2179            "S" => &sans_serif,
2180            "D" => &fraktur,
2181            "V" => &greek_variant,
2182            _ => match SWEDISH_INDICATOR_REPLACEMENTS.get(matched_char) {
2183                None => {error!("REPLACE_INDICATORS and SWEDISH_INDICATOR_REPLACEMENTS are not in sync: missing '{matched_char}'"); ""},
2184                Some(&ch) => ch,
2185            },
2186        }
2187    });
2188
2189    // Remove unicode blanks at start and end -- do this after the substitutions because ',' introduces spaces
2190    // let result = result.trim_start_matches('⠀').trim_end_matches('⠀');
2191    let result = COLLAPSE_SPACES.replace_all(&result, "⠀");
2192   
2193    return result.to_string();
2194}
2195
2196#[allow(non_snake_case)]
2197fn LaTeX_cleanup(_pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String {
2198    lazy_static! {
2199        static ref REMOVE_SPACE: Regex =Regex::new(r" ([\^_,;)\]}])").unwrap();          // '^', '_', ',', ';', ')', ']', '}'
2200        static ref COLLAPSE_SPACES: Regex = Regex::new(r" +").unwrap();
2201    }
2202    // debug!("LaTeX_cleanup: start={}", raw_braille);
2203    let result = raw_braille.replace('𝐖', " ");
2204    // let result = COLLAPSE_SPACES.replace_all(&raw_braille, "⠀");
2205    let result = COLLAPSE_SPACES.replace_all(&result, " ");
2206    // debug!("After collapse: {}", &result);
2207    let result = REMOVE_SPACE.replace_all(&result, "$1");
2208    // debug!("After remove: {}", &result);
2209    // let result = result.trim_matches('⠀');
2210    let result = result.trim_matches(' ');
2211   
2212    return result.to_string();
2213}
2214
2215#[allow(non_snake_case)]
2216fn ASCIIMath_cleanup(_pref_manager: Ref<PreferenceManager>, raw_braille: String) -> String {
2217    lazy_static! {
2218        static ref REMOVE_SPACE_BEFORE_OP: Regex = Regex::new(r#"([\w\d]) +([^\w\d"]|[\^_,;)\]}])"#).unwrap();
2219        static ref REMOVE_SPACE_AFTER_OP: Regex =  Regex::new(r#"([^\^_,;)\]}\w\d"]) +([\w\d])"#).unwrap();
2220        static ref COLLAPSE_SPACES: Regex = Regex::new(r" +").unwrap();
2221    }
2222    // debug!("ASCIIMath_cleanup: start={}", raw_braille);
2223    let result  = raw_braille.replace("|𝐖__|", "|𝐰__|");    // protect the whitespace to prevent misinterpretation as lfloor
2224    let result = result.replace('𝐖', " ");
2225    let result = COLLAPSE_SPACES.replace_all(&result, " ");
2226    // debug!("After collapse: {}", &result);
2227    let result = REMOVE_SPACE_BEFORE_OP.replace_all(&result, "$1$2");
2228    let result = REMOVE_SPACE_AFTER_OP.replace_all(&result, "$1$2");
2229    let result = result.replace('𝐰', " ");     // spaces around relational operators
2230    let result = COLLAPSE_SPACES.replace_all(&result, " ");
2231    // debug!("After remove: {}", &result);
2232    // let result = result.trim_matches('⠀');
2233    let result = result.trim_matches(' ');
2234   
2235    return result.to_string();
2236}
2237
2238
2239/************** Braille xpath functionality ***************/
2240use crate::canonicalize::{name, as_element, as_text};
2241use crate::xpath_functions::{is_leaf, IsBracketed, validate_one_node};
2242use sxd_document::dom::ParentOfChild;
2243use sxd_xpath::{Value, context, nodeset::*};
2244use sxd_xpath::function::{Function, Args};
2245use sxd_xpath::function::Error as XPathError;
2246use std::result::Result as StdResult;
2247
2248pub struct NemethNestingChars;
2249const NEMETH_FRAC_LEVEL: &str = "data-nemeth-frac-level";    // name of attr where value is cached
2250const FIRST_CHILD_ONLY: &[&str] = &["mroot", "msub", "msup", "msubsup", "munder", "mover", "munderover", "mmultiscripts"];
2251impl NemethNestingChars {
2252    // returns a 'repeat_char' corresponding to the Nemeth rules for nesting
2253    // note: this value is likely one char too long because the starting fraction is counted
2254    fn nemeth_frac_value(node: Element, repeat_char: &str) -> String {
2255        let children = node.children();
2256        let name = name(node);
2257        if is_leaf(node) {
2258            return "".to_string();
2259        } else if name == "mfrac" {
2260            // have we already computed the value?
2261            if let Some(value) = node.attribute_value(NEMETH_FRAC_LEVEL) {
2262                return value.to_string();
2263            }
2264
2265            let num_value = NemethNestingChars::nemeth_frac_value(as_element(children[0]), repeat_char);
2266            let denom_value = NemethNestingChars::nemeth_frac_value(as_element(children[1]), repeat_char);
2267            let mut max_value = if num_value.len() > denom_value.len() {num_value} else {denom_value};
2268            max_value += repeat_char;
2269            node.set_attribute_value(NEMETH_FRAC_LEVEL, &max_value);
2270            return max_value;
2271        } else if FIRST_CHILD_ONLY.contains(&name) {
2272            // only look at the base -- ignore scripts/index
2273            return NemethNestingChars::nemeth_frac_value(as_element(children[0]), repeat_char);
2274        } else {
2275            let mut result = "".to_string();
2276            for child in children {
2277                let value = NemethNestingChars::nemeth_frac_value(as_element(child), repeat_char);
2278                if value.len() > result.len() {
2279                    result = value;
2280                }
2281            }
2282            return result;
2283        }
2284    }
2285
2286    fn nemeth_root_value(node: Element, repeat_char: &str) -> StdResult<String, XPathError> {
2287        // returns the correct number of repeat_chars to use
2288        // note: because the highest count is toward the leaves and
2289        //    because this is a loop and not recursive, caching doesn't work without a lot of overhead
2290        let parent = node.parent().unwrap();
2291        if let ParentOfChild::Element(e) =  parent {
2292            let mut parent = e;
2293            let mut result = "".to_string();
2294            loop {
2295                let name = name(parent);
2296                if name == "math" {
2297                    return Ok( result );
2298                }
2299                if name == "msqrt" || name == "mroot" {
2300                    result += repeat_char;
2301                }
2302                let parent_of_child = parent.parent().unwrap();
2303                if let ParentOfChild::Element(e) =  parent_of_child {
2304                    parent = e;
2305                } else {
2306                    return Err( sxd_xpath::function::Error::Other("Internal error in nemeth_root_value: didn't find 'math' tag".to_string()) );
2307                }
2308            }
2309        }
2310        return Err( XPathError::Other("Internal error in nemeth_root_value: didn't find 'math' tag".to_string()) );
2311    }
2312}
2313
2314impl Function for NemethNestingChars {
2315/**
2316 * Returns a string with the correct number of nesting chars (could be an empty string)
2317 * @param(node) -- current node
2318 * @param(char) -- char (string) that should be repeated
2319 * Note: as a side effect, an attribute with the value so repeated calls to this or a child will be fast
2320 */
2321 fn evaluate<'d>(&self,
2322                        _context: &context::Evaluation<'_, 'd>,
2323                        args: Vec<Value<'d>>)
2324                        -> StdResult<Value<'d>, XPathError>
2325    {
2326        let mut args = Args(args);
2327        args.exactly(2)?;
2328        let repeat_char = args.pop_string()?;
2329        let node = crate::xpath_functions::validate_one_node(args.pop_nodeset()?, "NestingChars")?;
2330        if let Node::Element(el) = node {
2331            let name = name(el);
2332            // it is likely a bug to call this one a non mfrac
2333            if name == "mfrac" {
2334                // because it is called on itself, the fraction is counted one too many times -- chop one off
2335                // this is slightly messy because we are chopping off a char, not a byte
2336                const BRAILLE_BYTE_LEN: usize = "⠹".len();      // all Unicode braille symbols have the same number of bytes
2337                return Ok( Value::String( NemethNestingChars::nemeth_frac_value(el, &repeat_char)[BRAILLE_BYTE_LEN..].to_string() ) );
2338            } else if name == "msqrt" || name == "mroot" {
2339                return Ok( Value::String( NemethNestingChars::nemeth_root_value(el, &repeat_char)? ) );
2340            } else {
2341                panic!("NestingChars chars should be used only on 'mfrac'. '{}' was passed in", name);
2342            }
2343        } else {
2344            // not an element, so nothing to do
2345            return Ok( Value::String("".to_string()) );
2346        }
2347    }
2348}
2349
2350pub struct BrailleChars;
2351impl BrailleChars {
2352    // returns a string for the chars in the *leaf* node.
2353    // this string follows the Nemeth rules typefaces and deals with mathvariant
2354    //  which has partially turned chars to the alphanumeric block
2355    fn get_braille_chars(node: Element, code: &str, text_range: Option<Range<usize>>) -> StdResult<String, XPathError> {
2356        let result = match code {
2357            "Nemeth" => BrailleChars::get_braille_nemeth_chars(node, text_range),
2358            "UEB" => BrailleChars:: get_braille_ueb_chars(node, text_range),
2359            "CMU" => BrailleChars:: get_braille_cmu_chars(node, text_range),
2360            "Vietnam" => BrailleChars:: get_braille_vietnam_chars(node, text_range),
2361            "Swedish" => BrailleChars:: get_braille_ueb_chars(node, text_range),    // FIX: need to figure out what to implement
2362            "Finnish" => BrailleChars:: get_braille_ueb_chars(node, text_range),    // FIX: need to figure out what to implement
2363            _ => return Err(sxd_xpath::function::Error::Other(format!("get_braille_chars: unknown braille code '{code}'")))
2364        };
2365        return match result {
2366            Ok(string) => Ok(make_quoted_string(string)),
2367            Err(err) => return Err(sxd_xpath::function::Error::Other(err.to_string())),
2368        }
2369    }
2370
2371    fn get_braille_nemeth_chars(node: Element, text_range: Option<Range<usize>>) -> Result<String> {
2372        lazy_static! {
2373            // To greatly simplify typeface/language generation, the chars have unique ASCII chars for them:
2374            // Typeface: S: sans-serif, B: bold, 𝔹: blackboard, T: script, I: italic, R: Roman
2375            // Language: E: English, D: German, G: Greek, V: Greek variants, H: Hebrew, U: Russian
2376            // Indicators: C: capital, L: letter, N: number, P: punctuation, M: multipurpose
2377            static ref PICK_APART_CHAR: Regex = 
2378                Regex::new(r"(?P<face>[SB𝔹TIR]*)(?P<lang>[EDGVHU]?)(?P<cap>C?)(?P<letter>L?)(?P<num>[N]?)(?P<char>.)").unwrap();
2379        }
2380        let math_variant = node.attribute_value("mathvariant");
2381        // FIX: cover all the options -- use phf::Map
2382        let  attr_typeface = match math_variant {
2383            None => "R",
2384            Some(variant) => match variant {
2385                "bold" => "B",
2386                "italic" => "I",
2387                "double-struck" => "𝔹",
2388                "script" => "T",
2389                "fraktur" => "D",
2390                "sans-serif" => "S",
2391                _ => "R",       // normal and unknown
2392            },
2393        };
2394        let text = BrailleChars::substring(as_text(node), &text_range);
2395        let braille_chars = braille_replace_chars(&text, node)?;
2396        // debug!("Nemeth chars: text='{}', braille_chars='{}'", &text, &braille_chars);
2397        
2398        // we want to pull the prefix (typeface, language) out to the front until a change happens
2399        // the same is true for number indicator
2400        // also true (sort of) for capitalization -- if all caps, use double cap in front (assume abbr or Roman Numeral)
2401        
2402        // we only care about this for numbers and identifiers/text, so we filter for only those
2403        let node_name = name(node);
2404        let is_in_enclosed_list = node_name != "mo" && BrailleChars::is_in_enclosed_list(node);
2405        let is_mn_in_enclosed_list = is_in_enclosed_list && node_name == "mn";
2406        let mut typeface = "R".to_string();     // assumption is "R" and if attr or letter is different, something happens
2407        let mut is_all_caps = true;
2408        let mut is_all_caps_valid = false;      // all_caps only valid if we did a replacement
2409        let result = PICK_APART_CHAR.replace_all(&braille_chars, |caps: &Captures| {
2410            // debug!("  face: {:?}, lang: {:?}, num {:?}, letter: {:?}, cap: {:?}, char: {:?}",
2411            //        &caps["face"], &caps["lang"], &caps["num"], &caps["letter"], &caps["cap"], &caps["char"]);
2412            let mut nemeth_chars = "".to_string();
2413            let char_face = if caps["face"].is_empty() {attr_typeface} else {&caps["face"]};
2414            let typeface_changed =  typeface != char_face;
2415            if typeface_changed {
2416                typeface = char_face.to_string();   // needs to outlast this instance of the loop
2417                nemeth_chars += &typeface;
2418                nemeth_chars +=  &caps["lang"];
2419            } else {
2420                nemeth_chars +=  &caps["lang"];
2421            }
2422            // debug!("  typeface changed: {}, is_in_list: {}; num: {}", typeface_changed, is_in_enclosed_list, !caps["num"].is_empty());
2423            if !caps["num"].is_empty() && (typeface_changed || !is_mn_in_enclosed_list) {
2424                nemeth_chars += "N";
2425            }
2426            is_all_caps_valid = true;
2427            is_all_caps &= !&caps["cap"].is_empty();
2428            nemeth_chars += &caps["cap"];       // will be stripped later if all caps
2429            if is_in_enclosed_list {
2430                nemeth_chars += &caps["letter"].replace('L', "l");
2431            } else {
2432                nemeth_chars += &caps["letter"];
2433            }
2434            nemeth_chars += &caps["char"];
2435            return nemeth_chars;
2436        });
2437        // debug!("  result: {}", &result);
2438        let mut text_chars = text.chars();     // see if more than one char
2439        if is_all_caps_valid && is_all_caps && text_chars.next().is_some() &&  text_chars.next().is_some() {
2440            return Ok( "CC".to_string() + &result.replace('C', ""));
2441        } else {
2442            return Ok( result.to_string() );
2443        }
2444    }
2445
2446    fn get_braille_ueb_chars(node: Element, text_range: Option<Range<usize>>) -> Result<String> {
2447        // Because in UEB typeforms and caps may extend for multiple tokens,
2448        //   this routine merely deals with the mathvariant attr.
2449        // Canonicalize has already transformed all chars it can to math alphanumerics, but not all have bold/italic 
2450        // The typeform/caps transforms to (potentially) word mode are handled later.
2451        lazy_static! {
2452            static ref HAS_TYPEFACE: Regex = Regex::new(".*?(double-struck|script|fraktur|sans-serif).*").unwrap();
2453            static ref PICK_APART_CHAR: Regex = 
2454                 Regex::new(r"(?P<bold>B??)(?P<italic>I??)(?P<face>[S𝔹TD]??)s??(?P<cap>C??)(?P<greek>G??)(?P<char>[NL].)").unwrap();
2455        }
2456    
2457        let math_variant = node.attribute_value("mathvariant");
2458        let text = BrailleChars::substring(as_text(node), &text_range);
2459        let mut braille_chars = braille_replace_chars(&text, node)?;
2460
2461        // debug!("get_braille_ueb_chars: before/after unicode.yaml: '{}'/'{}'", text, braille_chars);
2462        if math_variant.is_none() {         // nothing we need to do
2463            return Ok(braille_chars);
2464        }
2465        // mathvariant could be "sans-serif-bold-italic" -- get the parts
2466        let math_variant = math_variant.unwrap();
2467        let italic = math_variant.contains("italic");
2468        if italic & !braille_chars.contains('I') {
2469            braille_chars = "I".to_string() + &braille_chars;
2470        }
2471        let bold = math_variant.contains("bold");
2472        if bold & !braille_chars.contains('B') {
2473            braille_chars = "B".to_string() + &braille_chars;
2474        }
2475        let typeface = match HAS_TYPEFACE.find(math_variant) {
2476            None => "",
2477            Some(m) => match m.as_str() {
2478                "double-struck" => "𝔹",
2479                "script" => "T",
2480                "fraktur" => "D",
2481                "sans-serif" => "S",
2482                //  don't consider monospace as a typeform
2483                _ => "",
2484            },
2485        };
2486        let result = PICK_APART_CHAR.replace_all(&braille_chars, |caps: &Captures| {
2487            // debug!("captures: {:?}", caps);
2488            // debug!("  bold: {:?}, italic: {:?}, face: {:?}, cap: {:?}, char: {:?}",
2489            //        &caps["bold"], &caps["italic"], &caps["face"], &caps["cap"], &caps["char"]);
2490            if bold || !caps["bold"].is_empty() {"B"} else {""}.to_string()
2491                + if italic || !caps["italic"].is_empty() {"I"} else {""}
2492                + if !&caps["face"].is_empty() {&caps["face"]} else {typeface}
2493                + &caps["cap"]
2494                + &caps["greek"]
2495                + &caps["char"]
2496        });
2497        // debug!("get_braille_ueb_chars: '{}'", &result);
2498        return Ok(result.to_string())
2499    }
2500
2501    fn get_braille_cmu_chars(node: Element, text_range: Option<Range<usize>>) -> Result<String> {
2502        // In CMU, we need to replace spaces used for number blocks with "."
2503        // For other numbers, we need to add "." to create digit blocks
2504
2505        lazy_static! {
2506            static ref HAS_TYPEFACE: Regex = Regex::new(".*?(double-struck|script|fraktur|sans-serif).*").unwrap();
2507            static ref PICK_APART_CHAR: Regex = 
2508                 Regex::new(r"(?P<bold>B??)(?P<italic>I??)(?P<face>[S𝔹TD]??)s??(?P<cap>C??)(?P<greek>G??)(?P<char>[NL].)").unwrap();
2509        }
2510    
2511        let math_variant = node.attribute_value("mathvariant");
2512        let text = BrailleChars::substring(as_text(node), &text_range);
2513        let text = add_separator(text);
2514
2515        let braille_chars = braille_replace_chars(&text, node)?;
2516
2517        // debug!("get_braille_ueb_chars: before/after unicode.yaml: '{}'/'{}'", text, braille_chars);
2518        if math_variant.is_none() {         // nothing we need to do
2519            return Ok(braille_chars);
2520        }
2521        // mathvariant could be "sans-serif-bold-italic" -- get the parts
2522        let math_variant = math_variant.unwrap();
2523        let bold = math_variant.contains("bold");
2524        let italic = math_variant.contains("italic");
2525        let typeface = match HAS_TYPEFACE.find(math_variant) {
2526            None => "",
2527            Some(m) => match m.as_str() {
2528                "double-struck" => "𝔹",
2529                "script" => "T",
2530                "fraktur" => "D",
2531                "sans-serif" => "S",
2532                //  don't consider monospace as a typeform
2533                _ => "",
2534            },
2535        };
2536        let result = PICK_APART_CHAR.replace_all(&braille_chars, |caps: &Captures| {
2537            // debug!("captures: {:?}", caps);
2538            // debug!("  bold: {:?}, italic: {:?}, face: {:?}, cap: {:?}, char: {:?}",
2539            //        &caps["bold"], &caps["italic"], &caps["face"], &caps["cap"], &caps["char"]);
2540            if bold || !caps["bold"].is_empty() {"B"} else {""}.to_string()
2541                + if italic || !caps["italic"].is_empty() {"I"} else {""}
2542                + if !&caps["face"].is_empty() {&caps["face"]} else {typeface}
2543                + &caps["cap"]
2544                + &caps["greek"]
2545                + &caps["char"]
2546        });
2547        return Ok(result.to_string());
2548
2549        fn add_separator(text: String) -> String {
2550            use crate::definitions::BRAILLE_DEFINITIONS;
2551            if let Some(text_without_arc) = text.strip_prefix("arc") {
2552                // "." after arc (7.5.3)
2553                let is_function_name = BRAILLE_DEFINITIONS.with(|definitions| {
2554                    let definitions = definitions.borrow();
2555                    let set = definitions.get_hashset("CMUFunctionNames").unwrap();
2556                    return set.contains(&text);
2557                });
2558                if is_function_name {
2559                    return "arc.".to_string() + text_without_arc;
2560                }
2561            } 
2562            return text;
2563        }
2564    }
2565
2566    fn get_braille_vietnam_chars(node: Element, text_range: Option<Range<usize>>) -> Result<String> {
2567        // this is basically the same as for ueb except:
2568        // 1. we deal with switching '.' and ',' if in English style for numbers
2569        // 2. if it is identified as a Roman Numeral, we make all but the first char lower case because they shouldn't get a cap indicator
2570        // 3. double letter chemical elements should NOT be part of a cap word sequence
2571        if name(node) == "mn" {
2572            // text of element is modified by these if needed
2573            lower_case_roman_numerals(node);
2574            switch_if_english_style_number(node);
2575        }
2576        let result = BrailleChars::get_braille_ueb_chars(node, text_range)?;
2577        return Ok(result);
2578
2579        fn lower_case_roman_numerals(mn_node: Element) {
2580            if mn_node.attribute("data-roman-numeral").is_some() {
2581                // if a roman numeral, all ASCII so we can optimize
2582                let text = as_text(mn_node);
2583                let mut new_text = String::from(&text[..1]);
2584                new_text.push_str(text[1..].to_ascii_lowercase().as_str());    // works for single char too
2585                mn_node.set_text(&new_text);
2586            }
2587        }
2588        fn switch_if_english_style_number(mn_node: Element) {
2589            let text = as_text(mn_node);
2590            let dot = text.find('.');
2591            let comma = text.find(',');
2592            match (dot, comma) {
2593                (None, None) => (),
2594                (Some(dot), Some(comma)) => {
2595                    if comma < dot {
2596                        // switch dot/comma -- using "\x01" as a temp when switching the the two chars
2597                        let switched = text.replace('.', "\x01").replace(',', ".").replace('\x01', ",");
2598                        mn_node.set_text(&switched);
2599                    }
2600                },
2601                (Some(dot), None) => {
2602                    // If it starts with a '.', a leading 0, or if there is only one '.' and not three chars after it
2603                    if dot==0 ||
2604                       (dot==1 && text.starts_with('0')) ||
2605                       (text[dot+1..].find('.').is_none() && text[dot+1..].len()!=3) {
2606                        mn_node.set_text(&text.replace('.', ","));
2607                    }
2608                },
2609                (None, Some(comma)) => {
2610                    // if there is more than one ",", than it can't be a decimal separator
2611                    if text[comma+1..].find(',').is_some() {
2612                        mn_node.set_text(&text.replace(',', "."));
2613                    }
2614                },
2615            }
2616        }
2617
2618    }
2619
2620
2621    fn is_in_enclosed_list(node: Element) -> bool {
2622        // Nemeth Rule 10 defines an enclosed list:
2623        // 1: begins and ends with fence
2624        // 2: FIX: not implemented -- must contain no word, abbreviation, ordinal or plural ending
2625        // 3: function names or signs of shape and the signs which follow them are a single item (not a word)
2626        // 4: an item of the list may be an ellipsis or any sign used for omission
2627        // 5: no relational operator may appear within the list
2628        // 6: the list must have at least 2 items.
2629        //       Items are separated by commas, can not have other punctuation (except ellipsis and dash)
2630        let mut parent = get_parent(node); // safe since 'math' is always at root
2631        while name(parent) == "mrow" {
2632            if IsBracketed::is_bracketed(parent, "", "", true, false) {
2633                for child in parent.children() {
2634                    if !child_meets_conditions(as_element(child)) {
2635                        return false;
2636                    }
2637                }
2638                return true;
2639            }
2640            parent = get_parent(parent);
2641        }
2642        return false;
2643
2644        fn child_meets_conditions(node: Element) -> bool {
2645            let name = name(node);
2646            return match name {
2647                "mi" | "mn" => true,
2648                "mo"  => !crate::canonicalize::is_relational_op(node),
2649                "mtext" => {
2650                    let text = as_text(node).trim();
2651                    return text=="?" || text=="-?-" || text.is_empty();   // various forms of "fill in missing content" (see also Nemeth_RULEs.yaml, "omissions")
2652                },
2653                "mrow" => {
2654                    if IsBracketed::is_bracketed(node, "", "", false, false) {
2655                        return child_meets_conditions(as_element(node.children()[1]));
2656                    } else {
2657                        for child in node.children() {
2658                            if !child_meets_conditions(as_element(child)) {
2659                                return false;
2660                            }
2661                        }
2662                    }  
2663                    true      
2664                },
2665                "menclose" => {
2666                    if let Some(notation) = node.attribute_value("notation") {
2667                        if notation != "bottom" || notation != "box" {
2668                            return false;
2669                        }
2670                        let child = as_element(node.children()[0]);     // menclose has exactly one child
2671                        return is_leaf(child) && as_text(child) == "?";
2672                    }
2673                    return false;
2674                },
2675                _ => {
2676                    for child in node.children() {
2677                        if !child_meets_conditions(as_element(child)) {
2678                            return false;
2679                        }
2680                    }
2681                    true
2682                },
2683            }
2684        }
2685    }
2686
2687    /// Extract the `char`s from `str` within `range` (these are chars, not byte offsets)
2688    fn substring(str: &str, text_range: &Option<Range<usize>>) -> String {
2689        return match text_range {
2690            None => str.to_string(),
2691            Some(range) => str.chars().skip(range.start).take(range.end - range.start).collect(),
2692        }
2693    }
2694}
2695
2696impl Function for BrailleChars {
2697    /**
2698     * Returns a string with the correct number of nesting chars (could be an empty string)
2699     * @param(node) -- current node or string
2700     * @param(char) -- char (string) that should be repeated
2701     * Note: as a side effect, an attribute with the value so repeated calls to this or a child will be fast
2702     */
2703    fn evaluate<'d>(&self,
2704                        context: &context::Evaluation<'_, 'd>,
2705                        args: Vec<Value<'d>>)
2706                        -> StdResult<Value<'d>, XPathError>
2707    {
2708        use crate::canonicalize::create_mathml_element;
2709        let mut args = Args(args);
2710        if let Err(e) = args.exactly(2).or_else(|_| args.exactly(4)) {
2711            return Err( XPathError::Other(format!("BrailleChars requires 2 or 4 args: {e}")));
2712        };
2713
2714        let range = if args.len() == 4 {
2715            let end = args.pop_number()? as usize - 1;      // non-inclusive at end, 0-based
2716            let start = args.pop_number()? as usize - 1;    // inclusive at start, a 0-based
2717            Some(start..end)
2718        } else {
2719            None
2720        };
2721        let braille_code = args.pop_string()?;
2722        let v: Value<'_> = args.0.pop().ok_or(XPathError::ArgumentMissing)?;
2723        let node = match v {
2724            Value::Nodeset(nodes) => {
2725                validate_one_node(nodes, "BrailleChars")?.element().unwrap()
2726            },
2727            Value::Number(n) => {
2728                let new_node = create_mathml_element(&context.node.document(), "mn");
2729                new_node.set_text(&n.to_string());
2730                new_node
2731            },
2732            Value::String(s) => {
2733                let new_node = create_mathml_element(&context.node.document(), "mi");   // FIX: try to guess mi vs mo???
2734                new_node.set_text(&s);
2735                new_node
2736            },
2737            _ => {
2738                return Ok( Value::String("".to_string()) ) // not an element, so nothing to do
2739            },
2740        };
2741
2742        if !is_leaf(node) {
2743            return Err( XPathError::Other(format!("BrailleChars called on non-leaf element '{}'", mml_to_string(node))) );
2744        }
2745        return Ok( Value::String( BrailleChars::get_braille_chars(node, &braille_code, range)? ) );
2746    }
2747}
2748
2749pub struct NeedsToBeGrouped;
2750impl NeedsToBeGrouped {
2751    // ordinals often have an irregular start (e.g., "half") before becoming regular.
2752    // if the number is irregular, return the ordinal form, otherwise return 'None'.
2753    fn needs_grouping_for_cmu(element: Element, _is_base: bool) -> bool {
2754        let node_name = name(element);
2755        let children = element.children();
2756        if node_name == "mrow" {
2757            // check for bracketed exprs
2758            if IsBracketed::is_bracketed(element, "", "", false, true) {
2759                return false;
2760            }
2761
2762            // check for prefix and postfix ops at start or end (=> len()==2, prefix is first op, postfix is last op)
2763            if children.len() == 2 &&
2764                (name(as_element(children[0])) == "mo" || name(as_element(children[1])) == "mo") {
2765                return false;
2766            }
2767
2768            if children.len() != 3 {  // ==3, need to check if it a linear fraction
2769                return true;
2770            }
2771            let operator = as_element(children[1]);
2772            if name(operator) != "mo" || as_text(operator) != "/" {
2773                return true;
2774            }
2775        }
2776
2777        if !(node_name == "mrow" || node_name == "mfrac") {
2778            return false;
2779        }
2780        // check for numeric fractions (regular fractions need brackets, not numeric fractions), either as an mfrac or with "/"
2781        // if the fraction starts with a "-", it is still a numeric fraction that doesn't need parens
2782        let mut numerator = as_element(children[0]);
2783        let denominator = as_element(children[children.len()-1]);
2784        let decimal_separator = crate::interface::get_preference("DecimalSeparators".to_string()).unwrap()
2785                                                        .chars().next().unwrap_or('.');
2786        if is_integer(denominator, decimal_separator) {
2787            // check numerator being either an integer "- integer"
2788            if name(numerator) == "mrow" {
2789                let numerator_children = numerator.children();
2790                if !(numerator_children.len() == 2 &&
2791                        name(as_element(numerator_children[0])) == "mo" &&
2792                        as_text(as_element(numerator_children[0])) == "-") {
2793                    return true;
2794                }
2795                numerator = as_element(numerator_children[1]);
2796            }
2797            return !is_integer(numerator, decimal_separator);
2798        }
2799        return true;
2800
2801        fn is_integer(mathml: Element, decimal_separator: char) -> bool {
2802            return name(mathml) == "mn" && !as_text(mathml).contains(decimal_separator)
2803        }
2804    }
2805
2806    /// FIX: what needs to be implemented?
2807    fn needs_grouping_for_finnish(mathml: Element, is_base: bool) -> bool {
2808        use crate::xpath_functions::IsInDefinition;
2809        let mut node_name = name(mathml);
2810        if mathml.attribute_value("data-roman-numeral").is_some() {
2811            node_name = "mi";           // roman numerals don't follow number rules
2812        }
2813
2814        // FIX: the leaf rules are from UEB -- check the Swedish rules
2815        match node_name {
2816            "mn" => {   
2817                if !is_base {
2818                    return false;
2819                }                                                                                        // clause 1
2820                // two 'mn's can be adjacent, in which case we need to group the 'mn' to make it clear it is separate (see bug #204)
2821                let parent = get_parent(mathml);   // there is always a "math" node
2822                let grandparent = if name(parent) == "math" {parent} else {get_parent(parent)};
2823                if name(grandparent) != "mrow" {
2824                    return false;
2825                }
2826                let preceding = parent.preceding_siblings();
2827                if preceding.len()  < 2 {
2828                    return false;
2829                }
2830                // any 'mn' would be separated from this node by invisible times
2831                let previous_child = as_element(preceding[preceding.len()-1]);
2832                if name(previous_child) == "mo" && as_text(previous_child) == "\u{2062}" {
2833                    let previous_child = as_element(preceding[preceding.len()-2]);
2834                    return name(previous_child) == "mn"
2835                } else {
2836                    return false;
2837                }
2838            },
2839            "mi" | "mo" | "mtext" => {
2840                let text = as_text(mathml);
2841                let parent = get_parent(mathml);   // there is always a "math" node
2842                let parent_name = name(parent);   // there is always a "math" node
2843                if is_base && (parent_name == "msub" || parent_name == "msup" || parent_name == "msubsup") && !text.contains([' ', '\u{00A0}']) {
2844                    return false;
2845                }
2846                let mut chars = text.chars();
2847                let first_char = chars.next().unwrap();             // canonicalization assures it isn't empty;
2848                let is_one_char = chars.next().is_none();
2849                // '¨', etc., brailles as two chars -- there probably is some exception list but I haven't found it -- these are the ones I know about
2850                return !((is_one_char && !['¨', '″', '‴', '⁗'].contains(&first_char)) ||                       // clause 8
2851                            // "lim", "cos", etc., appear not to get parens, but the rules don't mention it (tests show it)
2852                            IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "FunctionNames").unwrap() ||
2853                            IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "Arrows").unwrap() ||          // clause 4
2854                            IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "GeometryShapes").unwrap());   // clause 5
2855            },
2856            "mrow" => {
2857                // check for bracketed exprs
2858                if IsBracketed::is_bracketed(mathml, "", "", false, true) {
2859                    return false;
2860                }
2861
2862                let parent = get_parent(mathml); // safe since 'math' is always at root
2863                if name(parent) == "mfrac" {
2864                    let children = mathml.children();
2865                    if mathml.preceding_siblings().is_empty() {
2866                        // numerator: check for multiplication -- doesn't need grouping in numerator
2867                        if children.len() >= 3 {
2868                            let operator = as_element(children[1]);
2869                            if name(operator) == "mo" {
2870                                let ch = as_text(operator);
2871                                if ch == "\u{2062}" || ch == "⋅" || ch == "×"  {
2872                                    return false;
2873                                }
2874                            }
2875                        }
2876                        return true;
2877                    } else {
2878                        // denominator
2879                        return true;
2880                    }
2881
2882                }
2883                // check for prefix at start
2884                // example 7.12 has "2-" in superscript and is grouped, so we don't consider postfix ops
2885                let children = mathml.children();
2886                if children.len() == 2 &&
2887                    (name(as_element(children[0])) == "mo") {
2888                    return false;
2889                }
2890                return true;
2891            },
2892            _ => return false,
2893        }
2894    }
2895
2896    // ordinals often have an irregular start (e.g., "half") before becoming regular.
2897    // if the number is irregular, return the ordinal form, otherwise return 'None'.
2898    fn needs_grouping_for_swedish(mathml: Element, is_base: bool) -> bool {
2899        use crate::xpath_functions::IsInDefinition;
2900        let mut node_name = name(mathml);
2901        if mathml.attribute_value("data-roman-numeral").is_some() {
2902            node_name = "mi";           // roman numerals don't follow number rules
2903        }
2904
2905        match node_name {
2906            "mn" => return false,
2907            "mi" | "mo" | "mtext" => {
2908                let text = as_text(mathml);
2909                let parent = get_parent(mathml);   // there is always a "math" node
2910                let parent_name = name(parent);   // there is always a "math" node
2911                if is_base && (parent_name == "msub" || parent_name == "msup" || parent_name == "msubsup") && !text.contains([' ', '\u{00A0}']) {
2912                    return false;
2913                }
2914                let mut chars = text.chars();
2915                let first_char = chars.next().unwrap();             // canonicalization assures it isn't empty;
2916                let is_one_char = chars.next().is_none();
2917                // '¨', etc., brailles as two chars -- there probably is some exception list but I haven't found it -- these are the ones I know about
2918                return !((is_one_char && !['¨', '″', '‴', '⁗'].contains(&first_char)) ||                       // clause 8
2919                            // "lim", "cos", etc., appear not to get parens, but the rules don't mention it (tests show it)
2920                            IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "FunctionNames").unwrap() ||
2921                            IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "Arrows").unwrap() ||          // clause 4
2922                            IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "GeometryShapes").unwrap());   // clause 5
2923            },
2924            "mrow" => {
2925                // check for bracketed exprs
2926                if IsBracketed::is_bracketed(mathml, "", "", false, true) {
2927                    return false;
2928                }
2929
2930                // check for prefix at start
2931                // example 7.12 has "2-" in superscript and is grouped, so we don't consider postfix ops
2932                let children = mathml.children();
2933                if children.len() == 2 &&
2934                    (name(as_element(children[0])) == "mo") {
2935                    return false;
2936                }
2937                return true;
2938            },
2939            "mfrac" => {
2940                // exclude simple fractions -- they are not bracketed with start/end marks
2941                let children = mathml.children();
2942                return !(NeedsToBeGrouped::needs_grouping_for_swedish(as_element(children[0]), true) ||
2943                         NeedsToBeGrouped::needs_grouping_for_swedish(as_element(children[0]), true));
2944            },
2945            // At least for msup (Ex 7.7, and 7.32 and maybe more), spec seems to feel grouping is not needed.
2946            // "msub" | "msup" | "msubsup" | "munder" | "mover" | "munderover" => return true,
2947            "mtable" => return true,    // Fix: should check for trivial cases that don't need grouping
2948            _ => return false,
2949        }
2950    }
2951
2952    /// Returns true if the element needs grouping symbols
2953    /// Bases need extra attention because if they are a number and the item to the left is one, that needs distinguishing
2954    fn needs_grouping_for_ueb(mathml: Element, is_base: bool) -> bool {
2955        // From GTM 7.1
2956        // 1. An entire number, i.e. the initiating numeric symbol and all succeeding symbols within the numeric mode thus
2957        //     established (which would include any interior decimal points, commas, separator spaces, or simple numeric fraction lines).
2958        // 2. An entire general fraction, enclosed in fraction indicators.
2959        // 3. An entire radical expression, enclosed in radical indicators.
2960        // 4. An arrow.
2961        // 5. An arbitrary shape.
2962        // 6. Any expression enclosed in matching pairs of round parentheses, square brackets or curly braces.
2963        // 7. Any expression enclosed in the braille grouping indicators.   [Note: not possible here]
2964        // 8. If none of the foregoing apply, the item is simply the [this element's] individual symbol.
2965
2966        use crate::xpath_functions::IsInDefinition;
2967        let mut node_name = name(mathml);
2968        if mathml.attribute_value("data-roman-numeral").is_some() {
2969            node_name = "mi";           // roman numerals don't follow number rules
2970        }
2971        match node_name {
2972            "mn" => {   
2973                if !is_base {
2974                    return false;
2975                }                                                                                        // clause 1
2976                // two 'mn's can be adjacent, in which case we need to group the 'mn' to make it clear it is separate (see bug #204)
2977                let parent = get_parent(mathml);   // there is always a "math" node
2978                let grandparent = if name(parent) == "math" {parent} else {get_parent(parent)};
2979                if name(grandparent) != "mrow" {
2980                    return false;
2981                }
2982                let preceding = parent.preceding_siblings();
2983                if preceding.len()  < 2 {
2984                    return false;
2985                }
2986                // any 'mn' would be separated from this node by invisible times
2987                let previous_child = as_element(preceding[preceding.len()-1]);
2988                if name(previous_child) == "mo" && as_text(previous_child) == "\u{2062}" {
2989                    let previous_child = as_element(preceding[preceding.len()-2]);
2990                    return name(previous_child) == "mn"
2991                } else {
2992                    return false;
2993                }
2994            },
2995            "mi" | "mo" | "mtext" => {
2996                let text = as_text(mathml);
2997                let parent = get_parent(mathml);   // there is always a "math" node
2998                let parent_name = name(parent);   // there is always a "math" node
2999                if is_base && (parent_name == "msub" || parent_name == "msup" || parent_name == "msubsup") && !text.contains([' ', '\u{00A0}']) {
3000                    return false;
3001                }
3002                let mut chars = text.chars();
3003                let first_char = chars.next().unwrap();             // canonicalization assures it isn't empty;
3004                let is_one_char = chars.next().is_none();
3005                // '¨', etc., brailles as two chars -- there probably is some exception list but I haven't found it -- these are the ones I know about
3006                return !((is_one_char && !['¨', '″', '‴', '⁗'].contains(&first_char)) ||                       // clause 8
3007                            // "lim", "cos", etc., appear not to get parens, but the rules don't mention it (tests show it)
3008                            IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "FunctionNames").unwrap() ||
3009                            IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "Arrows").unwrap() ||          // clause 4
3010                            IsInDefinition::is_defined_in(text, &SPEECH_DEFINITIONS, "GeometryShapes").unwrap());   // clause 5
3011            },
3012            "mfrac" => return false,                                                     // clause 2 (test GTM 8.2(4) shows numeric fractions are not special)                                 
3013            "msqrt" | "mroot" => return false,                                           // clause 3
3014                    // clause 6 only mentions three grouping chars, I'm a little suspicious of that, but that's what it says
3015            "mrow" => return !(IsBracketed::is_bracketed(mathml, "(", ")", false, false) ||  
3016                                IsBracketed::is_bracketed(mathml, "[", "]", false, false) || 
3017                                IsBracketed::is_bracketed(mathml, "{", "}", false, false) ),
3018            "msub" | "msup" | "msubsup" => {
3019                // I'm a little dubious about the false value, but see GTM 7.7(2)
3020                if !is_base {
3021                    return true;
3022                } 
3023                // need to group nested scripts in base -- see GTM 12.2(2)                                         
3024                let parent = get_parent(mathml);   // there is always a "math" node
3025                let parent_name = name(parent);   // there is always a "math" node
3026                return parent_name == "munder" || parent_name == "mover" || parent_name == "munderover";
3027            },
3028            _ => return true,
3029        }
3030
3031    }
3032}
3033
3034impl Function for NeedsToBeGrouped {
3035    // convert a node to an ordinal number
3036    fn evaluate<'d>(&self,
3037                        _context: &context::Evaluation<'_, 'd>,
3038                        args: Vec<Value<'d>>)
3039                        -> StdResult<Value<'d>, XPathError>
3040    {
3041        let mut args = Args(args);
3042        args.exactly(3)?;
3043        let is_base = args.pop_boolean()?;
3044        let braille_code = args.pop_string()?;
3045        let node = validate_one_node(args.pop_nodeset()?, "NeedsToBeGrouped")?;
3046        if let Node::Element(e) = node {
3047            let answer = match braille_code.as_str() {
3048                "CMU" => NeedsToBeGrouped::needs_grouping_for_cmu(e, is_base),
3049                "UEB" => NeedsToBeGrouped::needs_grouping_for_ueb(e, is_base),
3050                "Finnish" => NeedsToBeGrouped::needs_grouping_for_finnish(e, is_base),
3051                "Swedish" => NeedsToBeGrouped::needs_grouping_for_swedish(e, is_base),
3052                _ => return Err(XPathError::Other(format!("NeedsToBeGrouped: braille code arg '{braille_code:?}' is not a known code ('UEB', 'CMU', or 'Swedish')"))),
3053            };
3054            return Ok( Value::Boolean( answer ) );
3055        }
3056
3057        return Err(XPathError::Other(format!("NeedsToBeGrouped: first arg '{node:?}' is not a node")));
3058    }
3059}
3060    
3061    
3062    
3063#[cfg(test)]
3064mod tests {
3065    use super::*;
3066    #[allow(unused_imports)]
3067    use crate::init_logger;
3068    use crate::interface::*;
3069    
3070    #[test]
3071    fn ueb_highlight_24() -> Result<()> {       // issue 24
3072        let mathml_str = "<math display='block' id='id-0'>
3073            <mrow id='id-1'>
3074                <mn id='id-2'>4</mn>
3075                <mo id='id-3'>&#x2062;</mo>
3076                <mi id='id-4'>a</mi>
3077                <mo id='id-5'>&#x2062;</mo>
3078                <mi id='id-6'>c</mi>
3079            </mrow>
3080        </math>";
3081        crate::interface::set_rules_dir(super::super::abs_rules_dir_path()).unwrap();
3082        set_mathml(mathml_str.to_string()).unwrap();
3083        set_preference("BrailleCode".to_string(), "UEB".to_string()).unwrap();
3084        set_preference("BrailleNavHighlight".to_string(), "All".to_string()).unwrap();
3085        let braille = get_braille("id-2".to_string())?;
3086        assert_eq!("⣼⣙⠰⠁⠉", braille);
3087        set_navigation_node("id-2".to_string(), 0)?;
3088        assert_eq!( get_braille_position()?, (0,2));
3089
3090        let braille = get_braille("id-4".to_string())?;
3091        assert_eq!("⠼⠙⣰⣁⠉", braille);
3092        set_navigation_node("id-4".to_string(), 0)?;
3093        assert_eq!( get_braille_position()?, (2,4));
3094        return Ok( () );
3095    }
3096    
3097    #[test]
3098    // This test probably should be repeated for each braille code and be taken out of here
3099    fn find_mathml_from_braille() -> Result<()> { 
3100        use std::time::Instant;
3101        let mathml_str = "<math id='id-0'>
3102        <mrow data-changed='added' id='id-1'>
3103          <mi id='id-2'>x</mi>
3104          <mo id='id-3'>=</mo>
3105          <mfrac id='id-4'>
3106            <mrow id='id-5'>
3107              <mrow data-changed='added' id='id-6'>
3108                <mo id='id-7'>-</mo>
3109                <mi id='id-8'>b</mi>
3110              </mrow>
3111              <mo id='id-9'>±</mo>
3112              <msqrt id='id-10'>
3113                <mrow data-changed='added' id='id-11'>
3114                  <msup id='id-12'>
3115                    <mi id='id-13'>b</mi>
3116                    <mn id='id-14'>2</mn>
3117                  </msup>
3118                  <mo id='id-15'>-</mo>
3119                  <mrow data-changed='added' id='id-16'>
3120                    <mn id='id-17'>4</mn>
3121                    <mo data-changed='added' id='id-18'>&#x2062;</mo>
3122                    <mi id='id-19'>a</mi>
3123                    <mo data-changed='added' id='id-20'>&#x2062;</mo>
3124                    <mi id='id-21'>c</mi>
3125                  </mrow>
3126                </mrow>
3127              </msqrt>
3128            </mrow>
3129            <mrow id='id-22'>
3130              <mn id='id-23'>2</mn>
3131              <mo data-changed='added' id='id-24'>&#x2062;</mo>
3132              <mi id='id-25'>a</mi>
3133            </mrow>
3134          </mfrac>
3135        </mrow>
3136       </math>";
3137        crate::interface::set_rules_dir(super::super::abs_rules_dir_path()).unwrap();
3138        set_mathml(mathml_str.to_string()).unwrap();
3139        set_preference("BrailleNavHighlight".to_string(), "Off".to_string()).unwrap();
3140        
3141        set_preference("BrailleCode".to_string(), "Nemeth".to_string()).unwrap();
3142        let braille = get_braille("".to_string())?;
3143        let answers= &[2, 3, 3, 3, 3, 4, 7, 8, 9, 9,   10, 13, 12, 14, 12, 15, 17, 19, 21, 10,   4, 23, 25, 4];
3144        let answers = answers.map(|num| format!("id-{}", num));
3145        debug!("\n*** Testing Nemeth ***");
3146        for i in 0..braille.chars().count() {
3147            debug!("\n===  i={}  ===", i);
3148            let instant = Instant::now();
3149            let (id, _offset) = crate::interface::get_navigation_node_from_braille_position(i)?;
3150            N_PROBES.with(|n| {debug!("test {:2} #probes = {}", i, n.borrow())});
3151            debug!("Time taken: {}ms", instant.elapsed().as_millis());
3152            assert_eq!(answers[i], id, "\nNemeth test ith position={}", i);
3153        }
3154
3155        set_preference("BrailleCode".to_string(), "UEB".to_string()).unwrap();
3156        let braille = get_braille("".to_string())?;
3157        let answers= &[0, 0, 0, 2, 3, 3, 3, 3, 4, 7,   7, 8, 9, 9, 10, 13, 12, 14, 14, 15,   15, 17, 17, 19, 19, 21, 10, 4, 4, 23,   23, 25, 25, 4, 0, 0];
3158        let answers = answers.map(|num| format!("id-{}", num));
3159        debug!("\n\n*** Testing UEB ***");
3160        for i in 0..braille.chars().count() {
3161            debug!("\n===  i={}  ===", i);
3162            let instant = Instant::now();
3163            let (id, _offset) = crate::interface::get_navigation_node_from_braille_position(i)?;
3164            N_PROBES.with(|n| {debug!("test {:2} #probes = {}", i, n.borrow())});
3165            debug!("Time taken: {}ms", instant.elapsed().as_millis());
3166            assert_eq!(answers[i], id, "\nUEB test ith position={}", i);
3167        }
3168        set_preference("BrailleCode".to_string(), "CMU".to_string()).unwrap();
3169        let braille = get_braille("".to_string())?;
3170        let answers= &[2, 3, 5, 7, 8, 9, 9, 9, 10, 10,   11, 13, 12, 14, 14, 15, 17, 17, 19, 19,   21, 11, 5, 4, 22, 23, 23, 25, 25, 22,];
3171        let answers = answers.map(|num| format!("id-{}", num));
3172        debug!("\n\n*** Testing CMU ***");
3173        debug!("Braille: {}", braille);
3174        for i in 0..braille.chars().count() {
3175            debug!("\n===  i={}  ===", i);
3176            let instant = Instant::now();
3177            let (id, _offset) = crate::interface::get_navigation_node_from_braille_position(i)?;
3178            N_PROBES.with(|n| {debug!("test {:2} #probes = {}", i, n.borrow())});
3179            debug!("Time taken: {}ms", instant.elapsed().as_millis());
3180            assert_eq!(answers[i], id, "\nCMU test ith position={}", i);
3181        }
3182        return Ok( () );
3183    }
3184    
3185    #[test]
3186    #[allow(non_snake_case)]
3187    fn test_UEB_start_mode() -> Result<()> {
3188        let mathml_str = "<math><msup><mi>x</mi><mi>n</mi></msup></math>";
3189        crate::interface::set_rules_dir(super::super::abs_rules_dir_path()).unwrap();
3190        set_mathml(mathml_str.to_string()).unwrap();
3191        set_preference("BrailleCode".to_string(), "UEB".to_string()).unwrap();
3192        set_preference("UEB_START_MODE".to_string(), "Grade2".to_string()).unwrap();
3193        let braille = get_braille("".to_string())?;
3194        assert_eq!("⠭⠰⠔⠝", braille, "Grade2");
3195        set_preference("UEB_START_MODE".to_string(), "Grade1".to_string()).unwrap();
3196        let braille = get_braille("".to_string())?;
3197        assert_eq!("⠭⠔⠝", braille, "Grade1");
3198        return Ok( () );
3199    }
3200}
libmathcat/braille.rs

libmathcat/
braille.rs