mathcat 0.1.26

MathCAT: Math Capable Assistive Technology ('Speech and braille from MathML')
Documentation
#![allow(clippy::needless_return)]
use sxd_document::dom::Element;
use sxd_document::Package;
use crate::errors::*;
use regex::{Captures, Regex, RegexSet};
use phf::{phf_map, phf_set};
use crate::speech::{BRAILLE_RULES, SpeechRulesWithContext};
use std::ops::Range;

static UEB_PREFIXES: phf::Set<char> = phf_set! {
    '', '', '', '', '', '', '', '',
};


/// braille the MathML
/// If 'nav_node_id' is not an empty string, then the element with that id will have dots 7 & 8 turned on as per the pref
pub fn braille_mathml(mathml: Element, nav_node_id: String) -> Result<String> {
    crate::speech::SpeechRules::update();
    return BRAILLE_RULES.with(|rules| {
        rules.borrow_mut().read_files()?;
        let rules = rules.borrow();
        let new_package = Package::new();
        let mut rules_with_context = SpeechRulesWithContext::new(&rules, new_package.as_document(), nav_node_id);
        let braille_string = rules_with_context.match_pattern::<String>(mathml)
                        .chain_err(|| "Pattern match/replacement failure!")?;
        let braille_string = braille_string.replace(' ', "");
        let pref_manager = rules_with_context.get_rules().pref_manager.borrow();
        let highlight_style = pref_manager.get_user_prefs().to_string("BrailleNavHighlight");
        let braille_code = pref_manager.get_user_prefs().to_string("BrailleCode");
        let braille = match braille_code.as_str() {
            "UEB" => ueb_cleanup(braille_string),
            "Nemeth" => nemeth_cleanup(braille_string),
            _ => braille_string,    // probably needs cleanup if someone has another code, but this will have to get added by hand
        };

        return Ok(
            if highlight_style != "Off" {
                highlight_braille_chars(braille, &braille_code, highlight_style == "All")
            } else {
             braille
            }
        );
    });

    // highlight with dots 7 & 8 based on the highlight style
    // both the start and stop points will be extended to deal with indicators such as capitalization
    // if 'fill_range' is true, the interior will be highlighted
    fn highlight_braille_chars(braille: String, braille_code: &str, fill_range: bool) -> String {
        let mut braille = braille;
        // some special (non-braille) chars weren't converted to having dots 7 & 8 to indicate navigation position
        // they need to be added to the start

        // find start and end indexes of the highlighted region
        let start = braille.find(is_highlighted);
        let end = braille.rfind(is_highlighted);
        if start.is_none() {
            assert!(end.is_none());
            return braille;
        };

        let end = end.unwrap();         // always exists if start exists
        let start = highlight_first_indicator(&mut braille, braille_code, start.unwrap(), end);

        if start == end {
            return braille;
        }

        if !fill_range {
            return braille;
        }

        let mut result = String::with_capacity(braille.len());
        result.push_str(&braille[..start]);
        let highlight_region =&mut braille[start..end];
        for ch in highlight_region.chars() {
            result.push( highlight(ch) );
        };
        result.push_str(&braille[end..]);
        return result;

        fn highlight_first_indicator(braille: &mut String, braille_code: &str, start_index: usize, end_index: usize) -> usize {
            // chars in the braille block range use 3 bytes -- we can use that to optimize the code some
            let first_ch = unhighlight(braille[start_index..start_index+3].chars().next().unwrap());

            // need to highlight (optional) capital/number, language, and style (max 2 chars) also in that (rev) order
            let prefix_ch_index = std::cmp::max(0, start_index as isize - 5*3) as usize;
            let indicators = &braille[prefix_ch_index..start_index];   // chars to be examined
            let i_byte_start = start_index - 3 * match braille_code {
                "Nemeth" => i_start_nemeth(indicators, first_ch),
                "UEB" => i_start_ueb(indicators),
                _ => {
                    error!("highlight_first_indicator: Unknown braille code '{}'", braille);
                    0
                },
            };
            if i_byte_start < start_index {
                // remove old highlight as long as we don't wipe out the end highlight
                if start_index < end_index {
                    let old_first_char_bytes = start_index..start_index+3;
                    let replacement_str = unhighlight(braille[old_first_char_bytes.clone()].chars().next().unwrap()).to_string();
                    braille.replace_range(old_first_char_bytes, &replacement_str);
                }

                // add new highlight
                let new_first_char_bytes = i_byte_start..i_byte_start+3;
                let replacement_str = highlight(braille[new_first_char_bytes.clone()].chars().next().unwrap()).to_string();
                braille.replace_range(new_first_char_bytes, &replacement_str);
            }

            return i_byte_start;
        }

    }

    /// Given a position in a Nemeth string, what is the position character that starts it (e.g, the prev char for capital letter)
    fn i_start_nemeth(braille_prefix: &str, first_ch: char) -> usize {
        static NEMETH_NUMBERS: phf::Set<char> = phf_set! {
            '', '', '', '', '', '', '', '', '', '', '' // 1, 2, ...9, 0, decimal pt
        };
        let mut n_chars = 0;
        let prefix = &mut braille_prefix.chars().rev().peekable();
        if prefix.peek() == Some(&'') ||  // cap indicator
           (prefix.peek() == Some(&'') && NEMETH_NUMBERS.contains(&first_ch)) ||  // number indicator
           [Some(&''), Some(&''), Some(&'')].contains(&prefix.peek()) {         // bold, script/blackboard, italic indicator
            n_chars += 1;
            prefix.next();
        } 

        if [Some(&''), Some(&''), Some(&'')].contains(&prefix.peek()) {   // English, German, Greek
            n_chars += 1;
        } else if prefix.peek() == Some(&'') {  
            let ch = prefix.next();                              // Russian/Greek Variant
            if ch == Some('') || ch == Some('') {
                n_chars += 2;
            }
        } else if prefix.peek() == Some(&'')  { // Hebrew 
            let ch = prefix.next();                              // Russian/Greek Variant
            if ch == Some('') {
                n_chars += 2;
            }
        };
        return n_chars;
    }

    /// Given a position in a UEB string, what is the position character that starts it (e.g, the prev char for capital letter)
    fn i_start_ueb(braille_prefix: &str) -> usize {
        let prefix = &mut braille_prefix.chars().rev().peekable();
        let mut n_chars = 0;
        while let Some(ch) = prefix.next() {
            if UEB_PREFIXES.contains(&ch) {
                n_chars += 1;
            } else if ch == '' {
                let n_typeform_chars = check_for_typeform(prefix);
                if n_typeform_chars > 0 {
                    n_chars += n_typeform_chars;
                } else {
                    break;
                }
            } else {
                break;
            }
        }
        return n_chars;
    }

    fn check_for_typeform(prefix: &mut dyn std::iter::Iterator<Item=char>) -> usize {
        static UEB_TYPEFORM_PREFIXES: phf::Set<char> = phf_set! {
            '', '', '', '',
        };

        if let Some(typeform_indicator) = prefix.next() {
            if UEB_TYPEFORM_PREFIXES.contains(&typeform_indicator) {
                return 2;
            } else if typeform_indicator == '' {
                if let Some(user_defined_typeform_indicator) = prefix.next() {
                    if UEB_TYPEFORM_PREFIXES.contains(&user_defined_typeform_indicator) || user_defined_typeform_indicator == '' {
                        return 3;
                    }
                }
            }
        }
        return 0;
    }
}

fn is_highlighted(ch: char) -> bool {
    let ch_as_u32 = ch as u32;
    return (0x28C0..0x28FF).contains(&ch_as_u32);
}

fn highlight(ch: char) -> char {
    return unsafe{char::from_u32_unchecked(ch as u32 | 0xC0)};      
}

fn unhighlight(ch: char) -> char {
    let ch_as_u32 = ch as u32;
    if (0x28C0..0x28FF).contains(&ch_as_u32) {
        return unsafe{char::from_u32_unchecked(ch_as_u32 & 0x283F)};  
    } else {
        return ch;
    }
}


fn nemeth_cleanup(raw_braille: String) -> String {
    // Typeface: S: sans-serif, B: bold, T: script/blackboard, I: italic, R: Roman
    // Language: E: English, D: German, G: Greek, V: Greek variants, H: Hebrew, U: Russian
    // Indicators: C: capital, N: number, P: punctuation, M: multipurpose
    // Others:
    //      W -- whitespace that should be kept (e.g, in a numeral)
    //      𝑁 -- hack for special case of a lone decimal pt -- not considered a number but follows rules mostly 
    // SRE doesn't have H: Hebrew or U: Russian, so not encoded (yet)
    // Note: some "positive" patterns find cases to keep the char and transform them to the lower case version
    static NEMETH_INDICATOR_REPLACEMENTS: phf::Map<&str, &str> = phf_map! {
        "S" => "⠈⠰",    // sans-serif
        "B" => "",     // bold
        "𝔹" => "",     // blackboard
        "T" => "",     // script (mapped to be the same a blackboard)
        "I" => "",     // italic
        "R" => "",      // roman
        "E" => "",     // English
        "D" => "",     // German (Deutsche)
        "G" => "",     // Greek
        "V" => "⠨⠈",    // Greek Variants
        "H" => "⠠⠠",    // Hebrew
        "U" => "⠈⠈",    // Russian
        "C" => "",     // capital
        "P" => "",     // punctuation
        "L" => "",      // letter
        "M" => "",      // multipurpose indicator
        "m" => "",     // required multipurpose indicator
        "N" => "",       // digit
        "n" => "",     // required number indicator
        "𝑁" => "",      // hack for special case of a lone decimal pt -- not considered a number but follows rules mostly
        "W" => "",     // whitespace
        "," => "⠠⠀",     // comma
        "b" => "",     // baseline
        "" => "",     // superscript
        "" => "",     // supscript
    };

    lazy_static! {
        // Trim braille spaces before and after braille indicators
        // In order: fraction, /, cancellation, letter, baseline
        // Note: fraction over is not listed due to example 42(4) which shows a space before the "/"
        static ref REMOVE_SPACE_BEFORE_BRAILLE_INDICATORS: Regex = 
            Regex::new(r"(⠄⠄⠄|⠤⠤⠤)W+([⠼⠸⠪])").unwrap();
        static ref REMOVE_SPACE_AFTER_BRAILLE_INDICATORS: Regex = 
            Regex::new(r"([⠹⠻Lb])W+(⠄⠄⠄)").unwrap();

        // Hack to convert non-numeric '.' to numeric '.'
        // The problem is that the numbers are hidden inside of mover -- this might be more general than rule 99_2.
        static ref DOTS_99_A_2: Regex = Regex::new(r"𝑁⠨mN").unwrap();

        // Multipurpose indicator insertion
        // 177.2 -- add after a letter and before a digit (or decimal pt) -- digits will start with N
        static ref MULTI_177_2: Regex = 
            Regex::new(r"(L.)[N𝑁]").unwrap();

        // keep between numeric subscript and digit ('M' added by subscript rule)
        static ref MULTI_177_3: Regex = 
            Regex::new(r"([N𝑁].)M([N𝑁].)").unwrap(); 

        // Add after decimal pt for non-digits except for comma and punctuation
        // Note: since "." can be in the middle of a number, there is not necessarily a "N"
        // Although not mentioned in 177_5, don't add an 'M' before an 'm'
        static ref MULTI_177_5: Regex = 
            Regex::new(r"([N𝑁]⠨)([^⠂⠆⠒⠲⠢⠖⠶⠦⠔N𝑁,Pm])").unwrap(); 


        // Pattern for rule II.9a (add numeric indicator at start of line or after a space) and 9a (add after typeface)
        // 1. start of line
        // 2. optional minus sign (⠤)
        // 3. optional typeface indicator
        // 4. number (N)
        static ref NUM_IND_9A: Regex = 
            Regex::new(r"(?P<start>^|[,W])(?P<minus>⠤?)(?P<face>[SBTIR]*?)N").unwrap();  

        // Needed after section mark(§), paragraph mark(¶), #, or *
        static ref NUM_IND_9D: Regex = 
            Regex::new(r"(⠈⠠⠎|⠈⠠⠏|⠨⠼|⠈⠼)N").unwrap();  

        // Needed after a typeface change or interior shape modifier indicator
        static ref NUM_IND_9E: Regex = Regex::new(r"(?P<face>[SBTIR]+?)N").unwrap();  
        static ref NUM_IND_9E_SHAPE: Regex = Regex::new(r"(?P<mod>⠸⠫)N").unwrap();  

        // Needed after hyphen that follows a word, abbreviation, or punctuation (caution about rule 11d)
        // Note -- hyphen might encode as either "P⠤" or "⠤" depending on the tag used
        static ref NUM_IND_9F: Regex = Regex::new(r"(L.L.|P.)(P?⠤)N").unwrap();  

        // Punctuation chars (Rule 38.6 says don't use before ",", "hyphen", "-", "…")
        // Never use punctuation indicator before these (38-6)
        //      "…": "⠀⠄⠄⠄"
        //      "-": "⠸⠤" (hyphen and dash)
        //      ",": "⠠⠀"     -- spacing already added
        // Rule II.9b (add numeric indicator after punctuation [optional minus[optional .][digit]
        //  because this is run after the above rule, some cases are already caught, so don't
        //  match if there is already a numeric indicator
        static ref NUM_IND_9B: Regex = Regex::new(r"(?P<punct>P.)(?P<minus>⠤?)N").unwrap();  

        // Before 79b (punctuation)
        static ref REMOVE_LEVEL_IND_BEFORE_SPACE_COMMA_PUNCT: Regex = Regex::new(r"(?:[↑↓]+b?|b)([W,P]|$)").unwrap();

        static ref REMOVE_LEVEL_IND_BEFORE_BASELINE: Regex = Regex::new(r"(?:[↑↓]+b)").unwrap();

        // Except for the four chars above, the unicode rules always include a punctuation indicator.
        // The cases to remove them (that seem relevant to MathML) are:
        //   Beginning of line or after a space (V 38.1)
        //   After a word (38.4)
        //   2nd or subsequent punctuation (includes, "-", etc) (38.7)
        static ref REMOVE_PUNCT_IND: Regex = Regex::new(r"(^|W|L.L.)P(.)").unwrap();  

        static ref REPLACE_INDICATORS: Regex =Regex::new(r"([SB𝔹TIREDGVHPCLMmb↑↓Nn𝑁W,])").unwrap();  
            
        static ref COLLAPSE_SPACES: Regex = Regex::new(r"⠀⠀+").unwrap();
    }

  debug!("Before:  \"{}\"", raw_braille);

    // Remove blanks before and after braille indicators
    let result = REMOVE_SPACE_BEFORE_BRAILLE_INDICATORS.replace_all(&raw_braille, "$1$2");
    let result = REMOVE_SPACE_AFTER_BRAILLE_INDICATORS.replace_all(&result, "$1$2");
  debug!("spaces:  \"{}\"", result);

    let result = DOTS_99_A_2.replace_all(&result, "N⠨mN");

    // Multipurpose indicator
    let result = MULTI_177_2.replace_all(&result, "${1}m${2}");
    let result = MULTI_177_3.replace_all(&result, "${1}m$2");
    let result = MULTI_177_5.replace_all(&result, "${1}m$2");
  debug!("MULTI:   \"{}\"", result);

    let result = NUM_IND_9A.replace_all(&result, "$start$minus${face}n");
  debug!("IND_9A:  \"{}\"", result);

    let result = NUM_IND_9D.replace_all(&result, "${1}n");
    let result = NUM_IND_9E.replace_all(&result, "${face}n");
    let result = NUM_IND_9E_SHAPE.replace_all(&result, "${mod}n");
    let result = NUM_IND_9F.replace_all(&result, "${1}${2}n");

//   debug!("IND_9E:  \"{}\"", result);

    // 9b: insert after punctuation (optional minus sign)
    // common punctuation adds a space, so 9a handled it. Here we deal with other "punctuation" 
    // FIX other punctuation and reference symbols (9d)
    let result = NUM_IND_9B.replace_all(&result, "$punct${minus}n");
  debug!("A PUNCT: \"{}\"", &result);

    // strip level indicators
    // checks for punctuation char, so needs to before punctuation is stripped.
    
    let result = REMOVE_LEVEL_IND_BEFORE_SPACE_COMMA_PUNCT.replace_all(&result, "$1");
//   debug!("Punct  : \"{}\"", &result);
    let result = REMOVE_LEVEL_IND_BEFORE_BASELINE.replace_all(&result, "b");
  debug!("Bseline: \"{}\"", &result);

    let result = REMOVE_PUNCT_IND.replace_all(&result, "$1$2");
//   debug!("Punct38: \"{}\"", &result);

    let result = REPLACE_INDICATORS.replace_all(&result, |cap: &Captures| {
        match NEMETH_INDICATOR_REPLACEMENTS.get(&cap[0]) {
            None => {error!("REPLACE_INDICATORS and NEMETH_INDICATOR_REPLACEMENTS are not in sync"); ""},
            Some(&ch) => ch,
        }
    });

    // Remove unicode blanks at start and end -- do this after the substitutions because ',' introduces spaces
    let result = result.trim_start_matches('').trim_end_matches('');
    let result = COLLAPSE_SPACES.replace_all(result, "");
   
    return result.to_string();

}

// Typeface: S: sans-serif, B: bold, T: script/blackboard, I: italic, R: Roman
// Language: E: English, D: German, G: Greek, V: Greek variants, H: Hebrew, U: Russian
// Indicators: C: capital, N: number, P: punctuation, M: multipurpose
// Others:
//      W -- whitespace that should be kept (e.g, in a numeral)
//      𝑁 -- hack for special case of a lone decimal pt -- not considered a number but follows rules mostly 
// SRE doesn't have H: Hebrew or U: Russian, so not encoded (yet)
// Note: some "positive" patterns find cases to keep the char and transform them to the lower case version
static UEB_INDICATOR_REPLACEMENTS: phf::Map<&str, &str> = phf_map! {
    "S" => "XXX",    // sans-serif
    "B" => "",     // bold
    "𝔹" => "⠈XXX",     // blackboard
    "T" => "",     // script
    "I" => "",     // italic
    "R" => "",      // roman
    // "E" => "⠰",     // English
    "1" => "",     // Grade 1 symbol
    "L" => "",     // Letter left in to assist in locating letters
    "D" => "XXX",     // German (Deutsche)
    "G" => "",     // Greek
    // "V" => "⠨⠈",    // Greek Variants
    // "H" => "⠠⠠",    // Hebrew
    // "U" => "⠈⠈",    // Russian
    "C" => "",      // capital
    "𝐶" => "",      // capital that never should get word indicator (from chemical element)
    "N" => "",     // number indicator
    "t" => "",     // shape terminator
    "W" => "",     // whitespace
    "𝐖"=> "",     // whitespace
    "s" => "",     // typeface single char indicator
    "w" => "",     // typeface word indicator
    "e" => "",     // typeface & capital terminator 
    "o" => "",       // flag that what follows is an open indicator (used for standing alone rule)
    "c" => "",       // flag that what follows is an close indicator (used for standing alone rule)
    "b" => "",       // flag that what follows is an open or close indicator (used for standing alone rule)
    "," => "",     // comma
    "." => "",     // period
    "-" => "-",     // hyphen
    "" => "⠠⠤",   // normal dash (2014) -- assume all normal dashes are unified here [RUEB appendix 3]
    "" => "⠐⠠⠤",  // long dash (2015) -- assume all long dashes are unified here [RUEB appendix 3]
    "#" => "",      // signals end of script
    // '(', '{', '[', '"', '\'', '“', '‘', '«',    // opening chars
    // ')', '}', ']', '\"', '\'', '”', '’', '»',           // closing chars
    // ',', ';', ':', '.', '…', '!', '?'                    // punctuation           

};

// static LETTERS: phf::Set<char> = phf_set! {
//     '⠁', '⠃', '⠉', '⠙', '⠑', '⠋', '⠛', '⠓', '⠊', '⠚', '⠅', '⠇', '⠍', 
//     '⠝', '⠕', '⠏', '⠟', '⠗', '⠎', '⠞', '⠥', '⠧', '⠺', '⠭', '⠽', '⠵',
// };

static LETTER_NUMBERS: phf::Set<char> = phf_set! {
    '', '', '', '', '', '', '', '', '', '',
};

static SHORT_FORMS: phf::Set<&str> = phf_set! {
    "L⠁L⠃", "L⠁L⠃L⠧", "L⠁L⠉", "L⠁L⠉L⠗", "L⠁L⠋",
    "L⠁L⠋L⠝", "L⠁L⠋L⠺", "L⠁L⠛", "L⠁L⠛L⠌", "L⠁L⠇",
     "L⠁L⠇L⠍", "L⠁L⠇L⠗", "L⠁L⠇L⠞", "L⠁L⠇L⠹", "L⠁L⠇L⠺",
     "L⠃L⠇", "L⠃L⠗L⠇", "L⠉L⠙", "L⠙L⠉L⠇", "L⠙L⠉L⠇L⠛",
     "L⠙L⠉L⠧", "L⠙L⠉L⠧L⠛", "L⠑L⠊", "L⠋L⠗", "L⠋L⠌", "L⠛L⠙",
     "L⠛L⠗L⠞", "L⠓L⠍", "L⠓L⠍L⠋", "L⠓L⠻L⠋", "L⠊L⠍L⠍", "L⠇L⠇", "L⠇L⠗",
     "L⠍L⠽L⠋", "L⠍L⠡", "L⠍L⠌", "L⠝L⠑L⠉", "L⠝L⠑L⠊", "L⠏L⠙",
     "L⠏L⠻L⠉L⠧", "L⠏L⠻L⠉L⠧L⠛", "L⠏L⠻L⠓", "L⠟L⠅", "L⠗L⠉L⠧",
     "L⠗L⠉L⠧L⠛", "L⠗L⠚L⠉", "L⠗L⠚L⠉L⠛", "L⠎L⠙", "L⠎L⠡", "L⠞L⠙",
     "L⠞L⠛L⠗", "L⠞L⠍", "L⠞L⠝", "L⠭L⠋", "L⠭L⠎", "L⠽L⠗", "L⠽L⠗L⠋",
     "L⠽L⠗L⠧L⠎", "L⠮L⠍L⠧L⠎", "L⠡L⠝", "L⠩L⠙", "L⠹L⠽L⠋", "L⠳L⠗L⠧L⠎",
     "L⠺L⠙", "L⠆L⠉", "L⠆L⠋", "L⠆L⠓", "L⠆L⠇", "L⠆L⠝", "L⠆L⠎", "L⠆L⠞",
     "L⠆L⠽", "L⠒L⠉L⠧", "L⠒L⠉L⠧L⠛", "L⠐L⠕L⠋"
};

static LETTER_PREFIXES: phf::Set<char> = phf_set! {
    'B', 'I', '𝔹', 'S', 'T', 'D', 'C', '𝐶'
};

lazy_static! {
    // Trim braille spaces before and after braille indicators
    // In order: fraction, /, cancellation, letter, baseline
    // Note: fraction over is not listed due to example 42(4) which shows a space before the "/"
    // static ref REMOVE_SPACE_BEFORE_BRAILLE_INDICATORS: Regex = 
    //     Regex::new(r"(⠄⠄⠄|⠤⠤⠤)W+([⠼⠸⠪])").unwrap();
    static ref REPLACE_INDICATORS: Regex =Regex::new(r"([1SB𝔹TIREDGVHP𝐶CLMNW𝐖swe,.-—―#ocb])").unwrap();  
    static ref COLLAPSE_SPACES: Regex = Regex::new(r"⠀⠀+").unwrap();
}

fn is_short_form(chars: &[char]) -> bool {
    let chars_as_string = chars.iter().map(|ch| ch.to_string()).collect::<String>();
    return SHORT_FORMS.contains(&chars_as_string);
}

fn ueb_cleanup(raw_braille: String) -> String {
    let result = typeface_to_word_mode(&raw_braille);
    let result = capitals_to_word_mode(&result);

    
    // '𝐖' is a hard break -- basically, it separates exprs
    let mut result = result.split('𝐖')
                        .map(|str| pick_start_mode(str) + "W")
                        .collect::<String>();
    result.pop();   // we added a 'W' at the end that needs to be removed.

    let result = result.replace("tW", "W");

    // these typeforms need to get pulled from user-prefs as they are transcriber-defined
    let pref_manager = crate::prefs::PreferenceManager::get();
    let pref_manager = pref_manager.borrow();
    let prefs = pref_manager.get_user_prefs();
    let double_struck = prefs.to_string("UEB_DoubleStruck");
    let sans_serif = prefs.to_string("UEB_SansSerif");
    let fraktur = prefs.to_string("UEB_Fraktur");

    let result = REPLACE_INDICATORS.replace_all(&result, |cap: &Captures| {
        let matched_char = &cap[0];
        match matched_char {
            "𝔹" => &double_struck,
            "S" => &sans_serif,
            "D" => &fraktur,
            _ => match UEB_INDICATOR_REPLACEMENTS.get(matched_char) {
                None => {error!("REPLACE_INDICATORS and UEB_INDICATOR_REPLACEMENTS are not in sync: missing '{}'", matched_char); ""},
                Some(&ch) => ch,
            },
        }
    });

    // Remove unicode blanks at start and end -- do this after the substitutions because ',' introduces spaces
    // let result = result.trim_start_matches('⠀').trim_end_matches('⠀');
    let result = COLLAPSE_SPACES.replace_all(&result, "");
   
    return result.to_string();

    fn typeface_to_word_mode(braille: &str) -> String {
        lazy_static! {
            static ref HAS_TYPEFACE: Regex = Regex::new("[BI𝔹STD]").unwrap();
        }
        // debug!("before typeface fix:  '{}'", braille);

        let mut result = "".to_string();
        let chars = braille.chars().collect::<Vec<char>>();
        let mut word_mode = Vec::with_capacity(5);
        let mut word_mode_end = Vec::with_capacity(5);
        let mut i = 0;
        while i < chars.len() {
            let ch = chars[i];
            if HAS_TYPEFACE.is_match(ch.to_string().as_str()) {
                let is_next_char_target = is_next_char(&chars[i+1..], ch);
                if word_mode.contains(&ch) {
                    if !is_next_char_target {
                        word_mode.retain(|&item| item!=ch);  // drop the char since word mode is done
                        word_mode_end.push(ch);   // add the char to signal to add end sequence
                    }
                } else {
                    result.push(ch);
                    if is_next_char_target {
                        result.push('w');     // typeface word indicator
                        word_mode.push(ch);      // starting word mode for this char
                    } else {
                        result.push('s');     // typeface single char indicator
                    }
                }
                i += 1; // eat "B", etc
            } else if ch == 'L' || ch == 'N' {
                result.push(chars[i]);
                result.push(chars[i+1]);
                if !word_mode_end.is_empty() && i+2 < chars.len() && !(chars[i+2] == 'W'|| chars[i+2] == '𝐖') {
                    // add terminator unless word sequence is terminated by end of string or whitespace
                    for &ch in &word_mode_end {
                        result.push(ch);
                        result.push('e');
                    };
                    word_mode_end.clear();
                }
                i += 2; // eat Ll/Nd
            } else {
                result.push(ch);
                i += 1;
            }
        }
        return result;

    }

    fn capitals_to_word_mode(braille: &str) -> String {
        debug!("before capitals fix:  '{}'", braille);

        let mut result = "".to_string();
        let chars = braille.chars().collect::<Vec<char>>();
        let mut is_word_mode = false;
        let mut i = 0;
        // look for a sequence of CLxCLy... and create CCLxLy...
        while i < chars.len() {
            let ch = chars[i];
            if ch == 'C' {
                let is_next_char_cap_c = is_next_char(&chars[i+1..], 'C');  // next letter sequence "C..."
                if is_next_char_cap_c {
                    if is_next_char_start_of_section_12_modifier(&chars[i+1..]) {
                        // to me this is tricky -- section 12 modifiers apply to the previous item
                        // the last clause of the "item" def is the previous "individual symbol" which ICEB 2.1 say is:
                        //   braille sign: one or more consecutive braille characters comprising a unit,
                        //     consisting of a root on its own or a root preceded by one or more
                        //     prefixes (also referred to as braille symbol)
                        // this means the capital indicator needs to be stated and can't be part of a word or passage
                        is_word_mode = false;
                        result.push('C');
                        i += 1;
                        continue;
                    }
                    if !is_word_mode {
                        // start word mode
                        result.push('C');
                        result.push('C');
                        is_word_mode = true;
                    } // else if word mode, don't emit the 'C'
                } else if !is_word_mode {
                    result.push('C');
                }
                if chars[i+1] == 'G' {
                    // Greek letters are a bit exceptional in that the pattern is "CGLx" -- push and bump 'i'
                    result.push('G');
                    i += 1;
                }
                if chars[i+1] != 'L' {
                    error!("capitals_to_word_mode: internal error: didn't find L after C.");
                }
                if i+2 < chars.len() {
                    result.push(chars[i+1]);    // eat 'L'
                    result.push(chars[i+2]);    // eat letter
                }
                i += 3 // eat "C", etc
            } else if ch == 'L' {       // must be lowercase -- uppercase consumed above
                // assert!(LETTERS.contains(&unhighlight(chars[i+1]))); not true for other alphabets
                if is_word_mode {
                    result.push('e');       // terminate Word mode (letter after caps)
                    is_word_mode = false;
                }
                result.push('L');
                result.push(chars[i+1]);
                i += 2; // eat L, letter
            } else {
                is_word_mode = false;   // non-letters terminate cap word mode
                result.push(ch);
                i += 1;
            }
        }
        return result;
    }

    fn is_next_char(chars: &[char], target: char) -> bool {        
        // first find the L or N and eat the char so that we are at the potential start of where the target lies
        // debug!("Looking for '{}' in '{}'", target, chars.iter().collect::<String>());
        for i_end in 0..chars.len() {
            if chars[i_end] == 'L' || chars[i_end] == 'N' {
                // skip the next char to get to the real start, and then look for the target
                // stop when L/N signals past potential target or we hit some non L/N char (actual braille)
                // debug!("   after L/N '{}'", chars[i_end+2..].iter().collect::<String>());
                for &ch in chars.iter().skip(i_end+2) {
                    if ch == 'L' || ch == 'N' || !LETTER_PREFIXES.contains(&ch) {
                        return false;
                    } else if ch == target {
                        // debug!("   found target");
                        return true;
                    }
                }
            }
        }
        return false;
    }

    fn is_next_char_start_of_section_12_modifier(chars: &[char]) -> bool {
        // first find the L and eat the char so that we are at the potential start of where the target lies
        let chars_len = chars.len();
        let mut i_cap = 0;
        while chars[i_cap] != 'C' {     // we know 'C' is in the string, so no need to check for exceeding chars_len
            i_cap += 1;
        }
        for i_end in i_cap+1..chars_len {
            if chars[i_end] == 'L' {
                // skip the next char to get to the real start, and then look for the modifier string or next L/N
                // debug!("   after L '{}'", chars[i_end+2..].iter().collect::<String>());
                for i in i_end+2..chars_len {
                    let ch = chars[i]; 
                    if ch == '1' {
                        // Fix: there's probably a much better way to check if we have a match against one of "⠱", "⠘⠱", "⠘⠲", "⠸⠱", "⠐⠱ ", "⠨⠸⠱"
                        if chars[i+1] == '' {
                            return true;
                        } else if i+2 < chars_len {
                            let mut str = chars[i+1].to_string();
                            str.push(chars[i+2]);
                            if str == "⠘⠱" || str == "⠘⠲" || str == "⠸⠱" || str == "⠐⠱" {
                                return true;
                            } else if i+3 < chars_len {
                                str.push(chars[i+3]);
                                return str == "⠨⠸⠱";
                            }
                            return false;
                        }
                    }
                    if ch == 'L' || ch == 'N' || !LETTER_PREFIXES.contains(&ch) {
                        return false;
                    }
                }
            }
        }
        return false;
    }

    fn pick_start_mode(raw_braille: &str) -> String {
        // Need to decide what the start mode should be
        // From http://www.brailleauthority.org/ueb/ueb_math_guidance/final_for_posting_ueb_math_guidance_may_2019_102419.pdf
        //   Unless a math expression can be correctly represented with only a grade 1 symbol indicator in the first three cells
        //   or before a single letter standing alone anywhere in the expression,
        //   begin the expression with a grade 1 word indicator (or a passage indicator if the expression includes spaces)
        // Apparently "only a grade 1 symbol..." means at most one grade 1 symbol based on some examples (GTM 6.4, example 4)
        debug!("before determining mode:  '{}'", raw_braille);
        let grade2 = remove_unneeded_mode_changes(raw_braille, UEB_Mode::Grade2, UEB_Duration::Symbol);
        debug!("Symbol mode:  '{}'", grade2);

        if is_grade2_string_ok(&grade2) {
            return grade2;
        } else {
            let grade1_word = remove_unneeded_mode_changes(raw_braille, UEB_Mode::Grade1, UEB_Duration::Word);
            debug!("Word mode:    '{}'", grade1_word);
            
            // BANA says use g1 word mode if spaces are present, but that's not what their examples do
            // A conversation with Ms. DeAndrea from BANA said that they mean use passage mode if ≥3 "segments" (≥2 blanks)
            let mut n_blanks = 0;
            if grade1_word.chars().any(|ch| {
                if ch == 'W' {
                    n_blanks += 1;
                }
                n_blanks == 2
            }) {
                let grade1_passage = remove_unneeded_mode_changes(raw_braille, UEB_Mode::Grade1, UEB_Duration::Passage);
                debug!("Passage mode: '{}'", &grade1_passage);
                return "⠰⠰⠰".to_string() + &grade1_passage + "⠰⠄";
            } else {
                return "⠰⠰".to_string() + &grade1_word;
            }
        }

        /// Return true if the BANA guidelines say it is ok to start with grade 2
        fn is_grade2_string_ok(grade2_braille: &str) -> bool {
            // BANA says use grade 2 if there is not more than one grade one symbol or single letter standing alone.
            // The exact quote from their guidance:
            //    Unless a math expression can be correctly represented with only a grade 1 symbol indicator in the first three cells
            //    or before a single letter standing alone anywhere in the expression,
            //    begin the expression with a grade 1 word indicator
            // Note: I modified this slightly to exclude the cap indicator in the count. That allows three more ICEB rule to pass and seems
            //    like it is a reasonable thing to do.

            // Because of the 'L's which go away, we have to put a little more work into finding the first three chars
            let chars = grade2_braille.chars().collect::<Vec<char>>();
            let mut n_real_chars = 0;  // actually number of chars
            let mut found_g1 = false;
            let mut i = 0;      // chars starts on the 4th char
            while i < chars.len() {
                let ch = chars[i];
                if ch == '1' {
                    if found_g1 {
                        return false;
                    }
                    found_g1 = true;
                } else if !"𝐶CLobc".contains(ch) {
                    if n_real_chars == 2 {
                        i += 1;
                        break;      // this is the third real char
                    };
                    n_real_chars += 1;
                }
                i += 1
            }

            // if we find another g1 that isn't forced and isn't standing alone, we are done
            // we only allow one standing alone example -- not sure if BANA guidance has this limit, but GTM 11_5_5_3 seems better with it
            let mut is_standing_alone_already_encountered = false;
            while i < chars.len() {
                let ch = chars[i];
                if ch == '1' && !is_forced_grade1(&chars, i) {
                    if !is_single_letter_on_right(&chars, i) || is_standing_alone_already_encountered {
                        return false;
                    }
                    is_standing_alone_already_encountered = true; 
                }
                i += 1;
            }
            return true;
        }

        /// Return true if the sequence of chars forces a '1' at the `i`th position
        /// Note: `chars[i]` should be '1'
        fn is_forced_grade1(chars: &[char], i: usize) -> bool {
            // A '1' is forced if 'a-j' follows a digit
            assert_eq!(chars[i], '1', "'is_forced_grade1' didn't start with '1'");
            // check that a-j follows the '1'
            if i+1 < chars.len() && LETTER_NUMBERS.contains(&unhighlight(chars[i+1])) {
                // check for a number before the '1'
                // this will be 'N' followed by LETTER_NUMBERS or the number ".", ",", or " "
                for j in (0..i).rev() {
                    let ch = chars[j];
                    if !(LETTER_NUMBERS.contains(&unhighlight(ch)) || ".,W𝐖".contains(ch)) {
                        return ch == 'N'
                    }
                }
            }
            return false;
        }

        fn is_single_letter_on_right(chars: &[char], i: usize) -> bool {
            static SKIP_CHARS: phf::Set<char> = phf_set! {
                'B', 'I', '𝔹', 'S', 'T', 'D', 'C', '𝐶', 's', 'w'   // indicators
            };

            // find the first char (if any)
            let mut count = 0;      // how many letters
            let mut i = i+1;
            while i < chars.len() {
                let ch = chars[i];
                if !SKIP_CHARS.contains(&ch) {
                    if ch == 'L' {
                        if count == 1 {
                            return false;   // found a second letter in the sequence
                        }
                        count += 1;
                    } else {
                        return count==1;
                    }
                    i += 2;   // eat 'L' and actual letter
                } else {
                    i += 1;
                }
            }
            return true;
        }
    }
}
#[allow(non_camel_case_types)]
#[derive(Debug, PartialEq, Copy, Clone)]
enum UEB_Mode {
    Numeric,        // also includes Grade1
    Grade1,
    Grade2,
}

#[allow(non_camel_case_types)]
#[derive(Debug, PartialEq, Copy, Clone)]
enum UEB_Duration {
    // Standing alone: A braille symbol that is standing alone may have a contracted (grade 2) meaning.
    // A letter or unbroken sequence of letters is “standing alone” if the symbols before and after the letter or
    //   sequence are spaces, hyphens, dashes or any combination thereof, including some common punctuation.
    // Item: An “item” is defined as the next symbol or one of seven groupings listed in Rules of Unified English Braille, §11.4.1.
    Symbol,

    // The grade 1 word indicator sets grade 1 mode for the next word or symbol sequence.
    // A symbol sequence in UEB is defined as an unbroken string of braille signs,
    //   whether alphabetic or non-alphabetic, preceded and followed by a space.
    Word,
    Passage,
}

// used to determine standing alone (on left side)
static LEFT_INTERVENING_CHARS: phf::Set<char> = phf_set! {  // see RUEB 2.6.2
    'B', 'I', '𝔹', 'S', 'T', 'D', 'C', '𝐶', 's', 'w',     // indicators
    // opening chars have prefix 'o', so not in set ['(', '{', '[', '"', '\'', '“', '‘', '«'] 
};

fn remove_unneeded_mode_changes(raw_braille: &str, start_mode: UEB_Mode, start_duration: UEB_Duration) -> String {

    // FIX: need to be smarter about moving on wrt to typeforms/typefaces, caps, bold/italic. [maybe just let them loop through the default?]
    let mut mode = start_mode;
    let mut duration = start_duration;
    let mut start_g2_letter = None;    // used for start of contraction checks
    let mut i_g2_start = None;  // set to 'i' when entering G2 mode; None in other modes. '1' indicator goes here if standing alone
    let mut cap_word_mode = false;     // only set to true in G2 to prevent contractions
    let mut result = String::default();
    let chars = raw_braille.chars().collect::<Vec<char>>();
    let mut i = 0;
    while i < chars.len() {
        let ch = chars[i];
        match mode {
            UEB_Mode::Numeric => {
                // Numeric Mode: (from https://uebmath.aphtech.org/lesson1.0 and lesson4.0)
                // Symbols that can appear within numeric mode include the ten digits, comma, period, simple fraction line,
                // line continuation indicator, and numeric space digit symbols.
                // A space or any other symbol not listed here terminates numeric mode.
                // Numeric mode is also terminated by the "!" -- used after a script
                //
                // The numeric indicator also turns on grade 1 mode.
                // When grade 1 mode is set by the numeric indicator,
                //   grade 1 indicators are not used unless a single lower-case letter a-j immediately follows a digit.
                // Grade 1 mode when set by the numeric indicator is terminated by a space, hyphen, dash, or a grade 1 indicator.
                i_g2_start = None;
                // debug!("Numeric: ch={}, duration: {:?}", ch, duration);
                match ch {
                    'L' => {
                        // terminate numeric mode -- duration doesn't change
                        // let the default case handle pushing on the chars for the letter
                        if LETTER_NUMBERS.contains(&unhighlight(chars[i+1])) {
                            result.push('1');   // need to distinguish a-j from a digit
                        }
                        result.push(ch);
                        i += 1;
                        mode = UEB_Mode::Grade1;
                        // duration remains Word
                    },
                    '1' | '𝟙' => {
                        // numeric mode implies grade 1, so don't output indicator;
                        i += 1;
                        mode = UEB_Mode::Grade1;
                        if start_duration == UEB_Duration::Passage {
                            duration = UEB_Duration::Passage;      // otherwise it remains at Word
                        }
                    },
                    '#' => {
                        // terminate numeric mode -- duration doesn't change
                        i += 1;
                        if i+1 < chars.len() && chars[i] == 'L' && LETTER_NUMBERS.contains(&unhighlight(chars[i+1])) {
                            // special case where the script was numeric and a letter follows, so need to put out G1 indicator
                            result.push('1');
                            // the G1 case should work with 'L' now
                        }
                        mode = UEB_Mode::Grade1;
                    },
                    'N' => {
                        // stay in the same mode (includes numeric "," and "." space) -- don't let default get these chars
                        result.push(chars[i+1]);
                        i += 2;
                    },
                    _ => {
                        // moving out of numeric mode
                        result.push(ch);
                        i += 1;
                        mode = if "W𝐖-—―".contains(ch) {start_mode} else {UEB_Mode::Grade1};     // space, hyphen, dash(short & long) RUEB 6.5.1
                        if mode == UEB_Mode::Grade2 {
                            start_g2_letter = None;        // will be set to real letter
                        }
                    },
                }
            },
            UEB_Mode::Grade1 => {
                // Grade 1 Mode:
                // The numeric indicator also sets grade 1 mode.
                // Grade 1 mode, when initiated by the numeric indicator, is terminated by a space, hyphen, dash or grade 1 terminator.
                // Grade 1 mode is also set by grade 1 indicators.
                i_g2_start = None;
                // debug!("Grade 1: ch={}, duration: {:?}", ch, duration);
                match ch {
                    'L' => {
                        // note: be aware of '#' case for Numeric because '1' might already be generated
                        // let prev_ch = if i > 1 {chars[i-1]} else {'1'};   // '1' -- anything beside ',' or '.'
                        // if duration == UEB_Duration::Symbol || 
                        //     ( ",. ".contains(prev_ch) && LETTER_NUMBERS.contains(&unhighlight(chars[i+1])) ) {
                        //     result.push('1');        // need to retain grade 1 indicator (RUEB 6.5.2)
                        // }
                        // let the default case handle pushing on the chars for the letter
                        result.push(ch);
                        i += 1;
                    },
                    '1' => {
                        // nothing to do -- let the default case handle the following chars
                        i += 1;
                    },
                    'N' => {
                        result.push(ch);
                        result.push(chars[i+1]);
                        i += 2;
                        mode = UEB_Mode::Numeric;
                        duration = UEB_Duration::Word;
                    },
                    'W' | '𝐖' => {
                        // this terminates a word mode if there was one
                        result.push(ch);
                        i += 1;
                        if start_duration != UEB_Duration::Passage {
                            duration = UEB_Duration::Symbol;
                            mode = UEB_Mode::Grade2;
                        }
                    },
                    _ => {
                        result.push(ch);
                        i += 1;
                        if duration == UEB_Duration::Symbol && !LETTER_PREFIXES.contains(&ch) {
                            mode = start_mode;
                        }
                    }
                }
                if mode == UEB_Mode::Grade2 {
                    start_g2_letter = None;        // will be set to real letter
                }

            },
            UEB_Mode::Grade2 => {
                // note: if we ended up using a '1', it only extends to the next char, which is also dealt with, so mode doesn't change
               if i_g2_start.is_none() {
                   i_g2_start = Some(i);
                   cap_word_mode = false;
               }
                // debug!("Grade 2: ch={}, duration: {:?}", ch, duration);
                match ch {
                    'L' => {
                        if start_g2_letter.is_none() {
                            start_g2_letter = Some(i);
                        }
                        let (is_alone, right_matched_chars, n_letters) = stands_alone(&chars, i);
                        // GTM 1.2.1 says we only need to use G1 for single letters or sequences that are a shortform (e.g, "ab")
                        if is_alone && (n_letters == 1 || is_short_form(&right_matched_chars[..2*n_letters])) {
                            // debug!("  is_alone -- pushing '1'");
                            result.push('1');
                            mode = UEB_Mode::Grade1;
                        }
                        // debug!("  pushing {:?}", right_matched_chars);
                        right_matched_chars.iter().for_each(|&ch| result.push(ch));
                        i += right_matched_chars.len();
                    },
                    'C' => {
                        // Want 'C' before 'L'; Could be CC for word cap -- if so, eat it and move on
                        // Note: guaranteed that there is a char after the 'C', so chars[i+1] is safe
                        if chars[i+1] == 'C' {
                            cap_word_mode = true;
                            i += 1;
                        } else {
                            let is_greek = chars[i+1] == 'G';
                            let (is_alone, right_matched_chars, n_letters) = stands_alone(&chars, if is_greek {i+2} else {i+1});
                            // GTM 1.2.1 says we only need to use G1 for single letters or sequences that are a shortform (e.g, "ab")
                            if is_alone && (n_letters == 1 || is_short_form(&right_matched_chars[..2*n_letters])) {
                                // debug!("  is_alone -- pushing '1'");
                                result.push('1');
                                mode = UEB_Mode::Grade1;
                            }
                            if cap_word_mode {
                                result.push('C');   // first 'C' if cap word
                            }
                            result.push('C');
                            if is_greek {
                                result.push('G');
                                i += 1;
                            }
                            start_g2_letter = Some(i);
                            // debug!("  pushing 'C' + {:?}", right_matched_chars);
                            right_matched_chars.iter().for_each(|&ch| result.push(ch));
                            i += 1 + right_matched_chars.len();
                        }
                    },
                    '1' => {
                        result.push(ch);
                        i += 1;
                        mode = UEB_Mode::Grade1;
                        duration = UEB_Duration::Symbol;
                    },
                    'N' => {
                        result.push(ch);
                        result.push(chars[i+1]);
                        i += 2;
                        mode = UEB_Mode::Numeric;
                        duration = UEB_Duration::Word;
                    },
                    _ => {
                        if let Some(start) = start_g2_letter {
                            if !cap_word_mode {
                                result = handle_contractions(&chars[start..i], result);
                            }
                            cap_word_mode = false;
                            start_g2_letter = None;     // not start of char sequence
                        }
                        result.push(ch);
                        i += 1;
                        if !LEFT_INTERVENING_CHARS.contains(&ch) {
                            cap_word_mode = false;
                            i_g2_start = Some(i);
                        }

                    }
                }
                if mode != UEB_Mode::Grade2 && !cap_word_mode {
                    if let Some(start) = start_g2_letter {
                        result = handle_contractions(&chars[start..i], result);
                        start_g2_letter = None;     // not start of char sequence
                    }
                }
            },
        }
    }
    if mode == UEB_Mode::Grade2 {
        if let Some(start) = start_g2_letter {
            result = handle_contractions(&chars[start..i], result);
        }
    }

    return result;
}

/// Returns a tuple:
///   true if the ith char "stands alone" (UEB 2.6)
///   the chars on the right that are part of the standing alone sequence
///   the number of letters in that sequence
/// This basically means a letter sequence surrounded by white space with some potentially intervening chars
/// The intervening chars can be typeform/cap indicators, along with various forms of punctuation
/// The ith char should be an "L"
/// This assumes that there is whitespace before and after the character string
fn stands_alone(chars: &[char], i: usize) -> (bool, &[char], usize) {
    // scan backward and check the conditions for "standing-alone"
    // we scan forward and check the conditions for "standing-alone"
    assert_eq!(chars[i], 'L', "'stands_alone' starts with non 'L'");
    if !left_side_stands_alone(&chars[0..i]) {
        return (false, &chars[i..i+2], 0);
    }

    let (mut is_alone, n_letters, n_right_matched) = right_side_stands_alone(&chars[i+2..]);
    if is_alone && n_letters == 1 {
        let ch = chars[i+1];
        if ch=='' || ch=='' || ch=='' {      // a, i, o
            is_alone = false;
        }
    }
    return (is_alone, &chars[i..i+2+n_right_matched], n_letters);

    /// chars before before 'L'
    fn left_side_stands_alone(chars: &[char]) -> bool {
        // scan backwards to skip letters and intervening chars
        // once we hit an intervening char, only intervening chars are allowed if standing alone
        let mut intervening_chars_mode = false; // true when we are on the final stretch
        let mut i = chars.len();
        while i > 0 {
            i -= 1;
            let ch = chars[i];
            let prev_ch = if i > 0 {chars[i-1]} else {' '};  // ' ' is a char not in input
            // debug!("  left alone: prev/ch {}/{}", prev_ch, ch);
            if (!intervening_chars_mode && prev_ch == 'L') ||
               (ch == 'o' || ch == 'b') {
                i -= 1;       // ignore 'Lx' and also ignore 'ox'
            } else if LEFT_INTERVENING_CHARS.contains(&ch) {
                intervening_chars_mode = true;
            } else {
                return "W𝐖-—―".contains(ch);
            }
        }

        return true;
    }

    // chars after character we are testing
    fn right_side_stands_alone(chars: &[char]) -> (bool, usize, usize) {
        // see RUEB 2.6.3
        static RIGHT_INTERVENING_CHARS: phf::Set<char> = phf_set! {
            'B', 'I', '𝔹', 'S', 'T', 'D', 'C', '𝐶', 's', 'w', 'e',   // indicators
            // ')', '}', ']', '\"', '\'', '”', '’', '»',      // closing chars
            // ',', ';', ':', '.', '…', '!', '?'              // punctuation           
        };
        // scan forward to skip letters and intervening chars
        // once we hit an intervening char, only intervening chars are allowed if standing alone ('c' and 'b' are part of them)
        let mut intervening_chars_mode = false; // true when we are on the final stretch
        let mut i = 0;
        let mut n_letters = 1;      // we have skipped the first letter
        while i < chars.len() {
            let ch = chars[i];
            // debug!("  right alone: ch/next {}/{}", ch, if i+1<chars.len() {chars[i+1]} else {' '});
            if !intervening_chars_mode && ch == 'L' {
                n_letters += 1;
                i += 1;       // ignore 'Lx' and also ignore 'ox'
            } else if ch == 'c' || ch == 'b' {
                i += 1;       // ignore 'Lx' and also ignore 'ox'
            } else if RIGHT_INTERVENING_CHARS.contains(&ch) {  
                intervening_chars_mode = true;
            } else {
                return if "W𝐖-—―".contains(ch) {(true, n_letters, i)} else {(false, n_letters, i)};
            }
            i += 1;
        }

        return (true, n_letters, chars.len());
    }
}

/// Return a modified result if chars can be contracted.
/// Otherwise, the original string is returned
fn handle_contractions(chars: &[char], mut result: String) -> String {
    struct Replacement {
        pattern: &'static str,
        replacement: &'static str
    }

    // It would be much better from an extensibility point of view to read the table in from a file
    // FIX: this would be much easier to read/maintain if ASCII braille were used
    // FIX:   (without the "L"s) and the CONTRACTIONS table built as a lazy static
    static CONTRACTIONS: &[Replacement] = &[
        Replacement{ pattern: "L⠁L⠝L⠙", replacement: "L⠯" },           // and
        Replacement{ pattern: "L⠋L⠕L⠗", replacement: "L⠿" },           // for
        Replacement{ pattern: "L⠕L⠋", replacement: "L⠷" },             // of
        Replacement{ pattern: "L⠞L⠓L⠑", replacement: "L⠮" },           // the
        Replacement{ pattern: "L⠺L⠊L⠞L⠓", replacement: "L⠾" },         // with
        Replacement{ pattern: "L⠉L⠓", replacement: "L⠡" },              // ch
        Replacement{ pattern: "L⠊L⠝", replacement: "L⠔" },              // in

        // cc -- don't match if after/before a cap letter -- no/can't use negative pattern (?!...) in regex package
        // figure this out -- also applies to ea, bb, ff, and gg (not that they matter)
        // cc may be important for "arccos", but RUEB doesn't apply it to "arccosine", so maybe not
        // Replacement{ pattern: "L⠉L⠉", replacement: "L⠒" },              // cc -- don't match if after/before a cap letter
        
        Replacement{ pattern: "L⠎L⠓", replacement: "L⠩" },              // sh
        Replacement{ pattern: "L⠁L⠗", replacement: "L⠜" },              // ar
        Replacement{ pattern: "L⠑L⠗", replacement: "L⠻" },              // er
        Replacement{ pattern: "(?P<s>L.)L⠍L⠑L⠝L⠞", replacement: "${s}L⠰L⠞" }, // ment
        Replacement{ pattern: "(?P<s>L.)L⠞L⠊L⠕L⠝", replacement: "${s}L⠰L⠝" } ,// tion
        Replacement{ pattern: "(?P<s>L.)L⠑L⠁(?P<e>L.)", replacement: "${s}L⠂${e}" },  // ea
    ];

    lazy_static! {
        static ref CONTRACTION_PATTERNS: RegexSet = init_patterns(CONTRACTIONS);
        static ref CONTRACTION_REGEX: Vec<Regex> = init_regex(CONTRACTIONS);
    }

    let mut chars_as_str = chars.iter().collect::<String>();
    // debug!("  handle_contractions: examine '{}'", &chars_as_str);
    let matches = CONTRACTION_PATTERNS.matches(&chars_as_str);
    for i in matches.iter() {
        let element = &CONTRACTIONS[i];
        // debug!("  replacing '{}' with '{}' in '{}'", element.pattern, element.replacement, &chars_as_str);
        result.truncate(result.len() - chars_as_str.len());
        chars_as_str = CONTRACTION_REGEX[i].replace_all(&chars_as_str, element.replacement).to_string();
        result.push_str(&chars_as_str);
        // debug!("  result after replace '{}'", result);
    }
    return result;



    fn init_patterns(contractions: &[Replacement]) -> RegexSet {
        let mut vec = Vec::with_capacity(contractions.len());
        for contraction in contractions {
            vec.push(contraction.pattern);
        }
        return RegexSet::new(&vec).unwrap();
    }

    fn init_regex(contractions: &[Replacement]) -> Vec<Regex> {
        let mut vec = Vec::with_capacity(contractions.len());
        for contraction in contractions {
            vec.push(Regex::new(contraction.pattern).unwrap());
        }
        return vec;
    }
}



/************** Braille xpath functionality ***************/
use crate::canonicalize::{name, as_element, as_text};
use crate::xpath_functions::{is_leaf, IsBracketed};
use sxd_document::dom::ParentOfChild;
use sxd_xpath::{Value, context, nodeset::*};
use sxd_xpath::function::{Function, Args};
use sxd_xpath::function::Error as XPathError;
use std::result::Result as StdResult;

pub struct NemethNestingChars;
const NEMETH_FRAC_LEVEL: &str = "nemeth-frac-level";    // name of attr where value is cached
const FIRST_CHILD_ONLY: &[&str] = &["mroot", "msub", "msup", "msubsup", "munder", "mover", "munderover", "mmultiscripts"];
impl NemethNestingChars {
    // returns a 'repeat_char' corresponding to the Nemeth rules for nesting
    // note: this value is likely one char too long because the starting fraction is counted
    fn nemeth_frac_value<'a>(node: &'a Element, repeat_char: &'a str) -> String {
        let children = node.children();
        let name = name(node);
        if is_leaf(*node) {
            return "".to_string();
        } else if name == "mfrac" {
            // have we already computed the value?
            if let Some(value) = node.attribute_value(NEMETH_FRAC_LEVEL) {
                return value.to_string();
            }

            let num_value = NemethNestingChars::nemeth_frac_value(&as_element(children[0]), repeat_char);
            let denom_value = NemethNestingChars::nemeth_frac_value(&as_element(children[1]), repeat_char);
            let mut max_value = if num_value.len() > denom_value.len() {num_value} else {denom_value};
            max_value += repeat_char;
            node.set_attribute_value(NEMETH_FRAC_LEVEL, &max_value);
            return max_value;
        } else if FIRST_CHILD_ONLY.contains(&name) {
            // only look at the base -- ignore scripts/index
            return NemethNestingChars::nemeth_frac_value(&as_element(children[0]), repeat_char);
        } else {
            let mut result = "".to_string();
            for child in children {
                let value = NemethNestingChars::nemeth_frac_value(&as_element(child), repeat_char);
                if value.len() > result.len() {
                    result = value;
                }
            }
            return result;
        }
    }

    fn nemeth_root_value<'a>(node: &'a Element, repeat_char: &'a str) -> StdResult<String, XPathError> {
        // returns the correct number of repeat_chars to use
        // note: because the highest count is toward the leaves and
        //    because this is a loop and not recursive, caching doesn't work without a lot of overhead
        let parent = node.parent().unwrap();
        if let ParentOfChild::Element(e) =  parent {
            let mut parent = e;
            let mut result = "".to_string();
            loop {
                let name = name(&parent);
                if name == "math" {
                    return Ok( result );
                }
                if name == "msqrt" || name == "mroot" {
                    result += repeat_char;
                }
                let parent_of_child = parent.parent().unwrap();
                if let ParentOfChild::Element(e) =  parent_of_child {
                    parent = e;
                } else {
                    return Err( sxd_xpath::function::Error::Other("Internal error in nemeth_root_value: didn't find 'math' tag".to_string()) );
                }
            }
        }
        return Err( XPathError::Other("Internal error in nemeth_root_value: didn't find 'math' tag".to_string()) );
    }
}

impl Function for NemethNestingChars {
/**

 * Returns a string with the correct number of nesting chars (could be an empty string)
 * @param(node) -- current node
 * @param(char) -- char (string) that should be repeated
 * Note: as a side effect, an attribute with the value so repeated calls to this or a child will be fast
 */
 fn evaluate<'c, 'd>(&self,
                        _context: &context::Evaluation<'c, 'd>,
                        args: Vec<Value<'d>>)
                        -> StdResult<Value<'d>, XPathError>
    {
        let mut args = Args(args);
        args.exactly(2)?;
        let repeat_char = args.pop_string()?;
        let node = crate::xpath_functions::validate_one_node(args.pop_nodeset()?, "NestingChars")?;
        if let Node::Element(el) = node {
            let name = name(&el);
            // it is likely a bug to call this one a non mfrac
            if name == "mfrac" {
                // because it is called on itself, the fraction is counted one too many times -- chop one off
                // this is slightly messy because we are chopping off a char, not a byte
                const BRAILLE_BYTE_LEN: usize = "".len();      // all Unicode braille symbols have the same number of bytes
                return Ok( Value::String( NemethNestingChars::nemeth_frac_value(&el, &repeat_char)[BRAILLE_BYTE_LEN..].to_string() ) );
            } else if name == "msqrt" || name == "mroot" {
                return Ok( Value::String( NemethNestingChars::nemeth_root_value(&el, &repeat_char)? ) );
            } else {
                panic!("NestingChars chars should be used only on 'mfrac'. '{}' was passed in", name);
            }
        } else {
            // not an element, so nothing to do
            return Ok( Value::String("".to_string()) );
        }
    }
}

pub struct BrailleChars;
impl BrailleChars {
    // returns a string for the chars in the *leaf* node.
    // this string follows the Nemeth rules typefaces and deals with mathvariant
    //  which has partially turned chars to the alphanumeric block
    fn get_braille_chars(node: &Element, code: &str, text_range: Option<Range<usize>>) -> StdResult<String, XPathError> {
        match code {
            "Nemeth" => return BrailleChars::get_braille_nemeth_chars(node, text_range),
            "UEB" => return BrailleChars:: get_braille_ueb_chars(node, text_range),
            _ => {
                warn!("get_braille_chars: unknown braille code '{}'", code);
                return Ok( as_text(*node).to_string() );
            },
        };
    }

    fn get_braille_nemeth_chars(node: &Element, text_range: Option<Range<usize>>) -> StdResult<String, XPathError> {
        lazy_static! {
            // To greatly simplify typeface/language generation, the chars have unique ASCII chars for them:
            // Typeface: S: sans-serif, B: bold, 𝔹: blackboard, T: script, I: italic, R: Roman
            // Language: E: English, D: German, G: Greek, V: Greek variants, H: Hebrew, U: Russian
            // Indicators: C: capital, L: letter, N: number, P: punctuation, M: multipurpose
            static ref PICK_APART_CHAR: Regex = 
                Regex::new(r"(?P<face>[SB𝔹TIR]*)(?P<lang>[EDGVHU]?)(?P<cap>C?)(?P<letter>L?)(?P<num>[N]?)(?P<char>.)").unwrap();
        }
    
        let math_variant = node.attribute_value("mathvariant");
        // FIX: cover all the options -- use phf::Map
        let  attr_typeface = match math_variant {
            None => "R",
            Some(variant) => match variant {
                "bold" => "B",
                "italic" => "I",
                "double-struck" => "𝔹",
                "script" => "T",
                "fraktur" => "D",
                "sans-serif" => "S",
                _ => "R",       // normal and unknown
            },
        };
        let text = BrailleChars::substring(as_text(*node), text_range);
        let braille_chars = crate::speech::braille_replace_chars(&text, *node).unwrap_or_else(|_| "".to_string());
        // debug!("Nemeth chars: text='{}', braille_chars='{}'", &text, &braille_chars);
        
        // we want to pull the prefix (typeface, language) out to the front until a change happens
        // the same is true for number indicator
        // also true (sort of) for capitalization -- if all caps, use double cap in front (assume abbr or Roman Numeral)
        let is_in_enclosed_list = name(node) == "mn" && BrailleChars::is_in_enclosed_list(*node);
        let mut typeface = "R".to_string();     // assumption is "R" and if attr or letter is different, something happens
        let mut is_all_caps = true;
        let mut is_all_caps_valid = false;      // all_caps only valid if we did a replacement
        let result = PICK_APART_CHAR.replace_all(&braille_chars, |caps: &Captures| {
            // debug!("  face: {:?}, lang: {:?}, num {:?}, cap: {:?}, char: {:?}",
            //        &caps["face"], &caps["lang"], &caps["num"], &caps["cap"], &caps["char"]);
            let mut nemeth_chars = "".to_string();
            let char_face = if caps["face"].is_empty() {attr_typeface} else {&caps["face"]};
            let typeface_changed =  typeface != char_face;
            if typeface_changed {
                typeface = char_face.to_string();   // needs to outlast this instance of the loop
                nemeth_chars += &typeface;
                nemeth_chars +=  &caps["lang"];
            } else {
                nemeth_chars +=  &caps["lang"];
            }
            // debug!("  typeface changed: {}, is_in_list: {}; num: {}", typeface_changed, is_in_enclosed_list, !caps["num"].is_empty());
            if !caps["num"].is_empty() && (typeface_changed || !is_in_enclosed_list) {
                nemeth_chars += "N";
            }
            is_all_caps_valid = true;
            is_all_caps &= !&caps["cap"].is_empty();
            nemeth_chars += &caps["cap"];       // will be stripped later if all caps
            nemeth_chars += &caps["letter"];
            nemeth_chars += &caps["char"];
            return nemeth_chars;
        });
        // debug!("  result: {}", &result);
        let mut text_chars = text.chars();     // see if more than one char
        if is_all_caps_valid && is_all_caps && text_chars.next().is_some() &&  text_chars.next().is_some() {
            return Ok( "CC".to_string() + &result.replace('C', ""));
        } else {
            return Ok( result.to_string() );
        }
    }

    fn get_braille_ueb_chars(node: &Element, text_range: Option<Range<usize>>) -> StdResult<String, XPathError> {
        // Because in UEB typeforms and caps may extend for multiple tokens,
        //   this routine merely deals with the mathvariant attr.
        // Canonicalize has already transformed all chars it can to math alphanumerics, but not all have bold/italic 
        // The typeform/caps transforms to (potentially) word mode are handled later.
        lazy_static! {
            static ref HAS_TYPEFACE: Regex = Regex::new(".*?(double-struck|script|fraktur|sans-serif).*").unwrap();
            static ref PICK_APART_CHAR: Regex = 
                 Regex::new(r"(?P<bold>B??)(?P<italic>I??)(?P<face>[S𝔹TD]??)s??(?P<cap>C??)(?P<greek>G??)(?P<char>[NL].)").unwrap();
        }
    
        let math_variant = node.attribute_value("mathvariant");
        let text = BrailleChars::substring(as_text(*node), text_range);
        let braille_chars = crate::speech::braille_replace_chars(&text, *node).unwrap_or_else(|_| "".to_string());

        if math_variant.is_none() {         // nothing we need to do
            return Ok(braille_chars);
        }
        // mathvariant could be "sans-serif-bold-italic" -- get the parts
        let math_variant = math_variant.unwrap();
        let bold = math_variant.contains("bold");
        let italic = math_variant.contains("italic");
        let typeface = match HAS_TYPEFACE.find(math_variant) {
            None => "",
            Some(m) => match m.as_str() {
                "double-struck" => "𝔹",
                "script" => "T",
                "fraktur" => "D",
                "sans-serif" => "S",
                //  don't consider monospace as a typeform
                _ => "",
            },
        };
        let result = PICK_APART_CHAR.replace_all(&braille_chars, |caps: &Captures| {
            // debug!("captures: {:?}", caps);
            // debug!("  bold: {:?}, italic: {:?}, face: {:?}, cap: {:?}, char: {:?}",
            //        &caps["bold"], &caps["italic"], &caps["face"], &caps["cap"], &caps["char"]);
            if bold || !caps["bold"].is_empty() {"B"} else {""}.to_string()
                + if italic || !caps["italic"].is_empty() {"I"} else {""}
                + if !&caps["face"].is_empty() {&caps["face"]} else {typeface}
                + &caps["cap"]
                + &caps["greek"]
                + &caps["char"]
        });
        return Ok(result.to_string())
    }

    fn is_in_enclosed_list(node: Element) -> bool {
        // Nemeth Rule 10 defines an enclosed list:
        // 1: begins and ends with fence
        // 2: FIX: not implemented -- must contain no word, abbreviation, ordinal or plural ending
        // 3: function names or signs of shape and the signs which follow them are a single item (not a word)
        // 4: an item of the list may be an ellipsis or any sign used for mission
        // 5: no relational operator may appear within the list
        // 6: the list must have at least 2 items.
        //       Items are separated by commas, can not have other punctuation (except ellipsis and dash)
        let mut parent = node.parent().unwrap().element().unwrap(); // safe since 'math' is always at root
        while name(&parent) == "mrow" {
            if IsBracketed::is_bracketed(&parent, "", "", true, false) {
                for child in parent.children() {
                    if !child_meets_conditions(as_element(child)) {
                        return false;
                    }
                }
                return true;
            }
            parent = parent.parent().unwrap().element().unwrap();
        }
        return false;

        fn child_meets_conditions(node: Element) -> bool {
            let name = name(&node);
            return match name {
                "mi" | "mn" => true,
                "mo"  => !crate::canonicalize::is_relational_op(node),
                "mtext" => false, // FIX -- should be more nuanced,
                "mrow" => {
                    if IsBracketed::is_bracketed(&node, "", "", false, false) {
                        return child_meets_conditions(as_element(node.children()[1]));
                    } else {
                        for child in node.children() {
                            if !child_meets_conditions(as_element(child)) {
                                return false;
                            }
                        }
                    }  
                    true      
                },
                _ => {
                    for child in node.children() {
                        if !child_meets_conditions(as_element(child)) {
                            return false;
                        }
                    }
                    true
                },
            }
        }
    }

    /// Extract the `char`s from `str` within `range` (these are chars, not byte offsets)
    fn substring(str: &str, text_range: Option<Range<usize>>) -> String {
        return match text_range {
            None => str.to_string(),
            Some(range) => str.chars().skip(range.start).take(range.end - range.start).collect(),
        }
    }
}

impl Function for BrailleChars {
    /**

     * Returns a string with the correct number of nesting chars (could be an empty string)
     * @param(node) -- current node
     * @param(char) -- char (string) that should be repeated
     * Note: as a side effect, an attribute with the value so repeated calls to this or a child will be fast
     */
     fn evaluate<'c, 'd>(&self,
                            _context: &context::Evaluation<'c, 'd>,
                            args: Vec<Value<'d>>)
                            -> StdResult<Value<'d>, XPathError>
        {
            let mut args = Args(args);
            if let Err(e) = args.exactly(2).or_else(|_| args.exactly(4)) {
                return Err( XPathError::Other(format!("BrailleChars requires 2 or 4 args: {}", e)));
            };

            let range = if args.len() == 4 {
                let end = args.pop_number()? as usize;   // non-inclusive at end
                let start = args.pop_number()? as usize - 1;    // adjust to 0-based
                Some(start..end)
            } else {
                None
            };
            let braille_code = args.pop_string()?;
            let node = crate::xpath_functions::validate_one_node(args.pop_nodeset()?, "BrailleChars")?;
            if let Node::Element(el) = node {
                assert!( is_leaf(el) );
                return Ok( Value::String( BrailleChars::get_braille_chars(&el, &braille_code, range)? ) );
            } else {
                // not an element, so nothing to do
                return Ok( Value::String("".to_string()) );
            }
        }
    }
    
#[cfg(test)]
mod tests {
    use super::*;
    #[allow(unused_imports)]
    use crate::init_logger;
    use crate::interface::*;
    
    #[test]
    fn ueb_highlight_24() -> Result<()> {       // issue 24
        let mathml_str = "<math display='block' id='M8o41h70-0'>
                <mrow id='M8o41h70-1'>
                <mn id='M8o41h70-2'>4</mn>
                <mo id='M8o41h70-3'>&#x2062;</mo>
                <mi id='M8o41h70-4'>a</mi>
                <mo id='M8o41h70-5'>&#x2062;</mo>
                <mi id='M8o41h70-6'>c</mi>
            </mrow>
        </math>";
        crate::interface::set_rules_dir(super::super::abs_rules_dir_path()).unwrap();
        set_mathml(mathml_str.to_string()).unwrap();
        set_preference("BrailleCode".to_string(), "UEB".to_string()).unwrap();
        set_preference("BrailleNavHighlight".to_string(), "All".to_string()).unwrap();
        let braille = get_braille("M8o41h70-2".to_string())?;
        assert_eq!("⣼⣙⠰⠁⠉", braille);
        let braille = get_braille("M8o41h70-4".to_string())?;
        assert_eq!("⠼⠙⣰⣁⠉", braille);
        return Ok( () );
    }
}