braillify 2.0.0

Rust 기반 크로스플랫폼 한국어 점역 라이브러리
Documentation
use std::borrow::Cow;

use crate::rules::token::{Token, WordMeta, WordToken};
use crate::rules::token_rule::{TokenAction, TokenPhase, TokenRule};

const ROMAN_INDICATOR: u8 = 52; //const ROMAN_CONTINUATION: u8 = 48; //const UPPERCASE_SIGN: u8 = 32; //const ROMAN_TERMINATOR: u8 = 50; //const HYPHEN: u8 = crate::unicode::decode_unicode('');

pub struct RomanNumeralRule;

fn is_upper_roman_char(c: char) -> bool {
    matches!(c, 'I' | 'V' | 'X')
}

fn is_lower_roman_char(c: char) -> bool {
    matches!(c, 'i' | 'v' | 'x')
}

fn roman_case(text: &str) -> Option<bool> {
    if text.chars().all(is_upper_roman_char) {
        return Some(true);
    }
    if text.chars().all(is_lower_roman_char) {
        return Some(false);
    }
    None
}

fn is_valid_roman_1_to_39(text: &str) -> bool {
    if text.is_empty() {
        return false;
    }
    if roman_case(text).is_none() {
        return false;
    }

    let upper = text.to_ascii_uppercase();
    let mut chars = upper.chars().peekable();
    let mut x_count = 0usize;
    while chars.peek().is_some_and(|c| *c == 'X') {
        x_count += 1;
        chars.next();
    }
    if x_count > 3 {
        return false;
    }

    let rest: String = chars.collect();
    let ones_ok = matches!(
        rest.as_str(),
        "" | "I" | "II" | "III" | "IV" | "V" | "VI" | "VII" | "VIII" | "IX"
    );

    (x_count > 0 || !rest.is_empty()) && ones_ok
}

fn split_roman_prefix(text: &str) -> (&str, &str) {
    let mut split_at = 0usize;
    for (idx, ch) in text.char_indices() {
        if !is_upper_roman_char(ch) && !is_lower_roman_char(ch) {
            break;
        }
        split_at = idx + ch.len_utf8();
    }
    text.split_at(split_at)
}

fn split_after_hyphen(text: &str) -> Option<(&str, &str)> {
    if !text.starts_with('-') {
        return None;
    }
    let without_hyphen = &text['-'.len_utf8()..];
    let (roman, rest) = split_roman_prefix(without_hyphen);
    if roman.is_empty() {
        return None;
    }
    Some((roman, rest))
}

fn starts_with_ascii_alpha(text: &str) -> bool {
    text.chars().next().is_some_and(|c| c.is_ascii_alphabetic())
}

fn has_prev_ascii_word<'a>(tokens: &[Token<'a>], index: usize) -> bool {
    tokens[..index].iter().rev().find_map(|token| match token {
        Token::Word(word) => Some(word.meta.has_ascii_alphabetic),
        _ => None,
    }) == Some(true)
}

fn build_word_token<'a>(text: &str) -> Token<'a> {
    let chars: Vec<char> = text.chars().collect();
    Token::Word(WordToken {
        text: Cow::Owned(text.to_string()),
        chars: chars.clone(),
        meta: WordMeta::from_chars(&chars),
    })
}

fn encode_roman_segment(text: &str, entry: u8, with_terminator: bool) -> Result<Vec<u8>, String> {
    let is_upper = roman_case(text).ok_or_else(|| format!("invalid roman case: {text}"))?;

    let mut out = Vec::new();
    out.push(entry);

    if is_upper {
        if text.chars().count() >= 2 {
            out.push(UPPERCASE_SIGN);
            out.push(UPPERCASE_SIGN);
        } else {
            out.push(UPPERCASE_SIGN);
        }
    }

    for ch in text.chars() {
        out.push(crate::english::encode_english(ch.to_ascii_lowercase())?);
    }

    if with_terminator {
        out.push(ROMAN_TERMINATOR);
    }

    Ok(out)
}

impl TokenRule for RomanNumeralRule {
    fn phase(&self) -> TokenPhase {
        TokenPhase::ModeEntry
    }

    fn priority(&self) -> u16 {
        5
    }

    fn apply<'a>(
        &self,
        tokens: &[Token<'a>],
        index: usize,
        state: &mut crate::rules::context::EncoderState,
    ) -> Result<TokenAction<'a>, String> {
        let Some(Token::Word(word)) = tokens.get(index) else {
            return Ok(TokenAction::Noop);
        };

        let text = word.text.as_ref();

        let allow_standalone = state.english_indicator || text.chars().count() >= 2;
        if allow_standalone && is_valid_roman_1_to_39(text) {
            let bytes = encode_roman_segment(text, ROMAN_INDICATOR, true)?;
            return Ok(TokenAction::Replace(Token::PreEncoded(bytes)));
        }

        if !state.english_indicator {
            return Ok(TokenAction::Noop);
        }

        let (first, rest) = split_roman_prefix(text);
        if first.is_empty() || !is_valid_roman_1_to_39(first) {
            return Ok(TokenAction::Noop);
        }

        if let Some((second, suffix)) = split_after_hyphen(rest)
            && is_valid_roman_1_to_39(second)
            && !starts_with_ascii_alpha(suffix)
        {
            let first_entry = if has_prev_ascii_word(tokens, index) {
                ROMAN_CONTINUATION
            } else {
                ROMAN_INDICATOR
            };
            let mut bytes = encode_roman_segment(first, first_entry, false)?;
            bytes.push(HYPHEN);
            bytes.extend(encode_roman_segment(second, ROMAN_CONTINUATION, true)?);

            if suffix.is_empty() {
                return Ok(TokenAction::Replace(Token::PreEncoded(bytes)));
            }

            return Ok(TokenAction::ReplaceMany(vec![
                Token::PreEncoded(bytes),
                build_word_token(suffix),
            ]));
        }

        if starts_with_ascii_alpha(rest) {
            return Ok(TokenAction::Noop);
        }

        let entry = if has_prev_ascii_word(tokens, index) {
            ROMAN_CONTINUATION
        } else {
            ROMAN_INDICATOR
        };

        let bytes = encode_roman_segment(first, entry, true)?;
        if rest.is_empty() {
            return Ok(TokenAction::Replace(Token::PreEncoded(bytes)));
        }

        Ok(TokenAction::ReplaceMany(vec![
            Token::PreEncoded(bytes),
            build_word_token(rest),
        ]))
    }
}