use crate::english::encode_english;
use crate::number::encode_number;
use crate::rules::context::EncoderState;
use crate::rules::token::Token;
use crate::rules::token_rule::{TokenAction, TokenPhase, TokenRule};
use crate::unicode::decode_unicode;
pub struct Rule33CitationYearSuffixRule;
fn is_rule33_emission(bytes: &[u8]) -> bool {
if bytes.len() < 8 || bytes.len() > 9 {
return false;
}
if bytes[0] != 60 {
return false;
}
let is_digit_byte = |b: &u8| matches!(*b, 1 | 3 | 9 | 10 | 11 | 17 | 19 | 25 | 26 | 27);
if !bytes[1..5].iter().all(is_digit_byte) {
return false;
}
if !matches!(bytes[5], 48 | 52) {
return false;
}
if !(1..=63).contains(&bytes[6]) {
return false;
}
match &bytes[7..] {
[2] => true, [50] => true, [48, 6] => true, _ => false,
}
}
fn match_year_suffix(text: &str) -> Option<(&str, char, char)> {
let chars: Vec<char> = text.chars().collect();
if chars.len() != 6 {
return None;
}
if !chars[..4].iter().all(|c| c.is_ascii_digit()) {
return None;
}
if !chars[4].is_ascii_lowercase() {
return None;
}
if !matches!(chars[5], ',' | ';' | '.') {
return None;
}
let year_end = text.char_indices().nth(4).map(|(i, _)| i)?;
Some((&text[..year_end], chars[4], chars[5]))
}
impl TokenRule for Rule33CitationYearSuffixRule {
fn phase(&self) -> TokenPhase {
TokenPhase::Normalization
}
fn priority(&self) -> u16 {
50
}
fn apply<'a>(
&self,
tokens: &[Token<'a>],
index: usize,
_state: &mut EncoderState,
) -> Result<TokenAction<'a>, String> {
let Some(Token::Word(word)) = tokens.get(index) else {
return Ok(TokenAction::Noop);
};
let text = word.text.as_ref();
let Some((year_str, letter, punct)) = match_year_suffix(text) else {
return Ok(TokenAction::Noop);
};
let prev_is_same_pattern = check_prev_is_same_pattern(tokens, index);
let mut bytes = Vec::new();
bytes.push(decode_unicode('⠼'));
for c in year_str.chars() {
bytes.push(encode_number(c)?);
}
bytes.push(if prev_is_same_pattern {
decode_unicode('⠰')
} else {
decode_unicode('⠴')
});
bytes.push(encode_english(letter)?);
if punct == ',' {
bytes.push(decode_unicode('⠂'));
} else if punct == ';' {
bytes.push(decode_unicode('⠰'));
bytes.push(decode_unicode('⠆'));
} else {
bytes.push(decode_unicode('⠲'));
}
Ok(TokenAction::Replace(Token::PreEncoded(bytes)))
}
}
fn check_prev_is_same_pattern(tokens: &[Token<'_>], index: usize) -> bool {
let mut i = index;
while i > 0 {
i -= 1;
match tokens.get(i) {
Some(Token::Space(_)) => continue,
Some(Token::Word(w)) => return match_year_suffix(w.text.as_ref()).is_some(),
Some(Token::PreEncoded(bytes)) => return is_rule33_emission(bytes),
_ => return false,
}
}
false
}
#[cfg(test)]
mod tests {
use super::*;
use crate::rules::token::{SpaceKind, WordMeta, WordToken};
use std::borrow::Cow;
fn word_token<'a>(text: &str) -> Token<'a> {
let chars: Vec<char> = text.chars().collect();
Token::Word(WordToken {
text: Cow::Owned(text.to_string()),
chars: chars.clone(),
meta: WordMeta::from_chars(&chars),
})
}
#[test]
fn rule_phase_priority() {
let r = Rule33CitationYearSuffixRule;
assert!(matches!(r.phase(), TokenPhase::Normalization));
assert_eq!(r.priority(), 50);
}
#[rstest::rstest]
#[case::valid_with_comma("1998a,", true)]
#[case::valid_with_semicolon("2024z;", true)]
#[case::valid_with_period("1900b.", true)]
#[case::missing_punctuation("1998a", false)]
#[case::too_many_letters("1998abc,", false)]
#[case::non_digit_in_year("199xa,", false)]
#[case::uppercase_letter("1998A,", false)]
#[case::wrong_punctuation("1998a!", false)]
fn match_year_suffix_paths(#[case] input: &str, #[case] is_match: bool) {
assert_eq!(match_year_suffix(input).is_some(), is_match);
}
#[test]
fn is_rule33_emission_detects_own_output() {
let bytes = vec![60, 1, 11, 11, 27, 52, 1, 2];
assert!(is_rule33_emission(&bytes));
let bytes2 = vec![60, 1, 11, 11, 27, 48, 1, 2];
assert!(is_rule33_emission(&bytes2));
let bytes3 = vec![60, 1, 11, 11, 27, 52, 1, 50];
assert!(is_rule33_emission(&bytes3));
let bytes4 = vec![60, 1, 11, 11, 27, 52, 1, 48, 6];
assert!(is_rule33_emission(&bytes4));
assert!(!is_rule33_emission(&[]));
assert!(!is_rule33_emission(&[60, 1, 11, 11, 27, 52, 1]));
assert!(!is_rule33_emission(&[59, 1, 11, 11, 27, 52, 1, 2]));
assert!(!is_rule33_emission(&[60, 1, 11, 11, 99, 52, 1, 2]));
assert!(!is_rule33_emission(&[60, 1, 11, 11, 27, 99, 1, 2]));
assert!(!is_rule33_emission(&[60, 1, 11, 11, 27, 52, 99, 2]));
assert!(!is_rule33_emission(&[60, 1, 11, 11, 27, 52, 1, 99]));
}
#[test]
fn apply_non_word_noop() {
let r = Rule33CitationYearSuffixRule;
let tokens = vec![Token::Space(SpaceKind::Regular)];
let mut state = EncoderState::new(false);
assert!(matches!(
r.apply(&tokens, 0, &mut state).unwrap(),
TokenAction::Noop
));
}
#[test]
fn apply_plain_word_noop() {
let r = Rule33CitationYearSuffixRule;
let tokens = vec![word_token("hello")];
let mut state = EncoderState::new(false);
assert!(matches!(
r.apply(&tokens, 0, &mut state).unwrap(),
TokenAction::Noop
));
}
#[test]
fn apply_year_suffix_comma() {
let r = Rule33CitationYearSuffixRule;
let tokens = vec![word_token("1998a,")];
let mut state = EncoderState::new(false);
let action = r.apply(&tokens, 0, &mut state).unwrap();
assert!(matches!(action, TokenAction::Replace(Token::PreEncoded(_))));
}
#[test]
fn apply_year_suffix_semicolon() {
let r = Rule33CitationYearSuffixRule;
let tokens = vec![word_token("1998a;")];
let mut state = EncoderState::new(false);
let action = r.apply(&tokens, 0, &mut state).unwrap();
assert!(matches!(action, TokenAction::Replace(Token::PreEncoded(_))));
}
#[test]
fn apply_year_suffix_period() {
let r = Rule33CitationYearSuffixRule;
let tokens = vec![word_token("1998a.")];
let mut state = EncoderState::new(false);
let action = r.apply(&tokens, 0, &mut state).unwrap();
assert!(matches!(action, TokenAction::Replace(Token::PreEncoded(_))));
}
#[test]
fn apply_continuation_after_year_word() {
let r = Rule33CitationYearSuffixRule;
let tokens = vec![
word_token("1998a,"),
Token::Space(SpaceKind::Regular),
word_token("1998b,"),
];
let mut state = EncoderState::new(false);
let action = r.apply(&tokens, 2, &mut state).unwrap();
if let TokenAction::Replace(Token::PreEncoded(bytes)) = action {
assert_eq!(bytes[5], 48);
} else {
panic!("expected Replace");
}
}
#[test]
fn apply_continuation_after_preencoded() {
let r = Rule33CitationYearSuffixRule;
let preenc_bytes = vec![60u8, 1, 11, 11, 27, 52, 1, 2];
let tokens = vec![
Token::PreEncoded(preenc_bytes),
Token::Space(SpaceKind::Regular),
word_token("1998b,"),
];
let mut state = EncoderState::new(false);
let action = r.apply(&tokens, 2, &mut state).unwrap();
if let TokenAction::Replace(Token::PreEncoded(bytes)) = action {
assert_eq!(bytes[5], 48); } else {
panic!("expected Replace");
}
}
#[test]
fn citation_with_mode_token_before_breaks_false() {
use crate::rules::token::ModeEvent;
let r = Rule33CitationYearSuffixRule;
let tokens = vec![Token::Mode(ModeEvent::EnterEnglish), word_token("1998a,")];
let mut state = EncoderState::new(false);
let action = r.apply(&tokens, 1, &mut state).unwrap();
if let TokenAction::Replace(Token::PreEncoded(bytes)) = action {
assert_eq!(bytes[5], 52); } else {
panic!("expected Replace");
}
}
}