jpreprocess_core/
accent_rule.rs

1use std::{
2    fmt::{Debug, Display},
3    str::FromStr,
4};
5
6use once_cell::sync::Lazy;
7use regex::Regex;
8use serde::{Deserialize, Serialize};
9
10use crate::JPreprocessResult;
11
12use super::pos::POS;
13
14#[derive(Debug, thiserror::Error, PartialEq, Eq)]
15pub enum AccentRuleParseError {
16    #[error("Unknown part of speech (POS) {0}")]
17    UnknownPOS(String),
18    #[error("Unrecognized syntax {0}")]
19    SyntaxError(String),
20}
21
22static PARSE_REGEX: Lazy<Regex> = Lazy::new(|| {
23    Regex::new("^((?P<pos>名詞|形容詞|助詞|特殊助動詞|動詞)%)?(?P<accent>[FC][1-5]|P1|P2|P6|P14)?(@(?P<add>[-0-9]+))?$")
24        .expect("Failed to compile accent rule regex")
25});
26
27#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
28pub enum AccentType {
29    F1,
30    F2,
31    F3,
32    F4,
33    F5,
34    //F6,
35    C1,
36    C2,
37    C3,
38    C4,
39    C5,
40    P1,
41    P2,
42    //P4,
43    P6,
44    //P13,
45    P14,
46    None,
47}
48
49impl FromStr for AccentType {
50    type Err = ();
51    fn from_str(s: &str) -> Result<Self, Self::Err> {
52        match s {
53            "F1" => Ok(Self::F1),
54            "F2" => Ok(Self::F2),
55            "F3" => Ok(Self::F3),
56            "F4" => Ok(Self::F4),
57            "F5" => Ok(Self::F5),
58            "C1" => Ok(Self::C1),
59            "C2" => Ok(Self::C2),
60            "C3" => Ok(Self::C3),
61            "C4" => Ok(Self::C4),
62            "C5" => Ok(Self::C5),
63            "P1" => Ok(Self::P1),
64            "P2" => Ok(Self::P2),
65            "P6" => Ok(Self::P6),
66            "P14" => Ok(Self::P14),
67            "" | "*" => Ok(Self::None),
68            _ => Err(()),
69        }
70    }
71}
72
73impl Display for AccentType {
74    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
75        f.write_str(match &self {
76            Self::F1 => "F1",
77            Self::F2 => "F2",
78            Self::F3 => "F3",
79            Self::F4 => "F4",
80            Self::F5 => "F5",
81            Self::C1 => "C1",
82            Self::C2 => "C2",
83            Self::C3 => "C3",
84            Self::C4 => "C4",
85            Self::C5 => "C5",
86            Self::P1 => "P1",
87            Self::P2 => "P2",
88            Self::P6 => "P6",
89            Self::P14 => "P14",
90            Self::None => "*",
91        })
92    }
93}
94
95// Accent sandhi rule
96#[derive(Clone, PartialEq, Serialize, Deserialize, Debug)]
97pub struct ChainRule {
98    pub accent_type: AccentType,
99    pub add_type: isize,
100}
101
102impl ChainRule {
103    pub fn new(accent_type: AccentType, add_type: isize) -> Self {
104        Self {
105            accent_type,
106            add_type,
107        }
108    }
109}
110
111impl Display for ChainRule {
112    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
113        if self.add_type == 0 {
114            write!(f, "{}", self.accent_type)
115        } else {
116            write!(f, "{}@{}", self.accent_type, self.add_type)
117        }
118    }
119}
120
121#[derive(Debug)]
122pub enum POSMatch {
123    Default,
124    Doushi,
125    Joshi,
126    Keiyoushi,
127    Meishi,
128}
129
130impl FromStr for POSMatch {
131    type Err = AccentRuleParseError;
132    fn from_str(s: &str) -> Result<Self, Self::Err> {
133        match s {
134            "動詞" => Ok(Self::Doushi),
135            "助詞" => Ok(Self::Joshi),
136            "形容詞" => Ok(Self::Keiyoushi),
137            "名詞" => Ok(Self::Meishi),
138            _ => Err(AccentRuleParseError::UnknownPOS(s.to_string())),
139        }
140    }
141}
142
143#[derive(Clone, PartialEq, Serialize, Deserialize, Debug, Default)]
144pub struct ChainRules {
145    pub default: Option<ChainRule>,
146    pub doushi: Option<ChainRule>,
147    pub joshi: Option<ChainRule>,
148    pub keiyoushi: Option<ChainRule>,
149    pub meishi: Option<ChainRule>,
150}
151
152impl ChainRules {
153    pub fn new(rules: &str) -> Self {
154        let mut result = Self::default();
155        if rules == "*" {
156            return result;
157        }
158
159        for rule in rules.split('/') {
160            if result.push_rule(rule).is_err() {
161                eprintln!("WARN: accent rule parsing has failed in {}. Skipped.", rule);
162            }
163        }
164        result
165    }
166
167    fn push_rule(&mut self, rule_str: &str) -> JPreprocessResult<()> {
168        let (pos, rule) = Self::parse_rule(rule_str)?;
169        match pos {
170            POSMatch::Doushi => self.doushi.replace(rule),
171            POSMatch::Joshi => self.joshi.replace(rule),
172            POSMatch::Keiyoushi => self.keiyoushi.replace(rule),
173            POSMatch::Meishi => self.meishi.replace(rule),
174            POSMatch::Default => self.default.replace(rule),
175        };
176        Ok(())
177    }
178
179    fn parse_rule(rule: &str) -> JPreprocessResult<(POSMatch, ChainRule)> {
180        let capture = PARSE_REGEX
181            .captures(rule)
182            .ok_or_else(|| AccentRuleParseError::SyntaxError(rule.to_string()))?;
183
184        let pos = {
185            if let Some(pos) = capture.name("pos") {
186                POSMatch::from_str(pos.as_str())?
187            } else {
188                POSMatch::Default
189            }
190        };
191
192        let accent_type = if let Some(matched) = capture.name("accent") {
193            // This is guaranteed to success by regex
194            AccentType::from_str(matched.as_str()).unwrap()
195        } else {
196            AccentType::None
197        };
198
199        let add_type = capture
200            .name("add")
201            .and_then(|matched| matched.as_str().parse().ok())
202            .unwrap_or(0);
203
204        Ok((pos, ChainRule::new(accent_type, add_type)))
205    }
206
207    pub fn get_rule(&self, pos: &POS) -> Option<&ChainRule> {
208        let rule = match pos {
209            POS::Doushi(_) => self.doushi.as_ref(),
210            POS::Joshi(_) => self.joshi.as_ref(),
211            POS::Keiyoushi(_) => self.keiyoushi.as_ref(),
212            POS::Meishi(_) => self.meishi.as_ref(),
213            _ => None,
214        };
215        rule.or(self.default.as_ref())
216    }
217
218    pub fn unset(&mut self) {
219        self.default = None;
220        self.doushi = None;
221        self.joshi = None;
222        self.keiyoushi = None;
223        self.meishi = None;
224    }
225}
226
227impl Display for ChainRules {
228    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
229        let text = &[
230            ("", &self.default),
231            ("動詞", &self.doushi),
232            ("助詞", &self.joshi),
233            ("形容詞", &self.keiyoushi),
234            ("名詞", &self.meishi),
235        ]
236        .iter()
237        .filter(|(_name, chainrule_option)| chainrule_option.is_some())
238        .fold(String::new(), |acc, (pos, chainrule_option)| {
239            let chainrule = chainrule_option.as_ref().unwrap();
240            let delim = if acc.is_empty() { "" } else { "/" };
241            if pos.is_empty() {
242                format!("{}{}{}", acc, delim, chainrule)
243            } else {
244                format!("{}{}{}%{}", acc, delim, pos, chainrule)
245            }
246        });
247        if text.is_empty() {
248            f.write_str("*")
249        } else {
250            f.write_str(text)
251        }
252    }
253}
254
255#[cfg(test)]
256mod tests {
257    use crate::{accent_rule::AccentType, pos::*};
258
259    use super::ChainRules;
260
261    #[test]
262    fn simple_rule() {
263        let rules = ChainRules::new("C3");
264        let rule = rules.get_rule(&POS::Others).unwrap();
265        assert_eq!(rule.accent_type, AccentType::C3);
266        assert_eq!(rule.add_type, 0);
267
268        assert_eq!(rules.to_string(), "C3");
269    }
270
271    #[test]
272    fn single_complex_rule() {
273        let rules = ChainRules::new("形容詞%F2@-1");
274        let rule = rules.get_rule(&POS::Keiyoushi(Keiyoushi::Jiritsu)).unwrap();
275        assert_eq!(rule.accent_type, AccentType::F2);
276        assert_eq!(rule.add_type, -1);
277
278        assert_eq!(rules.to_string(), "形容詞%F2@-1");
279    }
280
281    #[test]
282    fn multiple_complex_rule() {
283        let rules = ChainRules::new("形容詞%F2@0/動詞%F5");
284        let rule1 = rules.get_rule(&POS::Keiyoushi(Keiyoushi::Jiritsu)).unwrap();
285        assert_eq!(rule1.accent_type, AccentType::F2);
286        assert_eq!(rule1.add_type, 0);
287        let rule2 = rules.get_rule(&POS::Doushi(Doushi::Jiritsu)).unwrap();
288        assert_eq!(rule2.accent_type, AccentType::F5);
289        assert_eq!(rule2.add_type, 0);
290
291        assert_eq!(rules.to_string(), "動詞%F5/形容詞%F2");
292    }
293
294    #[test]
295    fn reject_invalid_pos() {
296        assert!(ChainRules::parse_rule("特殊助詞%F2@0").is_err());
297    }
298
299    #[test]
300    fn add_type_only() {
301        ChainRules::new("-1");
302    }
303
304    #[test]
305    fn default_rule_1() {
306        let rules = ChainRules::new("形容詞%F2/F5");
307        let rule1 = rules.get_rule(&POS::Keiyoushi(Keiyoushi::Jiritsu)).unwrap();
308        assert_eq!(rule1.accent_type, AccentType::F2);
309        let rule2 = rules.get_rule(&POS::Doushi(Doushi::Jiritsu)).unwrap();
310        assert_eq!(rule2.accent_type, AccentType::F5);
311
312        assert_eq!(rules.to_string(), "F5/形容詞%F2");
313    }
314
315    #[test]
316    fn default_rule_2() {
317        let rules = ChainRules::new("F5/形容詞%F2");
318        let rule1 = rules.get_rule(&POS::Keiyoushi(Keiyoushi::Jiritsu)).unwrap();
319        assert_eq!(rule1.accent_type, AccentType::F2);
320        let rule2 = rules.get_rule(&POS::Doushi(Doushi::Jiritsu)).unwrap();
321        assert_eq!(rule2.accent_type, AccentType::F5);
322
323        assert_eq!(rules.to_string(), "F5/形容詞%F2");
324    }
325
326    #[test]
327    fn empty() {
328        let rules = ChainRules::new("*");
329        assert_eq!(rules.default, None);
330        assert_eq!(rules.doushi, None);
331        assert_eq!(rules.joshi, None);
332        assert_eq!(rules.keiyoushi, None);
333        assert_eq!(rules.meishi, None);
334
335        assert_eq!(rules.to_string(), "*");
336    }
337}