jpreprocess_core/pronunciation/
mod.rs

1pub mod mora;
2mod mora_dict;
3mod mora_enum;
4pub mod phoneme;
5
6use serde::{Deserialize, Serialize};
7use std::{borrow::Cow, fmt::Display, ops::Range};
8
9pub use mora::*;
10pub use mora_enum::*;
11
12pub const TOUTEN: &str = "、";
13pub const QUESTION: &str = "?";
14pub const QUOTATION: &str = "’";
15
16#[macro_export]
17macro_rules! pron {
18    ([$($x:ident),*],$acc:expr) => {
19        {
20            $crate::pronunciation::Pronunciation {
21                moras: ::std::borrow::Cow::Borrowed(&[
22                    $(
23                        $crate::pronunciation::Mora {
24                            mora_enum: $crate::pronunciation::MoraEnum::$x,
25                            is_voiced: true,
26                        },
27                    )*
28                ]),
29                accent: $acc,
30            }
31        }
32    };
33}
34
35#[derive(Debug, thiserror::Error)]
36pub enum PronunciationParseError {
37    #[error("`{0}` could not be parsed as mora")]
38    UnknownMora(String),
39    #[error("Provided mora size {0} is different from that of calculated from pronunciation {1}")]
40    MoraSizeMismatch(usize, usize),
41    #[error("Failed to parse as integer: {0}")]
42    NumberParseError(#[from] std::num::ParseIntError),
43}
44
45/// Pronunciation.
46///
47/// Do not access moras and accent directly unless through [`pron`] macro.
48#[derive(Clone, PartialEq, Eq, Serialize, Deserialize, Debug, Default)]
49pub struct Pronunciation {
50    #[doc(hidden)]
51    pub moras: Cow<'static, [Mora]>,
52    #[doc(hidden)]
53    pub accent: usize,
54}
55
56impl Pronunciation {
57    pub fn new(moras: Vec<Mora>, accent: usize) -> Self {
58        Self {
59            moras: Cow::Owned(moras),
60            accent,
61        }
62    }
63
64    pub fn mora_size(&self) -> usize {
65        self.moras
66            .iter()
67            .filter(|mora| !matches!(mora.mora_enum, MoraEnum::Question | MoraEnum::Touten))
68            .count()
69    }
70
71    pub fn is_empty(&self) -> bool {
72        self.moras.is_empty()
73    }
74
75    pub fn mora_matches(&self, mora_enum: MoraEnum) -> bool {
76        let Some((first, rest)) = self.moras.split_first() else {
77            return false;
78        };
79        rest.is_empty() && first.mora_enum == mora_enum
80    }
81    pub fn is_question(&self) -> bool {
82        self.mora_matches(MoraEnum::Question)
83    }
84    pub fn is_touten(&self) -> bool {
85        self.mora_matches(MoraEnum::Touten)
86    }
87
88    pub fn is_mora_convertable(s: &str) -> bool {
89        mora_dict::MORA_STR_LIST.contains(&s)
90    }
91
92    pub fn to_pure_string(&self) -> String {
93        self.moras
94            .iter()
95            .map(|mora| mora.to_string())
96            .fold(String::new(), |a, b| a + &b)
97    }
98
99    #[inline]
100    pub fn moras(&self) -> &[Mora] {
101        self.moras.as_ref()
102    }
103    #[inline]
104    pub fn moras_mut(&mut self) -> &mut [Mora] {
105        self.moras.to_mut()
106    }
107
108    pub fn accent(&self) -> usize {
109        self.accent
110    }
111    pub fn set_accent(&mut self, accent: usize) {
112        self.accent = accent;
113    }
114
115    pub fn transfer_from(&mut self, from: &Self) {
116        let moras = self
117            .moras()
118            .iter()
119            .chain(from.moras())
120            .cloned()
121            .collect::<Vec<_>>();
122        self.moras = Cow::Owned(moras);
123    }
124}
125
126impl Pronunciation {
127    pub(crate) fn parse_csv_pron(
128        pron: &str,
129        acc_morasize: &str,
130    ) -> Result<Self, PronunciationParseError> {
131        let (accent, mora_size) = match acc_morasize.split_once('/') {
132            Some(("*" | "", "*" | "")) => (None, None),
133            Some((acc, mora_size)) => (Some(acc.parse()?), Some(mora_size.parse()?)),
134            None => match acc_morasize {
135                "*" | "" => (None, None),
136                acc => (Some(acc.parse()?), None),
137            },
138        };
139        let pronunciation = Self::parse(pron, accent.unwrap_or(0))?;
140
141        if let Some(mora_size) = mora_size {
142            if pronunciation.mora_size() != mora_size {
143                return Err(PronunciationParseError::MoraSizeMismatch(
144                    mora_size,
145                    pronunciation.mora_size(),
146                ));
147            }
148        }
149
150        Ok(pronunciation)
151    }
152
153    pub fn parse(moras: &str, accent: usize) -> Result<Self, PronunciationParseError> {
154        let parsed = Self::parse_mora_str(moras);
155        let result = if parsed.len() > 1 {
156            let range = parsed[1].0.clone();
157            return Err(PronunciationParseError::UnknownMora(
158                moras[range].to_string(),
159            ));
160        } else {
161            parsed.first().cloned().unwrap_or_default().1
162        };
163
164        Ok(Self::new(result, accent))
165    }
166
167    pub fn parse_mora_str(s: &str) -> Vec<(Range<usize>, Vec<Mora>)> {
168        if s == "*" {
169            return vec![];
170        } else if s == QUESTION {
171            return vec![(
172                0..QUESTION.len(),
173                vec![Mora {
174                    mora_enum: MoraEnum::Question,
175                    is_voiced: true,
176                }],
177            )];
178        }
179
180        let mut result = Vec::new();
181
182        let mut segment_start_point = 0;
183        let mut current_moras = Vec::new();
184        let mut current_position = 0;
185        for match_result in mora_dict::MORA_DICT_AHO_CORASICK.find_iter(s) {
186            if current_position != match_result.start() {
187                if !current_moras.is_empty() {
188                    result.push((segment_start_point..current_position, current_moras.clone()));
189                    current_moras.clear();
190                    segment_start_point = current_position;
191                }
192
193                result.push((
194                    segment_start_point..match_result.start(),
195                    vec![Mora {
196                        mora_enum: MoraEnum::Touten,
197                        is_voiced: true,
198                    }],
199                ));
200                segment_start_point = match_result.start();
201            }
202
203            let quotation = s[match_result.end()..].starts_with(QUOTATION);
204
205            current_moras.extend(
206                mora_dict::get_mora_enum(match_result.pattern().as_usize())
207                    .into_iter()
208                    .map(|mora_enum| Mora {
209                        mora_enum,
210                        is_voiced: !quotation,
211                    }),
212            );
213
214            current_position = match_result.end();
215            if quotation {
216                current_position += QUOTATION.len();
217            }
218        }
219
220        if !current_moras.is_empty() {
221            result.push((segment_start_point..current_position, current_moras));
222        }
223        if current_position != s.len() {
224            result.push((
225                current_position..s.len(),
226                vec![Mora {
227                    mora_enum: MoraEnum::Touten,
228                    is_voiced: true,
229                }],
230            ));
231        }
232
233        result
234    }
235}
236
237impl Display for Pronunciation {
238    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
239        f.write_str(
240            &self
241                .moras
242                .iter()
243                .fold(String::new(), |acc, mora| format!("{}{}", acc, mora)),
244        )
245    }
246}
247
248#[cfg(test)]
249mod test {
250    use super::{Mora, MoraEnum, Pronunciation};
251
252    #[test]
253    fn parse_normal() {
254        let pron = Pronunciation::parse_mora_str("オツカレサマデシ’タ");
255        assert_eq!(
256            pron,
257            vec![(
258                0..30,
259                vec![
260                    Mora {
261                        mora_enum: MoraEnum::O,
262                        is_voiced: true
263                    },
264                    Mora {
265                        mora_enum: MoraEnum::Tsu,
266                        is_voiced: true
267                    },
268                    Mora {
269                        mora_enum: MoraEnum::Ka,
270                        is_voiced: true
271                    },
272                    Mora {
273                        mora_enum: MoraEnum::Re,
274                        is_voiced: true
275                    },
276                    Mora {
277                        mora_enum: MoraEnum::Sa,
278                        is_voiced: true
279                    },
280                    Mora {
281                        mora_enum: MoraEnum::Ma,
282                        is_voiced: true
283                    },
284                    Mora {
285                        mora_enum: MoraEnum::De,
286                        is_voiced: true
287                    },
288                    Mora {
289                        mora_enum: MoraEnum::Shi,
290                        is_voiced: false
291                    },
292                    Mora {
293                        mora_enum: MoraEnum::Ta,
294                        is_voiced: true
295                    }
296                ]
297            )]
298        )
299    }
300
301    #[test]
302    fn parse_symbol() {
303        assert_eq!(
304            Pronunciation::parse_mora_str(";"),
305            vec![(
306                0..3,
307                vec![Mora {
308                    mora_enum: MoraEnum::Touten,
309                    is_voiced: true
310                }]
311            )]
312        )
313    }
314
315    #[test]
316    fn parse_empty() {
317        assert_eq!(Pronunciation::parse_mora_str(""), vec![])
318    }
319
320    #[test]
321    fn parse_multiple_segments() {
322        let pron = Pronunciation::parse_mora_str("バリー・ペーン,");
323        assert_eq!(
324            pron,
325            vec![
326                (
327                    0..9,
328                    vec![
329                        Mora {
330                            mora_enum: MoraEnum::Ba,
331                            is_voiced: true
332                        },
333                        Mora {
334                            mora_enum: MoraEnum::Ri,
335                            is_voiced: true
336                        },
337                        Mora {
338                            mora_enum: MoraEnum::Long,
339                            is_voiced: true
340                        },
341                    ]
342                ),
343                (
344                    9..12,
345                    vec![Mora {
346                        mora_enum: MoraEnum::Touten,
347                        is_voiced: true
348                    },]
349                ),
350                (
351                    12..21,
352                    vec![
353                        Mora {
354                            mora_enum: MoraEnum::Pe,
355                            is_voiced: true
356                        },
357                        Mora {
358                            mora_enum: MoraEnum::Long,
359                            is_voiced: true
360                        },
361                        Mora {
362                            mora_enum: MoraEnum::N,
363                            is_voiced: true
364                        }
365                    ]
366                ),
367                (
368                    21..24,
369                    vec![Mora {
370                        mora_enum: MoraEnum::Touten,
371                        is_voiced: true
372                    }]
373                )
374            ]
375        )
376    }
377
378    #[test]
379    fn to_string() {
380        assert_eq!(
381            Pronunciation::parse("オツカレサマデシ’タ", 0)
382                .unwrap()
383                .to_string(),
384            "オツカレサマデシ’タ"
385        );
386        assert_eq!(Pronunciation::parse("?", 0).unwrap().to_string(), "?");
387        assert_eq!(Pronunciation::parse(".?", 0).unwrap().to_string(), "、");
388        assert_eq!(Pronunciation::parse("*", 0).unwrap().to_string(), "");
389    }
390}