jpreprocess_core/pronunciation/
mod.rs1pub mod mora;
2mod mora_dict;
3mod mora_enum;
4pub mod phoneme;
5
6use serde::{Deserialize, Serialize};
7use std::{borrow::Cow, fmt::Display, ops::Range};
8
9pub use mora::*;
10pub use mora_enum::*;
11
12pub const TOUTEN: &str = "、";
13pub const QUESTION: &str = "?";
14pub const QUOTATION: &str = "’";
15
16#[macro_export]
17macro_rules! pron {
18 ([$($x:ident),*],$acc:expr) => {
19 {
20 $crate::pronunciation::Pronunciation {
21 moras: ::std::borrow::Cow::Borrowed(&[
22 $(
23 $crate::pronunciation::Mora {
24 mora_enum: $crate::pronunciation::MoraEnum::$x,
25 is_voiced: true,
26 },
27 )*
28 ]),
29 accent: $acc,
30 }
31 }
32 };
33}
34
35#[derive(Debug, thiserror::Error)]
36pub enum PronunciationParseError {
37 #[error("`{0}` could not be parsed as mora")]
38 UnknownMora(String),
39 #[error("Provided mora size {0} is different from that of calculated from pronunciation {1}")]
40 MoraSizeMismatch(usize, usize),
41 #[error("Failed to parse as integer: {0}")]
42 NumberParseError(#[from] std::num::ParseIntError),
43}
44
45#[derive(Clone, PartialEq, Eq, Serialize, Deserialize, Debug, Default)]
49pub struct Pronunciation {
50 #[doc(hidden)]
51 pub moras: Cow<'static, [Mora]>,
52 #[doc(hidden)]
53 pub accent: usize,
54}
55
56impl Pronunciation {
57 pub fn new(moras: Vec<Mora>, accent: usize) -> Self {
58 Self {
59 moras: Cow::Owned(moras),
60 accent,
61 }
62 }
63
64 pub fn mora_size(&self) -> usize {
65 self.moras
66 .iter()
67 .filter(|mora| !matches!(mora.mora_enum, MoraEnum::Question | MoraEnum::Touten))
68 .count()
69 }
70
71 pub fn is_empty(&self) -> bool {
72 self.moras.is_empty()
73 }
74
75 pub fn mora_matches(&self, mora_enum: MoraEnum) -> bool {
76 let Some((first, rest)) = self.moras.split_first() else {
77 return false;
78 };
79 rest.is_empty() && first.mora_enum == mora_enum
80 }
81 pub fn is_question(&self) -> bool {
82 self.mora_matches(MoraEnum::Question)
83 }
84 pub fn is_touten(&self) -> bool {
85 self.mora_matches(MoraEnum::Touten)
86 }
87
88 pub fn is_mora_convertable(s: &str) -> bool {
89 mora_dict::MORA_STR_LIST.contains(&s)
90 }
91
92 pub fn to_pure_string(&self) -> String {
93 self.moras
94 .iter()
95 .map(|mora| mora.to_string())
96 .fold(String::new(), |a, b| a + &b)
97 }
98
99 #[inline]
100 pub fn moras(&self) -> &[Mora] {
101 self.moras.as_ref()
102 }
103 #[inline]
104 pub fn moras_mut(&mut self) -> &mut [Mora] {
105 self.moras.to_mut()
106 }
107
108 pub fn accent(&self) -> usize {
109 self.accent
110 }
111 pub fn set_accent(&mut self, accent: usize) {
112 self.accent = accent;
113 }
114
115 pub fn transfer_from(&mut self, from: &Self) {
116 let moras = self
117 .moras()
118 .iter()
119 .chain(from.moras())
120 .cloned()
121 .collect::<Vec<_>>();
122 self.moras = Cow::Owned(moras);
123 }
124}
125
126impl Pronunciation {
127 pub(crate) fn parse_csv_pron(
128 pron: &str,
129 acc_morasize: &str,
130 ) -> Result<Self, PronunciationParseError> {
131 let (accent, mora_size) = match acc_morasize.split_once('/') {
132 Some(("*" | "", "*" | "")) => (None, None),
133 Some((acc, mora_size)) => (Some(acc.parse()?), Some(mora_size.parse()?)),
134 None => match acc_morasize {
135 "*" | "" => (None, None),
136 acc => (Some(acc.parse()?), None),
137 },
138 };
139 let pronunciation = Self::parse(pron, accent.unwrap_or(0))?;
140
141 if let Some(mora_size) = mora_size {
142 if pronunciation.mora_size() != mora_size {
143 return Err(PronunciationParseError::MoraSizeMismatch(
144 mora_size,
145 pronunciation.mora_size(),
146 ));
147 }
148 }
149
150 Ok(pronunciation)
151 }
152
153 pub fn parse(moras: &str, accent: usize) -> Result<Self, PronunciationParseError> {
154 let parsed = Self::parse_mora_str(moras);
155 let result = if parsed.len() > 1 {
156 let range = parsed[1].0.clone();
157 return Err(PronunciationParseError::UnknownMora(
158 moras[range].to_string(),
159 ));
160 } else {
161 parsed.first().cloned().unwrap_or_default().1
162 };
163
164 Ok(Self::new(result, accent))
165 }
166
167 pub fn parse_mora_str(s: &str) -> Vec<(Range<usize>, Vec<Mora>)> {
168 if s == "*" {
169 return vec![];
170 } else if s == QUESTION {
171 return vec![(
172 0..QUESTION.len(),
173 vec![Mora {
174 mora_enum: MoraEnum::Question,
175 is_voiced: true,
176 }],
177 )];
178 }
179
180 let mut result = Vec::new();
181
182 let mut segment_start_point = 0;
183 let mut current_moras = Vec::new();
184 let mut current_position = 0;
185 for match_result in mora_dict::MORA_DICT_AHO_CORASICK.find_iter(s) {
186 if current_position != match_result.start() {
187 if !current_moras.is_empty() {
188 result.push((segment_start_point..current_position, current_moras.clone()));
189 current_moras.clear();
190 segment_start_point = current_position;
191 }
192
193 result.push((
194 segment_start_point..match_result.start(),
195 vec![Mora {
196 mora_enum: MoraEnum::Touten,
197 is_voiced: true,
198 }],
199 ));
200 segment_start_point = match_result.start();
201 }
202
203 let quotation = s[match_result.end()..].starts_with(QUOTATION);
204
205 current_moras.extend(
206 mora_dict::get_mora_enum(match_result.pattern().as_usize())
207 .into_iter()
208 .map(|mora_enum| Mora {
209 mora_enum,
210 is_voiced: !quotation,
211 }),
212 );
213
214 current_position = match_result.end();
215 if quotation {
216 current_position += QUOTATION.len();
217 }
218 }
219
220 if !current_moras.is_empty() {
221 result.push((segment_start_point..current_position, current_moras));
222 }
223 if current_position != s.len() {
224 result.push((
225 current_position..s.len(),
226 vec![Mora {
227 mora_enum: MoraEnum::Touten,
228 is_voiced: true,
229 }],
230 ));
231 }
232
233 result
234 }
235}
236
237impl Display for Pronunciation {
238 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
239 f.write_str(
240 &self
241 .moras
242 .iter()
243 .fold(String::new(), |acc, mora| format!("{}{}", acc, mora)),
244 )
245 }
246}
247
248#[cfg(test)]
249mod test {
250 use super::{Mora, MoraEnum, Pronunciation};
251
252 #[test]
253 fn parse_normal() {
254 let pron = Pronunciation::parse_mora_str("オツカレサマデシ’タ");
255 assert_eq!(
256 pron,
257 vec![(
258 0..30,
259 vec![
260 Mora {
261 mora_enum: MoraEnum::O,
262 is_voiced: true
263 },
264 Mora {
265 mora_enum: MoraEnum::Tsu,
266 is_voiced: true
267 },
268 Mora {
269 mora_enum: MoraEnum::Ka,
270 is_voiced: true
271 },
272 Mora {
273 mora_enum: MoraEnum::Re,
274 is_voiced: true
275 },
276 Mora {
277 mora_enum: MoraEnum::Sa,
278 is_voiced: true
279 },
280 Mora {
281 mora_enum: MoraEnum::Ma,
282 is_voiced: true
283 },
284 Mora {
285 mora_enum: MoraEnum::De,
286 is_voiced: true
287 },
288 Mora {
289 mora_enum: MoraEnum::Shi,
290 is_voiced: false
291 },
292 Mora {
293 mora_enum: MoraEnum::Ta,
294 is_voiced: true
295 }
296 ]
297 )]
298 )
299 }
300
301 #[test]
302 fn parse_symbol() {
303 assert_eq!(
304 Pronunciation::parse_mora_str(";"),
305 vec![(
306 0..3,
307 vec![Mora {
308 mora_enum: MoraEnum::Touten,
309 is_voiced: true
310 }]
311 )]
312 )
313 }
314
315 #[test]
316 fn parse_empty() {
317 assert_eq!(Pronunciation::parse_mora_str(""), vec![])
318 }
319
320 #[test]
321 fn parse_multiple_segments() {
322 let pron = Pronunciation::parse_mora_str("バリー・ペーン,");
323 assert_eq!(
324 pron,
325 vec![
326 (
327 0..9,
328 vec![
329 Mora {
330 mora_enum: MoraEnum::Ba,
331 is_voiced: true
332 },
333 Mora {
334 mora_enum: MoraEnum::Ri,
335 is_voiced: true
336 },
337 Mora {
338 mora_enum: MoraEnum::Long,
339 is_voiced: true
340 },
341 ]
342 ),
343 (
344 9..12,
345 vec![Mora {
346 mora_enum: MoraEnum::Touten,
347 is_voiced: true
348 },]
349 ),
350 (
351 12..21,
352 vec![
353 Mora {
354 mora_enum: MoraEnum::Pe,
355 is_voiced: true
356 },
357 Mora {
358 mora_enum: MoraEnum::Long,
359 is_voiced: true
360 },
361 Mora {
362 mora_enum: MoraEnum::N,
363 is_voiced: true
364 }
365 ]
366 ),
367 (
368 21..24,
369 vec![Mora {
370 mora_enum: MoraEnum::Touten,
371 is_voiced: true
372 }]
373 )
374 ]
375 )
376 }
377
378 #[test]
379 fn to_string() {
380 assert_eq!(
381 Pronunciation::parse("オツカレサマデシ’タ", 0)
382 .unwrap()
383 .to_string(),
384 "オツカレサマデシ’タ"
385 );
386 assert_eq!(Pronunciation::parse("?", 0).unwrap().to_string(), "?");
387 assert_eq!(Pronunciation::parse(".?", 0).unwrap().to_string(), "、");
388 assert_eq!(Pronunciation::parse("*", 0).unwrap().to_string(), "");
389 }
390}