1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
//! PUA (Private Use Area) マッピング.
//!
//! Python (token_mapper.py) および C++ (各 *_phonemize.cpp) と同一のテーブル。
//! 学習済みモデルの重みに依存するため変更不可。
use std::collections::HashMap;
use std::sync::LazyLock;
/// PUA compatibility version. Increment when new PUA mappings are added.
pub const PUA_COMPAT_VERSION: u32 = 1;
/// Check if a model's PUA version is compatible.
pub fn check_pua_compat(model_version: Option<u32>) -> Result<(), String> {
match model_version {
None => Ok(()),
Some(v) if v == PUA_COMPAT_VERSION => Ok(()),
Some(v) => Err(format!(
"PUA version mismatch: model has pua_compat_version={v}, \
but piper-g2p expects version {PUA_COMPAT_VERSION}"
)),
}
}
/// 固定 PUA マッピング (96 エントリ)
/// 多文字音素トークン → Unicode Private Use Area コードポイント
pub static FIXED_PUA_MAP: LazyLock<Vec<(&'static str, u32)>> = LazyLock::new(|| {
vec![
// === Japanese (U+E000-E01C) ===
// Long vowels
("a:", 0xE000),
("i:", 0xE001),
("u:", 0xE002),
("e:", 0xE003),
("o:", 0xE004),
// Special consonants
("cl", 0xE005),
// Palatalized consonants
("ky", 0xE006),
("kw", 0xE007),
("gy", 0xE008),
("gw", 0xE009),
("ty", 0xE00A),
("dy", 0xE00B),
("py", 0xE00C),
("by", 0xE00D),
// Affricates and special sounds
("ch", 0xE00E),
("ts", 0xE00F),
("sh", 0xE010),
("zy", 0xE011),
("hy", 0xE012),
// Palatalized nasals/liquids
("ny", 0xE013),
("my", 0xE014),
("ry", 0xE015),
// Question type markers (Issue #204)
("?!", 0xE016),
("?.", 0xE017),
("?~", 0xE018),
// N phoneme variants (Issue #207)
("N_m", 0xE019),
("N_n", 0xE01A),
("N_ng", 0xE01B),
("N_uvular", 0xE01C),
// === Multilingual shared (U+E01D-E01E) ===
("rr", 0xE01D), // Spanish trill r
("y_vowel", 0xE01E), // Close front rounded vowel [y] (ZH pinyin ü, FR lune)
// 0xE01F reserved (unused gap)
// === Chinese (U+E020-E04A) ===
// --- Initials (aspirated/affricate) ---
("p\u{02b0}", 0xE020), // pʰ aspirated bilabial (pinyin p)
("t\u{02b0}", 0xE021), // tʰ aspirated alveolar (pinyin t)
("k\u{02b0}", 0xE022), // kʰ aspirated velar (pinyin k)
("t\u{0255}", 0xE023), // tɕ alveolo-palatal affricate (pinyin j)
("t\u{0255}\u{02b0}", 0xE024), // tɕʰ aspirated alveolo-palatal (pinyin q)
("t\u{0282}", 0xE025), // tʂ retroflex affricate (pinyin zh)
("t\u{0282}\u{02b0}", 0xE026), // tʂʰ aspirated retroflex (pinyin ch)
("ts\u{02b0}", 0xE027), // tsʰ aspirated alveolar affricate (pinyin c)
// --- Diphthongs ---
("a\u{026a}", 0xE028), // aɪ (pinyin ai)
("e\u{026a}", 0xE029), // eɪ (pinyin ei)
("a\u{028a}", 0xE02A), // aʊ (pinyin ao)
("o\u{028a}", 0xE02B), // oʊ (pinyin ou)
// --- Nasal finals ---
("an", 0xE02C), // an (pinyin an)
("\u{0259}n", 0xE02D), // ən (pinyin en)
("a\u{014b}", 0xE02E), // aŋ (pinyin ang)
("\u{0259}\u{014b}", 0xE02F), // əŋ (pinyin eng)
("u\u{014b}", 0xE030), // uŋ (pinyin ong)
// --- i-compound finals (齐齿呼) ---
("ia", 0xE031), // ia (pinyin ia/ya)
("i\u{025b}", 0xE032), // iɛ (pinyin ie/ye)
("iou", 0xE033), // iou (pinyin iu/you)
("ia\u{028a}", 0xE034), // iaʊ (pinyin iao/yao)
("i\u{025b}n", 0xE035), // iɛn (pinyin ian/yan)
("in", 0xE036), // in (pinyin in/yin)
("ia\u{014b}", 0xE037), // iaŋ (pinyin iang/yang)
("i\u{014b}", 0xE038), // iŋ (pinyin ing/ying)
("iu\u{014b}", 0xE039), // iuŋ (pinyin iong/yong)
// --- u-compound finals (合口呼) ---
("ua", 0xE03A), // ua (pinyin ua/wa)
("uo", 0xE03B), // uo (pinyin uo/wo)
("ua\u{026a}", 0xE03C), // uaɪ (pinyin uai/wai)
("ue\u{026a}", 0xE03D), // ueɪ (pinyin ui/wei)
("uan", 0xE03E), // uan (pinyin uan/wan)
("u\u{0259}n", 0xE03F), // uən (pinyin un/wen)
("ua\u{014b}", 0xE040), // uaŋ (pinyin uang/wang)
("u\u{0259}\u{014b}", 0xE041), // uəŋ (pinyin ueng/weng)
// --- ü-compound finals (撮口呼) ---
("y\u{025b}", 0xE042), // yɛ (pinyin üe/yue)
("y\u{025b}n", 0xE043), // yɛn (pinyin üan/yuan)
("yn", 0xE044), // yn (pinyin ün/yun)
// --- Syllabic consonants ---
("\u{027b}\u{0329}", 0xE045), // ɻ̩ syllabic retroflex (zhi/chi/shi/ri)
// --- Tone markers ---
("tone1", 0xE046),
("tone2", 0xE047),
("tone3", 0xE048),
("tone4", 0xE049),
("tone5", 0xE04A),
// === Korean (U+E04B-E052) ===
// --- Tense consonants (fortis / 경음) ---
("p\u{0348}", 0xE04B), // p͈ tense bilabial (ㅃ)
("t\u{0348}", 0xE04C), // t͈ tense alveolar (ㄸ)
("k\u{0348}", 0xE04D), // k͈ tense velar (ㄲ)
("s\u{0348}", 0xE04E), // s͈ tense sibilant (ㅆ)
("t\u{0348}\u{0255}", 0xE04F), // t͈ɕ tense alveolo-palatal affricate (ㅉ)
// --- Unreleased finals (내파음) ---
("k\u{031a}", 0xE050), // k̚ unreleased velar
("t\u{031a}", 0xE051), // t̚ unreleased alveolar
("p\u{031a}", 0xE052), // p̚ unreleased bilabial
// 0xE053 reserved (unused gap)
// === Spanish/Portuguese (U+E054-E055) ===
("t\u{0283}", 0xE054), // tʃ voiceless postalveolar affricate
("d\u{0292}", 0xE055), // dʒ voiced postalveolar affricate
// === French (U+E056-E058) ===
// --- Nasal vowels ---
("\u{025b}\u{0303}", 0xE056), // ɛ̃ nasal open-mid front unrounded
("\u{0251}\u{0303}", 0xE057), // ɑ̃ nasal open back unrounded
("\u{0254}\u{0303}", 0xE058), // ɔ̃ nasal open-mid back rounded
// === Swedish (U+E059-E061) ===
// --- Long vowels (Complementary Quantity) ---
("i\u{02D0}", 0xE059), // iː close front unrounded long
("y\u{02D0}", 0xE05A), // yː close front rounded long
("e\u{02D0}", 0xE05B), // eː close-mid front unrounded long
("\u{025B}\u{02D0}", 0xE05C), // ɛː open-mid front unrounded long
("\u{00F8}\u{02D0}", 0xE05D), // øː close-mid front rounded long
("\u{0251}\u{02D0}", 0xE05E), // ɑː open back unrounded long
("o\u{02D0}", 0xE05F), // oː close-mid back rounded long
("u\u{02D0}", 0xE060), // uː close back rounded long
("\u{0289}\u{02D0}", 0xE061), // ʉː close central rounded long
]
});
/// トークン→PUA 文字の前方マッピング
pub static TOKEN_TO_PUA: LazyLock<HashMap<&'static str, char>> = LazyLock::new(|| {
FIXED_PUA_MAP
.iter()
.filter_map(|(token, code)| char::from_u32(*code).map(|c| (*token, c)))
.collect()
});
/// PUA 文字→トークンの逆方向マッピング
pub static PUA_TO_TOKEN: LazyLock<HashMap<char, &'static str>> = LazyLock::new(|| {
FIXED_PUA_MAP
.iter()
.filter_map(|(token, code)| char::from_u32(*code).map(|c| (c, *token)))
.collect()
});
/// 多文字トークンを PUA コードポイントに変換
pub fn token_to_pua(token: &str) -> Option<char> {
TOKEN_TO_PUA.get(token).copied()
}
/// PUA コードポイントをトークン文字列に変換
pub fn pua_to_token(ch: char) -> Option<&'static str> {
PUA_TO_TOKEN.get(&ch).copied()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_fixed_pua_count() {
// Must match Python token_mapper.py FIXED_PUA_MAPPING count exactly
assert_eq!(FIXED_PUA_MAP.len(), 96);
}
#[test]
fn test_japanese_palatalized_order() {
// Verify E00A-E015 match Python canonical order exactly
assert_eq!(token_to_pua("ty"), Some('\u{E00A}'));
assert_eq!(token_to_pua("dy"), Some('\u{E00B}'));
assert_eq!(token_to_pua("py"), Some('\u{E00C}'));
assert_eq!(token_to_pua("by"), Some('\u{E00D}'));
assert_eq!(token_to_pua("zy"), Some('\u{E011}'));
assert_eq!(token_to_pua("hy"), Some('\u{E012}'));
assert_eq!(token_to_pua("ny"), Some('\u{E013}'));
assert_eq!(token_to_pua("my"), Some('\u{E014}'));
assert_eq!(token_to_pua("ry"), Some('\u{E015}'));
// "fw" must NOT exist in the mapping
assert_eq!(token_to_pua("fw"), None);
}
#[test]
fn test_chinese_compound_finals() {
// i-compound finals
assert_eq!(token_to_pua("iou"), Some('\u{E033}'));
assert_eq!(token_to_pua("in"), Some('\u{E036}'));
// u-compound finals
assert_eq!(token_to_pua("uan"), Some('\u{E03E}'));
// ü-compound finals
assert_eq!(token_to_pua("yn"), Some('\u{E044}'));
// Syllabic consonant: ɻ̩ (U+027B + U+0329)
assert_eq!(token_to_pua("\u{027b}\u{0329}"), Some('\u{E045}'));
// "syl", "ɥ", "ɻ" (single), "ioʊ", "yŋ", "yan" must NOT exist
assert_eq!(token_to_pua("syl"), None);
assert_eq!(token_to_pua("\u{0265}"), None); // ɥ
}
#[test]
fn test_japanese_pua() {
assert_eq!(token_to_pua("a:"), Some('\u{E000}'));
assert_eq!(token_to_pua("N_m"), Some('\u{E019}'));
assert_eq!(token_to_pua("?!"), Some('\u{E016}'));
}
#[test]
fn test_chinese_pua() {
assert_eq!(token_to_pua("tone1"), Some('\u{E046}'));
assert_eq!(token_to_pua("tɕ"), Some('\u{E023}'));
}
#[test]
fn test_reverse_mapping() {
assert_eq!(pua_to_token('\u{E000}'), Some("a:"));
assert_eq!(pua_to_token('\u{E056}'), Some("ɛ̃"));
}
#[test]
fn test_no_collisions() {
let mut seen_codes: std::collections::HashSet<u32> = std::collections::HashSet::new();
for (_, code) in FIXED_PUA_MAP.iter() {
assert!(
seen_codes.insert(*code),
"duplicate PUA code: 0x{:04X}",
code
);
}
}
}