phonologist 1.0.1

Parse phonemes in the International Phonetic Alphabet
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
//! A library for parsing IPA phonemes into their features and modifiers.
//! The main function is `Phoneme::parse`, which takes an IPA string and
//! returns a `Phoneme` struct containing the features and modifiers of
//! the phoneme, as well as any warnings that were generated during parsing.
//! 
//! Note that the parsing is a best-effort attempt, and may not be perfect.
//! Phonological notation is a method of communication, not a formal system,
//! and so care must be taken with the results of parsing.
//! 
//! `PartialOrd` and `Ord` are implemented primarily for the purpose of sorting
//! features for generating names via `Phoneme::name`; however, the order also
//! should line up with the IPA charts generally.
//! 
//! ## Examples
//! ```
//! use phonologist::{Phoneme, feature::{Feature, ConsonantFeature, Manner}};
//! let (phoneme, warnings) = Phoneme::parse("t");
//! assert!(warnings.is_empty());
//! assert_eq!(phoneme.name(), "voiceless alveolar stop");
//! assert!(
//!     phoneme
//!         .features()
//!         .contains(&Feature::Consonant(ConsonantFeature::Manner(Manner::Stop)))
//! );
//! ``````

use std::{collections::{HashSet, VecDeque}, fmt::Display};
use unicode_normalization::UnicodeNormalization;

#[doc = include_str!("../README.md")]
#[cfg(doctest)]
pub struct ReadmeDoctests;

// data is not stable between releases; here be dragons
#[doc(hidden)]
pub mod data;
pub mod feature;
use data::{PHONEMES, POLYPHTHONG_COMPONENTS, POSTFIX_MODIFIERS, PREFIX_MODIFIERS, TONE_LETTERS};
use feature::{ConsonantFeature, Depth, Feature, Height, Manner, Modifier, Place, VowelFeature};

use crate::{
    data::{CLICKS, IMPLIED_MODIFIERS, PRENASALIZED_STOP_CONFUSABLE, PRENASALIZED_STOPS},
    feature::{PhonemeClass, Tone},
};

#[derive(Clone, Debug)]
/// A struct representing a phoneme.
///
/// Not all features and modifiers are guaranteed to be present, and some may be mutually exclusive.
/// The parsed version of a phoneme is not guaranteed to be stable across different versions of this library.
pub struct Phoneme {
    /// Phonological features.
    features: HashSet<Feature>,
    /// Modifiers, usually diacritics. Does not include tone letters, which are stored separately.
    modifiers: HashSet<Modifier>,
    /// If this is a polyphthong, on-glides will be stored here.
    /// Polyphthongs must be represented with on-glides having the non-syllabic marker.
    on_glides: Vec<&'static [Feature]>,
    /// If this is a polyphthong, off-glides will be stored here.
    /// Polyphthongs must be represented with off-glides having the non-syllabic marker.
    off_glides: Vec<&'static [Feature]>,
    /// Tone letters will be stored here, in the order they appear in the IPA string.
    /// This does not include all possible tone modifiers - just the set ˩ ˨ ˧ ˦ ˥.
    tone_letters: Vec<Tone>,
    /// Whether this is a consonant or vowel.
    /// (If we couldn't find any features at all, this won't be known.)
    phoneme_class: Option<PhonemeClass>,
    /// The original IPA representation of the phoneme.
    representation: String,
}

impl Phoneme {
    /// Parses an IPA string into a `Phoneme` struct in a best-effort manner.
    /// Returns any warnings generated during parsing.
    pub fn parse(ipa: impl Into<String>) -> (Self, Vec<&'static str>) {
        let mut features = HashSet::new();
        let mut modifiers = HashSet::new();
        let mut on_glides = Vec::new();
        let mut off_glides = Vec::new();
        let mut warnings = Vec::new();
        let representation = ipa.into();

        // decompose the string as much as possible
        let mut ipa: &str = &representation.clone().nfd().collect::<String>();

        // let's pray that the compiler is smart enough to make this a `goto`
        let mut found_phoneme = false;

        // strip prefix modifiers
        'prefix_loop: loop {
            for (prefix, modifier) in PREFIX_MODIFIERS {
                if let Some(rest) = ipa.strip_prefix(prefix) {
                    if (*prefix == "" || *prefix == "")
                        && CLICKS.iter().any(|c| rest.starts_with(c))
                    {
                        break 'prefix_loop;
                    }
                    if PRENASALIZED_STOP_CONFUSABLE.contains(prefix) {
                        for (prenasalized_stop, f) in PRENASALIZED_STOPS {
                            if let Some(r) = ipa.strip_prefix(prenasalized_stop) {
                                features.extend(*f);
                                modifiers.insert(Modifier::PreNasalized);
                                ipa = r;
                                found_phoneme = true;
                                break 'prefix_loop;
                            }
                        }
                    }
                    modifiers.insert(*modifier);
                    ipa = rest;
                    continue 'prefix_loop;
                }
            }
            break;
        }

        if !found_phoneme {
            // strip on-glides
            'onglide_loop: loop {
                for (onglide, features) in POLYPHTHONG_COMPONENTS {
                    if let Some(rest) = ipa.strip_prefix(onglide) {
                        on_glides.push(*features);
                        ipa = rest;
                        continue 'onglide_loop;
                    }
                }
                break;
            }

            // find base phoneme
            for (phoneme, f) in PHONEMES {
                if let Some(rest) = ipa.strip_prefix(phoneme) {
                    features.extend(f.iter().copied());
                    ipa = rest;
                    if let Some(m) = IMPLIED_MODIFIERS.get(phoneme) {
                        modifiers.extend(m.iter());
                    }
                    break;
                }
            }
        }

        'offglide_loop_1: loop {
            for (offglide, features) in POLYPHTHONG_COMPONENTS {
                if let Some(rest) = ipa.strip_suffix(offglide) {
                    off_glides.push(*features);
                    ipa = rest;
                    continue 'offglide_loop_1;
                }
            }
            break;
        }

        'postfix_loop: loop {
            for (postfix, modifier) in POSTFIX_MODIFIERS {
                if let Some(rest) = ipa.strip_suffix(postfix) {
                    modifiers.insert(*modifier);
                    ipa = rest;
                    continue 'postfix_loop;
                }
            }
            break;
        }

        'offglide_loop_2: loop {
            for (offglide, features) in POLYPHTHONG_COMPONENTS {
                if let Some(rest) = ipa.strip_suffix(offglide) {
                    off_glides.push(*features);
                    ipa = rest;
                    continue 'offglide_loop_2;
                }
            }
            break;
        }

        let mut tone_letters = Vec::new();
        'tone_letter_loop: loop {
            for (tone_letter, modifier) in TONE_LETTERS {
                if let Some(rest) = ipa.strip_suffix(tone_letter) {
                    tone_letters.push(*modifier);
                    ipa = rest;
                    continue 'tone_letter_loop;
                }
            }
            break;
        }
        // since we strip the tone letters as suffixes, they will be in reverse order, so reverse them back
        tone_letters.reverse();

        if !ipa.is_empty() {
            warnings.push("leftover characters after parsing phoneme");
        }

        let phoneme_class = features.iter().next().map(Feature::phoneme_class);

        let phoneme = Self {
            features,
            modifiers,
            on_glides,
            off_glides,
            representation,
            tone_letters,
            phoneme_class,
        };

        (phoneme, warnings)
    }

    /// Get the phonological features of this phoneme.
    pub fn features(&self) -> &HashSet<Feature> {
        &self.features
    }

    /// Get the modifiers of this phoneme.
    pub fn modifiers(&self) -> &HashSet<Modifier> {
        &self.modifiers
    }

    /// Get the original IPA representation of this phoneme.
    ///
    /// This is _always_ the same as the input to `Phoneme::from`.
    pub fn representation(&self) -> &str {
        &self.representation
    }

    /// Get the on-glides of this phoneme, if it is a polyphthong.
    pub fn on_glides(&self) -> &[&'static [Feature]] {
        &self.on_glides
    }

    /// Get the off-glides of this phoneme, if it is a polyphthong.
    pub fn off_glides(&self) -> &[&'static [Feature]] {
        &self.off_glides
    }

    /// Get the tone letters of this phoneme, if any.
    ///
    /// This is in the order they appear in the IPA string.
    /// This does not include tone modifiers which appear on the vowel itself.
    /// Additionally, notation varies on what tone letters represent; for example, ExtraHigh may be used for a high level tone,
    /// or the notation may be based on chinese tone marking, in which e.g. "3" means a falling-rising tone. This will instead be parsed
    /// as Mid.
    // (which is sad, because that's the best tone...)
    pub fn tone_letters(&self) -> &[Tone] {
        &self.tone_letters
    }

    pub fn class(&self) -> Option<PhonemeClass> {
        self.phoneme_class
    }

    pub fn is_consonant(&self) -> bool {
        self.phoneme_class == Some(PhonemeClass::Consonant)
    }

    pub fn is_vowel(&self) -> bool {
        self.phoneme_class == Some(PhonemeClass::Vowel)
    }

    pub fn name(&self) -> String {
        let expected_parts = self.features.len() + self.modifiers.len() + 1;
        let mut parts = VecDeque::with_capacity(expected_parts);

        let has_voicing_modifier = self.modifiers.iter().any(|m| matches!(m, Modifier::Voice(_)));

        let mut features: Vec<_> = if has_voicing_modifier {
            self.features
                .iter()
                .copied()
                .filter(|f| !matches!(f, Feature::Consonant(ConsonantFeature::Voiced)))
                .collect()
        } else {
            self.features.iter().copied().collect()
        };
        features.sort();
        for feature in features {
            parts.push_back(feature.into());
        }

        if !has_voicing_modifier && self.is_consonant() && !self.features.iter().any(|f| matches!(f, Feature::Consonant(ConsonantFeature::Voiced))) {
            parts.push_front("voiceless");
        }

        if self.phoneme_class == Some(PhonemeClass::Vowel) {
            parts.push_back("vowel")
        }

        for modifier in &self.modifiers {
            modifier.apply_modifier(&mut parts);
        }
        let mut name = parts.make_contiguous().join(" ");

        if !self.tone_letters.is_empty() {
            name.push_str(" with tone pattern ");
            for tone_letter in &self.tone_letters {
                name.push_str(tone_letter.as_number_str());
            }
        }

        name
    }
}

impl Display for Phoneme {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.name())
    }
}

impl From<&str> for Phoneme {
    fn from(value: &str) -> Self {
        Self::parse(value).0
    }
}

impl From<String> for Phoneme {
    fn from(value: String) -> Self {
        Self::parse(value).0
    }
}

#[cfg(feature = "serde")]
impl serde::Serialize for Phoneme {
    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
        serializer.serialize_str(&self.representation)
    }
}

#[cfg(feature = "serde")]
impl<'de> serde::Deserialize<'de> for Phoneme {
    fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
        let s = String::deserialize(deserializer)?;
        Ok(Self::parse(s).0)
    }
}

#[cfg(test)]
mod tests {
    use crate::feature::Tone;

    use super::*;

    #[test]
    fn parsing_phoneme() {
        let (phoneme, warnings) = Phoneme::parse("");
        assert!(warnings.is_empty());
        assert!(
            phoneme
                .features()
                .contains(&Feature::Consonant(ConsonantFeature::Manner(Manner::Stop)))
        );
        assert!(
            phoneme
                .features()
                .contains(&Feature::Consonant(ConsonantFeature::Place(
                    Place::Alveolar
                )))
        );
        assert!(phoneme.modifiers().contains(&Modifier::Aspirated));
        assert_eq!(phoneme.representation(), "");
    }

    #[test]
    fn parsing_polyphthong() {
        let (phoneme, warnings) = Phoneme::parse("aɪ̯");
        assert!(warnings.is_empty());
        assert!(
            phoneme
                .features()
                .contains(&Feature::Vowel(VowelFeature::Height(Height::Open)))
        );
        assert!(
            phoneme
                .features()
                .contains(&Feature::Vowel(VowelFeature::Depth(Depth::Front)))
        );
        assert!(phoneme.on_glides().is_empty());
        assert_eq!(
            phoneme.off_glides(),
            &vec![&[
                Feature::Vowel(VowelFeature::Height(Height::NearClose)),
                Feature::Vowel(VowelFeature::Depth(Depth::NearFront))
            ]]
        );
        assert_eq!(phoneme.representation(), "aɪ̯");
    }

    #[test]
    fn parsing_tone_letters() {
        let (phoneme, warnings) = Phoneme::parse("a˧˥");
        assert!(warnings.is_empty());
        assert!(
            phoneme
                .features()
                .contains(&Feature::Vowel(VowelFeature::Height(Height::Open)))
        );
        assert!(
            phoneme
                .features()
                .contains(&Feature::Vowel(VowelFeature::Depth(Depth::Front)))
        );
        assert!(phoneme.tone_letters().contains(&Tone::Mid));
        assert!(
            phoneme
                .tone_letters()
                .contains(&Tone::ExtraHigh)
        );
        assert_eq!(phoneme.representation(), "a˧˥");
    }

    #[test]
    fn parsing_prenasalized_double_articulation() {
        let (phoneme, warnings) = Phoneme::parse("ŋ͡mg͡b");
        assert!(warnings.is_empty());
        assert!(phoneme.modifiers().contains(&Modifier::PreNasalized));
        assert!(phoneme.features().contains(&Feature::Consonant(ConsonantFeature::DoubleArticulation(Place::Bilabial, Place::Velar))));
        assert!(phoneme.features().contains(&Feature::Consonant(ConsonantFeature::Manner(Manner::Stop))));
    }

    #[test]
    fn name_simple() {
        let (phoneme, warnings) = Phoneme::parse("t");
        assert!(warnings.is_empty());
        assert_eq!(phoneme.name(), "voiceless alveolar stop");
    }

    #[test]
    fn name_with_modifiers() {
        let (phoneme, warnings) = Phoneme::parse("");
        assert!(warnings.is_empty());
        assert_eq!(phoneme.name(), "aspirated voiceless alveolar stop");
    }

    #[test]
    fn name_with_suffix() {
        let (phoneme, warnings) = Phoneme::parse("á");
        assert!(warnings.is_empty());
        assert_eq!(phoneme.name(), "open front vowel with high tone");
    }

    #[test]
    fn name_voiceless() {
        let (phoneme, warnings) = Phoneme::parse("");
        assert!(warnings.is_empty());
        assert_eq!(phoneme.name(), "voiceless alveolar stop");
    }

    #[test]
    fn name_tone_letters() {
        let (phoneme, warnings) = Phoneme::parse("a˧˥");
        assert!(warnings.is_empty());
        assert_eq!(phoneme.name(), "open front vowel with tone pattern 35");
    }

    #[test]
    fn name_prenasalized_double_articulation() {
        let (phoneme, warnings) = Phoneme::parse("ŋ͡mg͡b");
        assert!(warnings.is_empty());
        assert_eq!(phoneme.name(), "pre-nasalized voiced labial-velar stop");
    }
}