runestr_pancjkv/
lib.rs

1#![deny(warnings, missing_docs, missing_debug_implementations)]
2//! `rune`-based PanCJKV IVD Collection support
3//!
4//! [PanCJKV IVD Collection](https://github.com/adobe-type-tools/pancjkv-ivd-collection/) is an unregistered IVD collection,
5//! that makes use of Unicode Variation Selectors to distinguish CJK ideograph glyphs on a per-region basis.
6//!
7//! For example, `"\u{6211}"` (`'我'`, `U+6211`) when annotated with `PanCJKVRegion::JP`, will become `"\u{6211}\u{E01E8}"`,
8//! where the variation selector `U+E01E8` means Japan region in PanCJKV IVD Collection.
9//!
10//! This crate add support for PanCJKV IVD Collection support to `rune`-based iterators,
11//! by allowing unannotated CJK ideograph abstract characters be transformed into annotated form explicitly.
12
13use runestr::rune;
14
15#[allow(dead_code)]
16mod tables;
17
18/// PanCJKV Region
19#[derive(Clone, Copy, Debug)]
20pub enum PanCJKVRegion {
21    /// Kāngxī
22    XK,
23    /// PRC
24    CN,
25    /// Republic of Singapore
26    SG,
27    /// ROC
28    TW,
29    /// Hong Kong SAR
30    HK,
31    /// Macao SAR
32    MO,
33    /// Malaysia
34    MY,
35    /// Japan
36    JP,
37    /// ROK
38    KR,
39    /// DPRK
40    KP,
41    /// Vietnam
42    VN,
43}
44
45const PAN_CJKV_REGION_DATA: &[(PanCJKVRegion, char)] = &[
46    (PanCJKVRegion::XK, '\u{E01EF}'),
47    (PanCJKVRegion::CN, '\u{E01EE}'),
48    (PanCJKVRegion::SG, '\u{E01ED}'),
49    (PanCJKVRegion::TW, '\u{E01EC}'),
50    (PanCJKVRegion::HK, '\u{E01EB}'),
51    (PanCJKVRegion::MO, '\u{E01EA}'),
52    (PanCJKVRegion::MY, '\u{E01E9}'),
53    (PanCJKVRegion::JP, '\u{E01E8}'),
54    (PanCJKVRegion::KR, '\u{E01E7}'),
55    (PanCJKVRegion::KP, '\u{E01E6}'),
56    (PanCJKVRegion::VN, '\u{E01E5}'),
57];
58
59#[allow(dead_code)]
60const PAN_CJKV_REGION_COUNT: usize = PAN_CJKV_REGION_DATA.len();
61
62/// Annotate rune iterator items with PanCJKV region.
63pub trait PanCJKVAnnotate: Sized {
64    /// Retrieves an iterator that transforms all runes representing CJK ideographs to its PanCJKV IVS
65    /// form within a specific region.
66    fn annotate_with_pan_cjkv_region(self, region: PanCJKVRegion) -> PanCJKVAnnotateIter<Self>;
67}
68
69impl<I> PanCJKVAnnotate for I
70where
71    I: Iterator<Item = rune>,
72{
73    fn annotate_with_pan_cjkv_region(self, region: PanCJKVRegion) -> PanCJKVAnnotateIter<Self> {
74        PanCJKVAnnotateIter {
75            runes: self,
76            region_vs: PAN_CJKV_REGION_DATA[region as usize].1,
77        }
78    }
79}
80
81/// An iterator that annotates rune items with PanCJKV region,
82/// usually created with [`PanCJKVAnnotate::annotate_with_pan_cjkv_region`].
83#[derive(Debug)]
84pub struct PanCJKVAnnotateIter<I> {
85    runes: I,
86    region_vs: char,
87}
88
89impl<I> Iterator for PanCJKVAnnotateIter<I>
90where
91    I: Iterator<Item = rune>,
92{
93    type Item = rune;
94
95    fn next(&mut self) -> Option<Self::Item> {
96        use crate::tables::is_han_script_lo_character;
97        let rune = self.runes.next()?;
98        if let Some(ch) = rune.into_char() {
99            if is_han_script_lo_character(ch) {
100                let mut s = String::new();
101                s.push(ch);
102                s.push(self.region_vs);
103                return Some(rune::from_grapheme_cluster(&s).unwrap());
104            } else {
105                return Some(rune);
106            }
107        } else {
108            let chars = rune.into_chars();
109            #[derive(Clone, Copy)]
110            enum State {
111                None,
112                HanScriptLoCore(usize),
113                HanScriptLoCoreAndVS(usize, usize),
114            }
115
116            let mut state = State::None;
117            for (idx, ch) in chars.clone().enumerate() {
118                match state {
119                    State::None => {
120                        if is_han_script_lo_character(ch) {
121                            state = State::HanScriptLoCore(idx);
122                        }
123                    }
124                    State::HanScriptLoCore(core_idx) => {
125                        if idx == core_idx + 1 && is_vs(ch) {
126                            state = State::HanScriptLoCoreAndVS(core_idx, idx);
127                        }
128                        break;
129                    }
130                    _ => unreachable!(),
131                }
132            }
133            if let State::HanScriptLoCore(idx) = state {
134                let mut str = String::new();
135                str.extend(chars.clone().take(idx + 1));
136                str.push(self.region_vs);
137                str.extend(chars.skip(idx + 1));
138                Some(rune::from_grapheme_cluster(&str).unwrap())
139            } else {
140                Some(rune)
141            }
142        }
143    }
144}
145
146fn is_vs(ch: char) -> bool {
147    let ch = ch as u32;
148    if ch >= 0xFE00 && ch <= 0xFE0F {
149        true
150    } else if ch >= 0xE0100 && ch <= 0xE01EF {
151        true
152    } else {
153        false
154    }
155}
156
157#[cfg(test)]
158mod tests {
159    use runestr::RuneString;
160
161    use crate::{PanCJKVAnnotate, PanCJKVRegion};
162
163    #[test]
164    fn test_han_with_ascent() {
165        let test = RuneString::from_str_lossy("\u{6211}\u{030C}\u{4EEC}\u{E01EE}\u{0301}");
166        assert_eq!(2, test.runes().count());
167        let result = test
168            .runes()
169            .annotate_with_pan_cjkv_region(PanCJKVRegion::XK)
170            .collect::<RuneString>();
171        assert_eq!(
172            &result.chars().collect::<Vec<_>>()[..],
173            &[
174                '\u{6211}',
175                '\u{E01EF}',
176                '\u{030C}',
177                '\u{4EEC}',
178                '\u{E01EE}',
179                '\u{0301}'
180            ]
181        );
182        assert_eq!(2, result.runes().count());
183    }
184}