unicode_script/
lib.rs

1//! This crate exposes the Unicode `Script` and `Script_Extension`
2//! properties from [UAX #24](http://www.unicode.org/reports/tr24/)
3
4#![cfg_attr(not(test), no_std)]
5#![cfg_attr(feature = "bench", feature(test))]
6
7mod tables;
8
9use core::convert::TryFrom;
10use core::fmt;
11use core::u64;
12pub use tables::script_extensions;
13use tables::{get_script, get_script_extension, NEXT_SCRIPT};
14pub use tables::{Script, UNICODE_VERSION};
15
16impl Script {
17    /// Get the full name of a script.
18    pub fn full_name(self) -> &'static str {
19        self.inner_full_name()
20    }
21
22    /// Attempts to parse script name from the provided string.
23    /// Returns `None` if the provided string does not represent a valid
24    /// script full name.
25    pub fn from_full_name(input: &str) -> Option<Self> {
26        Self::inner_from_full_name(input)
27    }
28
29    /// Get the four-character short name of a script.
30    pub fn short_name(self) -> &'static str {
31        self.inner_short_name()
32    }
33
34    /// Attempts to parse script name from the provided string.
35    /// Returns `None` if the provided string does not represent a valid
36    /// script four-character short name.
37    pub fn from_short_name(input: &str) -> Option<Self> {
38        Self::inner_from_short_name(input)
39    }
40
41    /// Is this script "Recommended" according to
42    /// [UAX #31](www.unicode.org/reports/tr31/#Table_Recommended_Scripts)?
43    pub fn is_recommended(self) -> bool {
44        use Script::*;
45        match self {
46            Common | Inherited | Arabic | Armenian | Bengali | Bopomofo | Cyrillic | Devanagari
47            | Ethiopic | Georgian | Greek | Gujarati | Gurmukhi | Han | Hangul | Hebrew
48            | Hiragana | Kannada | Katakana | Khmer | Lao | Latin | Malayalam | Myanmar | Oriya
49            | Sinhala | Tamil | Telugu | Thaana | Thai | Tibetan => true,
50            _ => false,
51        }
52    }
53}
54
55impl From<Script> for ScriptExtension {
56    fn from(script: Script) -> Self {
57        if script == Script::Common {
58            ScriptExtension::new_common()
59        } else if script == Script::Inherited {
60            ScriptExtension::new_inherited()
61        } else if script == Script::Unknown {
62            ScriptExtension::new_unknown()
63        } else {
64            let mut first = 0;
65            let mut second = 0;
66            let mut third = 0;
67            let bit = script as u8;
68            // Find out which field it's in, and set the appropriate bit there
69            if bit < 64 {
70                first = 1 << bit as u64;
71            } else if bit < 128 {
72                // offset by 64 since `bit` is an absolute number,
73                // not relative to the chunk
74                second = 1 << (bit - 64) as u64;
75            } else {
76                third = 1 << (bit - 128) as u32;
77            }
78            ScriptExtension::new(first, second, third)
79        }
80    }
81}
82
83impl TryFrom<ScriptExtension> for Script {
84    type Error = ();
85    fn try_from(ext: ScriptExtension) -> Result<Self, ()> {
86        if ext.is_common_or_inherited() {
87            if ext.common {
88                Ok(Script::Common)
89            } else {
90                Ok(Script::Inherited)
91            }
92        } else if ext.is_empty() {
93            Ok(Script::Unknown)
94        } else {
95            // filled elements will have set ones
96            let fo = ext.first.count_ones();
97            let so = ext.second.count_ones();
98            let to = ext.third.count_ones();
99            // only one bit set, in the first chunk
100            if fo == 1 && so == 0 && to == 0 {
101                // use trailing_zeroes() to figure out which bit it is
102                Ok(Script::for_integer(ext.first.trailing_zeros() as u8))
103            // only one bit set, in the second chunk
104            } else if fo == 0 && so == 1 && to == 0 {
105                Ok(Script::for_integer(64 + ext.second.trailing_zeros() as u8))
106            // only one bit set, in the third chunk
107            } else if fo == 0 && so == 0 && to == 1 {
108                Ok(Script::for_integer(128 + ext.third.trailing_zeros() as u8))
109            } else {
110                Err(())
111            }
112        }
113    }
114}
115
116impl Default for Script {
117    fn default() -> Self {
118        Script::Common
119    }
120}
121
122impl From<char> for Script {
123    fn from(o: char) -> Self {
124        o.script()
125    }
126}
127
128impl fmt::Display for Script {
129    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
130        write!(f, "{}", self.full_name())
131    }
132}
133
134#[derive(Clone, Copy, PartialEq, Eq, Hash)]
135#[non_exhaustive]
136/// A value for the `Script_Extension` property
137///
138/// [`ScriptExtension`] is one or more [`Script`]
139///
140/// This is essentially an optimized version of `Vec<Script>` that uses bitfields
141pub struct ScriptExtension {
142    // A bitset for the first 64 scripts
143    first: u64,
144    // A bitset for the scripts 65-128
145    second: u64,
146    // A bitset for scripts after 128
147    third: u64,
148    // Both Common and Inherited are represented by all used bits being set,
149    // this flag lets us distinguish the two.
150    common: bool,
151}
152
153impl ScriptExtension {
154    // We don't use the complete u64 of `third`, so the "all" value is not just u32::MAX
155    // Instead, we take the number of the next (unused) script bit, subtract 128 to bring
156    // it in the range of `third`, create a u64 with just that bit set, and subtract 1
157    // to create one with all the lower bits set.
158    const THIRD_MAX: u64 = ((1 << (NEXT_SCRIPT - 128)) - 1);
159
160    pub(crate) const fn new(first: u64, second: u64, third: u64) -> Self {
161        ScriptExtension {
162            first,
163            second,
164            third,
165            common: false,
166        }
167    }
168
169    pub(crate) const fn new_common() -> Self {
170        ScriptExtension {
171            first: u64::MAX,
172            second: u64::MAX,
173            third: Self::THIRD_MAX,
174            common: true,
175        }
176    }
177
178    pub(crate) const fn new_inherited() -> Self {
179        ScriptExtension {
180            first: u64::MAX,
181            second: u64::MAX,
182            third: Self::THIRD_MAX,
183            common: false,
184        }
185    }
186
187    pub(crate) const fn new_unknown() -> Self {
188        ScriptExtension {
189            first: 0,
190            second: 0,
191            third: 0,
192            common: false,
193        }
194    }
195
196    const fn is_common_or_inherited(self) -> bool {
197        (self.first == u64::MAX) & (self.second == u64::MAX) & (self.third == Self::THIRD_MAX)
198    }
199
200    /// Checks if the script extension is Common
201    pub const fn is_common(self) -> bool {
202        self.is_common_or_inherited() & self.common
203    }
204
205    /// Checks if the script extension is Inherited
206    pub const fn is_inherited(self) -> bool {
207        self.is_common_or_inherited() & !self.common
208    }
209
210    /// Checks if the script extension is empty (unknown)
211    pub const fn is_empty(self) -> bool {
212        (self.first == 0) & (self.second == 0) & (self.third == 0)
213    }
214
215    /// Returns the number of scripts in the script extension
216    pub fn len(self) -> usize {
217        if self.is_common_or_inherited() {
218            1
219        } else {
220            (self.first.count_ones() + self.second.count_ones() + self.third.count_ones()) as usize
221        }
222    }
223
224    /// Intersect this `ScriptExtension` with another `ScriptExtension`. Produces `Unknown` if things
225    /// do not intersect. This is equivalent to [`ScriptExtension::intersection`] but it stores the result
226    /// in `self`
227    ///
228    /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
229    /// everything, the intersection of `Common` and `Inherited` is `Inherited`
230    pub fn intersect_with(&mut self, other: Self) {
231        *self = self.intersection(other)
232    }
233
234    /// Find the intersection between two ScriptExtensions. Returns Unknown if things
235    /// do not intersect.
236    ///
237    /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
238    /// everything, the intersection of `Common` and `Inherited` is `Inherited`
239    pub const fn intersection(self, other: Self) -> Self {
240        let first = self.first & other.first;
241        let second = self.second & other.second;
242        let third = self.third & other.third;
243        let common = self.common & other.common;
244        ScriptExtension {
245            first,
246            second,
247            third,
248            common,
249        }
250    }
251
252    /// Find the union between two ScriptExtensions.
253    ///
254    /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
255    /// everything, the union of `Common` and `Inherited` is `Common`
256    pub const fn union(self, other: Self) -> Self {
257        let first = self.first | other.first;
258        let second = self.second | other.second;
259        let third = self.third | other.third;
260        let common = self.common | other.common;
261        ScriptExtension {
262            first,
263            second,
264            third,
265            common,
266        }
267    }
268
269    /// Check if this ScriptExtension contains the given script
270    ///
271    /// Should be used with specific scripts only, this will
272    /// return `true` if `self` is not `Unknown` and `script` is
273    /// `Common` or `Inherited`
274    pub fn contains_script(self, script: Script) -> bool {
275        !self.intersection(script.into()).is_empty()
276    }
277
278    /// Get the intersection of script extensions of all characters
279    /// in a string.
280    pub fn for_str(x: &str) -> Self {
281        let mut ext = ScriptExtension::default();
282        for ch in x.chars() {
283            ext.intersect_with(ch.into());
284        }
285        ext
286    }
287
288    /// Iterate over the scripts in this script extension
289    ///
290    /// Will never yield Script::Unknown
291    pub fn iter(self) -> ScriptIterator {
292        ScriptIterator { ext: self }
293    }
294}
295
296impl Default for ScriptExtension {
297    fn default() -> Self {
298        ScriptExtension::new_common()
299    }
300}
301
302impl From<char> for ScriptExtension {
303    fn from(o: char) -> Self {
304        o.script_extension()
305    }
306}
307
308impl From<&'_ str> for ScriptExtension {
309    fn from(o: &'_ str) -> Self {
310        Self::for_str(o)
311    }
312}
313
314impl fmt::Debug for ScriptExtension {
315    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
316        write!(f, "ScriptExtension(")?;
317        fmt::Display::fmt(self, f)?;
318        write!(f, ")")
319    }
320}
321
322impl fmt::Display for ScriptExtension {
323    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
324        if self.is_common() {
325            write!(f, "Common")?;
326        } else if self.is_inherited() {
327            write!(f, "Inherited")?;
328        } else if self.is_empty() {
329            write!(f, "Unknown")?;
330        } else {
331            let mut first = true;
332            for script in self.iter() {
333                if !first {
334                    write!(f, " + ")?;
335                    first = false;
336                }
337                script.full_name().fmt(f)?;
338            }
339        }
340        Ok(())
341    }
342}
343
344/// Extension trait on `char` for calculating script properties
345pub trait UnicodeScript {
346    /// Get the script for a given character
347    fn script(&self) -> Script;
348    /// Get the Script_Extension for a given character
349    fn script_extension(&self) -> ScriptExtension;
350}
351
352impl UnicodeScript for char {
353    fn script(&self) -> Script {
354        get_script(*self).unwrap_or(Script::Unknown)
355    }
356
357    fn script_extension(&self) -> ScriptExtension {
358        get_script_extension(*self).unwrap_or_else(|| self.script().into())
359    }
360}
361
362/// Iterator over scripts in a [ScriptExtension].
363///
364/// Can be obtained ia [ScriptExtension::iter()]
365pub struct ScriptIterator {
366    ext: ScriptExtension,
367}
368
369impl Iterator for ScriptIterator {
370    type Item = Script;
371
372    fn next(&mut self) -> Option<Script> {
373        if self.ext.is_common_or_inherited() {
374            let common = self.ext.common;
375            self.ext = ScriptExtension::new_unknown();
376            if common {
377                Some(Script::Common)
378            } else {
379                Some(Script::Inherited)
380            }
381        // Are there bits left in the first chunk?
382        } else if self.ext.first != 0 {
383            // Find the next bit
384            let bit = self.ext.first.trailing_zeros();
385            // unset just that bit
386            self.ext.first &= !(1 << bit);
387            Some(Script::for_integer(bit as u8))
388        // Are there bits left in the second chunk?
389        } else if self.ext.second != 0 {
390            let bit = self.ext.second.trailing_zeros();
391            self.ext.second &= !(1 << bit);
392            Some(Script::for_integer(64 + bit as u8))
393        // Are there bits left in the third chunk?
394        } else if self.ext.third != 0 {
395            let bit = self.ext.third.trailing_zeros();
396            self.ext.third &= !(1 << bit);
397            Some(Script::for_integer(128 + bit as u8))
398        } else {
399            // Script::Unknown
400            None
401        }
402    }
403}
404
405#[cfg(test)]
406mod tests {
407    use crate::*;
408    use std::collections::HashSet;
409    use std::convert::TryInto;
410
411    #[cfg(feature = "bench")]
412    use test::bench::Bencher;
413    #[cfg(feature = "bench")]
414    extern crate test;
415
416    #[test]
417    fn test_conversion() {
418        let mut seen_scripts = HashSet::new();
419        let mut seen_exts = HashSet::new();
420        for bit in 0..NEXT_SCRIPT {
421            let script = Script::for_integer(bit);
422            let ext = script.into();
423            if seen_scripts.contains(&script) {
424                panic!("Found script {:?} twice!", script)
425            }
426            if seen_exts.contains(&ext) {
427                panic!("Found extension {:?} twice!", ext)
428            }
429            seen_scripts.insert(script);
430            seen_exts.insert(ext);
431            assert_eq!(script as u8, bit);
432            assert!(!ScriptExtension::new_common().intersection(ext).is_empty());
433            assert!(!ScriptExtension::new_inherited()
434                .intersection(ext)
435                .is_empty());
436            assert!(ScriptExtension::new_unknown().intersection(ext).is_empty());
437            assert_eq!(ext.iter().collect::<Vec<_>>(), vec![script]);
438            assert_eq!(Ok(script), ext.try_into());
439        }
440    }
441
442    #[test]
443    fn test_specific() {
444        let s = "सवव मानवी व्यद्क् जन्मतःच स्वतींत्र आहेत व त्ाींना समान प्रवतष्ठा व समान अविकार आहेत. त्ाींना ववचारशद्क् व सवविे कबुद्द्धलाभलेली आहे. व त्ाींनी एकमेकाींशी बींिुत्वाचाभावनेने आचरण करावे.";
445        let ext = ScriptExtension::for_str(s);
446        assert_eq!(ext, script_extensions::DEVA);
447        println!(
448            "{:?}",
449            script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
450        );
451        println!(
452            "{:?}",
453            ext.intersection(
454                script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
455            )
456        );
457        assert!(!ext
458            .intersection(script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH)
459            .is_empty());
460
461        let u = ext.union(Script::Dogra.into());
462        assert_eq!(
463            u.intersection(
464                script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
465            ),
466            u
467        );
468    }
469
470    #[test]
471    fn test_specific_ext() {
472        let ext = script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH;
473
474        let all: HashSet<_> = ext.iter().collect();
475
476        for bit in 0..NEXT_SCRIPT {
477            let script = Script::for_integer(bit);
478
479            if all.contains(&script) {
480                assert!(ext.contains_script(script))
481            } else {
482                assert!(!ext.contains_script(script))
483            }
484        }
485
486        assert!(ext.contains_script(Script::Devanagari));
487        assert!(ext.contains_script(Script::Dogra));
488        assert!(ext.contains_script(Script::Gujarati));
489        assert!(ext.contains_script(Script::Gurmukhi));
490        assert!(ext.contains_script(Script::Khojki));
491        assert!(ext.contains_script(Script::Kaithi));
492        assert!(ext.contains_script(Script::Mahajani));
493        assert!(ext.contains_script(Script::Modi));
494        assert!(ext.contains_script(Script::Khudawadi));
495        assert!(ext.contains_script(Script::Takri));
496        assert!(ext.contains_script(Script::Tirhuta));
497
498        let scr: Result<Script, _> = ext.try_into();
499        assert!(scr.is_err());
500    }
501
502    #[cfg(feature = "bench")]
503    #[bench]
504    fn bench_script_intersection(b: &mut Bencher) {
505        b.iter(|| {
506            let script = test::black_box(Script::Devanagari);
507            let ext = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ONAO_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH);
508            test::black_box(ext.intersection(script.into()));
509        })
510    }
511
512    #[cfg(feature = "bench")]
513    #[bench]
514    fn bench_ext_to_script(b: &mut Bencher) {
515        let ext: ScriptExtension = Script::Devanagari.into();
516        b.iter(|| {
517            let ext = test::black_box(ext);
518            let script: Result<Script, _> = ext.try_into();
519            let _ = test::black_box(script);
520        })
521    }
522
523    #[cfg(feature = "bench")]
524    #[bench]
525    fn bench_script_to_ext(b: &mut Bencher) {
526        b.iter(|| {
527            let script = test::black_box(Script::Devanagari);
528            let ext: ScriptExtension = script.into();
529            test::black_box(ext);
530        })
531    }
532
533    #[cfg(feature = "bench")]
534    #[bench]
535    fn bench_ext_intersection(b: &mut Bencher) {
536        b.iter(|| {
537            let e1 = test::black_box(script_extensions::ARAB_GARA_NKOO_ROHG_SYRC_THAA_YEZI);
538            let e2 = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ONAO_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH);
539            test::black_box(e2.intersection(e1));
540        })
541    }
542
543    #[cfg(feature = "bench")]
544    #[bench]
545    fn bench_to_vec(b: &mut Bencher) {
546        b.iter(|| {
547            let ext = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ONAO_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH);
548            test::black_box(ext.iter().collect::<Vec<_>>());
549        })
550    }
551
552    #[cfg(feature = "bench")]
553    #[bench]
554    fn bench_string_ext(b: &mut Bencher) {
555        b.iter(|| {
556            let s = test::black_box("सवव मानवी व्यद्क् जन्मतःच स्वतींत्र आहेत व त्ाींना समान प्रवतष्ठा व समान अविकार आहेत. त्ाींना ववचारशद्क् व सवविे कबुद्द्धलाभलेली आहे. व त्ाींनी एकमेकाींशी बींिुत्वाचाभावनेने आचरण करावे.");
557            test::black_box(ScriptExtension::for_str(s));
558        })
559    }
560}