unicode_script/
lib.rs

1//! This crate exposes the Unicode `Script` and `Script_Extension`
2//! properties from [UAX #24](http://www.unicode.org/reports/tr24/)
3
4#![cfg_attr(not(test), no_std)]
5#![cfg_attr(feature = "bench", feature(test))]
6
7mod tables;
8
9use core::convert::{TryFrom, TryInto};
10use core::fmt;
11use core::u64;
12pub use tables::script_extensions;
13use tables::{get_script, get_script_extension, NEXT_SCRIPT};
14pub use tables::{Script, UNICODE_VERSION};
15
16impl Script {
17    /// Get the full name of a script.
18    pub fn full_name(self) -> &'static str {
19        self.inner_full_name()
20    }
21
22    /// Attempts to parse script name from the provided string.
23    /// Returns `None` if the provided string does not represent a valid
24    /// script full name.
25    pub fn from_full_name(input: &str) -> Option<Self> {
26        Self::inner_from_full_name(input)
27    }
28
29    /// Get the four-character short name of a script.
30    pub fn short_name(self) -> &'static str {
31        self.inner_short_name()
32    }
33
34    /// Attempts to parse script name from the provided string.
35    /// Returns `None` if the provided string does not represent a valid
36    /// script four-character short name.
37    pub fn from_short_name(input: &str) -> Option<Self> {
38        Self::inner_from_short_name(input)
39    }
40
41    /// The 4-byte iso15924 tag as a big-endian `u32`
42    pub fn as_iso15924_tag(self) -> u32 {
43        let arr: [u8; 4] = self.inner_short_name().as_bytes().try_into().unwrap();
44        u32::from_be_bytes(arr)
45    }
46
47    /// Is this script "Recommended" according to
48    /// [UAX #31](www.unicode.org/reports/tr31/#Table_Recommended_Scripts)?
49    pub fn is_recommended(self) -> bool {
50        use Script::*;
51        match self {
52            Common | Inherited | Arabic | Armenian | Bengali | Bopomofo | Cyrillic | Devanagari
53            | Ethiopic | Georgian | Greek | Gujarati | Gurmukhi | Han | Hangul | Hebrew
54            | Hiragana | Kannada | Katakana | Khmer | Lao | Latin | Malayalam | Myanmar | Oriya
55            | Sinhala | Tamil | Telugu | Thaana | Thai | Tibetan => true,
56            _ => false,
57        }
58    }
59}
60
61impl From<Script> for ScriptExtension {
62    fn from(script: Script) -> Self {
63        if script == Script::Common {
64            ScriptExtension::new_common()
65        } else if script == Script::Inherited {
66            ScriptExtension::new_inherited()
67        } else if script == Script::Unknown {
68            ScriptExtension::new_unknown()
69        } else {
70            let mut first = 0;
71            let mut second = 0;
72            let mut third = 0;
73            let bit = script as u8;
74            // Find out which field it's in, and set the appropriate bit there
75            if bit < 64 {
76                first = 1 << bit as u64;
77            } else if bit < 128 {
78                // offset by 64 since `bit` is an absolute number,
79                // not relative to the chunk
80                second = 1 << (bit - 64) as u64;
81            } else {
82                third = 1 << (bit - 128) as u32;
83            }
84            ScriptExtension::new(first, second, third)
85        }
86    }
87}
88
89impl TryFrom<ScriptExtension> for Script {
90    type Error = ();
91    fn try_from(ext: ScriptExtension) -> Result<Self, ()> {
92        if ext.is_common_or_inherited() {
93            if ext.common {
94                Ok(Script::Common)
95            } else {
96                Ok(Script::Inherited)
97            }
98        } else if ext.is_empty() {
99            Ok(Script::Unknown)
100        } else {
101            // filled elements will have set ones
102            let fo = ext.first.count_ones();
103            let so = ext.second.count_ones();
104            let to = ext.third.count_ones();
105            // only one bit set, in the first chunk
106            if fo == 1 && so == 0 && to == 0 {
107                // use trailing_zeroes() to figure out which bit it is
108                Ok(Script::for_integer(ext.first.trailing_zeros() as u8))
109            // only one bit set, in the second chunk
110            } else if fo == 0 && so == 1 && to == 0 {
111                Ok(Script::for_integer(64 + ext.second.trailing_zeros() as u8))
112            // only one bit set, in the third chunk
113            } else if fo == 0 && so == 0 && to == 1 {
114                Ok(Script::for_integer(128 + ext.third.trailing_zeros() as u8))
115            } else {
116                Err(())
117            }
118        }
119    }
120}
121
122impl Default for Script {
123    fn default() -> Self {
124        Script::Common
125    }
126}
127
128impl From<char> for Script {
129    fn from(o: char) -> Self {
130        o.script()
131    }
132}
133
134impl fmt::Display for Script {
135    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
136        write!(f, "{}", self.full_name())
137    }
138}
139
140#[derive(Clone, Copy, PartialEq, Eq, Hash)]
141#[non_exhaustive]
142/// A value for the `Script_Extension` property
143///
144/// [`ScriptExtension`] is one or more [`Script`]
145///
146/// This is essentially an optimized version of `Vec<Script>` that uses bitfields
147pub struct ScriptExtension {
148    // A bitset for the first 64 scripts
149    first: u64,
150    // A bitset for the scripts 65-128
151    second: u64,
152    // A bitset for scripts after 128
153    third: u64,
154    // Both Common and Inherited are represented by all used bits being set,
155    // this flag lets us distinguish the two.
156    common: bool,
157}
158
159impl ScriptExtension {
160    // We don't use the complete u64 of `third`, so the "all" value is not just u32::MAX
161    // Instead, we take the number of the next (unused) script bit, subtract 128 to bring
162    // it in the range of `third`, create a u64 with just that bit set, and subtract 1
163    // to create one with all the lower bits set.
164    const THIRD_MAX: u64 = ((1 << (NEXT_SCRIPT - 128)) - 1);
165
166    pub(crate) const fn new(first: u64, second: u64, third: u64) -> Self {
167        ScriptExtension {
168            first,
169            second,
170            third,
171            common: false,
172        }
173    }
174
175    pub(crate) const fn new_common() -> Self {
176        ScriptExtension {
177            first: u64::MAX,
178            second: u64::MAX,
179            third: Self::THIRD_MAX,
180            common: true,
181        }
182    }
183
184    pub(crate) const fn new_inherited() -> Self {
185        ScriptExtension {
186            first: u64::MAX,
187            second: u64::MAX,
188            third: Self::THIRD_MAX,
189            common: false,
190        }
191    }
192
193    pub(crate) const fn new_unknown() -> Self {
194        ScriptExtension {
195            first: 0,
196            second: 0,
197            third: 0,
198            common: false,
199        }
200    }
201
202    const fn is_common_or_inherited(self) -> bool {
203        (self.first == u64::MAX) & (self.second == u64::MAX) & (self.third == Self::THIRD_MAX)
204    }
205
206    /// Checks if the script extension is Common
207    pub const fn is_common(self) -> bool {
208        self.is_common_or_inherited() & self.common
209    }
210
211    /// Checks if the script extension is Inherited
212    pub const fn is_inherited(self) -> bool {
213        self.is_common_or_inherited() & !self.common
214    }
215
216    /// Checks if the script extension is empty (unknown)
217    pub const fn is_empty(self) -> bool {
218        (self.first == 0) & (self.second == 0) & (self.third == 0)
219    }
220
221    /// Returns the number of scripts in the script extension
222    pub fn len(self) -> usize {
223        if self.is_common_or_inherited() {
224            1
225        } else {
226            (self.first.count_ones() + self.second.count_ones() + self.third.count_ones()) as usize
227        }
228    }
229
230    /// Intersect this `ScriptExtension` with another `ScriptExtension`. Produces `Unknown` if things
231    /// do not intersect. This is equivalent to [`ScriptExtension::intersection`] but it stores the result
232    /// in `self`
233    ///
234    /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
235    /// everything, the intersection of `Common` and `Inherited` is `Inherited`
236    pub fn intersect_with(&mut self, other: Self) {
237        *self = self.intersection(other)
238    }
239
240    /// Find the intersection between two ScriptExtensions. Returns Unknown if things
241    /// do not intersect.
242    ///
243    /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
244    /// everything, the intersection of `Common` and `Inherited` is `Inherited`
245    pub const fn intersection(self, other: Self) -> Self {
246        let first = self.first & other.first;
247        let second = self.second & other.second;
248        let third = self.third & other.third;
249        let common = self.common & other.common;
250        ScriptExtension {
251            first,
252            second,
253            third,
254            common,
255        }
256    }
257
258    /// Find the union between two ScriptExtensions.
259    ///
260    /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
261    /// everything, the union of `Common` and `Inherited` is `Common`
262    pub const fn union(self, other: Self) -> Self {
263        let first = self.first | other.first;
264        let second = self.second | other.second;
265        let third = self.third | other.third;
266        let common = self.common | other.common;
267        ScriptExtension {
268            first,
269            second,
270            third,
271            common,
272        }
273    }
274
275    /// Check if this ScriptExtension contains the given script
276    ///
277    /// Should be used with specific scripts only, this will
278    /// return `true` if `self` is not `Unknown` and `script` is
279    /// `Common` or `Inherited`
280    pub fn contains_script(self, script: Script) -> bool {
281        !self.intersection(script.into()).is_empty()
282    }
283
284    /// Get the intersection of script extensions of all characters
285    /// in a string.
286    pub fn for_str(x: &str) -> Self {
287        let mut ext = ScriptExtension::default();
288        for ch in x.chars() {
289            ext.intersect_with(ch.into());
290        }
291        ext
292    }
293
294    /// Iterate over the scripts in this script extension
295    ///
296    /// Will never yield Script::Unknown
297    pub fn iter(self) -> ScriptIterator {
298        ScriptIterator { ext: self }
299    }
300}
301
302impl Default for ScriptExtension {
303    fn default() -> Self {
304        ScriptExtension::new_common()
305    }
306}
307
308impl From<char> for ScriptExtension {
309    fn from(o: char) -> Self {
310        o.script_extension()
311    }
312}
313
314impl From<&'_ str> for ScriptExtension {
315    fn from(o: &'_ str) -> Self {
316        Self::for_str(o)
317    }
318}
319
320impl fmt::Debug for ScriptExtension {
321    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
322        write!(f, "ScriptExtension(")?;
323        fmt::Display::fmt(self, f)?;
324        write!(f, ")")
325    }
326}
327
328impl fmt::Display for ScriptExtension {
329    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
330        if self.is_common() {
331            write!(f, "Common")?;
332        } else if self.is_inherited() {
333            write!(f, "Inherited")?;
334        } else if self.is_empty() {
335            write!(f, "Unknown")?;
336        } else {
337            let mut first = true;
338            for script in self.iter() {
339                if first {
340                    first = false;
341                } else {
342                    write!(f, " + ")?;
343                }
344                script.full_name().fmt(f)?;
345            }
346        }
347        Ok(())
348    }
349}
350
351/// Extension trait on `char` for calculating script properties
352pub trait UnicodeScript {
353    /// Get the script for a given character
354    fn script(&self) -> Script;
355    /// Get the Script_Extension for a given character
356    fn script_extension(&self) -> ScriptExtension;
357}
358
359impl UnicodeScript for char {
360    fn script(&self) -> Script {
361        get_script(*self).unwrap_or(Script::Unknown)
362    }
363
364    fn script_extension(&self) -> ScriptExtension {
365        get_script_extension(*self).unwrap_or_else(|| self.script().into())
366    }
367}
368
369/// Iterator over scripts in a [ScriptExtension].
370///
371/// Can be obtained via [ScriptExtension::iter()]
372pub struct ScriptIterator {
373    ext: ScriptExtension,
374}
375
376impl Iterator for ScriptIterator {
377    type Item = Script;
378
379    fn next(&mut self) -> Option<Script> {
380        if self.ext.is_common_or_inherited() {
381            let common = self.ext.common;
382            self.ext = ScriptExtension::new_unknown();
383            if common {
384                Some(Script::Common)
385            } else {
386                Some(Script::Inherited)
387            }
388        // Are there bits left in the first chunk?
389        } else if self.ext.first != 0 {
390            // Find the next bit
391            let bit = self.ext.first.trailing_zeros();
392            // unset just that bit
393            self.ext.first &= !(1 << bit);
394            Some(Script::for_integer(bit as u8))
395        // Are there bits left in the second chunk?
396        } else if self.ext.second != 0 {
397            let bit = self.ext.second.trailing_zeros();
398            self.ext.second &= !(1 << bit);
399            Some(Script::for_integer(64 + bit as u8))
400        // Are there bits left in the third chunk?
401        } else if self.ext.third != 0 {
402            let bit = self.ext.third.trailing_zeros();
403            self.ext.third &= !(1 << bit);
404            Some(Script::for_integer(128 + bit as u8))
405        } else {
406            // Script::Unknown
407            None
408        }
409    }
410}
411
412#[cfg(test)]
413mod tests {
414    use crate::*;
415    use std::collections::HashSet;
416    use std::convert::TryInto;
417
418    #[cfg(feature = "bench")]
419    use test::bench::Bencher;
420    #[cfg(feature = "bench")]
421    extern crate test;
422
423    #[test]
424    fn test_conversion() {
425        let mut seen_scripts = HashSet::new();
426        let mut seen_exts = HashSet::new();
427        for bit in 0..NEXT_SCRIPT {
428            let script = Script::for_integer(bit);
429            let ext = script.into();
430            if seen_scripts.contains(&script) {
431                panic!("Found script {:?} twice!", script)
432            }
433            if seen_exts.contains(&ext) {
434                panic!("Found extension {:?} twice!", ext)
435            }
436            seen_scripts.insert(script);
437            seen_exts.insert(ext);
438            assert_eq!(script as u8, bit);
439            assert!(!ScriptExtension::new_common().intersection(ext).is_empty());
440            assert!(!ScriptExtension::new_inherited()
441                .intersection(ext)
442                .is_empty());
443            assert!(ScriptExtension::new_unknown().intersection(ext).is_empty());
444            assert_eq!(ext.iter().collect::<Vec<_>>(), vec![script]);
445            assert_eq!(Ok(script), ext.try_into());
446        }
447    }
448
449    #[test]
450    fn test_specific() {
451        let s = "सवव मानवी व्यद्क् जन्मतःच स्वतींत्र आहेत व त्ाींना समान प्रवतष्ठा व समान अविकार आहेत. त्ाींना ववचारशद्क् व सवविे कबुद्द्धलाभलेली आहे. व त्ाींनी एकमेकाींशी बींिुत्वाचाभावनेने आचरण करावे.";
452        let ext = ScriptExtension::for_str(s);
453        assert_eq!(ext, script_extensions::DEVA);
454        println!(
455            "{:?}",
456            script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
457        );
458        println!(
459            "{:?}",
460            ext.intersection(
461                script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
462            )
463        );
464        assert!(!ext
465            .intersection(script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH)
466            .is_empty());
467
468        let u = ext.union(Script::Dogra.into());
469        assert_eq!(
470            u.intersection(
471                script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
472            ),
473            u
474        );
475    }
476
477    #[test]
478    fn test_specific_ext() {
479        let ext = script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH;
480
481        let all: HashSet<_> = ext.iter().collect();
482
483        for bit in 0..NEXT_SCRIPT {
484            let script = Script::for_integer(bit);
485
486            if all.contains(&script) {
487                assert!(ext.contains_script(script))
488            } else {
489                assert!(!ext.contains_script(script))
490            }
491        }
492
493        assert!(ext.contains_script(Script::Devanagari));
494        assert!(ext.contains_script(Script::Dogra));
495        assert!(ext.contains_script(Script::Gujarati));
496        assert!(ext.contains_script(Script::Gurmukhi));
497        assert!(ext.contains_script(Script::Khojki));
498        assert!(ext.contains_script(Script::Kaithi));
499        assert!(ext.contains_script(Script::Mahajani));
500        assert!(ext.contains_script(Script::Modi));
501        assert!(ext.contains_script(Script::Khudawadi));
502        assert!(ext.contains_script(Script::Takri));
503        assert!(ext.contains_script(Script::Tirhuta));
504
505        let scr: Result<Script, _> = ext.try_into();
506        assert!(scr.is_err());
507    }
508
509    #[cfg(feature = "bench")]
510    #[bench]
511    fn bench_script_intersection(b: &mut Bencher) {
512        b.iter(|| {
513            let script = test::black_box(Script::Devanagari);
514            let ext = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ONAO_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH);
515            test::black_box(ext.intersection(script.into()));
516        })
517    }
518
519    #[cfg(feature = "bench")]
520    #[bench]
521    fn bench_ext_to_script(b: &mut Bencher) {
522        let ext: ScriptExtension = Script::Devanagari.into();
523        b.iter(|| {
524            let ext = test::black_box(ext);
525            let script: Result<Script, _> = ext.try_into();
526            let _ = test::black_box(script);
527        })
528    }
529
530    #[cfg(feature = "bench")]
531    #[bench]
532    fn bench_script_to_ext(b: &mut Bencher) {
533        b.iter(|| {
534            let script = test::black_box(Script::Devanagari);
535            let ext: ScriptExtension = script.into();
536            test::black_box(ext);
537        })
538    }
539
540    #[cfg(feature = "bench")]
541    #[bench]
542    fn bench_ext_intersection(b: &mut Bencher) {
543        b.iter(|| {
544            let e1 = test::black_box(script_extensions::ARAB_GARA_NKOO_ROHG_SYRC_THAA_YEZI);
545            let e2 = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ONAO_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH);
546            test::black_box(e2.intersection(e1));
547        })
548    }
549
550    #[cfg(feature = "bench")]
551    #[bench]
552    fn bench_to_vec(b: &mut Bencher) {
553        b.iter(|| {
554            let ext = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ONAO_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH);
555            test::black_box(ext.iter().collect::<Vec<_>>());
556        })
557    }
558
559    #[cfg(feature = "bench")]
560    #[bench]
561    fn bench_string_ext(b: &mut Bencher) {
562        b.iter(|| {
563            let s = test::black_box("सवव मानवी व्यद्क् जन्मतःच स्वतींत्र आहेत व त्ाींना समान प्रवतष्ठा व समान अविकार आहेत. त्ाींना ववचारशद्क् व सवविे कबुद्द्धलाभलेली आहे. व त्ाींनी एकमेकाींशी बींिुत्वाचाभावनेने आचरण करावे.");
564            test::black_box(ScriptExtension::for_str(s));
565        })
566    }
567}