unicode_security/
mixed_script.rs

1//! [Mixed-script detection](https://www.unicode.org/reports/tr39/#Mixed_Script_Detection)
2
3use core::fmt::{self, Debug};
4use unicode_script::{Script, ScriptExtension};
5
6/// An Augmented script set, as defined by UTS 39
7///
8/// https://www.unicode.org/reports/tr39/#def-augmented-script-set
9#[derive(Copy, Clone, PartialEq, Hash, Eq)]
10pub struct AugmentedScriptSet {
11    /// The base ScriptExtension value
12    pub base: ScriptExtension,
13    /// Han With Bopomofo
14    pub hanb: bool,
15    /// Japanese
16    pub jpan: bool,
17    /// Korean
18    pub kore: bool,
19}
20
21impl From<ScriptExtension> for AugmentedScriptSet {
22    fn from(ext: ScriptExtension) -> Self {
23        let mut hanb = false;
24        let mut jpan = false;
25        let mut kore = false;
26
27        if ext.is_common() || ext.is_inherited() || ext.contains_script(Script::Han) {
28            hanb = true;
29            jpan = true;
30            kore = true;
31        } else {
32            if ext.contains_script(Script::Hiragana) || ext.contains_script(Script::Katakana) {
33                jpan = true;
34            }
35
36            if ext.contains_script(Script::Hangul) {
37                kore = true;
38            }
39
40            if ext.contains_script(Script::Bopomofo) {
41                hanb = true;
42            }
43        }
44        Self {
45            base: ext,
46            hanb,
47            jpan,
48            kore,
49        }
50    }
51}
52
53impl From<char> for AugmentedScriptSet {
54    fn from(c: char) -> Self {
55        AugmentedScriptSet::for_char(c)
56    }
57}
58
59impl From<&'_ str> for AugmentedScriptSet {
60    fn from(s: &'_ str) -> Self {
61        AugmentedScriptSet::for_str(s)
62    }
63}
64
65impl Default for AugmentedScriptSet {
66    fn default() -> Self {
67        AugmentedScriptSet {
68            base: Script::Common.into(),
69            hanb: true,
70            jpan: true,
71            kore: true,
72        }
73    }
74}
75
76impl Debug for AugmentedScriptSet {
77    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
78        if self.is_empty() {
79            write!(f, "AugmentedScriptSet {{∅}}")?;
80        } else if self.is_all() {
81            write!(f, "AugmentedScriptSet {{ALL}}")?;
82        } else {
83            write!(f, "AugmentedScriptSet {{")?;
84            let mut first_entry = true;
85            let hanb = if self.hanb { Some("Hanb") } else { None };
86            let jpan = if self.jpan { Some("Jpan") } else { None };
87            let kore = if self.kore { Some("Kore") } else { None };
88            for writing_system in None
89                .into_iter()
90                .chain(hanb)
91                .chain(jpan)
92                .chain(kore)
93                .chain(self.base.iter().map(Script::short_name))
94            {
95                if !first_entry {
96                    write!(f, ", ")?;
97                } else {
98                    first_entry = false;
99                }
100                write!(f, "{}", writing_system)?;
101            }
102            write!(f, "}}")?;
103        }
104        Ok(())
105    }
106}
107
108impl fmt::Display for AugmentedScriptSet {
109    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
110        if self.is_empty() {
111            write!(f, "Empty")?;
112        } else if self.is_all() {
113            write!(f, "All")?;
114        } else {
115            let mut first_entry = true;
116            let hanb = if self.hanb {
117                Some("Han with Bopomofo")
118            } else {
119                None
120            };
121            let jpan = if self.jpan { Some("Japanese") } else { None };
122            let kore = if self.kore { Some("Korean") } else { None };
123            for writing_system in None
124                .into_iter()
125                .chain(hanb)
126                .chain(jpan)
127                .chain(kore)
128                .chain(self.base.iter().map(Script::full_name))
129            {
130                if !first_entry {
131                    write!(f, ", ")?;
132                } else {
133                    first_entry = false;
134                }
135                write!(f, "{}", writing_system)?;
136            }
137        }
138        Ok(())
139    }
140}
141
142impl AugmentedScriptSet {
143    /// Intersect this set with another
144    pub fn intersect_with(&mut self, other: Self) {
145        self.base.intersect_with(other.base);
146        self.hanb = self.hanb && other.hanb;
147        self.jpan = self.jpan && other.jpan;
148        self.kore = self.kore && other.kore;
149    }
150
151    /// Check if the set is empty
152    pub fn is_empty(&self) -> bool {
153        self.base.is_empty() && !self.hanb && !self.jpan && !self.kore
154    }
155
156    /// Check if the set is "All" (Common or Inherited)
157    pub fn is_all(&self) -> bool {
158        self.base.is_common() || self.base.is_inherited()
159    }
160
161    /// Construct an AugmentedScriptSet for a given character
162    pub fn for_char(c: char) -> Self {
163        ScriptExtension::from(c).into()
164    }
165
166    /// Find the [resolved script set](https://www.unicode.org/reports/tr39/#def-resolved-script-set) of a given string
167    pub fn for_str(s: &str) -> Self {
168        let mut set = AugmentedScriptSet::default();
169        for ch in s.chars() {
170            set.intersect_with(ch.into())
171        }
172        set
173    }
174}
175
176/// Extension trait for [mixed-script detection](https://www.unicode.org/reports/tr39/#Mixed_Script_Detection)
177pub trait MixedScript {
178    /// Check if a string is [single-script](https://www.unicode.org/reports/tr39/#def-single-script)
179    ///
180    /// Note that a single-script string may still contain multiple Script properties!
181    fn is_single_script(self) -> bool;
182
183    /// Find the [resolved script set](https://www.unicode.org/reports/tr39/#def-resolved-script-set) of a given string
184    fn resolve_script_set(self) -> AugmentedScriptSet;
185}
186
187impl MixedScript for &'_ str {
188    fn is_single_script(self) -> bool {
189        !AugmentedScriptSet::for_str(self).is_empty()
190    }
191
192    fn resolve_script_set(self) -> AugmentedScriptSet {
193        self.into()
194    }
195}
196
197/// Check if a character is considered potential mixed script confusable.
198///
199/// If the specified character is not restricted from use for identifiers,
200/// this function returns whether it is considered mixed script confusable
201/// with another character that is not restricted from use for identifiers.
202///
203/// If the specified character is restricted from use for identifiers,
204/// the return value is unspecified.
205pub fn is_potential_mixed_script_confusable_char(c: char) -> bool {
206    use crate::tables::potential_mixed_script_confusable::potential_mixed_script_confusable;
207
208    potential_mixed_script_confusable(c)
209}