Skip to main content

azul_layout/text3/
script.rs

1// Taken from: https://github.com/greyblake/whatlang-rs/blob/master/src/scripts/detect.rs
2//
3// See: https://github.com/greyblake/whatlang-rs/pull/67
4
5// License:
6//
7// (The MIT License)
8//
9// Copyright (c) 2017 Sergey Potapov <blake131313@gmail.com>
10// Copyright (c) 2014 Titus Wormer <tituswormer@gmail.com>
11// Copyright (c) 2008 Kent S Johnson
12// Copyright (c) 2006 Jacob R Rideout <kde@jacobrideout.net>
13// Copyright (c) 2004 Maciej Ceglowski
14//
15// Permission is hereby granted, free of charge, to any person obtaining
16// a copy of this software and associated documentation files (the
17// 'Software'), to deal in the Software without restriction, including
18// without limitation the rights to use, copy, modify, merge, publish,
19// distribute, sublicense, and/or sell copies of the Software, and to
20// permit persons to whom the Software is furnished to do so, subject to
21// the following conditions:
22//
23// The above copyright notice and this permission notice shall be
24// included in all copies or substantial portions of the Software.
25//
26// THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
27// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
28// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
29// IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
30// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
31// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
32// SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
33
34#[cfg(feature = "text_layout_hyphenation")]
35use hyphenation::Language as HyphenationLanguage;
36#[cfg(feature = "text_layout_hyphenation")]
37pub use hyphenation::Language;
38
39/// Stub Language enum for when hyphenation is not enabled.
40/// This mirrors the variants used in script detection functions.
41#[cfg(not(feature = "text_layout_hyphenation"))]
42#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
43#[allow(dead_code)]
44pub enum Language {
45    // Latin script languages
46    EnglishUS,
47    French,
48    German1996,
49    Spanish,
50    Portuguese,
51    Estonian,
52    Hungarian,
53    Polish,
54    Czech,
55    Slovak,
56    Latvian,
57    Lithuanian,
58    Romanian,
59    Turkish,
60    Croatian,
61    Icelandic,
62    Welsh,
63    NorwegianBokmal,
64    Swedish,
65    // Cyrillic script languages
66    Russian,
67    Ukrainian,
68    Belarusian,
69    Bulgarian,
70    Macedonian,
71    SerbianCyrillic,
72    Mongolian,
73    SlavonicChurch,
74    // Greek script languages
75    GreekMono,
76    GreekPoly,
77    Coptic,
78    // Indic script languages
79    Hindi,
80    Bengali,
81    Assamese,
82    Marathi,
83    Sanskrit,
84    Gujarati,
85    Panjabi,
86    Kannada,
87    Malayalam,
88    Oriya,
89    Tamil,
90    Telugu,
91    // Other scripts
92    Georgian,
93    Ethiopic,
94    Thai,
95    Chinese,
96}
97
98use rust_fontconfig::UnicodeRange;
99
100#[derive(PartialEq, Eq, Debug, Clone, Copy)]
101pub enum Script {
102    // Keep this in alphabetic order (for C bindings)
103    Arabic,
104    Bengali,
105    Cyrillic,
106    Devanagari,
107    Ethiopic,
108    Georgian,
109    Greek,
110    Gujarati,
111    Gurmukhi,
112    Hangul,
113    Hebrew,
114    Hiragana,
115    Kannada,
116    Katakana,
117    Khmer,
118    Latin,
119    Malayalam,
120    Mandarin,
121    Myanmar,
122    Oriya,
123    Sinhala,
124    Tamil,
125    Telugu,
126    Thai,
127}
128
129impl Script {
130    /// Maps a Script to a vector of its representative Unicode character ranges.
131    ///
132    /// The ranges are extracted from the `is_*` functions in the provided source code.
133    pub fn get_unicode_ranges(&self) -> Vec<UnicodeRange> {
134        match self {
135            Script::Arabic => vec![
136                UnicodeRange {
137                    start: 0x0600,
138                    end: 0x06FF,
139                },
140                UnicodeRange {
141                    start: 0x0750,
142                    end: 0x07FF,
143                },
144                UnicodeRange {
145                    start: 0x08A0,
146                    end: 0x08FF,
147                },
148                UnicodeRange {
149                    start: 0xFB50,
150                    end: 0xFDFF,
151                },
152                UnicodeRange {
153                    start: 0xFE70,
154                    end: 0xFEFF,
155                },
156                UnicodeRange {
157                    start: 0x10E60,
158                    end: 0x10E7F,
159                },
160                UnicodeRange {
161                    start: 0x1EE00,
162                    end: 0x1EEFF,
163                },
164            ],
165            Script::Bengali => vec![UnicodeRange {
166                start: 0x0980,
167                end: 0x09FF,
168            }],
169            Script::Cyrillic => vec![
170                UnicodeRange {
171                    start: 0x0400,
172                    end: 0x0484,
173                },
174                UnicodeRange {
175                    start: 0x0487,
176                    end: 0x052F,
177                },
178                UnicodeRange {
179                    start: 0x2DE0,
180                    end: 0x2DFF,
181                },
182                UnicodeRange {
183                    start: 0xA640,
184                    end: 0xA69D,
185                },
186                UnicodeRange {
187                    start: 0x1D2B,
188                    end: 0x1D2B,
189                },
190                UnicodeRange {
191                    start: 0x1D78,
192                    end: 0x1D78,
193                },
194                UnicodeRange {
195                    start: 0xA69F,
196                    end: 0xA69F,
197                },
198            ],
199            Script::Devanagari => vec![
200                UnicodeRange {
201                    start: 0x0900,
202                    end: 0x097F,
203                },
204                UnicodeRange {
205                    start: 0xA8E0,
206                    end: 0xA8FF,
207                },
208                UnicodeRange {
209                    start: 0x1CD0,
210                    end: 0x1CFF,
211                },
212            ],
213            Script::Ethiopic => vec![
214                UnicodeRange {
215                    start: 0x1200,
216                    end: 0x139F,
217                },
218                UnicodeRange {
219                    start: 0x2D80,
220                    end: 0x2DDF,
221                },
222                UnicodeRange {
223                    start: 0xAB00,
224                    end: 0xAB2F,
225                },
226            ],
227            Script::Georgian => vec![UnicodeRange {
228                start: 0x10A0,
229                end: 0x10FF,
230            }],
231            Script::Greek => vec![UnicodeRange {
232                start: 0x0370,
233                end: 0x03FF,
234            }],
235            Script::Gujarati => vec![UnicodeRange {
236                start: 0x0A80,
237                end: 0x0AFF,
238            }],
239            Script::Gurmukhi => vec![UnicodeRange {
240                start: 0x0A00,
241                end: 0x0A7F,
242            }],
243            Script::Hangul => vec![
244                UnicodeRange {
245                    start: 0xAC00,
246                    end: 0xD7AF,
247                },
248                UnicodeRange {
249                    start: 0x1100,
250                    end: 0x11FF,
251                },
252                UnicodeRange {
253                    start: 0x3130,
254                    end: 0x318F,
255                },
256                UnicodeRange {
257                    start: 0x3200,
258                    end: 0x32FF,
259                },
260                UnicodeRange {
261                    start: 0xA960,
262                    end: 0xA97F,
263                },
264                UnicodeRange {
265                    start: 0xD7B0,
266                    end: 0xD7FF,
267                },
268                UnicodeRange {
269                    start: 0xFF00,
270                    end: 0xFFEF,
271                },
272            ],
273            Script::Hebrew => vec![UnicodeRange {
274                start: 0x0590,
275                end: 0x05FF,
276            }],
277            Script::Hiragana => vec![UnicodeRange {
278                start: 0x3040,
279                end: 0x309F,
280            }],
281            Script::Kannada => vec![UnicodeRange {
282                start: 0x0C80,
283                end: 0x0CFF,
284            }],
285            Script::Katakana => vec![UnicodeRange {
286                start: 0x30A0,
287                end: 0x30FF,
288            }],
289            Script::Khmer => vec![
290                UnicodeRange {
291                    start: 0x1780,
292                    end: 0x17FF,
293                },
294                UnicodeRange {
295                    start: 0x19E0,
296                    end: 0x19FF,
297                },
298            ],
299            Script::Latin => vec![
300                UnicodeRange {
301                    start: 0x0041,
302                    end: 0x005A,
303                }, // A-Z
304                UnicodeRange {
305                    start: 0x0061,
306                    end: 0x007A,
307                }, // a-z
308                UnicodeRange {
309                    start: 0x0080,
310                    end: 0x00FF,
311                },
312                UnicodeRange {
313                    start: 0x0100,
314                    end: 0x017F,
315                },
316                UnicodeRange {
317                    start: 0x0180,
318                    end: 0x024F,
319                },
320                UnicodeRange {
321                    start: 0x0250,
322                    end: 0x02AF,
323                },
324                UnicodeRange {
325                    start: 0x1D00,
326                    end: 0x1D7F,
327                },
328                UnicodeRange {
329                    start: 0x1D80,
330                    end: 0x1DBF,
331                },
332                UnicodeRange {
333                    start: 0x1E00,
334                    end: 0x1EFF,
335                },
336                UnicodeRange {
337                    start: 0x2100,
338                    end: 0x214F,
339                },
340                UnicodeRange {
341                    start: 0x2C60,
342                    end: 0x2C7F,
343                },
344                UnicodeRange {
345                    start: 0xA720,
346                    end: 0xA7FF,
347                },
348                UnicodeRange {
349                    start: 0xAB30,
350                    end: 0xAB6F,
351                },
352            ],
353            Script::Malayalam => vec![UnicodeRange {
354                start: 0x0D00,
355                end: 0x0D7F,
356            }],
357            Script::Mandarin => vec![
358                UnicodeRange {
359                    start: 0x2E80,
360                    end: 0x2E99,
361                },
362                UnicodeRange {
363                    start: 0x2E9B,
364                    end: 0x2EF3,
365                },
366                UnicodeRange {
367                    start: 0x2F00,
368                    end: 0x2FD5,
369                },
370                UnicodeRange {
371                    start: 0x3005,
372                    end: 0x3005,
373                },
374                UnicodeRange {
375                    start: 0x3007,
376                    end: 0x3007,
377                },
378                UnicodeRange {
379                    start: 0x3021,
380                    end: 0x3029,
381                },
382                UnicodeRange {
383                    start: 0x3038,
384                    end: 0x303B,
385                },
386                UnicodeRange {
387                    start: 0x3400,
388                    end: 0x4DB5,
389                },
390                UnicodeRange {
391                    start: 0x4E00,
392                    end: 0x9FCC,
393                },
394                UnicodeRange {
395                    start: 0xF900,
396                    end: 0xFA6D,
397                },
398                UnicodeRange {
399                    start: 0xFA70,
400                    end: 0xFAD9,
401                },
402            ],
403            Script::Myanmar => vec![UnicodeRange {
404                start: 0x1000,
405                end: 0x109F,
406            }],
407            Script::Oriya => vec![UnicodeRange {
408                start: 0x0B00,
409                end: 0x0B7F,
410            }],
411            Script::Sinhala => vec![UnicodeRange {
412                start: 0x0D80,
413                end: 0x0DFF,
414            }],
415            Script::Tamil => vec![UnicodeRange {
416                start: 0x0B80,
417                end: 0x0BFF,
418            }],
419            Script::Telugu => vec![UnicodeRange {
420                start: 0x0C00,
421                end: 0x0C7F,
422            }],
423            Script::Thai => vec![UnicodeRange {
424                start: 0x0E00,
425                end: 0x0E7F,
426            }],
427        }
428    }
429}
430
431// Is it space, punctuation or digit?
432// Stop character is a character that does not give any value for script
433// or language detection.
434#[inline]
435pub fn is_stop_char(ch: char) -> bool {
436    matches!(ch, '\u{0000}'..='\u{0040}' | '\u{005B}'..='\u{0060}' | '\u{007B}'..='\u{007E}')
437}
438
439type ScriptCounter = (Script, fn(char) -> bool, usize);
440
441/// Detect only a script by a given text
442pub fn detect_script(text: &str) -> Option<Script> {
443    let mut script_counters: [ScriptCounter; 24] = [
444        (Script::Latin, is_latin, 0),
445        (Script::Cyrillic, is_cyrillic, 0),
446        (Script::Arabic, is_arabic, 0),
447        (Script::Mandarin, is_mandarin, 0),
448        (Script::Devanagari, is_devanagari, 0),
449        (Script::Hebrew, is_hebrew, 0),
450        (Script::Ethiopic, is_ethiopic, 0),
451        (Script::Georgian, is_georgian, 0),
452        (Script::Bengali, is_bengali, 0),
453        (Script::Hangul, is_hangul, 0),
454        (Script::Hiragana, is_hiragana, 0),
455        (Script::Katakana, is_katakana, 0),
456        (Script::Greek, is_greek, 0),
457        (Script::Kannada, is_kannada, 0),
458        (Script::Tamil, is_tamil, 0),
459        (Script::Thai, is_thai, 0),
460        (Script::Gujarati, is_gujarati, 0),
461        (Script::Gurmukhi, is_gurmukhi, 0),
462        (Script::Telugu, is_telugu, 0),
463        (Script::Malayalam, is_malayalam, 0),
464        (Script::Oriya, is_oriya, 0),
465        (Script::Myanmar, is_myanmar, 0),
466        (Script::Sinhala, is_sinhala, 0),
467        (Script::Khmer, is_khmer, 0),
468    ];
469
470    let half = text.chars().count() / 2;
471
472    for ch in text.chars() {
473        if is_stop_char(ch) {
474            continue;
475        }
476
477        // For performance reasons, we need to mutate script_counters by calling
478        // `swap` function, it would not be possible to do using normal iterator.
479        for i in 0..script_counters.len() {
480            let found = {
481                let (script, check_fn, ref mut count) = script_counters[i];
482                if check_fn(ch) {
483                    *count += 1;
484                    if *count > half {
485                        return Some(script);
486                    }
487                    true
488                } else {
489                    false
490                }
491            };
492            // Have to let borrow of count fall out of scope before doing swapping, or we could
493            // do this above.
494            if found {
495                // If script was found, move it closer to the front.
496                // If the text contains largely 1 or 2 scripts, this will
497                // cause these scripts to be eventually checked first.
498                if i > 0 {
499                    script_counters.swap(i - 1, i);
500                }
501                break;
502            }
503        }
504    }
505
506    let (script, _, count) = script_counters
507        .iter()
508        .cloned()
509        .max_by_key(|&(_, _, count)| count)
510        .unwrap();
511    if count != 0 {
512        Some(script)
513    } else {
514        None
515    }
516}
517
518pub fn detect_char_script(ch: char) -> Option<Script> {
519    let script_counters: [ScriptCounter; 24] = [
520        (Script::Latin, is_latin, 0),
521        (Script::Cyrillic, is_cyrillic, 0),
522        (Script::Arabic, is_arabic, 0),
523        (Script::Mandarin, is_mandarin, 0),
524        (Script::Devanagari, is_devanagari, 0),
525        (Script::Hebrew, is_hebrew, 0),
526        (Script::Ethiopic, is_ethiopic, 0),
527        (Script::Georgian, is_georgian, 0),
528        (Script::Bengali, is_bengali, 0),
529        (Script::Hangul, is_hangul, 0),
530        (Script::Hiragana, is_hiragana, 0),
531        (Script::Katakana, is_katakana, 0),
532        (Script::Greek, is_greek, 0),
533        (Script::Kannada, is_kannada, 0),
534        (Script::Tamil, is_tamil, 0),
535        (Script::Thai, is_thai, 0),
536        (Script::Gujarati, is_gujarati, 0),
537        (Script::Gurmukhi, is_gurmukhi, 0),
538        (Script::Telugu, is_telugu, 0),
539        (Script::Malayalam, is_malayalam, 0),
540        (Script::Oriya, is_oriya, 0),
541        (Script::Myanmar, is_myanmar, 0),
542        (Script::Sinhala, is_sinhala, 0),
543        (Script::Khmer, is_khmer, 0),
544    ];
545
546    for i in 0..script_counters.len() {
547        let (script, check_fn, _) = script_counters[i];
548        if check_fn(ch) {
549            return Some(script);
550        }
551    }
552    None
553}
554
555/// Iterates through the text once and returns as soon as an Assamese-specific character is found.
556fn detect_bengali_language(text: &str) -> Language {
557    for c in text.chars() {
558        // These characters are specific to Assamese in the Bengali script block.
559        // We can return immediately as this is the highest priority check.
560        if matches!(c, '\u{09F0}' | '\u{09F1}') {
561            // ৰ, ৱ
562            return Language::Assamese;
563        }
564    }
565    // If we finish the loop without finding any Assamese characters, it's Bengali.
566    Language::Bengali
567}
568
569fn detect_cyrillic_language(text: &str) -> Language {
570    for c in text.chars() {
571        match c {
572            // Highest priority: Old Cyrillic characters for Slavonic Church. Return immediately.
573            '\u{0460}'..='\u{047F}' => return Language::SlavonicChurch,
574            // Set flags for other languages. We don't return yet because a higher-priority
575            // character (like the one above) could still appear.
576            'ѓ' | 'ќ' | 'ѕ' => return Language::Macedonian,
577            'ў' => return Language::Belarusian,
578            'є' | 'і' | 'ї' | 'ґ' => return Language::Ukrainian,
579            'ө' | 'ү' | 'һ' => return Language::Mongolian,
580            'ј' | 'љ' | 'њ' | 'ћ' | 'ђ' | 'џ' => return Language::SerbianCyrillic,
581            // Bulgarian 'ъ' is also in Russian, but 'щ' is a stronger indicator.
582            // The logic implies that if either is present, it might be Bulgarian.
583            'щ' => return Language::Bulgarian,
584            _ => {}
585        }
586    }
587
588    Language::Russian
589}
590
591fn detect_devanagari_language(text: &str) -> Language {
592    for c in text.chars() {
593        match c {
594            // Marathi has higher priority in the original logic. Return immediately.
595            '\u{0933}' => return Language::Marathi, // ळ
596            // Flag for Sanskrit Vedic extensions.
597            '\u{1CD0}'..='\u{1CFF}' => return Language::Sanskrit,
598            _ => (),
599        }
600    }
601
602    Language::Hindi
603}
604
605fn detect_greek_language(text: &str) -> Language {
606    let mut has_polytonic = false;
607
608    for c in text.chars() {
609        match c {
610            // Coptic has higher priority. Return immediately.
611            '\u{2C80}'..='\u{2CFF}' => return Language::Coptic,
612            // Flag for Greek Extended (Polytonic) characters.
613            '\u{1F00}'..='\u{1FFF}' => return Language::GreekPoly,
614            _ => {}
615        }
616    }
617
618    Language::GreekMono
619}
620
621fn detect_latin_language(text: &str) -> Language {
622    // Flags for languages checked near the end of the original if-else chain.
623    let mut has_french_c = false;
624    let mut has_portugese_o = false;
625    let mut has_portuguese_a = false;
626
627    for c in text.chars() {
628        match c {
629            // --- Early Return Cases (in order of priority) ---
630            'ß' => return Language::German1996,
631            'ő' | 'ű' => return Language::Hungarian,
632            'ł' => return Language::Polish,
633            'ř' | 'ů' => return Language::Czech,
634            'ľ' | 'ĺ' | 'ŕ' => return Language::Slovak,
635            'ā' | 'ē' | 'ģ' | 'ī' | 'ķ' | 'ļ' | 'ņ' | 'ō' | 'ū' => {
636                return Language::Latvian
637            }
638            'ą' | 'ę' | 'ė' | 'į' | 'ų' => return Language::Lithuanian,
639            'ă' | 'ș' | 'ț' => return Language::Romanian,
640            'ğ' | 'ı' | 'ş' => return Language::Turkish,
641            'đ' => return Language::Croatian, /* Also used in Vietnamese, but Croatian is the */
642            // original's intent
643            'þ' | 'ð' => return Language::Icelandic,
644            'ŵ' | 'ŷ' => return Language::Welsh,
645            'æ' | 'ø' => return Language::NorwegianBokmal, // And Danish
646            'å' => return Language::Swedish,               // And Norwegian, Finnish
647            'ñ' => return Language::Spanish,
648            'ä' | 'ö' | 'ü' => return Language::German1996,
649
650            // NOTE: 'õ' is used by both Estonian and Portuguese
651            // Since Estonian is checked first, it takes precedence.
652            'õ' => has_portugese_o = true,
653            'ã' => has_portuguese_a = true,
654
655            // --- Flag-setting Cases ---
656            'ç' => has_french_c = true, // Also in Portuguese
657            'á' | 'é' | 'í' | 'ó' | 'ú' => return Language::Spanish,
658
659            _ => (),
660        }
661    }
662
663    // decide between portuguese, estonian and french
664
665    if has_french_c && !has_portugese_o && !has_portuguese_a {
666        return Language::French;
667    }
668
669    if has_portugese_o && !has_french_c && !has_portuguese_a {
670        return Language::Estonian;
671    }
672
673    if has_portugese_o || has_portuguese_a || has_french_c {
674        return Language::Portuguese;
675    }
676
677    Language::EnglishUS
678}
679
680pub fn script_to_language(script: Script, text: &str) -> Language {
681    match script {
682        Script::Ethiopic => Language::Ethiopic,
683        Script::Georgian => Language::Georgian,
684        Script::Gujarati => Language::Gujarati,
685        Script::Gurmukhi => Language::Panjabi,
686        Script::Kannada => Language::Kannada,
687        Script::Malayalam => Language::Malayalam,
688        Script::Mandarin => Language::Chinese,
689        Script::Oriya => Language::Oriya,
690        Script::Tamil => Language::Tamil,
691        Script::Telugu => Language::Telugu,
692        Script::Thai => Language::Thai,
693        Script::Bengali => detect_bengali_language(text),
694        Script::Cyrillic => detect_cyrillic_language(text),
695        Script::Devanagari => detect_devanagari_language(text),
696        Script::Greek => detect_greek_language(text),
697        Script::Latin => detect_latin_language(text),
698
699        // not directly matchable
700        Script::Myanmar => Language::Thai,
701        Script::Khmer => Language::Thai,
702        Script::Sinhala => Language::Hindi,
703
704        // no classical hyphenation behaviour
705        Script::Arabic => Language::Chinese,
706        Script::Hebrew => Language::Chinese,
707        Script::Hangul => Language::Chinese,
708        Script::Hiragana => Language::Chinese,
709        Script::Katakana => Language::Chinese,
710    }
711}
712
713pub fn is_cyrillic(ch: char) -> bool {
714    matches!(ch,
715        '\u{0400}'..='\u{0484}'
716        | '\u{0487}'..='\u{052F}'
717        | '\u{2DE0}'..='\u{2DFF}'
718        | '\u{A640}'..='\u{A69D}'
719        | '\u{1D2B}'
720        | '\u{1D78}'
721        | '\u{A69F}'
722    )
723}
724
725// https://en.wikipedia.org/wiki/Latin_script_in_Unicode
726pub fn is_latin(ch: char) -> bool {
727    matches!(ch,
728        'a'..='z'
729        | 'A'..='Z'
730        | '\u{0080}'..='\u{00FF}'
731        | '\u{0100}'..='\u{017F}'
732        | '\u{0180}'..='\u{024F}'
733        | '\u{0250}'..='\u{02AF}'
734        | '\u{1D00}'..='\u{1D7F}'
735        | '\u{1D80}'..='\u{1DBF}'
736        | '\u{1E00}'..='\u{1EFF}'
737        | '\u{2100}'..='\u{214F}'
738        | '\u{2C60}'..='\u{2C7F}'
739        | '\u{A720}'..='\u{A7FF}'
740        | '\u{AB30}'..='\u{AB6F}'
741    )
742}
743
744// Based on https://en.wikipedia.org/wiki/Arabic_script_in_Unicode
745pub fn is_arabic(ch: char) -> bool {
746    matches!(ch,
747        '\u{0600}'..='\u{06FF}'
748        | '\u{0750}'..='\u{07FF}'
749        | '\u{08A0}'..='\u{08FF}'
750        | '\u{FB50}'..='\u{FDFF}'
751        | '\u{FE70}'..='\u{FEFF}'
752        | '\u{10E60}'..='\u{10E7F}'
753        | '\u{1EE00}'..='\u{1EEFF}'
754    )
755}
756
757// Based on https://en.wikipedia.org/wiki/Devanagari#Unicode
758pub fn is_devanagari(ch: char) -> bool {
759    matches!(ch, '\u{0900}'..='\u{097F}' | '\u{A8E0}'..='\u{A8FF}' | '\u{1CD0}'..='\u{1CFF}')
760}
761
762// Based on https://www.key-shortcut.com/en/writing-systems/ethiopian-script/
763pub fn is_ethiopic(ch: char) -> bool {
764    matches!(ch, '\u{1200}'..='\u{139F}' | '\u{2D80}'..='\u{2DDF}' | '\u{AB00}'..='\u{AB2F}')
765}
766
767// Based on https://en.wikipedia.org/wiki/Hebrew_(Unicode_block)
768pub fn is_hebrew(ch: char) -> bool {
769    matches!(ch, '\u{0590}'..='\u{05FF}')
770}
771
772pub fn is_georgian(ch: char) -> bool {
773    matches!(ch, '\u{10A0}'..='\u{10FF}')
774}
775
776pub fn is_mandarin(ch: char) -> bool {
777    matches!(ch,
778        '\u{2E80}'..='\u{2E99}'
779        | '\u{2E9B}'..='\u{2EF3}'
780        | '\u{2F00}'..='\u{2FD5}'
781        | '\u{3005}'
782        | '\u{3007}'
783        | '\u{3021}'..='\u{3029}'
784        | '\u{3038}'..='\u{303B}'
785        | '\u{3400}'..='\u{4DB5}'
786        | '\u{4E00}'..='\u{9FCC}'
787        | '\u{F900}'..='\u{FA6D}'
788        | '\u{FA70}'..='\u{FAD9}'
789    )
790}
791
792pub fn is_bengali(ch: char) -> bool {
793    matches!(ch, '\u{0980}'..='\u{09FF}')
794}
795
796pub fn is_hiragana(ch: char) -> bool {
797    matches!(ch, '\u{3040}'..='\u{309F}')
798}
799
800pub fn is_katakana(ch: char) -> bool {
801    matches!(ch, '\u{30A0}'..='\u{30FF}')
802}
803
804// Hangul is Korean Alphabet. Unicode ranges are taken from: https://en.wikipedia.org/wiki/Hangul
805pub fn is_hangul(ch: char) -> bool {
806    matches!(ch,
807        '\u{AC00}'..='\u{D7AF}'
808        | '\u{1100}'..='\u{11FF}'
809        | '\u{3130}'..='\u{318F}'
810        | '\u{3200}'..='\u{32FF}'
811        | '\u{A960}'..='\u{A97F}'
812        | '\u{D7B0}'..='\u{D7FF}'
813        | '\u{FF00}'..='\u{FFEF}'
814    )
815}
816
817// Taken from: https://en.wikipedia.org/wiki/Greek_and_Coptic
818pub fn is_greek(ch: char) -> bool {
819    matches!(ch, '\u{0370}'..='\u{03FF}')
820}
821
822// Based on: https://en.wikipedia.org/wiki/Kannada_(Unicode_block)
823pub fn is_kannada(ch: char) -> bool {
824    matches!(ch, '\u{0C80}'..='\u{0CFF}')
825}
826
827// Based on: https://en.wikipedia.org/wiki/Tamil_(Unicode_block)
828pub fn is_tamil(ch: char) -> bool {
829    matches!(ch, '\u{0B80}'..='\u{0BFF}')
830}
831
832// Based on: https://en.wikipedia.org/wiki/Thai_(Unicode_block)
833pub fn is_thai(ch: char) -> bool {
834    matches!(ch, '\u{0E00}'..='\u{0E7F}')
835}
836
837// Based on: https://en.wikipedia.org/wiki/Gujarati_(Unicode_block)
838pub fn is_gujarati(ch: char) -> bool {
839    matches!(ch, '\u{0A80}'..='\u{0AFF}')
840}
841
842// Gurmukhi is the script for Punjabi language.
843// Based on: https://en.wikipedia.org/wiki/Gurmukhi_(Unicode_block)
844pub fn is_gurmukhi(ch: char) -> bool {
845    matches!(ch, '\u{0A00}'..='\u{0A7F}')
846}
847
848pub fn is_telugu(ch: char) -> bool {
849    matches!(ch, '\u{0C00}'..='\u{0C7F}')
850}
851
852// Based on: https://en.wikipedia.org/wiki/Malayalam_(Unicode_block)
853pub fn is_malayalam(ch: char) -> bool {
854    matches!(ch, '\u{0D00}'..='\u{0D7F}')
855}
856
857// Based on: https://en.wikipedia.org/wiki/Malayalam_(Unicode_block)
858pub fn is_oriya(ch: char) -> bool {
859    matches!(ch, '\u{0B00}'..='\u{0B7F}')
860}
861
862// Based on: https://en.wikipedia.org/wiki/Myanmar_(Unicode_block)
863pub fn is_myanmar(ch: char) -> bool {
864    matches!(ch, '\u{1000}'..='\u{109F}')
865}
866
867// Based on: https://en.wikipedia.org/wiki/Sinhala_(Unicode_block)
868pub fn is_sinhala(ch: char) -> bool {
869    matches!(ch, '\u{0D80}'..='\u{0DFF}')
870}
871
872// Based on: https://en.wikipedia.org/wiki/Khmer_alphabet
873pub fn is_khmer(ch: char) -> bool {
874    matches!(ch, '\u{1780}'..='\u{17FF}' | '\u{19E0}'..='\u{19FF}')
875}