Skip to main content

azul_layout/text3/
script.rs

1//! Unicode script detection and language identification for text shaping
2//!
3// Taken from: https://github.com/greyblake/whatlang-rs/blob/master/src/scripts/detect.rs
4//
5// See: https://github.com/greyblake/whatlang-rs/pull/67
6
7// License:
8//
9// (The MIT License)
10//
11// Copyright (c) 2017 Sergey Potapov <blake131313@gmail.com>
12// Copyright (c) 2014 Titus Wormer <tituswormer@gmail.com>
13// Copyright (c) 2008 Kent S Johnson
14// Copyright (c) 2006 Jacob R Rideout <kde@jacobrideout.net>
15// Copyright (c) 2004 Maciej Ceglowski
16//
17// Permission is hereby granted, free of charge, to any person obtaining
18// a copy of this software and associated documentation files (the
19// 'Software'), to deal in the Software without restriction, including
20// without limitation the rights to use, copy, modify, merge, publish,
21// distribute, sublicense, and/or sell copies of the Software, and to
22// permit persons to whom the Software is furnished to do so, subject to
23// the following conditions:
24//
25// The above copyright notice and this permission notice shall be
26// included in all copies or substantial portions of the Software.
27//
28// THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
29// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
30// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
31// IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
32// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
33// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
34// SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
35
36#[cfg(feature = "text_layout_hyphenation")]
37use hyphenation::Language as HyphenationLanguage;
38#[cfg(feature = "text_layout_hyphenation")]
39pub use hyphenation::Language;
40
41/// Stub Language enum for when hyphenation is not enabled.
42/// This mirrors the variants used in script detection functions.
43#[cfg(not(feature = "text_layout_hyphenation"))]
44#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
45#[allow(dead_code)]
46pub enum Language {
47    // Latin script languages
48    EnglishUS,
49    French,
50    German1996,
51    Spanish,
52    Portuguese,
53    Estonian,
54    Hungarian,
55    Polish,
56    Czech,
57    Slovak,
58    Latvian,
59    Lithuanian,
60    Romanian,
61    Turkish,
62    Croatian,
63    Icelandic,
64    Welsh,
65    NorwegianBokmal,
66    Swedish,
67    // Cyrillic script languages
68    Russian,
69    Ukrainian,
70    Belarusian,
71    Bulgarian,
72    Macedonian,
73    SerbianCyrillic,
74    Mongolian,
75    SlavonicChurch,
76    // Greek script languages
77    GreekMono,
78    GreekPoly,
79    Coptic,
80    // Indic script languages
81    Hindi,
82    Bengali,
83    Assamese,
84    Marathi,
85    Sanskrit,
86    Gujarati,
87    Panjabi,
88    Kannada,
89    Malayalam,
90    Oriya,
91    Tamil,
92    Telugu,
93    // Other scripts
94    Georgian,
95    Ethiopic,
96    Thai,
97    Chinese,
98}
99
100use rust_fontconfig::UnicodeRange;
101
102#[derive(PartialEq, Eq, Debug, Clone, Copy)]
103pub enum Script {
104    // Keep this in alphabetic order (for C bindings)
105    Arabic,
106    Bengali,
107    Cyrillic,
108    Devanagari,
109    Ethiopic,
110    Georgian,
111    Greek,
112    Gujarati,
113    Gurmukhi,
114    Hangul,
115    Hebrew,
116    Hiragana,
117    Kannada,
118    Katakana,
119    Khmer,
120    Latin,
121    Malayalam,
122    Mandarin,
123    Myanmar,
124    Oriya,
125    Sinhala,
126    Tamil,
127    Telugu,
128    Thai,
129}
130
131impl Script {
132    /// Maps a Script to a vector of its representative Unicode character ranges.
133    ///
134    /// The ranges are extracted from the `is_*` functions in the provided source code.
135    pub fn get_unicode_ranges(&self) -> Vec<UnicodeRange> {
136        match self {
137            Script::Arabic => vec![
138                UnicodeRange {
139                    start: 0x0600,
140                    end: 0x06FF,
141                },
142                UnicodeRange {
143                    start: 0x0750,
144                    end: 0x07FF,
145                },
146                UnicodeRange {
147                    start: 0x08A0,
148                    end: 0x08FF,
149                },
150                UnicodeRange {
151                    start: 0xFB50,
152                    end: 0xFDFF,
153                },
154                UnicodeRange {
155                    start: 0xFE70,
156                    end: 0xFEFF,
157                },
158                UnicodeRange {
159                    start: 0x10E60,
160                    end: 0x10E7F,
161                },
162                UnicodeRange {
163                    start: 0x1EE00,
164                    end: 0x1EEFF,
165                },
166            ],
167            Script::Bengali => vec![UnicodeRange {
168                start: 0x0980,
169                end: 0x09FF,
170            }],
171            Script::Cyrillic => vec![
172                UnicodeRange {
173                    start: 0x0400,
174                    end: 0x0484,
175                },
176                UnicodeRange {
177                    start: 0x0487,
178                    end: 0x052F,
179                },
180                UnicodeRange {
181                    start: 0x2DE0,
182                    end: 0x2DFF,
183                },
184                UnicodeRange {
185                    start: 0xA640,
186                    end: 0xA69D,
187                },
188                UnicodeRange {
189                    start: 0x1D2B,
190                    end: 0x1D2B,
191                },
192                UnicodeRange {
193                    start: 0x1D78,
194                    end: 0x1D78,
195                },
196                UnicodeRange {
197                    start: 0xA69F,
198                    end: 0xA69F,
199                },
200            ],
201            Script::Devanagari => vec![
202                UnicodeRange {
203                    start: 0x0900,
204                    end: 0x097F,
205                },
206                UnicodeRange {
207                    start: 0xA8E0,
208                    end: 0xA8FF,
209                },
210                UnicodeRange {
211                    start: 0x1CD0,
212                    end: 0x1CFF,
213                },
214            ],
215            Script::Ethiopic => vec![
216                UnicodeRange {
217                    start: 0x1200,
218                    end: 0x139F,
219                },
220                UnicodeRange {
221                    start: 0x2D80,
222                    end: 0x2DDF,
223                },
224                UnicodeRange {
225                    start: 0xAB00,
226                    end: 0xAB2F,
227                },
228            ],
229            Script::Georgian => vec![UnicodeRange {
230                start: 0x10A0,
231                end: 0x10FF,
232            }],
233            Script::Greek => vec![UnicodeRange {
234                start: 0x0370,
235                end: 0x03FF,
236            }],
237            Script::Gujarati => vec![UnicodeRange {
238                start: 0x0A80,
239                end: 0x0AFF,
240            }],
241            Script::Gurmukhi => vec![UnicodeRange {
242                start: 0x0A00,
243                end: 0x0A7F,
244            }],
245            Script::Hangul => vec![
246                UnicodeRange {
247                    start: 0xAC00,
248                    end: 0xD7AF,
249                },
250                UnicodeRange {
251                    start: 0x1100,
252                    end: 0x11FF,
253                },
254                UnicodeRange {
255                    start: 0x3130,
256                    end: 0x318F,
257                },
258                UnicodeRange {
259                    start: 0x3200,
260                    end: 0x32FF,
261                },
262                UnicodeRange {
263                    start: 0xA960,
264                    end: 0xA97F,
265                },
266                UnicodeRange {
267                    start: 0xD7B0,
268                    end: 0xD7FF,
269                },
270                UnicodeRange {
271                    start: 0xFF00,
272                    end: 0xFFEF,
273                },
274            ],
275            Script::Hebrew => vec![UnicodeRange {
276                start: 0x0590,
277                end: 0x05FF,
278            }],
279            Script::Hiragana => vec![UnicodeRange {
280                start: 0x3040,
281                end: 0x309F,
282            }],
283            Script::Kannada => vec![UnicodeRange {
284                start: 0x0C80,
285                end: 0x0CFF,
286            }],
287            Script::Katakana => vec![UnicodeRange {
288                start: 0x30A0,
289                end: 0x30FF,
290            }],
291            Script::Khmer => vec![
292                UnicodeRange {
293                    start: 0x1780,
294                    end: 0x17FF,
295                },
296                UnicodeRange {
297                    start: 0x19E0,
298                    end: 0x19FF,
299                },
300            ],
301            Script::Latin => vec![
302                UnicodeRange {
303                    start: 0x0041,
304                    end: 0x005A,
305                }, // A-Z
306                UnicodeRange {
307                    start: 0x0061,
308                    end: 0x007A,
309                }, // a-z
310                UnicodeRange {
311                    start: 0x0080,
312                    end: 0x00FF,
313                },
314                UnicodeRange {
315                    start: 0x0100,
316                    end: 0x017F,
317                },
318                UnicodeRange {
319                    start: 0x0180,
320                    end: 0x024F,
321                },
322                UnicodeRange {
323                    start: 0x0250,
324                    end: 0x02AF,
325                },
326                UnicodeRange {
327                    start: 0x1D00,
328                    end: 0x1D7F,
329                },
330                UnicodeRange {
331                    start: 0x1D80,
332                    end: 0x1DBF,
333                },
334                UnicodeRange {
335                    start: 0x1E00,
336                    end: 0x1EFF,
337                },
338                UnicodeRange {
339                    start: 0x2100,
340                    end: 0x214F,
341                },
342                UnicodeRange {
343                    start: 0x2C60,
344                    end: 0x2C7F,
345                },
346                UnicodeRange {
347                    start: 0xA720,
348                    end: 0xA7FF,
349                },
350                UnicodeRange {
351                    start: 0xAB30,
352                    end: 0xAB6F,
353                },
354            ],
355            Script::Malayalam => vec![UnicodeRange {
356                start: 0x0D00,
357                end: 0x0D7F,
358            }],
359            Script::Mandarin => vec![
360                UnicodeRange {
361                    start: 0x2E80,
362                    end: 0x2E99,
363                },
364                UnicodeRange {
365                    start: 0x2E9B,
366                    end: 0x2EF3,
367                },
368                UnicodeRange {
369                    start: 0x2F00,
370                    end: 0x2FD5,
371                },
372                UnicodeRange {
373                    start: 0x3005,
374                    end: 0x3005,
375                },
376                UnicodeRange {
377                    start: 0x3007,
378                    end: 0x3007,
379                },
380                UnicodeRange {
381                    start: 0x3021,
382                    end: 0x3029,
383                },
384                UnicodeRange {
385                    start: 0x3038,
386                    end: 0x303B,
387                },
388                UnicodeRange {
389                    start: 0x3400,
390                    end: 0x4DB5,
391                },
392                UnicodeRange {
393                    start: 0x4E00,
394                    end: 0x9FCC,
395                },
396                UnicodeRange {
397                    start: 0xF900,
398                    end: 0xFA6D,
399                },
400                UnicodeRange {
401                    start: 0xFA70,
402                    end: 0xFAD9,
403                },
404            ],
405            Script::Myanmar => vec![UnicodeRange {
406                start: 0x1000,
407                end: 0x109F,
408            }],
409            Script::Oriya => vec![UnicodeRange {
410                start: 0x0B00,
411                end: 0x0B7F,
412            }],
413            Script::Sinhala => vec![UnicodeRange {
414                start: 0x0D80,
415                end: 0x0DFF,
416            }],
417            Script::Tamil => vec![UnicodeRange {
418                start: 0x0B80,
419                end: 0x0BFF,
420            }],
421            Script::Telugu => vec![UnicodeRange {
422                start: 0x0C00,
423                end: 0x0C7F,
424            }],
425            Script::Thai => vec![UnicodeRange {
426                start: 0x0E00,
427                end: 0x0E7F,
428            }],
429        }
430    }
431}
432
433// Is it space, punctuation or digit?
434// Stop character is a character that does not give any value for script
435// or language detection.
436#[inline]
437pub fn is_stop_char(ch: char) -> bool {
438    matches!(ch, '\u{0000}'..='\u{0040}' | '\u{005B}'..='\u{0060}' | '\u{007B}'..='\u{007E}')
439}
440
441type ScriptCounter = (Script, fn(char) -> bool, usize);
442
443/// Detect only a script by a given text
444pub fn detect_script(text: &str) -> Option<Script> {
445    let mut script_counters: [ScriptCounter; 24] = [
446        (Script::Latin, is_latin, 0),
447        (Script::Cyrillic, is_cyrillic, 0),
448        (Script::Arabic, is_arabic, 0),
449        (Script::Mandarin, is_mandarin, 0),
450        (Script::Devanagari, is_devanagari, 0),
451        (Script::Hebrew, is_hebrew, 0),
452        (Script::Ethiopic, is_ethiopic, 0),
453        (Script::Georgian, is_georgian, 0),
454        (Script::Bengali, is_bengali, 0),
455        (Script::Hangul, is_hangul, 0),
456        (Script::Hiragana, is_hiragana, 0),
457        (Script::Katakana, is_katakana, 0),
458        (Script::Greek, is_greek, 0),
459        (Script::Kannada, is_kannada, 0),
460        (Script::Tamil, is_tamil, 0),
461        (Script::Thai, is_thai, 0),
462        (Script::Gujarati, is_gujarati, 0),
463        (Script::Gurmukhi, is_gurmukhi, 0),
464        (Script::Telugu, is_telugu, 0),
465        (Script::Malayalam, is_malayalam, 0),
466        (Script::Oriya, is_oriya, 0),
467        (Script::Myanmar, is_myanmar, 0),
468        (Script::Sinhala, is_sinhala, 0),
469        (Script::Khmer, is_khmer, 0),
470    ];
471
472    let half = text.chars().count() / 2;
473
474    for ch in text.chars() {
475        if is_stop_char(ch) {
476            continue;
477        }
478
479        // For performance reasons, we need to mutate script_counters by calling
480        // `swap` function, it would not be possible to do using normal iterator.
481        for i in 0..script_counters.len() {
482            let found = {
483                let (script, check_fn, ref mut count) = script_counters[i];
484                if check_fn(ch) {
485                    *count += 1;
486                    if *count > half {
487                        return Some(script);
488                    }
489                    true
490                } else {
491                    false
492                }
493            };
494            // Have to let borrow of count fall out of scope before doing swapping, or we could
495            // do this above.
496            if found {
497                // If script was found, move it closer to the front.
498                // If the text contains largely 1 or 2 scripts, this will
499                // cause these scripts to be eventually checked first.
500                if i > 0 {
501                    script_counters.swap(i - 1, i);
502                }
503                break;
504            }
505        }
506    }
507
508    let (script, _, count) = script_counters
509        .iter()
510        .cloned()
511        .max_by_key(|&(_, _, count)| count)
512        .unwrap();
513    if count != 0 {
514        Some(script)
515    } else {
516        None
517    }
518}
519
520pub fn detect_char_script(ch: char) -> Option<Script> {
521    let script_counters: [ScriptCounter; 24] = [
522        (Script::Latin, is_latin, 0),
523        (Script::Cyrillic, is_cyrillic, 0),
524        (Script::Arabic, is_arabic, 0),
525        (Script::Mandarin, is_mandarin, 0),
526        (Script::Devanagari, is_devanagari, 0),
527        (Script::Hebrew, is_hebrew, 0),
528        (Script::Ethiopic, is_ethiopic, 0),
529        (Script::Georgian, is_georgian, 0),
530        (Script::Bengali, is_bengali, 0),
531        (Script::Hangul, is_hangul, 0),
532        (Script::Hiragana, is_hiragana, 0),
533        (Script::Katakana, is_katakana, 0),
534        (Script::Greek, is_greek, 0),
535        (Script::Kannada, is_kannada, 0),
536        (Script::Tamil, is_tamil, 0),
537        (Script::Thai, is_thai, 0),
538        (Script::Gujarati, is_gujarati, 0),
539        (Script::Gurmukhi, is_gurmukhi, 0),
540        (Script::Telugu, is_telugu, 0),
541        (Script::Malayalam, is_malayalam, 0),
542        (Script::Oriya, is_oriya, 0),
543        (Script::Myanmar, is_myanmar, 0),
544        (Script::Sinhala, is_sinhala, 0),
545        (Script::Khmer, is_khmer, 0),
546    ];
547
548    for i in 0..script_counters.len() {
549        let (script, check_fn, _) = script_counters[i];
550        if check_fn(ch) {
551            return Some(script);
552        }
553    }
554    None
555}
556
557/// Iterates through the text once and returns as soon as an Assamese-specific character is found.
558fn detect_bengali_language(text: &str) -> Language {
559    for c in text.chars() {
560        // These characters are specific to Assamese in the Bengali script block.
561        // We can return immediately as this is the highest priority check.
562        if matches!(c, '\u{09F0}' | '\u{09F1}') {
563            // ৰ, ৱ
564            return Language::Assamese;
565        }
566    }
567    // If we finish the loop without finding any Assamese characters, it's Bengali.
568    Language::Bengali
569}
570
571fn detect_cyrillic_language(text: &str) -> Language {
572    for c in text.chars() {
573        match c {
574            // Highest priority: Old Cyrillic characters for Slavonic Church. Return immediately.
575            '\u{0460}'..='\u{047F}' => return Language::SlavonicChurch,
576            // Set flags for other languages. We don't return yet because a higher-priority
577            // character (like the one above) could still appear.
578            'ѓ' | 'ќ' | 'ѕ' => return Language::Macedonian,
579            'ў' => return Language::Belarusian,
580            'є' | 'і' | 'ї' | 'ґ' => return Language::Ukrainian,
581            'ө' | 'ү' | 'һ' => return Language::Mongolian,
582            'ј' | 'љ' | 'њ' | 'ћ' | 'ђ' | 'џ' => return Language::SerbianCyrillic,
583            // Bulgarian 'ъ' is also in Russian, but 'щ' is a stronger indicator.
584            // The logic implies that if either is present, it might be Bulgarian.
585            'щ' => return Language::Bulgarian,
586            _ => {}
587        }
588    }
589
590    Language::Russian
591}
592
593fn detect_devanagari_language(text: &str) -> Language {
594    for c in text.chars() {
595        match c {
596            // Marathi has higher priority in the original logic. Return immediately.
597            '\u{0933}' => return Language::Marathi, // ळ
598            // Flag for Sanskrit Vedic extensions.
599            '\u{1CD0}'..='\u{1CFF}' => return Language::Sanskrit,
600            _ => (),
601        }
602    }
603
604    Language::Hindi
605}
606
607fn detect_greek_language(text: &str) -> Language {
608    for c in text.chars() {
609        match c {
610            // Coptic has higher priority. Return immediately.
611            '\u{2C80}'..='\u{2CFF}' => return Language::Coptic,
612            // Flag for Greek Extended (Polytonic) characters.
613            '\u{1F00}'..='\u{1FFF}' => return Language::GreekPoly,
614            _ => {}
615        }
616    }
617
618    Language::GreekMono
619}
620
621fn detect_latin_language(text: &str) -> Language {
622    // Flags for languages checked near the end of the original if-else chain.
623    let mut has_french_c = false;
624    let mut has_portuguese_o = false;
625    let mut has_portuguese_a = false;
626
627    for c in text.chars() {
628        match c {
629            // --- Early Return Cases (in order of priority) ---
630            'ß' => return Language::German1996,
631            'ő' | 'ű' => return Language::Hungarian,
632            'ł' => return Language::Polish,
633            'ř' | 'ů' => return Language::Czech,
634            'ľ' | 'ĺ' | 'ŕ' => return Language::Slovak,
635            'ā' | 'ē' | 'ģ' | 'ī' | 'ķ' | 'ļ' | 'ņ' | 'ō' | 'ū' => {
636                return Language::Latvian
637            }
638            'ą' | 'ę' | 'ė' | 'į' | 'ų' => return Language::Lithuanian,
639            'ă' | 'ș' | 'ț' => return Language::Romanian,
640            'ğ' | 'ı' | 'ş' => return Language::Turkish,
641            'đ' => return Language::Croatian, /* Also used in Vietnamese, but Croatian is the */
642            // original's intent
643            'þ' | 'ð' => return Language::Icelandic,
644            'ŵ' | 'ŷ' => return Language::Welsh,
645            'æ' | 'ø' => return Language::NorwegianBokmal, // And Danish
646            'å' => return Language::Swedish,               // And Norwegian, Finnish
647            'ñ' => return Language::Spanish,
648            'ä' | 'ö' | 'ü' => return Language::German1996,
649
650            // NOTE: 'õ' is used by both Estonian and Portuguese
651            // Since Estonian is checked first, it takes precedence.
652            'õ' => has_portuguese_o = true,
653            'ã' => has_portuguese_a = true,
654
655            // --- Flag-setting Cases ---
656            'ç' => has_french_c = true, // Also in Portuguese
657            'á' | 'é' | 'í' | 'ó' | 'ú' => return Language::Spanish,
658
659            _ => (),
660        }
661    }
662
663    // decide between portuguese, estonian and french
664
665    if has_french_c && !has_portuguese_o && !has_portuguese_a {
666        return Language::French;
667    }
668
669    if has_portuguese_o && !has_french_c && !has_portuguese_a {
670        return Language::Estonian;
671    }
672
673    if has_portuguese_o || has_portuguese_a || has_french_c {
674        return Language::Portuguese;
675    }
676
677    Language::EnglishUS
678}
679
680pub fn script_to_language(script: Script, text: &str) -> Language {
681    match script {
682        Script::Ethiopic => Language::Ethiopic,
683        Script::Georgian => Language::Georgian,
684        Script::Gujarati => Language::Gujarati,
685        Script::Gurmukhi => Language::Panjabi,
686        Script::Kannada => Language::Kannada,
687        Script::Malayalam => Language::Malayalam,
688        Script::Mandarin => Language::Chinese,
689        Script::Oriya => Language::Oriya,
690        Script::Tamil => Language::Tamil,
691        Script::Telugu => Language::Telugu,
692        Script::Thai => Language::Thai,
693        Script::Bengali => detect_bengali_language(text),
694        Script::Cyrillic => detect_cyrillic_language(text),
695        Script::Devanagari => detect_devanagari_language(text),
696        Script::Greek => detect_greek_language(text),
697        Script::Latin => detect_latin_language(text),
698
699        // not directly matchable
700        Script::Myanmar => Language::Thai,
701        Script::Khmer => Language::Thai,
702        Script::Sinhala => Language::Hindi,
703
704        // no classical hyphenation behaviour
705        Script::Arabic => Language::Chinese,
706        Script::Hebrew => Language::Chinese,
707        Script::Hangul => Language::Chinese,
708        Script::Hiragana => Language::Chinese,
709        Script::Katakana => Language::Chinese,
710    }
711}
712
713pub fn is_cyrillic(ch: char) -> bool {
714    matches!(ch,
715        '\u{0400}'..='\u{0484}'
716        | '\u{0487}'..='\u{052F}'
717        | '\u{2DE0}'..='\u{2DFF}'
718        | '\u{A640}'..='\u{A69D}'
719        | '\u{1D2B}'
720        | '\u{1D78}'
721        | '\u{A69F}'
722    )
723}
724
725// https://en.wikipedia.org/wiki/Latin_script_in_Unicode
726pub fn is_latin(ch: char) -> bool {
727    matches!(ch,
728        'a'..='z'
729        | 'A'..='Z'
730        | '\u{0080}'..='\u{00FF}'
731        | '\u{0100}'..='\u{017F}'
732        | '\u{0180}'..='\u{024F}'
733        | '\u{0250}'..='\u{02AF}'
734        | '\u{1D00}'..='\u{1D7F}'
735        | '\u{1D80}'..='\u{1DBF}'
736        | '\u{1E00}'..='\u{1EFF}'
737        | '\u{2100}'..='\u{214F}'
738        | '\u{2C60}'..='\u{2C7F}'
739        | '\u{A720}'..='\u{A7FF}'
740        | '\u{AB30}'..='\u{AB6F}'
741    )
742}
743
744// Based on https://en.wikipedia.org/wiki/Arabic_script_in_Unicode
745pub fn is_arabic(ch: char) -> bool {
746    matches!(ch,
747        '\u{0600}'..='\u{06FF}'
748        | '\u{0750}'..='\u{07FF}'
749        | '\u{08A0}'..='\u{08FF}'
750        | '\u{FB50}'..='\u{FDFF}'
751        | '\u{FE70}'..='\u{FEFF}'
752        | '\u{10E60}'..='\u{10E7F}'
753        | '\u{1EE00}'..='\u{1EEFF}'
754    )
755}
756
757// Based on https://en.wikipedia.org/wiki/Devanagari#Unicode
758pub fn is_devanagari(ch: char) -> bool {
759    matches!(ch, '\u{0900}'..='\u{097F}' | '\u{A8E0}'..='\u{A8FF}' | '\u{1CD0}'..='\u{1CFF}')
760}
761
762// Based on https://www.key-shortcut.com/en/writing-systems/ethiopian-script/
763pub fn is_ethiopic(ch: char) -> bool {
764    matches!(ch, '\u{1200}'..='\u{139F}' | '\u{2D80}'..='\u{2DDF}' | '\u{AB00}'..='\u{AB2F}')
765}
766
767// Based on https://en.wikipedia.org/wiki/Hebrew_(Unicode_block)
768pub fn is_hebrew(ch: char) -> bool {
769    matches!(ch, '\u{0590}'..='\u{05FF}')
770}
771
772pub fn is_georgian(ch: char) -> bool {
773    matches!(ch, '\u{10A0}'..='\u{10FF}')
774}
775
776pub fn is_mandarin(ch: char) -> bool {
777    matches!(ch,
778        '\u{2E80}'..='\u{2E99}'
779        | '\u{2E9B}'..='\u{2EF3}'
780        | '\u{2F00}'..='\u{2FD5}'
781        | '\u{3005}'
782        | '\u{3007}'
783        | '\u{3021}'..='\u{3029}'
784        | '\u{3038}'..='\u{303B}'
785        | '\u{3400}'..='\u{4DB5}'
786        | '\u{4E00}'..='\u{9FCC}'
787        | '\u{F900}'..='\u{FA6D}'
788        | '\u{FA70}'..='\u{FAD9}'
789    )
790}
791
792pub fn is_bengali(ch: char) -> bool {
793    matches!(ch, '\u{0980}'..='\u{09FF}')
794}
795
796pub fn is_hiragana(ch: char) -> bool {
797    matches!(ch, '\u{3040}'..='\u{309F}')
798}
799
800pub fn is_katakana(ch: char) -> bool {
801    matches!(ch, '\u{30A0}'..='\u{30FF}')
802}
803
804// Hangul is Korean Alphabet. Unicode ranges are taken from: https://en.wikipedia.org/wiki/Hangul
805pub fn is_hangul(ch: char) -> bool {
806    matches!(ch,
807        '\u{AC00}'..='\u{D7AF}'
808        | '\u{1100}'..='\u{11FF}'
809        | '\u{3130}'..='\u{318F}'
810        | '\u{3200}'..='\u{32FF}'
811        | '\u{A960}'..='\u{A97F}'
812        | '\u{D7B0}'..='\u{D7FF}'
813        | '\u{FF00}'..='\u{FFEF}'
814    )
815}
816
817// Taken from: https://en.wikipedia.org/wiki/Greek_and_Coptic
818pub fn is_greek(ch: char) -> bool {
819    matches!(ch, '\u{0370}'..='\u{03FF}')
820}
821
822// Based on: https://en.wikipedia.org/wiki/Kannada_(Unicode_block)
823pub fn is_kannada(ch: char) -> bool {
824    matches!(ch, '\u{0C80}'..='\u{0CFF}')
825}
826
827// Based on: https://en.wikipedia.org/wiki/Tamil_(Unicode_block)
828pub fn is_tamil(ch: char) -> bool {
829    matches!(ch, '\u{0B80}'..='\u{0BFF}')
830}
831
832// Based on: https://en.wikipedia.org/wiki/Thai_(Unicode_block)
833pub fn is_thai(ch: char) -> bool {
834    matches!(ch, '\u{0E00}'..='\u{0E7F}')
835}
836
837// Based on: https://en.wikipedia.org/wiki/Gujarati_(Unicode_block)
838pub fn is_gujarati(ch: char) -> bool {
839    matches!(ch, '\u{0A80}'..='\u{0AFF}')
840}
841
842// Gurmukhi is the script for Punjabi language.
843// Based on: https://en.wikipedia.org/wiki/Gurmukhi_(Unicode_block)
844pub fn is_gurmukhi(ch: char) -> bool {
845    matches!(ch, '\u{0A00}'..='\u{0A7F}')
846}
847
848pub fn is_telugu(ch: char) -> bool {
849    matches!(ch, '\u{0C00}'..='\u{0C7F}')
850}
851
852// Based on: https://en.wikipedia.org/wiki/Malayalam_(Unicode_block)
853pub fn is_malayalam(ch: char) -> bool {
854    matches!(ch, '\u{0D00}'..='\u{0D7F}')
855}
856
857// Based on: https://en.wikipedia.org/wiki/Oriya_(Unicode_block)
858pub fn is_oriya(ch: char) -> bool {
859    matches!(ch, '\u{0B00}'..='\u{0B7F}')
860}
861
862// Based on: https://en.wikipedia.org/wiki/Myanmar_(Unicode_block)
863pub fn is_myanmar(ch: char) -> bool {
864    matches!(ch, '\u{1000}'..='\u{109F}')
865}
866
867// Based on: https://en.wikipedia.org/wiki/Sinhala_(Unicode_block)
868pub fn is_sinhala(ch: char) -> bool {
869    matches!(ch, '\u{0D80}'..='\u{0DFF}')
870}
871
872// Based on: https://en.wikipedia.org/wiki/Khmer_alphabet
873pub fn is_khmer(ch: char) -> bool {
874    matches!(ch, '\u{1780}'..='\u{17FF}' | '\u{19E0}'..='\u{19FF}')
875}