1#[cfg(feature = "text_layout_hyphenation")]
35use hyphenation::Language as HyphenationLanguage;
36#[cfg(feature = "text_layout_hyphenation")]
37pub use hyphenation::Language;
38
39#[cfg(not(feature = "text_layout_hyphenation"))]
42#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
43#[allow(dead_code)]
44pub enum Language {
45 EnglishUS,
47 French,
48 German1996,
49 Spanish,
50 Portuguese,
51 Estonian,
52 Hungarian,
53 Polish,
54 Czech,
55 Slovak,
56 Latvian,
57 Lithuanian,
58 Romanian,
59 Turkish,
60 Croatian,
61 Icelandic,
62 Welsh,
63 NorwegianBokmal,
64 Swedish,
65 Russian,
67 Ukrainian,
68 Belarusian,
69 Bulgarian,
70 Macedonian,
71 SerbianCyrillic,
72 Mongolian,
73 SlavonicChurch,
74 GreekMono,
76 GreekPoly,
77 Coptic,
78 Hindi,
80 Bengali,
81 Assamese,
82 Marathi,
83 Sanskrit,
84 Gujarati,
85 Panjabi,
86 Kannada,
87 Malayalam,
88 Oriya,
89 Tamil,
90 Telugu,
91 Georgian,
93 Ethiopic,
94 Thai,
95 Chinese,
96}
97
98use rust_fontconfig::UnicodeRange;
99
100#[derive(PartialEq, Eq, Debug, Clone, Copy)]
101pub enum Script {
102 Arabic,
104 Bengali,
105 Cyrillic,
106 Devanagari,
107 Ethiopic,
108 Georgian,
109 Greek,
110 Gujarati,
111 Gurmukhi,
112 Hangul,
113 Hebrew,
114 Hiragana,
115 Kannada,
116 Katakana,
117 Khmer,
118 Latin,
119 Malayalam,
120 Mandarin,
121 Myanmar,
122 Oriya,
123 Sinhala,
124 Tamil,
125 Telugu,
126 Thai,
127}
128
129impl Script {
130 pub fn get_unicode_ranges(&self) -> Vec<UnicodeRange> {
134 match self {
135 Script::Arabic => vec![
136 UnicodeRange {
137 start: 0x0600,
138 end: 0x06FF,
139 },
140 UnicodeRange {
141 start: 0x0750,
142 end: 0x07FF,
143 },
144 UnicodeRange {
145 start: 0x08A0,
146 end: 0x08FF,
147 },
148 UnicodeRange {
149 start: 0xFB50,
150 end: 0xFDFF,
151 },
152 UnicodeRange {
153 start: 0xFE70,
154 end: 0xFEFF,
155 },
156 UnicodeRange {
157 start: 0x10E60,
158 end: 0x10E7F,
159 },
160 UnicodeRange {
161 start: 0x1EE00,
162 end: 0x1EEFF,
163 },
164 ],
165 Script::Bengali => vec![UnicodeRange {
166 start: 0x0980,
167 end: 0x09FF,
168 }],
169 Script::Cyrillic => vec![
170 UnicodeRange {
171 start: 0x0400,
172 end: 0x0484,
173 },
174 UnicodeRange {
175 start: 0x0487,
176 end: 0x052F,
177 },
178 UnicodeRange {
179 start: 0x2DE0,
180 end: 0x2DFF,
181 },
182 UnicodeRange {
183 start: 0xA640,
184 end: 0xA69D,
185 },
186 UnicodeRange {
187 start: 0x1D2B,
188 end: 0x1D2B,
189 },
190 UnicodeRange {
191 start: 0x1D78,
192 end: 0x1D78,
193 },
194 UnicodeRange {
195 start: 0xA69F,
196 end: 0xA69F,
197 },
198 ],
199 Script::Devanagari => vec![
200 UnicodeRange {
201 start: 0x0900,
202 end: 0x097F,
203 },
204 UnicodeRange {
205 start: 0xA8E0,
206 end: 0xA8FF,
207 },
208 UnicodeRange {
209 start: 0x1CD0,
210 end: 0x1CFF,
211 },
212 ],
213 Script::Ethiopic => vec![
214 UnicodeRange {
215 start: 0x1200,
216 end: 0x139F,
217 },
218 UnicodeRange {
219 start: 0x2D80,
220 end: 0x2DDF,
221 },
222 UnicodeRange {
223 start: 0xAB00,
224 end: 0xAB2F,
225 },
226 ],
227 Script::Georgian => vec![UnicodeRange {
228 start: 0x10A0,
229 end: 0x10FF,
230 }],
231 Script::Greek => vec![UnicodeRange {
232 start: 0x0370,
233 end: 0x03FF,
234 }],
235 Script::Gujarati => vec![UnicodeRange {
236 start: 0x0A80,
237 end: 0x0AFF,
238 }],
239 Script::Gurmukhi => vec![UnicodeRange {
240 start: 0x0A00,
241 end: 0x0A7F,
242 }],
243 Script::Hangul => vec![
244 UnicodeRange {
245 start: 0xAC00,
246 end: 0xD7AF,
247 },
248 UnicodeRange {
249 start: 0x1100,
250 end: 0x11FF,
251 },
252 UnicodeRange {
253 start: 0x3130,
254 end: 0x318F,
255 },
256 UnicodeRange {
257 start: 0x3200,
258 end: 0x32FF,
259 },
260 UnicodeRange {
261 start: 0xA960,
262 end: 0xA97F,
263 },
264 UnicodeRange {
265 start: 0xD7B0,
266 end: 0xD7FF,
267 },
268 UnicodeRange {
269 start: 0xFF00,
270 end: 0xFFEF,
271 },
272 ],
273 Script::Hebrew => vec![UnicodeRange {
274 start: 0x0590,
275 end: 0x05FF,
276 }],
277 Script::Hiragana => vec![UnicodeRange {
278 start: 0x3040,
279 end: 0x309F,
280 }],
281 Script::Kannada => vec![UnicodeRange {
282 start: 0x0C80,
283 end: 0x0CFF,
284 }],
285 Script::Katakana => vec![UnicodeRange {
286 start: 0x30A0,
287 end: 0x30FF,
288 }],
289 Script::Khmer => vec![
290 UnicodeRange {
291 start: 0x1780,
292 end: 0x17FF,
293 },
294 UnicodeRange {
295 start: 0x19E0,
296 end: 0x19FF,
297 },
298 ],
299 Script::Latin => vec![
300 UnicodeRange {
301 start: 0x0041,
302 end: 0x005A,
303 }, UnicodeRange {
305 start: 0x0061,
306 end: 0x007A,
307 }, UnicodeRange {
309 start: 0x0080,
310 end: 0x00FF,
311 },
312 UnicodeRange {
313 start: 0x0100,
314 end: 0x017F,
315 },
316 UnicodeRange {
317 start: 0x0180,
318 end: 0x024F,
319 },
320 UnicodeRange {
321 start: 0x0250,
322 end: 0x02AF,
323 },
324 UnicodeRange {
325 start: 0x1D00,
326 end: 0x1D7F,
327 },
328 UnicodeRange {
329 start: 0x1D80,
330 end: 0x1DBF,
331 },
332 UnicodeRange {
333 start: 0x1E00,
334 end: 0x1EFF,
335 },
336 UnicodeRange {
337 start: 0x2100,
338 end: 0x214F,
339 },
340 UnicodeRange {
341 start: 0x2C60,
342 end: 0x2C7F,
343 },
344 UnicodeRange {
345 start: 0xA720,
346 end: 0xA7FF,
347 },
348 UnicodeRange {
349 start: 0xAB30,
350 end: 0xAB6F,
351 },
352 ],
353 Script::Malayalam => vec![UnicodeRange {
354 start: 0x0D00,
355 end: 0x0D7F,
356 }],
357 Script::Mandarin => vec![
358 UnicodeRange {
359 start: 0x2E80,
360 end: 0x2E99,
361 },
362 UnicodeRange {
363 start: 0x2E9B,
364 end: 0x2EF3,
365 },
366 UnicodeRange {
367 start: 0x2F00,
368 end: 0x2FD5,
369 },
370 UnicodeRange {
371 start: 0x3005,
372 end: 0x3005,
373 },
374 UnicodeRange {
375 start: 0x3007,
376 end: 0x3007,
377 },
378 UnicodeRange {
379 start: 0x3021,
380 end: 0x3029,
381 },
382 UnicodeRange {
383 start: 0x3038,
384 end: 0x303B,
385 },
386 UnicodeRange {
387 start: 0x3400,
388 end: 0x4DB5,
389 },
390 UnicodeRange {
391 start: 0x4E00,
392 end: 0x9FCC,
393 },
394 UnicodeRange {
395 start: 0xF900,
396 end: 0xFA6D,
397 },
398 UnicodeRange {
399 start: 0xFA70,
400 end: 0xFAD9,
401 },
402 ],
403 Script::Myanmar => vec![UnicodeRange {
404 start: 0x1000,
405 end: 0x109F,
406 }],
407 Script::Oriya => vec![UnicodeRange {
408 start: 0x0B00,
409 end: 0x0B7F,
410 }],
411 Script::Sinhala => vec![UnicodeRange {
412 start: 0x0D80,
413 end: 0x0DFF,
414 }],
415 Script::Tamil => vec![UnicodeRange {
416 start: 0x0B80,
417 end: 0x0BFF,
418 }],
419 Script::Telugu => vec![UnicodeRange {
420 start: 0x0C00,
421 end: 0x0C7F,
422 }],
423 Script::Thai => vec![UnicodeRange {
424 start: 0x0E00,
425 end: 0x0E7F,
426 }],
427 }
428 }
429}
430
431#[inline]
435pub fn is_stop_char(ch: char) -> bool {
436 matches!(ch, '\u{0000}'..='\u{0040}' | '\u{005B}'..='\u{0060}' | '\u{007B}'..='\u{007E}')
437}
438
439type ScriptCounter = (Script, fn(char) -> bool, usize);
440
441pub fn detect_script(text: &str) -> Option<Script> {
443 let mut script_counters: [ScriptCounter; 24] = [
444 (Script::Latin, is_latin, 0),
445 (Script::Cyrillic, is_cyrillic, 0),
446 (Script::Arabic, is_arabic, 0),
447 (Script::Mandarin, is_mandarin, 0),
448 (Script::Devanagari, is_devanagari, 0),
449 (Script::Hebrew, is_hebrew, 0),
450 (Script::Ethiopic, is_ethiopic, 0),
451 (Script::Georgian, is_georgian, 0),
452 (Script::Bengali, is_bengali, 0),
453 (Script::Hangul, is_hangul, 0),
454 (Script::Hiragana, is_hiragana, 0),
455 (Script::Katakana, is_katakana, 0),
456 (Script::Greek, is_greek, 0),
457 (Script::Kannada, is_kannada, 0),
458 (Script::Tamil, is_tamil, 0),
459 (Script::Thai, is_thai, 0),
460 (Script::Gujarati, is_gujarati, 0),
461 (Script::Gurmukhi, is_gurmukhi, 0),
462 (Script::Telugu, is_telugu, 0),
463 (Script::Malayalam, is_malayalam, 0),
464 (Script::Oriya, is_oriya, 0),
465 (Script::Myanmar, is_myanmar, 0),
466 (Script::Sinhala, is_sinhala, 0),
467 (Script::Khmer, is_khmer, 0),
468 ];
469
470 let half = text.chars().count() / 2;
471
472 for ch in text.chars() {
473 if is_stop_char(ch) {
474 continue;
475 }
476
477 for i in 0..script_counters.len() {
480 let found = {
481 let (script, check_fn, ref mut count) = script_counters[i];
482 if check_fn(ch) {
483 *count += 1;
484 if *count > half {
485 return Some(script);
486 }
487 true
488 } else {
489 false
490 }
491 };
492 if found {
495 if i > 0 {
499 script_counters.swap(i - 1, i);
500 }
501 break;
502 }
503 }
504 }
505
506 let (script, _, count) = script_counters
507 .iter()
508 .cloned()
509 .max_by_key(|&(_, _, count)| count)
510 .unwrap();
511 if count != 0 {
512 Some(script)
513 } else {
514 None
515 }
516}
517
518pub fn detect_char_script(ch: char) -> Option<Script> {
519 let script_counters: [ScriptCounter; 24] = [
520 (Script::Latin, is_latin, 0),
521 (Script::Cyrillic, is_cyrillic, 0),
522 (Script::Arabic, is_arabic, 0),
523 (Script::Mandarin, is_mandarin, 0),
524 (Script::Devanagari, is_devanagari, 0),
525 (Script::Hebrew, is_hebrew, 0),
526 (Script::Ethiopic, is_ethiopic, 0),
527 (Script::Georgian, is_georgian, 0),
528 (Script::Bengali, is_bengali, 0),
529 (Script::Hangul, is_hangul, 0),
530 (Script::Hiragana, is_hiragana, 0),
531 (Script::Katakana, is_katakana, 0),
532 (Script::Greek, is_greek, 0),
533 (Script::Kannada, is_kannada, 0),
534 (Script::Tamil, is_tamil, 0),
535 (Script::Thai, is_thai, 0),
536 (Script::Gujarati, is_gujarati, 0),
537 (Script::Gurmukhi, is_gurmukhi, 0),
538 (Script::Telugu, is_telugu, 0),
539 (Script::Malayalam, is_malayalam, 0),
540 (Script::Oriya, is_oriya, 0),
541 (Script::Myanmar, is_myanmar, 0),
542 (Script::Sinhala, is_sinhala, 0),
543 (Script::Khmer, is_khmer, 0),
544 ];
545
546 for i in 0..script_counters.len() {
547 let (script, check_fn, _) = script_counters[i];
548 if check_fn(ch) {
549 return Some(script);
550 }
551 }
552 None
553}
554
555fn detect_bengali_language(text: &str) -> Language {
557 for c in text.chars() {
558 if matches!(c, '\u{09F0}' | '\u{09F1}') {
561 return Language::Assamese;
563 }
564 }
565 Language::Bengali
567}
568
569fn detect_cyrillic_language(text: &str) -> Language {
570 for c in text.chars() {
571 match c {
572 '\u{0460}'..='\u{047F}' => return Language::SlavonicChurch,
574 'ѓ' | 'ќ' | 'ѕ' => return Language::Macedonian,
577 'ў' => return Language::Belarusian,
578 'є' | 'і' | 'ї' | 'ґ' => return Language::Ukrainian,
579 'ө' | 'ү' | 'һ' => return Language::Mongolian,
580 'ј' | 'љ' | 'њ' | 'ћ' | 'ђ' | 'џ' => return Language::SerbianCyrillic,
581 'щ' => return Language::Bulgarian,
584 _ => {}
585 }
586 }
587
588 Language::Russian
589}
590
591fn detect_devanagari_language(text: &str) -> Language {
592 for c in text.chars() {
593 match c {
594 '\u{0933}' => return Language::Marathi, '\u{1CD0}'..='\u{1CFF}' => return Language::Sanskrit,
598 _ => (),
599 }
600 }
601
602 Language::Hindi
603}
604
605fn detect_greek_language(text: &str) -> Language {
606 let mut has_polytonic = false;
607
608 for c in text.chars() {
609 match c {
610 '\u{2C80}'..='\u{2CFF}' => return Language::Coptic,
612 '\u{1F00}'..='\u{1FFF}' => return Language::GreekPoly,
614 _ => {}
615 }
616 }
617
618 Language::GreekMono
619}
620
621fn detect_latin_language(text: &str) -> Language {
622 let mut has_french_c = false;
624 let mut has_portugese_o = false;
625 let mut has_portuguese_a = false;
626
627 for c in text.chars() {
628 match c {
629 'ß' => return Language::German1996,
631 'ő' | 'ű' => return Language::Hungarian,
632 'ł' => return Language::Polish,
633 'ř' | 'ů' => return Language::Czech,
634 'ľ' | 'ĺ' | 'ŕ' => return Language::Slovak,
635 'ā' | 'ē' | 'ģ' | 'ī' | 'ķ' | 'ļ' | 'ņ' | 'ō' | 'ū' => {
636 return Language::Latvian
637 }
638 'ą' | 'ę' | 'ė' | 'į' | 'ų' => return Language::Lithuanian,
639 'ă' | 'ș' | 'ț' => return Language::Romanian,
640 'ğ' | 'ı' | 'ş' => return Language::Turkish,
641 'đ' => return Language::Croatian, 'þ' | 'ð' => return Language::Icelandic,
644 'ŵ' | 'ŷ' => return Language::Welsh,
645 'æ' | 'ø' => return Language::NorwegianBokmal, 'å' => return Language::Swedish, 'ñ' => return Language::Spanish,
648 'ä' | 'ö' | 'ü' => return Language::German1996,
649
650 'õ' => has_portugese_o = true,
653 'ã' => has_portuguese_a = true,
654
655 'ç' => has_french_c = true, 'á' | 'é' | 'í' | 'ó' | 'ú' => return Language::Spanish,
658
659 _ => (),
660 }
661 }
662
663 if has_french_c && !has_portugese_o && !has_portuguese_a {
666 return Language::French;
667 }
668
669 if has_portugese_o && !has_french_c && !has_portuguese_a {
670 return Language::Estonian;
671 }
672
673 if has_portugese_o || has_portuguese_a || has_french_c {
674 return Language::Portuguese;
675 }
676
677 Language::EnglishUS
678}
679
680pub fn script_to_language(script: Script, text: &str) -> Language {
681 match script {
682 Script::Ethiopic => Language::Ethiopic,
683 Script::Georgian => Language::Georgian,
684 Script::Gujarati => Language::Gujarati,
685 Script::Gurmukhi => Language::Panjabi,
686 Script::Kannada => Language::Kannada,
687 Script::Malayalam => Language::Malayalam,
688 Script::Mandarin => Language::Chinese,
689 Script::Oriya => Language::Oriya,
690 Script::Tamil => Language::Tamil,
691 Script::Telugu => Language::Telugu,
692 Script::Thai => Language::Thai,
693 Script::Bengali => detect_bengali_language(text),
694 Script::Cyrillic => detect_cyrillic_language(text),
695 Script::Devanagari => detect_devanagari_language(text),
696 Script::Greek => detect_greek_language(text),
697 Script::Latin => detect_latin_language(text),
698
699 Script::Myanmar => Language::Thai,
701 Script::Khmer => Language::Thai,
702 Script::Sinhala => Language::Hindi,
703
704 Script::Arabic => Language::Chinese,
706 Script::Hebrew => Language::Chinese,
707 Script::Hangul => Language::Chinese,
708 Script::Hiragana => Language::Chinese,
709 Script::Katakana => Language::Chinese,
710 }
711}
712
713pub fn is_cyrillic(ch: char) -> bool {
714 matches!(ch,
715 '\u{0400}'..='\u{0484}'
716 | '\u{0487}'..='\u{052F}'
717 | '\u{2DE0}'..='\u{2DFF}'
718 | '\u{A640}'..='\u{A69D}'
719 | '\u{1D2B}'
720 | '\u{1D78}'
721 | '\u{A69F}'
722 )
723}
724
725pub fn is_latin(ch: char) -> bool {
727 matches!(ch,
728 'a'..='z'
729 | 'A'..='Z'
730 | '\u{0080}'..='\u{00FF}'
731 | '\u{0100}'..='\u{017F}'
732 | '\u{0180}'..='\u{024F}'
733 | '\u{0250}'..='\u{02AF}'
734 | '\u{1D00}'..='\u{1D7F}'
735 | '\u{1D80}'..='\u{1DBF}'
736 | '\u{1E00}'..='\u{1EFF}'
737 | '\u{2100}'..='\u{214F}'
738 | '\u{2C60}'..='\u{2C7F}'
739 | '\u{A720}'..='\u{A7FF}'
740 | '\u{AB30}'..='\u{AB6F}'
741 )
742}
743
744pub fn is_arabic(ch: char) -> bool {
746 matches!(ch,
747 '\u{0600}'..='\u{06FF}'
748 | '\u{0750}'..='\u{07FF}'
749 | '\u{08A0}'..='\u{08FF}'
750 | '\u{FB50}'..='\u{FDFF}'
751 | '\u{FE70}'..='\u{FEFF}'
752 | '\u{10E60}'..='\u{10E7F}'
753 | '\u{1EE00}'..='\u{1EEFF}'
754 )
755}
756
757pub fn is_devanagari(ch: char) -> bool {
759 matches!(ch, '\u{0900}'..='\u{097F}' | '\u{A8E0}'..='\u{A8FF}' | '\u{1CD0}'..='\u{1CFF}')
760}
761
762pub fn is_ethiopic(ch: char) -> bool {
764 matches!(ch, '\u{1200}'..='\u{139F}' | '\u{2D80}'..='\u{2DDF}' | '\u{AB00}'..='\u{AB2F}')
765}
766
767pub fn is_hebrew(ch: char) -> bool {
769 matches!(ch, '\u{0590}'..='\u{05FF}')
770}
771
772pub fn is_georgian(ch: char) -> bool {
773 matches!(ch, '\u{10A0}'..='\u{10FF}')
774}
775
776pub fn is_mandarin(ch: char) -> bool {
777 matches!(ch,
778 '\u{2E80}'..='\u{2E99}'
779 | '\u{2E9B}'..='\u{2EF3}'
780 | '\u{2F00}'..='\u{2FD5}'
781 | '\u{3005}'
782 | '\u{3007}'
783 | '\u{3021}'..='\u{3029}'
784 | '\u{3038}'..='\u{303B}'
785 | '\u{3400}'..='\u{4DB5}'
786 | '\u{4E00}'..='\u{9FCC}'
787 | '\u{F900}'..='\u{FA6D}'
788 | '\u{FA70}'..='\u{FAD9}'
789 )
790}
791
792pub fn is_bengali(ch: char) -> bool {
793 matches!(ch, '\u{0980}'..='\u{09FF}')
794}
795
796pub fn is_hiragana(ch: char) -> bool {
797 matches!(ch, '\u{3040}'..='\u{309F}')
798}
799
800pub fn is_katakana(ch: char) -> bool {
801 matches!(ch, '\u{30A0}'..='\u{30FF}')
802}
803
804pub fn is_hangul(ch: char) -> bool {
806 matches!(ch,
807 '\u{AC00}'..='\u{D7AF}'
808 | '\u{1100}'..='\u{11FF}'
809 | '\u{3130}'..='\u{318F}'
810 | '\u{3200}'..='\u{32FF}'
811 | '\u{A960}'..='\u{A97F}'
812 | '\u{D7B0}'..='\u{D7FF}'
813 | '\u{FF00}'..='\u{FFEF}'
814 )
815}
816
817pub fn is_greek(ch: char) -> bool {
819 matches!(ch, '\u{0370}'..='\u{03FF}')
820}
821
822pub fn is_kannada(ch: char) -> bool {
824 matches!(ch, '\u{0C80}'..='\u{0CFF}')
825}
826
827pub fn is_tamil(ch: char) -> bool {
829 matches!(ch, '\u{0B80}'..='\u{0BFF}')
830}
831
832pub fn is_thai(ch: char) -> bool {
834 matches!(ch, '\u{0E00}'..='\u{0E7F}')
835}
836
837pub fn is_gujarati(ch: char) -> bool {
839 matches!(ch, '\u{0A80}'..='\u{0AFF}')
840}
841
842pub fn is_gurmukhi(ch: char) -> bool {
845 matches!(ch, '\u{0A00}'..='\u{0A7F}')
846}
847
848pub fn is_telugu(ch: char) -> bool {
849 matches!(ch, '\u{0C00}'..='\u{0C7F}')
850}
851
852pub fn is_malayalam(ch: char) -> bool {
854 matches!(ch, '\u{0D00}'..='\u{0D7F}')
855}
856
857pub fn is_oriya(ch: char) -> bool {
859 matches!(ch, '\u{0B00}'..='\u{0B7F}')
860}
861
862pub fn is_myanmar(ch: char) -> bool {
864 matches!(ch, '\u{1000}'..='\u{109F}')
865}
866
867pub fn is_sinhala(ch: char) -> bool {
869 matches!(ch, '\u{0D80}'..='\u{0DFF}')
870}
871
872pub fn is_khmer(ch: char) -> bool {
874 matches!(ch, '\u{1780}'..='\u{17FF}' | '\u{19E0}'..='\u{19FF}')
875}