1#[cfg(feature = "text_layout_hyphenation")]
37use hyphenation::Language as HyphenationLanguage;
38#[cfg(feature = "text_layout_hyphenation")]
39pub use hyphenation::Language;
40
41#[cfg(not(feature = "text_layout_hyphenation"))]
44#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
45#[allow(dead_code)]
46pub enum Language {
47 EnglishUS,
49 French,
50 German1996,
51 Spanish,
52 Portuguese,
53 Estonian,
54 Hungarian,
55 Polish,
56 Czech,
57 Slovak,
58 Latvian,
59 Lithuanian,
60 Romanian,
61 Turkish,
62 Croatian,
63 Icelandic,
64 Welsh,
65 NorwegianBokmal,
66 Swedish,
67 Russian,
69 Ukrainian,
70 Belarusian,
71 Bulgarian,
72 Macedonian,
73 SerbianCyrillic,
74 Mongolian,
75 SlavonicChurch,
76 GreekMono,
78 GreekPoly,
79 Coptic,
80 Hindi,
82 Bengali,
83 Assamese,
84 Marathi,
85 Sanskrit,
86 Gujarati,
87 Panjabi,
88 Kannada,
89 Malayalam,
90 Oriya,
91 Tamil,
92 Telugu,
93 Georgian,
95 Ethiopic,
96 Thai,
97 Chinese,
98}
99
100use rust_fontconfig::UnicodeRange;
101
102#[derive(PartialEq, Eq, Debug, Clone, Copy)]
103pub enum Script {
104 Arabic,
106 Bengali,
107 Cyrillic,
108 Devanagari,
109 Ethiopic,
110 Georgian,
111 Greek,
112 Gujarati,
113 Gurmukhi,
114 Hangul,
115 Hebrew,
116 Hiragana,
117 Kannada,
118 Katakana,
119 Khmer,
120 Latin,
121 Malayalam,
122 Mandarin,
123 Myanmar,
124 Oriya,
125 Sinhala,
126 Tamil,
127 Telugu,
128 Thai,
129}
130
131impl Script {
132 pub fn get_unicode_ranges(&self) -> Vec<UnicodeRange> {
136 match self {
137 Script::Arabic => vec![
138 UnicodeRange {
139 start: 0x0600,
140 end: 0x06FF,
141 },
142 UnicodeRange {
143 start: 0x0750,
144 end: 0x07FF,
145 },
146 UnicodeRange {
147 start: 0x08A0,
148 end: 0x08FF,
149 },
150 UnicodeRange {
151 start: 0xFB50,
152 end: 0xFDFF,
153 },
154 UnicodeRange {
155 start: 0xFE70,
156 end: 0xFEFF,
157 },
158 UnicodeRange {
159 start: 0x10E60,
160 end: 0x10E7F,
161 },
162 UnicodeRange {
163 start: 0x1EE00,
164 end: 0x1EEFF,
165 },
166 ],
167 Script::Bengali => vec![UnicodeRange {
168 start: 0x0980,
169 end: 0x09FF,
170 }],
171 Script::Cyrillic => vec![
172 UnicodeRange {
173 start: 0x0400,
174 end: 0x0484,
175 },
176 UnicodeRange {
177 start: 0x0487,
178 end: 0x052F,
179 },
180 UnicodeRange {
181 start: 0x2DE0,
182 end: 0x2DFF,
183 },
184 UnicodeRange {
185 start: 0xA640,
186 end: 0xA69D,
187 },
188 UnicodeRange {
189 start: 0x1D2B,
190 end: 0x1D2B,
191 },
192 UnicodeRange {
193 start: 0x1D78,
194 end: 0x1D78,
195 },
196 UnicodeRange {
197 start: 0xA69F,
198 end: 0xA69F,
199 },
200 ],
201 Script::Devanagari => vec![
202 UnicodeRange {
203 start: 0x0900,
204 end: 0x097F,
205 },
206 UnicodeRange {
207 start: 0xA8E0,
208 end: 0xA8FF,
209 },
210 UnicodeRange {
211 start: 0x1CD0,
212 end: 0x1CFF,
213 },
214 ],
215 Script::Ethiopic => vec![
216 UnicodeRange {
217 start: 0x1200,
218 end: 0x139F,
219 },
220 UnicodeRange {
221 start: 0x2D80,
222 end: 0x2DDF,
223 },
224 UnicodeRange {
225 start: 0xAB00,
226 end: 0xAB2F,
227 },
228 ],
229 Script::Georgian => vec![UnicodeRange {
230 start: 0x10A0,
231 end: 0x10FF,
232 }],
233 Script::Greek => vec![UnicodeRange {
234 start: 0x0370,
235 end: 0x03FF,
236 }],
237 Script::Gujarati => vec![UnicodeRange {
238 start: 0x0A80,
239 end: 0x0AFF,
240 }],
241 Script::Gurmukhi => vec![UnicodeRange {
242 start: 0x0A00,
243 end: 0x0A7F,
244 }],
245 Script::Hangul => vec![
246 UnicodeRange {
247 start: 0xAC00,
248 end: 0xD7AF,
249 },
250 UnicodeRange {
251 start: 0x1100,
252 end: 0x11FF,
253 },
254 UnicodeRange {
255 start: 0x3130,
256 end: 0x318F,
257 },
258 UnicodeRange {
259 start: 0x3200,
260 end: 0x32FF,
261 },
262 UnicodeRange {
263 start: 0xA960,
264 end: 0xA97F,
265 },
266 UnicodeRange {
267 start: 0xD7B0,
268 end: 0xD7FF,
269 },
270 UnicodeRange {
271 start: 0xFF00,
272 end: 0xFFEF,
273 },
274 ],
275 Script::Hebrew => vec![UnicodeRange {
276 start: 0x0590,
277 end: 0x05FF,
278 }],
279 Script::Hiragana => vec![UnicodeRange {
280 start: 0x3040,
281 end: 0x309F,
282 }],
283 Script::Kannada => vec![UnicodeRange {
284 start: 0x0C80,
285 end: 0x0CFF,
286 }],
287 Script::Katakana => vec![UnicodeRange {
288 start: 0x30A0,
289 end: 0x30FF,
290 }],
291 Script::Khmer => vec![
292 UnicodeRange {
293 start: 0x1780,
294 end: 0x17FF,
295 },
296 UnicodeRange {
297 start: 0x19E0,
298 end: 0x19FF,
299 },
300 ],
301 Script::Latin => vec![
302 UnicodeRange {
303 start: 0x0041,
304 end: 0x005A,
305 }, UnicodeRange {
307 start: 0x0061,
308 end: 0x007A,
309 }, UnicodeRange {
311 start: 0x0080,
312 end: 0x00FF,
313 },
314 UnicodeRange {
315 start: 0x0100,
316 end: 0x017F,
317 },
318 UnicodeRange {
319 start: 0x0180,
320 end: 0x024F,
321 },
322 UnicodeRange {
323 start: 0x0250,
324 end: 0x02AF,
325 },
326 UnicodeRange {
327 start: 0x1D00,
328 end: 0x1D7F,
329 },
330 UnicodeRange {
331 start: 0x1D80,
332 end: 0x1DBF,
333 },
334 UnicodeRange {
335 start: 0x1E00,
336 end: 0x1EFF,
337 },
338 UnicodeRange {
339 start: 0x2100,
340 end: 0x214F,
341 },
342 UnicodeRange {
343 start: 0x2C60,
344 end: 0x2C7F,
345 },
346 UnicodeRange {
347 start: 0xA720,
348 end: 0xA7FF,
349 },
350 UnicodeRange {
351 start: 0xAB30,
352 end: 0xAB6F,
353 },
354 ],
355 Script::Malayalam => vec![UnicodeRange {
356 start: 0x0D00,
357 end: 0x0D7F,
358 }],
359 Script::Mandarin => vec![
360 UnicodeRange {
361 start: 0x2E80,
362 end: 0x2E99,
363 },
364 UnicodeRange {
365 start: 0x2E9B,
366 end: 0x2EF3,
367 },
368 UnicodeRange {
369 start: 0x2F00,
370 end: 0x2FD5,
371 },
372 UnicodeRange {
373 start: 0x3005,
374 end: 0x3005,
375 },
376 UnicodeRange {
377 start: 0x3007,
378 end: 0x3007,
379 },
380 UnicodeRange {
381 start: 0x3021,
382 end: 0x3029,
383 },
384 UnicodeRange {
385 start: 0x3038,
386 end: 0x303B,
387 },
388 UnicodeRange {
389 start: 0x3400,
390 end: 0x4DB5,
391 },
392 UnicodeRange {
393 start: 0x4E00,
394 end: 0x9FCC,
395 },
396 UnicodeRange {
397 start: 0xF900,
398 end: 0xFA6D,
399 },
400 UnicodeRange {
401 start: 0xFA70,
402 end: 0xFAD9,
403 },
404 ],
405 Script::Myanmar => vec![UnicodeRange {
406 start: 0x1000,
407 end: 0x109F,
408 }],
409 Script::Oriya => vec![UnicodeRange {
410 start: 0x0B00,
411 end: 0x0B7F,
412 }],
413 Script::Sinhala => vec![UnicodeRange {
414 start: 0x0D80,
415 end: 0x0DFF,
416 }],
417 Script::Tamil => vec![UnicodeRange {
418 start: 0x0B80,
419 end: 0x0BFF,
420 }],
421 Script::Telugu => vec![UnicodeRange {
422 start: 0x0C00,
423 end: 0x0C7F,
424 }],
425 Script::Thai => vec![UnicodeRange {
426 start: 0x0E00,
427 end: 0x0E7F,
428 }],
429 }
430 }
431}
432
433#[inline]
437pub fn is_stop_char(ch: char) -> bool {
438 matches!(ch, '\u{0000}'..='\u{0040}' | '\u{005B}'..='\u{0060}' | '\u{007B}'..='\u{007E}')
439}
440
441type ScriptCounter = (Script, fn(char) -> bool, usize);
442
443pub fn detect_script(text: &str) -> Option<Script> {
445 let mut script_counters: [ScriptCounter; 24] = [
446 (Script::Latin, is_latin, 0),
447 (Script::Cyrillic, is_cyrillic, 0),
448 (Script::Arabic, is_arabic, 0),
449 (Script::Mandarin, is_mandarin, 0),
450 (Script::Devanagari, is_devanagari, 0),
451 (Script::Hebrew, is_hebrew, 0),
452 (Script::Ethiopic, is_ethiopic, 0),
453 (Script::Georgian, is_georgian, 0),
454 (Script::Bengali, is_bengali, 0),
455 (Script::Hangul, is_hangul, 0),
456 (Script::Hiragana, is_hiragana, 0),
457 (Script::Katakana, is_katakana, 0),
458 (Script::Greek, is_greek, 0),
459 (Script::Kannada, is_kannada, 0),
460 (Script::Tamil, is_tamil, 0),
461 (Script::Thai, is_thai, 0),
462 (Script::Gujarati, is_gujarati, 0),
463 (Script::Gurmukhi, is_gurmukhi, 0),
464 (Script::Telugu, is_telugu, 0),
465 (Script::Malayalam, is_malayalam, 0),
466 (Script::Oriya, is_oriya, 0),
467 (Script::Myanmar, is_myanmar, 0),
468 (Script::Sinhala, is_sinhala, 0),
469 (Script::Khmer, is_khmer, 0),
470 ];
471
472 let half = text.chars().count() / 2;
473
474 for ch in text.chars() {
475 if is_stop_char(ch) {
476 continue;
477 }
478
479 for i in 0..script_counters.len() {
482 let found = {
483 let (script, check_fn, ref mut count) = script_counters[i];
484 if check_fn(ch) {
485 *count += 1;
486 if *count > half {
487 return Some(script);
488 }
489 true
490 } else {
491 false
492 }
493 };
494 if found {
497 if i > 0 {
501 script_counters.swap(i - 1, i);
502 }
503 break;
504 }
505 }
506 }
507
508 let (script, _, count) = script_counters
509 .iter()
510 .cloned()
511 .max_by_key(|&(_, _, count)| count)
512 .unwrap();
513 if count != 0 {
514 Some(script)
515 } else {
516 None
517 }
518}
519
520pub fn detect_char_script(ch: char) -> Option<Script> {
521 let script_counters: [ScriptCounter; 24] = [
522 (Script::Latin, is_latin, 0),
523 (Script::Cyrillic, is_cyrillic, 0),
524 (Script::Arabic, is_arabic, 0),
525 (Script::Mandarin, is_mandarin, 0),
526 (Script::Devanagari, is_devanagari, 0),
527 (Script::Hebrew, is_hebrew, 0),
528 (Script::Ethiopic, is_ethiopic, 0),
529 (Script::Georgian, is_georgian, 0),
530 (Script::Bengali, is_bengali, 0),
531 (Script::Hangul, is_hangul, 0),
532 (Script::Hiragana, is_hiragana, 0),
533 (Script::Katakana, is_katakana, 0),
534 (Script::Greek, is_greek, 0),
535 (Script::Kannada, is_kannada, 0),
536 (Script::Tamil, is_tamil, 0),
537 (Script::Thai, is_thai, 0),
538 (Script::Gujarati, is_gujarati, 0),
539 (Script::Gurmukhi, is_gurmukhi, 0),
540 (Script::Telugu, is_telugu, 0),
541 (Script::Malayalam, is_malayalam, 0),
542 (Script::Oriya, is_oriya, 0),
543 (Script::Myanmar, is_myanmar, 0),
544 (Script::Sinhala, is_sinhala, 0),
545 (Script::Khmer, is_khmer, 0),
546 ];
547
548 for i in 0..script_counters.len() {
549 let (script, check_fn, _) = script_counters[i];
550 if check_fn(ch) {
551 return Some(script);
552 }
553 }
554 None
555}
556
557fn detect_bengali_language(text: &str) -> Language {
559 for c in text.chars() {
560 if matches!(c, '\u{09F0}' | '\u{09F1}') {
563 return Language::Assamese;
565 }
566 }
567 Language::Bengali
569}
570
571fn detect_cyrillic_language(text: &str) -> Language {
572 for c in text.chars() {
573 match c {
574 '\u{0460}'..='\u{047F}' => return Language::SlavonicChurch,
576 'ѓ' | 'ќ' | 'ѕ' => return Language::Macedonian,
579 'ў' => return Language::Belarusian,
580 'є' | 'і' | 'ї' | 'ґ' => return Language::Ukrainian,
581 'ө' | 'ү' | 'һ' => return Language::Mongolian,
582 'ј' | 'љ' | 'њ' | 'ћ' | 'ђ' | 'џ' => return Language::SerbianCyrillic,
583 'щ' => return Language::Bulgarian,
586 _ => {}
587 }
588 }
589
590 Language::Russian
591}
592
593fn detect_devanagari_language(text: &str) -> Language {
594 for c in text.chars() {
595 match c {
596 '\u{0933}' => return Language::Marathi, '\u{1CD0}'..='\u{1CFF}' => return Language::Sanskrit,
600 _ => (),
601 }
602 }
603
604 Language::Hindi
605}
606
607fn detect_greek_language(text: &str) -> Language {
608 for c in text.chars() {
609 match c {
610 '\u{2C80}'..='\u{2CFF}' => return Language::Coptic,
612 '\u{1F00}'..='\u{1FFF}' => return Language::GreekPoly,
614 _ => {}
615 }
616 }
617
618 Language::GreekMono
619}
620
621fn detect_latin_language(text: &str) -> Language {
622 let mut has_french_c = false;
624 let mut has_portuguese_o = false;
625 let mut has_portuguese_a = false;
626
627 for c in text.chars() {
628 match c {
629 'ß' => return Language::German1996,
631 'ő' | 'ű' => return Language::Hungarian,
632 'ł' => return Language::Polish,
633 'ř' | 'ů' => return Language::Czech,
634 'ľ' | 'ĺ' | 'ŕ' => return Language::Slovak,
635 'ā' | 'ē' | 'ģ' | 'ī' | 'ķ' | 'ļ' | 'ņ' | 'ō' | 'ū' => {
636 return Language::Latvian
637 }
638 'ą' | 'ę' | 'ė' | 'į' | 'ų' => return Language::Lithuanian,
639 'ă' | 'ș' | 'ț' => return Language::Romanian,
640 'ğ' | 'ı' | 'ş' => return Language::Turkish,
641 'đ' => return Language::Croatian, 'þ' | 'ð' => return Language::Icelandic,
644 'ŵ' | 'ŷ' => return Language::Welsh,
645 'æ' | 'ø' => return Language::NorwegianBokmal, 'å' => return Language::Swedish, 'ñ' => return Language::Spanish,
648 'ä' | 'ö' | 'ü' => return Language::German1996,
649
650 'õ' => has_portuguese_o = true,
653 'ã' => has_portuguese_a = true,
654
655 'ç' => has_french_c = true, 'á' | 'é' | 'í' | 'ó' | 'ú' => return Language::Spanish,
658
659 _ => (),
660 }
661 }
662
663 if has_french_c && !has_portuguese_o && !has_portuguese_a {
666 return Language::French;
667 }
668
669 if has_portuguese_o && !has_french_c && !has_portuguese_a {
670 return Language::Estonian;
671 }
672
673 if has_portuguese_o || has_portuguese_a || has_french_c {
674 return Language::Portuguese;
675 }
676
677 Language::EnglishUS
678}
679
680pub fn script_to_language(script: Script, text: &str) -> Language {
681 match script {
682 Script::Ethiopic => Language::Ethiopic,
683 Script::Georgian => Language::Georgian,
684 Script::Gujarati => Language::Gujarati,
685 Script::Gurmukhi => Language::Panjabi,
686 Script::Kannada => Language::Kannada,
687 Script::Malayalam => Language::Malayalam,
688 Script::Mandarin => Language::Chinese,
689 Script::Oriya => Language::Oriya,
690 Script::Tamil => Language::Tamil,
691 Script::Telugu => Language::Telugu,
692 Script::Thai => Language::Thai,
693 Script::Bengali => detect_bengali_language(text),
694 Script::Cyrillic => detect_cyrillic_language(text),
695 Script::Devanagari => detect_devanagari_language(text),
696 Script::Greek => detect_greek_language(text),
697 Script::Latin => detect_latin_language(text),
698
699 Script::Myanmar => Language::Thai,
701 Script::Khmer => Language::Thai,
702 Script::Sinhala => Language::Hindi,
703
704 Script::Arabic => Language::Chinese,
706 Script::Hebrew => Language::Chinese,
707 Script::Hangul => Language::Chinese,
708 Script::Hiragana => Language::Chinese,
709 Script::Katakana => Language::Chinese,
710 }
711}
712
713pub fn is_cyrillic(ch: char) -> bool {
714 matches!(ch,
715 '\u{0400}'..='\u{0484}'
716 | '\u{0487}'..='\u{052F}'
717 | '\u{2DE0}'..='\u{2DFF}'
718 | '\u{A640}'..='\u{A69D}'
719 | '\u{1D2B}'
720 | '\u{1D78}'
721 | '\u{A69F}'
722 )
723}
724
725pub fn is_latin(ch: char) -> bool {
727 matches!(ch,
728 'a'..='z'
729 | 'A'..='Z'
730 | '\u{0080}'..='\u{00FF}'
731 | '\u{0100}'..='\u{017F}'
732 | '\u{0180}'..='\u{024F}'
733 | '\u{0250}'..='\u{02AF}'
734 | '\u{1D00}'..='\u{1D7F}'
735 | '\u{1D80}'..='\u{1DBF}'
736 | '\u{1E00}'..='\u{1EFF}'
737 | '\u{2100}'..='\u{214F}'
738 | '\u{2C60}'..='\u{2C7F}'
739 | '\u{A720}'..='\u{A7FF}'
740 | '\u{AB30}'..='\u{AB6F}'
741 )
742}
743
744pub fn is_arabic(ch: char) -> bool {
746 matches!(ch,
747 '\u{0600}'..='\u{06FF}'
748 | '\u{0750}'..='\u{07FF}'
749 | '\u{08A0}'..='\u{08FF}'
750 | '\u{FB50}'..='\u{FDFF}'
751 | '\u{FE70}'..='\u{FEFF}'
752 | '\u{10E60}'..='\u{10E7F}'
753 | '\u{1EE00}'..='\u{1EEFF}'
754 )
755}
756
757pub fn is_devanagari(ch: char) -> bool {
759 matches!(ch, '\u{0900}'..='\u{097F}' | '\u{A8E0}'..='\u{A8FF}' | '\u{1CD0}'..='\u{1CFF}')
760}
761
762pub fn is_ethiopic(ch: char) -> bool {
764 matches!(ch, '\u{1200}'..='\u{139F}' | '\u{2D80}'..='\u{2DDF}' | '\u{AB00}'..='\u{AB2F}')
765}
766
767pub fn is_hebrew(ch: char) -> bool {
769 matches!(ch, '\u{0590}'..='\u{05FF}')
770}
771
772pub fn is_georgian(ch: char) -> bool {
773 matches!(ch, '\u{10A0}'..='\u{10FF}')
774}
775
776pub fn is_mandarin(ch: char) -> bool {
777 matches!(ch,
778 '\u{2E80}'..='\u{2E99}'
779 | '\u{2E9B}'..='\u{2EF3}'
780 | '\u{2F00}'..='\u{2FD5}'
781 | '\u{3005}'
782 | '\u{3007}'
783 | '\u{3021}'..='\u{3029}'
784 | '\u{3038}'..='\u{303B}'
785 | '\u{3400}'..='\u{4DB5}'
786 | '\u{4E00}'..='\u{9FCC}'
787 | '\u{F900}'..='\u{FA6D}'
788 | '\u{FA70}'..='\u{FAD9}'
789 )
790}
791
792pub fn is_bengali(ch: char) -> bool {
793 matches!(ch, '\u{0980}'..='\u{09FF}')
794}
795
796pub fn is_hiragana(ch: char) -> bool {
797 matches!(ch, '\u{3040}'..='\u{309F}')
798}
799
800pub fn is_katakana(ch: char) -> bool {
801 matches!(ch, '\u{30A0}'..='\u{30FF}')
802}
803
804pub fn is_hangul(ch: char) -> bool {
806 matches!(ch,
807 '\u{AC00}'..='\u{D7AF}'
808 | '\u{1100}'..='\u{11FF}'
809 | '\u{3130}'..='\u{318F}'
810 | '\u{3200}'..='\u{32FF}'
811 | '\u{A960}'..='\u{A97F}'
812 | '\u{D7B0}'..='\u{D7FF}'
813 | '\u{FF00}'..='\u{FFEF}'
814 )
815}
816
817pub fn is_greek(ch: char) -> bool {
819 matches!(ch, '\u{0370}'..='\u{03FF}')
820}
821
822pub fn is_kannada(ch: char) -> bool {
824 matches!(ch, '\u{0C80}'..='\u{0CFF}')
825}
826
827pub fn is_tamil(ch: char) -> bool {
829 matches!(ch, '\u{0B80}'..='\u{0BFF}')
830}
831
832pub fn is_thai(ch: char) -> bool {
834 matches!(ch, '\u{0E00}'..='\u{0E7F}')
835}
836
837pub fn is_gujarati(ch: char) -> bool {
839 matches!(ch, '\u{0A80}'..='\u{0AFF}')
840}
841
842pub fn is_gurmukhi(ch: char) -> bool {
845 matches!(ch, '\u{0A00}'..='\u{0A7F}')
846}
847
848pub fn is_telugu(ch: char) -> bool {
849 matches!(ch, '\u{0C00}'..='\u{0C7F}')
850}
851
852pub fn is_malayalam(ch: char) -> bool {
854 matches!(ch, '\u{0D00}'..='\u{0D7F}')
855}
856
857pub fn is_oriya(ch: char) -> bool {
859 matches!(ch, '\u{0B00}'..='\u{0B7F}')
860}
861
862pub fn is_myanmar(ch: char) -> bool {
864 matches!(ch, '\u{1000}'..='\u{109F}')
865}
866
867pub fn is_sinhala(ch: char) -> bool {
869 matches!(ch, '\u{0D80}'..='\u{0DFF}')
870}
871
872pub fn is_khmer(ch: char) -> bool {
874 matches!(ch, '\u{1780}'..='\u{17FF}' | '\u{19E0}'..='\u{19FF}')
875}