tamil/
tamil.rs

1/**
2 * (C) 2021, Ezhil Language Foundation.
3 *  This file is part of Open-Tamil for Rust language.
4 */
5pub const UYIR: [char; 12] = ['அ', 'ஆ', 'இ', 'ஈ', 'உ', 'ஊ', 'எ', 'ஏ', 'ஐ', 'ஒ', 'ஓ', 'ஔ'];
6pub const UYIR_LETTERS: &'static [char; 12] = &UYIR;
7pub const VOWEL_A: char = 'அ';
8pub const VOWEL_AA: char = 'ஆ';
9pub const VOWEL_I: char = 'இ';
10pub const VOWEL_II: char = 'ஈ';
11pub const VOWEL_U: char = 'உ';
12pub const VOWEL_UU: char = 'ஊ';
13pub const VOWEL_E: char = 'எ';
14pub const VOWEL_EE: char = 'ஏ';
15pub const VOWEL_AI: char = 'ஐ';
16pub const VOWEL_O: char = 'ஒ';
17pub const VOWEL_OO: char = 'ஓ';
18pub const VOWEL_AU: char = 'ஔ';
19pub const AYTHAM_LETTER: char = 'ஃ';
20pub const AYUDHA_LETTER: char = 'ஃ';
21
22pub const KURIL_LETTERS: [char; 5] = ['அ', 'இ', 'உ', 'எ', 'ஒ'];
23pub const NEDIL_LETTERS: [char; 7] = ['ஆ', 'ஈ', 'ஊ', 'ஏ', 'ஓ', 'ஐ', 'ஔ'];
24pub const DIPTHONG_LETTERS: [char; 2] = ['ஐ', 'ஔ'];
25
26pub const PRONOUN_LETTERS: [char; 3] = ['அ', 'இ', 'உ'];
27pub const SUTTEZHUTHTHU: &'static [char; 3] = &PRONOUN_LETTERS;
28
29pub const QUESTIONSUFFIX_LETTERS: [char; 3] = ['ஆ', 'ஏ', 'ஓ'];
30pub const VINAAEZHUTHTHU: &'static [char; 3] = &QUESTIONSUFFIX_LETTERS;
31
32pub const VALLINAM_LETTERS: [&str; 6] = ["க்", "ச்", "ட்", "த்", "ப்", "ற்"];
33pub const MELLINAM_LETTERS: [&str; 6] = ["ங்", "ஞ்", "ண்", "ந்", "ம்", "ன்"];
34pub const IDAYINAM_LETTERS: [&str; 6] = ["ய்", "ர்", "ல்", "வ்", "ழ்", "ள்"];
35
36pub const MEI_LETTERS: [&str; 18] = [
37    "க்", "ச்", "ட்", "த்", "ப்", "ற்", "ஞ்", "ங்", "ண்", "ந்", "ம்", "ன்", "ய்", "ர்", "ல்", "வ்", "ழ்", "ள்",
38];
39
40pub const ACCENT_SYMBOLS: [char; 13] = [
41    '\u{0}', 'ா', 'ி', 'ீ', 'ு', 'ூ', 'ெ', 'ே', 'ை', 'ொ', 'ோ', 'ௌ', 'ஃ',
42];
43
44pub const ACCENT_AA: char = ACCENT_SYMBOLS[1];
45pub const ACCENT_I: char = ACCENT_SYMBOLS[2];
46pub const ACCENT_U: char = ACCENT_SYMBOLS[3];
47pub const ACCENT_UU: char = ACCENT_SYMBOLS[4];
48pub const ACCENT_E: char = ACCENT_SYMBOLS[5];
49pub const ACCENT_EE: char = ACCENT_SYMBOLS[6];
50pub const ACCENT_AI: char = ACCENT_SYMBOLS[7];
51pub const ACCENT_O: char = ACCENT_SYMBOLS[8];
52pub const ACCENT_OO: char = ACCENT_SYMBOLS[9];
53pub const ACCENT_AU: char = ACCENT_SYMBOLS[10];
54
55pub const PULLI_SYMBOLS: [char; 1] = ['்'];
56
57pub const AGARAM_LETTERS: [char; 18] = [
58    'க', 'ச', 'ட', 'த', 'ப', 'ற', 'ஞ', 'ங', 'ண', 'ந', 'ம', 'ன', 'ய', 'ர', 'ல', 'வ', 'ழ', 'ள',
59];
60pub const MAYANGOLI_LETTERS: [char; 8] = ['ண', 'ன', 'ந', 'ல', 'ழ', 'ள', 'ர', 'ற'];
61
62pub const CONSONANT_KA: char = 'க';
63pub const CONSONANT_NGA: char = 'ங';
64pub const CONSONANT_CA: char = 'ச';
65pub const CONSONANT_JA: char = 'ஜ';
66pub const CONSONANT_NYA: char = 'ஞ';
67pub const CONSONANT_TTA: char = 'ட';
68pub const CONSONANT_NNA: char = 'ண';
69pub const CONSONANT_NNNA: char = 'ன';
70pub const CONSONANT_TA: char = 'த';
71pub const CONSONANT_THA: char = 'த';
72pub const CONSONANT_NA: char = 'ந';
73pub const CONSONANT_PA: char = 'ப';
74pub const CONSONANT_MA: char = 'ம';
75pub const CONSONANT_YA: char = 'ய';
76pub const CONSONANT_RA: char = 'ர';
77pub const CONSONANT_RRA: char = 'ற';
78pub const CONSONANT_LA: char = 'ல';
79pub const CONSONANT_LLA: char = 'ள';
80pub const CONSONANT_LLLA: char = 'ழ';
81pub const CONSONANT_ZHA: char = 'ழ';
82pub const CONSONANT_VA: char = 'வ';
83
84pub const SANSKRIT_LETTERS: [&str; 6] = ["ஶ", "ஜ", "ஷ", "ஸ", "ஹ", "க்ஷ"];
85pub const SANSKRIT_MEI_LETTERS: [&str; 6] = ["ஶ்", "ஜ்", "ஷ்", "ஸ்", "ஹ்", "க்ஷ்"];
86
87pub const GRANTHA_MEI_LETTERS: [&str; 24] = [
88    "க்",
89    "ச்",
90    "ட்",
91    "த்",
92    "ப்",
93    "ற்",
94    "ஞ்",
95    "ங்",
96    "ண்",
97    "ந்",
98    "ம்",
99    "ன்",
100    "ய்",
101    "ர்",
102    "ல்",
103    "வ்",
104    "ழ்",
105    "ள்",
106    "ஶ்",
107    "ஜ்",
108    "ஷ்",
109    "ஸ்",
110    "ஹ்",
111    "க்ஷ்",
112];
113pub const GRANTHA_AGARAM_LETTERS: [&str; 24] = [
114    "க",
115    "ச",
116    "ட",
117    "த",
118    "ப",
119    "ற",
120    "ஞ",
121    "ங",
122    "ண",
123    "ந",
124    "ம",
125    "ன",
126    "ய",
127    "ர",
128    "ல",
129    "வ",
130    "ழ",
131    "ள",
132    "ஶ",
133    "ஜ",
134    "ஷ",
135    "ஸ",
136    "ஹ",
137    "க்ஷ",
138];
139
140pub const UYIRMEI_LETTERS: [&str; 216] = [
141    "க", "கா", "கி", "கீ", "கு", "கூ", "கெ", "கே", "கை", "கொ", "கோ", "கௌ", "ச", "சா", "சி", "சீ",
142    "சு", "சூ", "செ", "சே", "சை", "சொ", "சோ", "சௌ", "ட", "டா", "டி", "டீ", "டு", "டூ", "டெ", "டே",
143    "டை", "டொ", "டோ", "டௌ", "த", "தா", "தி", "தீ", "து", "தூ", "தெ", "தே", "தை", "தொ", "தோ", "தௌ",
144    "ப", "பா", "பி", "பீ", "பு", "பூ", "பெ", "பே", "பை", "பொ", "போ", "பௌ", "ற", "றா", "றி", "றீ",
145    "று", "றூ", "றெ", "றே", "றை", "றொ", "றோ", "றௌ", "ஞ", "ஞா", "ஞி", "ஞீ", "ஞு", "ஞூ", "ஞெ", "ஞே",
146    "ஞை", "ஞொ", "ஞோ", "ஞௌ", "ங", "ஙா", "ஙி", "ஙீ", "ஙு", "ஙூ", "ஙெ", "ஙே", "ஙை", "ஙொ", "ஙோ", "ஙௌ",
147    "ண", "ணா", "ணி", "ணீ", "ணு", "ணூ", "ணெ", "ணே", "ணை", "ணொ", "ணோ", "ணௌ", "ந", "நா", "நி", "நீ",
148    "நு", "நூ", "நெ", "நே", "நை", "நொ", "நோ", "நௌ", "ம", "மா", "மி", "மீ", "மு", "மூ", "மெ", "மே",
149    "மை", "மொ", "மோ", "மௌ", "ன", "னா", "னி", "னீ", "னு", "னூ", "னெ", "னே", "னை", "னொ", "னோ", "னௌ",
150    "ய", "யா", "யி", "யீ", "யு", "யூ", "யெ", "யே", "யை", "யொ", "யோ", "யௌ", "ர", "ரா", "ரி", "ரீ",
151    "ரு", "ரூ", "ரெ", "ரே", "ரை", "ரொ", "ரோ", "ரௌ", "ல", "லா", "லி", "லீ", "லு", "லூ", "லெ", "லே",
152    "லை", "லொ", "லோ", "லௌ", "வ", "வா", "வி", "வீ", "வு", "வூ", "வெ", "வே", "வை", "வொ", "வோ", "வௌ",
153    "ழ", "ழா", "ழி", "ழீ", "ழு", "ழூ", "ழெ", "ழே", "ழை", "ழொ", "ழோ", "ழௌ", "ள", "ளா", "ளி", "ளீ",
154    "ளு", "ளூ", "ளெ", "ளே", "ளை", "ளொ", "ளோ", "ளௌ",
155];
156
157// total tamil letters in use, including sanskrit letters
158pub const TAMIL_LETTERS: [&str; 345] = [
159    /* Uyir */
160    "அ",
161    "ஆ",
162    "இ",
163    "ஈ",
164    "உ",
165    "ஊ",
166    "எ",
167    "ஏ",
168    "ஐ",
169    "ஒ",
170    "ஓ",
171    "ஔ",
172    /* Ayuda Ezhuthu */
173    "ஃ",
174    /* Mei */
175    "க்",
176    "ச்",
177    "ட்",
178    "த்",
179    "ப்",
180    "ற்",
181    "ஞ்",
182    "ங்",
183    "ண்",
184    "ந்",
185    "ம்",
186    "ன்",
187    "ய்",
188    "ர்",
189    "ல்",
190    "வ்",
191    "ழ்",
192    "ள்",
193    /* Agaram */
194    "க",
195    "ச",
196    "ட",
197    "த",
198    "ப",
199    "ற",
200    "ஞ",
201    "ங",
202    "ண",
203    "ந",
204    "ம",
205    "ன",
206    "ய",
207    "ர",
208    "ல",
209    "வ",
210    "ழ",
211    "ள",
212    /* Sanskrit (Vada Mozhi) */
213    "ஜ",
214    "ஷ",
215    "ஸ",
216    "ஹ",
217    /* Sanskrit (Mei) */
218    "ஜ்",
219    "ஷ்",
220    "ஸ்",
221    "ஹ்",
222    /* Uyir Mei */
223    "க",
224    "கா",
225    "கி",
226    "கீ",
227    "கு",
228    "கூ",
229    "கெ",
230    "கே",
231    "கை",
232    "கொ",
233    "கோ",
234    "கௌ",
235    "ச",
236    "சா",
237    "சி",
238    "சீ",
239    "சு",
240    "சூ",
241    "செ",
242    "சே",
243    "சை",
244    "சொ",
245    "சோ",
246    "சௌ",
247    "ட",
248    "டா",
249    "டி",
250    "டீ",
251    "டு",
252    "டூ",
253    "டெ",
254    "டே",
255    "டை",
256    "டொ",
257    "டோ",
258    "டௌ",
259    "த",
260    "தா",
261    "தி",
262    "தீ",
263    "து",
264    "தூ",
265    "தெ",
266    "தே",
267    "தை",
268    "தொ",
269    "தோ",
270    "தௌ",
271    "ப",
272    "பா",
273    "பி",
274    "பீ",
275    "பு",
276    "பூ",
277    "பெ",
278    "பே",
279    "பை",
280    "பொ",
281    "போ",
282    "பௌ",
283    "ற",
284    "றா",
285    "றி",
286    "றீ",
287    "று",
288    "றூ",
289    "றெ",
290    "றே",
291    "றை",
292    "றொ",
293    "றோ",
294    "றௌ",
295    "ஞ",
296    "ஞா",
297    "ஞி",
298    "ஞீ",
299    "ஞு",
300    "ஞூ",
301    "ஞெ",
302    "ஞே",
303    "ஞை",
304    "ஞொ",
305    "ஞோ",
306    "ஞௌ",
307    "ங",
308    "ஙா",
309    "ஙி",
310    "ஙீ",
311    "ஙு",
312    "ஙூ",
313    "ஙெ",
314    "ஙே",
315    "ஙை",
316    "ஙொ",
317    "ஙோ",
318    "ஙௌ",
319    "ண",
320    "ணா",
321    "ணி",
322    "ணீ",
323    "ணு",
324    "ணூ",
325    "ணெ",
326    "ணே",
327    "ணை",
328    "ணொ",
329    "ணோ",
330    "ணௌ",
331    "ந",
332    "நா",
333    "நி",
334    "நீ",
335    "நு",
336    "நூ",
337    "நெ",
338    "நே",
339    "நை",
340    "நொ",
341    "நோ",
342    "நௌ",
343    "ம",
344    "மா",
345    "மி",
346    "மீ",
347    "மு",
348    "மூ",
349    "மெ",
350    "மே",
351    "மை",
352    "மொ",
353    "மோ",
354    "மௌ",
355    "ன",
356    "னா",
357    "னி",
358    "னீ",
359    "னு",
360    "னூ",
361    "னெ",
362    "னே",
363    "னை",
364    "னொ",
365    "னோ",
366    "னௌ",
367    "ய",
368    "யா",
369    "யி",
370    "யீ",
371    "யு",
372    "யூ",
373    "யெ",
374    "யே",
375    "யை",
376    "யொ",
377    "யோ",
378    "யௌ",
379    "ர",
380    "ரா",
381    "ரி",
382    "ரீ",
383    "ரு",
384    "ரூ",
385    "ரெ",
386    "ரே",
387    "ரை",
388    "ரொ",
389    "ரோ",
390    "ரௌ",
391    "ல",
392    "லா",
393    "லி",
394    "லீ",
395    "லு",
396    "லூ",
397    "லெ",
398    "லே",
399    "லை",
400    "லொ",
401    "லோ",
402    "லௌ",
403    "வ",
404    "வா",
405    "வி",
406    "வீ",
407    "வு",
408    "வூ",
409    "வெ",
410    "வே",
411    "வை",
412    "வொ",
413    "வோ",
414    "வௌ",
415    "ழ",
416    "ழா",
417    "ழி",
418    "ழீ",
419    "ழு",
420    "ழூ",
421    "ழெ",
422    "ழே",
423    "ழை",
424    "ழொ",
425    "ழோ",
426    "ழௌ",
427    "ள",
428    "ளா",
429    "ளி",
430    "ளீ",
431    "ளு",
432    "ளூ",
433    "ளெ",
434    "ளே",
435    "ளை",
436    "ளொ",
437    "ளோ",
438    "ளௌ", /* Sanskrit Uyir-Mei */
439    "ஶ",
440    "ஶா",
441    "ஶி",
442    "ஶீ",
443    "ஶு",
444    "ஶூ",
445    "ஶெ",
446    "ஶே",
447    "ஶை",
448    "ஶொ",
449    "ஶோ",
450    "ஶௌ",
451    "ஜ",
452    "ஜா",
453    "ஜி",
454    "ஜீ",
455    "ஜு",
456    "ஜூ",
457    "ஜெ",
458    "ஜே",
459    "ஜை",
460    "ஜொ",
461    "ஜோ",
462    "ஜௌ",
463    "ஷ",
464    "ஷா",
465    "ஷி",
466    "ஷீ",
467    "ஷு",
468    "ஷூ",
469    "ஷெ",
470    "ஷே",
471    "ஷை",
472    "ஷொ",
473    "ஷோ",
474    "ஷௌ",
475    "ஸ",
476    "ஸா",
477    "ஸி",
478    "ஸீ",
479    "ஸு",
480    "ஸூ",
481    "ஸெ",
482    "ஸே",
483    "ஸை",
484    "ஸொ",
485    "ஸோ",
486    "ஸௌ",
487    "ஹ",
488    "ஹா",
489    "ஹி",
490    "ஹீ",
491    "ஹு",
492    "ஹூ",
493    "ஹெ",
494    "ஹே",
495    "ஹை",
496    "ஹொ",
497    "ஹோ",
498    "ஹௌ",
499    "க்ஷ",
500    "க்ஷா",
501    "க்ஷி",
502    "க்ஷீ",
503    "க்ஷு",
504    "க்ஷூ",
505    "க்ஷெ",
506    "க்ஷே",
507    "க்ஷை",
508    "க்ஷொ",
509    "க்ஷோ",
510    "க்ஷௌ",
511];
512
513pub const DAY: &str = "௳";
514pub const MONTH: &str = "௴";
515pub const YEAR: &str = "௵";
516pub const DEBIT: &str = "௶";
517pub const CREDIT: &str = "௷";
518pub const RUPEE: &str = "௹";
519pub const NUMERAL: &str = "௺";
520pub const SRI: &str = "\u{0bb6}\u{0bcd}\u{0bb0}\u{0bc0}"; // #SRI -ஶ்ரீ
521pub const KSHA: &str = "\u{0b95}\u{0bcd}\u{0bb7}"; // #KSHA - க்ஷ
522pub const KSH: &str = "\u{0b95}\u{0bcd}\u{0bb7}\u{0bcd}"; // #KSH - க்ஷ்
523pub const INDIAN_RUPEE: &str = "₹";
524pub const TAMIL_SYMBOLS: [&'static str; 11] = [
525    DAY,
526    MONTH,
527    YEAR,
528    DEBIT,
529    CREDIT,
530    RUPEE,
531    NUMERAL,
532    SRI,
533    KSHA,
534    KSH,
535    INDIAN_RUPEE,
536];
537
538/* length of the definitions */
539pub fn accent_len() -> usize {
540    ACCENT_SYMBOLS.len()
541}
542pub fn ayudha_len() -> usize {
543    1
544}
545pub fn uyir_len() -> usize {
546    UYIR_LETTERS.len()
547}
548pub fn mei_len() -> usize {
549    MEI_LETTERS.len()
550}
551pub fn agaram_len() -> usize {
552    AGARAM_LETTERS.len()
553}
554pub fn uyirmei_len() -> usize {
555    UYIRMEI_LETTERS.len()
556}
557pub fn tamil_len() -> usize {
558    TAMIL_LETTERS.len()
559}
560
561pub fn uyir(idx: usize) -> char {
562    UYIR_LETTERS[idx]
563}
564pub fn agaram(idx: usize) -> char {
565    AGARAM_LETTERS[idx]
566}
567pub fn mei(idx: usize) -> String {
568    MEI_LETTERS[idx].to_string()
569}
570pub fn uyirmei(idx: usize) -> String {
571    UYIRMEI_LETTERS[idx].to_string()
572}
573
574pub fn tamil247() -> Vec<String> {
575    let mut _tamil247: Vec<String> = vec![];
576    _tamil247.push(AYUDHA_LETTER.to_string());
577    for letter in UYIR_LETTERS.iter() {
578        _tamil247.push(letter.to_string());
579    }
580    for letter in MEI_LETTERS.iter() {
581        _tamil247.push(letter.to_string());
582    }
583    for letter in UYIRMEI_LETTERS.iter() {
584        _tamil247.push(letter.to_string());
585    }
586    _tamil247
587}
588
589/**
590mei_to_agaram("ழ்") => ழ
591*/
592pub fn mei_to_agaram(in_syllable: String) -> String {
593    match GRANTHA_MEI_LETTERS.iter().position(|&x| x == in_syllable) {
594        Some(mei_pos) => {
595            let agaram_a_pos: usize = 0;
596            uyirmei_constructed(mei_pos, agaram_a_pos)
597        }
598
599        None => in_syllable,
600    }
601}
602
603/**
604def uyirmei_constructed( mei_idx, uyir_idx):
605    """ construct uyirmei letter give mei index and uyir index """
606    idx,idy = mei_idx,uyir_idx
607    assert ( idy >= 0 and idy < uyir_len() )
608    assert ( idx >= 0 and idx < 6+mei_len() )
609    return grantha_agaram_letters[mei_idx]+accent_symbols[uyir_idx]
610*/
611pub fn uyirmei_constructed(mei_idx: usize, uyir_idx: usize) -> String {
612    match uyir_idx {
613        0 => {
614            format!("{}", GRANTHA_AGARAM_LETTERS[mei_idx])
615        }
616        _ => {
617            format!(
618                "{}{}",
619                GRANTHA_AGARAM_LETTERS[mei_idx], ACCENT_SYMBOLS[uyir_idx]
620            )
621        }
622    }
623}
624
625pub fn is_tamil_unicode_predicate(_x: char) -> bool {
626    let ranges: [i32; 2] = [2946, 3066];
627    let x = _x as i32;
628    x >= ranges[0] && x <= ranges[1]
629}
630
631pub fn getidx(letter: String) -> usize {
632    let mut itr: usize = 0;
633    loop {
634        if itr == TAMIL_LETTERS.len() {
635            panic!("Cannot find letter in Tamil arichuvadi");
636        } else if letter == TAMIL_LETTERS[itr] {
637            break;
638        } else {
639            itr = itr + 1
640        }
641    }
642    itr
643}
644
645/*
646    """check if the word has any occurance of any tamil letter """
647    # list comprehension is not necessary - we bail at earliest
648*/
649pub fn has_tamil(word: &str) -> bool {
650    for c in word.chars() {
651        let cstr = c.to_string();
652        if TAMIL_LETTERS.iter().any(|x| *x == cstr) {
653            return true;
654        }
655    }
656    false
657}
658
659pub fn get_letters_length(word: &str) -> usize {
660    get_letters(word).len()
661}
662
663/** Split a tamil-unicode stream into
664* tamil characters (individuals).
665*/
666pub fn get_letters(x: &str) -> Vec<String> {
667    /* Splits the @word into a character-list of tamil/english
668     *characters present in the stream. This routine provides a robust tokenizer
669     *for Tamil unicode letters. */
670    const SPL_SYMBOLS: [char; 12] = ['்', 'ா', 'ி', 'ீ', 'ு', 'ூ', 'ெ', 'ே', 'ை', 'ொ', 'ோ', 'ௌ'];
671    let mut v: Vec<String> = Vec::new();
672    let mut tmp: String = String::from("");
673    for c in x.chars() {
674        if SPL_SYMBOLS.iter().any(|x| *x == c) {
675            let z = v.pop();
676            match z {
677                Some(zz) => {
678                    tmp.push_str(&zz);
679                }
680                _ => {}
681            }
682        }
683
684        if tmp.len() != 0 {
685            tmp.push(c);
686            v.push(tmp.clone());
687            tmp.clear();
688        } else {
689            v.push(c.to_string());
690        }
691    }
692    v
693}
694
695pub fn all_tamil(word: &str) -> bool {
696    get_letters(&word)
697        .iter()
698        .all(|x| is_tamil_unicode_predicate(x.chars().next().unwrap()))
699}
700
701pub fn istamil_prefix(word: &str) -> bool {
702    /* check if the given word has a tamil prefix. Returns
703     * either a True/False flag
704     */
705    match word.len() {
706        0 => false,
707        _ => {
708            let letters = get_letters(&word);
709            TAMIL_LETTERS.iter().any(|x| x == &letters[0])
710        }
711    }
712}
713
714pub fn reverse_word(word: &str) -> String {
715    let letters = get_letters(word);
716    let mut word_out = String::from("");
717    for letter in letters.iter().rev() {
718        word_out.push_str(letter);
719    }
720    word_out
721}