ar_reshaper/
reshaper.rs

1use core::ops::RangeInclusive;
2
3use alloc::{collections::BTreeMap, string::String, vec::Vec};
4
5use crate::{
6    config::ReshaperConfig,
7    form::LetterForm,
8    letters::{
9        letters_db::{TATWEEL, ZWJ},
10        *,
11    },
12    ligatures::*,
13};
14
15const EMPTY: (char, LetterForm) = ('\0', LetterForm::Unsupported);
16
17static HARAKAT_RE: [RangeInclusive<char>; 9] = [
18    '\u{0610}'..='\u{061a}',
19    '\u{064b}'..='\u{065f}',
20    '\u{0670}'..='\u{0670}',
21    '\u{06d6}'..='\u{06dc}',
22    '\u{06df}'..='\u{06e8}',
23    '\u{06ea}'..='\u{06ed}',
24    '\u{08d4}'..='\u{08e1}',
25    '\u{08d4}'..='\u{08ed}',
26    '\u{08e3}'..='\u{08ff}',
27];
28
29/// # ArabicReshaper
30/// the main type for reconstructing sentences to be used in applications that don't support Arabic script.
31#[derive(Default, Clone)]
32pub struct ArabicReshaper {
33    config: ReshaperConfig,
34    letters: Letters,
35}
36
37impl ArabicReshaper {
38    /// Create a new [ArabicReshaper] using the given config
39    pub const fn new(config: ReshaperConfig) -> Self {
40        Self {
41            letters: Letters::new(config.language),
42            config,
43        }
44    }
45
46    /// Check whatever the text need reshaping or not.
47    pub fn need_reshape<S>(&self, text: S) -> bool
48    where
49        S: AsRef<str>,
50    {
51        text.as_ref().chars().any(|c| self.letters.contains_key(&c))
52    }
53
54    /// Reshape the given line and return the reshaped string
55    pub fn reshape<S>(&self, text: S) -> String
56    where
57        S: AsRef<str>,
58    {
59        let text = text.as_ref();
60
61        if text.is_empty() {
62            return String::new();
63        }
64
65        let ReshaperConfig {
66            delete_harakat,
67            shift_harakat_position,
68            delete_tatweel,
69            support_zwj,
70            use_unshaped_instead_of_isolated,
71            support_ligatures,
72            ..
73        } = self.config;
74
75        let isolated_form = match use_unshaped_instead_of_isolated {
76            true => LetterForm::Unshaped,
77            false => LetterForm::Isolated,
78        };
79
80        let mut output = Vec::new();
81        let mut position_harakat: BTreeMap<isize, Vec<char>> = BTreeMap::new();
82
83        for letter in text.chars() {
84            if HARAKAT_RE.iter().any(|h| h.contains(&letter)) {
85                if !delete_harakat {
86                    let mut position = (output.len() - 1) as isize;
87                    if shift_harakat_position {
88                        position -= 1
89                    }
90
91                    let entry = position_harakat.entry(position).or_default();
92
93                    if shift_harakat_position {
94                        entry.insert(0, letter);
95                    } else {
96                        entry.push(letter);
97                    }
98                }
99            } else if letter == TATWEEL && delete_tatweel || letter == ZWJ && !support_zwj {
100            } else if !self.letters.contains_key(&letter) {
101                output.push((letter, LetterForm::Unsupported))
102            } else if output.is_empty() {
103                output.push((letter, isolated_form)) // first letter
104            } else {
105                let previous_letter = output.last_mut().unwrap();
106                if (previous_letter.1 == LetterForm::Unsupported)
107                    || (!self.letters.connects_with_letter_before(letter))
108                    || (!self.letters.connects_with_letter_after(previous_letter.0))
109                    || (previous_letter.1 == LetterForm::Final
110                        && !self
111                            .letters
112                            .connects_with_letters_before_and_after(previous_letter.0))
113                {
114                    output.push((letter, isolated_form));
115                } else if previous_letter.1 == isolated_form {
116                    *previous_letter = (previous_letter.0, LetterForm::Initial);
117                    output.push((letter, LetterForm::Final));
118                } else {
119                    // Otherwise, we will change the previous letter to connect
120                    // to the current letter
121                    *previous_letter = (previous_letter.0, LetterForm::Medial);
122                    output.push((letter, LetterForm::Final));
123                }
124            }
125
126            // Remove ZWJ if it's the second to last item as it won't be useful
127            let len = output.len();
128            if support_zwj && len > 1 && output[len - 2].0 == ZWJ {
129                output.remove(len - 2);
130            }
131        }
132
133        if support_zwj && !output.is_empty() && output.last().unwrap().0 == ZWJ {
134            output.pop();
135        }
136
137        if support_ligatures {
138            // Clean text from Harakat to be able to find ligatures
139            let mut text: String = text
140                .chars()
141                .filter(|c| !HARAKAT_RE.iter().any(|r| r.contains(c)))
142                .collect();
143
144            // Clean text from Tatweel to find ligatures if delete_tatweel
145            if delete_tatweel {
146                text = text.replace(TATWEEL, "")
147            }
148
149            for ((tmatchs, forms), enabled) in
150                LIGATURES.iter().zip(self.config.ligatures.list.iter())
151            {
152                if !enabled {
153                    continue;
154                }
155                for tmatch in *tmatchs {
156                    for (idx, m) in text.match_indices(tmatch) {
157                        // match_indices returns bytes offset
158                        // we want character position
159                        let a = text[..idx].chars().count();
160                        let b = text[..idx + m.len()].chars().count();
161
162                        let a_form = output[a].1;
163                        let b_form = output[b - 1].1;
164                        let ligature_form: LetterForm;
165
166                        // +-----------+----------+---------+---------+----------+
167                        // | a   \   b | ISOLATED | INITIAL | MEDIAL  | FINAL    |
168                        // +-----------+----------+---------+---------+----------+
169                        // | ISOLATED  | ISOLATED | INITIAL | INITIAL | ISOLATED |
170                        // | INITIAL   | ISOLATED | INITIAL | INITIAL | ISOLATED |
171                        // | MEDIAL    | FINAL    | MEDIAL  | MEDIAL  | FINAL    |
172                        // | FINAL     | FINAL    | MEDIAL  | MEDIAL  | FINAL    |
173                        // +-----------+----------+---------+---------+----------+
174
175                        if a_form == isolated_form || a_form == LetterForm::Initial {
176                            if b_form == isolated_form || b_form == LetterForm::Final {
177                                ligature_form = LetterForm::Isolated;
178                            } else {
179                                ligature_form = LetterForm::Initial;
180                            }
181                        } else if b_form == isolated_form || b_form == LetterForm::Final {
182                            ligature_form = LetterForm::Final;
183                        } else {
184                            ligature_form = LetterForm::Medial;
185                        }
186
187                        if forms.get(ligature_form) == '\0' {
188                            continue;
189                        }
190
191                        output[a] = (forms.get(ligature_form), LetterForm::Unsupported);
192
193                        for e in output[a + 1..b].iter_mut() {
194                            *e = EMPTY;
195                        }
196                    }
197                }
198            }
199        }
200
201        let mut result = Vec::with_capacity(text.len());
202
203        if !delete_harakat {
204            if let Some(ph) = position_harakat.get(&-1) {
205                result.extend(ph);
206            }
207        }
208
209        for (i, (letter, form)) in output.into_iter().enumerate() {
210            if letter != '\0' {
211                result.push(self.letters.get_form(letter, form))
212            }
213
214            if !delete_harakat {
215                if let Some(ph) = position_harakat.get(&(i as isize)) {
216                    result.extend(ph);
217                }
218            }
219        }
220
221        result.into_iter().collect()
222    }
223
224    /// Reshape all lines in the given slice and return a new [Vec<String>] of strings
225    pub fn reshape_lines<S, L>(&self, lines: L) -> Vec<String>
226    where
227        S: AsRef<str>,
228        L: AsRef<[S]>,
229    {
230        let lines = lines.as_ref();
231        let mut result = Vec::with_capacity(lines.len());
232        for line in lines {
233            result.push(self.reshape(line.as_ref()));
234        }
235        result
236    }
237
238    /// A safe way to modify the config ([ReshaperConfig]) after creating
239    /// the [ArabicReshaper].
240    pub fn modify_config<F>(&mut self, func: F)
241    where
242        F: FnOnce(&mut ReshaperConfig),
243    {
244        let language_before = self.config.language;
245
246        func(&mut self.config);
247
248        if language_before != self.config.language {
249            // language changed, update letters
250            self.letters.change_language(self.config.language);
251        }
252    }
253}
254
255impl From<ReshaperConfig> for ArabicReshaper {
256    fn from(value: ReshaperConfig) -> Self {
257        ArabicReshaper::new(value)
258    }
259}