Skip to main content

quranize/
lib.rs

1//! [Quranize] encodes alphabetic text into quran text, a.k.a. transliteration.
2//!
3//! # Examples
4//!
5//! ## Adding crate quranize to a project's dependencies
6//!
7//! Run `cargo add quranize`, or add the following lines to `Cargo.toml` file.
8//! ```toml
9//! [dependencies]
10//! quranize = "1.0"
11//! ```
12//!
13//! ## Encoding alphabetic text to quran text
14//!
15//! ```
16//! let q = quranize::Quranize::new();
17//!
18//! assert_eq!(q.encode("bismillahirrohmanirrohim")[0].0, "بِسمِ اللَّهِ الرَّحمـٰنِ الرَّحيم");
19//! assert_eq!(q.encode("amma yatasa alun")[0].0, "عَمَّ يَتَساءَلون");
20//!
21//! let (i, _) = q.find("عَمَّ يَتَساءَلون")[0];
22//! let &(page, sura, aya, _) = q.get_data(i).unwrap();
23//! assert_eq!((page, sura, aya), (582, 78, 1));
24//! ```
25
26mod normalization;
27mod quran_metadata;
28mod suffix_tree;
29mod transliteration;
30
31use suffix_tree::{Edge, Index, SuffixTree};
32use transliteration::{contextual_map, harf_muqottoah_map, map};
33
34type EncodeResults = Vec<(String, usize, Vec<&'static str>)>;
35type PrevMap = (char, &'static str);
36
37use quran_metadata::*;
38const QURAN_TXT: &str = include_str!("quran-simple-min.txt");
39
40/// Quranize model, for doing transliteration, finding string, and getting aya.
41pub struct Quranize {
42    data: Vec<(u16, u8, u16, &'static str)>,
43    tree: suffix_tree::SuffixTree<'static>,
44}
45
46impl Quranize {
47    const EXPECTED_VERTEX_COUNT: usize = 126_307;
48
49    /// Create a new [`Quranize`] instance.
50    pub fn new() -> Self {
51        let mut data = Vec::with_capacity(AYA_COUNT);
52        let mut tree = SuffixTree::with_capacity(Self::EXPECTED_VERTEX_COUNT);
53
54        let mut sura_num = 0;
55        let mut page = 0;
56        (0..AYA_COUNT)
57            .zip(QURAN_TXT.split_inclusive('\n'))
58            .map(|(i, q)| {
59                sura_num += (SURA_STARTS.get(sura_num) == Some(&i)) as usize;
60                let aya_num = i - SURA_STARTS[sura_num - 1] + 1;
61                page += (PAGE_OFFSETS.get(page) == Some(&(sura_num, aya_num))) as usize;
62                (i, page as u16, sura_num as u8, aya_num as u16, q)
63            })
64            .map(|(i, p, s, a, q)| (i, p, s, a, Self::trim_basmalah(s, a, q)))
65            .for_each(|(i, p, s, a, q)| {
66                data.push((p, s, a, q.trim_end()));
67                tree.construct(i, q);
68            });
69        Self { data, tree }
70    }
71
72    fn trim_basmalah(s: u8, a: u16, q: &str) -> &str {
73        match (s, a) {
74            (1, _) | (9, _) => q,
75            (_, 1) => q.splitn(5, ' ').last().unwrap(),
76            _ => q,
77        }
78    }
79
80    /// Do transliteration on `s`, returning a list of tuple:
81    /// - `String`: transliteration result / quran form
82    /// - `usize`: location count where the quran form above is found in Alquran
83    /// - `Vec<&'static str>`: explanation for each chars in the quran form above
84    ///
85    /// # Examples
86    ///
87    /// ```
88    /// let q = quranize::Quranize::new();
89    /// assert_eq!(q.encode("alif lam mim"), [("الم".to_string(), 912, vec!["alif", "lam", "mim"])]);
90    /// assert_eq!(q.encode("minal jinnati wannas")[0].0, "مِنَ الجِنَّةِ وَالنّاس");
91    /// ```
92    pub fn encode(&self, s: &str) -> EncodeResults {
93        let mut results: EncodeResults = match normalization::normalize(s).as_str() {
94            "" => vec![],
95            s => { self.tree.edges_from(0) }
96                .flat_map(|&e| self.rev_encode(s, e, None))
97                .collect(),
98        }
99        .into_iter()
100        .chain(match normalization::normalize_muqottoah(s).as_str() {
101            "" => vec![],
102            s => { self.tree.edges_from(0) }
103                .flat_map(|&e| self.rev_encode_muqottoah(s, e))
104                .collect(),
105        })
106        .map(|(q, n, e)| (q.chars().rev().collect(), n, e.into_iter().rev().collect()))
107        .collect();
108        results.sort_unstable_by(|x, y| x.0.cmp(&y.0));
109        results.dedup_by(|x, y| x.0 == y.0);
110        results
111    }
112
113    fn rev_encode(&self, s: &str, (v, w, l): Edge, pm: Option<PrevMap>) -> EncodeResults {
114        let results_iter = l.chars().next().into_iter().flat_map(|c| -> EncodeResults {
115            let tsls = map(c).iter().chain(contextual_map(pm.unzip().0, c));
116            let tsl_results_iter = tsls.filter_map(|&tsl| -> Option<EncodeResults> {
117                s.strip_prefix(tsl).map(|s| match s {
118                    "" => vec![(c.to_string(), self.tree.count_data(w), vec![tsl])],
119                    s => match &l[c.len_utf8()..] {
120                        "" => { self.tree.edges_from(w) }
121                            .flat_map(|&e| self.rev_encode(s, e, Some((c, tsl))))
122                            .collect(),
123                        l => self.rev_encode(s, (v, w, l), Some((c, tsl))),
124                    }
125                    .into_iter()
126                    .map(|(mut q, n, mut e)| {
127                        q.push(c);
128                        e.push(tsl);
129                        (q, n, e)
130                    })
131                    .collect(),
132                })
133            });
134            tsl_results_iter.flatten().collect()
135        });
136        results_iter.collect()
137    }
138
139    fn rev_encode_muqottoah(&self, s: &str, (v, w, l): Edge) -> EncodeResults {
140        let results_iter = l.chars().next().into_iter().flat_map(|c| -> EncodeResults {
141            let tsls = harf_muqottoah_map(c).iter();
142            let tsl_results_iter = tsls.filter_map(|&tsl| -> Option<EncodeResults> {
143                s.strip_prefix(tsl).map(|s| match s {
144                    "" => match self.tree.vertices[w].2 {
145                        true => vec![(c.to_string(), self.tree.count_data(w), vec![tsl])],
146                        false => vec![],
147                    },
148                    s => match &l[c.len_utf8()..] {
149                        "" => { self.tree.edges_from(w) }
150                            .flat_map(|&e| self.rev_encode_muqottoah(s, e))
151                            .collect(),
152                        l => self.rev_encode_muqottoah(s, (v, w, l)),
153                    }
154                    .into_iter()
155                    .map(|(mut q, n, mut e)| {
156                        q.push(c);
157                        e.push(tsl);
158                        (q, n, e)
159                    })
160                    .collect(),
161                })
162            });
163            tsl_results_iter.flatten().collect()
164        });
165        results_iter.collect()
166    }
167
168    /// Find `s` in Alquran, returning a list of `Index`, where
169    /// `Index` is a tuple, containing:
170    /// - `usize`: aya row / aya offset (`0..6236`)
171    /// - `usize`: string offset in a specific aya (`0..length of aya`)
172    ///
173    /// # Examples
174    /// ```
175    /// let q = quranize::Quranize::new();
176    /// let index = q.find("عَمَّ يَتَساءَلون")[0];
177    /// assert_eq!(index, (5672, 0));
178    /// ```
179    pub fn find(&self, s: &str) -> Vec<Index> {
180        self.tree.find(s, 0)
181    }
182
183    /// Get the data for a specific aya row / aya offset (`i`: 0..6236).
184    /// Data is a tuple of:
185    /// - `u16`: page number
186    /// - `u8`: sura number
187    /// - `u16`: aya number
188    /// - `&str`: aya text
189    ///
190    /// # Examples
191    /// ```
192    /// let q = quranize::Quranize::new();
193    /// assert_eq!(q.get_data(5672), Some(&(582, 78, 1, "عَمَّ يَتَساءَلونَ")));
194    /// ```
195    pub fn get_data(&self, i: usize) -> Option<&(u16, u8, u16, &str)> {
196        self.data.get(i)
197    }
198
199    /// Get the data for a specific page number (`page`: 1..604).
200    /// Returns a vector of tuples, each tuple contains:
201    /// - `u16`: page number
202    /// - `u8`: sura number
203    /// - `u16`: aya number
204    /// - `&str`: aya text
205    /// # Examples
206    /// ```
207    /// let q = quranize::Quranize::new();
208    /// let page_data = q.get_data_from_page(582).unwrap();
209    /// assert_eq!(page_data.len(), 30);
210    /// assert_eq!(page_data[0], &(582, 78, 1, "عَمَّ يَتَساءَلونَ"));
211    /// ```
212    pub fn get_data_from_page(&self, page: u16) -> Option<Vec<&(u16, u8, u16, &str)>> {
213        let same_page = |&&(p, _, _, _): &&(u16, u8, u16, &str)| p == page;
214        let pos = { self.data.binary_search_by_key(&page, |&(p, _, _, _)| p) }.ok()?;
215        let left_item_count = self.data[..pos].iter().rev().take_while(same_page).count();
216        let left_pos = pos - left_item_count;
217        let page_data = self.data[left_pos..].iter().take_while(same_page).collect();
218        Some(page_data)
219    }
220}
221
222impl Default for Quranize {
223    fn default() -> Self {
224        Self::new()
225    }
226}
227
228#[cfg(test)]
229mod tests {
230    use super::*;
231    use pretty_assertions::assert_eq;
232
233    impl Quranize {
234        fn e(&self, text: &str) -> Vec<String> {
235            self.encode(text).into_iter().map(|r| r.0).collect()
236        }
237    }
238
239    #[test]
240    fn test_quranize_default() {
241        let q: Quranize = Default::default();
242        assert_eq!(q.e("illa billah"), ["إِلّا بِاللَّه"]);
243        assert_eq!(q.e("alqur'an"), ["القُرآن"]);
244        assert_eq!(q.e("bismillah"), ["بِسمِ اللَّه"]);
245        assert_eq!(q.e("birobbinnas"), ["بِرَبِّ النّاس"]);
246        assert_eq!(q.e("inna anzalnahu"), ["إِنّا أَنزَلناهُ"]);
247        assert_eq!(q.e("wa'tasimu"), ["وَاعتَصِمو"]);
248        assert_eq!(q.e("wa'tasimu bihablillah"), ["وَاعتَصِموا بِحَبلِ اللَّه"]);
249        assert_eq!(q.e("idza qodho"), ["إِذا قَضَ"]);
250        assert_eq!(q.e("masyaallah"), ["ما شاءَ اللَّه"]);
251        assert_eq!(q.e("illa man taba"), ["إِلّا مَن تابَ"]);
252        assert_eq!(q.e("alla tahzani"), ["أَلّا تَحزَني"]);
253        assert_eq!(q.e("innasya niaka"), ["إِنَّ شانِئَكَ"]);
254        assert_eq!(q.e("innasya ni'aka"), ["إِنَّ شانِئَكَ"]);
255        assert_eq!(q.e("wasalamun alaihi"), ["وَسَلامٌ عَلَيهِ"]);
256        assert_eq!(q.e("ulaika hum"), ["أُولـٰئِكَ هُم"]);
257        assert_eq!(q.e("waladdoollin"), ["وَلَا الضّالّين"]);
258        assert_eq!(q.e("undur kaifa"), ["انظُر كَيفَ"]);
259        assert_eq!(q.e("lirrohman"), ["لِلرَّحمـٰن"]);
260        assert_eq!(q.e("waantum muslimun"), ["وَأَنتُم مُسلِمون"]);
261        assert_eq!(q.e("laa yukallifullah"), ["لا يُكَلِّفُ اللَّه"]);
262        assert_eq!(q.e("robbil alamin"), ["رَبِّ العالَمين"]);
263        assert_eq!(q.e("husnul maab"), ["حُسنُ المَآب"]);
264        assert_eq!(q.e("khusnul ma'ab"), ["حُسنُ المَآب"]);
265        assert_eq!(q.e("kufuwan"), ["كُفُوً"]);
266        assert_eq!(q.e("yukhodiun"), ["يُخادِعون"]);
267        assert_eq!(q.e("indallah"), ["عِندَ اللَّه"]);
268        assert_eq!(q.e("alimul ghoibi"), ["عالِمُ الغَيبِ"]);
269        assert_eq!(q.e("kaana dhoifa"), ["كانَ ضَعيفًا"]);
270        assert_eq!(q.e("waantum muslimuna"), ["وَأَنتُم مُسلِمونَ"]);
271        assert_eq!(q.e("kitabi la roiba"), ["الكِتابِ لا رَيبَ"]);
272        assert_eq!(q.e("takwili"), ["تَأويلِ"]);
273        assert_eq!(q.e("yu'minun"), ["يُؤمِنون"]);
274        assert_eq!(q.e("hudan lil muttaqin"), ["هُدًى لِلمُتَّقين"]);
275        assert_eq!(q.e("majreeha wamursaha"), ["مَجراها وَمُرساها"]);
276        assert_eq!(q.e("fabiayyi alai"), ["فَبِأَيِّ آلاءِ"]);
277        assert_eq!(q.e("wayuallimukumma"), ["وَيُعَلِّمُكُم ما"]);
278        assert_eq!(q.e("wassolat"), ["وَالصَّلاة"]);
279    }
280
281    #[test]
282    fn test_alfatihah() {
283        let q = Quranize::new();
284        assert_eq!(
285            q.e("bismillahirrohmanirrohiim"),
286            ["بِسمِ اللَّهِ الرَّحمـٰنِ الرَّحيم"]
287        );
288        assert_eq!(
289            q.e("alhamdulilla hirobbil 'alamiin"),
290            ["الحَمدُ لِلَّهِ رَبِّ العالَمين"]
291        );
292        assert_eq!(q.e("arrohma nirrohim"), ["الرَّحمـٰنِ الرَّحيم"]);
293        assert_eq!(q.e("maliki yau middin"), ["مالِكِ يَومِ الدّين"]);
294        assert_eq!(
295            q.e("iyyakanakbudu waiyyakanastain"),
296            ["إِيّاكَ نَعبُدُ وَإِيّاكَ نَستَعين"]
297        );
298        assert_eq!(q.e("ihdinassirotol mustaqim"), ["اهدِنَا الصِّراطَ المُستَقيم"]);
299        assert_eq!(
300            q.e("shirotolladzina an'amta 'alaihim ghoiril maghdzubi 'alaihim waladdoolliin"),
301            ["صِراطَ الَّذينَ أَنعَمتَ عَلَيهِم غَيرِ المَغضوبِ عَلَيهِم وَلَا الضّالّين"]
302        );
303    }
304
305    #[test]
306    fn test_al_ikhlas() {
307        let q = Quranize::new();
308        assert_eq!(q.e("qulhuwallahuahad"), ["قُل هُوَ اللَّهُ أَحَد"]);
309        assert_eq!(q.e("allahussomad"), ["اللَّهُ الصَّمَد"]);
310        assert_eq!(q.e("lam yalid walam yulad"), ["لَم يَلِد وَلَم يولَد"]);
311        assert_eq!(
312            q.e("walam yakun lahu kufuwan ahad"),
313            ["وَلَم يَكُن لَهُ كُفُوًا أَحَد"]
314        );
315    }
316
317    #[test]
318    fn test_harf_muqottoah() {
319        let q = Quranize::new();
320        assert_eq!(q.e("alif lam mim"), ["الم"]);
321        assert_eq!(q.e("alif laaam miiim"), &["الم"]);
322        assert_eq!(q.e("nuun"), &["ن"]);
323        assert_eq!(q.e("kaaaf haa yaa aiiin shoood"), &["كهيعص"]);
324        assert_eq!(q.e("kaf ha ya 'ain shod"), &["كهيعص"]);
325        assert_eq!(q.e("alif lam ro"), &["الر"]);
326    }
327
328    #[test]
329    fn test_quranize_empty_result() {
330        let q = Quranize::new();
331        let empty: [String; 0] = [];
332        assert_eq!(q.e(""), empty);
333        assert_eq!(q.e(" "), empty);
334        assert_eq!(q.e(" -"), empty);
335        assert_eq!(q.e("abcd"), empty);
336        assert_eq!(q.e("1+2=3"), empty);
337    }
338
339    #[test]
340    fn test_unique() {
341        let q = Quranize::new();
342        let results = q.e("masyaallah");
343        let uresults = std::collections::HashSet::<&String>::from_iter(results.iter());
344        let is_unique = results.len() == uresults.len();
345        assert!(is_unique, "results are not unique. results: {results:#?}");
346    }
347
348    #[test]
349    fn test_tree_find() {
350        let q = Quranize::new();
351        assert!(q.find("بِسمِ").contains(&(0, 0)));
352        assert_eq!(q.find("وَالنّاسِ").last(), Some(&(6235, 28)));
353        assert!(q.find("الم").contains(&(7, 0)));
354        assert_eq!(q.find("بِسمِ اللَّهِ الرَّحمـٰنِ الرَّحيمِ").len(), 2);
355        assert!(q.find("").is_empty());
356        assert!(q.find("نن").is_empty());
357        assert!(q.find("ننن").is_empty());
358        assert!(q.find("نننن").is_empty());
359        assert!(q.find("2+3+4=9").is_empty());
360        assert_eq!(q.find("بِسمِ اللَّهِ الرَّحمـٰنِ الرَّحيمِ").first(), Some(&(0, 0)));
361        assert_eq!(q.find("الرَّحمـٰنِ الرَّحيمِ").first(), Some(&(0, 26)));
362        assert_eq!(q.find("").first(), None);
363        assert_eq!(q.find("abc").first(), None);
364    }
365
366    #[test]
367    fn test_tree_props() {
368        let t = Quranize::new().tree;
369        assert_eq!(t.vertices.len(), t.edges.len() + 1);
370        assert_eq!(t.count_data(0), t.collect_data(0).len());
371        assert_eq!(t.vertices.len(), Quranize::EXPECTED_VERTEX_COUNT);
372        assert!(t.vertices[0].2);
373        assert!(!t.vertices[Quranize::EXPECTED_VERTEX_COUNT - 1].2);
374    }
375
376    #[test]
377    fn test_get_data_from_page() {
378        let q = Quranize::new();
379        let page1_data = q.get_data_from_page(1).unwrap();
380        assert_eq!(page1_data.len(), 7);
381        assert_eq!(page1_data[0].1, 1);
382        assert_eq!(page1_data[0].2, 1);
383        assert_eq!(page1_data[0].3, "بِسمِ اللَّهِ الرَّحمـٰنِ الرَّحيمِ");
384        let page2_data = q.get_data_from_page(2).unwrap();
385        assert_eq!(page2_data.len(), 5);
386        let page3_data = q.get_data_from_page(3).unwrap();
387        assert_eq!(page3_data.len(), 11);
388        let page604_data = q.get_data_from_page(604).unwrap();
389        assert_eq!(page604_data.len(), 15);
390        assert_eq!(page604_data[0].3, "قُل هُوَ اللَّهُ أَحَدٌ");
391    }
392}