1mod normalization;
27mod quran_metadata;
28mod suffix_tree;
29mod transliteration;
30
31use suffix_tree::{Edge, Index, SuffixTree};
32use transliteration::{contextual_map, harf_muqottoah_map, map};
33
34type EncodeResults = Vec<(String, usize, Vec<&'static str>)>;
35type PrevMap = (char, &'static str);
36
37use quran_metadata::*;
38const QURAN_TXT: &str = include_str!("quran-simple-min.txt");
39
40pub struct Quranize {
42 data: Vec<(u16, u8, u16, &'static str)>,
43 tree: suffix_tree::SuffixTree<'static>,
44}
45
46impl Quranize {
47 const EXPECTED_VERTEX_COUNT: usize = 126_307;
48
49 pub fn new() -> Self {
51 let mut data = Vec::with_capacity(AYA_COUNT);
52 let mut tree = SuffixTree::with_capacity(Self::EXPECTED_VERTEX_COUNT);
53
54 let mut sura_num = 0;
55 let mut page = 0;
56 (0..AYA_COUNT)
57 .zip(QURAN_TXT.split_inclusive('\n'))
58 .map(|(i, q)| {
59 sura_num += (SURA_STARTS.get(sura_num) == Some(&i)) as usize;
60 let aya_num = i - SURA_STARTS[sura_num - 1] + 1;
61 page += (PAGE_OFFSETS.get(page) == Some(&(sura_num, aya_num))) as usize;
62 (i, page as u16, sura_num as u8, aya_num as u16, q)
63 })
64 .map(|(i, p, s, a, q)| (i, p, s, a, Self::trim_basmalah(s, a, q)))
65 .for_each(|(i, p, s, a, q)| {
66 data.push((p, s, a, q.trim_end()));
67 tree.construct(i, q);
68 });
69 Self { data, tree }
70 }
71
72 fn trim_basmalah(s: u8, a: u16, q: &str) -> &str {
73 match (s, a) {
74 (1, _) | (9, _) => q,
75 (_, 1) => q.splitn(5, ' ').last().unwrap(),
76 _ => q,
77 }
78 }
79
80 pub fn encode(&self, s: &str) -> EncodeResults {
93 let mut results: EncodeResults = match normalization::normalize(s).as_str() {
94 "" => vec![],
95 s => { self.tree.edges_from(0) }
96 .flat_map(|&e| self.rev_encode(s, e, None))
97 .collect(),
98 }
99 .into_iter()
100 .chain(match normalization::normalize_muqottoah(s).as_str() {
101 "" => vec![],
102 s => { self.tree.edges_from(0) }
103 .flat_map(|&e| self.rev_encode_muqottoah(s, e))
104 .collect(),
105 })
106 .map(|(q, n, e)| (q.chars().rev().collect(), n, e.into_iter().rev().collect()))
107 .collect();
108 results.sort_unstable_by(|x, y| x.0.cmp(&y.0));
109 results.dedup_by(|x, y| x.0 == y.0);
110 results
111 }
112
113 fn rev_encode(&self, s: &str, (v, w, l): Edge, pm: Option<PrevMap>) -> EncodeResults {
114 let results_iter = l.chars().next().into_iter().flat_map(|c| -> EncodeResults {
115 let tsls = map(c).iter().chain(contextual_map(pm.unzip().0, c));
116 let tsl_results_iter = tsls.filter_map(|&tsl| -> Option<EncodeResults> {
117 s.strip_prefix(tsl).map(|s| match s {
118 "" => vec![(c.to_string(), self.tree.count_data(w), vec![tsl])],
119 s => match &l[c.len_utf8()..] {
120 "" => { self.tree.edges_from(w) }
121 .flat_map(|&e| self.rev_encode(s, e, Some((c, tsl))))
122 .collect(),
123 l => self.rev_encode(s, (v, w, l), Some((c, tsl))),
124 }
125 .into_iter()
126 .map(|(mut q, n, mut e)| {
127 q.push(c);
128 e.push(tsl);
129 (q, n, e)
130 })
131 .collect(),
132 })
133 });
134 tsl_results_iter.flatten().collect()
135 });
136 results_iter.collect()
137 }
138
139 fn rev_encode_muqottoah(&self, s: &str, (v, w, l): Edge) -> EncodeResults {
140 let results_iter = l.chars().next().into_iter().flat_map(|c| -> EncodeResults {
141 let tsls = harf_muqottoah_map(c).iter();
142 let tsl_results_iter = tsls.filter_map(|&tsl| -> Option<EncodeResults> {
143 s.strip_prefix(tsl).map(|s| match s {
144 "" => match self.tree.vertices[w].2 {
145 true => vec![(c.to_string(), self.tree.count_data(w), vec![tsl])],
146 false => vec![],
147 },
148 s => match &l[c.len_utf8()..] {
149 "" => { self.tree.edges_from(w) }
150 .flat_map(|&e| self.rev_encode_muqottoah(s, e))
151 .collect(),
152 l => self.rev_encode_muqottoah(s, (v, w, l)),
153 }
154 .into_iter()
155 .map(|(mut q, n, mut e)| {
156 q.push(c);
157 e.push(tsl);
158 (q, n, e)
159 })
160 .collect(),
161 })
162 });
163 tsl_results_iter.flatten().collect()
164 });
165 results_iter.collect()
166 }
167
168 pub fn find(&self, s: &str) -> Vec<Index> {
180 self.tree.find(s, 0)
181 }
182
183 pub fn get_data(&self, i: usize) -> Option<&(u16, u8, u16, &str)> {
196 self.data.get(i)
197 }
198
199 pub fn get_data_from_page(&self, page: u16) -> Option<Vec<&(u16, u8, u16, &str)>> {
213 let same_page = |&&(p, _, _, _): &&(u16, u8, u16, &str)| p == page;
214 let pos = { self.data.binary_search_by_key(&page, |&(p, _, _, _)| p) }.ok()?;
215 let left_item_count = self.data[..pos].iter().rev().take_while(same_page).count();
216 let left_pos = pos - left_item_count;
217 let page_data = self.data[left_pos..].iter().take_while(same_page).collect();
218 Some(page_data)
219 }
220}
221
222impl Default for Quranize {
223 fn default() -> Self {
224 Self::new()
225 }
226}
227
228#[cfg(test)]
229mod tests {
230 use super::*;
231 use pretty_assertions::assert_eq;
232
233 impl Quranize {
234 fn e(&self, text: &str) -> Vec<String> {
235 self.encode(text).into_iter().map(|r| r.0).collect()
236 }
237 }
238
239 #[test]
240 fn test_quranize_default() {
241 let q: Quranize = Default::default();
242 assert_eq!(q.e("illa billah"), ["إِلّا بِاللَّه"]);
243 assert_eq!(q.e("alqur'an"), ["القُرآن"]);
244 assert_eq!(q.e("bismillah"), ["بِسمِ اللَّه"]);
245 assert_eq!(q.e("birobbinnas"), ["بِرَبِّ النّاس"]);
246 assert_eq!(q.e("inna anzalnahu"), ["إِنّا أَنزَلناهُ"]);
247 assert_eq!(q.e("wa'tasimu"), ["وَاعتَصِمو"]);
248 assert_eq!(q.e("wa'tasimu bihablillah"), ["وَاعتَصِموا بِحَبلِ اللَّه"]);
249 assert_eq!(q.e("idza qodho"), ["إِذا قَضَ"]);
250 assert_eq!(q.e("masyaallah"), ["ما شاءَ اللَّه"]);
251 assert_eq!(q.e("illa man taba"), ["إِلّا مَن تابَ"]);
252 assert_eq!(q.e("alla tahzani"), ["أَلّا تَحزَني"]);
253 assert_eq!(q.e("innasya niaka"), ["إِنَّ شانِئَكَ"]);
254 assert_eq!(q.e("innasya ni'aka"), ["إِنَّ شانِئَكَ"]);
255 assert_eq!(q.e("wasalamun alaihi"), ["وَسَلامٌ عَلَيهِ"]);
256 assert_eq!(q.e("ulaika hum"), ["أُولـٰئِكَ هُم"]);
257 assert_eq!(q.e("waladdoollin"), ["وَلَا الضّالّين"]);
258 assert_eq!(q.e("undur kaifa"), ["انظُر كَيفَ"]);
259 assert_eq!(q.e("lirrohman"), ["لِلرَّحمـٰن"]);
260 assert_eq!(q.e("waantum muslimun"), ["وَأَنتُم مُسلِمون"]);
261 assert_eq!(q.e("laa yukallifullah"), ["لا يُكَلِّفُ اللَّه"]);
262 assert_eq!(q.e("robbil alamin"), ["رَبِّ العالَمين"]);
263 assert_eq!(q.e("husnul maab"), ["حُسنُ المَآب"]);
264 assert_eq!(q.e("khusnul ma'ab"), ["حُسنُ المَآب"]);
265 assert_eq!(q.e("kufuwan"), ["كُفُوً"]);
266 assert_eq!(q.e("yukhodiun"), ["يُخادِعون"]);
267 assert_eq!(q.e("indallah"), ["عِندَ اللَّه"]);
268 assert_eq!(q.e("alimul ghoibi"), ["عالِمُ الغَيبِ"]);
269 assert_eq!(q.e("kaana dhoifa"), ["كانَ ضَعيفًا"]);
270 assert_eq!(q.e("waantum muslimuna"), ["وَأَنتُم مُسلِمونَ"]);
271 assert_eq!(q.e("kitabi la roiba"), ["الكِتابِ لا رَيبَ"]);
272 assert_eq!(q.e("takwili"), ["تَأويلِ"]);
273 assert_eq!(q.e("yu'minun"), ["يُؤمِنون"]);
274 assert_eq!(q.e("hudan lil muttaqin"), ["هُدًى لِلمُتَّقين"]);
275 assert_eq!(q.e("majreeha wamursaha"), ["مَجراها وَمُرساها"]);
276 assert_eq!(q.e("fabiayyi alai"), ["فَبِأَيِّ آلاءِ"]);
277 assert_eq!(q.e("wayuallimukumma"), ["وَيُعَلِّمُكُم ما"]);
278 assert_eq!(q.e("wassolat"), ["وَالصَّلاة"]);
279 }
280
281 #[test]
282 fn test_alfatihah() {
283 let q = Quranize::new();
284 assert_eq!(
285 q.e("bismillahirrohmanirrohiim"),
286 ["بِسمِ اللَّهِ الرَّحمـٰنِ الرَّحيم"]
287 );
288 assert_eq!(
289 q.e("alhamdulilla hirobbil 'alamiin"),
290 ["الحَمدُ لِلَّهِ رَبِّ العالَمين"]
291 );
292 assert_eq!(q.e("arrohma nirrohim"), ["الرَّحمـٰنِ الرَّحيم"]);
293 assert_eq!(q.e("maliki yau middin"), ["مالِكِ يَومِ الدّين"]);
294 assert_eq!(
295 q.e("iyyakanakbudu waiyyakanastain"),
296 ["إِيّاكَ نَعبُدُ وَإِيّاكَ نَستَعين"]
297 );
298 assert_eq!(q.e("ihdinassirotol mustaqim"), ["اهدِنَا الصِّراطَ المُستَقيم"]);
299 assert_eq!(
300 q.e("shirotolladzina an'amta 'alaihim ghoiril maghdzubi 'alaihim waladdoolliin"),
301 ["صِراطَ الَّذينَ أَنعَمتَ عَلَيهِم غَيرِ المَغضوبِ عَلَيهِم وَلَا الضّالّين"]
302 );
303 }
304
305 #[test]
306 fn test_al_ikhlas() {
307 let q = Quranize::new();
308 assert_eq!(q.e("qulhuwallahuahad"), ["قُل هُوَ اللَّهُ أَحَد"]);
309 assert_eq!(q.e("allahussomad"), ["اللَّهُ الصَّمَد"]);
310 assert_eq!(q.e("lam yalid walam yulad"), ["لَم يَلِد وَلَم يولَد"]);
311 assert_eq!(
312 q.e("walam yakun lahu kufuwan ahad"),
313 ["وَلَم يَكُن لَهُ كُفُوًا أَحَد"]
314 );
315 }
316
317 #[test]
318 fn test_harf_muqottoah() {
319 let q = Quranize::new();
320 assert_eq!(q.e("alif lam mim"), ["الم"]);
321 assert_eq!(q.e("alif laaam miiim"), &["الم"]);
322 assert_eq!(q.e("nuun"), &["ن"]);
323 assert_eq!(q.e("kaaaf haa yaa aiiin shoood"), &["كهيعص"]);
324 assert_eq!(q.e("kaf ha ya 'ain shod"), &["كهيعص"]);
325 assert_eq!(q.e("alif lam ro"), &["الر"]);
326 }
327
328 #[test]
329 fn test_quranize_empty_result() {
330 let q = Quranize::new();
331 let empty: [String; 0] = [];
332 assert_eq!(q.e(""), empty);
333 assert_eq!(q.e(" "), empty);
334 assert_eq!(q.e(" -"), empty);
335 assert_eq!(q.e("abcd"), empty);
336 assert_eq!(q.e("1+2=3"), empty);
337 }
338
339 #[test]
340 fn test_unique() {
341 let q = Quranize::new();
342 let results = q.e("masyaallah");
343 let uresults = std::collections::HashSet::<&String>::from_iter(results.iter());
344 let is_unique = results.len() == uresults.len();
345 assert!(is_unique, "results are not unique. results: {results:#?}");
346 }
347
348 #[test]
349 fn test_tree_find() {
350 let q = Quranize::new();
351 assert!(q.find("بِسمِ").contains(&(0, 0)));
352 assert_eq!(q.find("وَالنّاسِ").last(), Some(&(6235, 28)));
353 assert!(q.find("الم").contains(&(7, 0)));
354 assert_eq!(q.find("بِسمِ اللَّهِ الرَّحمـٰنِ الرَّحيمِ").len(), 2);
355 assert!(q.find("").is_empty());
356 assert!(q.find("نن").is_empty());
357 assert!(q.find("ننن").is_empty());
358 assert!(q.find("نننن").is_empty());
359 assert!(q.find("2+3+4=9").is_empty());
360 assert_eq!(q.find("بِسمِ اللَّهِ الرَّحمـٰنِ الرَّحيمِ").first(), Some(&(0, 0)));
361 assert_eq!(q.find("الرَّحمـٰنِ الرَّحيمِ").first(), Some(&(0, 26)));
362 assert_eq!(q.find("").first(), None);
363 assert_eq!(q.find("abc").first(), None);
364 }
365
366 #[test]
367 fn test_tree_props() {
368 let t = Quranize::new().tree;
369 assert_eq!(t.vertices.len(), t.edges.len() + 1);
370 assert_eq!(t.count_data(0), t.collect_data(0).len());
371 assert_eq!(t.vertices.len(), Quranize::EXPECTED_VERTEX_COUNT);
372 assert!(t.vertices[0].2);
373 assert!(!t.vertices[Quranize::EXPECTED_VERTEX_COUNT - 1].2);
374 }
375
376 #[test]
377 fn test_get_data_from_page() {
378 let q = Quranize::new();
379 let page1_data = q.get_data_from_page(1).unwrap();
380 assert_eq!(page1_data.len(), 7);
381 assert_eq!(page1_data[0].1, 1);
382 assert_eq!(page1_data[0].2, 1);
383 assert_eq!(page1_data[0].3, "بِسمِ اللَّهِ الرَّحمـٰنِ الرَّحيمِ");
384 let page2_data = q.get_data_from_page(2).unwrap();
385 assert_eq!(page2_data.len(), 5);
386 let page3_data = q.get_data_from_page(3).unwrap();
387 assert_eq!(page3_data.len(), 11);
388 let page604_data = q.get_data_from_page(604).unwrap();
389 assert_eq!(page604_data.len(), 15);
390 assert_eq!(page604_data[0].3, "قُل هُوَ اللَّهُ أَحَدٌ");
391 }
392}