1mod normalization;
28mod suffix_tree;
29mod transliteration;
30
31use suffix_tree::{Edge, Index};
32use transliteration::{contextual_map, harf_muqottoah_map, map};
33
34type EncodeResults = Vec<(String, usize, Vec<&'static str>)>;
35type PrevMap = (char, &'static str);
36
37const AYA_COUNT: usize = 6236;
38const SURA_STARTS: [usize; 114] = [
39 0, 7, 293, 493, 669, 789, 954, 1160, 1235, 1364, 1473, 1596, 1707, 1750, 1802, 1901, 2029,
40 2140, 2250, 2348, 2483, 2595, 2673, 2791, 2855, 2932, 3159, 3252, 3340, 3409, 3469, 3503, 3533,
41 3606, 3660, 3705, 3788, 3970, 4058, 4133, 4218, 4272, 4325, 4414, 4473, 4510, 4545, 4583, 4612,
42 4630, 4675, 4735, 4784, 4846, 4901, 4979, 5075, 5104, 5126, 5150, 5163, 5177, 5188, 5199, 5217,
43 5229, 5241, 5271, 5323, 5375, 5419, 5447, 5475, 5495, 5551, 5591, 5622, 5672, 5712, 5758, 5800,
44 5829, 5848, 5884, 5909, 5931, 5948, 5967, 5993, 6023, 6043, 6058, 6079, 6090, 6098, 6106, 6125,
45 6130, 6138, 6146, 6157, 6168, 6176, 6179, 6188, 6193, 6197, 6204, 6207, 6213, 6216, 6221, 6225,
46 6230,
47];
48const QURAN_TXT: &str = include_str!("quran-simple-min.txt");
49
50pub struct Quranize {
52 tree: suffix_tree::SuffixTree<'static>,
53 saqs: Vec<(u8, u16, &'static str)>,
54}
55
56impl Quranize {
57 const EXPECTED_VERTEX_COUNT: usize = 126_307;
58
59 pub fn new() -> Self {
61 let mut tree = suffix_tree::SuffixTree::with_capacity(Self::EXPECTED_VERTEX_COUNT);
62 let mut saqs = Vec::with_capacity(AYA_COUNT);
63 let mut sura_num = 1;
64 (0..AYA_COUNT)
65 .zip(QURAN_TXT.split_inclusive('\n'))
66 .map(|(i, q)| {
67 sura_num += (i == SURA_STARTS.get(sura_num).copied().unwrap_or(AYA_COUNT)) as usize;
68 let aya_num = i - SURA_STARTS[sura_num - 1] + 1;
69 ((i, sura_num as u8, aya_num as u16), q)
70 })
71 .map(|((i, s, a), q)| ((i, s, a), Self::trim_basmalah(s, a, q)))
72 .for_each(|((i, s, a), q)| {
73 tree.construct(i, q);
74 saqs.push((s, a, q.trim()));
75 });
76 Self { tree, saqs }
77 }
78
79 fn trim_basmalah(s: u8, a: u16, q: &str) -> &str {
80 match (s, a) {
81 (1, _) | (9, _) => q,
82 (_, 1) => q.splitn(5, ' ').last().unwrap(),
83 _ => q,
84 }
85 }
86
87 pub fn encode(&self, s: &str) -> EncodeResults {
100 let mut results: EncodeResults = match normalization::normalize(s).as_str() {
101 "" => vec![],
102 s => { self.tree.edges_from(0) }
103 .flat_map(|&e| self.rev_encode(s, e, None))
104 .collect(),
105 }
106 .into_iter()
107 .chain(match normalization::normalize_muqottoah(s).as_str() {
108 "" => vec![],
109 s => { self.tree.edges_from(0) }
110 .flat_map(|&e| self.rev_encode_muqottoah(s, e))
111 .collect(),
112 })
113 .map(|(q, n, e)| (q.chars().rev().collect(), n, e.into_iter().rev().collect()))
114 .collect();
115 results.dedup_by(|x, y| x.0 == y.0);
116 results
117 }
118
119 fn rev_encode(&self, s: &str, (v, w, l): Edge, pm: Option<PrevMap>) -> EncodeResults {
120 let results_iter = l.chars().next().into_iter().flat_map(|c| -> EncodeResults {
121 let tsls = map(c).iter().chain(contextual_map(pm.unzip().0, c));
122 let tsl_results_iter = tsls.filter_map(|&tsl| -> Option<EncodeResults> {
123 s.strip_prefix(tsl).map(|s| match s {
124 "" => vec![(c.to_string(), self.tree.count_data(w), vec![tsl])],
125 s => match &l[c.len_utf8()..] {
126 "" => { self.tree.edges_from(w) }
127 .flat_map(|&e| self.rev_encode(s, e, Some((c, tsl))))
128 .collect(),
129 l => self.rev_encode(s, (v, w, l), Some((c, tsl))),
130 }
131 .into_iter()
132 .map(|(mut q, n, mut e)| {
133 q.push(c);
134 e.push(tsl);
135 (q, n, e)
136 })
137 .collect(),
138 })
139 });
140 tsl_results_iter.flatten().collect()
141 });
142 results_iter.collect()
143 }
144
145 fn rev_encode_muqottoah(&self, s: &str, (v, w, l): Edge) -> EncodeResults {
146 let results_iter = l.chars().next().into_iter().flat_map(|c| -> EncodeResults {
147 let tsls = harf_muqottoah_map(c).iter();
148 let tsl_results_iter = tsls.filter_map(|&tsl| -> Option<EncodeResults> {
149 s.strip_prefix(tsl).map(|s| match s {
150 "" => match self.tree.vertices[w].2 {
151 true => vec![(c.to_string(), self.tree.count_data(w), vec![tsl])],
152 false => vec![],
153 },
154 s => match &l[c.len_utf8()..] {
155 "" => { self.tree.edges_from(w) }
156 .flat_map(|&e| self.rev_encode_muqottoah(s, e))
157 .collect(),
158 l => self.rev_encode_muqottoah(s, (v, w, l)),
159 }
160 .into_iter()
161 .map(|(mut q, n, mut e)| {
162 q.push(c);
163 e.push(tsl);
164 (q, n, e)
165 })
166 .collect(),
167 })
168 });
169 tsl_results_iter.flatten().collect()
170 });
171 results_iter.collect()
172 }
173
174 pub fn find(&self, s: &str) -> Vec<Index> {
186 self.tree.find(s, 0)
187 }
188
189 pub fn get_sura(&self, i: usize) -> Option<u8> {
197 Some(self.saqs.get(i)?.0)
198 }
199
200 pub fn get_aya(&self, i: usize) -> Option<u16> {
208 Some(self.saqs.get(i)?.1)
209 }
210
211 pub fn get_quran(&self, i: usize) -> Option<&str> {
219 Some(self.saqs.get(i)?.2)
220 }
221}
222
223impl Default for Quranize {
224 fn default() -> Self {
225 Self::new()
226 }
227}
228
229#[cfg(test)]
230mod tests {
231 use super::*;
232 use pretty_assertions::assert_eq;
233
234 impl Quranize {
235 fn e(&self, text: &str) -> Vec<String> {
236 self.encode(text).into_iter().map(|r| r.0).collect()
237 }
238 }
239
240 #[test]
241 fn test_quranize_default() {
242 let q: Quranize = Default::default();
243 assert_eq!(q.e("illa billah"), ["إِلّا بِاللَّه"]);
244 assert_eq!(q.e("alqur'an"), ["القُرآن"]);
245 assert_eq!(q.e("bismillah"), ["بِسمِ اللَّه"]);
246 assert_eq!(q.e("birobbinnas"), ["بِرَبِّ النّاس"]);
247 assert_eq!(q.e("inna anzalnahu"), ["إِنّا أَنزَلناهُ"]);
248 assert_eq!(q.e("wa'tasimu"), ["وَاعتَصِمو"]);
249 assert_eq!(q.e("wa'tasimu bihablillah"), ["وَاعتَصِموا بِحَبلِ اللَّه"]);
250 assert_eq!(q.e("idza qodho"), ["إِذا قَضَ"]);
251 assert_eq!(q.e("masyaallah"), ["ما شاءَ اللَّه"]);
252 assert_eq!(q.e("illa man taba"), ["إِلّا مَن تابَ"]);
253 assert_eq!(q.e("alla tahzani"), ["أَلّا تَحزَني"]);
254 assert_eq!(q.e("innasya niaka"), ["إِنَّ شانِئَكَ"]);
255 assert_eq!(q.e("innasya ni'aka"), ["إِنَّ شانِئَكَ"]);
256 assert_eq!(q.e("wasalamun alaihi"), ["وَسَلامٌ عَلَيهِ"]);
257 assert_eq!(q.e("ulaika hum"), ["أُولـٰئِكَ هُم"]);
258 assert_eq!(q.e("waladdoollin"), ["وَلَا الضّالّين"]);
259 assert_eq!(q.e("undur kaifa"), ["انظُر كَيفَ"]);
260 assert_eq!(q.e("lirrohman"), ["لِلرَّحمـٰن"]);
261 assert_eq!(q.e("waantum muslimun"), ["وَأَنتُم مُسلِمون"]);
262 assert_eq!(q.e("laa yukallifullah"), ["لا يُكَلِّفُ اللَّه"]);
263 assert_eq!(q.e("robbil alamin"), ["رَبِّ العالَمين"]);
264 assert_eq!(q.e("husnul maab"), ["حُسنُ المَآب"]);
265 assert_eq!(q.e("khusnul ma'ab"), ["حُسنُ المَآب"]);
266 assert_eq!(q.e("kufuwan"), ["كُفُوً"]);
267 assert_eq!(q.e("yukhodiun"), ["يُخادِعون"]);
268 assert_eq!(q.e("indallah"), ["عِندَ اللَّه"]);
269 assert_eq!(q.e("alimul ghoibi"), ["عالِمُ الغَيبِ"]);
270 assert_eq!(q.e("kaana dhoifa"), ["كانَ ضَعيفًا"]);
271 assert_eq!(q.e("waantum muslimuna"), ["وَأَنتُم مُسلِمونَ"]);
272 assert_eq!(q.e("kitabi la roiba"), ["الكِتابِ لا رَيبَ"]);
273 assert_eq!(q.e("takwili"), ["تَأويلِ"]);
274 assert_eq!(q.e("yu'minun"), ["يُؤمِنون"]);
275 assert_eq!(q.e("hudan lil muttaqin"), ["هُدًى لِلمُتَّقين"]);
276 assert_eq!(q.e("majreeha wamursaha"), ["مَجراها وَمُرساها"]);
277 assert_eq!(q.e("fabiayyi alai"), ["فَبِأَيِّ آلاءِ"]);
278 assert_eq!(q.e("wayuallimukumma"), ["وَيُعَلِّمُكُم ما"]);
279 assert_eq!(q.e("wassolat"), ["وَالصَّلاة"]);
280 }
281
282 #[test]
283 fn test_alfatihah() {
284 let q = Quranize::new();
285 assert_eq!(
286 q.e("bismillahirrohmanirrohiim"),
287 ["بِسمِ اللَّهِ الرَّحمـٰنِ الرَّحيم"]
288 );
289 assert_eq!(
290 q.e("alhamdulilla hirobbil 'alamiin"),
291 ["الحَمدُ لِلَّهِ رَبِّ العالَمين"]
292 );
293 assert_eq!(q.e("arrohma nirrohim"), ["الرَّحمـٰنِ الرَّحيم"]);
294 assert_eq!(q.e("maliki yau middin"), ["مالِكِ يَومِ الدّين"]);
295 assert_eq!(
296 q.e("iyyakanakbudu waiyyakanastain"),
297 ["إِيّاكَ نَعبُدُ وَإِيّاكَ نَستَعين"]
298 );
299 assert_eq!(q.e("ihdinassirotol mustaqim"), ["اهدِنَا الصِّراطَ المُستَقيم"]);
300 assert_eq!(
301 q.e("shirotolladzina an'amta 'alaihim ghoiril maghdzubi 'alaihim waladdoolliin"),
302 ["صِراطَ الَّذينَ أَنعَمتَ عَلَيهِم غَيرِ المَغضوبِ عَلَيهِم وَلَا الضّالّين"]
303 );
304 }
305
306 #[test]
307 fn test_al_ikhlas() {
308 let q = Quranize::new();
309 assert_eq!(q.e("qulhuwallahuahad"), ["قُل هُوَ اللَّهُ أَحَد"]);
310 assert_eq!(q.e("allahussomad"), ["اللَّهُ الصَّمَد"]);
311 assert_eq!(q.e("lam yalid walam yulad"), ["لَم يَلِد وَلَم يولَد"]);
312 assert_eq!(
313 q.e("walam yakun lahu kufuwan ahad"),
314 ["وَلَم يَكُن لَهُ كُفُوًا أَحَد"]
315 );
316 }
317
318 #[test]
319 fn test_harf_muqottoah() {
320 let q = Quranize::new();
321 assert_eq!(q.e("alif lam mim"), ["الم"]);
322 assert_eq!(q.e("alif laaam miiim"), &["الم"]);
323 assert_eq!(q.e("nuun"), &["ن"]);
324 assert_eq!(q.e("kaaaf haa yaa aiiin shoood"), &["كهيعص"]);
325 assert_eq!(q.e("kaf ha ya 'ain shod"), &["كهيعص"]);
326 assert_eq!(q.e("alif lam ro"), &["الر"]);
327 }
328
329 #[test]
330 fn test_quranize_empty_result() {
331 let q = Quranize::new();
332 let empty: [String; 0] = [];
333 assert_eq!(q.e(""), empty);
334 assert_eq!(q.e(" "), empty);
335 assert_eq!(q.e(" -"), empty);
336 assert_eq!(q.e("abcd"), empty);
337 assert_eq!(q.e("1+2=3"), empty);
338 }
339
340 #[test]
341 fn test_unique() {
342 let q = Quranize::new();
343 let results = q.e("masyaallah");
344 let uresults = std::collections::HashSet::<&String>::from_iter(results.iter());
345 let is_unique = results.len() == uresults.len();
346 assert!(is_unique, "results are not unique. results: {:#?}", results);
347 }
348
349 #[test]
350 fn test_tree_find() {
351 let q = Quranize::new();
352 assert!(q.find("بِسمِ").contains(&(0, 0)));
353 assert_eq!(q.find("وَالنّاسِ").last(), Some(&(6235, 28)));
354 assert!(q.find("الم").contains(&(7, 0)));
355 assert_eq!(q.find("بِسمِ اللَّهِ الرَّحمـٰنِ الرَّحيمِ").len(), 2);
356 assert!(q.find("").is_empty());
357 assert!(q.find("نن").is_empty());
358 assert!(q.find("ننن").is_empty());
359 assert!(q.find("نننن").is_empty());
360 assert!(q.find("2+3+4=9").is_empty());
361 assert_eq!(q.find("بِسمِ اللَّهِ الرَّحمـٰنِ الرَّحيمِ").first(), Some(&(0, 0)));
362 assert_eq!(q.find("الرَّحمـٰنِ الرَّحيمِ").first(), Some(&(0, 26)));
363 assert_eq!(q.find("").first(), None);
364 assert_eq!(q.find("abc").first(), None);
365 }
366
367 #[test]
368 fn test_tree_props() {
369 let t = Quranize::new().tree;
370 assert_eq!(t.vertices.len(), t.edges.len() + 1);
371 assert_eq!(t.count_data(0), t.collect_data(0).len());
372 assert_eq!(t.vertices.len(), Quranize::EXPECTED_VERTEX_COUNT);
373 assert!(t.vertices[0].2);
374 assert!(!t.vertices[Quranize::EXPECTED_VERTEX_COUNT - 1].2);
375 }
376}