#![no_std]
extern crate alloc;
mod codepoints;
mod ffi;
use alloc::{boxed::Box, vec::Vec};
use codepoints::*;
use core::iter;
use hashbrown::{hash_map::Entry, HashMap};
use itertools::Itertools;
#[derive(PartialEq, Eq, Hash, Debug, Clone, Copy)]
struct KashidaCandidate {
breakpoint: usize,
bp_priority: usize,
}
impl KashidaCandidate {
fn new(breakpoint: usize, bp_priority: usize) -> Self {
Self { breakpoint, bp_priority }
}
}
#[non_exhaustive]
#[derive(Clone, Copy)]
pub enum Script {
Arabic,
}
#[must_use]
pub fn find_kashidas(input: &str, script: Script) -> Box<[usize]> {
match script {
Script::Arabic => find_kashidas_arabic(input),
}
}
#[must_use]
fn find_kashidas_arabic(input: &str) -> Box<[usize]> {
let mut candidates: HashMap<_, KashidaCandidate> = HashMap::with_capacity(input.len() / 2);
let word_segmenter = icu_segmenter::WordSegmenter::new_auto();
let grapheme_segmenter = icu_segmenter::GraphemeClusterSegmenter::new();
let words = word_segmenter
.segment_str(input)
.tuple_windows()
.filter_map(|(wb1, wb2)| Some(&input[wb1..wb2]).filter(|s| !s.trim().is_empty()));
for (word_idx, word) in words.enumerate() {
let graphemes = grapheme_segmenter
.segment_str(word)
.tuple_windows()
.map(|(gb1, gb2)| Some(&word[gb1..gb2]))
.pad_using(3, |_| None)
.chain(iter::once(None))
.tuple_windows();
for glyph_window in graphemes {
find_kashidas_in_glyph_run(glyph_window, input, |kc| {
match candidates.entry(word_idx) {
Entry::Occupied(mut e) if kc.bp_priority <= e.get().bp_priority => e.insert(kc),
Entry::Occupied(_) => kc,
Entry::Vacant(e) => *e.insert(kc),
};
});
}
}
let mut ret = candidates.into_values().collect::<Vec<_>>();
ret.sort_unstable_by_key(|a| a.bp_priority);
ret.into_iter().map(|kc| kc.breakpoint).collect()
}
fn find_kashidas_in_glyph_run(
(g1, g2, g3, g4): (Option<&str>, Option<&str>, Option<&str>, Option<&str>),
input: &str,
mut insert_candidate: impl FnMut(KashidaCandidate),
) {
let breakpoint = |g: &str| g.as_ptr() as usize - input.as_ptr() as usize;
match (g1, g2, g3, g4) {
(Some(g1), Some(g2), Some(g3), None)
if g1.contains(|c| LAMS.contains(&c))
&& g2.contains(|c| LAMS.contains(&c))
&& g3.contains(|c| HEHS.contains(&c)) => {}
(_, Some(g), _, _) if g.chars().all(|c| c == KASHIDA) => {
insert_candidate(KashidaCandidate::new(breakpoint(g), 0));
}
(Some(preceding), Some(g), None, None)
if preceding.contains(|c| CONNECTORS_EXCEPT_LAMS.contains(&c))
|| (preceding.contains(|c| LAMS.contains(&c))
&& g.chars().all(|c| !ALEFS.contains(&c))) =>
{
insert_candidate(KashidaCandidate::new(breakpoint(g), 9));
}
(Some(g1), Some(g2), ..) | (_, Some(g1), Some(g2), _)
if g1.contains(|c| SADS.contains(&c) || SEENS.contains(&c))
&& g2.contains(char::is_alphabetic) =>
{
insert_candidate(KashidaCandidate::new(breakpoint(g2), 1));
}
(Some(preceding), Some(fst), Some(snd), None)
if preceding.contains(|c| ALL_CONNECTORS.contains(&c))
&& fst.contains(|c| TEETH.contains(&c))
&& snd.contains(|c| YEHS.contains(&c) || MEEMS.contains(&c)) =>
{
insert_candidate(KashidaCandidate::new(breakpoint(fst), 4));
}
(_, Some(preceding), Some(g), None)
if preceding.contains(|c| CONNECTORS_EXCEPT_LAMS.contains(&c)) =>
{
if g.contains(|c| HEHS.contains(&c)) {
insert_candidate(KashidaCandidate::new(breakpoint(g), 2));
}
else if g.contains(|c| TAHS.contains(&c) || LAMS.contains(&c) || KAFS.contains(&c)) {
insert_candidate(KashidaCandidate::new(breakpoint(g), 3));
}
else if g.contains(|c| AINS.contains(&c) || FEHS.contains(&c) || QAFS.contains(&c)) {
insert_candidate(KashidaCandidate::new(breakpoint(g), 5));
}
else {
insert_candidate(KashidaCandidate::new(breakpoint(g), 6));
};
}
(Some(preceding), Some(fst), Some(snd), _)
if preceding.contains(|c| ALL_CONNECTORS.contains(&c))
&& fst.contains(|c| LAMS.contains(&c))
&& snd.contains(|c| ALEFS.contains(&c)) =>
{
insert_candidate(KashidaCandidate::new(breakpoint(fst), 3));
}
(Some(preceding), Some(fst), Some(snd), _)
if preceding.contains(|c| ALL_CONNECTORS.contains(&c))
&& fst.contains(|c| TEETH.contains(&c))
&& snd.contains(|c| REHS.contains(&c)) =>
{
insert_candidate(KashidaCandidate::new(breakpoint(fst), 4));
}
(Some(preceding), Some(g), ..) | (_, Some(preceding), Some(g), _)
if preceding.contains(|c| ALL_CONNECTORS.contains(&c)) =>
{
let breakpoint = breakpoint(g);
if g.contains(|c| DALS.contains(&c) || TEH_MARBOUTA.contains(&c)) {
insert_candidate(KashidaCandidate::new(breakpoint, 2));
}
else if preceding.chars().all(|c| !LAMS.contains(&c))
&& g.contains(|c| ALEFS.contains(&c))
{
insert_candidate(KashidaCandidate::new(breakpoint, 4));
}
else if g.contains(|c| WAWS.contains(&c)) {
insert_candidate(KashidaCandidate::new(breakpoint, 5));
}
else if preceding.chars().all(|c| !LAMS.contains(&c)) {
insert_candidate(KashidaCandidate::new(breakpoint, 7));
}
}
_ => {} }
}
#[cfg(test)]
mod tests {
extern crate std;
use std::{println, vec};
use super::*;
#[test]
fn jalala_no_candidates() {
let input = "الله";
let candidates = find_kashidas_arabic(input);
assert_eq!(candidates.len(), 0);
}
#[test]
fn basmala_candidates() {
let input = "بسم الله الرحمن الرحيم";
let candidates = find_kashidas_arabic(input);
println!("{:?}", candidates);
assert_eq!(candidates, vec![4, 37, 26].into_boxed_slice());
}
}