#![forbid(unsafe_code)]
use unicode_bidi::{BidiInfo, Level};
pub fn looks_rtl(text: &str) -> bool {
text.chars()
.any(|c| crate::text::rtl_detector::is_rtl_text(c as u32))
}
pub fn reorder_visual_to_logical(text: &str) -> String {
if !looks_rtl(text) {
return text.to_string();
}
let info = BidiInfo::new(text, None);
if info.paragraphs.is_empty() {
return text.to_string();
}
let mut out = String::with_capacity(text.len());
for para in &info.paragraphs {
let line_range = para.range.clone();
let line = info.reorder_line(para, line_range);
out.push_str(&line);
}
out
}
pub fn paragraph_is_rtl(text: &str) -> bool {
if !looks_rtl(text) {
return false;
}
let info = BidiInfo::new(text, None);
info.paragraphs
.first()
.map(|p| p.level.is_rtl())
.unwrap_or(false)
}
fn is_bidi_digit(c: char) -> bool {
let cp = c as u32;
c.is_ascii_digit()
|| (0x0660..=0x0669).contains(&cp) || (0x06F0..=0x06F9).contains(&cp) }
fn is_latin_letter(c: char) -> bool {
if c.is_ascii_alphabetic() {
return true;
}
let cp = c as u32;
c.is_alphabetic()
&& ((0x00C0..=0x024F).contains(&cp) || (0x1E00..=0x1EFF).contains(&cp)) }
pub(crate) fn reorder_mixed_rtl_line(line: &str) -> String {
if !paragraph_is_rtl(line) {
return line.to_string();
}
let has_embedded_ltr = line.chars().any(|c| is_bidi_digit(c) || is_latin_letter(c));
if !has_embedded_ltr {
return line.to_string();
}
let info = BidiInfo::new(line, Some(Level::rtl()));
let chars: Vec<char> = line.chars().collect();
if info.levels.len() != line.len() {
return line.to_string();
}
let mut char_levels: Vec<Level> = Vec::with_capacity(chars.len());
{
let mut byte = 0usize;
for c in &chars {
char_levels.push(info.levels[byte]);
byte += c.len_utf8();
}
}
let mut out = String::with_capacity(line.len());
let mut i = 0usize;
while i < chars.len() {
if char_levels[i].is_rtl() {
out.push(chars[i]);
i += 1;
continue;
}
let mut j = i;
while j < chars.len() && char_levels[j].is_ltr() {
j += 1;
}
for &c in &chars[i..j] {
out.push(c);
}
i = j;
}
out
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum RunOrder {
Visual,
Logical,
Ambiguous,
}
pub(crate) fn detect_visual_order_run(chars_with_x: &[(char, f32)]) -> RunOrder {
if chars_with_x.iter().any(|(c, _)| {
let cp = *c as u32;
(0xFB50..=0xFDFF).contains(&cp) || (0xFE70..=0xFEFF).contains(&cp)
}) {
return RunOrder::Ambiguous;
}
use crate::text::rtl_detector::{is_arabic_letter, is_hebrew_letter};
let rtl: Vec<(char, f32)> = chars_with_x
.iter()
.copied()
.filter(|(c, _)| {
let cp = *c as u32;
is_arabic_letter(cp) || is_hebrew_letter(cp)
})
.collect();
if rtl.len() < 4 {
return RunOrder::Ambiguous;
}
const KERN_TOL: f32 = 0.5; let mut asc: usize = 0;
let mut desc: usize = 0;
for w in rtl.windows(2) {
let (_, x0) = w[0];
let (_, x1) = w[1];
let dx = x1 - x0;
if dx > KERN_TOL {
asc += 1;
} else if dx < -KERN_TOL {
desc += 1;
}
}
let total = asc + desc;
if total == 0 {
return RunOrder::Ambiguous;
}
if 10 * asc > 9 * total {
return RunOrder::Visual;
}
if 10 * desc > 9 * total {
return RunOrder::Logical;
}
RunOrder::Ambiguous
}
pub(crate) mod isolation {
pub(crate) const LRI: char = '\u{2066}';
pub(crate) const RLI: char = '\u{2067}';
#[allow(dead_code)]
pub(crate) const FSI: char = '\u{2068}';
pub(crate) const PDI: char = '\u{2069}';
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum CharDir {
Rtl,
Ltr,
Neutral,
}
fn classify(c: char) -> CharDir {
let cp = c as u32;
if crate::text::rtl_detector::is_rtl_text(cp) {
return CharDir::Rtl;
}
if c.is_alphabetic() {
return CharDir::Ltr;
}
CharDir::Neutral
}
pub fn wrap_rtl_isolates(text: &str, block_is_rtl: bool) -> String {
if text.is_empty() {
return String::new();
}
let has_rtl = looks_rtl(text);
if !block_is_rtl && !has_rtl {
return text.to_string();
}
let has_ltr = text.chars().any(|c| classify(c) == CharDir::Ltr);
if block_is_rtl && !has_ltr {
return text.to_string();
}
let chars: Vec<char> = text.chars().collect();
let mut runs: Vec<(CharDir, Vec<char>)> = Vec::new();
let mut pending_neutrals: Vec<char> = Vec::new();
for c in chars {
let dir = classify(c);
match dir {
CharDir::Neutral => {
if let Some(last) = runs.last_mut() {
last.1.push(c);
} else {
pending_neutrals.push(c);
}
},
CharDir::Rtl | CharDir::Ltr => {
if let Some(last) = runs.last_mut() {
if last.0 == dir {
last.1.push(c);
continue;
}
}
let mut buf = std::mem::take(&mut pending_neutrals);
buf.push(c);
runs.push((dir, buf));
},
}
}
if runs.is_empty() {
return text.to_string();
}
if !pending_neutrals.is_empty() {
let mut tail = pending_neutrals;
runs[0].1.append(&mut tail);
}
let mut out = String::with_capacity(text.len() + runs.len() * 6);
for (dir, run_chars) in runs {
let run_text: String = run_chars.into_iter().collect();
match (block_is_rtl, dir) {
(false, CharDir::Rtl) => {
out.push(isolation::RLI);
out.push_str(&run_text);
out.push(isolation::PDI);
},
(true, CharDir::Ltr) => {
out.push(isolation::LRI);
out.push_str(&run_text);
out.push(isolation::PDI);
},
_ => {
out.push_str(&run_text);
},
}
}
out
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn looks_rtl_pure_ascii_is_false() {
assert!(!looks_rtl("hello world"));
assert!(!looks_rtl(""));
}
#[test]
fn looks_rtl_arabic_is_true() {
assert!(looks_rtl("مرحبا"));
assert!(looks_rtl("year 2024 عام"));
}
#[test]
fn looks_rtl_hebrew_is_true() {
assert!(looks_rtl("שלום"));
}
#[test]
fn reorder_pure_ltr_is_identity() {
let s = "Hello, world!";
assert_eq!(reorder_visual_to_logical(s), s);
}
#[test]
fn reorder_is_a_visual_to_logical_converter_not_idempotent() {
let logical_hebrew = "בנימין";
let after_first = reorder_visual_to_logical(logical_hebrew);
assert_ne!(after_first, logical_hebrew);
let after_second = reorder_visual_to_logical(&after_first);
assert_eq!(after_second, logical_hebrew);
}
#[test]
fn reorder_arabic_with_numerals_keeps_digits_logical() {
let logical = "عام 2024 كان جيدا";
let result = reorder_visual_to_logical(logical);
assert!(result.contains("2024"), "expected `2024` in reordered line, got {:?}", result);
assert_eq!(result.chars().count(), logical.chars().count());
}
#[test]
fn paragraph_is_rtl_for_arabic() {
assert!(paragraph_is_rtl("هذا نص عربي"));
}
#[test]
fn paragraph_is_not_rtl_for_pure_english() {
assert!(!paragraph_is_rtl("This is English"));
}
#[test]
fn looks_rtl_delegates_to_rtl_detector() {
for cp in [
0x058F, 0x0590, 0x05FF, 0x0600, 0x0633, 0x06FF, 0x0700, 0x074F, 0x0750, 0x077F, 0x0780,
0x08A0, 0x08FF, 0x0900, 0xFB4F, 0xFB50, 0xFDFF, 0xFE00, 0xFE70, 0xFEFE, 0xFEFF, 0xFF00,
] {
if let Some(c) = char::from_u32(cp) {
let s = c.to_string();
let bidi_says = looks_rtl(&s);
let detector_says = crate::text::rtl_detector::is_rtl_text(cp);
assert_eq!(
bidi_says, detector_says,
"U+{:04X}: looks_rtl={} but rtl_detector::is_rtl_text={}",
cp, bidi_says, detector_says
);
}
}
}
#[test]
fn paragraph_is_rtl_respects_dominant_direction() {
assert!(!paragraph_is_rtl("Foo بار 1"));
assert!(paragraph_is_rtl("بار Foo 1"));
}
#[test]
fn looks_rtl_covers_all_supported_blocks() {
let cases: &[(u32, &str)] = &[
(0x0590, "Hebrew start"),
(0x05F4, "Hebrew end-ish"),
(0x0600, "Arabic start"),
(0x06FF, "Arabic end"),
(0x0750, "Arabic Supplement start"),
(0x077F, "Arabic Supplement end"),
(0x08A0, "Arabic Extended-A start"),
(0x08FF, "Arabic Extended-A end"),
(0xFB50, "Arabic Presentation Forms-A start"),
(0xFDFF, "Arabic Presentation Forms-A end"),
(0xFE70, "Arabic Presentation Forms-B start"),
(0xFEFF, "Arabic Presentation Forms-B end"),
];
for (cp, name) in cases {
if let Some(c) = char::from_u32(*cp) {
let s = c.to_string();
assert!(looks_rtl(&s), "looks_rtl({:?} {}) should be true", s, name);
}
}
}
#[test]
fn looks_rtl_rejects_neutral_and_cjk() {
for s in [
"中文", "日本語", "α β γ", "1234567890",
"!@#$%^&*()",
"café",
"naïve",
] {
assert!(!looks_rtl(s), "looks_rtl({:?}) should be false", s);
}
}
#[test]
fn reorder_pure_ltr_identity_extras() {
for s in [
"",
"a",
"Hello, world!",
"Multi-line\nstays unchanged",
"Numbers: 1234 5678",
"Symbols: !@#$%^&*",
"Whitespace between words",
] {
assert_eq!(reorder_visual_to_logical(s), s, "identity broken on {:?}", s);
}
}
#[test]
fn reorder_preserves_character_count() {
for s in [
"عربي",
"هذا نص عربي للاختبار",
"year 2024 عام جيد",
"שלום world",
"Mixed: عربي + 123 + Latin",
] {
let out = reorder_visual_to_logical(s);
assert_eq!(
out.chars().count(),
s.chars().count(),
"char count changed: {:?} -> {:?}",
s,
out
);
}
}
#[test]
fn reorder_keeps_embedded_ltr_token_contiguous() {
let line = "هذا منتج Microsoft الجديد";
let result = reorder_visual_to_logical(line);
assert!(
result.contains("Microsoft"),
"embedded LTR token reversed: {:?} -> {:?}",
line,
result
);
}
#[test]
fn paragraph_is_rtl_edges() {
assert!(!paragraph_is_rtl(""));
assert!(!paragraph_is_rtl(" "));
assert!(!paragraph_is_rtl("123 456"));
assert!(paragraph_is_rtl("نص with English"));
}
#[test]
fn reorder_mixed_rtl_line_date_keeps_ltr_subruns_left_to_right() {
let line = "14 april 1434 ٤٣٤١";
let out = reorder_mixed_rtl_line(line);
assert!(out.contains("1434"), "`1434` reversed/lost: {:?} -> {:?}", line, out);
assert!(out.contains("april"), "`april` reversed/lost: {:?} -> {:?}", line, out);
assert!(out.contains("14 "), "leading `14` reversed/lost: {:?} -> {:?}", line, out);
let p14 = out.find("14").expect("14 present");
let papril = out.find("april").expect("april present");
let p1434 = out.find("1434").expect("1434 present");
assert!(p14 < papril && papril < p1434, "LTR sub-run order changed: {:?}", out);
assert_eq!(
out.chars().count(),
line.chars().count(),
"char count changed: {:?} -> {:?}",
line,
out
);
}
#[test]
fn reorder_mixed_rtl_line_pure_arabic_is_byte_identical() {
let line = "هذا نص عربي خالص";
assert_eq!(reorder_mixed_rtl_line(line), line);
}
#[test]
fn reorder_mixed_rtl_line_pure_english_is_byte_identical() {
let line = "This is plain English 2024";
assert_eq!(reorder_mixed_rtl_line(line), line);
}
#[test]
fn reorder_mixed_rtl_line_ltr_first_is_unchanged() {
let line = "Invoice رقم 123";
assert_eq!(reorder_mixed_rtl_line(line), line);
}
#[test]
fn reorder_mixed_rtl_line_preserves_char_count() {
for s in [
"14 april 1434 ٤٣٤١",
"هذا منتج Microsoft الجديد",
"عام 2024 كان جيدا",
"السعر 99 دولار",
] {
let out = reorder_mixed_rtl_line(s);
assert_eq!(
out.chars().count(),
s.chars().count(),
"char count changed: {:?} -> {:?}",
s,
out
);
}
}
#[test]
fn detect_visual_run_short_run_is_ambiguous() {
let three_chars = [('ק', 0.0), ('ר', 6.0), ('ח', 12.0)];
assert_eq!(detect_visual_order_run(&three_chars), RunOrder::Ambiguous);
}
#[test]
fn detect_visual_run_hebrew_visual_order() {
let visual = [
('מ', 0.0),
('ק', 6.0),
('ל', 12.0),
('ד', 18.0),
('ת', 24.0),
];
assert_eq!(detect_visual_order_run(&visual), RunOrder::Visual);
}
#[test]
fn detect_visual_run_hebrew_logical_order() {
let logical = [
('מ', 24.0),
('ק', 18.0),
('ל', 12.0),
('ד', 6.0),
('ת', 0.0),
];
assert_eq!(detect_visual_order_run(&logical), RunOrder::Logical);
}
#[test]
fn detect_visual_run_arabic_main_block_visual() {
let visual = [('ع', 0.0), ('ر', 7.0), ('ب', 14.0), ('ي', 21.0)];
assert_eq!(detect_visual_order_run(&visual), RunOrder::Visual);
}
#[test]
fn detect_visual_run_presentation_forms_bails_out() {
let with_pfs = [
('\u{FE80}', 0.0), ('\u{FE91}', 7.0), ('\u{FE9A}', 14.0),
('\u{FEAB}', 21.0),
];
assert_eq!(detect_visual_order_run(&with_pfs), RunOrder::Ambiguous);
}
#[test]
fn detect_visual_run_ties_are_ambiguous() {
let ties = [('ק', 5.0), ('ר', 5.0), ('ח', 5.0), ('ל', 5.0)];
assert_eq!(detect_visual_order_run(&ties), RunOrder::Ambiguous);
}
#[test]
fn detect_visual_run_mixed_signal_is_ambiguous() {
let mixed = [('ק', 0.0), ('ר', 6.0), ('ח', 3.0), ('ל', 1.0)];
assert_eq!(detect_visual_order_run(&mixed), RunOrder::Ambiguous);
}
#[test]
fn detect_visual_run_ignores_non_rtl_chars() {
let with_digit = [
('ק', 0.0),
('ר', 6.0),
('2', 12.0), ('ח', 18.0),
('ל', 24.0),
];
assert_eq!(detect_visual_order_run(&with_digit), RunOrder::Visual);
}
#[test]
fn detect_visual_run_kerning_tolerance() {
let kerning_noise = [('ק', 0.0), ('ר', 0.3), ('ח', 0.6), ('ל', 0.9), ('מ', 1.2)];
assert_eq!(detect_visual_order_run(&kerning_noise), RunOrder::Ambiguous);
}
#[test]
fn wrap_rtl_isolates_pure_ltr_is_identity() {
for s in [
"",
"Hello, world!",
"The article is about greetings, page 42.",
"Multiple\nlines\nstay clean",
"Numbers 123 and punctuation: !?.,;",
] {
assert_eq!(wrap_rtl_isolates(s, false), s, "pure-LTR identity broken on {:?}", s);
}
}
#[test]
fn wrap_rtl_isolates_rtl_run_in_ltr_block_gets_rli_pdi() {
let line = "The article שלום עולם is greetings.";
let out = wrap_rtl_isolates(line, false);
assert!(out.contains('\u{2067}'), "RLI missing in {:?}", out);
assert!(out.contains('\u{2069}'), "PDI missing in {:?}", out);
assert!(!out.contains('\u{2066}'), "unexpected LRI in {:?}", out);
let rli_idx = out.find('\u{2067}').expect("RLI present");
let pdi_idx = out.find('\u{2069}').expect("PDI present");
assert!(rli_idx < pdi_idx, "RLI must precede PDI in {:?}", out);
}
#[test]
fn wrap_rtl_isolates_ltr_run_in_rtl_block_gets_lri_pdi() {
let line = "הספר Microsoft חדש";
let out = wrap_rtl_isolates(line, true);
assert!(out.contains('\u{2066}'), "LRI missing in {:?}", out);
assert!(out.contains('\u{2069}'), "PDI missing in {:?}", out);
assert!(!out.contains('\u{2067}'), "unexpected RLI in {:?}", out);
let lri_idx = out.find('\u{2066}').expect("LRI present");
let pdi_idx = out.find('\u{2069}').expect("PDI present");
assert!(lri_idx < pdi_idx, "LRI must precede PDI in {:?}", out);
}
#[test]
fn wrap_rtl_isolates_pure_rtl_in_rtl_block_is_identity() {
let line = "שלום עולם";
assert_eq!(wrap_rtl_isolates(line, true), line);
}
#[test]
fn wrap_rtl_isolates_no_double_wrap_on_repeated_runs() {
let line = "First שלום middle עולם last";
let out = wrap_rtl_isolates(line, false);
let rli_count = out.chars().filter(|&c| c == '\u{2067}').count();
let pdi_count = out.chars().filter(|&c| c == '\u{2069}').count();
assert_eq!(rli_count, 2, "expected 2 RLIs in {:?}", out);
assert_eq!(pdi_count, 2, "expected 2 PDIs in {:?}", out);
}
#[test]
fn wrap_rtl_isolates_preserves_char_count_modulo_markers() {
let line = "abc שלום def";
let out = wrap_rtl_isolates(line, false);
let stripped: String = out
.chars()
.filter(|c| !matches!(*c, '\u{2066}' | '\u{2067}' | '\u{2068}' | '\u{2069}'))
.collect();
assert_eq!(stripped, line);
}
}