use super::Handle;
use std::ffi::{CStr, c_char};
#[repr(C)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum NormalizationForm {
NFD = 0,
NFC = 1,
NFKD = 2,
NFKC = 3,
}
#[repr(C)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BiDiDirection {
LTR = 0,
RTL = 1,
Mixed = 2,
}
#[repr(C)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ScriptCategory {
Common = 0,
Latin = 1,
Greek = 2,
Cyrillic = 3,
Arabic = 4,
Hebrew = 5,
CJK = 6,
Devanagari = 7,
Thai = 8,
Other = 255,
}
#[repr(C)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum WordBreakType {
NoBreak = 0,
Soft = 1,
Hard = 2,
Line = 3,
}
#[unsafe(no_mangle)]
pub extern "C" fn fz_normalize_string(
_ctx: Handle,
input: *const c_char,
output: *mut c_char,
output_size: usize,
form: i32,
) -> usize {
if input.is_null() || output.is_null() || output_size == 0 {
return 0;
}
let input_str = match unsafe { CStr::from_ptr(input) }.to_str() {
Ok(s) => s,
Err(_) => return 0,
};
let _norm_form = match form {
1 => NormalizationForm::NFC,
2 => NormalizationForm::NFKD,
3 => NormalizationForm::NFKC,
_ => NormalizationForm::NFD,
};
let normalized = normalize_simple(input_str);
let bytes = normalized.as_bytes();
let copy_len = bytes.len().min(output_size - 1);
let output_slice = unsafe { std::slice::from_raw_parts_mut(output as *mut u8, copy_len + 1) };
output_slice[..copy_len].copy_from_slice(&bytes[..copy_len]);
output_slice[copy_len] = 0;
copy_len
}
fn normalize_simple(s: &str) -> String {
let mut result = String::with_capacity(s.len());
for c in s.chars() {
match c {
'fi' => result.push_str("fi"),
'fl' => result.push_str("fl"),
'ff' => result.push_str("ff"),
'ffi' => result.push_str("ffi"),
'ffl' => result.push_str("ffl"),
'IJ' => result.push_str("IJ"),
'ij' => result.push_str("ij"),
_ => result.push(c),
}
}
result
}
#[unsafe(no_mangle)]
pub extern "C" fn fz_string_is_normalized(_ctx: Handle, input: *const c_char, form: i32) -> i32 {
if input.is_null() {
return 1;
}
let input_str = match unsafe { CStr::from_ptr(input) }.to_str() {
Ok(s) => s,
Err(_) => return 0,
};
let _form = match form {
1 => NormalizationForm::NFC,
2 => NormalizationForm::NFKD,
3 => NormalizationForm::NFKC,
_ => NormalizationForm::NFD,
};
let has_ligatures = input_str
.chars()
.any(|c| matches!(c, 'fi' | 'fl' | 'ff' | 'ffi' | 'ffl'));
if has_ligatures { 0 } else { 1 }
}
#[unsafe(no_mangle)]
pub extern "C" fn fz_get_bidi_direction(_ctx: Handle, text: *const c_char) -> i32 {
if text.is_null() {
return BiDiDirection::LTR as i32;
}
let text_str = match unsafe { CStr::from_ptr(text) }.to_str() {
Ok(s) => s,
Err(_) => return BiDiDirection::LTR as i32,
};
let mut has_rtl = false;
let mut has_ltr = false;
for c in text_str.chars() {
if is_rtl_char(c) {
has_rtl = true;
} else if is_strong_ltr_char(c) {
has_ltr = true;
}
}
if has_rtl && has_ltr {
BiDiDirection::Mixed as i32
} else if has_rtl {
BiDiDirection::RTL as i32
} else {
BiDiDirection::LTR as i32
}
}
fn is_rtl_char(c: char) -> bool {
matches!(c,
'\u{0590}'..='\u{05FF}' | '\u{0600}'..='\u{06FF}' | '\u{0750}'..='\u{077F}' | '\u{08A0}'..='\u{08FF}' )
}
fn is_strong_ltr_char(c: char) -> bool {
c.is_alphabetic() && !is_rtl_char(c)
}
#[unsafe(no_mangle)]
pub extern "C" fn fz_bidi_reorder(
_ctx: Handle,
input: *const c_char,
output: *mut c_char,
output_size: usize,
base_dir: i32,
) -> usize {
if input.is_null() || output.is_null() || output_size == 0 {
return 0;
}
let input_str = match unsafe { CStr::from_ptr(input) }.to_str() {
Ok(s) => s,
Err(_) => return 0,
};
let base_rtl = base_dir == BiDiDirection::RTL as i32;
let reordered = bidi_reorder_simple(input_str, base_rtl);
let bytes = reordered.as_bytes();
let copy_len = bytes.len().min(output_size - 1);
let output_slice = unsafe { std::slice::from_raw_parts_mut(output as *mut u8, copy_len + 1) };
output_slice[..copy_len].copy_from_slice(&bytes[..copy_len]);
output_slice[copy_len] = 0;
copy_len
}
fn bidi_reorder_simple(s: &str, _base_rtl: bool) -> String {
let mut result = String::new();
let mut current_run = String::new();
let mut in_rtl = false;
for c in s.chars() {
let char_rtl = is_rtl_char(c);
if char_rtl != in_rtl && !current_run.is_empty() {
if in_rtl {
result.extend(current_run.chars().rev());
} else {
result.push_str(¤t_run);
}
current_run.clear();
}
in_rtl = char_rtl;
current_run.push(c);
}
if !current_run.is_empty() {
if in_rtl {
result.extend(current_run.chars().rev());
} else {
result.push_str(¤t_run);
}
}
result
}
#[unsafe(no_mangle)]
pub extern "C" fn fz_find_word_breaks(
_ctx: Handle,
text: *const c_char,
breaks: *mut i32,
max_breaks: usize,
) -> usize {
if text.is_null() || breaks.is_null() || max_breaks == 0 {
return 0;
}
let text_str = match unsafe { CStr::from_ptr(text) }.to_str() {
Ok(s) => s,
Err(_) => return 0,
};
let breaks_slice = unsafe { std::slice::from_raw_parts_mut(breaks, max_breaks) };
let mut break_count = 0;
let mut prev_was_space = true;
for (i, c) in text_str.char_indices() {
let is_space = c.is_whitespace();
if prev_was_space && !is_space && break_count < max_breaks {
breaks_slice[break_count] = i as i32;
break_count += 1;
}
prev_was_space = is_space;
}
break_count
}
#[unsafe(no_mangle)]
pub extern "C" fn fz_get_word_at(
_ctx: Handle,
text: *const c_char,
position: usize,
word_start: *mut usize,
word_end: *mut usize,
) -> i32 {
if text.is_null() {
return 0;
}
let text_str = match unsafe { CStr::from_ptr(text) }.to_str() {
Ok(s) => s,
Err(_) => return 0,
};
if position >= text_str.len() {
return 0;
}
let bytes = text_str.as_bytes();
let mut start = position;
let mut end = position;
while start > 0 && !bytes[start - 1].is_ascii_whitespace() {
start -= 1;
}
while end < bytes.len() && !bytes[end].is_ascii_whitespace() {
end += 1;
}
if !word_start.is_null() {
unsafe { *word_start = start };
}
if !word_end.is_null() {
unsafe { *word_end = end };
}
1
}
#[unsafe(no_mangle)]
pub extern "C" fn fz_find_line_breaks(
_ctx: Handle,
text: *const c_char,
breaks: *mut i32,
max_breaks: usize,
) -> usize {
if text.is_null() || breaks.is_null() || max_breaks == 0 {
return 0;
}
let text_str = match unsafe { CStr::from_ptr(text) }.to_str() {
Ok(s) => s,
Err(_) => return 0,
};
let breaks_slice = unsafe { std::slice::from_raw_parts_mut(breaks, max_breaks) };
let mut break_count = 0;
for (i, c) in text_str.char_indices() {
let is_break_after = c.is_whitespace()
|| c == '-'
|| c == '\u{00AD}' || is_cjk_char(c);
if is_break_after && break_count < max_breaks {
breaks_slice[break_count] = (i + c.len_utf8()) as i32;
break_count += 1;
}
}
break_count
}
fn is_cjk_char(c: char) -> bool {
matches!(c,
'\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{3000}'..='\u{303F}' | '\u{3040}'..='\u{309F}' | '\u{30A0}'..='\u{30FF}' | '\u{AC00}'..='\u{D7AF}' )
}
#[unsafe(no_mangle)]
pub extern "C" fn fz_detect_script(_ctx: Handle, text: *const c_char) -> i32 {
if text.is_null() {
return ScriptCategory::Common as i32;
}
let text_str = match unsafe { CStr::from_ptr(text) }.to_str() {
Ok(s) => s,
Err(_) => return ScriptCategory::Common as i32,
};
for c in text_str.chars() {
let script = char_script(c);
if script != ScriptCategory::Common {
return script as i32;
}
}
ScriptCategory::Common as i32
}
fn char_script(c: char) -> ScriptCategory {
match c {
'A'..='Z' | 'a'..='z' | '\u{00C0}'..='\u{00FF}' => ScriptCategory::Latin,
'\u{0370}'..='\u{03FF}' => ScriptCategory::Greek,
'\u{0400}'..='\u{04FF}' => ScriptCategory::Cyrillic,
'\u{0600}'..='\u{06FF}' | '\u{0750}'..='\u{077F}' => ScriptCategory::Arabic,
'\u{0590}'..='\u{05FF}' => ScriptCategory::Hebrew,
'\u{4E00}'..='\u{9FFF}' | '\u{3040}'..='\u{30FF}' | '\u{AC00}'..='\u{D7AF}' => {
ScriptCategory::CJK
}
'\u{0900}'..='\u{097F}' => ScriptCategory::Devanagari,
'\u{0E00}'..='\u{0E7F}' => ScriptCategory::Thai,
_ => ScriptCategory::Common,
}
}
#[unsafe(no_mangle)]
pub extern "C" fn fz_casefold(
_ctx: Handle,
input: *const c_char,
output: *mut c_char,
output_size: usize,
) -> usize {
if input.is_null() || output.is_null() || output_size == 0 {
return 0;
}
let input_str = match unsafe { CStr::from_ptr(input) }.to_str() {
Ok(s) => s,
Err(_) => return 0,
};
let folded: String = input_str.chars().flat_map(|c| c.to_lowercase()).collect();
let bytes = folded.as_bytes();
let copy_len = bytes.len().min(output_size - 1);
let output_slice = unsafe { std::slice::from_raw_parts_mut(output as *mut u8, copy_len + 1) };
output_slice[..copy_len].copy_from_slice(&bytes[..copy_len]);
output_slice[copy_len] = 0;
copy_len
}
#[unsafe(no_mangle)]
pub extern "C" fn fz_strcoll(
_ctx: Handle,
s1: *const c_char,
s2: *const c_char,
_locale: *const c_char,
) -> i32 {
if s1.is_null() && s2.is_null() {
return 0;
}
if s1.is_null() {
return -1;
}
if s2.is_null() {
return 1;
}
let str1 = unsafe { CStr::from_ptr(s1) }.to_str().unwrap_or("");
let str2 = unsafe { CStr::from_ptr(s2) }.to_str().unwrap_or("");
let s1_lower: String = str1.chars().flat_map(|c| c.to_lowercase()).collect();
let s2_lower: String = str2.chars().flat_map(|c| c.to_lowercase()).collect();
match s1_lower.cmp(&s2_lower) {
std::cmp::Ordering::Less => -1,
std::cmp::Ordering::Equal => 0,
std::cmp::Ordering::Greater => 1,
}
}
#[unsafe(no_mangle)]
pub extern "C" fn fz_string_char_count(_ctx: Handle, s: *const c_char) -> usize {
if s.is_null() {
return 0;
}
let str = match unsafe { CStr::from_ptr(s) }.to_str() {
Ok(s) => s,
Err(_) => return 0,
};
str.chars().count()
}
#[unsafe(no_mangle)]
pub extern "C" fn fz_char_to_byte_offset(_ctx: Handle, s: *const c_char, char_index: usize) -> i32 {
if s.is_null() {
return -1;
}
let str = match unsafe { CStr::from_ptr(s) }.to_str() {
Ok(s) => s,
Err(_) => return -1,
};
for (i, (byte_idx, _)) in str.char_indices().enumerate() {
if i == char_index {
return byte_idx as i32;
}
}
-1
}
#[unsafe(no_mangle)]
pub extern "C" fn fz_byte_to_char_offset(
_ctx: Handle,
s: *const c_char,
byte_offset: usize,
) -> i32 {
if s.is_null() {
return -1;
}
let str = match unsafe { CStr::from_ptr(s) }.to_str() {
Ok(s) => s,
Err(_) => return -1,
};
for (char_idx, (byte_idx, _)) in str.char_indices().enumerate() {
if byte_idx == byte_offset {
return char_idx as i32;
}
}
-1
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_normalize_ligatures() {
let input = c"finding flowers";
let mut output = [0u8; 64];
let len = fz_normalize_string(
0,
input.as_ptr(),
output.as_mut_ptr().cast(),
64,
NormalizationForm::NFC as i32,
);
let result = std::str::from_utf8(&output[..len]).unwrap();
assert!(result.contains("fi"));
assert!(result.contains("fl"));
}
#[test]
fn test_bidi_detection() {
let ltr = c"Hello World";
let rtl = c"\u{05E9}\u{05DC}\u{05D5}\u{05DD}";
assert_eq!(
fz_get_bidi_direction(0, ltr.as_ptr()),
BiDiDirection::LTR as i32
);
assert_eq!(
fz_get_bidi_direction(0, rtl.as_ptr()),
BiDiDirection::RTL as i32
);
}
#[test]
fn test_word_breaks() {
let text = c"Hello world test string";
let mut breaks = [0i32; 10];
let count = fz_find_word_breaks(0, text.as_ptr(), breaks.as_mut_ptr(), 10);
assert_eq!(count, 4); assert_eq!(breaks[0], 0); assert_eq!(breaks[1], 6); }
#[test]
fn test_script_detection() {
let latin = c"Hello";
let arabic = c"\u{0645}\u{0631}\u{062D}\u{0628}\u{0627}"; let cjk = c"\u{4E2D}\u{6587}";
assert_eq!(
fz_detect_script(0, latin.as_ptr()),
ScriptCategory::Latin as i32
);
assert_eq!(
fz_detect_script(0, arabic.as_ptr()),
ScriptCategory::Arabic as i32
);
assert_eq!(
fz_detect_script(0, cjk.as_ptr()),
ScriptCategory::CJK as i32
);
}
#[test]
fn test_casefold() {
let input = c"Hello WORLD";
let mut output = [0u8; 32];
let len = fz_casefold(0, input.as_ptr(), output.as_mut_ptr().cast(), 32);
let result = std::str::from_utf8(&output[..len]).unwrap();
assert_eq!(result, "hello world");
}
#[test]
fn test_char_count() {
let ascii = c"Hello";
let unicode = c"\u{1F600}Hello";
assert_eq!(fz_string_char_count(0, ascii.as_ptr()), 5);
assert_eq!(fz_string_char_count(0, unicode.as_ptr()), 6);
}
#[test]
fn test_char_byte_offset() {
let s = c"H\u{00E9}llo";
assert_eq!(fz_char_to_byte_offset(0, s.as_ptr(), 0), 0);
assert_eq!(fz_char_to_byte_offset(0, s.as_ptr(), 1), 1);
assert_eq!(fz_char_to_byte_offset(0, s.as_ptr(), 2), 3);
}
#[test]
fn test_strcoll() {
let a = c"apple";
let b = c"Banana";
let c = c"Apple";
assert!(fz_strcoll(0, a.as_ptr(), b.as_ptr(), std::ptr::null()) < 0);
assert_eq!(fz_strcoll(0, a.as_ptr(), c.as_ptr(), std::ptr::null()), 0);
}
}