use crate::error::{KreuzbergError, Result};
use crate::types::PageBoundary;
use bitvec::prelude::*;
pub const ADAPTIVE_VALIDATION_THRESHOLD: usize = 10;
pub fn precompute_utf8_boundaries(text: &str) -> BitVec {
let text_len = text.len();
let mut boundaries = bitvec![0; text_len + 1];
boundaries.set(0, true);
for (i, _) in text.char_indices() {
if i <= text_len {
boundaries.set(i, true);
}
}
if text_len > 0 {
boundaries.set(text_len, true);
}
boundaries
}
pub fn validate_utf8_boundaries(text: &str, boundaries: &[PageBoundary]) -> Result<()> {
if boundaries.is_empty() {
return Ok(());
}
let text_len = text.len();
if boundaries.len() <= ADAPTIVE_VALIDATION_THRESHOLD {
validate_utf8_boundaries_fast_path(text, boundaries, text_len)
} else {
validate_utf8_boundaries_batch_path(text, boundaries, text_len)
}
}
fn validate_utf8_boundaries_fast_path(text: &str, boundaries: &[PageBoundary], text_len: usize) -> Result<()> {
for (idx, boundary) in boundaries.iter().enumerate() {
if boundary.byte_start > text_len {
return Err(KreuzbergError::validation(format!(
"Page boundary {} has byte_start={} which exceeds text length {}",
idx, boundary.byte_start, text_len
)));
}
if boundary.byte_end > text_len {
return Err(KreuzbergError::validation(format!(
"Page boundary {} has byte_end={} which exceeds text length {}",
idx, boundary.byte_end, text_len
)));
}
if boundary.byte_start > 0 && boundary.byte_start < text_len && !text.is_char_boundary(boundary.byte_start) {
return Err(KreuzbergError::validation(format!(
"Page boundary {} has byte_start={} which is not a valid UTF-8 character boundary (text length={}). This may indicate corrupted multibyte characters (emoji, CJK, etc.)",
idx, boundary.byte_start, text_len
)));
}
if boundary.byte_end > 0 && boundary.byte_end < text_len && !text.is_char_boundary(boundary.byte_end) {
return Err(KreuzbergError::validation(format!(
"Page boundary {} has byte_end={} which is not a valid UTF-8 character boundary (text length={}). This may indicate corrupted multibyte characters (emoji, CJK, etc.)",
idx, boundary.byte_end, text_len
)));
}
}
Ok(())
}
fn validate_utf8_boundaries_batch_path(text: &str, boundaries: &[PageBoundary], text_len: usize) -> Result<()> {
let valid_boundaries = precompute_utf8_boundaries(text);
for (idx, boundary) in boundaries.iter().enumerate() {
if boundary.byte_start > text_len {
return Err(KreuzbergError::validation(format!(
"Page boundary {} has byte_start={} which exceeds text length {}",
idx, boundary.byte_start, text_len
)));
}
if boundary.byte_end > text_len {
return Err(KreuzbergError::validation(format!(
"Page boundary {} has byte_end={} which exceeds text length {}",
idx, boundary.byte_end, text_len
)));
}
if boundary.byte_start > 0 && boundary.byte_start <= text_len && !valid_boundaries[boundary.byte_start] {
return Err(KreuzbergError::validation(format!(
"Page boundary {} has byte_start={} which is not a valid UTF-8 character boundary (text length={}). This may indicate corrupted multibyte characters (emoji, CJK, etc.)",
idx, boundary.byte_start, text_len
)));
}
if boundary.byte_end > 0 && boundary.byte_end <= text_len && !valid_boundaries[boundary.byte_end] {
return Err(KreuzbergError::validation(format!(
"Page boundary {} has byte_end={} which is not a valid UTF-8 character boundary (text length={}). This may indicate corrupted multibyte characters (emoji, CJK, etc.)",
idx, boundary.byte_end, text_len
)));
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_validate_utf8_boundaries_valid_ascii() {
let text = "This is ASCII text.";
let boundaries = vec![
PageBoundary {
byte_start: 0,
byte_end: 10,
page_number: 1,
},
PageBoundary {
byte_start: 10,
byte_end: 19,
page_number: 2,
},
];
let result = validate_utf8_boundaries(text, &boundaries);
assert!(result.is_ok());
}
#[test]
fn test_validate_utf8_boundaries_valid_emoji() {
let text = "Hello 👋 World 🌍 End";
let boundaries = vec![
PageBoundary {
byte_start: 0,
byte_end: 11,
page_number: 1,
},
PageBoundary {
byte_start: 11,
byte_end: 25,
page_number: 2,
},
];
let result = validate_utf8_boundaries(text, &boundaries);
assert!(result.is_ok());
}
#[test]
fn test_validate_utf8_boundaries_valid_cjk() {
let text = "你好世界 こんにちは 안녕하세요";
let boundaries = vec![
PageBoundary {
byte_start: 0,
byte_end: 13,
page_number: 1,
},
PageBoundary {
byte_start: 13,
byte_end: 44,
page_number: 2,
},
];
let result = validate_utf8_boundaries(text, &boundaries);
assert!(result.is_ok());
}
#[test]
fn test_validate_utf8_boundaries_invalid_mid_emoji() {
let text = "Hello 👋 World";
let boundaries = vec![PageBoundary {
byte_start: 0,
byte_end: 7,
page_number: 1,
}];
let result = validate_utf8_boundaries(text, &boundaries);
assert!(result.is_err());
let err = result.unwrap_err();
assert!(err.to_string().contains("UTF-8 character boundary"));
assert!(err.to_string().contains("byte_end=7"));
}
#[test]
fn test_validate_utf8_boundaries_invalid_mid_multibyte_cjk() {
let text = "中文文本";
let boundaries = vec![PageBoundary {
byte_start: 0,
byte_end: 1,
page_number: 1,
}];
let result = validate_utf8_boundaries(text, &boundaries);
assert!(result.is_err());
let err = result.unwrap_err();
assert!(err.to_string().contains("UTF-8 character boundary"));
}
#[test]
fn test_validate_utf8_boundaries_byte_start_exceeds_length() {
let text = "Short";
let boundaries = vec![
PageBoundary {
byte_start: 0,
byte_end: 3,
page_number: 1,
},
PageBoundary {
byte_start: 10,
byte_end: 15,
page_number: 2,
},
];
let result = validate_utf8_boundaries(text, &boundaries);
assert!(result.is_err());
let err = result.unwrap_err();
assert!(err.to_string().contains("exceeds text length"));
}
#[test]
fn test_validate_utf8_boundaries_byte_end_exceeds_length() {
let text = "Short";
let boundaries = vec![PageBoundary {
byte_start: 0,
byte_end: 100,
page_number: 1,
}];
let result = validate_utf8_boundaries(text, &boundaries);
assert!(result.is_err());
let err = result.unwrap_err();
assert!(err.to_string().contains("exceeds text length"));
}
#[test]
fn test_validate_utf8_boundaries_empty_boundaries() {
let text = "Some text";
let boundaries: Vec<PageBoundary> = vec![];
let result = validate_utf8_boundaries(text, &boundaries);
assert!(result.is_ok());
}
#[test]
fn test_validate_utf8_boundaries_at_text_boundaries() {
let text = "Exact boundary test";
let text_len = text.len();
let boundaries = vec![PageBoundary {
byte_start: 0,
byte_end: text_len,
page_number: 1,
}];
let result = validate_utf8_boundaries(text, &boundaries);
assert!(result.is_ok());
}
#[test]
fn test_validate_utf8_boundaries_mixed_languages() {
let text = "English text mixed with 中文 and français";
let boundaries = vec![
PageBoundary {
byte_start: 0,
byte_end: 24,
page_number: 1,
},
PageBoundary {
byte_start: 24,
byte_end: text.len(),
page_number: 2,
},
];
let result = validate_utf8_boundaries(text, &boundaries);
assert!(result.is_ok());
}
#[test]
fn test_validate_utf8_boundaries_error_messages_are_clear() {
let text = "Test 👋 text";
let boundaries = vec![PageBoundary {
byte_start: 0,
byte_end: 6,
page_number: 1,
}];
let result = validate_utf8_boundaries(text, &boundaries);
assert!(result.is_err());
let err = result.unwrap_err();
let err_msg = err.to_string();
assert!(err_msg.contains("UTF-8"));
assert!(err_msg.contains("boundary"));
assert!(err_msg.contains("6"));
}
#[test]
fn test_validate_utf8_boundaries_multiple_valid_boundaries() {
let text = "First👋Second🌍Third";
let boundaries = vec![
PageBoundary {
byte_start: 0,
byte_end: 5,
page_number: 1,
},
PageBoundary {
byte_start: 5,
byte_end: 9,
page_number: 2,
},
PageBoundary {
byte_start: 9,
byte_end: 15,
page_number: 3,
},
PageBoundary {
byte_start: 15,
byte_end: 19,
page_number: 4,
},
PageBoundary {
byte_start: 19,
byte_end: text.len(),
page_number: 5,
},
];
let result = validate_utf8_boundaries(text, &boundaries);
assert!(result.is_ok());
}
#[test]
fn test_validate_utf8_boundaries_zero_start_and_end() {
let text = "Text";
let boundaries = vec![PageBoundary {
byte_start: 0,
byte_end: 0,
page_number: 1,
}];
let result = validate_utf8_boundaries(text, &boundaries);
assert!(result.is_ok());
}
#[test]
fn test_utf8_boundaries_caching_with_many_boundaries() {
let text = "🌍 Hello World ".repeat(200);
let text_len = text.len();
let mut boundaries = vec![];
let boundary_count = 10;
let step = text_len / boundary_count;
for i in 0..boundary_count {
let start = i * step;
let end = if i == boundary_count - 1 {
text_len
} else {
(i + 1) * step
};
if start < end
&& start <= text_len
&& end <= text_len
&& let Some(boundary_start) = text[..start].char_indices().last().map(|(idx, _)| idx)
&& let Some(boundary_end) = text[..end].char_indices().last().map(|(idx, _)| idx)
{
boundaries.push(PageBoundary {
byte_start: boundary_start,
byte_end: boundary_end,
page_number: i + 1,
});
}
}
if !boundaries.is_empty() {
let result = validate_utf8_boundaries(&text, &boundaries);
assert!(result.is_ok());
}
}
#[test]
fn test_utf8_boundaries_caching_large_document_with_emojis() {
let large_text = "This is a large document with lots of emoji: 🌍 🚀 💻 🎉 🔥 ✨ 🎨 🌟 ".repeat(100);
let all_indices: Vec<usize> = large_text.char_indices().map(|(idx, _)| idx).collect();
let third_idx = all_indices.len() / 3;
let two_thirds_idx = (2 * all_indices.len()) / 3;
let boundary_start_1 = if third_idx < all_indices.len() {
all_indices[third_idx]
} else {
large_text.len()
};
let boundary_start_2 = if two_thirds_idx < all_indices.len() {
all_indices[two_thirds_idx]
} else {
large_text.len()
};
let boundaries = vec![
PageBoundary {
byte_start: 0,
byte_end: boundary_start_1,
page_number: 1,
},
PageBoundary {
byte_start: boundary_start_1,
byte_end: boundary_start_2,
page_number: 2,
},
PageBoundary {
byte_start: boundary_start_2,
byte_end: large_text.len(),
page_number: 3,
},
];
let result = validate_utf8_boundaries(&large_text, &boundaries);
assert!(result.is_ok());
}
#[test]
fn test_adaptive_validation_small_boundary_set() {
let text = "Hello 👋 World 🌍 End";
let boundaries = vec![
PageBoundary {
byte_start: 0,
byte_end: 6,
page_number: 1,
},
PageBoundary {
byte_start: 6,
byte_end: 15,
page_number: 2,
},
PageBoundary {
byte_start: 15,
byte_end: text.len(),
page_number: 3,
},
];
let result = validate_utf8_boundaries(text, &boundaries);
assert!(result.is_ok());
}
#[test]
fn test_adaptive_validation_threshold_boundary() {
let text = "Test text ".repeat(50);
let text_len = text.len();
let mut boundaries = vec![];
let step = text_len / ADAPTIVE_VALIDATION_THRESHOLD;
for i in 0..ADAPTIVE_VALIDATION_THRESHOLD {
let start = i * step;
let end = if i == ADAPTIVE_VALIDATION_THRESHOLD - 1 {
text_len
} else {
(i + 1) * step
};
if start < end
&& start <= text_len
&& end <= text_len
&& let Some(boundary_start) = text[..start.min(text_len - 1)]
.char_indices()
.last()
.map(|(idx, _)| idx)
&& let Some(boundary_end) = text[..end.min(text_len)].char_indices().last().map(|(idx, _)| idx)
&& boundary_start < boundary_end
{
boundaries.push(PageBoundary {
byte_start: boundary_start,
byte_end: boundary_end,
page_number: i + 1,
});
}
}
if !boundaries.is_empty() {
let result = validate_utf8_boundaries(&text, &boundaries);
assert!(result.is_ok());
}
}
#[test]
fn test_adaptive_validation_large_boundary_set() {
let text = "Lorem ipsum dolor sit amet ".repeat(100);
let text_len = text.len();
let mut boundaries = vec![];
let boundary_count = 50;
let step = text_len / boundary_count;
for i in 0..boundary_count {
let start = i * step;
let end = if i == boundary_count - 1 {
text_len
} else {
(i + 1) * step
};
if start < end
&& start <= text_len
&& end <= text_len
&& let Some(boundary_start) = text[..start.min(text_len - 1)]
.char_indices()
.last()
.map(|(idx, _)| idx)
&& let Some(boundary_end) = text[..end.min(text_len)].char_indices().last().map(|(idx, _)| idx)
&& boundary_start < boundary_end
{
boundaries.push(PageBoundary {
byte_start: boundary_start,
byte_end: boundary_end,
page_number: i + 1,
});
}
}
if !boundaries.is_empty() {
let result = validate_utf8_boundaries(&text, &boundaries);
assert!(result.is_ok());
}
}
#[test]
fn test_adaptive_validation_consistency() {
let text = "Mixed language: 你好 مرحبا Здравствуй ".repeat(50);
let boundaries = vec![
PageBoundary {
byte_start: 0,
byte_end: 50,
page_number: 1,
},
PageBoundary {
byte_start: 50,
byte_end: 100,
page_number: 2,
},
PageBoundary {
byte_start: 100,
byte_end: 150,
page_number: 3,
},
PageBoundary {
byte_start: 150,
byte_end: 200,
page_number: 4,
},
PageBoundary {
byte_start: 200,
byte_end: text.len(),
page_number: 5,
},
];
let result = validate_utf8_boundaries(&text, &boundaries);
let _ = result;
}
}