use crate::utils::CoreError;
use alloc::{
format,
string::{String, ToString},
};
use core::str;
pub fn validate_utf8(bytes: &[u8]) -> Result<(), CoreError> {
match str::from_utf8(bytes) {
Ok(_) => Ok(()),
Err(err) => {
let position = err.valid_up_to();
let message = err.error_len().map_or_else(
|| format!("Incomplete UTF-8 sequence at position {position}"),
|len| format!("Invalid UTF-8 sequence of {len} bytes at position {position}"),
);
Err(CoreError::utf8_error(position, message))
}
}
}
#[must_use]
pub fn recover_utf8(bytes: &[u8]) -> (String, usize) {
str::from_utf8(bytes).map_or_else(
|_| {
let recovered = String::from_utf8_lossy(bytes);
let replacements = recovered.matches('\u{FFFD}').count();
(recovered.into_owned(), replacements)
},
|s| (s.to_string(), 0),
)
}
#[must_use]
pub fn is_valid_ass_text(text: &str) -> bool {
text.chars().all(|c| {
c.is_ascii_graphic() || c == ' ' || c == '\t' || c == '\n' || c == '\r' || (!c.is_ascii() && !c.is_control()) })
}
#[must_use]
pub fn truncate_at_char_boundary(text: &str, max_bytes: usize) -> (&str, bool) {
if text.len() <= max_bytes {
return (text, false);
}
let mut boundary = max_bytes;
while boundary > 0 && !text.is_char_boundary(boundary) {
boundary -= 1;
}
(&text[..boundary], true)
}
#[must_use]
pub fn count_replacement_chars(text: &str) -> usize {
text.matches('\u{FFFD}').count()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn validate_valid_utf8() {
let text = "Hello, 世界! 🎵";
assert!(validate_utf8(text.as_bytes()).is_ok());
}
#[test]
fn validate_invalid_utf8() {
let invalid_bytes = &[0xFF, 0xFE, 0x80]; assert!(validate_utf8(invalid_bytes).is_err());
}
#[test]
fn validate_incomplete_utf8() {
let incomplete_bytes = &[0xC2]; let result = validate_utf8(incomplete_bytes);
assert!(result.is_err());
}
#[test]
fn recover_valid_utf8() {
let text = "Hello, World!";
let (recovered, replacements) = recover_utf8(text.as_bytes());
assert_eq!(recovered, "Hello, World!");
assert_eq!(replacements, 0);
}
#[test]
fn recover_invalid_utf8() {
let invalid_bytes = &[b'H', b'i', 0xFF, b'!'];
let (recovered, replacements) = recover_utf8(invalid_bytes);
assert_eq!(recovered, "Hi�!");
assert_eq!(replacements, 1);
}
#[test]
fn recover_multiple_invalid_sequences() {
let invalid_bytes = &[b'A', 0xFF, b'B', 0xFE, b'C'];
let (recovered, replacements) = recover_utf8(invalid_bytes);
assert_eq!(recovered, "A�B�C");
assert_eq!(replacements, 2);
}
#[test]
fn valid_ass_text() {
assert!(is_valid_ass_text("Hello World"));
assert!(is_valid_ass_text("Hello\tWorld\n"));
assert!(is_valid_ass_text("Hello 世界"));
assert!(!is_valid_ass_text("Hello\x00World")); assert!(!is_valid_ass_text("Hello\x1FWorld")); }
#[test]
fn truncate_ascii() {
let text = "Hello World";
let (truncated, was_truncated) = truncate_at_char_boundary(text, 5);
assert_eq!(truncated, "Hello");
assert!(was_truncated);
}
#[test]
fn truncate_unicode() {
let text = "Hello 世界";
let (truncated, was_truncated) = truncate_at_char_boundary(text, 8);
assert_eq!(truncated, "Hello "); assert!(was_truncated);
}
#[test]
fn truncate_no_change() {
let text = "Hello";
let (truncated, was_truncated) = truncate_at_char_boundary(text, 10);
assert_eq!(truncated, "Hello");
assert!(!was_truncated);
}
#[test]
fn truncate_at_unicode_boundary() {
let text = "世界";
let (truncated, was_truncated) = truncate_at_char_boundary(text, 3);
assert_eq!(truncated, "世");
assert!(was_truncated);
}
#[test]
fn count_replacement_characters() {
assert_eq!(count_replacement_chars("Hello World"), 0);
assert_eq!(count_replacement_chars("Hello � World"), 1);
assert_eq!(count_replacement_chars("� Test � Again �"), 3);
}
}