#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Utf8Sequence {
pub ranges: Vec<(u8, u8)>,
}
impl Utf8Sequence {
pub fn new(ranges: Vec<(u8, u8)>) -> Self {
Self { ranges }
}
pub fn len(&self) -> usize {
self.ranges.len()
}
pub fn is_empty(&self) -> bool {
self.ranges.is_empty()
}
}
pub fn compile_utf8_range(start: u32, end: u32) -> Vec<Utf8Sequence> {
if start > end {
return vec![];
}
let start = start.min(0x10FFFF);
let end = end.min(0x10FFFF);
let mut sequences = Vec::new();
let boundaries = [
0x00, 0x80, 0x800, 0x10000, 0x110000, ];
let mut current = start;
for i in 0..4 {
let boundary_start = boundaries[i];
let boundary_end = boundaries[i + 1] - 1;
if current > boundary_end {
continue;
}
if current > end {
break;
}
let range_start = current.max(boundary_start);
let range_end = end.min(boundary_end);
if range_start <= range_end {
let class_sequences = compile_utf8_class(range_start, range_end, i + 1);
sequences.extend(class_sequences);
current = range_end + 1;
}
}
sequences
}
fn compile_utf8_class(start: u32, end: u32, encoding_len: usize) -> Vec<Utf8Sequence> {
match encoding_len {
1 => compile_1byte(start, end),
2 => compile_2byte(start, end),
3 => compile_3byte(start, end),
4 => compile_4byte(start, end),
_ => vec![],
}
}
fn compile_1byte(start: u32, end: u32) -> Vec<Utf8Sequence> {
debug_assert!(start <= 0x7F && end <= 0x7F);
vec![Utf8Sequence::new(vec![(start as u8, end as u8)])]
}
fn compile_2byte(start: u32, end: u32) -> Vec<Utf8Sequence> {
debug_assert!(start >= 0x80 && end <= 0x7FF);
let mut sequences = Vec::new();
let mut current = start;
while current <= end {
let byte1 = (0xC0 | (current >> 6)) as u8;
let max_with_same_byte1 = ((current >> 6) << 6) | 0x3F;
let range_end = end.min(max_with_same_byte1);
let byte2_start = (0x80 | (current & 0x3F)) as u8;
let byte2_end = (0x80 | (range_end & 0x3F)) as u8;
sequences.push(Utf8Sequence::new(vec![
(byte1, byte1),
(byte2_start, byte2_end),
]));
current = range_end + 1;
}
sequences
}
fn compile_3byte(start: u32, end: u32) -> Vec<Utf8Sequence> {
debug_assert!(start >= 0x800 && end <= 0xFFFF);
let mut sequences = Vec::new();
let mut current = start;
while current <= end {
if (0xD800..=0xDFFF).contains(¤t) {
current = 0xE000;
if current > end {
break;
}
}
let byte1 = (0xE0 | (current >> 12)) as u8;
let max_with_same_byte1 = ((current >> 12) << 12) | 0xFFF;
let range_end = end.min(max_with_same_byte1);
let range_end = if range_end >= 0xD800 && current < 0xD800 {
0xD7FF
} else {
range_end
};
if current <= range_end {
let sub_sequences = compile_3byte_with_fixed_byte1(current, range_end, byte1);
sequences.extend(sub_sequences);
}
current = range_end + 1;
if (0xD800..=0xDFFF).contains(¤t) {
current = 0xE000;
}
}
sequences
}
fn compile_3byte_with_fixed_byte1(start: u32, end: u32, byte1: u8) -> Vec<Utf8Sequence> {
let mut sequences = Vec::new();
let mut current = start;
while current <= end {
let byte2 = (0x80 | ((current >> 6) & 0x3F)) as u8;
let max_with_same_byte2 = ((current >> 6) << 6) | 0x3F;
let range_end = end.min(max_with_same_byte2);
let byte3_start = (0x80 | (current & 0x3F)) as u8;
let byte3_end = (0x80 | (range_end & 0x3F)) as u8;
sequences.push(Utf8Sequence::new(vec![
(byte1, byte1),
(byte2, byte2),
(byte3_start, byte3_end),
]));
current = range_end + 1;
}
sequences
}
fn compile_4byte(start: u32, end: u32) -> Vec<Utf8Sequence> {
debug_assert!(start >= 0x10000 && end <= 0x10FFFF);
let mut sequences = Vec::new();
let mut current = start;
while current <= end {
let byte1 = (0xF0 | (current >> 18)) as u8;
let max_with_same_byte1 = ((current >> 18) << 18) | 0x3FFFF;
let range_end = end.min(max_with_same_byte1);
let sub_sequences = compile_4byte_with_fixed_byte1(current, range_end, byte1);
sequences.extend(sub_sequences);
current = range_end + 1;
}
sequences
}
fn compile_4byte_with_fixed_byte1(start: u32, end: u32, byte1: u8) -> Vec<Utf8Sequence> {
let mut sequences = Vec::new();
let mut current = start;
while current <= end {
let byte2 = (0x80 | ((current >> 12) & 0x3F)) as u8;
let max_with_same_byte2 = ((current >> 12) << 12) | 0xFFF;
let range_end = end.min(max_with_same_byte2);
let sub_sequences = compile_4byte_with_fixed_byte12(current, range_end, byte1, byte2);
sequences.extend(sub_sequences);
current = range_end + 1;
}
sequences
}
fn compile_4byte_with_fixed_byte12(
start: u32,
end: u32,
byte1: u8,
byte2: u8,
) -> Vec<Utf8Sequence> {
let mut sequences = Vec::new();
let mut current = start;
while current <= end {
let byte3 = (0x80 | ((current >> 6) & 0x3F)) as u8;
let max_with_same_byte3 = ((current >> 6) << 6) | 0x3F;
let range_end = end.min(max_with_same_byte3);
let byte4_start = (0x80 | (current & 0x3F)) as u8;
let byte4_end = (0x80 | (range_end & 0x3F)) as u8;
sequences.push(Utf8Sequence::new(vec![
(byte1, byte1),
(byte2, byte2),
(byte3, byte3),
(byte4_start, byte4_end),
]));
current = range_end + 1;
}
sequences
}
pub fn encode_code_point(cp: u32) -> Option<Vec<u8>> {
if cp > 0x10FFFF || (0xD800..=0xDFFF).contains(&cp) {
return None;
}
let c = char::from_u32(cp)?;
let mut buf = [0u8; 4];
let s = c.encode_utf8(&mut buf);
Some(s.as_bytes().to_vec())
}
pub fn optimize_sequences(sequences: Vec<Utf8Sequence>) -> Vec<Utf8Sequence> {
if sequences.len() <= 1 {
return sequences;
}
let mut optimized = Vec::new();
let mut i = 0;
while i < sequences.len() {
let mut current = sequences[i].clone();
while i + 1 < sequences.len() {
let next = &sequences[i + 1];
if let Some(merged) = try_merge_sequences(¤t, next) {
current = merged;
i += 1;
} else {
break;
}
}
optimized.push(current);
i += 1;
}
optimized
}
fn complement_code_point_ranges(ranges: &[(u32, u32)]) -> Vec<(u32, u32)> {
if ranges.is_empty() {
return vec![(0x0000, 0xD7FF), (0xE000, 0x10FFFF)];
}
let mut complement = Vec::new();
let mut current = 0u32;
for &(start, end) in ranges {
let start = start.min(0x10FFFF);
let end = end.min(0x10FFFF);
if start > end || start > 0x10FFFF {
continue;
}
if current < start {
if current <= 0xD7FF && start > 0xD7FF {
if current <= 0xD7FF {
complement.push((current, 0xD7FF.min(start.saturating_sub(1))));
}
if start > 0xDFFF {
complement.push((0xE000.max(current), start.saturating_sub(1)));
} else if (0xD800..=0xDFFF).contains(&start) {
current = 0xE000;
if current < start {
complement.push((current, start.saturating_sub(1)));
}
}
} else if (0xD800..=0xDFFF).contains(¤t) {
current = 0xE000;
if current < start {
complement.push((current, start.saturating_sub(1)));
}
} else if (0xD800..=0xDFFF).contains(&start) {
if current < 0xD800 {
complement.push((current, 0xD7FF));
}
} else {
complement.push((current, start.saturating_sub(1)));
}
}
current = end.saturating_add(1);
if (0xD800..=0xDFFF).contains(¤t) {
current = 0xE000;
}
}
if current <= 0x10FFFF {
if current <= 0xD7FF {
complement.push((current, 0xD7FF));
if 0xE000 <= 0x10FFFF {
complement.push((0xE000, 0x10FFFF));
}
} else if current >= 0xE000 {
complement.push((current, 0x10FFFF));
}
}
complement
}
pub fn compile_utf8_complement(ranges: &[(u32, u32)]) -> Vec<Utf8Sequence> {
let mut sorted_ranges: Vec<(u32, u32)> = ranges.to_vec();
sorted_ranges.sort_by_key(|r| r.0);
let mut merged = Vec::new();
for range in sorted_ranges {
if merged.is_empty() {
merged.push(range);
} else {
let last = merged.last_mut().unwrap();
if range.0 <= last.1.saturating_add(1) {
last.1 = last.1.max(range.1);
} else {
merged.push(range);
}
}
}
let complement_ranges = complement_code_point_ranges(&merged);
let mut sequences = Vec::new();
for (start, end) in complement_ranges {
sequences.extend(compile_utf8_range(start, end));
}
optimize_sequences(sequences)
}
fn try_merge_sequences(a: &Utf8Sequence, b: &Utf8Sequence) -> Option<Utf8Sequence> {
if a.len() != b.len() || a.is_empty() {
return None;
}
let n = a.len();
for i in 0..n - 1 {
if a.ranges[i] != b.ranges[i] {
return None;
}
}
let (a_start, a_end) = a.ranges[n - 1];
let (b_start, b_end) = b.ranges[n - 1];
if a_end.checked_add(1) == Some(b_start) {
let mut merged_ranges = a.ranges[..n - 1].to_vec();
merged_ranges.push((a_start, b_end));
Some(Utf8Sequence::new(merged_ranges))
} else {
None
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_encode_code_point_ascii() {
assert_eq!(encode_code_point(0x41), Some(vec![0x41])); assert_eq!(encode_code_point(0x7F), Some(vec![0x7F]));
}
#[test]
fn test_encode_code_point_2byte() {
assert_eq!(encode_code_point(0xE9), Some(vec![0xC3, 0xA9]));
assert_eq!(encode_code_point(0xFF), Some(vec![0xC3, 0xBF]));
}
#[test]
fn test_encode_code_point_3byte() {
assert_eq!(encode_code_point(0x3042), Some(vec![0xE3, 0x81, 0x82]));
}
#[test]
fn test_encode_code_point_4byte() {
assert_eq!(
encode_code_point(0x1F600),
Some(vec![0xF0, 0x9F, 0x98, 0x80])
);
}
#[test]
fn test_encode_code_point_invalid() {
assert_eq!(encode_code_point(0xD800), None);
assert_eq!(encode_code_point(0xDFFF), None);
assert_eq!(encode_code_point(0x110000), None);
}
#[test]
fn test_compile_1byte_range() {
let seqs = compile_utf8_range(0x41, 0x5A); assert_eq!(seqs.len(), 1);
assert_eq!(seqs[0].ranges, vec![(0x41, 0x5A)]);
}
#[test]
fn test_compile_2byte_range() {
let seqs = compile_utf8_range(0x80, 0xBF);
assert_eq!(seqs.len(), 1);
assert_eq!(seqs[0].ranges[0], (0xC2, 0xC2));
assert_eq!(seqs[0].ranges[1], (0x80, 0xBF));
}
#[test]
fn test_compile_cross_boundary_1_2() {
let seqs = compile_utf8_range(0x61, 0xFF);
assert!(seqs.len() >= 2);
assert_eq!(seqs[0].len(), 1);
assert_eq!(seqs[0].ranges[0], (0x61, 0x7F));
for seq in &seqs[1..] {
assert_eq!(seq.len(), 2);
}
}
#[test]
fn test_compile_greek_letters() {
let seqs = compile_utf8_range(0x03B1, 0x03C9);
for seq in &seqs {
assert_eq!(seq.len(), 2);
assert!(seq.ranges[0].0 >= 0xCE && seq.ranges[0].1 <= 0xCF);
}
}
#[test]
fn test_compile_cjk() {
let seqs = compile_utf8_range(0x4E00, 0x4E0F);
for seq in &seqs {
assert_eq!(seq.len(), 3);
}
}
#[test]
fn test_compile_emoji() {
let seqs = compile_utf8_range(0x1F600, 0x1F60F);
for seq in &seqs {
assert_eq!(seq.len(), 4);
}
}
#[test]
fn test_surrogate_skip() {
let seqs = compile_utf8_range(0xD700, 0xE000);
for seq in &seqs {
assert!(seq.len() >= 2);
}
}
#[test]
fn test_optimize_sequences() {
let seqs = vec![
Utf8Sequence::new(vec![(0xC2, 0xC2), (0x80, 0x8F)]),
Utf8Sequence::new(vec![(0xC2, 0xC2), (0x90, 0x9F)]),
Utf8Sequence::new(vec![(0xC2, 0xC2), (0xA0, 0xAF)]),
];
let optimized = optimize_sequences(seqs);
assert_eq!(optimized.len(), 1);
assert_eq!(optimized[0].ranges, vec![(0xC2, 0xC2), (0x80, 0xAF)]);
}
#[test]
fn test_full_unicode_range() {
let seqs = compile_utf8_range(0, 0x10FFFF);
let has_1byte = seqs.iter().any(|s| s.len() == 1);
let has_2byte = seqs.iter().any(|s| s.len() == 2);
let has_3byte = seqs.iter().any(|s| s.len() == 3);
let has_4byte = seqs.iter().any(|s| s.len() == 4);
assert!(has_1byte);
assert!(has_2byte);
assert!(has_3byte);
assert!(has_4byte);
}
#[test]
fn test_single_code_point() {
let seqs = compile_utf8_range(0x03B1, 0x03B1); assert_eq!(seqs.len(), 1);
assert_eq!(seqs[0].len(), 2);
}
#[test]
fn test_complement_ascii() {
let complement = compile_utf8_complement(&[(0x61, 0x7A)]);
let has_1byte = complement.iter().any(|s| s.len() == 1);
let has_2byte = complement.iter().any(|s| s.len() == 2);
let has_3byte = complement.iter().any(|s| s.len() == 3);
let has_4byte = complement.iter().any(|s| s.len() == 4);
assert!(has_1byte, "Should have 1-byte sequences");
assert!(has_2byte, "Should have 2-byte sequences");
assert!(has_3byte, "Should have 3-byte sequences");
assert!(has_4byte, "Should have 4-byte sequences");
}
#[test]
fn test_complement_greek() {
let complement = compile_utf8_complement(&[(0x03B1, 0x03C9)]);
let has_1byte = complement.iter().any(|s| s.len() == 1);
let has_2byte = complement.iter().any(|s| s.len() == 2);
let has_3byte = complement.iter().any(|s| s.len() == 3);
let has_4byte = complement.iter().any(|s| s.len() == 4);
assert!(has_1byte, "Should have ASCII");
assert!(has_2byte, "Should have other 2-byte sequences");
assert!(has_3byte, "Should have 3-byte sequences");
assert!(has_4byte, "Should have 4-byte sequences");
}
#[test]
fn test_complement_excludes_surrogates() {
let complement = compile_utf8_complement(&[]);
for seq in &complement {
if seq.len() == 3 {
let (b1_start, b1_end) = seq.ranges[0];
if b1_start == 0xED && b1_end == 0xED {
let (b2_start, _) = seq.ranges[1];
assert!(b2_start < 0xA0, "Should not generate surrogate sequences");
}
}
}
}
#[test]
fn test_complement_single_char() {
let complement = compile_utf8_complement(&[(0x03B1, 0x03B1)]);
assert!(!complement.is_empty());
let has_1byte = complement.iter().any(|s| s.len() == 1);
let has_2byte = complement.iter().any(|s| s.len() == 2);
let has_3byte = complement.iter().any(|s| s.len() == 3);
let has_4byte = complement.iter().any(|s| s.len() == 4);
assert!(has_1byte);
assert!(has_2byte);
assert!(has_3byte);
assert!(has_4byte);
}
#[test]
fn test_complement_multiple_ranges() {
let complement = compile_utf8_complement(&[(0x41, 0x5A), (0x61, 0x7A)]);
let has_1byte = complement.iter().any(|s| s.len() == 1);
assert!(has_1byte);
let has_2byte = complement.iter().any(|s| s.len() == 2);
let has_3byte = complement.iter().any(|s| s.len() == 3);
let has_4byte = complement.iter().any(|s| s.len() == 4);
assert!(has_2byte);
assert!(has_3byte);
assert!(has_4byte);
}
#[test]
fn test_complement_cjk() {
let complement = compile_utf8_complement(&[(0x4E00, 0x9FFF)]);
let has_1byte = complement.iter().any(|s| s.len() == 1);
let has_2byte = complement.iter().any(|s| s.len() == 2);
let has_3byte = complement.iter().any(|s| s.len() == 3);
let has_4byte = complement.iter().any(|s| s.len() == 4);
assert!(has_1byte);
assert!(has_2byte);
assert!(has_3byte);
assert!(has_4byte);
}
#[test]
fn test_complement_emoji() {
let complement = compile_utf8_complement(&[(0x1F600, 0x1F602)]);
let has_1byte = complement.iter().any(|s| s.len() == 1);
let has_2byte = complement.iter().any(|s| s.len() == 2);
let has_3byte = complement.iter().any(|s| s.len() == 3);
let has_4byte = complement.iter().any(|s| s.len() == 4);
assert!(has_1byte);
assert!(has_2byte);
assert!(has_3byte);
assert!(has_4byte);
}
#[test]
fn test_complement_boundary_cases() {
let complement = compile_utf8_complement(&[(0x7F, 0x80)]);
assert!(!complement.is_empty());
let complement = compile_utf8_complement(&[(0x7FF, 0x800)]);
assert!(!complement.is_empty());
let complement = compile_utf8_complement(&[(0xFFFF, 0x10000)]);
assert!(!complement.is_empty());
}
}