#![no_std]
#![cfg_attr(feature = "verus", feature(proc_macro_hygiene))]
#![deny(unsafe_op_in_unsafe_fn)]
#![warn(missing_docs)]
#![allow(clippy::inline_always)]
#![cfg_attr(rr, feature(register_tool))]
#![cfg_attr(rr, feature(custom_inner_attributes))]
#![cfg_attr(rr, register_tool(rr))]
#![cfg_attr(rr, rr::package("smooth_utf8"))]
#![cfg_attr(rr, rr::coq_prefix("smooth_utf8"))]
#![cfg_attr(rr, rr::include("ptr"))]
use core::ops::Range;
macro_rules! da {
($($t:tt)*) => { #[cfg(not(feature = "verus"))] { core::debug_assert!($($t)*); } };
}
#[cfg(feature = "verus")]
use spec::*;
#[cfg(feature = "verus")]
use verus_builtin_macros::{proof, verus_spec, verus_verify};
#[cfg(feature = "verus")]
#[allow(unused_imports)]
use vstd::prelude::*;
mod ascii_skip;
mod raw;
#[cfg(feature = "verus")]
pub mod spec;
#[cfg(feature = "simdutf8")]
const LONG_THRESHOLD: usize = 128;
#[cfg_attr(feature = "verus", verus_verify)]
#[allow(clippy::unreadable_literal)] const SIGN_BITS: u64 = 0x8080_8080_8080_8080;
#[cfg_attr(feature = "verus", verus_verify)]
pub const SLACK: usize = 8;
#[cfg_attr(feature = "verus", verus_spec(ret =>
ensures ret == is_valid_utf8(b@),
))]
#[inline]
#[must_use]
pub fn verify(b: &[u8]) -> bool {
#[cfg(feature = "simdutf8")]
if b.len() >= LONG_THRESHOLD {
return simdutf8::basic::from_utf8(b).is_ok();
}
#[cfg(feature = "verus")]
proof! { assert(b@.subrange(0, b@.len() as int) =~= b@); }
verify_impl::<0, SafeTail>(b, 0..b.len())
}
#[inline]
#[must_use]
pub fn to_str(b: &[u8]) -> Option<&str> {
if verify(b) {
Some(unsafe { core::str::from_utf8_unchecked(b) })
} else {
None
}
}
#[cfg_attr(feature = "verus", verus_spec(ret =>
requires
range.start <= range.end,
range.end + SLACK <= buf@.len(),
ensures
ret == is_valid_utf8(buf@.subrange(range.start as int, range.end as int)),
))]
#[inline]
#[must_use]
pub unsafe fn verify_with_slack(buf: &[u8], range: Range<usize>) -> bool {
da!(range.start <= range.end);
da!(range.end.saturating_add(SLACK) <= buf.len());
#[cfg(feature = "simdutf8")]
if range.end - range.start >= LONG_THRESHOLD {
return simdutf8::basic::from_utf8(&buf[range]).is_ok();
}
verify_impl::<SLACK, SlackTail>(buf, range)
}
#[cfg_attr(feature = "verus", verus_verify(external_body))]
#[cfg_attr(feature = "verus", verus_spec(ret =>
requires at + 8 <= buf@.len(),
ensures ret == pack64(buf@, at as int),
))]
#[inline(always)]
unsafe fn load64(buf: &[u8], at: usize) -> u64 {
da!(at + 8 <= buf.len());
unsafe { raw::load64_raw(buf.as_ptr(), at) }
}
#[cfg_attr(feature = "verus", verus_verify(external_body))]
#[cfg_attr(feature = "verus", verus_spec(ret =>
requires at < buf@.len(),
ensures ret == buf@[at as int],
))]
#[inline(always)]
#[cfg(target_pointer_width = "64")]
unsafe fn load8(buf: &[u8], at: usize) -> u8 {
da!(at < buf.len());
unsafe { raw::load8_raw(buf.as_ptr(), at) }
}
#[cfg_attr(feature = "verus", verus_verify)]
trait Tail<const PAD: usize> {
#[cfg_attr(feature = "verus", verus_spec(ret =>
requires at < end, end + PAD <= buf@.len(),
ensures forall |j: int| 0 <= j < 8 && at + j < end
==> #[trigger] byte64(ret, j) == buf@[at + j],
))]
fn load64(buf: &[u8], at: usize, end: usize) -> u64;
}
#[cfg_attr(feature = "verus", verus_verify)]
struct SafeTail;
#[cfg_attr(feature = "verus", verus_verify)]
impl Tail<0> for SafeTail {
#[cfg_attr(feature = "verus", verus_verify(external_body))]
#[inline(always)]
fn load64(buf: &[u8], at: usize, end: usize) -> u64 {
da!(at < end && end <= buf.len());
if at + 8 <= end {
unsafe { load64(buf, at) }
} else {
let mut tmp = [0u8; 8];
tmp[..end - at].copy_from_slice(&buf[at..end]);
u64::from_le_bytes(tmp)
}
}
}
#[cfg_attr(feature = "verus", verus_verify)]
struct SlackTail;
#[cfg_attr(feature = "verus", verus_verify)]
impl Tail<SLACK> for SlackTail {
#[inline(always)]
fn load64(buf: &[u8], at: usize, end: usize) -> u64 {
da!(at < end);
let _ = end;
let ret = unsafe { load64(buf, at) };
#[cfg(feature = "verus")]
proof! {
assert forall |j: int| 0 <= j < 8 && at + j < end
implies #[trigger] byte64(ret, j) == buf@[at + j] by {
lemma_pack64_byte(buf@, at as int, j);
};
}
ret
}
}
#[cfg_attr(feature = "verus", verus_spec(ret =>
requires
range.start <= range.end,
range.end + PAD <= buf@.len(),
ensures
ret == is_valid_utf8(buf@.subrange(range.start as int, range.end as int)),
))]
#[inline]
fn verify_impl<const PAD: usize, T: Tail<PAD>>(buf: &[u8], range: Range<usize>) -> bool {
let start = range.start;
let end = range.end;
let mut p = start;
if p == end {
#[cfg(feature = "verus")]
proof! { assert(buf@.subrange(start as int, end as int).len() == 0); }
return true;
}
p = ascii_skip::skip(buf, p, end);
if end - p >= ascii_skip::STEP {
#[cfg(feature = "verus")]
proof! { lemma_ascii_prefix_iff(buf@, start as int, p as int, end as int); }
return verify_multibyte(buf, p, end);
}
#[cfg_attr(feature = "verus", verus_spec(
invariant
start == range.start, end == range.end,
start <= p, p <= end, end + PAD <= buf@.len(),
all_ascii(buf@, start as int, p as int),
decreases end - p
))]
while end - p >= 8 {
let bytes = unsafe { load64(buf, p) };
if bytes & SIGN_BITS != 0 {
#[cfg(feature = "verus")]
proof! { lemma_ascii_prefix_iff(buf@, start as int, p as int, end as int); }
return verify_multibyte(buf, p, end);
}
#[cfg(feature = "verus")]
proof! {
lemma_signbits8(buf@, p as int);
lemma_ascii_extend(buf@, start as int, p as int, p as int + 8);
}
p += 8;
}
if p < end {
let left = end - p; let bytes = T::load64(buf, p, end);
let mask = SIGN_BITS >> ((8 - left) * 8);
if bytes & mask != 0 {
#[cfg(feature = "verus")]
proof! { lemma_ascii_prefix_iff(buf@, start as int, p as int, end as int); }
return verify_multibyte(buf, p, end);
}
#[cfg(feature = "verus")]
proof! {
assert(mask == sign_mask(left as int));
lemma_mask_zero_ascii(buf@, p as int, left as int, bytes);
lemma_ascii_extend(buf@, start as int, p as int, end as int);
lemma_ascii_valid(buf@, start as int, end as int);
}
} else {
#[cfg(feature = "verus")]
proof! { lemma_ascii_valid(buf@, start as int, end as int); }
}
true
}
#[cfg(not(target_pointer_width = "64"))]
#[inline]
fn verify_multibyte(buf: &[u8], start: usize, end: usize) -> bool {
core::str::from_utf8(&buf[start..end]).is_ok()
}
#[allow(dead_code)] const ACCEPT: u8 = 12;
#[allow(dead_code)]
const REJECT: u8 = 0;
#[rustfmt::skip]
const CLASS: [u8; 256] = [
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
];
#[rustfmt::skip]
const TRANS: [u8; 256] = [
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
12, 0,24,36,60,96,84, 0, 0, 0,48,72,
0,12, 0, 0, 0, 0, 0,12, 0,12, 0, 0,
0,24, 0, 0, 0, 0, 0,24, 0,24, 0, 0,
0, 0, 0, 0, 0, 0, 0,24, 0, 0, 0, 0,
0,24, 0, 0, 0, 0, 0, 0, 0,24, 0, 0,
0, 0, 0, 0, 0, 0, 0,36, 0,36, 0, 0,
0,36, 0, 0, 0, 0, 0,36, 0,36, 0, 0,
0,36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
];
const fn build_rows() -> [u64; 256] {
let mut row = [0u64; 256];
let mut b = 0usize;
while b < 256 {
let class = CLASS[b] as usize;
let mut s = 0usize;
while s < 9 {
let next = (TRANS[s * 12 + class] / 12) as u64 * 6;
row[b] |= next << (s * 6);
s += 1;
}
b += 1;
}
row
}
const ROW: [u64; 256] = build_rows();
mod state {
pub const REJECT: u64 = 0;
pub const ACCEPT: u64 = 6;
pub const C1: u64 = 12;
pub const C2: u64 = 18;
pub const E0: u64 = 24;
pub const ED: u64 = 30;
pub const F0: u64 = 36;
pub const C3: u64 = 42;
pub const F4: u64 = 48;
}
#[cfg_attr(feature = "verus", verus_verify)]
const DFA_ACCEPT: u64 = 6;
#[cfg_attr(feature = "verus", verus_verify)]
const DFA_REJECT: u64 = 0;
const _: [(); 0] = [(); (DFA_ACCEPT != state::ACCEPT) as usize];
const _: [(); 0] = [(); (DFA_REJECT != state::REJECT) as usize];
const _: [(); 0] = [(); (state::F4 + 6 > 64) as usize];
macro_rules! row_check {
($($byte:literal -> $state:path,)*) => {
$(const _: [(); 0] = [(); ((ROW[$byte] >> state::ACCEPT) & 63 != $state) as usize];)*
};
}
row_check! {
0x41 -> state::ACCEPT, 0x80 -> state::REJECT, 0xC1 -> state::REJECT,
0xC2 -> state::C1, 0xE0 -> state::E0, 0xE1 -> state::C2,
0xED -> state::ED, 0xEE -> state::C2, 0xF0 -> state::F0,
0xF1 -> state::C3, 0xF4 -> state::F4, 0xF5 -> state::REJECT,
}
#[cfg_attr(feature = "verus", verus_verify(external_body))]
#[cfg_attr(feature = "verus", verus_spec(ret =>
ensures ret == spec_row(byte),
))]
#[inline(always)]
#[cfg(target_pointer_width = "64")]
const fn row(byte: u8) -> u64 {
ROW[byte as usize]
}
#[allow(clippy::unreadable_literal, clippy::cast_possible_truncation)]
const _CHECK_SPEC_ROW: () = {
let mut b = 0usize;
while b < 256 {
let bb = b as u8;
let spec: u64 = if bb <= 0x7F {
0x0000000000000180
} else if bb <= 0x8F {
0x0012480300306000
} else if bb <= 0x9F {
0x0000492300306000
} else if bb <= 0xBF {
0x000049200C306000
} else if bb <= 0xC1 {
0
} else if bb <= 0xDF {
0x0000000000000300
} else if bb == 0xE0 {
0x0000000000000600
} else if bb <= 0xEC {
0x0000000000000480
} else if bb == 0xED {
0x0000000000000780
} else if bb <= 0xEF {
0x0000000000000480
} else if bb == 0xF0 {
0x0000000000000900
} else if bb <= 0xF3 {
0x0000000000000A80
} else if bb == 0xF4 {
0x0000000000000C00
} else {
0
};
assert!(ROW[b] == spec);
b += 1;
}
};
#[cfg_attr(feature = "verus", verus_verify)]
#[cfg_attr(feature = "verus", verus_spec(ret =>
requires is_state(state),
ensures ret == spec_step(state, byte), is_state(ret),
))]
#[inline(always)]
#[cfg(target_pointer_width = "64")]
const fn step(state: u64, byte: u8) -> u64 {
#[cfg(feature = "verus")]
proof! { lemma_row_step(state, byte); }
(row(byte) >> (state & 63)) & 63
}
#[cfg_attr(feature = "verus", verus_verify)]
#[cfg_attr(feature = "verus", verus_spec(ret =>
requires
start < end,
end <= buf@.len(),
ensures
ret == is_valid_utf8(buf@.subrange(start as int, end as int)),
))]
#[cfg(target_pointer_width = "64")]
#[inline]
#[allow(clippy::cast_possible_truncation)] #[allow(clippy::too_many_lines)] fn verify_multibyte(buf: &[u8], start: usize, end: usize) -> bool {
let mut state = DFA_ACCEPT;
let mut p = start;
#[cfg(feature = "verus")]
proof! { assert(buf@.subrange(start as int, start as int).len() == 0); }
#[cfg_attr(feature = "verus", verus_spec(
invariant
start <= p, p <= end, end <= buf@.len(),
is_state(state),
state == run(ST_ACCEPT, buf@.subrange(start as int, p as int)),
decreases end - p
))]
while end - p >= 8 {
let w = unsafe { load64(buf, p) };
if state == DFA_ACCEPT && w & SIGN_BITS == 0 {
#[cfg(feature = "verus")]
proof! {
lemma_signbits8(buf@, p as int);
lemma_ascii_valid(buf@, p as int, p as int + 8);
lemma_run_valid(buf@.subrange(p as int, p as int + 8));
lemma_run_join(ST_ACCEPT, buf@, start as int, p as int, p as int + 8);
}
p += 8;
continue;
}
#[cfg(feature = "verus")]
proof! {
assert(w >> 0u64 == w) by (bit_vector);
lemma_chunk_snoc(buf@, start as int, p as int, w, 0, state);
}
state = step(state, w as u8);
#[cfg(feature = "verus")]
proof! { lemma_chunk_snoc(buf@, start as int, p as int, w, 1, state); }
state = step(state, (w >> 8) as u8);
#[cfg(feature = "verus")]
proof! { lemma_chunk_snoc(buf@, start as int, p as int, w, 2, state); }
state = step(state, (w >> 16) as u8);
#[cfg(feature = "verus")]
proof! { lemma_chunk_snoc(buf@, start as int, p as int, w, 3, state); }
state = step(state, (w >> 24) as u8);
#[cfg(feature = "verus")]
proof! { lemma_chunk_snoc(buf@, start as int, p as int, w, 4, state); }
state = step(state, (w >> 32) as u8);
#[cfg(feature = "verus")]
proof! { lemma_chunk_snoc(buf@, start as int, p as int, w, 5, state); }
state = step(state, (w >> 40) as u8);
#[cfg(feature = "verus")]
proof! { lemma_chunk_snoc(buf@, start as int, p as int, w, 6, state); }
state = step(state, (w >> 48) as u8);
#[cfg(feature = "verus")]
proof! { lemma_chunk_snoc(buf@, start as int, p as int, w, 7, state); }
state = step(state, (w >> 56) as u8);
p += 8;
if state == DFA_REJECT {
#[cfg(feature = "verus")]
proof! {
lemma_run_join(ST_ACCEPT, buf@, start as int, p as int, end as int);
lemma_run_reject(buf@.subrange(p as int, end as int));
lemma_run_valid(buf@.subrange(start as int, end as int));
}
return false;
}
}
#[cfg_attr(feature = "verus", verus_spec(
invariant
start <= p, p <= end, end <= buf@.len(),
is_state(state),
state == run(ST_ACCEPT, buf@.subrange(start as int, p as int)),
decreases end - p
))]
while p < end {
state = step(state, unsafe { load8(buf, p) });
#[cfg(feature = "verus")]
proof! { lemma_run_snoc(ST_ACCEPT, buf@, start as int, p as int); }
p += 1;
}
#[cfg(feature = "verus")]
proof! { lemma_run_valid(buf@.subrange(start as int, end as int)); }
state == DFA_ACCEPT
}
#[cfg(test)]
mod tests;