use crate::atomic::{PyAtomic, Radium};
use crate::format::CharLen;
use crate::wtf8::{CodePoint, Wtf8, Wtf8Buf};
use ascii::{AsciiChar, AsciiStr, AsciiString};
use core::fmt;
use core::ops::{Bound, RangeBounds};
use core::sync::atomic::Ordering::Relaxed;
#[cfg(not(target_arch = "wasm32"))]
#[allow(non_camel_case_types)]
pub type wchar_t = libc::wchar_t;
#[cfg(target_arch = "wasm32")]
#[allow(non_camel_case_types)]
pub type wchar_t = u32;
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub enum StrKind {
Ascii,
Utf8,
Wtf8,
}
impl core::ops::BitOr for StrKind {
type Output = Self;
fn bitor(self, other: Self) -> Self {
use StrKind::*;
match (self, other) {
(Wtf8, _) | (_, Wtf8) => Wtf8,
(Utf8, _) | (_, Utf8) => Utf8,
(Ascii, Ascii) => Ascii,
}
}
}
impl StrKind {
pub const fn is_ascii(&self) -> bool {
matches!(self, Self::Ascii)
}
pub const fn is_utf8(&self) -> bool {
matches!(self, Self::Ascii | Self::Utf8)
}
#[inline(always)]
pub fn can_encode(&self, code: CodePoint) -> bool {
match self {
Self::Ascii => code.is_ascii(),
Self::Utf8 => code.to_char().is_some(),
Self::Wtf8 => true,
}
}
}
pub trait DeduceStrKind {
fn str_kind(&self) -> StrKind;
}
impl DeduceStrKind for str {
fn str_kind(&self) -> StrKind {
if self.is_ascii() {
StrKind::Ascii
} else {
StrKind::Utf8
}
}
}
impl DeduceStrKind for Wtf8 {
fn str_kind(&self) -> StrKind {
if self.is_ascii() {
StrKind::Ascii
} else if self.is_utf8() {
StrKind::Utf8
} else {
StrKind::Wtf8
}
}
}
impl DeduceStrKind for String {
fn str_kind(&self) -> StrKind {
(**self).str_kind()
}
}
impl DeduceStrKind for Wtf8Buf {
fn str_kind(&self) -> StrKind {
(**self).str_kind()
}
}
impl<T: DeduceStrKind + ?Sized> DeduceStrKind for &T {
fn str_kind(&self) -> StrKind {
(**self).str_kind()
}
}
impl<T: DeduceStrKind + ?Sized> DeduceStrKind for Box<T> {
fn str_kind(&self) -> StrKind {
(**self).str_kind()
}
}
#[derive(Debug)]
pub enum PyKindStr<'a> {
Ascii(&'a AsciiStr),
Utf8(&'a str),
Wtf8(&'a Wtf8),
}
#[derive(Debug, Clone)]
pub struct StrData {
data: Box<Wtf8>,
kind: StrKind,
len: StrLen,
}
struct StrLen(PyAtomic<usize>);
impl From<usize> for StrLen {
#[inline(always)]
fn from(value: usize) -> Self {
Self(Radium::new(value))
}
}
impl fmt::Debug for StrLen {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
let len = self.0.load(Relaxed);
if len == usize::MAX {
f.write_str("<uncomputed>")
} else {
len.fmt(f)
}
}
}
impl StrLen {
#[inline(always)]
fn zero() -> Self {
0usize.into()
}
#[inline(always)]
fn uncomputed() -> Self {
usize::MAX.into()
}
}
impl Clone for StrLen {
fn clone(&self) -> Self {
Self(self.0.load(Relaxed).into())
}
}
impl Default for StrData {
fn default() -> Self {
Self {
data: <Box<Wtf8>>::default(),
kind: StrKind::Ascii,
len: StrLen::zero(),
}
}
}
impl From<Box<Wtf8>> for StrData {
fn from(value: Box<Wtf8>) -> Self {
let kind = value.str_kind();
unsafe { Self::new_str_unchecked(value, kind) }
}
}
impl From<Box<str>> for StrData {
#[inline]
fn from(value: Box<str>) -> Self {
let kind = value.str_kind();
unsafe { Self::new_str_unchecked(value.into(), kind) }
}
}
impl From<Box<AsciiStr>> for StrData {
#[inline]
fn from(value: Box<AsciiStr>) -> Self {
Self {
len: value.len().into(),
data: value.into(),
kind: StrKind::Ascii,
}
}
}
impl From<AsciiChar> for StrData {
fn from(ch: AsciiChar) -> Self {
AsciiString::from(ch).into_boxed_ascii_str().into()
}
}
impl From<char> for StrData {
fn from(ch: char) -> Self {
if let Ok(ch) = ascii::AsciiChar::from_ascii(ch) {
ch.into()
} else {
Self {
data: ch.to_string().into(),
kind: StrKind::Utf8,
len: 1.into(),
}
}
}
}
impl From<CodePoint> for StrData {
fn from(ch: CodePoint) -> Self {
if let Some(ch) = ch.to_char() {
ch.into()
} else {
Self {
data: Wtf8Buf::from(ch).into(),
kind: StrKind::Wtf8,
len: 1.into(),
}
}
}
}
impl StrData {
pub unsafe fn new_str_unchecked(data: Box<Wtf8>, kind: StrKind) -> Self {
let len = match kind {
StrKind::Ascii => data.len().into(),
_ => StrLen::uncomputed(),
};
Self { data, kind, len }
}
pub unsafe fn new_with_char_len(data: Box<Wtf8>, kind: StrKind, char_len: usize) -> Self {
Self {
data,
kind,
len: char_len.into(),
}
}
#[inline]
pub const fn as_wtf8(&self) -> &Wtf8 {
&self.data
}
#[inline]
pub fn as_str(&self) -> Option<&str> {
self.kind
.is_utf8()
.then(|| unsafe { core::str::from_utf8_unchecked(self.data.as_bytes()) })
}
pub fn as_ascii(&self) -> Option<&AsciiStr> {
self.kind
.is_ascii()
.then(|| unsafe { AsciiStr::from_ascii_unchecked(self.data.as_bytes()) })
}
pub const fn kind(&self) -> StrKind {
self.kind
}
#[inline]
pub fn as_str_kind(&self) -> PyKindStr<'_> {
match self.kind {
StrKind::Ascii => {
PyKindStr::Ascii(unsafe { AsciiStr::from_ascii_unchecked(self.data.as_bytes()) })
}
StrKind::Utf8 => {
PyKindStr::Utf8(unsafe { core::str::from_utf8_unchecked(self.data.as_bytes()) })
}
StrKind::Wtf8 => PyKindStr::Wtf8(&self.data),
}
}
#[inline]
pub fn len(&self) -> usize {
self.data.len()
}
pub fn is_empty(&self) -> bool {
self.data.is_empty()
}
#[inline]
pub fn char_len(&self) -> usize {
match self.len.0.load(Relaxed) {
usize::MAX => self._compute_char_len(),
len => len,
}
}
#[cold]
fn _compute_char_len(&self) -> usize {
let len = if let Some(s) = self.as_str() {
s.chars().count()
} else {
self.data.code_points().count()
};
self.len.0.store(len, Relaxed);
len
}
pub fn nth_char(&self, index: usize) -> CodePoint {
match self.as_str_kind() {
PyKindStr::Ascii(s) => s[index].into(),
PyKindStr::Utf8(s) => s.chars().nth(index).unwrap().into(),
PyKindStr::Wtf8(w) => w.code_points().nth(index).unwrap(),
}
}
}
impl core::fmt::Display for StrData {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
self.data.fmt(f)
}
}
impl CharLen for StrData {
fn char_len(&self) -> usize {
self.char_len()
}
}
pub fn try_get_chars(s: &str, range: impl RangeBounds<usize>) -> Option<&str> {
let mut chars = s.chars();
let start = match range.start_bound() {
Bound::Included(&i) => i,
Bound::Excluded(&i) => i + 1,
Bound::Unbounded => 0,
};
for _ in 0..start {
chars.next()?;
}
let s = chars.as_str();
let range_len = match range.end_bound() {
Bound::Included(&i) => i + 1 - start,
Bound::Excluded(&i) => i - start,
Bound::Unbounded => return Some(s),
};
char_range_end(s, range_len).map(|end| &s[..end])
}
pub fn get_chars(s: &str, range: impl RangeBounds<usize>) -> &str {
try_get_chars(s, range).unwrap()
}
#[inline]
pub fn char_range_end(s: &str, n_chars: usize) -> Option<usize> {
let i = match n_chars.checked_sub(1) {
Some(last_char_index) => {
let (index, c) = s.char_indices().nth(last_char_index)?;
index + c.len_utf8()
}
None => 0,
};
Some(i)
}
pub fn try_get_codepoints(w: &Wtf8, range: impl RangeBounds<usize>) -> Option<&Wtf8> {
let mut chars = w.code_points();
let start = match range.start_bound() {
Bound::Included(&i) => i,
Bound::Excluded(&i) => i + 1,
Bound::Unbounded => 0,
};
for _ in 0..start {
chars.next()?;
}
let s = chars.as_wtf8();
let range_len = match range.end_bound() {
Bound::Included(&i) => i + 1 - start,
Bound::Excluded(&i) => i - start,
Bound::Unbounded => return Some(s),
};
codepoint_range_end(s, range_len).map(|end| &s[..end])
}
pub fn get_codepoints(w: &Wtf8, range: impl RangeBounds<usize>) -> &Wtf8 {
try_get_codepoints(w, range).unwrap()
}
#[inline]
pub fn codepoint_range_end(s: &Wtf8, n_chars: usize) -> Option<usize> {
let i = match n_chars.checked_sub(1) {
Some(last_char_index) => {
let (index, c) = s.code_point_indices().nth(last_char_index)?;
index + c.len_wtf8()
}
None => 0,
};
Some(i)
}
pub fn zfill(bytes: &[u8], width: usize) -> Vec<u8> {
if width <= bytes.len() {
bytes.to_vec()
} else {
let (sign, s) = match bytes.first() {
Some(_sign @ b'+') | Some(_sign @ b'-') => {
(unsafe { bytes.get_unchecked(..1) }, &bytes[1..])
}
_ => (&b""[..], bytes),
};
let mut filled = Vec::new();
filled.extend_from_slice(sign);
filled.extend(core::iter::repeat_n(b'0', width - bytes.len()));
filled.extend_from_slice(s);
filled
}
}
pub fn to_ascii(value: &Wtf8) -> AsciiString {
let mut ascii = Vec::new();
for cp in value.code_points() {
if cp.is_ascii() {
ascii.push(cp.to_u32() as u8);
} else {
let c = cp.to_u32();
let hex = if c < 0x100 {
format!("\\x{c:02x}")
} else if c < 0x10000 {
format!("\\u{c:04x}")
} else {
format!("\\U{c:08x}")
};
ascii.append(&mut hex.into_bytes());
}
}
unsafe { AsciiString::from_ascii_unchecked(ascii) }
}
#[derive(Clone, Copy)]
pub struct UnicodeEscapeCodepoint(pub CodePoint);
impl fmt::Display for UnicodeEscapeCodepoint {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let c = self.0.to_u32();
if c >= 0x10000 {
write!(f, "\\U{c:08x}")
} else if c >= 0x100 {
write!(f, "\\u{c:04x}")
} else {
write!(f, "\\x{c:02x}")
}
}
}
pub mod levenshtein {
pub const MOVE_COST: usize = 2;
const CASE_COST: usize = 1;
const MAX_STRING_SIZE: usize = 40;
const fn substitution_cost(mut a: u8, mut b: u8) -> usize {
if (a & 31) != (b & 31) {
return MOVE_COST;
}
if a == b {
return 0;
}
if a.is_ascii_uppercase() {
a += b'a' - b'A';
}
if b.is_ascii_uppercase() {
b += b'a' - b'A';
}
if a == b { CASE_COST } else { MOVE_COST }
}
pub fn levenshtein_distance(a: &[u8], b: &[u8], max_cost: usize) -> usize {
if a == b {
return 0;
}
let (mut a_bytes, mut b_bytes) = (a, b);
let (mut a_begin, mut a_end) = (0usize, a.len());
let (mut b_begin, mut b_end) = (0usize, b.len());
while a_end > 0 && b_end > 0 && (a_bytes[a_begin] == b_bytes[b_begin]) {
a_begin += 1;
b_begin += 1;
a_end -= 1;
b_end -= 1;
}
while a_end > 0
&& b_end > 0
&& (a_bytes[a_begin + a_end - 1] == b_bytes[b_begin + b_end - 1])
{
a_end -= 1;
b_end -= 1;
}
if a_end == 0 || b_end == 0 {
return (a_end + b_end) * MOVE_COST;
}
if a_end > MAX_STRING_SIZE || b_end > MAX_STRING_SIZE {
return max_cost + 1;
}
if b_end < a_end {
core::mem::swap(&mut a_bytes, &mut b_bytes);
core::mem::swap(&mut a_begin, &mut b_begin);
core::mem::swap(&mut a_end, &mut b_end);
}
if (b_end - a_end) * MOVE_COST > max_cost {
return max_cost + 1;
}
let mut buffer = [0usize; MAX_STRING_SIZE];
for (i, x) in buffer.iter_mut().take(a_end).enumerate() {
*x = (i + 1) * MOVE_COST;
}
let mut result = 0usize;
for (b_index, b_code) in b_bytes[b_begin..(b_begin + b_end)].iter().enumerate() {
result = b_index * MOVE_COST;
let mut distance = result;
let mut minimum = usize::MAX;
for (a_index, a_code) in a_bytes[a_begin..(a_begin + a_end)].iter().enumerate() {
let substitute = distance + substitution_cost(*b_code, *a_code);
distance = buffer[a_index];
let insert_delete = usize::min(result, distance) + MOVE_COST;
result = usize::min(insert_delete, substitute);
buffer[a_index] = result;
if result < minimum {
minimum = result;
}
}
if minimum > max_cost {
return max_cost + 1;
}
}
result
}
}
pub fn expandtabs(input: &str, tab_size: usize) -> String {
let tab_stop = tab_size;
let mut expanded_str = String::with_capacity(input.len());
let mut tab_size = tab_stop;
let mut col_count = 0usize;
for ch in input.chars() {
match ch {
'\t' => {
let num_spaces = tab_size - col_count;
col_count += num_spaces;
let expand = " ".repeat(num_spaces);
expanded_str.push_str(&expand);
}
'\r' | '\n' => {
expanded_str.push(ch);
col_count = 0;
tab_size = 0;
}
_ => {
expanded_str.push(ch);
col_count += 1;
}
}
if col_count >= tab_size {
tab_size += tab_stop;
}
}
expanded_str
}
#[macro_export]
macro_rules! ascii {
($x:expr $(,)?) => {{
let s = const {
let s: &str = $x;
assert!(s.is_ascii(), "ascii!() argument is not an ascii string");
s
};
unsafe { $crate::vendored::ascii::AsciiStr::from_ascii_unchecked(s.as_bytes()) }
}};
}
pub use ascii;
const UNICODE_DECIMAL_VALUES: &[char] = &[
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '٠', '٥', '٢', '٣', '٤', 'ټ', 'ٌ', '٧', '٨',
'ŮŠ', '۰', 'Űą', '۲', 'Űł', 'Ű´', 'Űľ', 'Űś', 'ۡ', '۸', 'Űš', 'ß', 'ß', 'ß', 'ß', 'ß', 'ß
', 'ß', 'ß',
'ß', 'ß', '༌', '༧', '༨', '༊', '༪', '།', '༏', 'ŕĽ', '༎', '༯', 'ŕ§Ś', 'ŕ§§', '২', 'ŕ§Š', 'ŕ§Ş', 'ŕ§Ť', 'ŕ§Ź',
'ŕ§', 'ŕ§Ž', 'ŕ§Ż', '੦', '੧', '੨', '੩', '੪', '੫', '੬', 'ŕŠ', '੮', '੯', '૦', '૧', '૨', '૩', '૪', '૫',
'૬', 'ŕŤ', '૮', '૯', 'ŕŚ', 'ŕ§', 'ŕ¨', 'ŕŠ', 'ŕŞ', 'ŕŤ', 'ŕŹ', 'ŕ', 'ŕŽ', 'ŕŻ', 'ௌ', '௧', '௨', 'ொ', '௪',
'்', '௏', 'ŕŻ', '௎', '௯', '์', 'ŕą§', '๨', '๊', '๪', 'ํ', '๏', 'ŕą', '๎', '๯', 'ೌ', 'ŕł§', '೨', 'ೊ',
'೪', '್', '೏', 'ŕł', '೎', '೯', 'ྌ', 'ŕľ§', 'ྨ', 'ྊ', 'ྪ', 'ྍ', 'ྏ', 'ŕľ', 'ྎ', 'ྯ', 'ࡌ', 'ࡧ', 'ࡨ',
'ࡊ', 'ࡪ', 'ࡍ', 'ࡏ', 'ŕˇ', 'ࡎ', '࡯', 'ŕš', 'ŕš', 'ŕš', 'ŕš', 'ŕš', 'ŕš', 'ŕš', 'ŕš', 'ŕš', 'ŕš', 'ŕť', 'ŕť',
'ŕť', 'ŕť', 'ŕť', 'ŕť', 'ŕť', 'ŕť', 'ŕť', 'ŕť', 'ŕź ', '༡', '༢', '༣', '༤', '༥', '༦', 'ŕź§', '༨', '༩', 'á',
'á', 'á', 'á', 'á', 'á
', 'á', 'á', 'á', 'á', 'á', 'á', 'á', 'á', 'á', 'á', 'á', 'á', 'á', 'á',
'á ', 'áĄ', 'á˘', 'áŁ', 'á¤', 'áĽ', 'áŚ', 'á§', 'á¨', 'áŠ', 'á ', 'á ', 'á ', 'á ', 'á ', 'á ', 'á ', 'á ', 'á ',
'á ', 'áĽ', 'áĽ', 'áĽ', 'áĽ', 'áĽ', 'áĽ', 'áĽ', 'áĽ', 'áĽ', 'áĽ', 'á§', 'á§', 'á§', 'á§', 'á§', 'á§', 'á§', 'á§',
'á§', 'á§', 'áŞ', 'áŞ', 'áŞ', 'áŞ', 'áŞ', 'áŞ
', 'áŞ', 'áŞ', 'áŞ', 'áŞ', 'áŞ', 'áŞ', 'áŞ', 'áŞ', 'áŞ', 'áŞ', 'áŞ',
'áŞ', 'áŞ', 'áŞ', 'á', 'á', 'á', 'á', 'á', 'á', 'á', 'á', 'á', 'á', 'Ꮀ', 'Ꮉ', 'Ꮂ', 'Ꮃ', 'Ꮄ', 'Ꮎ',
'᎜', 'Ꭱ', 'Ꮈ', '᎚', 'áą', 'áą', 'áą', 'áą', 'áą', 'áą
', 'áą', 'áą', 'áą', 'áą', 'áą', 'áą', 'áą', 'áą', 'áą',
'áą', 'áą', 'áą', 'áą', 'áą', 'ę ', 'ęĄ', 'ę˘', 'ęŁ', 'ę¤', 'ęĽ', 'ęŚ', 'ę§', 'ę¨', 'ęŠ', 'ęŁ', 'ęŁ', 'ęŁ', 'ęŁ',
'ęŁ', 'ęŁ', 'ęŁ', 'ęŁ', 'ęŁ', 'ęŁ', 'ę¤', 'ę¤', 'ę¤', 'ę¤', 'ę¤', 'ę¤
', 'ę¤', 'ę¤', 'ę¤', 'ę¤', 'ę§', 'ę§', 'ę§',
'ę§', 'ę§', 'ę§', 'ę§', 'ę§', 'ę§', 'ę§', 'ę§°', 'ę§ą', '꧲', 'ę§ł', 'ę§´', 'ę§ľ', 'ę§ś', 'ꧡ', '꧸', 'ę§š', 'ęŠ', 'ęŠ',
'ęŠ', 'ęŠ', 'ęŠ', 'ęŠ', 'ęŠ', 'ęŠ', 'ęŠ', 'ęŠ', '꯰', '꯹', '꯲', '꯳', '꯴', '꯾', 'ꯜ', 'ꯡ', '꯸', 'ꯚ', 'ďź',
'ďź', 'ďź', 'ďź', 'ďź', 'ďź', 'ďź', 'ďź', 'ďź', 'ďź', 'đ ', 'đĄ', 'đ˘', 'đŁ', 'đ¤', 'đĽ', 'đŚ', 'đ§',
'đ¨', 'đŠ', 'đŚ', 'đ§', 'đ¨', 'đŠ', 'đŞ', 'đŤ', 'đŹ', 'đ', 'đŽ', 'đŻ', 'đ°', 'đą', 'đ˛', 'đł', 'đ´', 'đľ', 'đś',
'đˇ', 'đ¸', 'đš', 'đś', 'đˇ', 'đ¸', 'đš', 'đş', 'đť', 'đź', 'đ˝', 'đž', 'đż', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ',
'đ', 'đ', 'đ', 'đ', 'đ°', 'đą', 'đ˛', 'đł', 'đ´', 'đľ', 'đś', 'đˇ', 'đ¸', 'đš', 'đ', 'đ', 'đ', 'đ', 'đ',
'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ',
'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ
', 'đ', 'đ', 'đ', 'đ', 'đ°', 'đą', 'đ˛',
'đł', 'đ´', 'đľ', 'đś', 'đˇ', 'đ¸', 'đš', 'đŁ ', 'đŁĄ', 'đŁ˘', 'đŁŁ', 'đ٤', 'đŁĽ', 'đŁŚ', 'đ٧', 'đٍ', 'đŁŠ', 'đą', 'đą',
'đą', 'đą', 'đą', 'đą', 'đą', 'đą', 'đą', 'đą', 'đľ', 'đľ', 'đľ', 'đľ', 'đľ', 'đľ', 'đľ', 'đľ', 'đľ', 'đľ', 'đŠ ',
'đŠĄ', 'đŠ˘', 'đŠŁ', 'đФ', 'đŠĽ', 'đŠŚ', 'đЧ', 'đЍ', 'đŠŠ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ',
'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ', 'đ ',
'đĄ', 'đ˘', 'đŁ', 'đ¤', 'đĽ', 'đŚ', 'đ§', 'đ¨', 'đŠ', 'đŞ', 'đŤ', 'đŹ', 'đ', 'đŽ', 'đŻ', 'đ°', 'đą', 'đ˛', 'đł',
'đ´', 'đľ', 'đś', 'đˇ', 'đ¸', 'đš', 'đş', 'đť', 'đź', 'đ˝', 'đž', 'đż', 'đĽ', 'đĽ', 'đĽ', 'đĽ', 'đĽ', 'đĽ', 'đĽ',
'đĽ', 'đĽ', 'đĽ',
];
pub fn char_to_decimal(ch: char) -> Option<u8> {
UNICODE_DECIMAL_VALUES
.binary_search(&ch)
.ok()
.map(|i| (i % 10) as u8)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_get_chars() {
let s = "0123456789";
assert_eq!(get_chars(s, 3..7), "3456");
assert_eq!(get_chars(s, 3..7), &s[3..7]);
let s = "0ě ëě˝ë 돸ěě´9";
assert_eq!(get_chars(s, 3..7), "ě˝ë 돸");
let s = "0đđđđđđ
đđ¤Ł9";
assert_eq!(get_chars(s, 3..7), "đđđđ
");
}
}