use polars_core::prelude::StringChunked;
fn convert_while_ascii(b: &[u8], convert: fn(&u8) -> u8, out: &mut Vec<u8>) {
out.clear();
out.reserve(b.len());
const USIZE_SIZE: usize = size_of::<usize>();
const MAGIC_UNROLL: usize = 2;
const N: usize = USIZE_SIZE * MAGIC_UNROLL;
const NONASCII_MASK: usize = usize::from_ne_bytes([0x80; USIZE_SIZE]);
let mut i = 0;
unsafe {
while i + N <= b.len() {
let in_chunk = b.get_unchecked(i..i + N);
let out_chunk = out.spare_capacity_mut().get_unchecked_mut(i..i + N);
let mut bits = 0;
for j in 0..MAGIC_UNROLL {
bits |= in_chunk.as_ptr().cast::<usize>().add(j).read_unaligned();
}
if bits & NONASCII_MASK != 0 {
break;
}
for j in 0..N {
let out = out_chunk.get_unchecked_mut(j);
out.write(convert(in_chunk.get_unchecked(j)));
}
i += N;
}
out.set_len(i);
}
}
fn to_lowercase_helper(s: &str, buf: &mut Vec<u8>) {
convert_while_ascii(s.as_bytes(), u8::to_ascii_lowercase, buf);
let rest = unsafe { s.get_unchecked(buf.len()..) };
let mut s = unsafe { String::from_utf8_unchecked(std::mem::take(buf)) };
for (i, c) in rest[..].char_indices() {
if c == 'Σ' {
map_uppercase_sigma(rest, i, &mut s)
} else {
s.extend(c.to_lowercase());
}
}
fn map_uppercase_sigma(from: &str, i: usize, to: &mut String) {
debug_assert!('Σ'.len_utf8() == 2);
let is_word_final = case_ignoreable_then_cased(from[..i].chars().rev())
&& !case_ignoreable_then_cased(from[i + 2..].chars());
to.push_str(if is_word_final { "ς" } else { "σ" });
}
fn case_ignoreable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {
#[cfg(feature = "nightly")]
use core::unicode::{Case_Ignorable, Cased};
#[cfg(not(feature = "nightly"))]
use super::unicode_internals::{Case_Ignorable, Cased};
#[allow(clippy::skip_while_next)]
match iter.skip_while(|&c| Case_Ignorable(c)).next() {
Some(c) => Cased(c),
None => false,
}
}
*buf = s.into_bytes();
}
pub(super) fn to_lowercase<'a>(ca: &'a StringChunked) -> StringChunked {
let mut buf = Vec::new();
let f = |s: &'a str| -> &'a str {
to_lowercase_helper(s, &mut buf);
let slice = unsafe { std::str::from_utf8_unchecked(&buf) };
unsafe { std::mem::transmute::<&str, &'a str>(slice) }
};
ca.apply_mut(f)
}
pub(super) fn to_uppercase<'a>(ca: &'a StringChunked) -> StringChunked {
let mut buf = Vec::new();
let f = |s: &'a str| -> &'a str {
convert_while_ascii(s.as_bytes(), u8::to_ascii_uppercase, &mut buf);
let rest = unsafe { s.get_unchecked(buf.len()..) };
let mut s = unsafe { String::from_utf8_unchecked(std::mem::take(&mut buf)) };
for c in rest.chars() {
s.extend(c.to_uppercase());
}
buf = s.into_bytes();
let slice = unsafe { std::str::from_utf8_unchecked(&buf) };
unsafe { std::mem::transmute::<&str, &'a str>(slice) }
};
ca.apply_mut(f)
}
#[cfg(feature = "nightly")]
pub(super) fn to_titlecase<'a>(ca: &'a StringChunked) -> StringChunked {
let mut buf = Vec::new();
let mut scratch = Vec::new();
let f = |s: &'a str| -> &'a str {
to_lowercase_helper(s, &mut scratch);
let lowercased = unsafe { std::str::from_utf8_unchecked(&scratch) };
buf.clear();
let mut s = unsafe { String::from_utf8_unchecked(std::mem::take(&mut buf)) };
let mut next_is_upper = true;
for c in lowercased.chars() {
if next_is_upper {
s.extend(c.to_uppercase());
} else {
s.push(c);
}
next_is_upper = !c.is_alphanumeric();
}
buf = s.into_bytes();
let slice = unsafe { std::str::from_utf8_unchecked(&buf) };
unsafe { std::mem::transmute::<&str, &'a str>(slice) }
};
ca.apply_mut(f)
}