use crate::core::{is_unique, RUMResult, RUMVec};
use crate::types::RUMBuffer;
use chardetng::EncodingDetector;
pub use compact_str::{
format_compact as rumtk_format, CompactString, CompactStringExt, ToCompactString,
};
use encoding_rs::Encoding;
use std::cmp::min;
use unicode_segmentation::UnicodeSegmentation;
const ESCAPED_STRING_WINDOW: usize = 6;
const ASCII_ESCAPE_CHAR: char = '\\';
const MIN_ASCII_READABLE: char = ' ';
const MAX_ASCII_READABLE: char = '~';
pub const EMPTY_STRING: &str = "";
pub const DOT_STR: &str = ".";
pub const EMPTY_STRING_OPTION: Option<&str> = Some("");
pub const READABLE_ASCII: &str = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~";
pub type RUMString = CompactString;
pub type EscapeException<'a> = (&'a str, &'a str);
pub type EscapeExceptions<'a> = &'a [EscapeException<'a>];
pub type Grapheme<'a> = &'a str;
pub type GraphemeStringView<'a> = RUMVec<Grapheme<'a>>;
pub type GraphemePattern<'a> = &'a [Grapheme<'a>];
pub type GraphemeSlice<'b, 'a> = &'b [Grapheme<'a>];
pub type GraphemePatternPair<'a> = (GraphemePattern<'a>, GraphemePattern<'a>);
#[derive(Default, Debug, PartialEq, Clone)]
pub struct GraphemeStr<'a> {
view: GraphemeStringView<'a>,
start: usize,
end: usize,
}
impl<'a> GraphemeStr<'a> {
pub fn from(string: &'a str) -> Self {
let view = string.graphemes(true).collect::<GraphemeStringView>();
Self::from_view(view)
}
pub fn from_view(view: GraphemeStringView<'a>) -> Self {
let start = 0;
let end = view.len();
Self { view, start, end }
}
pub fn at(&self, index: usize) -> Grapheme<'a> {
self.view[index]
}
pub fn trim(&self, pattern: &GraphemePatternPair<'a>) -> Self {
let (left_pattern, right_pattern) = pattern;
self.trim_left(left_pattern).trim_right(right_pattern)
}
pub fn trim_left(&self, pattern: &GraphemePattern<'a>) -> Self {
let new_offset = self.find(pattern, self.start);
Self {
view: self.view.clone(),
start: new_offset,
end: self.end,
}
}
pub fn trim_right(&self, pattern: &GraphemePattern<'a>) -> Self {
let new_offset = self.rfind(pattern, self.end);
Self {
view: self.view.clone(),
start: self.start,
end: new_offset,
}
}
pub fn splice(&self, skip_pattern: &GraphemePatternPair<'a>) -> Self {
let (left_pattern, right_pattern) = skip_pattern;
let mut new_view = GraphemeStringView::with_capacity(self.end - self.start);
let mut offset = self.start;
let l_pattern_s = left_pattern.len();
while offset < self.end {
let target_s = self.find(left_pattern, offset) + l_pattern_s;
for i in offset..target_s {
new_view.push(self.view[i]);
}
offset = self.find(right_pattern, target_s);
}
GraphemeStr::from_view(new_view)
}
pub fn find(&self, pattern: &GraphemePattern<'a>, offset: usize) -> usize {
let pattern_s = pattern.len();
let mut new_offset = offset;
let mut pattern_end = new_offset + pattern_s;
while new_offset < self.end && pattern_end < self.end {
if self.view[new_offset..pattern_end] == **pattern {
break;
}
new_offset += 1;
pattern_end = new_offset + pattern_s;
}
new_offset
}
pub fn rfind(&self, pattern: &GraphemePattern<'a>, offset: usize) -> usize {
let pattern_s = pattern.len();
let mut new_offset = offset;
while new_offset > self.start {
if self.view[new_offset - pattern_s..new_offset] == **pattern {
break;
}
new_offset -= 1;
}
new_offset
}
pub fn len(&self) -> usize {
self.end - self.start
}
pub fn get_graphemes(&self) -> GraphemeSlice<'_, 'a> {
&self.view[self.start..self.end]
}
pub fn truncate(&self, size: usize) -> Self {
let end = min(size, self.end);
Self {
view: self.view.clone(),
start: self.start,
end,
}
}
pub fn is_unique(&self) -> bool {
is_unique(&self.view)
}
}
impl ToString for GraphemeStr<'_> {
fn to_string(&self) -> String {
let mut new_string = String::with_capacity(self.len());
for grapheme in self.view[self.start..self.end].iter() {
new_string.push_str(grapheme);
}
new_string
}
}
impl RUMStringConversions for GraphemeStr<'_> {}
pub trait StringLike {
fn with_capacity(capacity: usize) -> Self;
fn push_str(&mut self, string: &str);
}
pub trait AsStr {
fn as_str(&self) -> &str;
fn as_grapheme_str(&self) -> GraphemeStr {
GraphemeStr::from(self.as_str())
}
}
pub trait RUMStringConversions: ToString {
#[inline(always)]
fn to_rumstring(&self) -> RUMString {
RUMString::from(self.to_string())
}
#[inline(always)]
fn to_raw(&self) -> RUMVec<u8> {
self.to_string().as_bytes().to_vec()
}
#[inline(always)]
fn to_buffer(&self) -> RUMBuffer {
RUMBuffer::from(self.to_string())
}
}
pub trait StringUtils: AsStr + RUMStringConversions {
#[inline(always)]
fn duplicate(&self, count: usize) -> RUMString {
let mut duplicated = RUMString::with_capacity(count);
for i in 0..count {
duplicated += &self.as_str();
}
duplicated
}
fn truncate(&self, count: usize) -> RUMString {
self.as_grapheme_str().truncate(count).to_rumstring()
}
}
impl AsStr for String {
fn as_str(&self) -> &str {
self.as_str()
}
}
impl RUMStringConversions for RUMString {}
impl AsStr for RUMString {
fn as_str(&self) -> &str {
self.as_str()
}
}
impl StringUtils for RUMString {}
impl RUMStringConversions for str {}
impl AsStr for str {
fn as_str(&self) -> &str {
self
}
}
impl StringUtils for str {}
impl RUMStringConversions for char {}
pub trait RUMArrayConversions {
fn to_rumstring(&self) -> RUMString;
}
impl RUMArrayConversions for Vec<u8> {
fn to_rumstring(&self) -> RUMString {
self.as_slice().to_rumstring()
}
}
impl RUMArrayConversions for &[u8] {
fn to_rumstring(&self) -> RUMString {
RUMString::from_utf8(&self).unwrap()
}
}
pub fn count_tokens_ignoring_pattern(vector: &Vec<&str>, string_token: &RUMString) -> usize {
let mut count: usize = 0;
for tok in vector.iter() {
if string_token != tok {
count += 1;
}
}
count
}
pub fn try_decode(src: &[u8]) -> RUMString {
let mut detector = EncodingDetector::new();
detector.feed(&src, true);
let encoding = detector.guess(None, true);
decode(src, encoding)
}
pub fn try_decode_with(src: &[u8], encoding_name: &str) -> RUMString {
let encoding = match Encoding::for_label(encoding_name.as_bytes()) {
Some(v) => v,
None => return RUMString::from(""),
};
decode(src, encoding)
}
fn decode(src: &[u8], encoding: &'static Encoding) -> RUMString {
match encoding.decode_without_bom_handling_and_without_replacement(&src) {
Some(res) => RUMString::from(res),
None => RUMString::from_utf8(src).unwrap(),
}
}
pub fn unescape_string(escaped_str: &str) -> RUMResult<RUMString> {
let graphemes = escaped_str.graphemes(true).collect::<Vec<&str>>();
let str_size = graphemes.len();
let mut result: Vec<u8> = Vec::with_capacity(escaped_str.len());
let mut i = 0;
while i < str_size {
let seq_start = graphemes[i];
match seq_start {
"\\" => {
let escape_seq = get_grapheme_string(&graphemes, " ", i);
let mut c = match unescape(&escape_seq) {
Ok(c) => c,
Err(_why) => Vec::from(escape_seq.as_bytes()),
};
result.append(&mut c);
i += &escape_seq.as_grapheme_str().len();
}
_ => {
result.append(&mut Vec::from(seq_start.as_bytes()));
i += 1;
}
}
}
Ok(try_decode(result.as_slice()))
}
pub fn get_grapheme_string<'a>(
graphemes: &Vec<&'a str>,
end_grapheme: &str,
start_index: usize,
) -> RUMString {
get_grapheme_collection(graphemes, end_grapheme, start_index).join_compact("")
}
pub fn get_grapheme_collection<'a>(
graphemes: &Vec<&'a str>,
end_grapheme: &str,
start_index: usize,
) -> Vec<&'a str> {
let mut result: Vec<&'a str> = Vec::new();
for grapheme in graphemes.iter().skip(start_index) {
let item = *grapheme;
if item == end_grapheme {
break;
}
result.push(item);
}
result
}
pub fn unescape(escaped_str: &str) -> Result<Vec<u8>, RUMString> {
let lower_case = escaped_str.to_lowercase();
let mut bytes: Vec<u8> = Vec::with_capacity(3);
match &lower_case[0..2] {
"\\x" => {
let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
bytes.append(&mut byte_str.as_bytes().to_vec());
}
"\\u" => {
let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
bytes.append(&mut byte_str.as_bytes().to_vec());
}
"\\c" => {
let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
bytes.append(&mut byte_str.as_bytes().to_vec());
}
"\\o" => {
let byte_str = number_to_char_unchecked(&octal_to_number(&lower_case[2..6])?);
bytes.append(&mut byte_str.as_bytes().to_vec());
}
"\\m" => match lower_case.as_grapheme_str().len() {
8 => {
bytes.push(hex_to_byte(&lower_case[2..4])?);
bytes.push(hex_to_byte(&lower_case[4..6])?);
bytes.push(hex_to_byte(&lower_case[6..8])?);
}
6 => {
bytes.push(hex_to_byte(&lower_case[2..4])?);
bytes.push(hex_to_byte(&lower_case[4..6])?);
}
_ => {
return Err(rumtk_format!(
"Unknown multibyte sequence. Cannot decode {}",
lower_case
))
}
},
"\\z" => bytes.append(&mut lower_case.as_bytes().to_vec()),
_ => bytes.push(unescape_control_byte(&lower_case[0..2])?),
}
Ok(bytes)
}
fn unescape_control(escaped_str: &str) -> Result<char, RUMString> {
match escaped_str {
"\\t" => Ok('\t'),
"\\b" => Ok('\x08'),
"\\n" => Ok('\n'),
"\\r" => Ok('\r'),
"\\f" => Ok('\x14'),
"\\s" => Ok('\x20'),
"\\\\" => Ok(ASCII_ESCAPE_CHAR),
"\\'" => Ok('\''),
"\\\"" => Ok('"'),
"\\0" => Ok('\0'),
"\\v" => Ok('\x0B'),
"\\a" => Ok('\x07'),
_ => Err(rumtk_format!(
"Unknown escape sequence? Sequence: {}!",
escaped_str
)),
}
}
fn unescape_control_byte(escaped_str: &str) -> Result<u8, RUMString> {
match escaped_str {
"\\t" => Ok(9), "\\b" => Ok(8), "\\n" => Ok(10), "\\r" => Ok(13), "\\f" => Ok(12), "\\s" => Ok(32), "\\\\" => Ok(27), "\\'" => Ok(39), "\\\"" => Ok(34), "\\0" => Ok(0), "\\v" => Ok(11), "\\a" => Ok(7), _ => hex_to_byte(escaped_str),
}
}
fn hex_to_number(hex_str: &str) -> Result<u32, RUMString> {
match u32::from_str_radix(&hex_str, 16) {
Ok(result) => Ok(result),
Err(val) => Err(rumtk_format!(
"Failed to parse string with error {}! Input string {} \
is not hex string!",
val,
hex_str
)),
}
}
fn hex_to_byte(hex_str: &str) -> Result<u8, RUMString> {
match u8::from_str_radix(&hex_str, 16) {
Ok(result) => Ok(result),
Err(val) => Err(rumtk_format!(
"Failed to parse string with error {}! Input string {} \
is not hex string!",
val,
hex_str
)),
}
}
fn octal_to_number(hoctal_str: &str) -> Result<u32, RUMString> {
match u32::from_str_radix(&hoctal_str, 8) {
Ok(result) => Ok(result),
Err(val) => Err(rumtk_format!(
"Failed to parse string with error {}! Input string {} \
is not an octal string!",
val,
hoctal_str
)),
}
}
fn octal_to_byte(hoctal_str: &str) -> Result<u8, RUMString> {
match u8::from_str_radix(&hoctal_str, 8) {
Ok(result) => Ok(result),
Err(val) => Err(rumtk_format!(
"Failed to parse string with error {}! Input string {} \
is not an octal string!",
val,
hoctal_str
)),
}
}
fn number_to_char(num: &u32) -> Result<RUMString, RUMString> {
match char::from_u32(*num) {
Some(result) => Ok(result.to_rumstring()),
None => Err(rumtk_format!(
"Failed to cast number to character! Number {}",
num
)),
}
}
fn number_to_char_unchecked(num: &u32) -> RUMString {
unsafe { char::from_u32_unchecked(*num).to_rumstring() }
}
pub fn escape(unescaped_str: &str) -> RUMString {
basic_escape(unescaped_str, &vec![("{", ""), ("}", "")])
}
pub fn basic_escape(unescaped_str: &str, except: EscapeExceptions) -> RUMString {
let escaped = is_escaped_str(unescaped_str);
if !escaped {
let mut escaped_str = unescaped_str.escape_default().to_string();
for (from, to) in except {
escaped_str = escaped_str.replace(from, to);
}
return escaped_str.to_rumstring();
}
unescaped_str.to_rumstring()
}
pub fn is_ascii_str(unescaped_str: &str) -> bool {
unescaped_str.is_ascii()
}
pub fn is_escaped_str(unescaped_str: &str) -> bool {
if !is_ascii_str(unescaped_str) {
return false;
}
for c in unescaped_str.chars() {
if !is_printable_char(&c) {
return false;
}
}
true
}
pub fn is_printable_char(c: &char) -> bool {
&MIN_ASCII_READABLE <= c && c <= &MAX_ASCII_READABLE
}
pub fn filter_ascii(unescaped_str: &str, closure: fn(char) -> bool) -> RUMString {
let mut filtered = unescaped_str.to_rumstring();
filtered.retain(closure);
filtered
}
pub fn filter_non_printable_ascii(unescaped_str: &str) -> RUMString {
filter_ascii(unescaped_str, |c: char| is_printable_char(&c))
}