1use crate::core::{is_unique, RUMResult};
21use chardetng::EncodingDetector;
22pub use compact_str::{format_compact, CompactString, CompactStringExt, ToCompactString};
23use encoding_rs::Encoding;
24use std::fmt::Display;
25use unicode_segmentation::UnicodeSegmentation;
26const ESCAPED_STRING_WINDOW: usize = 6;
28const ASCII_ESCAPE_CHAR: char = '\\';
29const MIN_ASCII_READABLE: char = ' ';
30const MAX_ASCII_READABLE: char = '~';
31pub const EMPTY_STRING: &str = "";
32pub const DOT_STR: &str = ".";
33pub const EMPTY_STRING_OPTION: Option<&str> = Some("");
34pub const READABLE_ASCII: &str = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~";
35
36pub type RUMString = CompactString;
38
39pub trait UTFStringExtensions {
49 fn count_graphemes(&self) -> usize;
50
51 fn get_grapheme(&self, index: usize) -> &str;
70
71 fn get_graphemes(&self) -> Vec<&str>;
72
73 fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str>;
74
75 #[inline(always)]
76 fn take_grapheme<'a>(&self, graphemes: &Vec<&'a str>, index: usize) -> RUMString {
77 if index >= graphemes.len() {
78 return RUMString::from(EMPTY_STRING);
79 }
80 RUMString::from(graphemes[index])
81 }
82
83 #[inline(always)]
84 fn get_grapheme_window(&self, min: usize, max: usize, offset: usize) -> RUMString {
85 let mut window: RUMString = RUMString::with_capacity(max - min);
86 let start = min + offset;
87 let end = max + offset;
88 let graphemes = self.get_graphemes();
89 for i in start..end {
90 window += &self.take_grapheme(&graphemes, i);
91 }
92 window
93 }
94
95 #[inline(always)]
96 fn get_grapheme_string(&self, end_pattern: &str, offset: usize) -> RUMString {
97 let mut window: RUMString = RUMString::with_capacity(ESCAPED_STRING_WINDOW);
98 for grapheme in self.get_grapheme_chunk(offset) {
99 if grapheme == end_pattern {
100 return RUMString::from(window);
101 } else {
102 window += grapheme;
103 }
104 }
105 RUMString::from(window)
106 }
107
108 #[inline(always)]
109 fn find_grapheme(&self, pattern: &str, offset: usize) -> &str {
110 for grapheme in self.get_grapheme_chunk(offset) {
111 if grapheme == pattern {
112 return grapheme;
113 }
114 }
115 EMPTY_STRING
116 }
117
118 #[inline(always)]
119 fn truncate(&self, max_size: usize) -> RUMString {
120 self.get_grapheme_window(0, max_size, 0)
121 }
122}
123
124pub trait AsStr {
125 fn as_str(&self) -> &str;
126}
127
128pub trait RUMStringConversions: ToString {
129 fn to_rumstring(&self) -> RUMString {
130 RUMString::from(self.to_string())
131 }
132
133 fn to_raw(&self) -> Vec<u8> {
134 self.to_string().as_bytes().to_vec()
135 }
136}
137
138pub trait StringUtils: AsStr + UTFStringExtensions {
139 #[inline(always)]
140 fn duplicate(&self, count: usize) -> RUMString {
141 let mut duplicated = RUMString::with_capacity(count);
142 for i in 0..count {
143 duplicated += &self.as_str();
144 }
145 duplicated
146 }
147
148 fn is_unique(&self) -> bool {
149 let graphemes = self.get_graphemes();
150 is_unique(&graphemes)
151 }
152}
153
154impl UTFStringExtensions for RUMString {
155 #[inline(always)]
156 fn count_graphemes(&self) -> usize {
157 self.graphemes(true).count()
158 }
159
160 #[inline(always)]
161 fn get_grapheme(&self, index: usize) -> &str {
162 self.graphemes(true)
163 .nth(index)
164 .or(EMPTY_STRING_OPTION)
165 .unwrap()
166 }
167
168 #[inline(always)]
169 fn get_graphemes(&self) -> Vec<&str> {
170 self.graphemes(true).collect::<Vec<&str>>()
171 }
172
173 #[inline(always)]
174 fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str> {
175 self.graphemes(true).skip(offset).collect::<Vec<&str>>()
176 }
177}
178
179impl RUMStringConversions for RUMString {}
180impl AsStr for RUMString {
181 fn as_str(&self) -> &str {
182 self.as_str()
183 }
184}
185impl StringUtils for RUMString {}
186
187impl UTFStringExtensions for str {
188 #[inline(always)]
189 fn count_graphemes(&self) -> usize {
190 self.graphemes(true).count()
191 }
192
193 #[inline(always)]
194 fn get_grapheme(&self, index: usize) -> &str {
195 self.graphemes(true)
196 .nth(index)
197 .or(EMPTY_STRING_OPTION)
198 .unwrap()
199 }
200
201 #[inline(always)]
202 fn get_graphemes(&self) -> Vec<&str> {
203 self.graphemes(true).collect::<Vec<&str>>()
204 }
205
206 #[inline(always)]
207 fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str> {
208 self.graphemes(true).skip(offset).collect::<Vec<&str>>()
209 }
210}
211
212impl RUMStringConversions for str {}
213
214impl AsStr for str {
215 fn as_str(&self) -> &str {
216 self
217 }
218}
219
220impl StringUtils for str {}
221
222impl RUMStringConversions for char {}
223
224pub trait RUMArrayConversions {
225 fn to_rumstring(&self) -> RUMString;
226}
227
228impl RUMArrayConversions for Vec<u8> {
229 fn to_rumstring(&self) -> RUMString {
230 self.as_slice().to_rumstring()
231 }
232}
233
234impl RUMArrayConversions for &[u8] {
235 fn to_rumstring(&self) -> RUMString {
236 RUMString::from_utf8(&self).unwrap()
237 }
238}
239
240pub fn count_tokens_ignoring_pattern(vector: &Vec<&str>, string_token: &RUMString) -> usize {
243 let mut count: usize = 0;
244 for tok in vector.iter() {
245 if string_token != tok {
246 count += 1;
247 }
248 }
249 count
250}
251
252pub fn try_decode(src: &[u8]) -> RUMString {
259 let mut detector = EncodingDetector::new();
260 detector.feed(&src, true);
261 let encoding = detector.guess(None, true);
262 decode(src, encoding)
263}
264
265pub fn try_decode_with(src: &[u8], encoding_name: &str) -> RUMString {
271 let encoding = match Encoding::for_label(encoding_name.as_bytes()) {
272 Some(v) => v,
273 None => return RUMString::from(""),
274 };
275 decode(src, encoding)
276}
277
278fn decode(src: &[u8], encoding: &'static Encoding) -> RUMString {
284 match encoding.decode_without_bom_handling_and_without_replacement(&src) {
285 Some(res) => RUMString::from(res),
286 None => RUMString::from_utf8(src).unwrap(),
287 }
288}
289
290pub fn unescape_string(escaped_str: &str) -> RUMResult<RUMString> {
299 let graphemes = escaped_str.graphemes(true).collect::<Vec<&str>>();
300 let str_size = graphemes.len();
301 let mut result: Vec<u8> = Vec::with_capacity(escaped_str.len());
302 let mut i = 0;
303 while i < str_size {
304 let seq_start = graphemes[i];
305 match seq_start {
306 "\\" => {
307 let escape_seq = get_grapheme_string(&graphemes, " ", i);
308 let mut c = match unescape(&escape_seq) {
309 Ok(c) => c,
310 Err(_why) => Vec::from(escape_seq.as_bytes()),
311 };
312 result.append(&mut c);
313 i += &escape_seq.count_graphemes();
314 }
315 _ => {
316 result.append(&mut Vec::from(seq_start.as_bytes()));
317 i += 1;
318 }
319 }
320 }
321 Ok(try_decode(result.as_slice()))
322}
323
324pub fn get_grapheme_string<'a>(
328 graphemes: &Vec<&'a str>,
329 end_grapheme: &str,
330 start_index: usize,
331) -> RUMString {
332 get_grapheme_collection(graphemes, end_grapheme, start_index).join_compact("")
333}
334
335pub fn get_grapheme_collection<'a>(
342 graphemes: &Vec<&'a str>,
343 end_grapheme: &str,
344 start_index: usize,
345) -> Vec<&'a str> {
346 let mut result: Vec<&'a str> = Vec::new();
347 for grapheme in graphemes.iter().skip(start_index) {
348 let item = *grapheme;
349 if item == end_grapheme {
350 break;
351 }
352 result.push(item);
353 }
354 result
355}
356
357pub fn unescape(escaped_str: &str) -> Result<Vec<u8>, RUMString> {
368 let lower_case = escaped_str.to_lowercase();
369 let mut bytes: Vec<u8> = Vec::with_capacity(3);
370 match &lower_case[0..2] {
371 "\\x" => {
373 let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
374 bytes.append(&mut byte_str.as_bytes().to_vec());
375 }
376 "\\u" => {
378 let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
379 bytes.append(&mut byte_str.as_bytes().to_vec());
380 }
381 "\\c" => {
383 let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
384 bytes.append(&mut byte_str.as_bytes().to_vec());
385 }
386 "\\o" => {
388 let byte_str = number_to_char_unchecked(&octal_to_number(&lower_case[2..6])?);
389 bytes.append(&mut byte_str.as_bytes().to_vec());
390 }
391 "\\m" => match lower_case.count_graphemes() {
393 8 => {
394 bytes.push(hex_to_byte(&lower_case[2..4])?);
395 bytes.push(hex_to_byte(&lower_case[4..6])?);
396 bytes.push(hex_to_byte(&lower_case[6..8])?);
397 }
398 6 => {
399 bytes.push(hex_to_byte(&lower_case[2..4])?);
400 bytes.push(hex_to_byte(&lower_case[4..6])?);
401 }
402 _ => {
403 return Err(format_compact!(
404 "Unknown multibyte sequence. Cannot decode {}",
405 lower_case
406 ))
407 }
408 },
409 "\\z" => bytes.append(&mut lower_case.as_bytes().to_vec()),
411 _ => bytes.push(unescape_control_byte(&lower_case[0..2])?),
413 }
414 Ok(bytes)
415}
416
417fn unescape_control(escaped_str: &str) -> Result<char, RUMString> {
422 match escaped_str {
423 "\\t" => Ok('\t'),
425 "\\b" => Ok('\x08'),
426 "\\n" => Ok('\n'),
427 "\\r" => Ok('\r'),
428 "\\f" => Ok('\x14'),
429 "\\s" => Ok('\x20'),
430 "\\\\" => Ok(ASCII_ESCAPE_CHAR),
431 "\\'" => Ok('\''),
432 "\\\"" => Ok('"'),
433 "\\0" => Ok('\0'),
434 "\\v" => Ok('\x0B'),
435 "\\a" => Ok('\x07'),
436 _ => Err(format_compact!(
438 "Unknown escape sequence? Sequence: {}!",
439 escaped_str
440 )),
441 }
442}
443
444fn unescape_control_byte(escaped_str: &str) -> Result<u8, RUMString> {
449 match escaped_str {
450 "\\t" => Ok(9), "\\b" => Ok(8), "\\n" => Ok(10), "\\r" => Ok(13), "\\f" => Ok(12), "\\s" => Ok(32), "\\\\" => Ok(27), "\\'" => Ok(39), "\\\"" => Ok(34), "\\0" => Ok(0), "\\v" => Ok(11), "\\a" => Ok(7), _ => hex_to_byte(escaped_str),
466 }
467}
468
469fn hex_to_number(hex_str: &str) -> Result<u32, RUMString> {
473 match u32::from_str_radix(&hex_str, 16) {
474 Ok(result) => Ok(result),
475 Err(val) => Err(format_compact!(
476 "Failed to parse string with error {}! Input string {} \
477 is not hex string!",
478 val,
479 hex_str
480 )),
481 }
482}
483
484fn hex_to_byte(hex_str: &str) -> Result<u8, RUMString> {
488 match u8::from_str_radix(&hex_str, 16) {
489 Ok(result) => Ok(result),
490 Err(val) => Err(format_compact!(
491 "Failed to parse string with error {}! Input string {} \
492 is not hex string!",
493 val,
494 hex_str
495 )),
496 }
497}
498
499fn octal_to_number(hoctal_str: &str) -> Result<u32, RUMString> {
503 match u32::from_str_radix(&hoctal_str, 8) {
504 Ok(result) => Ok(result),
505 Err(val) => Err(format_compact!(
506 "Failed to parse string with error {}! Input string {} \
507 is not an octal string!",
508 val,
509 hoctal_str
510 )),
511 }
512}
513
514fn octal_to_byte(hoctal_str: &str) -> Result<u8, RUMString> {
518 match u8::from_str_radix(&hoctal_str, 8) {
519 Ok(result) => Ok(result),
520 Err(val) => Err(format_compact!(
521 "Failed to parse string with error {}! Input string {} \
522 is not an octal string!",
523 val,
524 hoctal_str
525 )),
526 }
527}
528
529fn number_to_char(num: &u32) -> Result<RUMString, RUMString> {
533 match char::from_u32(*num) {
534 Some(result) => Ok(result.to_rumstring()),
535 None => Err(format_compact!(
536 "Failed to cast number to character! Number {}",
537 num
538 )),
539 }
540}
541
542fn number_to_char_unchecked(num: &u32) -> RUMString {
548 unsafe { char::from_u32_unchecked(*num).to_rumstring() }
549}
550
551pub fn escape(unescaped_str: &str) -> RUMString {
563 basic_escape(unescaped_str)
564 .replace("{", "")
565 .replace("}", "")
566 .to_rumstring()
567}
568
569pub fn basic_escape(unescaped_str: &str) -> RUMString {
580 let escaped = is_escaped_str(unescaped_str);
581 if !escaped {
582 return unescaped_str.escape_default().to_compact_string();
583 }
584 unescaped_str.to_rumstring()
585}
586
587pub fn is_ascii_str(unescaped_str: &str) -> bool {
594 unescaped_str.is_ascii()
595}
596
597pub fn is_escaped_str(unescaped_str: &str) -> bool {
606 if !is_ascii_str(unescaped_str) {
607 return false;
608 }
609
610 for c in unescaped_str.chars() {
611 if !is_printable_char(&c) {
612 return false;
613 }
614 }
615 true
616}
617
618pub fn is_printable_char(c: &char) -> bool {
622 &MIN_ASCII_READABLE <= c && c <= &MAX_ASCII_READABLE
623}
624
625pub fn filter_ascii(unescaped_str: &str, closure: fn(char) -> bool) -> RUMString {
629 let mut filtered = unescaped_str.to_rumstring();
630 filtered.retain(closure);
631 filtered
632}
633
634pub fn filter_non_printable_ascii(unescaped_str: &str) -> RUMString {
638 filter_ascii(unescaped_str, |c: char| is_printable_char(&c))
639}