1use crate::core::{is_unique, RUMResult};
21use chardetng::EncodingDetector;
22pub use compact_str::{format_compact, CompactString, CompactStringExt, ToCompactString};
23use encoding_rs::Encoding;
24use std::fmt::Display;
25use unicode_segmentation::UnicodeSegmentation;
26const ESCAPED_STRING_WINDOW: usize = 6;
28const ASCII_ESCAPE_CHAR: char = '\\';
29const MIN_ASCII_READABLE: char = ' ';
30const MAX_ASCII_READABLE: char = '~';
31pub const EMPTY_STRING: &str = "";
32pub const DOT_STR: &str = ".";
33pub const EMPTY_STRING_OPTION: Option<&str> = Some("");
34pub const READABLE_ASCII: &str = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~";
35
36pub type RUMString = CompactString;
38
39pub trait UTFStringExtensions {
49 fn count_graphemes(&self) -> usize;
50
51 fn get_grapheme(&self, index: usize) -> &str;
70
71 fn get_graphemes(&self) -> Vec<&str>;
72
73 fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str>;
74
75 #[inline(always)]
76 fn take_grapheme<'a>(&self, graphemes: &Vec<&'a str>, index: usize) -> RUMString {
77 if index >= graphemes.len() {
78 return RUMString::from(EMPTY_STRING);
79 }
80 RUMString::from(graphemes[index])
81 }
82
83 #[inline(always)]
84 fn get_grapheme_window(&self, min: usize, max: usize, offset: usize) -> RUMString {
85 let mut window: RUMString = RUMString::with_capacity(max - min);
86 let start = min + offset;
87 let end = max + offset;
88 let graphemes = self.get_graphemes();
89 for i in start..end {
90 window += &self.take_grapheme(&graphemes, i);
91 }
92 window
93 }
94
95 #[inline(always)]
96 fn get_grapheme_string(&self, end_pattern: &str, offset: usize) -> RUMString {
97 let mut window: RUMString = RUMString::with_capacity(ESCAPED_STRING_WINDOW);
98 for grapheme in self.get_grapheme_chunk(offset) {
99 if grapheme == end_pattern {
100 return RUMString::from(window);
101 } else {
102 window += grapheme;
103 }
104 }
105 RUMString::from(window)
106 }
107
108 #[inline(always)]
109 fn find_grapheme(&self, pattern: &str, offset: usize) -> &str {
110 for grapheme in self.get_grapheme_chunk(offset) {
111 if grapheme == pattern {
112 return grapheme;
113 }
114 }
115 EMPTY_STRING
116 }
117
118 #[inline(always)]
119 fn truncate(&self, max_size: usize) -> RUMString {
120 self.get_grapheme_window(0, max_size, 0)
121 }
122}
123
124pub trait AsStr {
125 fn as_str(&self) -> &str;
126}
127
128pub trait RUMStringConversions: ToString {
129 fn to_rumstring(&self) -> RUMString {
130 RUMString::from(self.to_string())
131 }
132
133 fn to_raw(&self) -> Vec<u8> {
134 self.to_string().as_bytes().to_vec()
135 }
136}
137
138pub trait StringUtils: AsStr + UTFStringExtensions {
139 #[inline(always)]
140 fn duplicate(&self, count: usize) -> RUMString {
141 let mut duplicated = RUMString::with_capacity(count);
142 for i in 0..count {
143 duplicated += &self.as_str();
144 }
145 duplicated
146 }
147
148 fn is_unique(&self) -> bool {
149 let graphemes = self.get_graphemes();
150 is_unique(&graphemes)
151 }
152}
153
154impl UTFStringExtensions for RUMString {
155 #[inline(always)]
156 fn count_graphemes(&self) -> usize {
157 self.graphemes(true).count()
158 }
159
160 #[inline(always)]
161 fn get_grapheme(&self, index: usize) -> &str {
162 self.graphemes(true)
163 .nth(index)
164 .or(EMPTY_STRING_OPTION)
165 .unwrap()
166 }
167
168 #[inline(always)]
169 fn get_graphemes(&self) -> Vec<&str> {
170 self.graphemes(true).collect::<Vec<&str>>()
171 }
172
173 #[inline(always)]
174 fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str> {
175 self.graphemes(true).skip(offset).collect::<Vec<&str>>()
176 }
177}
178
179impl RUMStringConversions for RUMString {}
180impl AsStr for RUMString {
181 fn as_str(&self) -> &str {
182 self.as_str()
183 }
184}
185impl StringUtils for RUMString {}
186
187impl UTFStringExtensions for str {
188 #[inline(always)]
189 fn count_graphemes(&self) -> usize {
190 self.graphemes(true).count()
191 }
192
193 #[inline(always)]
194 fn get_grapheme(&self, index: usize) -> &str {
195 self.graphemes(true)
196 .nth(index)
197 .or(EMPTY_STRING_OPTION)
198 .unwrap()
199 }
200
201 #[inline(always)]
202 fn get_graphemes(&self) -> Vec<&str> {
203 self.graphemes(true).collect::<Vec<&str>>()
204 }
205
206 #[inline(always)]
207 fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str> {
208 self.graphemes(true).skip(offset).collect::<Vec<&str>>()
209 }
210}
211
212impl RUMStringConversions for str {}
213
214impl AsStr for str {
215 fn as_str(&self) -> &str {
216 self
217 }
218}
219
220impl StringUtils for str {}
221
222impl RUMStringConversions for char {}
223
224pub trait RUMArrayConversions {
225 fn to_rumstring(&self) -> RUMString;
226}
227
228impl RUMArrayConversions for Vec<u8> {
229 fn to_rumstring(&self) -> RUMString {
230 self.as_slice().to_rumstring()
231 }
232}
233
234impl RUMArrayConversions for &[u8] {
235 fn to_rumstring(&self) -> RUMString {
236 RUMString::from_utf8(&self).unwrap()
237 }
238}
239
240pub fn count_tokens_ignoring_pattern(vector: &Vec<&str>, string_token: &RUMString) -> usize {
243 let mut count: usize = 0;
244 for tok in vector.iter() {
245 if string_token != tok {
246 count += 1;
247 }
248 }
249 count
250}
251
252pub fn try_decode(src: &[u8]) -> RUMString {
259 let mut detector = EncodingDetector::new();
260 detector.feed(&src, true);
261 let encoding = detector.guess(None, true);
262 decode(src, encoding)
263}
264
265pub fn try_decode_with(src: &[u8], encoding_name: &str) -> RUMString {
271 let encoding = match Encoding::for_label(encoding_name.as_bytes()) {
272 Some(v) => v,
273 None => return RUMString::from(""),
274 };
275 decode(src, encoding)
276}
277
278fn decode(src: &[u8], encoding: &'static Encoding) -> RUMString {
284 match encoding.decode_without_bom_handling_and_without_replacement(&src) {
285 Some(res) => RUMString::from(res),
286 None => RUMString::from_utf8(src).unwrap(),
287 }
288}
289
290pub fn unescape_string(escaped_str: &str) -> RUMResult<RUMString> {
299 let str_size = escaped_str.count_graphemes();
300 let mut result: Vec<u8> = Vec::with_capacity(escaped_str.len());
301 let mut i = 0;
302 while i < str_size {
303 let seq_start = escaped_str.get_grapheme(i);
304 match seq_start {
305 "\\" => {
306 let escape_seq = escaped_str.get_grapheme_string(" ", i);
307 let mut c = match unescape(&escape_seq) {
308 Ok(c) => c,
309 Err(_why) => Vec::from(escape_seq.as_bytes()),
310 };
311 result.append(&mut c);
312 i += &escape_seq.count_graphemes();
313 }
314 _ => {
315 result.append(&mut Vec::from(seq_start.as_bytes()));
316 i += 1;
317 }
318 }
319 }
320 Ok(try_decode(result.as_slice()))
321}
322
323pub fn unescape(escaped_str: &str) -> Result<Vec<u8>, RUMString> {
334 let lower_case = escaped_str.to_lowercase();
335 let mut bytes: Vec<u8> = Vec::with_capacity(3);
336 match &lower_case[0..2] {
337 "\\x" => {
339 let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
340 bytes.append(&mut byte_str.as_bytes().to_vec());
341 }
342 "\\u" => {
344 let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
345 bytes.append(&mut byte_str.as_bytes().to_vec());
346 }
347 "\\c" => {
349 let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
350 bytes.append(&mut byte_str.as_bytes().to_vec());
351 }
352 "\\o" => {
354 let byte_str = number_to_char_unchecked(&octal_to_number(&lower_case[2..6])?);
355 bytes.append(&mut byte_str.as_bytes().to_vec());
356 }
357 "\\m" => match lower_case.count_graphemes() {
359 8 => {
360 bytes.push(hex_to_byte(&lower_case[2..4])?);
361 bytes.push(hex_to_byte(&lower_case[4..6])?);
362 bytes.push(hex_to_byte(&lower_case[6..8])?);
363 }
364 6 => {
365 bytes.push(hex_to_byte(&lower_case[2..4])?);
366 bytes.push(hex_to_byte(&lower_case[4..6])?);
367 }
368 _ => {
369 return Err(format_compact!(
370 "Unknown multibyte sequence. Cannot decode {}",
371 lower_case
372 ))
373 }
374 },
375 "\\z" => bytes.append(&mut lower_case.as_bytes().to_vec()),
377 _ => bytes.push(unescape_control_byte(&lower_case)?),
379 }
380 Ok(bytes)
381}
382
383fn unescape_control(escaped_str: &str) -> Result<char, RUMString> {
388 match escaped_str {
389 "\\t" => Ok('\t'),
391 "\\b" => Ok('\x08'),
392 "\\n" => Ok('\n'),
393 "\\r" => Ok('\r'),
394 "\\f" => Ok('\x14'),
395 "\\s" => Ok('\x20'),
396 "\\\\" => Ok(ASCII_ESCAPE_CHAR),
397 "\\'" => Ok('\''),
398 "\\\"" => Ok('\"'),
399 "\\0" => Ok('\0'),
400 "\\v" => Ok('\x0B'),
401 "\\a" => Ok('\x07'),
402 _ => Err(format_compact!(
404 "Unknown escape sequence? Sequence: {}!",
405 escaped_str
406 )),
407 }
408}
409
410fn unescape_control_byte(escaped_str: &str) -> Result<u8, RUMString> {
415 match escaped_str {
416 "\\t" => Ok(9), "\\b" => Ok(8), "\\n" => Ok(10), "\\r" => Ok(13), "\\f" => Ok(12), "\\s" => Ok(32), "\\\\" => Ok(27), "\\'" => Ok(39), "\\\"" => Ok(34), "\\0" => Ok(0), "\\v" => Ok(11), "\\a" => Ok(7), _ => hex_to_byte(&escaped_str[2..]),
432 }
433}
434
435fn hex_to_number(hex_str: &str) -> Result<u32, RUMString> {
439 match u32::from_str_radix(&hex_str, 16) {
440 Ok(result) => Ok(result),
441 Err(val) => Err(format_compact!(
442 "Failed to parse string with error {}! Input string {} \
443 is not hex string!",
444 val,
445 hex_str
446 )),
447 }
448}
449
450fn hex_to_byte(hex_str: &str) -> Result<u8, RUMString> {
454 match u8::from_str_radix(&hex_str, 16) {
455 Ok(result) => Ok(result),
456 Err(val) => Err(format_compact!(
457 "Failed to parse string with error {}! Input string {} \
458 is not hex string!",
459 val,
460 hex_str
461 )),
462 }
463}
464
465fn octal_to_number(hoctal_str: &str) -> Result<u32, RUMString> {
469 match u32::from_str_radix(&hoctal_str, 8) {
470 Ok(result) => Ok(result),
471 Err(val) => Err(format_compact!(
472 "Failed to parse string with error {}! Input string {} \
473 is not an octal string!",
474 val,
475 hoctal_str
476 )),
477 }
478}
479
480fn octal_to_byte(hoctal_str: &str) -> Result<u8, RUMString> {
484 match u8::from_str_radix(&hoctal_str, 8) {
485 Ok(result) => Ok(result),
486 Err(val) => Err(format_compact!(
487 "Failed to parse string with error {}! Input string {} \
488 is not an octal string!",
489 val,
490 hoctal_str
491 )),
492 }
493}
494
495fn number_to_char(num: &u32) -> Result<RUMString, RUMString> {
499 match char::from_u32(*num) {
500 Some(result) => Ok(result.to_rumstring()),
501 None => Err(format_compact!(
502 "Failed to cast number to character! Number {}",
503 num
504 )),
505 }
506}
507
508fn number_to_char_unchecked(num: &u32) -> RUMString {
514 unsafe { char::from_u32_unchecked(*num).to_rumstring() }
515}
516
517pub fn escape(unescaped_str: &str) -> RUMString {
529 basic_escape(unescaped_str)
530 .replace("{", "")
531 .replace("}", "")
532 .to_rumstring()
533}
534
535pub fn basic_escape(unescaped_str: &str) -> RUMString {
546 unescaped_str.escape_default().to_compact_string()
547}
548
549pub fn filter_ascii(unescaped_str: &str, closure: fn(char) -> bool) -> RUMString {
553 let mut filtered = unescaped_str.to_rumstring();
554 filtered.retain(closure);
555 filtered
556}
557
558pub fn filter_non_printable_ascii(unescaped_str: &str) -> RUMString {
562 filter_ascii(unescaped_str, |c: char| {
563 !c.is_ascii() && (' ' <= c || c <= '~')
564 })
565}