1use crate::core::{is_unique, RUMResult};
22use chardetng::EncodingDetector;
23pub use compact_str::{
24 format_compact as rumtk_format, CompactString, CompactStringExt, ToCompactString,
25};
26use encoding_rs::Encoding;
27use unicode_segmentation::UnicodeSegmentation;
28const ESCAPED_STRING_WINDOW: usize = 6;
30const ASCII_ESCAPE_CHAR: char = '\\';
31const MIN_ASCII_READABLE: char = ' ';
32const MAX_ASCII_READABLE: char = '~';
33pub const EMPTY_STRING: &str = "";
34pub const DOT_STR: &str = ".";
35pub const EMPTY_STRING_OPTION: Option<&str> = Some("");
36pub const READABLE_ASCII: &str = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~";
37
38pub type RUMString = CompactString;
40pub type EscapeException<'a> = (&'a str, &'a str);
41pub type EscapeExceptions<'a> = &'a [EscapeException<'a>];
42
43pub trait UTFStringExtensions {
53 fn count_graphemes(&self) -> usize;
54
55 fn get_grapheme(&self, index: usize) -> &str;
74
75 fn get_graphemes(&self) -> Vec<&str>;
76
77 fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str>;
78
79 #[inline(always)]
80 fn take_grapheme<'a>(&self, graphemes: &Vec<&'a str>, index: usize) -> RUMString {
81 if index >= graphemes.len() {
82 return RUMString::from(EMPTY_STRING);
83 }
84 RUMString::from(graphemes[index])
85 }
86
87 #[inline(always)]
88 fn get_grapheme_window(&self, min: usize, max: usize, offset: usize) -> RUMString {
89 let mut window: RUMString = RUMString::with_capacity(max - min);
90 let start = min + offset;
91 let end = max + offset;
92 let graphemes = self.get_graphemes();
93 for i in start..end {
94 window += &self.take_grapheme(&graphemes, i);
95 }
96 window
97 }
98
99 #[inline(always)]
100 fn get_grapheme_string(&self, end_pattern: &str, offset: usize) -> RUMString {
101 let mut window: RUMString = RUMString::with_capacity(ESCAPED_STRING_WINDOW);
102 for grapheme in self.get_grapheme_chunk(offset) {
103 if grapheme == end_pattern {
104 return RUMString::from(window);
105 } else {
106 window += grapheme;
107 }
108 }
109 RUMString::from(window)
110 }
111
112 #[inline(always)]
113 fn find_grapheme(&self, pattern: &str, offset: usize) -> &str {
114 for grapheme in self.get_grapheme_chunk(offset) {
115 if grapheme == pattern {
116 return grapheme;
117 }
118 }
119 EMPTY_STRING
120 }
121
122 #[inline(always)]
123 fn truncate(&self, max_size: usize) -> RUMString {
124 self.get_grapheme_window(0, max_size, 0)
125 }
126}
127
128pub trait AsStr {
129 fn as_str(&self) -> &str;
130}
131
132pub trait RUMStringConversions: ToString {
133 fn to_rumstring(&self) -> RUMString {
134 RUMString::from(self.to_string())
135 }
136
137 fn to_raw(&self) -> Vec<u8> {
138 self.to_string().as_bytes().to_vec()
139 }
140}
141
142pub trait StringUtils: AsStr + UTFStringExtensions {
143 #[inline(always)]
144 fn duplicate(&self, count: usize) -> RUMString {
145 let mut duplicated = RUMString::with_capacity(count);
146 for i in 0..count {
147 duplicated += &self.as_str();
148 }
149 duplicated
150 }
151
152 fn is_unique(&self) -> bool {
153 let graphemes = self.get_graphemes();
154 is_unique(&graphemes)
155 }
156}
157
158impl UTFStringExtensions for RUMString {
159 #[inline(always)]
160 fn count_graphemes(&self) -> usize {
161 self.graphemes(true).count()
162 }
163
164 #[inline(always)]
165 fn get_grapheme(&self, index: usize) -> &str {
166 self.graphemes(true)
167 .nth(index)
168 .or(EMPTY_STRING_OPTION)
169 .unwrap()
170 }
171
172 #[inline(always)]
173 fn get_graphemes(&self) -> Vec<&str> {
174 self.graphemes(true).collect::<Vec<&str>>()
175 }
176
177 #[inline(always)]
178 fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str> {
179 self.graphemes(true).skip(offset).collect::<Vec<&str>>()
180 }
181}
182
183impl RUMStringConversions for RUMString {}
184impl AsStr for RUMString {
185 fn as_str(&self) -> &str {
186 self.as_str()
187 }
188}
189impl StringUtils for RUMString {}
190
191impl UTFStringExtensions for str {
192 #[inline(always)]
193 fn count_graphemes(&self) -> usize {
194 self.graphemes(true).count()
195 }
196
197 #[inline(always)]
198 fn get_grapheme(&self, index: usize) -> &str {
199 self.graphemes(true)
200 .nth(index)
201 .or(EMPTY_STRING_OPTION)
202 .unwrap()
203 }
204
205 #[inline(always)]
206 fn get_graphemes(&self) -> Vec<&str> {
207 self.graphemes(true).collect::<Vec<&str>>()
208 }
209
210 #[inline(always)]
211 fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str> {
212 self.graphemes(true).skip(offset).collect::<Vec<&str>>()
213 }
214}
215
216impl RUMStringConversions for str {}
217
218impl AsStr for str {
219 fn as_str(&self) -> &str {
220 self
221 }
222}
223
224impl StringUtils for str {}
225
226impl RUMStringConversions for char {}
227
228pub trait RUMArrayConversions {
229 fn to_rumstring(&self) -> RUMString;
230}
231
232impl RUMArrayConversions for Vec<u8> {
233 fn to_rumstring(&self) -> RUMString {
234 self.as_slice().to_rumstring()
235 }
236}
237
238impl RUMArrayConversions for &[u8] {
239 fn to_rumstring(&self) -> RUMString {
240 RUMString::from_utf8(&self).unwrap()
241 }
242}
243
244pub fn count_tokens_ignoring_pattern(vector: &Vec<&str>, string_token: &RUMString) -> usize {
247 let mut count: usize = 0;
248 for tok in vector.iter() {
249 if string_token != tok {
250 count += 1;
251 }
252 }
253 count
254}
255
256pub fn try_decode(src: &[u8]) -> RUMString {
263 let mut detector = EncodingDetector::new();
264 detector.feed(&src, true);
265 let encoding = detector.guess(None, true);
266 decode(src, encoding)
267}
268
269pub fn try_decode_with(src: &[u8], encoding_name: &str) -> RUMString {
275 let encoding = match Encoding::for_label(encoding_name.as_bytes()) {
276 Some(v) => v,
277 None => return RUMString::from(""),
278 };
279 decode(src, encoding)
280}
281
282fn decode(src: &[u8], encoding: &'static Encoding) -> RUMString {
288 match encoding.decode_without_bom_handling_and_without_replacement(&src) {
289 Some(res) => RUMString::from(res),
290 None => RUMString::from_utf8(src).unwrap(),
291 }
292}
293
294pub fn unescape_string(escaped_str: &str) -> RUMResult<RUMString> {
303 let graphemes = escaped_str.graphemes(true).collect::<Vec<&str>>();
304 let str_size = graphemes.len();
305 let mut result: Vec<u8> = Vec::with_capacity(escaped_str.len());
306 let mut i = 0;
307 while i < str_size {
308 let seq_start = graphemes[i];
309 match seq_start {
310 "\\" => {
311 let escape_seq = get_grapheme_string(&graphemes, " ", i);
312 let mut c = match unescape(&escape_seq) {
313 Ok(c) => c,
314 Err(_why) => Vec::from(escape_seq.as_bytes()),
315 };
316 result.append(&mut c);
317 i += &escape_seq.count_graphemes();
318 }
319 _ => {
320 result.append(&mut Vec::from(seq_start.as_bytes()));
321 i += 1;
322 }
323 }
324 }
325 Ok(try_decode(result.as_slice()))
326}
327
328pub fn get_grapheme_string<'a>(
332 graphemes: &Vec<&'a str>,
333 end_grapheme: &str,
334 start_index: usize,
335) -> RUMString {
336 get_grapheme_collection(graphemes, end_grapheme, start_index).join_compact("")
337}
338
339pub fn get_grapheme_collection<'a>(
346 graphemes: &Vec<&'a str>,
347 end_grapheme: &str,
348 start_index: usize,
349) -> Vec<&'a str> {
350 let mut result: Vec<&'a str> = Vec::new();
351 for grapheme in graphemes.iter().skip(start_index) {
352 let item = *grapheme;
353 if item == end_grapheme {
354 break;
355 }
356 result.push(item);
357 }
358 result
359}
360
361pub fn unescape(escaped_str: &str) -> Result<Vec<u8>, RUMString> {
372 let lower_case = escaped_str.to_lowercase();
373 let mut bytes: Vec<u8> = Vec::with_capacity(3);
374 match &lower_case[0..2] {
375 "\\x" => {
377 let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
378 bytes.append(&mut byte_str.as_bytes().to_vec());
379 }
380 "\\u" => {
382 let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
383 bytes.append(&mut byte_str.as_bytes().to_vec());
384 }
385 "\\c" => {
387 let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
388 bytes.append(&mut byte_str.as_bytes().to_vec());
389 }
390 "\\o" => {
392 let byte_str = number_to_char_unchecked(&octal_to_number(&lower_case[2..6])?);
393 bytes.append(&mut byte_str.as_bytes().to_vec());
394 }
395 "\\m" => match lower_case.count_graphemes() {
397 8 => {
398 bytes.push(hex_to_byte(&lower_case[2..4])?);
399 bytes.push(hex_to_byte(&lower_case[4..6])?);
400 bytes.push(hex_to_byte(&lower_case[6..8])?);
401 }
402 6 => {
403 bytes.push(hex_to_byte(&lower_case[2..4])?);
404 bytes.push(hex_to_byte(&lower_case[4..6])?);
405 }
406 _ => {
407 return Err(rumtk_format!(
408 "Unknown multibyte sequence. Cannot decode {}",
409 lower_case
410 ))
411 }
412 },
413 "\\z" => bytes.append(&mut lower_case.as_bytes().to_vec()),
415 _ => bytes.push(unescape_control_byte(&lower_case[0..2])?),
417 }
418 Ok(bytes)
419}
420
421fn unescape_control(escaped_str: &str) -> Result<char, RUMString> {
426 match escaped_str {
427 "\\t" => Ok('\t'),
429 "\\b" => Ok('\x08'),
430 "\\n" => Ok('\n'),
431 "\\r" => Ok('\r'),
432 "\\f" => Ok('\x14'),
433 "\\s" => Ok('\x20'),
434 "\\\\" => Ok(ASCII_ESCAPE_CHAR),
435 "\\'" => Ok('\''),
436 "\\\"" => Ok('"'),
437 "\\0" => Ok('\0'),
438 "\\v" => Ok('\x0B'),
439 "\\a" => Ok('\x07'),
440 _ => Err(rumtk_format!(
442 "Unknown escape sequence? Sequence: {}!",
443 escaped_str
444 )),
445 }
446}
447
448fn unescape_control_byte(escaped_str: &str) -> Result<u8, RUMString> {
453 match escaped_str {
454 "\\t" => Ok(9), "\\b" => Ok(8), "\\n" => Ok(10), "\\r" => Ok(13), "\\f" => Ok(12), "\\s" => Ok(32), "\\\\" => Ok(27), "\\'" => Ok(39), "\\\"" => Ok(34), "\\0" => Ok(0), "\\v" => Ok(11), "\\a" => Ok(7), _ => hex_to_byte(escaped_str),
470 }
471}
472
473fn hex_to_number(hex_str: &str) -> Result<u32, RUMString> {
477 match u32::from_str_radix(&hex_str, 16) {
478 Ok(result) => Ok(result),
479 Err(val) => Err(rumtk_format!(
480 "Failed to parse string with error {}! Input string {} \
481 is not hex string!",
482 val,
483 hex_str
484 )),
485 }
486}
487
488fn hex_to_byte(hex_str: &str) -> Result<u8, RUMString> {
492 match u8::from_str_radix(&hex_str, 16) {
493 Ok(result) => Ok(result),
494 Err(val) => Err(rumtk_format!(
495 "Failed to parse string with error {}! Input string {} \
496 is not hex string!",
497 val,
498 hex_str
499 )),
500 }
501}
502
503fn octal_to_number(hoctal_str: &str) -> Result<u32, RUMString> {
507 match u32::from_str_radix(&hoctal_str, 8) {
508 Ok(result) => Ok(result),
509 Err(val) => Err(rumtk_format!(
510 "Failed to parse string with error {}! Input string {} \
511 is not an octal string!",
512 val,
513 hoctal_str
514 )),
515 }
516}
517
518fn octal_to_byte(hoctal_str: &str) -> Result<u8, RUMString> {
522 match u8::from_str_radix(&hoctal_str, 8) {
523 Ok(result) => Ok(result),
524 Err(val) => Err(rumtk_format!(
525 "Failed to parse string with error {}! Input string {} \
526 is not an octal string!",
527 val,
528 hoctal_str
529 )),
530 }
531}
532
533fn number_to_char(num: &u32) -> Result<RUMString, RUMString> {
537 match char::from_u32(*num) {
538 Some(result) => Ok(result.to_rumstring()),
539 None => Err(rumtk_format!(
540 "Failed to cast number to character! Number {}",
541 num
542 )),
543 }
544}
545
546fn number_to_char_unchecked(num: &u32) -> RUMString {
552 unsafe { char::from_u32_unchecked(*num).to_rumstring() }
553}
554
555pub fn escape(unescaped_str: &str) -> RUMString {
567 basic_escape(unescaped_str, &vec![("{", ""), ("}", "")])
568}
569
570pub fn basic_escape(unescaped_str: &str, except: EscapeExceptions) -> RUMString {
581 let escaped = is_escaped_str(unescaped_str);
582 if !escaped {
583 let mut escaped_str = unescaped_str.escape_default().to_string();
584 for (from, to) in except {
585 escaped_str = escaped_str.replace(from, to);
586 }
587 return escaped_str.to_rumstring();
588 }
589 unescaped_str.to_rumstring()
590}
591
592pub fn is_ascii_str(unescaped_str: &str) -> bool {
599 unescaped_str.is_ascii()
600}
601
602pub fn is_escaped_str(unescaped_str: &str) -> bool {
611 if !is_ascii_str(unescaped_str) {
612 return false;
613 }
614
615 for c in unescaped_str.chars() {
616 if !is_printable_char(&c) {
617 return false;
618 }
619 }
620 true
621}
622
623pub fn is_printable_char(c: &char) -> bool {
627 &MIN_ASCII_READABLE <= c && c <= &MAX_ASCII_READABLE
628}
629
630pub fn filter_ascii(unescaped_str: &str, closure: fn(char) -> bool) -> RUMString {
634 let mut filtered = unescaped_str.to_rumstring();
635 filtered.retain(closure);
636 filtered
637}
638
639pub fn filter_non_printable_ascii(unescaped_str: &str) -> RUMString {
643 filter_ascii(unescaped_str, |c: char| is_printable_char(&c))
644}