1use crate::core::{is_unique, RUMResult};
22use chardetng::EncodingDetector;
23pub use compact_str::{
24 format_compact as rumtk_format, CompactString, CompactStringExt, ToCompactString,
25};
26use encoding_rs::Encoding;
27use unicode_segmentation::UnicodeSegmentation;
28const ESCAPED_STRING_WINDOW: usize = 6;
30const ASCII_ESCAPE_CHAR: char = '\\';
31const MIN_ASCII_READABLE: char = ' ';
32const MAX_ASCII_READABLE: char = '~';
33pub const EMPTY_STRING: &str = "";
34pub const DOT_STR: &str = ".";
35pub const EMPTY_STRING_OPTION: Option<&str> = Some("");
36pub const READABLE_ASCII: &str = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~";
37
38pub type RUMString = CompactString;
40
41pub trait UTFStringExtensions {
51 fn count_graphemes(&self) -> usize;
52
53 fn get_grapheme(&self, index: usize) -> &str;
72
73 fn get_graphemes(&self) -> Vec<&str>;
74
75 fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str>;
76
77 #[inline(always)]
78 fn take_grapheme<'a>(&self, graphemes: &Vec<&'a str>, index: usize) -> RUMString {
79 if index >= graphemes.len() {
80 return RUMString::from(EMPTY_STRING);
81 }
82 RUMString::from(graphemes[index])
83 }
84
85 #[inline(always)]
86 fn get_grapheme_window(&self, min: usize, max: usize, offset: usize) -> RUMString {
87 let mut window: RUMString = RUMString::with_capacity(max - min);
88 let start = min + offset;
89 let end = max + offset;
90 let graphemes = self.get_graphemes();
91 for i in start..end {
92 window += &self.take_grapheme(&graphemes, i);
93 }
94 window
95 }
96
97 #[inline(always)]
98 fn get_grapheme_string(&self, end_pattern: &str, offset: usize) -> RUMString {
99 let mut window: RUMString = RUMString::with_capacity(ESCAPED_STRING_WINDOW);
100 for grapheme in self.get_grapheme_chunk(offset) {
101 if grapheme == end_pattern {
102 return RUMString::from(window);
103 } else {
104 window += grapheme;
105 }
106 }
107 RUMString::from(window)
108 }
109
110 #[inline(always)]
111 fn find_grapheme(&self, pattern: &str, offset: usize) -> &str {
112 for grapheme in self.get_grapheme_chunk(offset) {
113 if grapheme == pattern {
114 return grapheme;
115 }
116 }
117 EMPTY_STRING
118 }
119
120 #[inline(always)]
121 fn truncate(&self, max_size: usize) -> RUMString {
122 self.get_grapheme_window(0, max_size, 0)
123 }
124}
125
126pub trait AsStr {
127 fn as_str(&self) -> &str;
128}
129
130pub trait RUMStringConversions: ToString {
131 fn to_rumstring(&self) -> RUMString {
132 RUMString::from(self.to_string())
133 }
134
135 fn to_raw(&self) -> Vec<u8> {
136 self.to_string().as_bytes().to_vec()
137 }
138}
139
140pub trait StringUtils: AsStr + UTFStringExtensions {
141 #[inline(always)]
142 fn duplicate(&self, count: usize) -> RUMString {
143 let mut duplicated = RUMString::with_capacity(count);
144 for i in 0..count {
145 duplicated += &self.as_str();
146 }
147 duplicated
148 }
149
150 fn is_unique(&self) -> bool {
151 let graphemes = self.get_graphemes();
152 is_unique(&graphemes)
153 }
154}
155
156impl UTFStringExtensions for RUMString {
157 #[inline(always)]
158 fn count_graphemes(&self) -> usize {
159 self.graphemes(true).count()
160 }
161
162 #[inline(always)]
163 fn get_grapheme(&self, index: usize) -> &str {
164 self.graphemes(true)
165 .nth(index)
166 .or(EMPTY_STRING_OPTION)
167 .unwrap()
168 }
169
170 #[inline(always)]
171 fn get_graphemes(&self) -> Vec<&str> {
172 self.graphemes(true).collect::<Vec<&str>>()
173 }
174
175 #[inline(always)]
176 fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str> {
177 self.graphemes(true).skip(offset).collect::<Vec<&str>>()
178 }
179}
180
181impl RUMStringConversions for RUMString {}
182impl AsStr for RUMString {
183 fn as_str(&self) -> &str {
184 self.as_str()
185 }
186}
187impl StringUtils for RUMString {}
188
189impl UTFStringExtensions for str {
190 #[inline(always)]
191 fn count_graphemes(&self) -> usize {
192 self.graphemes(true).count()
193 }
194
195 #[inline(always)]
196 fn get_grapheme(&self, index: usize) -> &str {
197 self.graphemes(true)
198 .nth(index)
199 .or(EMPTY_STRING_OPTION)
200 .unwrap()
201 }
202
203 #[inline(always)]
204 fn get_graphemes(&self) -> Vec<&str> {
205 self.graphemes(true).collect::<Vec<&str>>()
206 }
207
208 #[inline(always)]
209 fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str> {
210 self.graphemes(true).skip(offset).collect::<Vec<&str>>()
211 }
212}
213
214impl RUMStringConversions for str {}
215
216impl AsStr for str {
217 fn as_str(&self) -> &str {
218 self
219 }
220}
221
222impl StringUtils for str {}
223
224impl RUMStringConversions for char {}
225
226pub trait RUMArrayConversions {
227 fn to_rumstring(&self) -> RUMString;
228}
229
230impl RUMArrayConversions for Vec<u8> {
231 fn to_rumstring(&self) -> RUMString {
232 self.as_slice().to_rumstring()
233 }
234}
235
236impl RUMArrayConversions for &[u8] {
237 fn to_rumstring(&self) -> RUMString {
238 RUMString::from_utf8(&self).unwrap()
239 }
240}
241
242pub fn count_tokens_ignoring_pattern(vector: &Vec<&str>, string_token: &RUMString) -> usize {
245 let mut count: usize = 0;
246 for tok in vector.iter() {
247 if string_token != tok {
248 count += 1;
249 }
250 }
251 count
252}
253
254pub fn try_decode(src: &[u8]) -> RUMString {
261 let mut detector = EncodingDetector::new();
262 detector.feed(&src, true);
263 let encoding = detector.guess(None, true);
264 decode(src, encoding)
265}
266
267pub fn try_decode_with(src: &[u8], encoding_name: &str) -> RUMString {
273 let encoding = match Encoding::for_label(encoding_name.as_bytes()) {
274 Some(v) => v,
275 None => return RUMString::from(""),
276 };
277 decode(src, encoding)
278}
279
280fn decode(src: &[u8], encoding: &'static Encoding) -> RUMString {
286 match encoding.decode_without_bom_handling_and_without_replacement(&src) {
287 Some(res) => RUMString::from(res),
288 None => RUMString::from_utf8(src).unwrap(),
289 }
290}
291
292pub fn unescape_string(escaped_str: &str) -> RUMResult<RUMString> {
301 let graphemes = escaped_str.graphemes(true).collect::<Vec<&str>>();
302 let str_size = graphemes.len();
303 let mut result: Vec<u8> = Vec::with_capacity(escaped_str.len());
304 let mut i = 0;
305 while i < str_size {
306 let seq_start = graphemes[i];
307 match seq_start {
308 "\\" => {
309 let escape_seq = get_grapheme_string(&graphemes, " ", i);
310 let mut c = match unescape(&escape_seq) {
311 Ok(c) => c,
312 Err(_why) => Vec::from(escape_seq.as_bytes()),
313 };
314 result.append(&mut c);
315 i += &escape_seq.count_graphemes();
316 }
317 _ => {
318 result.append(&mut Vec::from(seq_start.as_bytes()));
319 i += 1;
320 }
321 }
322 }
323 Ok(try_decode(result.as_slice()))
324}
325
326pub fn get_grapheme_string<'a>(
330 graphemes: &Vec<&'a str>,
331 end_grapheme: &str,
332 start_index: usize,
333) -> RUMString {
334 get_grapheme_collection(graphemes, end_grapheme, start_index).join_compact("")
335}
336
337pub fn get_grapheme_collection<'a>(
344 graphemes: &Vec<&'a str>,
345 end_grapheme: &str,
346 start_index: usize,
347) -> Vec<&'a str> {
348 let mut result: Vec<&'a str> = Vec::new();
349 for grapheme in graphemes.iter().skip(start_index) {
350 let item = *grapheme;
351 if item == end_grapheme {
352 break;
353 }
354 result.push(item);
355 }
356 result
357}
358
359pub fn unescape(escaped_str: &str) -> Result<Vec<u8>, RUMString> {
370 let lower_case = escaped_str.to_lowercase();
371 let mut bytes: Vec<u8> = Vec::with_capacity(3);
372 match &lower_case[0..2] {
373 "\\x" => {
375 let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
376 bytes.append(&mut byte_str.as_bytes().to_vec());
377 }
378 "\\u" => {
380 let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
381 bytes.append(&mut byte_str.as_bytes().to_vec());
382 }
383 "\\c" => {
385 let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
386 bytes.append(&mut byte_str.as_bytes().to_vec());
387 }
388 "\\o" => {
390 let byte_str = number_to_char_unchecked(&octal_to_number(&lower_case[2..6])?);
391 bytes.append(&mut byte_str.as_bytes().to_vec());
392 }
393 "\\m" => match lower_case.count_graphemes() {
395 8 => {
396 bytes.push(hex_to_byte(&lower_case[2..4])?);
397 bytes.push(hex_to_byte(&lower_case[4..6])?);
398 bytes.push(hex_to_byte(&lower_case[6..8])?);
399 }
400 6 => {
401 bytes.push(hex_to_byte(&lower_case[2..4])?);
402 bytes.push(hex_to_byte(&lower_case[4..6])?);
403 }
404 _ => {
405 return Err(rumtk_format!(
406 "Unknown multibyte sequence. Cannot decode {}",
407 lower_case
408 ))
409 }
410 },
411 "\\z" => bytes.append(&mut lower_case.as_bytes().to_vec()),
413 _ => bytes.push(unescape_control_byte(&lower_case[0..2])?),
415 }
416 Ok(bytes)
417}
418
419fn unescape_control(escaped_str: &str) -> Result<char, RUMString> {
424 match escaped_str {
425 "\\t" => Ok('\t'),
427 "\\b" => Ok('\x08'),
428 "\\n" => Ok('\n'),
429 "\\r" => Ok('\r'),
430 "\\f" => Ok('\x14'),
431 "\\s" => Ok('\x20'),
432 "\\\\" => Ok(ASCII_ESCAPE_CHAR),
433 "\\'" => Ok('\''),
434 "\\\"" => Ok('"'),
435 "\\0" => Ok('\0'),
436 "\\v" => Ok('\x0B'),
437 "\\a" => Ok('\x07'),
438 _ => Err(rumtk_format!(
440 "Unknown escape sequence? Sequence: {}!",
441 escaped_str
442 )),
443 }
444}
445
446fn unescape_control_byte(escaped_str: &str) -> Result<u8, RUMString> {
451 match escaped_str {
452 "\\t" => Ok(9), "\\b" => Ok(8), "\\n" => Ok(10), "\\r" => Ok(13), "\\f" => Ok(12), "\\s" => Ok(32), "\\\\" => Ok(27), "\\'" => Ok(39), "\\\"" => Ok(34), "\\0" => Ok(0), "\\v" => Ok(11), "\\a" => Ok(7), _ => hex_to_byte(escaped_str),
468 }
469}
470
471fn hex_to_number(hex_str: &str) -> Result<u32, RUMString> {
475 match u32::from_str_radix(&hex_str, 16) {
476 Ok(result) => Ok(result),
477 Err(val) => Err(rumtk_format!(
478 "Failed to parse string with error {}! Input string {} \
479 is not hex string!",
480 val,
481 hex_str
482 )),
483 }
484}
485
486fn hex_to_byte(hex_str: &str) -> Result<u8, RUMString> {
490 match u8::from_str_radix(&hex_str, 16) {
491 Ok(result) => Ok(result),
492 Err(val) => Err(rumtk_format!(
493 "Failed to parse string with error {}! Input string {} \
494 is not hex string!",
495 val,
496 hex_str
497 )),
498 }
499}
500
501fn octal_to_number(hoctal_str: &str) -> Result<u32, RUMString> {
505 match u32::from_str_radix(&hoctal_str, 8) {
506 Ok(result) => Ok(result),
507 Err(val) => Err(rumtk_format!(
508 "Failed to parse string with error {}! Input string {} \
509 is not an octal string!",
510 val,
511 hoctal_str
512 )),
513 }
514}
515
516fn octal_to_byte(hoctal_str: &str) -> Result<u8, RUMString> {
520 match u8::from_str_radix(&hoctal_str, 8) {
521 Ok(result) => Ok(result),
522 Err(val) => Err(rumtk_format!(
523 "Failed to parse string with error {}! Input string {} \
524 is not an octal string!",
525 val,
526 hoctal_str
527 )),
528 }
529}
530
531fn number_to_char(num: &u32) -> Result<RUMString, RUMString> {
535 match char::from_u32(*num) {
536 Some(result) => Ok(result.to_rumstring()),
537 None => Err(rumtk_format!(
538 "Failed to cast number to character! Number {}",
539 num
540 )),
541 }
542}
543
544fn number_to_char_unchecked(num: &u32) -> RUMString {
550 unsafe { char::from_u32_unchecked(*num).to_rumstring() }
551}
552
553pub fn escape(unescaped_str: &str) -> RUMString {
565 basic_escape(unescaped_str)
566 .replace("{", "")
567 .replace("}", "")
568 .to_rumstring()
569}
570
571pub fn basic_escape(unescaped_str: &str) -> RUMString {
582 let escaped = is_escaped_str(unescaped_str);
583 if !escaped {
584 return unescaped_str.escape_default().to_compact_string();
585 }
586 unescaped_str.to_rumstring()
587}
588
589pub fn is_ascii_str(unescaped_str: &str) -> bool {
596 unescaped_str.is_ascii()
597}
598
599pub fn is_escaped_str(unescaped_str: &str) -> bool {
608 if !is_ascii_str(unescaped_str) {
609 return false;
610 }
611
612 for c in unescaped_str.chars() {
613 if !is_printable_char(&c) {
614 return false;
615 }
616 }
617 true
618}
619
620pub fn is_printable_char(c: &char) -> bool {
624 &MIN_ASCII_READABLE <= c && c <= &MAX_ASCII_READABLE
625}
626
627pub fn filter_ascii(unescaped_str: &str, closure: fn(char) -> bool) -> RUMString {
631 let mut filtered = unescaped_str.to_rumstring();
632 filtered.retain(closure);
633 filtered
634}
635
636pub fn filter_non_printable_ascii(unescaped_str: &str) -> RUMString {
640 filter_ascii(unescaped_str, |c: char| is_printable_char(&c))
641}