1use crate::core::{is_unique, RUMResult, RUMVec};
22use crate::types::RUMBuffer;
23use chardetng::EncodingDetector;
24pub use compact_str::{
25 format_compact as rumtk_format, CompactString, CompactStringExt, ToCompactString,
26};
27use encoding_rs::Encoding;
28use unicode_segmentation::UnicodeSegmentation;
29const ESCAPED_STRING_WINDOW: usize = 6;
31const ASCII_ESCAPE_CHAR: char = '\\';
32const MIN_ASCII_READABLE: char = ' ';
33const MAX_ASCII_READABLE: char = '~';
34pub const EMPTY_STRING: &str = "";
35pub const DOT_STR: &str = ".";
36pub const EMPTY_STRING_OPTION: Option<&str> = Some("");
37pub const READABLE_ASCII: &str = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~";
38
39pub type RUMString = CompactString;
41pub type EscapeException<'a> = (&'a str, &'a str);
42pub type EscapeExceptions<'a> = &'a [EscapeException<'a>];
43
44pub trait UTFStringExtensions {
54 fn count_graphemes(&self) -> usize;
55
56 fn get_grapheme(&self, index: usize) -> &str;
75
76 fn get_graphemes(&self) -> Vec<&str>;
77
78 fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str>;
79
80 #[inline(always)]
81 fn take_grapheme<'a>(&self, graphemes: &Vec<&'a str>, index: usize) -> RUMString {
82 if index >= graphemes.len() {
83 return RUMString::from(EMPTY_STRING);
84 }
85 RUMString::from(graphemes[index])
86 }
87
88 #[inline(always)]
89 fn get_grapheme_window(&self, min: usize, max: usize, offset: usize) -> RUMString {
90 let mut window: RUMString = RUMString::with_capacity(max - min);
91 let start = min + offset;
92 let end = max + offset;
93 let graphemes = self.get_graphemes();
94 for i in start..end {
95 window += &self.take_grapheme(&graphemes, i);
96 }
97 window
98 }
99
100 #[inline(always)]
101 fn get_grapheme_string(&self, end_pattern: &str, offset: usize) -> RUMString {
102 let mut window: RUMString = RUMString::with_capacity(ESCAPED_STRING_WINDOW);
103 for grapheme in self.get_grapheme_chunk(offset) {
104 if grapheme == end_pattern {
105 return RUMString::from(window);
106 } else {
107 window += grapheme;
108 }
109 }
110 RUMString::from(window)
111 }
112
113 #[inline(always)]
114 fn find_grapheme(&self, pattern: &str, offset: usize) -> &str {
115 for grapheme in self.get_grapheme_chunk(offset) {
116 if grapheme == pattern {
117 return grapheme;
118 }
119 }
120 EMPTY_STRING
121 }
122
123 #[inline(always)]
124 fn truncate(&self, max_size: usize) -> RUMString {
125 self.get_grapheme_window(0, max_size, 0)
126 }
127}
128
129pub trait AsStr {
130 fn as_str(&self) -> &str;
131}
132
133pub trait RUMStringConversions: ToString {
134 #[inline(always)]
135 fn to_rumstring(&self) -> RUMString {
136 RUMString::from(self.to_string())
137 }
138
139 #[inline(always)]
140 fn to_raw(&self) -> RUMVec<u8> {
141 self.to_string().as_bytes().to_vec()
142 }
143
144 #[inline(always)]
145 fn to_buffer(&self) -> RUMBuffer {
146 RUMBuffer::from(self.to_string())
147 }
148}
149
150pub trait StringUtils: AsStr + UTFStringExtensions {
151 #[inline(always)]
152 fn duplicate(&self, count: usize) -> RUMString {
153 let mut duplicated = RUMString::with_capacity(count);
154 for i in 0..count {
155 duplicated += &self.as_str();
156 }
157 duplicated
158 }
159
160 fn is_unique(&self) -> bool {
161 let graphemes = self.get_graphemes();
162 is_unique(&graphemes)
163 }
164}
165
166impl UTFStringExtensions for RUMString {
167 #[inline(always)]
168 fn count_graphemes(&self) -> usize {
169 self.graphemes(true).count()
170 }
171
172 #[inline(always)]
173 fn get_grapheme(&self, index: usize) -> &str {
174 self.graphemes(true)
175 .nth(index)
176 .or(EMPTY_STRING_OPTION)
177 .unwrap()
178 }
179
180 #[inline(always)]
181 fn get_graphemes(&self) -> Vec<&str> {
182 self.graphemes(true).collect::<Vec<&str>>()
183 }
184
185 #[inline(always)]
186 fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str> {
187 self.graphemes(true).skip(offset).collect::<Vec<&str>>()
188 }
189}
190
191impl RUMStringConversions for RUMString {}
192impl AsStr for RUMString {
193 fn as_str(&self) -> &str {
194 self.as_str()
195 }
196}
197impl StringUtils for RUMString {}
198
199impl UTFStringExtensions for str {
200 #[inline(always)]
201 fn count_graphemes(&self) -> usize {
202 self.graphemes(true).count()
203 }
204
205 #[inline(always)]
206 fn get_grapheme(&self, index: usize) -> &str {
207 self.graphemes(true)
208 .nth(index)
209 .or(EMPTY_STRING_OPTION)
210 .unwrap()
211 }
212
213 #[inline(always)]
214 fn get_graphemes(&self) -> Vec<&str> {
215 self.graphemes(true).collect::<Vec<&str>>()
216 }
217
218 #[inline(always)]
219 fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str> {
220 self.graphemes(true).skip(offset).collect::<Vec<&str>>()
221 }
222}
223
224impl RUMStringConversions for str {}
225
226impl AsStr for str {
227 fn as_str(&self) -> &str {
228 self
229 }
230}
231
232impl StringUtils for str {}
233
234impl RUMStringConversions for char {}
235
236pub trait RUMArrayConversions {
237 fn to_rumstring(&self) -> RUMString;
238}
239
240impl RUMArrayConversions for Vec<u8> {
241 fn to_rumstring(&self) -> RUMString {
242 self.as_slice().to_rumstring()
243 }
244}
245
246impl RUMArrayConversions for &[u8] {
247 fn to_rumstring(&self) -> RUMString {
248 RUMString::from_utf8(&self).unwrap()
249 }
250}
251
252pub fn count_tokens_ignoring_pattern(vector: &Vec<&str>, string_token: &RUMString) -> usize {
255 let mut count: usize = 0;
256 for tok in vector.iter() {
257 if string_token != tok {
258 count += 1;
259 }
260 }
261 count
262}
263
264pub fn try_decode(src: &[u8]) -> RUMString {
271 let mut detector = EncodingDetector::new();
272 detector.feed(&src, true);
273 let encoding = detector.guess(None, true);
274 decode(src, encoding)
275}
276
277pub fn try_decode_with(src: &[u8], encoding_name: &str) -> RUMString {
283 let encoding = match Encoding::for_label(encoding_name.as_bytes()) {
284 Some(v) => v,
285 None => return RUMString::from(""),
286 };
287 decode(src, encoding)
288}
289
290fn decode(src: &[u8], encoding: &'static Encoding) -> RUMString {
296 match encoding.decode_without_bom_handling_and_without_replacement(&src) {
297 Some(res) => RUMString::from(res),
298 None => RUMString::from_utf8(src).unwrap(),
299 }
300}
301
302pub fn unescape_string(escaped_str: &str) -> RUMResult<RUMString> {
311 let graphemes = escaped_str.graphemes(true).collect::<Vec<&str>>();
312 let str_size = graphemes.len();
313 let mut result: Vec<u8> = Vec::with_capacity(escaped_str.len());
314 let mut i = 0;
315 while i < str_size {
316 let seq_start = graphemes[i];
317 match seq_start {
318 "\\" => {
319 let escape_seq = get_grapheme_string(&graphemes, " ", i);
320 let mut c = match unescape(&escape_seq) {
321 Ok(c) => c,
322 Err(_why) => Vec::from(escape_seq.as_bytes()),
323 };
324 result.append(&mut c);
325 i += &escape_seq.count_graphemes();
326 }
327 _ => {
328 result.append(&mut Vec::from(seq_start.as_bytes()));
329 i += 1;
330 }
331 }
332 }
333 Ok(try_decode(result.as_slice()))
334}
335
336pub fn get_grapheme_string<'a>(
340 graphemes: &Vec<&'a str>,
341 end_grapheme: &str,
342 start_index: usize,
343) -> RUMString {
344 get_grapheme_collection(graphemes, end_grapheme, start_index).join_compact("")
345}
346
347pub fn get_grapheme_collection<'a>(
354 graphemes: &Vec<&'a str>,
355 end_grapheme: &str,
356 start_index: usize,
357) -> Vec<&'a str> {
358 let mut result: Vec<&'a str> = Vec::new();
359 for grapheme in graphemes.iter().skip(start_index) {
360 let item = *grapheme;
361 if item == end_grapheme {
362 break;
363 }
364 result.push(item);
365 }
366 result
367}
368
369pub fn unescape(escaped_str: &str) -> Result<Vec<u8>, RUMString> {
380 let lower_case = escaped_str.to_lowercase();
381 let mut bytes: Vec<u8> = Vec::with_capacity(3);
382 match &lower_case[0..2] {
383 "\\x" => {
385 let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
386 bytes.append(&mut byte_str.as_bytes().to_vec());
387 }
388 "\\u" => {
390 let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
391 bytes.append(&mut byte_str.as_bytes().to_vec());
392 }
393 "\\c" => {
395 let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
396 bytes.append(&mut byte_str.as_bytes().to_vec());
397 }
398 "\\o" => {
400 let byte_str = number_to_char_unchecked(&octal_to_number(&lower_case[2..6])?);
401 bytes.append(&mut byte_str.as_bytes().to_vec());
402 }
403 "\\m" => match lower_case.count_graphemes() {
405 8 => {
406 bytes.push(hex_to_byte(&lower_case[2..4])?);
407 bytes.push(hex_to_byte(&lower_case[4..6])?);
408 bytes.push(hex_to_byte(&lower_case[6..8])?);
409 }
410 6 => {
411 bytes.push(hex_to_byte(&lower_case[2..4])?);
412 bytes.push(hex_to_byte(&lower_case[4..6])?);
413 }
414 _ => {
415 return Err(rumtk_format!(
416 "Unknown multibyte sequence. Cannot decode {}",
417 lower_case
418 ))
419 }
420 },
421 "\\z" => bytes.append(&mut lower_case.as_bytes().to_vec()),
423 _ => bytes.push(unescape_control_byte(&lower_case[0..2])?),
425 }
426 Ok(bytes)
427}
428
429fn unescape_control(escaped_str: &str) -> Result<char, RUMString> {
434 match escaped_str {
435 "\\t" => Ok('\t'),
437 "\\b" => Ok('\x08'),
438 "\\n" => Ok('\n'),
439 "\\r" => Ok('\r'),
440 "\\f" => Ok('\x14'),
441 "\\s" => Ok('\x20'),
442 "\\\\" => Ok(ASCII_ESCAPE_CHAR),
443 "\\'" => Ok('\''),
444 "\\\"" => Ok('"'),
445 "\\0" => Ok('\0'),
446 "\\v" => Ok('\x0B'),
447 "\\a" => Ok('\x07'),
448 _ => Err(rumtk_format!(
450 "Unknown escape sequence? Sequence: {}!",
451 escaped_str
452 )),
453 }
454}
455
456fn unescape_control_byte(escaped_str: &str) -> Result<u8, RUMString> {
461 match escaped_str {
462 "\\t" => Ok(9), "\\b" => Ok(8), "\\n" => Ok(10), "\\r" => Ok(13), "\\f" => Ok(12), "\\s" => Ok(32), "\\\\" => Ok(27), "\\'" => Ok(39), "\\\"" => Ok(34), "\\0" => Ok(0), "\\v" => Ok(11), "\\a" => Ok(7), _ => hex_to_byte(escaped_str),
478 }
479}
480
481fn hex_to_number(hex_str: &str) -> Result<u32, RUMString> {
485 match u32::from_str_radix(&hex_str, 16) {
486 Ok(result) => Ok(result),
487 Err(val) => Err(rumtk_format!(
488 "Failed to parse string with error {}! Input string {} \
489 is not hex string!",
490 val,
491 hex_str
492 )),
493 }
494}
495
496fn hex_to_byte(hex_str: &str) -> Result<u8, RUMString> {
500 match u8::from_str_radix(&hex_str, 16) {
501 Ok(result) => Ok(result),
502 Err(val) => Err(rumtk_format!(
503 "Failed to parse string with error {}! Input string {} \
504 is not hex string!",
505 val,
506 hex_str
507 )),
508 }
509}
510
511fn octal_to_number(hoctal_str: &str) -> Result<u32, RUMString> {
515 match u32::from_str_radix(&hoctal_str, 8) {
516 Ok(result) => Ok(result),
517 Err(val) => Err(rumtk_format!(
518 "Failed to parse string with error {}! Input string {} \
519 is not an octal string!",
520 val,
521 hoctal_str
522 )),
523 }
524}
525
526fn octal_to_byte(hoctal_str: &str) -> Result<u8, RUMString> {
530 match u8::from_str_radix(&hoctal_str, 8) {
531 Ok(result) => Ok(result),
532 Err(val) => Err(rumtk_format!(
533 "Failed to parse string with error {}! Input string {} \
534 is not an octal string!",
535 val,
536 hoctal_str
537 )),
538 }
539}
540
541fn number_to_char(num: &u32) -> Result<RUMString, RUMString> {
545 match char::from_u32(*num) {
546 Some(result) => Ok(result.to_rumstring()),
547 None => Err(rumtk_format!(
548 "Failed to cast number to character! Number {}",
549 num
550 )),
551 }
552}
553
554fn number_to_char_unchecked(num: &u32) -> RUMString {
560 unsafe { char::from_u32_unchecked(*num).to_rumstring() }
561}
562
563pub fn escape(unescaped_str: &str) -> RUMString {
575 basic_escape(unescaped_str, &vec![("{", ""), ("}", "")])
576}
577
578pub fn basic_escape(unescaped_str: &str, except: EscapeExceptions) -> RUMString {
589 let escaped = is_escaped_str(unescaped_str);
590 if !escaped {
591 let mut escaped_str = unescaped_str.escape_default().to_string();
592 for (from, to) in except {
593 escaped_str = escaped_str.replace(from, to);
594 }
595 return escaped_str.to_rumstring();
596 }
597 unescaped_str.to_rumstring()
598}
599
600pub fn is_ascii_str(unescaped_str: &str) -> bool {
607 unescaped_str.is_ascii()
608}
609
610pub fn is_escaped_str(unescaped_str: &str) -> bool {
619 if !is_ascii_str(unescaped_str) {
620 return false;
621 }
622
623 for c in unescaped_str.chars() {
624 if !is_printable_char(&c) {
625 return false;
626 }
627 }
628 true
629}
630
631pub fn is_printable_char(c: &char) -> bool {
635 &MIN_ASCII_READABLE <= c && c <= &MAX_ASCII_READABLE
636}
637
638pub fn filter_ascii(unescaped_str: &str, closure: fn(char) -> bool) -> RUMString {
642 let mut filtered = unescaped_str.to_rumstring();
643 filtered.retain(closure);
644 filtered
645}
646
647pub fn filter_non_printable_ascii(unescaped_str: &str) -> RUMString {
651 filter_ascii(unescaped_str, |c: char| is_printable_char(&c))
652}