Skip to main content

rumtk_core/
strings.rs

1/*
2 * rumtk attempts to implement HL7 and medical protocols for interoperability in medicine.
3 * This toolkit aims to be reliable, simple, performant, and standards compliant.
4 * Copyright (C) 2024  Luis M. Santos, M.D. <lsantos@medicalmasses.com>
5 * Copyright (C) 2025  MedicalMasses L.L.C. <contact@medicalmasses.com>
6 *
7 * This program is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
19 */
20use crate::core::{is_unique, RUMResult, RUMVec};
21use crate::types::RUMBuffer;
22use chardetng::EncodingDetector;
23pub use compact_str::{
24    format_compact as rumtk_format, CompactString, CompactStringExt, ToCompactString,
25};
26use encoding_rs::Encoding;
27use std::cmp::min;
28use unicode_segmentation::UnicodeSegmentation;
29/**************************** Constants**************************************/
30const ESCAPED_STRING_WINDOW: usize = 6;
31const ASCII_ESCAPE_CHAR: char = '\\';
32const MIN_ASCII_READABLE: char = ' ';
33const MAX_ASCII_READABLE: char = '~';
34pub const EMPTY_STRING: &str = "";
35pub const DOT_STR: &str = ".";
36pub const EMPTY_STRING_OPTION: Option<&str> = Some("");
37pub const READABLE_ASCII: &str = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~";
38
39/**************************** Types *****************************************/
40pub type RUMString = CompactString;
41pub type EscapeException<'a> = (&'a str, &'a str);
42pub type EscapeExceptions<'a> = &'a [EscapeException<'a>];
43pub type Grapheme<'a> = &'a str;
44pub type GraphemeStringView<'a> = RUMVec<Grapheme<'a>>;
45pub type GraphemePattern<'a> = &'a [Grapheme<'a>];
46pub type GraphemeSlice<'b, 'a> = &'b [Grapheme<'a>];
47pub type GraphemePatternPair<'a> = (GraphemePattern<'a>, GraphemePattern<'a>);
48
49///
50/// The equivalent to a `stringview` but at the grapheme level. Meaning, we can use this view to
51/// iterate through a string at the full `UTF8` implementation
52///
53#[derive(Default, Debug, PartialEq, Clone)]
54pub struct GraphemeStr<'a> {
55    view: GraphemeStringView<'a>,
56    start: usize,
57    end: usize,
58}
59
60impl<'a> GraphemeStr<'a> {
61    pub fn from(string: &'a str) -> Self {
62        let view = string.graphemes(true).collect::<GraphemeStringView>();
63        Self::from_view(view)
64    }
65
66    pub fn from_view(view: GraphemeStringView<'a>) -> Self {
67        let start = 0;
68        let end = view.len();
69        Self { view, start, end }
70    }
71
72    pub fn at(&self, index: usize) -> Grapheme<'a> {
73        self.view[index]
74    }
75
76    pub fn trim(&self, pattern: &GraphemePatternPair<'a>) -> Self {
77        let (left_pattern, right_pattern) = pattern;
78        self.trim_left(left_pattern).trim_right(right_pattern)
79    }
80
81    pub fn trim_left(&self, pattern: &GraphemePattern<'a>) -> Self {
82        let new_offset = self.find(pattern, self.start);
83        Self {
84            view: self.view.clone(),
85            start: new_offset,
86            end: self.end,
87        }
88    }
89
90    pub fn trim_right(&self, pattern: &GraphemePattern<'a>) -> Self {
91        let new_offset = self.rfind(pattern, self.end);
92        Self {
93            view: self.view.clone(),
94            start: self.start,
95            end: new_offset,
96        }
97    }
98
99    pub fn splice(&self, skip_pattern: &GraphemePatternPair<'a>) -> Self {
100        let (left_pattern, right_pattern) = skip_pattern;
101        let mut new_view = GraphemeStringView::with_capacity(self.end - self.start);
102        let mut offset = self.start;
103        let l_pattern_s = left_pattern.len();
104
105        while offset < self.end {
106            let target_s = self.find(left_pattern, offset) + l_pattern_s;
107            for i in offset..target_s {
108                new_view.push(self.view[i]);
109            }
110            offset = self.find(right_pattern, target_s);
111        }
112
113        GraphemeStr::from_view(new_view)
114    }
115
116    pub fn find(&self, pattern: &GraphemePattern<'a>, offset: usize) -> usize {
117        let pattern_s = pattern.len();
118        let mut new_offset = offset;
119        let mut pattern_end = new_offset + pattern_s;
120
121        while new_offset < self.end && pattern_end < self.end {
122            if self.view[new_offset..pattern_end] == **pattern {
123                break;
124            }
125
126            new_offset += 1;
127            pattern_end = new_offset + pattern_s;
128        }
129
130        new_offset
131    }
132
133    pub fn rfind(&self, pattern: &GraphemePattern<'a>, offset: usize) -> usize {
134        let pattern_s = pattern.len();
135        let mut new_offset = offset;
136        while new_offset > self.start {
137            if self.view[new_offset - pattern_s..new_offset] == **pattern {
138                break;
139            }
140
141            new_offset -= 1;
142        }
143
144        new_offset
145    }
146
147    pub fn len(&self) -> usize {
148        self.end - self.start
149    }
150
151    pub fn get_graphemes(&self) -> GraphemeSlice<'_, 'a> {
152        &self.view[self.start..self.end]
153    }
154
155    pub fn truncate(&self, size: usize) -> Self {
156        let end = min(size, self.end);
157        Self {
158            view: self.view.clone(),
159            start: self.start,
160            end,
161        }
162    }
163
164    pub fn is_unique(&self) -> bool {
165        is_unique(&self.view)
166    }
167}
168
169impl ToString for GraphemeStr<'_> {
170    fn to_string(&self) -> String {
171        let mut new_string = String::with_capacity(self.len());
172
173        for grapheme in self.view[self.start..self.end].iter() {
174            new_string.push_str(grapheme);
175        }
176
177        new_string
178    }
179}
180
181impl RUMStringConversions for GraphemeStr<'_> {}
182
183/**************************** Traits ****************************************/
184
185pub trait StringLike {
186    fn with_capacity(capacity: usize) -> Self;
187    fn push_str(&mut self, string: &str);
188}
189
190pub trait AsStr {
191    fn as_str(&self) -> &str;
192    fn as_grapheme_str(&self) -> GraphemeStr {
193        GraphemeStr::from(self.as_str())
194    }
195}
196
197pub trait RUMStringConversions: ToString {
198    #[inline(always)]
199    fn to_rumstring(&self) -> RUMString {
200        RUMString::from(self.to_string())
201    }
202
203    #[inline(always)]
204    fn to_raw(&self) -> RUMVec<u8> {
205        self.to_string().as_bytes().to_vec()
206    }
207
208    #[inline(always)]
209    fn to_buffer(&self) -> RUMBuffer {
210        RUMBuffer::from(self.to_string())
211    }
212}
213
214pub trait StringUtils: AsStr + RUMStringConversions {
215    #[inline(always)]
216    fn duplicate(&self, count: usize) -> RUMString {
217        let mut duplicated = RUMString::with_capacity(count);
218        for i in 0..count {
219            duplicated += &self.as_str();
220        }
221        duplicated
222    }
223
224    fn truncate(&self, count: usize) -> RUMString {
225        self.as_grapheme_str().truncate(count).to_rumstring()
226    }
227}
228
229impl AsStr for String {
230    fn as_str(&self) -> &str {
231        self.as_str()
232    }
233}
234
235impl RUMStringConversions for RUMString {}
236impl AsStr for RUMString {
237    fn as_str(&self) -> &str {
238        self.as_str()
239    }
240}
241impl StringUtils for RUMString {}
242
243impl RUMStringConversions for str {}
244
245impl AsStr for str {
246    fn as_str(&self) -> &str {
247        self
248    }
249}
250
251impl StringUtils for str {}
252
253impl RUMStringConversions for char {}
254
255pub trait RUMArrayConversions {
256    fn to_rumstring(&self) -> RUMString;
257}
258
259impl RUMArrayConversions for Vec<u8> {
260    fn to_rumstring(&self) -> RUMString {
261        self.as_slice().to_rumstring()
262    }
263}
264
265impl RUMArrayConversions for &[u8] {
266    fn to_rumstring(&self) -> RUMString {
267        RUMString::from_utf8(&self).unwrap()
268    }
269}
270
271/**************************** Helpers ***************************************/
272
273pub fn count_tokens_ignoring_pattern(vector: &Vec<&str>, string_token: &RUMString) -> usize {
274    let mut count: usize = 0;
275    for tok in vector.iter() {
276        if string_token != tok {
277            count += 1;
278        }
279    }
280    count
281}
282
283///
284/// Implements decoding this string from its auto-detected encoding to UTF-8.
285/// Failing that we assume the string was encoded in UTF-8 and return a copy.
286///
287/// Note => Decoding is facilitated via the crates chardet-ng and encoding_rs.
288///
289pub fn try_decode(src: &[u8]) -> RUMString {
290    let mut detector = EncodingDetector::new();
291    detector.feed(&src, true);
292    let encoding = detector.guess(None, true);
293    decode(src, encoding)
294}
295
296///
297/// Implements decoding this string from a specific encoding to UTF-8.
298///
299/// Note => Decoding is facilitated via the crates chardet-ng and encoding_rs.
300///
301pub fn try_decode_with(src: &[u8], encoding_name: &str) -> RUMString {
302    let encoding = match Encoding::for_label(encoding_name.as_bytes()) {
303        Some(v) => v,
304        None => return RUMString::from(""),
305    };
306    decode(src, encoding)
307}
308
309///
310/// Implements decoding of input with encoder.
311///
312/// Note => Decoding is facilitated via the crate encoding_rs.
313///
314fn decode(src: &[u8], encoding: &'static Encoding) -> RUMString {
315    match encoding.decode_without_bom_handling_and_without_replacement(&src) {
316        Some(res) => RUMString::from(res),
317        None => RUMString::from_utf8(src).unwrap(),
318    }
319}
320
321///
322/// This function will scan through an escaped string and unescape any escaped characters.
323/// We collect these characters as a byte vector.
324/// Finally, we do a decode pass on the vector to re-encode the bytes **hopefully right** into a
325/// valid UTF-8 string.
326///
327/// This function focuses on reverting the result of [escape], whose output is meant for HL7.
328///
329pub fn unescape_string(escaped_str: &str) -> RUMResult<RUMString> {
330    let graphemes = escaped_str.graphemes(true).collect::<Vec<&str>>();
331    let str_size = graphemes.len();
332    let mut result: Vec<u8> = Vec::with_capacity(escaped_str.len());
333    let mut i = 0;
334    while i < str_size {
335        let seq_start = graphemes[i];
336        match seq_start {
337            "\\" => {
338                let escape_seq = get_grapheme_string(&graphemes, " ", i);
339                let mut c = match unescape(&escape_seq) {
340                    Ok(c) => c,
341                    Err(_why) => Vec::from(escape_seq.as_bytes()),
342                };
343                result.append(&mut c);
344                i += &escape_seq.as_grapheme_str().len();
345            }
346            _ => {
347                result.append(&mut Vec::from(seq_start.as_bytes()));
348                i += 1;
349            }
350        }
351    }
352    Ok(try_decode(result.as_slice()))
353}
354
355///
356/// Get the grapheme block and concatenate it into a newly allocated [`RUMString`].
357///
358pub fn get_grapheme_string<'a>(
359    graphemes: &Vec<&'a str>,
360    end_grapheme: &str,
361    start_index: usize,
362) -> RUMString {
363    get_grapheme_collection(graphemes, end_grapheme, start_index).join_compact("")
364}
365
366///
367/// Return vector of graphemes from starting spot up until we find the end grapheme.
368///
369/// Because a grapheme may take more than one codepoint characters, these have to be treated as
370/// references to strings.
371///
372pub fn get_grapheme_collection<'a>(
373    graphemes: &Vec<&'a str>,
374    end_grapheme: &str,
375    start_index: usize,
376) -> Vec<&'a str> {
377    let mut result: Vec<&'a str> = Vec::new();
378    for grapheme in graphemes.iter().skip(start_index) {
379        let item = *grapheme;
380        if item == end_grapheme {
381            break;
382        }
383        result.push(item);
384    }
385    result
386}
387
388///
389/// Turn escaped character sequence into the equivalent UTF-8 character
390/// This function accepts \o, \x and \u formats.
391/// This function will also attempt to unescape the common C style control characters.
392/// Anything else needs to be expressed as hex or octal patterns with the formats above.
393///
394/// If I did this right, I should get the "raw" byte sequence out of the escaped string.
395/// We can then use the bytes and attempt a decode() to figure out the string encoding and
396/// get the correct conversion to UTF-8. **Fingers crossed**
397///
398pub fn unescape(escaped_str: &str) -> Result<Vec<u8>, RUMString> {
399    let lower_case = escaped_str.to_lowercase();
400    let mut bytes: Vec<u8> = Vec::with_capacity(3);
401    match &lower_case[0..2] {
402        // Hex notation case. Assume we are getting xxyy bytes
403        "\\x" => {
404            let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
405            bytes.append(&mut byte_str.as_bytes().to_vec());
406        }
407        // Unicode notation case, we need to do an extra step or we will lose key bytes.
408        "\\u" => {
409            let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
410            bytes.append(&mut byte_str.as_bytes().to_vec());
411        }
412        // Single byte notation case
413        "\\c" => {
414            let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
415            bytes.append(&mut byte_str.as_bytes().to_vec());
416        }
417        // Unicode notation case
418        "\\o" => {
419            let byte_str = number_to_char_unchecked(&octal_to_number(&lower_case[2..6])?);
420            bytes.append(&mut byte_str.as_bytes().to_vec());
421        }
422        // Multibyte notation case
423        "\\m" => match lower_case.as_grapheme_str().len() {
424            8 => {
425                bytes.push(hex_to_byte(&lower_case[2..4])?);
426                bytes.push(hex_to_byte(&lower_case[4..6])?);
427                bytes.push(hex_to_byte(&lower_case[6..8])?);
428            }
429            6 => {
430                bytes.push(hex_to_byte(&lower_case[2..4])?);
431                bytes.push(hex_to_byte(&lower_case[4..6])?);
432            }
433            _ => {
434                return Err(rumtk_format!(
435                    "Unknown multibyte sequence. Cannot decode {}",
436                    lower_case
437                ))
438            }
439        },
440        // Custom encoding
441        "\\z" => bytes.append(&mut lower_case.as_bytes().to_vec()),
442        // Single byte codes.
443        _ => bytes.push(unescape_control_byte(&lower_case[0..2])?),
444    }
445    Ok(bytes)
446}
447
448///
449/// Unescape basic character
450/// We use pattern matching to map the basic escape character to its corresponding integer value.
451///
452fn unescape_control(escaped_str: &str) -> Result<char, RUMString> {
453    match escaped_str {
454        // Common control sequences
455        "\\t" => Ok('\t'),
456        "\\b" => Ok('\x08'),
457        "\\n" => Ok('\n'),
458        "\\r" => Ok('\r'),
459        "\\f" => Ok('\x14'),
460        "\\s" => Ok('\x20'),
461        "\\\\" => Ok(ASCII_ESCAPE_CHAR),
462        "\\'" => Ok('\''),
463        "\\\"" => Ok('"'),
464        "\\0" => Ok('\0'),
465        "\\v" => Ok('\x0B'),
466        "\\a" => Ok('\x07'),
467        // Control sequences by
468        _ => Err(rumtk_format!(
469            "Unknown escape sequence? Sequence: {}!",
470            escaped_str
471        )),
472    }
473}
474
475///
476/// Unescape basic character
477/// We use pattern matching to map the basic escape character to its corresponding integer value.
478///
479fn unescape_control_byte(escaped_str: &str) -> Result<u8, RUMString> {
480    match escaped_str {
481        // Common control sequences
482        "\\t" => Ok(9),   // Tab/Character Tabulation
483        "\\b" => Ok(8),   // Backspace
484        "\\n" => Ok(10),  // New line/ Line Feed character
485        "\\r" => Ok(13),  // Carriage Return character
486        "\\f" => Ok(12),  // Form Feed
487        "\\s" => Ok(32),  // Space
488        "\\\\" => Ok(27), // Escape
489        "\\'" => Ok(39),  // Single quote
490        "\\\"" => Ok(34), // Double quote
491        "\\0" => Ok(0),   // Null character
492        "\\v" => Ok(11),  // Vertical Tab/Line Tabulation
493        "\\a" => Ok(7),   // Alert bell
494        // Control sequences by hex
495        //Err(rumtk_format!("Unknown escape sequence? Sequence: {}!", escaped_str))
496        _ => hex_to_byte(escaped_str),
497    }
498}
499
500///
501/// Turn hex string to number (u32)
502///
503fn hex_to_number(hex_str: &str) -> Result<u32, RUMString> {
504    match u32::from_str_radix(&hex_str, 16) {
505        Ok(result) => Ok(result),
506        Err(val) => Err(rumtk_format!(
507            "Failed to parse string with error {}! Input string {} \
508        is not hex string!",
509            val,
510            hex_str
511        )),
512    }
513}
514
515///
516/// Turn hex string to byte (u8)
517///
518fn hex_to_byte(hex_str: &str) -> Result<u8, RUMString> {
519    match u8::from_str_radix(&hex_str, 16) {
520        Ok(result) => Ok(result),
521        Err(val) => Err(rumtk_format!(
522            "Failed to parse string with error {}! Input string {} \
523        is not hex string!",
524            val,
525            hex_str
526        )),
527    }
528}
529
530///
531/// Turn octal string to number (u32)
532///
533fn octal_to_number(hoctal_str: &str) -> Result<u32, RUMString> {
534    match u32::from_str_radix(&hoctal_str, 8) {
535        Ok(result) => Ok(result),
536        Err(val) => Err(rumtk_format!(
537            "Failed to parse string with error {}! Input string {} \
538        is not an octal string!",
539            val,
540            hoctal_str
541        )),
542    }
543}
544
545///
546/// Turn octal string to byte (u32)
547///
548fn octal_to_byte(hoctal_str: &str) -> Result<u8, RUMString> {
549    match u8::from_str_radix(&hoctal_str, 8) {
550        Ok(result) => Ok(result),
551        Err(val) => Err(rumtk_format!(
552            "Failed to parse string with error {}! Input string {} \
553        is not an octal string!",
554            val,
555            hoctal_str
556        )),
557    }
558}
559
560///
561/// Turn number to UTF-8 char
562///
563fn number_to_char(num: &u32) -> Result<RUMString, RUMString> {
564    match char::from_u32(*num) {
565        Some(result) => Ok(result.to_rumstring()),
566        None => Err(rumtk_format!(
567            "Failed to cast number to character! Number {}",
568            num
569        )),
570    }
571}
572
573///
574/// Turn number to UTF-8 char. Normally, calling from_u32 checks if the value is a valid character.
575/// This version uses the less safe from_u32_unchecked() function because we want to get the bytes
576/// and deal with validity at a higher layer.
577///
578fn number_to_char_unchecked(num: &u32) -> RUMString {
579    unsafe { char::from_u32_unchecked(*num).to_rumstring() }
580}
581
582///
583/// Turn UTF-8 character into escaped character sequence as expected in HL7
584///
585/// # Example
586/// ```
587///  use rumtk_core::strings::{escape};
588///  let message = "I ❤ my wife!";
589///  let escaped_message = escape(&message);
590///  assert_eq!("I \\u2764 my wife!", &escaped_message, "Did not get expected escaped string! Got {}!", &escaped_message);
591///```
592///
593pub fn escape(unescaped_str: &str) -> RUMString {
594    basic_escape(unescaped_str, &vec![("{", ""), ("}", "")])
595}
596
597///
598/// Escape UTF-8 characters in UTF-8 string that are beyond ascii range
599///
600/// # Example
601/// ```
602///  use rumtk_core::strings::basic_escape;
603///  let message = "I ❤ my wife!";
604///  let escaped_message = basic_escape(&message, &vec![]);
605///  assert_eq!("I \\u{2764} my wife!", &escaped_message, "Did not get expected escaped string! Got {}!", &escaped_message);
606///```
607pub fn basic_escape(unescaped_str: &str, except: EscapeExceptions) -> RUMString {
608    let escaped = is_escaped_str(unescaped_str);
609    if !escaped {
610        let mut escaped_str = unescaped_str.escape_default().to_string();
611        for (from, to) in except {
612            escaped_str = escaped_str.replace(from, to);
613        }
614        return escaped_str.to_rumstring();
615    }
616    unescaped_str.to_rumstring()
617}
618
619///
620/// Checks if a given string is fully ASCII or within the ASCII range.
621///
622/// Remember: all strings are UTF-8 encoded in Rust, but most ASCII strings fit within the UTF-8
623/// encoding scheme.
624///
625pub fn is_ascii_str(unescaped_str: &str) -> bool {
626    unescaped_str.is_ascii()
627}
628
629///
630/// Checks if an input string is already escaped.
631/// The idea is to avoid escaping the escaped string thus making it a nightmare to undo the
632/// escaping later on.
633///
634/// Basically, if you were to blindly escape the input string, back slashes keep getting escaped.
635/// For example `\r -> \\r -> \\\\r -> ...`.
636///
637pub fn is_escaped_str(unescaped_str: &str) -> bool {
638    if !is_ascii_str(unescaped_str) {
639        return false;
640    }
641
642    for c in unescaped_str.chars() {
643        if !is_printable_char(&c) {
644            return false;
645        }
646    }
647    true
648}
649
650///
651/// Returns whether a character is in the ASCII printable range.
652///
653pub fn is_printable_char(c: &char) -> bool {
654    &MIN_ASCII_READABLE <= c && c <= &MAX_ASCII_READABLE
655}
656
657///
658/// Removes all non ASCII and all non printable characters from string.
659///
660pub fn filter_ascii(unescaped_str: &str, closure: fn(char) -> bool) -> RUMString {
661    let mut filtered = unescaped_str.to_rumstring();
662    filtered.retain(closure);
663    filtered
664}
665
666///
667/// Removes all non ASCII and all non printable characters from string.
668///
669pub fn filter_non_printable_ascii(unescaped_str: &str) -> RUMString {
670    filter_ascii(unescaped_str, |c: char| is_printable_char(&c))
671}