Skip to main content

rumtk_core/
strings.rs

1/*
2 * rumtk attempts to implement HL7 and medical protocols for interoperability in medicine.
3 * This toolkit aims to be reliable, simple, performant, and standards compliant.
4 * Copyright (C) 2024  Luis M. Santos, M.D.
5 * Copyright (C) 2025  MedicalMasses L.L.C.
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
20 */
21use crate::core::{is_unique, RUMResult, RUMVec};
22use crate::types::RUMBuffer;
23use chardetng::EncodingDetector;
24pub use compact_str::{
25    format_compact as rumtk_format, CompactString, CompactStringExt, ToCompactString,
26};
27use encoding_rs::Encoding;
28use unicode_segmentation::UnicodeSegmentation;
29/**************************** Constants**************************************/
30const ESCAPED_STRING_WINDOW: usize = 6;
31const ASCII_ESCAPE_CHAR: char = '\\';
32const MIN_ASCII_READABLE: char = ' ';
33const MAX_ASCII_READABLE: char = '~';
34pub const EMPTY_STRING: &str = "";
35pub const DOT_STR: &str = ".";
36pub const EMPTY_STRING_OPTION: Option<&str> = Some("");
37pub const READABLE_ASCII: &str = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~";
38
39/**************************** Types *****************************************/
40pub type RUMString = CompactString;
41pub type EscapeException<'a> = (&'a str, &'a str);
42pub type EscapeExceptions<'a> = &'a [EscapeException<'a>];
43
44/**************************** Traits ****************************************/
45
46///
47/// Implemented indexing trait for String and str which uses the UnicodeSegmentation facilities to
48/// enable grapheme iteration by default. There could be some performance penalty, but it will allow
49/// for native Unicode support to the best extent possible.
50///
51/// We also enable decoding from Encoding Standard encodings to UTF-8.
52///
53pub trait UTFStringExtensions {
54    fn count_graphemes(&self) -> usize;
55
56    ///
57    /// Return a grapheme unit which could span multiple Unicode codepoints or "characters".
58    ///
59    /// # Note
60    /// ```text
61    ///     If the grapheme requested does not exists, this method will return a blank string.
62    /// ```
63    ///
64    /// Instead of just retrieving a codepoint as character, I decided to take it a step further and
65    /// have support for grapheme selection such that characters in written language like sanskrit
66    /// can be properly selected and evaluated.
67    ///
68    /// [!CAUTION]
69    /// This can be an extremely slow operation over large strings since each call to this method
70    /// will need to rescan the input string every time we need to look up a grapheme. Unfortunately,
71    /// this is a side effect of convenience. To improve performance, call .get_graphemes() once and
72    /// then call take_grapheme() over that iterator.
73    ///
74    fn get_grapheme(&self, index: usize) -> &str;
75
76    fn get_graphemes(&self) -> Vec<&str>;
77
78    fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str>;
79
80    #[inline(always)]
81    fn take_grapheme<'a>(&self, graphemes: &Vec<&'a str>, index: usize) -> RUMString {
82        if index >= graphemes.len() {
83            return RUMString::from(EMPTY_STRING);
84        }
85        RUMString::from(graphemes[index])
86    }
87
88    #[inline(always)]
89    fn get_grapheme_window(&self, min: usize, max: usize, offset: usize) -> RUMString {
90        let mut window: RUMString = RUMString::with_capacity(max - min);
91        let start = min + offset;
92        let end = max + offset;
93        let graphemes = self.get_graphemes();
94        for i in start..end {
95            window += &self.take_grapheme(&graphemes, i);
96        }
97        window
98    }
99
100    #[inline(always)]
101    fn get_grapheme_string(&self, end_pattern: &str, offset: usize) -> RUMString {
102        let mut window: RUMString = RUMString::with_capacity(ESCAPED_STRING_WINDOW);
103        for grapheme in self.get_grapheme_chunk(offset) {
104            if grapheme == end_pattern {
105                return RUMString::from(window);
106            } else {
107                window += grapheme;
108            }
109        }
110        RUMString::from(window)
111    }
112
113    #[inline(always)]
114    fn find_grapheme(&self, pattern: &str, offset: usize) -> &str {
115        for grapheme in self.get_grapheme_chunk(offset) {
116            if grapheme == pattern {
117                return grapheme;
118            }
119        }
120        EMPTY_STRING
121    }
122
123    #[inline(always)]
124    fn truncate(&self, max_size: usize) -> RUMString {
125        self.get_grapheme_window(0, max_size, 0)
126    }
127}
128
129pub trait AsStr {
130    fn as_str(&self) -> &str;
131}
132
133pub trait RUMStringConversions: ToString {
134    #[inline(always)]
135    fn to_rumstring(&self) -> RUMString {
136        RUMString::from(self.to_string())
137    }
138
139    #[inline(always)]
140    fn to_raw(&self) -> RUMVec<u8> {
141        self.to_string().as_bytes().to_vec()
142    }
143
144    #[inline(always)]
145    fn to_buffer(&self) -> RUMBuffer {
146        RUMBuffer::from(self.to_string())
147    }
148}
149
150pub trait StringUtils: AsStr + UTFStringExtensions {
151    #[inline(always)]
152    fn duplicate(&self, count: usize) -> RUMString {
153        let mut duplicated = RUMString::with_capacity(count);
154        for i in 0..count {
155            duplicated += &self.as_str();
156        }
157        duplicated
158    }
159
160    fn is_unique(&self) -> bool {
161        let graphemes = self.get_graphemes();
162        is_unique(&graphemes)
163    }
164}
165
166impl UTFStringExtensions for RUMString {
167    #[inline(always)]
168    fn count_graphemes(&self) -> usize {
169        self.graphemes(true).count()
170    }
171
172    #[inline(always)]
173    fn get_grapheme(&self, index: usize) -> &str {
174        self.graphemes(true)
175            .nth(index)
176            .or(EMPTY_STRING_OPTION)
177            .unwrap()
178    }
179
180    #[inline(always)]
181    fn get_graphemes(&self) -> Vec<&str> {
182        self.graphemes(true).collect::<Vec<&str>>()
183    }
184
185    #[inline(always)]
186    fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str> {
187        self.graphemes(true).skip(offset).collect::<Vec<&str>>()
188    }
189}
190
191impl RUMStringConversions for RUMString {}
192impl AsStr for RUMString {
193    fn as_str(&self) -> &str {
194        self.as_str()
195    }
196}
197impl StringUtils for RUMString {}
198
199impl UTFStringExtensions for str {
200    #[inline(always)]
201    fn count_graphemes(&self) -> usize {
202        self.graphemes(true).count()
203    }
204
205    #[inline(always)]
206    fn get_grapheme(&self, index: usize) -> &str {
207        self.graphemes(true)
208            .nth(index)
209            .or(EMPTY_STRING_OPTION)
210            .unwrap()
211    }
212
213    #[inline(always)]
214    fn get_graphemes(&self) -> Vec<&str> {
215        self.graphemes(true).collect::<Vec<&str>>()
216    }
217
218    #[inline(always)]
219    fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str> {
220        self.graphemes(true).skip(offset).collect::<Vec<&str>>()
221    }
222}
223
224impl RUMStringConversions for str {}
225
226impl AsStr for str {
227    fn as_str(&self) -> &str {
228        self
229    }
230}
231
232impl StringUtils for str {}
233
234impl RUMStringConversions for char {}
235
236pub trait RUMArrayConversions {
237    fn to_rumstring(&self) -> RUMString;
238}
239
240impl RUMArrayConversions for Vec<u8> {
241    fn to_rumstring(&self) -> RUMString {
242        self.as_slice().to_rumstring()
243    }
244}
245
246impl RUMArrayConversions for &[u8] {
247    fn to_rumstring(&self) -> RUMString {
248        RUMString::from_utf8(&self).unwrap()
249    }
250}
251
252/**************************** Helpers ***************************************/
253
254pub fn count_tokens_ignoring_pattern(vector: &Vec<&str>, string_token: &RUMString) -> usize {
255    let mut count: usize = 0;
256    for tok in vector.iter() {
257        if string_token != tok {
258            count += 1;
259        }
260    }
261    count
262}
263
264///
265/// Implements decoding this string from its auto-detected encoding to UTF-8.
266/// Failing that we assume the string was encoded in UTF-8 and return a copy.
267///
268/// Note => Decoding is facilitated via the crates chardet-ng and encoding_rs.
269///
270pub fn try_decode(src: &[u8]) -> RUMString {
271    let mut detector = EncodingDetector::new();
272    detector.feed(&src, true);
273    let encoding = detector.guess(None, true);
274    decode(src, encoding)
275}
276
277///
278/// Implements decoding this string from a specific encoding to UTF-8.
279///
280/// Note => Decoding is facilitated via the crates chardet-ng and encoding_rs.
281///
282pub fn try_decode_with(src: &[u8], encoding_name: &str) -> RUMString {
283    let encoding = match Encoding::for_label(encoding_name.as_bytes()) {
284        Some(v) => v,
285        None => return RUMString::from(""),
286    };
287    decode(src, encoding)
288}
289
290///
291/// Implements decoding of input with encoder.
292///
293/// Note => Decoding is facilitated via the crate encoding_rs.
294///
295fn decode(src: &[u8], encoding: &'static Encoding) -> RUMString {
296    match encoding.decode_without_bom_handling_and_without_replacement(&src) {
297        Some(res) => RUMString::from(res),
298        None => RUMString::from_utf8(src).unwrap(),
299    }
300}
301
302///
303/// This function will scan through an escaped string and unescape any escaped characters.
304/// We collect these characters as a byte vector.
305/// Finally, we do a decode pass on the vector to re-encode the bytes **hopefully right** into a
306/// valid UTF-8 string.
307///
308/// This function focuses on reverting the result of [escape], whose output is meant for HL7.
309///
310pub fn unescape_string(escaped_str: &str) -> RUMResult<RUMString> {
311    let graphemes = escaped_str.graphemes(true).collect::<Vec<&str>>();
312    let str_size = graphemes.len();
313    let mut result: Vec<u8> = Vec::with_capacity(escaped_str.len());
314    let mut i = 0;
315    while i < str_size {
316        let seq_start = graphemes[i];
317        match seq_start {
318            "\\" => {
319                let escape_seq = get_grapheme_string(&graphemes, " ", i);
320                let mut c = match unescape(&escape_seq) {
321                    Ok(c) => c,
322                    Err(_why) => Vec::from(escape_seq.as_bytes()),
323                };
324                result.append(&mut c);
325                i += &escape_seq.count_graphemes();
326            }
327            _ => {
328                result.append(&mut Vec::from(seq_start.as_bytes()));
329                i += 1;
330            }
331        }
332    }
333    Ok(try_decode(result.as_slice()))
334}
335
336///
337/// Get the grapheme block and concatenate it into a newly allocated [`RUMString`].
338///
339pub fn get_grapheme_string<'a>(
340    graphemes: &Vec<&'a str>,
341    end_grapheme: &str,
342    start_index: usize,
343) -> RUMString {
344    get_grapheme_collection(graphemes, end_grapheme, start_index).join_compact("")
345}
346
347///
348/// Return vector of graphemes from starting spot up until we find the end grapheme.
349///
350/// Because a grapheme may take more than one codepoint characters, these have to be treated as
351/// references to strings.
352///
353pub fn get_grapheme_collection<'a>(
354    graphemes: &Vec<&'a str>,
355    end_grapheme: &str,
356    start_index: usize,
357) -> Vec<&'a str> {
358    let mut result: Vec<&'a str> = Vec::new();
359    for grapheme in graphemes.iter().skip(start_index) {
360        let item = *grapheme;
361        if item == end_grapheme {
362            break;
363        }
364        result.push(item);
365    }
366    result
367}
368
369///
370/// Turn escaped character sequence into the equivalent UTF-8 character
371/// This function accepts \o, \x and \u formats.
372/// This function will also attempt to unescape the common C style control characters.
373/// Anything else needs to be expressed as hex or octal patterns with the formats above.
374///
375/// If I did this right, I should get the "raw" byte sequence out of the escaped string.
376/// We can then use the bytes and attempt a decode() to figure out the string encoding and
377/// get the correct conversion to UTF-8. **Fingers crossed**
378///
379pub fn unescape(escaped_str: &str) -> Result<Vec<u8>, RUMString> {
380    let lower_case = escaped_str.to_lowercase();
381    let mut bytes: Vec<u8> = Vec::with_capacity(3);
382    match &lower_case[0..2] {
383        // Hex notation case. Assume we are getting xxyy bytes
384        "\\x" => {
385            let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
386            bytes.append(&mut byte_str.as_bytes().to_vec());
387        }
388        // Unicode notation case, we need to do an extra step or we will lose key bytes.
389        "\\u" => {
390            let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
391            bytes.append(&mut byte_str.as_bytes().to_vec());
392        }
393        // Single byte notation case
394        "\\c" => {
395            let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
396            bytes.append(&mut byte_str.as_bytes().to_vec());
397        }
398        // Unicode notation case
399        "\\o" => {
400            let byte_str = number_to_char_unchecked(&octal_to_number(&lower_case[2..6])?);
401            bytes.append(&mut byte_str.as_bytes().to_vec());
402        }
403        // Multibyte notation case
404        "\\m" => match lower_case.count_graphemes() {
405            8 => {
406                bytes.push(hex_to_byte(&lower_case[2..4])?);
407                bytes.push(hex_to_byte(&lower_case[4..6])?);
408                bytes.push(hex_to_byte(&lower_case[6..8])?);
409            }
410            6 => {
411                bytes.push(hex_to_byte(&lower_case[2..4])?);
412                bytes.push(hex_to_byte(&lower_case[4..6])?);
413            }
414            _ => {
415                return Err(rumtk_format!(
416                    "Unknown multibyte sequence. Cannot decode {}",
417                    lower_case
418                ))
419            }
420        },
421        // Custom encoding
422        "\\z" => bytes.append(&mut lower_case.as_bytes().to_vec()),
423        // Single byte codes.
424        _ => bytes.push(unescape_control_byte(&lower_case[0..2])?),
425    }
426    Ok(bytes)
427}
428
429///
430/// Unescape basic character
431/// We use pattern matching to map the basic escape character to its corresponding integer value.
432///
433fn unescape_control(escaped_str: &str) -> Result<char, RUMString> {
434    match escaped_str {
435        // Common control sequences
436        "\\t" => Ok('\t'),
437        "\\b" => Ok('\x08'),
438        "\\n" => Ok('\n'),
439        "\\r" => Ok('\r'),
440        "\\f" => Ok('\x14'),
441        "\\s" => Ok('\x20'),
442        "\\\\" => Ok(ASCII_ESCAPE_CHAR),
443        "\\'" => Ok('\''),
444        "\\\"" => Ok('"'),
445        "\\0" => Ok('\0'),
446        "\\v" => Ok('\x0B'),
447        "\\a" => Ok('\x07'),
448        // Control sequences by
449        _ => Err(rumtk_format!(
450            "Unknown escape sequence? Sequence: {}!",
451            escaped_str
452        )),
453    }
454}
455
456///
457/// Unescape basic character
458/// We use pattern matching to map the basic escape character to its corresponding integer value.
459///
460fn unescape_control_byte(escaped_str: &str) -> Result<u8, RUMString> {
461    match escaped_str {
462        // Common control sequences
463        "\\t" => Ok(9),   // Tab/Character Tabulation
464        "\\b" => Ok(8),   // Backspace
465        "\\n" => Ok(10),  // New line/ Line Feed character
466        "\\r" => Ok(13),  // Carriage Return character
467        "\\f" => Ok(12),  // Form Feed
468        "\\s" => Ok(32),  // Space
469        "\\\\" => Ok(27), // Escape
470        "\\'" => Ok(39),  // Single quote
471        "\\\"" => Ok(34), // Double quote
472        "\\0" => Ok(0),   // Null character
473        "\\v" => Ok(11),  // Vertical Tab/Line Tabulation
474        "\\a" => Ok(7),   // Alert bell
475        // Control sequences by hex
476        //Err(rumtk_format!("Unknown escape sequence? Sequence: {}!", escaped_str))
477        _ => hex_to_byte(escaped_str),
478    }
479}
480
481///
482/// Turn hex string to number (u32)
483///
484fn hex_to_number(hex_str: &str) -> Result<u32, RUMString> {
485    match u32::from_str_radix(&hex_str, 16) {
486        Ok(result) => Ok(result),
487        Err(val) => Err(rumtk_format!(
488            "Failed to parse string with error {}! Input string {} \
489        is not hex string!",
490            val,
491            hex_str
492        )),
493    }
494}
495
496///
497/// Turn hex string to byte (u8)
498///
499fn hex_to_byte(hex_str: &str) -> Result<u8, RUMString> {
500    match u8::from_str_radix(&hex_str, 16) {
501        Ok(result) => Ok(result),
502        Err(val) => Err(rumtk_format!(
503            "Failed to parse string with error {}! Input string {} \
504        is not hex string!",
505            val,
506            hex_str
507        )),
508    }
509}
510
511///
512/// Turn octal string to number (u32)
513///
514fn octal_to_number(hoctal_str: &str) -> Result<u32, RUMString> {
515    match u32::from_str_radix(&hoctal_str, 8) {
516        Ok(result) => Ok(result),
517        Err(val) => Err(rumtk_format!(
518            "Failed to parse string with error {}! Input string {} \
519        is not an octal string!",
520            val,
521            hoctal_str
522        )),
523    }
524}
525
526///
527/// Turn octal string to byte (u32)
528///
529fn octal_to_byte(hoctal_str: &str) -> Result<u8, RUMString> {
530    match u8::from_str_radix(&hoctal_str, 8) {
531        Ok(result) => Ok(result),
532        Err(val) => Err(rumtk_format!(
533            "Failed to parse string with error {}! Input string {} \
534        is not an octal string!",
535            val,
536            hoctal_str
537        )),
538    }
539}
540
541///
542/// Turn number to UTF-8 char
543///
544fn number_to_char(num: &u32) -> Result<RUMString, RUMString> {
545    match char::from_u32(*num) {
546        Some(result) => Ok(result.to_rumstring()),
547        None => Err(rumtk_format!(
548            "Failed to cast number to character! Number {}",
549            num
550        )),
551    }
552}
553
554///
555/// Turn number to UTF-8 char. Normally, calling from_u32 checks if the value is a valid character.
556/// This version uses the less safe from_u32_unchecked() function because we want to get the bytes
557/// and deal with validity at a higher layer.
558///
559fn number_to_char_unchecked(num: &u32) -> RUMString {
560    unsafe { char::from_u32_unchecked(*num).to_rumstring() }
561}
562
563///
564/// Turn UTF-8 character into escaped character sequence as expected in HL7
565///
566/// # Example
567/// ```
568///  use rumtk_core::strings::{escape};
569///  let message = "I ❤ my wife!";
570///  let escaped_message = escape(&message);
571///  assert_eq!("I \\u2764 my wife!", &escaped_message, "Did not get expected escaped string! Got {}!", &escaped_message);
572///```
573///
574pub fn escape(unescaped_str: &str) -> RUMString {
575    basic_escape(unescaped_str, &vec![("{", ""), ("}", "")])
576}
577
578///
579/// Escape UTF-8 characters in UTF-8 string that are beyond ascii range
580///
581/// # Example
582/// ```
583///  use rumtk_core::strings::basic_escape;
584///  let message = "I ❤ my wife!";
585///  let escaped_message = basic_escape(&message, &vec![]);
586///  assert_eq!("I \\u{2764} my wife!", &escaped_message, "Did not get expected escaped string! Got {}!", &escaped_message);
587///```
588pub fn basic_escape(unescaped_str: &str, except: EscapeExceptions) -> RUMString {
589    let escaped = is_escaped_str(unescaped_str);
590    if !escaped {
591        let mut escaped_str = unescaped_str.escape_default().to_string();
592        for (from, to) in except {
593            escaped_str = escaped_str.replace(from, to);
594        }
595        return escaped_str.to_rumstring();
596    }
597    unescaped_str.to_rumstring()
598}
599
600///
601/// Checks if a given string is fully ASCII or within the ASCII range.
602///
603/// Remember: all strings are UTF-8 encoded in Rust, but most ASCII strings fit within the UTF-8
604/// encoding scheme.
605///
606pub fn is_ascii_str(unescaped_str: &str) -> bool {
607    unescaped_str.is_ascii()
608}
609
610///
611/// Checks if an input string is already escaped.
612/// The idea is to avoid escaping the escaped string thus making it a nightmare to undo the
613/// escaping later on.
614///
615/// Basically, if you were to blindly escape the input string, back slashes keep getting escaped.
616/// For example `\r -> \\r -> \\\\r -> ...`.
617///
618pub fn is_escaped_str(unescaped_str: &str) -> bool {
619    if !is_ascii_str(unescaped_str) {
620        return false;
621    }
622
623    for c in unescaped_str.chars() {
624        if !is_printable_char(&c) {
625            return false;
626        }
627    }
628    true
629}
630
631///
632/// Returns whether a character is in the ASCII printable range.
633///
634pub fn is_printable_char(c: &char) -> bool {
635    &MIN_ASCII_READABLE <= c && c <= &MAX_ASCII_READABLE
636}
637
638///
639/// Removes all non ASCII and all non printable characters from string.
640///
641pub fn filter_ascii(unescaped_str: &str, closure: fn(char) -> bool) -> RUMString {
642    let mut filtered = unescaped_str.to_rumstring();
643    filtered.retain(closure);
644    filtered
645}
646
647///
648/// Removes all non ASCII and all non printable characters from string.
649///
650pub fn filter_non_printable_ascii(unescaped_str: &str) -> RUMString {
651    filter_ascii(unescaped_str, |c: char| is_printable_char(&c))
652}