rumtk_core/
strings.rs

1/*
2 * rumtk attempts to implement HL7 and medical protocols for interoperability in medicine.
3 * This toolkit aims to be reliable, simple, performant, and standards compliant.
4 * Copyright (C) 2024  Luis M. Santos, M.D.
5 * Copyright (C) 2025  MedicalMasses L.L.C.
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
20 */
21use crate::core::{is_unique, RUMResult};
22use chardetng::EncodingDetector;
23pub use compact_str::{
24    format_compact as rumtk_format, CompactString, CompactStringExt, ToCompactString,
25};
26use encoding_rs::Encoding;
27use unicode_segmentation::UnicodeSegmentation;
28/**************************** Constants**************************************/
29const ESCAPED_STRING_WINDOW: usize = 6;
30const ASCII_ESCAPE_CHAR: char = '\\';
31const MIN_ASCII_READABLE: char = ' ';
32const MAX_ASCII_READABLE: char = '~';
33pub const EMPTY_STRING: &str = "";
34pub const DOT_STR: &str = ".";
35pub const EMPTY_STRING_OPTION: Option<&str> = Some("");
36pub const READABLE_ASCII: &str = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~";
37
38/**************************** Types *****************************************/
39pub type RUMString = CompactString;
40
41/**************************** Traits ****************************************/
42
43///
44/// Implemented indexing trait for String and str which uses the UnicodeSegmentation facilities to
45/// enable grapheme iteration by default. There could be some performance penalty, but it will allow
46/// for native Unicode support to the best extent possible.
47///
48/// We also enable decoding from Encoding Standard encodings to UTF-8.
49///
50pub trait UTFStringExtensions {
51    fn count_graphemes(&self) -> usize;
52
53    ///
54    /// Return a grapheme unit which could span multiple Unicode codepoints or "characters".
55    ///
56    /// # Note
57    /// ```text
58    ///     If the grapheme requested does not exists, this method will return a blank string.
59    /// ```
60    ///
61    /// Instead of just retrieving a codepoint as character, I decided to take it a step further and
62    /// have support for grapheme selection such that characters in written language like sanskrit
63    /// can be properly selected and evaluated.
64    ///
65    /// [!CAUTION]
66    /// This can be an extremely slow operation over large strings since each call to this method
67    /// will need to rescan the input string every time we need to look up a grapheme. Unfortunately,
68    /// this is a side effect of convenience. To improve performance, call .get_graphemes() once and
69    /// then call take_grapheme() over that iterator.
70    ///
71    fn get_grapheme(&self, index: usize) -> &str;
72
73    fn get_graphemes(&self) -> Vec<&str>;
74
75    fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str>;
76
77    #[inline(always)]
78    fn take_grapheme<'a>(&self, graphemes: &Vec<&'a str>, index: usize) -> RUMString {
79        if index >= graphemes.len() {
80            return RUMString::from(EMPTY_STRING);
81        }
82        RUMString::from(graphemes[index])
83    }
84
85    #[inline(always)]
86    fn get_grapheme_window(&self, min: usize, max: usize, offset: usize) -> RUMString {
87        let mut window: RUMString = RUMString::with_capacity(max - min);
88        let start = min + offset;
89        let end = max + offset;
90        let graphemes = self.get_graphemes();
91        for i in start..end {
92            window += &self.take_grapheme(&graphemes, i);
93        }
94        window
95    }
96
97    #[inline(always)]
98    fn get_grapheme_string(&self, end_pattern: &str, offset: usize) -> RUMString {
99        let mut window: RUMString = RUMString::with_capacity(ESCAPED_STRING_WINDOW);
100        for grapheme in self.get_grapheme_chunk(offset) {
101            if grapheme == end_pattern {
102                return RUMString::from(window);
103            } else {
104                window += grapheme;
105            }
106        }
107        RUMString::from(window)
108    }
109
110    #[inline(always)]
111    fn find_grapheme(&self, pattern: &str, offset: usize) -> &str {
112        for grapheme in self.get_grapheme_chunk(offset) {
113            if grapheme == pattern {
114                return grapheme;
115            }
116        }
117        EMPTY_STRING
118    }
119
120    #[inline(always)]
121    fn truncate(&self, max_size: usize) -> RUMString {
122        self.get_grapheme_window(0, max_size, 0)
123    }
124}
125
126pub trait AsStr {
127    fn as_str(&self) -> &str;
128}
129
130pub trait RUMStringConversions: ToString {
131    fn to_rumstring(&self) -> RUMString {
132        RUMString::from(self.to_string())
133    }
134
135    fn to_raw(&self) -> Vec<u8> {
136        self.to_string().as_bytes().to_vec()
137    }
138}
139
140pub trait StringUtils: AsStr + UTFStringExtensions {
141    #[inline(always)]
142    fn duplicate(&self, count: usize) -> RUMString {
143        let mut duplicated = RUMString::with_capacity(count);
144        for i in 0..count {
145            duplicated += &self.as_str();
146        }
147        duplicated
148    }
149
150    fn is_unique(&self) -> bool {
151        let graphemes = self.get_graphemes();
152        is_unique(&graphemes)
153    }
154}
155
156impl UTFStringExtensions for RUMString {
157    #[inline(always)]
158    fn count_graphemes(&self) -> usize {
159        self.graphemes(true).count()
160    }
161
162    #[inline(always)]
163    fn get_grapheme(&self, index: usize) -> &str {
164        self.graphemes(true)
165            .nth(index)
166            .or(EMPTY_STRING_OPTION)
167            .unwrap()
168    }
169
170    #[inline(always)]
171    fn get_graphemes(&self) -> Vec<&str> {
172        self.graphemes(true).collect::<Vec<&str>>()
173    }
174
175    #[inline(always)]
176    fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str> {
177        self.graphemes(true).skip(offset).collect::<Vec<&str>>()
178    }
179}
180
181impl RUMStringConversions for RUMString {}
182impl AsStr for RUMString {
183    fn as_str(&self) -> &str {
184        self.as_str()
185    }
186}
187impl StringUtils for RUMString {}
188
189impl UTFStringExtensions for str {
190    #[inline(always)]
191    fn count_graphemes(&self) -> usize {
192        self.graphemes(true).count()
193    }
194
195    #[inline(always)]
196    fn get_grapheme(&self, index: usize) -> &str {
197        self.graphemes(true)
198            .nth(index)
199            .or(EMPTY_STRING_OPTION)
200            .unwrap()
201    }
202
203    #[inline(always)]
204    fn get_graphemes(&self) -> Vec<&str> {
205        self.graphemes(true).collect::<Vec<&str>>()
206    }
207
208    #[inline(always)]
209    fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str> {
210        self.graphemes(true).skip(offset).collect::<Vec<&str>>()
211    }
212}
213
214impl RUMStringConversions for str {}
215
216impl AsStr for str {
217    fn as_str(&self) -> &str {
218        self
219    }
220}
221
222impl StringUtils for str {}
223
224impl RUMStringConversions for char {}
225
226pub trait RUMArrayConversions {
227    fn to_rumstring(&self) -> RUMString;
228}
229
230impl RUMArrayConversions for Vec<u8> {
231    fn to_rumstring(&self) -> RUMString {
232        self.as_slice().to_rumstring()
233    }
234}
235
236impl RUMArrayConversions for &[u8] {
237    fn to_rumstring(&self) -> RUMString {
238        RUMString::from_utf8(&self).unwrap()
239    }
240}
241
242/**************************** Helpers ***************************************/
243
244pub fn count_tokens_ignoring_pattern(vector: &Vec<&str>, string_token: &RUMString) -> usize {
245    let mut count: usize = 0;
246    for tok in vector.iter() {
247        if string_token != tok {
248            count += 1;
249        }
250    }
251    count
252}
253
254///
255/// Implements decoding this string from its auto-detected encoding to UTF-8.
256/// Failing that we assume the string was encoded in UTF-8 and return a copy.
257///
258/// Note => Decoding is facilitated via the crates chardet-ng and encoding_rs.
259///
260pub fn try_decode(src: &[u8]) -> RUMString {
261    let mut detector = EncodingDetector::new();
262    detector.feed(&src, true);
263    let encoding = detector.guess(None, true);
264    decode(src, encoding)
265}
266
267///
268/// Implements decoding this string from a specific encoding to UTF-8.
269///
270/// Note => Decoding is facilitated via the crates chardet-ng and encoding_rs.
271///
272pub fn try_decode_with(src: &[u8], encoding_name: &str) -> RUMString {
273    let encoding = match Encoding::for_label(encoding_name.as_bytes()) {
274        Some(v) => v,
275        None => return RUMString::from(""),
276    };
277    decode(src, encoding)
278}
279
280///
281/// Implements decoding of input with encoder.
282///
283/// Note => Decoding is facilitated via the crate encoding_rs.
284///
285fn decode(src: &[u8], encoding: &'static Encoding) -> RUMString {
286    match encoding.decode_without_bom_handling_and_without_replacement(&src) {
287        Some(res) => RUMString::from(res),
288        None => RUMString::from_utf8(src).unwrap(),
289    }
290}
291
292///
293/// This function will scan through an escaped string and unescape any escaped characters.
294/// We collect these characters as a byte vector.
295/// Finally, we do a decode pass on the vector to re-encode the bytes **hopefully right** into a
296/// valid UTF-8 string.
297///
298/// This function focuses on reverting the result of [escape], whose output is meant for HL7.
299///
300pub fn unescape_string(escaped_str: &str) -> RUMResult<RUMString> {
301    let graphemes = escaped_str.graphemes(true).collect::<Vec<&str>>();
302    let str_size = graphemes.len();
303    let mut result: Vec<u8> = Vec::with_capacity(escaped_str.len());
304    let mut i = 0;
305    while i < str_size {
306        let seq_start = graphemes[i];
307        match seq_start {
308            "\\" => {
309                let escape_seq = get_grapheme_string(&graphemes, " ", i);
310                let mut c = match unescape(&escape_seq) {
311                    Ok(c) => c,
312                    Err(_why) => Vec::from(escape_seq.as_bytes()),
313                };
314                result.append(&mut c);
315                i += &escape_seq.count_graphemes();
316            }
317            _ => {
318                result.append(&mut Vec::from(seq_start.as_bytes()));
319                i += 1;
320            }
321        }
322    }
323    Ok(try_decode(result.as_slice()))
324}
325
326///
327/// Get the grapheme block and concatenate it into a newly allocated [`RUMString`].
328///
329pub fn get_grapheme_string<'a>(
330    graphemes: &Vec<&'a str>,
331    end_grapheme: &str,
332    start_index: usize,
333) -> RUMString {
334    get_grapheme_collection(graphemes, end_grapheme, start_index).join_compact("")
335}
336
337///
338/// Return vector of graphemes from starting spot up until we find the end grapheme.
339///
340/// Because a grapheme may take more than one codepoint characters, these have to be treated as
341/// references to strings.
342///
343pub fn get_grapheme_collection<'a>(
344    graphemes: &Vec<&'a str>,
345    end_grapheme: &str,
346    start_index: usize,
347) -> Vec<&'a str> {
348    let mut result: Vec<&'a str> = Vec::new();
349    for grapheme in graphemes.iter().skip(start_index) {
350        let item = *grapheme;
351        if item == end_grapheme {
352            break;
353        }
354        result.push(item);
355    }
356    result
357}
358
359///
360/// Turn escaped character sequence into the equivalent UTF-8 character
361/// This function accepts \o, \x and \u formats.
362/// This function will also attempt to unescape the common C style control characters.
363/// Anything else needs to be expressed as hex or octal patterns with the formats above.
364///
365/// If I did this right, I should get the "raw" byte sequence out of the escaped string.
366/// We can then use the bytes and attempt a decode() to figure out the string encoding and
367/// get the correct conversion to UTF-8. **Fingers crossed**
368///
369pub fn unescape(escaped_str: &str) -> Result<Vec<u8>, RUMString> {
370    let lower_case = escaped_str.to_lowercase();
371    let mut bytes: Vec<u8> = Vec::with_capacity(3);
372    match &lower_case[0..2] {
373        // Hex notation case. Assume we are getting xxyy bytes
374        "\\x" => {
375            let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
376            bytes.append(&mut byte_str.as_bytes().to_vec());
377        }
378        // Unicode notation case, we need to do an extra step or we will lose key bytes.
379        "\\u" => {
380            let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
381            bytes.append(&mut byte_str.as_bytes().to_vec());
382        }
383        // Single byte notation case
384        "\\c" => {
385            let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
386            bytes.append(&mut byte_str.as_bytes().to_vec());
387        }
388        // Unicode notation case
389        "\\o" => {
390            let byte_str = number_to_char_unchecked(&octal_to_number(&lower_case[2..6])?);
391            bytes.append(&mut byte_str.as_bytes().to_vec());
392        }
393        // Multibyte notation case
394        "\\m" => match lower_case.count_graphemes() {
395            8 => {
396                bytes.push(hex_to_byte(&lower_case[2..4])?);
397                bytes.push(hex_to_byte(&lower_case[4..6])?);
398                bytes.push(hex_to_byte(&lower_case[6..8])?);
399            }
400            6 => {
401                bytes.push(hex_to_byte(&lower_case[2..4])?);
402                bytes.push(hex_to_byte(&lower_case[4..6])?);
403            }
404            _ => {
405                return Err(rumtk_format!(
406                    "Unknown multibyte sequence. Cannot decode {}",
407                    lower_case
408                ))
409            }
410        },
411        // Custom encoding
412        "\\z" => bytes.append(&mut lower_case.as_bytes().to_vec()),
413        // Single byte codes.
414        _ => bytes.push(unescape_control_byte(&lower_case[0..2])?),
415    }
416    Ok(bytes)
417}
418
419///
420/// Unescape basic character
421/// We use pattern matching to map the basic escape character to its corresponding integer value.
422///
423fn unescape_control(escaped_str: &str) -> Result<char, RUMString> {
424    match escaped_str {
425        // Common control sequences
426        "\\t" => Ok('\t'),
427        "\\b" => Ok('\x08'),
428        "\\n" => Ok('\n'),
429        "\\r" => Ok('\r'),
430        "\\f" => Ok('\x14'),
431        "\\s" => Ok('\x20'),
432        "\\\\" => Ok(ASCII_ESCAPE_CHAR),
433        "\\'" => Ok('\''),
434        "\\\"" => Ok('"'),
435        "\\0" => Ok('\0'),
436        "\\v" => Ok('\x0B'),
437        "\\a" => Ok('\x07'),
438        // Control sequences by
439        _ => Err(rumtk_format!(
440            "Unknown escape sequence? Sequence: {}!",
441            escaped_str
442        )),
443    }
444}
445
446///
447/// Unescape basic character
448/// We use pattern matching to map the basic escape character to its corresponding integer value.
449///
450fn unescape_control_byte(escaped_str: &str) -> Result<u8, RUMString> {
451    match escaped_str {
452        // Common control sequences
453        "\\t" => Ok(9),   // Tab/Character Tabulation
454        "\\b" => Ok(8),   // Backspace
455        "\\n" => Ok(10),  // New line/ Line Feed character
456        "\\r" => Ok(13),  // Carriage Return character
457        "\\f" => Ok(12),  // Form Feed
458        "\\s" => Ok(32),  // Space
459        "\\\\" => Ok(27), // Escape
460        "\\'" => Ok(39),  // Single quote
461        "\\\"" => Ok(34), // Double quote
462        "\\0" => Ok(0),   // Null character
463        "\\v" => Ok(11),  // Vertical Tab/Line Tabulation
464        "\\a" => Ok(7),   // Alert bell
465        // Control sequences by hex
466        //Err(rumtk_format!("Unknown escape sequence? Sequence: {}!", escaped_str))
467        _ => hex_to_byte(escaped_str),
468    }
469}
470
471///
472/// Turn hex string to number (u32)
473///
474fn hex_to_number(hex_str: &str) -> Result<u32, RUMString> {
475    match u32::from_str_radix(&hex_str, 16) {
476        Ok(result) => Ok(result),
477        Err(val) => Err(rumtk_format!(
478            "Failed to parse string with error {}! Input string {} \
479        is not hex string!",
480            val,
481            hex_str
482        )),
483    }
484}
485
486///
487/// Turn hex string to byte (u8)
488///
489fn hex_to_byte(hex_str: &str) -> Result<u8, RUMString> {
490    match u8::from_str_radix(&hex_str, 16) {
491        Ok(result) => Ok(result),
492        Err(val) => Err(rumtk_format!(
493            "Failed to parse string with error {}! Input string {} \
494        is not hex string!",
495            val,
496            hex_str
497        )),
498    }
499}
500
501///
502/// Turn octal string to number (u32)
503///
504fn octal_to_number(hoctal_str: &str) -> Result<u32, RUMString> {
505    match u32::from_str_radix(&hoctal_str, 8) {
506        Ok(result) => Ok(result),
507        Err(val) => Err(rumtk_format!(
508            "Failed to parse string with error {}! Input string {} \
509        is not an octal string!",
510            val,
511            hoctal_str
512        )),
513    }
514}
515
516///
517/// Turn octal string to byte (u32)
518///
519fn octal_to_byte(hoctal_str: &str) -> Result<u8, RUMString> {
520    match u8::from_str_radix(&hoctal_str, 8) {
521        Ok(result) => Ok(result),
522        Err(val) => Err(rumtk_format!(
523            "Failed to parse string with error {}! Input string {} \
524        is not an octal string!",
525            val,
526            hoctal_str
527        )),
528    }
529}
530
531///
532/// Turn number to UTF-8 char
533///
534fn number_to_char(num: &u32) -> Result<RUMString, RUMString> {
535    match char::from_u32(*num) {
536        Some(result) => Ok(result.to_rumstring()),
537        None => Err(rumtk_format!(
538            "Failed to cast number to character! Number {}",
539            num
540        )),
541    }
542}
543
544///
545/// Turn number to UTF-8 char. Normally, calling from_u32 checks if the value is a valid character.
546/// This version uses the less safe from_u32_unchecked() function because we want to get the bytes
547/// and deal with validity at a higher layer.
548///
549fn number_to_char_unchecked(num: &u32) -> RUMString {
550    unsafe { char::from_u32_unchecked(*num).to_rumstring() }
551}
552
553///
554/// Turn UTF-8 character into escaped character sequence as expected in HL7
555///
556/// # Example
557/// ```
558///  use rumtk_core::strings::{escape};
559///  let message = "I ❤ my wife!";
560///  let escaped_message = escape(&message);
561///  assert_eq!("I \\u2764 my wife!", &escaped_message, "Did not get expected escaped string! Got {}!", &escaped_message);
562///```
563///
564pub fn escape(unescaped_str: &str) -> RUMString {
565    basic_escape(unescaped_str)
566        .replace("{", "")
567        .replace("}", "")
568        .to_rumstring()
569}
570
571///
572/// Escape UTF-8 characters in UTF-8 string that are beyond ascii range
573///
574/// # Example
575/// ```
576///  use rumtk_core::strings::basic_escape;
577///  let message = "I ❤ my wife!";
578///  let escaped_message = basic_escape(&message);
579///  assert_eq!("I \\u{2764} my wife!", &escaped_message, "Did not get expected escaped string! Got {}!", &escaped_message);
580///```
581pub fn basic_escape(unescaped_str: &str) -> RUMString {
582    let escaped = is_escaped_str(unescaped_str);
583    if !escaped {
584        return unescaped_str.escape_default().to_compact_string();
585    }
586    unescaped_str.to_rumstring()
587}
588
589///
590/// Checks if a given string is fully ASCII or within the ASCII range.
591///
592/// Remember: all strings are UTF-8 encoded in Rust, but most ASCII strings fit within the UTF-8
593/// encoding scheme.
594///
595pub fn is_ascii_str(unescaped_str: &str) -> bool {
596    unescaped_str.is_ascii()
597}
598
599///
600/// Checks if an input string is already escaped.
601/// The idea is to avoid escaping the escaped string thus making it a nightmare to undo the
602/// escaping later on.
603///
604/// Basically, if you were to blindly escape the input string, back slashes keep getting escaped.
605/// For example `\r -> \\r -> \\\\r -> ...`.
606///
607pub fn is_escaped_str(unescaped_str: &str) -> bool {
608    if !is_ascii_str(unescaped_str) {
609        return false;
610    }
611
612    for c in unescaped_str.chars() {
613        if !is_printable_char(&c) {
614            return false;
615        }
616    }
617    true
618}
619
620///
621/// Returns whether a character is in the ASCII printable range.
622///
623pub fn is_printable_char(c: &char) -> bool {
624    &MIN_ASCII_READABLE <= c && c <= &MAX_ASCII_READABLE
625}
626
627///
628/// Removes all non ASCII and all non printable characters from string.
629///
630pub fn filter_ascii(unescaped_str: &str, closure: fn(char) -> bool) -> RUMString {
631    let mut filtered = unescaped_str.to_rumstring();
632    filtered.retain(closure);
633    filtered
634}
635
636///
637/// Removes all non ASCII and all non printable characters from string.
638///
639pub fn filter_non_printable_ascii(unescaped_str: &str) -> RUMString {
640    filter_ascii(unescaped_str, |c: char| is_printable_char(&c))
641}
rumtk_core/strings.rs

rumtk_core/
strings.rs