rumtk_core/
strings.rs

1/*
2 * rumtk attempts to implement HL7 and medical protocols for interoperability in medicine.
3 * This toolkit aims to be reliable, simple, performant, and standards compliant.
4 * Copyright (C) 2024  Luis M. Santos, M.D.
5 * Copyright (C) 2025  MedicalMasses L.L.C.
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
20 */
21use crate::core::{is_unique, RUMResult};
22use chardetng::EncodingDetector;
23pub use compact_str::{
24    format_compact as rumtk_format, CompactString, CompactStringExt, ToCompactString,
25};
26use encoding_rs::Encoding;
27use unicode_segmentation::UnicodeSegmentation;
28/**************************** Constants**************************************/
29const ESCAPED_STRING_WINDOW: usize = 6;
30const ASCII_ESCAPE_CHAR: char = '\\';
31const MIN_ASCII_READABLE: char = ' ';
32const MAX_ASCII_READABLE: char = '~';
33pub const EMPTY_STRING: &str = "";
34pub const DOT_STR: &str = ".";
35pub const EMPTY_STRING_OPTION: Option<&str> = Some("");
36pub const READABLE_ASCII: &str = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~";
37
38/**************************** Types *****************************************/
39pub type RUMString = CompactString;
40pub type EscapeException<'a> = (&'a str, &'a str);
41pub type EscapeExceptions<'a> = &'a [EscapeException<'a>];
42
43/**************************** Traits ****************************************/
44
45///
46/// Implemented indexing trait for String and str which uses the UnicodeSegmentation facilities to
47/// enable grapheme iteration by default. There could be some performance penalty, but it will allow
48/// for native Unicode support to the best extent possible.
49///
50/// We also enable decoding from Encoding Standard encodings to UTF-8.
51///
52pub trait UTFStringExtensions {
53    fn count_graphemes(&self) -> usize;
54
55    ///
56    /// Return a grapheme unit which could span multiple Unicode codepoints or "characters".
57    ///
58    /// # Note
59    /// ```text
60    ///     If the grapheme requested does not exists, this method will return a blank string.
61    /// ```
62    ///
63    /// Instead of just retrieving a codepoint as character, I decided to take it a step further and
64    /// have support for grapheme selection such that characters in written language like sanskrit
65    /// can be properly selected and evaluated.
66    ///
67    /// [!CAUTION]
68    /// This can be an extremely slow operation over large strings since each call to this method
69    /// will need to rescan the input string every time we need to look up a grapheme. Unfortunately,
70    /// this is a side effect of convenience. To improve performance, call .get_graphemes() once and
71    /// then call take_grapheme() over that iterator.
72    ///
73    fn get_grapheme(&self, index: usize) -> &str;
74
75    fn get_graphemes(&self) -> Vec<&str>;
76
77    fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str>;
78
79    #[inline(always)]
80    fn take_grapheme<'a>(&self, graphemes: &Vec<&'a str>, index: usize) -> RUMString {
81        if index >= graphemes.len() {
82            return RUMString::from(EMPTY_STRING);
83        }
84        RUMString::from(graphemes[index])
85    }
86
87    #[inline(always)]
88    fn get_grapheme_window(&self, min: usize, max: usize, offset: usize) -> RUMString {
89        let mut window: RUMString = RUMString::with_capacity(max - min);
90        let start = min + offset;
91        let end = max + offset;
92        let graphemes = self.get_graphemes();
93        for i in start..end {
94            window += &self.take_grapheme(&graphemes, i);
95        }
96        window
97    }
98
99    #[inline(always)]
100    fn get_grapheme_string(&self, end_pattern: &str, offset: usize) -> RUMString {
101        let mut window: RUMString = RUMString::with_capacity(ESCAPED_STRING_WINDOW);
102        for grapheme in self.get_grapheme_chunk(offset) {
103            if grapheme == end_pattern {
104                return RUMString::from(window);
105            } else {
106                window += grapheme;
107            }
108        }
109        RUMString::from(window)
110    }
111
112    #[inline(always)]
113    fn find_grapheme(&self, pattern: &str, offset: usize) -> &str {
114        for grapheme in self.get_grapheme_chunk(offset) {
115            if grapheme == pattern {
116                return grapheme;
117            }
118        }
119        EMPTY_STRING
120    }
121
122    #[inline(always)]
123    fn truncate(&self, max_size: usize) -> RUMString {
124        self.get_grapheme_window(0, max_size, 0)
125    }
126}
127
128pub trait AsStr {
129    fn as_str(&self) -> &str;
130}
131
132pub trait RUMStringConversions: ToString {
133    fn to_rumstring(&self) -> RUMString {
134        RUMString::from(self.to_string())
135    }
136
137    fn to_raw(&self) -> Vec<u8> {
138        self.to_string().as_bytes().to_vec()
139    }
140}
141
142pub trait StringUtils: AsStr + UTFStringExtensions {
143    #[inline(always)]
144    fn duplicate(&self, count: usize) -> RUMString {
145        let mut duplicated = RUMString::with_capacity(count);
146        for i in 0..count {
147            duplicated += &self.as_str();
148        }
149        duplicated
150    }
151
152    fn is_unique(&self) -> bool {
153        let graphemes = self.get_graphemes();
154        is_unique(&graphemes)
155    }
156}
157
158impl UTFStringExtensions for RUMString {
159    #[inline(always)]
160    fn count_graphemes(&self) -> usize {
161        self.graphemes(true).count()
162    }
163
164    #[inline(always)]
165    fn get_grapheme(&self, index: usize) -> &str {
166        self.graphemes(true)
167            .nth(index)
168            .or(EMPTY_STRING_OPTION)
169            .unwrap()
170    }
171
172    #[inline(always)]
173    fn get_graphemes(&self) -> Vec<&str> {
174        self.graphemes(true).collect::<Vec<&str>>()
175    }
176
177    #[inline(always)]
178    fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str> {
179        self.graphemes(true).skip(offset).collect::<Vec<&str>>()
180    }
181}
182
183impl RUMStringConversions for RUMString {}
184impl AsStr for RUMString {
185    fn as_str(&self) -> &str {
186        self.as_str()
187    }
188}
189impl StringUtils for RUMString {}
190
191impl UTFStringExtensions for str {
192    #[inline(always)]
193    fn count_graphemes(&self) -> usize {
194        self.graphemes(true).count()
195    }
196
197    #[inline(always)]
198    fn get_grapheme(&self, index: usize) -> &str {
199        self.graphemes(true)
200            .nth(index)
201            .or(EMPTY_STRING_OPTION)
202            .unwrap()
203    }
204
205    #[inline(always)]
206    fn get_graphemes(&self) -> Vec<&str> {
207        self.graphemes(true).collect::<Vec<&str>>()
208    }
209
210    #[inline(always)]
211    fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str> {
212        self.graphemes(true).skip(offset).collect::<Vec<&str>>()
213    }
214}
215
216impl RUMStringConversions for str {}
217
218impl AsStr for str {
219    fn as_str(&self) -> &str {
220        self
221    }
222}
223
224impl StringUtils for str {}
225
226impl RUMStringConversions for char {}
227
228pub trait RUMArrayConversions {
229    fn to_rumstring(&self) -> RUMString;
230}
231
232impl RUMArrayConversions for Vec<u8> {
233    fn to_rumstring(&self) -> RUMString {
234        self.as_slice().to_rumstring()
235    }
236}
237
238impl RUMArrayConversions for &[u8] {
239    fn to_rumstring(&self) -> RUMString {
240        RUMString::from_utf8(&self).unwrap()
241    }
242}
243
244/**************************** Helpers ***************************************/
245
246pub fn count_tokens_ignoring_pattern(vector: &Vec<&str>, string_token: &RUMString) -> usize {
247    let mut count: usize = 0;
248    for tok in vector.iter() {
249        if string_token != tok {
250            count += 1;
251        }
252    }
253    count
254}
255
256///
257/// Implements decoding this string from its auto-detected encoding to UTF-8.
258/// Failing that we assume the string was encoded in UTF-8 and return a copy.
259///
260/// Note => Decoding is facilitated via the crates chardet-ng and encoding_rs.
261///
262pub fn try_decode(src: &[u8]) -> RUMString {
263    let mut detector = EncodingDetector::new();
264    detector.feed(&src, true);
265    let encoding = detector.guess(None, true);
266    decode(src, encoding)
267}
268
269///
270/// Implements decoding this string from a specific encoding to UTF-8.
271///
272/// Note => Decoding is facilitated via the crates chardet-ng and encoding_rs.
273///
274pub fn try_decode_with(src: &[u8], encoding_name: &str) -> RUMString {
275    let encoding = match Encoding::for_label(encoding_name.as_bytes()) {
276        Some(v) => v,
277        None => return RUMString::from(""),
278    };
279    decode(src, encoding)
280}
281
282///
283/// Implements decoding of input with encoder.
284///
285/// Note => Decoding is facilitated via the crate encoding_rs.
286///
287fn decode(src: &[u8], encoding: &'static Encoding) -> RUMString {
288    match encoding.decode_without_bom_handling_and_without_replacement(&src) {
289        Some(res) => RUMString::from(res),
290        None => RUMString::from_utf8(src).unwrap(),
291    }
292}
293
294///
295/// This function will scan through an escaped string and unescape any escaped characters.
296/// We collect these characters as a byte vector.
297/// Finally, we do a decode pass on the vector to re-encode the bytes **hopefully right** into a
298/// valid UTF-8 string.
299///
300/// This function focuses on reverting the result of [escape], whose output is meant for HL7.
301///
302pub fn unescape_string(escaped_str: &str) -> RUMResult<RUMString> {
303    let graphemes = escaped_str.graphemes(true).collect::<Vec<&str>>();
304    let str_size = graphemes.len();
305    let mut result: Vec<u8> = Vec::with_capacity(escaped_str.len());
306    let mut i = 0;
307    while i < str_size {
308        let seq_start = graphemes[i];
309        match seq_start {
310            "\\" => {
311                let escape_seq = get_grapheme_string(&graphemes, " ", i);
312                let mut c = match unescape(&escape_seq) {
313                    Ok(c) => c,
314                    Err(_why) => Vec::from(escape_seq.as_bytes()),
315                };
316                result.append(&mut c);
317                i += &escape_seq.count_graphemes();
318            }
319            _ => {
320                result.append(&mut Vec::from(seq_start.as_bytes()));
321                i += 1;
322            }
323        }
324    }
325    Ok(try_decode(result.as_slice()))
326}
327
328///
329/// Get the grapheme block and concatenate it into a newly allocated [`RUMString`].
330///
331pub fn get_grapheme_string<'a>(
332    graphemes: &Vec<&'a str>,
333    end_grapheme: &str,
334    start_index: usize,
335) -> RUMString {
336    get_grapheme_collection(graphemes, end_grapheme, start_index).join_compact("")
337}
338
339///
340/// Return vector of graphemes from starting spot up until we find the end grapheme.
341///
342/// Because a grapheme may take more than one codepoint characters, these have to be treated as
343/// references to strings.
344///
345pub fn get_grapheme_collection<'a>(
346    graphemes: &Vec<&'a str>,
347    end_grapheme: &str,
348    start_index: usize,
349) -> Vec<&'a str> {
350    let mut result: Vec<&'a str> = Vec::new();
351    for grapheme in graphemes.iter().skip(start_index) {
352        let item = *grapheme;
353        if item == end_grapheme {
354            break;
355        }
356        result.push(item);
357    }
358    result
359}
360
361///
362/// Turn escaped character sequence into the equivalent UTF-8 character
363/// This function accepts \o, \x and \u formats.
364/// This function will also attempt to unescape the common C style control characters.
365/// Anything else needs to be expressed as hex or octal patterns with the formats above.
366///
367/// If I did this right, I should get the "raw" byte sequence out of the escaped string.
368/// We can then use the bytes and attempt a decode() to figure out the string encoding and
369/// get the correct conversion to UTF-8. **Fingers crossed**
370///
371pub fn unescape(escaped_str: &str) -> Result<Vec<u8>, RUMString> {
372    let lower_case = escaped_str.to_lowercase();
373    let mut bytes: Vec<u8> = Vec::with_capacity(3);
374    match &lower_case[0..2] {
375        // Hex notation case. Assume we are getting xxyy bytes
376        "\\x" => {
377            let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
378            bytes.append(&mut byte_str.as_bytes().to_vec());
379        }
380        // Unicode notation case, we need to do an extra step or we will lose key bytes.
381        "\\u" => {
382            let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
383            bytes.append(&mut byte_str.as_bytes().to_vec());
384        }
385        // Single byte notation case
386        "\\c" => {
387            let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
388            bytes.append(&mut byte_str.as_bytes().to_vec());
389        }
390        // Unicode notation case
391        "\\o" => {
392            let byte_str = number_to_char_unchecked(&octal_to_number(&lower_case[2..6])?);
393            bytes.append(&mut byte_str.as_bytes().to_vec());
394        }
395        // Multibyte notation case
396        "\\m" => match lower_case.count_graphemes() {
397            8 => {
398                bytes.push(hex_to_byte(&lower_case[2..4])?);
399                bytes.push(hex_to_byte(&lower_case[4..6])?);
400                bytes.push(hex_to_byte(&lower_case[6..8])?);
401            }
402            6 => {
403                bytes.push(hex_to_byte(&lower_case[2..4])?);
404                bytes.push(hex_to_byte(&lower_case[4..6])?);
405            }
406            _ => {
407                return Err(rumtk_format!(
408                    "Unknown multibyte sequence. Cannot decode {}",
409                    lower_case
410                ))
411            }
412        },
413        // Custom encoding
414        "\\z" => bytes.append(&mut lower_case.as_bytes().to_vec()),
415        // Single byte codes.
416        _ => bytes.push(unescape_control_byte(&lower_case[0..2])?),
417    }
418    Ok(bytes)
419}
420
421///
422/// Unescape basic character
423/// We use pattern matching to map the basic escape character to its corresponding integer value.
424///
425fn unescape_control(escaped_str: &str) -> Result<char, RUMString> {
426    match escaped_str {
427        // Common control sequences
428        "\\t" => Ok('\t'),
429        "\\b" => Ok('\x08'),
430        "\\n" => Ok('\n'),
431        "\\r" => Ok('\r'),
432        "\\f" => Ok('\x14'),
433        "\\s" => Ok('\x20'),
434        "\\\\" => Ok(ASCII_ESCAPE_CHAR),
435        "\\'" => Ok('\''),
436        "\\\"" => Ok('"'),
437        "\\0" => Ok('\0'),
438        "\\v" => Ok('\x0B'),
439        "\\a" => Ok('\x07'),
440        // Control sequences by
441        _ => Err(rumtk_format!(
442            "Unknown escape sequence? Sequence: {}!",
443            escaped_str
444        )),
445    }
446}
447
448///
449/// Unescape basic character
450/// We use pattern matching to map the basic escape character to its corresponding integer value.
451///
452fn unescape_control_byte(escaped_str: &str) -> Result<u8, RUMString> {
453    match escaped_str {
454        // Common control sequences
455        "\\t" => Ok(9),   // Tab/Character Tabulation
456        "\\b" => Ok(8),   // Backspace
457        "\\n" => Ok(10),  // New line/ Line Feed character
458        "\\r" => Ok(13),  // Carriage Return character
459        "\\f" => Ok(12),  // Form Feed
460        "\\s" => Ok(32),  // Space
461        "\\\\" => Ok(27), // Escape
462        "\\'" => Ok(39),  // Single quote
463        "\\\"" => Ok(34), // Double quote
464        "\\0" => Ok(0),   // Null character
465        "\\v" => Ok(11),  // Vertical Tab/Line Tabulation
466        "\\a" => Ok(7),   // Alert bell
467        // Control sequences by hex
468        //Err(rumtk_format!("Unknown escape sequence? Sequence: {}!", escaped_str))
469        _ => hex_to_byte(escaped_str),
470    }
471}
472
473///
474/// Turn hex string to number (u32)
475///
476fn hex_to_number(hex_str: &str) -> Result<u32, RUMString> {
477    match u32::from_str_radix(&hex_str, 16) {
478        Ok(result) => Ok(result),
479        Err(val) => Err(rumtk_format!(
480            "Failed to parse string with error {}! Input string {} \
481        is not hex string!",
482            val,
483            hex_str
484        )),
485    }
486}
487
488///
489/// Turn hex string to byte (u8)
490///
491fn hex_to_byte(hex_str: &str) -> Result<u8, RUMString> {
492    match u8::from_str_radix(&hex_str, 16) {
493        Ok(result) => Ok(result),
494        Err(val) => Err(rumtk_format!(
495            "Failed to parse string with error {}! Input string {} \
496        is not hex string!",
497            val,
498            hex_str
499        )),
500    }
501}
502
503///
504/// Turn octal string to number (u32)
505///
506fn octal_to_number(hoctal_str: &str) -> Result<u32, RUMString> {
507    match u32::from_str_radix(&hoctal_str, 8) {
508        Ok(result) => Ok(result),
509        Err(val) => Err(rumtk_format!(
510            "Failed to parse string with error {}! Input string {} \
511        is not an octal string!",
512            val,
513            hoctal_str
514        )),
515    }
516}
517
518///
519/// Turn octal string to byte (u32)
520///
521fn octal_to_byte(hoctal_str: &str) -> Result<u8, RUMString> {
522    match u8::from_str_radix(&hoctal_str, 8) {
523        Ok(result) => Ok(result),
524        Err(val) => Err(rumtk_format!(
525            "Failed to parse string with error {}! Input string {} \
526        is not an octal string!",
527            val,
528            hoctal_str
529        )),
530    }
531}
532
533///
534/// Turn number to UTF-8 char
535///
536fn number_to_char(num: &u32) -> Result<RUMString, RUMString> {
537    match char::from_u32(*num) {
538        Some(result) => Ok(result.to_rumstring()),
539        None => Err(rumtk_format!(
540            "Failed to cast number to character! Number {}",
541            num
542        )),
543    }
544}
545
546///
547/// Turn number to UTF-8 char. Normally, calling from_u32 checks if the value is a valid character.
548/// This version uses the less safe from_u32_unchecked() function because we want to get the bytes
549/// and deal with validity at a higher layer.
550///
551fn number_to_char_unchecked(num: &u32) -> RUMString {
552    unsafe { char::from_u32_unchecked(*num).to_rumstring() }
553}
554
555///
556/// Turn UTF-8 character into escaped character sequence as expected in HL7
557///
558/// # Example
559/// ```
560///  use rumtk_core::strings::{escape};
561///  let message = "I ❤ my wife!";
562///  let escaped_message = escape(&message);
563///  assert_eq!("I \\u2764 my wife!", &escaped_message, "Did not get expected escaped string! Got {}!", &escaped_message);
564///```
565///
566pub fn escape(unescaped_str: &str) -> RUMString {
567    basic_escape(unescaped_str, &vec![("{", ""), ("}", "")])
568}
569
570///
571/// Escape UTF-8 characters in UTF-8 string that are beyond ascii range
572///
573/// # Example
574/// ```
575///  use rumtk_core::strings::basic_escape;
576///  let message = "I ❤ my wife!";
577///  let escaped_message = basic_escape(&message, &vec![]);
578///  assert_eq!("I \\u{2764} my wife!", &escaped_message, "Did not get expected escaped string! Got {}!", &escaped_message);
579///```
580pub fn basic_escape(unescaped_str: &str, except: EscapeExceptions) -> RUMString {
581    let escaped = is_escaped_str(unescaped_str);
582    if !escaped {
583        let mut escaped_str = unescaped_str.escape_default().to_string();
584        for (from, to) in except {
585            escaped_str = escaped_str.replace(from, to);
586        }
587        return escaped_str.to_rumstring();
588    }
589    unescaped_str.to_rumstring()
590}
591
592///
593/// Checks if a given string is fully ASCII or within the ASCII range.
594///
595/// Remember: all strings are UTF-8 encoded in Rust, but most ASCII strings fit within the UTF-8
596/// encoding scheme.
597///
598pub fn is_ascii_str(unescaped_str: &str) -> bool {
599    unescaped_str.is_ascii()
600}
601
602///
603/// Checks if an input string is already escaped.
604/// The idea is to avoid escaping the escaped string thus making it a nightmare to undo the
605/// escaping later on.
606///
607/// Basically, if you were to blindly escape the input string, back slashes keep getting escaped.
608/// For example `\r -> \\r -> \\\\r -> ...`.
609///
610pub fn is_escaped_str(unescaped_str: &str) -> bool {
611    if !is_ascii_str(unescaped_str) {
612        return false;
613    }
614
615    for c in unescaped_str.chars() {
616        if !is_printable_char(&c) {
617            return false;
618        }
619    }
620    true
621}
622
623///
624/// Returns whether a character is in the ASCII printable range.
625///
626pub fn is_printable_char(c: &char) -> bool {
627    &MIN_ASCII_READABLE <= c && c <= &MAX_ASCII_READABLE
628}
629
630///
631/// Removes all non ASCII and all non printable characters from string.
632///
633pub fn filter_ascii(unescaped_str: &str, closure: fn(char) -> bool) -> RUMString {
634    let mut filtered = unescaped_str.to_rumstring();
635    filtered.retain(closure);
636    filtered
637}
638
639///
640/// Removes all non ASCII and all non printable characters from string.
641///
642pub fn filter_non_printable_ascii(unescaped_str: &str) -> RUMString {
643    filter_ascii(unescaped_str, |c: char| is_printable_char(&c))
644}