rumtk_core/
strings.rs

1/*
2 * rumtk attempts to implement HL7 and medical protocols for interoperability in medicine.
3 * This toolkit aims to be reliable, simple, performant, and standards compliant.
4 * Copyright (C) 2024  Luis M. Santos, M.D.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
19 */
20use crate::core::{is_unique, RUMResult};
21use chardetng::EncodingDetector;
22pub use compact_str::{format_compact, CompactString, CompactStringExt, ToCompactString};
23use encoding_rs::Encoding;
24use std::fmt::Display;
25use unicode_segmentation::UnicodeSegmentation;
26/**************************** Constants**************************************/
27const ESCAPED_STRING_WINDOW: usize = 6;
28const ASCII_ESCAPE_CHAR: char = '\\';
29const MIN_ASCII_READABLE: char = ' ';
30const MAX_ASCII_READABLE: char = '~';
31pub const EMPTY_STRING: &str = "";
32pub const DOT_STR: &str = ".";
33pub const EMPTY_STRING_OPTION: Option<&str> = Some("");
34pub const READABLE_ASCII: &str = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~";
35
36/**************************** Types *****************************************/
37pub type RUMString = CompactString;
38
39/**************************** Traits ****************************************/
40
41///
42/// Implemented indexing trait for String and str which uses the UnicodeSegmentation facilities to
43/// enable grapheme iteration by default. There could be some performance penalty, but it will allow
44/// for native Unicode support to the best extent possible.
45///
46/// We also enable decoding from Encoding Standard encodings to UTF-8.
47///
48pub trait UTFStringExtensions {
49    fn count_graphemes(&self) -> usize;
50
51    ///
52    /// Return a grapheme unit which could span multiple Unicode codepoints or "characters".
53    ///
54    /// # Note
55    /// ```text
56    ///     If the grapheme requested does not exists, this method will return a blank string.
57    /// ```
58    ///
59    /// Instead of just retrieving a codepoint as character, I decided to take it a step further and
60    /// have support for grapheme selection such that characters in written language like sanskrit
61    /// can be properly selected and evaluated.
62    ///
63    /// [!CAUTION]
64    /// This can be an extremely slow operation over large strings since each call to this method
65    /// will need to rescan the input string every time we need to look up a grapheme. Unfortunately,
66    /// this is a side effect of convenience. To improve performance, call .get_graphemes() once and
67    /// then call take_grapheme() over that iterator.
68    ///
69    fn get_grapheme(&self, index: usize) -> &str;
70
71    fn get_graphemes(&self) -> Vec<&str>;
72
73    fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str>;
74
75    #[inline(always)]
76    fn take_grapheme<'a>(&self, graphemes: &Vec<&'a str>, index: usize) -> RUMString {
77        if index >= graphemes.len() {
78            return RUMString::from(EMPTY_STRING);
79        }
80        RUMString::from(graphemes[index])
81    }
82
83    #[inline(always)]
84    fn get_grapheme_window(&self, min: usize, max: usize, offset: usize) -> RUMString {
85        let mut window: RUMString = RUMString::with_capacity(max - min);
86        let start = min + offset;
87        let end = max + offset;
88        let graphemes = self.get_graphemes();
89        for i in start..end {
90            window += &self.take_grapheme(&graphemes, i);
91        }
92        window
93    }
94
95    #[inline(always)]
96    fn get_grapheme_string(&self, end_pattern: &str, offset: usize) -> RUMString {
97        let mut window: RUMString = RUMString::with_capacity(ESCAPED_STRING_WINDOW);
98        for grapheme in self.get_grapheme_chunk(offset) {
99            if grapheme == end_pattern {
100                return RUMString::from(window);
101            } else {
102                window += grapheme;
103            }
104        }
105        RUMString::from(window)
106    }
107
108    #[inline(always)]
109    fn find_grapheme(&self, pattern: &str, offset: usize) -> &str {
110        for grapheme in self.get_grapheme_chunk(offset) {
111            if grapheme == pattern {
112                return grapheme;
113            }
114        }
115        EMPTY_STRING
116    }
117
118    #[inline(always)]
119    fn truncate(&self, max_size: usize) -> RUMString {
120        self.get_grapheme_window(0, max_size, 0)
121    }
122}
123
124pub trait AsStr {
125    fn as_str(&self) -> &str;
126}
127
128pub trait RUMStringConversions: ToString {
129    fn to_rumstring(&self) -> RUMString {
130        RUMString::from(self.to_string())
131    }
132
133    fn to_raw(&self) -> Vec<u8> {
134        self.to_string().as_bytes().to_vec()
135    }
136}
137
138pub trait StringUtils: AsStr + UTFStringExtensions {
139    #[inline(always)]
140    fn duplicate(&self, count: usize) -> RUMString {
141        let mut duplicated = RUMString::with_capacity(count);
142        for i in 0..count {
143            duplicated += &self.as_str();
144        }
145        duplicated
146    }
147
148    fn is_unique(&self) -> bool {
149        let graphemes = self.get_graphemes();
150        is_unique(&graphemes)
151    }
152}
153
154impl UTFStringExtensions for RUMString {
155    #[inline(always)]
156    fn count_graphemes(&self) -> usize {
157        self.graphemes(true).count()
158    }
159
160    #[inline(always)]
161    fn get_grapheme(&self, index: usize) -> &str {
162        self.graphemes(true)
163            .nth(index)
164            .or(EMPTY_STRING_OPTION)
165            .unwrap()
166    }
167
168    #[inline(always)]
169    fn get_graphemes(&self) -> Vec<&str> {
170        self.graphemes(true).collect::<Vec<&str>>()
171    }
172
173    #[inline(always)]
174    fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str> {
175        self.graphemes(true).skip(offset).collect::<Vec<&str>>()
176    }
177}
178
179impl RUMStringConversions for RUMString {}
180impl AsStr for RUMString {
181    fn as_str(&self) -> &str {
182        self.as_str()
183    }
184}
185impl StringUtils for RUMString {}
186
187impl UTFStringExtensions for str {
188    #[inline(always)]
189    fn count_graphemes(&self) -> usize {
190        self.graphemes(true).count()
191    }
192
193    #[inline(always)]
194    fn get_grapheme(&self, index: usize) -> &str {
195        self.graphemes(true)
196            .nth(index)
197            .or(EMPTY_STRING_OPTION)
198            .unwrap()
199    }
200
201    #[inline(always)]
202    fn get_graphemes(&self) -> Vec<&str> {
203        self.graphemes(true).collect::<Vec<&str>>()
204    }
205
206    #[inline(always)]
207    fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str> {
208        self.graphemes(true).skip(offset).collect::<Vec<&str>>()
209    }
210}
211
212impl RUMStringConversions for str {}
213
214impl AsStr for str {
215    fn as_str(&self) -> &str {
216        self
217    }
218}
219
220impl StringUtils for str {}
221
222impl RUMStringConversions for char {}
223
224pub trait RUMArrayConversions {
225    fn to_rumstring(&self) -> RUMString;
226}
227
228impl RUMArrayConversions for Vec<u8> {
229    fn to_rumstring(&self) -> RUMString {
230        self.as_slice().to_rumstring()
231    }
232}
233
234impl RUMArrayConversions for &[u8] {
235    fn to_rumstring(&self) -> RUMString {
236        RUMString::from_utf8(&self).unwrap()
237    }
238}
239
240/**************************** Helpers ***************************************/
241
242pub fn count_tokens_ignoring_pattern(vector: &Vec<&str>, string_token: &RUMString) -> usize {
243    let mut count: usize = 0;
244    for tok in vector.iter() {
245        if string_token != tok {
246            count += 1;
247        }
248    }
249    count
250}
251
252///
253/// Implements decoding this string from its auto-detected encoding to UTF-8.
254/// Failing that we assume the string was encoded in UTF-8 and return a copy.
255///
256/// Note => Decoding is facilitated via the crates chardet-ng and encoding_rs.
257///
258pub fn try_decode(src: &[u8]) -> RUMString {
259    let mut detector = EncodingDetector::new();
260    detector.feed(&src, true);
261    let encoding = detector.guess(None, true);
262    decode(src, encoding)
263}
264
265///
266/// Implements decoding this string from a specific encoding to UTF-8.
267///
268/// Note => Decoding is facilitated via the crates chardet-ng and encoding_rs.
269///
270pub fn try_decode_with(src: &[u8], encoding_name: &str) -> RUMString {
271    let encoding = match Encoding::for_label(encoding_name.as_bytes()) {
272        Some(v) => v,
273        None => return RUMString::from(""),
274    };
275    decode(src, encoding)
276}
277
278///
279/// Implements decoding of input with encoder.
280///
281/// Note => Decoding is facilitated via the crate encoding_rs.
282///
283fn decode(src: &[u8], encoding: &'static Encoding) -> RUMString {
284    match encoding.decode_without_bom_handling_and_without_replacement(&src) {
285        Some(res) => RUMString::from(res),
286        None => RUMString::from_utf8(src).unwrap(),
287    }
288}
289
290///
291/// This function will scan through an escaped string and unescape any escaped characters.
292/// We collect these characters as a byte vector.
293/// Finally, we do a decode pass on the vector to re-encode the bytes **hopefully right** into a
294/// valid UTF-8 string.
295///
296/// This function focuses on reverting the result of [escape], whose output is meant for HL7.
297///
298pub fn unescape_string(escaped_str: &str) -> RUMResult<RUMString> {
299    let graphemes = escaped_str.graphemes(true).collect::<Vec<&str>>();
300    let str_size = graphemes.len();
301    let mut result: Vec<u8> = Vec::with_capacity(escaped_str.len());
302    let mut i = 0;
303    while i < str_size {
304        let seq_start = graphemes[i];
305        match seq_start {
306            "\\" => {
307                let escape_seq = get_grapheme_string(&graphemes, " ", i);
308                let mut c = match unescape(&escape_seq) {
309                    Ok(c) => c,
310                    Err(_why) => Vec::from(escape_seq.as_bytes()),
311                };
312                result.append(&mut c);
313                i += &escape_seq.count_graphemes();
314            }
315            _ => {
316                result.append(&mut Vec::from(seq_start.as_bytes()));
317                i += 1;
318            }
319        }
320    }
321    Ok(try_decode(result.as_slice()))
322}
323
324///
325/// Get the grapheme block and concatenate it into a newly allocated [`RUMString`].
326///
327pub fn get_grapheme_string<'a>(
328    graphemes: &Vec<&'a str>,
329    end_grapheme: &str,
330    start_index: usize,
331) -> RUMString {
332    get_grapheme_collection(graphemes, end_grapheme, start_index).join_compact("")
333}
334
335///
336/// Return vector of graphemes from starting spot up until we find the end grapheme.
337///
338/// Because a grapheme may take more than one codepoint characters, these have to be treated as
339/// references to strings.
340///
341pub fn get_grapheme_collection<'a>(
342    graphemes: &Vec<&'a str>,
343    end_grapheme: &str,
344    start_index: usize,
345) -> Vec<&'a str> {
346    let mut result: Vec<&'a str> = Vec::new();
347    for grapheme in graphemes.iter().skip(start_index) {
348        let item = *grapheme;
349        if item == end_grapheme {
350            break;
351        }
352        result.push(item);
353    }
354    result
355}
356
357///
358/// Turn escaped character sequence into the equivalent UTF-8 character
359/// This function accepts \o, \x and \u formats.
360/// This function will also attempt to unescape the common C style control characters.
361/// Anything else needs to be expressed as hex or octal patterns with the formats above.
362///
363/// If I did this right, I should get the "raw" byte sequence out of the escaped string.
364/// We can then use the bytes and attempt a decode() to figure out the string encoding and
365/// get the correct conversion to UTF-8. **Fingers crossed**
366///
367pub fn unescape(escaped_str: &str) -> Result<Vec<u8>, RUMString> {
368    let lower_case = escaped_str.to_lowercase();
369    let mut bytes: Vec<u8> = Vec::with_capacity(3);
370    match &lower_case[0..2] {
371        // Hex notation case. Assume we are getting xxyy bytes
372        "\\x" => {
373            let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
374            bytes.append(&mut byte_str.as_bytes().to_vec());
375        }
376        // Unicode notation case, we need to do an extra step or we will lose key bytes.
377        "\\u" => {
378            let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
379            bytes.append(&mut byte_str.as_bytes().to_vec());
380        }
381        // Single byte notation case
382        "\\c" => {
383            let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
384            bytes.append(&mut byte_str.as_bytes().to_vec());
385        }
386        // Unicode notation case
387        "\\o" => {
388            let byte_str = number_to_char_unchecked(&octal_to_number(&lower_case[2..6])?);
389            bytes.append(&mut byte_str.as_bytes().to_vec());
390        }
391        // Multibyte notation case
392        "\\m" => match lower_case.count_graphemes() {
393            8 => {
394                bytes.push(hex_to_byte(&lower_case[2..4])?);
395                bytes.push(hex_to_byte(&lower_case[4..6])?);
396                bytes.push(hex_to_byte(&lower_case[6..8])?);
397            }
398            6 => {
399                bytes.push(hex_to_byte(&lower_case[2..4])?);
400                bytes.push(hex_to_byte(&lower_case[4..6])?);
401            }
402            _ => {
403                return Err(format_compact!(
404                    "Unknown multibyte sequence. Cannot decode {}",
405                    lower_case
406                ))
407            }
408        },
409        // Custom encoding
410        "\\z" => bytes.append(&mut lower_case.as_bytes().to_vec()),
411        // Single byte codes.
412        _ => bytes.push(unescape_control_byte(&lower_case[0..2])?),
413    }
414    Ok(bytes)
415}
416
417///
418/// Unescape basic character
419/// We use pattern matching to map the basic escape character to its corresponding integer value.
420///
421fn unescape_control(escaped_str: &str) -> Result<char, RUMString> {
422    match escaped_str {
423        // Common control sequences
424        "\\t" => Ok('\t'),
425        "\\b" => Ok('\x08'),
426        "\\n" => Ok('\n'),
427        "\\r" => Ok('\r'),
428        "\\f" => Ok('\x14'),
429        "\\s" => Ok('\x20'),
430        "\\\\" => Ok(ASCII_ESCAPE_CHAR),
431        "\\'" => Ok('\''),
432        "\\\"" => Ok('"'),
433        "\\0" => Ok('\0'),
434        "\\v" => Ok('\x0B'),
435        "\\a" => Ok('\x07'),
436        // Control sequences by
437        _ => Err(format_compact!(
438            "Unknown escape sequence? Sequence: {}!",
439            escaped_str
440        )),
441    }
442}
443
444///
445/// Unescape basic character
446/// We use pattern matching to map the basic escape character to its corresponding integer value.
447///
448fn unescape_control_byte(escaped_str: &str) -> Result<u8, RUMString> {
449    match escaped_str {
450        // Common control sequences
451        "\\t" => Ok(9),   // Tab/Character Tabulation
452        "\\b" => Ok(8),   // Backspace
453        "\\n" => Ok(10),  // New line/ Line Feed character
454        "\\r" => Ok(13),  // Carriage Return character
455        "\\f" => Ok(12),  // Form Feed
456        "\\s" => Ok(32),  // Space
457        "\\\\" => Ok(27), // Escape
458        "\\'" => Ok(39),  // Single quote
459        "\\\"" => Ok(34), // Double quote
460        "\\0" => Ok(0),   // Null character
461        "\\v" => Ok(11),  // Vertical Tab/Line Tabulation
462        "\\a" => Ok(7),   // Alert bell
463        // Control sequences by hex
464        //Err(format_compact!("Unknown escape sequence? Sequence: {}!", escaped_str))
465        _ => hex_to_byte(escaped_str),
466    }
467}
468
469///
470/// Turn hex string to number (u32)
471///
472fn hex_to_number(hex_str: &str) -> Result<u32, RUMString> {
473    match u32::from_str_radix(&hex_str, 16) {
474        Ok(result) => Ok(result),
475        Err(val) => Err(format_compact!(
476            "Failed to parse string with error {}! Input string {} \
477        is not hex string!",
478            val,
479            hex_str
480        )),
481    }
482}
483
484///
485/// Turn hex string to byte (u8)
486///
487fn hex_to_byte(hex_str: &str) -> Result<u8, RUMString> {
488    match u8::from_str_radix(&hex_str, 16) {
489        Ok(result) => Ok(result),
490        Err(val) => Err(format_compact!(
491            "Failed to parse string with error {}! Input string {} \
492        is not hex string!",
493            val,
494            hex_str
495        )),
496    }
497}
498
499///
500/// Turn octal string to number (u32)
501///
502fn octal_to_number(hoctal_str: &str) -> Result<u32, RUMString> {
503    match u32::from_str_radix(&hoctal_str, 8) {
504        Ok(result) => Ok(result),
505        Err(val) => Err(format_compact!(
506            "Failed to parse string with error {}! Input string {} \
507        is not an octal string!",
508            val,
509            hoctal_str
510        )),
511    }
512}
513
514///
515/// Turn octal string to byte (u32)
516///
517fn octal_to_byte(hoctal_str: &str) -> Result<u8, RUMString> {
518    match u8::from_str_radix(&hoctal_str, 8) {
519        Ok(result) => Ok(result),
520        Err(val) => Err(format_compact!(
521            "Failed to parse string with error {}! Input string {} \
522        is not an octal string!",
523            val,
524            hoctal_str
525        )),
526    }
527}
528
529///
530/// Turn number to UTF-8 char
531///
532fn number_to_char(num: &u32) -> Result<RUMString, RUMString> {
533    match char::from_u32(*num) {
534        Some(result) => Ok(result.to_rumstring()),
535        None => Err(format_compact!(
536            "Failed to cast number to character! Number {}",
537            num
538        )),
539    }
540}
541
542///
543/// Turn number to UTF-8 char. Normally, calling from_u32 checks if the value is a valid character.
544/// This version uses the less safe from_u32_unchecked() function because we want to get the bytes
545/// and deal with validity at a higher layer.
546///
547fn number_to_char_unchecked(num: &u32) -> RUMString {
548    unsafe { char::from_u32_unchecked(*num).to_rumstring() }
549}
550
551///
552/// Turn UTF-8 character into escaped character sequence as expected in HL7
553///
554/// # Example
555/// ```
556///  use rumtk_core::strings::{escape};
557///  let message = "I ❤ my wife!";
558///  let escaped_message = escape(&message);
559///  assert_eq!("I \\u2764 my wife!", &escaped_message, "Did not get expected escaped string! Got {}!", &escaped_message);
560///```
561///
562pub fn escape(unescaped_str: &str) -> RUMString {
563    basic_escape(unescaped_str)
564        .replace("{", "")
565        .replace("}", "")
566        .to_rumstring()
567}
568
569///
570/// Escape UTF-8 characters in UTF-8 string that are beyond ascii range
571///
572/// # Example
573/// ```
574///  use rumtk_core::strings::basic_escape;
575///  let message = "I ❤ my wife!";
576///  let escaped_message = basic_escape(&message);
577///  assert_eq!("I \\u{2764} my wife!", &escaped_message, "Did not get expected escaped string! Got {}!", &escaped_message);
578///```
579pub fn basic_escape(unescaped_str: &str) -> RUMString {
580    let escaped = is_escaped_str(unescaped_str);
581    if !escaped {
582        return unescaped_str.escape_default().to_compact_string();
583    }
584    unescaped_str.to_rumstring()
585}
586
587///
588/// Checks if a given string is fully ASCII or within the ASCII range.
589///
590/// Remember: all strings are UTF-8 encoded in Rust, but most ASCII strings fit within the UTF-8
591/// encoding scheme.
592///
593pub fn is_ascii_str(unescaped_str: &str) -> bool {
594    unescaped_str.is_ascii()
595}
596
597///
598/// Checks if an input string is already escaped.
599/// The idea is to avoid escaping the escaped string thus making it a nightmare to undo the
600/// escaping later on.
601///
602/// Basically, if you were to blindly escape the input string, back slashes keep getting escaped.
603/// For example `\r -> \\r -> \\\\r -> ...`.
604///
605pub fn is_escaped_str(unescaped_str: &str) -> bool {
606    if !is_ascii_str(unescaped_str) {
607        return false;
608    }
609
610    for c in unescaped_str.chars() {
611        if !is_printable_char(&c) {
612            return false;
613        }
614    }
615    true
616}
617
618///
619/// Returns whether a character is in the ASCII printable range.
620///
621pub fn is_printable_char(c: &char) -> bool {
622    &MIN_ASCII_READABLE <= c && c <= &MAX_ASCII_READABLE
623}
624
625///
626/// Removes all non ASCII and all non printable characters from string.
627///
628pub fn filter_ascii(unescaped_str: &str, closure: fn(char) -> bool) -> RUMString {
629    let mut filtered = unescaped_str.to_rumstring();
630    filtered.retain(closure);
631    filtered
632}
633
634///
635/// Removes all non ASCII and all non printable characters from string.
636///
637pub fn filter_non_printable_ascii(unescaped_str: &str) -> RUMString {
638    filter_ascii(unescaped_str, |c: char| is_printable_char(&c))
639}