rumtk_core/
strings.rs

1/*
2 * rumtk attempts to implement HL7 and medical protocols for interoperability in medicine.
3 * This toolkit aims to be reliable, simple, performant, and standards compliant.
4 * Copyright (C) 2024  Luis M. Santos, M.D.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
19 */
20use crate::core::{is_unique, RUMResult};
21use chardetng::EncodingDetector;
22pub use compact_str::{format_compact, CompactString, CompactStringExt, ToCompactString};
23use encoding_rs::Encoding;
24use std::fmt::Display;
25use unicode_segmentation::UnicodeSegmentation;
26/**************************** Constants**************************************/
27const ESCAPED_STRING_WINDOW: usize = 6;
28const ASCII_ESCAPE_CHAR: char = '\\';
29const MIN_ASCII_READABLE: char = ' ';
30const MAX_ASCII_READABLE: char = '~';
31pub const EMPTY_STRING: &str = "";
32pub const DOT_STR: &str = ".";
33pub const EMPTY_STRING_OPTION: Option<&str> = Some("");
34pub const READABLE_ASCII: &str = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~";
35
36/**************************** Types *****************************************/
37pub type RUMString = CompactString;
38
39/**************************** Traits ****************************************/
40
41///
42/// Implemented indexing trait for String and str which uses the UnicodeSegmentation facilities to
43/// enable grapheme iteration by default. There could be some performance penalty, but it will allow
44/// for native Unicode support to the best extent possible.
45///
46/// We also enable decoding from Encoding Standard encodings to UTF-8.
47///
48pub trait UTFStringExtensions {
49    fn count_graphemes(&self) -> usize;
50
51    ///
52    /// Return a grapheme unit which could span multiple Unicode codepoints or "characters".
53    ///
54    /// # Note
55    /// ```text
56    ///     If the grapheme requested does not exists, this method will return a blank string.
57    /// ```
58    ///
59    /// Instead of just retrieving a codepoint as character, I decided to take it a step further and
60    /// have support for grapheme selection such that characters in written language like sanskrit
61    /// can be properly selected and evaluated.
62    ///
63    /// [!CAUTION]
64    /// This can be an extremely slow operation over large strings since each call to this method
65    /// will need to rescan the input string every time we need to look up a grapheme. Unfortunately,
66    /// this is a side effect of convenience. To improve performance, call .get_graphemes() once and
67    /// then call take_grapheme() over that iterator.
68    ///
69    fn get_grapheme(&self, index: usize) -> &str;
70
71    fn get_graphemes(&self) -> Vec<&str>;
72
73    fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str>;
74
75    #[inline(always)]
76    fn take_grapheme<'a>(&self, graphemes: &Vec<&'a str>, index: usize) -> RUMString {
77        if index >= graphemes.len() {
78            return RUMString::from(EMPTY_STRING);
79        }
80        RUMString::from(graphemes[index])
81    }
82
83    #[inline(always)]
84    fn get_grapheme_window(&self, min: usize, max: usize, offset: usize) -> RUMString {
85        let mut window: RUMString = RUMString::with_capacity(max - min);
86        let start = min + offset;
87        let end = max + offset;
88        let graphemes = self.get_graphemes();
89        for i in start..end {
90            window += &self.take_grapheme(&graphemes, i);
91        }
92        window
93    }
94
95    #[inline(always)]
96    fn get_grapheme_string(&self, end_pattern: &str, offset: usize) -> RUMString {
97        let mut window: RUMString = RUMString::with_capacity(ESCAPED_STRING_WINDOW);
98        for grapheme in self.get_grapheme_chunk(offset) {
99            if grapheme == end_pattern {
100                return RUMString::from(window);
101            } else {
102                window += grapheme;
103            }
104        }
105        RUMString::from(window)
106    }
107
108    #[inline(always)]
109    fn find_grapheme(&self, pattern: &str, offset: usize) -> &str {
110        for grapheme in self.get_grapheme_chunk(offset) {
111            if grapheme == pattern {
112                return grapheme;
113            }
114        }
115        EMPTY_STRING
116    }
117
118    #[inline(always)]
119    fn truncate(&self, max_size: usize) -> RUMString {
120        self.get_grapheme_window(0, max_size, 0)
121    }
122}
123
124pub trait AsStr {
125    fn as_str(&self) -> &str;
126}
127
128pub trait RUMStringConversions: ToString {
129    fn to_rumstring(&self) -> RUMString {
130        RUMString::from(self.to_string())
131    }
132
133    fn to_raw(&self) -> Vec<u8> {
134        self.to_string().as_bytes().to_vec()
135    }
136}
137
138pub trait StringUtils: AsStr + UTFStringExtensions {
139    #[inline(always)]
140    fn duplicate(&self, count: usize) -> RUMString {
141        let mut duplicated = RUMString::with_capacity(count);
142        for i in 0..count {
143            duplicated += &self.as_str();
144        }
145        duplicated
146    }
147
148    fn is_unique(&self) -> bool {
149        let graphemes = self.get_graphemes();
150        is_unique(&graphemes)
151    }
152}
153
154impl UTFStringExtensions for RUMString {
155    #[inline(always)]
156    fn count_graphemes(&self) -> usize {
157        self.graphemes(true).count()
158    }
159
160    #[inline(always)]
161    fn get_grapheme(&self, index: usize) -> &str {
162        self.graphemes(true)
163            .nth(index)
164            .or(EMPTY_STRING_OPTION)
165            .unwrap()
166    }
167
168    #[inline(always)]
169    fn get_graphemes(&self) -> Vec<&str> {
170        self.graphemes(true).collect::<Vec<&str>>()
171    }
172
173    #[inline(always)]
174    fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str> {
175        self.graphemes(true).skip(offset).collect::<Vec<&str>>()
176    }
177}
178
179impl RUMStringConversions for RUMString {}
180impl AsStr for RUMString {
181    fn as_str(&self) -> &str {
182        self.as_str()
183    }
184}
185impl StringUtils for RUMString {}
186
187impl UTFStringExtensions for str {
188    #[inline(always)]
189    fn count_graphemes(&self) -> usize {
190        self.graphemes(true).count()
191    }
192
193    #[inline(always)]
194    fn get_grapheme(&self, index: usize) -> &str {
195        self.graphemes(true)
196            .nth(index)
197            .or(EMPTY_STRING_OPTION)
198            .unwrap()
199    }
200
201    #[inline(always)]
202    fn get_graphemes(&self) -> Vec<&str> {
203        self.graphemes(true).collect::<Vec<&str>>()
204    }
205
206    #[inline(always)]
207    fn get_grapheme_chunk(&self, offset: usize) -> Vec<&str> {
208        self.graphemes(true).skip(offset).collect::<Vec<&str>>()
209    }
210}
211
212impl RUMStringConversions for str {}
213
214impl AsStr for str {
215    fn as_str(&self) -> &str {
216        self
217    }
218}
219
220impl StringUtils for str {}
221
222impl RUMStringConversions for char {}
223
224pub trait RUMArrayConversions {
225    fn to_rumstring(&self) -> RUMString;
226}
227
228impl RUMArrayConversions for Vec<u8> {
229    fn to_rumstring(&self) -> RUMString {
230        self.as_slice().to_rumstring()
231    }
232}
233
234impl RUMArrayConversions for &[u8] {
235    fn to_rumstring(&self) -> RUMString {
236        RUMString::from_utf8(&self).unwrap()
237    }
238}
239
240/**************************** Helpers ***************************************/
241
242pub fn count_tokens_ignoring_pattern(vector: &Vec<&str>, string_token: &RUMString) -> usize {
243    let mut count: usize = 0;
244    for tok in vector.iter() {
245        if string_token != tok {
246            count += 1;
247        }
248    }
249    count
250}
251
252///
253/// Implements decoding this string from its auto-detected encoding to UTF-8.
254/// Failing that we assume the string was encoded in UTF-8 and return a copy.
255///
256/// Note => Decoding is facilitated via the crates chardet-ng and encoding_rs.
257///
258pub fn try_decode(src: &[u8]) -> RUMString {
259    let mut detector = EncodingDetector::new();
260    detector.feed(&src, true);
261    let encoding = detector.guess(None, true);
262    decode(src, encoding)
263}
264
265///
266/// Implements decoding this string from a specific encoding to UTF-8.
267///
268/// Note => Decoding is facilitated via the crates chardet-ng and encoding_rs.
269///
270pub fn try_decode_with(src: &[u8], encoding_name: &str) -> RUMString {
271    let encoding = match Encoding::for_label(encoding_name.as_bytes()) {
272        Some(v) => v,
273        None => return RUMString::from(""),
274    };
275    decode(src, encoding)
276}
277
278///
279/// Implements decoding of input with encoder.
280///
281/// Note => Decoding is facilitated via the crate encoding_rs.
282///
283fn decode(src: &[u8], encoding: &'static Encoding) -> RUMString {
284    match encoding.decode_without_bom_handling_and_without_replacement(&src) {
285        Some(res) => RUMString::from(res),
286        None => RUMString::from_utf8(src).unwrap(),
287    }
288}
289
290///
291/// This function will scan through an escaped string and unescape any escaped characters.
292/// We collect these characters as a byte vector.
293/// Finally, we do a decode pass on the vector to re-encode the bytes **hopefully right** into a
294/// valid UTF-8 string.
295///
296/// This function focuses on reverting the result of [escape], whose output is meant for HL7.
297///
298pub fn unescape_string(escaped_str: &str) -> RUMResult<RUMString> {
299    let str_size = escaped_str.count_graphemes();
300    let mut result: Vec<u8> = Vec::with_capacity(escaped_str.len());
301    let mut i = 0;
302    while i < str_size {
303        let seq_start = escaped_str.get_grapheme(i);
304        match seq_start {
305            "\\" => {
306                let escape_seq = escaped_str.get_grapheme_string(" ", i);
307                let mut c = match unescape(&escape_seq) {
308                    Ok(c) => c,
309                    Err(_why) => Vec::from(escape_seq.as_bytes()),
310                };
311                result.append(&mut c);
312                i += &escape_seq.count_graphemes();
313            }
314            _ => {
315                result.append(&mut Vec::from(seq_start.as_bytes()));
316                i += 1;
317            }
318        }
319    }
320    Ok(try_decode(result.as_slice()))
321}
322
323///
324/// Turn escaped character sequence into the equivalent UTF-8 character
325/// This function accepts \o, \x and \u formats.
326/// This function will also attempt to unescape the common C style control characters.
327/// Anything else needs to be expressed as hex or octal patterns with the formats above.
328///
329/// If I did this right, I should get the "raw" byte sequence out of the escaped string.
330/// We can then use the bytes and attempt a decode() to figure out the string encoding and
331/// get the correct conversion to UTF-8. **Fingers crossed**
332///
333pub fn unescape(escaped_str: &str) -> Result<Vec<u8>, RUMString> {
334    let lower_case = escaped_str.to_lowercase();
335    let mut bytes: Vec<u8> = Vec::with_capacity(3);
336    match &lower_case[0..2] {
337        // Hex notation case. Assume we are getting xxyy bytes
338        "\\x" => {
339            let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
340            bytes.append(&mut byte_str.as_bytes().to_vec());
341        }
342        // Unicode notation case, we need to do an extra step or we will lose key bytes.
343        "\\u" => {
344            let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
345            bytes.append(&mut byte_str.as_bytes().to_vec());
346        }
347        // Single byte notation case
348        "\\c" => {
349            let byte_str = number_to_char_unchecked(&hex_to_number(&lower_case[2..6])?);
350            bytes.append(&mut byte_str.as_bytes().to_vec());
351        }
352        // Unicode notation case
353        "\\o" => {
354            let byte_str = number_to_char_unchecked(&octal_to_number(&lower_case[2..6])?);
355            bytes.append(&mut byte_str.as_bytes().to_vec());
356        }
357        // Multibyte notation case
358        "\\m" => match lower_case.count_graphemes() {
359            8 => {
360                bytes.push(hex_to_byte(&lower_case[2..4])?);
361                bytes.push(hex_to_byte(&lower_case[4..6])?);
362                bytes.push(hex_to_byte(&lower_case[6..8])?);
363            }
364            6 => {
365                bytes.push(hex_to_byte(&lower_case[2..4])?);
366                bytes.push(hex_to_byte(&lower_case[4..6])?);
367            }
368            _ => {
369                return Err(format_compact!(
370                    "Unknown multibyte sequence. Cannot decode {}",
371                    lower_case
372                ))
373            }
374        },
375        // Custom encoding
376        "\\z" => bytes.append(&mut lower_case.as_bytes().to_vec()),
377        // Single byte codes.
378        _ => bytes.push(unescape_control_byte(&lower_case)?),
379    }
380    Ok(bytes)
381}
382
383///
384/// Unescape basic character
385/// We use pattern matching to map the basic escape character to its corresponding integer value.
386///
387fn unescape_control(escaped_str: &str) -> Result<char, RUMString> {
388    match escaped_str {
389        // Common control sequences
390        "\\t" => Ok('\t'),
391        "\\b" => Ok('\x08'),
392        "\\n" => Ok('\n'),
393        "\\r" => Ok('\r'),
394        "\\f" => Ok('\x14'),
395        "\\s" => Ok('\x20'),
396        "\\\\" => Ok(ASCII_ESCAPE_CHAR),
397        "\\'" => Ok('\''),
398        "\\\"" => Ok('\"'),
399        "\\0" => Ok('\0'),
400        "\\v" => Ok('\x0B'),
401        "\\a" => Ok('\x07'),
402        // Control sequences by
403        _ => Err(format_compact!(
404            "Unknown escape sequence? Sequence: {}!",
405            escaped_str
406        )),
407    }
408}
409
410///
411/// Unescape basic character
412/// We use pattern matching to map the basic escape character to its corresponding integer value.
413///
414fn unescape_control_byte(escaped_str: &str) -> Result<u8, RUMString> {
415    match escaped_str {
416        // Common control sequences
417        "\\t" => Ok(9),   // Tab/Character Tabulation
418        "\\b" => Ok(8),   // Backspace
419        "\\n" => Ok(10),  // New line/ Line Feed character
420        "\\r" => Ok(13),  // Carriage Return character
421        "\\f" => Ok(12),  // Form Feed
422        "\\s" => Ok(32),  // Space
423        "\\\\" => Ok(27), // Escape
424        "\\'" => Ok(39),  // Single quote
425        "\\\"" => Ok(34), // Double quote
426        "\\0" => Ok(0),   // Null character
427        "\\v" => Ok(11),  // Vertical Tab/Line Tabulation
428        "\\a" => Ok(7),   // Alert bell
429        // Control sequences by hex
430        //Err(format_compact!("Unknown escape sequence? Sequence: {}!", escaped_str))
431        _ => hex_to_byte(&escaped_str[2..]),
432    }
433}
434
435///
436/// Turn hex string to number (u32)
437///
438fn hex_to_number(hex_str: &str) -> Result<u32, RUMString> {
439    match u32::from_str_radix(&hex_str, 16) {
440        Ok(result) => Ok(result),
441        Err(val) => Err(format_compact!(
442            "Failed to parse string with error {}! Input string {} \
443        is not hex string!",
444            val,
445            hex_str
446        )),
447    }
448}
449
450///
451/// Turn hex string to byte (u8)
452///
453fn hex_to_byte(hex_str: &str) -> Result<u8, RUMString> {
454    match u8::from_str_radix(&hex_str, 16) {
455        Ok(result) => Ok(result),
456        Err(val) => Err(format_compact!(
457            "Failed to parse string with error {}! Input string {} \
458        is not hex string!",
459            val,
460            hex_str
461        )),
462    }
463}
464
465///
466/// Turn octal string to number (u32)
467///
468fn octal_to_number(hoctal_str: &str) -> Result<u32, RUMString> {
469    match u32::from_str_radix(&hoctal_str, 8) {
470        Ok(result) => Ok(result),
471        Err(val) => Err(format_compact!(
472            "Failed to parse string with error {}! Input string {} \
473        is not an octal string!",
474            val,
475            hoctal_str
476        )),
477    }
478}
479
480///
481/// Turn octal string to byte (u32)
482///
483fn octal_to_byte(hoctal_str: &str) -> Result<u8, RUMString> {
484    match u8::from_str_radix(&hoctal_str, 8) {
485        Ok(result) => Ok(result),
486        Err(val) => Err(format_compact!(
487            "Failed to parse string with error {}! Input string {} \
488        is not an octal string!",
489            val,
490            hoctal_str
491        )),
492    }
493}
494
495///
496/// Turn number to UTF-8 char
497///
498fn number_to_char(num: &u32) -> Result<RUMString, RUMString> {
499    match char::from_u32(*num) {
500        Some(result) => Ok(result.to_rumstring()),
501        None => Err(format_compact!(
502            "Failed to cast number to character! Number {}",
503            num
504        )),
505    }
506}
507
508///
509/// Turn number to UTF-8 char. Normally, calling from_u32 checks if the value is a valid character.
510/// This version uses the less safe from_u32_unchecked() function because we want to get the bytes
511/// and deal with validity at a higher layer.
512///
513fn number_to_char_unchecked(num: &u32) -> RUMString {
514    unsafe { char::from_u32_unchecked(*num).to_rumstring() }
515}
516
517///
518/// Turn UTF-8 character into escaped character sequence as expected in HL7
519///
520/// # Example
521/// ```
522///  use rumtk_core::strings::{escape};
523///  let message = "I ❤ my wife!";
524///  let escaped_message = escape(&message);
525///  assert_eq!("I \\u2764 my wife!", &escaped_message, "Did not get expected escaped string! Got {}!", &escaped_message);
526///```
527///
528pub fn escape(unescaped_str: &str) -> RUMString {
529    basic_escape(unescaped_str)
530        .replace("{", "")
531        .replace("}", "")
532        .to_rumstring()
533}
534
535///
536/// Escape UTF-8 characters in UTF-8 string that are beyond ascii range
537///
538/// # Example
539/// ```
540///  use rumtk_core::strings::basic_escape;
541///  let message = "I ❤ my wife!";
542///  let escaped_message = basic_escape(&message);
543///  assert_eq!("I \\u{2764} my wife!", &escaped_message, "Did not get expected escaped string! Got {}!", &escaped_message);
544///```
545pub fn basic_escape(unescaped_str: &str) -> RUMString {
546    unescaped_str.escape_default().to_compact_string()
547}
548
549///
550/// Removes all non ASCII and all non printable characters from string.
551///
552pub fn filter_ascii(unescaped_str: &str, closure: fn(char) -> bool) -> RUMString {
553    let mut filtered = unescaped_str.to_rumstring();
554    filtered.retain(closure);
555    filtered
556}
557
558///
559/// Removes all non ASCII and all non printable characters from string.
560///
561pub fn filter_non_printable_ascii(unescaped_str: &str) -> RUMString {
562    filter_ascii(unescaped_str, |c: char| {
563        !c.is_ascii() && (' ' <= c || c <= '~')
564    })
565}