rusthl7/
escape_sequence.rs

1use log::{debug, trace};
2use regex::Regex;
3
4use crate::separators::Separators;
5use std::borrow::Cow;
6
7/// This struct provides the decoding functionality to parse escape sequences from the source string back to their original chars.
8///
9/// For more info see [here](https://www.lyniate.com/knowledge-hub/hl7-escape-sequences/) or [here](https://confluence.hl7australia.com/display/OOADRM20181/Appendix+1+Parsing+HL7v2#Appendix1ParsingHL7v2-Dealingwithreservedcharactersanddelimiters)
10///
11/// ## Example:
12/// ```
13/// # use rusthl7::escape_sequence::EscapeSequence;
14/// # use rusthl7::separators::Separators;
15/// let delims = Separators::default();
16/// let decoder = EscapeSequence::new(delims);
17/// let hl7_field_value = r#"Obstetrician \T\ Gynaecologist"#;
18/// let decoded = decoder.decode(hl7_field_value);
19/// assert_eq!(decoded, r#"Obstetrician & Gynaecologist"#);
20/// ```
21///
22/// ## Details
23///
24/// This decoder will replace some, **but not all** of the standard HL7 escape sequences.
25/// - `\E\`,`\F\`, '\R\`, `\S\`, `\T\` are all handled, and replaced with the Escape, Field, Repeat, Component and Sub-Component separator chars respectively
26/// - `\X..\` hexidecimal erscape sequences are supported (2 hex digits per char)
27///
28/// The following sequences are **NOT** replaced by design and will be left in the string:
29/// - `\H\` Indicates the start of highlighted text, this is a consuming application problem and will not be replaced.
30/// - `\N\` Indicates the end of highlighted text and resumption of normal text.  This is a consuming application problem and will not be replaced.
31/// - `\Z...\` Custom application escape sequences, these are custom (as are most `Z` items in HL7) and will not be replaced.
32///
33/// Also, not all of the sequences that _should_ be replaced are currently being handled, specifically:
34/// /// - `\Cxxyy\`, '\Mxxyyzz\ arguably _should_ be handled, but aren't currently.  There's [some suggestion](https://confluence.hl7australia.com/display/OOADRM20181/Appendix+1+Parsing+HL7v2#Appendix1ParsingHL7v2-Unicodecharacters) that these are discouraged in lieu of html-escaped values
35///
36/// If there's _no possibility_ of escape sequences (because there's no escape characters, typically backslashes) in the value, this function short circuits as early as possible and returns the original string slice for optimum performance.
37pub struct EscapeSequence {
38    escape_buf: [u8; 1],
39    field_buf: [u8; 1],
40    repeat_buf: [u8; 1],
41    component_buf: [u8; 1],
42    subcomponent_buf: [u8; 1],
43    escape_regex: Regex,
44}
45
46impl<'a> EscapeSequence {
47    /// Create a new struct ready for processing of escape sequences.
48    /// Escape sequences in HL7 are dependent on the actual delimiters used _for that message_, and so we need a [Separators] instance to know what chars we're working with.
49    ///
50    /// Creating a new [EscapeSequence] does involve some non-trivial work in order to improve the performance of the `decode()` operations.  It's expected that instances of this struct will be cached
51    /// per message, or per sending application if it will always use the same separators, or for the lifetime of the process if you're only dealing with known (often default) separators.
52    pub fn new(delims: Separators) -> EscapeSequence {
53        let regex = if delims.escape_char == '\\' {
54            Regex::new(r#"\\"#) // needs special handling because backslashes have meaning in regexes, and need to be escaped
55        } else {
56            Regex::new(String::from(delims.escape_char).as_str()) //everything else just works (I hope!)
57        }
58        .unwrap();
59
60        let mut return_val = EscapeSequence {
61            escape_buf: [0; 1], // The spec specifically requires single byte (actually 7-bit ASCII) delim chars
62            field_buf: [0; 1],
63            repeat_buf: [0; 1],
64            component_buf: [0; 1],
65            subcomponent_buf: [0; 1],
66            escape_regex: regex,
67        };
68
69        // We need &str to inject into the output buffer, convert the `Char` here
70        let _bytes = delims.escape_char.encode_utf8(&mut return_val.escape_buf);
71        let _bytes = delims.field.encode_utf8(&mut return_val.field_buf);
72        let _bytes = delims.repeat.encode_utf8(&mut return_val.repeat_buf);
73        let _bytes = delims.component.encode_utf8(&mut return_val.component_buf);
74        let _bytes = delims
75            .subcomponent
76            .encode_utf8(&mut return_val.subcomponent_buf);
77
78        return_val
79    }
80
81    /// This is where the magic happens.  Call this to update any escape sequences in the given &str.
82    pub fn decode<S>(&self, input: S) -> Cow<'a, str>
83    where
84        S: Into<Cow<'a, str>>,
85    {
86        // The comments below will almost certainly reference backslashes as that is by far the most common escape character
87        // the reality is any reference to "backslash" is actually referencing the escape char in the MSH segemnt, and stored in `self.delims.escape_char`
88
89        let input = input.into();
90        let first = self.escape_regex.find(&input); //using `regex.find` here is about twice as fast for the 'no sequences' benchmark as using &str.find()...
91
92        match first {
93            Some(first) => {
94                let first = first.start();
95
96                // We know there's a backslash, so we need to process stuff
97
98                // we're going to be replacing (mainly) 3 char escape sequences (eg `\F\`) with a single char sequence (eg `|`) so the initial length of the input should be sufficient
99                let mut output: Vec<u8> = Vec::with_capacity(input.len());
100                output.extend_from_slice(input[0..first].as_bytes()); // this doesn't include the escape char we found
101
102                // index in input that we're up to
103                let mut i = first;
104
105                debug!("Found first escape char at {}", first);
106
107                while i < input.len() {
108                    let start_of_sequence = self.escape_regex.find(&input[i..]);
109                    if start_of_sequence.is_none() {
110                        // there's nothing left to process, no more backslashes in the rest of the buffer
111
112                        trace!("No more sequence starts in input, completing...");
113                        output.extend_from_slice(input[i..].as_bytes()); // add the rest of the input
114                        break; // break out of while loop
115                    }
116
117                    let start_index = start_of_sequence.unwrap().start() + i; // index is offset into input by i chars as that's what's we subsliced above
118                    trace!("Found the next escape char at {}", start_index);
119
120                    let end_of_sequence = self.escape_regex.find(&input[start_index + 1..]);
121
122                    if end_of_sequence.is_none() {
123                        // there's nothing left to process, the backslash we are curently looking at is NOT an escape sequence
124                        trace!("No more sequence ends in input, completing...");
125                        output.extend_from_slice(input[start_index..].as_bytes()); // add the rest of the input (including the escape char that brought us here) in one go
126                        break; // break out of while loop
127                    }
128
129                    // else we have found another escape char, get the slice in between
130                    let end_index = end_of_sequence.unwrap().start() + start_index + 1; // the end is the number of chars after the start_index, not from the start of input
131                    trace!("Found end of sequence at {}", end_index);
132
133                    let sequence = &input[start_index + 1..end_index];
134                    trace!("Found escape sequence: '{}'", sequence);
135
136                    // we have a possible window of data between i and start_index that we've just read through as text, but isn't yet in output... append it now
137                    output.extend_from_slice(input[i..start_index].as_bytes());
138
139                    match sequence {
140                        "E" => output.extend_from_slice(&self.escape_buf),
141                        "F" => output.extend_from_slice(&self.field_buf),
142                        "R" => output.extend_from_slice(&self.repeat_buf),
143                        "S" => output.extend_from_slice(&self.component_buf),
144                        "T" => output.extend_from_slice(&self.subcomponent_buf),
145
146                        // Highlighted/Normal text sequences need to remain for consuming libraries to act on as they see fit
147                        "H" | "N" => {
148                            output.extend_from_slice(&self.escape_buf);
149                            output.extend_from_slice(sequence.as_bytes());
150                            output.extend_from_slice(&self.escape_buf);
151                        }
152
153                        _ => {
154                            if sequence.starts_with('Z') {
155                                trace!("Into custom escape sequence, ignoring...");
156                                output.extend_from_slice(&self.escape_buf);
157                                output.extend_from_slice(sequence.as_bytes());
158                                output.extend_from_slice(&self.escape_buf);
159                            } else if let Some(hex_code) = sequence.strip_prefix('X') {
160                                let hex = hex::decode(hex_code)
161                                    .expect("Unable to parse X-value into valid hex");
162                                println!("Converted hex code {} to {:?}", hex_code, hex);
163                                output.extend_from_slice(&hex);
164
165                            // TODO: Add more sequences
166                            } else {
167                                // not a known sequence, must just be two backslashes randomly in a string
168                                trace!("Unknown sequence, extending output...");
169                                output.extend_from_slice(
170                                    input[start_index - 1..end_index].as_bytes(),
171                                );
172                                // include both the initial escape char, and also the final one.
173                            }
174                        }
175                    }
176
177                    i = end_index + 1; // move through buffer, we we've covered everything up to this point now
178                } // while more chars in input to loop through
179
180                Cow::Owned(String::from_utf8(output).unwrap())
181            }
182            None => {
183                // no escape char in the string at all, just return what we have
184                input
185            }
186        }
187    }
188}
189
190#[cfg(test)]
191mod tests {
192    use std::str::FromStr;
193
194    use super::*;
195
196    #[test]
197    fn test_decode_does_nothing_if_not_required() {
198        let delims = Separators::default();
199        let escaper = EscapeSequence::new(delims);
200
201        let input = "There are no escape sequences here/there/.";
202        let output = escaper.decode(input);
203        assert_eq!(output, input);
204    }
205
206    #[test]
207    fn test_decode_handles_simple_x_codes() {
208        let delims = Separators::default();
209        let escaper = EscapeSequence::new(delims);
210
211        let input = "Escape sequence with \\X0D\\.";
212        let output = escaper.decode(input);
213        assert_eq!(output, "Escape sequence with \r.");
214    }
215
216    #[test]
217    fn test_decode_handles_multi_byte_x_codes() {
218        let delims = Separators::default();
219        let escaper = EscapeSequence::new(delims);
220
221        let input = "Sentence 1.\\X0D0A\\Sentence 2.";
222        let output = escaper.decode(input);
223        assert_eq!(output, "Sentence 1.\r\nSentence 2.");
224    }
225
226    #[test]
227    fn test_decode_does_nothing_if_backslash_is_not_escape_sequence() {
228        let delims = Separators::default();
229        let escaper = EscapeSequence::new(delims);
230
231        let input = r#"There are no escape sequences here\there."#;
232        let output = escaper.decode(input);
233        assert_eq!(output, input);
234    }
235
236    #[test]
237    fn test_decode_handles_field_sequence() {
238        let delims = Separators::default();
239        let escaper = EscapeSequence::new(delims);
240
241        let input = r#"Escape this \F\ please"#;
242        let output = escaper.decode(input);
243        assert_eq!(output, "Escape this | please");
244    }
245
246    #[test]
247    fn ensure_decode_does_not_eat_chars_it_shouldnt() {
248        let delims = Separators::default();
249        let escaper = EscapeSequence::new(delims);
250
251        let input = r#"Escape this \F please"#;
252        let output = escaper.decode(input);
253        assert_eq!(output, input);
254    }
255
256    #[test]
257    fn ensure_decode_handles_custom_delims() {
258        let delims = Separators::from_str("MSH^!@#$").unwrap();
259        let escaper = EscapeSequence::new(delims);
260
261        let input = r#"Escape this #F# please"#;
262        let output = escaper.decode(input);
263        assert_eq!(output, "Escape this ^ please");
264    }
265
266    #[test]
267    fn ensure_decode_handles_eescape_sequence() {
268        let delims = Separators::default();
269        let escaper = EscapeSequence::new(delims);
270
271        let input = r#"Escape this \E\ please"#; // convert the escape sequence
272        let output = escaper.decode(input);
273        assert_eq!(output, r#"Escape this \ please"#); // into a single escape char
274
275        // ensure it moves on past the char it just added
276        let input = r#"Escape this \E\ pretty \F\ please"#; // convert the escape sequence
277        let output = escaper.decode(input);
278        assert_eq!(output, r#"Escape this \ pretty | please"#); // into a single escape char and still handle future sequences ok
279    }
280
281    #[test]
282    fn test_decode_handles_repeat_sequence() {
283        let delims = Separators::default();
284        let escaper = EscapeSequence::new(delims);
285
286        let input = r#"Escape this \R\ please"#;
287        let output = escaper.decode(input);
288        assert_eq!(output, "Escape this ~ please");
289    }
290
291    #[test]
292    fn test_decode_handles_component_sequence() {
293        let delims = Separators::default();
294        let escaper = EscapeSequence::new(delims);
295
296        let input = r#"Escape this \S\ please"#;
297        let output = escaper.decode(input);
298        assert_eq!(output, "Escape this ^ please");
299    }
300
301    #[test]
302    fn test_decode_handles_subcomponent_sequence() {
303        let delims = Separators::default();
304        let escaper = EscapeSequence::new(delims);
305
306        let input = r#"Obstetrician \T\ Gynaecologist"#;
307        let output = escaper.decode(input);
308        assert_eq!(output, "Obstetrician & Gynaecologist");
309    }
310
311    #[test]
312    fn ensure_decode_ignores_highlighting_sequence() {
313        let delims = Separators::default();
314        let escaper = EscapeSequence::new(delims);
315
316        let input = r#"Don't escape this \H\highlighted text\N\ please"#;
317        let output = escaper.decode(input);
318        assert_eq!(output, input);
319    }
320
321    #[test]
322    fn ensure_decode_ignores_custom_sequence() {
323        let delims = Separators::default();
324        let escaper = EscapeSequence::new(delims);
325
326        let input = r#"Don't escape this custom sequence \Z1234\ please"#;
327        let output = escaper.decode(input);
328        assert_eq!(output, input);
329    }
330}