rusthl7/escape_sequence.rs
1use log::{debug, trace};
2use regex::Regex;
3
4use crate::separators::Separators;
5use std::borrow::Cow;
6
7/// This struct provides the decoding functionality to parse escape sequences from the source string back to their original chars.
8///
9/// For more info see [here](https://www.lyniate.com/knowledge-hub/hl7-escape-sequences/) or [here](https://confluence.hl7australia.com/display/OOADRM20181/Appendix+1+Parsing+HL7v2#Appendix1ParsingHL7v2-Dealingwithreservedcharactersanddelimiters)
10///
11/// ## Example:
12/// ```
13/// # use rusthl7::escape_sequence::EscapeSequence;
14/// # use rusthl7::separators::Separators;
15/// let delims = Separators::default();
16/// let decoder = EscapeSequence::new(delims);
17/// let hl7_field_value = r#"Obstetrician \T\ Gynaecologist"#;
18/// let decoded = decoder.decode(hl7_field_value);
19/// assert_eq!(decoded, r#"Obstetrician & Gynaecologist"#);
20/// ```
21///
22/// ## Details
23///
24/// This decoder will replace some, **but not all** of the standard HL7 escape sequences.
25/// - `\E\`,`\F\`, '\R\`, `\S\`, `\T\` are all handled, and replaced with the Escape, Field, Repeat, Component and Sub-Component separator chars respectively
26/// - `\X..\` hexidecimal erscape sequences are supported (2 hex digits per char)
27///
28/// The following sequences are **NOT** replaced by design and will be left in the string:
29/// - `\H\` Indicates the start of highlighted text, this is a consuming application problem and will not be replaced.
30/// - `\N\` Indicates the end of highlighted text and resumption of normal text. This is a consuming application problem and will not be replaced.
31/// - `\Z...\` Custom application escape sequences, these are custom (as are most `Z` items in HL7) and will not be replaced.
32///
33/// Also, not all of the sequences that _should_ be replaced are currently being handled, specifically:
34/// /// - `\Cxxyy\`, '\Mxxyyzz\ arguably _should_ be handled, but aren't currently. There's [some suggestion](https://confluence.hl7australia.com/display/OOADRM20181/Appendix+1+Parsing+HL7v2#Appendix1ParsingHL7v2-Unicodecharacters) that these are discouraged in lieu of html-escaped values
35///
36/// If there's _no possibility_ of escape sequences (because there's no escape characters, typically backslashes) in the value, this function short circuits as early as possible and returns the original string slice for optimum performance.
37pub struct EscapeSequence {
38 escape_buf: [u8; 1],
39 field_buf: [u8; 1],
40 repeat_buf: [u8; 1],
41 component_buf: [u8; 1],
42 subcomponent_buf: [u8; 1],
43 escape_regex: Regex,
44}
45
46impl<'a> EscapeSequence {
47 /// Create a new struct ready for processing of escape sequences.
48 /// Escape sequences in HL7 are dependent on the actual delimiters used _for that message_, and so we need a [Separators] instance to know what chars we're working with.
49 ///
50 /// Creating a new [EscapeSequence] does involve some non-trivial work in order to improve the performance of the `decode()` operations. It's expected that instances of this struct will be cached
51 /// per message, or per sending application if it will always use the same separators, or for the lifetime of the process if you're only dealing with known (often default) separators.
52 pub fn new(delims: Separators) -> EscapeSequence {
53 let regex = if delims.escape_char == '\\' {
54 Regex::new(r#"\\"#) // needs special handling because backslashes have meaning in regexes, and need to be escaped
55 } else {
56 Regex::new(String::from(delims.escape_char).as_str()) //everything else just works (I hope!)
57 }
58 .unwrap();
59
60 let mut return_val = EscapeSequence {
61 escape_buf: [0; 1], // The spec specifically requires single byte (actually 7-bit ASCII) delim chars
62 field_buf: [0; 1],
63 repeat_buf: [0; 1],
64 component_buf: [0; 1],
65 subcomponent_buf: [0; 1],
66 escape_regex: regex,
67 };
68
69 // We need &str to inject into the output buffer, convert the `Char` here
70 let _bytes = delims.escape_char.encode_utf8(&mut return_val.escape_buf);
71 let _bytes = delims.field.encode_utf8(&mut return_val.field_buf);
72 let _bytes = delims.repeat.encode_utf8(&mut return_val.repeat_buf);
73 let _bytes = delims.component.encode_utf8(&mut return_val.component_buf);
74 let _bytes = delims
75 .subcomponent
76 .encode_utf8(&mut return_val.subcomponent_buf);
77
78 return_val
79 }
80
81 /// This is where the magic happens. Call this to update any escape sequences in the given &str.
82 pub fn decode<S>(&self, input: S) -> Cow<'a, str>
83 where
84 S: Into<Cow<'a, str>>,
85 {
86 // The comments below will almost certainly reference backslashes as that is by far the most common escape character
87 // the reality is any reference to "backslash" is actually referencing the escape char in the MSH segemnt, and stored in `self.delims.escape_char`
88
89 let input = input.into();
90 let first = self.escape_regex.find(&input); //using `regex.find` here is about twice as fast for the 'no sequences' benchmark as using &str.find()...
91
92 match first {
93 Some(first) => {
94 let first = first.start();
95
96 // We know there's a backslash, so we need to process stuff
97
98 // we're going to be replacing (mainly) 3 char escape sequences (eg `\F\`) with a single char sequence (eg `|`) so the initial length of the input should be sufficient
99 let mut output: Vec<u8> = Vec::with_capacity(input.len());
100 output.extend_from_slice(input[0..first].as_bytes()); // this doesn't include the escape char we found
101
102 // index in input that we're up to
103 let mut i = first;
104
105 debug!("Found first escape char at {}", first);
106
107 while i < input.len() {
108 let start_of_sequence = self.escape_regex.find(&input[i..]);
109 if start_of_sequence.is_none() {
110 // there's nothing left to process, no more backslashes in the rest of the buffer
111
112 trace!("No more sequence starts in input, completing...");
113 output.extend_from_slice(input[i..].as_bytes()); // add the rest of the input
114 break; // break out of while loop
115 }
116
117 let start_index = start_of_sequence.unwrap().start() + i; // index is offset into input by i chars as that's what's we subsliced above
118 trace!("Found the next escape char at {}", start_index);
119
120 let end_of_sequence = self.escape_regex.find(&input[start_index + 1..]);
121
122 if end_of_sequence.is_none() {
123 // there's nothing left to process, the backslash we are curently looking at is NOT an escape sequence
124 trace!("No more sequence ends in input, completing...");
125 output.extend_from_slice(input[start_index..].as_bytes()); // add the rest of the input (including the escape char that brought us here) in one go
126 break; // break out of while loop
127 }
128
129 // else we have found another escape char, get the slice in between
130 let end_index = end_of_sequence.unwrap().start() + start_index + 1; // the end is the number of chars after the start_index, not from the start of input
131 trace!("Found end of sequence at {}", end_index);
132
133 let sequence = &input[start_index + 1..end_index];
134 trace!("Found escape sequence: '{}'", sequence);
135
136 // we have a possible window of data between i and start_index that we've just read through as text, but isn't yet in output... append it now
137 output.extend_from_slice(input[i..start_index].as_bytes());
138
139 match sequence {
140 "E" => output.extend_from_slice(&self.escape_buf),
141 "F" => output.extend_from_slice(&self.field_buf),
142 "R" => output.extend_from_slice(&self.repeat_buf),
143 "S" => output.extend_from_slice(&self.component_buf),
144 "T" => output.extend_from_slice(&self.subcomponent_buf),
145
146 // Highlighted/Normal text sequences need to remain for consuming libraries to act on as they see fit
147 "H" | "N" => {
148 output.extend_from_slice(&self.escape_buf);
149 output.extend_from_slice(sequence.as_bytes());
150 output.extend_from_slice(&self.escape_buf);
151 }
152
153 _ => {
154 if sequence.starts_with('Z') {
155 trace!("Into custom escape sequence, ignoring...");
156 output.extend_from_slice(&self.escape_buf);
157 output.extend_from_slice(sequence.as_bytes());
158 output.extend_from_slice(&self.escape_buf);
159 } else if let Some(hex_code) = sequence.strip_prefix('X') {
160 let hex = hex::decode(hex_code)
161 .expect("Unable to parse X-value into valid hex");
162 println!("Converted hex code {} to {:?}", hex_code, hex);
163 output.extend_from_slice(&hex);
164
165 // TODO: Add more sequences
166 } else {
167 // not a known sequence, must just be two backslashes randomly in a string
168 trace!("Unknown sequence, extending output...");
169 output.extend_from_slice(
170 input[start_index - 1..end_index].as_bytes(),
171 );
172 // include both the initial escape char, and also the final one.
173 }
174 }
175 }
176
177 i = end_index + 1; // move through buffer, we we've covered everything up to this point now
178 } // while more chars in input to loop through
179
180 Cow::Owned(String::from_utf8(output).unwrap())
181 }
182 None => {
183 // no escape char in the string at all, just return what we have
184 input
185 }
186 }
187 }
188}
189
190#[cfg(test)]
191mod tests {
192 use std::str::FromStr;
193
194 use super::*;
195
196 #[test]
197 fn test_decode_does_nothing_if_not_required() {
198 let delims = Separators::default();
199 let escaper = EscapeSequence::new(delims);
200
201 let input = "There are no escape sequences here/there/.";
202 let output = escaper.decode(input);
203 assert_eq!(output, input);
204 }
205
206 #[test]
207 fn test_decode_handles_simple_x_codes() {
208 let delims = Separators::default();
209 let escaper = EscapeSequence::new(delims);
210
211 let input = "Escape sequence with \\X0D\\.";
212 let output = escaper.decode(input);
213 assert_eq!(output, "Escape sequence with \r.");
214 }
215
216 #[test]
217 fn test_decode_handles_multi_byte_x_codes() {
218 let delims = Separators::default();
219 let escaper = EscapeSequence::new(delims);
220
221 let input = "Sentence 1.\\X0D0A\\Sentence 2.";
222 let output = escaper.decode(input);
223 assert_eq!(output, "Sentence 1.\r\nSentence 2.");
224 }
225
226 #[test]
227 fn test_decode_does_nothing_if_backslash_is_not_escape_sequence() {
228 let delims = Separators::default();
229 let escaper = EscapeSequence::new(delims);
230
231 let input = r#"There are no escape sequences here\there."#;
232 let output = escaper.decode(input);
233 assert_eq!(output, input);
234 }
235
236 #[test]
237 fn test_decode_handles_field_sequence() {
238 let delims = Separators::default();
239 let escaper = EscapeSequence::new(delims);
240
241 let input = r#"Escape this \F\ please"#;
242 let output = escaper.decode(input);
243 assert_eq!(output, "Escape this | please");
244 }
245
246 #[test]
247 fn ensure_decode_does_not_eat_chars_it_shouldnt() {
248 let delims = Separators::default();
249 let escaper = EscapeSequence::new(delims);
250
251 let input = r#"Escape this \F please"#;
252 let output = escaper.decode(input);
253 assert_eq!(output, input);
254 }
255
256 #[test]
257 fn ensure_decode_handles_custom_delims() {
258 let delims = Separators::from_str("MSH^!@#$").unwrap();
259 let escaper = EscapeSequence::new(delims);
260
261 let input = r#"Escape this #F# please"#;
262 let output = escaper.decode(input);
263 assert_eq!(output, "Escape this ^ please");
264 }
265
266 #[test]
267 fn ensure_decode_handles_eescape_sequence() {
268 let delims = Separators::default();
269 let escaper = EscapeSequence::new(delims);
270
271 let input = r#"Escape this \E\ please"#; // convert the escape sequence
272 let output = escaper.decode(input);
273 assert_eq!(output, r#"Escape this \ please"#); // into a single escape char
274
275 // ensure it moves on past the char it just added
276 let input = r#"Escape this \E\ pretty \F\ please"#; // convert the escape sequence
277 let output = escaper.decode(input);
278 assert_eq!(output, r#"Escape this \ pretty | please"#); // into a single escape char and still handle future sequences ok
279 }
280
281 #[test]
282 fn test_decode_handles_repeat_sequence() {
283 let delims = Separators::default();
284 let escaper = EscapeSequence::new(delims);
285
286 let input = r#"Escape this \R\ please"#;
287 let output = escaper.decode(input);
288 assert_eq!(output, "Escape this ~ please");
289 }
290
291 #[test]
292 fn test_decode_handles_component_sequence() {
293 let delims = Separators::default();
294 let escaper = EscapeSequence::new(delims);
295
296 let input = r#"Escape this \S\ please"#;
297 let output = escaper.decode(input);
298 assert_eq!(output, "Escape this ^ please");
299 }
300
301 #[test]
302 fn test_decode_handles_subcomponent_sequence() {
303 let delims = Separators::default();
304 let escaper = EscapeSequence::new(delims);
305
306 let input = r#"Obstetrician \T\ Gynaecologist"#;
307 let output = escaper.decode(input);
308 assert_eq!(output, "Obstetrician & Gynaecologist");
309 }
310
311 #[test]
312 fn ensure_decode_ignores_highlighting_sequence() {
313 let delims = Separators::default();
314 let escaper = EscapeSequence::new(delims);
315
316 let input = r#"Don't escape this \H\highlighted text\N\ please"#;
317 let output = escaper.decode(input);
318 assert_eq!(output, input);
319 }
320
321 #[test]
322 fn ensure_decode_ignores_custom_sequence() {
323 let delims = Separators::default();
324 let escaper = EscapeSequence::new(delims);
325
326 let input = r#"Don't escape this custom sequence \Z1234\ please"#;
327 let output = escaper.decode(input);
328 assert_eq!(output, input);
329 }
330}