1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
//! This module contains a collection of sanitizers which is really just a fancy way of saying
//! that this is a bunch of functions which take strings, change them, and give them back.

use deunicode::deunicode_char;
use pest::Parser;

use crate::MT940Parser;
use crate::Rule;

/// Run all sanitizers on the input in a useful order.
///
/// If you don't really care exactly _how_ you're input is sanitized and just want it to work, this
/// is probably the function to use. Be aware that it's possible that some data could be truncated
/// in order to make valid statements.
pub fn sanitize(s: &str) -> String {
    let s1 = to_swift_charset(s);
    let s2 = strip_stuff_between_messages(&s1);
    strip_excess_tag86_lines(&s2)
}

/// Try to make a given input conform to the SWIFT MT101 allowed charset.
///
/// This works by running `deunicode_char` on all non-SWIFT characters. That gets rid of characters
/// like 'ä', 'ö', 'ü', 'ú' and so and converts them into their sensible ASCII equivalents.
/// Any remaining non-SWIFT characters (like '!', '=', etc) will be replaced with a dot ('.') each.
/// [SWIFT MT101 characters reference here](http://www.sepaforcorporates.com/swift-for-corporates/quick-guide-swift-mt101-format/).
pub fn to_swift_charset(s: &str) -> String {
    // Parse the string char by char and see whether it's a conforming swift char. If it isn't,
    // we'll want to run deunicode.
    s.chars()
        .map(|x| {
            let char_as_string = x.to_string();
            let parsed = MT940Parser::parse(Rule::swift_char, &char_as_string);
            // If parsing succeeds, we already have a SWIFT charset allowable character, yay!
            // However, if it doesn't, we'll have to be sensible and smart about it...
            if parsed.is_ok() {
                char_as_string.clone()
            } else {
                // This is the first attempt to make a non-SWIFT character into an allowed character.
                let deunicoded = deunicode_char(x).unwrap_or(".").to_string();
                // Also note that we have to use the `Rule::swift_chars` here because a single
                // Unicode character might be deunicoded to multiple ASCII chars!
                let parsed_after_deunicode = MT940Parser::parse(Rule::swift_chars, &deunicoded);
                if parsed_after_deunicode.is_ok() {
                    deunicoded.clone()
                } else {
                    // If all else fails, we can only replace this character with a dot and move
                    // on.
                    ".".to_string()
                }
            }
        })
        .collect()
}

/// Remove stuff between messages.
///
/// Sometimes, statements will have messages separated with `-` or the like to keep the visually
/// seperate like this:
///
/// ```ignore
/// ...
/// :62F:D070904EUR1237628,23
/// :64:D070904EUR1237628,23
/// -
/// :20:T089413956000001
/// :25:50880050/0194777100888
/// ...
/// ```
///
/// However, this makes them uncompliant.
///
/// This sanitizer gets rid of that.
pub fn strip_stuff_between_messages(s: &str) -> String {
    // Find all :20: tags and remove any non-tag lines before those.
    let total_lines = s.lines().count();
    let mut lines_with_tag_20 = vec![];
    let mut lines_with_tags = vec![];
    let mut last_tag = "20";

    // Do one pass to find all the indices of tag 20 and non-tag 20 lines.
    for (i, line) in s.lines().enumerate() {
        let parsed = MT940Parser::parse(Rule::field, line);
        if let Ok(mut parsed) = parsed {
            last_tag = parsed
                .next()
                .unwrap()
                .into_inner()
                .next()
                .unwrap()
                .into_inner()
                .next()
                .unwrap()
                .as_str();
            if last_tag == "20" {
                lines_with_tag_20.push(i);
            }
            lines_with_tags.push(i);
        }
    }

    let mut lines_to_delete = vec![];

    // Do a second pass to figure out which lines we don't want.
    for tag_20_index in lines_with_tag_20 {
        // From the tag 20 index, travel upwards until either hitting another tag's index or
        // until hitting index 0.
        let mut i = tag_20_index;
        while i > 0 {
            i -= 1;

            // Terminate on the first line that contains another tag.
            if lines_with_tags.contains(&i) {
                break;
            } else {
                lines_to_delete.push(i);
            }
        }
    }

    // There is a special case to handle for the end of the file:
    // If the very last tag is a tag 86 then we'll allow any non-tag lines after it towards the end
    // of the file. However, if the last tag is a non-86-tag then we'll remove any additional lines
    // up to the last tag.
    if last_tag != "86" {
        let last_tag_index = *lines_with_tags.last().unwrap_or(&0) + 1;
        lines_to_delete.extend(last_tag_index..total_lines);
    }

    // Do a third pass to actually copy only the wanted lines from the input to the output.
    s.lines()
        .enumerate()
        .filter_map(|(i, contents)| (!lines_to_delete.contains(&i)).then(|| contents))
        .chain(std::iter::once(""))
        .collect::<Vec<&str>>()
        .join("\r\n")
}

/// Remove excess lines on tag 86 statements beyond the 6 allowed.
///
/// Note that you potentially lose information with this sanitizer.
pub fn strip_excess_tag86_lines(input: &str) -> String {
    let mut lines_to_delete = vec![];

    // Get a list of lines where tag 86 messages start.
    let tag_86_lines = input.lines().enumerate().filter_map(|(line, contents)| {
        if contents.starts_with(":86:") {
            Some(line)
        } else {
            None
        }
    });

    for line_no in tag_86_lines {
        let lines = input.lines().skip(line_no + 1);

        // Find all lines excess of the 5 allowed additional lines (6 in total counting the skipped line above).
        let to_delete = lines
            .enumerate()
            .take_while(|(_, contents)| !contents.starts_with(':'))
            .filter_map(move |(line, _)| {
                if line >= 5 {
                    Some(line + line_no + 1)
                } else {
                    None
                }
            });

        lines_to_delete.extend(to_delete);
    }

    input
        .lines()
        .enumerate()
        .filter_map(|(line, contents)| (!lines_to_delete.contains(&line)).then(|| contents))
        .chain(std::iter::once(""))
        .collect::<Vec<&str>>()
        .join("\r\n")
}

#[cfg(test)]
mod tests {
    use pretty_assertions::assert_eq;
    use proptest::{prop_assert, proptest};
    use rstest::rstest;

    use super::*;

    proptest! {
        #[test]
        fn to_swift_charset_no_parsing_failure_after_conversion(input in r".+") {
            let result = to_swift_charset(&input);
            let parsed = MT940Parser::parse(Rule::swift_chars, &result);
            prop_assert!(parsed.is_ok());
        }
    }

    #[test]
    fn to_swift_charset_sanitize_sentence() {
        let input = "hällö waß íst lös";
        let result = to_swift_charset(input);
        let expected = "hallo wass ist los";
        assert_eq!(result, expected);
    }

    #[rstest(
        input,
        expected,
        case("ä", "a"),
        case("ö", "o"),
        case("ú", "u"),
        case("é", "e"),
        case("å", "a"),
        case("á", "a"),
        case("ß", "ss"),
        case("ú", "u"),
        case("ó", "o"),
        case("í", "i"),
        case("ë", "e"),
        case("=", "."),
        case("!", ".")
    )]
    fn to_swift_charset_special_char_conversions(input: &str, expected: &str) {
        let result = to_swift_charset(input);
        assert_eq!(result, expected);
    }

    #[test]
    fn strip_stuff_between_messages_success() {
        let input = "\
            :86:asdasdads\r\n\
            ------\r\n\
            :20:vvvvv\r\n\
            :86:hello\r\n\
            multi line string\r\n\
            here is ok\r\n\
            :64:end of message\r\n\
            stuff between messages\r\n\
            should be removed\r\n\
            :20:aaaaa\r\n\
            :64:some more\r\n\
            ö»»«»«äää\r\n\
            :20:lolab\r\n\
            :86:zzzz\r\n\
            :64:asda\r\n\
            --\r\n\
        ";
        let expected = "\
                        :86:asdasdads\r\n\
                        :20:vvvvv\r\n\
                        :86:hello\r\n\
                        multi line string\r\n\
                        here is ok\r\n\
                        :64:end of message\r\n\
                        :20:aaaaa\r\n\
                        :64:some more\r\n\
                        :20:lolab\r\n\
                        :86:zzzz\r\n\
                        :64:asda\r\n\
                        ";
        let result = strip_stuff_between_messages(input);
        assert_eq!(result, expected);
    }

    /// Last lines in the file will be stripped if last tag is not tag 86.
    /// Tag 86 is a multiline tag and can validly be placed at the end of a message.
    #[test]
    fn strip_stuff_between_messages_last_is_86() {
        let input = "\
            :20:vvvvv\r\n\
            :86:hello\r\n\
            multi line string\r\n\
            here is ok\r\n\
            --\r\n\
        ";
        let expected = "\
                        :20:vvvvv\r\n\
                        :86:hello\r\n\
                        multi line string\r\n\
                        here is ok\r\n\
                        --\r\n\
                        ";
        let result = strip_stuff_between_messages(input);
        assert_eq!(result, expected);
    }

    #[test]
    fn excess_tag86_are_stripped() {
        let input = "\
            :20:vvvvv\r\n\
            :86:hello\r\n\
            multi line string\r\n\
            here is ok even with date that looks like a tag 20:10:43\r\n\
            but not when\r\n\
            it is way too many\r\n\
            lines\r\n\
            in fact i shouldnt be here\r\n\
            and i shouldnt either\r\n\
            :62F:C123EUR321,98\r\n\
            :20:vvvvv\r\n\
            :86:hello\r\n\
            multi line string\r\n\
            but not many lines\r\n\
            :62F:C123EUR321,98\r\n\
            :20:vvvvv\r\n\
            :86:hi there\r\n\
            a very multi lined string\r\n\
            here is ok even with date that looks like a tag 20:86:43\r\n\
            but not when\r\n\
            it is way too many\r\n\
            lines\r\n\
            in fact i shouldnt be here\r\n\
            and i shouldnt either\r\n\
            and i certainly aint supposed to be here as well\r\n\
            :62F:C321EUR123,98\r\n\
        ";
        let expected = "\
            :20:vvvvv\r\n\
            :86:hello\r\n\
            multi line string\r\n\
            here is ok even with date that looks like a tag 20:10:43\r\n\
            but not when\r\n\
            it is way too many\r\n\
            lines\r\n\
            :62F:C123EUR321,98\r\n\
            :20:vvvvv\r\n\
            :86:hello\r\n\
            multi line string\r\n\
            but not many lines\r\n\
            :62F:C123EUR321,98\r\n\
            :20:vvvvv\r\n\
            :86:hi there\r\n\
            a very multi lined string\r\n\
            here is ok even with date that looks like a tag 20:86:43\r\n\
            but not when\r\n\
            it is way too many\r\n\
            lines\r\n\
            :62F:C321EUR123,98\r\n\
        ";
        let result = strip_excess_tag86_lines(input);
        assert_eq!(result, expected);
    }
}