tremor_kv/
lib.rs

1// Copyright 2020-2021, The Tremor Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// KV parsing
16//
17// Parses a string into a map. It is possible to split based on different characters that represent
18// either field or key value boundaries.
19//
20// A good part of the logstash functionality will be handled outside of this function and in a
21// generic way in tremor script.
22//
23// Features (in relation to LS):
24//
25// | Setting                | Translation                                             | Supported |
26// |------------------------|---------------------------------------------------------|-----------|
27// | allow_duplicate_values | not supported, since we deal with JSON maps             | No        |
28// | default_keys           | should be handled in TS (via assignment)                | TS        |
29// | exclude_keys           | should behandled in TS (via delete_keys?)               | TS        |
30// | field_split            | supported, array of strings                             | Yes       |
31// | field_split_pattern    | not supported                                           | No        |
32// | include_brackets       | should be handled in TS (via map + dissect?)            | TS        |
33// | include_keys           | should be handled in TS (via select)                    | TS        |
34// | prefix                 | should be handled in TS (via map + string::format)      | TS        |
35// | recursive              | not supported                                           | No        |
36// | remove_char_key        | should be handled in TS (via map + re::replace)         | TS        |
37// | remove_char_value      | should be handled in TS (via map + re::replace)         | TS        |
38// | source                 | handled in TS at call time                              | TS        |
39// | target                 | handled in TS at return time                            | TS        |
40// | tag_on_failure         | handled in TS at return time                            | TS        |
41// | tag_on_timeout         | currently not supported                                 | No        |
42// | timeout_millis         | currently not supported                                 | No        |
43// | transform_key          | should be handled in TS (via map + ?)                   | TS        |
44// | transform_value        | should be handled in TS (via map + ?)                   | TS        |
45// | trim_key               | should be handled in TS (via map + ?)                   | TS        |
46// | trim_value             | should be handled in TS (via map + ?)                   | TS        |
47// | value_split            | supported, array of strings                             | Yes       |
48// | value_split_pattern    | not supported                                           | No        |
49// | whitespace             | we always run in 'lenient mode' as is the default of LS | No        |
50#![deny(warnings)]
51#![recursion_limit = "1024"]
52#![deny(
53    clippy::all,
54    clippy::unwrap_used,
55    clippy::unnecessary_unwrap,
56    clippy::pedantic
57)]
58#![allow(clippy::must_use_candidate)]
59
60use serde::{Deserialize, Serialize};
61use simd_json::prelude::{MutableObject, *};
62use std::fmt;
63
64#[derive(Debug, PartialEq, Eq)]
65pub enum Error {
66    InvalidPattern(usize),
67    DoubleSeperator(String),
68    InvalidEscape(char),
69    UnterminatedEscape,
70}
71impl fmt::Display for Error {
72    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
73        match self {
74            Self::InvalidPattern(p) => write!(f, "invalid pattern at character {p}"),
75            Self::DoubleSeperator(s) => {
76                write!(
77                    f,
78                    "The seperator '{s}' is used for both key value seperation as well as pair seperation."
79                )
80            }
81            Self::InvalidEscape(s) => write!(f, "Invalid escape sequence \\'{s}' is not valid."),
82            Self::UnterminatedEscape => write!(
83                f,
84                "Unterminated escape at the end of line or of a delimiter %{{ can't be escaped"
85            ),
86        }
87    }
88}
89
90impl std::error::Error for Error {}
91
92#[derive(PartialEq, Debug, Clone, Serialize, Deserialize, Eq)]
93pub struct Pattern {
94    field_seperators: Vec<String>,
95    key_seperators: Vec<String>,
96}
97
98impl std::default::Default for Pattern {
99    fn default() -> Self {
100        Self {
101            field_seperators: vec![" ".to_string()],
102            key_seperators: vec![":".to_string()],
103        }
104    }
105}
106
107fn handle_escapes(s: &str) -> Result<String, Error> {
108    let mut res = String::with_capacity(s.len());
109    let mut cs = s.chars();
110    while let Some(c) = cs.next() {
111        match c {
112            '\\' => {
113                if let Some(c1) = cs.next() {
114                    match c1 {
115                        '\\' => res.push(c1),
116                        'n' => res.push('\n'),
117                        't' => res.push('\t'),
118                        'r' => res.push('\r'),
119                        other => return Err(Error::InvalidEscape(other)),
120                    }
121                } else {
122                    return Err(Error::UnterminatedEscape);
123                }
124            }
125            c => res.push(c),
126        }
127    }
128    Ok(res)
129}
130
131impl Pattern {
132    /// compiles a pattern
133    /// # Errors
134    /// fails if the pattern is invalid
135    pub fn compile(pattern: &str) -> Result<Self, Error> {
136        let mut field_seperators = Vec::new();
137        let mut key_seperators = Vec::new();
138        let mut i = 0;
139        loop {
140            if pattern[i..].starts_with("%{key}") {
141                i += 6;
142                if let Some(i1) = pattern[i..].find("%{val}") {
143                    if i1 != 0 {
144                        key_seperators.push(handle_escapes(&pattern[i..i + i1])?);
145                    }
146                    i += i1 + 6;
147                } else {
148                    return Err(Error::InvalidPattern(i));
149                }
150            } else if let Some(i1) = pattern[i..].find("%{key}") {
151                if i1 != 0 {
152                    field_seperators.push(handle_escapes(&pattern[i..i + i1])?);
153                }
154                i += i1;
155            } else if pattern[i..].is_empty() {
156                break;
157            } else {
158                field_seperators.push(handle_escapes(&pattern[i..])?);
159                break;
160            }
161        }
162        if field_seperators.is_empty() {
163            field_seperators.push(" ".to_string());
164        }
165        if key_seperators.is_empty() {
166            key_seperators.push(":".to_string());
167        }
168        field_seperators.sort();
169        key_seperators.sort();
170        field_seperators.dedup();
171        key_seperators.dedup();
172
173        for fs in &field_seperators {
174            if key_seperators.iter().any(|ks| ks.contains(fs)) {
175                return Err(Error::DoubleSeperator(fs.to_string()));
176            }
177
178            if field_seperators
179                .iter()
180                .any(|fs2| fs2 != fs && fs2.contains(fs))
181            {
182                return Err(Error::DoubleSeperator(fs.to_string()));
183            }
184        }
185
186        for ks in &key_seperators {
187            if field_seperators.iter().any(|fs| fs.contains(ks)) {
188                return Err(Error::DoubleSeperator(ks.to_string()));
189            }
190
191            if key_seperators
192                .iter()
193                .any(|ks2| ks2 != ks && ks2.contains(ks))
194            {
195                return Err(Error::DoubleSeperator(ks.to_string()));
196            }
197        }
198
199        Ok(Self {
200            field_seperators,
201            key_seperators,
202        })
203    }
204    /// Splits a string that represents KV pairs.
205    ///
206    /// * `input` - The input string
207    ///
208    /// Note: Fields that have on value are dropped.
209    pub fn run<'input, V>(&self, input: &'input str) -> Option<V>
210    where
211        V: ValueBuilder<'input> + MutableObject + 'input,
212        <V as MutableObject>::Key: std::hash::Hash + Eq + From<&'input str>,
213        <V as MutableObject>::Target: std::convert::From<&'input str>,
214    {
215        let mut r = V::object();
216        let mut empty = true;
217        for field in multi_split(input, &self.field_seperators) {
218            let kv: Vec<&str> = multi_split(field, &self.key_seperators);
219            if kv.len() == 2 {
220                empty = false;
221                r.insert(kv[0], kv[1]).ok()?;
222            }
223        }
224        if empty { None } else { Some(r) }
225    }
226}
227
228fn multi_split<'input>(input: &'input str, seperators: &[String]) -> Vec<&'input str> {
229    use std::mem;
230    let mut i: Vec<&str> = vec![input];
231    let mut i1 = vec![];
232    let mut r: Vec<&str>;
233    for s in seperators {
234        i1.clear();
235        for e in &i {
236            r = e.split(s.as_str()).collect();
237            i1.append(&mut r);
238        }
239        mem::swap(&mut i, &mut i1);
240    }
241    i
242}
243
244#[cfg(test)]
245mod test {
246    use super::*;
247    use simd_json::BorrowedValue;
248    use simd_json::borrowed::Object;
249
250    #[test]
251    fn default() {
252        let d = Pattern::default();
253        let p = Pattern::compile("%{key}:%{val}").expect("compile");
254        assert_eq!(d, p);
255    }
256    #[test]
257    fn test_multisplit() {
258        let seps = vec![String::from(" "), String::from(";")];
259        let input = "this=is;a=test for:seperators";
260
261        let i = multi_split(input, &seps);
262        assert_eq!(i, vec!["this=is", "a=test", "for:seperators"]);
263    }
264
265    #[test]
266    fn simple_split() {
267        let kv = Pattern::compile("%{key}=%{val}").expect("Failed to build pattern");
268        let r: BorrowedValue = kv.run("this=is a=test").expect("Failed to split input");
269        assert_eq!(r.as_object().map(Object::len).unwrap_or_default(), 2);
270        assert_eq!(r["this"], "is");
271        assert_eq!(r["a"], "test");
272    }
273
274    #[test]
275    fn simple_split2() {
276        let kv = Pattern::compile("&%{key}=%{val}").expect("Failed to build pattern");
277        let r: BorrowedValue = kv.run("this=is&a=test").expect("Failed to split input");
278        assert_eq!(r.as_object().map(Object::len).unwrap_or_default(), 2);
279        assert_eq!(r["this"], "is");
280        assert_eq!(r["a"], "test");
281    }
282    #[test]
283    fn newline_simple_() {
284        let kv = Pattern::compile(r"\n%{key}=%{val}").expect("Failed to build pattern");
285        let r: BorrowedValue = kv.run("this=is\na=test").expect("Failed to split input");
286        assert_eq!(r.as_object().map(Object::len).unwrap_or_default(), 2);
287        assert_eq!(r["this"], "is");
288        assert_eq!(r["a"], "test");
289    }
290
291    #[test]
292    fn simple_split3() {
293        let kv = Pattern::compile("&").expect("Failed to build pattern");
294        let r: BorrowedValue = kv.run("this:is&a:test").expect("Failed to split input");
295        assert_eq!(r.as_object().map(Object::len).unwrap_or_default(), 2);
296        assert_eq!(r["this"], "is");
297        assert_eq!(r["a"], "test");
298    }
299
300    #[test]
301    fn simple_split4() {
302        let kv = Pattern::compile("%{key}%{%{val}").expect("Failed to build pattern");
303        let r: BorrowedValue = kv.run("this%{is a%{test").expect("Failed to split input");
304        assert_eq!(r.as_object().map(Object::len).unwrap_or_default(), 2);
305        assert_eq!(r["this"], "is");
306        assert_eq!(r["a"], "test");
307    }
308
309    #[test]
310    fn simple_split5() {
311        let kv = Pattern::compile("%{key}%{key}%{val}").expect("Failed to build pattern");
312        dbg!(&kv);
313        let r: BorrowedValue = kv
314            .run("this%{key}is a%{key}test")
315            .expect("Failed to split input");
316        assert_eq!(r.as_object().map(Object::len).unwrap_or_default(), 2);
317        assert_eq!(r["this"], "is");
318        assert_eq!(r["a"], "test");
319    }
320
321    #[test]
322    fn invalid_pattern() {
323        let kv = Pattern::compile("%{key} ");
324        let e = kv.expect_err("no error");
325        assert_eq!(e, Error::InvalidPattern(6));
326        println!("{e}");
327
328        let kv = Pattern::compile("%{key} %{val} \\8");
329        let e = kv.expect_err("no error");
330        assert_eq!(e, Error::InvalidEscape('8'));
331        println!("{e}");
332
333        let kv = Pattern::compile("%{key} %{val} ");
334        let e = kv.expect_err("no error");
335        assert_eq!(e, Error::DoubleSeperator(String::from(" ")));
336        println!("{e}");
337
338        let kv = Pattern::compile("%{key}=%{val} %{key}==%{val}");
339        let e = kv.expect_err("no error");
340        assert_eq!(e, Error::DoubleSeperator(String::from("=")));
341        println!("{e}");
342
343        let kv = Pattern::compile("%{key}=%{val}; %{key}:%{val} %{key}:%{val}");
344        let e = kv.expect_err("no error");
345        assert_eq!(e, Error::DoubleSeperator(String::from(" ")));
346        println!("{e}");
347
348        let kv = Pattern::compile("%{key}=%{val};%{key}:%{val} :%{key}:%{val}");
349        let e = kv.expect_err("no error");
350        assert_eq!(e, Error::DoubleSeperator(String::from(":")));
351        println!("{e}");
352    }
353    #[test]
354    fn one_field() {
355        let kv = Pattern::compile("%{key}=%{val}").expect("Failed to build pattern");
356        let r: BorrowedValue = kv.run("this=is").expect("Failed to split input");
357        assert_eq!(r.as_object().map(Object::len).unwrap_or_default(), 1);
358        assert_eq!(r["this"], "is");
359    }
360
361    #[test]
362    fn no_split() {
363        let kv = Pattern::compile("%{key}=%{val}").expect("Failed to build pattern");
364        let r: Option<BorrowedValue> = kv.run("this is a test");
365        assert!(r.is_none());
366    }
367
368    #[test]
369    fn different_seperators() {
370        let kv = Pattern::compile("%{key}=%{val};%{key}:%{val} %{key}:%{val}")
371            .expect("Failed to build pattern");
372        dbg!(&kv);
373        let r: BorrowedValue = kv
374            .run("this=is;a=test for:seperators")
375            .expect("Failed to split input");
376        dbg!(&r);
377        assert_eq!(r.as_object().map(Object::len).unwrap_or_default(), 3);
378        assert_eq!(r["this"], "is");
379        assert_eq!(r["a"], "test");
380        assert_eq!(r["for"], "seperators");
381    }
382
383    #[test]
384    fn different_seperators2() {
385        let kv = Pattern::compile("%{key}=%{val}%{key}:%{val} %{key}:%{val};")
386            .expect("Failed to build pattern");
387        let r: BorrowedValue = kv
388            .run("this=is;a=test for:seperators")
389            .expect("Failed to split input");
390        dbg!(&r);
391        dbg!(&kv);
392        assert_eq!(r.as_object().map(Object::len).unwrap_or_default(), 3);
393        assert_eq!(r["this"], "is");
394        assert_eq!(r["a"], "test");
395        assert_eq!(r["for"], "seperators");
396    }
397
398    #[test]
399    fn invalid_pattern2() {
400        let kv = Pattern::compile("%{key}=%{val};%{key}:%{val} %{key}:%{val}")
401            .expect("Failed to build pattern");
402        let r: BorrowedValue = kv
403            .run("this=is;a=test for:seperators")
404            .expect("Failed to split input");
405        dbg!(&r);
406        dbg!(&kv);
407        assert_eq!(r.as_object().map(Object::len).unwrap_or_default(), 3);
408        assert_eq!(r["this"], "is");
409        assert_eq!(r["a"], "test");
410        assert_eq!(r["for"], "seperators");
411    }
412
413    #[test]
414    fn unfinished_escape_in_pattern() {
415        let res = Pattern::compile(r"%{key}=%{val}; \\\r\n\t\");
416        assert_eq!(Err(Error::UnterminatedEscape), res);
417        if let Err(e) = res {
418            assert_eq!(
419                "Unterminated escape at the end of line or of a delimiter %{ can't be escaped",
420                &e.to_string()
421            );
422        }
423    }
424}