datafu/lib.rs
1/*
2 * Datafu - Rust library for extracting data from object graphs.
3 * Copyright (C) 2021 Soni L.
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU Affero General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU Affero General Public License for more details.
14 *
15 * You should have received a copy of the GNU Affero General Public License
16 * along with this program. If not, see <https://www.gnu.org/licenses/>.
17 */
18#![warn(rust_2018_idioms)]
19#![cfg_attr(not(feature = "stable"), feature(label_break_value))]
20
21//! Datafu is a regex-inspired query language. It was primarily
22//! designed for processing object trees parsed from configuration files, but
23//! can also be used with JSON APIs, and even XML.
24//!
25//! # Languge Reference
26//!
27//! Datafu expressions have the ability to iterate, index, validate and filter
28//! data structures, through the use of the syntax elements below.
29//!
30//! ## Syntax Elements of Datafu Expressions
31//!
32//! An arrow is `->` and indicates indexing/iteration. Whether indexing or
33//! iteration is used is defined by the elements that follow, with iteration
34//! being used by default.
35//!
36//! A variable is a sequence of alphanumeric characters, not starting with
37//! a digit. A `(key, value)` tuple containing the respective matched
38//! element will be identified by this name in the results map.
39//!
40//! A literal is a sequence of characters delimited by `'`, optionally
41//! followed by `?`, with `%` as the escape character, and defines a
42//! string-keyed indexing operation. A literal can contain any character,
43//! except unescaped `%` or `'` symbols, which must be escaped as
44//! `%%` and `%'`, respectively. The sequence of characters defined by
45//! a literal is used as the string object in the indexing operation.
46//!
47//! A parameter is `$`, optionally followed by `?`, followed by a
48//! sequence of alphanumeric characters, not starting with a digit, and
49//! defines an object-keyed indexing operation. The sequence of characters
50//! defined by a parameter is used to retrieve, from the pattern's
51//! definitions, the object to be used in the indexing operation.
52//!
53//! A regex is a sequence of characters delimited by `/`, optionally
54//! followed by `?`, with `%` as the escape character. A regex can
55//! contain any character, except unescaped `%` or `/` symbols, which
56//! must be escaped as `%%` and `%/`, respectively. The sequence of
57//! characters defined by a regex is passed to the `regex` crate, which
58//! may apply further restrictions on the characters used, and is used to
59//! accept the respective keys processed by the iterator.
60//!
61//! A predicate is `:`, optionally followed by `?`, followed by an
62//! `$` and a sequence of alphanumeric characters, not starting with a
63//! digit, and is used to accept values to be processed based on an
64//! external [`Predicate`].
65//!
66//! A key match is a datafu expression (including, but not limited to, the
67//! empty datafu expression) enclosed within `[` and `]`, optionally
68//! prefixed with one or more predicates, and applies the enclosed
69//! predicates and datafu expression to the key (or index) being processed.
70//! A key match enables additional validation of keys and/or extraction of
71//! values from keys, and accepts a key if and only if the enclosed
72//! predicates accept the key and the enclosed expression matches the key.
73//!
74//! A subvalue is a datafu expression (including, but not limited to, the
75//! empty datafu expression) enclosed within `(` and `)`, and applies
76//! the enclosed datafu expression to the value (or index) being processed.
77//! A subvalue enables the ability to match multiple values on the same
78//! object, and accepts a value if and only the enclosed expression
79//! matches the value. A subvalue can be made optional by the presence of
80//! a `?` after the subvalue - in case of no match, it will just omit
81//! the relevant keys in the result. Optional subvalues are unrelated to
82//! non-validating syntax elements (see below), they just use the same
83//! syntax.
84//!
85//! Some syntax elements can be validating or non-validating. Validating
86//! syntax elements will return a [`errors::MatchError::ValidationError`]
87//! whenever a non-accepted element is encountered, whereas non-validating
88//! ones will skip them. Whether an element is validating is determined by
89//! the absence of an optional `?` in the documented position. Note that
90//! it is possible for a validating syntax element to still yield results
91//! before returning a [`errors::MatchError::ValidationError`], so one
92//! needs to be careful when writing code where such behaviour could
93//! result in a security vulnerability.
94//!
95//! The empty pattern matches anything, but only does so once.
96//!
97//! ## Syntax of Datafu Expressions
98//!
99//! Datafu Expressions follow the given syntax, in (pseudo-)extended BNF:
100//!
101//! ```text
102//! expression ::= {arrow tag} {subvalue}
103//! tag ::= identifier [arg] {predicate} | arg {predicate}
104//! arg ::= parameter | literal | regex | keymatch
105//!
106//! arrow ::= '->'
107//! keymatch ::= '[' {predicate} expression ']'
108//! subvalue ::= '(' {predicate} expression ')' ['?']
109//! ```
110//!
111//! For a description of the terminals "parameter", "literal", "regex" and
112//! "predicate", see "Syntax Elements of Datafu Expressions" above.
113//!
114//! # Examples
115//!
116//! <!-- TODO -->
117
118extern crate regex;
119
120#[cfg(test)]
121extern crate proptest;
122
123pub mod errors;
124mod parser;
125mod pattern;
126mod vm;
127
128pub use pattern::Pattern;
129
130pub use vm::Matcher;
131
132// TODO replace with GATs
133/// A borrowed or owned value of various types.
134///
135/// This exists purely as a workaround for Rust not having GATs yet.
136#[derive(Debug)]
137pub enum RefOwn<'b, T: ?Sized, U> {
138 /// Borrowed T.
139 Ref(&'b T),
140 /// Borrowed string.
141 Str(&'b str),
142 /// Owned U.
143 Own(U),
144}
145
146impl<'b, T, U> PartialEq for RefOwn<'b, T, U>
147where
148 T: ?Sized + PartialEq<T> + PartialEq<U> + PartialEq<str>,
149 U: PartialEq<T> + PartialEq<U> + PartialEq<str>,
150 str: PartialEq<T> + PartialEq<U> + PartialEq<str>
151{
152 fn eq(&self, other: &Self) -> bool {
153 match (self, other) {
154 (RefOwn::Ref(l), RefOwn::Ref(r)) => l.eq(r),
155 (RefOwn::Own(l), RefOwn::Own(r)) => l.eq(r),
156 (RefOwn::Str(l), RefOwn::Str(r)) => l.eq(r),
157 (RefOwn::Ref(l), RefOwn::Own(r)) => PartialEq::eq(*l, r),
158 (RefOwn::Own(l), RefOwn::Str(r)) => PartialEq::eq(l, *r),
159 (RefOwn::Str(l), RefOwn::Ref(r)) => l.eq(r),
160 (RefOwn::Ref(l), RefOwn::Str(r)) => l.eq(r),
161 (RefOwn::Own(l), RefOwn::Ref(r)) => PartialEq::eq(l, *r),
162 (RefOwn::Str(l), RefOwn::Own(r)) => PartialEq::eq(*l, r),
163 }
164 }
165}
166
167impl<'b, T: ?Sized, U: Copy> Copy for RefOwn<'b, T, U> {
168}
169
170impl<'b, T: ?Sized, U: Clone> Clone for RefOwn<'b, T, U> {
171 fn clone(&self) -> Self {
172 match self {
173 RefOwn::Ref(r) => RefOwn::Ref(r),
174 RefOwn::Str(r) => RefOwn::Str(r),
175 RefOwn::Own(v) => RefOwn::Own(v.clone()),
176 }
177 }
178}
179
180/// A tuple representing a key-value pair.
181pub type KVPair<'b, T> = (RefOwn<'b, <T as PatternTypes>::Ref, <T as PatternTypes>::Own>, RefOwn<'b, <T as PatternTypes>::Ref, <T as PatternTypes>::Own>);
182
183impl<'b, T, U> From<&'b T> for RefOwn<'b, T, U> {
184 fn from(x: &'b T) -> RefOwn<'b, T, U> {
185 RefOwn::Ref(x)
186 }
187}
188
189// TODO investigate if this should be PatternTypes: Default
190/// Defines the types and operations used for matching.
191pub trait PatternTypes {
192 /// The borrowed type.
193 type Ref: ?Sized;
194
195 // TODO replace with GATs.
196 // TODO potentially relax with Clone?
197 /// The owned type.
198 type Own: Copy + 'static;
199
200 /// Returns an iterator over key-value pairs contained within an item, or
201 /// None if this operation is unsupported for the given value.
202 fn pairs<'b>(
203 item: RefOwn<'b, Self::Ref, Self::Own>
204 ) -> Option<Box<dyn Iterator<Item=KVPair<'b, Self>> + 'b>>;
205
206 /// Returns an optional key-value pair keyed by the given key, or None if
207 /// this operation is unsupported for the given value.
208 fn get<'a, 'b>(
209 item: RefOwn<'b, Self::Ref, Self::Own>,
210 key: RefOwn<'a, Self::Ref, Self::Own>
211 ) -> Option<Option<KVPair<'b, Self>>>;
212
213 // TODO replace with GATs + newtypes
214 /// Returns whether two keys/values are the same/equivalent. This must provide
215 /// the same guarantees as PartialEq. In fact, this is a replacement for
216 /// PartialEq for cases where it's not possible to just use PartialEq.
217 fn matches(
218 left: RefOwn<'_, Self::Ref, Self::Own>,
219 right: RefOwn<'_, Self::Ref, Self::Own>
220 ) -> bool;
221
222 /// Returns the value as an &str.
223 fn as_str<'b>(
224 value: RefOwn<'b, Self::Ref, Self::Own>
225 ) -> Option<&'b str>;
226}
227
228/// A predicate for keys and values.
229pub type Predicate<T> = dyn (Fn(RefOwn<'_, <T as PatternTypes>::Ref, <T as PatternTypes>::Own>) -> bool) + Send + Sync;