goat_cli/utils/
expression.rs

1use crate::utils::tax_ranks::TaxRanks;
2use crate::utils::utils::{did_you_mean, switch_string_to_url_encoding};
3
4use anyhow::{bail, ensure, Result};
5use regex::{CaptureMatches, Captures, Regex};
6use std::{collections::BTreeMap, fmt};
7use tabled::{object::Rows, Panel, Width, Modify, Table, Tabled};
8
9/// Serialize GoaT variables into their types.
10///
11/// See [here](https://www.elastic.co/guide/en/elasticsearch/reference/current/number.html)
12/// for more details.
13#[derive(Tabled)]
14pub enum TypeOf<'a> {
15    /// Signed 64 bit int.
16    Long,
17    /// Signed 16 bit int.
18    Short,
19    /// Float with one decimal place.
20    OneDP,
21    /// Float with two decimal places.
22    TwoDP,
23    /// Signed 32 bit int.
24    Integer,
25    /// A date.
26    Date,
27    /// Half precision 16 bit float.
28    HalfFloat,
29    /// A variable which itself is an enumeration.
30    Keyword(Vec<&'a str>),
31    /// None to catch parsing errors
32    None,
33}
34
35impl<'a> TypeOf<'a> {
36    /// Check the values input by a user, so `goat-cli` displays meaningful help.
37    fn check(&self, other: &str, variable: &str) -> Result<()> {
38        // we will have to parse the `other` conditionally on what the
39        // `TypeOf` is.
40        match self {
41            TypeOf::Long => match other.parse::<i64>() {
42                Ok(_) => (),
43                Err(_) => bail!(format!("For variable \"{variable}\" in the expression, an input error was found. Pass an integer as a value.")),
44            },
45            TypeOf::Short => match other.parse::<i16>() {
46                Ok(_) => (),
47                Err(_) => bail!(format!("For variable \"{variable}\" in the expression, an input error was found. Pass an integer as a value.")),
48            },
49            TypeOf::OneDP => match other.parse::<f32>() {
50                Ok(_) => (),
51                Err(_) => bail!(format!("For variable \"{variable}\" in the expression, an input error was found. Pass a float as a value.")),
52            },
53            TypeOf::TwoDP => match other.parse::<f32>() {
54                Ok(_) => (),
55                Err(_) => bail!(format!("For variable \"{variable}\" in the expression, an input error was found. Pass a float as a value.")),
56            },
57            TypeOf::Integer => match other.parse::<i32>() {
58                Ok(_) => (),
59                Err(_) => bail!(format!("For variable \"{variable}\" in the expression, an input error was found. Pass an integer as a value.")),
60            },
61            // dates should be in a specified format
62            // yyyy-mm-dd
63            TypeOf::Date => {
64                let tokens = other.split('-').collect::<Vec<_>>();
65                ensure!(
66                    tokens.len() == 1 || tokens.len() == 3,
67                    "Improperly formatted date. Please make sure date is in the format yyyy-mm-dd, or yyyy."
68                )
69            }
70            TypeOf::HalfFloat => match other.parse::<f32>() {
71                Ok(_) => (),
72                Err(_) => bail!(format!("For variable \"{variable}\" in the expression, an input error was found. Pass a float as a value.")),
73            },
74            // keywords handled elsewhere
75            TypeOf::Keyword(_) => (),
76            // None to catch errors.
77            TypeOf::None => (),
78        };
79        Ok(())
80    }
81}
82
83impl<'a> fmt::Display for TypeOf<'a> {
84    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
85        match self {
86            // do nothing with None at the moment.
87            TypeOf::None => write!(f, "Please don't use yet! This variable needs fixing."),
88            TypeOf::Long => write!(f, "!=, <, <=, =, ==, >, >="),
89            TypeOf::Short => write!(f, "!=, <, <=, =, ==, >, >="),
90            TypeOf::OneDP => write!(f, "!=, <, <=, =, ==, >, >="),
91            TypeOf::TwoDP => write!(f, "!=, <, <=, =, ==, >, >="),
92            TypeOf::Integer => write!(f, "!=, <, <=, =, ==, >, >="),
93            TypeOf::Date => write!(f, "!=, <, <=, =, ==, >, >="),
94            TypeOf::HalfFloat => write!(f, "!=, <, <=, =, ==, >, >="),
95            TypeOf::Keyword(k) => match k[0] {
96                "" => write!(f, ""),
97                _ => write!(f, "== {}", k.join(", ")),
98            },
99        }
100    }
101}
102
103/// Kind of an option alias. Does a
104/// particular variable have a function
105/// associated with it? Usually min/max.
106pub enum Function<'a> {
107    None,
108    Some(Vec<&'a str>),
109}
110
111impl<'a> fmt::Display for Function<'a> {
112    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
113        match self {
114            Function::None => write!(f, ""),
115            Function::Some(fun) => write!(f, "{}", fun.join(", ")),
116        }
117    }
118}
119
120/// The GoaT variable of interest.
121#[derive(Tabled)]
122pub struct Variable<'a> {
123    #[tabled(rename = "Display Name")]
124    pub display_name: &'a str,
125    #[tabled(rename = "Operators/Keywords")]
126    pub type_of: TypeOf<'a>,
127    #[tabled(rename = "Function(s)")]
128    pub functions: Function<'a>,
129}
130
131/// The column headers for `goat-cli search --print-expression`.
132#[derive(Tabled)]
133struct ColHeader(#[tabled(rename = "Expression Name")] &'static str);
134
135/// Print the table of GoaT variable data.
136pub fn print_variable_data(data: &BTreeMap<&'static str, Variable<'static>>) {
137    // for some space
138    println!();
139    // map the header to a tuple combination
140    // see https://github.com/zhiburt/tabled/blob/master/README.md
141    let table_data = data
142        .iter()
143        .map(|(e, f)| (ColHeader(e), f))
144        .collect::<Vec<(ColHeader, &Variable)>>();
145    // add taxon ranks at end...
146    let footer_data = TaxRanks::init();
147
148    let table_string = Table::new(&table_data)
149        .with(Panel::footer(format!("NCBI taxon ranks:\n\n{}", footer_data)))
150        .with(
151            Modify::new(Rows::new(1..table_data.len() - 1))
152                .with(Width::wrap(30).keep_words()),
153        )
154        // 4 rows
155        .with(
156            Modify::new(Rows::new(table_data.len()..))
157                .with(Width::wrap(30 * 4).keep_words()),
158        )
159        .to_string();
160
161    println!("{}", table_string);
162}
163
164/// The CLI expression which needs to be parsed.
165pub struct CLIexpression<'a> {
166    pub inner: &'a str,
167    pub length: usize, // these queries can't be crazy long.
168    pub expression: Vec<&'a str>,
169}
170
171impl<'a> CLIexpression<'a> {
172    /// Constructor for [`CLIexpression`].
173    pub fn new(string: &'a str) -> Self {
174        Self {
175            inner: string,
176            length: string.len(),
177            expression: Vec::new(),
178        }
179    }
180
181    /// The initial split on the keyword `AND`.
182    fn split(&self) -> Self {
183        let mut res_vec = Vec::new();
184        // commands only accept AND? Rich!
185        let re = Regex::new("AND").unwrap();
186        let splitter = SplitCaptures::new(&re, self.inner);
187        for state in splitter {
188            let el = match state {
189                SplitState::Unmatched(s) => s,
190                SplitState::Captured(s) => s.get(0).map_or("", |m| m.as_str()),
191            };
192            res_vec.push(el);
193        }
194        Self {
195            inner: self.inner,
196            length: self.length,
197            expression: res_vec,
198        }
199    }
200
201    /// The main function which parses a [`CLIexpression`]. A bit of a
202    /// monster of a function. Might need cleaning up at some point.
203    pub fn parse(
204        &mut self,
205        reference_data: &BTreeMap<&'static str, Variable<'static>>,
206    ) -> Result<String> {
207        let expression_length_limit = 100;
208        if self.length > expression_length_limit {
209            bail!(
210                "The expression query provided is greater than {} chars.",
211                expression_length_limit
212            )
213        }
214        if self.inner.contains("&&") {
215            bail!("Use AND keyword, not && for expression queries.")
216        }
217        if self.inner.contains(" contains") {
218            bail!("Using the \"contains\" keyword is not yet supported.")
219        }
220        if self.inner.contains("||") || self.inner.contains("OR") {
221            bail!("OR (or ||) keyword is not supported.")
222        }
223        if self.inner.contains("tax_name")
224            || self.inner.contains("tax_tree")
225            || self.inner.contains("tax_lineage")
226        {
227            bail!("Set tax_name through -t <taxon_name>, tax_tree by -d flag, and tax_lineage by -l flag.")
228        }
229        let split_vec = &self.split();
230        let exp_vec = &split_vec.expression;
231
232        // split the expression vector into parts
233        let mut index = 0;
234        let exp_vec_len = exp_vec.len();
235        let mut expression = String::new();
236        // regular expression splitter
237        // precedence here matters
238        let re = Regex::new(r"!=|<=|<|==|=|>=|>").unwrap();
239        if !re.is_match(self.inner) {
240            bail!("No operators were found in the expression.")
241        }
242
243        // must always start with a space and AND
244        expression += "%20AND";
245        // vector of variables to check against
246        let var_vec_check = &reference_data
247            .iter()
248            .map(|(e, _)| *e)
249            .collect::<Vec<&str>>();
250        // we can also create another vector of variables
251        // with the appropriate max/min attached.
252        // TODO: this seems like a crazy way of doing this - any better ideas?
253        let var_vec_min_max_check = {
254            let mut collector = Vec::new();
255            for (goat_var, el) in reference_data {
256                match &el.functions {
257                    Function::None => (),
258                    Function::Some(f) => {
259                        for pos in f {
260                            let format_pos = format!("{}({})", pos, goat_var);
261                            collector.push(format_pos);
262                        }
263                    }
264                }
265            }
266            collector
267        };
268
269        // loop over the expression vector
270        // splitting into further vectors
271        // to evaluate each argument.
272        loop {
273            if index == exp_vec_len {
274                break;
275            }
276            // expected to be in format
277            // variable <operator> number/enum
278            let curr_el = exp_vec[index];
279
280            let mut curr_el_vec = Vec::new();
281            // split this on the operator
282            // do we need to check whether this operator actually exists?
283            // I can imagine that this will break down otherwise...
284            let splitter = SplitCaptures::new(&re, curr_el);
285
286            for state in splitter {
287                match state {
288                    SplitState::Unmatched(s) => {
289                        curr_el_vec.push(s);
290                    }
291                    SplitState::Captured(s) => {
292                        curr_el_vec.push(s.get(0).map_or("", |m| m.as_str()));
293                    }
294                };
295            }
296
297            // check this vector is length 3 or 1
298            ensure!(
299                    curr_el_vec.len() == 3 || curr_el_vec.len() == 1,
300                    "Split vector on single expression is invalid - length = {}. Are the input variables or operands correct?",
301                    curr_el_vec.len()
302                );
303            match curr_el_vec.len() {
304                3 => {
305                    // trim strings
306                    // replace rogue quotes (not sure why this is happening now, but was not before...)
307                    // manually escape these...
308                    let variable = &curr_el_vec[0].trim().replace('\"', "").replace('\'', "")[..];
309                    let operator = switch_string_to_url_encoding(curr_el_vec[1])?.trim();
310                    let value = &curr_el_vec[2].trim().replace('\"', "").replace('\'', "")[..];
311
312                    if !var_vec_check.contains(&variable)
313                        && !var_vec_min_max_check.contains(&variable.to_string())
314                    {
315                        // ew
316                        // just combining the min/max and normal variable vectors
317                        // into a single vector.
318                        let combined_checks = var_vec_check
319                            .iter()
320                            .map(|e| String::from(*e))
321                            .collect::<Vec<String>>()
322                            .iter()
323                            .chain(
324                                var_vec_min_max_check
325                                    .iter()
326                                    .map(String::from)
327                                    .collect::<Vec<String>>()
328                                    .iter(),
329                            )
330                            .map(String::from)
331                            .collect::<Vec<String>>();
332
333                        let var_vec_mean = did_you_mean(&combined_checks, variable);
334
335                        if let Some(value) = var_vec_mean {
336                            bail!(
337                                "In your expression (LHS) you typed \"{}\" - did you mean \"{}\"?",
338                                variable,
339                                value
340                            )
341                        }
342                    }
343
344                    // this panics with min/max.
345                    // if min/max present, extract within the parentheses.
346                    let keyword_enums = match var_vec_min_max_check.contains(&variable.to_string())
347                    {
348                        true => {
349                            // this means we have min/max
350                            let re = Regex::new(r"\((.*?)\)").unwrap();
351                            // we guarantee getting here with a variable, so unwrap is fine
352                            // the second unwrap is always guaranteed too?
353                            let extract_var =
354                                re.captures(variable).unwrap().get(1).unwrap().as_str();
355                            &reference_data.get(extract_var).unwrap().type_of
356                        }
357                        false => &reference_data.get(variable).unwrap().type_of,
358                    };
359
360                    // if there are parentheses - i.e. in min()/max() functions
361                    let url_encoded_variable = variable.replace('(', "%28");
362                    let url_encoded_variable = url_encoded_variable.replace(')', "%29");
363
364                    // if there are keywords, make sure they are a match
365                    match keyword_enums {
366                        TypeOf::Keyword(k) => {
367                            // split on commas here
368                            // and trim
369                            let value_split_commas = value
370                                .split(',')
371                                .map(|e| {
372                                    let trimmed = e.trim();
373                                    trimmed.replace('!', "")
374                                })
375                                .collect::<Vec<String>>();
376
377                            // now check our keyword enums
378                            for val in &value_split_commas {
379                                let possibilities =
380                                    k.iter().map(|e| String::from(*e)).collect::<Vec<_>>();
381                                let did_you_mean_str = did_you_mean(&possibilities, val);
382
383                                if let Some(value) = did_you_mean_str {
384                                    if value != *val {
385                                        bail!("In your expression (RHS) you typed \"{}\" - did you mean \"{}\"?", val, value)
386                                    }
387                                }
388                            }
389
390                            // now modify value_split_commas to parse parentheses
391                            let parsed_value_split_commas = value
392                                .split(',')
393                                .map(|e| {
394                                    // trim again but keep bool flags
395                                    let f = e.trim();
396                                    // janky but will do for now.
397                                    let f = f.replace('(', "%28");
398                                    let f = f.replace(')', "%29");
399                                    let f = f.replace(' ', "%20");
400                                    f.replace('!', "%21")
401                                })
402                                .collect::<Vec<String>>();
403                            // build expression
404                            expression += "%20";
405                            expression += &url_encoded_variable;
406                            // do operators need to be translated?
407                            expression += "%20";
408                            expression += operator;
409                            expression += "%20";
410                            expression += &parsed_value_split_commas.join("%2C");
411                            expression += "%20";
412                            // end of sub expression
413                            // assume there is another expression to follow
414                            expression += "AND%20"
415                        }
416                        t => {
417                            // here can we type check input
418                            TypeOf::check(t, value, variable)?;
419
420                            // build expression
421                            expression += "%20";
422                            expression += &url_encoded_variable;
423                            // do operators need to be translated?
424                            expression += "%20";
425                            expression += operator;
426                            expression += "%20";
427                            expression += value;
428                            expression += "%20";
429                            // end of sub expression
430                            // assume there is another expression to follow
431                            expression += "AND%20"
432                        }
433                    }
434                }
435                1 => (),
436                _ => unreachable!(),
437            }
438
439            index += 1;
440        }
441        // remove trailing AND%20
442        match expression.len() - 6 > 0 {
443            true => {
444                expression.drain(expression.len() - 6..);
445                Ok(expression)
446            }
447            false => {
448                bail!("Error in expression format. Expressions must be in the format:\n\t<variable> <operator> <value> AND ...")
449            }
450        }
451    }
452}
453
454/// Split a string and keep the delimiter.
455/// Thanks [`BurntSushi`](https://github.com/rust-lang/regex/issues/330)
456#[derive(Debug)]
457struct SplitCaptures<'r, 't> {
458    finder: CaptureMatches<'r, 't>,
459    text: &'t str,
460    last: usize,
461    caps: Option<Captures<'t>>,
462}
463
464impl<'r, 't> SplitCaptures<'r, 't> {
465    pub fn new(re: &'r Regex, text: &'t str) -> SplitCaptures<'r, 't> {
466        SplitCaptures {
467            finder: re.captures_iter(text),
468            text,
469            last: 0,
470            caps: None,
471        }
472    }
473}
474
475#[derive(Debug)]
476enum SplitState<'t> {
477    Unmatched(&'t str),
478    Captured(Captures<'t>),
479}
480
481impl<'r, 't> Iterator for SplitCaptures<'r, 't> {
482    type Item = SplitState<'t>;
483
484    fn next(&mut self) -> Option<SplitState<'t>> {
485        if let Some(caps) = self.caps.take() {
486            return Some(SplitState::Captured(caps));
487        }
488        match self.finder.next() {
489            None => {
490                if self.last >= self.text.len() {
491                    None
492                } else {
493                    let s = &self.text[self.last..];
494                    self.last = self.text.len();
495                    Some(SplitState::Unmatched(s))
496                }
497            }
498            Some(caps) => {
499                let m = caps.get(0).unwrap();
500                let unmatched = &self.text[self.last..m.start()];
501                self.last = m.end();
502                self.caps = Some(caps);
503                Some(SplitState::Unmatched(unmatched))
504            }
505        }
506    }
507}