goat_cli/utils/
utils.rs

1use std::{
2    fs::File,
3    io::{BufRead, BufReader},
4    path::{Path, PathBuf},
5};
6
7use crate::{
8    utils::expression,
9    utils::variable_data::{GOAT_ASSEMBLY_VARIABLE_DATA, GOAT_TAXON_VARIABLE_DATA},
10    IndexType, UPPER_CLI_FILE_LIMIT,
11};
12use anyhow::{bail, Context, Result};
13use rand::distributions::Alphanumeric;
14use rand::{thread_rng, Rng};
15
16/// Determine from the CLI matches how many URLs
17/// are needing to be generated, and return a
18/// vector of random character strings to use as
19/// unique identifiers.
20pub fn generate_unique_strings(
21    matches: &clap::ArgMatches,
22    index_type: IndexType,
23) -> Result<Vec<String>> {
24    let tax_name_op = matches.get_one::<String>("taxon");
25    let filename_op = matches.get_one::<PathBuf>("file");
26    // print expression table
27    // got to include this here, otherwise we error.
28    // reports don't include this.
29    let print_expression = matches.get_one::<bool>("print-expression");
30
31    if let Some(p) = print_expression {
32        if *p {
33            match index_type {
34                IndexType::Taxon => expression::print_variable_data(&*GOAT_TAXON_VARIABLE_DATA),
35                IndexType::Assembly => {
36                    expression::print_variable_data(&*GOAT_ASSEMBLY_VARIABLE_DATA)
37                }
38            }
39            std::process::exit(0);
40        }
41    }
42
43    let url_vector: Vec<String>;
44    // if -t use this
45    match tax_name_op {
46        Some(s) => {
47            // catch empty string hanging here.
48            if s.is_empty() {
49                bail!("Empty string found, please specify a taxon.");
50            }
51            url_vector = parse_comma_separated(s);
52        }
53        None => match filename_op {
54            Some(s) => {
55                url_vector = lines_from_file(s)?;
56                // check length of vector and bail if > 1000
57                if url_vector.len() > *UPPER_CLI_FILE_LIMIT {
58                    let limit_string = pretty_print_usize(*UPPER_CLI_FILE_LIMIT);
59                    bail!("Number of taxa specified cannot exceed {}.", limit_string)
60                }
61            }
62            None => bail!("One of -f (--file) or -t (--taxon) should be specified."),
63        },
64    }
65
66    let url_vector_len = url_vector.len();
67
68    let mut chars_vec = vec![];
69    for _ in 0..url_vector_len {
70        let mut rng = thread_rng();
71        let chars: String = (0..15).map(|_| rng.sample(Alphanumeric) as char).collect();
72        chars_vec.push(chars.clone());
73    }
74
75    Ok(chars_vec)
76}
77
78/// Read NCBI taxon ID's or binomial names of species,
79/// or higher order taxa from a file.
80pub fn lines_from_file(filename: impl AsRef<Path>) -> Result<Vec<String>> {
81    let file = File::open(&filename)
82        .with_context(|| format!("Could not open {:?}", filename.as_ref().as_os_str()))?;
83    let buf = BufReader::new(file);
84    let buf_res: Result<Vec<String>> = buf
85        .lines()
86        .map(|l| {
87            l.with_context(|| {
88                format!(
89                    "Error in mapping buf_lines from {:?}",
90                    filename.as_ref().as_os_str()
91                )
92            })
93        })
94        .collect();
95    buf_res
96}
97
98// taxids should be comma separated
99// remove whitespace from beginning and end of each element of the vec.
100// TODO: check structure of each element in vec.
101
102/// Parse a comma separated string and return each of the elements
103/// stripped of whitespace in a vector.
104pub fn parse_comma_separated(taxids: &str) -> Vec<String> {
105    let res: Vec<&str> = taxids.split(',').collect();
106
107    let mut res2 = Vec::new();
108    for mut str in res {
109        // sort the rights
110        while str.ends_with(' ') {
111            let len = str.len();
112            let new_len = len.saturating_sub(" ".len());
113            str = &str[..new_len];
114        }
115        // sort the lefts
116        let mut index = 0;
117        while str.starts_with(' ') {
118            index += 1;
119            str = &str[index..];
120        }
121        // in addition, remove any quotes
122        // so we can parse things like:
123        // `-v"assembly_level"`, where there is
124        // no space between the `-v` and `assembly_level`
125        let replaced = str.replace('\"', "").replace('\'', "");
126
127        res2.push(replaced);
128    }
129    res2.sort_unstable();
130    res2.dedup();
131    res2
132}
133
134/// Creates a vector of taxon ranks which will eventually form the
135/// headers of the taxon ranks in the returned TSV file.
136pub fn get_rank_vector(r: &str) -> Vec<String> {
137    let ranks = vec![
138        "subspecies".to_string(),
139        "species".to_string(),
140        "genus".to_string(),
141        "family".to_string(),
142        "order".to_string(),
143        "class".to_string(),
144        "phylum".to_string(),
145        "kingdom".to_string(),
146        "superkingdom".to_string(),
147    ];
148    let position_selected = ranks.iter().position(|e| e == r);
149    match position_selected {
150        Some(p) => ranks[p..].to_vec(),
151        None => vec!["".to_string()],
152    }
153}
154
155/// If multiple taxa are queried at once, headers will return for every new taxon.
156/// We can suppress this by storing the whole return as a string.
157pub fn format_tsv_output(awaited_fetches: Vec<Result<String, anyhow::Error>>) -> Result<()> {
158    // if there is a single element, return this.
159    // is there a way to get all the headers, and compare them...
160    let mut headers = Vec::new();
161    for el in &awaited_fetches {
162        let tsv = match el {
163            Ok(ref e) => e,
164            Err(e) => bail!("{}", e),
165        };
166        headers.push(tsv.split('\n').next());
167    }
168
169    // mainly a guard - but Rich I think fixed this so shouldn't need to be done.
170    let header = headers.iter().fold(headers[0], |acc, &item| {
171        let acc = acc?;
172        let item = item?;
173        if item.len() > acc.len() {
174            Some(item)
175        } else {
176            Some(acc)
177        }
178    });
179
180    match header {
181        Some(h) => println!("{}", h),
182        None => bail!("No header found."),
183    }
184
185    for el in awaited_fetches {
186        let tsv = match el {
187            Ok(ref e) => e,
188            Err(e) => bail!("{}", e),
189        };
190
191        let tsv_iter = tsv.split('\n');
192        for row in tsv_iter.skip(1) {
193            println!("{}", row)
194        }
195    }
196
197    Ok(())
198}
199
200/// Thanks to [this](https://stackoverflow.com/questions/38406793/why-is-capitalizing-the-first-letter-of-a-string-so-convoluted-in-rust)
201/// post on stack overflow. Make a string uppercase on the first character.
202pub fn some_kind_of_uppercase_first_letter(s: &str) -> String {
203    let mut c = s.chars();
204    match c.next() {
205        None => String::new(),
206        Some(f) => f.to_uppercase().collect::<String>() + c.as_str(),
207    }
208}
209
210/// Thanks to  [`this`](https://stackoverflow.com/questions/26998485/is-it-possible-to-print-a-number-formatted-with-thousand-separator-in-rust)
211/// post on stack overflow. For error messages above cli query limit, print
212/// the [`usize`] prettily.
213pub fn pretty_print_usize(i: usize) -> String {
214    let mut s = String::new();
215    let i_str = i.to_string();
216    let a = i_str.chars().rev().enumerate();
217    for (idx, val) in a {
218        if idx != 0 && idx % 3 == 0 {
219            s.insert(0, ',');
220        }
221        s.insert(0, val);
222    }
223    s.to_string()
224}
225
226/// A function to replace certain combinations of characters
227/// as their URL encoded variations. Not entirely sure if this is
228/// necessary.
229pub fn switch_string_to_url_encoding(string: &str) -> Result<&str> {
230    let res = match string {
231        // "!=" => "%21%3D",
232        "!=" => "!%3D",
233        // "<" => "%3C",
234        "<" => "%3C",
235        // "<=" => "%3C%3D",
236        "<=" => "<%3D",
237        "=" => "%3D",
238        "==" => "%3D%3D",
239        // ">" => "%3E",
240        ">" => "%3E",
241        // ">=" => "%3E%3D",
242        ">=" => ">%3D",
243        _ => bail!("Should not reach here."),
244    };
245    Ok(res)
246}
247
248/// Shamelessly poached from the [Nushell core code](https://github.com/nushell/nushell/blob/690ec9abfa994e6cf8b85ec38173ee5f0c91011c/crates/nu-protocol/src/shell_error.rs).
249/// Suggest the closest match to a string.
250pub fn did_you_mean(possibilities: &[String], tried: &str) -> Option<String> {
251    let mut possible_matches: Vec<_> = possibilities
252        .iter()
253        .map(|word| {
254            let edit_distance = levenshtein_distance(&word.to_lowercase(), &tried.to_lowercase());
255            (edit_distance, word.to_owned())
256        })
257        .collect();
258
259    possible_matches.sort();
260
261    if let Some((_, first)) = possible_matches.into_iter().next() {
262        Some(first)
263    } else {
264        None
265    }
266}
267
268/// Compute the Levenshtein distance between two strings.
269/// Borrowed from [here](https://github.com/wooorm/levenshtein-rs).
270fn levenshtein_distance(a: &str, b: &str) -> usize {
271    let mut result = 0;
272
273    /* Shortcut optimizations / degenerate cases. */
274    if a == b {
275        return result;
276    }
277
278    let length_a = a.chars().count();
279    let length_b = b.chars().count();
280
281    if length_a == 0 {
282        return length_b;
283    }
284
285    if length_b == 0 {
286        return length_a;
287    }
288
289    /* Initialize the vector.
290     *
291     * This is why it’s fast, normally a matrix is used,
292     * here we use a single vector. */
293    let mut cache: Vec<usize> = (1..).take(length_a).collect();
294    let mut distance_a;
295    let mut distance_b;
296
297    /* Loop. */
298    for (index_b, code_b) in b.chars().enumerate() {
299        result = index_b;
300        distance_a = index_b;
301
302        for (index_a, code_a) in a.chars().enumerate() {
303            distance_b = if code_a == code_b {
304                distance_a
305            } else {
306                distance_a + 1
307            };
308
309            distance_a = cache[index_a];
310
311            result = if distance_a > result {
312                if distance_b > result {
313                    result + 1
314                } else {
315                    distance_b
316                }
317            } else if distance_b > distance_a {
318                distance_a + 1
319            } else {
320                distance_b
321            };
322
323            cache[index_a] = result;
324        }
325    }
326
327    result
328}