goat_cli/utils/
url.rs

1use crate::{
2    utils::{
3        expression::CLIexpression,
4        variable_data::{GOAT_ASSEMBLY_VARIABLE_DATA, GOAT_TAXON_VARIABLE_DATA},
5        variables::Variables,
6    },
7    IndexType,
8};
9
10use anyhow::Result;
11
12// format the ranks for the URL.
13
14/// Function to format the rank into a GoaT URL segment.
15fn format_rank(r: &str) -> String {
16    // fixed vector of ranks.
17    // "none" by default will return an empty string here.
18    let ranks = vec![
19        "subspecies",
20        "species",
21        "genus",
22        "family",
23        "order",
24        "class",
25        "phylum",
26        "kingdom",
27        "superkingdom",
28    ];
29    let position_selected = ranks.iter().position(|e| e == &r);
30    let updated_ranks = match position_selected {
31        Some(p) => &ranks[p..],
32        None => return "".to_string(),
33    };
34    let mut rank_string = String::new();
35    rank_string += "&ranks=";
36    let ranks_to_add = updated_ranks.join("%2C");
37    rank_string += &ranks_to_add;
38
39    rank_string
40}
41
42/// If names appears in [`FieldBuilder`], then we add the
43/// GoaT URL segment for that.
44fn format_names(flag: bool) -> String {
45    match flag {
46        true => "&names=synonym%2Ctol_id%2Ccommon_name".to_string(),
47        false => "".to_string(),
48    }
49}
50
51/// Format an expression put into the `-e` flag on the CLI.
52pub fn format_expression(exp: &str, index_type: IndexType) -> Result<String> {
53    let mut new_exp = CLIexpression::new(exp);
54    let parsed_string = match index_type {
55        IndexType::Taxon => new_exp.parse(&*GOAT_TAXON_VARIABLE_DATA)?,
56        IndexType::Assembly => new_exp.parse(&*GOAT_ASSEMBLY_VARIABLE_DATA)?,
57    };
58    Ok(parsed_string)
59}
60
61/// Boolean struct containing all of the CLI flag information
62/// passed from the user. This struct has been expanded to include
63/// both `taxon` and `assembly` indexes.
64#[derive(Copy, Clone)]
65pub struct FieldBuilder {
66    /// Add only assembly level/span GoaT fields.
67    ///
68    /// A taxon index flag.
69    pub taxon_assembly: bool,
70    /// Add bioproject GoaT field.
71    ///
72    /// A taxon index flag.
73    pub taxon_bioproject: bool,
74    /// Add BUSCO completeness.
75    ///
76    /// A taxon index flag.
77    pub taxon_busco: bool,
78    /// Add country list GoaT field.
79    ///
80    /// A taxon index flag.
81    pub taxon_country_list: bool,
82    /// Add C-value information GoaT field.
83    ///
84    /// A taxon index flag.
85    pub taxon_cvalues: bool,
86    /// Add assembly & EBP metric date GoaT fields.
87    ///
88    /// A taxon index flag.
89    pub taxon_date: bool,
90    /// Add GC percent GoaT field.
91    ///
92    /// A taxon index flag.
93    pub taxon_gc_percent: bool,
94    /// Add gene count GoaT field.
95    ///
96    /// A taxon index flag.
97    pub taxon_gene_count: bool,
98    /// Add genome size GoaT fields.
99    ///
100    /// A taxon index flag.
101    pub taxon_gs: bool,
102    /// Add karyotype GoaT fields; chromosome number and
103    /// haploid number.
104    ///
105    /// A taxon index flag.
106    pub taxon_karyotype: bool,
107    /// Add return information for `isb_wildlife_act_1976`,
108    /// `habreg_2017`, `marhabreg-2017`, `waca_1981`,
109    /// `protection_of_badgers_act_1992`, `echabs92`
110    ///
111    /// A taxon index flag.
112    pub taxon_legislation: bool,
113    /// Add mitochondrial assembly span and gc percent
114    /// GoaT fields.
115    ///
116    /// A taxon index flag.
117    pub taxon_mitochondrion: bool,
118    /// Add contig and scaffold n50 GoaT fields.
119    ///
120    /// A taxon index flag.
121    pub taxon_n50: bool,
122    /// Add synonym, tolID, and common name GoaT fields.
123    ///
124    /// Not implemented in [`FieldBuilder`] below.
125    ///
126    /// A taxon index flag.
127    pub taxon_names: bool,
128    /// Add plastid assembly span and gc percent GoaT
129    /// fields.
130    ///
131    /// A taxon index flag.
132    pub taxon_plastid: bool,
133    /// Add ploidy GoaT field.
134    ///
135    /// A taxon index flag.
136    pub taxon_ploidy: bool,
137    /// Add sex determination GoaT field.
138    ///
139    /// A taxon index flag.
140    pub taxon_sex_determination: bool,
141    /// Add sample tracking information GoaT field.
142    ///
143    /// A taxon index flag.
144    pub taxon_status: bool,
145    /// Add `long_list`, `other_priority`, and `family_representative`
146    /// GoaT fields.
147    ///
148    /// A taxon index flag.
149    pub taxon_target_lists: bool,
150    /// Render output in tidy format?
151    ///
152    /// Not implemented in [`FieldBuilder`] below.
153    ///
154    /// A taxon index flag.
155    pub taxon_tidy: bool,
156    /// Assembly span and level.
157    ///
158    /// An assembly index flag.
159    pub assembly_assembly: bool,
160    /// Only chromosome count.
161    ///
162    /// An assembly index flag.
163    pub assembly_karyotype: bool,
164    /// All the contig information.
165    ///
166    /// An assembly index flag.
167    pub assembly_contig: bool,
168    /// All scaffold information.
169    ///
170    /// An assembly index flag.
171    pub assembly_scaffold: bool,
172    /// GC content.
173    ///
174    /// An assembly index flag.
175    pub assembly_gc: bool,
176    /// Gene and non-coding gene count.
177    ///
178    /// An assembly index flag.
179    pub assembly_gene: bool,
180    /// BUSCO completeness, lineage and string.
181    ///
182    /// An assembly index flag.
183    pub assembly_busco: bool,
184    /// BlobToolKit stats(?). No hit/target.
185    ///
186    /// An assembly index flag.
187    pub assembly_btk: bool,
188}
189
190impl FieldBuilder {
191    /// A function to turn all of the fields into a small data structure.
192    ///
193    /// This is hardcoded, but could be modified to be read in from
194    /// the goat standard variables JSON in the future.
195    ///
196    /// It's a [`Vec`] of a tuple of:
197    /// - [`bool`] which shows whether the user chose this flag or not
198    /// - [`Vec<&str>`] which enumerates the variable strings (as GoaT
199    /// would recognise) that correspond to this field.
200    ///
201    /// It's a bit of a judgement call on my part but happy to change if
202    /// there is a compelling argument.
203    fn to_vec_tuples(&self) -> Vec<(bool, Vec<&str>)> {
204        vec![
205            // Add all of the taxon_* fields
206            (self.taxon_assembly, vec!["assembly_level", "assembly_span"]),
207            (self.taxon_bioproject, vec!["bioproject", "biosample"]),
208            // testing all these busco fields.
209            (
210                self.taxon_busco,
211                vec![
212                    "busco_completeness",
213                    "odb10_lineage",
214                    "busco_lineage",
215                    "busco_string",
216                ],
217            ),
218            (self.taxon_country_list, vec!["country_list"]),
219            (self.taxon_cvalues, vec!["c_value"]),
220            (self.taxon_date, vec!["assembly_date", "ebp_metric_date"]),
221            (self.taxon_gc_percent, vec!["gc_percent"]),
222            (self.taxon_gene_count, vec!["gene_count"]),
223            (
224                self.taxon_gs,
225                vec!["genome_size", "genome_size_kmer", "genome_size_draft"],
226            ),
227            (
228                self.taxon_karyotype,
229                vec!["chromosome_number", "haploid_number"],
230            ),
231            (
232                self.taxon_legislation,
233                vec![
234                    "isb_wildlife_act_1976",
235                    "HabReg_2017",
236                    "MarHabReg-2017",
237                    "waca_1981",
238                    "Protection_of_Badgers_Act_1992",
239                    "ECHabs92",
240                ],
241            ),
242            (
243                self.taxon_mitochondrion,
244                vec!["mitochondrion_assembly_span", "mitochondrion_gc_percent"],
245            ),
246            (self.taxon_n50, vec!["scaffold_n50", "contig_n50"]),
247            (
248                self.taxon_plastid,
249                vec!["plastid_assembly_span", "plastid_gc_percent"],
250            ),
251            (self.taxon_ploidy, vec!["ploidy"]),
252            (self.taxon_sex_determination, vec!["sex_determination"]),
253            // there's now a bunch of sequencing status_asg/b10k/cbp... etc
254            // don't know if these should go here.
255            (
256                self.taxon_status,
257                vec![
258                    "sequencing_status",
259                    "sample_collected",
260                    "sample_acquired",
261                    "in_progress",
262                    "insdc_submitted",
263                    "insdc_open",
264                    "published",
265                    "sample_collected_by",
266                ],
267            ),
268            (
269                self.taxon_target_lists,
270                vec!["long_list", "other_priority", "family_representative"],
271            ),
272            // Add all of the assembly_* fields
273            (
274                self.assembly_assembly,
275                vec!["assembly_level", "assembly_span"],
276            ),
277            (self.assembly_btk, vec!["nohit", "target"]),
278            (
279                self.assembly_busco,
280                vec!["busco_completeness", "busco_lineage", "busco_string"],
281            ),
282            (
283                self.assembly_contig,
284                vec!["contig_count", "contig_l50", "contig_n50"],
285            ),
286            (self.assembly_gc, vec!["gc_percent"]),
287            (
288                self.assembly_gene,
289                vec!["gene_count", "noncoding_gene_count"],
290            ),
291            (self.assembly_karyotype, vec!["chromosome_count"]),
292            (
293                self.assembly_scaffold,
294                vec!["scaffold_count", "scaffold_l50", "scaffold_n50"],
295            ),
296        ]
297    }
298
299    /// A function which formats all of the GoaT fields
300    /// together into a URL segment.
301    pub fn build_fields_string(&self) -> String {
302        const BASE: &str = "&fields=";
303        const DELIMITER: &str = "%2C";
304
305        // build the little data base
306        let data = self.to_vec_tuples();
307
308        // and now build the string
309        let mut field_string = String::new();
310        // add the base
311        field_string += BASE;
312        for (field_present, field_vec) in data.iter() {
313            match field_present {
314                true => {
315                    field_string += &field_vec.join(DELIMITER);
316                    field_string += DELIMITER;
317                }
318                false => continue,
319            }
320        }
321
322        // remove the last three chars == '&2C'
323        field_string.drain(field_string.len() - 3..);
324        // check for blanks
325        let any_true = data.iter().map(|e| e.0).any(|e| e);
326        if !any_true {
327            // remove everything
328            field_string.drain(..);
329        }
330
331        field_string
332    }
333
334    /// An implementation of exculding values returned if they are missing or ancestral values inferred by GoaT.
335    fn generate_exculde_flags(&self) -> String {
336        const ANCESTRAL: &str = "&excludeAncestral";
337        const MISSING: &str = "&excludeMissing";
338        const OPEN_ANGLE_BRACE: &str = "%5B";
339        const CLOSE_ANGLE_BRACE: &str = "%5D";
340
341        let data = self.to_vec_tuples();
342        let mut exclusion_string = String::new();
343
344        let mut exclude_index: i32 = 0;
345        for (field_present, field_vec) in data.iter() {
346            match field_present {
347                true => {
348                    for field in field_vec {
349                        // e.g. &excludeAncestral%5B0%5D=assembly_span
350                        // add ancestral
351                        exclusion_string += ANCESTRAL;
352                        exclusion_string += OPEN_ANGLE_BRACE;
353                        exclusion_string += &exclude_index.to_string();
354                        exclusion_string += CLOSE_ANGLE_BRACE;
355                        exclusion_string += &format!("={field}");
356
357                        // add missing
358                        exclusion_string += MISSING;
359                        exclusion_string += OPEN_ANGLE_BRACE;
360                        exclusion_string += &exclude_index.to_string();
361                        exclusion_string += CLOSE_ANGLE_BRACE;
362                        exclusion_string += &format!("={field}");
363
364                        exclude_index += 1;
365                    }
366                }
367                false => continue,
368            }
369        }
370
371        exclusion_string
372    }
373}
374
375/// The function which creats the GoaT API URLs
376/// which are then used as GET requests.
377pub fn make_goat_urls(
378    api: &str,
379    taxids: &[String],
380    goat_url: &str,
381    tax_tree: &str,
382    include_estimates: bool,
383    include_raw_values: bool,
384    exclude: bool,
385    summarise_values_by: &str,
386    result: &str,
387    taxonomy: &str,
388    size: u64,
389    ranks: &str,
390    fields: FieldBuilder,
391    variables: Option<&str>,
392    expression: &str,
393    tax_rank: &str,
394    unique_ids: Vec<String>,
395    index_type: IndexType,
396) -> Result<Vec<String>> {
397    let mut res = Vec::new();
398
399    // make the rank string
400    let rank_string = format_rank(ranks);
401    // due to variables being created independently of the fields/FieldBuilder
402    // this code is a lot less nice than it could be.
403    // FIXME: this means that if you supply both a variable string and some flags, only the variable string will be
404    // considered.
405    let fields_string = match variables {
406        Some(v) => match index_type {
407            IndexType::Taxon => Variables::new(v).parse(&*GOAT_TAXON_VARIABLE_DATA)?,
408            IndexType::Assembly => Variables::new(v).parse(&*GOAT_ASSEMBLY_VARIABLE_DATA)?,
409        },
410        None => fields.build_fields_string(),
411    };
412
413    let exclude_missing_or_ancestral = if exclude {
414        match variables {
415            Some(v) => match index_type {
416                IndexType::Taxon => Variables::new(v).parse_exclude(&*GOAT_TAXON_VARIABLE_DATA)?,
417                IndexType::Assembly => Variables::new(v).parse_exclude(&*GOAT_ASSEMBLY_VARIABLE_DATA)?,
418            },
419            None => fields.generate_exculde_flags(),
420        }
421    } else {
422        "".into()
423    };
424
425    let names = format_names(fields.taxon_names);
426
427    let tidy_data: &str = match fields.taxon_tidy {
428        true => "&tidyData=true",
429        false => "",
430    };
431
432    // enumeration of the taxa will be 0 -> n,
433    // corresponding to alphabetical order of taxa
434    for (taxon, chars) in taxids.iter().zip(unique_ids.iter()) {
435        let query_id = format!("&queryId=goat_cli_{}", chars);
436        let url = format!(
437        // hardcode tidy data for now.
438        "{goat_url}{api}?query=tax_{tax_tree}%28{taxon}%29{tax_rank}{expression}&includeEstimates={include_estimates}&includeRawValues={include_raw_values}&summaryValues={summarise_values_by}&result={result}&taxonomy={taxonomy}&size={size}{rank_string}{fields_string}{tidy_data}{names}{query_id}{exclude_missing_or_ancestral}"
439    );
440        res.push(url);
441    }
442    Ok(res)
443}