1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
use crate::utils::variables::Variables;
use anyhow::Result;
use lazy_static::lazy_static;

const GOAT_URL_BASE: &str = "https://goat.genomehubs.org/api/";
const GOAT_API_VERSION: &str = "v0.0.1/";

lazy_static! {
    pub static ref GOAT_URL: String = format!("{}{}", GOAT_URL_BASE, GOAT_API_VERSION);
    pub static ref TAXONOMY: String = "ncbi".to_string();
}

// format the ranks for the URL.

fn format_rank(r: &str) -> String {
    // fixed vector of ranks.
    // "none" by default will return an empty string here.
    let ranks = vec![
        "subspecies",
        "species",
        "genus",
        "family",
        "order",
        "class",
        "phylum",
        "kingdom",
        "superkingdom",
    ];
    let position_selected = ranks.iter().position(|e| e == &r);
    let updated_ranks = match position_selected {
        Some(p) => &ranks[p..],
        None => return "".to_string(),
    };
    let mut rank_string = String::new();
    rank_string += "&ranks=";
    let ranks_to_add = updated_ranks.join("%2C");
    rank_string += &ranks_to_add;

    rank_string
}

// for now, let's put all names in.
fn format_names(flag: bool) -> String {
    match flag {
        true => "&names=synonym%2Ctol_id%2Ccommon_name".to_string(),
        false => "".to_string(),
    }
}

// keep contained here for now
use crate::utils::expression::CLIexpression;
pub fn format_expression(exp: &str) -> Result<String> {
    let mut new_exp = CLIexpression::new(exp);
    let parsed_string = new_exp.parse()?;
    Ok(parsed_string)
}

// Parse the fields from search.
#[derive(Copy, Clone)]
pub struct FieldBuilder {
    pub all: bool,
    pub assembly: bool,
    pub bioproject: bool,
    pub busco: bool,
    pub country_list: bool,
    pub cvalues: bool,
    pub date: bool,
    pub gc_percent: bool,
    pub gene_count: bool,
    pub gs: bool,
    pub karyotype: bool,
    pub legislation: bool,
    pub mitochondrion: bool,
    pub n50: bool,
    pub names: bool,
    pub plastid: bool,
    pub ploidy: bool,
    pub sex_determination: bool,
    pub status: bool,
    pub target_lists: bool,
    pub tidy: bool,
}

impl FieldBuilder {
    // private fn used below in build_fields_string
    fn as_array(&self) -> [bool; 21] {
        [
            self.all,
            self.assembly,
            self.bioproject,
            self.busco,
            self.country_list,
            self.cvalues,
            self.date,
            self.gc_percent,
            self.gene_count,
            self.gs,
            self.karyotype,
            self.legislation,
            self.mitochondrion,
            self.names,
            self.n50,
            self.plastid,
            self.ploidy,
            self.sex_determination,
            self.status,
            self.target_lists,
            self.tidy,
        ]
    }

    pub fn build_fields_string(&self) -> String {
        let base = "&fields=";
        let delimiter = "%2C";
        // these are display level 1
        let assembly_level_field = "assembly_level";
        let assembly_span_field = "assembly_span";
        let busco_completeness_field = "busco_completeness";
        let chromosome_number_field = "chromosome_number";
        let haploid_number_field = "haploid_number";
        let gc_percent_field = "gc_percent";
        // we also have genome size kmer & draft
        let genome_size_field = "genome_size";
        // c value method & cell type
        let c_value_field = "c_value";
        // these are display level 2
        let mitochondrial_assembly_span_field = "mitochondrion_assembly_span";
        let mitochondrial_gc_percent_field = "mitochondrion_gc_percent";
        let plastid_assembly_span_field = "plastid_assembly_span";
        let plastid_gc_percent_field = "plastid_gc_percent";
        let ploidy = "ploidy";
        let sex_determination = "sex_determination";
        // all legislation data
        let isb_wildlife_act_1976 = "isb_wildlife_act_1976";
        let hab_reg_2017 = "HabReg_2017";
        let mar_hab_reg_2017 = "MarHabReg-2017";
        let waca_1981 = "waca_1981";
        let protection_of_badgers_act_1992 = "Protection_of_Badgers_Act_1992";
        let e_c_habs92 = "ECHabs92";
        // all target lists
        let long_list = "long_list";
        let other_priority = "other_priority";
        let family_representative = "family_representative";
        // add n50
        let contig_n50 = "contig_n50";
        let scaffold_n50 = "scaffold_n50";
        // add bioproject & biosample
        let bioproject = "bioproject";
        let biosample = "biosample";
        // gene count
        let gene_count = "gene_count";
        // dates
        let assembly_date = "assembly_date";
        let ebp_metric_date = "ebp_metric_date";
        // country list
        let country_list = "country_list";
        // sequencing status of the taxon
        // lump all these together at the moment.
        let sequencing_status = "sequencing_status";
        let sample_collected = "sample_collected";
        let sample_acquired = "sample_acquired";
        let in_progress = "in_progress";
        let insdc_submitted = "insdc_submitted";
        let insdc_open = "insdc_open";
        let published = "published";
        let sample_collected_by = "sample_collected_by";

        let field_array = self.as_array();
        let mut field_string = String::new();

        field_string += base;
        // default assembly stats
        if self.assembly || self.all {
            field_string += assembly_level_field;
            field_string += delimiter;
            field_string += assembly_span_field;
            field_string += delimiter;
        }
        // busco stats
        if self.busco || self.all {
            field_string += busco_completeness_field;
            field_string += delimiter;
        }
        // gc_percent
        if self.gc_percent || self.all {
            field_string += gc_percent_field;
            field_string += delimiter;
        }
        // default karyotype stats
        if self.karyotype || self.all {
            field_string += chromosome_number_field;
            field_string += delimiter;
            field_string += haploid_number_field;
            field_string += delimiter;
        }
        // additional karyotype
        if self.ploidy || self.all {
            field_string += ploidy;
            field_string += delimiter;
        }
        // additional karyotype
        if self.sex_determination || self.all {
            field_string += sex_determination;
            field_string += delimiter;
        }
        // genome size and c-value split here
        // even though they are both default in GoaT.
        if self.gs || self.all {
            field_string += genome_size_field;
            field_string += delimiter;
        }
        if self.cvalues || self.all {
            field_string += c_value_field;
            field_string += delimiter;
        }
        // add mito data
        if self.mitochondrion || self.all {
            field_string += mitochondrial_assembly_span_field;
            field_string += delimiter;
            field_string += mitochondrial_gc_percent_field;
            field_string += delimiter;
        }
        // add plastid data
        if self.plastid || self.all {
            field_string += plastid_assembly_span_field;
            field_string += delimiter;
            field_string += plastid_gc_percent_field;
            field_string += delimiter;
        }
        // add all legislation data
        if self.legislation || self.all {
            field_string += isb_wildlife_act_1976;
            field_string += delimiter;
            field_string += hab_reg_2017;
            field_string += delimiter;
            field_string += mar_hab_reg_2017;
            field_string += delimiter;
            field_string += waca_1981;
            field_string += delimiter;
            field_string += protection_of_badgers_act_1992;
            field_string += delimiter;
            field_string += e_c_habs92;
            field_string += delimiter;
        }
        if self.target_lists || self.all {
            field_string += long_list;
            field_string += delimiter;
            field_string += other_priority;
            field_string += delimiter;
            field_string += family_representative;
            field_string += delimiter;
        }
        if self.n50 || self.all {
            field_string += contig_n50;
            field_string += delimiter;
            field_string += scaffold_n50;
            field_string += delimiter;
        }
        if self.bioproject || self.all {
            field_string += bioproject;
            field_string += delimiter;
            field_string += biosample;
            field_string += delimiter;
        }
        if self.gene_count || self.all {
            field_string += gene_count;
            field_string += delimiter;
        }
        if self.date || self.all {
            field_string += assembly_date;
            field_string += delimiter;
            field_string += ebp_metric_date;
            field_string += delimiter;
        }
        if self.country_list || self.all {
            field_string += country_list;
            field_string += delimiter;
        }
        if self.status || self.all {
            field_string += sequencing_status;
            field_string += delimiter;
            field_string += sample_collected;
            field_string += delimiter;
            field_string += sample_acquired;
            field_string += delimiter;
            field_string += in_progress;
            field_string += delimiter;
            field_string += insdc_submitted;
            field_string += delimiter;
            field_string += insdc_open;
            field_string += delimiter;
            field_string += published;
            field_string += delimiter;
            field_string += sample_collected_by;
            field_string += delimiter;
        }

        // remove the last three chars == '&2C'
        field_string.drain(field_string.len() - 3..);
        // check for blanks
        let any_true = field_array.iter().any(|&e| e == true);
        if !any_true {
            // remove everything
            field_string.drain(..);
        }

        field_string
    }
}

// the master function to make all of the
pub fn make_goat_urls(
    api: &str,
    taxids: &Vec<String>,
    goat_url: &str,
    tax_tree: &str,
    include_estimates: bool,
    include_raw_values: bool,
    summarise_values_by: &str,
    result: &str,
    taxonomy: &str,
    size: &str,
    ranks: &str,
    fields: FieldBuilder,
    variables: Option<&str>,
    expression: &str,
    tax_rank: &str,
) -> Result<Vec<String>> {
    let mut res = Vec::new();

    // make the rank string
    let rank_string = format_rank(ranks);
    // make the fields string
    // either from hand coded variables by the user
    // or from flag switches
    let fields_string = match variables {
        Some(v) => Variables::new(v).parse()?,
        None => fields.build_fields_string(),
    };
    let names = format_names(fields.names);

    let tidy_data: &str;

    match fields.tidy {
        true => tidy_data = "&tidyData=true",
        false => tidy_data = "",
    }

    // enumeration of the taxa will be 0 -> n,
    // corresponding to alphabetical order of taxa
    for (index, el) in taxids.iter().enumerate() {
        let query_id = format!("&queryId=goat_cli_{}", index);
        let url = format!(
        // hardcode tidy data for now.
        "{goat_url}{api}?query=tax_{tax_tree}%28{taxon}%29{tax_rank}{expression}&includeEstimates={include_estimates}&includeRawValues={include_raw_values}&summaryValues={summarise_values_by}&result={result}&taxonomy={taxonomy}&size={size}{rank_string}{fields_string}{tidy_data}{names}{query_id}",
        goat_url = goat_url, api = api, tax_tree = tax_tree, taxon = el, tax_rank = tax_rank, expression = expression, include_estimates = include_estimates, include_raw_values = include_raw_values, summarise_values_by = summarise_values_by, result = result, taxonomy = taxonomy, size = size, rank_string = rank_string, fields_string = fields_string, tidy_data = tidy_data, names = names, query_id = query_id
    );
        res.push(url);
    }
    Ok(res)
}