phcue_ck/
lib.rs

1use clap::{Parser, ValueEnum};
2use futures::StreamExt;
3use serde::{Deserialize, Serialize};
4use std::fs::File;
5use std::io::{BufRead, BufReader};
6use std::path::PathBuf;
7use std::process::exit;
8
9/// A struct to hold the data returned from the ENA API
10#[derive(Serialize, Deserialize, Clone, Debug)]
11struct ENAApiResponse {
12    run_accession: String,
13    fastq_ftp: String,
14    fastq_bytes: String,
15    fastq_md5: String,
16    submitted_ftp: String,
17    submitted_md5: String,
18    submitted_bytes: String,
19    sra_ftp: String,
20    sra_bytes: String,
21    sra_md5: String,
22}
23
24/// A struct to hold the parsed data from the ENA API and return it to the user
25#[derive(Clone, Debug, Serialize, Deserialize)]
26#[serde(from = "ENAApiResponse")]
27pub struct Run {
28    pub accession: String,
29    reads: Vec<Reads>,
30}
31
32impl Run {
33    /// Clean single end reads if there are paired end reads too
34    /// This is if the user does not wish to have the single end reads, and
35    /// keep only the paired end reads
36    pub fn clean_single_end(&mut self) {
37        if self.reads.len() == 3 {
38            self.reads.remove(0);
39        }
40    }
41}
42
43#[derive(Debug, ValueEnum, Clone)]
44pub enum OutputFormat {
45    Json,
46    Csv,
47    CsvWide,
48    CsvLong,
49}
50
51/// Here, we implement the From trait for the Run struct, so that Run instances
52/// can be derived from instances of the ENAApiResponse type.
53/// Full example here: https://play.rust-lang.org/?version=stable&mode=debug&edition=2021&gist=6d15ef7f0834dae23b1bcea336c627f2
54impl From<ENAApiResponse> for Run {
55    fn from(response: ENAApiResponse) -> Self {
56        let fastq_ftp_array = response.fastq_ftp.split(';').collect::<Vec<&str>>();
57        let fastq_bytes_array = response.fastq_bytes.split(';').collect::<Vec<&str>>();
58        let fastq_md5_array = response.fastq_md5.split(';').collect::<Vec<&str>>();
59        let mut reads: Vec<Reads> = Vec::new();
60        for i in 0..fastq_ftp_array.len() {
61            reads.push(Reads {
62                url: format!(
63                    "ftp://{address}",
64                    address = fastq_ftp_array[i].to_string().to_owned()
65                ),
66                bytes: match fastq_bytes_array[i].parse::<u32>() {
67                    Ok(n) => n,
68                    Err(_) => {
69                        eprintln!(
70                            "Could not parse {} as a number of bytes for accession {}",
71                            fastq_bytes_array[i], response.run_accession
72                        );
73                        0
74                    }
75                },
76                md5: fastq_md5_array[i].to_string().to_owned(),
77            });
78        }
79        Self {
80            accession: response.run_accession,
81            reads,
82        }
83    }
84}
85
86#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
87struct Reads {
88    url: String,
89    md5: String,
90    bytes: u32,
91}
92
93/// A function to query the ENA API and return a vector of Run instances
94async fn query_ena(
95    accession: &String,
96    client: &reqwest::Client,
97) -> Result<Vec<Run>, reqwest::Error> {
98    let request_url = format!("https://www.ebi.ac.uk/ena/portal/api/filereport?accession={accession}&result=read_run&format=json", accession = accession);
99    let response = client.get(&request_url).send().await?;
100    let runs: Vec<Run> = response.json().await?;
101    Ok(runs)
102}
103
104/// A function to query the ENA API and return a vector of Run instances
105/// This function is used to query the ENA API concurrently across multiple accessions
106pub async fn concurrent_query_ena(accessions: Vec<String>, num_requests: usize) -> Vec<Run> {
107    let client = reqwest::Client::new();
108    futures::stream::iter({
109        accessions.iter().map(|accession| {
110            let client = client.clone();
111            eprintln!("Querying ENA for accession: {}", accession);
112            async move {
113                match query_ena(accession, &client).await {
114                    Ok(run) => Some(run),
115                    Err(e) => {
116                        eprintln!("Error querying ENA for accession: {}", accession);
117                        eprintln!("Error: {}", e);
118                        None
119                    }
120                }
121            }
122        })
123    })
124    .buffer_unordered(num_requests)
125    .collect::<Vec<_>>()
126    .await
127    .into_iter()
128    .filter_map(|run| run)
129    .flatten()
130    .collect::<Vec<Run>>()
131}
132
133/// CLI options and arguments
134#[derive(Parser, Debug)]
135#[clap(author, version, about, long_about = None)]
136pub struct Args {
137    #[clap(short, long, value_parser, multiple = true, validator = validate_accession, required_unless_present = "file")]
138    /// The accession of the run to query (must be an SRR, ERR or DRR accession)
139    pub accession: Vec<String>,
140
141    #[clap(
142        short = 'n',
143        long = "num-requests",
144        value_name = "NUM",
145        default_value = "1",
146        help = "Maximum number of concurrent requests to make to the ENA API (max of 10 are allowed)"
147    )]
148    /// The maximum number of concurrent requests to make to the ENA API
149    /// Default: 1
150    /// Maximum: 10
151    /// Minimum: 1
152    pub num_requests: u8,
153
154    #[clap(
155        short,
156        long,
157        value_name = "FILE",
158        help = "File containing accessions to query",
159        required_unless_present = "accession"
160    )]
161    /// The file containing accessions to query
162    /// If this is specified, the accessions will be read from this file
163    /// If this is not specified, the accessions will be read from the command line
164    pub file: Option<PathBuf>,
165
166    #[clap(
167        short,
168        long,
169        help = "Keep single end reads if there are paired end reads too"
170    )]
171    /// Keep single end reads if there are paired end reads too
172    /// By default, we discard single end reads if there are paired end reads too.
173    /// This is if the user does wish to have the single end reads
174    pub keep_single_end: bool,
175
176    #[clap(
177        value_enum,
178        short = 'o',
179        long = "output-format",
180        value_name = "FORMAT",
181        default_value_t = OutputFormat::Json,
182        help = "Format for output of data."
183    )]
184    /// The ourput format for the download links
185    /// If this is specified, the data will be written to the output format
186    /// If this is not specified, the data will be written to stdout
187    pub format: OutputFormat,
188}
189
190pub fn parse_args() -> Args {
191    Args::parse()
192}
193
194/// Validate the accession number to make sure it starts with SRR, ERR,
195///  or DRR
196fn validate_accession(accession: &str) -> Result<(), String> {
197    let regex = regex::Regex::new(r"^(SRR|ERR|DRR)[0-9]{6,10}$").unwrap();
198    if regex.is_match(accession) {
199        Ok(())
200    } else {
201        Err(format!("{} is not a valid accession number", accession))
202    }
203}
204
205/// Validate the total number of concurrent requests to make to the ENA API
206/// to make sure it is within the bounds of 1 and 10. If not, return the minimum
207/// if num_requests is less than 1 or maximum value if num_requests is larger than 10.
208/// We have chosen to bound it to 10 to be nice to the ENA API.
209///
210pub fn check_num_requests(num_requests: u8) -> usize {
211    if num_requests > 10 {
212        eprintln!("To be nice to ENA, we only allow up to 10 concurrent requests. Setting number of requests to 10.");
213        10
214    } else if num_requests < 1 {
215        eprintln!("Number of requests should be at least 1. Setting number of requests to 1.");
216        1
217    } else {
218        num_requests as usize
219    }
220}
221
222/// A function to read accessions from a file and return a vector of validated
223/// accessions. The function skips any empty lines, and will issue a warning
224/// if it encounters an invalid accession. This deals with any potential header
225/// lines in the file.
226pub fn read_accessions(file: &PathBuf) -> Vec<String> {
227    let file = match File::open(file) {
228        Ok(file) => file,
229        Err(e) => {
230            eprintln!("Error opening file: {}", e);
231            exit(1);
232        }
233    };
234    let reader = BufReader::new(file);
235    reader
236        .lines()
237        .into_iter()
238        .filter_map(|line| line.ok())
239        .filter_map(|line| if line.is_empty() { None } else { Some(line) })
240        .filter_map(|line| match validate_accession(line.as_str()) {
241            Ok(_) => Some(line),
242            Err(e) => {
243                eprintln!("Error validating accession: {}. Ignoring this value...", e);
244                None
245            }
246        })
247        .collect()
248}
249
250/// A function to handle output in the csv format. This function outputs one read per line.
251pub fn print_csv<W: std::io::Write>(wtr: &mut csv::Writer<W>, runs: Vec<Run>) -> Result<(), std::io::Error> {
252    for run in runs {
253        wtr.write_record(&["accession", "url", "md5", "bytes"])?;
254        for read in run.reads {
255            wtr.write_record(&[&run.accession, &read.url, &read.md5, &read.bytes.to_string()])?;
256        }
257    }
258    wtr.flush()?;
259    Ok(())
260}
261
262/// A function to handle output in the wide csv format. This function outputs one run per line.
263pub fn print_csv_wide<W: std::io::Write>(wtr: &mut csv::Writer<W>, runs: Vec<Run>, keep_single_end: bool) -> Result<(), std::io::Error> {
264    wtr.write_record(&["accession", "url_se", "md5_se", "bytes_1", "url_1", "md5_1", "bytes_se", "url_2", "md5_2", "bytes_2"])?;
265    for run in runs {
266        match run.reads.len() {
267            1 if keep_single_end==true => wtr.write_record(&[&run.accession, &run.reads[0].url,  &run.reads[0].md5, &run.reads[0].bytes.to_string(), "", "", "", "", "", ""])?,
268            2 => wtr.write_record(&[&run.accession, "", "", "", &run.reads[0].url, &run.reads[0].md5, &run.reads[0].bytes.to_string(), &run.reads[1].url, &run.reads[1].md5,  &run.reads[1].bytes.to_string()])?,
269            3 if keep_single_end==true => wtr.write_record(&[&run.accession, &run.reads[0].url, &run.reads[0].md5, &run.reads[0].bytes.to_string(), &run.reads[1].url, &run.reads[1].md5, &run.reads[1].bytes.to_string(), &run.reads[2].url, &run.reads[2].md5, &run.reads[2].bytes.to_string()])?,
270            _ => {
271                eprintln!("Found too many or too few reads for {}", &run.accession);
272                exit(1);
273            }
274        }
275    }
276    wtr.flush()?;
277    Ok(())
278}
279
280///A function to handle output in the long csv format. This function prints one variable per line.
281pub fn print_csv_long<W: std::io::Write>(wtr: &mut csv::Writer<W>, runs: Vec<Run>) -> Result<(), std::io::Error> {
282    wtr.write_record(&["accession", "variable", "value"])?;
283    for run in runs {
284        match run.reads.len() {
285            1 => {
286                wtr.write_record(&[&run.accession, "url_se", &run.reads[0].url])?;
287                wtr.write_record(&[&run.accession, "md5_se", &run.reads[0].md5])?;
288                wtr.write_record(&[&run.accession, "bytes_se", &run.reads[0].bytes.to_string()])?; 
289            },
290            2 => {
291                wtr.write_record(&[&run.accession, "url_1", &run.reads[0].url])?;
292                wtr.write_record(&[&run.accession, "md5_1", &run.reads[0].md5])?;
293                wtr.write_record(&[&run.accession, "bytes_1", &run.reads[0].bytes.to_string()])?;
294                wtr.write_record(&[&run.accession, "url_2", &run.reads[1].url])?;
295                wtr.write_record(&[&run.accession, "md5_2", &run.reads[1].md5])?;
296                wtr.write_record(&[&run.accession, "bytes_2", &run.reads[1].bytes.to_string()])?;
297            },
298            3 => {
299                wtr.write_record(&[&run.accession, "url_se", &run.reads[0].url])?;
300                wtr.write_record(&[&run.accession, "md5_se", &run.reads[0].md5])?;
301                wtr.write_record(&[&run.accession, "bytes_se", &run.reads[0].bytes.to_string()])?; 
302                wtr.write_record(&[&run.accession, "url_1", &run.reads[1].url])?;
303                wtr.write_record(&[&run.accession, "md5_1", &run.reads[1].md5])?;
304                wtr.write_record(&[&run.accession, "bytes_1", &run.reads[1].bytes.to_string()])?;
305                wtr.write_record(&[&run.accession, "url_2", &run.reads[2].url])?;
306                wtr.write_record(&[&run.accession, "md5_2", &run.reads[2].md5])?;
307                wtr.write_record(&[&run.accession, "bytes_2", &run.reads[2].bytes.to_string()])?;
308            },
309            _ => {
310                eprintln!("Found too many or too few reads for {}", &run.accession);
311                exit(1);
312            }
313        }
314    }
315    wtr.flush()?;
316    Ok(())
317}
318
319
320#[cfg(test)]
321mod tests {
322    use super::*;
323
324    #[test]
325    fn test_validate_srr_accession() {
326        let accession = "SRR1234567";
327        let result = validate_accession(accession);
328        assert!(result.is_ok());
329    }
330
331    #[test]
332    fn test_validate_err_accession() {
333        let accession = "ERR1234567";
334        let result = validate_accession(accession);
335        assert!(result.is_ok());
336    }
337
338    #[test]
339    fn test_validate_drr_accession() {
340        let accession = "DRR1234567";
341        let result = validate_accession(accession);
342        assert!(result.is_ok());
343    }
344
345    #[test]
346    fn test_validate_invalid_accession() {
347        let accession = "1234567";
348        let result = validate_accession(accession);
349        assert!(result.is_err());
350    }
351
352    #[test]
353    fn test_check_num_requests_valid() {
354        let num_requests = 5;
355        let result = check_num_requests(num_requests);
356        assert_eq!(result, 5);
357    }
358
359    #[test]
360    fn test_check_num_requests_invalid_less_than_1() {
361        let num_requests = 0;
362        let result = check_num_requests(num_requests);
363        assert_eq!(result, 1);
364    }
365
366    #[test]
367    fn test_check_num_requests_invalid_greater_than_10() {
368        let num_requests = 11;
369        let result = check_num_requests(num_requests);
370        assert_eq!(result, 10);
371    }
372
373    #[test]
374    fn test_removal_single_reads() {
375        let read_se = Reads {
376            url: "read.fastq.gz".to_string(),
377            md5: "md5".to_string(),
378            bytes: 123,
379        };
380        let read_pe_1 = Reads {
381            url: "read_1.fastq.gz".to_string(),
382            md5: "md5".to_string(),
383            bytes: 123,
384        };
385        let read_pe_2 = Reads {
386            url: "read_2.fastq.gz".to_string(),
387            md5: "md5".to_string(),
388            bytes: 123,
389        };
390        let reads_se = vec![read_se.clone()];
391        let reads_pe = vec![read_pe_1.clone(), read_pe_2.clone()];
392        let reads_pe_se = vec![read_se.clone(), read_pe_1.clone(), read_pe_2.clone()];
393        let run_se = Run {
394            accession: "SRR1234567".to_string(),
395            reads: reads_se,
396        };
397        let run_pe = Run {
398            accession: "SRR1234567".to_string(),
399            reads: reads_pe,
400        };
401        let run_pe_se = Run {
402            accession: "SRR1234567".to_string(),
403            reads: reads_pe_se,
404        };
405        let mut runs = vec![run_se, run_pe, run_pe_se];
406        runs.iter_mut().for_each(|run| run.clean_single_end());
407        assert_eq!(runs[0].reads[0], read_se);
408        assert_eq!(runs[1].reads[0], read_pe_1);
409        assert_eq!(runs[1].reads[1], read_pe_2);
410        assert_eq!(runs[2].reads[0], read_pe_1);
411        assert_eq!(runs[2].reads[1], read_pe_2);
412    }
413
414    #[test]
415    fn test_print_csv() {
416        let read = Reads {
417            url: "url".to_string(),
418            md5: "md5".to_string(),
419            bytes: 123,
420        };
421        let reads = vec![read.clone()];
422        let run = Run {
423            accession: "accession".to_string(),
424            reads: reads,
425        };
426        let runs = vec![run];
427        let mut wtr = csv::Writer::from_writer(Vec::new());
428        print_csv(&mut wtr, runs).unwrap();
429        let data = String::from_utf8(wtr.into_inner().unwrap()).unwrap();
430        assert_eq!(data, "accession,url,md5,bytes\naccession,url,md5,123\n");
431    }
432
433    #[test]
434    fn test_print_csv_wide() {
435        let read_se = Reads {
436            url: "url_se".to_string(),
437            md5: "md5_se".to_string(),
438            bytes: 123,
439        };
440        let read_pe_1 = Reads {
441            url: "url_pe_1".to_string(),
442            md5: "md5_pe_1".to_string(),
443            bytes: 123,
444        };
445        let read_pe_2 = Reads {
446            url: "url_pe_2".to_string(),
447            md5: "md5_pe_2".to_string(),
448            bytes: 123,
449        };
450        let reads_se = vec![read_se.clone()];
451        let reads_pe = vec![read_pe_1.clone(), read_pe_2.clone()];
452        let reads_pe_se = vec![read_se.clone(), read_pe_1.clone(), read_pe_2.clone()];
453        let run_se = Run {
454            accession: "SRR1234567".to_string(),
455            reads: reads_se,
456        };
457        let run_pe = Run {
458            accession: "SRR1234567".to_string(),
459            reads: reads_pe,
460        };
461        let run_pe_se = Run {
462            accession: "SRR1234567".to_string(),
463            reads: reads_pe_se,
464        };
465 
466        let runs = vec![run_se];
467        let mut wtr = csv::Writer::from_writer(Vec::new());
468        print_csv_wide(&mut wtr, runs, true).unwrap();
469        let data = String::from_utf8(wtr.into_inner().unwrap()).unwrap();
470        assert_eq!(data, "accession,url_se,md5_se,bytes_1,url_1,md5_1,bytes_se,url_2,md5_2,bytes_2\nSRR1234567,url_se,md5_se,123,,,,,,\n");
471
472        let runs_pe = vec![run_pe];
473        let mut wtr = csv::Writer::from_writer(Vec::new());
474        print_csv_wide(&mut wtr, runs_pe, false).unwrap();
475        let data = String::from_utf8(wtr.into_inner().unwrap()).unwrap();
476        assert_eq!(data, "accession,url_se,md5_se,bytes_1,url_1,md5_1,bytes_se,url_2,md5_2,bytes_2\nSRR1234567,,,,url_pe_1,md5_pe_1,123,url_pe_2,md5_pe_2,123\n");
477
478        let runs_pe_se = vec![run_pe_se];
479        let mut wtr = csv::Writer::from_writer(Vec::new());
480        print_csv_wide(&mut wtr, runs_pe_se, true).unwrap();
481        let data = String::from_utf8(wtr.into_inner().unwrap()).unwrap();
482        assert_eq!(data, "accession,url_se,md5_se,bytes_1,url_1,md5_1,bytes_se,url_2,md5_2,bytes_2\nSRR1234567,url_se,md5_se,123,url_pe_1,md5_pe_1,123,url_pe_2,md5_pe_2,123\n");
483    }
484
485    #[test]
486    fn test_print_csv_long() {
487        let read_se = Reads {
488            url: "url_se".to_string(),
489            md5: "md5_se".to_string(),
490            bytes: 123,
491        };
492        let read_pe_1 = Reads {
493            url: "url_pe_1".to_string(),
494            md5: "md5_pe_1".to_string(),
495            bytes: 123,
496        };
497        let read_pe_2 = Reads {
498            url: "url_pe_2".to_string(),
499            md5: "md5_pe_2".to_string(),
500            bytes: 123,
501        };
502        let reads_se = vec![read_se.clone()];
503        let reads_pe = vec![read_pe_1.clone(), read_pe_2.clone()];
504        let reads_pe_se = vec![read_se.clone(), read_pe_1.clone(), read_pe_2.clone()];
505        let run_se = Run {
506            accession: "SRR1234567".to_string(),
507            reads: reads_se,
508        };
509        let run_pe = Run {
510            accession: "SRR1234567".to_string(),
511            reads: reads_pe,
512        };
513        let run_pe_se = Run {
514            accession: "SRR1234567".to_string(),
515            reads: reads_pe_se,
516        };
517 
518        let runs = vec![run_se];
519        let mut wtr = csv::Writer::from_writer(Vec::new());
520        print_csv_long(&mut wtr, runs).unwrap();
521        let data = String::from_utf8(wtr.into_inner().unwrap()).unwrap();
522        assert_eq!(data, "accession,variable,value\nSRR1234567,url_se,url_se\nSRR1234567,md5_se,md5_se\nSRR1234567,bytes_se,123\n");
523
524        let runs_pe = vec![run_pe];
525        let mut wtr = csv::Writer::from_writer(Vec::new());
526        print_csv_long(&mut wtr, runs_pe).unwrap();
527        let data = String::from_utf8(wtr.into_inner().unwrap()).unwrap();
528        assert_eq!(data, "accession,variable,value\nSRR1234567,url_1,url_pe_1\nSRR1234567,md5_1,md5_pe_1\nSRR1234567,bytes_1,123\nSRR1234567,url_2,url_pe_2\nSRR1234567,md5_2,md5_pe_2\nSRR1234567,bytes_2,123\n");
529
530        let runs_pe_se = vec![run_pe_se];
531        let mut wtr = csv::Writer::from_writer(Vec::new());
532        print_csv_long(&mut wtr, runs_pe_se).unwrap();
533        let data = String::from_utf8(wtr.into_inner().unwrap()).unwrap();
534        assert_eq!(data, "accession,variable,value\nSRR1234567,url_se,url_se\nSRR1234567,md5_se,md5_se\nSRR1234567,bytes_se,123\nSRR1234567,url_1,url_pe_1\nSRR1234567,md5_1,md5_pe_1\nSRR1234567,bytes_1,123\nSRR1234567,url_2,url_pe_2\nSRR1234567,md5_2,md5_pe_2\nSRR1234567,bytes_2,123\n");
535    }
536}