1use clap::{Parser, ValueEnum};
2use futures::StreamExt;
3use serde::{Deserialize, Serialize};
4use std::fs::File;
5use std::io::{BufRead, BufReader};
6use std::path::PathBuf;
7use std::process::exit;
8
9#[derive(Serialize, Deserialize, Clone, Debug)]
11struct ENAApiResponse {
12 run_accession: String,
13 fastq_ftp: String,
14 fastq_bytes: String,
15 fastq_md5: String,
16 submitted_ftp: String,
17 submitted_md5: String,
18 submitted_bytes: String,
19 sra_ftp: String,
20 sra_bytes: String,
21 sra_md5: String,
22}
23
24#[derive(Clone, Debug, Serialize, Deserialize)]
26#[serde(from = "ENAApiResponse")]
27pub struct Run {
28 pub accession: String,
29 reads: Vec<Reads>,
30}
31
32impl Run {
33 pub fn clean_single_end(&mut self) {
37 if self.reads.len() == 3 {
38 self.reads.remove(0);
39 }
40 }
41}
42
43#[derive(Debug, ValueEnum, Clone)]
44pub enum OutputFormat {
45 Json,
46 Csv,
47 CsvWide,
48 CsvLong,
49}
50
51impl From<ENAApiResponse> for Run {
55 fn from(response: ENAApiResponse) -> Self {
56 let fastq_ftp_array = response.fastq_ftp.split(';').collect::<Vec<&str>>();
57 let fastq_bytes_array = response.fastq_bytes.split(';').collect::<Vec<&str>>();
58 let fastq_md5_array = response.fastq_md5.split(';').collect::<Vec<&str>>();
59 let mut reads: Vec<Reads> = Vec::new();
60 for i in 0..fastq_ftp_array.len() {
61 reads.push(Reads {
62 url: format!(
63 "ftp://{address}",
64 address = fastq_ftp_array[i].to_string().to_owned()
65 ),
66 bytes: match fastq_bytes_array[i].parse::<u32>() {
67 Ok(n) => n,
68 Err(_) => {
69 eprintln!(
70 "Could not parse {} as a number of bytes for accession {}",
71 fastq_bytes_array[i], response.run_accession
72 );
73 0
74 }
75 },
76 md5: fastq_md5_array[i].to_string().to_owned(),
77 });
78 }
79 Self {
80 accession: response.run_accession,
81 reads,
82 }
83 }
84}
85
86#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
87struct Reads {
88 url: String,
89 md5: String,
90 bytes: u32,
91}
92
93async fn query_ena(
95 accession: &String,
96 client: &reqwest::Client,
97) -> Result<Vec<Run>, reqwest::Error> {
98 let request_url = format!("https://www.ebi.ac.uk/ena/portal/api/filereport?accession={accession}&result=read_run&format=json", accession = accession);
99 let response = client.get(&request_url).send().await?;
100 let runs: Vec<Run> = response.json().await?;
101 Ok(runs)
102}
103
104pub async fn concurrent_query_ena(accessions: Vec<String>, num_requests: usize) -> Vec<Run> {
107 let client = reqwest::Client::new();
108 futures::stream::iter({
109 accessions.iter().map(|accession| {
110 let client = client.clone();
111 eprintln!("Querying ENA for accession: {}", accession);
112 async move {
113 match query_ena(accession, &client).await {
114 Ok(run) => Some(run),
115 Err(e) => {
116 eprintln!("Error querying ENA for accession: {}", accession);
117 eprintln!("Error: {}", e);
118 None
119 }
120 }
121 }
122 })
123 })
124 .buffer_unordered(num_requests)
125 .collect::<Vec<_>>()
126 .await
127 .into_iter()
128 .filter_map(|run| run)
129 .flatten()
130 .collect::<Vec<Run>>()
131}
132
133#[derive(Parser, Debug)]
135#[clap(author, version, about, long_about = None)]
136pub struct Args {
137 #[clap(short, long, value_parser, multiple = true, validator = validate_accession, required_unless_present = "file")]
138 pub accession: Vec<String>,
140
141 #[clap(
142 short = 'n',
143 long = "num-requests",
144 value_name = "NUM",
145 default_value = "1",
146 help = "Maximum number of concurrent requests to make to the ENA API (max of 10 are allowed)"
147 )]
148 pub num_requests: u8,
153
154 #[clap(
155 short,
156 long,
157 value_name = "FILE",
158 help = "File containing accessions to query",
159 required_unless_present = "accession"
160 )]
161 pub file: Option<PathBuf>,
165
166 #[clap(
167 short,
168 long,
169 help = "Keep single end reads if there are paired end reads too"
170 )]
171 pub keep_single_end: bool,
175
176 #[clap(
177 value_enum,
178 short = 'o',
179 long = "output-format",
180 value_name = "FORMAT",
181 default_value_t = OutputFormat::Json,
182 help = "Format for output of data."
183 )]
184 pub format: OutputFormat,
188}
189
190pub fn parse_args() -> Args {
191 Args::parse()
192}
193
194fn validate_accession(accession: &str) -> Result<(), String> {
197 let regex = regex::Regex::new(r"^(SRR|ERR|DRR)[0-9]{6,10}$").unwrap();
198 if regex.is_match(accession) {
199 Ok(())
200 } else {
201 Err(format!("{} is not a valid accession number", accession))
202 }
203}
204
205pub fn check_num_requests(num_requests: u8) -> usize {
211 if num_requests > 10 {
212 eprintln!("To be nice to ENA, we only allow up to 10 concurrent requests. Setting number of requests to 10.");
213 10
214 } else if num_requests < 1 {
215 eprintln!("Number of requests should be at least 1. Setting number of requests to 1.");
216 1
217 } else {
218 num_requests as usize
219 }
220}
221
222pub fn read_accessions(file: &PathBuf) -> Vec<String> {
227 let file = match File::open(file) {
228 Ok(file) => file,
229 Err(e) => {
230 eprintln!("Error opening file: {}", e);
231 exit(1);
232 }
233 };
234 let reader = BufReader::new(file);
235 reader
236 .lines()
237 .into_iter()
238 .filter_map(|line| line.ok())
239 .filter_map(|line| if line.is_empty() { None } else { Some(line) })
240 .filter_map(|line| match validate_accession(line.as_str()) {
241 Ok(_) => Some(line),
242 Err(e) => {
243 eprintln!("Error validating accession: {}. Ignoring this value...", e);
244 None
245 }
246 })
247 .collect()
248}
249
250pub fn print_csv<W: std::io::Write>(wtr: &mut csv::Writer<W>, runs: Vec<Run>) -> Result<(), std::io::Error> {
252 for run in runs {
253 wtr.write_record(&["accession", "url", "md5", "bytes"])?;
254 for read in run.reads {
255 wtr.write_record(&[&run.accession, &read.url, &read.md5, &read.bytes.to_string()])?;
256 }
257 }
258 wtr.flush()?;
259 Ok(())
260}
261
262pub fn print_csv_wide<W: std::io::Write>(wtr: &mut csv::Writer<W>, runs: Vec<Run>, keep_single_end: bool) -> Result<(), std::io::Error> {
264 wtr.write_record(&["accession", "url_se", "md5_se", "bytes_1", "url_1", "md5_1", "bytes_se", "url_2", "md5_2", "bytes_2"])?;
265 for run in runs {
266 match run.reads.len() {
267 1 if keep_single_end==true => wtr.write_record(&[&run.accession, &run.reads[0].url, &run.reads[0].md5, &run.reads[0].bytes.to_string(), "", "", "", "", "", ""])?,
268 2 => wtr.write_record(&[&run.accession, "", "", "", &run.reads[0].url, &run.reads[0].md5, &run.reads[0].bytes.to_string(), &run.reads[1].url, &run.reads[1].md5, &run.reads[1].bytes.to_string()])?,
269 3 if keep_single_end==true => wtr.write_record(&[&run.accession, &run.reads[0].url, &run.reads[0].md5, &run.reads[0].bytes.to_string(), &run.reads[1].url, &run.reads[1].md5, &run.reads[1].bytes.to_string(), &run.reads[2].url, &run.reads[2].md5, &run.reads[2].bytes.to_string()])?,
270 _ => {
271 eprintln!("Found too many or too few reads for {}", &run.accession);
272 exit(1);
273 }
274 }
275 }
276 wtr.flush()?;
277 Ok(())
278}
279
280pub fn print_csv_long<W: std::io::Write>(wtr: &mut csv::Writer<W>, runs: Vec<Run>) -> Result<(), std::io::Error> {
282 wtr.write_record(&["accession", "variable", "value"])?;
283 for run in runs {
284 match run.reads.len() {
285 1 => {
286 wtr.write_record(&[&run.accession, "url_se", &run.reads[0].url])?;
287 wtr.write_record(&[&run.accession, "md5_se", &run.reads[0].md5])?;
288 wtr.write_record(&[&run.accession, "bytes_se", &run.reads[0].bytes.to_string()])?;
289 },
290 2 => {
291 wtr.write_record(&[&run.accession, "url_1", &run.reads[0].url])?;
292 wtr.write_record(&[&run.accession, "md5_1", &run.reads[0].md5])?;
293 wtr.write_record(&[&run.accession, "bytes_1", &run.reads[0].bytes.to_string()])?;
294 wtr.write_record(&[&run.accession, "url_2", &run.reads[1].url])?;
295 wtr.write_record(&[&run.accession, "md5_2", &run.reads[1].md5])?;
296 wtr.write_record(&[&run.accession, "bytes_2", &run.reads[1].bytes.to_string()])?;
297 },
298 3 => {
299 wtr.write_record(&[&run.accession, "url_se", &run.reads[0].url])?;
300 wtr.write_record(&[&run.accession, "md5_se", &run.reads[0].md5])?;
301 wtr.write_record(&[&run.accession, "bytes_se", &run.reads[0].bytes.to_string()])?;
302 wtr.write_record(&[&run.accession, "url_1", &run.reads[1].url])?;
303 wtr.write_record(&[&run.accession, "md5_1", &run.reads[1].md5])?;
304 wtr.write_record(&[&run.accession, "bytes_1", &run.reads[1].bytes.to_string()])?;
305 wtr.write_record(&[&run.accession, "url_2", &run.reads[2].url])?;
306 wtr.write_record(&[&run.accession, "md5_2", &run.reads[2].md5])?;
307 wtr.write_record(&[&run.accession, "bytes_2", &run.reads[2].bytes.to_string()])?;
308 },
309 _ => {
310 eprintln!("Found too many or too few reads for {}", &run.accession);
311 exit(1);
312 }
313 }
314 }
315 wtr.flush()?;
316 Ok(())
317}
318
319
320#[cfg(test)]
321mod tests {
322 use super::*;
323
324 #[test]
325 fn test_validate_srr_accession() {
326 let accession = "SRR1234567";
327 let result = validate_accession(accession);
328 assert!(result.is_ok());
329 }
330
331 #[test]
332 fn test_validate_err_accession() {
333 let accession = "ERR1234567";
334 let result = validate_accession(accession);
335 assert!(result.is_ok());
336 }
337
338 #[test]
339 fn test_validate_drr_accession() {
340 let accession = "DRR1234567";
341 let result = validate_accession(accession);
342 assert!(result.is_ok());
343 }
344
345 #[test]
346 fn test_validate_invalid_accession() {
347 let accession = "1234567";
348 let result = validate_accession(accession);
349 assert!(result.is_err());
350 }
351
352 #[test]
353 fn test_check_num_requests_valid() {
354 let num_requests = 5;
355 let result = check_num_requests(num_requests);
356 assert_eq!(result, 5);
357 }
358
359 #[test]
360 fn test_check_num_requests_invalid_less_than_1() {
361 let num_requests = 0;
362 let result = check_num_requests(num_requests);
363 assert_eq!(result, 1);
364 }
365
366 #[test]
367 fn test_check_num_requests_invalid_greater_than_10() {
368 let num_requests = 11;
369 let result = check_num_requests(num_requests);
370 assert_eq!(result, 10);
371 }
372
373 #[test]
374 fn test_removal_single_reads() {
375 let read_se = Reads {
376 url: "read.fastq.gz".to_string(),
377 md5: "md5".to_string(),
378 bytes: 123,
379 };
380 let read_pe_1 = Reads {
381 url: "read_1.fastq.gz".to_string(),
382 md5: "md5".to_string(),
383 bytes: 123,
384 };
385 let read_pe_2 = Reads {
386 url: "read_2.fastq.gz".to_string(),
387 md5: "md5".to_string(),
388 bytes: 123,
389 };
390 let reads_se = vec![read_se.clone()];
391 let reads_pe = vec![read_pe_1.clone(), read_pe_2.clone()];
392 let reads_pe_se = vec![read_se.clone(), read_pe_1.clone(), read_pe_2.clone()];
393 let run_se = Run {
394 accession: "SRR1234567".to_string(),
395 reads: reads_se,
396 };
397 let run_pe = Run {
398 accession: "SRR1234567".to_string(),
399 reads: reads_pe,
400 };
401 let run_pe_se = Run {
402 accession: "SRR1234567".to_string(),
403 reads: reads_pe_se,
404 };
405 let mut runs = vec![run_se, run_pe, run_pe_se];
406 runs.iter_mut().for_each(|run| run.clean_single_end());
407 assert_eq!(runs[0].reads[0], read_se);
408 assert_eq!(runs[1].reads[0], read_pe_1);
409 assert_eq!(runs[1].reads[1], read_pe_2);
410 assert_eq!(runs[2].reads[0], read_pe_1);
411 assert_eq!(runs[2].reads[1], read_pe_2);
412 }
413
414 #[test]
415 fn test_print_csv() {
416 let read = Reads {
417 url: "url".to_string(),
418 md5: "md5".to_string(),
419 bytes: 123,
420 };
421 let reads = vec![read.clone()];
422 let run = Run {
423 accession: "accession".to_string(),
424 reads: reads,
425 };
426 let runs = vec![run];
427 let mut wtr = csv::Writer::from_writer(Vec::new());
428 print_csv(&mut wtr, runs).unwrap();
429 let data = String::from_utf8(wtr.into_inner().unwrap()).unwrap();
430 assert_eq!(data, "accession,url,md5,bytes\naccession,url,md5,123\n");
431 }
432
433 #[test]
434 fn test_print_csv_wide() {
435 let read_se = Reads {
436 url: "url_se".to_string(),
437 md5: "md5_se".to_string(),
438 bytes: 123,
439 };
440 let read_pe_1 = Reads {
441 url: "url_pe_1".to_string(),
442 md5: "md5_pe_1".to_string(),
443 bytes: 123,
444 };
445 let read_pe_2 = Reads {
446 url: "url_pe_2".to_string(),
447 md5: "md5_pe_2".to_string(),
448 bytes: 123,
449 };
450 let reads_se = vec![read_se.clone()];
451 let reads_pe = vec![read_pe_1.clone(), read_pe_2.clone()];
452 let reads_pe_se = vec![read_se.clone(), read_pe_1.clone(), read_pe_2.clone()];
453 let run_se = Run {
454 accession: "SRR1234567".to_string(),
455 reads: reads_se,
456 };
457 let run_pe = Run {
458 accession: "SRR1234567".to_string(),
459 reads: reads_pe,
460 };
461 let run_pe_se = Run {
462 accession: "SRR1234567".to_string(),
463 reads: reads_pe_se,
464 };
465
466 let runs = vec![run_se];
467 let mut wtr = csv::Writer::from_writer(Vec::new());
468 print_csv_wide(&mut wtr, runs, true).unwrap();
469 let data = String::from_utf8(wtr.into_inner().unwrap()).unwrap();
470 assert_eq!(data, "accession,url_se,md5_se,bytes_1,url_1,md5_1,bytes_se,url_2,md5_2,bytes_2\nSRR1234567,url_se,md5_se,123,,,,,,\n");
471
472 let runs_pe = vec![run_pe];
473 let mut wtr = csv::Writer::from_writer(Vec::new());
474 print_csv_wide(&mut wtr, runs_pe, false).unwrap();
475 let data = String::from_utf8(wtr.into_inner().unwrap()).unwrap();
476 assert_eq!(data, "accession,url_se,md5_se,bytes_1,url_1,md5_1,bytes_se,url_2,md5_2,bytes_2\nSRR1234567,,,,url_pe_1,md5_pe_1,123,url_pe_2,md5_pe_2,123\n");
477
478 let runs_pe_se = vec![run_pe_se];
479 let mut wtr = csv::Writer::from_writer(Vec::new());
480 print_csv_wide(&mut wtr, runs_pe_se, true).unwrap();
481 let data = String::from_utf8(wtr.into_inner().unwrap()).unwrap();
482 assert_eq!(data, "accession,url_se,md5_se,bytes_1,url_1,md5_1,bytes_se,url_2,md5_2,bytes_2\nSRR1234567,url_se,md5_se,123,url_pe_1,md5_pe_1,123,url_pe_2,md5_pe_2,123\n");
483 }
484
485 #[test]
486 fn test_print_csv_long() {
487 let read_se = Reads {
488 url: "url_se".to_string(),
489 md5: "md5_se".to_string(),
490 bytes: 123,
491 };
492 let read_pe_1 = Reads {
493 url: "url_pe_1".to_string(),
494 md5: "md5_pe_1".to_string(),
495 bytes: 123,
496 };
497 let read_pe_2 = Reads {
498 url: "url_pe_2".to_string(),
499 md5: "md5_pe_2".to_string(),
500 bytes: 123,
501 };
502 let reads_se = vec![read_se.clone()];
503 let reads_pe = vec![read_pe_1.clone(), read_pe_2.clone()];
504 let reads_pe_se = vec![read_se.clone(), read_pe_1.clone(), read_pe_2.clone()];
505 let run_se = Run {
506 accession: "SRR1234567".to_string(),
507 reads: reads_se,
508 };
509 let run_pe = Run {
510 accession: "SRR1234567".to_string(),
511 reads: reads_pe,
512 };
513 let run_pe_se = Run {
514 accession: "SRR1234567".to_string(),
515 reads: reads_pe_se,
516 };
517
518 let runs = vec![run_se];
519 let mut wtr = csv::Writer::from_writer(Vec::new());
520 print_csv_long(&mut wtr, runs).unwrap();
521 let data = String::from_utf8(wtr.into_inner().unwrap()).unwrap();
522 assert_eq!(data, "accession,variable,value\nSRR1234567,url_se,url_se\nSRR1234567,md5_se,md5_se\nSRR1234567,bytes_se,123\n");
523
524 let runs_pe = vec![run_pe];
525 let mut wtr = csv::Writer::from_writer(Vec::new());
526 print_csv_long(&mut wtr, runs_pe).unwrap();
527 let data = String::from_utf8(wtr.into_inner().unwrap()).unwrap();
528 assert_eq!(data, "accession,variable,value\nSRR1234567,url_1,url_pe_1\nSRR1234567,md5_1,md5_pe_1\nSRR1234567,bytes_1,123\nSRR1234567,url_2,url_pe_2\nSRR1234567,md5_2,md5_pe_2\nSRR1234567,bytes_2,123\n");
529
530 let runs_pe_se = vec![run_pe_se];
531 let mut wtr = csv::Writer::from_writer(Vec::new());
532 print_csv_long(&mut wtr, runs_pe_se).unwrap();
533 let data = String::from_utf8(wtr.into_inner().unwrap()).unwrap();
534 assert_eq!(data, "accession,variable,value\nSRR1234567,url_se,url_se\nSRR1234567,md5_se,md5_se\nSRR1234567,bytes_se,123\nSRR1234567,url_1,url_pe_1\nSRR1234567,md5_1,md5_pe_1\nSRR1234567,bytes_1,123\nSRR1234567,url_2,url_pe_2\nSRR1234567,md5_2,md5_pe_2\nSRR1234567,bytes_2,123\n");
535 }
536}