1use anyhow::Result;
8use clap::Parser;
9use rustalign_aligner::{AlignerParams, AlignerWithRef, PeConstraints, PePolicy};
10use rustalign_common::{Nuc, Strand};
11use rustalign_io::{FastqReader, SamConfig, SamOpt, SamRecord, SamWriter};
12
13#[derive(Debug, Parser)]
18#[command(name = "ferroux-align")]
19#[command(author = "Ben Langmead <langmea@cs.jhu.edu>")]
20#[command(version)]
21#[command(about = "Ultrafast and memory-efficient read aligner", long_about = None)]
22pub struct AlignOptions {
23 #[clap(short = 'x', long)]
26 pub index: String,
27
28 #[clap(short = 'U', long, value_name = "r")]
30 pub reads_unpaired: Vec<String>,
31
32 #[clap(short = '1', long, value_name = "m1")]
34 pub reads_1: Vec<String>,
35
36 #[clap(short = '2', long, value_name = "m2")]
38 pub reads_2: Vec<String>,
39
40 #[clap(short = 'S', long, value_name = "sam")]
42 pub sam_file: Option<String>,
43
44 #[clap(short = 'q', long, conflicts_with = "fasta")]
47 pub fastq: bool,
48
49 #[clap(short = 'f', long, conflicts_with = "fastq")]
51 pub fasta: bool,
52
53 #[clap(short = 'c', long)]
55 pub sequences: bool,
56
57 #[clap(short = 's', long, value_name = "int")]
59 pub skip: Option<usize>,
60
61 #[clap(short = 'u', long = "upto", value_name = "int")]
63 pub upto: Option<usize>,
64
65 #[clap(short = '5', long = "trim5", value_name = "int")]
67 pub trim5: Option<usize>,
68
69 #[clap(short = '3', long = "trim3", value_name = "int")]
71 pub trim3: Option<usize>,
72
73 #[clap(short = 'N', long, value_name = "int")]
76 pub seed_mismatches: Option<u8>,
77
78 #[clap(short = 'L', long, value_name = "int", default_value = "22")]
80 pub seed_length: usize,
81
82 #[clap(long, conflicts_with = "local")]
84 pub end_to_end: bool,
85
86 #[clap(long, conflicts_with = "end_to_end")]
88 pub local: bool,
89
90 #[clap(long = "nofw")]
92 pub no_forward: bool,
93
94 #[clap(long = "norc")]
96 pub no_reverse: bool,
97
98 #[clap(long = "ma", value_name = "int")]
101 pub match_bonus: Option<i32>,
102
103 #[clap(long = "mp", value_name = "int", default_value = "6")]
105 pub mismatch_penalty: i32,
106
107 #[clap(long = "np", value_name = "int", default_value = "1")]
109 pub n_penalty: i32,
110
111 #[clap(long = "rdg", value_name = "int,int")]
113 pub read_gap: Option<String>,
114
115 #[clap(long = "rfg", value_name = "int,int")]
117 pub ref_gap: Option<String>,
118
119 #[clap(short = 'k', long, value_name = "int")]
122 pub report_alignments: Option<usize>,
123
124 #[clap(short = 'a', long = "all")]
126 pub report_all: bool,
127
128 #[clap(short = 'D', long, value_name = "int", default_value = "15")]
131 pub failed_extends: usize,
132
133 #[clap(short = 'R', long, value_name = "int", default_value = "2")]
135 pub seed_sets: usize,
136
137 #[clap(short = 'I', long = "minins", value_name = "int", default_value = "0")]
140 pub min_insert: usize,
141
142 #[clap(
144 short = 'X',
145 long = "maxins",
146 value_name = "int",
147 default_value = "500"
148 )]
149 pub max_insert: usize,
150
151 #[clap(long)]
153 pub orientation: Option<String>,
154
155 #[clap(short = 'p', long, value_name = "int", default_value = "1")]
158 pub threads: usize,
159
160 #[clap(long = "reorder")]
162 pub reorder: bool,
163
164 #[clap(short = 't', long = "time")]
167 pub print_time: bool,
168
169 #[clap(long)]
171 pub quiet: bool,
172
173 #[clap(long = "no-unal")]
175 pub no_unaligned: bool,
176
177 #[clap(long = "no-head")]
179 pub no_head: bool,
180
181 #[clap(long = "no-sq")]
183 pub no_sq: bool,
184
185 #[clap(long = "xeq")]
187 pub xeq: bool,
188}
189
190pub fn run() -> Result<()> {
192 let opts = AlignOptions::parse();
193 run_with_opts(opts)
194}
195
196pub fn run_with_options(opts: AlignOptions) -> Result<()> {
198 run_with_opts(opts)
199}
200
201#[allow(dead_code)]
202fn main() -> Result<()> {
203 run()
204}
205
206fn run_with_opts(opts: AlignOptions) -> Result<()> {
207 let index_with_ref = rustalign_fmindex::EbwtWithRef::load(&opts.index)?;
209 let ebwt = index_with_ref.ebwt();
210 let ref_names = ebwt.refnames().to_vec();
211 let ref_lengths = ebwt.plen().to_vec();
212
213 if let Some(ref sam_file) = opts.sam_file {
215 let file = std::fs::File::create(sam_file)?;
216 align_with_writer(&opts, file, &ref_names, &ref_lengths, index_with_ref)?;
217 } else {
218 let stdout = std::io::stdout();
219 align_with_writer(&opts, stdout, &ref_names, &ref_lengths, index_with_ref)?;
220 }
221
222 Ok(())
223}
224
225fn align_with_writer<W: std::io::Write>(
226 opts: &AlignOptions,
227 writer: W,
228 ref_names: &[String],
229 ref_lengths: &[u32],
230 index_with_ref: rustalign_fmindex::EbwtWithRef,
231) -> Result<()> {
232 let config = SamConfig::default();
234 let mut writer = SamWriter::new(writer, config);
235 writer.write_header(ref_names, ref_lengths)?;
236
237 if !opts.reads_unpaired.is_empty() {
239 align_unpaired(opts, &mut writer, index_with_ref)?;
240 } else if !opts.reads_1.is_empty() && !opts.reads_2.is_empty() {
241 align_paired(opts, &mut writer, index_with_ref)?;
242 }
243
244 Ok(())
245}
246
247fn align_unpaired<W: std::io::Write>(
248 opts: &AlignOptions,
249 writer: &mut SamWriter<W>,
250 index_with_ref: rustalign_fmindex::EbwtWithRef,
251) -> Result<()> {
252 let reads_file = &opts.reads_unpaired[0];
253 let reader = FastqReader::from_path(reads_file)?;
254
255 let aligner_params = rustalign_aligner::AlignerParams::default();
257 let aligner = rustalign_aligner::AlignerWithRef::new(index_with_ref, aligner_params);
258
259 let index_for_resolution = aligner.index();
261
262 for record in reader {
263 let record = record?;
264
265 let nucs = record.seq.clone();
267
268 let nucs_rc: Vec<rustalign_common::Nuc> =
270 nucs.iter().rev().map(|&n| n.complement()).collect();
271
272 let result = aligner.align(&nucs, &nucs_rc);
274
275 let mut sam_rec = rustalign_io::SamRecord::new(record.id.clone());
277 sam_rec.seq = nucs.clone();
278 sam_rec.qual = record.qual;
279
280 if !result.alignments.is_empty() {
282 let read_len = nucs.len() as u64;
283 let index_is_forward = true;
284
285 let mut resolved: Vec<(usize, String, u64, Strand, String)> = Vec::new();
288
289 for (idx, aln) in result.alignments.iter().enumerate() {
290 match index_for_resolution.ebwt().resolve_position(
291 aln.bwt_row,
292 read_len,
293 index_is_forward,
294 ) {
295 Ok((chr_name, pos, _chr_len)) => {
296 let adjusted_pos = if pos > aln.seed_offset as u64 {
298 pos - aln.seed_offset as u64
299 } else {
300 1
301 };
302
303 resolved.push((idx, chr_name, adjusted_pos, aln.strand, aln.cigar.clone()));
304 }
305 Err(_) => continue,
306 }
307 }
308
309 fn chromosome_order(chr: &str) -> u32 {
312 match chr {
313 "Mt" => 1, "Pt" => 2, "1" => 3,
316 "2" => 4,
317 "3" => 5,
318 "4" => 6,
319 "5" => 7,
320 _ => 99, }
322 }
323
324 fn is_organelle(chr: &str) -> bool {
328 chr == "Mt" || chr == "Pt"
329 }
330
331 if let Some((_idx, chr_name, adjusted_pos, strand, cigar)) =
342 resolved.iter().min_by(|a, b| {
343 let aln_a = &result.alignments[a.0];
345 let aln_b = &result.alignments[b.0];
346
347 match aln_b.score.cmp(&aln_a.score) {
349 std::cmp::Ordering::Equal => {
350 let a_organelle = is_organelle(&a.1);
352 let b_organelle = is_organelle(&b.1);
353 match (a_organelle, b_organelle) {
354 (true, false) => std::cmp::Ordering::Less,
355 (false, true) => std::cmp::Ordering::Greater,
356 _ => {
357 match aln_a.seed_hit_count.cmp(&aln_b.seed_hit_count) {
360 std::cmp::Ordering::Equal => {
361 match chromosome_order(&a.1)
363 .cmp(&chromosome_order(&b.1))
364 {
365 std::cmp::Ordering::Equal => {
366 match a.2.cmp(&b.2) {
368 std::cmp::Ordering::Equal => {
369 match (aln_a.strand, aln_b.strand) {
371 (
372 Strand::Forward,
373 Strand::Reverse,
374 ) => std::cmp::Ordering::Less,
375 (
376 Strand::Reverse,
377 Strand::Forward,
378 ) => std::cmp::Ordering::Greater,
379 _ => aln_a
380 .seed_offset
381 .cmp(&aln_b.seed_offset),
382 }
383 }
384 other => other,
385 }
386 }
387 other => other,
388 }
389 }
390 other => other,
391 }
392 }
393 }
394 }
395 other => other,
396 }
397 })
398 {
399 sam_rec.rname = chr_name.clone();
400 sam_rec.pos = *adjusted_pos as i32;
401 sam_rec.flag = if *strand == Strand::Reverse { 16 } else { 0 };
402 sam_rec.mapq = 255;
403 sam_rec.cigar = if cigar.is_empty() {
404 format!("{}M", nucs.len())
405 } else {
406 cigar.clone()
407 };
408 } else {
409 if !opts.quiet {
411 eprintln!(
412 "DEBUG: Position resolution failed for all alignments of {}",
413 record.id
414 );
415 }
416 }
417 } else {
418 if !opts.quiet {
420 if let Some(err) = result.error {
421 eprintln!("DEBUG: No alignment for {}: {}", record.id, err);
422 } else {
423 eprintln!("DEBUG: No alignment for {} (unknown reason)", record.id);
424 }
425 }
426 }
427 if sam_rec.rname == "*" && opts.no_unaligned {
431 continue;
432 }
433
434 writer.write(&sam_rec)?;
435 }
436
437 Ok(())
438}
439
440#[derive(Clone)]
442#[allow(dead_code)]
443struct ResolvedAlignment {
444 idx: usize,
446 chr_name: String,
448 chr_idx: u32,
450 pos: u64,
452 strand: Strand,
454 cigar: String,
456 aln: rustalign_aligner::Alignment,
458}
459
460fn align_paired<W: std::io::Write>(
461 opts: &AlignOptions,
462 writer: &mut SamWriter<W>,
463 index_with_ref: rustalign_fmindex::EbwtWithRef,
464) -> Result<()> {
465 let r1_file = &opts.reads_1[0];
466 let r2_file = &opts.reads_2[0];
467
468 let reader1 = FastqReader::from_path(r1_file)?;
469 let reader2 = FastqReader::from_path(r2_file)?;
470
471 let aligner_params = AlignerParams::default();
473 let aligner = AlignerWithRef::new(index_with_ref, aligner_params);
474
475 let index_for_resolution = aligner.index();
477
478 let pe_policy = match opts.orientation.as_deref() {
480 Some("fr") | None => PePolicy::FR, Some("rf") => PePolicy::RF,
482 Some("ff") => PePolicy::FF,
483 Some("rr") => PePolicy::RR,
484 _ => {
485 return Err(anyhow::anyhow!(
486 "Invalid orientation: {}. Use fr, rf, ff, or rr",
487 opts.orientation.as_ref().unwrap()
488 ));
489 }
490 };
491
492 let pe_constraints =
493 PeConstraints::new(opts.min_insert as u32, opts.max_insert as u32, pe_policy);
494
495 for (record1, record2) in reader1.zip(reader2) {
497 let record1 = record1?;
498 let record2 = record2?;
499
500 let nucs1 = record1.seq.clone();
502 let nucs2 = record2.seq.clone();
503
504 let nucs1_rc: Vec<Nuc> = nucs1.iter().rev().map(|&n| n.complement()).collect();
506 let nucs2_rc: Vec<Nuc> = nucs2.iter().rev().map(|&n| n.complement()).collect();
507
508 let result1 = aligner.align(&nucs1, &nucs1_rc);
510 let result2 = aligner.align(&nucs2, &nucs2_rc);
511
512 let mut resolved1: Vec<ResolvedAlignment> = Vec::new();
514 if !result1.alignments.is_empty() {
515 for (idx, aln) in result1.alignments.iter().enumerate() {
516 if let Ok((chr_name, pos, _chr_len)) = index_for_resolution.ebwt().resolve_position(
517 aln.bwt_row,
518 nucs1.len() as u64,
519 true,
520 ) {
521 let adjusted_pos = if pos > aln.seed_offset as u64 {
522 pos - aln.seed_offset as u64
523 } else {
524 1
525 };
526
527 let chr_idx = index_for_resolution
529 .ebwt()
530 .refnames()
531 .iter()
532 .position(|n| n == &chr_name)
533 .unwrap_or(0) as u32;
534
535 resolved1.push(ResolvedAlignment {
536 idx,
537 chr_name,
538 chr_idx,
539 pos: adjusted_pos,
540 strand: aln.strand,
541 cigar: aln.cigar.clone(),
542 aln: aln.clone(),
543 });
544 }
545 }
546 }
547
548 let mut resolved2: Vec<ResolvedAlignment> = Vec::new();
550 if !result2.alignments.is_empty() {
551 for (idx, aln) in result2.alignments.iter().enumerate() {
552 if let Ok((chr_name, pos, _chr_len)) = index_for_resolution.ebwt().resolve_position(
553 aln.bwt_row,
554 nucs2.len() as u64,
555 true,
556 ) {
557 let adjusted_pos = if pos > aln.seed_offset as u64 {
558 pos - aln.seed_offset as u64
559 } else {
560 1
561 };
562
563 let chr_idx = index_for_resolution
564 .ebwt()
565 .refnames()
566 .iter()
567 .position(|n| n == &chr_name)
568 .unwrap_or(0) as u32;
569
570 resolved2.push(ResolvedAlignment {
571 idx,
572 chr_name,
573 chr_idx,
574 pos: adjusted_pos,
575 strand: aln.strand,
576 cigar: aln.cigar.clone(),
577 aln: aln.clone(),
578 });
579 }
580 }
581 }
582
583 let best_pair = find_best_pair(
585 &resolved1,
586 &resolved2,
587 &pe_constraints,
588 nucs1.len() as u32,
589 nucs2.len() as u32,
590 );
591
592 match best_pair {
594 Some((r1, r2, insert_size, is_concordant)) => {
595 let mut sam1 = SamRecord::new(record1.id.clone());
597 sam1.seq = nucs1.clone();
598 sam1.qual = record1.qual.clone();
599 sam1.rname = r1.chr_name.clone();
600 sam1.pos = r1.pos as i32;
601 sam1.flag = calculate_flag(r1.strand, r2.strand, true, true, is_concordant);
602 sam1.mapq = 255;
603 sam1.cigar = if r1.cigar.is_empty() {
604 format!("{}M", nucs1.len())
605 } else {
606 r1.cigar.clone()
607 };
608 sam1.rnext = if r1.chr_name == r2.chr_name {
609 "=".to_string()
610 } else {
611 r2.chr_name.clone()
612 };
613 sam1.pnext = r2.pos as i32;
614 sam1.tlen = insert_size;
615 sam1.add_opt(SamOpt::Int("NM".to_string(), r1.aln.edits as i64));
616 writer.write(&sam1)?;
617
618 let mut sam2 = SamRecord::new(record2.id.clone());
620 sam2.seq = nucs2.clone();
621 sam2.qual = record2.qual.clone();
622 sam2.rname = r2.chr_name.clone();
623 sam2.pos = r2.pos as i32;
624 sam2.flag = calculate_flag(r2.strand, r1.strand, false, true, is_concordant);
625 sam2.mapq = 255;
626 sam2.cigar = if r2.cigar.is_empty() {
627 format!("{}M", nucs2.len())
628 } else {
629 r2.cigar.clone()
630 };
631 sam2.rnext = if r1.chr_name == r2.chr_name {
632 "=".to_string()
633 } else {
634 r1.chr_name.clone()
635 };
636 sam2.pnext = r1.pos as i32;
637 sam2.tlen = -insert_size; sam2.add_opt(SamOpt::Int("NM".to_string(), r2.aln.edits as i64));
639 writer.write(&sam2)?;
640 }
641 None => {
642 if !opts.no_unaligned {
644 if let Some(r1) = select_best_alignment(&resolved1) {
646 let mut sam1 = SamRecord::new(record1.id.clone());
647 sam1.seq = nucs1;
648 sam1.qual = record1.qual;
649 sam1.rname = r1.chr_name;
650 sam1.pos = r1.pos as i32;
651 sam1.flag = calculate_flag(r1.strand, Strand::Forward, true, false, false);
652 sam1.mapq = 255;
653 sam1.cigar = format!("{}M", sam1.seq.len());
654 sam1.rnext = "*".to_string();
655 sam1.pnext = 0;
656 sam1.tlen = 0;
657 writer.write(&sam1)?;
658 } else {
659 let mut sam1 = SamRecord::new(record1.id.clone());
661 sam1.seq = nucs1;
662 sam1.qual = record1.qual;
663 sam1.flag = 0x4 | 0x40 | 0x8; writer.write(&sam1)?;
665 }
666
667 if let Some(r2) = select_best_alignment(&resolved2) {
668 let mut sam2 = SamRecord::new(record2.id.clone());
669 sam2.seq = nucs2;
670 sam2.qual = record2.qual;
671 sam2.rname = r2.chr_name;
672 sam2.pos = r2.pos as i32;
673 sam2.flag = calculate_flag(r2.strand, Strand::Forward, false, false, false);
674 sam2.mapq = 255;
675 sam2.cigar = format!("{}M", sam2.seq.len());
676 sam2.rnext = "*".to_string();
677 sam2.pnext = 0;
678 sam2.tlen = 0;
679 writer.write(&sam2)?;
680 } else {
681 let mut sam2 = SamRecord::new(record2.id.clone());
683 sam2.seq = nucs2;
684 sam2.qual = record2.qual;
685 sam2.flag = 0x4 | 0x80 | 0x8; writer.write(&sam2)?;
687 }
688 }
689 }
690 }
691 }
692
693 Ok(())
694}
695
696fn find_best_pair(
698 r1_alignments: &[ResolvedAlignment],
699 r2_alignments: &[ResolvedAlignment],
700 constraints: &PeConstraints,
701 r1_len: u32,
702 r2_len: u32,
703) -> Option<(ResolvedAlignment, ResolvedAlignment, i32, bool)> {
704 let mut best_pair: Option<(ResolvedAlignment, ResolvedAlignment, i32, bool, i32)> = None;
705
706 for r1 in r1_alignments {
707 for r2 in r2_alignments {
708 let strand1 = r1.strand == Strand::Forward;
710 let strand2 = r2.strand == Strand::Forward;
711
712 if let Some(insert_size) = constraints.is_concordant(
713 r1.pos, r1_len, strand1, r1.chr_idx, r2.pos, r2_len, strand2, r2.chr_idx,
714 ) {
715 let combined_score = r1.aln.score + r2.aln.score;
717
718 let is_better = match &best_pair {
720 None => true,
721 Some((_, _, _, _, best_score)) => combined_score > *best_score,
722 };
723
724 if is_better {
725 best_pair = Some((r1.clone(), r2.clone(), insert_size, true, combined_score));
726 }
727 }
728 }
729 }
730
731 if best_pair.is_none() {
733 for r1 in r1_alignments {
734 for r2 in r2_alignments {
735 if r1.chr_idx == r2.chr_idx {
736 let _strand1 = r1.strand == Strand::Forward;
737 let _strand2 = r2.strand == Strand::Forward;
738
739 let left_pos = r1.pos.min(r2.pos);
741 let right_pos = (r2.pos + r2_len as u64).max(r1.pos + r1_len as u64);
742 let insert_size = (right_pos - left_pos) as i32;
743
744 let combined_score = r1.aln.score + r2.aln.score;
745
746 let is_better = match &best_pair {
747 None => true,
748 Some((_, _, _, _, best_score)) => combined_score > *best_score,
749 };
750
751 if is_better {
752 best_pair =
753 Some((r1.clone(), r2.clone(), insert_size, false, combined_score));
754 }
755 }
756 }
757 }
758 }
759
760 best_pair.map(|(r1, r2, insert, concordant, _)| (r1, r2, insert, concordant))
761}
762
763fn select_best_alignment(alignments: &[ResolvedAlignment]) -> Option<ResolvedAlignment> {
765 alignments
766 .iter()
767 .max_by(|a, b| {
768 match a.aln.score.cmp(&b.aln.score) {
770 std::cmp::Ordering::Equal => {
771 b.aln.seed_hit_count.cmp(&a.aln.seed_hit_count)
773 }
774 other => other,
775 }
776 })
777 .cloned()
778}
779
780fn calculate_flag(
782 strand: Strand,
783 mate_strand: Strand,
784 is_first: bool,
785 both_mapped: bool,
786 is_concordant: bool,
787) -> u16 {
788 let mut flag: u16 = 0;
789
790 flag |= 0x1;
792
793 if is_concordant {
795 flag |= 0x2;
796 }
797
798 if !both_mapped {
802 flag |= 0x8;
803 }
804
805 if strand == Strand::Reverse {
807 flag |= 0x10;
808 }
809
810 if mate_strand == Strand::Reverse {
812 flag |= 0x20;
813 }
814
815 if is_first {
817 flag |= 0x40;
818 } else {
819 flag |= 0x80;
820 }
821
822 flag
823}