Skip to main content

rustalign_cli/
align.rs

1//! ferroux-align - Main alignment tool
2//!
3//! This is the main RustAlign alignment tool that reads sequence reads,
4//! aligns them to a reference index using the FM-index, and outputs
5//! SAM format alignments.
6
7use anyhow::Result;
8use clap::Parser;
9use rustalign_aligner::{AlignerParams, AlignerWithRef, PeConstraints, PePolicy};
10use rustalign_common::{Nuc, Strand};
11use rustalign_io::{FastqReader, SamConfig, SamOpt, SamRecord, SamWriter};
12
13/// Ferroux aligner - Fast and sensitive read alignment
14///
15/// Ferroux is an ultrafast and memory-efficient tool for aligning
16/// sequencing reads to long reference sequences.
17#[derive(Debug, Parser)]
18#[command(name = "ferroux-align")]
19#[command(author = "Ben Langmead <langmea@cs.jhu.edu>")]
20#[command(version)]
21#[command(about = "Ultrafast and memory-efficient read aligner", long_about = None)]
22pub struct AlignOptions {
23    // ========== Required Arguments ==========
24    /// Index filename prefix (minus trailing .X.rai)
25    #[clap(short = 'x', long)]
26    pub index: String,
27
28    /// Files with unpaired reads
29    #[clap(short = 'U', long, value_name = "r")]
30    pub reads_unpaired: Vec<String>,
31
32    /// Files with #1 mates, paired with files in <m2>
33    #[clap(short = '1', long, value_name = "m1")]
34    pub reads_1: Vec<String>,
35
36    /// Files with #2 mates, paired with files in <m1>
37    #[clap(short = '2', long, value_name = "m2")]
38    pub reads_2: Vec<String>,
39
40    /// Output SAM file (default: stdout)
41    #[clap(short = 'S', long, value_name = "sam")]
42    pub sam_file: Option<String>,
43
44    // ========== Input Options ==========
45    /// Query input files are FASTQ .fq/.fastq (default)
46    #[clap(short = 'q', long, conflicts_with = "fasta")]
47    pub fastq: bool,
48
49    /// Query input files are (multi-)FASTA .fa/.mfa
50    #[clap(short = 'f', long, conflicts_with = "fastq")]
51    pub fasta: bool,
52
53    /// <m1>, <m2>, <r> are sequences themselves, not files
54    #[clap(short = 'c', long)]
55    pub sequences: bool,
56
57    /// Skip the first <int> reads/pairs in the input
58    #[clap(short = 's', long, value_name = "int")]
59    pub skip: Option<usize>,
60
61    /// Stop after first <int> reads/pairs
62    #[clap(short = 'u', long = "upto", value_name = "int")]
63    pub upto: Option<usize>,
64
65    /// Trim <int> bases from 5'/left end of reads
66    #[clap(short = '5', long = "trim5", value_name = "int")]
67    pub trim5: Option<usize>,
68
69    /// Trim <int> bases from 3'/right end of reads
70    #[clap(short = '3', long = "trim3", value_name = "int")]
71    pub trim3: Option<usize>,
72
73    // ========== Alignment Options ==========
74    /// Max # mismatches in seed alignment; can be 0 or 1
75    #[clap(short = 'N', long, value_name = "int")]
76    pub seed_mismatches: Option<u8>,
77
78    /// Length of seed substrings; must be >3, <32
79    #[clap(short = 'L', long, value_name = "int", default_value = "22")]
80    pub seed_length: usize,
81
82    /// Entire read must align; no clipping
83    #[clap(long, conflicts_with = "local")]
84    pub end_to_end: bool,
85
86    /// Local alignment; ends might be soft clipped
87    #[clap(long, conflicts_with = "end_to_end")]
88    pub local: bool,
89
90    /// Do not align forward (original) version of read
91    #[clap(long = "nofw")]
92    pub no_forward: bool,
93
94    /// Do not align reverse-complement version of read
95    #[clap(long = "norc")]
96    pub no_reverse: bool,
97
98    // ========== Scoring Options ==========
99    /// Match bonus (0 for --end-to-end, 2 for --local)
100    #[clap(long = "ma", value_name = "int")]
101    pub match_bonus: Option<i32>,
102
103    /// Max penalty for mismatch
104    #[clap(long = "mp", value_name = "int", default_value = "6")]
105    pub mismatch_penalty: i32,
106
107    /// Penalty for non-A/C/G/Ts in read/ref
108    #[clap(long = "np", value_name = "int", default_value = "1")]
109    pub n_penalty: i32,
110
111    /// Read gap open, extend penalties
112    #[clap(long = "rdg", value_name = "int,int")]
113    pub read_gap: Option<String>,
114
115    /// Reference gap open, extend penalties
116    #[clap(long = "rfg", value_name = "int,int")]
117    pub ref_gap: Option<String>,
118
119    // ========== Reporting Options ==========
120    /// Report up to <int> alignments per read
121    #[clap(short = 'k', long, value_name = "int")]
122    pub report_alignments: Option<usize>,
123
124    /// Report all alignments
125    #[clap(short = 'a', long = "all")]
126    pub report_all: bool,
127
128    // ========== Effort Options ==========
129    /// Give up extending after <int> failed extends
130    #[clap(short = 'D', long, value_name = "int", default_value = "15")]
131    pub failed_extends: usize,
132
133    /// For reads with repetitive seeds, try <int> sets of seeds
134    #[clap(short = 'R', long, value_name = "int", default_value = "2")]
135    pub seed_sets: usize,
136
137    // ========== Paired-end Options ==========
138    /// Minimum fragment length
139    #[clap(short = 'I', long = "minins", value_name = "int", default_value = "0")]
140    pub min_insert: usize,
141
142    /// Maximum fragment length
143    #[clap(
144        short = 'X',
145        long = "maxins",
146        value_name = "int",
147        default_value = "500"
148    )]
149    pub max_insert: usize,
150
151    /// -1, -2 mates align fw/rev, rev/fw, fw/fw
152    #[clap(long)]
153    pub orientation: Option<String>,
154
155    // ========== Performance Options ==========
156    /// Number of alignment threads
157    #[clap(short = 'p', long, value_name = "int", default_value = "1")]
158    pub threads: usize,
159
160    /// Force SAM output order to match input
161    #[clap(long = "reorder")]
162    pub reorder: bool,
163
164    // ========== Output Options ==========
165    /// Print wall-clock time taken by search phases
166    #[clap(short = 't', long = "time")]
167    pub print_time: bool,
168
169    /// Print nothing to stderr except serious errors
170    #[clap(long)]
171    pub quiet: bool,
172
173    /// Suppress SAM records for unaligned reads
174    #[clap(long = "no-unal")]
175    pub no_unaligned: bool,
176
177    /// Suppress header lines (@PG, @HD, etc.)
178    #[clap(long = "no-head")]
179    pub no_head: bool,
180
181    /// Suppress @SQ header lines
182    #[clap(long = "no-sq")]
183    pub no_sq: bool,
184
185    /// Use '='/'X', instead of 'M,' to specify matches/mismatches
186    #[clap(long = "xeq")]
187    pub xeq: bool,
188}
189
190/// Run the ferroux-align tool
191pub fn run() -> Result<()> {
192    let opts = AlignOptions::parse();
193    run_with_opts(opts)
194}
195
196/// Run with explicit options (for subcommand support)
197pub fn run_with_options(opts: AlignOptions) -> Result<()> {
198    run_with_opts(opts)
199}
200
201#[allow(dead_code)]
202fn main() -> Result<()> {
203    run()
204}
205
206fn run_with_opts(opts: AlignOptions) -> Result<()> {
207    // Load index to get reference names and lengths for SAM header
208    let index_with_ref = rustalign_fmindex::EbwtWithRef::load(&opts.index)?;
209    let ebwt = index_with_ref.ebwt();
210    let ref_names = ebwt.refnames().to_vec();
211    let ref_lengths = ebwt.plen().to_vec();
212
213    // Use file output if -S is specified, otherwise stdout
214    if let Some(ref sam_file) = opts.sam_file {
215        let file = std::fs::File::create(sam_file)?;
216        align_with_writer(&opts, file, &ref_names, &ref_lengths, index_with_ref)?;
217    } else {
218        let stdout = std::io::stdout();
219        align_with_writer(&opts, stdout, &ref_names, &ref_lengths, index_with_ref)?;
220    }
221
222    Ok(())
223}
224
225fn align_with_writer<W: std::io::Write>(
226    opts: &AlignOptions,
227    writer: W,
228    ref_names: &[String],
229    ref_lengths: &[u32],
230    index_with_ref: rustalign_fmindex::EbwtWithRef,
231) -> Result<()> {
232    // Set up SAM output
233    let config = SamConfig::default();
234    let mut writer = SamWriter::new(writer, config);
235    writer.write_header(ref_names, ref_lengths)?;
236
237    // Align reads
238    if !opts.reads_unpaired.is_empty() {
239        align_unpaired(opts, &mut writer, index_with_ref)?;
240    } else if !opts.reads_1.is_empty() && !opts.reads_2.is_empty() {
241        align_paired(opts, &mut writer, index_with_ref)?;
242    }
243
244    Ok(())
245}
246
247fn align_unpaired<W: std::io::Write>(
248    opts: &AlignOptions,
249    writer: &mut SamWriter<W>,
250    index_with_ref: rustalign_fmindex::EbwtWithRef,
251) -> Result<()> {
252    let reads_file = &opts.reads_unpaired[0];
253    let reader = FastqReader::from_path(reads_file)?;
254
255    // Create aligner with reference
256    let aligner_params = rustalign_aligner::AlignerParams::default();
257    let aligner = rustalign_aligner::AlignerWithRef::new(index_with_ref, aligner_params);
258
259    // Get a reference to the index for position resolution
260    let index_for_resolution = aligner.index();
261
262    for record in reader {
263        let record = record?;
264
265        // Sequence is already parsed as Vec<Nuc>
266        let nucs = record.seq.clone();
267
268        // Create reverse complement
269        let nucs_rc: Vec<rustalign_common::Nuc> =
270            nucs.iter().rev().map(|&n| n.complement()).collect();
271
272        // Align the read with actual reference extraction
273        let result = aligner.align(&nucs, &nucs_rc);
274
275        // Create SAM record
276        let mut sam_rec = rustalign_io::SamRecord::new(record.id.clone());
277        sam_rec.seq = nucs.clone();
278        sam_rec.qual = record.qual;
279
280        // Check if we got successful alignments
281        if !result.alignments.is_empty() {
282            let read_len = nucs.len() as u64;
283            let index_is_forward = true;
284
285            // Resolve positions for ALL candidate alignments
286            // This allows us to use chromosome as a tie-breaker
287            let mut resolved: Vec<(usize, String, u64, Strand, String)> = Vec::new();
288
289            for (idx, aln) in result.alignments.iter().enumerate() {
290                match index_for_resolution.ebwt().resolve_position(
291                    aln.bwt_row,
292                    read_len,
293                    index_is_forward,
294                ) {
295                    Ok((chr_name, pos, _chr_len)) => {
296                        // Adjust position by subtracting the seed offset
297                        let adjusted_pos = if pos > aln.seed_offset as u64 {
298                            pos - aln.seed_offset as u64
299                        } else {
300                            1
301                        };
302
303                        resolved.push((idx, chr_name, adjusted_pos, aln.strand, aln.cigar.clone()));
304                    }
305                    Err(_) => continue,
306                }
307            }
308
309            // Select the best alignment using tie-breakers
310            // Order: score > organelle > hit_count > chromosome_order > position > strand > seed_offset
311            fn chromosome_order(chr: &str) -> u32 {
312                match chr {
313                    "Mt" => 1, // Mitochondria first among organelles
314                    "Pt" => 2, // Plastid second among organelles
315                    "1" => 3,
316                    "2" => 4,
317                    "3" => 5,
318                    "4" => 6,
319                    "5" => 7,
320                    _ => 99, // Unknown chromosomes come last
321                }
322            }
323
324            // For same chromosome, use position (lower wins)
325            // For different chromosomes with same organelle status (both nuclear or both organelle),
326            // use hit_count to prefer more unique alignments
327            fn is_organelle(chr: &str) -> bool {
328                chr == "Mt" || chr == "Pt"
329            }
330
331            // Find the best alignment using:
332            // 1. Score (higher is better)
333            // 2. Organelle preference (Mt/Pt > nuclear for NUMT/NUPT cases)
334            //    For reads matching NUMT/NUPT regions, the organelle genome (Mt/Pt) is
335            //    the "true" source, while nuclear copies are derived.
336            // 3. hit_count (lower is better - more unique, for cross-chromosome comparison)
337            // 4. Chromosome order (lower chr number wins)
338            // 5. Position (lower wins for same chromosome - deterministic)
339            // 6. Strand: forward preferred (bowtie2 behavior)
340            // 7. seed_offset (only used as final tie-breaker)
341            if let Some((_idx, chr_name, adjusted_pos, strand, cigar)) =
342                resolved.iter().min_by(|a, b| {
343                    // Get original alignment for comparison
344                    let aln_a = &result.alignments[a.0];
345                    let aln_b = &result.alignments[b.0];
346
347                    // Primary: score (higher is better)
348                    match aln_b.score.cmp(&aln_a.score) {
349                        std::cmp::Ordering::Equal => {
350                            // Secondary: organelle preference (Mt/Pt > nuclear)
351                            let a_organelle = is_organelle(&a.1);
352                            let b_organelle = is_organelle(&b.1);
353                            match (a_organelle, b_organelle) {
354                                (true, false) => std::cmp::Ordering::Less,
355                                (false, true) => std::cmp::Ordering::Greater,
356                                _ => {
357                                    // Tertiary: hit_count (lower is better - more unique)
358                                    // This helps differentiate between chromosomes with equally good matches
359                                    match aln_a.seed_hit_count.cmp(&aln_b.seed_hit_count) {
360                                        std::cmp::Ordering::Equal => {
361                                            // Quaternary: chromosome order (lower chr number wins)
362                                            match chromosome_order(&a.1)
363                                                .cmp(&chromosome_order(&b.1))
364                                            {
365                                                std::cmp::Ordering::Equal => {
366                                                    // Quinary: position (lower wins)
367                                                    match a.2.cmp(&b.2) {
368                                                        std::cmp::Ordering::Equal => {
369                                                            // Senary: forward strand preferred
370                                                            match (aln_a.strand, aln_b.strand) {
371                                                                (
372                                                                    Strand::Forward,
373                                                                    Strand::Reverse,
374                                                                ) => std::cmp::Ordering::Less,
375                                                                (
376                                                                    Strand::Reverse,
377                                                                    Strand::Forward,
378                                                                ) => std::cmp::Ordering::Greater,
379                                                                _ => aln_a
380                                                                    .seed_offset
381                                                                    .cmp(&aln_b.seed_offset),
382                                                            }
383                                                        }
384                                                        other => other,
385                                                    }
386                                                }
387                                                other => other,
388                                            }
389                                        }
390                                        other => other,
391                                    }
392                                }
393                            }
394                        }
395                        other => other,
396                    }
397                })
398            {
399                sam_rec.rname = chr_name.clone();
400                sam_rec.pos = *adjusted_pos as i32;
401                sam_rec.flag = if *strand == Strand::Reverse { 16 } else { 0 };
402                sam_rec.mapq = 255;
403                sam_rec.cigar = if cigar.is_empty() {
404                    format!("{}M", nucs.len())
405                } else {
406                    cigar.clone()
407                };
408            } else {
409                // All position resolutions failed
410                if !opts.quiet {
411                    eprintln!(
412                        "DEBUG: Position resolution failed for all alignments of {}",
413                        record.id
414                    );
415                }
416            }
417        } else {
418            // Debug: output why no alignment
419            if !opts.quiet {
420                if let Some(err) = result.error {
421                    eprintln!("DEBUG: No alignment for {}: {}", record.id, err);
422                } else {
423                    eprintln!("DEBUG: No alignment for {} (unknown reason)", record.id);
424                }
425            }
426        }
427        // If no alignment, record stays as unmapped (flag=0, rname=*, etc.)
428
429        // Skip writing unmapped reads if --no-unal is specified
430        if sam_rec.rname == "*" && opts.no_unaligned {
431            continue;
432        }
433
434        writer.write(&sam_rec)?;
435    }
436
437    Ok(())
438}
439
440/// Resolved alignment with position information
441#[derive(Clone)]
442#[allow(dead_code)]
443struct ResolvedAlignment {
444    /// Index into the original alignment vector
445    idx: usize,
446    /// Chromosome name
447    chr_name: String,
448    /// Chromosome index (for paired-end comparison)
449    chr_idx: u32,
450    /// Adjusted position (1-based)
451    pos: u64,
452    /// Strand
453    strand: Strand,
454    /// CIGAR string
455    cigar: String,
456    /// Original alignment
457    aln: rustalign_aligner::Alignment,
458}
459
460fn align_paired<W: std::io::Write>(
461    opts: &AlignOptions,
462    writer: &mut SamWriter<W>,
463    index_with_ref: rustalign_fmindex::EbwtWithRef,
464) -> Result<()> {
465    let r1_file = &opts.reads_1[0];
466    let r2_file = &opts.reads_2[0];
467
468    let reader1 = FastqReader::from_path(r1_file)?;
469    let reader2 = FastqReader::from_path(r2_file)?;
470
471    // Create aligner
472    let aligner_params = AlignerParams::default();
473    let aligner = AlignerWithRef::new(index_with_ref, aligner_params);
474
475    // Get reference to index for position resolution
476    let index_for_resolution = aligner.index();
477
478    // Create paired-end constraints from options
479    let pe_policy = match opts.orientation.as_deref() {
480        Some("fr") | None => PePolicy::FR, // Default is FR
481        Some("rf") => PePolicy::RF,
482        Some("ff") => PePolicy::FF,
483        Some("rr") => PePolicy::RR,
484        _ => {
485            return Err(anyhow::anyhow!(
486                "Invalid orientation: {}. Use fr, rf, ff, or rr",
487                opts.orientation.as_ref().unwrap()
488            ));
489        }
490    };
491
492    let pe_constraints =
493        PeConstraints::new(opts.min_insert as u32, opts.max_insert as u32, pe_policy);
494
495    // Iterate through paired reads
496    for (record1, record2) in reader1.zip(reader2) {
497        let record1 = record1?;
498        let record2 = record2?;
499
500        // Parse sequences
501        let nucs1 = record1.seq.clone();
502        let nucs2 = record2.seq.clone();
503
504        // Create reverse complements
505        let nucs1_rc: Vec<Nuc> = nucs1.iter().rev().map(|&n| n.complement()).collect();
506        let nucs2_rc: Vec<Nuc> = nucs2.iter().rev().map(|&n| n.complement()).collect();
507
508        // Align both mates
509        let result1 = aligner.align(&nucs1, &nucs1_rc);
510        let result2 = aligner.align(&nucs2, &nucs2_rc);
511
512        // Resolve positions for R1 alignments
513        let mut resolved1: Vec<ResolvedAlignment> = Vec::new();
514        if !result1.alignments.is_empty() {
515            for (idx, aln) in result1.alignments.iter().enumerate() {
516                if let Ok((chr_name, pos, _chr_len)) = index_for_resolution.ebwt().resolve_position(
517                    aln.bwt_row,
518                    nucs1.len() as u64,
519                    true,
520                ) {
521                    let adjusted_pos = if pos > aln.seed_offset as u64 {
522                        pos - aln.seed_offset as u64
523                    } else {
524                        1
525                    };
526
527                    // Get chromosome index
528                    let chr_idx = index_for_resolution
529                        .ebwt()
530                        .refnames()
531                        .iter()
532                        .position(|n| n == &chr_name)
533                        .unwrap_or(0) as u32;
534
535                    resolved1.push(ResolvedAlignment {
536                        idx,
537                        chr_name,
538                        chr_idx,
539                        pos: adjusted_pos,
540                        strand: aln.strand,
541                        cigar: aln.cigar.clone(),
542                        aln: aln.clone(),
543                    });
544                }
545            }
546        }
547
548        // Resolve positions for R2 alignments
549        let mut resolved2: Vec<ResolvedAlignment> = Vec::new();
550        if !result2.alignments.is_empty() {
551            for (idx, aln) in result2.alignments.iter().enumerate() {
552                if let Ok((chr_name, pos, _chr_len)) = index_for_resolution.ebwt().resolve_position(
553                    aln.bwt_row,
554                    nucs2.len() as u64,
555                    true,
556                ) {
557                    let adjusted_pos = if pos > aln.seed_offset as u64 {
558                        pos - aln.seed_offset as u64
559                    } else {
560                        1
561                    };
562
563                    let chr_idx = index_for_resolution
564                        .ebwt()
565                        .refnames()
566                        .iter()
567                        .position(|n| n == &chr_name)
568                        .unwrap_or(0) as u32;
569
570                    resolved2.push(ResolvedAlignment {
571                        idx,
572                        chr_name,
573                        chr_idx,
574                        pos: adjusted_pos,
575                        strand: aln.strand,
576                        cigar: aln.cigar.clone(),
577                        aln: aln.clone(),
578                    });
579                }
580            }
581        }
582
583        // Find best concordant pair
584        let best_pair = find_best_pair(
585            &resolved1,
586            &resolved2,
587            &pe_constraints,
588            nucs1.len() as u32,
589            nucs2.len() as u32,
590        );
591
592        // Write SAM records
593        match best_pair {
594            Some((r1, r2, insert_size, is_concordant)) => {
595                // Write R1
596                let mut sam1 = SamRecord::new(record1.id.clone());
597                sam1.seq = nucs1.clone();
598                sam1.qual = record1.qual.clone();
599                sam1.rname = r1.chr_name.clone();
600                sam1.pos = r1.pos as i32;
601                sam1.flag = calculate_flag(r1.strand, r2.strand, true, true, is_concordant);
602                sam1.mapq = 255;
603                sam1.cigar = if r1.cigar.is_empty() {
604                    format!("{}M", nucs1.len())
605                } else {
606                    r1.cigar.clone()
607                };
608                sam1.rnext = if r1.chr_name == r2.chr_name {
609                    "=".to_string()
610                } else {
611                    r2.chr_name.clone()
612                };
613                sam1.pnext = r2.pos as i32;
614                sam1.tlen = insert_size;
615                sam1.add_opt(SamOpt::Int("NM".to_string(), r1.aln.edits as i64));
616                writer.write(&sam1)?;
617
618                // Write R2
619                let mut sam2 = SamRecord::new(record2.id.clone());
620                sam2.seq = nucs2.clone();
621                sam2.qual = record2.qual.clone();
622                sam2.rname = r2.chr_name.clone();
623                sam2.pos = r2.pos as i32;
624                sam2.flag = calculate_flag(r2.strand, r1.strand, false, true, is_concordant);
625                sam2.mapq = 255;
626                sam2.cigar = if r2.cigar.is_empty() {
627                    format!("{}M", nucs2.len())
628                } else {
629                    r2.cigar.clone()
630                };
631                sam2.rnext = if r1.chr_name == r2.chr_name {
632                    "=".to_string()
633                } else {
634                    r1.chr_name.clone()
635                };
636                sam2.pnext = r1.pos as i32;
637                sam2.tlen = -insert_size; // R2 has negative TLEN
638                sam2.add_opt(SamOpt::Int("NM".to_string(), r2.aln.edits as i64));
639                writer.write(&sam2)?;
640            }
641            None => {
642                // No concordant pair found - write as unpaired
643                if !opts.no_unaligned {
644                    // Try to write single-end alignments if available
645                    if let Some(r1) = select_best_alignment(&resolved1) {
646                        let mut sam1 = SamRecord::new(record1.id.clone());
647                        sam1.seq = nucs1;
648                        sam1.qual = record1.qual;
649                        sam1.rname = r1.chr_name;
650                        sam1.pos = r1.pos as i32;
651                        sam1.flag = calculate_flag(r1.strand, Strand::Forward, true, false, false);
652                        sam1.mapq = 255;
653                        sam1.cigar = format!("{}M", sam1.seq.len());
654                        sam1.rnext = "*".to_string();
655                        sam1.pnext = 0;
656                        sam1.tlen = 0;
657                        writer.write(&sam1)?;
658                    } else {
659                        // R1 unmapped
660                        let mut sam1 = SamRecord::new(record1.id.clone());
661                        sam1.seq = nucs1;
662                        sam1.qual = record1.qual;
663                        sam1.flag = 0x4 | 0x40 | 0x8; // UNMAP | FIRST | MUNMAP
664                        writer.write(&sam1)?;
665                    }
666
667                    if let Some(r2) = select_best_alignment(&resolved2) {
668                        let mut sam2 = SamRecord::new(record2.id.clone());
669                        sam2.seq = nucs2;
670                        sam2.qual = record2.qual;
671                        sam2.rname = r2.chr_name;
672                        sam2.pos = r2.pos as i32;
673                        sam2.flag = calculate_flag(r2.strand, Strand::Forward, false, false, false);
674                        sam2.mapq = 255;
675                        sam2.cigar = format!("{}M", sam2.seq.len());
676                        sam2.rnext = "*".to_string();
677                        sam2.pnext = 0;
678                        sam2.tlen = 0;
679                        writer.write(&sam2)?;
680                    } else {
681                        // R2 unmapped
682                        let mut sam2 = SamRecord::new(record2.id.clone());
683                        sam2.seq = nucs2;
684                        sam2.qual = record2.qual;
685                        sam2.flag = 0x4 | 0x80 | 0x8; // UNMAP | SECOND | MUNMAP
686                        writer.write(&sam2)?;
687                    }
688                }
689            }
690        }
691    }
692
693    Ok(())
694}
695
696/// Find the best concordant pair from R1 and R2 alignments
697fn find_best_pair(
698    r1_alignments: &[ResolvedAlignment],
699    r2_alignments: &[ResolvedAlignment],
700    constraints: &PeConstraints,
701    r1_len: u32,
702    r2_len: u32,
703) -> Option<(ResolvedAlignment, ResolvedAlignment, i32, bool)> {
704    let mut best_pair: Option<(ResolvedAlignment, ResolvedAlignment, i32, bool, i32)> = None;
705
706    for r1 in r1_alignments {
707        for r2 in r2_alignments {
708            // Check if this pair is concordant
709            let strand1 = r1.strand == Strand::Forward;
710            let strand2 = r2.strand == Strand::Forward;
711
712            if let Some(insert_size) = constraints.is_concordant(
713                r1.pos, r1_len, strand1, r1.chr_idx, r2.pos, r2_len, strand2, r2.chr_idx,
714            ) {
715                // Calculate combined score
716                let combined_score = r1.aln.score + r2.aln.score;
717
718                // Keep the best concordant pair
719                let is_better = match &best_pair {
720                    None => true,
721                    Some((_, _, _, _, best_score)) => combined_score > *best_score,
722                };
723
724                if is_better {
725                    best_pair = Some((r1.clone(), r2.clone(), insert_size, true, combined_score));
726                }
727            }
728        }
729    }
730
731    // If no concordant pair, try to find a discordant pair (same chr but wrong orientation/insert)
732    if best_pair.is_none() {
733        for r1 in r1_alignments {
734            for r2 in r2_alignments {
735                if r1.chr_idx == r2.chr_idx {
736                    let _strand1 = r1.strand == Strand::Forward;
737                    let _strand2 = r2.strand == Strand::Forward;
738
739                    // Calculate insert size anyway
740                    let left_pos = r1.pos.min(r2.pos);
741                    let right_pos = (r2.pos + r2_len as u64).max(r1.pos + r1_len as u64);
742                    let insert_size = (right_pos - left_pos) as i32;
743
744                    let combined_score = r1.aln.score + r2.aln.score;
745
746                    let is_better = match &best_pair {
747                        None => true,
748                        Some((_, _, _, _, best_score)) => combined_score > *best_score,
749                    };
750
751                    if is_better {
752                        best_pair =
753                            Some((r1.clone(), r2.clone(), insert_size, false, combined_score));
754                    }
755                }
756            }
757        }
758    }
759
760    best_pair.map(|(r1, r2, insert, concordant, _)| (r1, r2, insert, concordant))
761}
762
763/// Select the best alignment from a list
764fn select_best_alignment(alignments: &[ResolvedAlignment]) -> Option<ResolvedAlignment> {
765    alignments
766        .iter()
767        .max_by(|a, b| {
768            // First compare by score
769            match a.aln.score.cmp(&b.aln.score) {
770                std::cmp::Ordering::Equal => {
771                    // Then by hit count (lower is better)
772                    b.aln.seed_hit_count.cmp(&a.aln.seed_hit_count)
773                }
774                other => other,
775            }
776        })
777        .cloned()
778}
779
780/// Calculate SAM flag for paired-end alignment
781fn calculate_flag(
782    strand: Strand,
783    mate_strand: Strand,
784    is_first: bool,
785    both_mapped: bool,
786    is_concordant: bool,
787) -> u16 {
788    let mut flag: u16 = 0;
789
790    // PAIRED (0x1)
791    flag |= 0x1;
792
793    // PROPER_PAIR (0x2) - set if concordant
794    if is_concordant {
795        flag |= 0x2;
796    }
797
798    // UNMAP (0x4) - not set since we have alignment
799
800    // MUNMAP (0x8) - mate unmapped
801    if !both_mapped {
802        flag |= 0x8;
803    }
804
805    // REVERSE (0x10)
806    if strand == Strand::Reverse {
807        flag |= 0x10;
808    }
809
810    // MREVERSE (0x20)
811    if mate_strand == Strand::Reverse {
812        flag |= 0x20;
813    }
814
815    // FIRST_IN_PAIR (0x40) or SECOND_IN_PAIR (0x80)
816    if is_first {
817        flag |= 0x40;
818    } else {
819        flag |= 0x80;
820    }
821
822    flag
823}