Skip to main content

gbz_base/
alignment.rs

1//! Structures for representing sequence to graph alignments.
2//!
3//! An [`Alignment`] object represents the alignment of a query sequence to a target path in a graph.
4//! It corresponds to a single line in a GAF file.
5//!
6//! An [`AlignmentBlock`] object represents an ordered collection of alignments, using column-based compression.
7//! It corresponds to a row in table `Alignments` in a [`crate::GAFBase`].
8//!
9//! The GAF format is a text-based format for representing sequence alignments to a graph.
10//! See [the specification](https://github.com/lh3/gfatools/blob/master/doc/rGFA.md) for an overview.
11//! Some details are better documented in the [minimap2 man page](https://lh3.github.io/minimap2/minimap2.html#10).
12//!
13//! # Assumptions
14//!
15//! Some details of the GAF format are not well documented, and this implementation makes some assumptions.
16//!
17//! * Softclips are included either in both the query interval and the difference string or in neither.
18//! * If a difference string is present, it overrides query/path interval ends, the number of matches, and alignment block length.
19//! * Alignment block length is the sum of matches, mismatches, insertions, and deletions.
20//!
21//! # Supported optional fields
22//!
23//! * `AS:i`: Alignment score.
24//! * `bq:Z`: Base quality values for the query sequence.
25//! * `cs:Z`: Difference string for the alignment.
26//! * `pd:b`: Properly paired flag.
27//! * `fn:Z`: Name of the next read in the pair.
28//! * `fp:Z`: Name of the previous read in the pair.
29//!
30//! Only matches, mismatches, insertions, and deletions are supported in the difference string.
31//! Matches are represented as a match length rather than as the matching sequence.
32//!
33//! The order of optional fields is not preserved.
34
35use crate::{Subgraph, Mapping, Difference};
36use crate::formats::{self, TypedField};
37use crate::utils::{self, PathStartSource};
38
39use std::io::{Read, Write};
40use std::ops::Range;
41use std::sync::Arc;
42use std::{cmp, str};
43
44use zstd::stream::Encoder as ZstdEncoder;
45use zstd::stream::Decoder as ZstdDecoder;
46
47use gbz::{GBWT, Orientation, Pos, ENDMARKER};
48use gbz::support::{self, ByteCode, ByteCodeIter, RLE, Run, RLEIter};
49
50#[cfg(test)]
51mod tests;
52
53pub mod mapping;
54
55//-----------------------------------------------------------------------------
56
57// TODO: add validate() against a graph, a subgraph, or a read set
58/// An alignment between a query sequence and a target path in a graph.
59///
60/// This object corresponds either to a line in a GAF file or to a row in table `Alignments` in [`crate::GAFBase`].
61/// When the alignment is built from a GAF line, the target path is stored explicitly.
62/// For alignments stored in a database, only the GBWT starting position is stored.
63/// See [`TargetPath`] for details.
64///
65/// A GAF line can be converted to an `Alignment` object with [`Alignment::from_gaf`].
66/// An `Alignment` object can be serialized as a GAF line with [`Alignment::to_gaf`].
67/// The conversion can be lossy (see [`crate::alignment`] for details).
68///
69/// # Examples
70///
71/// ```
72/// use gbz_base::Alignment;
73/// use gbz::Orientation;
74/// use gbz::support;
75///
76/// // Unnamed empty sequence.
77/// let empty = Alignment::new();
78/// assert!(empty.name.is_empty());
79/// assert_eq!(empty.seq_len, 0);
80/// assert!(empty.is_unaligned());
81///
82/// // Construct a GAF line.
83/// let name = "query";
84/// let seq_len = 7;
85/// let seq_interval = 0..6;
86/// let path = ">1<2>3";
87/// let path_len = 8;
88/// let path_interval = 2..8;
89/// let matches = 5;
90/// let edits = 1;
91/// let mapq = 60;
92/// let line = format!("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
93///     name,
94///     seq_len, seq_interval.start, seq_interval.end,
95///     '+', path,
96///     path_len, path_interval.start, path_interval.end,
97///     matches, matches + edits, mapq
98/// );
99///
100/// // Create an alignment from the line.
101/// let aln = Alignment::from_gaf(line.as_bytes()).unwrap();
102/// assert_eq!(aln.name, name);
103/// assert_eq!(aln.seq_len, seq_len);
104/// assert_eq!(aln.seq_interval, seq_interval);
105/// assert!(aln.has_target_path());
106/// let path = vec!(
107///     support::encode_node(1, Orientation::Forward),
108///     support::encode_node(2, Orientation::Reverse),
109///     support::encode_node(3, Orientation::Forward)
110/// );
111/// assert_eq!(aln.target_path(), Some(path.as_ref()));
112/// assert_eq!(aln.path_len, path_len);
113/// assert_eq!(aln.path_interval, path_interval);
114/// assert_eq!(aln.matches, matches);
115/// assert_eq!(aln.edits, edits);
116/// assert_eq!(aln.mapq, Some(mapq));
117///
118/// // All optional fields are missing.
119/// assert!(aln.score.is_none());
120/// assert!(aln.base_quality.is_empty());
121/// assert!(aln.difference.is_empty());
122/// assert!(aln.pair.is_none());
123/// assert!(aln.optional.is_empty());
124///
125/// // Convert back to GAF.
126/// let _query_sequence = b"GATTACA"; // Not used yet.
127/// let target_sequence = b"GAGATCAC".to_vec();
128/// let from_aln = aln.to_gaf(&target_sequence);
129/// assert_eq!(&from_aln, line.as_bytes());
130/// ```
131#[derive(Clone, Debug, PartialEq)]
132pub struct Alignment {
133    /// Name of the query sequence.
134    pub name: String,
135    /// Length of the query sequence.
136    pub seq_len: usize,
137    /// Aligned interval of the query sequence.
138    pub seq_interval: Range<usize>,
139    /// Target path in the orientation of the query sequence.
140    pub path: TargetPath,
141    /// Length of the target path.
142    pub path_len: usize,
143    /// Aligned interval of the target path.
144    pub path_interval: Range<usize>,
145    /// Number of matches in the alignment.
146    pub matches: usize,
147    /// Number of mismatches and gaps in the alignment.
148    pub edits: usize,
149    /// Mapping quality.
150    pub mapq: Option<usize>,
151    /// Alignment score.
152    pub score: Option<isize>,
153    /// Base quality values for the query sequence.
154    pub base_quality: Vec<u8>,
155    /// Difference string, or an empty vector if not present.
156    pub difference: Vec<Difference>,
157    /// Information about the paired alignment.
158    pub pair: Option<PairedRead>,
159    /// Optional typed fields that have not been interpreted.
160    pub optional: Vec<TypedField>,
161}
162
163impl Default for Alignment {
164    fn default() -> Self {
165        Alignment {
166            name: String::new(),
167            seq_len: 0,
168            seq_interval: 0..0,
169            path: TargetPath::Path(Vec::new()),
170            path_len: 0,
171            path_interval: 0..0,
172            matches: 0,
173            edits: 0,
174            mapq: None,
175            score: None,
176            base_quality: Vec::new(),
177            difference: Vec::new(),
178            pair: None,
179            optional: Vec::new(),
180        }
181    }
182}
183
184//-----------------------------------------------------------------------------
185
186/// Construction; conversions between GAF lines and `Alignment` objects.
187impl Alignment {
188    // Number of mandatory fields in a GAF line.
189    const MANDATORY_FIELDS: usize = 12;
190
191    // Placeholder value for a missing mapping quality.
192    const MISSING_MAPQ: usize = 255;
193
194    // The field is empty and the value is missing; typically used with unaligned sequences.
195    const MISSING_VALUE: [u8; 1] = [b'*'];
196
197    /// Creates an empty alignment.
198    pub fn new() -> Self {
199        Alignment::default()
200    }
201
202    // Parses a string field from a GAF field.
203    fn parse_string(field: &[u8], field_name: &str) -> Result<String, String> {
204        String::from_utf8(field.to_vec()).map_err(|err| {
205            format!("Invalid {}: {}", field_name, err)
206        })
207    }
208
209    // Parses an unsigned integer from a GAF field.
210    // Returns `0` if the value is missing.
211    fn parse_usize(field: &[u8], field_name: &str) -> Result<usize, String> {
212        if field == Self::MISSING_VALUE {
213            return Ok(0);
214        }
215        let number = str::from_utf8(field).map_err(|err| {
216            format!("Invalid {}: {}", field_name, err)
217        })?;
218        number.parse().map_err(|err| {
219            format!("Invalid {}: {}", field_name, err)
220        })
221    }
222
223    // Parses an interval from two GAF fields.
224    fn parse_interval(start: &[u8], end: &[u8]) -> Result<Range<usize>, String> {
225        let start = Self::parse_usize(start, "interval start")?;
226        let end = Self::parse_usize(end, "interval end")?;
227        Ok(start..end)
228    }
229
230    // Parses an orientation from a GAF field.
231    // Returns [`Orientation::Forward`] if the value is missing.
232    fn parse_orientation(field: &[u8], field_name: &str) -> Result<Orientation, String> {
233        if field == Self::MISSING_VALUE {
234            return Ok(Orientation::Forward);
235        }
236        if field.len() != 1 {
237            return Err(format!("Invalid {}: {}", field_name, String::from_utf8_lossy(field)));
238        }
239        match field[0] {
240            b'+' => Ok(Orientation::Forward),
241            b'-' => Ok(Orientation::Reverse),
242            _ => Err(format!("Invalid {}: {}", field_name, String::from_utf8_lossy(field))),
243        }
244    }
245
246    // Parses an oriented path from a GAF field.
247    // Returns an empty path if the value is missing.
248    fn parse_path(field: &[u8]) -> Result<Vec<usize>, String> {
249        let mut result = Vec::new();
250        if field == Self::MISSING_VALUE {
251            return Ok(result);
252        }
253
254        let mut start = 0;
255        while start < field.len() {
256            let orientation = match field[start] {
257                b'>' => Orientation::Forward,
258                b'<' => Orientation::Reverse,
259                _ => return Err(format!("Invalid segment orientation: {}", String::from_utf8_lossy(field))),
260            };
261            start += 1;
262            let end = field[start..].iter().position(|&c| c == b'>' || c == b'<').map_or(field.len(), |x| start + x);
263            let node = str::from_utf8(&field[start..end]).map_err(|err| {
264                format!("Invalid segment name: {}", err)
265            })?.parse().map_err(|_| {
266                String::from("Only numerical segment names are supported")
267            })?;
268            result.push(support::encode_node(node, orientation));
269            start = end;
270        }
271        Ok(result)
272    }
273
274    // Parses a pair name from the value of a typed field.
275    fn parse_pair(value: Vec<u8>, is_next: bool, output: &mut Option<PairedRead>) -> Result<(), String> {
276        if output.is_some() {
277            return Err(String::from("Multiple pair fields"));
278        }
279        let name = String::from_utf8(value).map_err(|err| {
280            format!("Invalid pair name: {}", err)
281        })?;
282        *output = Some(PairedRead {
283            name,
284            is_next,
285            is_proper: false,
286        });
287        Ok(())
288    }
289
290    /// Parses an alignment from a GAF line.
291    ///
292    /// Returns an error if the line cannot be parsed.
293    /// The line may end with up to one endline character, which is ignored.
294    /// The returned alignment stores the target path explicitly.
295    /// Parsing is based on bytes rather than characters to avoid unnecessary UTF-8 validation.
296    ///
297    /// If a difference string is present, some numerical fields will be recalculated from it.
298    /// These include interval ends on both the query and the target, as well as the number of matches and edits.
299    /// This behavior is justified, because some aligners may not calculate these values correctly.
300    pub fn from_gaf(line: &[u8]) -> Result<Self, String> {
301        // Check for an endline character which may be present.
302        let line = if line.last() == Some(&b'\n') {
303            &line[..line.len() - 1]
304        } else {
305            line
306        };
307
308        // Split the line into fields.
309        let fields = line.split(|&c| c == b'\t').collect::<Vec<_>>();
310        if fields.len() < Self::MANDATORY_FIELDS {
311            let line = String::from_utf8_lossy(line);
312            let message = format!("GAF line with fewer than {} fields: {}", Self::MANDATORY_FIELDS, line);
313            return Err(message);
314        }
315
316        // Query sequence.
317        let name = Self::parse_string(fields[0], "query sequence name")?;
318        let seq_len = Self::parse_usize(fields[1], "query sequence length")?;
319        let mut seq_interval = Self::parse_interval(fields[2], fields[3])?;
320
321        // Target path.
322        let orientation = Self::parse_orientation(fields[4], "target orientation")?;
323        let mut path = Self::parse_path(fields[5]).map_err(|err| {
324            format!("Invalid target path: {}", err)
325        })?;
326        if orientation == Orientation::Reverse {
327            support::reverse_path_in_place(&mut path);
328        }
329        let path = TargetPath::Path(path);
330        let path_len = Self::parse_usize(fields[6], "target path length")?;
331        // TODO: This is a hack. VG sometimes gets the target interval end wrong.
332        // If we can't parse the interval, we will later try to parse the start and infer the length from the difference string.
333        let path_interval = Self::parse_interval(fields[7], fields[8]);
334
335        // Alignment statistics.
336        let mut matches = Self::parse_usize(fields[9], "matches")?;
337        let alignment_len = Self::parse_usize(fields[10], "alignment length")?;
338        let mut edits = alignment_len.saturating_sub(matches);
339        let mapq = Self::parse_usize(fields[11], "mapping quality")?;
340        let mapq = if mapq == Self::MISSING_MAPQ { None } else { Some(mapq) }; // TODO: Too large values?
341
342        // Optional fields.
343        let mut score = None;
344        let mut base_quality = Vec::new();
345        let mut difference = Vec::new();
346        let mut pair = None;
347        let mut properly_paired = None;
348        let mut optional = Vec::new();
349        for field in fields[Self::MANDATORY_FIELDS..].iter() {
350            let parsed = TypedField::parse(field)?;
351            match parsed {
352                TypedField::Int([b'A', b'S'], value) => {
353                    if score.is_some() {
354                        return Err(String::from("Multiple alignment score fields"));
355                    }
356                    score = Some(value);
357                },
358                TypedField::String([b'b', b'q'], value) => {
359                    if !base_quality.is_empty() {
360                        return Err(String::from("Multiple base quality fields"));
361                    }
362                    base_quality = value;
363                },
364                TypedField::String([b'c', b's'], value) => {
365                    if !difference.is_empty() {
366                        return Err(String::from("Multiple difference fields"));
367                    }
368                    difference = Difference::parse_normalized(&value)?;
369                },
370                TypedField::Bool([b'p', b'd'], value) => {
371                    if properly_paired.is_some() {
372                        return Err(String::from("Multiple properly paired fields"));
373                    }
374                    properly_paired = Some(value);
375                },
376                TypedField::String([b'f', b'n'], value) => {
377                    Self::parse_pair(value, true, &mut pair)?;
378                },
379                TypedField::String([b'f', b'p'], value) => {
380                    Self::parse_pair(value, false, &mut pair)?;
381                },
382                _ => { optional.push(parsed); },
383            }
384        }
385        if let (Some(pair), Some(properly_paired)) = (pair.as_mut(), properly_paired) {
386            pair.is_proper = properly_paired;
387        }
388
389        // TODO: Part of the interval end hack.
390        let mut path_interval = match path_interval {
391            Ok(interval) => interval,
392            Err(_) => {
393                if !difference.is_empty() {
394                    let target_start = Self::parse_usize(fields[7], "target interval start")?;
395                    target_start..target_start
396                } else {
397                    return Err(String::from("Target interval end cannot be parsed or inferred from the difference string"));
398                }
399            },
400        };
401
402        // If we have a difference string, recalculate the redundant numerical fields.
403        if !difference.is_empty() {
404            let (query_len, target_len, num_matches, num_edits) = Difference::stats(&difference);
405            seq_interval.end = seq_interval.start + query_len;
406            path_interval.end = path_interval.start + target_len;
407            matches = num_matches;
408            edits = num_edits;
409        }
410
411        // Now we have the final path interval. Flip its orientation if necessary.
412        if orientation == Orientation::Reverse {
413            let start = path_len.saturating_sub(path_interval.end);
414            let end = path_len.saturating_sub(path_interval.start);
415            path_interval = start..end;
416        }
417
418        Ok(Alignment {
419            name, seq_len, seq_interval,
420            path, path_len, path_interval,
421            matches, edits, mapq, score,
422            base_quality, difference, pair,
423            optional,
424        })
425    }
426
427    /// Converts the alignment to a GAF line.
428    ///
429    /// If the target path is stored as a GBWT starting position, it will be missing (`*`).
430    /// The path can be set with [`Alignment::set_target_path`] or extracted from a GBWT index with [`Alignment::extract_target_path`].
431    /// The returned line does not end with an endline character.
432    ///
433    /// A target sequence is necessary for reconstructing the difference string, if present.
434    /// It must correspond to the entire target path.
435    pub fn to_gaf(&self, target_sequence: &[u8]) -> Vec<u8> {
436        let mut result = Vec::new();
437
438        result.extend_from_slice(self.name.as_bytes());
439
440        // Coordinates in the query sequence.
441        result.push(b'\t');
442        utils::append_usize(&mut result, self.seq_len);
443        result.push(b'\t');
444        utils::append_usize(&mut result, self.seq_interval.start);
445        result.push(b'\t');
446        utils::append_usize(&mut result, self.seq_interval.end);
447
448        // Target path; always in the forward orientation.
449        result.push(b'\t');
450        result.push(b'+');
451        result.push(b'\t');
452        match &self.path {
453            TargetPath::Path(path) => {
454                if path.is_empty() {
455                    result.extend_from_slice(&Self::MISSING_VALUE);
456                } else {
457                    formats::append_walk(&mut result, path);
458                }
459            },
460            TargetPath::StartPosition(_) => {
461                result.extend_from_slice(&Self::MISSING_VALUE);
462            },
463        }
464
465        // Coordinates in the target path.
466        result.push(b'\t');
467        utils::append_usize(&mut result, self.path_len);
468        result.push(b'\t');
469        utils::append_usize(&mut result, self.path_interval.start);
470        result.push(b'\t');
471        utils::append_usize(&mut result, self.path_interval.end);
472
473        // Alignment statistics.
474        result.push(b'\t');
475        utils::append_usize(&mut result, self.matches);
476        result.push(b'\t');
477        utils::append_usize(&mut result, self.matches + self.edits);
478        result.push(b'\t');
479        let mapq = self.mapq.unwrap_or(Self::MISSING_MAPQ);
480        utils::append_usize(&mut result, mapq);
481
482        // Known optional fields.
483        if let Some(score) = self.score {
484            let field = TypedField::Int([b'A', b'S'], score);
485            field.append_to(&mut result, true);
486        }
487        if !self.base_quality.is_empty() {
488            TypedField::append_string(&mut result, [b'b', b'q'], &self.base_quality, true);
489        }
490        if !self.difference.is_empty() {
491            // TODO: Can we write the difference string directly?
492            let target_sequence = &target_sequence[self.path_interval.clone()];
493            let field = TypedField::String([b'c', b's'], Difference::to_bytes(&self.difference, target_sequence));
494            field.append_to(&mut result, true);
495        }
496        if let Some(pair) = &self.pair {
497            if pair.is_next {
498                TypedField::append_string(&mut result, [b'f', b'n'], pair.name.as_bytes(), true);
499            } else {
500                TypedField::append_string(&mut result, [b'f', b'p'], pair.name.as_bytes(), true);
501            }
502            let field = TypedField::Bool([b'p', b'd'], pair.is_proper);
503            field.append_to(&mut result, true);
504        }
505
506        // Other optional fields.
507        for field in self.optional.iter() {
508            field.append_to(&mut result, true);
509        }
510
511        result
512    }
513
514    /// Validates the internal consistency of the alignment.
515    ///
516    /// Returns an error message if the alignment is inconsistent.
517    pub fn validate(&self) -> Result<(), String> {
518        // Check that intervals are valid.
519        if self.seq_interval.start > self.seq_interval.end || self.seq_interval.end > self.seq_len {
520            return Err(format!("Query sequence interval {}..{} for a sequence of length {}", self.seq_interval.start, self.seq_interval.end, self.seq_len));
521        }
522        if self.path_interval.start > self.path_interval.end || self.path_interval.end > self.path_len {
523            return Err(format!("Target path interval {}..{} for a path of length {}", self.path_interval.start, self.path_interval.end, self.path_len));
524        }
525
526        // Check that the target path is consistent with the path length and interval.
527        if let Some(path) = self.target_path() {
528            // This check also guarantees that the path interval is empty if the path is empty.
529            if path.is_empty() && self.path_len > 0 {
530                return Err(String::from("Empty target path with a non-empty path length"));
531            }
532            // TODO: We could check that the path length matches the sum of node lengths if we had access to a graph here.
533        }
534
535        // Check that interval lengths and alignment statistics match the difference string, if present.
536        if !self.difference.is_empty() {
537            let (query_len, target_len, num_matches, num_edits) = Difference::stats(&self.difference);
538            if query_len != self.seq_interval.len() {
539                return Err(format!("Query interval length {} according to the difference string, but {} in the alignment", query_len, self.seq_interval.len()));
540            }
541            if target_len != self.path_interval.len() {
542                return Err(format!("Target path interval length {} according to the difference string, but {} in the alignment", target_len, self.path_interval.len()));
543            }
544            if num_matches != self.matches {
545                return Err(format!("Number of matches {} according to the difference string, but {} in the alignment", num_matches, self.matches));
546            }
547            if num_edits != self.edits {
548                return Err(format!("Number of edits {} according to the difference string, but {} in the alignment", num_edits, self.edits));
549            }
550        }
551
552        Ok(())
553    }
554}
555
556//-----------------------------------------------------------------------------
557
558/// Encoding / decoding the alignment in the format used in GAF-base.
559impl Alignment {
560    // Normalizes the interval so that `start <= end <= len` and returns it as (left flank, length, right flank).
561    fn normalize_coordinates(interval: Range<usize>, len: usize) -> (usize, usize, usize) {
562        let start = if interval.start <= interval.end { interval.start } else { interval.end };
563        let end = if interval.end <= len { interval.end } else { len };
564        let left_flank = start;
565        let length = end - start;
566        let right_flank = len - end;
567        (left_flank, length, right_flank)
568    }
569
570    // Encodes a signed integer. Small absolute values are represented as small numbers.
571    fn encode_signed(value: isize, encoder: &mut ByteCode) {
572        let value = if value < 0 { (-2 * value - 1) as usize } else { 2 * value as usize };
573        encoder.write(value);
574    }
575
576    // Encodes numerical fields as a blob.
577    //
578    // This includes the following fields:
579    //
580    // * Query sequence: `seq_len`, `seq_interval.start`, `seq_interval.end`.
581    // * Target path: `path_len`, `path_interval.start`, `path_interval.end`.
582    // * Alignment statistics: `matches`, `edits`, `mapq`, `score`.
583    //
584    // The coordinates are normalized so that `start <= end <= len`.
585    // If `known_length` is set, the length of the query sequence is assumed to be stored externally.
586    // We also assume that the decoder knows whether this is a full and/or exact alignment, or whether a difference string is present.
587    // Numbers that can be derived from these properties are not stored to save space.
588    //
589    // Mapping quality and alignment score are stored if they are present.
590    // Their presence should be stored separately.
591    fn encode_numbers_into(&self, encoder: &mut ByteCode, known_length: bool) {
592        let full_alignment = self.is_full();
593        let exact_alignment = self.is_exact();
594        let stored_difference_string = self.has_difference_string() & !exact_alignment;
595
596        // Query sequence coordinates. We write the aligned length first to simplify the logic.
597        let (left_flank, length, right_flank) = Self::normalize_coordinates(self.seq_interval.clone(), self.seq_len);
598        if !(known_length || stored_difference_string) {
599            encoder.write(length);
600        }
601        if !full_alignment {
602            encoder.write(left_flank);
603            encoder.write(right_flank);
604        }
605
606        // Target path coordinates. We again write the aligned length first.
607        // We do not store the right flank, because it can be derived from the path itself.
608        let (left_flank, length, _) = Self::normalize_coordinates(self.path_interval.clone(), self.path_len);
609        if !(exact_alignment || stored_difference_string) {
610            encoder.write(length);
611        }
612        encoder.write(left_flank);
613
614        // Alignment statistics.
615        if !(exact_alignment || stored_difference_string) {
616            encoder.write(self.matches);
617            encoder.write(self.edits);
618        }
619        if let Some(mapq) = self.mapq {
620            encoder.write(mapq);
621        }
622        if let Some(score) = self.score {
623            Self::encode_signed(score, encoder);
624        };
625    }
626
627    // Encodes the difference string into the given encoder.
628    //
629    // Also encodes empty / missing difference strings.
630    // [`Difference::End`] values are not included in the encoding, but one is always added at the end.
631    fn encode_difference_into(&self, encoder: &mut RLE) {
632        for diff in self.difference.iter() {
633            match diff {
634                Difference::Match(len) => encoder.write(Run::new(0, *len)),
635                Difference::Mismatch(base) => encoder.write(Run::new(1, utils::encode_base(*base))),
636                Difference::Insertion(seq) => {
637                    let len = utils::encoded_length(seq.len());
638                    encoder.write(Run::new(2, len));
639                    let encoded = utils::encode_sequence(seq);
640                    for byte in encoded {
641                        encoder.write_byte(byte);
642                    }
643                },
644                Difference::Deletion(len) => encoder.write(Run::new(3, *len)),
645                Difference::End => {},
646            }
647        }
648        encoder.write(Run::new(4, 1));
649    }
650
651    // TODO: encode optional fields
652
653    // Decodes a signed integer from the iterator.
654    fn decode_signed(iter: &mut ByteCodeIter) -> Option<isize> {
655        let value = iter.next()?;
656        if value % 2 == 0 {
657            Some((value / 2) as isize)
658        } else {
659            Some(-(value as isize + 1) / 2)
660        }
661    }
662
663    // Decodes a difference string from the iterator until an End value.
664    // Also needs the underlying slice for decoding insertions.
665    // Returns an error if the if there is no End value.
666    fn decode_difference_from(encoded: &[u8], decoder: &mut RLEIter) -> Result<Vec<Difference>, String> {
667        let mut result: Vec<Difference> = Vec::new();
668        while let Some(run) = decoder.next() {
669            match run.value {
670                0 => result.push(Difference::Match(run.len)),
671                1 => result.push(Difference::Mismatch(utils::decode_base(run.len))),
672                2 => {
673                    let offset = decoder.offset();
674                    for _ in 0..run.len {
675                        let _ = decoder.byte().ok_or_else(|| String::from("Missing insertion base"))?;
676                    }
677                    let encoded = &encoded[offset..offset + run.len];
678                    let seq = utils::decode_sequence(encoded);
679                    result.push(Difference::Insertion(seq));
680                },
681                3 => result.push(Difference::Deletion(run.len)),
682                4 => {
683                    return Ok(result);
684                },
685                _ => return Err(format!("Invalid difference string operation: {}", run.value)),
686            }
687        }
688        Err(String::from("Encoded difference string ended without an End value"))
689    }
690}
691
692//-----------------------------------------------------------------------------
693
694// TODO: Add an operation for reconstructing the query sequence.
695/// Operations on the Alignment object.
696impl Alignment {
697    /// Returns `true` if the query sequence is unaligned.
698    ///
699    /// An aligned sequence has a non-empty query interval aligned to a non-empty target interval.
700    /// NOTE: An empty sequence is by definition unaligned.
701    pub fn is_unaligned(&self) -> bool {
702        self.seq_interval.is_empty() || self.path_interval.is_empty()
703    }
704
705    /// Returns `true` if this is an alignment of the entire query sequence.
706    ///
707    /// NOTE: An empty sequence is by definition unaligned.
708    pub fn is_full(&self) -> bool {
709        self.seq_len > 0 &&
710            self.seq_interval.start == 0 &&
711            self.seq_interval.end == self.seq_len
712    }
713
714    /// Returns `true` if this is an exact alignment.
715    ///
716    /// NOTE: An empty sequence is by definition unaligned.
717    pub fn is_exact(&self) -> bool {
718        self.seq_len > 0 &&
719            !self.seq_interval.is_empty() &&
720            self.edits == 0
721    }
722
723    /// Returns `true` if there is a non-empty difference string for the alignment.
724    pub fn has_difference_string(&self) -> bool {
725        !self.difference.is_empty()
726    }
727
728    /// Returns the minimum GBWT node identifier in the target path, or [`None`] if there is no path.
729    pub fn min_handle(&self) -> Option<usize> {
730        match self.path {
731            TargetPath::Path(ref path) => path.iter().copied().min(),
732            TargetPath::StartPosition(_) => None,
733        }
734    }
735
736    /// Returns the maximum GBWT node identifier in the target path, or [`None`] if there is no path.
737    pub fn max_handle(&self) -> Option<usize> {
738        match self.path {
739            TargetPath::Path(ref path) => path.iter().copied().max(),
740            TargetPath::StartPosition(_) => None,
741        }
742    }
743
744    /// Returns `true` if the target path is stored explicitly.
745    pub fn has_target_path(&self) -> bool {
746        matches!(self.path, TargetPath::Path(_))
747    }
748
749    /// Returns `true` if the target path is stored explicitly and is non-empty.
750    pub fn has_non_empty_target_path(&self) -> bool {
751        match &self.path {
752            TargetPath::Path(path) => !path.is_empty(),
753            TargetPath::StartPosition(_) => false,
754        }
755    }
756
757    /// Returns the target path if it is stored explicitly.
758    pub fn target_path(&self) -> Option<&[usize]> {
759        match &self.path {
760            TargetPath::Path(path) => Some(path),
761            TargetPath::StartPosition(_) => None,
762        }
763    }
764
765    // TODO: Should this update an existing target path?
766    /// Sets the target path from the GBWT index if it is not already present.
767    pub fn extract_target_path(&mut self, index: &GBWT) {
768        let mut pos = match self.path {
769            TargetPath::Path(_) => return,
770            TargetPath::StartPosition(pos) => Some(pos),
771        };
772        let mut path = Vec::new();
773        while let Some(p) = pos {
774            path.push(p.node);
775            pos = index.forward(p);
776        }
777        self.path = TargetPath::Path(path);
778    }
779
780    /// Sets the given path as the target path.
781    pub fn set_target_path(&mut self, path: Vec<usize>) {
782        self.path = TargetPath::Path(path);
783    }
784
785    /// Sets the length of the target path.
786    pub fn set_target_path_len(&mut self, len: usize) {
787        self.path_len = len;
788    }
789
790    /// Returns an iterator over the alignment as a sequence of mappings.
791    ///
792    /// Returns [`None`] if a valid iterator cannot be built.
793    /// The iterator requires a difference string and an explicitly stored target path.
794    /// It may stop early if the alignment is invalid.
795    ///
796    /// The iterator needs a function that provides the sequence length for each node.
797    /// This function may be based on [`gbz::GBZ`], [`Subgraph`], or [`crate::ReadSet`].
798    pub fn iter<'a>(&'a self, sequence_len: Arc<dyn Fn(usize) -> Option<usize> + 'a>) -> Option<AlignmentIter<'a>> {
799        if self.difference.is_empty() && (!self.seq_interval.is_empty() || !self.path_interval.is_empty()) {
800            return None;
801        }
802        if !self.has_target_path() {
803            return None;
804        }
805        let target_path = self.target_path().unwrap();
806        if target_path.is_empty() && (self.path_interval.start != 0 || self.path_interval.end != 0) {
807            return None;
808        }
809
810        let mut iter = AlignmentIter {
811            parent: self,
812            sequence_len,
813            seq_offset: self.seq_interval.start,
814            path_offset: self.path_interval.start,
815            path_vec_offset: 0,
816            path_node_offset: self.path_interval.start,
817            diff_vec_offset: 0,
818            diff_op_offset: 0,
819        };
820        if iter.path_node_offset == 0 {
821            return Some(iter);
822        }
823
824        // In some edge cases, there may be unused nodes at the start of the target path.
825        let mut handle = *target_path.get(iter.path_vec_offset)?;
826        let mut node_len = (*iter.sequence_len)(handle)?;
827        while iter.path_node_offset >= node_len {
828            iter.path_node_offset -= node_len;
829            iter.path_vec_offset += 1;
830            handle = *target_path.get(iter.path_vec_offset)?;
831            node_len = (*iter.sequence_len)(handle)?;
832        }
833
834        Some(iter)
835    }
836
837    // Creates an empty fragment of this alignment.
838    fn empty_fragment(&self, fragment_index: usize) -> Alignment {
839        let mut aln = Alignment {
840            name: self.name.clone(),
841            seq_len: self.seq_len,
842            seq_interval: 0..0,
843            path: TargetPath::Path(Vec::new()),
844            path_len: 0,
845            path_interval: 0..0,
846            matches: self.matches,
847            edits: self.edits,
848            mapq: self.mapq,
849            score: self.score,
850            base_quality: self.base_quality.clone(),
851            difference: Vec::new(),
852            pair: self.pair.clone(),
853            optional: self.optional.clone(),
854        };
855        aln.optional.push(TypedField::Int([b'f', b'i'], fragment_index as isize));
856
857        aln
858    }
859
860    // Extends the given fragment with the next mapping of the same original alignment.
861    fn extend(&mut self, mapping: Mapping) -> Result<(), String> {
862        // Start by updating the difference string.
863        if self.seq_interval.is_empty() && self.path_interval.is_empty() {
864            // If we started with a gap, `is_unaligned()` would be true.
865            if !self.difference.is_empty() {
866                return Err(String::from("Cannot extend an unaligned fragment with a non-empty difference string"));
867            }
868            self.difference.push(mapping.edit().clone());
869        } else {
870            if self.difference.is_empty() {
871                return Err(String::from("Cannot extend an aligned fragment without a difference string"));
872            }
873            if !self.difference.last_mut().unwrap().try_merge(mapping.edit()) {
874                self.difference.push(mapping.edit().clone());
875            }
876        }
877
878        // Update the query sequence.
879        if mapping.seq_interval().end > self.seq_len {
880            return Err(String::from("Cannot extend a fragment beyond the query sequence length"));
881        }
882        if self.seq_interval.is_empty() {
883            // Either the first mapping or we only have deletions so far.
884            self.seq_interval = mapping.seq_interval().clone();
885        } else {
886            if mapping.seq_interval().start != self.seq_interval.end {
887                return Err(String::from("Cannot append a non-contiguous query interval"));
888            }
889            self.seq_interval.end = mapping.seq_interval().end;
890        }
891
892        // TODO: enum?
893        // Determine how we are extending the alignment.
894        let target_path = self.target_path().ok_or_else(||
895            String::from("Cannot extend a fragment without an explicit target path")
896        )?;
897        let last_node = target_path.last().copied();
898        let path_left = self.path_len.saturating_sub(self.path_interval.end);
899        let reverse_offset = mapping.node_len().saturating_sub(mapping.node_interval().start);
900        let first_mapping = last_node.is_none();
901        let continues_in_same_node = Some(mapping.handle()) == last_node && reverse_offset == path_left;
902        let starts_a_new_node = path_left == 0 && mapping.is_at_start();
903
904        // Update the target path.
905        if first_mapping {
906            if let TargetPath::Path(path) = &mut self.path {
907                path.push(mapping.handle());
908            } else {
909                unreachable!();
910            }
911            self.path_len += mapping.node_len();
912            self.path_interval = mapping.node_interval().clone();
913        } else if starts_a_new_node {
914            if let TargetPath::Path(path) = &mut self.path {
915                path.push(mapping.handle());
916            } else {
917                unreachable!();
918            }
919            self.path_len += mapping.node_len();
920            self.path_interval.end += mapping.target_len();
921        } else if continues_in_same_node {
922            self.path_interval.end += mapping.target_len();
923        } else {
924            return Err(String::from("Cannot append a non-contiguous target interval"));
925        }
926
927        Ok(())
928    }
929
930    /// Clips the alignment into fragments that are fully contained in the given subgraph.
931    ///
932    /// Returns an empty vector if the read is unaligned or lacks an explicit non-empty target path or a difference string.
933    /// There will be one alignment fragment for every maximal subpath in the subgraph.
934    /// The fragments have the same name, pair, optional fields, and statistics as the original alignment.
935    /// Only the aligned intervals and difference strings depend on the fragment.
936    ///
937    /// Fragments of the same alignment are identified by a fragment index stored as an optional field `fi:i`.
938    /// The indexes start from 1.
939    /// Fragment index is omitted if the entire alignment is in the subgraph.
940    ///
941    /// Clipping requires a function that returns the sequence length for the node with the given handle.
942    /// This function may be based on [`gbz::GBZ`], [`Subgraph`], or [`crate::ReadSet`].
943    ///
944    /// # Errors
945    ///
946    /// Returns an error if an [`AlignmentIter`] cannot be created.
947    /// May return an error if the alignment is invalid and the iterator returns non-consecutive mappings.
948    ///
949    /// # Examples
950    ///
951    /// ```
952    /// use gbz::{GBZ, Orientation};
953    /// use gbz::support;
954    /// use gbz_base::{Alignment, Difference, Subgraph};
955    /// use gbz_base::utils;
956    /// use simple_sds::serialize;
957    /// use std::sync::Arc;
958    ///
959    /// let gbz_filename = utils::get_test_data("micb-kir3dl1.gbz");
960    /// let graph: GBZ = serialize::load_from(&gbz_filename).unwrap();
961    ///
962    /// // Create a perfect alignment with node lengths of 68 bp, 1 bp, and 6 bp.
963    /// let mut aln = Alignment::new();
964    /// aln.seq_len = 50;
965    /// aln.seq_interval = 0..50;
966    /// let target_path = vec![
967    ///     support::encode_node(13, Orientation::Forward),
968    ///     support::encode_node(14, Orientation::Forward),
969    ///     support::encode_node(16, Orientation::Forward),
970    /// ];
971    /// aln.set_target_path(target_path.clone());
972    /// aln.path_len = 75;
973    /// aln.path_interval = 20..70;
974    /// aln.matches = 50;
975    /// aln.difference.push(Difference::Match(50));
976    ///
977    /// // Create a subgraph without the middle node.
978    /// let mut subgraph = Subgraph::new();
979    /// let _ = subgraph.add_node_from_gbz(&graph, 13);
980    /// let _ = subgraph.add_node_from_gbz(&graph, 16);
981    ///
982    /// // Clip the alignment to the subgraph.
983    /// let sequence_len = Arc::new(|handle| {
984    ///     let (node_id, _) = support::decode_node(handle);
985    ///     graph.sequence_len(node_id)
986    /// });
987    /// let result = aln.clip(&subgraph, sequence_len.clone());
988    /// assert!(result.is_ok());
989    /// let clipped = result.unwrap();
990    ///
991    /// // We should have two fragments.
992    /// assert_eq!(clipped.len(), 2);
993    /// assert_eq!(clipped[0].seq_interval, 0..48);
994    /// assert_eq!(clipped[0].target_path().unwrap(), &target_path[0..1]);
995    /// assert_eq!(clipped[0].path_interval, 20..68);
996    /// assert_eq!(clipped[0].difference, vec![Difference::Match(48)]);
997    /// assert_eq!(clipped[1].seq_interval, 49..50);
998    /// assert_eq!(clipped[1].target_path().unwrap(), &target_path[2..3]);
999    /// assert_eq!(clipped[1].path_interval, 0..1);
1000    /// assert_eq!(clipped[1].difference, vec![Difference::Match(1)]);
1001    /// ```
1002    pub fn clip<'a>(&self, subgraph: &Subgraph, sequence_len: Arc<impl Fn(usize) -> Option<usize> + 'a>) -> Result<Vec<Alignment>, String> {
1003        let mut result = Vec::new();
1004        if self.is_unaligned() || !self.has_non_empty_target_path() || self.difference.is_empty() {
1005            return Ok(result);
1006        }
1007
1008        let mut aln: Option<Alignment> = None; // The alignment we are currently building.
1009        let iter = self.iter(sequence_len);
1010        if iter.is_none() {
1011            eprintln!("Warning: Cannot build an alignment iterator for {}", self.name);
1012            return Ok(result);
1013        }
1014        let iter = iter.unwrap();
1015        for mapping in iter {
1016            if !subgraph.has_handle(mapping.handle()) {
1017                if let Some(prev) = aln {
1018                    result.push(prev);
1019                    aln = None;
1020                }
1021                continue;
1022            }
1023
1024            if let Some(aln) = &mut aln {
1025                aln.extend(mapping)?;
1026            } else {
1027                let mut curr = self.empty_fragment(result.len() + 1);
1028                curr.extend(mapping)?;
1029                aln = Some(curr);
1030            }
1031        }
1032        if let Some(aln) = aln {
1033            result.push(aln);
1034        }
1035
1036        // Clear the fragment index if we have the entire alignment as a single fragment.
1037        if result.len() == 1 && result[0].seq_interval == self.seq_interval {
1038            result[0].optional.retain(|field| {
1039                !matches!(field, TypedField::Int([b'f', b'i'], _))
1040            });
1041        }
1042
1043        Ok(result)
1044    }
1045}
1046
1047//-----------------------------------------------------------------------------
1048
1049/// Target path in [`Alignment`].
1050///
1051/// The path is stored either explicitly as a sequence of oriented nodes or as its starting position in the GBWT index.
1052/// This corresponds to table `Nodes` in [`crate::GAFBase`].
1053#[derive(Clone, Debug, PartialEq, Eq)]
1054pub enum TargetPath {
1055    /// Path as a sequence of oriented nodes (GBWT node identifiers; GAF-base node handles).
1056    Path(Vec<usize>),
1057    /// Starting position in the GBWT.
1058    StartPosition(Pos),
1059}
1060
1061/// Information about the paired alignment.
1062#[derive(Clone, Debug, PartialEq, Eq)]
1063pub struct PairedRead {
1064    /// Name of the pair.
1065    pub name: String,
1066    /// Is the pair the next the next fragment?
1067    pub is_next: bool,
1068    /// Are the alignments properly paired?
1069    pub is_proper: bool,
1070}
1071
1072//-----------------------------------------------------------------------------
1073
1074/// An iterator over an [`Alignment`] as a sequence of [`Mapping`] objects.
1075///
1076/// This iterator assumes that the alignment is valid, and it may stop early if it is not.
1077/// It requires that the alignment stores the target path explicitly and has a difference string.
1078/// Insertions at node boundaries are assigned to the left.
1079/// If the difference string is inconsistent with the sequence / path intervals, the difference string takes precedence.
1080///
1081/// # Examples
1082///
1083/// ```
1084/// use gbz_base::{Alignment, Mapping, Difference};
1085/// use std::sync::Arc;
1086///
1087/// // Construct an alignment.
1088/// let mut aln = Alignment::new();
1089/// aln.seq_len = 15;
1090/// aln.seq_interval = 0..15;
1091/// aln.set_target_path(vec![1, 2, 3]);
1092/// aln.path_len = 15;
1093/// aln.path_interval = 0..15;
1094/// aln.matches = 14;
1095/// aln.edits = 1;
1096/// aln.difference = vec![
1097///     Difference::Match(6),
1098///     Difference::Mismatch(b'A'),
1099///     Difference::Match(8),
1100/// ];
1101///
1102/// // We use this in place of a graph for node lengths.
1103/// let node_lengths = vec![0, 4, 6, 5];
1104/// let sequence_len = Arc::new(|handle| node_lengths.get(handle).copied());
1105///
1106/// // This is what we expect from the iterator.
1107/// let truth = vec![
1108///     Mapping::new(0, 1, node_lengths[1], 0, Difference::Match(4)),
1109///     Mapping::new(4, 2, node_lengths[2], 0, Difference::Match(2)),
1110///     Mapping::new(6, 2, node_lengths[2], 2, Difference::Mismatch(b'A')),
1111///     Mapping::new(7, 2, node_lengths[2], 3, Difference::Match(3)),
1112///     Mapping::new(10, 3, node_lengths[3], 0, Difference::Match(5)),
1113/// ];
1114///
1115/// // Iterator creation may fail if the alignment is invalid.
1116/// let iter = aln.iter(sequence_len);
1117/// assert!(iter.is_some());
1118/// let iter = iter.unwrap();
1119/// assert!(iter.eq(truth.iter().cloned()));
1120/// ```
1121#[derive(Clone)]
1122pub struct AlignmentIter<'a> {
1123    parent: &'a Alignment,
1124    sequence_len: Arc<dyn Fn(usize) -> Option<usize> + 'a>,
1125    // Position in the query sequence.
1126    seq_offset: usize,
1127    // Position in the target sequence.
1128    path_offset: usize,
1129    // Position in the target path.
1130    path_vec_offset: usize,
1131    // Position within the current node in the target path.
1132    path_node_offset: usize,
1133    // Position in the difference string.
1134    diff_vec_offset: usize,
1135    // Target position in the current difference operation.
1136    diff_op_offset: usize,
1137}
1138
1139impl<'a> Iterator for AlignmentIter<'a> {
1140    type Item = Mapping;
1141
1142    fn next(&mut self) -> Option<Self::Item> {
1143        if self.diff_vec_offset >= self.parent.difference.len() {
1144            return None;
1145        }
1146
1147        // First find the edit that forms the basis of the mapping.
1148        let full_edit = self.parent.difference.get(self.diff_vec_offset)?;
1149        let edit_left = full_edit.target_len() - self.diff_op_offset;
1150
1151        // We may have unaligned insertions past the end of an empty target path.
1152        let target_path = self.parent.target_path()?;
1153        let handle = target_path.get(self.path_vec_offset);
1154        if handle.is_none() {
1155            match full_edit {
1156                Difference::Insertion(_) => {
1157                    let seq_offset = self.seq_offset;
1158                    self.seq_offset += full_edit.query_len();
1159                    self.diff_vec_offset += 1;
1160                    self.diff_op_offset = 0;
1161                    return Some(Mapping::unaligned(seq_offset, full_edit.clone()));
1162                },
1163                _ => {
1164                    return None;
1165                },
1166            }
1167        }
1168
1169        // Find the target node. We may need to advance to the next node we are
1170        // at the end of the node and we have an edit with non-zero target length.
1171        // We prefer mapping insertions to the end of a node rather than to the start
1172        // of the next node, as the next node may not exist.
1173        let mut handle = *handle.unwrap();
1174        let mut node_len = (*self.sequence_len)(handle)?;
1175        if self.path_node_offset >= node_len && edit_left > 0 {
1176            self.path_vec_offset += 1;
1177            self.path_node_offset = 0;
1178            handle = *target_path.get(self.path_vec_offset)?;
1179            node_len = (*self.sequence_len)(handle)?;
1180        }
1181        let node_left = node_len - self.path_node_offset;
1182
1183        // Take the offsets for the mapping before we update them.
1184        let seq_offset = self.seq_offset;
1185        let node_offset = self.path_node_offset;
1186
1187        // Now determine the actual edit.
1188        let edit_len = cmp::min(edit_left, node_left);
1189        let edit = match full_edit {
1190            Difference::Match(_) => Difference::Match(edit_len),
1191            Difference::Mismatch(base) => Difference::Mismatch(*base),
1192            Difference::Insertion(seq) => {
1193                // We can always take the full insertion within the current node.
1194                Difference::Insertion(seq.clone())
1195            },
1196            Difference::Deletion(_) => Difference::Deletion(edit_len),
1197            Difference::End => {
1198                self.diff_vec_offset = self.parent.difference.len();
1199                return None;
1200            },
1201        };
1202
1203        // And then advance the iterator.
1204        self.seq_offset += edit.query_len();
1205        self.path_offset += edit.target_len();
1206        // We do not advance the node, as the next edit may be an insertion.
1207        self.path_node_offset += edit.target_len();
1208        self.diff_op_offset += edit.target_len();
1209        if self.diff_op_offset >= full_edit.target_len() {
1210            self.diff_vec_offset += 1;
1211            self.diff_op_offset = 0;
1212        }
1213
1214        Some(Mapping::new(seq_offset, handle, node_len, node_offset, edit))
1215    }
1216}
1217
1218//-----------------------------------------------------------------------------
1219
1220/// An ad hoc bitvector for flags in [`AlignmentBlock`].
1221#[derive(Debug, Clone)]
1222pub struct Flags {
1223    bits: Vec<u8>,
1224}
1225
1226impl Flags {
1227    /// Number of flags per alignment.
1228    pub const NUM_FLAGS: usize = 6;
1229
1230    // Flags expressed as a bit offset.
1231    // TODO: As enum?
1232
1233    /// Flag: Pair is the next fragment in the read pair.
1234    pub const FLAG_PAIR_IS_NEXT: usize = 0;
1235
1236    /// Flag: The the alignment is properly paired.
1237    pub const FLAG_PAIR_IS_PROPER: usize = 1;
1238
1239    /// Flag: A mapping quality score is present.
1240    pub const FLAG_HAS_MAPQ: usize = 2;
1241
1242    /// Flag: An alignment score is present.
1243    pub const FLAG_HAS_SCORE: usize = 3;
1244
1245    /// Flag: This is an alignment of the entire query sequence.
1246    ///
1247    /// We do not encode the lengths of the unaligned flanks.
1248    pub const FLAG_FULL_ALIGNMENT: usize = 4;
1249
1250    /// Flag: This is an exact alignment.
1251    ///
1252    /// We do not encode the difference string, matches, or edits.
1253    pub const FLAG_EXACT_ALIGNMENT: usize = 5;
1254
1255    /// Creates a new flags object with the given number of alignments.
1256    /// The flags are initialized to zero (false).
1257    pub fn new(num_alignments: usize) -> Self {
1258        let bits = vec![0; (num_alignments * Self::NUM_FLAGS).div_ceil(8)];
1259        Self { bits }
1260    }
1261
1262    /// Sets the given flag.
1263    ///
1264    /// # Arguments
1265    ///
1266    /// * `index`: The index of the alignment.
1267    /// * `flag`: The flag to set.
1268    /// * `value`: The value to set the flag to.
1269    pub fn set(&mut self, index: usize, flag: usize, value: bool) {
1270        let bit = index * Self::NUM_FLAGS + flag;
1271        if value {
1272            self.bits[bit / 8] |= 1 << (bit % 8);
1273        } else {
1274            self.bits[bit / 8] &= !(1 << (bit % 8));
1275        }
1276    }
1277
1278    /// Gets the value of the given flag.
1279    ///
1280    /// # Arguments
1281    ///
1282    /// * `index`: The index of the alignment.
1283    /// * `flag`: The flag to get.
1284    pub fn get(&self, index: usize, flag: usize) -> bool {
1285        let bit = index * Self::NUM_FLAGS + flag;
1286        (self.bits[bit / 8] >> (bit % 8)) & 0x01 != 0
1287    }
1288
1289    /// Returns the number of bytes used to store the flags.
1290    pub fn bytes(&self) -> usize {
1291        self.bits.len()
1292    }
1293}
1294
1295impl From<Vec<u8>> for Flags {
1296    fn from(bits: Vec<u8>) -> Self {
1297        Self { bits }
1298    }
1299}
1300
1301impl AsRef<[u8]> for Flags {
1302    fn as_ref(&self) -> &[u8] {
1303        &self.bits
1304    }
1305}
1306
1307//-----------------------------------------------------------------------------
1308
1309// TODO: expected alignment score for perfect alignments? Or just a scoring model?
1310/// An encoded block of [`Alignment`] objects.
1311///
1312/// This is a compressed representation of multiple sequences aligned to the same graph.
1313/// With short reads, a properly chosen block size enables both column-based compression and random access to the alignments.
1314/// Long reads do not benefit from column-based compression, as the amount of metadata is insignificant.
1315/// Reasonable block sizes could be 1000 alignments for short reads and 10 for long reads.
1316///
1317/// For best results, node identifiers should approximate a topological order in the graph.
1318/// The range of node identifiers in a path is typically proportional to the length of the path.
1319/// If the alignments are sorted by (min id, max id), a block will then consist of alignments that are close to each other in the graph.
1320///
1321/// # Notes
1322///
1323/// * Target paths must be stored separately (e.g. using a GBWT-based encoding).
1324/// * A block must contain either aligned reads or unaligned reads, but not both.
1325///
1326/// # Examples
1327///
1328/// ```
1329/// use gbz_base::{Alignment, AlignmentBlock};
1330/// use gbz_base::utils::PathStartSource;
1331/// use gbz_base::{formats, utils};
1332/// use gbz::GBWT;
1333/// use gbz::support;
1334/// use simple_sds::serialize;
1335///
1336/// // We assume that the target paths are stored separately in a GBWT index.
1337/// let gbwt_filename = utils::get_test_data("micb-kir3dl1_HG003.gbwt");
1338/// let index: GBWT = serialize::load_from(&gbwt_filename).unwrap();
1339///
1340/// // We use the GBWT index for determining the starting position for each path.
1341/// // But we could also compute it on the fly with `PathStartSource::new()`.
1342/// let mut source = PathStartSource::from(&index);
1343///
1344/// // Open a GAF file and skip the header.
1345/// let gaf_filename = utils::get_test_data("micb-kir3dl1_HG003.gaf");
1346/// let mut gaf_file = utils::open_file(&gaf_filename).unwrap();
1347/// while formats::peek_gaf_header_line(&mut gaf_file).unwrap() {
1348///    let mut buf: Vec<u8> = Vec::new();
1349///    let _ = gaf_file.read_until(b'\n', &mut buf).unwrap();
1350/// }
1351///
1352/// // Read some alignments from the GAF file.
1353/// let mut alignments = Vec::new();
1354/// for _ in 0..10 {
1355///     let mut buf: Vec<u8> = Vec::new();
1356///     let _ = gaf_file.read_until(b'\n', &mut buf).unwrap();
1357///     let aln = Alignment::from_gaf(&buf).unwrap();
1358///     // A block cannot have a mix of aligned and unaligned reads.
1359///     assert!(!aln.is_unaligned());
1360///     alignments.push(aln);
1361/// }
1362///
1363/// // Compress the block.
1364/// let mut first_id = 0;
1365/// let block = AlignmentBlock::new(&alignments, &mut source, first_id).unwrap();
1366/// assert_eq!(block.len(), alignments.len());
1367/// first_id += alignments.len(); // Next block would start there.
1368///
1369/// // Decompress the block and extract the paths from the GBWT.
1370/// let mut decompressed = block.decode().unwrap();
1371/// assert_eq!(decompressed.len(), alignments.len());
1372/// for (i, aln) in decompressed.iter_mut().enumerate() {
1373///     aln.extract_target_path(&index);
1374///     // NOTE: We need the reference graph to determine the true target path length.
1375///     aln.path_len = alignments[i].path_len;
1376///     assert_eq!(*aln, alignments[i]);
1377/// }
1378/// ```
1379#[derive(Debug, Clone)]
1380pub struct AlignmentBlock {
1381    /// Minimum GBWT node identifier in the target paths, or [`None`] if this is a block of unaligned reads.
1382    pub min_handle: Option<usize>,
1383    /// Maximum GBWT node identifier in the target paths, or [`None`] if this is a block of unaligned reads.
1384    pub max_handle: Option<usize>,
1385    /// Number of alignments in the block.
1386    pub alignments: usize,
1387    /// Expected read length in the block, or [`None`] if the lengths vary.
1388    pub read_length: Option<usize>,
1389    /// GBWT starting positions for the target paths.
1390    pub gbwt_starts: Vec<u8>,
1391    /// Read and pair names.
1392    pub names: Vec<u8>,
1393    /// Quality strings.
1394    pub quality_strings: Vec<u8>,
1395    /// Difference strings.
1396    pub difference_strings: Vec<u8>,
1397    /// Binary flags for each alignment.
1398    pub flags: Flags,
1399    /// Encoded numerical information that cannot be derived from the other fields.
1400    pub numbers: Vec<u8>,
1401    /// Optional typed fields that have not been interpreted.
1402    pub optional: Vec<u8>,
1403}
1404
1405impl AlignmentBlock {
1406    // TODO: Make this a parameter?
1407    /// Compression level for Zstandard.
1408    pub const COMPRESSION_LEVEL: i32 = 7;
1409
1410    fn expected_read_length(alignments: &[Alignment]) -> Option<usize> {
1411        let mut read_length = None;
1412        for aln in alignments.iter() {
1413            if let Some(len) = read_length {
1414                if aln.seq_len != len {
1415                    read_length = None;
1416                    break;
1417                }
1418            } else {
1419                if aln.seq_len == 0 {
1420                    break;
1421                }
1422                read_length = Some(aln.seq_len);
1423            }
1424        }
1425        read_length
1426    }
1427
1428    fn compress_gbwt_starts(
1429        alignments: &[Alignment],
1430        source: &mut PathStartSource,
1431        first_id: usize, min_handle: Option<usize>
1432    ) -> Result<Vec<u8>, String> {
1433        let base_node = min_handle.unwrap_or(0);
1434        let mut encoder = ByteCode::new();
1435        for (i, aln) in alignments.iter().enumerate() {
1436            let node_id = if let Some(path) = aln.target_path() {
1437                path.first().copied().unwrap_or(ENDMARKER)
1438            } else {
1439                return Err(format!("Line {}: Cannot compute GBWT start for an alignment without an explicit target path", first_id + i));
1440            };
1441
1442            // GBWT start as (node - min_handle, offset).
1443            if let Some(start) = source.next(node_id) {
1444                encoder.write(start.node - base_node);
1445                encoder.write(start.offset);
1446            } else if !encoder.is_empty() {
1447                return Err(format!("Line {}: Unaligned read in a block with aligned reads", first_id + i));
1448            }
1449        }
1450        Ok(Vec::from(encoder))
1451    }
1452
1453    // TODO: Somewhere else?
1454    fn zstd_compress(data: &[u8]) -> Result<Vec<u8>, String> {
1455        let mut encoder = ZstdEncoder::new(Vec::new(), Self::COMPRESSION_LEVEL).unwrap();
1456        encoder.write_all(data).map_err(|err| format!("Zstd compression error: {}", err))?;
1457        encoder.finish().map_err(|err| format!("Zstd compression error: {}", err))
1458    }
1459
1460    fn compress_names_pairs(alignments: &[Alignment]) -> Result<Vec<u8>, String> {
1461        let mut names: Vec<u8> = Vec::new();
1462        for aln in alignments.iter() {
1463            // Read name as a 0-terminated string.
1464            names.extend_from_slice(aln.name.as_bytes());
1465            names.push(0);
1466            // Pair name as a 0-terminated string, if present.
1467            if let Some(pair) = &aln.pair {
1468                names.extend_from_slice(pair.name.as_bytes());
1469            }
1470            names.push(0);
1471        }
1472        Self::zstd_compress(&names)
1473    }
1474
1475    fn compress_quality_strings(alignments: &[Alignment]) -> Result<Vec<u8>, String> {
1476        let mut quality_strings: Vec<u8> = Vec::new();
1477        for aln in alignments.iter() {
1478            // Quality strings as 0-terminated strings.
1479            if !aln.base_quality.is_empty() {
1480                quality_strings.extend_from_slice(&aln.base_quality);
1481            }
1482            quality_strings.push(0);
1483        }
1484        if quality_strings.len() <= alignments.len() {
1485            // If we have no quality strings, we can store an empty blob.
1486            return Ok(Vec::new());
1487        }
1488        Self::zstd_compress(&quality_strings)
1489    }
1490
1491    // TODO: Instead of TAG:TYPE:VALUE, we could drop the separators.
1492    fn compress_optional_fields(alignments: &[Alignment]) -> Result<Vec<u8>, String> {
1493        let mut optional: Vec<u8> = Vec::new();
1494        for aln in alignments.iter() {
1495            // Optional fields as a sequence of typed fields.
1496            for (i, field) in aln.optional.iter().enumerate() {
1497                field.append_to(&mut optional, i > 0);
1498            }
1499            optional.push(0); // Terminator for the optional fields of each alignment.
1500        }
1501        if optional.len() <= alignments.len() {
1502            // If we have no optional fields, we can store an empty blob.
1503            return Ok(Vec::new());
1504        }
1505        Self::zstd_compress(&optional)
1506    }
1507
1508    /// Creates a new alignment block from the given read alignments and GBWT index.
1509    ///
1510    /// If the reads are aligned, they correspond to paths `first_id` to `first_id + alignments.len() - 1` in the GBWT index.
1511    /// The GBWT index may be bidirectional or unidirectional.
1512    ///
1513    /// # Arguments
1514    ///
1515    /// * `alignments`: The alignments to include in the block.
1516    /// * `source`: A source for GBWT starting positions of the target paths.
1517    /// * `first_id`: Path identifier of the first alignment in the block.
1518    ///
1519    /// # Errors
1520    ///
1521    /// Returns an error, if:
1522    ///
1523    /// * The block contains a mix of aligned and unaligned reads.
1524    /// * The source computes GBWT starts on the fly, but an alignment does not store the target path explicitly.
1525    /// * Compression fails.
1526    pub fn new(alignments: &[Alignment], source: &mut PathStartSource, first_id: usize) -> Result<Self, String> {
1527        let min_handle = alignments.iter().map(|aln| aln.min_handle()).min().flatten();
1528        let max_handle = alignments.iter().map(|aln| aln.max_handle()).max().flatten();
1529        let read_length = Self::expected_read_length(alignments);
1530        let gbwt_starts = Self::compress_gbwt_starts(alignments, source, first_id, min_handle)?;
1531        let names = Self::compress_names_pairs(alignments)?;
1532        let quality_strings = Self::compress_quality_strings(alignments)?;
1533        let optional = Self::compress_optional_fields(alignments)?;
1534
1535        // We encode the remaining fields together, as they depend on each other.
1536        let mut difference_strings = RLE::with_sigma(Difference::NUM_TYPES);
1537        let mut flags = Flags::new(alignments.len());
1538        let mut numbers = ByteCode::new();
1539
1540        for (i, aln) in alignments.iter().enumerate() {
1541            let full_alignment = aln.is_full();
1542            let exact_alignment = aln.is_exact();
1543            if !exact_alignment {
1544                // If the alignment is exact, we can reconstruct the difference string from
1545                // the length of the aligned interval.
1546                aln.encode_difference_into(&mut difference_strings);
1547            }
1548
1549            if let Some(pair) = &aln.pair {
1550                if pair.is_next {
1551                    flags.set(i, Flags::FLAG_PAIR_IS_NEXT, true);
1552                }
1553                if pair.is_proper {
1554                    flags.set(i, Flags::FLAG_PAIR_IS_PROPER, true);
1555                }
1556            }
1557            if aln.mapq.is_some() {
1558                flags.set(i, Flags::FLAG_HAS_MAPQ, true);
1559            }
1560            if aln.score.is_some() {
1561                flags.set(i, Flags::FLAG_HAS_SCORE, true);
1562            }
1563            flags.set(i, Flags::FLAG_FULL_ALIGNMENT, full_alignment);
1564            flags.set(i, Flags::FLAG_EXACT_ALIGNMENT, exact_alignment);
1565
1566            aln.encode_numbers_into(&mut numbers, read_length.is_some());
1567        }
1568
1569        // TODO: Also consider using an empty blob for no difference strings, as we do for quality strings.
1570
1571        Ok(Self {
1572            min_handle,
1573            max_handle,
1574            alignments: alignments.len(),
1575            read_length,
1576            gbwt_starts,
1577            names,
1578            quality_strings,
1579            difference_strings: Vec::from(difference_strings),
1580            flags,
1581            numbers: Vec::from(numbers),
1582            optional,
1583        })
1584    }
1585
1586    /// Returns the number of alignments in the block.
1587    pub fn len(&self) -> usize {
1588        self.alignments
1589    }
1590
1591    /// Returns `true` if the block contains no alignments.
1592    pub fn is_empty(&self) -> bool {
1593        self.alignments == 0
1594    }
1595
1596    fn decompress_gbwt_starts(&self, result: &mut [Alignment]) -> Result<(), String> {
1597        if self.min_handle.is_none() {
1598            return Ok(());
1599        }
1600        let base_node = self.min_handle.unwrap();
1601        let mut decoder: ByteCodeIter<'_> = ByteCodeIter::new(&self.gbwt_starts[..]);
1602        for (i, aln) in result.iter_mut().enumerate() {
1603            let start = decoder.next().ok_or_else(||
1604                format!("Missing GBWT start for alignment {}", i)
1605            )?;
1606            let offset = decoder.next().ok_or_else(||
1607                format!("Missing GBWT offset for alignment {}", i)
1608            )?;
1609            aln.path = TargetPath::StartPosition(Pos::new(start + base_node, offset));
1610        }
1611        Ok(())
1612    }
1613
1614    // TODO: Somewhere else?
1615    fn zstd_decompress(data: &[u8]) -> Result<Vec<u8>, String> {
1616        let mut decoder = ZstdDecoder::new(data).map_err(|err| format!("Zstd decompression error: {}", err))?;
1617        let mut buffer = Vec::new();
1618        decoder.read_to_end(&mut buffer).map_err(|err| format!("Zstd decompression error: {}", err))?;
1619        Ok(buffer)
1620    }
1621
1622    fn decompress_names_pairs(&self, result: &mut [Alignment]) -> Result<(), String> {
1623        let buffer = Self::zstd_decompress(&self.names[..])?;
1624        let mut iter = buffer.split(|&c| c == 0);
1625        for (i, aln) in result.iter_mut().enumerate() {
1626            let name = iter.next().ok_or_else(|| format!("Missing name for alignment {}", i))?;
1627            aln.name = String::from_utf8_lossy(name).to_string();
1628            let pair_name = iter.next().ok_or_else(|| format!("Missing pair name for alignment {}", i))?;
1629            if !pair_name.is_empty() {
1630                aln.pair = Some(PairedRead {
1631                    name: String::from_utf8_lossy(pair_name).to_string(),
1632                    is_next: self.flags.get(i, Flags::FLAG_PAIR_IS_NEXT),
1633                    is_proper: self.flags.get(i, Flags::FLAG_PAIR_IS_PROPER),
1634                });
1635            }
1636        }
1637        // There should be an empty slice at the end, as the buffer is 0-terminated.
1638        // TODO: Should we check this?
1639        Ok(())
1640    }
1641
1642    fn decompress_quality_strings(&self, result: &mut [Alignment]) -> Result<(), String> {
1643        if self.quality_strings.is_empty() {
1644            // An empty blob indicates no quality strings.
1645            return Ok(());
1646        }
1647
1648        let buffer = Self::zstd_decompress(&self.quality_strings[..])?;
1649        let mut iter = buffer.split(|&c| c == 0);
1650        for (i, aln) in result.iter_mut().enumerate() {
1651            let quality = iter.next().ok_or_else(|| format!("Missing quality string for alignment {}", i))?;
1652            aln.base_quality = quality.to_vec();
1653        }
1654        // There should be an empty slice at the end, as the buffer is 0-terminated.
1655        // TODO: Should we check this?
1656        Ok(())
1657    }
1658
1659    fn decompress_difference_strings(&self, result: &mut [Alignment]) -> Result<(), String> {
1660        // TODO: Also consider using an empty blob for no difference strings, as we do for quality strings.
1661
1662        let mut decoder = RLEIter::with_sigma(&self.difference_strings[..], Difference::NUM_TYPES);
1663        for (i, aln) in result.iter_mut().enumerate() {
1664            if !self.flags.get(i, Flags::FLAG_EXACT_ALIGNMENT) {
1665                let res = Alignment::decode_difference_from(&self.difference_strings, &mut decoder);
1666                aln.difference = res.map_err(|err| format!("Missing difference string for alignment {}: {}", i, err))?;
1667            }
1668            // If the alignment is exact, we will reconstruct the difference string once we know
1669            // the length of the aligned interval.
1670        }
1671        Ok(())
1672    }
1673
1674    fn decompress_numbers(&self, result: &mut [Alignment]) -> Result<(), String> {
1675        let mut decoder = ByteCodeIter::new(&self.numbers[..]);
1676        for (i, aln) in result.iter_mut().enumerate() {
1677            let full_alignment = self.flags.get(i, Flags::FLAG_FULL_ALIGNMENT);
1678            let exact_alignment = self.flags.get(i, Flags::FLAG_EXACT_ALIGNMENT);
1679            // We did not store the difference string for an exact alignment.
1680            let stored_difference_string = aln.has_difference_string();
1681
1682            // Derive the numbers from the difference string first.
1683            // If the difference string is empty, we get zeros and update them later.
1684            let (mut query_len, mut target_len, num_matches, num_edits) = Difference::stats(&aln.difference);
1685            aln.matches = num_matches;
1686            aln.edits = num_edits;
1687
1688            // Query sequence coordinates.
1689            if let Some(len) = self.read_length {
1690                query_len = len;
1691            } else if !stored_difference_string {
1692                query_len = decoder.next().ok_or_else(|| format!("Missing query sequence aligned length for alignment {}", i))?;
1693            }
1694            let (query_left, query_right) = if full_alignment {
1695                (0, 0)
1696            } else {
1697                let query_left = decoder.next().ok_or_else(|| format!("Missing query sequence left flank for alignment {}", i))?;
1698                let query_right = decoder.next().ok_or_else(|| format!("Missing query sequence right flank for alignment {}", i))?;
1699                (query_left, query_right)
1700            };
1701            if self.read_length.is_some() {
1702                // The known length was for the entire query sequence, including the flanks.
1703                query_len -= query_left + query_right;
1704            }
1705            aln.seq_interval = query_left..(query_left + query_len);
1706            aln.seq_len = query_len + query_left + query_right;
1707
1708            // Target path coordinates.
1709            if exact_alignment {
1710                target_len = query_len;
1711            } else if !stored_difference_string {
1712                target_len = decoder.next().ok_or_else(|| format!("Missing target path aligned length for alignment {}", i))?;
1713            }
1714            let target_left = decoder.next().ok_or_else(|| format!("Missing target path left flank for alignment {}", i))?;
1715            let target_right = 0; // We can determine the length of the right flank later.
1716            aln.path_interval = target_left..(target_left + target_len);
1717            aln.path_len = target_len + target_left + target_right;
1718
1719            // Alignment statistics.
1720            if exact_alignment {
1721                aln.matches = query_len;
1722                aln.edits = 0;
1723            } else if !stored_difference_string {
1724                aln.matches = decoder.next().ok_or_else(|| format!("Missing number of matches for alignment {}", i))?;
1725                aln.edits = decoder.next().ok_or_else(|| format!("Missing number of edits for alignment {}", i))?;
1726            }
1727            if self.flags.get(i, Flags::FLAG_HAS_MAPQ) {
1728                aln.mapq = Some(decoder.next().ok_or_else(|| format!("Missing mapping quality for alignment {}", i))?);
1729            }
1730            if self.flags.get(i, Flags::FLAG_HAS_SCORE) {
1731                aln.score = Some(Alignment::decode_signed(&mut decoder).ok_or_else(||
1732                    format!("Missing alignment score for alignment {}", i)
1733                )?);
1734            }
1735        }
1736
1737        Ok(())
1738    }
1739
1740    fn decompress_optional_fields(&self, result: &mut [Alignment]) -> Result<(), String> {
1741        if self.optional.is_empty() {
1742            // An empty blob indicates no optional fields.
1743            return Ok(());
1744        }
1745
1746        let buffer = Self::zstd_decompress(&self.optional[..])?;
1747        let mut iter = buffer.split(|&c| c == 0);
1748        for (i, aln) in result.iter_mut().enumerate() {
1749            let optional = iter.next().ok_or_else(|| format!("Missing optional fields for alignment {}", i))?;
1750            let fields = optional.split(|&c| c == b'\t');
1751            for field in fields {
1752                let parsed = TypedField::parse(field)?;
1753                aln.optional.push(parsed);
1754            }
1755        }
1756        // There should be an empty slice at the end, as the buffer is 0-terminated.
1757        // TODO: Should we check this?
1758        Ok(())
1759    }
1760
1761    /// Decompresses the block into a vector of alignments.
1762    ///
1763    /// Aligned query sequences have target paths represented as GBWT starting positions.
1764    /// The path can be set with [`Alignment::set_target_path`] or extracted from a GBWT index with [`Alignment::extract_target_path`].
1765    /// Unaligned query sequences have empty target paths.
1766    ///
1767    /// The true length of the target path cannot be determined from the alignment block alone.
1768    /// It can be set later using [`Alignment::set_target_path_len`].
1769    ///
1770    /// # Errors
1771    ///
1772    /// Returns an error if decompression fails or if data required for decoding the block is missing.
1773    pub fn decode(&self) -> Result<Vec<Alignment>, String> {
1774        let mut result = vec![Alignment::default(); self.len()];
1775
1776        // In a block of aligned reads, we have a GBWT start for every alignment.
1777        // In a block of unaligned reads, we have no GBWT starts.
1778        self.decompress_gbwt_starts(&mut result)?;
1779
1780        // Missing query / pair names are encoded as empty strings.
1781        self.decompress_names_pairs(&mut result)?;
1782
1783        // This could be an empty blob if we have no quality strings.
1784        self.decompress_quality_strings(&mut result)?;
1785
1786        // The encoding includes empty / missing difference strings but no difference strings
1787        // for exact alignments.
1788        self.decompress_difference_strings(&mut result)?;
1789
1790        // We need the flags and the difference strings (for non-exact alignments) to decode the numbers.
1791        self.decompress_numbers(&mut result)?;
1792
1793        // This could be an empty blob if we have no optional fields.
1794        self.decompress_optional_fields(&mut result)?;
1795
1796        // Reconstruct the difference strings for exact alignments as the last step.
1797        // We may not know the aligned interval length until we have decompressed the numbers.
1798        for (i, aln) in result.iter_mut().enumerate() {
1799            if self.flags.get(i, Flags::FLAG_EXACT_ALIGNMENT) {
1800                aln.difference = vec![Difference::Match(aln.seq_interval.len())];
1801            }
1802        }
1803
1804        Ok(result)
1805    }
1806}
1807
1808//-----------------------------------------------------------------------------