gbz_base/alignment.rs
1//! Structures for representing sequence to graph alignments.
2//!
3//! An [`Alignment`] object represents the alignment of a query sequence to a target path in a graph.
4//! It corresponds to a single line in a GAF file.
5//!
6//! An [`AlignmentBlock`] object represents an ordered collection of alignments, using column-based compression.
7//! It corresponds to a row in table `Alignments` in a [`crate::GAFBase`].
8//!
9//! The GAF format is a text-based format for representing sequence alignments to a graph.
10//! See [the specification](https://github.com/lh3/gfatools/blob/master/doc/rGFA.md) for an overview.
11//! Some details are better documented in the [minimap2 man page](https://lh3.github.io/minimap2/minimap2.html#10).
12//!
13//! # Assumptions
14//!
15//! Some details of the GAF format are not well documented, and this implementation makes some assumptions.
16//!
17//! * Softclips are included either in both the query interval and the difference string or in neither.
18//! * If a difference string is present, it overrides query/path interval ends, the number of matches, and alignment block length.
19//! * Alignment block length is the sum of matches, mismatches, insertions, and deletions.
20//!
21//! # Supported optional fields
22//!
23//! * `AS:i`: Alignment score.
24//! * `bq:Z`: Base quality values for the query sequence.
25//! * `cs:Z`: Difference string for the alignment.
26//! * `pd:b`: Properly paired flag.
27//! * `fn:Z`: Name of the next read in the pair.
28//! * `fp:Z`: Name of the previous read in the pair.
29//!
30//! Only matches, mismatches, insertions, and deletions are supported in the difference string.
31//! Matches are represented as a match length rather than as the matching sequence.
32//!
33//! The order of optional fields is not preserved.
34
35use crate::{Subgraph, Mapping, Difference};
36use crate::formats::{self, TypedField};
37use crate::utils::{self, PathStartSource};
38
39use std::io::{Read, Write};
40use std::ops::Range;
41use std::sync::Arc;
42use std::{cmp, str};
43
44use zstd::stream::Encoder as ZstdEncoder;
45use zstd::stream::Decoder as ZstdDecoder;
46
47use gbz::{GBWT, Orientation, Pos, ENDMARKER};
48use gbz::support::{self, ByteCode, ByteCodeIter, RLE, Run, RLEIter};
49
50#[cfg(test)]
51mod tests;
52
53pub mod mapping;
54
55//-----------------------------------------------------------------------------
56
57// TODO: add validate() against a graph, a subgraph, or a read set
58/// An alignment between a query sequence and a target path in a graph.
59///
60/// This object corresponds either to a line in a GAF file or to a row in table `Alignments` in [`crate::GAFBase`].
61/// When the alignment is built from a GAF line, the target path is stored explicitly.
62/// For alignments stored in a database, only the GBWT starting position is stored.
63/// See [`TargetPath`] for details.
64///
65/// A GAF line can be converted to an `Alignment` object with [`Alignment::from_gaf`].
66/// An `Alignment` object can be serialized as a GAF line with [`Alignment::to_gaf`].
67/// The conversion can be lossy (see [`crate::alignment`] for details).
68///
69/// # Examples
70///
71/// ```
72/// use gbz_base::Alignment;
73/// use gbz::Orientation;
74/// use gbz::support;
75///
76/// // Unnamed empty sequence.
77/// let empty = Alignment::new();
78/// assert!(empty.name.is_empty());
79/// assert_eq!(empty.seq_len, 0);
80/// assert!(empty.is_unaligned());
81///
82/// // Construct a GAF line.
83/// let name = "query";
84/// let seq_len = 7;
85/// let seq_interval = 0..6;
86/// let path = ">1<2>3";
87/// let path_len = 8;
88/// let path_interval = 2..8;
89/// let matches = 5;
90/// let edits = 1;
91/// let mapq = 60;
92/// let line = format!("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
93/// name,
94/// seq_len, seq_interval.start, seq_interval.end,
95/// '+', path,
96/// path_len, path_interval.start, path_interval.end,
97/// matches, matches + edits, mapq
98/// );
99///
100/// // Create an alignment from the line.
101/// let aln = Alignment::from_gaf(line.as_bytes()).unwrap();
102/// assert_eq!(aln.name, name);
103/// assert_eq!(aln.seq_len, seq_len);
104/// assert_eq!(aln.seq_interval, seq_interval);
105/// assert!(aln.has_target_path());
106/// let path = vec!(
107/// support::encode_node(1, Orientation::Forward),
108/// support::encode_node(2, Orientation::Reverse),
109/// support::encode_node(3, Orientation::Forward)
110/// );
111/// assert_eq!(aln.target_path(), Some(path.as_ref()));
112/// assert_eq!(aln.path_len, path_len);
113/// assert_eq!(aln.path_interval, path_interval);
114/// assert_eq!(aln.matches, matches);
115/// assert_eq!(aln.edits, edits);
116/// assert_eq!(aln.mapq, Some(mapq));
117///
118/// // All optional fields are missing.
119/// assert!(aln.score.is_none());
120/// assert!(aln.base_quality.is_empty());
121/// assert!(aln.difference.is_empty());
122/// assert!(aln.pair.is_none());
123/// assert!(aln.optional.is_empty());
124///
125/// // Convert back to GAF.
126/// let _query_sequence = b"GATTACA"; // Not used yet.
127/// let target_sequence = b"GAGATCAC".to_vec();
128/// let from_aln = aln.to_gaf(&target_sequence);
129/// assert_eq!(&from_aln, line.as_bytes());
130/// ```
131#[derive(Clone, Debug, PartialEq)]
132pub struct Alignment {
133 /// Name of the query sequence.
134 pub name: String,
135 /// Length of the query sequence.
136 pub seq_len: usize,
137 /// Aligned interval of the query sequence.
138 pub seq_interval: Range<usize>,
139 /// Target path in the orientation of the query sequence.
140 pub path: TargetPath,
141 /// Length of the target path.
142 pub path_len: usize,
143 /// Aligned interval of the target path.
144 pub path_interval: Range<usize>,
145 /// Number of matches in the alignment.
146 pub matches: usize,
147 /// Number of mismatches and gaps in the alignment.
148 pub edits: usize,
149 /// Mapping quality.
150 pub mapq: Option<usize>,
151 /// Alignment score.
152 pub score: Option<isize>,
153 /// Base quality values for the query sequence.
154 pub base_quality: Vec<u8>,
155 /// Difference string, or an empty vector if not present.
156 pub difference: Vec<Difference>,
157 /// Information about the paired alignment.
158 pub pair: Option<PairedRead>,
159 /// Optional typed fields that have not been interpreted.
160 pub optional: Vec<TypedField>,
161}
162
163impl Default for Alignment {
164 fn default() -> Self {
165 Alignment {
166 name: String::new(),
167 seq_len: 0,
168 seq_interval: 0..0,
169 path: TargetPath::Path(Vec::new()),
170 path_len: 0,
171 path_interval: 0..0,
172 matches: 0,
173 edits: 0,
174 mapq: None,
175 score: None,
176 base_quality: Vec::new(),
177 difference: Vec::new(),
178 pair: None,
179 optional: Vec::new(),
180 }
181 }
182}
183
184//-----------------------------------------------------------------------------
185
186/// Construction; conversions between GAF lines and `Alignment` objects.
187impl Alignment {
188 // Number of mandatory fields in a GAF line.
189 const MANDATORY_FIELDS: usize = 12;
190
191 // Placeholder value for a missing mapping quality.
192 const MISSING_MAPQ: usize = 255;
193
194 // The field is empty and the value is missing; typically used with unaligned sequences.
195 const MISSING_VALUE: [u8; 1] = [b'*'];
196
197 /// Creates an empty alignment.
198 pub fn new() -> Self {
199 Alignment::default()
200 }
201
202 // Parses a string field from a GAF field.
203 fn parse_string(field: &[u8], field_name: &str) -> Result<String, String> {
204 String::from_utf8(field.to_vec()).map_err(|err| {
205 format!("Invalid {}: {}", field_name, err)
206 })
207 }
208
209 // Parses an unsigned integer from a GAF field.
210 // Returns `0` if the value is missing.
211 fn parse_usize(field: &[u8], field_name: &str) -> Result<usize, String> {
212 if field == Self::MISSING_VALUE {
213 return Ok(0);
214 }
215 let number = str::from_utf8(field).map_err(|err| {
216 format!("Invalid {}: {}", field_name, err)
217 })?;
218 number.parse().map_err(|err| {
219 format!("Invalid {}: {}", field_name, err)
220 })
221 }
222
223 // Parses an interval from two GAF fields.
224 fn parse_interval(start: &[u8], end: &[u8]) -> Result<Range<usize>, String> {
225 let start = Self::parse_usize(start, "interval start")?;
226 let end = Self::parse_usize(end, "interval end")?;
227 Ok(start..end)
228 }
229
230 // Parses an orientation from a GAF field.
231 // Returns [`Orientation::Forward`] if the value is missing.
232 fn parse_orientation(field: &[u8], field_name: &str) -> Result<Orientation, String> {
233 if field == Self::MISSING_VALUE {
234 return Ok(Orientation::Forward);
235 }
236 if field.len() != 1 {
237 return Err(format!("Invalid {}: {}", field_name, String::from_utf8_lossy(field)));
238 }
239 match field[0] {
240 b'+' => Ok(Orientation::Forward),
241 b'-' => Ok(Orientation::Reverse),
242 _ => Err(format!("Invalid {}: {}", field_name, String::from_utf8_lossy(field))),
243 }
244 }
245
246 // Parses an oriented path from a GAF field.
247 // Returns an empty path if the value is missing.
248 fn parse_path(field: &[u8]) -> Result<Vec<usize>, String> {
249 let mut result = Vec::new();
250 if field == Self::MISSING_VALUE {
251 return Ok(result);
252 }
253
254 let mut start = 0;
255 while start < field.len() {
256 let orientation = match field[start] {
257 b'>' => Orientation::Forward,
258 b'<' => Orientation::Reverse,
259 _ => return Err(format!("Invalid segment orientation: {}", String::from_utf8_lossy(field))),
260 };
261 start += 1;
262 let end = field[start..].iter().position(|&c| c == b'>' || c == b'<').map_or(field.len(), |x| start + x);
263 let node = str::from_utf8(&field[start..end]).map_err(|err| {
264 format!("Invalid segment name: {}", err)
265 })?.parse().map_err(|_| {
266 String::from("Only numerical segment names are supported")
267 })?;
268 result.push(support::encode_node(node, orientation));
269 start = end;
270 }
271 Ok(result)
272 }
273
274 // Parses a pair name from the value of a typed field.
275 fn parse_pair(value: Vec<u8>, is_next: bool, output: &mut Option<PairedRead>) -> Result<(), String> {
276 if output.is_some() {
277 return Err(String::from("Multiple pair fields"));
278 }
279 let name = String::from_utf8(value).map_err(|err| {
280 format!("Invalid pair name: {}", err)
281 })?;
282 *output = Some(PairedRead {
283 name,
284 is_next,
285 is_proper: false,
286 });
287 Ok(())
288 }
289
290 /// Parses an alignment from a GAF line.
291 ///
292 /// Returns an error if the line cannot be parsed.
293 /// The line may end with up to one endline character, which is ignored.
294 /// The returned alignment stores the target path explicitly.
295 /// Parsing is based on bytes rather than characters to avoid unnecessary UTF-8 validation.
296 ///
297 /// If a difference string is present, some numerical fields will be recalculated from it.
298 /// These include interval ends on both the query and the target, as well as the number of matches and edits.
299 /// This behavior is justified, because some aligners may not calculate these values correctly.
300 pub fn from_gaf(line: &[u8]) -> Result<Self, String> {
301 // Check for an endline character which may be present.
302 let line = if line.last() == Some(&b'\n') {
303 &line[..line.len() - 1]
304 } else {
305 line
306 };
307
308 // Split the line into fields.
309 let fields = line.split(|&c| c == b'\t').collect::<Vec<_>>();
310 if fields.len() < Self::MANDATORY_FIELDS {
311 let line = String::from_utf8_lossy(line);
312 let message = format!("GAF line with fewer than {} fields: {}", Self::MANDATORY_FIELDS, line);
313 return Err(message);
314 }
315
316 // Query sequence.
317 let name = Self::parse_string(fields[0], "query sequence name")?;
318 let seq_len = Self::parse_usize(fields[1], "query sequence length")?;
319 let mut seq_interval = Self::parse_interval(fields[2], fields[3])?;
320
321 // Target path.
322 let orientation = Self::parse_orientation(fields[4], "target orientation")?;
323 let mut path = Self::parse_path(fields[5]).map_err(|err| {
324 format!("Invalid target path: {}", err)
325 })?;
326 if orientation == Orientation::Reverse {
327 support::reverse_path_in_place(&mut path);
328 }
329 let path = TargetPath::Path(path);
330 let path_len = Self::parse_usize(fields[6], "target path length")?;
331 // TODO: This is a hack. VG sometimes gets the target interval end wrong.
332 // If we can't parse the interval, we will later try to parse the start and infer the length from the difference string.
333 let path_interval = Self::parse_interval(fields[7], fields[8]);
334
335 // Alignment statistics.
336 let mut matches = Self::parse_usize(fields[9], "matches")?;
337 let alignment_len = Self::parse_usize(fields[10], "alignment length")?;
338 let mut edits = alignment_len.saturating_sub(matches);
339 let mapq = Self::parse_usize(fields[11], "mapping quality")?;
340 let mapq = if mapq == Self::MISSING_MAPQ { None } else { Some(mapq) }; // TODO: Too large values?
341
342 // Optional fields.
343 let mut score = None;
344 let mut base_quality = Vec::new();
345 let mut difference = Vec::new();
346 let mut pair = None;
347 let mut properly_paired = None;
348 let mut optional = Vec::new();
349 for field in fields[Self::MANDATORY_FIELDS..].iter() {
350 let parsed = TypedField::parse(field)?;
351 match parsed {
352 TypedField::Int([b'A', b'S'], value) => {
353 if score.is_some() {
354 return Err(String::from("Multiple alignment score fields"));
355 }
356 score = Some(value);
357 },
358 TypedField::String([b'b', b'q'], value) => {
359 if !base_quality.is_empty() {
360 return Err(String::from("Multiple base quality fields"));
361 }
362 base_quality = value;
363 },
364 TypedField::String([b'c', b's'], value) => {
365 if !difference.is_empty() {
366 return Err(String::from("Multiple difference fields"));
367 }
368 difference = Difference::parse_normalized(&value)?;
369 },
370 TypedField::Bool([b'p', b'd'], value) => {
371 if properly_paired.is_some() {
372 return Err(String::from("Multiple properly paired fields"));
373 }
374 properly_paired = Some(value);
375 },
376 TypedField::String([b'f', b'n'], value) => {
377 Self::parse_pair(value, true, &mut pair)?;
378 },
379 TypedField::String([b'f', b'p'], value) => {
380 Self::parse_pair(value, false, &mut pair)?;
381 },
382 _ => { optional.push(parsed); },
383 }
384 }
385 if let (Some(pair), Some(properly_paired)) = (pair.as_mut(), properly_paired) {
386 pair.is_proper = properly_paired;
387 }
388
389 // TODO: Part of the interval end hack.
390 let mut path_interval = match path_interval {
391 Ok(interval) => interval,
392 Err(_) => {
393 if !difference.is_empty() {
394 let target_start = Self::parse_usize(fields[7], "target interval start")?;
395 target_start..target_start
396 } else {
397 return Err(String::from("Target interval end cannot be parsed or inferred from the difference string"));
398 }
399 },
400 };
401
402 // If we have a difference string, recalculate the redundant numerical fields.
403 if !difference.is_empty() {
404 let (query_len, target_len, num_matches, num_edits) = Difference::stats(&difference);
405 seq_interval.end = seq_interval.start + query_len;
406 path_interval.end = path_interval.start + target_len;
407 matches = num_matches;
408 edits = num_edits;
409 }
410
411 // Now we have the final path interval. Flip its orientation if necessary.
412 if orientation == Orientation::Reverse {
413 let start = path_len.saturating_sub(path_interval.end);
414 let end = path_len.saturating_sub(path_interval.start);
415 path_interval = start..end;
416 }
417
418 Ok(Alignment {
419 name, seq_len, seq_interval,
420 path, path_len, path_interval,
421 matches, edits, mapq, score,
422 base_quality, difference, pair,
423 optional,
424 })
425 }
426
427 /// Converts the alignment to a GAF line.
428 ///
429 /// If the target path is stored as a GBWT starting position, it will be missing (`*`).
430 /// The path can be set with [`Alignment::set_target_path`] or extracted from a GBWT index with [`Alignment::extract_target_path`].
431 /// The returned line does not end with an endline character.
432 ///
433 /// A target sequence is necessary for reconstructing the difference string, if present.
434 /// It must correspond to the entire target path.
435 pub fn to_gaf(&self, target_sequence: &[u8]) -> Vec<u8> {
436 let mut result = Vec::new();
437
438 result.extend_from_slice(self.name.as_bytes());
439
440 // Coordinates in the query sequence.
441 result.push(b'\t');
442 utils::append_usize(&mut result, self.seq_len);
443 result.push(b'\t');
444 utils::append_usize(&mut result, self.seq_interval.start);
445 result.push(b'\t');
446 utils::append_usize(&mut result, self.seq_interval.end);
447
448 // Target path; always in the forward orientation.
449 result.push(b'\t');
450 result.push(b'+');
451 result.push(b'\t');
452 match &self.path {
453 TargetPath::Path(path) => {
454 if path.is_empty() {
455 result.extend_from_slice(&Self::MISSING_VALUE);
456 } else {
457 formats::append_walk(&mut result, path);
458 }
459 },
460 TargetPath::StartPosition(_) => {
461 result.extend_from_slice(&Self::MISSING_VALUE);
462 },
463 }
464
465 // Coordinates in the target path.
466 result.push(b'\t');
467 utils::append_usize(&mut result, self.path_len);
468 result.push(b'\t');
469 utils::append_usize(&mut result, self.path_interval.start);
470 result.push(b'\t');
471 utils::append_usize(&mut result, self.path_interval.end);
472
473 // Alignment statistics.
474 result.push(b'\t');
475 utils::append_usize(&mut result, self.matches);
476 result.push(b'\t');
477 utils::append_usize(&mut result, self.matches + self.edits);
478 result.push(b'\t');
479 let mapq = self.mapq.unwrap_or(Self::MISSING_MAPQ);
480 utils::append_usize(&mut result, mapq);
481
482 // Known optional fields.
483 if let Some(score) = self.score {
484 let field = TypedField::Int([b'A', b'S'], score);
485 field.append_to(&mut result, true);
486 }
487 if !self.base_quality.is_empty() {
488 TypedField::append_string(&mut result, [b'b', b'q'], &self.base_quality, true);
489 }
490 if !self.difference.is_empty() {
491 // TODO: Can we write the difference string directly?
492 let target_sequence = &target_sequence[self.path_interval.clone()];
493 let field = TypedField::String([b'c', b's'], Difference::to_bytes(&self.difference, target_sequence));
494 field.append_to(&mut result, true);
495 }
496 if let Some(pair) = &self.pair {
497 if pair.is_next {
498 TypedField::append_string(&mut result, [b'f', b'n'], pair.name.as_bytes(), true);
499 } else {
500 TypedField::append_string(&mut result, [b'f', b'p'], pair.name.as_bytes(), true);
501 }
502 let field = TypedField::Bool([b'p', b'd'], pair.is_proper);
503 field.append_to(&mut result, true);
504 }
505
506 // Other optional fields.
507 for field in self.optional.iter() {
508 field.append_to(&mut result, true);
509 }
510
511 result
512 }
513
514 /// Validates the internal consistency of the alignment.
515 ///
516 /// Returns an error message if the alignment is inconsistent.
517 pub fn validate(&self) -> Result<(), String> {
518 // Check that intervals are valid.
519 if self.seq_interval.start > self.seq_interval.end || self.seq_interval.end > self.seq_len {
520 return Err(format!("Query sequence interval {}..{} for a sequence of length {}", self.seq_interval.start, self.seq_interval.end, self.seq_len));
521 }
522 if self.path_interval.start > self.path_interval.end || self.path_interval.end > self.path_len {
523 return Err(format!("Target path interval {}..{} for a path of length {}", self.path_interval.start, self.path_interval.end, self.path_len));
524 }
525
526 // Check that the target path is consistent with the path length and interval.
527 if let Some(path) = self.target_path() {
528 // This check also guarantees that the path interval is empty if the path is empty.
529 if path.is_empty() && self.path_len > 0 {
530 return Err(String::from("Empty target path with a non-empty path length"));
531 }
532 // TODO: We could check that the path length matches the sum of node lengths if we had access to a graph here.
533 }
534
535 // Check that interval lengths and alignment statistics match the difference string, if present.
536 if !self.difference.is_empty() {
537 let (query_len, target_len, num_matches, num_edits) = Difference::stats(&self.difference);
538 if query_len != self.seq_interval.len() {
539 return Err(format!("Query interval length {} according to the difference string, but {} in the alignment", query_len, self.seq_interval.len()));
540 }
541 if target_len != self.path_interval.len() {
542 return Err(format!("Target path interval length {} according to the difference string, but {} in the alignment", target_len, self.path_interval.len()));
543 }
544 if num_matches != self.matches {
545 return Err(format!("Number of matches {} according to the difference string, but {} in the alignment", num_matches, self.matches));
546 }
547 if num_edits != self.edits {
548 return Err(format!("Number of edits {} according to the difference string, but {} in the alignment", num_edits, self.edits));
549 }
550 }
551
552 Ok(())
553 }
554}
555
556//-----------------------------------------------------------------------------
557
558/// Encoding / decoding the alignment in the format used in GAF-base.
559impl Alignment {
560 // Normalizes the interval so that `start <= end <= len` and returns it as (left flank, length, right flank).
561 fn normalize_coordinates(interval: Range<usize>, len: usize) -> (usize, usize, usize) {
562 let start = if interval.start <= interval.end { interval.start } else { interval.end };
563 let end = if interval.end <= len { interval.end } else { len };
564 let left_flank = start;
565 let length = end - start;
566 let right_flank = len - end;
567 (left_flank, length, right_flank)
568 }
569
570 // Encodes a signed integer. Small absolute values are represented as small numbers.
571 fn encode_signed(value: isize, encoder: &mut ByteCode) {
572 let value = if value < 0 { (-2 * value - 1) as usize } else { 2 * value as usize };
573 encoder.write(value);
574 }
575
576 // Encodes numerical fields as a blob.
577 //
578 // This includes the following fields:
579 //
580 // * Query sequence: `seq_len`, `seq_interval.start`, `seq_interval.end`.
581 // * Target path: `path_len`, `path_interval.start`, `path_interval.end`.
582 // * Alignment statistics: `matches`, `edits`, `mapq`, `score`.
583 //
584 // The coordinates are normalized so that `start <= end <= len`.
585 // If `known_length` is set, the length of the query sequence is assumed to be stored externally.
586 // We also assume that the decoder knows whether this is a full and/or exact alignment, or whether a difference string is present.
587 // Numbers that can be derived from these properties are not stored to save space.
588 //
589 // Mapping quality and alignment score are stored if they are present.
590 // Their presence should be stored separately.
591 fn encode_numbers_into(&self, encoder: &mut ByteCode, known_length: bool) {
592 let full_alignment = self.is_full();
593 let exact_alignment = self.is_exact();
594 let stored_difference_string = self.has_difference_string() & !exact_alignment;
595
596 // Query sequence coordinates. We write the aligned length first to simplify the logic.
597 let (left_flank, length, right_flank) = Self::normalize_coordinates(self.seq_interval.clone(), self.seq_len);
598 if !(known_length || stored_difference_string) {
599 encoder.write(length);
600 }
601 if !full_alignment {
602 encoder.write(left_flank);
603 encoder.write(right_flank);
604 }
605
606 // Target path coordinates. We again write the aligned length first.
607 // We do not store the right flank, because it can be derived from the path itself.
608 let (left_flank, length, _) = Self::normalize_coordinates(self.path_interval.clone(), self.path_len);
609 if !(exact_alignment || stored_difference_string) {
610 encoder.write(length);
611 }
612 encoder.write(left_flank);
613
614 // Alignment statistics.
615 if !(exact_alignment || stored_difference_string) {
616 encoder.write(self.matches);
617 encoder.write(self.edits);
618 }
619 if let Some(mapq) = self.mapq {
620 encoder.write(mapq);
621 }
622 if let Some(score) = self.score {
623 Self::encode_signed(score, encoder);
624 };
625 }
626
627 // Encodes the difference string into the given encoder.
628 //
629 // Also encodes empty / missing difference strings.
630 // [`Difference::End`] values are not included in the encoding, but one is always added at the end.
631 fn encode_difference_into(&self, encoder: &mut RLE) {
632 for diff in self.difference.iter() {
633 match diff {
634 Difference::Match(len) => encoder.write(Run::new(0, *len)),
635 Difference::Mismatch(base) => encoder.write(Run::new(1, utils::encode_base(*base))),
636 Difference::Insertion(seq) => {
637 let len = utils::encoded_length(seq.len());
638 encoder.write(Run::new(2, len));
639 let encoded = utils::encode_sequence(seq);
640 for byte in encoded {
641 encoder.write_byte(byte);
642 }
643 },
644 Difference::Deletion(len) => encoder.write(Run::new(3, *len)),
645 Difference::End => {},
646 }
647 }
648 encoder.write(Run::new(4, 1));
649 }
650
651 // TODO: encode optional fields
652
653 // Decodes a signed integer from the iterator.
654 fn decode_signed(iter: &mut ByteCodeIter) -> Option<isize> {
655 let value = iter.next()?;
656 if value % 2 == 0 {
657 Some((value / 2) as isize)
658 } else {
659 Some(-(value as isize + 1) / 2)
660 }
661 }
662
663 // Decodes a difference string from the iterator until an End value.
664 // Also needs the underlying slice for decoding insertions.
665 // Returns an error if the if there is no End value.
666 fn decode_difference_from(encoded: &[u8], decoder: &mut RLEIter) -> Result<Vec<Difference>, String> {
667 let mut result: Vec<Difference> = Vec::new();
668 while let Some(run) = decoder.next() {
669 match run.value {
670 0 => result.push(Difference::Match(run.len)),
671 1 => result.push(Difference::Mismatch(utils::decode_base(run.len))),
672 2 => {
673 let offset = decoder.offset();
674 for _ in 0..run.len {
675 let _ = decoder.byte().ok_or_else(|| String::from("Missing insertion base"))?;
676 }
677 let encoded = &encoded[offset..offset + run.len];
678 let seq = utils::decode_sequence(encoded);
679 result.push(Difference::Insertion(seq));
680 },
681 3 => result.push(Difference::Deletion(run.len)),
682 4 => {
683 return Ok(result);
684 },
685 _ => return Err(format!("Invalid difference string operation: {}", run.value)),
686 }
687 }
688 Err(String::from("Encoded difference string ended without an End value"))
689 }
690}
691
692//-----------------------------------------------------------------------------
693
694// TODO: Add an operation for reconstructing the query sequence.
695/// Operations on the Alignment object.
696impl Alignment {
697 /// Returns `true` if the query sequence is unaligned.
698 ///
699 /// An aligned sequence has a non-empty query interval aligned to a non-empty target interval.
700 /// NOTE: An empty sequence is by definition unaligned.
701 pub fn is_unaligned(&self) -> bool {
702 self.seq_interval.is_empty() || self.path_interval.is_empty()
703 }
704
705 /// Returns `true` if this is an alignment of the entire query sequence.
706 ///
707 /// NOTE: An empty sequence is by definition unaligned.
708 pub fn is_full(&self) -> bool {
709 self.seq_len > 0 &&
710 self.seq_interval.start == 0 &&
711 self.seq_interval.end == self.seq_len
712 }
713
714 /// Returns `true` if this is an exact alignment.
715 ///
716 /// NOTE: An empty sequence is by definition unaligned.
717 pub fn is_exact(&self) -> bool {
718 self.seq_len > 0 &&
719 !self.seq_interval.is_empty() &&
720 self.edits == 0
721 }
722
723 /// Returns `true` if there is a non-empty difference string for the alignment.
724 pub fn has_difference_string(&self) -> bool {
725 !self.difference.is_empty()
726 }
727
728 /// Returns the minimum GBWT node identifier in the target path, or [`None`] if there is no path.
729 pub fn min_handle(&self) -> Option<usize> {
730 match self.path {
731 TargetPath::Path(ref path) => path.iter().copied().min(),
732 TargetPath::StartPosition(_) => None,
733 }
734 }
735
736 /// Returns the maximum GBWT node identifier in the target path, or [`None`] if there is no path.
737 pub fn max_handle(&self) -> Option<usize> {
738 match self.path {
739 TargetPath::Path(ref path) => path.iter().copied().max(),
740 TargetPath::StartPosition(_) => None,
741 }
742 }
743
744 /// Returns `true` if the target path is stored explicitly.
745 pub fn has_target_path(&self) -> bool {
746 matches!(self.path, TargetPath::Path(_))
747 }
748
749 /// Returns `true` if the target path is stored explicitly and is non-empty.
750 pub fn has_non_empty_target_path(&self) -> bool {
751 match &self.path {
752 TargetPath::Path(path) => !path.is_empty(),
753 TargetPath::StartPosition(_) => false,
754 }
755 }
756
757 /// Returns the target path if it is stored explicitly.
758 pub fn target_path(&self) -> Option<&[usize]> {
759 match &self.path {
760 TargetPath::Path(path) => Some(path),
761 TargetPath::StartPosition(_) => None,
762 }
763 }
764
765 // TODO: Should this update an existing target path?
766 /// Sets the target path from the GBWT index if it is not already present.
767 pub fn extract_target_path(&mut self, index: &GBWT) {
768 let mut pos = match self.path {
769 TargetPath::Path(_) => return,
770 TargetPath::StartPosition(pos) => Some(pos),
771 };
772 let mut path = Vec::new();
773 while let Some(p) = pos {
774 path.push(p.node);
775 pos = index.forward(p);
776 }
777 self.path = TargetPath::Path(path);
778 }
779
780 /// Sets the given path as the target path.
781 pub fn set_target_path(&mut self, path: Vec<usize>) {
782 self.path = TargetPath::Path(path);
783 }
784
785 /// Sets the length of the target path.
786 pub fn set_target_path_len(&mut self, len: usize) {
787 self.path_len = len;
788 }
789
790 /// Returns an iterator over the alignment as a sequence of mappings.
791 ///
792 /// Returns [`None`] if a valid iterator cannot be built.
793 /// The iterator requires a difference string and an explicitly stored target path.
794 /// It may stop early if the alignment is invalid.
795 ///
796 /// The iterator needs a function that provides the sequence length for each node.
797 /// This function may be based on [`gbz::GBZ`], [`Subgraph`], or [`crate::ReadSet`].
798 pub fn iter<'a>(&'a self, sequence_len: Arc<dyn Fn(usize) -> Option<usize> + 'a>) -> Option<AlignmentIter<'a>> {
799 if self.difference.is_empty() && (!self.seq_interval.is_empty() || !self.path_interval.is_empty()) {
800 return None;
801 }
802 if !self.has_target_path() {
803 return None;
804 }
805 let target_path = self.target_path().unwrap();
806 if target_path.is_empty() && (self.path_interval.start != 0 || self.path_interval.end != 0) {
807 return None;
808 }
809
810 let mut iter = AlignmentIter {
811 parent: self,
812 sequence_len,
813 seq_offset: self.seq_interval.start,
814 path_offset: self.path_interval.start,
815 path_vec_offset: 0,
816 path_node_offset: self.path_interval.start,
817 diff_vec_offset: 0,
818 diff_op_offset: 0,
819 };
820 if iter.path_node_offset == 0 {
821 return Some(iter);
822 }
823
824 // In some edge cases, there may be unused nodes at the start of the target path.
825 let mut handle = *target_path.get(iter.path_vec_offset)?;
826 let mut node_len = (*iter.sequence_len)(handle)?;
827 while iter.path_node_offset >= node_len {
828 iter.path_node_offset -= node_len;
829 iter.path_vec_offset += 1;
830 handle = *target_path.get(iter.path_vec_offset)?;
831 node_len = (*iter.sequence_len)(handle)?;
832 }
833
834 Some(iter)
835 }
836
837 // Creates an empty fragment of this alignment.
838 fn empty_fragment(&self, fragment_index: usize) -> Alignment {
839 let mut aln = Alignment {
840 name: self.name.clone(),
841 seq_len: self.seq_len,
842 seq_interval: 0..0,
843 path: TargetPath::Path(Vec::new()),
844 path_len: 0,
845 path_interval: 0..0,
846 matches: self.matches,
847 edits: self.edits,
848 mapq: self.mapq,
849 score: self.score,
850 base_quality: self.base_quality.clone(),
851 difference: Vec::new(),
852 pair: self.pair.clone(),
853 optional: self.optional.clone(),
854 };
855 aln.optional.push(TypedField::Int([b'f', b'i'], fragment_index as isize));
856
857 aln
858 }
859
860 // Extends the given fragment with the next mapping of the same original alignment.
861 fn extend(&mut self, mapping: Mapping) -> Result<(), String> {
862 // Start by updating the difference string.
863 if self.seq_interval.is_empty() && self.path_interval.is_empty() {
864 // If we started with a gap, `is_unaligned()` would be true.
865 if !self.difference.is_empty() {
866 return Err(String::from("Cannot extend an unaligned fragment with a non-empty difference string"));
867 }
868 self.difference.push(mapping.edit().clone());
869 } else {
870 if self.difference.is_empty() {
871 return Err(String::from("Cannot extend an aligned fragment without a difference string"));
872 }
873 if !self.difference.last_mut().unwrap().try_merge(mapping.edit()) {
874 self.difference.push(mapping.edit().clone());
875 }
876 }
877
878 // Update the query sequence.
879 if mapping.seq_interval().end > self.seq_len {
880 return Err(String::from("Cannot extend a fragment beyond the query sequence length"));
881 }
882 if self.seq_interval.is_empty() {
883 // Either the first mapping or we only have deletions so far.
884 self.seq_interval = mapping.seq_interval().clone();
885 } else {
886 if mapping.seq_interval().start != self.seq_interval.end {
887 return Err(String::from("Cannot append a non-contiguous query interval"));
888 }
889 self.seq_interval.end = mapping.seq_interval().end;
890 }
891
892 // TODO: enum?
893 // Determine how we are extending the alignment.
894 let target_path = self.target_path().ok_or_else(||
895 String::from("Cannot extend a fragment without an explicit target path")
896 )?;
897 let last_node = target_path.last().copied();
898 let path_left = self.path_len.saturating_sub(self.path_interval.end);
899 let reverse_offset = mapping.node_len().saturating_sub(mapping.node_interval().start);
900 let first_mapping = last_node.is_none();
901 let continues_in_same_node = Some(mapping.handle()) == last_node && reverse_offset == path_left;
902 let starts_a_new_node = path_left == 0 && mapping.is_at_start();
903
904 // Update the target path.
905 if first_mapping {
906 if let TargetPath::Path(path) = &mut self.path {
907 path.push(mapping.handle());
908 } else {
909 unreachable!();
910 }
911 self.path_len += mapping.node_len();
912 self.path_interval = mapping.node_interval().clone();
913 } else if starts_a_new_node {
914 if let TargetPath::Path(path) = &mut self.path {
915 path.push(mapping.handle());
916 } else {
917 unreachable!();
918 }
919 self.path_len += mapping.node_len();
920 self.path_interval.end += mapping.target_len();
921 } else if continues_in_same_node {
922 self.path_interval.end += mapping.target_len();
923 } else {
924 return Err(String::from("Cannot append a non-contiguous target interval"));
925 }
926
927 Ok(())
928 }
929
930 /// Clips the alignment into fragments that are fully contained in the given subgraph.
931 ///
932 /// Returns an empty vector if the read is unaligned or lacks an explicit non-empty target path or a difference string.
933 /// There will be one alignment fragment for every maximal subpath in the subgraph.
934 /// The fragments have the same name, pair, optional fields, and statistics as the original alignment.
935 /// Only the aligned intervals and difference strings depend on the fragment.
936 ///
937 /// Fragments of the same alignment are identified by a fragment index stored as an optional field `fi:i`.
938 /// The indexes start from 1.
939 /// Fragment index is omitted if the entire alignment is in the subgraph.
940 ///
941 /// Clipping requires a function that returns the sequence length for the node with the given handle.
942 /// This function may be based on [`gbz::GBZ`], [`Subgraph`], or [`crate::ReadSet`].
943 ///
944 /// # Errors
945 ///
946 /// Returns an error if an [`AlignmentIter`] cannot be created.
947 /// May return an error if the alignment is invalid and the iterator returns non-consecutive mappings.
948 ///
949 /// # Examples
950 ///
951 /// ```
952 /// use gbz::{GBZ, Orientation};
953 /// use gbz::support;
954 /// use gbz_base::{Alignment, Difference, Subgraph};
955 /// use gbz_base::utils;
956 /// use simple_sds::serialize;
957 /// use std::sync::Arc;
958 ///
959 /// let gbz_filename = utils::get_test_data("micb-kir3dl1.gbz");
960 /// let graph: GBZ = serialize::load_from(&gbz_filename).unwrap();
961 ///
962 /// // Create a perfect alignment with node lengths of 68 bp, 1 bp, and 6 bp.
963 /// let mut aln = Alignment::new();
964 /// aln.seq_len = 50;
965 /// aln.seq_interval = 0..50;
966 /// let target_path = vec![
967 /// support::encode_node(13, Orientation::Forward),
968 /// support::encode_node(14, Orientation::Forward),
969 /// support::encode_node(16, Orientation::Forward),
970 /// ];
971 /// aln.set_target_path(target_path.clone());
972 /// aln.path_len = 75;
973 /// aln.path_interval = 20..70;
974 /// aln.matches = 50;
975 /// aln.difference.push(Difference::Match(50));
976 ///
977 /// // Create a subgraph without the middle node.
978 /// let mut subgraph = Subgraph::new();
979 /// let _ = subgraph.add_node_from_gbz(&graph, 13);
980 /// let _ = subgraph.add_node_from_gbz(&graph, 16);
981 ///
982 /// // Clip the alignment to the subgraph.
983 /// let sequence_len = Arc::new(|handle| {
984 /// let (node_id, _) = support::decode_node(handle);
985 /// graph.sequence_len(node_id)
986 /// });
987 /// let result = aln.clip(&subgraph, sequence_len.clone());
988 /// assert!(result.is_ok());
989 /// let clipped = result.unwrap();
990 ///
991 /// // We should have two fragments.
992 /// assert_eq!(clipped.len(), 2);
993 /// assert_eq!(clipped[0].seq_interval, 0..48);
994 /// assert_eq!(clipped[0].target_path().unwrap(), &target_path[0..1]);
995 /// assert_eq!(clipped[0].path_interval, 20..68);
996 /// assert_eq!(clipped[0].difference, vec![Difference::Match(48)]);
997 /// assert_eq!(clipped[1].seq_interval, 49..50);
998 /// assert_eq!(clipped[1].target_path().unwrap(), &target_path[2..3]);
999 /// assert_eq!(clipped[1].path_interval, 0..1);
1000 /// assert_eq!(clipped[1].difference, vec![Difference::Match(1)]);
1001 /// ```
1002 pub fn clip<'a>(&self, subgraph: &Subgraph, sequence_len: Arc<impl Fn(usize) -> Option<usize> + 'a>) -> Result<Vec<Alignment>, String> {
1003 let mut result = Vec::new();
1004 if self.is_unaligned() || !self.has_non_empty_target_path() || self.difference.is_empty() {
1005 return Ok(result);
1006 }
1007
1008 let mut aln: Option<Alignment> = None; // The alignment we are currently building.
1009 let iter = self.iter(sequence_len);
1010 if iter.is_none() {
1011 eprintln!("Warning: Cannot build an alignment iterator for {}", self.name);
1012 return Ok(result);
1013 }
1014 let iter = iter.unwrap();
1015 for mapping in iter {
1016 if !subgraph.has_handle(mapping.handle()) {
1017 if let Some(prev) = aln {
1018 result.push(prev);
1019 aln = None;
1020 }
1021 continue;
1022 }
1023
1024 if let Some(aln) = &mut aln {
1025 aln.extend(mapping)?;
1026 } else {
1027 let mut curr = self.empty_fragment(result.len() + 1);
1028 curr.extend(mapping)?;
1029 aln = Some(curr);
1030 }
1031 }
1032 if let Some(aln) = aln {
1033 result.push(aln);
1034 }
1035
1036 // Clear the fragment index if we have the entire alignment as a single fragment.
1037 if result.len() == 1 && result[0].seq_interval == self.seq_interval {
1038 result[0].optional.retain(|field| {
1039 !matches!(field, TypedField::Int([b'f', b'i'], _))
1040 });
1041 }
1042
1043 Ok(result)
1044 }
1045}
1046
1047//-----------------------------------------------------------------------------
1048
1049/// Target path in [`Alignment`].
1050///
1051/// The path is stored either explicitly as a sequence of oriented nodes or as its starting position in the GBWT index.
1052/// This corresponds to table `Nodes` in [`crate::GAFBase`].
1053#[derive(Clone, Debug, PartialEq, Eq)]
1054pub enum TargetPath {
1055 /// Path as a sequence of oriented nodes (GBWT node identifiers; GAF-base node handles).
1056 Path(Vec<usize>),
1057 /// Starting position in the GBWT.
1058 StartPosition(Pos),
1059}
1060
1061/// Information about the paired alignment.
1062#[derive(Clone, Debug, PartialEq, Eq)]
1063pub struct PairedRead {
1064 /// Name of the pair.
1065 pub name: String,
1066 /// Is the pair the next the next fragment?
1067 pub is_next: bool,
1068 /// Are the alignments properly paired?
1069 pub is_proper: bool,
1070}
1071
1072//-----------------------------------------------------------------------------
1073
1074/// An iterator over an [`Alignment`] as a sequence of [`Mapping`] objects.
1075///
1076/// This iterator assumes that the alignment is valid, and it may stop early if it is not.
1077/// It requires that the alignment stores the target path explicitly and has a difference string.
1078/// Insertions at node boundaries are assigned to the left.
1079/// If the difference string is inconsistent with the sequence / path intervals, the difference string takes precedence.
1080///
1081/// # Examples
1082///
1083/// ```
1084/// use gbz_base::{Alignment, Mapping, Difference};
1085/// use std::sync::Arc;
1086///
1087/// // Construct an alignment.
1088/// let mut aln = Alignment::new();
1089/// aln.seq_len = 15;
1090/// aln.seq_interval = 0..15;
1091/// aln.set_target_path(vec![1, 2, 3]);
1092/// aln.path_len = 15;
1093/// aln.path_interval = 0..15;
1094/// aln.matches = 14;
1095/// aln.edits = 1;
1096/// aln.difference = vec![
1097/// Difference::Match(6),
1098/// Difference::Mismatch(b'A'),
1099/// Difference::Match(8),
1100/// ];
1101///
1102/// // We use this in place of a graph for node lengths.
1103/// let node_lengths = vec![0, 4, 6, 5];
1104/// let sequence_len = Arc::new(|handle| node_lengths.get(handle).copied());
1105///
1106/// // This is what we expect from the iterator.
1107/// let truth = vec![
1108/// Mapping::new(0, 1, node_lengths[1], 0, Difference::Match(4)),
1109/// Mapping::new(4, 2, node_lengths[2], 0, Difference::Match(2)),
1110/// Mapping::new(6, 2, node_lengths[2], 2, Difference::Mismatch(b'A')),
1111/// Mapping::new(7, 2, node_lengths[2], 3, Difference::Match(3)),
1112/// Mapping::new(10, 3, node_lengths[3], 0, Difference::Match(5)),
1113/// ];
1114///
1115/// // Iterator creation may fail if the alignment is invalid.
1116/// let iter = aln.iter(sequence_len);
1117/// assert!(iter.is_some());
1118/// let iter = iter.unwrap();
1119/// assert!(iter.eq(truth.iter().cloned()));
1120/// ```
1121#[derive(Clone)]
1122pub struct AlignmentIter<'a> {
1123 parent: &'a Alignment,
1124 sequence_len: Arc<dyn Fn(usize) -> Option<usize> + 'a>,
1125 // Position in the query sequence.
1126 seq_offset: usize,
1127 // Position in the target sequence.
1128 path_offset: usize,
1129 // Position in the target path.
1130 path_vec_offset: usize,
1131 // Position within the current node in the target path.
1132 path_node_offset: usize,
1133 // Position in the difference string.
1134 diff_vec_offset: usize,
1135 // Target position in the current difference operation.
1136 diff_op_offset: usize,
1137}
1138
1139impl<'a> Iterator for AlignmentIter<'a> {
1140 type Item = Mapping;
1141
1142 fn next(&mut self) -> Option<Self::Item> {
1143 if self.diff_vec_offset >= self.parent.difference.len() {
1144 return None;
1145 }
1146
1147 // First find the edit that forms the basis of the mapping.
1148 let full_edit = self.parent.difference.get(self.diff_vec_offset)?;
1149 let edit_left = full_edit.target_len() - self.diff_op_offset;
1150
1151 // We may have unaligned insertions past the end of an empty target path.
1152 let target_path = self.parent.target_path()?;
1153 let handle = target_path.get(self.path_vec_offset);
1154 if handle.is_none() {
1155 match full_edit {
1156 Difference::Insertion(_) => {
1157 let seq_offset = self.seq_offset;
1158 self.seq_offset += full_edit.query_len();
1159 self.diff_vec_offset += 1;
1160 self.diff_op_offset = 0;
1161 return Some(Mapping::unaligned(seq_offset, full_edit.clone()));
1162 },
1163 _ => {
1164 return None;
1165 },
1166 }
1167 }
1168
1169 // Find the target node. We may need to advance to the next node we are
1170 // at the end of the node and we have an edit with non-zero target length.
1171 // We prefer mapping insertions to the end of a node rather than to the start
1172 // of the next node, as the next node may not exist.
1173 let mut handle = *handle.unwrap();
1174 let mut node_len = (*self.sequence_len)(handle)?;
1175 if self.path_node_offset >= node_len && edit_left > 0 {
1176 self.path_vec_offset += 1;
1177 self.path_node_offset = 0;
1178 handle = *target_path.get(self.path_vec_offset)?;
1179 node_len = (*self.sequence_len)(handle)?;
1180 }
1181 let node_left = node_len - self.path_node_offset;
1182
1183 // Take the offsets for the mapping before we update them.
1184 let seq_offset = self.seq_offset;
1185 let node_offset = self.path_node_offset;
1186
1187 // Now determine the actual edit.
1188 let edit_len = cmp::min(edit_left, node_left);
1189 let edit = match full_edit {
1190 Difference::Match(_) => Difference::Match(edit_len),
1191 Difference::Mismatch(base) => Difference::Mismatch(*base),
1192 Difference::Insertion(seq) => {
1193 // We can always take the full insertion within the current node.
1194 Difference::Insertion(seq.clone())
1195 },
1196 Difference::Deletion(_) => Difference::Deletion(edit_len),
1197 Difference::End => {
1198 self.diff_vec_offset = self.parent.difference.len();
1199 return None;
1200 },
1201 };
1202
1203 // And then advance the iterator.
1204 self.seq_offset += edit.query_len();
1205 self.path_offset += edit.target_len();
1206 // We do not advance the node, as the next edit may be an insertion.
1207 self.path_node_offset += edit.target_len();
1208 self.diff_op_offset += edit.target_len();
1209 if self.diff_op_offset >= full_edit.target_len() {
1210 self.diff_vec_offset += 1;
1211 self.diff_op_offset = 0;
1212 }
1213
1214 Some(Mapping::new(seq_offset, handle, node_len, node_offset, edit))
1215 }
1216}
1217
1218//-----------------------------------------------------------------------------
1219
1220/// An ad hoc bitvector for flags in [`AlignmentBlock`].
1221#[derive(Debug, Clone)]
1222pub struct Flags {
1223 bits: Vec<u8>,
1224}
1225
1226impl Flags {
1227 /// Number of flags per alignment.
1228 pub const NUM_FLAGS: usize = 6;
1229
1230 // Flags expressed as a bit offset.
1231 // TODO: As enum?
1232
1233 /// Flag: Pair is the next fragment in the read pair.
1234 pub const FLAG_PAIR_IS_NEXT: usize = 0;
1235
1236 /// Flag: The the alignment is properly paired.
1237 pub const FLAG_PAIR_IS_PROPER: usize = 1;
1238
1239 /// Flag: A mapping quality score is present.
1240 pub const FLAG_HAS_MAPQ: usize = 2;
1241
1242 /// Flag: An alignment score is present.
1243 pub const FLAG_HAS_SCORE: usize = 3;
1244
1245 /// Flag: This is an alignment of the entire query sequence.
1246 ///
1247 /// We do not encode the lengths of the unaligned flanks.
1248 pub const FLAG_FULL_ALIGNMENT: usize = 4;
1249
1250 /// Flag: This is an exact alignment.
1251 ///
1252 /// We do not encode the difference string, matches, or edits.
1253 pub const FLAG_EXACT_ALIGNMENT: usize = 5;
1254
1255 /// Creates a new flags object with the given number of alignments.
1256 /// The flags are initialized to zero (false).
1257 pub fn new(num_alignments: usize) -> Self {
1258 let bits = vec![0; (num_alignments * Self::NUM_FLAGS).div_ceil(8)];
1259 Self { bits }
1260 }
1261
1262 /// Sets the given flag.
1263 ///
1264 /// # Arguments
1265 ///
1266 /// * `index`: The index of the alignment.
1267 /// * `flag`: The flag to set.
1268 /// * `value`: The value to set the flag to.
1269 pub fn set(&mut self, index: usize, flag: usize, value: bool) {
1270 let bit = index * Self::NUM_FLAGS + flag;
1271 if value {
1272 self.bits[bit / 8] |= 1 << (bit % 8);
1273 } else {
1274 self.bits[bit / 8] &= !(1 << (bit % 8));
1275 }
1276 }
1277
1278 /// Gets the value of the given flag.
1279 ///
1280 /// # Arguments
1281 ///
1282 /// * `index`: The index of the alignment.
1283 /// * `flag`: The flag to get.
1284 pub fn get(&self, index: usize, flag: usize) -> bool {
1285 let bit = index * Self::NUM_FLAGS + flag;
1286 (self.bits[bit / 8] >> (bit % 8)) & 0x01 != 0
1287 }
1288
1289 /// Returns the number of bytes used to store the flags.
1290 pub fn bytes(&self) -> usize {
1291 self.bits.len()
1292 }
1293}
1294
1295impl From<Vec<u8>> for Flags {
1296 fn from(bits: Vec<u8>) -> Self {
1297 Self { bits }
1298 }
1299}
1300
1301impl AsRef<[u8]> for Flags {
1302 fn as_ref(&self) -> &[u8] {
1303 &self.bits
1304 }
1305}
1306
1307//-----------------------------------------------------------------------------
1308
1309// TODO: expected alignment score for perfect alignments? Or just a scoring model?
1310/// An encoded block of [`Alignment`] objects.
1311///
1312/// This is a compressed representation of multiple sequences aligned to the same graph.
1313/// With short reads, a properly chosen block size enables both column-based compression and random access to the alignments.
1314/// Long reads do not benefit from column-based compression, as the amount of metadata is insignificant.
1315/// Reasonable block sizes could be 1000 alignments for short reads and 10 for long reads.
1316///
1317/// For best results, node identifiers should approximate a topological order in the graph.
1318/// The range of node identifiers in a path is typically proportional to the length of the path.
1319/// If the alignments are sorted by (min id, max id), a block will then consist of alignments that are close to each other in the graph.
1320///
1321/// # Notes
1322///
1323/// * Target paths must be stored separately (e.g. using a GBWT-based encoding).
1324/// * A block must contain either aligned reads or unaligned reads, but not both.
1325///
1326/// # Examples
1327///
1328/// ```
1329/// use gbz_base::{Alignment, AlignmentBlock};
1330/// use gbz_base::utils::PathStartSource;
1331/// use gbz_base::{formats, utils};
1332/// use gbz::GBWT;
1333/// use gbz::support;
1334/// use simple_sds::serialize;
1335///
1336/// // We assume that the target paths are stored separately in a GBWT index.
1337/// let gbwt_filename = utils::get_test_data("micb-kir3dl1_HG003.gbwt");
1338/// let index: GBWT = serialize::load_from(&gbwt_filename).unwrap();
1339///
1340/// // We use the GBWT index for determining the starting position for each path.
1341/// // But we could also compute it on the fly with `PathStartSource::new()`.
1342/// let mut source = PathStartSource::from(&index);
1343///
1344/// // Open a GAF file and skip the header.
1345/// let gaf_filename = utils::get_test_data("micb-kir3dl1_HG003.gaf");
1346/// let mut gaf_file = utils::open_file(&gaf_filename).unwrap();
1347/// while formats::peek_gaf_header_line(&mut gaf_file).unwrap() {
1348/// let mut buf: Vec<u8> = Vec::new();
1349/// let _ = gaf_file.read_until(b'\n', &mut buf).unwrap();
1350/// }
1351///
1352/// // Read some alignments from the GAF file.
1353/// let mut alignments = Vec::new();
1354/// for _ in 0..10 {
1355/// let mut buf: Vec<u8> = Vec::new();
1356/// let _ = gaf_file.read_until(b'\n', &mut buf).unwrap();
1357/// let aln = Alignment::from_gaf(&buf).unwrap();
1358/// // A block cannot have a mix of aligned and unaligned reads.
1359/// assert!(!aln.is_unaligned());
1360/// alignments.push(aln);
1361/// }
1362///
1363/// // Compress the block.
1364/// let mut first_id = 0;
1365/// let block = AlignmentBlock::new(&alignments, &mut source, first_id).unwrap();
1366/// assert_eq!(block.len(), alignments.len());
1367/// first_id += alignments.len(); // Next block would start there.
1368///
1369/// // Decompress the block and extract the paths from the GBWT.
1370/// let mut decompressed = block.decode().unwrap();
1371/// assert_eq!(decompressed.len(), alignments.len());
1372/// for (i, aln) in decompressed.iter_mut().enumerate() {
1373/// aln.extract_target_path(&index);
1374/// // NOTE: We need the reference graph to determine the true target path length.
1375/// aln.path_len = alignments[i].path_len;
1376/// assert_eq!(*aln, alignments[i]);
1377/// }
1378/// ```
1379#[derive(Debug, Clone)]
1380pub struct AlignmentBlock {
1381 /// Minimum GBWT node identifier in the target paths, or [`None`] if this is a block of unaligned reads.
1382 pub min_handle: Option<usize>,
1383 /// Maximum GBWT node identifier in the target paths, or [`None`] if this is a block of unaligned reads.
1384 pub max_handle: Option<usize>,
1385 /// Number of alignments in the block.
1386 pub alignments: usize,
1387 /// Expected read length in the block, or [`None`] if the lengths vary.
1388 pub read_length: Option<usize>,
1389 /// GBWT starting positions for the target paths.
1390 pub gbwt_starts: Vec<u8>,
1391 /// Read and pair names.
1392 pub names: Vec<u8>,
1393 /// Quality strings.
1394 pub quality_strings: Vec<u8>,
1395 /// Difference strings.
1396 pub difference_strings: Vec<u8>,
1397 /// Binary flags for each alignment.
1398 pub flags: Flags,
1399 /// Encoded numerical information that cannot be derived from the other fields.
1400 pub numbers: Vec<u8>,
1401 /// Optional typed fields that have not been interpreted.
1402 pub optional: Vec<u8>,
1403}
1404
1405impl AlignmentBlock {
1406 // TODO: Make this a parameter?
1407 /// Compression level for Zstandard.
1408 pub const COMPRESSION_LEVEL: i32 = 7;
1409
1410 fn expected_read_length(alignments: &[Alignment]) -> Option<usize> {
1411 let mut read_length = None;
1412 for aln in alignments.iter() {
1413 if let Some(len) = read_length {
1414 if aln.seq_len != len {
1415 read_length = None;
1416 break;
1417 }
1418 } else {
1419 if aln.seq_len == 0 {
1420 break;
1421 }
1422 read_length = Some(aln.seq_len);
1423 }
1424 }
1425 read_length
1426 }
1427
1428 fn compress_gbwt_starts(
1429 alignments: &[Alignment],
1430 source: &mut PathStartSource,
1431 first_id: usize, min_handle: Option<usize>
1432 ) -> Result<Vec<u8>, String> {
1433 let base_node = min_handle.unwrap_or(0);
1434 let mut encoder = ByteCode::new();
1435 for (i, aln) in alignments.iter().enumerate() {
1436 let node_id = if let Some(path) = aln.target_path() {
1437 path.first().copied().unwrap_or(ENDMARKER)
1438 } else {
1439 return Err(format!("Line {}: Cannot compute GBWT start for an alignment without an explicit target path", first_id + i));
1440 };
1441
1442 // GBWT start as (node - min_handle, offset).
1443 if let Some(start) = source.next(node_id) {
1444 encoder.write(start.node - base_node);
1445 encoder.write(start.offset);
1446 } else if !encoder.is_empty() {
1447 return Err(format!("Line {}: Unaligned read in a block with aligned reads", first_id + i));
1448 }
1449 }
1450 Ok(Vec::from(encoder))
1451 }
1452
1453 // TODO: Somewhere else?
1454 fn zstd_compress(data: &[u8]) -> Result<Vec<u8>, String> {
1455 let mut encoder = ZstdEncoder::new(Vec::new(), Self::COMPRESSION_LEVEL).unwrap();
1456 encoder.write_all(data).map_err(|err| format!("Zstd compression error: {}", err))?;
1457 encoder.finish().map_err(|err| format!("Zstd compression error: {}", err))
1458 }
1459
1460 fn compress_names_pairs(alignments: &[Alignment]) -> Result<Vec<u8>, String> {
1461 let mut names: Vec<u8> = Vec::new();
1462 for aln in alignments.iter() {
1463 // Read name as a 0-terminated string.
1464 names.extend_from_slice(aln.name.as_bytes());
1465 names.push(0);
1466 // Pair name as a 0-terminated string, if present.
1467 if let Some(pair) = &aln.pair {
1468 names.extend_from_slice(pair.name.as_bytes());
1469 }
1470 names.push(0);
1471 }
1472 Self::zstd_compress(&names)
1473 }
1474
1475 fn compress_quality_strings(alignments: &[Alignment]) -> Result<Vec<u8>, String> {
1476 let mut quality_strings: Vec<u8> = Vec::new();
1477 for aln in alignments.iter() {
1478 // Quality strings as 0-terminated strings.
1479 if !aln.base_quality.is_empty() {
1480 quality_strings.extend_from_slice(&aln.base_quality);
1481 }
1482 quality_strings.push(0);
1483 }
1484 if quality_strings.len() <= alignments.len() {
1485 // If we have no quality strings, we can store an empty blob.
1486 return Ok(Vec::new());
1487 }
1488 Self::zstd_compress(&quality_strings)
1489 }
1490
1491 // TODO: Instead of TAG:TYPE:VALUE, we could drop the separators.
1492 fn compress_optional_fields(alignments: &[Alignment]) -> Result<Vec<u8>, String> {
1493 let mut optional: Vec<u8> = Vec::new();
1494 for aln in alignments.iter() {
1495 // Optional fields as a sequence of typed fields.
1496 for (i, field) in aln.optional.iter().enumerate() {
1497 field.append_to(&mut optional, i > 0);
1498 }
1499 optional.push(0); // Terminator for the optional fields of each alignment.
1500 }
1501 if optional.len() <= alignments.len() {
1502 // If we have no optional fields, we can store an empty blob.
1503 return Ok(Vec::new());
1504 }
1505 Self::zstd_compress(&optional)
1506 }
1507
1508 /// Creates a new alignment block from the given read alignments and GBWT index.
1509 ///
1510 /// If the reads are aligned, they correspond to paths `first_id` to `first_id + alignments.len() - 1` in the GBWT index.
1511 /// The GBWT index may be bidirectional or unidirectional.
1512 ///
1513 /// # Arguments
1514 ///
1515 /// * `alignments`: The alignments to include in the block.
1516 /// * `source`: A source for GBWT starting positions of the target paths.
1517 /// * `first_id`: Path identifier of the first alignment in the block.
1518 ///
1519 /// # Errors
1520 ///
1521 /// Returns an error, if:
1522 ///
1523 /// * The block contains a mix of aligned and unaligned reads.
1524 /// * The source computes GBWT starts on the fly, but an alignment does not store the target path explicitly.
1525 /// * Compression fails.
1526 pub fn new(alignments: &[Alignment], source: &mut PathStartSource, first_id: usize) -> Result<Self, String> {
1527 let min_handle = alignments.iter().map(|aln| aln.min_handle()).min().flatten();
1528 let max_handle = alignments.iter().map(|aln| aln.max_handle()).max().flatten();
1529 let read_length = Self::expected_read_length(alignments);
1530 let gbwt_starts = Self::compress_gbwt_starts(alignments, source, first_id, min_handle)?;
1531 let names = Self::compress_names_pairs(alignments)?;
1532 let quality_strings = Self::compress_quality_strings(alignments)?;
1533 let optional = Self::compress_optional_fields(alignments)?;
1534
1535 // We encode the remaining fields together, as they depend on each other.
1536 let mut difference_strings = RLE::with_sigma(Difference::NUM_TYPES);
1537 let mut flags = Flags::new(alignments.len());
1538 let mut numbers = ByteCode::new();
1539
1540 for (i, aln) in alignments.iter().enumerate() {
1541 let full_alignment = aln.is_full();
1542 let exact_alignment = aln.is_exact();
1543 if !exact_alignment {
1544 // If the alignment is exact, we can reconstruct the difference string from
1545 // the length of the aligned interval.
1546 aln.encode_difference_into(&mut difference_strings);
1547 }
1548
1549 if let Some(pair) = &aln.pair {
1550 if pair.is_next {
1551 flags.set(i, Flags::FLAG_PAIR_IS_NEXT, true);
1552 }
1553 if pair.is_proper {
1554 flags.set(i, Flags::FLAG_PAIR_IS_PROPER, true);
1555 }
1556 }
1557 if aln.mapq.is_some() {
1558 flags.set(i, Flags::FLAG_HAS_MAPQ, true);
1559 }
1560 if aln.score.is_some() {
1561 flags.set(i, Flags::FLAG_HAS_SCORE, true);
1562 }
1563 flags.set(i, Flags::FLAG_FULL_ALIGNMENT, full_alignment);
1564 flags.set(i, Flags::FLAG_EXACT_ALIGNMENT, exact_alignment);
1565
1566 aln.encode_numbers_into(&mut numbers, read_length.is_some());
1567 }
1568
1569 // TODO: Also consider using an empty blob for no difference strings, as we do for quality strings.
1570
1571 Ok(Self {
1572 min_handle,
1573 max_handle,
1574 alignments: alignments.len(),
1575 read_length,
1576 gbwt_starts,
1577 names,
1578 quality_strings,
1579 difference_strings: Vec::from(difference_strings),
1580 flags,
1581 numbers: Vec::from(numbers),
1582 optional,
1583 })
1584 }
1585
1586 /// Returns the number of alignments in the block.
1587 pub fn len(&self) -> usize {
1588 self.alignments
1589 }
1590
1591 /// Returns `true` if the block contains no alignments.
1592 pub fn is_empty(&self) -> bool {
1593 self.alignments == 0
1594 }
1595
1596 fn decompress_gbwt_starts(&self, result: &mut [Alignment]) -> Result<(), String> {
1597 if self.min_handle.is_none() {
1598 return Ok(());
1599 }
1600 let base_node = self.min_handle.unwrap();
1601 let mut decoder: ByteCodeIter<'_> = ByteCodeIter::new(&self.gbwt_starts[..]);
1602 for (i, aln) in result.iter_mut().enumerate() {
1603 let start = decoder.next().ok_or_else(||
1604 format!("Missing GBWT start for alignment {}", i)
1605 )?;
1606 let offset = decoder.next().ok_or_else(||
1607 format!("Missing GBWT offset for alignment {}", i)
1608 )?;
1609 aln.path = TargetPath::StartPosition(Pos::new(start + base_node, offset));
1610 }
1611 Ok(())
1612 }
1613
1614 // TODO: Somewhere else?
1615 fn zstd_decompress(data: &[u8]) -> Result<Vec<u8>, String> {
1616 let mut decoder = ZstdDecoder::new(data).map_err(|err| format!("Zstd decompression error: {}", err))?;
1617 let mut buffer = Vec::new();
1618 decoder.read_to_end(&mut buffer).map_err(|err| format!("Zstd decompression error: {}", err))?;
1619 Ok(buffer)
1620 }
1621
1622 fn decompress_names_pairs(&self, result: &mut [Alignment]) -> Result<(), String> {
1623 let buffer = Self::zstd_decompress(&self.names[..])?;
1624 let mut iter = buffer.split(|&c| c == 0);
1625 for (i, aln) in result.iter_mut().enumerate() {
1626 let name = iter.next().ok_or_else(|| format!("Missing name for alignment {}", i))?;
1627 aln.name = String::from_utf8_lossy(name).to_string();
1628 let pair_name = iter.next().ok_or_else(|| format!("Missing pair name for alignment {}", i))?;
1629 if !pair_name.is_empty() {
1630 aln.pair = Some(PairedRead {
1631 name: String::from_utf8_lossy(pair_name).to_string(),
1632 is_next: self.flags.get(i, Flags::FLAG_PAIR_IS_NEXT),
1633 is_proper: self.flags.get(i, Flags::FLAG_PAIR_IS_PROPER),
1634 });
1635 }
1636 }
1637 // There should be an empty slice at the end, as the buffer is 0-terminated.
1638 // TODO: Should we check this?
1639 Ok(())
1640 }
1641
1642 fn decompress_quality_strings(&self, result: &mut [Alignment]) -> Result<(), String> {
1643 if self.quality_strings.is_empty() {
1644 // An empty blob indicates no quality strings.
1645 return Ok(());
1646 }
1647
1648 let buffer = Self::zstd_decompress(&self.quality_strings[..])?;
1649 let mut iter = buffer.split(|&c| c == 0);
1650 for (i, aln) in result.iter_mut().enumerate() {
1651 let quality = iter.next().ok_or_else(|| format!("Missing quality string for alignment {}", i))?;
1652 aln.base_quality = quality.to_vec();
1653 }
1654 // There should be an empty slice at the end, as the buffer is 0-terminated.
1655 // TODO: Should we check this?
1656 Ok(())
1657 }
1658
1659 fn decompress_difference_strings(&self, result: &mut [Alignment]) -> Result<(), String> {
1660 // TODO: Also consider using an empty blob for no difference strings, as we do for quality strings.
1661
1662 let mut decoder = RLEIter::with_sigma(&self.difference_strings[..], Difference::NUM_TYPES);
1663 for (i, aln) in result.iter_mut().enumerate() {
1664 if !self.flags.get(i, Flags::FLAG_EXACT_ALIGNMENT) {
1665 let res = Alignment::decode_difference_from(&self.difference_strings, &mut decoder);
1666 aln.difference = res.map_err(|err| format!("Missing difference string for alignment {}: {}", i, err))?;
1667 }
1668 // If the alignment is exact, we will reconstruct the difference string once we know
1669 // the length of the aligned interval.
1670 }
1671 Ok(())
1672 }
1673
1674 fn decompress_numbers(&self, result: &mut [Alignment]) -> Result<(), String> {
1675 let mut decoder = ByteCodeIter::new(&self.numbers[..]);
1676 for (i, aln) in result.iter_mut().enumerate() {
1677 let full_alignment = self.flags.get(i, Flags::FLAG_FULL_ALIGNMENT);
1678 let exact_alignment = self.flags.get(i, Flags::FLAG_EXACT_ALIGNMENT);
1679 // We did not store the difference string for an exact alignment.
1680 let stored_difference_string = aln.has_difference_string();
1681
1682 // Derive the numbers from the difference string first.
1683 // If the difference string is empty, we get zeros and update them later.
1684 let (mut query_len, mut target_len, num_matches, num_edits) = Difference::stats(&aln.difference);
1685 aln.matches = num_matches;
1686 aln.edits = num_edits;
1687
1688 // Query sequence coordinates.
1689 if let Some(len) = self.read_length {
1690 query_len = len;
1691 } else if !stored_difference_string {
1692 query_len = decoder.next().ok_or_else(|| format!("Missing query sequence aligned length for alignment {}", i))?;
1693 }
1694 let (query_left, query_right) = if full_alignment {
1695 (0, 0)
1696 } else {
1697 let query_left = decoder.next().ok_or_else(|| format!("Missing query sequence left flank for alignment {}", i))?;
1698 let query_right = decoder.next().ok_or_else(|| format!("Missing query sequence right flank for alignment {}", i))?;
1699 (query_left, query_right)
1700 };
1701 if self.read_length.is_some() {
1702 // The known length was for the entire query sequence, including the flanks.
1703 query_len -= query_left + query_right;
1704 }
1705 aln.seq_interval = query_left..(query_left + query_len);
1706 aln.seq_len = query_len + query_left + query_right;
1707
1708 // Target path coordinates.
1709 if exact_alignment {
1710 target_len = query_len;
1711 } else if !stored_difference_string {
1712 target_len = decoder.next().ok_or_else(|| format!("Missing target path aligned length for alignment {}", i))?;
1713 }
1714 let target_left = decoder.next().ok_or_else(|| format!("Missing target path left flank for alignment {}", i))?;
1715 let target_right = 0; // We can determine the length of the right flank later.
1716 aln.path_interval = target_left..(target_left + target_len);
1717 aln.path_len = target_len + target_left + target_right;
1718
1719 // Alignment statistics.
1720 if exact_alignment {
1721 aln.matches = query_len;
1722 aln.edits = 0;
1723 } else if !stored_difference_string {
1724 aln.matches = decoder.next().ok_or_else(|| format!("Missing number of matches for alignment {}", i))?;
1725 aln.edits = decoder.next().ok_or_else(|| format!("Missing number of edits for alignment {}", i))?;
1726 }
1727 if self.flags.get(i, Flags::FLAG_HAS_MAPQ) {
1728 aln.mapq = Some(decoder.next().ok_or_else(|| format!("Missing mapping quality for alignment {}", i))?);
1729 }
1730 if self.flags.get(i, Flags::FLAG_HAS_SCORE) {
1731 aln.score = Some(Alignment::decode_signed(&mut decoder).ok_or_else(||
1732 format!("Missing alignment score for alignment {}", i)
1733 )?);
1734 }
1735 }
1736
1737 Ok(())
1738 }
1739
1740 fn decompress_optional_fields(&self, result: &mut [Alignment]) -> Result<(), String> {
1741 if self.optional.is_empty() {
1742 // An empty blob indicates no optional fields.
1743 return Ok(());
1744 }
1745
1746 let buffer = Self::zstd_decompress(&self.optional[..])?;
1747 let mut iter = buffer.split(|&c| c == 0);
1748 for (i, aln) in result.iter_mut().enumerate() {
1749 let optional = iter.next().ok_or_else(|| format!("Missing optional fields for alignment {}", i))?;
1750 let fields = optional.split(|&c| c == b'\t');
1751 for field in fields {
1752 let parsed = TypedField::parse(field)?;
1753 aln.optional.push(parsed);
1754 }
1755 }
1756 // There should be an empty slice at the end, as the buffer is 0-terminated.
1757 // TODO: Should we check this?
1758 Ok(())
1759 }
1760
1761 /// Decompresses the block into a vector of alignments.
1762 ///
1763 /// Aligned query sequences have target paths represented as GBWT starting positions.
1764 /// The path can be set with [`Alignment::set_target_path`] or extracted from a GBWT index with [`Alignment::extract_target_path`].
1765 /// Unaligned query sequences have empty target paths.
1766 ///
1767 /// The true length of the target path cannot be determined from the alignment block alone.
1768 /// It can be set later using [`Alignment::set_target_path_len`].
1769 ///
1770 /// # Errors
1771 ///
1772 /// Returns an error if decompression fails or if data required for decoding the block is missing.
1773 pub fn decode(&self) -> Result<Vec<Alignment>, String> {
1774 let mut result = vec![Alignment::default(); self.len()];
1775
1776 // In a block of aligned reads, we have a GBWT start for every alignment.
1777 // In a block of unaligned reads, we have no GBWT starts.
1778 self.decompress_gbwt_starts(&mut result)?;
1779
1780 // Missing query / pair names are encoded as empty strings.
1781 self.decompress_names_pairs(&mut result)?;
1782
1783 // This could be an empty blob if we have no quality strings.
1784 self.decompress_quality_strings(&mut result)?;
1785
1786 // The encoding includes empty / missing difference strings but no difference strings
1787 // for exact alignments.
1788 self.decompress_difference_strings(&mut result)?;
1789
1790 // We need the flags and the difference strings (for non-exact alignments) to decode the numbers.
1791 self.decompress_numbers(&mut result)?;
1792
1793 // This could be an empty blob if we have no optional fields.
1794 self.decompress_optional_fields(&mut result)?;
1795
1796 // Reconstruct the difference strings for exact alignments as the last step.
1797 // We may not know the aligned interval length until we have decompressed the numbers.
1798 for (i, aln) in result.iter_mut().enumerate() {
1799 if self.flags.get(i, Flags::FLAG_EXACT_ALIGNMENT) {
1800 aln.difference = vec![Difference::Match(aln.seq_interval.len())];
1801 }
1802 }
1803
1804 Ok(result)
1805 }
1806}
1807
1808//-----------------------------------------------------------------------------