Skip to main content

rosalind/
lib.rs

1//! # Rosalind — a deterministic, low-memory genomics engine
2//!
3//! Call variants across a whole genome on a laptop, with memory you can **predict
4//! and verify**, and results that are **byte-for-byte reproducible**. Rosalind
5//! treats memory as a *contract*: you declare a RAM budget, `rosalind plan` tells
6//! you up front whether the job fits, the run honors it (fits-or-refuses cleanly —
7//! never a silent OOM-kill), and `rosalind verify` re-checks a BLAKE3 receipt
8//! proving the realized peak landed inside your budget.
9//!
10//! The kernel is a streaming, CIGAR-aware **pileup column stream** bounded by local
11//! coverage, not input size — a substrate you can compute arbitrary per-locus
12//! analytics on. Variant calling is the first consumer, not the whole product.
13//!
14//! ```
15//! use std::sync::Arc;
16//! use rosalind::{PileupEngine, PileupParams, SliceSource};
17//! use rosalind::core::{AlignedRead, CigarOp, CigarOpKind, Position, SamFlags};
18//!
19//! // One 4bp read "ACGT" aligned at chr0:0 over the reference "ACGT".
20//! let read = AlignedRead {
21//!     contig: 0,
22//!     pos: Position(0),
23//!     mapq: 60,
24//!     flags: SamFlags(0),
25//!     cigar: vec![CigarOp::new(CigarOpKind::Match, 4)],
26//!     seq: Arc::from(b"ACGT".to_vec().into_boxed_slice()),
27//!     qual: Arc::from(vec![40u8; 4].into_boxed_slice()),
28//! };
29//! let reference: Arc<[u8]> = Arc::from(b"ACGT".to_vec().into_boxed_slice());
30//!
31//! // The bounded pileup substrate: one PileupColumn per covered position.
32//! let mut engine =
33//!     PileupEngine::new(SliceSource::new(vec![read]), reference, 0, 0..4, PileupParams::default());
34//! let first = engine.next().unwrap().unwrap();
35//! assert_eq!(first.depth(), 1);
36//! ```
37//!
38//! ## Research direction (Phase D)
39//!
40//! Rosalind is also a research vehicle for **space-bounded genomics** — sublinear-space
41//! index *construction* along a `~√t` space/time curve, extending the memory contract to the
42//! index build itself (today's build is `O(reference)`). That is a direction, not yet shipped;
43//! it is tracked in `docs/OPEN_PROBLEMS.md`.
44
45#![warn(missing_docs, missing_debug_implementations)]
46#![allow(clippy::new_without_default)]
47
48// Each module is a layer of the genomics engine.
49/// The calling layer: probabilistically-grounded, abstention-aware variant calls from pileup columns.
50pub mod call;
51/// Core types: the lingua franca shared by every layer (io, index, align, pileup, call).
52pub mod core;
53/// Genomics primitives: the FM-index, persisted memory-mapped index, alignment, sort, eval.
54pub mod genomics;
55/// IO layer: spec-valid VCF writer + streaming FASTA/FASTQ/BAM readers.
56pub mod io;
57/// The streaming pileup kernel: one CIGAR-aware, filtered, bounded-memory engine.
58pub mod pileup;
59/// Reproducibility receipts: canonical-JSON BLAKE3 manifests for every run.
60/// Extracted to the `rosalind-receipt` leaf crate (no htslib — wasm-friendly) and
61/// re-exported here, so `rosalind::provenance::*` is unchanged.
62pub use rosalind_receipt as provenance;
63/// Third-party byte re-derivation from a receipt (the `reproduce` verb).
64pub mod reproduce;
65/// Helper utilities: read-only mmap + peak-RSS measurement.
66pub mod util;
67
68// ── Genomics product surface — what builders compose on ───────────────────────
69// The bounded streaming substrate:
70pub use io::bam::StreamingBamSource;
71pub use pileup::{Obs, PileupColumn, PileupEngine, PileupParams, ReadSource, SliceSource};
72// The bounded whole-genome germline drive + calls:
73pub use call::{
74    call_germline_region_streaming, call_germline_whole_genome, GermlineCall, GermlineParams,
75};
76// ColumnKit: implement one trait, inherit the bounded contract (SDK front door).
77pub use call::{run_bounded_whole_genome, ColumnAnalyzer, FeatureAnalyzer};
78// The memory contract (declare → plan → honor → verify), incl. fleet packing:
79pub use call::{
80    estimate_variants_working_set, first_fit_decreasing, predicted_peak_rss_bytes, PackJob,
81    PackOutcome,
82};
83pub use core::{MemoryBudget, WorkingSet};
84// Build-once → mmap index + the reproducibility receipt:
85pub use genomics::{GenomeIndex, IndexReader, ReferenceView};
86pub use provenance::{verify_receipt, CommandCapture, RunManifest, VerifyOpts, VerifyReport};