Skip to main content

cyanea_seq/
bwt.rs

1//! Burrows-Wheeler Transform (BWT) for arbitrary byte sequences.
2//!
3//! Provides forward BWT construction via suffix array and inverse BWT
4//! via LF-mapping reconstruction.
5
6/// Burrows-Wheeler Transform of a byte string.
7///
8/// Constructed by appending a sentinel `$` to the input, building the suffix
9/// array, and reading off the last column of the sorted rotation matrix.
10#[derive(Debug, Clone)]
11pub struct Bwt {
12    /// The BWT string (same length as text + sentinel).
13    bwt: Vec<u8>,
14    /// Position of the sentinel character in the BWT.
15    primary_index: usize,
16}
17
18impl Bwt {
19    /// Build the BWT of `text`.
20    ///
21    /// A sentinel `$` (0x00) is appended internally and is lexicographically
22    /// smaller than every input byte.
23    ///
24    /// # Example
25    ///
26    /// ```
27    /// use cyanea_seq::bwt::Bwt;
28    ///
29    /// let bwt = Bwt::build(b"banana");
30    /// assert_eq!(bwt.len(), 7); // 6 + sentinel
31    /// ```
32    pub fn build(text: &[u8]) -> Self {
33        if text.is_empty() {
34            return Self {
35                bwt: vec![0u8], // just the sentinel
36                primary_index: 0,
37            };
38        }
39
40        // Append sentinel (0x00, sorts before everything)
41        let mut augmented = Vec::with_capacity(text.len() + 1);
42        augmented.extend_from_slice(text);
43        augmented.push(0u8);
44
45        let n = augmented.len();
46
47        // Build suffix array via sort
48        let mut sa: Vec<usize> = (0..n).collect();
49        sa.sort_by(|&a, &b| augmented[a..].cmp(&augmented[b..]));
50
51        // BWT: last column of sorted rotations = text[(sa[i] - 1) % n]
52        let mut bwt = Vec::with_capacity(n);
53        let mut primary_index = 0;
54        for (i, &pos) in sa.iter().enumerate() {
55            if pos == 0 {
56                bwt.push(augmented[n - 1]);
57                primary_index = i;
58            } else {
59                bwt.push(augmented[pos - 1]);
60            }
61        }
62
63        Self { bwt, primary_index }
64    }
65
66    /// The BWT string as a byte slice.
67    pub fn as_bytes(&self) -> &[u8] {
68        &self.bwt
69    }
70
71    /// Position of the sentinel character (`$`) in the BWT.
72    pub fn primary_index(&self) -> usize {
73        self.primary_index
74    }
75
76    /// Length of the BWT (including sentinel).
77    pub fn len(&self) -> usize {
78        self.bwt.len()
79    }
80
81    /// Whether the BWT is empty (only possible for empty input + sentinel).
82    pub fn is_empty(&self) -> bool {
83        self.bwt.len() <= 1
84    }
85
86    /// Reconstruct the original text (without sentinel) from the BWT.
87    ///
88    /// Uses the LF-mapping: build C table (cumulative counts of characters
89    /// smaller than c) and occurrence counts, then walk from the row
90    /// containing the sentinel to reconstruct the text in reverse.
91    ///
92    /// # Example
93    ///
94    /// ```
95    /// use cyanea_seq::bwt::Bwt;
96    ///
97    /// let text = b"banana";
98    /// let bwt = Bwt::build(text);
99    /// let recovered = bwt.invert();
100    /// assert_eq!(recovered, text);
101    /// ```
102    pub fn invert(&self) -> Vec<u8> {
103        let n = self.bwt.len();
104        if n <= 1 {
105            return vec![];
106        }
107
108        // Count occurrences of each byte value in the BWT
109        let mut counts = [0usize; 256];
110        for &b in &self.bwt {
111            counts[b as usize] += 1;
112        }
113
114        // Build C table: c_table[c] = number of characters in BWT that are < c
115        let mut c_table = [0usize; 256];
116        let mut cumulative = 0;
117        for i in 0..256 {
118            c_table[i] = cumulative;
119            cumulative += counts[i];
120        }
121
122        // Build occurrence array: occ[i] = number of occurrences of bwt[i]
123        // in bwt[0..i] (exclusive of i, so occ[i] is the rank of bwt[i] among
124        // its equal characters up to position i)
125        let mut running = [0usize; 256];
126        let mut occ = Vec::with_capacity(n);
127        for &b in &self.bwt {
128            occ.push(running[b as usize]);
129            running[b as usize] += 1;
130        }
131
132        // LF-mapping: for row i, lf(i) = c_table[bwt[i]] + occ[i]
133        // Walk from the sentinel row (primary_index) to reconstruct text
134        let text_len = n - 1; // exclude sentinel
135        let mut result = vec![0u8; text_len];
136        let mut idx = self.primary_index;
137        for i in (0..text_len).rev() {
138            idx = c_table[self.bwt[idx] as usize] + occ[idx];
139            result[i] = self.bwt[idx];
140        }
141
142        result
143    }
144}
145
146#[cfg(test)]
147mod tests {
148    use super::*;
149
150    #[test]
151    fn roundtrip_banana() {
152        let text = b"banana";
153        let bwt = Bwt::build(text);
154        assert_eq!(bwt.invert(), text);
155    }
156
157    #[test]
158    fn roundtrip_empty() {
159        let bwt = Bwt::build(b"");
160        assert!(bwt.is_empty());
161        assert_eq!(bwt.len(), 1); // just sentinel
162        assert_eq!(bwt.invert(), b"");
163    }
164
165    #[test]
166    fn roundtrip_single_char() {
167        let text = b"A";
168        let bwt = Bwt::build(text);
169        assert_eq!(bwt.len(), 2);
170        assert_eq!(bwt.invert(), text);
171    }
172
173    #[test]
174    fn roundtrip_repeated_chars() {
175        let text = b"aaaa";
176        let bwt = Bwt::build(text);
177        assert_eq!(bwt.invert(), text);
178    }
179
180    #[test]
181    fn roundtrip_dna() {
182        let text = b"ACGTACGT";
183        let bwt = Bwt::build(text);
184        assert_eq!(bwt.invert(), text);
185    }
186
187    #[test]
188    fn roundtrip_mississippi() {
189        let text = b"mississippi";
190        let bwt = Bwt::build(text);
191        assert_eq!(bwt.invert(), text);
192    }
193
194    #[test]
195    fn roundtrip_all_different() {
196        let text = b"abcdefgh";
197        let bwt = Bwt::build(text);
198        assert_eq!(bwt.invert(), text);
199    }
200
201    #[test]
202    fn primary_index_is_valid() {
203        let text = b"banana";
204        let bwt = Bwt::build(text);
205        // The sentinel position should be within bounds
206        assert!(bwt.primary_index() < bwt.len());
207        // The character at primary_index in the BWT is the last char of the original text
208        // (wrapping: the char before the sentinel in the rotation)
209    }
210
211    #[test]
212    fn bwt_length_matches() {
213        let text = b"ACGTACGT";
214        let bwt = Bwt::build(text);
215        assert_eq!(bwt.len(), text.len() + 1);
216        assert_eq!(bwt.as_bytes().len(), text.len() + 1);
217    }
218
219    #[test]
220    fn roundtrip_long_sequence() {
221        let text = b"ACGTACGTACGTACGTACGTACGTACGTACGT";
222        let bwt = Bwt::build(text);
223        assert_eq!(bwt.invert(), text);
224    }
225}