cyanea_seq/bwt.rs
1//! Burrows-Wheeler Transform (BWT) for arbitrary byte sequences.
2//!
3//! Provides forward BWT construction via suffix array and inverse BWT
4//! via LF-mapping reconstruction.
5
6/// Burrows-Wheeler Transform of a byte string.
7///
8/// Constructed by appending a sentinel `$` to the input, building the suffix
9/// array, and reading off the last column of the sorted rotation matrix.
10#[derive(Debug, Clone)]
11pub struct Bwt {
12 /// The BWT string (same length as text + sentinel).
13 bwt: Vec<u8>,
14 /// Position of the sentinel character in the BWT.
15 primary_index: usize,
16}
17
18impl Bwt {
19 /// Build the BWT of `text`.
20 ///
21 /// A sentinel `$` (0x00) is appended internally and is lexicographically
22 /// smaller than every input byte.
23 ///
24 /// # Example
25 ///
26 /// ```
27 /// use cyanea_seq::bwt::Bwt;
28 ///
29 /// let bwt = Bwt::build(b"banana");
30 /// assert_eq!(bwt.len(), 7); // 6 + sentinel
31 /// ```
32 pub fn build(text: &[u8]) -> Self {
33 if text.is_empty() {
34 return Self {
35 bwt: vec![0u8], // just the sentinel
36 primary_index: 0,
37 };
38 }
39
40 // Append sentinel (0x00, sorts before everything)
41 let mut augmented = Vec::with_capacity(text.len() + 1);
42 augmented.extend_from_slice(text);
43 augmented.push(0u8);
44
45 let n = augmented.len();
46
47 // Build suffix array via sort
48 let mut sa: Vec<usize> = (0..n).collect();
49 sa.sort_by(|&a, &b| augmented[a..].cmp(&augmented[b..]));
50
51 // BWT: last column of sorted rotations = text[(sa[i] - 1) % n]
52 let mut bwt = Vec::with_capacity(n);
53 let mut primary_index = 0;
54 for (i, &pos) in sa.iter().enumerate() {
55 if pos == 0 {
56 bwt.push(augmented[n - 1]);
57 primary_index = i;
58 } else {
59 bwt.push(augmented[pos - 1]);
60 }
61 }
62
63 Self { bwt, primary_index }
64 }
65
66 /// The BWT string as a byte slice.
67 pub fn as_bytes(&self) -> &[u8] {
68 &self.bwt
69 }
70
71 /// Position of the sentinel character (`$`) in the BWT.
72 pub fn primary_index(&self) -> usize {
73 self.primary_index
74 }
75
76 /// Length of the BWT (including sentinel).
77 pub fn len(&self) -> usize {
78 self.bwt.len()
79 }
80
81 /// Whether the BWT is empty (only possible for empty input + sentinel).
82 pub fn is_empty(&self) -> bool {
83 self.bwt.len() <= 1
84 }
85
86 /// Reconstruct the original text (without sentinel) from the BWT.
87 ///
88 /// Uses the LF-mapping: build C table (cumulative counts of characters
89 /// smaller than c) and occurrence counts, then walk from the row
90 /// containing the sentinel to reconstruct the text in reverse.
91 ///
92 /// # Example
93 ///
94 /// ```
95 /// use cyanea_seq::bwt::Bwt;
96 ///
97 /// let text = b"banana";
98 /// let bwt = Bwt::build(text);
99 /// let recovered = bwt.invert();
100 /// assert_eq!(recovered, text);
101 /// ```
102 pub fn invert(&self) -> Vec<u8> {
103 let n = self.bwt.len();
104 if n <= 1 {
105 return vec![];
106 }
107
108 // Count occurrences of each byte value in the BWT
109 let mut counts = [0usize; 256];
110 for &b in &self.bwt {
111 counts[b as usize] += 1;
112 }
113
114 // Build C table: c_table[c] = number of characters in BWT that are < c
115 let mut c_table = [0usize; 256];
116 let mut cumulative = 0;
117 for i in 0..256 {
118 c_table[i] = cumulative;
119 cumulative += counts[i];
120 }
121
122 // Build occurrence array: occ[i] = number of occurrences of bwt[i]
123 // in bwt[0..i] (exclusive of i, so occ[i] is the rank of bwt[i] among
124 // its equal characters up to position i)
125 let mut running = [0usize; 256];
126 let mut occ = Vec::with_capacity(n);
127 for &b in &self.bwt {
128 occ.push(running[b as usize]);
129 running[b as usize] += 1;
130 }
131
132 // LF-mapping: for row i, lf(i) = c_table[bwt[i]] + occ[i]
133 // Walk from the sentinel row (primary_index) to reconstruct text
134 let text_len = n - 1; // exclude sentinel
135 let mut result = vec![0u8; text_len];
136 let mut idx = self.primary_index;
137 for i in (0..text_len).rev() {
138 idx = c_table[self.bwt[idx] as usize] + occ[idx];
139 result[i] = self.bwt[idx];
140 }
141
142 result
143 }
144}
145
146#[cfg(test)]
147mod tests {
148 use super::*;
149
150 #[test]
151 fn roundtrip_banana() {
152 let text = b"banana";
153 let bwt = Bwt::build(text);
154 assert_eq!(bwt.invert(), text);
155 }
156
157 #[test]
158 fn roundtrip_empty() {
159 let bwt = Bwt::build(b"");
160 assert!(bwt.is_empty());
161 assert_eq!(bwt.len(), 1); // just sentinel
162 assert_eq!(bwt.invert(), b"");
163 }
164
165 #[test]
166 fn roundtrip_single_char() {
167 let text = b"A";
168 let bwt = Bwt::build(text);
169 assert_eq!(bwt.len(), 2);
170 assert_eq!(bwt.invert(), text);
171 }
172
173 #[test]
174 fn roundtrip_repeated_chars() {
175 let text = b"aaaa";
176 let bwt = Bwt::build(text);
177 assert_eq!(bwt.invert(), text);
178 }
179
180 #[test]
181 fn roundtrip_dna() {
182 let text = b"ACGTACGT";
183 let bwt = Bwt::build(text);
184 assert_eq!(bwt.invert(), text);
185 }
186
187 #[test]
188 fn roundtrip_mississippi() {
189 let text = b"mississippi";
190 let bwt = Bwt::build(text);
191 assert_eq!(bwt.invert(), text);
192 }
193
194 #[test]
195 fn roundtrip_all_different() {
196 let text = b"abcdefgh";
197 let bwt = Bwt::build(text);
198 assert_eq!(bwt.invert(), text);
199 }
200
201 #[test]
202 fn primary_index_is_valid() {
203 let text = b"banana";
204 let bwt = Bwt::build(text);
205 // The sentinel position should be within bounds
206 assert!(bwt.primary_index() < bwt.len());
207 // The character at primary_index in the BWT is the last char of the original text
208 // (wrapping: the char before the sentinel in the rotation)
209 }
210
211 #[test]
212 fn bwt_length_matches() {
213 let text = b"ACGTACGT";
214 let bwt = Bwt::build(text);
215 assert_eq!(bwt.len(), text.len() + 1);
216 assert_eq!(bwt.as_bytes().len(), text.len() + 1);
217 }
218
219 #[test]
220 fn roundtrip_long_sequence() {
221 let text = b"ACGTACGTACGTACGTACGTACGTACGTACGT";
222 let bwt = Bwt::build(text);
223 assert_eq!(bwt.invert(), text);
224 }
225}