bzip2_os/bwt_algorithms/sais_fallback.rs
1//! The fallback bwt_sort algorithm for the Rust version of the standard BZIP2 library.
2//!
3//! This is a simple SA-IS implemenation of a sorting algorithm, using sentinels. SA-IS is particularly suited
4//! to repetative data such as is found in genetic sequencing.
5//!
6//! SA-IS does not lend itself to multi-threading when the data sets are smaller (except for frequency counting). The BZIP2 algorithm has a maximum
7//! block size of 900kb, consequently no attempt at multithreading within the sorting portion of the algorithm is made. (Multiple blocks
8//! will be processed in parallel, though.)
9//!
10//! In order to determine whether the data is more likely to be better sorted using SA-IS, the lms_complexity function
11//! can test a sample of the data to determine whether SA-IS is suited to the data.
12//!
13//! NOTE 1: This implementation effectively uses compressed LMS data in order to reduce cache misses.
14//!
15//! NOTE 2: It is difficult to determine the best sorting alogithm based on only a sample of the data. The approach
16//! used in this version is based on LMS complexity (my term). This technique has proven
17//! to be the most reliable method I have discovered to return a decent result based on analysis of a small sample of the larger data set.
18//!
19//!
20
21const S: u32 = 1;
22const LMS: u32 = 1;
23const L: u32 = 0;
24
25#[allow(clippy::upper_case_acronyms)]
26/// LMS struct holds commpressed L, S, and LMS values, plus counters used for validity checks.
27struct LMS {
28 /// Bit oriented vec of LMS type element indecies
29 lms: Vec<u32>,
30 /// Bit oriented vec of L and S type element indecies
31 ls: Vec<u32>,
32 // The following are primarily used in debugging - to simplify getting counts
33 /// Position of the last element in the input data and LS/LMS vecs
34 last: usize,
35 /// Count of LMS type elements
36 lms_count: usize,
37 /// Count of L type elements
38 l_count: usize,
39 /// Count of S type elements
40 s_count: usize,
41}
42
43/// Create empty LMS struct
44impl LMS {
45 fn new() -> Self {
46 Self {
47 // Initialize ls and lms vecs
48 ls: Vec::new(),
49 lms: Vec::new(),
50 // The following are primarily used in debugging - to simplify getting counts
51 last: 0_usize,
52 lms_count: 0_usize,
53 l_count: 0_usize,
54 s_count: 0_usize,
55 }
56 }
57
58 /// Initialize an LMS struct based on data (which will be either u8 or u32).
59 fn init<T>(&mut self, data: &[T])
60 where
61 T: TryInto<usize> + Copy + std::cmp::Ord,
62 T::Error: std::fmt::Debug,
63 {
64 /*
65 SA-IS builds a suffix tree, but BZIP2 wraps around the end of the string in the comparison.
66
67 The (currently magic) solution is to use a Duval rotation of the data to prevent elements from
68 miss-sorting.
69
70 To store LMS info (s-type, l-type and lms-type elements) I will use a binary system where a set
71 bit indicates an s-type and a a zero bit indicates a l-type in the ls vector. In the lms vector,
72 a set bit indicates an lms element. Since 2^5 = 32, we can index to the correct u32 in the vecs by
73 shifting the index right 5 bits. We can get the right element within that vec by using %32.
74
75 This means that the temporary internal ls/lms info for 64 bytes can be stored in two u32 words (8 bytes).
76
77 The use of sentinels also means that we must store ls/lms info at idx+1 (one position beyond the actual
78 location in the data). We will be putting the sentinel into location 0.
79 */
80
81 // Initialize data end and search starting position
82 self.last = data.len();
83
84 // Initialize ls and lms vecs to all L (0 - no S or LMS found yet)
85 self.lms = vec![L; data.len() / 32 + 1];
86 self.ls = vec![L; data.len() / 32 + 1];
87 // Initialize the final sentinel to S type as well as LMS type
88 self.ls[self.last >> 5] |= S << (self.last % 32);
89 self.lms[self.last >> 5] |= LMS << (self.last % 32);
90 //self.debug();
91
92 // Iterate backwards through the data determining L S and LMS elements. The sentinel
93 // at the end is an S, so the last elmenet by definition must be an L type. We can
94 // start iterating left from here.
95 let mut current = L;
96 let mut prev = data[data.len() - 1];
97 for (idx, &el) in data.iter().enumerate().take(data.len() - 1).rev() {
98 // Compare the current element with the previous element. If less or equal, this is an S
99 match el.cmp(&prev) {
100 std::cmp::Ordering::Less => {
101 self.ls[idx >> 5] |= S << (idx % 32);
102 //self.debug();
103 // Record that we are now working with a S type element
104 current = S;
105 }
106 // If the prev element is equal to this one, we need to compare to whether we are currently
107 // working with S or L
108 std::cmp::Ordering::Equal => {
109 if current == S {
110 // We are in a run of S, so we need to set this one to S. (L's are the unmarked varient)
111 self.ls[idx >> 5] |= S << (idx % 32);
112 //self.debug();
113 }
114 }
115 // If we found an L and we were in a run of S type elements, then the previous element must be an LMS
116 std::cmp::Ordering::Greater => {
117 if current == S {
118 // Mark previous element as lms
119 self.lms[(idx + 1) >> 5] |= LMS << ((idx + 1) % 32);
120 current = L;
121 // self.debug();
122 }
123 }
124 }
125 prev = el;
126 }
127 // Before we leave, take a couple nanoseconds to record the counts
128 self.lms_count = self.lms.iter().map(|el| el.count_ones()).sum::<u32>() as usize;
129 self.s_count = self.ls.iter().map(|el| el.count_ones()).sum::<u32>() as usize;
130 self.l_count = data.len() - self.s_count;
131 }
132
133 /// Checks if element at index is set (is an LMS element)
134 fn is_lms(&self, idx: usize) -> bool {
135 if self.lms[idx >> 5] & (LMS << (idx % 32)) > 0 {
136 return true;
137 };
138 false
139 }
140
141 /// data element at idx is not set (is an L)
142 pub fn is_l(&self, idx: usize) -> bool {
143 if self.ls[idx >> 5] & (S << (idx % 32)) == 0 {
144 return true;
145 };
146 false
147 }
148
149 /// data element at idx is set (is an S)
150 fn is_s(&self, idx: usize) -> bool {
151 if self.ls[idx >> 5] & (S << (idx % 32)) > 0 {
152 return true;
153 };
154 false
155 }
156
157 /// Test if LMS elements at data index a and b are NOT equal. Assumes a and be are lms elements.
158 fn is_unequal_lms<T: std::cmp::PartialOrd + std::fmt::Display>(
159 &self,
160 data: &[T],
161 a: usize,
162 b: usize,
163 ) -> bool {
164 // If either is a sentinel, they are unequal
165 if a == self.last || b == self.last {
166 return true;
167 }
168 // Put smaller of a or b into first, and calculate difference between them
169 let mut i = if a > b { b + 1 } else { a + 1 };
170 let diff = if a > b { a - b } else { b - a };
171
172 // Iterate through the data relative to both a and b checking for equality starting at the element past a
173 while i != self.last - diff {
174 let b = i + diff;
175 // If both a and b are LMS elements, then we are past the segments and the segments were not unequal
176 if self.is_lms(i) && self.is_lms(b) {
177 return false;
178 }
179
180 // If only one was an LMS elements, then we are done and the segments were unequal
181 if self.is_lms(i) || self.is_lms(b) {
182 return true;
183 }
184 // If the next bytes are unequal, then the segments were unequal
185 if data[i] != data[b] {
186 return true;
187 }
188 i += 1;
189 }
190 // We worked through all the data, so the segments were not equal
191 true
192 }
193
194 /*
195 /// Test if LMS elements at data index a and b are NOT equal. Assumes a and be are lms elements.
196 /// This is faster for larger blocks, but oddly slower for smaller blocks.
197 fn is_unequal_lms_b<T: std::cmp::PartialOrd + std::fmt::Display>(
198 &self,
199 data: &[T],
200 a: usize,
201 b: usize,
202 ) -> bool {
203 // If either a or b is a sentinel, the elements are unequal
204 if a == self.last || b == self.last {
205 return true;
206 }
207 // Make sure we start at the element that is smaller
208 let (c, diff) = if a > b {
209 (b, (a - b).min(self.last - a))
210 } else {
211 (a, (b - a).min(self.last - b))
212 };
213
214 let mut length = 1;
215 while !self.is_lms(c + length) && length < diff {
216 length += 1;
217 }
218
219 if data[a..a + length] == data[b..b + length] {
220 return false;
221 }
222 true
223 }
224 */
225 fn debug(&self) {
226 // Print a "count line" to aid counting to stderr
227 eprint!(" --");
228 for i in 0..=self.last {
229 eprint!("{}", i % 10);
230 }
231 eprintln!("-- ");
232
233 // Print LS data
234 eprint!(" LS: ");
235 for i in 0..=self.last {
236 eprint!("{}", (self.ls[i >> 5] & 1_u32 << (i % 32) > 0) as u32);
237 }
238 eprintln!(" ({} S & {} L elements)", self.s_count, self.l_count);
239
240 // Print LMS data
241 eprint!(" LMS: ");
242 for i in 0..=self.last {
243 eprint!("{}", (self.lms[i >> 5] & 1_u32 << (i % 32) > 0) as u32);
244 }
245 eprintln!(" ({} LMS elements)", self.lms_count);
246 }
247}
248
249#[cfg(test)]
250mod test_lms {
251 use super::*;
252 #[test]
253 pub fn lms_test() {
254 let data = "caabage".as_bytes();
255 let mut lms = LMS::new();
256 lms.init(data);
257 //cabbage - LSLLSLLS
258 assert!(lms.is_l(0));
259 assert!(lms.is_s(1));
260 assert!(lms.is_s(2));
261 assert!(lms.is_l(3));
262 assert!(lms.is_s(4));
263 assert!(lms.is_l(5));
264 assert!(lms.is_l(6));
265 assert!(lms.is_s(7));
266
267 assert_eq!(lms.is_lms(0), false);
268 assert_eq!(lms.is_lms(1), true);
269 assert_eq!(lms.is_lms(2), false);
270 assert_eq!(lms.is_lms(3), false);
271 assert_eq!(lms.is_lms(4), true);
272 assert_eq!(lms.is_lms(5), false);
273 assert_eq!(lms.is_lms(6), false);
274 assert_eq!(lms.is_lms(7), true);
275 }
276}
277//--- Done with LMS struct ------------------------------------------------------------------------------------
278
279use log::{error, debug};
280//-- Counts for Bucket Sorting --------------------------------------------------------------------------------
281use rayon::prelude::*;
282
283/// Return frequency count of elements in the input vec. Size is the value of the largest element in the input. (Vec based because we don't know the number
284/// of elements we will be counting.)
285fn bucket_sizes<T>(data: &[T], size: usize) -> Vec<u32>
286where
287 T: TryInto<usize> + Copy,
288 T::Error: std::fmt::Debug,
289 T: Sync,
290{
291 // Use parallel method if more than 64k elements in the data
292 if data.len() > 64_000 {
293 // 16k is pretty much the sweet spot for chunk size.
294 data.par_chunks(16_000)
295 .fold(
296 || vec![0_u32; size],
297 |mut freqs, chunk| {
298 chunk
299 .iter()
300 .for_each(|&el| freqs[el.try_into().unwrap_or_default()] += 1);
301 freqs
302 },
303 )
304 .reduce(
305 || vec![0_u32; size],
306 |s, f| s.iter().zip(&f).map(|(a, b)| a + b).collect::<Vec<u32>>(),
307 )
308 } else {
309 let mut freqs = vec![0_u32; size];
310 data.iter()
311 .for_each(|&el| freqs[el.try_into().unwrap_or_default()] += 1);
312 freqs
313 }
314}
315
316/// Returns index to top positions of buckets for bucket sorting.
317fn bucket_heads(buckets: &[u32]) -> Vec<u32> {
318 buckets
319 .iter()
320 .enumerate()
321 .fold(
322 (vec![0_u32; buckets.len()], 1_u32),
323 |(mut head, mut idx), (i, &count)| {
324 head[i] = idx;
325 idx += count;
326 (head, idx)
327 },
328 )
329 .0
330}
331/// Returns index to bottom positions of buckets for bucket sorting.
332fn bucket_tails(buckets: &[u32]) -> Vec<u32> {
333 buckets
334 .iter()
335 .enumerate()
336 .fold(
337 (vec![0_u32; buckets.len()], 1_u32),
338 |(mut tail, mut idx), (i, &count)| {
339 idx += count;
340 tail[i] = idx - 1;
341 (tail, idx)
342 },
343 )
344 .0
345}
346
347#[cfg(test)]
348mod test_bucket_prep {
349 use super::*;
350 #[test]
351 pub fn freq_count_test() {
352 let data = [2, 0, 1, 1, 0, 6, 4];
353 let frq = bucket_sizes(&data, 7);
354 assert_eq!(frq[0..7], vec![2, 2, 1, 0, 1, 0, 1]);
355 }
356 #[test]
357 pub fn freq_head_test() {
358 let data = [2, 0, 1, 1, 0, 6, 4];
359 let freq = bucket_sizes(&data, 7);
360 let heads = bucket_heads(&freq);
361 assert_eq!(heads[0..7], vec![1, 3, 5, 6, 6, 7, 7]);
362 }
363 #[test]
364 pub fn freq_tail_test() {
365 let data = [2, 0, 1, 1, 0, 6, 4];
366 let freq = bucket_sizes(&data, 7);
367 let tails = bucket_tails(&freq);
368 assert_eq!(tails[0..7], vec![2, 4, 5, 5, 6, 6, 7]);
369 }
370}
371//-- End Frequency Counts for Bucket Sorting -------------------------------------------------------------------
372
373//-- Bucket Sorting --------------------------------------------------------------------------------------------
374/// Initial bucket sort of the LMS elements in the buckets vec.
375fn initial_buckets_sort<T>(data: &[T], bkt_sizes: &[u32], lms: &LMS) -> Vec<Option<u32>>
376where
377 T: TryInto<usize> + Copy,
378 T::Error: std::fmt::Debug,
379{
380 // Get the bucket tails info
381 let mut tails = bucket_tails(bkt_sizes);
382
383 // Initialize output vec to contain 1 more element that the input data
384 let mut buckets = vec![None; data.len() + 1];
385 buckets[0] = Some(lms.last as u32);
386
387 // Find the LMS elements
388 for idx in (0..lms.last).rev() {
389 if lms.is_lms(idx) {
390 buckets[tails[data[idx].try_into().unwrap_or_default()] as usize] = Some(idx as u32);
391 tails[data[idx].try_into().unwrap_or_default()] -= 1;
392 }
393 }
394 // Add the sentinel to the front
395 buckets[0] = Some(data.len() as u32);
396 buckets
397}
398
399/// Induce L type elements into the sort array after initial allocation of LMS elements
400fn induced_sort_l<T>(data: &[T], buckets: &mut [Option<u32>], bkt_sizes: &[u32], lms: &LMS)
401where
402 T: TryInto<usize> + Copy,
403 T::Error: std::fmt::Debug,
404{
405 // Get the bucket heads info
406 let mut heads = bucket_heads(bkt_sizes);
407
408 // Find L type elements that are left of the index and insert them into the buckets.
409 // We can start at 0 and walk to the end with L type elements.
410 for idx in 0..lms.last {
411 // Only do buckets with a valid index
412 if buckets[idx].is_some() {
413 // Check if the element left of the element indexed by this bucket also an L type.
414 let prev = if buckets[idx] == Some(0) {
415 lms.last
416 } else {
417 buckets[idx].unwrap() as usize - 1
418 };
419 if lms.is_l(prev) {
420 // If so, insert that l-type into the next free top spot in that bucket
421 buckets[heads[data[prev].try_into().unwrap_or_default()] as usize] =
422 Some(prev as u32);
423 // And adjust the location available for the next L-type in this bucket (if any)
424 heads[data[prev].try_into().unwrap_or_default()] += 1;
425 //DEBUG
426 //lms.debug();
427 }
428 }
429 }
430}
431
432/// Induce S type elements into the sort array after induced L sort
433fn induced_sort_s<T>(data: &[T], buckets: &mut [Option<u32>], bkt_sizes: &[u32], lms: &LMS)
434where
435 T: TryInto<usize> + Copy + std::cmp::Ord,
436 T::Error: std::fmt::Debug,
437{
438 // Get the bucket tails info
439 let mut tails = bucket_tails(bkt_sizes);
440
441 // Start at the right most known element and then iterate down.
442 let mut idx = lms.last;
443 // Prepare to loop back to here
444 while idx > 0 {
445 // Check if the element left of the element indexed by this bucket is an S type.
446 if buckets[idx].is_none() {
447 error!("This should never happen. Idx is {}", idx);
448 lms.debug(); // Pause here
449 }
450 // As long as we are not referencing the start of the data
451 if buckets[idx] != Some(0) {
452 // Get the previous element index
453 let prev = buckets[idx].unwrap() as usize - 1;
454 // If it is an S type
455 if lms.is_s(prev) {
456 // Insert/update that element into the next free bottom spot in the appropriate bucket
457 let bkt_index = data[prev].try_into().unwrap_or_default();
458 buckets[tails[bkt_index] as usize] = Some(prev as u32);
459 // And adjust the location available for the next S type in this bucket (if any)
460 tails[data[prev].try_into().unwrap_or_default()] -= 1;
461 //DEBUG
462 //lms.debug();
463 }
464 };
465 idx -= 1;
466 }
467}
468
469fn sa_is<T>(data: &[T], alphabet_size: usize) -> Vec<u32>
470where
471 T: TryInto<usize> + Copy + std::cmp::Ord + std::fmt::Display + std::marker::Sync,
472 T::Error: std::fmt::Debug,
473{
474 // Don't attemp to process empty data.
475 if data.is_empty() {
476 return vec![];
477 }
478
479 // STEP 1: Build LMS info
480 let mut lms = LMS::new();
481 lms.init(data);
482 //DEBUG
483 //lms.debug();
484
485 // STEP 2: Calculate buckets for bucket sorting
486 let bkt_sizes = bucket_sizes(data, alphabet_size);
487
488 // STEP 3: Do initial bucket sorting of LMS elements
489 let mut buckets = initial_buckets_sort(data, &bkt_sizes, &lms);
490 // Validity test for development
491 if lms.lms_count != buckets.iter().filter(|&b| b.is_some()).count() {
492 error!(
493 "Didn't initialize buckets properly. Missed {}.",
494 lms.lms_count - buckets.iter().filter(|&b| b.is_some()).count()
495 );
496 debug_buckets('i', &buckets);
497 }
498
499 // STEP 4: Do induced L sort
500 induced_sort_l(data, &mut buckets, &bkt_sizes, &lms);
501 // Validity test for development
502 if lms.s_count - lms.lms_count != buckets.iter().filter(|&b| b.is_none()).count() {
503 error!(
504 "Expected to have {} empty buckets after first induced_sort_l. Instead...",
505 lms.s_count - lms.lms_count,
506 );
507 debug_nones(&buckets);
508 debug_buckets('l', &buckets);
509 }
510
511 // STEP 5: Do induced S sort
512 induced_sort_s(data, &mut buckets, &bkt_sizes, &lms);
513 // Validity test for development
514 if 0 != buckets.iter().filter(|&b| b.is_none()).count() {
515 error!(
516 "Didn't complete s-induced sort properly. Missed {}.",
517 buckets.iter().filter(|&b| b.is_none()).count()
518 );
519 debug_nones(&buckets);
520 //debug_buckets('s', &buckets);
521 }
522
523 // STEP 6: Create Summary Suffix list from correctly sorted LMS elements
524 // Create summary of unique lms elements.
525 let (summary, offsets, summary_size) = make_summary(data, &mut buckets, &lms);
526 // Create unique ordered elements, recursing if needed to ensure only unique lms elements exist.
527 let summary_suffix_vec = make_summary_suffix_vec(summary_size, &lms, summary);
528
529 //STEP 7: Do final bucket sort based on the summary. First clear the buckets
530 (0..buckets.len()).for_each(|i| buckets[i] = None);
531
532 // Get the bucket tails info
533 let mut tails = bucket_tails(&bkt_sizes);
534
535 // Place the LMS elements in the summary_suffix_vec
536 for el in summary_suffix_vec.iter().skip(2).rev() {
537 let data_index = offsets[*el as usize] as usize;
538 let bucket_index = data[data_index].try_into().unwrap_or_default() as usize;
539 buckets[tails[bucket_index] as usize] = Some(data_index as u32);
540 tails[bucket_index] -= 1;
541 }
542
543 // Add the sentinel to the front
544 buckets[0] = Some(data.len() as u32);
545
546 // Validity test for development
547 if lms.lms_count != buckets.iter().filter(|&b| b.is_some()).count() {
548 error!("Didn't initialize buckets for FINAL sort properly.");
549 debug_buckets('F', &buckets);
550 }
551
552 // STEP 9: Do induced L sort
553 induced_sort_l(data, &mut buckets, &bkt_sizes, &lms);
554 // Validity test for development
555 if lms.s_count - lms.lms_count != buckets.iter().filter(|&b| b.is_none()).count() {
556 error!(
557 "Expected to have {} empty buckets during first induced_sort_l. Instead...",
558 lms.s_count - lms.lms_count,
559 );
560 debug_nones(&buckets);
561 //debug_buckets('L', &buckets);
562 }
563
564 // STEP 10: Do induced S sort
565 induced_sort_s(data, &mut buckets, &bkt_sizes, &lms);
566 // Validity test for development
567 if 0 != buckets.iter().filter(|&b| b.is_none()).count() {
568 error!(
569 "Didn't complete s-induced sort properly. Missed {}.",
570 buckets.iter().filter(|&b| b.is_none()).count()
571 );
572 debug_nones(&buckets);
573 //debug_buckets('S', &buckets);
574 }
575
576 // Convert Option<u32> to u32 and return that vec
577 buckets.iter().skip(1).map(|&el| el.unwrap()).collect()
578}
579
580/// Entry point for Simple SA-IS sort for Burrow-Wheeler Transform. Takes u8 slice and returns
581/// u32 key and u8 vector in BWT format.
582pub fn sais_entry(data: &[u8]) -> (u32, Vec<u8>) {
583 /*
584 SA-IS doesn't work in our context unless it is "duval rotated". We must have a lexicographically minimal
585 rotation of the data or the conversion from the index to the BWT vec will be off.
586
587 The duval rotation finds the "lexically minimal" point and splits and reorders the data around that point.
588
589 Cudos to https://github.com/torfmaster/ribzip2, where I initially saw this concept in use.
590 */
591
592 // Do the rotation and return the reorganized data and offset to the original start of the data.
593 let (data, offset) = rotate_duval(data);
594
595 // Go do the sa-is sort, returning the index to the BWT.
596 let index = sa_is(&data, 256);
597
598 // Initialize the key. We find the actutal value in the loop below.
599 let mut key = 0_u32;
600 // Initialize the final vec
601 let mut bwt = vec![0_u8; data.len()];
602 // Get the offset to the original start of the data
603 let duval_zero_position = (index.len() - offset) as u32;
604
605 // Create the final BWT vec and find the actual key value
606 for i in 0..index.len() {
607 // Watch for the key location
608 if index[i] == duval_zero_position {
609 key = i as u32
610 }
611 // BWT is build from the data at the previous index location. Wrap around if the index is at 0.
612 if index[i] == 0 {
613 bwt[i] = data[data.len() - 1] as u8;
614 } else {
615 bwt[i] = data[index[i] as usize - 1];
616 }
617 }
618 // Return the key and BWT data
619 (key, bwt)
620}
621
622/// Create summary of LMS elements. Return vec of LMS names, vec of offsets and count of unique LMS names.
623fn make_summary<T>(
624 data: &[T],
625 buckets: &mut [Option<u32>],
626 lms: &LMS,
627) -> (Vec<u32>, Vec<u32>, usize)
628where
629 T: TryInto<usize> + Copy + std::cmp::Ord + std::fmt::Display,
630 T::Error: std::fmt::Debug,
631{
632 // Initialize temporary names vec
633 let mut names: Vec<Option<u32>> = vec![None; buckets.len()];
634 // Initialize temporary offset(pointer) list
635 let mut offsets = vec![None; buckets.len()];
636 // Initialize current name
637 let mut current_name = 0_u32;
638 // Initialize sentinel name
639 names[buckets[0].unwrap() as usize] = Some(current_name);
640 // Initialize sentinel pointer
641 offsets[buckets[0].unwrap() as usize] = Some(lms.last as u32);
642 // Initialize previous LMS to sentinel LMS value
643 let mut prev_lms = buckets[0];
644
645 // Iterate through the buckets looking for sequences of identical lms groups, skipping the sentinel
646 for &ptr in buckets[1..].iter() {
647 // Unwrap and convert to usize once - to make it easier to read
648 if lms.is_lms(ptr.unwrap() as usize) {
649 let curr_lms = ptr.unwrap() as usize;
650 if lms.is_unequal_lms(data, prev_lms.unwrap() as usize, curr_lms) {
651 prev_lms = Some(curr_lms as u32);
652 current_name += 1;
653 }
654 // Store the results based on the pointer in the buckets
655 names[curr_lms] = Some(current_name);
656 offsets[curr_lms] = Some(curr_lms as u32);
657 }
658 }
659 // Filter out non-lms elements and return names, offsets and count of unique LMS names
660 (
661 names.into_iter().flatten().collect::<Vec<u32>>(),
662 offsets.into_iter().flatten().collect::<Vec<u32>>(),
663 current_name as usize + 1,
664 )
665}
666
667/// Create vec that contains only unique LMS elements, recursing if needed to ensure only unique LMS elements exist.
668fn make_summary_suffix_vec(summary_size: usize, lms: &LMS, mut summary: Vec<u32>) -> Vec<u32> {
669 // Recurse if we had any identical LMS groups when we made the summary (make_summary returned summary_size,
670 // the count of unique LMS elements)
671 if summary_size != lms.lms_count {
672 // Recurse
673 summary = sa_is(&summary, summary_size);
674 // The above recurses until there are no more duplicate LMS elements.
675 // Now return the summary the recursion finally produced.
676 let mut summary_suffix_vec = vec![summary.len() as u32; summary.len() + 1];
677 summary_suffix_vec[1..(summary.len() + 1)].copy_from_slice(&summary[..]);
678 summary_suffix_vec
679 } else {
680 // Make a summary suffix vec from summary with a new sentinel at the beginning.
681 let mut summary_suffix_vec = vec![summary_size as u32; summary.len() + 1];
682 // Fill in the rest of the vec from summary
683 summary.iter().enumerate().for_each(|(idx, el)| {
684 summary_suffix_vec[*el as usize + 1] = idx as u32;
685 });
686 summary_suffix_vec
687 }
688}
689
690/// Debug function for the buckets.
691fn debug_buckets(note: char, buckets: &[Option<u32>]) {
692 debug!(
693 "{} Buckets: {:?}",
694 note,
695 (0..32.min(buckets.len())).fold(Vec::new(), |mut vec, i| {
696 vec.push(buckets[i]);
697 vec
698 })
699 );
700}
701
702/// Debug function to show which bucket elemements were missing.
703fn debug_nones(buckets: &[Option<u32>]) {
704 let mut indecies = (0..buckets.len() as u32).collect::<Vec<u32>>();
705 buckets.iter().for_each(|b| {
706 if b.is_some() {
707 indecies.remove(b.unwrap() as usize);
708 }
709 });
710 indecies.sort();
711 debug!(" Didn't fill: {:?} ", indecies);
712}
713
714/*
715/// Compute the Lexicographically Minimal String Rotation. Passes tests but results in failed compression.
716fn duval_ds(input: &[u8]) -> usize {
717 let n = input.len();
718 if n < 2 {
719 return 0;
720 }
721 let mut smallest = (input[0], 0);
722
723 let mut this = 0;
724 let mut next = 1;
725 // Find the smallest run
726 while this < n - 1 {
727 if next >= n {
728 next -= n
729 };
730 // If ever the next byte is smaller than the smallest, we have a new smallest run
731 if input[next] < smallest.0 {
732 smallest = (input[next], next);
733 next += 1;
734 continue;
735 };
736 // If the next byte is equal to this current byte, increment both and check the next
737 if input[this] == input[next] {
738 next += 1;
739 this += 1;
740 continue;
741 }
742 // If the next byte is smaller than this byte, we could have a new smallest run starting at next_start
743 if input[next] < input[this] {
744 // See if the next byte is different from the smallest
745 if input[next] != smallest.0 {
746 // If it is different, we can't have a new run. Go to the next byte
747 next += 1;
748 this += 1;
749 continue;
750 } else {
751 // check which run is longer - the one starting at smallest or the one starting at next
752 let (mut a, mut b) = (smallest.1, next);
753 // Advance past the equal bytes
754 while input[a] == input[b % n] {
755 a += 1;
756 b += 1;
757 if a == n {
758 break;
759 }
760 }
761 // If a is greater than b, run b is longer
762 if input[a % n] > input[b % n] {
763 smallest = (input[next], next);
764 }
765 this = b;
766 next = b + 1;
767 continue;
768 }
769 }
770 // If the next byte is larger than this one, start looking again at the next byte
771 if input[next] > input[this] {
772 next += 1;
773 this += 1;
774 continue;
775 }
776 }
777 smallest.1
778} */
779
780/// Compute the Lexicographically Minimal String Rotation. Fails tests, but is verbatim from ribzip2 and seems to work.
781fn duval(input: &[u8]) -> usize {
782 let mut final_start = 0;
783 let n = input.len();
784 let mut i = 0;
785
786 while i < n {
787 let mut j = i + 1;
788 let mut k = i;
789 while j < n && input[k] <= input[j] {
790 if input[k] < input[j] {
791 k = i;
792 } else {
793 k += 1;
794 }
795 j += 1;
796 }
797 while i <= k {
798 final_start = i;
799
800 i += j - k;
801 }
802 }
803 final_start
804}
805
806/// Compute lexicographically minimal rotation using the duval algorithm.
807/// Returns the rotation and the offset.
808fn rotate_duval(input: &[u8]) -> (Vec<u8>, usize) {
809 let offset = duval(input);
810 //let offset = duval(input);
811 let mut buf = vec![];
812 let (head, tail) = input.split_at(offset);
813 buf.append(&mut tail.to_vec());
814 buf.append(&mut head.to_vec());
815 (buf, offset)
816}
817
818/// Given a sample slice of the data (5k suggested), compute the data complexity.
819/// A complexity of less than 0.3 indicates that SA-IS is the better algorithm
820/// than the native sort_unstable algorithm.
821pub fn lms_complexity(data: &[u8]) -> f64 {
822 // STEP 1: Build LMS info
823 let mut lms = LMS::new();
824 lms.init(data);
825
826 // STEP 2: Compute and return the LMS complexity
827 debug!("Lms complexity is {}", lms.lms_count as f64 / data.len() as f64);
828 lms.lms_count as f64 / data.len() as f64
829}
830
831#[cfg(test)]
832mod tests {
833 use super::*;
834 #[test]
835 fn duval_test_a() {
836 let data = "a".as_bytes();
837 assert_eq!(duval(data), 0);
838 }
839 #[test]
840 fn duval_test_ba() {
841 let data = "ba".as_bytes();
842 assert_eq!(duval(data), 1);
843 }
844 #[test]
845 fn duval_test_aaaaaa() {
846 let data = "aaaaaa".as_bytes();
847 assert_eq!(duval(data), 0);
848 //assert_eq!(duval_original(data), 0);
849 }
850 #[test]
851 fn duval_test_aaaaab() {
852 let data = "aaaaab".as_bytes();
853 assert_eq!(duval(data), 0);
854 }
855 #[test]
856 fn duval_test_aaaaba() {
857 let data = "aaaaba".as_bytes();
858 assert_eq!(duval(data), 5);
859 }
860 #[test]
861 fn duval_test_aaabaa() {
862 let data = "aaabaa".as_bytes();
863 assert_eq!(duval(data), 4);
864 }
865 #[test]
866 fn duval_test_aabaaa() {
867 let data = "aabaaa".as_bytes();
868 assert_eq!(duval(data), 3);
869 }
870 #[test]
871 fn duval_test_abaaaa() {
872 let data = "abaaaa".as_bytes();
873 assert_eq!(duval(data), 2);
874 }
875 #[test]
876 fn duval_test_baaaaa() {
877 let data = "baaaaa".as_bytes();
878 assert_eq!(duval(data), 1);
879 }
880 #[test]
881 fn duval_test_baaaab() {
882 let data = "baaaab".as_bytes();
883 assert_eq!(duval(data), 1);
884 }
885 #[test]
886 fn duval_test_abbbba() {
887 let data = "abbbba".as_bytes();
888 assert_eq!(duval(data), 5);
889 }
890 #[test]
891 fn duval_test_baabaa() {
892 let data = "baabaa".as_bytes();
893 assert_eq!(duval(data), 1);
894 }
895 #[test]
896 fn duval_test_abaabaaabaababaaabaaababaab() {
897 let data = "abaabaaabaababaaabaaababaab".as_bytes();
898 assert_eq!(duval(data), 14);
899 }
900}