bzip2_os/bwt_algorithms/
sais_fallback.rs

1//! The fallback bwt_sort algorithm for the Rust version of the standard BZIP2 library.
2//!
3//! This is a simple SA-IS implemenation of a sorting algorithm, using sentinels. SA-IS is particularly suited
4//! to repetative data such as is found in genetic sequencing.
5//! 
6//! SA-IS does not lend itself to multi-threading when the data sets are smaller (except for frequency counting). The BZIP2 algorithm has a maximum
7//! block size of 900kb, consequently no attempt at multithreading within the sorting portion of the algorithm is made. (Multiple blocks
8//! will be processed in parallel, though.)
9//! 
10//! In order to determine whether the data is more likely to be better sorted using SA-IS, the lms_complexity function
11//! can test a sample of the data to determine whether SA-IS is suited to the data.
12//! 
13//! NOTE 1: This implementation effectively uses compressed LMS data in order to reduce cache misses.
14//! 
15//! NOTE 2: It is difficult to determine the best sorting alogithm based on only a sample of the data. The approach 
16//! used in this version is based on LMS complexity (my term). This technique has proven
17//! to be the most reliable method I have discovered to return a decent result based on analysis of a small sample of the larger data set.
18//! 
19//! 
20
21const S: u32 = 1;
22const LMS: u32 = 1;
23const L: u32 = 0;
24
25#[allow(clippy::upper_case_acronyms)]
26/// LMS struct holds commpressed L, S, and LMS values, plus counters used for validity checks.
27struct LMS {
28    /// Bit oriented vec of LMS type element indecies
29    lms: Vec<u32>,
30    /// Bit oriented vec of L and S type element indecies
31    ls: Vec<u32>,
32    // The following are primarily used in debugging - to simplify getting counts
33    /// Position of the last element in the input data and LS/LMS vecs
34    last: usize,
35    /// Count of LMS type elements
36    lms_count: usize,
37    /// Count of L type elements
38    l_count: usize,
39    /// Count of S type elements
40    s_count: usize,
41}
42
43/// Create empty LMS struct
44impl LMS {
45    fn new() -> Self {
46        Self {
47            // Initialize ls and lms vecs
48            ls: Vec::new(),
49            lms: Vec::new(),
50            // The following are primarily used in debugging - to simplify getting counts
51            last: 0_usize,
52            lms_count: 0_usize,
53            l_count: 0_usize,
54            s_count: 0_usize,
55        }
56    }
57
58    /// Initialize an LMS struct based on data (which will be either u8 or u32).
59    fn init<T>(&mut self, data: &[T])
60    where
61        T: TryInto<usize> + Copy + std::cmp::Ord,
62        T::Error: std::fmt::Debug,
63    {
64        /*
65        SA-IS builds a suffix tree, but BZIP2 wraps around the end of the string in the comparison.
66
67        The (currently magic) solution is to use a Duval rotation of the data to prevent elements from
68        miss-sorting.
69
70        To store LMS info (s-type, l-type and lms-type elements) I will use a binary system where a set
71        bit indicates an s-type and a a zero bit indicates a l-type in the ls vector. In the lms vector,
72        a set bit indicates an lms element. Since 2^5 = 32, we can index to the correct u32 in the vecs by
73        shifting the index right 5 bits. We can get the right element within that vec by using %32.
74
75        This means that the temporary internal ls/lms info for 64 bytes can be stored in two u32 words (8 bytes).
76
77        The use of sentinels also means that we must store ls/lms info at idx+1 (one position beyond the actual
78        location in the data). We will be putting the sentinel into location 0.
79        */
80
81        // Initialize data end and search starting position
82        self.last = data.len();
83
84        // Initialize ls and lms vecs to all L (0 - no S or LMS found yet)
85        self.lms = vec![L; data.len() / 32 + 1];
86        self.ls = vec![L; data.len() / 32 + 1];
87        // Initialize the final sentinel to S type as well as LMS type
88        self.ls[self.last >> 5] |= S << (self.last % 32);
89        self.lms[self.last >> 5] |= LMS << (self.last % 32);
90        //self.debug();
91
92        // Iterate backwards through the data determining L S and LMS elements. The sentinel
93        // at the end is an S, so the last elmenet by definition must be an L type. We can
94        // start iterating left from here.
95        let mut current = L;
96        let mut prev = data[data.len() - 1];
97        for (idx, &el) in data.iter().enumerate().take(data.len() - 1).rev() {
98            // Compare the current element with the previous element. If less or equal, this is an S
99            match el.cmp(&prev) {
100                std::cmp::Ordering::Less => {
101                    self.ls[idx >> 5] |= S << (idx % 32);
102                    //self.debug();
103                    // Record that we are now working with a S type element
104                    current = S;
105                }
106                // If the prev element is equal to this one, we need to compare to whether we are currently
107                // working with S or L
108                std::cmp::Ordering::Equal => {
109                    if current == S {
110                        // We are in a run of S, so we need to set this one to S. (L's are the unmarked varient)
111                        self.ls[idx >> 5] |= S << (idx % 32);
112                        //self.debug();
113                    }
114                }
115                // If we found an L and we were in a run of S type elements, then the previous element must be an LMS
116                std::cmp::Ordering::Greater => {
117                    if current == S {
118                        // Mark previous element as lms
119                        self.lms[(idx + 1) >> 5] |= LMS << ((idx + 1) % 32);
120                        current = L;
121                        // self.debug();
122                    }
123                }
124            }
125            prev = el;
126        }
127        // Before we leave, take a couple nanoseconds to record the counts
128        self.lms_count = self.lms.iter().map(|el| el.count_ones()).sum::<u32>() as usize;
129        self.s_count = self.ls.iter().map(|el| el.count_ones()).sum::<u32>() as usize;
130        self.l_count = data.len() - self.s_count;
131    }
132
133    /// Checks if element at index is set (is an LMS element)
134    fn is_lms(&self, idx: usize) -> bool {
135        if self.lms[idx >> 5] & (LMS << (idx % 32)) > 0 {
136            return true;
137        };
138        false
139    }
140
141    /// data element at idx is not set (is an L)
142    pub fn is_l(&self, idx: usize) -> bool {
143        if self.ls[idx >> 5] & (S << (idx % 32)) == 0 {
144            return true;
145        };
146        false
147    }
148
149    /// data element at idx is set (is an S)
150    fn is_s(&self, idx: usize) -> bool {
151        if self.ls[idx >> 5] & (S << (idx % 32)) > 0 {
152            return true;
153        };
154        false
155    }
156
157    /// Test if LMS elements at data index a and b are NOT equal. Assumes a and be are lms elements.
158    fn is_unequal_lms<T: std::cmp::PartialOrd + std::fmt::Display>(
159        &self,
160        data: &[T],
161        a: usize,
162        b: usize,
163    ) -> bool {
164        // If either is a sentinel, they are unequal
165        if a == self.last || b == self.last {
166            return true;
167        }
168        // Put smaller of a or b into first, and calculate difference between them
169        let mut i = if a > b { b + 1 } else { a + 1 };
170        let diff = if a > b { a - b } else { b - a };
171
172        // Iterate through the data relative to both a and b checking for equality starting at the element past a
173        while i != self.last - diff {
174            let b = i + diff;
175            // If both a and b are LMS elements, then we are past the segments and the segments were not unequal
176            if self.is_lms(i) && self.is_lms(b) {
177                return false;
178            }
179
180            // If only one was an LMS elements, then we are done and the segments were unequal
181            if self.is_lms(i) || self.is_lms(b) {
182                return true;
183            }
184            // If the next bytes are unequal, then the segments were unequal
185            if data[i] != data[b] {
186                return true;
187            }
188            i += 1;
189        }
190        // We worked through all the data, so the segments were not equal
191        true
192    }
193
194    /*
195       /// Test if LMS elements at data index a and b are NOT equal. Assumes a and be are lms elements.
196       /// This is faster for larger blocks, but oddly slower for smaller blocks.
197       fn is_unequal_lms_b<T: std::cmp::PartialOrd + std::fmt::Display>(
198           &self,
199           data: &[T],
200           a: usize,
201           b: usize,
202       ) -> bool {
203           // If either a or b is a sentinel, the elements are unequal
204           if a == self.last || b == self.last {
205               return true;
206           }
207           // Make sure we start at the element that is smaller
208           let (c, diff) = if a > b {
209               (b, (a - b).min(self.last - a))
210           } else {
211               (a, (b - a).min(self.last - b))
212           };
213
214           let mut length = 1;
215           while !self.is_lms(c + length) && length < diff {
216               length += 1;
217           }
218
219           if data[a..a + length] == data[b..b + length] {
220               return false;
221           }
222           true
223       }
224    */
225    fn debug(&self) {
226        // Print a "count line" to aid counting to stderr
227        eprint!("    --");
228        for i in 0..=self.last {
229            eprint!("{}", i % 10);
230        }
231        eprintln!("--  ");
232
233        // Print LS data
234        eprint!("  LS: ");
235        for i in 0..=self.last {
236            eprint!("{}", (self.ls[i >> 5] & 1_u32 << (i % 32) > 0) as u32);
237        }
238        eprintln!("  ({} S & {} L elements)", self.s_count, self.l_count);
239
240        // Print LMS data
241        eprint!(" LMS: ");
242        for i in 0..=self.last {
243            eprint!("{}", (self.lms[i >> 5] & 1_u32 << (i % 32) > 0) as u32);
244        }
245        eprintln!("  ({} LMS elements)", self.lms_count);
246    }
247}
248
249#[cfg(test)]
250mod test_lms {
251    use super::*;
252    #[test]
253    pub fn lms_test() {
254        let data = "caabage".as_bytes();
255        let mut lms = LMS::new();
256        lms.init(data);
257        //cabbage - LSLLSLLS
258        assert!(lms.is_l(0));
259        assert!(lms.is_s(1));
260        assert!(lms.is_s(2));
261        assert!(lms.is_l(3));
262        assert!(lms.is_s(4));
263        assert!(lms.is_l(5));
264        assert!(lms.is_l(6));
265        assert!(lms.is_s(7));
266
267        assert_eq!(lms.is_lms(0), false);
268        assert_eq!(lms.is_lms(1), true);
269        assert_eq!(lms.is_lms(2), false);
270        assert_eq!(lms.is_lms(3), false);
271        assert_eq!(lms.is_lms(4), true);
272        assert_eq!(lms.is_lms(5), false);
273        assert_eq!(lms.is_lms(6), false);
274        assert_eq!(lms.is_lms(7), true);
275    }
276}
277//--- Done with LMS struct ------------------------------------------------------------------------------------
278
279use log::{error, debug};
280//-- Counts for Bucket Sorting --------------------------------------------------------------------------------
281use rayon::prelude::*;
282
283/// Return frequency count of elements in the input vec. Size is the value of the largest element in the input. (Vec based because we don't know the number
284/// of elements we will be counting.)
285fn bucket_sizes<T>(data: &[T], size: usize) -> Vec<u32>
286where
287    T: TryInto<usize> + Copy,
288    T::Error: std::fmt::Debug,
289    T: Sync,
290{
291    // Use parallel method if more than 64k elements in the data
292    if data.len() > 64_000 {
293        // 16k is pretty much the sweet spot for chunk size.
294        data.par_chunks(16_000)
295            .fold(
296                || vec![0_u32; size],
297                |mut freqs, chunk| {
298                    chunk
299                        .iter()
300                        .for_each(|&el| freqs[el.try_into().unwrap_or_default()] += 1);
301                    freqs
302                },
303            )
304            .reduce(
305                || vec![0_u32; size],
306                |s, f| s.iter().zip(&f).map(|(a, b)| a + b).collect::<Vec<u32>>(),
307            )
308    } else {
309        let mut freqs = vec![0_u32; size];
310        data.iter()
311            .for_each(|&el| freqs[el.try_into().unwrap_or_default()] += 1);
312        freqs
313    }
314}
315
316/// Returns index to top positions of buckets for bucket sorting.
317fn bucket_heads(buckets: &[u32]) -> Vec<u32> {
318    buckets
319        .iter()
320        .enumerate()
321        .fold(
322            (vec![0_u32; buckets.len()], 1_u32),
323            |(mut head, mut idx), (i, &count)| {
324                head[i] = idx;
325                idx += count;
326                (head, idx)
327            },
328        )
329        .0
330}
331/// Returns index to bottom positions of buckets for bucket sorting.
332fn bucket_tails(buckets: &[u32]) -> Vec<u32> {
333    buckets
334        .iter()
335        .enumerate()
336        .fold(
337            (vec![0_u32; buckets.len()], 1_u32),
338            |(mut tail, mut idx), (i, &count)| {
339                idx += count;
340                tail[i] = idx - 1;
341                (tail, idx)
342            },
343        )
344        .0
345}
346
347#[cfg(test)]
348mod test_bucket_prep {
349    use super::*;
350    #[test]
351    pub fn freq_count_test() {
352        let data = [2, 0, 1, 1, 0, 6, 4];
353        let frq = bucket_sizes(&data, 7);
354        assert_eq!(frq[0..7], vec![2, 2, 1, 0, 1, 0, 1]);
355    }
356    #[test]
357    pub fn freq_head_test() {
358        let data = [2, 0, 1, 1, 0, 6, 4];
359        let freq = bucket_sizes(&data, 7);
360        let heads = bucket_heads(&freq);
361        assert_eq!(heads[0..7], vec![1, 3, 5, 6, 6, 7, 7]);
362    }
363    #[test]
364    pub fn freq_tail_test() {
365        let data = [2, 0, 1, 1, 0, 6, 4];
366        let freq = bucket_sizes(&data, 7);
367        let tails = bucket_tails(&freq);
368        assert_eq!(tails[0..7], vec![2, 4, 5, 5, 6, 6, 7]);
369    }
370}
371//-- End Frequency Counts for Bucket Sorting -------------------------------------------------------------------
372
373//-- Bucket Sorting --------------------------------------------------------------------------------------------
374/// Initial bucket sort of the LMS elements in the buckets vec.
375fn initial_buckets_sort<T>(data: &[T], bkt_sizes: &[u32], lms: &LMS) -> Vec<Option<u32>>
376where
377    T: TryInto<usize> + Copy,
378    T::Error: std::fmt::Debug,
379{
380    // Get the bucket tails info
381    let mut tails = bucket_tails(bkt_sizes);
382
383    // Initialize output vec to contain 1 more element that the input data
384    let mut buckets = vec![None; data.len() + 1];
385    buckets[0] = Some(lms.last as u32);
386
387    // Find the LMS elements
388    for idx in (0..lms.last).rev() {
389        if lms.is_lms(idx) {
390            buckets[tails[data[idx].try_into().unwrap_or_default()] as usize] = Some(idx as u32);
391            tails[data[idx].try_into().unwrap_or_default()] -= 1;
392        }
393    }
394    // Add the sentinel to the front
395    buckets[0] = Some(data.len() as u32);
396    buckets
397}
398
399/// Induce L type elements into the sort array after initial allocation of LMS elements
400fn induced_sort_l<T>(data: &[T], buckets: &mut [Option<u32>], bkt_sizes: &[u32], lms: &LMS)
401where
402    T: TryInto<usize> + Copy,
403    T::Error: std::fmt::Debug,
404{
405    // Get the bucket heads info
406    let mut heads = bucket_heads(bkt_sizes);
407
408    // Find L type elements that are left of the index and insert them into the buckets.
409    // We can start at 0 and walk to the end with L type elements.
410    for idx in 0..lms.last {
411        // Only do buckets with a valid index
412        if buckets[idx].is_some() {
413            // Check if the element left of the element indexed by this bucket also an L type.
414            let prev = if buckets[idx] == Some(0) {
415                lms.last
416            } else {
417                buckets[idx].unwrap() as usize - 1
418            };
419            if lms.is_l(prev) {
420                // If so, insert that l-type into the next free top spot in that bucket
421                buckets[heads[data[prev].try_into().unwrap_or_default()] as usize] =
422                    Some(prev as u32);
423                // And adjust the location available for the next L-type in this bucket (if any)
424                heads[data[prev].try_into().unwrap_or_default()] += 1;
425                //DEBUG
426                //lms.debug();
427            }
428        }
429    }
430}
431
432/// Induce S type elements into the sort array after induced L sort
433fn induced_sort_s<T>(data: &[T], buckets: &mut [Option<u32>], bkt_sizes: &[u32], lms: &LMS)
434where
435    T: TryInto<usize> + Copy + std::cmp::Ord,
436    T::Error: std::fmt::Debug,
437{
438    // Get the bucket tails info
439    let mut tails = bucket_tails(bkt_sizes);
440
441    // Start at the right most known element and then iterate down.
442    let mut idx = lms.last;
443    // Prepare to loop back to here
444    while idx > 0 {
445        // Check if the element left of the element indexed by this bucket is an S type.
446        if buckets[idx].is_none() {
447            error!("This should never happen. Idx is {}", idx);
448            lms.debug(); // Pause here
449        }
450        // As long as we are not referencing the start of the data
451        if buckets[idx] != Some(0) {
452            // Get the previous element index
453            let prev = buckets[idx].unwrap() as usize - 1;
454            // If it is an S type
455            if lms.is_s(prev) {
456                // Insert/update that element into the next free bottom spot in the appropriate bucket
457                let bkt_index = data[prev].try_into().unwrap_or_default();
458                buckets[tails[bkt_index] as usize] = Some(prev as u32);
459                // And adjust the location available for the next S type in this bucket (if any)
460                tails[data[prev].try_into().unwrap_or_default()] -= 1;
461                //DEBUG
462                //lms.debug();
463            }
464        };
465        idx -= 1;
466    }
467}
468
469fn sa_is<T>(data: &[T], alphabet_size: usize) -> Vec<u32>
470where
471    T: TryInto<usize> + Copy + std::cmp::Ord + std::fmt::Display + std::marker::Sync,
472    T::Error: std::fmt::Debug,
473{
474    // Don't attemp to process empty data.
475    if data.is_empty() {
476        return vec![];
477    }
478
479    // STEP 1: Build LMS info
480    let mut lms = LMS::new();
481    lms.init(data);
482    //DEBUG
483    //lms.debug();
484
485    // STEP 2: Calculate buckets for bucket sorting
486    let bkt_sizes = bucket_sizes(data, alphabet_size);
487
488    // STEP 3: Do initial bucket sorting of LMS elements
489    let mut buckets = initial_buckets_sort(data, &bkt_sizes, &lms);
490    // Validity test for development
491    if lms.lms_count != buckets.iter().filter(|&b| b.is_some()).count() {
492        error!(
493            "Didn't initialize buckets properly. Missed {}.",
494            lms.lms_count - buckets.iter().filter(|&b| b.is_some()).count()
495        );
496        debug_buckets('i', &buckets);
497    }
498
499    // STEP 4: Do induced L sort
500    induced_sort_l(data, &mut buckets, &bkt_sizes, &lms);
501    // Validity test for development
502    if lms.s_count - lms.lms_count != buckets.iter().filter(|&b| b.is_none()).count() {
503        error!(
504            "Expected to have {} empty buckets after first induced_sort_l. Instead...",
505            lms.s_count - lms.lms_count,
506        );
507        debug_nones(&buckets);
508        debug_buckets('l', &buckets);
509    }
510
511    // STEP 5: Do induced S sort
512    induced_sort_s(data, &mut buckets, &bkt_sizes, &lms);
513    // Validity test for development
514    if 0 != buckets.iter().filter(|&b| b.is_none()).count() {
515        error!(
516            "Didn't complete s-induced sort properly. Missed {}.",
517            buckets.iter().filter(|&b| b.is_none()).count()
518        );
519        debug_nones(&buckets);
520        //debug_buckets('s', &buckets);
521    }
522
523    // STEP 6: Create Summary Suffix list from correctly sorted LMS elements
524    // Create summary of unique lms elements.
525    let (summary, offsets, summary_size) = make_summary(data, &mut buckets, &lms);
526    // Create unique ordered elements, recursing if needed to ensure only unique lms elements exist.
527    let summary_suffix_vec = make_summary_suffix_vec(summary_size, &lms, summary);
528
529    //STEP 7: Do final bucket sort based on the summary. First clear the buckets
530    (0..buckets.len()).for_each(|i| buckets[i] = None);
531
532    // Get the bucket tails info
533    let mut tails = bucket_tails(&bkt_sizes);
534
535    // Place the LMS elements in the summary_suffix_vec
536    for el in summary_suffix_vec.iter().skip(2).rev() {
537        let data_index = offsets[*el as usize] as usize;
538        let bucket_index = data[data_index].try_into().unwrap_or_default() as usize;
539        buckets[tails[bucket_index] as usize] = Some(data_index as u32);
540        tails[bucket_index] -= 1;
541    }
542
543    // Add the sentinel to the front
544    buckets[0] = Some(data.len() as u32);
545
546    // Validity test for development
547    if lms.lms_count != buckets.iter().filter(|&b| b.is_some()).count() {
548        error!("Didn't initialize buckets for FINAL sort properly.");
549        debug_buckets('F', &buckets);
550    }
551
552    // STEP 9: Do induced L sort
553    induced_sort_l(data, &mut buckets, &bkt_sizes, &lms);
554    // Validity test for development
555    if lms.s_count - lms.lms_count != buckets.iter().filter(|&b| b.is_none()).count() {
556        error!(
557            "Expected to have {} empty buckets during first induced_sort_l. Instead...",
558            lms.s_count - lms.lms_count,
559        );
560        debug_nones(&buckets);
561        //debug_buckets('L', &buckets);
562    }
563
564    // STEP 10: Do induced S sort
565    induced_sort_s(data, &mut buckets, &bkt_sizes, &lms);
566    // Validity test for development
567    if 0 != buckets.iter().filter(|&b| b.is_none()).count() {
568        error!(
569            "Didn't complete s-induced sort properly. Missed {}.",
570            buckets.iter().filter(|&b| b.is_none()).count()
571        );
572        debug_nones(&buckets);
573        //debug_buckets('S', &buckets);
574    }
575
576    // Convert Option<u32> to u32 and return that vec
577    buckets.iter().skip(1).map(|&el| el.unwrap()).collect()
578}
579
580/// Entry point for Simple SA-IS sort for Burrow-Wheeler Transform. Takes u8 slice and returns
581/// u32 key and u8 vector in BWT format.
582pub fn sais_entry(data: &[u8]) -> (u32, Vec<u8>) {
583    /*
584    SA-IS doesn't work in our context unless it is "duval rotated". We must have a lexicographically minimal
585    rotation of the data or the conversion from the index to the BWT vec will be off.
586
587    The duval rotation finds the "lexically minimal" point and splits and reorders the data around that point.
588
589    Cudos to https://github.com/torfmaster/ribzip2, where I initially saw this concept in use.
590    */
591
592    // Do the rotation and return the reorganized data and offset to the original start of the data.
593    let (data, offset) = rotate_duval(data);
594
595    // Go do the sa-is sort, returning the index to the BWT.
596    let index = sa_is(&data, 256);
597
598    // Initialize the key. We find the actutal value in the loop below.
599    let mut key = 0_u32;
600    // Initialize the final vec
601    let mut bwt = vec![0_u8; data.len()];
602    // Get the offset to the original start of the data
603    let duval_zero_position = (index.len() - offset) as u32;
604
605    // Create the final BWT vec and find the actual key value
606    for i in 0..index.len() {
607        // Watch for the key location
608        if index[i] == duval_zero_position {
609            key = i as u32
610        }
611        // BWT is build from the data at the previous index location. Wrap around if the index is at 0.
612        if index[i] == 0 {
613            bwt[i] = data[data.len() - 1] as u8;
614        } else {
615            bwt[i] = data[index[i] as usize - 1];
616        }
617    }
618    // Return the key and BWT data
619    (key, bwt)
620}
621
622/// Create summary of LMS elements. Return vec of LMS names, vec of offsets and count of unique LMS names.
623fn make_summary<T>(
624    data: &[T],
625    buckets: &mut [Option<u32>],
626    lms: &LMS,
627) -> (Vec<u32>, Vec<u32>, usize)
628where
629    T: TryInto<usize> + Copy + std::cmp::Ord + std::fmt::Display,
630    T::Error: std::fmt::Debug,
631{
632    // Initialize temporary names vec
633    let mut names: Vec<Option<u32>> = vec![None; buckets.len()];
634    // Initialize temporary offset(pointer) list
635    let mut offsets = vec![None; buckets.len()];
636    // Initialize current name
637    let mut current_name = 0_u32;
638    // Initialize sentinel name
639    names[buckets[0].unwrap() as usize] = Some(current_name);
640    // Initialize sentinel pointer
641    offsets[buckets[0].unwrap() as usize] = Some(lms.last as u32);
642    // Initialize previous LMS to sentinel LMS value
643    let mut prev_lms = buckets[0];
644
645    // Iterate through the buckets looking for sequences of identical lms groups, skipping the sentinel
646    for &ptr in buckets[1..].iter() {
647        // Unwrap and convert to usize once - to make it easier to read
648        if lms.is_lms(ptr.unwrap() as usize) {
649            let curr_lms = ptr.unwrap() as usize;
650            if lms.is_unequal_lms(data, prev_lms.unwrap() as usize, curr_lms) {
651                prev_lms = Some(curr_lms as u32);
652                current_name += 1;
653            }
654            // Store the results based on the pointer in the buckets
655            names[curr_lms] = Some(current_name);
656            offsets[curr_lms] = Some(curr_lms as u32);
657        }
658    }
659    // Filter out non-lms elements and return names, offsets and count of unique LMS names
660    (
661        names.into_iter().flatten().collect::<Vec<u32>>(),
662        offsets.into_iter().flatten().collect::<Vec<u32>>(),
663        current_name as usize + 1,
664    )
665}
666
667/// Create vec that contains only unique LMS elements, recursing if needed to ensure only unique LMS elements exist.
668fn make_summary_suffix_vec(summary_size: usize, lms: &LMS, mut summary: Vec<u32>) -> Vec<u32> {
669    // Recurse if we had any identical LMS groups when we made the summary (make_summary returned summary_size,
670    //  the count of unique LMS elements)
671    if summary_size != lms.lms_count {
672        // Recurse
673        summary = sa_is(&summary, summary_size);
674        // The above recurses until there are no more duplicate LMS elements.
675        // Now return the summary the recursion finally produced.
676        let mut summary_suffix_vec = vec![summary.len() as u32; summary.len() + 1];
677        summary_suffix_vec[1..(summary.len() + 1)].copy_from_slice(&summary[..]);
678        summary_suffix_vec
679    } else {
680        // Make a summary suffix vec from summary with a new sentinel at the beginning.
681        let mut summary_suffix_vec = vec![summary_size as u32; summary.len() + 1];
682        // Fill in the rest of the vec from summary
683        summary.iter().enumerate().for_each(|(idx, el)| {
684            summary_suffix_vec[*el as usize + 1] = idx as u32;
685        });
686        summary_suffix_vec
687    }
688}
689
690/// Debug function for the buckets.
691fn debug_buckets(note: char, buckets: &[Option<u32>]) {
692    debug!(
693        "{} Buckets: {:?}",
694        note,
695        (0..32.min(buckets.len())).fold(Vec::new(), |mut vec, i| {
696            vec.push(buckets[i]);
697            vec
698        })
699    );
700}
701
702/// Debug function to show which bucket elemements were missing.
703fn debug_nones(buckets: &[Option<u32>]) {
704    let mut indecies = (0..buckets.len() as u32).collect::<Vec<u32>>();
705    buckets.iter().for_each(|b| {
706        if b.is_some() {
707            indecies.remove(b.unwrap() as usize);
708        }
709    });
710    indecies.sort();
711    debug!(" Didn't fill: {:?} ", indecies);
712}
713
714/*
715/// Compute the Lexicographically Minimal String Rotation. Passes tests but results in failed compression.
716fn duval_ds(input: &[u8]) -> usize {
717    let n = input.len();
718    if n < 2 {
719        return 0;
720    }
721    let mut smallest = (input[0], 0);
722
723    let mut this = 0;
724    let mut next = 1;
725    // Find the smallest run
726    while this < n - 1 {
727        if next >= n {
728            next -= n
729        };
730        // If ever the next byte is smaller than the smallest, we have a new smallest run
731        if input[next] < smallest.0 {
732            smallest = (input[next], next);
733            next += 1;
734            continue;
735        };
736        // If the next byte is equal to this current byte, increment both and check the next
737        if input[this] == input[next] {
738            next += 1;
739            this += 1;
740            continue;
741        }
742        // If the next byte is smaller than this byte, we could have a new smallest run starting at next_start
743        if input[next] < input[this] {
744            // See if the next byte is different from the smallest
745            if input[next] != smallest.0 {
746                // If it is different, we can't have a new run. Go to the next byte
747                next += 1;
748                this += 1;
749                continue;
750            } else {
751                // check which run is longer - the one starting at smallest or the one starting at next
752                let (mut a, mut b) = (smallest.1, next);
753                // Advance past the equal bytes
754                while input[a] == input[b % n] {
755                    a += 1;
756                    b += 1;
757                    if a == n {
758                        break;
759                    }
760                }
761                // If a is greater than b, run b is longer
762                if input[a % n] > input[b % n] {
763                    smallest = (input[next], next);
764                }
765                this = b;
766                next = b + 1;
767                continue;
768            }
769        }
770        // If the next byte is larger than this one, start looking again at the next byte
771        if input[next] > input[this] {
772            next += 1;
773            this += 1;
774            continue;
775        }
776    }
777    smallest.1
778} */
779
780/// Compute the Lexicographically Minimal String Rotation. Fails tests, but is verbatim from ribzip2 and seems to work.
781fn duval(input: &[u8]) -> usize {
782    let mut final_start = 0;
783    let n = input.len();
784    let mut i = 0;
785
786    while i < n {
787        let mut j = i + 1;
788        let mut k = i;
789        while j < n && input[k] <= input[j] {
790            if input[k] < input[j] {
791                k = i;
792            } else {
793                k += 1;
794            }
795            j += 1;
796        }
797        while i <= k {
798            final_start = i;
799
800            i += j - k;
801        }
802    }
803    final_start
804}
805
806/// Compute lexicographically minimal rotation using the duval algorithm.
807/// Returns the rotation and the offset.
808fn rotate_duval(input: &[u8]) -> (Vec<u8>, usize) {
809    let offset = duval(input);
810    //let offset = duval(input);
811    let mut buf = vec![];
812    let (head, tail) = input.split_at(offset);
813    buf.append(&mut tail.to_vec());
814    buf.append(&mut head.to_vec());
815    (buf, offset)
816}
817
818/// Given a sample slice of the data (5k suggested), compute the data complexity.
819/// A complexity of less than 0.3 indicates that SA-IS is the better algorithm
820/// than the native sort_unstable algorithm.
821pub fn lms_complexity(data: &[u8]) -> f64 {
822    // STEP 1: Build LMS info
823    let mut lms = LMS::new();
824    lms.init(data);
825
826    // STEP 2: Compute and return the LMS complexity
827    debug!("Lms complexity is {}", lms.lms_count as f64 / data.len() as f64);
828    lms.lms_count as f64 / data.len() as f64
829}
830
831#[cfg(test)]
832mod tests {
833    use super::*;
834    #[test]
835    fn duval_test_a() {
836        let data = "a".as_bytes();
837        assert_eq!(duval(data), 0);
838    }
839    #[test]
840    fn duval_test_ba() {
841        let data = "ba".as_bytes();
842        assert_eq!(duval(data), 1);
843    }
844    #[test]
845    fn duval_test_aaaaaa() {
846        let data = "aaaaaa".as_bytes();
847        assert_eq!(duval(data), 0);
848        //assert_eq!(duval_original(data), 0);
849    }
850    #[test]
851    fn duval_test_aaaaab() {
852        let data = "aaaaab".as_bytes();
853        assert_eq!(duval(data), 0);
854    }
855    #[test]
856    fn duval_test_aaaaba() {
857        let data = "aaaaba".as_bytes();
858        assert_eq!(duval(data), 5);
859    }
860    #[test]
861    fn duval_test_aaabaa() {
862        let data = "aaabaa".as_bytes();
863        assert_eq!(duval(data), 4);
864    }
865    #[test]
866    fn duval_test_aabaaa() {
867        let data = "aabaaa".as_bytes();
868        assert_eq!(duval(data), 3);
869    }
870    #[test]
871    fn duval_test_abaaaa() {
872        let data = "abaaaa".as_bytes();
873        assert_eq!(duval(data), 2);
874    }
875    #[test]
876    fn duval_test_baaaaa() {
877        let data = "baaaaa".as_bytes();
878        assert_eq!(duval(data), 1);
879    }
880    #[test]
881    fn duval_test_baaaab() {
882        let data = "baaaab".as_bytes();
883        assert_eq!(duval(data), 1);
884    }
885    #[test]
886    fn duval_test_abbbba() {
887        let data = "abbbba".as_bytes();
888        assert_eq!(duval(data), 5);
889    }
890    #[test]
891    fn duval_test_baabaa() {
892        let data = "baabaa".as_bytes();
893        assert_eq!(duval(data), 1);
894    }
895    #[test]
896    fn duval_test_abaabaaabaababaaabaaababaab() {
897        let data = "abaabaaabaababaaabaaababaab".as_bytes();
898        assert_eq!(duval(data), 14);
899    }
900}
bzip2_os/bwt_algorithms/sais_fallback.rs

bzip2_os/bwt_algorithms/
sais_fallback.rs