bzip2_os/bwt_algorithms/
bwt_sort.rs

1//! This is the entry point for the main bwt_sort algorithm and contains the main sorting algorithm.
2//!
3//! The main sorting algorithm is currently based on the standard Rust sort_unstable algorithm. When data is
4//! larger than 5k bytes, we use a multi-threaded approach based on Rayon's par_sort_unstable algorithm.
5//!
6//! Since different sorting algorithms are better suited for different kinds of data, this module contains a test
7//! to determine whether the data would be better suited to the main algorithm or the fallback algorithm.
8//! 
9//! NOTE: 
10//! * During the earlier development and testing phases, I ported Julian Seward's sort algorithm into Rust. However 
11//! the built-in sort_unstable algorithm performed equally well as my port. Using the built-in algorithm avoids a lot of complexity. 
12//! (I do welcome suggestions for improved sorting algorithms.)
13//! * I have tried multiple different algorithms to test data entropy in order to choose when to select the fallback algorithm. The final
14//! method I finally implemented is based on the "left most smaller" (LMS) concept in suffix array algorithms.
15//!
16use super::sais_fallback::sais_entry;
17use crate::bwt_algorithms::sais_fallback::lms_complexity;
18use log::info;
19use rayon::prelude::*;
20/*
21I tried a varient that used a double length block to avoid the nested equality checks
22in block_compare, but it was barely faster.
23*/
24
25/// Encode data using the Burrows-Wheeler-Transform. Requires a u8 slice of data to be sorted.
26/// This returns a u32 key and a u8 vec of the BWT data.
27pub fn bwt_encode(rle1_data: &[u8]) -> (u32, Vec<u8>) {
28    // Test data longer than 5k bytes to help select the best algorithm
29    if rle1_data.len() > 5_000 && lms_complexity(&rle1_data[0..5_000.min(rle1_data.len())]) < 0.3 {
30        info!("Using SA-IS algorithm.");
31        return sais_entry(rle1_data);
32    }
33
34    info!("Using native algorithm.");
35    // Create index into block. Index is u32, which should be more than enough
36    let mut index = (0_u32..rle1_data.len() as u32).collect::<Vec<u32>>();
37
38    // Sort index
39    if rle1_data.len() > 40000 {
40        index[..].par_sort_unstable_by(|a, b| block_compare(*a as usize, *b as usize, rle1_data));
41    } else {
42        index[..].sort_unstable_by(|a, b| block_compare(*a as usize, *b as usize, rle1_data));
43    }
44    // Get key and BWT output
45    let mut key = 0_u32;
46    let mut bwt = vec![0; rle1_data.len()];
47    for i in 0..bwt.len() {
48        if index[i] == 0 {
49            key = i as u32;
50        }
51        if index[i] == 0 {
52            bwt[i] = rle1_data[rle1_data.len() - 1];
53        } else {
54            bwt[i] = rle1_data[(index[i] as usize) - 1];
55        }
56    }
57    (key, bwt)
58}
59
60/// compare the next two chunks of the original data to decide which sorts first
61fn block_compare(a: usize, b: usize, block: &[u8]) -> std::cmp::Ordering {
62    let min = std::cmp::min(block[a..].len(), block[b..].len());
63
64    // Lexicographical comparison
65    let mut result = block[a..a + min].cmp(&block[b..b + min]);
66
67    // Implement wraparound if needed
68    if result == std::cmp::Ordering::Equal {
69        if a < b {
70            let to_end = block.len() - a - min;
71            result = block[(a + min)..].cmp(&block[..to_end]);
72            if result == std::cmp::Ordering::Equal {
73                let rest_of_block = block.len() - to_end - min;
74                return block[..rest_of_block].cmp(&block[to_end..(to_end + rest_of_block)]);
75            }
76        } else {
77            let to_end = block.len() - b - min;
78            result = block[..to_end].cmp(&block[(b + min)..]);
79            if result == std::cmp::Ordering::Equal {
80                let rest_of_block = block.len() - to_end - min;
81                return block[to_end..(to_end + rest_of_block)].cmp(&block[..rest_of_block]);
82            }
83        }
84    }
85    result
86}
87
88
89/// Decode a Burrows-Wheeler-Transform. Requires a key, a u8 slice containing the BWT data, and an array of the u8 frequencies
90/// found in the data. Returns the decoded data as a u8 vec.
91pub fn bwt_decode(mut key: u32, bwt_in: &[u8], freq_in: &[u32]) -> Vec<u8> {
92    /* LOGIC:
93    Encode the current byte and the index to the next byte in each element of the transformation vector.
94    The leftmost byte of each element is the current byte value. The rightmost three bytes are the index to the next byte.
95    */
96
97    // Calculate end once.
98    let end = bwt_in.len();
99
100    // Convert frequency count to a cumulative sum of frequencies
101    let mut freq = [0_u32; 256];
102
103    {
104        for i in 0..255 {
105            freq[i + 1] = freq[i] + freq_in[i];
106        }
107    }
108
109    //Build the transformation vector to find the next character in the original data
110    let mut t_vec = vec![0_u32; end];
111    for (i, &s) in bwt_in.iter().enumerate() {
112        t_vec[i] |= (s as u32) << 24;
113        t_vec[freq[s as usize] as usize] |= i as u32;
114        freq[s as usize] += 1
115    }
116
117    // Transform the data
118    // First put they last element into the end, thus avoiding a test on each iteration
119    let mut rle1_data = vec![0_u8; end];
120    let mut el = t_vec[key as usize];
121    key = el & 0xFFFFFF;
122    rle1_data[end - 1] = (el >> 24) as u8;
123
124    for i in 1..t_vec.len() {
125        el = t_vec[key as usize];
126        key = el & 0xFFFFFF;
127        rle1_data[i - 1] = (el >> 24) as u8;
128    }
129    rle1_data
130}
131