bzip2_os/bwt_algorithms/bwt_sort.rs
1//! This is the entry point for the main bwt_sort algorithm and contains the main sorting algorithm.
2//!
3//! The main sorting algorithm is currently based on the standard Rust sort_unstable algorithm. When data is
4//! larger than 5k bytes, we use a multi-threaded approach based on Rayon's par_sort_unstable algorithm.
5//!
6//! Since different sorting algorithms are better suited for different kinds of data, this module contains a test
7//! to determine whether the data would be better suited to the main algorithm or the fallback algorithm.
8//!
9//! NOTE:
10//! * During the earlier development and testing phases, I ported Julian Seward's sort algorithm into Rust. However
11//! the built-in sort_unstable algorithm performed equally well as my port. Using the built-in algorithm avoids a lot of complexity.
12//! (I do welcome suggestions for improved sorting algorithms.)
13//! * I have tried multiple different algorithms to test data entropy in order to choose when to select the fallback algorithm. The final
14//! method I finally implemented is based on the "left most smaller" (LMS) concept in suffix array algorithms.
15//!
16use super::sais_fallback::sais_entry;
17use crate::bwt_algorithms::sais_fallback::lms_complexity;
18use log::info;
19use rayon::prelude::*;
20/*
21I tried a varient that used a double length block to avoid the nested equality checks
22in block_compare, but it was barely faster.
23*/
24
25/// Encode data using the Burrows-Wheeler-Transform. Requires a u8 slice of data to be sorted.
26/// This returns a u32 key and a u8 vec of the BWT data.
27pub fn bwt_encode(rle1_data: &[u8]) -> (u32, Vec<u8>) {
28 // Test data longer than 5k bytes to help select the best algorithm
29 if rle1_data.len() > 5_000 && lms_complexity(&rle1_data[0..5_000.min(rle1_data.len())]) < 0.3 {
30 info!("Using SA-IS algorithm.");
31 return sais_entry(rle1_data);
32 }
33
34 info!("Using native algorithm.");
35 // Create index into block. Index is u32, which should be more than enough
36 let mut index = (0_u32..rle1_data.len() as u32).collect::<Vec<u32>>();
37
38 // Sort index
39 if rle1_data.len() > 40000 {
40 index[..].par_sort_unstable_by(|a, b| block_compare(*a as usize, *b as usize, rle1_data));
41 } else {
42 index[..].sort_unstable_by(|a, b| block_compare(*a as usize, *b as usize, rle1_data));
43 }
44 // Get key and BWT output
45 let mut key = 0_u32;
46 let mut bwt = vec![0; rle1_data.len()];
47 for i in 0..bwt.len() {
48 if index[i] == 0 {
49 key = i as u32;
50 }
51 if index[i] == 0 {
52 bwt[i] = rle1_data[rle1_data.len() - 1];
53 } else {
54 bwt[i] = rle1_data[(index[i] as usize) - 1];
55 }
56 }
57 (key, bwt)
58}
59
60/// compare the next two chunks of the original data to decide which sorts first
61fn block_compare(a: usize, b: usize, block: &[u8]) -> std::cmp::Ordering {
62 let min = std::cmp::min(block[a..].len(), block[b..].len());
63
64 // Lexicographical comparison
65 let mut result = block[a..a + min].cmp(&block[b..b + min]);
66
67 // Implement wraparound if needed
68 if result == std::cmp::Ordering::Equal {
69 if a < b {
70 let to_end = block.len() - a - min;
71 result = block[(a + min)..].cmp(&block[..to_end]);
72 if result == std::cmp::Ordering::Equal {
73 let rest_of_block = block.len() - to_end - min;
74 return block[..rest_of_block].cmp(&block[to_end..(to_end + rest_of_block)]);
75 }
76 } else {
77 let to_end = block.len() - b - min;
78 result = block[..to_end].cmp(&block[(b + min)..]);
79 if result == std::cmp::Ordering::Equal {
80 let rest_of_block = block.len() - to_end - min;
81 return block[to_end..(to_end + rest_of_block)].cmp(&block[..rest_of_block]);
82 }
83 }
84 }
85 result
86}
87
88
89/// Decode a Burrows-Wheeler-Transform. Requires a key, a u8 slice containing the BWT data, and an array of the u8 frequencies
90/// found in the data. Returns the decoded data as a u8 vec.
91pub fn bwt_decode(mut key: u32, bwt_in: &[u8], freq_in: &[u32]) -> Vec<u8> {
92 /* LOGIC:
93 Encode the current byte and the index to the next byte in each element of the transformation vector.
94 The leftmost byte of each element is the current byte value. The rightmost three bytes are the index to the next byte.
95 */
96
97 // Calculate end once.
98 let end = bwt_in.len();
99
100 // Convert frequency count to a cumulative sum of frequencies
101 let mut freq = [0_u32; 256];
102
103 {
104 for i in 0..255 {
105 freq[i + 1] = freq[i] + freq_in[i];
106 }
107 }
108
109 //Build the transformation vector to find the next character in the original data
110 let mut t_vec = vec![0_u32; end];
111 for (i, &s) in bwt_in.iter().enumerate() {
112 t_vec[i] |= (s as u32) << 24;
113 t_vec[freq[s as usize] as usize] |= i as u32;
114 freq[s as usize] += 1
115 }
116
117 // Transform the data
118 // First put they last element into the end, thus avoiding a test on each iteration
119 let mut rle1_data = vec![0_u8; end];
120 let mut el = t_vec[key as usize];
121 key = el & 0xFFFFFF;
122 rle1_data[end - 1] = (el >> 24) as u8;
123
124 for i in 1..t_vec.len() {
125 el = t_vec[key as usize];
126 key = el & 0xFFFFFF;
127 rle1_data[i - 1] = (el >> 24) as u8;
128 }
129 rle1_data
130}
131