extsort_iter/
lib.rs

1// struct SequentialSorter {}
2
3//! This crate implements an external sort for arbitrary
4//! iterators of any size, assuming they fit on disk.
5//! ```
6//! # #[cfg(not(miri))]
7//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
8//! use extsort_iter::*;
9//!
10//! let sequence = [3,21,42,9,5];
11//!
12//! // the default configuration will sort with up to 10M in buffered in Memory
13//! // and place the files under /tmp
14//! //
15//! // you will most likely want to change at least the location.
16//! let config = ExtsortConfig::default();
17//!
18//! let data = sequence
19//!     .iter()
20//!     .cloned()
21//!     .external_sort(config)?
22//!     .collect::<Vec<_>>();
23//! assert_eq!(&data, &[3,5,9,21,42]);
24//!
25//! // you if you enable the compression_lz4_flex feature, the runs will be transparently compressed
26//! // using lz4 before being written to disk. This can help if you have a slow/small disk.
27//! # #[cfg(compression_lz4_flex)]
28//! let config = ExtsortConfig::default().compress_lz4_flex();
29//! # #[cfg(not(compression_lz4_flex))]
30//! # let config = ExtsortConfig::default();
31//!
32//! let data = sequence
33//!     .iter()
34//!     .cloned()
35//!     .external_sort(config)?
36//!     .collect::<Vec<_>>();
37//! assert_eq!(&data, &[3,5,9,21,42]);
38//!
39//! # Ok(())
40//! # }
41//! # #[cfg(miri)]
42//! # fn main() {}
43//! ```
44//!
45//! ## When not to use this crate
46//!
47//! When your source iterator is big because each item owns large amounts of heap memory.
48//! That means the following case will result in memory exhaustion:
49//! ```no_run
50//! # use extsort_iter::*;
51//! let data = "somestring".to_owned();
52//! let iterator = std::iter::from_fn(|| Some(data.clone())).take(1_000_000);
53//! let sorted  = iterator.external_sort(ExtsortConfig::default());
54//! ```
55//!
56//! The reason for that is that we are not dropping the values from the source iterator until they are
57//! yielded by the result iterator.
58//!
59//! You can think of it as buffering the entire input iterator, with the values
60//! themselves living on disk but all memory the values point to still living on the heap.
61
62#[cfg(windows)]
63extern crate winapi;
64
65#[warn(clippy::all, clippy::pedantic)]
66#[allow(clippy::module_name_repetitions)]
67pub mod extension_trait;
68mod merge;
69mod orderer;
70mod run;
71mod sorter;
72mod tape;
73
74pub use extension_trait::*;
75pub use sorter::ExtsortConfig;
76
77#[cfg(not(miri))]
78#[cfg(test)]
79mod tests {
80    use crate::{extension_trait::ExtSortOrdExtension, sorter::ExtsortConfig, ExtSortByExtension};
81
82    const TEST_SEQUENCE: [i32; 100] = [
83        2, 82, 29, 86, 100, 67, 44, 19, 25, 10, 84, 47, 65, 42, 11, 24, 53, 92, 69, 49, 70, 36, 8,
84        48, 16, 91, 62, 58, 55, 18, 27, 79, 76, 40, 22, 95, 99, 28, 17, 7, 59, 30, 97, 80, 34, 33,
85        54, 45, 31, 52, 56, 1, 57, 38, 61, 6, 23, 94, 85, 51, 35, 68, 41, 15, 90, 14, 74, 75, 32,
86        73, 83, 64, 77, 89, 4, 72, 71, 21, 63, 5, 39, 12, 20, 3, 43, 88, 26, 78, 93, 60, 50, 13,
87        37, 87, 46, 96, 66, 98, 81, 9,
88    ];
89
90    #[test]
91    fn integration() {
92        let data = TEST_SEQUENCE
93            .iter()
94            .external_sort(ExtsortConfig::with_buffer_size(32))
95            .unwrap();
96
97        let is_sorted = data.into_iter().zip(1..).all(|(l, r)| *l == r);
98
99        assert!(is_sorted);
100    }
101
102    #[test]
103    fn sort_zst() {
104        let data = [(), (), ()];
105        let sorted = data
106            .into_iter()
107            .external_sort_by_key(ExtsortConfig::new(), |_a| 1)
108            .unwrap()
109            .collect::<Vec<_>>();
110        assert_eq!(3, sorted.len())
111    }
112
113    #[test]
114    fn integration_sortby() {
115        let data = TEST_SEQUENCE
116            .iter()
117            .external_sort_by(ExtsortConfig::with_buffer_size(4096), |a, b| a.cmp(b))
118            .unwrap();
119        let data = data.collect::<Vec<_>>();
120
121        let is_sorted = data.into_iter().zip(1..).all(|(l, r)| *l == r);
122
123        assert!(is_sorted);
124    }
125
126    #[test]
127    fn integration_sortby_key() {
128        let data = TEST_SEQUENCE
129            .iter()
130            .external_sort_by_key(ExtsortConfig::new(), |a| *a)
131            .unwrap();
132        let data = data.collect::<Vec<_>>();
133
134        let is_sorted = data.into_iter().zip(1..).all(|(l, r)| *l == r);
135
136        assert!(is_sorted);
137    }
138
139    fn roundtrip_sequence(sequence: Vec<i32>, buffer_size: usize) {
140        let roundtripped = sequence
141            .iter()
142            .cloned()
143            .external_sort(
144                ExtsortConfig::with_buffer_size(buffer_size).temp_file_folder("/dev/shm"),
145            )
146            .unwrap()
147            .collect::<Vec<_>>();
148
149        assert_eq!(sequence, roundtripped);
150    }
151
152    #[test]
153    fn test_single_run() {
154        let sequence = (1..100).collect();
155        roundtrip_sequence(sequence, 80000);
156    }
157
158    #[test]
159    fn test_many_runs() {
160        let sequence = (0..1000).collect();
161        roundtrip_sequence(sequence, 16);
162    }
163
164    #[test]
165    fn test_tiny_buffer() {
166        let sequence = (0..1000).collect();
167        roundtrip_sequence(sequence, 1);
168    }
169
170    #[test]
171    fn test_sort_in_mem() {
172        let sequence = (0..10).collect();
173        roundtrip_sequence(sequence, 4096);
174    }
175
176    #[test]
177    fn test_remaining_len() {
178        let data = (0..500).collect::<Vec<_>>();
179        let mut sorted = data
180            .into_iter()
181            .external_sort(ExtsortConfig::with_buffer_size(8))
182            .unwrap();
183        let mut result = vec![];
184        assert_eq!(500, sorted.len());
185        while let Some(next) = sorted.next() {
186            result.push(next);
187            assert_eq!(500 - result.len(), sorted.len());
188        }
189    }
190}