pub struct Sort { /* private fields */ }
Expand description
Sort a text file with record like lines
§Examples
use std::path::PathBuf;
use text_file_sort::sort::Sort;
// parallel record sort
fn sort_records(input: PathBuf, output: PathBuf, tmp: PathBuf) -> Result<(), anyhow::Error> {
let mut text_file_sort = Sort::new(vec![input.clone()], output.clone());
// set number of CPU cores the sort will attempt to use. When given the number that exceeds
// the number of available CPU cores the work will be split among available cores with
// somewhat degraded performance.
text_file_sort.with_tasks(2);
// set the directory for intermediate results. The default is the system temp dir -
// std::env::temp_dir(), however, for large files it is recommended to provide a dedicated
// directory for intermediate files, preferably on the same file system as the output result.
text_file_sort.with_tmp_dir(tmp);
text_file_sort.sort()
}
Implementations§
Source§impl Sort
impl Sort
Sourcepub fn new(input_files: Vec<PathBuf>, output: PathBuf) -> Sort
pub fn new(input_files: Vec<PathBuf>, output: PathBuf) -> Sort
Create a default Sort definition.
A default Sort definition will use the system temporary directory as defined by std::env::temp_dir().
- The default field separator is a TAB (‘\t’)
- The complete line will be considered as a single String field
- empty lines will be sorted lexicographically
- lines starting with ‘#’ will be ignored
- max intermediate files is set to 1024.
- input is read in chunks of 10 MB bytes
- default Order is Asc
- prefix and suffix are empty
- default end lines is ‘\n’
The Sort implementation will increase the file descriptor rlimit to accommodate configured open files
Sourcepub fn with_tmp_dir(&mut self, tmp: PathBuf)
pub fn with_tmp_dir(&mut self, tmp: PathBuf)
Set directory for intermediate files. By default use std::env::temp_dir() It is recommended for large files to create a dedicated directory for intermediate files on the same file system as the output target
Sourcepub fn with_tasks(&mut self, tasks: usize)
pub fn with_tasks(&mut self, tasks: usize)
Set the number of tasks. The default is zero which will result in using all system cores
Sourcepub fn with_field_separator(&mut self, field_separator: char)
pub fn with_field_separator(&mut self, field_separator: char)
Set the field separator. The default is ‘\t’
Sourcepub fn with_concurrent_merge(&mut self, concurrent_merge: bool)
pub fn with_concurrent_merge(&mut self, concurrent_merge: bool)
Merge sorted files concurrently to reduce the number of files before the final merge
Sourcepub fn with_chunk_size_bytes(&mut self, chunk_size_bytes: u64)
pub fn with_chunk_size_bytes(&mut self, chunk_size_bytes: u64)
The input will be read in chunks of ‘chunk_size_bytes’ respecting line boundaries
Sourcepub fn with_chunk_size_mb(&mut self, chunk_size_mb: u64)
pub fn with_chunk_size_mb(&mut self, chunk_size_mb: u64)
The input will be read in chunks of ‘chunk_size_mb’ MB respecting line boundaries
Sourcepub fn with_intermediate_files(&mut self, files: usize)
pub fn with_intermediate_files(&mut self, files: usize)
Set the number of intermediate files. The default is 1024.
Sourcepub fn with_ignore_empty(&mut self)
pub fn with_ignore_empty(&mut self)
Direct the algorithm to ignore empty lines. The default is false
Sourcepub fn with_ignore_lines(&mut self, r: Regex)
pub fn with_ignore_lines(&mut self, r: Regex)
Specify which lines to ignore. Each line matching the regex will be ignored and will not appear in the output.
Sourcepub fn add_field(&mut self, field: Field)
pub fn add_field(&mut self, field: Field)
Add field specification. The default is to treat the complete line as a single String field in the record
Sourcepub fn with_fields(&mut self, fields: Vec<Field>)
pub fn with_fields(&mut self, fields: Vec<Field>)
Replace all fields with the fields
value.
Sourcepub fn with_order(&mut self, order: Order)
pub fn with_order(&mut self, order: Order)
Set Order
Sourcepub fn add_prefix_line(&mut self, prefix_line: String)
pub fn add_prefix_line(&mut self, prefix_line: String)
Add file prefix. The provided prefix will be inserted at the beginning of the sorted file
Sourcepub fn with_prefix_lines(&mut self, prefix_lines: Vec<String>)
pub fn with_prefix_lines(&mut self, prefix_lines: Vec<String>)
Set prefix lines
Sourcepub fn add_suffix_line(&mut self, suffix_line: String)
pub fn add_suffix_line(&mut self, suffix_line: String)
Add file suffix. The provided suffix will be inserted at the end of the sorted file
Sourcepub fn with_suffix_lines(&mut self, suffix_lines: Vec<String>)
pub fn with_suffix_lines(&mut self, suffix_lines: Vec<String>)
Set suffix lines