Struct Sort

Source
pub struct Sort { /* private fields */ }
Expand description

Sort a text file with record like lines

§Examples

use std::path::PathBuf;
use text_file_sort::sort::Sort;

// parallel record sort
fn sort_records(input: PathBuf, output: PathBuf, tmp: PathBuf) -> Result<(), anyhow::Error> {
   let mut text_file_sort = Sort::new(vec![input.clone()], output.clone());
    // set number of CPU cores the sort will attempt to use. When given the number that exceeds
    // the number of available CPU cores the work will be split among available cores with
    // somewhat degraded performance.
    text_file_sort.with_tasks(2);
    // set the directory for intermediate results. The default is the system temp dir -
    // std::env::temp_dir(), however, for large files it is recommended to provide a dedicated
    // directory for intermediate files, preferably on the same file system as the output result.
    text_file_sort.with_tmp_dir(tmp);
    text_file_sort.sort()
}

Implementations§

Source§

impl Sort

Source

pub fn new(input_files: Vec<PathBuf>, output: PathBuf) -> Sort

Create a default Sort definition.

A default Sort definition will use the system temporary directory as defined by std::env::temp_dir().

  • The default field separator is a TAB (‘\t’)
  • The complete line will be considered as a single String field
  • empty lines will be sorted lexicographically
  • lines starting with ‘#’ will be ignored
  • max intermediate files is set to 1024.
  • input is read in chunks of 10 MB bytes
  • default Order is Asc
  • prefix and suffix are empty
  • default end lines is ‘\n’

The Sort implementation will increase the file descriptor rlimit to accommodate configured open files

Source

pub fn with_tmp_dir(&mut self, tmp: PathBuf)

Set directory for intermediate files. By default use std::env::temp_dir() It is recommended for large files to create a dedicated directory for intermediate files on the same file system as the output target

Source

pub fn with_tasks(&mut self, tasks: usize)

Set the number of tasks. The default is zero which will result in using all system cores

Source

pub fn with_field_separator(&mut self, field_separator: char)

Set the field separator. The default is ‘\t’

Source

pub fn with_concurrent_merge(&mut self, concurrent_merge: bool)

Merge sorted files concurrently to reduce the number of files before the final merge

Source

pub fn with_chunk_size_bytes(&mut self, chunk_size_bytes: u64)

The input will be read in chunks of ‘chunk_size_bytes’ respecting line boundaries

Source

pub fn with_chunk_size_mb(&mut self, chunk_size_mb: u64)

The input will be read in chunks of ‘chunk_size_mb’ MB respecting line boundaries

Source

pub fn with_intermediate_files(&mut self, files: usize)

Set the number of intermediate files. The default is 1024.

Source

pub fn with_ignore_empty(&mut self)

Direct the algorithm to ignore empty lines. The default is false

Source

pub fn with_ignore_lines(&mut self, r: Regex)

Specify which lines to ignore. Each line matching the regex will be ignored and will not appear in the output.

Source

pub fn add_field(&mut self, field: Field)

Add field specification. The default is to treat the complete line as a single String field in the record

Source

pub fn with_fields(&mut self, fields: Vec<Field>)

Replace all fields with the fields value.

Source

pub fn with_order(&mut self, order: Order)

Set Order

Source

pub fn add_prefix_line(&mut self, prefix_line: String)

Add file prefix. The provided prefix will be inserted at the beginning of the sorted file

Source

pub fn with_prefix_lines(&mut self, prefix_lines: Vec<String>)

Set prefix lines

Source

pub fn add_suffix_line(&mut self, suffix_line: String)

Add file suffix. The provided suffix will be inserted at the end of the sorted file

Source

pub fn with_suffix_lines(&mut self, suffix_lines: Vec<String>)

Set suffix lines

Source

pub fn with_endl(&mut self, endl: char)

Set line ending char - not supporting CRLF

Source

pub fn sort(&self) -> Result<(), Error>

Sort input files or STDIN

Source

pub fn check(&self) -> Result<bool, Error>

Source

pub fn merge(&self) -> Result<(), Error>

Auto Trait Implementations§

§

impl Freeze for Sort

§

impl RefUnwindSafe for Sort

§

impl Send for Sort

§

impl Sync for Sort

§

impl Unpin for Sort

§

impl UnwindSafe for Sort

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source§

impl<T> Pointable for T

Source§

const ALIGN: usize

The alignment of pointer.
Source§

type Init = T

The type for initializers.
Source§

unsafe fn init(init: <T as Pointable>::Init) -> usize

Initializes a with the given initializer. Read more
Source§

unsafe fn deref<'a>(ptr: usize) -> &'a T

Dereferences the given pointer. Read more
Source§

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

Mutably dereferences the given pointer. Read more
Source§

unsafe fn drop(ptr: usize)

Drops the object pointed to by the given pointer. Read more
Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.
Source§

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

Source§

fn vzip(self) -> V