text_file_sort/
lib.rs

1//! This crate implements a sort algorithm for text files composed of lines or line records. For example
2//! CSV or TSV.
3//!
4//! A data file composed of lines or line records, that is lines that are composed of fields separated
5//! by a delimiter, can be sorted using this crate. Example for such files are
6//! [pg_dump](https://www.postgresql.org/docs/current/app-pgdump.html),
7//! [CSV](https://www.rfc-editor.org/rfc/rfc4180) and [GTFS](https://gtfs.org/schedule/reference/) data files.
8//! The motivation for writing this module was the need to sort pg_dump files of the [OpenStreetMap](https://www.openstreetmap.org/)
9//! database containing billions of lines by the primary key of each table before converting the data
10//! to PBF format.
11//!
12//! This implementation can be used to sort very large files, taking advantage of multiple CPU
13//! cores and providing memory usage control.
14//!
15//! # Issues
16//! Issues are welcome and appreciated. Please submit to https://github.com/navigatorsguild/text-file-sort/issues
17//!
18//! # Benchmarks
19//! [Benchmarks](https://github.com/navigatorsguild/text-file-sort/wiki/Benchmarks) generated by
20//! [benchmark-rs](https://crates.io/crates/benchmark-rs)
21//!
22//! ![link](https://github.com/navigatorsguild/text-file-sort/assets/122003456/cce3b27a-1557-4a2a-942b-36d2cfa0959e)
23//!
24//! # Examples
25//! ```
26//! use std::path::PathBuf;
27//! use text_file_sort::sort::Sort;
28//!
29//! // optimized for use with Jemalloc
30//! use tikv_jemallocator::Jemalloc;
31//! #[global_allocator]
32//! static GLOBAL: Jemalloc = Jemalloc;
33//!
34//! // parallel record sort
35//! fn sort_records(input: PathBuf, output: PathBuf, tmp: PathBuf) -> Result<(), anyhow::Error> {
36//!    let mut text_file_sort = Sort::new(vec![input.clone()], output.clone());
37//!
38//!     // set number of CPU cores the sort will attempt to use. When given the number that exceeds
39//!     // the number of available CPU cores the work will be split among available cores with
40//!     // somewhat degraded performance. The default is to use all available cores.
41//!     text_file_sort.with_tasks(2);
42//!
43//!     // set the directory for intermediate results. The default is the system temp dir -
44//!     // std::env::temp_dir(), however, for large files it is recommended to provide a dedicated
45//!     // directory for intermediate files, preferably on the same file system as the output result.
46//!     text_file_sort.with_tmp_dir(tmp);
47//!
48//!     text_file_sort.sort()
49//! }
50//! ```
51//!
52
53pub(crate) mod sort_command;
54pub(crate) mod line_record;
55pub(crate) mod key;
56pub(crate) mod sorted_chunk_file;
57pub(crate) mod unmerged_chunk_file;
58pub(crate) mod config;
59pub(crate) mod chunk_iterator;
60
61pub mod sort;
62pub mod field;
63pub mod field_type;
64pub mod order;