use std::fs;
use std::io;
use std::num::NonZeroUsize;
use std::path::Path;
use crate::config::{Config, Delimiter};
use crate::util::{self, FilenameTemplate};
use crate::CliResult;
static USAGE: &str = "
Splits the given CSV data into smaller files having a fixed number of
rows given to -S, --size.
Target file can also be split into a given number of -c/--chunks.
Files will be written in current working directory by default or in any directory
given to -O/--out-dir (that will be created for your if necessary).
Usage:
xan split [options] [<input>]
xan split --help
split options:
-O, --out-dir <dir> Where to write the chunks. Defaults to current working
directory.
-S, --size <arg> The number of records to write into each chunk.
[default: 4096]
-c, --chunks <n> Divide the file into at most <n> chunks having
roughly the same number of records. Target file must be
seekable (e.g. this will not work with stdin nor gzipped
files).
--segments When used with -c/--chunks, output the byte offsets of
found segments instead.
-f, --filename <filename> A filename template to use when constructing
the names of the output files. The string '{}'
will be replaced either by the index in original file of
first row emitted when using -S/--size or by the chunk
index when using -c/--chunks.
[default: {}.csv]
Common options:
-h, --help Display this message
-n, --no-headers When set, the first row will NOT be interpreted
as column names. Otherwise, the first row will
appear in all chunks as the header row.
-d, --delimiter <arg> The field delimiter for reading CSV data.
Must be a single character.
";
#[derive(Clone, Deserialize)]
struct Args {
arg_input: Option<String>,
flag_out_dir: Option<String>,
flag_size: NonZeroUsize,
flag_chunks: Option<NonZeroUsize>,
flag_segments: bool,
flag_filename: FilenameTemplate,
flag_no_headers: bool,
flag_delimiter: Option<Delimiter>,
}
pub fn run(argv: &[&str]) -> CliResult<()> {
let args: Args = util::get_args(USAGE, argv)?;
if args.flag_chunks.is_some() {
if args.flag_segments {
args.segments()
} else {
args.split_by_segments()
}
} else {
args.split_by_size()
}
}
impl Args {
fn split_by_size(&self) -> CliResult<()> {
if let Some(out_dir) = &self.flag_out_dir {
fs::create_dir_all(out_dir)?;
}
let rconfig = self.rconfig();
let mut splitter = rconfig.simd_splitter()?;
let headers = splitter.byte_headers()?.to_vec();
let mut wtr = self.new_writer(&headers, 0)?;
let mut i = 0;
while let Some(record) = splitter.split_record()? {
if i > 0 && i % self.flag_size == 0 {
wtr.flush()?;
wtr = self.new_writer(&headers, i)?;
}
wtr.write_splitted_record(record)?;
i += 1;
}
Ok(wtr.flush()?)
}
fn split_by_segments(&self) -> CliResult<()> {
if let Some(out_dir) = &self.flag_out_dir {
fs::create_dir_all(out_dir)?;
}
let rconfig = self.rconfig();
let mut seeker = rconfig
.simd_seeker()?
.ok_or("Could not sample the file to build segments!")?;
let initial_pos = seeker.initial_position();
let segments = seeker.segments(self.flag_chunks.unwrap().get())?;
let mut splitter = seeker.into_splitter()?;
let headers = splitter.byte_headers()?.to_vec();
let mut writer = self.new_writer(&headers, 0)?;
let mut chunk: usize = 0;
while let Some((pos, record)) = splitter.split_record_with_position()? {
if initial_pos + pos >= segments[chunk].1 {
writer.flush()?;
chunk += 1;
writer = self.new_writer(&headers, chunk)?;
}
writer.write_splitted_record(record)?;
}
Ok(())
}
fn segments(&self) -> CliResult<()> {
let rconfig = self.rconfig();
let mut seeker = rconfig
.simd_seeker()?
.ok_or("Could not sample the file to build segments!")?;
let segments = seeker.segments(self.flag_chunks.unwrap().get())?;
let mut wtr = Config::new(&None).simd_writer()?;
let mut record = simd_csv::ByteRecord::new();
record.push_field(b"from");
record.push_field(b"to");
wtr.write_byte_record(&record)?;
for (f, t) in segments {
record.clear();
record.push_field(f.to_string().as_bytes());
record.push_field(t.to_string().as_bytes());
wtr.write_byte_record(&record)?;
}
wtr.flush()?;
Ok(())
}
fn new_writer(
&self,
headers: &[u8],
id: usize,
) -> CliResult<simd_csv::Writer<Box<dyn io::Write + Send + 'static>>> {
let dir = match &self.flag_out_dir {
Some(out_dir) => Path::new(out_dir),
None => Path::new(""),
};
let path = dir.join(self.flag_filename.filename(&format!("{}", id)));
let spath = Some(path.display().to_string());
let mut wtr = Config::new(&spath).simd_writer()?;
if !self.rconfig().no_headers {
wtr.write_splitted_record(headers)?;
}
Ok(wtr)
}
fn rconfig(&self) -> Config {
Config::new(&self.arg_input)
.delimiter(self.flag_delimiter)
.no_headers(self.flag_no_headers)
}
}