use std::fs;
use pariter::IteratorExt;
use crate::config::{Config, Delimiter};
use crate::moonblade::SelectionProgram;
use crate::util;
use crate::CliResult;
static USAGE: &str = r#"
The map command evaluates an expression for each row of the given CSV file and
output the same row with added columns containing the results of beforementioned
expression.
For instance, given the following CSV file:
a,b
1,4
5,2
The following command:
$ xan map 'a + b as c' file.csv > result.csv
Will produce the following result:
a,b,c
1,4,5
5,2,7
You can also create multiple columns at once:
$ xan map 'a + b as c, a * b as d' file.csv > result.csv
Will produce the following result:
a,b,c,d
1,4,5,4
5,2,7,10
Expression clauses can also return more than one item at once to avoid repeating
computations, for instance:
Splitting a full name:
$ xan map 'full_name.split(" ") as (first_name, last_name)' file.csv > result.csv
Extracting data from a JSON cell:
$ xan map 'data.parse_json() | [_.name, _.meta[2].age] as (name, age)' file.csv > result.csv
You can also use the -O/--overwrite flag to overwrite already existing columns:
$ xan map -O 'b * 10 as b, a * b as c' file.csv > result.csv
Will produce:
a,b,c
1,40,4
5,20,10
The expression can optionally be read from a file using the -f/--evaluate-file flag:
$ xan map -f expr.moonblade file.csv > result.csv
For a quick review of the capabilities of the expression language,
check out the `xan help cheatsheet` command.
For a list of available functions, use `xan help functions`.
Miscellaneous tricks:
1. Copying a column:
$ xan map 'column_name as copy_name' file.csv > result.csv
2. Create a column containing a constant value:
$ xan map '"john" as from' file.csv > result.csv
Usage:
xan map [options] <expression> [<input>]
xan map --help
map options:
-f, --evaluate-file Read evaluation expression from a file instead.
-O, --overwrite If set, expressions named with a column already existing
in the file will be overwritten with the result of the
expression instead of adding a new column at the end.
This means you can both transform and add columns at the
same time.
-F, --filter If given, will not write rows in the output if all results
of evaluated expression are falsey.
-p, --parallel Whether to use parallelization to speed up computations.
Will automatically select a suitable number of threads to use
based on your number of cores. Use -t, --threads if you want to
indicate the number of threads yourself.
-t, --threads <threads> Parellize computations using this many threads. Use -p, --parallel
if you want the number of threads to be automatically chosen instead.
Common options:
-h, --help Display this message
-o, --output <file> Write output to <file> instead of stdout.
-n, --no-headers When set, the first row will not be evaled
as headers.
-d, --delimiter <arg> The field delimiter for reading CSV data.
Must be a single character.
"#;
#[derive(Deserialize)]
struct Args {
arg_expression: String,
arg_input: Option<String>,
flag_output: Option<String>,
flag_no_headers: bool,
flag_delimiter: Option<Delimiter>,
flag_filter: bool,
flag_parallel: bool,
flag_threads: Option<usize>,
flag_evaluate_file: bool,
flag_overwrite: bool,
}
impl Args {
fn resolve(&mut self) -> CliResult<()> {
if self.flag_evaluate_file {
self.arg_expression = fs::read_to_string(&self.arg_expression)?;
}
Ok(())
}
}
pub fn run(argv: &[&str]) -> CliResult<()> {
let mut args: Args = util::get_args(USAGE, argv)?;
args.resolve()?;
let rconf = Config::new(&args.arg_input)
.no_headers(args.flag_no_headers)
.delimiter(args.flag_delimiter);
let parallelization = match (args.flag_parallel, args.flag_threads) {
(true, None) => Some(None),
(_, Some(count)) => Some(Some(count)),
_ => None,
};
let mut wtr = Config::new(&args.flag_output).simd_writer()?;
let mut rdr = rconf.simd_reader()?;
let headers = rdr.byte_headers()?.clone();
let program = SelectionProgram::parse(&args.arg_expression, &headers)?;
if args.flag_overwrite && program.has_any_plural_expr() {
Err("-O/--overwrite does not work with clauses yielding multiple columns yet!")?;
}
let actually_overwriting = args.flag_overwrite && program.has_something_to_overwrite();
if !rconf.no_headers {
if actually_overwriting {
wtr.write_record(headers.iter().chain(program.new_headers()))?;
} else {
wtr.write_record(headers.iter().chain(program.headers()))?;
}
}
if let Some(threads) = parallelization {
for result in rdr.into_byte_records().enumerate().parallel_map_custom(
|o| o.threads(threads.unwrap_or_else(crate::util::default_num_cpus)),
move |(index, record)| -> CliResult<(bool, simd_csv::ByteRecord)> {
let mut record = record?;
let is_truthy;
if actually_overwriting {
(is_truthy, record) = program.overwrite(index, &mut record)?;
} else {
is_truthy = program.extend(index, &mut record)?;
}
Ok((is_truthy, record))
},
) {
let (is_truthy, record) = result?;
if !args.flag_filter || is_truthy {
wtr.write_byte_record(&record)?;
}
}
} else {
let mut record = simd_csv::ByteRecord::new();
let mut index: usize = 0;
while rdr.read_byte_record(&mut record)? {
let is_truthy;
if actually_overwriting {
(is_truthy, record) = program.overwrite(index, &mut record)?;
} else {
is_truthy = program.extend(index, &mut record)?;
}
if !args.flag_filter || is_truthy {
wtr.write_byte_record(&record)?;
}
index += 1;
}
}
Ok(wtr.flush()?)
}