use std::cmp::Ordering;
use bstr::ByteSlice;
use rayon::slice::ParallelSliceMut;
use crate::config::{Config, Delimiter};
use crate::scales::LinearScale;
use crate::select::SelectedColumns;
use crate::util;
use crate::CliResult;
static USAGE: &str = "
Discretize selection of columns containing continuous data into bins.
The resulting bins table will be formatted thusly:
field - Name of the column
value - Bin's label (depends on what was given to -l/--label)
lower_bound - Lower bound of the bin
upper_bound - Upper bound of the bin
count - Number of rows falling into this bin
The number of bins can be chosen with the -b/--bins flag. Note that,
by default, this number is an approximate goal since the command
attempts to find readble boundaries for the bins and this make it
hard to respect a precise number of bins. Use the -e/--exact flag
if you want to force the command to respect -b/--bins exactly.
Combined with `xan hist`, this command can be very useful to visualize
distributions of continous columns:
$ xan bins -s count data.csv | xan hist
Using a log scale:
$ xan bins -s count data.csv | xan hist --scale log
Usage:
xan bins [options] [<input>]
xan bins --help
bins options:
-s, --select <arg> Select a subset of columns to compute bins for. See
'xan select --help' for more detail.
-b, --bins <number> Number of bins to generate. Note that without -e/--exact,
this number should be considered as an approximate goal.
The command by default attempts to find nice & readable boundaries
for the bins and this means a precise number of bins is not
always achievable.
[default: 10]
-H, --heuristic <name> Heuristic to use to automatically find an adequate number
of bins. Must be one of `freedman-diaconis`, `sqrt` or `sturges`.
--max-bins <number> Maximum number of bins to generate. Only useful when using
the -H/--heuristic flag.
-e, --exact Whether to make sure to return the exact number of bins
provided to -b/--bins, which means the readability of the
bins boundaries might suffer.
-l, --label <mode> Label to choose for the bins (that will be placed in the
`value` column). Mostly useful to tweak representation when
piping to `xan hist`. Can be one of \"full\", \"lower\" or \"upper\".
[default: full]
-m, --min <min> Override min value. Values lower that this min will be counted
as out of bounds.
-M, --max <max> Override max value. Values greater that this max will be counted
as out of bounds.
-N, --no-extra Don't include, empty cells, nans and out of bounds counts.
Common options:
-h, --help Display this message
-o, --output <file> Write output to <file> instead of stdout.
-n, --no-headers When set, the file will be considered as having no
headers.
-d, --delimiter <arg> The field delimiter for reading CSV data.
Must be a single character.
";
#[derive(Deserialize)]
struct Args {
arg_input: Option<String>,
flag_select: SelectedColumns,
flag_no_headers: bool,
flag_delimiter: Option<Delimiter>,
flag_output: Option<String>,
flag_no_extra: bool,
flag_bins: usize,
flag_max_bins: Option<usize>,
flag_heuristic: Option<Heuristic>,
flag_label: LabelOption,
flag_exact: bool,
flag_min: Option<f64>,
flag_max: Option<f64>,
}
impl Args {
fn bins_count(&self) -> BinsCount {
match self.flag_heuristic {
Some(heuristic) => BinsCount::WithHeuristic(heuristic),
None => BinsCount::Some(self.flag_bins),
}
}
}
pub fn run(argv: &[&str]) -> CliResult<()> {
let args: Args = util::get_args(USAGE, argv)?;
let conf = Config::new(&args.arg_input)
.delimiter(args.flag_delimiter)
.no_headers(args.flag_no_headers)
.select(args.flag_select.clone());
let mut rdr = conf.simd_reader()?;
let mut wtr = Config::new(&args.flag_output).simd_writer()?;
let headers = rdr.byte_headers()?.clone();
let sel = conf.selection(&headers)?;
let mut all_series: Vec<Series> = sel.iter().map(|i| Series::new(*i)).collect();
let mut record = simd_csv::ByteRecord::new();
while rdr.read_byte_record(&mut record)? {
for (cell, series) in sel.select(&record).zip(all_series.iter_mut()) {
series.add(cell, &args.flag_min, &args.flag_max);
}
}
wtr.write_record(["field", "value", "lower_bound", "upper_bound", "count"])?;
for series in all_series.iter_mut() {
match series.bins(
args.bins_count(),
args.flag_max_bins,
args.flag_min,
args.flag_max,
args.flag_exact,
) {
None => continue,
Some(bins) => {
let max_lower_bound_width = bins
.iter()
.map(|bin| util::format_number(bin.lower_bound).len())
.max()
.unwrap();
let max_upper_bound_width = bins
.iter()
.map(|bin| util::format_number(bin.upper_bound).len())
.max()
.unwrap();
let mut bins_iter = bins.iter().peekable();
while let Some(bin) = bins_iter.next() {
let (lower_bound, upper_bound) = match series.data_type {
DataType::Float => (bin.lower_bound, bin.upper_bound),
DataType::Integer => (bin.lower_bound.ceil(), bin.upper_bound.ceil()),
};
let lower_bound = util::format_number(lower_bound);
let upper_bound = util::format_number(upper_bound);
let label_format = if bin.is_constant() {
lower_bound
} else {
match args.flag_label {
LabelOption::Full => match bins_iter.peek() {
None => format!(
">= {:lower_width$} <= {:upper_width$}",
lower_bound,
upper_bound,
lower_width = max_lower_bound_width,
upper_width = max_upper_bound_width
),
Some(_) => format!(
">= {:lower_width$} < {:upper_width$}",
lower_bound,
upper_bound,
lower_width = max_lower_bound_width,
upper_width = max_upper_bound_width
),
},
LabelOption::Upper => upper_bound,
LabelOption::Lower => lower_bound,
}
};
wtr.write_record([
&headers[series.column],
label_format.as_bytes(),
bin.lower_bound.to_string().as_bytes(),
bin.upper_bound.to_string().as_bytes(),
bin.count.to_string().as_bytes(),
])?;
}
}
}
if !args.flag_no_extra && series.nans > 0 {
wtr.write_record([
&headers[series.column],
b"<NaN>",
b"",
b"",
series.nans.to_string().as_bytes(),
])?;
}
if !args.flag_no_extra && series.nulls > 0 {
wtr.write_record([
&headers[series.column],
b"<empty>",
b"",
b"",
series.nulls.to_string().as_bytes(),
])?;
}
if !args.flag_no_extra && series.out_of_bounds > 0 {
wtr.write_record([
&headers[series.column],
b"<rest>",
b"",
b"",
series.out_of_bounds.to_string().as_bytes(),
])?;
}
}
Ok(wtr.flush()?)
}
fn compute_rectified_iqr(numbers: &[f64], stats: &SeriesStats) -> Option<f64> {
if numbers.len() < 4 {
None
} else {
let q1 = (numbers.len() as f64 * 0.25).floor() as usize;
let q3 = (numbers.len() as f64 * 0.75).floor() as usize;
let mut q1 = numbers[q1];
let mut q3 = numbers[q3];
let offset = stats.min().unwrap() + 1.0;
q1 += offset;
q3 += offset;
let iqr = q3 - q1;
Some(iqr)
}
}
#[derive(Deserialize, Clone, Copy)]
enum LabelOption {
Full,
Lower,
Upper,
}
#[derive(Deserialize, Clone, Copy, Debug)]
enum Heuristic {
#[serde(rename = "freedman-diaconis")]
FreedmanDiaconis,
Sturges,
Sqrt,
}
#[derive(Clone, Copy, Debug)]
enum BinsCount {
Some(usize),
WithHeuristic(Heuristic),
}
#[derive(Debug)]
struct SeriesStats {
extent: Option<(f64, f64)>,
}
impl SeriesStats {
pub fn min(&self) -> Option<f64> {
self.extent.map(|extent| extent.0)
}
pub fn max(&self) -> Option<f64> {
self.extent.map(|extent| extent.1)
}
}
#[derive(Debug)]
struct Bin {
lower_bound: f64,
upper_bound: f64,
count: usize,
}
impl Bin {
fn is_constant(&self) -> bool {
self.lower_bound == self.upper_bound
}
}
#[derive(Debug)]
enum DataType {
Integer,
Float,
}
#[derive(Debug)]
struct Series {
column: usize,
numbers: Vec<f64>,
count: usize,
nans: usize,
nulls: usize,
out_of_bounds: usize,
data_type: DataType,
}
impl Series {
pub fn new(column: usize) -> Self {
Series {
column,
numbers: Vec::new(),
count: 0,
nans: 0,
nulls: 0,
out_of_bounds: 0,
data_type: DataType::Integer,
}
}
pub fn add(&mut self, cell: &[u8], min: &Option<f64>, max: &Option<f64>) {
self.count += 1;
let cell = cell.trim();
if cell.is_empty() {
self.nulls += 1;
return;
}
match fast_float::parse::<f64, &[u8]>(cell) {
Ok(float) => {
if let Some(m) = min {
if float < *m {
self.out_of_bounds += 1;
return;
}
} else if let Some(m) = max {
if float > *m {
self.out_of_bounds += 1;
return;
}
}
if float.fract() != 0.0 {
self.data_type = DataType::Float;
}
self.numbers.push(float);
}
Err(_) => {
self.nans += 1;
}
}
}
pub fn len(&self) -> usize {
self.numbers.len()
}
pub fn stats(&self) -> SeriesStats {
let mut extent: Option<(f64, f64)> = None;
for n in self.numbers.iter() {
let n = *n;
extent = match extent {
None => Some((n, n)),
Some(m) => Some((f64::min(n, m.0), f64::max(n, m.1))),
};
}
SeriesStats { extent }
}
pub fn naive_optimal_bin_count(&self) -> usize {
(self.len() as f64).sqrt().ceil() as usize
}
pub fn freedman_diaconis(&mut self, width: f64, stats: &SeriesStats) -> Option<usize> {
self.numbers.par_sort_unstable_by(|a, b| a.total_cmp(b));
compute_rectified_iqr(&self.numbers, stats).and_then(|iqr| {
if iqr == 0.0 {
return None;
}
let bin_width = 2.0 * (iqr / (self.numbers.len() as f64).cbrt());
Some((width / bin_width).ceil() as usize)
})
}
pub fn sturges(&self) -> usize {
1 + (self.numbers.len() as f64).log2().ceil() as usize
}
pub fn solve_bins_count(
&mut self,
bins_count: BinsCount,
max_opt: Option<usize>,
width: f64,
stats: &SeriesStats,
) -> usize {
let mut expected = match bins_count {
BinsCount::Some(count) => count,
BinsCount::WithHeuristic(heuristic) => match heuristic {
Heuristic::FreedmanDiaconis => self
.freedman_diaconis(width, stats)
.unwrap_or_else(|| self.naive_optimal_bin_count()),
Heuristic::Sqrt => self.naive_optimal_bin_count(),
Heuristic::Sturges => self.sturges(),
},
};
expected = expected.max(2);
if let Some(max) = max_opt {
expected = expected.min(max);
}
expected
}
pub fn bins(
&mut self,
count: BinsCount,
max_bins: Option<usize>,
min: Option<f64>,
max: Option<f64>,
exact: bool,
) -> Option<Vec<Bin>> {
if self.len() < 1 {
return None;
}
let stats = self.stats();
let min = min.unwrap_or_else(|| stats.min().unwrap());
let max = max.unwrap_or_else(|| stats.max().unwrap());
if min == max {
return Some(vec![Bin {
lower_bound: min,
upper_bound: max,
count: self.len(),
}]);
}
let width = max - min;
let count = self.solve_bins_count(count, max_bins, width, &stats);
let bins = if !exact {
let scale = LinearScale::nice((min, max), (0.0, 1.0), count);
let mut ticks = scale.ticks(count);
if ticks.is_empty() {
return Some(vec![]);
}
if ticks.len() >= 2 {
ticks[0] = min;
*ticks.last_mut().unwrap() = max;
}
let mut bins: Vec<Bin> = Vec::with_capacity(ticks.len());
for i in 0..(ticks.len() - 1) {
bins.push(Bin {
lower_bound: ticks[i],
upper_bound: ticks[i + 1],
count: 0,
});
}
for n in self.numbers.iter() {
let bin_index = bins
.binary_search_by(|bin| match bin.upper_bound.partial_cmp(n).unwrap() {
Ordering::Equal => Ordering::Less,
ord => ord,
})
.unwrap_err()
.min(bins.len().saturating_sub(1));
bins[bin_index].count += 1;
}
bins
} else {
let mut bins: Vec<Bin> = Vec::with_capacity(count);
let cell_width = width / count as f64;
let mut lower_bound = min;
for _ in 0..count {
let upper_bound = f64::min(lower_bound + cell_width, max);
bins.push(Bin {
lower_bound,
upper_bound,
count: 0,
});
lower_bound = upper_bound;
}
for n in self.numbers.iter() {
let mut bin_index = ((n - min) / cell_width).floor() as usize;
if bin_index == bins.len() {
bin_index -= 1;
}
bins[bin_index].count += 1;
}
bins
};
Some(bins)
}
}