nu_command/charting/
histogram.rs

1use super::hashable_value::HashableValue;
2use itertools::Itertools;
3use nu_engine::command_prelude::*;
4
5use std::collections::HashMap;
6
7#[derive(Clone)]
8pub struct Histogram;
9
10enum PercentageCalcMethod {
11    Normalize,
12    Relative,
13}
14
15impl Command for Histogram {
16    fn name(&self) -> &str {
17        "histogram"
18    }
19
20    fn signature(&self) -> Signature {
21        Signature::build("histogram")
22            .input_output_types(vec![(Type::List(Box::new(Type::Any)), Type::table()),])
23            .optional("column-name", SyntaxShape::String, "Column name to calc frequency, no need to provide if input is a list.")
24            .optional("frequency-column-name", SyntaxShape::String, "Histogram's frequency column, default to be frequency column output.")
25            .named("percentage-type", SyntaxShape::String, "percentage calculate method, can be 'normalize' or 'relative', in 'normalize', defaults to be 'normalize'", Some('t'))
26            .category(Category::Chart)
27    }
28
29    fn description(&self) -> &str {
30        "Creates a new table with a histogram based on the column name passed in."
31    }
32
33    fn examples(&self) -> Vec<Example> {
34        vec![
35            Example {
36                description: "Compute a histogram of file types",
37                example: "ls | histogram type",
38                result: None,
39            },
40            Example {
41                description: "Compute a histogram for the types of files, with frequency column named freq",
42                example: "ls | histogram type freq",
43                result: None,
44            },
45            Example {
46                description: "Compute a histogram for a list of numbers",
47                example: "[1 2 1] | histogram",
48                result: Some(Value::test_list(vec![
49                    Value::test_record(record! {
50                        "value" =>      Value::test_int(1),
51                        "count" =>      Value::test_int(2),
52                        "quantile" =>   Value::test_float(0.6666666666666666),
53                        "percentage" => Value::test_string("66.67%"),
54                        "frequency" =>  Value::test_string("******************************************************************"),
55                    }),
56                    Value::test_record(record! {
57                        "value" =>      Value::test_int(2),
58                        "count" =>      Value::test_int(1),
59                        "quantile" =>   Value::test_float(0.3333333333333333),
60                        "percentage" => Value::test_string("33.33%"),
61                        "frequency" =>  Value::test_string("*********************************"),
62                    }),
63                ])),
64            },
65            Example {
66                description: "Compute a histogram for a list of numbers, and percentage is based on the maximum value",
67                example: "[1 2 3 1 1 1 2 2 1 1] | histogram --percentage-type relative",
68                result: None,
69            },
70        ]
71    }
72
73    fn run(
74        &self,
75        engine_state: &EngineState,
76        stack: &mut Stack,
77        call: &Call,
78        input: PipelineData,
79    ) -> Result<PipelineData, ShellError> {
80        // input check.
81        let column_name: Option<Spanned<String>> = call.opt(engine_state, stack, 0)?;
82        let frequency_name_arg = call.opt::<Spanned<String>>(engine_state, stack, 1)?;
83        let frequency_column_name = match frequency_name_arg {
84            Some(inner) => {
85                let forbidden_column_names = ["value", "count", "quantile", "percentage"];
86                if forbidden_column_names.contains(&inner.item.as_str()) {
87                    return Err(ShellError::TypeMismatch {
88                        err_message: format!(
89                            "frequency-column-name can't be {}",
90                            forbidden_column_names
91                                .iter()
92                                .map(|val| format!("'{val}'"))
93                                .collect::<Vec<_>>()
94                                .join(", ")
95                        ),
96                        span: inner.span,
97                    });
98                }
99                inner.item
100            }
101            None => "frequency".to_string(),
102        };
103
104        let calc_method: Option<Spanned<String>> =
105            call.get_flag(engine_state, stack, "percentage-type")?;
106        let calc_method = match calc_method {
107            None => PercentageCalcMethod::Normalize,
108            Some(inner) => match inner.item.as_str() {
109                "normalize" => PercentageCalcMethod::Normalize,
110                "relative" => PercentageCalcMethod::Relative,
111                _ => {
112                    return Err(ShellError::TypeMismatch {
113                        err_message: "calc method can only be 'normalize' or 'relative'"
114                            .to_string(),
115                        span: inner.span,
116                    });
117                }
118            },
119        };
120
121        let span = call.head;
122        let data_as_value = input.into_value(span)?;
123        let value_span = data_as_value.span();
124        // `input` is not a list, here we can return an error.
125        run_histogram(
126            data_as_value.into_list()?,
127            column_name,
128            frequency_column_name,
129            calc_method,
130            span,
131            // Note that as_list() filters out Value::Error here.
132            value_span,
133        )
134    }
135}
136
137fn run_histogram(
138    values: Vec<Value>,
139    column_name: Option<Spanned<String>>,
140    freq_column: String,
141    calc_method: PercentageCalcMethod,
142    head_span: Span,
143    list_span: Span,
144) -> Result<PipelineData, ShellError> {
145    let mut inputs = vec![];
146    // convert from inputs to hashable values.
147    match column_name {
148        None => {
149            // some invalid input scenario needs to handle:
150            // Expect input is a list of hashable value, if one value is not hashable, throw out error.
151            for v in values {
152                match v {
153                    // Propagate existing errors.
154                    Value::Error { error, .. } => return Err(*error),
155                    _ => {
156                        let t = v.get_type();
157                        let span = v.span();
158                        inputs.push(HashableValue::from_value(v, head_span).map_err(|_| {
159                        ShellError::UnsupportedInput { msg: "Since column-name was not provided, only lists of hashable values are supported.".to_string(), input: format!(
160                                "input type: {t:?}"
161                            ), msg_span: head_span, input_span: span }
162                    })?)
163                    }
164                }
165            }
166        }
167        Some(ref col) => {
168            // some invalid input scenario needs to handle:
169            // * item in `input` is not a record, just skip it.
170            // * a record doesn't contain specific column, just skip it.
171            // * all records don't contain specific column, throw out error, indicate at least one row should contains specific column.
172            // * a record contain a value which can't be hashed, skip it.
173            let col_name = &col.item;
174            for v in values {
175                match v {
176                    // parse record, and fill valid value to actual input.
177                    Value::Record { val, .. } => {
178                        if let Some(v) = val.get(col_name) {
179                            if let Ok(v) = HashableValue::from_value(v.clone(), head_span) {
180                                inputs.push(v);
181                            }
182                        }
183                    }
184                    // Propagate existing errors.
185                    Value::Error { error, .. } => return Err(*error),
186                    _ => continue,
187                }
188            }
189
190            if inputs.is_empty() {
191                return Err(ShellError::CantFindColumn {
192                    col_name: col_name.clone(),
193                    span: Some(head_span),
194                    src_span: list_span,
195                });
196            }
197        }
198    }
199
200    let value_column_name = column_name
201        .map(|x| x.item)
202        .unwrap_or_else(|| "value".to_string());
203    Ok(histogram_impl(
204        inputs,
205        &value_column_name,
206        calc_method,
207        &freq_column,
208        head_span,
209    ))
210}
211
212fn histogram_impl(
213    inputs: Vec<HashableValue>,
214    value_column_name: &str,
215    calc_method: PercentageCalcMethod,
216    freq_column: &str,
217    span: Span,
218) -> PipelineData {
219    // here we can make sure that inputs is not empty, and every elements
220    // is a simple val and ok to make count.
221    let mut counter = HashMap::new();
222    let mut max_cnt = 0;
223    let total_cnt = inputs.len();
224    for i in inputs {
225        let new_cnt = *counter.get(&i).unwrap_or(&0) + 1;
226        counter.insert(i, new_cnt);
227        if new_cnt > max_cnt {
228            max_cnt = new_cnt;
229        }
230    }
231
232    let mut result = vec![];
233    const MAX_FREQ_COUNT: f64 = 100.0;
234    for (val, count) in counter.into_iter().sorted() {
235        let quantile = match calc_method {
236            PercentageCalcMethod::Normalize => count as f64 / total_cnt as f64,
237            PercentageCalcMethod::Relative => count as f64 / max_cnt as f64,
238        };
239
240        let percentage = format!("{:.2}%", quantile * 100_f64);
241        let freq = "*".repeat((MAX_FREQ_COUNT * quantile).floor() as usize);
242
243        result.push((
244            count, // attach count first for easily sorting.
245            Value::record(
246                record! {
247                    value_column_name => val.into_value(),
248                    "count" => Value::int(count, span),
249                    "quantile" => Value::float(quantile, span),
250                    "percentage" => Value::string(percentage, span),
251                    freq_column => Value::string(freq, span),
252                },
253                span,
254            ),
255        ));
256    }
257    result.sort_by(|a, b| b.0.cmp(&a.0));
258    Value::list(result.into_iter().map(|x| x.1).collect(), span).into_pipeline_data()
259}
260
261#[cfg(test)]
262mod tests {
263    use super::*;
264
265    #[test]
266    fn test_examples() {
267        use crate::test_examples;
268
269        test_examples(Histogram)
270    }
271}