Skip to main content

oner_quantize/quantize/
mod.rs

1// This Source Code Form is subject to the terms of the Mozilla Public
2// License, v. 2.0. If a copy of the MPL was not distributed with this
3// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4
5use super::interval::merge_neighbours_with_same_class;
6use crate::Interval;
7use ord_subset::OrdSubset;
8use ord_subset::OrdSubsetSliceExt;
9use std::fmt::Debug;
10use std::hash::Hash;
11
12mod splits;
13use splits::{intervals_from_splits, trim_splits};
14
15/// Quantize the given `attribute` (aka feature, column) into an ordered list of `Intervals`.
16///
17/// # Arguments
18///
19/// * `attribute` - a single attribute, typically numeric, to be quantized.
20/// * `classes` - the corresponsing class for each attribute value.
21/// * `small` -  the small disjunct threshold, such as 3. There has to be at least one class in an interval with more than `small` values in the interval.
22///
23/// # Examples
24/// ```
25/// use oner_quantize::find_intervals;
26/// use oner_quantize::Interval;
27/// use oner_quantize::Interval::{Lower, Range, Upper};
28///
29/// // Fake data that has three clear splits:
30/// let attribute = vec![  1, 10,   3,   1,  20,  30,  100];
31/// let classes   = vec!["a", "b", "a", "a", "b", "b", "c"];
32///
33/// let intervals =
34///    find_intervals(&attribute, &classes, 2);
35///
36/// assert_eq!(intervals, vec![
37///   Lower { below: 10, class: "a" },
38///   Range { from: 10, below: 100, class: "b" },
39///   Upper { from: 100, class: "c" }
40/// ]);
41/// ```
42pub fn find_intervals<A, C>(attribute: &[A], classes: &[C], small: usize) -> Vec<Interval<A, C>>
43where
44    A: OrdSubset + Copy + Debug,
45    C: Eq + Hash + Copy + Debug,
46{
47    // 1. Get the attribute values (plus associated class) in attribute sorted order:
48    let mut sorted: Vec<(&A, &C)> = Vec::new();
49    for (v, c) in attribute.iter().zip(classes.iter()) {
50        sorted.push((v, c));
51    }
52    sorted.ord_subset_sort_by_key(|pair| pair.0);
53
54    // 2. Create a (tentative) split each time the attribute value changes.
55
56    // `split_index` contains indicies into `sorted` where we might split the attribute into an interval boundary.
57    // That is, a value of 1 in `split_index` means the attribute value at sorted[0] differs from sorted[1].
58    // The split happens between index 0 and 1 in that example.
59    let mut split_index = Vec::new();
60    for (prev_index, ((cur_value, _cur_class), (prev_value, _prev_class))) in
61        sorted.iter().skip(1).zip(sorted.iter()).enumerate()
62    {
63        if cur_value > prev_value {
64            split_index.push(prev_index + 1);
65        }
66    }
67
68    // 3. Remove splits that are too small:
69    let split_index_trimmed = trim_splits(split_index, small, &sorted);
70
71    // 4. Generate distinct intervals from the splits:
72    let intervals: Vec<Interval<A, C>> = intervals_from_splits(split_index_trimmed, &sorted);
73
74    // 5. Remove redundant intervals:
75    merge_neighbours_with_same_class(&intervals)
76}
77
78#[cfg(test)]
79mod tests {
80    use super::find_intervals;
81    use super::Interval;
82    #[test]
83    fn test_golf_example() {
84        // This example (inputs, and boundary points) comes from:
85        // Nevill-Manning, Holmes & Witten (1995)  _The Development of Holte's 1R Classifier_, p. 2
86
87        let attrbibute = vec![64, 65, 68, 69, 70, 71, 72, 72, 75, 75, 80, 81, 83, 85];
88
89        let classes = vec!["p", "d", "p", "p", "p", "d", "p", "d", "p", "p", "d", "p", "p", "d"];
90
91        let actual = find_intervals(&attrbibute, &classes, 3);
92
93        let expected = vec![Interval::lower(85, "p"), Interval::upper(85, "d")];
94
95        assert_eq!(expected, actual);
96    }
97}
98
99/// Find which interval applies to a given attribute value.
100///
101/// # Examples
102///
103/// ```
104/// use oner_quantize::Interval;
105/// use oner_quantize::quantize;
106///
107/// let intervals = vec![
108///     Interval::lower(15, "x"),
109///     Interval::range(15, 20, "y"),
110///     Interval::upper(20, "z")
111/// ];
112///
113/// // Quantize into an interval, and extract the corresponding class:
114/// let class = |attribute_value| quantize(&intervals, attribute_value)
115///   .map(|interval| interval.class());
116///
117/// assert_eq!(class(10), Some(&"x"));
118/// assert_eq!(class(15), Some(&"y"));
119/// assert_eq!(class(99), Some(&"z"));
120/// ```
121pub fn quantize<A, C>(intervals: &[Interval<A, C>], attribute_value: A) -> Option<&Interval<A, C>>
122where
123    A: PartialOrd + Copy,
124    C: Copy,
125{
126    intervals.iter().find(|interval| interval.matches(attribute_value))
127}