oner_quantize/quantize/mod.rs
1// This Source Code Form is subject to the terms of the Mozilla Public
2// License, v. 2.0. If a copy of the MPL was not distributed with this
3// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4
5use super::interval::merge_neighbours_with_same_class;
6use crate::Interval;
7use ord_subset::OrdSubset;
8use ord_subset::OrdSubsetSliceExt;
9use std::fmt::Debug;
10use std::hash::Hash;
11
12mod splits;
13use splits::{intervals_from_splits, trim_splits};
14
15/// Quantize the given `attribute` (aka feature, column) into an ordered list of `Intervals`.
16///
17/// # Arguments
18///
19/// * `attribute` - a single attribute, typically numeric, to be quantized.
20/// * `classes` - the corresponsing class for each attribute value.
21/// * `small` - the small disjunct threshold, such as 3. There has to be at least one class in an interval with more than `small` values in the interval.
22///
23/// # Examples
24/// ```
25/// use oner_quantize::find_intervals;
26/// use oner_quantize::Interval;
27/// use oner_quantize::Interval::{Lower, Range, Upper};
28///
29/// // Fake data that has three clear splits:
30/// let attribute = vec![ 1, 10, 3, 1, 20, 30, 100];
31/// let classes = vec!["a", "b", "a", "a", "b", "b", "c"];
32///
33/// let intervals =
34/// find_intervals(&attribute, &classes, 2);
35///
36/// assert_eq!(intervals, vec![
37/// Lower { below: 10, class: "a" },
38/// Range { from: 10, below: 100, class: "b" },
39/// Upper { from: 100, class: "c" }
40/// ]);
41/// ```
42pub fn find_intervals<A, C>(attribute: &[A], classes: &[C], small: usize) -> Vec<Interval<A, C>>
43where
44 A: OrdSubset + Copy + Debug,
45 C: Eq + Hash + Copy + Debug,
46{
47 // 1. Get the attribute values (plus associated class) in attribute sorted order:
48 let mut sorted: Vec<(&A, &C)> = Vec::new();
49 for (v, c) in attribute.iter().zip(classes.iter()) {
50 sorted.push((v, c));
51 }
52 sorted.ord_subset_sort_by_key(|pair| pair.0);
53
54 // 2. Create a (tentative) split each time the attribute value changes.
55
56 // `split_index` contains indicies into `sorted` where we might split the attribute into an interval boundary.
57 // That is, a value of 1 in `split_index` means the attribute value at sorted[0] differs from sorted[1].
58 // The split happens between index 0 and 1 in that example.
59 let mut split_index = Vec::new();
60 for (prev_index, ((cur_value, _cur_class), (prev_value, _prev_class))) in
61 sorted.iter().skip(1).zip(sorted.iter()).enumerate()
62 {
63 if cur_value > prev_value {
64 split_index.push(prev_index + 1);
65 }
66 }
67
68 // 3. Remove splits that are too small:
69 let split_index_trimmed = trim_splits(split_index, small, &sorted);
70
71 // 4. Generate distinct intervals from the splits:
72 let intervals: Vec<Interval<A, C>> = intervals_from_splits(split_index_trimmed, &sorted);
73
74 // 5. Remove redundant intervals:
75 merge_neighbours_with_same_class(&intervals)
76}
77
78#[cfg(test)]
79mod tests {
80 use super::find_intervals;
81 use super::Interval;
82 #[test]
83 fn test_golf_example() {
84 // This example (inputs, and boundary points) comes from:
85 // Nevill-Manning, Holmes & Witten (1995) _The Development of Holte's 1R Classifier_, p. 2
86
87 let attrbibute = vec![64, 65, 68, 69, 70, 71, 72, 72, 75, 75, 80, 81, 83, 85];
88
89 let classes = vec!["p", "d", "p", "p", "p", "d", "p", "d", "p", "p", "d", "p", "p", "d"];
90
91 let actual = find_intervals(&attrbibute, &classes, 3);
92
93 let expected = vec![Interval::lower(85, "p"), Interval::upper(85, "d")];
94
95 assert_eq!(expected, actual);
96 }
97}
98
99/// Find which interval applies to a given attribute value.
100///
101/// # Examples
102///
103/// ```
104/// use oner_quantize::Interval;
105/// use oner_quantize::quantize;
106///
107/// let intervals = vec![
108/// Interval::lower(15, "x"),
109/// Interval::range(15, 20, "y"),
110/// Interval::upper(20, "z")
111/// ];
112///
113/// // Quantize into an interval, and extract the corresponding class:
114/// let class = |attribute_value| quantize(&intervals, attribute_value)
115/// .map(|interval| interval.class());
116///
117/// assert_eq!(class(10), Some(&"x"));
118/// assert_eq!(class(15), Some(&"y"));
119/// assert_eq!(class(99), Some(&"z"));
120/// ```
121pub fn quantize<A, C>(intervals: &[Interval<A, C>], attribute_value: A) -> Option<&Interval<A, C>>
122where
123 A: PartialOrd + Copy,
124 C: Copy,
125{
126 intervals.iter().find(|interval| interval.matches(attribute_value))
127}