1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
//! This crate contains an implementation of the HOT SAX algorithm, and
//! the brute force algorithm, as proposed by [Keogh et al.](http://www.cse.cuhk.edu.hk/~adafu/Pub/icdm05time.pdf).
//! It also includes the [HS-Squeezer](https://dl.acm.org/doi/abs/10.1145/3287921.3287929) algorithm,
//! since it offers useful optimizations, while still being heavily based on the HOT SAX algorithm.
//! However, from testing, the performance of this implementation is similar or worse than HOT SAX.
//! If you find any possible optimizations, please open an issue.
//!
//! During the implementation some other functions had to be made, such as `paa`, `znorm`, and
//! `gaussian`. These functions are exposed, due to their utility apart from being used in HOT SAX.
//!
//! The code is well commented in order to explain the implementation, in the case that people want
//! to learn how the HOT SAX algorithm works by looking at an implementation. If a part is vaguely
//! commented, feel free to leave an issue.
//!
//! Note that only `Float` vectors are supported. If your data is made up of integers, you need to
//! convert it to float first.
//!
//! ## Example of use
//! ```ignore
//! use std::error::Error;
//! use plotly::{Plot, Scatter};
//!
//! # fn main() -> Result<(), Box<dyn Error>> {
//! // Parses the CSV file from the dataset.
//! let mut rdr = csv::ReaderBuilder::new()
//!     .trim(csv::Trim::All)
//!     .from_path("data/TEK16.CSV")?;
//!
//! // Deserialize CSV data into a vector of floats.
//! let mut data : Vec<f64> = Vec::new();
//! for result in rdr.deserialize() {
//!     data.push(result?);
//! }
//!
//! // Prepare a plot
//! let mut plot = Plot::new();
//!
//! // Retrieve the largest discord. This should approx. match the one found in the paper.
//! // It uses the same settings: a discord size of 256 and a=3.
//! // word_size was assumed to be 3.
//! let discord_size = 256;
//! let discord = hotsax::Anomaly::with(&data, discord_size)
//!     .use_slice(1000..)      // Skips the beginning due to an abnormality.
//!     .find_largest_discord() // Finds the largest discord in the subslice.
//!     .unwrap().1;            // Only gets the location.
//!
//! // Plot the entire dataset as a blue color.
//! let trace1 = Scatter::new((1..=data.len()).collect(), data.clone())
//!     .line(plotly::common::Line::new().color(plotly::NamedColor::Blue))
//!     .name("Data");
//!
//! // Plot the discord itself as a red color.
//! let trace2 = Scatter::new((discord+1..discord+discord_size+1).collect(), data[discord..discord+128].to_vec())
//!     .line(plotly::common::Line::new().color(plotly::NamedColor::Red))
//!     .name("Discord");
//!
//! // Add traces to the plot.
//! plot.add_trace(trace1);
//! plot.add_trace(trace2);
//!
//! // Shows the plot to verify.
//! plot.show();
//! # Ok(())
//! # }
//! ```

/// Implements anomaly detection algorithms, including the brute force and
/// HOT SAX algorithms as specified by Keogh's paper, found
/// [here](http://www.cse.cuhk.edu.hk/~adafu/Pub/icdm05time.pdf).
pub mod anomaly;
pub use anomaly::Anomaly;

/// Dimensionality reduction techniques.
///
/// Used in the implementation of `HOTSAX`, but can be used externally as well.
pub mod dim_reduction;
pub use dim_reduction::{paa, sax};

/// Miscellaneous utility functions.
pub mod util;
pub use util::{gaussian, znorm, mean, std_dev};

/// Clustering functions and squeezer impl
pub mod squeezer;
pub use squeezer::squeezer;
pub use anomaly::Algorithm;

pub(crate) mod trie;

#[cfg(test)]
mod test {
    use plotly::{Plot, Scatter, Layout};

    static DISCORD_SIZE: usize = 128;
    static DISCORD_AMNT: usize = 1;
    static MIN_DIST: f64 = 2.00;

    #[test]
    fn multi_discord() -> Result<(), Box<dyn std::error::Error>> {

        // Initialises the CSV reader.
        let mut rdr = csv::ReaderBuilder::new()
            .trim(csv::Trim::All)
            .from_path("data/TEK17.csv")?;

        // Preparing Y axis...
        let mut data: Vec<f64> = Vec::new();

        // Retrieve all data.
        for record in rdr.deserialize() {
            data.push(record?);
        }

        // let data = trailing_moving_average(&data, 0);

        // Retrieve all discords.
        let discords = crate::Anomaly::with(&data, DISCORD_SIZE)
            .use_algo(crate::Algorithm::Squeezer(0.85))
            // .sax_word_length(5)
            // .dim_reduce(800)
            // .use_slice(4000..)
            // .find_discords_min_dist(MIN_DIST);
            .find_n_largest_discords(DISCORD_AMNT);

        println!("{} DISCORDS FOUND!", discords.len());

        let indexes : Vec<_> = (1..=5000).collect();

        // Initialise plot with original data.
        let mut plot = Plot::new();
        let temps = Scatter::new(indexes.clone(), data.clone()).name("temps");
        plot.add_trace(temps);

        // Plots all discords.
        plot_discords(&mut plot, discords, &indexes, &data);
        // Attaches a title to the plot.
        attach_title(&mut plot, "Temperature 2019, Buchan Sydney");

        // Shows the plot.
        plot.show();

        Ok(())
    }

    // Plots all discords onto the passed in plot.
    fn plot_discords(plot: &mut Plot, discords: Vec<(f64, usize)>, x_axis: &Vec<u32>, data: &Vec<f64>) {
        for i in 0..discords.len() {
            let discord = discords[i];
            let loc = discord.1;
            let dist = discord.0;

            plot.add_trace(
                Scatter::new(
                    x_axis[loc..loc+DISCORD_SIZE].into(),
                    data[loc..loc+DISCORD_SIZE].into()
                ).name(&format!("Discord #{} ({:.2})", i, dist))
            );
        }
    }

    fn attach_title(plot: &mut Plot, title: &str) {
        plot.set_layout(
            Layout::new()
                .title(plotly::common::Title::new(title))
        );
    }

    /// Averages the data using a centered moving average.
    ///
    /// Data[i] = Mean(Data[i-n..i+n]);
    fn centered_moving_average(data: &Vec<f64>, n: usize) -> Vec<f64> {
        if n == 0 { return data.clone() }

        let mut out = Vec::new();
        let len = data.len();

        for i in 0..n {
            out.push(crate::mean(&&data[0..i]));
        }

        for i in n..len-n {
            out.push(crate::mean(&&data[i-n..i+n]));
        }

        for i in len-n..len {
            out.push(crate::mean(&&data[i-n..i]));
        }

        out
    }

    /// Averages the data using a trailing moving average.
    ///
    /// Data[i] = Mean(Data[i-n..i]);
    fn trailing_moving_average(data: &Vec<f64>, n: usize) -> Vec<f64> {
        if n == 0 { return data.clone() }

        let mut out = Vec::new();
        let len = data.len();

        for i in 0..n {
            out.push(crate::mean(&&data[0..i]));
        }

        for i in n..len {
            out.push(crate::mean(&&data[i-n..i]));
        }

        out
    }
}