asap_rs/smoothing/
adaptive_smoothing.rs

1//! Adaptive smoothing implementation for time series data.
2//!
3//! This module provides the core implementation of the ASAP algorithm that automatically
4//! determines the optimal window size for smoothing time series data to highlight important
5//! patterns while reducing noise.
6//!
7//! ## Algorithm Details
8//!
9//! The core ASAP algorithm works as follows:
10//!
11//! 1. For time series with strong periodicity, use autocorrelation to identify candidate
12//!    window sizes aligned with the periods in the data
13//!
14//! 2. For each candidate window size, apply a simple moving average and evaluate:
15//!    - The roughness of the result (standard deviation of differences)
16//!    - Whether kurtosis is preserved (to maintain the shape of the distribution)
17//!
18//! 3. Select the window size that minimizes roughness while preserving kurtosis
19//!
20//! 4. For data without strong periodicity, use binary search to find the optimal window size
21//!
22//! ## Pattern-Specific Handling
23//!
24//! This implementation includes specialized detection and handling for different types of patterns:
25//!
26//! ### Zigzag Patterns
27//!
28//! Zigzag patterns (rapidly alternating high-low values) present a special challenge for smoothing
29//! algorithms because:
30//!
31//! - They have extremely high roughness due to constant direction changes
32//! - Any smoothing dramatically alters their kurtosis
33//! - The strict kurtosis preservation constraint would prevent any effective smoothing
34//!
35//! The implementation detects zigzag patterns by analyzing:
36//!
37//! - The roughness relative to the data range
38//! - The percentage of points that alternate direction
39//!
40//! For detected zigzag patterns, the algorithm:
41//!
42//! - Uses a relaxed kurtosis constraint (allowing up to 40% reduction instead of 10%)
43//! - Prioritizes a window size of 2, which is mathematically optimal for alternating patterns
44//! - Ensures the resulting smoothed data effectively reduces the zigzag while preserving the mean
45//!
46//! ### Small Datasets
47//!
48//! For small datasets, the implementation uses a simplified approach with direct testing of
49//! a few window sizes rather than the full autocorrelation analysis, to avoid over-smoothing.
50
51
52use crate::statistics::Metrics;
53use crate::utils::ACF;
54use super::sma::sma;
55
56pub fn smooth(data: &[f64], resolution: usize) -> Vec<f64> {
57    // Return empty result for empty input
58    if data.is_empty() {
59        return Vec::new();
60    }
61    
62    let mut data = data.to_vec();
63    
64    // Ignore the last value if it's NaN
65    if data.last().map_or(false, |&x| x.is_nan()) {
66        data.pop();
67    }
68    
69    // If data is empty after removing NaN, return empty result
70    if data.is_empty() {
71        return Vec::new();
72    }
73
74    // For very small datasets relative to resolution, avoid over-smoothing
75    if data.len() <= resolution * 2 {
76        return data;
77    }
78
79    // Only perform preaggregation if we have significantly more points than resolution
80    let target_min_points = resolution.max(10);
81    
82    if data.len() > target_min_points * 10 {
83        // Calculate window size to achieve roughly target_min_points * 2
84        // This ensures we don't overly compress the data initially
85        let window = (data.len() / (target_min_points * 2)).max(1);
86        let slide = window; // Non-overlapping windows
87        data = sma(&data, window, slide);
88    }
89
90    // If we're down to very few points after preaggregation, just return
91    if data.len() <= 2 {
92        return data;
93    }
94
95    // Calculate metrics for original (potentially preaggregated) data
96    let metrics = Metrics::new(data.clone());
97    let original_kurt = metrics.kurtosis();
98    let original_roughness = metrics.roughness();
99    
100    // NEW: Check for zigzag pattern
101    let is_zigzag_pattern = detect_zigzag_pattern(&data, original_roughness);
102    
103    // Adjust kurtosis constraint based on data characteristics
104    // For zigzag patterns, we'll be more aggressive in smoothing
105    let kurtosis_threshold = if is_zigzag_pattern {
106        // Allow up to 40% reduction in kurtosis for zigzag patterns
107        original_kurt * 0.6
108    } else {
109        // Regular case - allow up to 10% reduction
110        original_kurt * 0.9
111    };
112
113    // For small datasets, use a simple approach
114    if data.len() < 20 {
115        // Try a few simple window sizes and pick the best one
116        let candidates = [2, 3, 5, 7];
117        let mut best_window = 1;
118        let mut min_roughness = original_roughness;
119        
120        for &w in &candidates {
121            if w >= data.len() {
122                continue;
123            }
124            
125            let smoothed = sma(&data, w, 1);
126            if smoothed.is_empty() {
127                continue;
128            }
129            
130            let metrics = Metrics::new(smoothed);
131            let roughness = metrics.roughness();
132            
133            // Only consider if kurtosis constraint is satisfied
134            if metrics.kurtosis() >= kurtosis_threshold && roughness < min_roughness {
135                min_roughness = roughness;
136                best_window = w;
137            }
138        }
139        
140        // If we found a better window, use it; otherwise return original
141        if best_window > 1 {
142            return sma(&data, best_window, 1);
143        }
144        
145        // Special case for zigzag patterns - if nothing worked above but it's a zigzag,
146        // force a window size of 2 to smooth it out
147        if is_zigzag_pattern {
148            return sma(&data, 2, 1);
149        }
150        
151        return data;
152    }
153
154    // ==== For larger datasets, use the ASAP algorithm with improvements ====
155    
156    // Calculate ACF with a safe max_lag (avoid empty data issues)
157    let max_lag = (data.len() / 10).max(1).min(50); // Cap at 50 to avoid excessive computation
158    let mut acf = ACF::new(data.clone(), max_lag);
159    let peaks = acf.find_peaks();
160    
161    // If no peaks found, use a reasonable default window
162    if peaks.is_empty() {
163        // ==== FIXED: More balanced default window size ====
164        let window_size = (data.len() / 10).max(2).min(data.len() / 2);
165        let smoothed = sma(&data, window_size, 1);
166        
167        // Check if this improves roughness while preserving kurtosis
168        let metrics = Metrics::new(smoothed.clone());
169        if metrics.kurtosis() >= kurtosis_threshold && metrics.roughness() < original_roughness {
170            return smoothed;
171        }
172        
173        // If zigzag, try with window size 2 (which specifically handles alternating patterns)
174        if is_zigzag_pattern {
175            let smoothed = sma(&data, 2, 1);
176            let metrics = Metrics::new(smoothed.clone());
177            if metrics.roughness() < original_roughness {
178                return smoothed;
179            }
180        }
181        
182        // If not, return original data
183        return data;
184    }
185    
186    // Initialize search variables
187    let mut min_obj = original_roughness;
188    let mut window_size = 1; // Start with original data (no smoothing)
189    let mut lb = 1;
190    let mut largest_feasible = usize::MAX;
191    let mut tail = (data.len() / 10).min(peaks[peaks.len() - 1]);
192
193    // Search peaks from largest to smallest with better constraints
194    for (i, &w) in peaks.iter().rev().enumerate() {
195        // Skip if window size is too large (prevents over-smoothing)
196        if w > data.len() / 2 {
197            continue;
198        }
199        
200        if w < lb || w == 1 {
201            break;
202        } else if (1.0 - acf.correlations[w]).sqrt() * window_size as f64 >
203                  (1.0 - acf.correlations[window_size]).sqrt() * w as f64 {
204            continue;
205        }
206
207        let smoothed = sma(&data, w, 1);
208        if smoothed.len() < 2 { // Skip if smoothed result is too small
209            continue;
210        }
211        
212        let metrics = Metrics::new(smoothed);
213        let roughness = metrics.roughness();
214
215        // Use the adjusted kurtosis threshold
216        if metrics.kurtosis() >= kurtosis_threshold {
217            if roughness < min_obj {
218                min_obj = roughness;
219                window_size = w;
220            }
221            // Update lower bound for future searches
222            let acf_ratio = ((acf.max_acf - 1.0) / (acf.correlations[w] - 1.0)).sqrt();
223            if acf_ratio.is_finite() && !acf_ratio.is_nan() {
224                lb = ((w as f64 * acf_ratio).round() as usize).max(lb);
225            }
226            if largest_feasible == usize::MAX {
227                largest_feasible = i;
228            }
229        }
230    }
231
232    // For zigzag patterns, if we haven't found a good window yet, force a window of 2
233    if window_size == 1 && is_zigzag_pattern {
234        window_size = 2;
235    }
236
237    // If we found a feasible window, refine with binary search
238    if window_size > 1 {
239        if largest_feasible != usize::MAX && largest_feasible < peaks.len().saturating_sub(2) {
240            tail = peaks[largest_feasible + 1];
241        }
242        lb = lb.max(if largest_feasible != usize::MAX { peaks[largest_feasible] + 1 } else { 1 });
243        
244        // Ensure binary search range is reasonable
245        if lb < tail && lb < data.len() / 2 {
246            window_size = binary_search(lb, tail, &data, min_obj, kurtosis_threshold, window_size, is_zigzag_pattern);
247        }
248    } else {
249        // If no feasible window found in peak search, try a conservative default
250        let default_window = (data.len() / 20).max(2).min(data.len() / 3);
251        let smoothed = sma(&data, default_window, 1);
252        
253        if !smoothed.is_empty() {
254            let metrics = Metrics::new(smoothed.clone());
255            if metrics.kurtosis() >= kurtosis_threshold && metrics.roughness() < original_roughness {
256                return smoothed;
257            }
258        }
259        
260        // If still no improvement but it's a zigzag pattern, try window size 2
261        if is_zigzag_pattern {
262            let smoothed = sma(&data, 2, 1);
263            if !smoothed.is_empty() {
264                let metrics = Metrics::new(smoothed.clone());
265                if metrics.roughness() < original_roughness {
266                    return smoothed;
267                }
268            }
269        }
270        
271        // If still no improvement, return original data
272        return data;
273    }
274
275    // Ensure window_size is reasonable and won't over-smooth
276    window_size = window_size.max(2).min(data.len() / 3);
277    
278    // Final smoothing with selected window
279    let smoothed = sma(&data, window_size, 1);
280
281    // Calculate metrics for original and smoothed data to compare roughness
282    let original_roughness = Metrics::new(data.clone()).roughness();
283    let smoothed_roughness = Metrics::new(smoothed.clone()).roughness();
284
285    // If smoothing made roughness worse or result is too small, return original data
286    if smoothed.len() < 2 || smoothed_roughness >= original_roughness * 1.1 {
287        return data;
288    }
289
290    smoothed
291}
292
293// Helper function to detect zigzag patterns
294fn detect_zigzag_pattern(data: &[f64], roughness: f64) -> bool {
295    // If data is too short, it's not meaningful to check
296    if data.len() < 4 {
297        return false;
298    }
299    
300    // Zigzag patterns have very high roughness relative to the range of values
301    let max_val = data.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
302    let min_val = data.iter().fold(f64::INFINITY, |a, &b| a.min(b));
303    let data_range = max_val - min_val;
304    
305    // If range is effectively zero, it's not a zigzag
306    if data_range < 1e-10 {
307        return false;
308    }
309    
310    // Check for alternating pattern (values go up, down, up, down...)
311    let mut alternating_count = 0;
312    for i in 1..data.len()-1 {
313        let prev_diff = data[i] - data[i-1];
314        let next_diff = data[i+1] - data[i];
315        if prev_diff * next_diff < 0.0 {  // Different signs
316            alternating_count += 1;
317        }
318    }
319    
320    let alternating_ratio = alternating_count as f64 / (data.len() - 2) as f64;
321    
322    // Zigzag patterns have:
323    // 1. High roughness relative to data range
324    // 2. High percentage of alternating points
325    roughness > data_range * 0.5 && alternating_ratio > 0.7
326}
327
328fn binary_search(mut head: usize, mut tail: usize, data: &[f64], mut min_obj: f64, kurtosis_threshold: f64, mut window_size: usize, is_zigzag: bool) -> usize {
329    // Handle edge cases
330    if data.is_empty() || head > data.len() || tail > data.len() {
331        return window_size;
332    }
333    
334    head = head.min(data.len() / 2); // Prevent window sizes larger than half the data
335    tail = tail.min(data.len() / 2);
336    
337    // Skip search if head > tail
338    if head > tail {
339        return window_size;
340    }
341
342    // For zigzag patterns, prefer window size 2 if it works
343    if is_zigzag && head <= 2 && 2 <= tail {
344        let smoothed = sma(data, 2, 1);
345        if !smoothed.is_empty() {
346            let metrics = Metrics::new(smoothed);
347            if metrics.roughness() < min_obj {
348                return 2;
349            }
350        }
351    }
352    
353    while head <= tail {
354        let w = (head + tail) / 2;
355        let smoothed = sma(data, w, 1);
356        
357        // Skip if smoothed is empty or too small
358        if smoothed.len() < 2 {
359            break;
360        }
361        
362        let metrics = Metrics::new(smoothed);
363        // Use the adjusted kurtosis threshold
364        if metrics.kurtosis() >= kurtosis_threshold {
365            // Search second half if feasible
366            let roughness = metrics.roughness();
367            if roughness < min_obj {
368                window_size = w;
369                min_obj = roughness;
370            }
371            head = w + 1;
372        } else {
373            // Search first half
374            tail = w.saturating_sub(1);
375        }
376    }
377    
378    window_size
379}