asap_rs/smoothing/adaptive_smoothing.rs
1//! Adaptive smoothing implementation for time series data.
2//!
3//! This module provides the core implementation of the ASAP algorithm that automatically
4//! determines the optimal window size for smoothing time series data to highlight important
5//! patterns while reducing noise.
6//!
7//! ## Algorithm Details
8//!
9//! The core ASAP algorithm works as follows:
10//!
11//! 1. For time series with strong periodicity, use autocorrelation to identify candidate
12//! window sizes aligned with the periods in the data
13//!
14//! 2. For each candidate window size, apply a simple moving average and evaluate:
15//! - The roughness of the result (standard deviation of differences)
16//! - Whether kurtosis is preserved (to maintain the shape of the distribution)
17//!
18//! 3. Select the window size that minimizes roughness while preserving kurtosis
19//!
20//! 4. For data without strong periodicity, use binary search to find the optimal window size
21//!
22//! ## Pattern-Specific Handling
23//!
24//! This implementation includes specialized detection and handling for different types of patterns:
25//!
26//! ### Zigzag Patterns
27//!
28//! Zigzag patterns (rapidly alternating high-low values) present a special challenge for smoothing
29//! algorithms because:
30//!
31//! - They have extremely high roughness due to constant direction changes
32//! - Any smoothing dramatically alters their kurtosis
33//! - The strict kurtosis preservation constraint would prevent any effective smoothing
34//!
35//! The implementation detects zigzag patterns by analyzing:
36//!
37//! - The roughness relative to the data range
38//! - The percentage of points that alternate direction
39//!
40//! For detected zigzag patterns, the algorithm:
41//!
42//! - Uses a relaxed kurtosis constraint (allowing up to 40% reduction instead of 10%)
43//! - Prioritizes a window size of 2, which is mathematically optimal for alternating patterns
44//! - Ensures the resulting smoothed data effectively reduces the zigzag while preserving the mean
45//!
46//! ### Small Datasets
47//!
48//! For small datasets, the implementation uses a simplified approach with direct testing of
49//! a few window sizes rather than the full autocorrelation analysis, to avoid over-smoothing.
50
51
52use crate::statistics::Metrics;
53use crate::utils::ACF;
54use super::sma::sma;
55
56pub fn smooth(data: &[f64], resolution: usize) -> Vec<f64> {
57 // Return empty result for empty input
58 if data.is_empty() {
59 return Vec::new();
60 }
61
62 let mut data = data.to_vec();
63
64 // Ignore the last value if it's NaN
65 if data.last().map_or(false, |&x| x.is_nan()) {
66 data.pop();
67 }
68
69 // If data is empty after removing NaN, return empty result
70 if data.is_empty() {
71 return Vec::new();
72 }
73
74 // For very small datasets relative to resolution, avoid over-smoothing
75 if data.len() <= resolution * 2 {
76 return data;
77 }
78
79 // Only perform preaggregation if we have significantly more points than resolution
80 let target_min_points = resolution.max(10);
81
82 if data.len() > target_min_points * 10 {
83 // Calculate window size to achieve roughly target_min_points * 2
84 // This ensures we don't overly compress the data initially
85 let window = (data.len() / (target_min_points * 2)).max(1);
86 let slide = window; // Non-overlapping windows
87 data = sma(&data, window, slide);
88 }
89
90 // If we're down to very few points after preaggregation, just return
91 if data.len() <= 2 {
92 return data;
93 }
94
95 // Calculate metrics for original (potentially preaggregated) data
96 let metrics = Metrics::new(data.clone());
97 let original_kurt = metrics.kurtosis();
98 let original_roughness = metrics.roughness();
99
100 // NEW: Check for zigzag pattern
101 let is_zigzag_pattern = detect_zigzag_pattern(&data, original_roughness);
102
103 // Adjust kurtosis constraint based on data characteristics
104 // For zigzag patterns, we'll be more aggressive in smoothing
105 let kurtosis_threshold = if is_zigzag_pattern {
106 // Allow up to 40% reduction in kurtosis for zigzag patterns
107 original_kurt * 0.6
108 } else {
109 // Regular case - allow up to 10% reduction
110 original_kurt * 0.9
111 };
112
113 // For small datasets, use a simple approach
114 if data.len() < 20 {
115 // Try a few simple window sizes and pick the best one
116 let candidates = [2, 3, 5, 7];
117 let mut best_window = 1;
118 let mut min_roughness = original_roughness;
119
120 for &w in &candidates {
121 if w >= data.len() {
122 continue;
123 }
124
125 let smoothed = sma(&data, w, 1);
126 if smoothed.is_empty() {
127 continue;
128 }
129
130 let metrics = Metrics::new(smoothed);
131 let roughness = metrics.roughness();
132
133 // Only consider if kurtosis constraint is satisfied
134 if metrics.kurtosis() >= kurtosis_threshold && roughness < min_roughness {
135 min_roughness = roughness;
136 best_window = w;
137 }
138 }
139
140 // If we found a better window, use it; otherwise return original
141 if best_window > 1 {
142 return sma(&data, best_window, 1);
143 }
144
145 // Special case for zigzag patterns - if nothing worked above but it's a zigzag,
146 // force a window size of 2 to smooth it out
147 if is_zigzag_pattern {
148 return sma(&data, 2, 1);
149 }
150
151 return data;
152 }
153
154 // ==== For larger datasets, use the ASAP algorithm with improvements ====
155
156 // Calculate ACF with a safe max_lag (avoid empty data issues)
157 let max_lag = (data.len() / 10).max(1).min(50); // Cap at 50 to avoid excessive computation
158 let mut acf = ACF::new(data.clone(), max_lag);
159 let peaks = acf.find_peaks();
160
161 // If no peaks found, use a reasonable default window
162 if peaks.is_empty() {
163 // ==== FIXED: More balanced default window size ====
164 let window_size = (data.len() / 10).max(2).min(data.len() / 2);
165 let smoothed = sma(&data, window_size, 1);
166
167 // Check if this improves roughness while preserving kurtosis
168 let metrics = Metrics::new(smoothed.clone());
169 if metrics.kurtosis() >= kurtosis_threshold && metrics.roughness() < original_roughness {
170 return smoothed;
171 }
172
173 // If zigzag, try with window size 2 (which specifically handles alternating patterns)
174 if is_zigzag_pattern {
175 let smoothed = sma(&data, 2, 1);
176 let metrics = Metrics::new(smoothed.clone());
177 if metrics.roughness() < original_roughness {
178 return smoothed;
179 }
180 }
181
182 // If not, return original data
183 return data;
184 }
185
186 // Initialize search variables
187 let mut min_obj = original_roughness;
188 let mut window_size = 1; // Start with original data (no smoothing)
189 let mut lb = 1;
190 let mut largest_feasible = usize::MAX;
191 let mut tail = (data.len() / 10).min(peaks[peaks.len() - 1]);
192
193 // Search peaks from largest to smallest with better constraints
194 for (i, &w) in peaks.iter().rev().enumerate() {
195 // Skip if window size is too large (prevents over-smoothing)
196 if w > data.len() / 2 {
197 continue;
198 }
199
200 if w < lb || w == 1 {
201 break;
202 } else if (1.0 - acf.correlations[w]).sqrt() * window_size as f64 >
203 (1.0 - acf.correlations[window_size]).sqrt() * w as f64 {
204 continue;
205 }
206
207 let smoothed = sma(&data, w, 1);
208 if smoothed.len() < 2 { // Skip if smoothed result is too small
209 continue;
210 }
211
212 let metrics = Metrics::new(smoothed);
213 let roughness = metrics.roughness();
214
215 // Use the adjusted kurtosis threshold
216 if metrics.kurtosis() >= kurtosis_threshold {
217 if roughness < min_obj {
218 min_obj = roughness;
219 window_size = w;
220 }
221 // Update lower bound for future searches
222 let acf_ratio = ((acf.max_acf - 1.0) / (acf.correlations[w] - 1.0)).sqrt();
223 if acf_ratio.is_finite() && !acf_ratio.is_nan() {
224 lb = ((w as f64 * acf_ratio).round() as usize).max(lb);
225 }
226 if largest_feasible == usize::MAX {
227 largest_feasible = i;
228 }
229 }
230 }
231
232 // For zigzag patterns, if we haven't found a good window yet, force a window of 2
233 if window_size == 1 && is_zigzag_pattern {
234 window_size = 2;
235 }
236
237 // If we found a feasible window, refine with binary search
238 if window_size > 1 {
239 if largest_feasible != usize::MAX && largest_feasible < peaks.len().saturating_sub(2) {
240 tail = peaks[largest_feasible + 1];
241 }
242 lb = lb.max(if largest_feasible != usize::MAX { peaks[largest_feasible] + 1 } else { 1 });
243
244 // Ensure binary search range is reasonable
245 if lb < tail && lb < data.len() / 2 {
246 window_size = binary_search(lb, tail, &data, min_obj, kurtosis_threshold, window_size, is_zigzag_pattern);
247 }
248 } else {
249 // If no feasible window found in peak search, try a conservative default
250 let default_window = (data.len() / 20).max(2).min(data.len() / 3);
251 let smoothed = sma(&data, default_window, 1);
252
253 if !smoothed.is_empty() {
254 let metrics = Metrics::new(smoothed.clone());
255 if metrics.kurtosis() >= kurtosis_threshold && metrics.roughness() < original_roughness {
256 return smoothed;
257 }
258 }
259
260 // If still no improvement but it's a zigzag pattern, try window size 2
261 if is_zigzag_pattern {
262 let smoothed = sma(&data, 2, 1);
263 if !smoothed.is_empty() {
264 let metrics = Metrics::new(smoothed.clone());
265 if metrics.roughness() < original_roughness {
266 return smoothed;
267 }
268 }
269 }
270
271 // If still no improvement, return original data
272 return data;
273 }
274
275 // Ensure window_size is reasonable and won't over-smooth
276 window_size = window_size.max(2).min(data.len() / 3);
277
278 // Final smoothing with selected window
279 let smoothed = sma(&data, window_size, 1);
280
281 // Calculate metrics for original and smoothed data to compare roughness
282 let original_roughness = Metrics::new(data.clone()).roughness();
283 let smoothed_roughness = Metrics::new(smoothed.clone()).roughness();
284
285 // If smoothing made roughness worse or result is too small, return original data
286 if smoothed.len() < 2 || smoothed_roughness >= original_roughness * 1.1 {
287 return data;
288 }
289
290 smoothed
291}
292
293// Helper function to detect zigzag patterns
294fn detect_zigzag_pattern(data: &[f64], roughness: f64) -> bool {
295 // If data is too short, it's not meaningful to check
296 if data.len() < 4 {
297 return false;
298 }
299
300 // Zigzag patterns have very high roughness relative to the range of values
301 let max_val = data.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
302 let min_val = data.iter().fold(f64::INFINITY, |a, &b| a.min(b));
303 let data_range = max_val - min_val;
304
305 // If range is effectively zero, it's not a zigzag
306 if data_range < 1e-10 {
307 return false;
308 }
309
310 // Check for alternating pattern (values go up, down, up, down...)
311 let mut alternating_count = 0;
312 for i in 1..data.len()-1 {
313 let prev_diff = data[i] - data[i-1];
314 let next_diff = data[i+1] - data[i];
315 if prev_diff * next_diff < 0.0 { // Different signs
316 alternating_count += 1;
317 }
318 }
319
320 let alternating_ratio = alternating_count as f64 / (data.len() - 2) as f64;
321
322 // Zigzag patterns have:
323 // 1. High roughness relative to data range
324 // 2. High percentage of alternating points
325 roughness > data_range * 0.5 && alternating_ratio > 0.7
326}
327
328fn binary_search(mut head: usize, mut tail: usize, data: &[f64], mut min_obj: f64, kurtosis_threshold: f64, mut window_size: usize, is_zigzag: bool) -> usize {
329 // Handle edge cases
330 if data.is_empty() || head > data.len() || tail > data.len() {
331 return window_size;
332 }
333
334 head = head.min(data.len() / 2); // Prevent window sizes larger than half the data
335 tail = tail.min(data.len() / 2);
336
337 // Skip search if head > tail
338 if head > tail {
339 return window_size;
340 }
341
342 // For zigzag patterns, prefer window size 2 if it works
343 if is_zigzag && head <= 2 && 2 <= tail {
344 let smoothed = sma(data, 2, 1);
345 if !smoothed.is_empty() {
346 let metrics = Metrics::new(smoothed);
347 if metrics.roughness() < min_obj {
348 return 2;
349 }
350 }
351 }
352
353 while head <= tail {
354 let w = (head + tail) / 2;
355 let smoothed = sma(data, w, 1);
356
357 // Skip if smoothed is empty or too small
358 if smoothed.len() < 2 {
359 break;
360 }
361
362 let metrics = Metrics::new(smoothed);
363 // Use the adjusted kurtosis threshold
364 if metrics.kurtosis() >= kurtosis_threshold {
365 // Search second half if feasible
366 let roughness = metrics.roughness();
367 if roughness < min_obj {
368 window_size = w;
369 min_obj = roughness;
370 }
371 head = w + 1;
372 } else {
373 // Search first half
374 tail = w.saturating_sub(1);
375 }
376 }
377
378 window_size
379}