fekan/kan_layer.rs
1// #![allow(dead_code)]
2pub(crate) mod edge;
3
4use crate::layer_errors::LayerError;
5use edge::{linspace, Edge};
6use log::{debug, trace};
7use rand::distributions::Distribution;
8use rand::thread_rng;
9use serde::{Deserialize, Serialize};
10use statrs::distribution::Normal; // apparently the statrs distributions use the rand Distribution trait
11
12use std::{
13 collections::VecDeque,
14 sync::{Arc, Mutex},
15 thread::{self, ScopedJoinHandle},
16 vec,
17};
18
19/// A layer in a Kolmogorov-Arnold neural Network (KAN)
20///
21/// A KAN layer consists of a number of nodes equal to the output dimension of the layer.
22/// Each node has a number of incoming edges equal to the input dimension of the layer, and each edge holds a B-spline that operates on the value travelling down the edge
23
24#[derive(Debug, Serialize, Deserialize, Clone)]
25pub struct KanLayer {
26 // I think it will make sense to have each KanLayer be a vector of splines, plus the input and output dimension.
27 // the first `out_dim` splines will read from the first input, the second `out_dim` splines will read from the second input, etc., with `in_dim` such chunks
28 // to caluclate the output of the layer, the first element is the sum of the output of splines 0, out_dim, 2*out_dim, etc., the second element is the sum of splines 1, out_dim+1, 2*out_dim+1, etc.
29 /// the splines in this layer. The first `output_dimension` splines read from the first "input node", the second `output_dimension` splines read from the second "input node", etc.
30 /// /-1--X /4---X
31 /// O - 2 O /
32 /// \ \-X / /-X
33 /// O \ O ---5-
34 /// 3--X \--6---X
35 pub(crate) splines: Vec<Edge>,
36 input_dimension: usize,
37 output_dimension: usize,
38 /// a vector of previous inputs to the layer, used to update the knot vectors for each incoming edge.
39 ///
40 /// dim0 = number of samples
41 ///
42 /// dim1 = input_dimension
43 #[serde(skip)] // part of the layer's operating state, not part of the model
44 samples: Vec<Vec<f64>>,
45
46 #[serde(skip)] // part of the layer's operating state, not part of the model
47 layer_l1: Option<f64>,
48}
49
50/// Hyperparameters for a KanLayer
51///
52/// # Examples
53/// see [`KanLayer::new`]
54#[derive(Debug, Copy, Clone)]
55#[allow(missing_docs)]
56pub struct KanLayerOptions {
57 pub input_dimension: usize,
58 pub output_dimension: usize,
59 pub degree: usize,
60 pub coef_size: usize,
61}
62
63impl KanLayer {
64 /// create a new layer with `output_dimension` nodes in this layer that each expect an `input_dimension`-long preactivation vector.
65 ///
66 /// All incoming edges will be created with a degree `degree` B-spline and `coef_size` control points.
67 ///
68 /// All B-splines areinitialized with coefficients drawn from astandard normal distribution, and with
69 /// `degree + coef_size + 1` knots evenly spaced between -1.0 and 1.0. Because knots are always initialized to span the range [-1, 1], make sure you call [`KanLayer::update_knots_from_samples`] regularly during training, or at least after a good portion of the training data has been passed through the model, to ensure that the layer's supported input range covers the range spanned by the training data.
70 /// # Warning
71 /// If you plan on ever calling [`KanLayer::update_knots_from_samples`] on your layer, make sure coef_size >= 2 * degree + 1. [`KanLayer::update_knots_from_samples`] reserves the first and last `degree` knots as "padding", and you will get NaNs when you call [`KanLayer::forward`] after updating knots if there aren't enough "non-padding" knots
72 ///
73 /// If you don't plan on calling [`KanLayer::update_knots_from_samples`], any coef_size >= degree + 1 should be fine
74 /// # Examples
75 /// ```
76 /// use fekan::kan_layer::{KanLayer, KanLayerOptions};
77 /// let input_dimension = 3;
78 /// let output_dimension = 4;
79 /// let layer_options = KanLayerOptions {
80 /// input_dimension,
81 /// output_dimension,
82 /// degree: 3,
83 /// coef_size: 6,
84 /// };
85 /// let my_layer = KanLayer::new(&layer_options);
86 /// assert_eq!(my_layer.total_edges(), output_dimension * input_dimension);
87 /// ```
88 pub fn new(options: &KanLayerOptions) -> Self {
89 let num_edges = options.input_dimension * options.output_dimension;
90 let num_knots = options.coef_size + options.degree + 1;
91 let normal_dist = Normal::new(0.0, 0.1).expect("unable to create normal distribution");
92 let mut randomness = thread_rng();
93 let splines = (0..num_edges)
94 .map(|_| {
95 let coefficients: Vec<f64> = (0..options.coef_size)
96 .map(|_| normal_dist.sample(&mut randomness) as f64)
97 .collect();
98 Edge::new(options.degree, coefficients, linspace(-1.0, 1.0, num_knots))
99 .expect("spline creation error")
100 })
101 .collect();
102
103 KanLayer {
104 splines,
105 input_dimension: options.input_dimension,
106 output_dimension: options.output_dimension,
107 samples: Vec::new(),
108 layer_l1: None,
109 }
110 }
111
112 // pub fn len(&self) -> usize {
113 // self.nodes.len()
114 // }
115
116 // pub fn total_edges(&self) -> usize {
117 // self.nodes.len() * self.nodes[0].0.len()
118 // }
119
120 /// calculate the activations of the nodes in this layer given the preactivations.
121 /// This operation mutates internal state, which will be read in [`KanLayer::backward()`] and [`KanLayer::update_knots_from_samples()`]
122 ///
123 /// each vector in `preactivations` should be of length `input_dimension`, and each vector in the output will have length `output_dimension`
124 /// # Errors
125 /// Returns an [`LayerError`] if
126 /// * the length of any `preactivation` in `preactivations` is not equal to the input_dimension this layer
127 /// * the output would contain NaNs.
128 ///
129 /// See [`LayerError`] for more information
130 ///
131 /// # Examples
132 /// ```
133 /// use fekan::kan_layer::{KanLayer, KanLayerOptions};
134 /// let input_dimension = 3;
135 /// let output_dimension = 4;
136 /// let layer_options = KanLayerOptions {
137 /// input_dimension,
138 /// output_dimension,
139 /// degree: 5,
140 /// coef_size: 6,
141 /// };
142 /// let mut my_layer = KanLayer::new(&layer_options);
143 /// let preacts = vec![vec![0.0; input_dimension], vec![0.5; input_dimension]];
144 /// let acts = my_layer.forward(preacts)?;
145 /// assert_eq!(acts.len(), 2);
146 /// assert_eq!(acts[0].len(), output_dimension);
147 /// # Ok::<(), fekan::layer_errors::LayerError>(())
148 /// ```
149 pub fn forward(&mut self, preactivations: Vec<Vec<f64>>) -> Result<Vec<Vec<f64>>, LayerError> {
150 let num_inputs = preactivations.len(); // grab this value, since we're about to move the preactivations into the internal cache
151 self.forward_preamble(preactivations)?;
152 // preactivations dim0 = number of samples, dim1 = input_dimension
153
154 // clone the last `num_inputs` preactivations from the internal cache (since we just moved them in), then transpose them so that dim0 = input_dimension, dim1 = number of samples
155 let mut transposed_preacts = vec![vec![0.0; num_inputs]; self.input_dimension];
156 for i in 0..num_inputs {
157 for j in 0..self.input_dimension {
158 transposed_preacts[j][i] = self.samples[i][j];
159 }
160 }
161 // go output-node-by-output-node so we can sum as we go and reduce memory usage
162 // dim0 = num_inputs, dim1 = output_dimension
163 let mut activations = vec![vec![0.0; self.output_dimension]; num_inputs];
164
165 // not the cleanest implementation maybe, but it'll work
166 for edge_index in 0..self.splines.len() {
167 trace!("Calculating activations for edge {}", edge_index);
168 let in_node_idx = edge_index / self.output_dimension;
169 let out_node_idx = edge_index % self.output_dimension;
170 let sample_wise_outputs =
171 self.splines[edge_index].forward(&transposed_preacts[in_node_idx]);
172 if sample_wise_outputs.iter().any(|v| v.is_nan()) {
173 return Err(LayerError::nans_in_activations(
174 edge_index,
175 sample_wise_outputs,
176 self.splines[edge_index].clone(),
177 ));
178 }
179 for sample_idx in 0..num_inputs {
180 activations[sample_idx][out_node_idx] += sample_wise_outputs[sample_idx];
181 }
182 }
183 self.layer_l1 = Some(
184 self.splines
185 .iter()
186 .map(|s| {
187 s.l1_norm()
188 .expect("edges should have L1 norm stored after forward pass")
189 })
190 .sum::<f64>()
191 / self.splines.len() as f64,
192 );
193 trace!("Activations: {:?}", activations);
194 Ok(activations)
195 }
196
197 /// As [`KanLayer::forward`], but divides the work among the passed number of threads
198 pub fn forward_multithreaded(
199 &mut self,
200 preactivations: Vec<Vec<f64>>,
201 num_threads: usize,
202 ) -> Result<Vec<Vec<f64>>, LayerError> {
203 let num_samples = preactivations.len(); // grab this value, since we're about to move the preactivations into the internal cache
204 self.forward_preamble(preactivations)?;
205 // clone the last `num_inputs` preactivations from the internal cache (since we just moved them in), then transpose them so that dim0 = input_dimension, dim1 = number of samples
206 let mut transposed_preacts = vec![vec![0.0; num_samples]; self.input_dimension];
207 for i in 0..num_samples {
208 for j in 0..self.input_dimension {
209 transposed_preacts[j][i] = self.samples[i][j];
210 }
211 }
212 // dim0 = num_samples, dim1 = output_dimension
213 let activations = Arc::new(Mutex::new(vec![
214 vec![0.0; self.output_dimension];
215 num_samples
216 ]));
217
218 /* spawn new threads, give those threads ownership of chunks of splines (so they don't have to worry about locking spline chaches),
219 * a reference to the transposed preacts, and the activations vector behnd a mutex.
220 *
221 * The threads will need to return ownership of the edges; as long as we join the threads in the same order we spawned them,
222 * we should be able to reassemble the splines in the correct order without issue
223 */
224 let edges_per_thread = (self.splines.len() as f64 / num_threads as f64).ceil() as usize;
225 let output_dimension = self.output_dimension;
226 let threaded_result: Result<(), LayerError> = thread::scope(|s| {
227 // since the threads will be adding to the activations vector themselves, they don't need to return anything but the edges they took ownership of
228 let handles: Vec<ScopedJoinHandle<Result<Vec<Edge>, LayerError>>> = (0..num_threads)
229 .map(|thread_idx| {
230 let edges_for_thread: Vec<Edge> = self
231 .splines
232 .drain(0..edges_per_thread.min(self.splines.len()))
233 .collect();
234 let transposed_preacts = &transposed_preacts;
235 let threaded_activations = Arc::clone(&activations);
236 let thread_result = s.spawn(move || {
237 let mut thread_edges = edges_for_thread; // just to explicitly move the edges into the thread
238 for edge_index in 0..thread_edges.len() {
239 let this_edge: &mut Edge = &mut thread_edges[edge_index];
240 let true_edge_index = thread_idx * edges_per_thread + edge_index;
241 let in_node_idx = true_edge_index / output_dimension;
242 let out_node_idx = true_edge_index % output_dimension;
243 let sample_wise_outputs =
244 this_edge.forward(&transposed_preacts[in_node_idx]);
245
246 if sample_wise_outputs.iter().any(|v| v.is_nan()) {
247 return Err(LayerError::nans_in_activations(
248 edge_index,
249 sample_wise_outputs,
250 this_edge.clone(),
251 ));
252 }
253
254 let mut mutex_acquired_activations =
255 threaded_activations.lock().unwrap();
256 for sample_idx in 0..num_samples {
257 let sample_activations =
258 &mut mutex_acquired_activations[sample_idx];
259 let out_node = &mut sample_activations[out_node_idx];
260 let edge_output = sample_wise_outputs[sample_idx];
261 *out_node += edge_output;
262 }
263 }
264 Ok(thread_edges) // give the edges back once we're done
265 });
266 thread_result
267 })
268 .collect();
269 // reassmble the splines in the correct order
270 for handle in handles {
271 let thread_result = handle.join().unwrap()?;
272 self.splines.extend(thread_result);
273 }
274 Ok(())
275 });
276 threaded_result?;
277 self.layer_l1 = Some(
278 self.splines
279 .iter()
280 .map(|s| {
281 s.l1_norm()
282 .expect("edges should have L1 norm stored after forward pass")
283 })
284 .sum::<f64>()
285 / self.splines.len() as f64,
286 );
287 let result = activations.lock().unwrap().clone();
288 Ok(result)
289 }
290
291 /// check the length of the preactivation vector and save the inputs to the internal cache
292 fn forward_preamble(&mut self, preactivations: Vec<Vec<f64>>) -> Result<(), LayerError> {
293 for preact in preactivations.iter() {
294 if preact.len() != self.input_dimension {
295 return Err(LayerError::missized_preacts(
296 preact.len(),
297 self.input_dimension,
298 ));
299 }
300 }
301 // the preactivations to the internal cache. We'll work from this cache during forward and backward passes
302 let mut preactivations = preactivations;
303 self.samples.append(&mut preactivations);
304 Ok(())
305 }
306
307 /// as [KanLayer::forward], but does not accumulate any internal state
308 ///
309 /// This method should be used when the model is not being trained, for example during inference or validation: when you won't be backpropogating, this method is faster uses less memory than [`KanLayer::forward`]
310 ///
311 /// # Errors
312 /// Returns a [`LayerError`] if...
313 /// * the length of `preactivation` is not equal to the input_dimension this layer
314 /// * the output would contain NaNs.
315
316 pub fn infer(&self, preactivations: &[Vec<f64>]) -> Result<Vec<Vec<f64>>, LayerError> {
317 for preactivation in preactivations.iter() {
318 if preactivation.len() != self.input_dimension {
319 return Err(LayerError::missized_preacts(
320 preactivation.len(),
321 self.input_dimension,
322 ));
323 }
324 }
325 let num_inputs = preactivations.len();
326 let mut transposed_preacts = vec![vec![0.0; num_inputs]; self.input_dimension];
327 for i in 0..num_inputs {
328 for j in 0..self.input_dimension {
329 transposed_preacts[j][i] = preactivations[i][j];
330 }
331 }
332 let mut activations = vec![vec![0.0; self.output_dimension]; num_inputs];
333 for edge_index in 0..self.splines.len() {
334 let in_node_idx = edge_index / self.output_dimension;
335 let out_node_idx = edge_index % self.output_dimension;
336 let sample_wise_outputs =
337 self.splines[edge_index].infer(&transposed_preacts[in_node_idx]);
338 for sample_idx in 0..num_inputs {
339 activations[sample_idx][out_node_idx] += sample_wise_outputs[sample_idx];
340 }
341 }
342
343 Ok(activations)
344 }
345
346 /// as [`KanLayer::infer`], but divides the work among the passed number of threads
347 pub fn infer_multithreaded(
348 &self,
349 preactivations: &[Vec<f64>],
350 num_threads: usize,
351 ) -> Result<Vec<Vec<f64>>, LayerError> {
352 for preactivation in preactivations.iter() {
353 if preactivation.len() != self.input_dimension {
354 return Err(LayerError::missized_preacts(
355 preactivation.len(),
356 self.input_dimension,
357 ));
358 }
359 }
360 let num_inputs = preactivations.len();
361 let mut transposed_preacts = vec![vec![0.0; num_inputs]; self.input_dimension];
362 for i in 0..num_inputs {
363 for j in 0..self.input_dimension {
364 transposed_preacts[j][i] = preactivations[i][j];
365 }
366 }
367
368 let activations = Arc::new(Mutex::new(vec![
369 vec![0.0; self.output_dimension];
370 num_inputs
371 ]));
372
373 let edges_per_thread = (self.splines.len() as f64 / num_threads as f64).ceil() as usize;
374 let output_dimension = self.output_dimension;
375 let num_samples = preactivations.len();
376 let threaded_result: Result<(), LayerError> = thread::scope(|s| {
377 // since the threads will be adding to the activations vector themselves, they don't need to return anything but the edges they took ownership of
378 let handles: Vec<ScopedJoinHandle<Result<(), LayerError>>> = (0..num_threads)
379 .map(|thread_idx| {
380 let edges_for_thread: &[Edge] = &self.splines
381 [thread_idx * edges_per_thread..(thread_idx + 1) * edges_per_thread];
382 let transposed_preacts = &transposed_preacts;
383 let threaded_activations = Arc::clone(&activations);
384 let thread_result = s.spawn(move || {
385 let thread_edges = edges_for_thread; // just to explicitly move the edges into the thread
386 for edge_index in 0..thread_edges.len() {
387 let this_edge: &Edge = &thread_edges[edge_index];
388 let true_edge_index = thread_idx * edges_per_thread + edge_index;
389 let in_node_idx = true_edge_index / output_dimension;
390 let out_node_idx = true_edge_index % output_dimension;
391 let sample_wise_outputs =
392 this_edge.infer(&transposed_preacts[in_node_idx]);
393
394 if sample_wise_outputs.iter().any(|v| v.is_nan()) {
395 return Err(LayerError::nans_in_activations(
396 edge_index,
397 sample_wise_outputs,
398 this_edge.clone(),
399 ));
400 }
401
402 let mut mutex_acquired_activations =
403 threaded_activations.lock().unwrap();
404 for sample_idx in 0..num_samples {
405 let sample_activations =
406 &mut mutex_acquired_activations[sample_idx];
407 let out_node = &mut sample_activations[out_node_idx];
408 let edge_output = sample_wise_outputs[sample_idx];
409 *out_node += edge_output;
410 }
411 }
412 Ok(()) // give the edges back once we're done
413 });
414 thread_result
415 })
416 .collect();
417 // reassmble the splines in the correct order
418 for handle in handles {
419 handle.join().unwrap()?;
420 }
421 Ok(())
422 });
423 threaded_result?;
424
425 let result = activations.lock().unwrap().clone();
426 Ok(result)
427 }
428 /// Using samples memoized by [`KanLayer::forward`], update the knot vectors for each incoming edge in this layer.
429 ///
430 /// When `knot_adaptivity` is 0, the new knot vectors will be uniformly distributed over the range spanned by the samples;
431 /// when `knot_adaptivity` is 1, the new knots will be placed at the quantiles of the samples. 0 < `knot_adaptivity` < 1 will interpolate between these two extremes.
432 ///
433 /// ## Warning
434 /// calling this function with `knot_adaptivity = 1` can result in a large number of knots being placed at the same value, which can cause [`KanLayer::forward`] to output NaNs. In practice, `knot_adaptivity` should be set to something like 0.1, but anything < 1.0 should be fine
435 ///
436 /// calling this function with fewer samples than the number of knots in a spline AND `knot_adaptivity` > 0 results in undefined behavior
437 ///
438 /// # Errors
439 /// Returns an error if the layer has no memoized samples, which most likely means that [`KanLayer::forward`] has not been called since initialization or the last call to [`KanLayer::clear_samples`]
440 ///
441 /// # Examples
442 ///
443 /// Update the knots of a model every few samples during training, to make sure that the supported input range of a given layer covers the output range of the previous layer.
444 /// ```
445 /// use fekan::kan_layer::{KanLayer, KanLayerOptions};
446 ///
447 /// # let input_size = 5;
448 /// # let output_size = 3;
449 /// # let layer_options = KanLayerOptions {input_dimension: 5,output_dimension: 4,degree: 3, coef_size: 6};
450 /// # fn calculate_gradient(output: Vec<Vec<f64>>, label: Vec<f64>) -> Vec<Vec<f64>> {vec![vec![0.0; output[0].len()]; output.len()]}
451 /// # let training_data = vec![(vec![0.1, 0.2, 0.3, 0.4, 0.5], 1.0f64), (vec![0.2, 0.3, 0.4, 0.5, 0.6], 0.0f64), (vec![0.3, 0.4, 0.5, 0.6, 0.7], 1.0f64)];
452 /// let mut my_layer = KanLayer::new(&layer_options);
453 /// # let batch_size = 1;
454 /// for batch_data in training_data.chunks(batch_size) {
455 /// let batch_features = batch_data.iter().map(|(f, _)| f.clone()).collect::<Vec<Vec<f64>>>();
456 /// let batch_output: Vec<Vec<f64>> = my_layer.forward(batch_features)?;
457 /// let batch_labels: Vec<f64> = batch_data.iter().map(|(_, l)| *l).collect();
458 /// let batch_gradients: Vec<Vec<f64>> = calculate_gradient(batch_output, batch_labels);
459 /// let _ = my_layer.backward(&batch_gradients)?;
460 /// my_layer.update(0.1, 1.0, 1.0); // updating the model's parameters changes the output range of the b-splines that make up the model
461 /// my_layer.update_knots_from_samples(0.1)?; // updating the knots adjusts the input range of the b-splines to match the output range of the previous layer
462 /// }
463 /// # Ok::<(), fekan::layer_errors::LayerError>(())
464 ///```
465 /// Note on the above example: even in this example, where range of input to the layer is the range of values in the training data and does not change during training, it's still important to update the knots at least once, after a good portion of the training data has been passed through, to ensure that the layer's supported input range covers the range spanned by the training data.
466 /// KanLayer knots are initialized to span the range [-1, 1], so if the training data is outside that range, the activations will be 0.0 until the knots are updated.
467 ///
468 ///
469 /// The below example shows why regularly updating the knots is important - especially early in training, before the model starts to converge when its parameters are changing rapidly
470 /// ```
471 /// use fekan::kan_layer::{KanLayer, KanLayerOptions};
472 /// # let some_layer_options = KanLayerOptions {input_dimension: 2,output_dimension: 4,degree: 5, coef_size: 6};
473 /// let mut my_layer = KanLayer::new(&some_layer_options);
474 /// let sample1 = vec![vec![100f64, -100f64]];
475 /// let sample2 = vec![vec![-100f64, 100f64]];
476 ///
477 /// let acts = my_layer.forward(sample1.clone()).unwrap();
478 /// assert!(acts[0].iter().all(|x| *x == 0.0)); // the preacts were all outside the initial knot range, so the activations should all be 0
479 /// let acts = my_layer.forward(sample2).unwrap();
480 /// assert!(acts[0].iter().all(|x| *x == 0.0)); // the preacts were all outside the initial knot range, so the activations should all be 0
481 /// my_layer.update_knots_from_samples(0.0).unwrap(); // we don't have enough samples to calculate quantiles, so we have to keep the knots uniformly distributed. In practice, this function should be called every few hundred forward passes or so
482 /// let new_acts = my_layer.forward(sample1).unwrap();
483 /// assert!(new_acts[0].iter().all(|x| *x != 0.0)); // the knot range now covers the samples, so the activations should be non-zero
484 /// # Ok::<(), fekan::layer_errors::LayerError>(())
485 /// ```
486 pub fn update_knots_from_samples(&mut self, knot_adaptivity: f64) -> Result<(), LayerError> {
487 trace!("Updating knots from {} samples", self.samples.len()); // trace since this happens every batch
488 if self.samples.is_empty() {
489 return Err(LayerError::no_samples());
490 }
491 // lets construct a sorted vector of the samples for each incoming value
492 // first we transpose the samples, so that dim0 = input_dimension, dim1 = number of samples
493 let mut sorted_samples: Vec<Vec<f64>> =
494 vec![Vec::with_capacity(self.samples.len()); self.input_dimension];
495 for i in 0..self.samples.len() {
496 for j in 0..self.input_dimension {
497 sorted_samples[j].push(self.samples[i][j]); // remember, push is just an indexed insert that checks capacity first. As long as capacity isn't exceeded, push is O(1)
498 }
499 }
500
501 // now we sort along dim1
502 for j in 0..self.input_dimension {
503 sorted_samples[j].sort_by(|a, b| a.partial_cmp(b).unwrap());
504 }
505 // TODO: it might be worth checking if the above operation would be faster if I changed the order of the loops and sorted inside the outer loop. Maybe something to do with cache performance?
506
507 for (idx, spline) in self.splines.iter_mut().enumerate() {
508 let sample_idx = idx % self.input_dimension; // the first `input_dimension` splines belong to the first "node", so every `input_dimension` splines, we move to the next node and reset which inner sample vector we're looking at
509 let sample = &sorted_samples[sample_idx];
510 trace!("Updating knots for edge {} from samples", idx);
511 spline.update_knots_from_samples(sample, knot_adaptivity);
512 }
513 if log::log_enabled!(log::Level::Trace) {
514 let mut ranges = vec![(0.0, 0.0); self.splines.len()];
515 for (idx, spline) in self.splines.iter().enumerate() {
516 ranges[idx] = spline.get_full_input_range();
517 }
518 trace!("Supported input ranges after knot update: {:#?}", ranges);
519 }
520
521 Ok(())
522 }
523
524 /// as [`KanLayer::update_knots_from_samples`], but divides the work among the passed number of threads
525 pub fn update_knots_from_samples_multithreaded(
526 &mut self,
527 knot_adaptivity: f64,
528 num_threads: usize,
529 ) -> Result<(), LayerError> {
530 if self.samples.is_empty() {
531 return Err(LayerError::no_samples());
532 }
533
534 let mut sorted_samples: Vec<Vec<f64>> =
535 vec![Vec::with_capacity(self.samples.len()); self.input_dimension];
536 for i in 0..self.samples.len() {
537 for j in 0..self.input_dimension {
538 sorted_samples[j].push(self.samples[i][j]); // remember, push is just an indexed insert that checks capacity first. As long as capacity isn't exceeded, push is O(1)
539 }
540 }
541
542 // dim0 = input_dimension, dim1 = number of samples
543 // now we sort along dim1
544 for j in 0..self.input_dimension {
545 sorted_samples[j].sort_by(|a, b| a.partial_cmp(b).unwrap());
546 }
547 // TODO: it might be worth checking if the above operation would be faster if I changed the order of the loops and sorted inside the outer loop. Maybe something to do with cache performance?
548 let edges_per_thread = (self.splines.len() as f64 / num_threads as f64).ceil() as usize;
549 let output_dimension = self.output_dimension;
550 thread::scope(|s| {
551 let handles: Vec<ScopedJoinHandle<Vec<Edge>>> = (0..num_threads)
552 .map(|thread_idx| {
553 let edges_for_thread: Vec<Edge> = self
554 .splines
555 .drain(0..edges_per_thread.min(self.splines.len()))
556 .collect();
557 let sorted_samples = &sorted_samples;
558 s.spawn(move || {
559 let mut thread_edges = edges_for_thread; // just to explicitly move the edges into the thread
560 for edge_idx in 0..thread_edges.len() {
561 let this_edge = &mut thread_edges[edge_idx];
562 let true_edge_idx = thread_idx * edges_per_thread + edge_idx;
563 let input_idx = true_edge_idx / output_dimension;
564 let samples = &sorted_samples[input_idx];
565 this_edge.update_knots_from_samples(samples, knot_adaptivity);
566 }
567 thread_edges // make sure to give the edges back once we're done
568 })
569 })
570 .collect();
571 for handle in handles {
572 let thread_result: Vec<Edge> = handle.join().unwrap();
573 self.splines.extend(thread_result);
574 }
575 });
576 Ok(())
577 }
578
579 /// wipe the internal state that tracks the samples used to update the knot vectors
580 ///
581 /// # Examples
582 ///
583 /// ```
584 /// use fekan::kan_layer::{KanLayer, KanLayerOptions};
585 /// # let some_layer_options = KanLayerOptions {input_dimension: 2,output_dimension: 4,degree: 5, coef_size: 6};
586 /// let mut my_layer = KanLayer::new(&some_layer_options);
587 /// /* After several forward passes... */
588 /// # let samples = vec![vec![100f64, -100f64],vec![-100f64, 100f64]];
589 /// # let _acts = my_layer.forward(samples)?;
590 /// let update_result = my_layer.update_knots_from_samples(0.0);
591 /// assert!(update_result.is_ok());
592 /// my_layer.clear_samples();
593 /// let update_result = my_layer.update_knots_from_samples(0.0);
594 /// assert!(update_result.is_err()); // we've cleared the samples, so we can't update the knot vectors
595 /// # Ok::<(), fekan::layer_errors::LayerError>(())
596 pub fn clear_samples(&mut self) {
597 self.samples.clear();
598 }
599
600 /// Given a vector of gradient values for the nodes in this layer, backpropogate the error through the layer, updating the internal gradients for the incoming edges
601 /// and return the error for the previous layer.
602 ///
603 /// This function relies on mutated inner state and should be called after [`KanLayer::forward`].
604 ///
605 /// Calculated gradients are stored internally, and only applied during [`KanLayer::update`].
606 ///
607 /// # Errors
608 /// Returns a [`LayerError`] if...
609 /// * the length of `gradient` is not equal to the number of nodes in this layer (i.e this layer's output dimension)
610 /// * this method is called before [`KanLayer::forward`]
611 ///
612 /// # Examples
613 /// Backpropgate the error through a two-layer network, and update the gradients
614 /// ```
615 /// use fekan::kan_layer::{KanLayer, KanLayerOptions};
616 /// let first_layer_options = KanLayerOptions { input_dimension: 2, output_dimension: 4, degree: 5, coef_size: 6 };
617 /// let second_layer_options = KanLayerOptions { input_dimension: 4, output_dimension: 3, degree: 5, coef_size: 6 };
618 /// let mut first_layer = KanLayer::new(&first_layer_options);
619 /// let mut second_layer = KanLayer::new(&second_layer_options);
620 /// /* forward pass */
621 /// let preacts = vec![vec![0.0, 0.5]];
622 /// let acts = first_layer.forward(preacts).unwrap();
623 /// let output = second_layer.forward(acts).unwrap();
624 /// /* calculate error */
625 /// # let error = vec![vec![1.0, 0.5, 0.5]];
626 /// assert_eq!(error[0].len(), second_layer_options.output_dimension);
627 /// let first_layer_error = second_layer.backward(&error).unwrap();
628 /// assert_eq!(first_layer_error[0].len(), first_layer_options.output_dimension);
629 /// let input_error = first_layer.backward(&first_layer_error).unwrap();
630 /// assert_eq!(input_error[0].len(), first_layer_options.input_dimension);
631 ///
632 /// // apply the gradients
633 /// let learning_rate = 0.1;
634 /// first_layer.update(learning_rate, 1.0, 1.0);
635 /// second_layer.update(learning_rate, 1.0, 1.0);
636 /// // reset the gradients
637 /// first_layer.zero_gradients();
638 /// second_layer.zero_gradients();
639 /// /* continue training */
640 /// ```
641 pub fn backward(&mut self, gradients: &[Vec<f64>]) -> Result<Vec<Vec<f64>>, LayerError> {
642 for gradient in gradients.iter() {
643 if gradient.len() != self.output_dimension {
644 return Err(LayerError::missized_gradient(
645 gradient.len(),
646 self.output_dimension,
647 ));
648 }
649
650 if gradient.iter().any(|f| f.is_nan()) {
651 return Err(LayerError::nans_in_gradient());
652 }
653 }
654 let layer_l1 = self
655 .layer_l1
656 .ok_or(LayerError::backward_before_forward(None, 0))?;
657 let layer_entropy = self.layer_entropy();
658
659 let num_gradients = gradients.len();
660 let mut transposed_gradients = vec![vec![0.0; num_gradients]; self.output_dimension];
661 for i in 0..num_gradients {
662 for j in 0..self.output_dimension {
663 let to_move_gradient = gradients[i][j]; // separate the lines for easier debugging
664 transposed_gradients[j][i] = to_move_gradient;
665 }
666 }
667 let mut backpropped_gradients = vec![vec![0.0; self.input_dimension]; num_gradients];
668 let mut sibling_entropy_terms: VecDeque<f64> = self
669 .splines
670 .iter()
671 .map(|s| {
672 let edge_l1 = s.l1_norm().expect("edge should have an L1");
673 if edge_l1 == 0.0 {
674 0.0
675 } else {
676 edge_l1 * ((edge_l1 / layer_l1).ln() + 1.0)
677 }
678 })
679 .collect();
680 for edge_index in 0..self.splines.len() {
681 trace!("Backpropping gradients for edge {}", edge_index);
682 let in_node_idx = edge_index / self.output_dimension;
683 let out_node_idx = edge_index % self.output_dimension;
684 // let sibling_l1s: &[f64] = &sibling_entropy_terms.as_slices().0[1..];
685 let sample_wise_outputs = self.splines[edge_index]
686 .backward(&transposed_gradients[out_node_idx], layer_l1, layer_entropy)
687 .map_err(|e| LayerError::backward_before_forward(Some(e), edge_index))?; // TODO incorporate sparsity losses
688 trace!(
689 "Backpropped gradients for edge {}: {:?}",
690 edge_index,
691 sample_wise_outputs
692 );
693 for sample_idx in 0..num_gradients {
694 backpropped_gradients[sample_idx][in_node_idx] += sample_wise_outputs[sample_idx];
695 }
696
697 // rotate the sibling L1s so that the next edge gets the correct values
698 sibling_entropy_terms.rotate_left(1);
699 sibling_entropy_terms.make_contiguous(); // necessary for .as_slices() above to work properly
700 }
701 trace!("Backpropped gradients: {:?}", backpropped_gradients);
702 self.layer_l1 = None; // The L1 should be re-set after the next forward pass, and backward should not be called before forward, so this serves as a (redundant) check
703 Ok(backpropped_gradients)
704 }
705
706 /// as [`KanLayer::backward`], but divides the work among the passed number of threads
707 pub fn backward_multithreaded(
708 &mut self,
709 gradients: &[Vec<f64>],
710 num_threads: usize,
711 ) -> Result<Vec<Vec<f64>>, LayerError> {
712 for gradient in gradients.iter() {
713 if gradient.len() != self.output_dimension {
714 return Err(LayerError::missized_gradient(
715 gradient.len(),
716 self.output_dimension,
717 ));
718 }
719
720 if gradient.iter().any(|f| f.is_nan()) {
721 return Err(LayerError::nans_in_gradient());
722 }
723 }
724 let layer_l1 = self
725 .layer_l1
726 .ok_or(LayerError::backward_before_forward(None, 0))?;
727 let layer_entropy = self.layer_entropy();
728
729 let num_gradients = gradients.len();
730 let mut transposed_gradients = vec![vec![0.0; num_gradients]; self.output_dimension];
731 for i in 0..num_gradients {
732 for j in 0..self.output_dimension {
733 let to_move_gradient = gradients[i][j]; // separate the lines for easier debugging
734 transposed_gradients[j][i] = to_move_gradient;
735 }
736 }
737 // dim0 = num_gradients, dim1 = input_dimension
738 let backpropped_gradients = Arc::new(Mutex::new(vec![
739 vec![0.0; self.input_dimension];
740 num_gradients
741 ]));
742 let backprop_result = thread::scope(|s| {
743 let edges_per_thread = (self.splines.len() as f64 / num_threads as f64).ceil() as usize;
744 let output_dimension = self.output_dimension;
745 // let all_edge_l1s = self
746 // .splines
747 // .iter()
748 // .map(|s| s.l1_norm().expect("edge should have an L1"))
749 // .collect::<Vec<f64>>();
750 let handles: Vec<ScopedJoinHandle<Result<Vec<Edge>, LayerError>>> = (0..num_threads)
751 .map(|thread_idx| {
752 let edges_for_thread: Vec<Edge> = self
753 .splines
754 .drain(0..edges_per_thread.min(self.splines.len()))
755 .collect();
756 // let edge_l1s = all_edge_l1s.clone();
757 let transposed_gradients = &transposed_gradients;
758 let threaded_gradients = Arc::clone(&backpropped_gradients);
759 let thread_result = s.spawn(move || {
760 let mut thread_edges = edges_for_thread; // explicitly move the edges into the thread because it makes me feel good
761 for edge_index in 0..thread_edges.len() {
762 let this_edge: &mut Edge = &mut thread_edges[edge_index];
763 let true_edge_index = thread_idx * edges_per_thread + edge_index;
764 // let sibling_l1s: Vec<f64> = edge_l1s
765 // .iter()
766 // .enumerate()
767 // .filter(|(idx, _l1)| *idx != true_edge_index)
768 // .map(|(_idx, l1)| *l1)
769 // .collect();
770 let in_node_idx = true_edge_index / output_dimension;
771 let out_node_idx = true_edge_index % output_dimension;
772 let sample_wise_outputs = this_edge
773 .backward(
774 &transposed_gradients[out_node_idx],
775 layer_l1,
776 layer_entropy,
777 )
778 .map_err(|e| {
779 LayerError::backward_before_forward(Some(e), true_edge_index)
780 })?;
781 let mut mutex_acquired_gradients = threaded_gradients.lock().unwrap();
782 for sample_idx in 0..num_gradients {
783 let sample_gradients = &mut mutex_acquired_gradients[sample_idx];
784 let in_node = &mut sample_gradients[in_node_idx];
785 let edge_output = sample_wise_outputs[sample_idx];
786 *in_node += edge_output;
787 }
788 }
789 Ok(thread_edges)
790 });
791 thread_result
792 })
793 .collect();
794 for handle in handles {
795 let thread_result: Vec<Edge> = handle.join().unwrap()?;
796 self.splines.extend(thread_result);
797 }
798 Ok(())
799 });
800 backprop_result?;
801 let result = backpropped_gradients.lock().unwrap().clone();
802 Ok(result)
803 }
804
805 fn layer_entropy(&self) -> f64 {
806 let layer_l1 = self
807 .layer_l1
808 .expect("Layer L1 should be set to calculate entropy");
809 self.splines
810 .iter()
811 .map(|s| {
812 s.l1_norm()
813 .expect("Edge should have an L1 to calculate entropy")
814 })
815 .map(|edge_l1| {
816 if edge_l1 == 0.0 {
817 0.0
818 } else {
819 edge_l1 / layer_l1 * (edge_l1 / layer_l1).ln()
820 }
821 })
822 .sum::<f64>()
823 / self.splines.len() as f64
824 }
825
826 // /// as [KanLayer::backward], but divides the work among the passed thread pool
827 // pub fn backward_concurrent(
828 // &mut self,
829 // error: &[f64],
830 // thread_pool: &ThreadPool,
831 // ) -> Result<Vec<f64>, BackwardLayerError> {
832 // if error.len() != self.output_dimension {
833 // return Err(BackwardLayerError::MissizedGradientError {
834 // actual: error.len(),
835 // expected: self.output_dimension,
836 // });
837 // }
838 // // if error.iter().any(|f| f.is_nan()) {
839 // // return Err(BackwardLayerError::ReceivedNanError);
840 // // }
841
842 // let backprop_result: (Vec<f64>, Vec<BackwardSplineError>) = thread_pool.install(|| {
843 // let mut input_gradient = vec![0.0; self.input_dimension];
844 // let mut spline_errors = Vec::with_capacity(self.splines.len());
845 // for i in 0..self.splines.len() {
846 // // every `input_dimension` splines belong to the same node, and thus will use the same error value.
847 // // "Distribute" the error at a given node among all incoming edges
848 // let error_at_edge_output =
849 // error[i / self.input_dimension] / self.input_dimension as f64;
850 // match self.splines[i].backward(error_at_edge_output) {
851 // Ok(error_at_edge_input) => {
852 // input_gradient[i % self.input_dimension] += error_at_edge_input
853 // }
854 // Err(e) => spline_errors.push(e),
855 // }
856 // }
857 // (input_gradient, spline_errors)
858 // });
859 // if !backprop_result.1.is_empty() {
860 // return Err(backprop_result.1[0].into());
861 // }
862 // Ok(backprop_result.0)
863 // }
864
865 /// set the length of the knot vectors for each incoming edge in this layer
866 ///
867 /// Generally used multiple times throughout training to increase the number of knots in the spline to increase fidelity of the curve
868 /// # Notes
869 /// * The number of control points is set to `knot_length - degree - 1`, and the control points are calculated using lstsq to approximate the previous curve
870 /// # Errors
871 /// * Returns an error if any of the splines' updated control points include NaNs. This should never happen, but if it does, it's a bug
872 /// # Examples
873 /// Extend the knot vectors during training to increase the fidelity of the splines
874 /// ```
875 /// use fekan::kan_layer::{KanLayer, KanLayerOptions};
876 /// let input_dimension = 2;
877 /// let output_dimension = 4;
878 /// let degree = 5;
879 /// let coef_size = 6;
880 /// let layer_options = KanLayerOptions {
881 /// input_dimension,
882 /// output_dimension,
883 /// degree,
884 /// coef_size
885 /// };
886 /// let mut my_layer = KanLayer::new(&layer_options);
887 ///
888 /// let num_splines = input_dimension * output_dimension;
889 /// let expected_knots_per_spline = coef_size + degree + 1;
890 /// assert_eq!(my_layer.knot_length(), expected_knots_per_spline, "starting knots per edge");
891 /// assert_eq!(my_layer.parameter_count(), num_splines * (coef_size + my_layer.knot_length()), "starting parameter count");
892 ///
893 /// /* train the layer a bit to start shaping the splines */
894 ///
895 /// let new_knot_length = my_layer.knot_length() * 2;
896 /// let update_result = my_layer.set_knot_length(new_knot_length);
897 /// assert!(update_result.is_ok(), "update knots");
898 ///
899 /// assert_eq!(my_layer.knot_length(), new_knot_length, "ending knots per edge");
900 /// let new_coef_size = new_knot_length - degree - 1;
901 /// assert_eq!(my_layer.parameter_count(), num_splines * (new_coef_size + new_knot_length), "ending parameter count");
902 ///
903 /// /* continue training layer, now with increased fidelity in the spline */
904 /// ```
905 /// # Panics
906 /// Panics if the Singular Value Decomposition (SVD) used to calculate the control points fails. This should never happen, but if it does, it's a bug
907 pub fn set_knot_length(&mut self, knot_length: usize) -> Result<(), LayerError> {
908 for i in 0..self.splines.len() {
909 self.splines[i]
910 .set_knot_length(knot_length)
911 .map_err(|e| LayerError::set_knot_length(i, e))?;
912 }
913 Ok(())
914 }
915
916 /// as [`KanLayer::set_knot_length`], but divides the work among the passed number of threads
917 pub fn set_knot_length_multithreaded(
918 &mut self,
919 knot_length: usize,
920 num_threads: usize,
921 ) -> Result<(), LayerError> {
922 let edges_per_thread = (self.splines.len() as f64 / num_threads as f64).ceil() as usize;
923 let threaded_result: Result<(), LayerError> = thread::scope(|s| {
924 let handles: Vec<ScopedJoinHandle<Result<Vec<Edge>, LayerError>>> = (0..num_threads)
925 .map(|thread_idx| {
926 let edges_for_thread: Vec<Edge> = self
927 .splines
928 .drain(0..edges_per_thread.min(self.splines.len()))
929 .collect();
930 let thread_result = s.spawn(move || {
931 let mut thread_edges = edges_for_thread; // just to explicitly move the edges into the thread
932 for edge_index in 0..thread_edges.len() {
933 let this_edge: &mut Edge = &mut thread_edges[edge_index];
934 this_edge.set_knot_length(knot_length).map_err(|e| {
935 LayerError::set_knot_length(
936 edge_index + thread_idx * edges_per_thread,
937 e,
938 )
939 })?;
940 }
941 Ok(thread_edges) // give the edges back once we're done
942 });
943 thread_result
944 })
945 .collect();
946 for handle in handles {
947 let thread_result = handle.join().unwrap()?;
948 self.splines.extend(thread_result);
949 }
950 Ok(())
951 });
952 threaded_result?;
953 Ok(())
954 }
955
956 /// return the length of the knot vectors for each incoming edge in this layer
957 /// # Examples
958 /// ```
959 /// use fekan::kan_layer::{KanLayer, KanLayerOptions};
960 /// let layer_options = KanLayerOptions {
961 /// input_dimension: 2,
962 /// output_dimension: 4,
963 /// degree: 5,
964 /// coef_size: 6
965 /// };
966 /// let mut my_layer = KanLayer::new(&layer_options);
967 /// assert_eq!(my_layer.knot_length(), 6 + 5 + 1);
968 pub fn knot_length(&self) -> usize {
969 for spline in self.splines.iter() {
970 if spline.knots().len() > 0 {
971 return spline.knots().len();
972 }
973 }
974 0
975 }
976
977 /// update the control points for each incoming edge in this layer given the learning rate
978 ///
979 /// this function relies on internally stored gradients calculated during [`KanLayer::backward()`]
980 ///
981 /// # Examples
982 /// see [`KanLayer::backward`]
983 pub fn update(&mut self, learning_rate: f64, l1_penalty: f64, entropy_penalty: f64) {
984 for spline in self.splines.iter_mut() {
985 spline.update_control_points(learning_rate, l1_penalty, entropy_penalty);
986 }
987 }
988
989 /// as [`KanLayer::update`], but divides the work among the passed number of threads
990 pub fn update_multithreaded(
991 &mut self,
992 learning_rate: f64,
993 l1_penalty: f64,
994 entropy_penalty: f64,
995 num_threads: usize,
996 ) {
997 let edges_per_thread = (self.splines.len() as f64 / num_threads as f64).ceil() as usize;
998 thread::scope(|s| {
999 let handles: Vec<ScopedJoinHandle<Vec<Edge>>> = (0..num_threads)
1000 .map(|_| {
1001 let edges_for_thread: Vec<Edge> = self
1002 .splines
1003 .drain(0..edges_per_thread.min(self.splines.len()))
1004 .collect();
1005 let thread_result = s.spawn(move || {
1006 let mut thread_edges = edges_for_thread; // just to explicitly move the edges into the thread
1007 for edge_index in 0..thread_edges.len() {
1008 let this_edge: &mut Edge = &mut thread_edges[edge_index];
1009 this_edge.update_control_points(
1010 learning_rate,
1011 l1_penalty,
1012 entropy_penalty,
1013 );
1014 }
1015 thread_edges // make sure to give the edges back once we're done
1016 });
1017 thread_result
1018 })
1019 .collect();
1020 for handle in handles {
1021 let thread_result: Vec<Edge> = handle.join().unwrap();
1022 self.splines.extend(thread_result);
1023 }
1024 });
1025 }
1026
1027 /// clear gradients for each incoming edge in this layer
1028 ///
1029 /// # Examples
1030 /// see [`KanLayer::backward`]
1031 pub fn zero_gradients(&mut self) {
1032 for spline in self.splines.iter_mut() {
1033 spline.zero_gradients();
1034 }
1035 }
1036
1037 /// return the total number of parameters in this layer.
1038 /// A layer has `input_dimension * output_dimension` splines, each with `degree + coef_size + 1` knots and `coef_size` control points
1039 ///
1040 /// #Examples
1041 /// ```
1042 /// use fekan::kan_layer::{KanLayer, KanLayerOptions};
1043 /// let layer_options = KanLayerOptions {
1044 /// input_dimension: 2,
1045 /// output_dimension: 4,
1046 /// degree: 5,
1047 /// coef_size: 6
1048 /// };
1049 /// let my_layer = KanLayer::new(&layer_options);
1050 /// assert_eq!(my_layer.parameter_count(), 2 * 4 * (6 + (5 + 6 + 1)));
1051 ///```
1052 pub fn parameter_count(&self) -> usize {
1053 self.input_dimension * self.output_dimension * self.splines[0].parameter_count()
1054 }
1055
1056 /// returns the total number of trainable parameters in this layer.
1057 /// A layer has `input_dimension * output_dimension` splines, each with coef_size` control points, which are the trainable parameter in a KAN layer
1058 /// # Examples
1059 /// ```
1060 /// use fekan::kan_layer::{KanLayer, KanLayerOptions};
1061 /// let layer_options = KanLayerOptions {
1062 /// input_dimension: 2,
1063 /// output_dimension: 4,
1064 /// degree: 5,
1065 /// coef_size: 6
1066 /// };
1067 /// let my_layer = KanLayer::new(&layer_options);
1068 /// assert_eq!(my_layer.trainable_parameter_count(), 2 * 4 * 6);
1069 ///```
1070 pub fn trainable_parameter_count(&self) -> usize {
1071 self.input_dimension * self.output_dimension * self.splines[0].trainable_parameter_count()
1072 }
1073
1074 /// return the number of incoming edges to nodes in this layer
1075 pub fn total_edges(&self) -> usize {
1076 self.input_dimension * self.output_dimension
1077 }
1078
1079 /// Create a new KanLayer by merging the splines of multiple KanLayers. Splines are merged by averaging their knots and control points.
1080 /// `new_layer.splines[0] = spline_merge([layer1.splines[0], layer2.splines[0], ...])`, etc. The output of the merged layer is not necessarily the average of the outputs of the input layers.
1081 /// # Errors
1082 /// Returns a [`LayerError`] if...
1083 /// * `kan_layers` is empty
1084 /// * the input dimensions of the layers in `kan_layers` are not all equal
1085 /// * the output dimensions of the layers in `kan_layers` are not all equal
1086 /// * there is an error merging the splines of the layers, caused by the splines having different: degrees, number of control points, number or knots from each other
1087 /// # Examples
1088 /// Train a layer using multiple threads, then merge the results
1089 /// ```
1090 /// use fekan::kan_layer::{KanLayer, KanLayerOptions};
1091 /// use std::thread;
1092 /// # use fekan::Sample;
1093 /// # let layer_options = KanLayerOptions {
1094 /// # input_dimension: 2,
1095 /// # output_dimension: 4,
1096 /// # degree: 5,
1097 /// # coef_size: 6
1098 /// # };
1099 /// # let num_training_threads = 1;
1100 /// # let training_data = vec![Sample::new_regression_sample(vec![], 0.0)];
1101 /// # fn train_layer(layer: KanLayer, data: &[Sample]) -> KanLayer {layer}
1102 /// let my_layer = KanLayer::new(&layer_options);
1103 /// let partially_trained_layers: Vec<KanLayer> = thread::scope(|s|{
1104 /// let chunk_size = f32::ceil(training_data.len() as f32 / num_training_threads as f32) as usize; // round up, since .chunks() gives up-to chunk_size chunks. This way to don't leave any data on the cutting room floor
1105 /// let handles: Vec<_> = training_data.chunks(chunk_size).map(|training_data_chunk|{
1106 /// let clone_layer = my_layer.clone();
1107 /// s.spawn(move ||{
1108 /// train_layer(clone_layer, training_data_chunk) // `train_layer` is a stand-in for whatever function you're using to train the layer - not actually defined in this crate
1109 /// })
1110 /// }).collect();
1111 /// handles.into_iter().map(|handle| handle.join().unwrap()).collect()
1112 /// });
1113 /// let fully_trained_layer = KanLayer::merge_layers(partially_trained_layers)?;
1114 /// # Ok::<(), fekan::layer_errors::LayerError>(())
1115 /// ```
1116 pub fn merge_layers(kan_layers: Vec<KanLayer>) -> Result<KanLayer, LayerError> {
1117 if kan_layers.is_empty() {
1118 return Err(LayerError::merge_no_layers());
1119 }
1120 let expected_input_dimension = kan_layers[0].input_dimension;
1121 let expected_output_dimension = kan_layers[0].output_dimension;
1122 // check that all layers have the same input and output dimensions
1123 for i in 1..kan_layers.len() {
1124 if kan_layers[i].input_dimension != expected_input_dimension {
1125 return Err(LayerError::merge_mismatched_input_dimension(
1126 i,
1127 expected_input_dimension,
1128 kan_layers[i].input_dimension,
1129 ));
1130 }
1131 if kan_layers[i].output_dimension != expected_output_dimension {
1132 return Err(LayerError::merge_mismatched_output_dimension(
1133 i,
1134 expected_output_dimension,
1135 kan_layers[i].output_dimension,
1136 ));
1137 }
1138 }
1139 let edge_count = expected_input_dimension * expected_output_dimension;
1140 // // now build a row-major matrix of splines where each column is the splines in a given layer, and the rows are the ith spline in each layer
1141 // // splines_to_merge = [[L0_S0, L1_S0, ... LJ_S0],
1142 // // [L0_S1, L1_S1, ... LJ_S1],
1143 // // ...
1144 // // [L0_SN, L1_SN, ... LJ_SN]]
1145 // let num_splines = expected_input_dimension * expected_output_dimension;
1146 // let mut splines_to_merge: VecDeque<Vec<Edge>> = vec![vec![]; num_splines].into();
1147 // //populated in column-major order
1148 // for j in 0..kan_layers.len() {
1149 // for i in 0..num_splines {
1150 // splines_to_merge[i].push(kan_layers[j].splines.remove(i));
1151 // }
1152 // }
1153 // let mut merged_splines = Vec::with_capacity(num_splines);
1154 // let mut i = 0;
1155 // while let Some(splines) = splines_to_merge.pop_front() {
1156 // let merge_result =
1157 // Edge::merge_edges(splines).map_err(|e| LayerError::spline_merge(i, e))?;
1158 // i += 1;
1159 // merged_splines.push(merge_result);
1160 // }
1161 let mut all_edges: Vec<VecDeque<Edge>> = kan_layers
1162 .into_iter()
1163 .map(|layer| layer.splines.into())
1164 .collect();
1165 let mut merged_edges =
1166 Vec::with_capacity(expected_input_dimension * expected_output_dimension);
1167 for i in 0..edge_count {
1168 let edges_to_merge: Vec<Edge> = all_edges
1169 .iter_mut()
1170 .map(|layer_dequeue| {
1171 layer_dequeue
1172 .pop_front()
1173 .expect("iterated past end of dequeue while merging layers")
1174 })
1175 .collect();
1176 merged_edges.push(
1177 Edge::merge_edges(edges_to_merge).map_err(|e| LayerError::spline_merge(i, e))?,
1178 );
1179 }
1180
1181 Ok(KanLayer {
1182 splines: merged_edges,
1183 input_dimension: expected_input_dimension,
1184 output_dimension: expected_output_dimension,
1185 samples: vec![],
1186 layer_l1: None,
1187 })
1188 }
1189
1190 /// does no useful work at the moment - only here for benchmarking
1191 pub fn bench_suggest_symbolic(&self) {
1192 for (_idx, spline) in self.splines.iter().enumerate() {
1193 spline.suggest_symbolic(1);
1194 }
1195 }
1196
1197 /// test each spline in the layer for similarity to a symbolic function (e.g x^2, sin(x), etc.). If the R^2 value of the best fit is greater than `r2_threshold`, replace the spline with the symbolic function
1198 ///
1199 /// Useful at the end of training to enhance interpretability of the model
1200 pub fn test_and_set_symbolic(&mut self, r2_threshold: f64) -> Vec<usize> {
1201 debug!(
1202 "Testing and setting symbolic functions with R2 >= {}",
1203 r2_threshold
1204 );
1205 let mut clamped_edges = Vec::new();
1206 for i in 0..self.splines.len() {
1207 trace!("Testing edge {}", i);
1208 let mut suggestions = self.splines[i].suggest_symbolic(1);
1209 if suggestions.is_empty() {
1210 // pruned or already-symbolified edges will return an empty vector
1211 continue;
1212 }
1213 let (possible_symbol, r2) = suggestions.remove(0);
1214 if r2 >= r2_threshold {
1215 self.splines[i] = possible_symbol;
1216 clamped_edges.push(i);
1217 }
1218 }
1219 debug!("Symbolified layer:\n{}", self);
1220 clamped_edges
1221 }
1222
1223 /// Tests all edges using the samples provided, and 'prunes' any edges in this layer with an average absolute output value less than `threshold` - that is, they will be replaced with a constant edge that outputs 0.0
1224 /// # Returns
1225 /// A vector of the indices of the pruned edges
1226 pub fn prune(&mut self, samples: &[Vec<f64>], threshold: f64) -> Vec<usize> {
1227 assert!(threshold >= 0.0, "Pruning threhsold must be >= 0.0");
1228 let transposed_samples = transpose(samples);
1229 let mut pruned_indices = Vec::new();
1230 for i in 0..self.splines.len() {
1231 trace!("Pruning edge {}", i);
1232 let in_node_idx = i / self.output_dimension;
1233 if self.splines[i].prune(&transposed_samples[in_node_idx], threshold) {
1234 pruned_indices.push(i);
1235 }
1236 }
1237 pruned_indices
1238 }
1239
1240 /// return the input dimension of this layer
1241 pub fn input_dimension(&self) -> usize {
1242 self.input_dimension
1243 }
1244
1245 /// return the output dimension of this layer
1246 pub fn output_dimension(&self) -> usize {
1247 self.output_dimension
1248 }
1249
1250 /// wipe the edge's activations. Useful for debugging and benchmarking
1251 pub fn wipe_activations(&mut self) {
1252 for edge in self.splines.iter_mut() {
1253 edge.wipe_activations();
1254 }
1255 }
1256}
1257
1258fn transpose(matrix: &[Vec<f64>]) -> Vec<Vec<f64>> {
1259 let mut transposed = vec![vec![0.0; matrix.len()]; matrix[0].len()];
1260 for i in 0..matrix.len() {
1261 for j in 0..matrix[0].len() {
1262 transposed[j][i] = matrix[i][j];
1263 }
1264 }
1265 transposed
1266}
1267
1268impl PartialEq for KanLayer {
1269 // only in a VERY contrived case would two layers have equal splines but different input/output dimensions
1270 // but it's technically possible, so we've got to check it
1271 fn eq(&self, other: &Self) -> bool {
1272 self.splines == other.splines
1273 && self.input_dimension == other.input_dimension
1274 && self.output_dimension == other.output_dimension
1275 }
1276}
1277
1278impl std::fmt::Display for KanLayer {
1279 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1280 let edge_string = self
1281 .splines
1282 .iter()
1283 .map(|e| "- ".to_string() + &e.to_string())
1284 .collect::<Vec<String>>()
1285 .join("\n");
1286 write!(
1287 f,
1288 "KanLayer: input_dimension: {}, output_dimension: {}, edges:\n {}",
1289 self.input_dimension, self.output_dimension, edge_string
1290 )
1291 }
1292}
1293
1294#[cfg(test)]
1295mod test {
1296
1297 use edge::Edge;
1298 use test_log::test;
1299
1300 use super::*;
1301
1302 /// returns a new layer with input and output dimension = 2, k = 3, and coef_size = 4
1303 fn build_test_layer() -> KanLayer {
1304 let k = 3;
1305 let coef_size = 4;
1306 let knot_size = coef_size + k + 1;
1307 let knots = linspace(-1.0, 1.0, knot_size);
1308 let spline1 = Edge::new(k, vec![1.0; coef_size], knots.clone()).unwrap();
1309 let spline2 = Edge::new(k, vec![-1.0; coef_size], knots.clone()).unwrap();
1310 KanLayer {
1311 splines: vec![spline1.clone(), spline2.clone(), spline2, spline1],
1312 samples: vec![],
1313 input_dimension: 2,
1314 output_dimension: 2,
1315 layer_l1: None,
1316 }
1317 }
1318
1319 #[test]
1320 fn test_new() {
1321 let input_dimension = 3;
1322 let output_dimension = 4;
1323 let k = 5;
1324 let coef_size = 6;
1325 let my_layer = KanLayer::new(&KanLayerOptions {
1326 input_dimension,
1327 output_dimension,
1328 degree: k,
1329 coef_size,
1330 });
1331 assert_eq!(my_layer.output_dimension, output_dimension);
1332 assert_eq!(my_layer.input_dimension, input_dimension);
1333 assert_eq!(my_layer.splines.len(), input_dimension * output_dimension);
1334 }
1335
1336 #[test]
1337 fn test_forward() {
1338 // to properly test layer forward, I need a layer with output and input dim = 2, which means 4 total edges
1339 let mut layer = build_test_layer();
1340 let preacts = vec![vec![0.0, 0.5]];
1341 let acts = layer.forward(preacts).unwrap();
1342 let expected_activations = vec![0.3177, -0.3177];
1343 let rounded_activations: Vec<f64> = acts[0]
1344 .iter()
1345 .map(|x| (x * 10000.0).round() / 10000.0)
1346 .collect();
1347 assert_eq!(rounded_activations, expected_activations);
1348 }
1349
1350 #[test]
1351 fn test_forward_bad_activations() {
1352 let mut layer = build_test_layer();
1353 let preacts = vec![vec![0.0, 0.5, 0.5]];
1354 let acts = layer.forward(preacts);
1355 assert!(acts.is_err());
1356 let error = acts.err().unwrap();
1357 assert_eq!(error, LayerError::missized_preacts(3, 2));
1358 println!("{:?}", error); // make sure we can build the error message
1359 }
1360
1361 #[test]
1362 fn test_forward_then_backward() {
1363 let mut layer = build_test_layer();
1364 let preacts = vec![vec![0.0, 0.5]];
1365 let acts = layer.forward(preacts).unwrap();
1366 let expected_activations = vec![0.3177, -0.3177];
1367 let rounded_activations: Vec<f64> = acts[0]
1368 .iter()
1369 .map(|x| (x * 10000.0).round() / 10000.0)
1370 .collect();
1371 assert_eq!(rounded_activations, expected_activations, "forward failed");
1372
1373 let error = vec![vec![1.0, 0.5]];
1374 let input_error = layer.backward(&error).unwrap();
1375 let expected_input_error = vec![0.0, 1.20313];
1376 let rounded_input_error: Vec<f64> = input_error[0]
1377 .iter()
1378 .map(|f| (f * 100000.0).round() / 100000.0)
1379 .collect();
1380 assert_eq!(rounded_input_error, expected_input_error, "backward failed");
1381 }
1382
1383 #[test]
1384 fn test_forward_multithreaded_activations() {
1385 let mut layer = build_test_layer();
1386 let preacts = vec![vec![0.0, 0.5]];
1387 let acts = layer.forward_multithreaded(preacts, 4).unwrap();
1388 let expected_activations = vec![0.3177, -0.3177];
1389 let rounded_activations: Vec<f64> = acts[0]
1390 .iter()
1391 .map(|x| (x * 10000.0).round() / 10000.0)
1392 .collect();
1393 assert_eq!(rounded_activations, expected_activations);
1394 }
1395
1396 #[test]
1397 fn test_forward_multhreaded_reassemble() {
1398 let mut layer = build_test_layer();
1399 let reference_layer = layer.clone();
1400 let preacts = vec![vec![0.0, 0.5]];
1401 let _ = layer.forward_multithreaded(preacts, 4).unwrap();
1402 assert_eq!(layer, reference_layer);
1403 }
1404
1405 #[test]
1406 fn test_forward_then_backward_multithreaded_result() {
1407 let mut layer = build_test_layer();
1408 let preacts = vec![vec![0.0, 0.5]];
1409 let acts = layer.forward_multithreaded(preacts, 4).unwrap();
1410 let expected_activations = vec![0.3177, -0.3177];
1411 let rounded_activations: Vec<f64> = acts[0]
1412 .iter()
1413 .map(|x| (x * 10000.0).round() / 10000.0)
1414 .collect();
1415 assert_eq!(rounded_activations, expected_activations, "forward failed");
1416
1417 let error = vec![vec![1.0, 0.5]];
1418 let input_error = layer.backward_multithreaded(&error, 4).unwrap();
1419 let expected_input_error = vec![0.0, 1.20313];
1420 let rounded_input_error: Vec<f64> = input_error[0]
1421 .iter()
1422 .map(|f| (f * 100000.0).round() / 100000.0)
1423 .collect();
1424 assert_eq!(rounded_input_error, expected_input_error, "backward failed");
1425 }
1426
1427 #[test]
1428 fn test_forward_then_backward_multithreaded_reassemble() {
1429 let mut layer = build_test_layer();
1430 let reference_layer = layer.clone();
1431 let preacts = vec![vec![0.0, 0.5]];
1432 let acts = layer.forward_multithreaded(preacts, 4).unwrap();
1433 let expected_activations = vec![0.3177, -0.3177];
1434 let rounded_activations: Vec<f64> = acts[0]
1435 .iter()
1436 .map(|x| (x * 10000.0).round() / 10000.0)
1437 .collect();
1438 assert_eq!(rounded_activations, expected_activations, "forward failed");
1439
1440 let error = vec![vec![1.0, 0.5]];
1441 let _ = layer.backward_multithreaded(&error, 4).unwrap();
1442 assert_eq!(layer, reference_layer, "edges not reassembled correctly");
1443 }
1444
1445 // #[test]
1446 // fn test_forward_then_backward_concurrent() {
1447 // let thread_pool = ThreadPoolBuilder::new().num_threads(4).build().unwrap();
1448 // let mut layer = build_test_layer();
1449 // let preacts = vec![0.0, 0.5];
1450 // let acts = layer.forward(&preacts).unwrap();
1451 // let expected_activations = vec![0.3177, -0.3177];
1452 // let rounded_activations: Vec<f64> = acts
1453 // .iter()
1454 // .map(|x| (x * 10000.0).round() / 10000.0)
1455 // .collect();
1456 // assert_eq!(rounded_activations, expected_activations, "forward failed");
1457
1458 // let error = vec![1.0, 0.5];
1459 // let input_error = layer.backward_concurrent(&error, &thread_pool).unwrap();
1460 // let expected_input_error = vec![0.0, 0.60156];
1461 // let rounded_input_error: Vec<f64> = input_error
1462 // .iter()
1463 // .map(|f| (f * 100000.0).round() / 100000.0)
1464 // .collect();
1465 // assert_eq!(rounded_input_error, expected_input_error, "backward failed");
1466 // }
1467
1468 // #[test]
1469 // fn test_forward_concurrent_then_backward_concurrent() {
1470 // let thread_pool = ThreadPoolBuilder::new().num_threads(4).build().unwrap();
1471 // let mut layer = build_test_layer();
1472 // let preacts = vec![0.0, 0.5];
1473 // let acts = layer.forward_concurrent(&preacts, &thread_pool).unwrap();
1474 // let expected_activations = vec![0.3177, -0.3177];
1475 // let rounded_activations: Vec<f64> = acts
1476 // .iter()
1477 // .map(|x| (x * 10000.0).round() / 10000.0)
1478 // .collect();
1479 // assert_eq!(rounded_activations, expected_activations, "forward failed");
1480
1481 // let error = vec![1.0, 0.5];
1482 // let input_error = layer.backward_concurrent(&error, &thread_pool).unwrap();
1483 // let expected_input_error = vec![0.0, 0.60156];
1484 // let rounded_input_error: Vec<f64> = input_error
1485 // .iter()
1486 // .map(|f| (f * 100000.0).round() / 100000.0)
1487 // .collect();
1488 // assert_eq!(rounded_input_error, expected_input_error, "backward failed");
1489 // }
1490
1491 #[test]
1492 fn test_backward_before_forward() {
1493 let mut layer = build_test_layer();
1494 let error = vec![vec![1.0, 0.5]];
1495 let input_error = layer.backward(&error);
1496 assert!(input_error.is_err());
1497 }
1498
1499 // #[test]
1500 // fn test_backward_concurrent_before_forward() {
1501 // let thread_pool = ThreadPoolBuilder::new().num_threads(4).build().unwrap();
1502 // let mut layer = build_test_layer();
1503 // let error = vec![1.0, 0.5];
1504 // let input_error = layer.backward_concurrent(&error, &thread_pool);
1505 // assert!(input_error.is_err());
1506 // }
1507
1508 #[test]
1509 fn test_backward_bad_error_length() {
1510 let mut layer = build_test_layer();
1511 let preacts = vec![vec![0.0, 0.5]];
1512 let _ = layer.forward(preacts).unwrap();
1513 let error = vec![vec![1.0, 0.5, 0.5]];
1514 let input_error = layer.backward(&error);
1515 assert!(input_error.is_err());
1516 }
1517
1518 #[test]
1519 fn test_update_knots_from_samples_multithreaded_results_and_reassemble() {
1520 let mut layer = build_test_layer();
1521 let preacts = vec![vec![0.0, 1.0], vec![0.5, 2.0]];
1522 let _ = layer.forward_multithreaded(preacts, 4).unwrap();
1523 layer
1524 .update_knots_from_samples_multithreaded(0.0, 4)
1525 .unwrap();
1526 let expected_knots_1 = vec![-0.1875, -0.125, -0.0625, 0.0, 0.5, 0.5625, 0.625, 0.6875]; // accounts for the padding added in Edge::update_knots_from_samples
1527 let expected_knots_2 = vec![0.625, 0.75, 0.875, 1.0, 2.0, 2.125, 2.25, 2.375]; // accounts for the padding added in Edge::update_knots_from_samples
1528 assert_eq!(layer.splines[0].knots(), expected_knots_1, "edge 0");
1529 assert_eq!(layer.splines[1].knots(), expected_knots_1, "edge 1");
1530 assert_eq!(layer.splines[2].knots(), expected_knots_2, "edge 2");
1531 assert_eq!(layer.splines[3].knots(), expected_knots_2, "edge 3");
1532 }
1533
1534 #[test]
1535 // we checked the proper averaging of knots and control points in the spline tests, so we just need to check that the layer merge isn't messing up the order of the splines
1536 fn test_merge_identical_layers_yield_identical_output() {
1537 let layer1 = build_test_layer();
1538 let layer2 = layer1.clone();
1539 let input = vec![vec![0.0, 0.5]];
1540 let acts1 = layer1.infer(&input).unwrap();
1541 let acts2 = layer2.infer(&input).unwrap();
1542 assert_eq!(acts1, acts2);
1543 let merged_layer = KanLayer::merge_layers(vec![layer1, layer2]).unwrap();
1544 let acts3 = merged_layer.infer(&input).unwrap();
1545 assert_eq!(acts1, acts3);
1546 }
1547
1548 #[test]
1549 fn test_layer_send() {
1550 fn assert_send<T: Send>() {}
1551 assert_send::<KanLayer>();
1552 }
1553
1554 #[test]
1555 fn test_layer_sync() {
1556 fn assert_sync<T: Sync>() {}
1557 assert_sync::<KanLayer>();
1558 }
1559}