simple_ml/
lib.rs

1use math::round;
2use rand::*;
3
4// use crate::lib_matrix::*;
5
6/*
7SOURCE
8------
9Activation from : https://ml-cheatsheet.readthedocs.io/en/latest/activation_functions.html
10Neuron : nnfs
11
12DESCRIPTION
13-----------------------------------------
14STRUCTS
15-------
161. LayerDetails : To create a layer of n_neurons and n_inputs
17    > 1. create_weights : To randomly generate n_neurons weights between -1 and 1
18    > 2. create_bias : A constant numer (can be modified if required) vector of n_neurons as bias
19    > 3. output_of_layer : activation_function((inputs*weights)-bias)
20
21FUNCTIONS
22---------
231. activation_leaky_relu :
24    > 1. &Vec<T> to be used as input to funtion
25    > 2. alpha to control the fucntion's "leaky" nature
26    = 1. Modified Vec<T>
272. activation_relu :
28    > 1. &Vec<T> to be used as input to funtion
29    = 1. Modified Vec<T>
303. activation_sigmoid :
31    > 1. &Vec<T> to be used as input to funtion
32    = 1. Modified Vec<T>
334. activation_tanh :
34    > 1. &Vec<T> to be used as input to funtion
35    = 1. Modified Vec<T>
36*/
37pub struct LayerDetails {
38    /*
39    To create layers of a neural network
40    n_inputs : number of inputs to the layer
41    n_neurons : number of neurons in the layer
42    */
43    pub n_inputs: usize,
44    pub n_neurons: i32,
45}
46impl LayerDetails {
47    pub fn create_weights(&self) -> Vec<Vec<f64>> {
48        /*
49        random weights between -1 and 1, for optimization, assinged to each neuron and input
50        */
51        let mut rng = rand::thread_rng();
52        let mut weight: Vec<Vec<f64>> = vec![];
53        // this gives transposed weights
54        for _ in 0..self.n_inputs {
55            weight.push(
56                (0..self.n_neurons)
57                    .map(|_| round::ceil(rng.gen_range(-1., 1.), 3))
58                    .collect(),
59            );
60        }
61        weight
62    }
63    pub fn create_bias(&self, value: f64) -> Vec<f64> {
64        /*
65        Initialize a constant value vector of value passed
66        Which acts as bias introduced to each neuron of the layer
67        */
68        let bias = vec![value; self.n_neurons as usize];
69        bias
70    }
71    pub fn output_of_layer(
72        &self,
73        input: &Vec<Vec<f64>>,
74        weights: &Vec<Vec<f64>>,
75        bias: &mut Vec<f64>,
76        f: &str,
77        alpha: f64,
78    ) -> Vec<Vec<f64>> {
79        /*
80        The inputs are :
81        INPUT : [NxM]
82        WEIGHTS : [MxN]
83        BIAS : [N]
84        F: "relu" or "leaky relu" or "sigmoid" or "tanh"
85        ALPHA : only if leaky relu is used, else it will be ignored
86
87        The output is [NxN] : F((INPUT*WEIGHTS)+BIAS)
88         */
89        let mut mat_mul = transpose(&matrix_multiplication(&input, &weights));
90        // println!("input * weights = {:?}", mat_mul);
91        let mut output: Vec<Vec<f64>> = vec![];
92        for i in &mut mat_mul {
93            // println!("i*w {:?}, bias {:?}", &i, &bias);
94            output.push(vector_addition(i, bias));
95        }
96        // println!("Before activation it was {:?}", &output[0]);
97        // println!("After activation it was {:?}", activation_relu(&output[0]));
98        let mut activated_output = vec![];
99        match f {
100            "relu" => {
101                println!("Alpha is for 'leaky relu' only, it is not taken into account here");
102                for i in output.clone() {
103                    activated_output.push(activation_relu(&i));
104                }
105            }
106            "leaky relu" => {
107                for i in output.clone() {
108                    activated_output.push(activation_leaky_relu(&i, alpha));
109                }
110            }
111            "sigmoid" => {
112                println!("Alpha is for 'leaky relu' only, it is not taken into account here");
113                for i in output.clone() {
114                    activated_output.push(activation_sigmoid(&i));
115                }
116            }
117            "tanh" => {
118                println!("Alpha is for 'leaky relu' only, it is not taken into account here");
119                for i in output.clone() {
120                    activated_output.push(activation_tanh(&i));
121                }
122            }
123            _ => panic!("Select from either 'tanh','sigmoid','relu','leaky relu'"),
124        }
125        // transpose(&activated_output)
126        activated_output
127    }
128}
129
130pub fn activation_relu<T>(input: &Vec<T>) -> Vec<T>
131where
132    T: Copy + std::cmp::PartialOrd + std::ops::Sub<Output = T> + std::str::FromStr,
133    <T as std::str::FromStr>::Err: std::fmt::Debug,
134{
135    // ReLU for neurons
136    /*
137    If greater than 0 then x passed else 0
138    where x is the values of (input*weights)+bias
139    */
140    let zero = "0".parse::<T>().unwrap();
141    input
142        .iter()
143        .map(|x| if *x > zero { *x } else { *x - *x })
144        .collect()
145}
146
147pub fn activation_leaky_relu<T>(input: &Vec<T>, alpha: f64) -> Vec<T>
148where
149    T: Copy + std::cmp::PartialOrd + std::ops::Mul<Output = T> + std::str::FromStr,
150    <T as std::str::FromStr>::Err: std::fmt::Debug,
151{
152    // Leaky ReLU for neurons, where alpha is multiplied with x if x <= 0
153    // to avoid making it completely 0 like in ReLU
154    /*
155    If greater than 0 then x passed else alpha*x
156    where x is the values of (input*weights)+bias
157    */
158    let zero = "0".parse::<T>().unwrap();
159    let a = format!("{}", alpha).parse::<T>().unwrap();
160    input
161        .iter()
162        .map(|x| if *x > zero { *x } else { a * *x })
163        .collect()
164}
165
166pub fn activation_sigmoid<T>(input: &Vec<T>) -> Vec<f64>
167where
168    T: std::str::FromStr + std::fmt::Debug,
169    <T as std::str::FromStr>::Err: std::fmt::Debug,
170{
171    // Sigmoid for neurons
172    /*
173    1/(1+(e^x))
174    where x is the values of (input*weights)+bias
175    */
176    input
177        .iter()
178        .map(|x| 1. / (1. + format!("{:?}", x).parse::<f64>().unwrap().exp()))
179        .collect()
180}
181
182pub fn activation_tanh<T>(input: &Vec<T>) -> Vec<f64>
183where
184    T: std::str::FromStr + std::fmt::Debug,
185    <T as std::str::FromStr>::Err: std::fmt::Debug,
186{
187    // TanH for neurons
188    input
189        .iter()
190        .map(|x| {
191            (format!("{:?}", x).parse::<f64>().unwrap().exp()
192                - (format!("{:?}", x).parse::<f64>().unwrap() * (-1.)).exp())
193                / (format!("{:?}", x).parse::<f64>().unwrap().exp()
194                    + (format!("{:?}", x).parse::<f64>().unwrap() * (-1.)).exp())
195        })
196        .collect()
197}
198
199/*
200DESCRIPTION
201-----------------------------------------
202STRUCTS
203-------
2041. OLS : file_path: String, target: usize, // target column number , pub test_size: f64
205    > fit
206
2072. BLR : file_path: String, test_size: f64, target_column: usize, learning_rate: f64, iter_count: u32, binary_threshold: f64,
208    > fit
209    > sigmoid
210    > log_loss
211    > gradient_descent
212    > change_in_loss
213    > predict
214
2153. KNN : file_path: String, test_size: f64, target_column: usize, k: usize, method: &'a str
216    > fit
217    x predict
218
2194. Distance : row1: Vec<f64>, row2: Vec<f64>
220    > distance_euclidean
221    > distance_manhattan
222    > distance_cosine
223    > distance_chebyshev
224
2255. Kmeans : file_path: String, k: usize, iterations: u32
226    > fit
227
2286. SSVM : file_path: String,  drop_column_number: Vec<usize>,test_size: f64,  learning_rate: f64,  iter_count: i32,  reg_strength: f64
229    > fit
230    x sgd
231    x compute_cost
232    x calculate_cost_gradient
233    x predict
234
235
236FUNCTIONS
237---------
2381. coefficient : To find slope(b1) and intercept(b0) of a line
239> 1. list1 : A &Vec<T>
240> 2. list2 : A &Vec<T>
241= 1. b0
242= 2. b1
243
2442. convert_and_impute : To convert type and replace missing values with a constant input
245> 1. list : A &Vec<String> to be converted to a different type
246> 2. to : A value which provides the type(U) to be converted to
247> 3. impute_with : A value(U) to be swapped with missing elemets of the same type as "to"
248= 1. Result with Vec<U> and Error propagated
249= 2. A Vec<uszie> to show the list of indexes where values were missing
250
2513. covariance :
252> 1. list1 : A &Vec<T>
253> 2. list2 : A &Vec<T>
254= 1. f64
255
2564. impute_string :
257> 1. list : A &mut Vec<String> to be imputed
258> 2. impute_with : A value(U) to be swapped with missing elemets of the same type as "to"
259= 1. A Vec<&str> with missing values replaced
260
2615. mean :
262> 1. list : A &Vec<T>
263= 1. f64
264
2656. read_csv :
266> 1. path : A String for file path
267> 2. columns : number of columns to be converted to
268= 1. HashMap<String,Vec<String>) as a table with headers and its values in vector
269
2707. root_mean_square :
271> 1. list1 : A &Vec<T>
272> 2. list2 : A &Vec<T>
273= 1. f64
274
2758. simple_linear_regression_prediction : // https://machinelearningmastery.com/implement-simple-linear-regression-scratch-python/
276> 1. train : A &Vec<(T,T)>
277> 2. test : A &Vec<(T,T)>
278    = 1. Vec<T>
279
2809. variance :
281    > 1. list : A &Vec<T>
282    = 1. f64
283
28410. convert_string_categorical :
285    > 1. list : A &Vec<T>
286    > 2. extra_class : bool if true more than 10 classes else less
287    = Vec<usize>
288
28911. min_max_scaler : between [0.,1.]
290    > 1. list: A &Vec<f64>
291    = Vec<f64>
292
29312. logistic_function_f : sigmoid function
294    > 1. matrix: A &Vec<Vec<f64>>
295    > 2. beta: A &Vec<Vec<f64>>
296    = Vec<Vec<f64>>
297
29813. log_gradient_f :  logistic gradient function
299    > 1. matrix1: A &Vec<Vec<f64>>
300    > 2. beta: A &Vec<Vec<f64>> // same shape as matrix1
301    > 3. matrix2: A &Vec<f64> // target
302    = Vec<Vec<f64>>
303
30414. logistic_predict :
305    1. > matrix1: &Vec<Vec<f64>>
306    2. > beta: &Vec<Vec<f64>>
307    = Vec<Vec<f64>>
308
30915. randomize_vector :
310    1. > rows : &Vec<T>
311    = Vec<T>
312
31316. randomize :
314    1. > rows : &Vec<Vec<T>>
315    = Vec<Vec<T>>
316
31717. train_test_split_vector_f :
318    1. > input: &Vec<f64>
319    2. > percentage: f64
320    = Vec<f64>
321    = Vec<f64>
322
32318. train_test_split_f :
324    1. > input: &Vec<Vec<f64>>
325    2. > percentage: f64
326    = Vec<Vec<f64>>
327    = Vec<Vec<f64>>
328
32919. correlation :
330    1. > list1: &Vec<T>
331    2. > list2: &Vec<T>
332    3. > name: &str // 's' spearman, 'p': pearson
333    = f64
334
33520. std_dev :
336    1. > list1: &Vec<T>
337    = f64
338
33921. spearman_rank : Spearman ranking
340    1. > list1: &Vec<T>
341    = Vec<(T, f64)>
342
34322. how_many_and_where_vector :
344    1. > list: &Vec<T>
345    2. > number: T  // to be searched
346    = Vec<usize>
347
34823. how_many_and_where :
349    1. > list: &Vec<Vec<T>>
350    2. > number: T  // to be searched
351    = Vec<(usize,usize)>
352
35324. z_score :
354    1. > list: &Vec<T>
355    2. > number: T
356    = f64
357
35825. one_hot_encoding :
359    1. > column: &Vec<&str>
360     = Vec<Vec<u8>>
361
36226. shape : shows #rowsx#columns
363    1. m: &Vec<Vec<f64>>
364    = ()
365
36627. rmse
367    1. test_data: &Vec<Vec<f64>>
368    2. predicted: &Vec<f64>)
369    = f64
370
37128. mse
372    1. test_data: &Vec<Vec<f64>>
373    2. predicted: &Vec<f64>
374    = f64
375
37629. mae
377    1. test_data: &Vec<Vec<f64>>
378    2. predicted: &Vec<f64>
379    = f64
380
38130. r_square
382    1. predicted: &Vec<f64>
383    2. actual: &Vec<f64>, features: usize
384    = (f64, f64)
385
38631. mape
387    1. test_data: &Vec<Vec<f64>>
388    2. predicted: &Vec<f64>
389    = f64
390
39132. drop_column
392    1. matrix: &Vec<Vec<f64>>
393    2. column_number
394    = Vec<Vec<f64>>
395
39633. preprocess_train_test_split
397    1. matrix: &Vec<Vec<f64>>,
398    2. test_percentage: f64,
399    3 .target_column: usize,
400    4 .preprocess: &str : "s","m"
401     = (Vec<Vec<f64>>, Vec<f64>, Vec<Vec<f64>>, Vec<f64>) : x_train, y_train, x_test, y_test
402
40334. standardize_vector_f
404    1. list: &Vec<f64>
405     = Vec<f64>
406
40735. min_max_scaler
408    1. list: &Vec<f64>
409    = Vec<f64>
410
41136. float_randomize
412    1. matrix: &Vec<Vec<String>>
413    = Vec<Vec<f64>>
414
41537. confuse_me
416    1. predicted: &Vec<f64>
417    2. actual: &Vec<f64>
418     = ()
419
42038. cv
421    1. data : &Vec<Vec<T>>
422    2. k : usize
423    = (Vec<Vec<T>>,Vec<Vec<T>>)
424
42539. z_outlier_f
426    1. list : &Vec<f64>
427    = Vec<f64>
428
42940. percentile_f
430    1. list:&Vec<f64>
431    2. percentile:u32)
432    = f64
433
43441. quartile_f
435    1. list:&Vec<f64>
436
437*/
438
439// use crate::lib_matrix;
440// use lib_matrix::*;
441
442pub struct OLS {
443    pub file_path: String,
444    pub target: usize, // target column number
445    pub test_size: f64,
446}
447
448impl OLS {
449    pub fn fit(&self) {
450        /*
451        Source:
452        Video: https://www.youtube.com/watch?v=K_EH2abOp00
453        Book: Trevor Hastie,  Robert Tibshirani, Jerome Friedman - The Elements of  Statistical Learning_  Data Mining, Inference, and Pred
454        Article: https://towardsdatascience.com/regression-an-explanation-of-regression-metrics-and-what-can-go-wrong-a39a9793d914#:~:text=Root%20Mean%20Squared%20Error%3A%20RMSE,value%20predicted%20by%20the%20model.&text=Mean%20Absolute%20Error%3A%20MAE%20is,value%20predicted%20by%20the%20model.
455        Library:
456
457        TODO:
458        * Whats the role of gradient descent in this?
459        * rules of regression
460        * p-value
461        * Colinearity
462        */
463
464        // read a csv file
465        let (columns, values) = read_csv(self.file_path.clone()); // output is row wise
466                                                                  // assuming the last column has the value to be predicted
467        println!(
468            "The target here is header named: {:?}",
469            columns[self.target - 1]
470        );
471
472        // // converting vector of string to vector of f64s
473        let random_data = randomize(&values)
474            .iter()
475            .map(|a| {
476                a.iter()
477                    .filter(|b| **b != "".to_string())
478                    .map(|b| b.parse::<f64>().unwrap())
479                    .collect::<Vec<f64>>()
480            })
481            .collect::<Vec<Vec<f64>>>();
482        // splitting it into train and test as per test percentage passed as parameter to get scores
483        let (train_data, test_data) = train_test_split_f(&random_data, self.test_size);
484        shape("Training data", &train_data);
485        shape("Testing data", &test_data);
486
487        // converting rows to vector of columns of f64s
488
489        // println!("{:?}",train_data );
490        shape("Training data", &train_data);
491        let actual_train = row_to_columns_conversion(&train_data);
492        // println!(">>>>>");
493        let x = drop_column(&actual_train, self.target);
494        // // the read columns are in transposed form already, so creating vector of features X and adding 1 in front of it for b0
495        let b0_vec: Vec<Vec<f64>> = vec![vec![1.; x[0].len()]]; //[1,1,1...1,1,1]
496        let X = [&b0_vec[..], &x[..]].concat(); // [1,1,1...,1,1,1]+X
497                                                // shape(&X);
498        let xt = MatrixF { matrix: X };
499
500        // and vector of targets y
501        let y = vec![actual_train[self.target - 1].to_vec()];
502        // print_a_matrix(
503        //     "Features",
504        //     &xt.matrix.iter().map(|a| a[..6].to_vec()).collect(),
505        // );
506        // print_a_matrix("Target", &y);
507
508        /*
509        beta = np.linalg.inv(X.T@X)@(X.T@y)
510         */
511
512        // (X.T@X)
513        let xtx = MatrixF {
514            matrix: matrix_multiplication(&xt.matrix, &transpose(&xt.matrix)),
515        };
516        // println!("{:?}", MatrixF::inverse_f(&xtx));
517        let slopes = &matrix_multiplication(
518            &MatrixF::inverse_f(&xtx), // np.linalg.inv(X.T@X)
519            &transpose(&vec![matrix_vector_product_f(&xt.matrix, &y[0])]), //(X.T@y)
520        )[0];
521
522        // combining column names with coefficients
523        let output: Vec<_> = columns[..columns.len() - 1]
524            .iter()
525            .zip(slopes[1..].iter())
526            .collect();
527        // println!("****************** Without Gradient Descent ******************");
528        println!(
529        "\n\nThe coeficients of a columns as per simple linear regression on {:?}% of data is : \n{:?} and b0 is : {:?}",
530        self.test_size * 100.,
531        output,
532        slopes[0]
533    );
534
535        // predicting the values for test features
536        // multiplying each test feture row with corresponding slopes to predict the dependent variable
537        let mut predicted_values = vec![];
538        for i in test_data.iter() {
539            predicted_values.push({
540                let value = i
541                    .iter()
542                    .zip(slopes[1..].iter())
543                    .map(|(a, b)| (a * b))
544                    .collect::<Vec<f64>>();
545                value.iter().fold(slopes[0], |a, b| a + b) // b0+b1x1+b2x2..+bnxn
546            });
547        }
548
549        println!("RMSE : {:?}", rmse(&test_data, &predicted_values));
550        println!("MSE : {:?}", mse(&test_data, &predicted_values)); // cost function
551        println!("MAE : {:?}", mae(&test_data, &predicted_values));
552        println!("MAPE : {:?}", mape(&test_data, &predicted_values));
553        println!(
554            "R2 and adjusted R2 : {:?}",
555            r_square(
556                &test_data
557                    .iter()
558                    .map(|a| a[test_data[0].len() - 1])
559                    .collect(), // passing only the target values
560                &predicted_values,
561                columns.len(),
562            )
563        );
564
565        println!();
566        println!();
567    }
568}
569
570pub struct BLR {
571    pub file_path: String,     // pointing to a csv or txt file
572    pub test_size: f64,        // ex: .30 => random 30% of data become test
573    pub target_column: usize,  // column index which has to be classified
574    pub learning_rate: f64,    // gradient descent step size ex: 0.1, 0.05 etc
575    pub iter_count: u32,       // how many epochs ex: 10000
576    pub binary_threshold: f64, // at what probability will the class be determined ex: 0.6 => anything above 0.6 is 1
577}
578impl BLR {
579    pub fn fit(&self) {
580        /*
581            Source:
582            Video:
583            Book: Trevor Hastie,  Robert Tibshirani, Jerome Friedman - The Elements of  Statistical Learning_  Data Mining, Inference, and Pred
584            Article: https://towardsdatascience.com/scale-standardize-or-normalize-with-scikit-learn-6ccc7d176a02
585            Library:
586        */
587
588        // read a csv file
589        let (_, values) = read_csv(self.file_path.clone()); // output is row wise
590
591        // converting vector of string to vector of f64s
592        let random_data = float_randomize(&values);
593
594        // splitting it into train and test as per test percentage passed as parameter to get scores
595        let (x_train, y_train, x_test, y_test) =
596            preprocess_train_test_split(&random_data, self.test_size, self.target_column, "");
597
598        shape("Training features", &x_train);
599        shape("Test features", &x_test);
600        println!("Training target: {:?}", &y_train.len());
601        println!("Test target: {:?}", &y_test.len());
602
603        // now to the main part
604        let length = x_train[0].len();
605        let feature_count = x_train.len();
606        // let class_count = (unique_values(&y_test).len() + unique_values(&y_test).len()) / 2;
607        let intercept = vec![vec![1.; length]];
608        let new_x_train = [&intercept[..], &x_train[..]].concat();
609        let mut coefficients = vec![0.; feature_count + 1];
610
611        let mut cost = vec![];
612        print!("Reducing loss ...");
613        for _ in 0..self.iter_count {
614            let s = BLR::sigmoid(&new_x_train, &coefficients);
615            cost.push(BLR::log_loss(&s, &y_train));
616            let gd = BLR::gradient_descent(&new_x_train, &s, &y_train);
617            coefficients = BLR::change_in_loss(&coefficients, self.learning_rate, &gd);
618        }
619        // println!("The intercept is : {:?}", coefficients[0]);
620        // println!(
621        //     "The coefficients are : {:?}",
622        //     columns
623        //         .iter()
624        //         .zip(coefficients[1..].to_vec())
625        //         .collect::<Vec<(&String, f64)>>()
626        // );
627        let predicted = BLR::predict(&x_test, &coefficients, self.binary_threshold);
628        confuse_me(&predicted, &y_test, -1., 1.);
629    }
630
631    pub fn predict(test_features: &Vec<Vec<f64>>, weights: &Vec<f64>, threshold: f64) -> Vec<f64> {
632        let length = test_features[0].len();
633        let intercept = vec![vec![1.; length]];
634        let new_x_test = [&intercept[..], &test_features[..]].concat();
635        let pred = BLR::sigmoid(&new_x_test, weights);
636        pred.iter()
637            .map(|a| if *a > threshold { 1. } else { 0. })
638            .collect()
639    }
640
641    pub fn change_in_loss(coeff: &Vec<f64>, lr: f64, gd: &Vec<f64>) -> Vec<f64> {
642        print!(".");
643        if coeff.len() == gd.len() {
644            element_wise_operation(coeff, &gd.iter().map(|a| a * lr).collect(), "add")
645        } else {
646            panic!("The dimensions do not match")
647        }
648    }
649
650    pub fn gradient_descent(
651        train: &Vec<Vec<f64>>,
652        sigmoid: &Vec<f64>,
653        y_train: &Vec<f64>,
654    ) -> Vec<f64> {
655        let part2 = element_wise_operation(sigmoid, y_train, "sub");
656        let numerator = matrix_vector_product_f(train, &part2);
657        numerator
658            .iter()
659            .map(|a| *a / (y_train.len() as f64))
660            .collect()
661    }
662
663    pub fn log_loss(sigmoid: &Vec<f64>, y_train: &Vec<f64>) -> f64 {
664        let part11 = sigmoid.iter().map(|a| a.log(1.0_f64.exp())).collect();
665        let part12 = y_train.iter().map(|a| a * -1.).collect();
666        let part21 = sigmoid
667            .iter()
668            .map(|a| (1. - a).log(1.0_f64.exp()))
669            .collect();
670        let part22 = y_train.iter().map(|a| 1. - a).collect();
671        let part1 = element_wise_operation(&part11, &part12, "mul");
672        let part2 = element_wise_operation(&part21, &part22, "mul");
673        mean(&element_wise_operation(&part1, &part2, "sub"))
674    }
675
676    pub fn sigmoid(train: &Vec<Vec<f64>>, coeff: &Vec<f64>) -> Vec<f64> {
677        let z = matrix_vector_product_f(&transpose(train), coeff);
678        z.iter().map(|a| 1. / (1. + a.exp())).collect()
679    }
680}
681
682pub struct KNN<'a> {
683    pub file_path: String,
684    pub test_size: f64,
685    pub target_column: usize,
686    pub k: usize,
687    pub method: &'a str,
688}
689impl<'a> KNN<'a> {
690    pub fn fit(&self) {
691        /*
692        method : Euclidean or Chebyshev or cosine or Manhattan or Minkowski or Weighted
693        */
694        // read a csv file
695        let (_, values) = read_csv(self.file_path.clone()); // output is row wise
696
697        // converting vector of string to vector of f64s
698        let random_data = float_randomize(&values); // blr needs to be removed
699
700        // splitting it into train and test as per test percentage passed as parameter to get scores
701        let (x_train, y_train, x_test, y_test) =
702            preprocess_train_test_split(&random_data, self.test_size, self.target_column, ""); // blr needs to be removed
703
704        // now to the main part
705        // since it is row wise, conversion
706        let train_rows = columns_to_rows_conversion(&x_train);
707        let test_rows = columns_to_rows_conversion(&x_test);
708        shape("train Rows:", &train_rows);
709        shape("test Rows:", &test_rows);
710        // println!("{:?}", y_train.len());
711
712        // predicting values
713        let predcited = KNN::predict(&train_rows, &y_train, &test_rows, self.method, self.k);
714        println!("Metrics");
715        confuse_me(
716            &predcited.iter().map(|a| *a as f64).collect::<Vec<f64>>(),
717            &y_test,
718            -1.,
719            1.,
720        ); // blr needs to be removed
721    }
722
723    fn predict(
724        train_rows: &Vec<Vec<f64>>,
725        train_values: &Vec<f64>,
726        test_rows: &Vec<Vec<f64>>,
727        method: &str,
728        k: usize,
729    ) -> Vec<i32> {
730        match method {
731            "e" => println!("\n\nCalculating KNN using euclidean distance ..."),
732            "ma" => println!("\n\nCalculating KNN using manhattan distance ..."),
733            "co" => println!("\n\nCalculating KNN using cosine distance ..."),
734            "ch" => println!("\n\nCalculating KNN using chebyshev distance ..."),
735            _ => panic!("The method has to be either 'e' or 'ma' or 'co' or 'ch'"),
736        };
737        let mut predcited = vec![];
738        for j in test_rows.iter() {
739            let mut class_found = vec![];
740            for (n, i) in train_rows.iter().enumerate() {
741                // println!("{:?},{:?},{:?}", j, n, i);
742                let dis = Distance {
743                    row1: i.clone(),
744                    row2: j.clone(),
745                };
746                match method {
747                    "e" => class_found.push((dis.distance_euclidean(), train_values[n])),
748                    "ma" => class_found.push((dis.distance_manhattan(), train_values[n])),
749                    "co" => class_found.push((dis.distance_cosine(), train_values[n])),
750                    "ch" => class_found.push((dis.distance_chebyshev(), train_values[n])),
751                    _ => (), // cant happen as it would panic in the previous match
752                };
753            }
754            // sorting acsending the vector by first value of tuple
755            class_found.sort_by(|(a, _), (c, _)| (*a).partial_cmp(c).unwrap());
756            let k_nearest = class_found[..k].to_vec();
757            let knn: Vec<f64> = k_nearest.iter().map(|a| a.1).collect();
758            // converting classes to int and classifying
759            let nearness = value_counts(&knn.iter().map(|a| *a as i32).collect());
760            // finding the closest
761            predcited.push(*nearness.iter().next_back().unwrap().0)
762        }
763        predcited
764    }
765}
766
767pub struct Distance {
768    pub row1: Vec<f64>,
769    pub row2: Vec<f64>,
770}
771impl Distance {
772    pub fn distance_euclidean(&self) -> f64 {
773        // sqrt(sum((row1-row2)**2))
774
775        let distance = self
776            .row1
777            .iter()
778            .zip(self.row2.iter())
779            .map(|(a, b)| (*a - *b) * (*a - *b))
780            .collect::<Vec<f64>>();
781        distance.iter().fold(0., |a, b| a + b).sqrt()
782    }
783
784    pub fn distance_manhattan(&self) -> f64 {
785        // sum(|row1-row2|)
786
787        let distance = self
788            .row1
789            .iter()
790            .zip(self.row2.iter())
791            .map(|(a, b)| (*a - *b).abs())
792            .collect::<Vec<f64>>();
793        distance.iter().fold(0., |a, b| a + b)
794    }
795
796    pub fn distance_cosine(&self) -> f64 {
797        // 1- (a.b)/(|a||b|)
798
799        let numerator = self
800            .row1
801            .iter()
802            .zip(self.row2.iter())
803            .map(|(a, b)| (*a * *b))
804            .collect::<Vec<f64>>()
805            .iter()
806            .fold(0., |a, b| a + b);
807        let denominator = (self
808            .row1
809            .iter()
810            .map(|a| a * a)
811            .collect::<Vec<f64>>()
812            .iter()
813            .fold(0., |a, b| a + b)
814            .sqrt())
815            * (self
816                .row2
817                .iter()
818                .map(|a| a * a)
819                .collect::<Vec<f64>>()
820                .iter()
821                .fold(0., |a, b| a + b)
822                .sqrt());
823        1. - numerator / denominator
824    }
825
826    pub fn distance_chebyshev(&self) -> f64 {
827        // max(|row1-row2|)
828        let distance = self
829            .row1
830            .iter()
831            .zip(self.row2.iter())
832            .map(|(a, b)| (*a - *b).abs())
833            .collect::<Vec<f64>>();
834        distance.iter().cloned().fold(0. / 0., f64::max)
835    }
836}
837
838pub struct Kmeans {
839    pub file_path: String,
840    pub k: usize,
841    pub iterations: u32,
842}
843impl Kmeans {
844    pub fn fit(&self) {
845        /*
846            Source:
847            Video:
848            Book: Trevor Hastie,  Robert Tibshirani, Jerome Friedman - The Elements of  Statistical Learning_  Data Mining, Inference, and Pred
849            Article: https://www.analyticsvidhya.com/blog/2019/08/comprehensive-guide-k-means-clustering/
850            Library:
851
852            ABOUT:
853            * Assuming no duplicate rows exist
854            * Only features and no targets in the input data
855
856            Procedure:
857            1. Prepare data : remove target if any
858            2. Select K centroids
859            3. Find closest points (Eucledian distance)
860            4. Calcualte new mean
861            5. Repeat 3,4 till the same points ened up in the cluster
862
863            TODO:
864            * Add cost function to minimize
865        */
866
867        // read a csv file
868        let (_, values) = read_csv(self.file_path.clone()); // output is row wise
869
870        // converting vector of string to vector of f64s
871        let random_data: Vec<_> = float_randomize(&values);
872
873        // selecting first k points as centroid (already in random order)
874        let mut centroids = randomize(&random_data)[..self.k].to_vec();
875        print_a_matrix("Original means", &centroids);
876
877        let mut new_mean: Vec<Vec<f64>> = vec![];
878        for x in 0..self.iterations - 1 {
879            let mut updated_cluster = vec![];
880            let mut nearest_centroid_number = vec![];
881            for i in random_data.iter() {
882                let mut distance = vec![];
883                for (centroid_number, j) in centroids.iter().enumerate() {
884                    let dis = Distance {
885                        row1: i.clone(),
886                        row2: j.clone(),
887                    };
888                    distance.push((centroid_number, dis.distance_euclidean()))
889                }
890                distance.sort_by(|m, n| m.1.partial_cmp(&n.1).unwrap());
891                nearest_centroid_number.push(distance[0].0);
892            }
893
894            // combining cluster number and data
895            let clusters: Vec<(&usize, &Vec<f64>)> = nearest_centroid_number
896                .iter()
897                .zip(random_data.iter())
898                .collect();
899            // println!("{:?}", clusters);
900
901            // finding new centorid
902            new_mean = vec![];
903            for (m, _) in centroids.iter().enumerate() {
904                let mut group = vec![];
905                for i in clusters.iter() {
906                    if *i.0 == m {
907                        group.push(i.1.clone());
908                    }
909                }
910                new_mean.push(
911                    group
912                        .iter()
913                        .fold(vec![0.; self.k], |a, b| {
914                            element_wise_operation(&a, b, "add")
915                        })
916                        .iter()
917                        .map(|a| a / (group.len() as f64)) // the mean part in K-means
918                        .collect(),
919                );
920                updated_cluster = clusters.clone()
921            }
922            println!("Iteration {:?}", x);
923            if centroids == new_mean {
924                // show in a list of cluster number as per the order of row in original data
925                let mut rearranged_output = vec![];
926                for i in values
927                    .iter()
928                    .map(|a| a.iter().map(|b| b.parse().unwrap()).collect())
929                    .collect::<Vec<Vec<f64>>>()
930                    .iter()
931                {
932                    for (c, v) in updated_cluster.iter() {
933                        if i == *v {
934                            rearranged_output.push((c, v));
935                            break;
936                        }
937                    }
938                }
939                // displaying only the clusters assigned to  each row
940                println!(
941                    "CLUSTERS\n{:?}",
942                    rearranged_output
943                        .iter()
944                        .map(|a| **(a.0))
945                        .collect::<Vec<usize>>()
946                );
947                break;
948            } else {
949                centroids = new_mean.clone();
950            }
951        }
952        print_a_matrix("Final means", &centroids);
953    }
954}
955
956pub struct SSVM {
957    pub file_path: String,              // pointing to a csv or txt file
958    pub drop_column_number: Vec<usize>, // if first and second column has id and are not required then vec![1,2] else if nothing then vec![]
959    pub test_size: f64,                 // ex: .30 => random 30% of data become test
960    pub learning_rate: f64,             // gradient descent step size ex: 0.1, 0.05 etc
961    pub iter_count: i32,                // how many epochs ex: 10000
962    pub reg_strength: f64, // at what probability will the class be determined ex: 0.6 => anything above 0.6 is 1
963}
964impl SSVM {
965    // https://towardsdatascience.com/svm-implementation-from-scratch-python-2db2fc52e5c2
966    // data: https://www.kaggle.com/uciml/breast-cancer-wisconsin-data (changed M:1 and B:2)
967    /*
968        Assumes all the pre-processing like converting string columns to a number has been done
969        Assuming the target column is placed in the end
970        Assuming only two classes 1 and -1
971    */
972
973    pub fn fit(&self) -> Vec<f64> {
974        // read a csv file
975        let (columns, values) = read_csv(self.file_path.clone()); // output is row wise
976
977        // converting vector of string to vector of f64s
978        // println!("___");
979        let mut random_data = SSVM::float_randomize(&values);
980
981        println!(
982            "The columns are\n{:?}\n",
983            columns
984                .iter()
985                .filter(|a| **a != "\r".to_string())
986                .map(|a| a.replace("\"", ""))
987                .collect::<Vec<String>>()
988        );
989        shape("Before dropping columns the dimensions are", &random_data);
990
991        // drop column after converting it to column wise data
992        random_data = row_to_columns_conversion(&random_data);
993        if self.drop_column_number.len() > 0 {
994            for (n, i) in self.drop_column_number.iter().enumerate() {
995                if n == 0 {
996                    println!("Dropping column #{}", i);
997                    random_data = drop_column(&random_data, *i);
998                } else {
999                    println!("Dropping column #{}", i);
1000                    random_data = drop_column(&random_data, *i - n);
1001                }
1002            }
1003        }
1004        // converting it back to row wise
1005        random_data = columns_to_rows_conversion(&random_data);
1006
1007        shape("After dropping columns the dimensions are", &random_data);
1008        println!();
1009
1010        head(&random_data, 5);
1011
1012        // normalizing features in thier columns wise format
1013        let mut normalized = row_to_columns_conversion(&random_data);
1014
1015        random_data = row_to_columns_conversion(&random_data);
1016        for (n, i) in random_data.iter().enumerate() {
1017            print!(".");
1018            if n != normalized.len() - 1 - self.drop_column_number.len() {
1019                normalized[n] = min_max_scaler(i);
1020            } else {
1021                normalized[n] = i.clone();
1022            }
1023        }
1024        println!("\nAfter normalizing:");
1025
1026        // converting it back to row wise
1027        normalized = columns_to_rows_conversion(&normalized);
1028        head(&normalized, 5);
1029        println!();
1030
1031        // splitting it into train and test as per test percentage passed as parameter to get scores
1032        let (mut x_train, y_train, mut x_test, y_test) =
1033            preprocess_train_test_split(&normalized, self.test_size, normalized[0].len(), "");
1034
1035        // adding intercept column to feature
1036        let mut length = x_train[0].len();
1037        let intercept = vec![vec![1.; length]];
1038        x_train = [&intercept[..], &x_train[..]].concat();
1039        length = x_test[0].len();
1040        let intercept = vec![vec![1.; length]];
1041        x_test = [&intercept[..], &x_test[..]].concat();
1042
1043        // converting into proper shape
1044        x_train = columns_to_rows_conversion(&x_train);
1045        x_test = columns_to_rows_conversion(&x_test);
1046
1047        // checking the shapes
1048        shape("Training features", &x_train);
1049        shape("Test features", &x_test);
1050        println!("Training target: {:?}", &y_train.len());
1051        println!("Test target: {:?}", &y_test.len());
1052
1053        let weights = SSVM::sgd(&self, &x_train, &y_train);
1054        let predictions = SSVM::predict(&self, &x_test, &weights);
1055        confuse_me(&predictions, &y_test, -1., 1.);
1056        println!("Weights of intercept followed by features : {:?}", weights);
1057        weights
1058    }
1059    fn sgd(&self, features: &Vec<Vec<f64>>, output: &Vec<f64>) -> Vec<f64> {
1060        let max_epoch: i32 = self.iter_count;
1061        let mut weights = vec![0.; features[0].len()];
1062        let mut nth = 0.;
1063        let mut prev_cost = std::f64::INFINITY;
1064        let per_cost_threshold = 0.01;
1065        for epoch in 1..max_epoch {
1066            // shuffling inputs
1067            if epoch % 100 == 0 {
1068                print!("..");
1069            }
1070            let order = randomize_vector(&(0..output.len()).map(|a| a).collect());
1071            let mut x = vec![];
1072            let mut y = vec![];
1073            for i in order.iter() {
1074                x.push(features[*i].clone());
1075                y.push(output[*i]);
1076            }
1077
1078            // calculating cost
1079            for (n, i) in x.iter().enumerate() {
1080                let ascent = SSVM::calculate_cost_gradient(&self, &weights, i, y[n]);
1081                weights = element_wise_operation(
1082                    &weights,
1083                    &ascent.iter().map(|a| a * self.learning_rate).collect(),
1084                    "sub",
1085                );
1086            }
1087            // println!("Ascent {:?}", weights);
1088
1089            if epoch == 2f64.powf(nth) as i32 || epoch == max_epoch - 1 {
1090                let cost = SSVM::compute_cost(&self, &weights, features, output);
1091                println!("{} Epoch, has cost {}", epoch, cost);
1092                if (prev_cost - cost).abs() < (per_cost_threshold * prev_cost) {
1093                    println!("{:?}", weights);
1094                    return weights;
1095                }
1096                prev_cost = cost;
1097                nth += 1.;
1098            }
1099        }
1100        // println!();
1101        weights
1102    }
1103
1104    fn compute_cost(&self, weight: &Vec<f64>, x: &Vec<Vec<f64>>, y: &Vec<f64>) -> f64 {
1105        // hinge loss
1106        let mut distance = element_wise_operation(&matrix_vector_product_f(x, weight), &y, "mul");
1107        // println!("{:?}", &matrix_vector_product_f(x, weight).len());
1108        // println!("Loss {:?}", distance);
1109        distance = distance.iter().map(|a| 1. - *a).collect();
1110        distance = distance
1111            .iter()
1112            .map(|a| if *a > 0. { *a } else { 0. })
1113            .collect();
1114        let hinge_loss =
1115            self.reg_strength * (distance.iter().fold(0., |a, b| a + b) / (x.len() as f64));
1116        (dot_product(&weight, &weight) / 2.) + hinge_loss
1117    }
1118
1119    fn calculate_cost_gradient(
1120        &self,
1121        weight: &Vec<f64>,
1122        x_batch: &Vec<f64>,
1123        y_batch: f64,
1124    ) -> Vec<f64> {
1125        let distance = 1. - (dot_product(&x_batch, &weight) * y_batch);
1126        // println!("Distance {:?}", distance);
1127        let mut dw = vec![0.; weight.len()];
1128        let di;
1129        if distance < 0. {
1130            di = dw.clone();
1131        } else {
1132            let second_half = x_batch
1133                .iter()
1134                .map(|a| a * self.reg_strength * y_batch)
1135                .collect();
1136            di = element_wise_operation(weight, &second_half, "sub");
1137        }
1138        dw = element_wise_operation(&di, &dw, "add");
1139        // println!("di : {:?}", dw);
1140        dw
1141    }
1142
1143    fn predict(&self, test_features: &Vec<Vec<f64>>, weights: &Vec<f64>) -> Vec<f64> {
1144        let mut output = vec![];
1145        for i in test_features.iter() {
1146            if dot_product(i, weights) > 0. {
1147                output.push(1.);
1148            } else {
1149                output.push(-1.);
1150            }
1151        }
1152        println!("Predications : {:?}", output);
1153        output
1154    }
1155
1156    fn float_randomize(matrix: &Vec<Vec<String>>) -> Vec<Vec<f64>> {
1157        randomize(
1158            &matrix
1159                .iter()
1160                .map(|a| {
1161                    a.iter()
1162                        .map(|b| {
1163                            (b).replace("\r", "")
1164                                .replace("\n", "")
1165                                .parse::<f64>()
1166                                .unwrap()
1167                        })
1168                        .collect::<Vec<f64>>()
1169                })
1170                .collect::<Vec<Vec<f64>>>(),
1171        )
1172    }
1173}
1174
1175pub fn mean<T>(list: &Vec<T>) -> f64
1176where
1177    T: std::iter::Sum<T>
1178        + std::ops::Div<Output = T>
1179        + Copy
1180        + std::str::FromStr
1181        + std::string::ToString
1182        + std::ops::Add<T, Output = T>
1183        + std::fmt::Debug
1184        + std::fmt::Display
1185        + std::str::FromStr,
1186    <T as std::str::FromStr>::Err: std::fmt::Debug,
1187{
1188    // https://machinelearningmastery.com/implement-simple-linear-regression-scratch-python/
1189    let zero: T = "0".parse().unwrap();
1190    let len_str = list.len().to_string();
1191    let length: T = len_str.parse().unwrap();
1192    (list.iter().fold(zero, |acc, x| acc + *x) / length)
1193        .to_string()
1194        .parse()
1195        .unwrap()
1196}
1197
1198pub fn variance<T>(list: &Vec<T>) -> f64
1199where
1200    T: std::iter::Sum<T>
1201        + std::ops::Div<Output = T>
1202        + std::marker::Copy
1203        + std::fmt::Display
1204        + std::ops::Sub<T, Output = T>
1205        + std::ops::Add<T, Output = T>
1206        + std::ops::Mul<T, Output = T>
1207        + std::fmt::Debug
1208        + std::string::ToString
1209        + std::str::FromStr,
1210    <T as std::str::FromStr>::Err: std::fmt::Debug,
1211{
1212    // https://machinelearningmastery.com/implement-simple-linear-regression-scratch-python/
1213    let zero: T = "0".parse().unwrap();
1214    let mu = mean(list);
1215    let _len_str: T = list.len().to_string().parse().unwrap(); // is division is required
1216    let output: Vec<_> = list
1217        .iter()
1218        .map(|x| (*x - mu.to_string().parse().unwrap()) * (*x - mu.to_string().parse().unwrap()))
1219        .collect();
1220    // output
1221    let variance = output.iter().fold(zero, |a, b| a + *b); // / len_str;
1222    variance.to_string().parse().unwrap()
1223}
1224
1225pub fn covariance<T>(list1: &Vec<T>, list2: &Vec<T>) -> f64
1226where
1227    T: std::iter::Sum<T>
1228        + std::ops::Div<Output = T>
1229        + std::fmt::Debug
1230        + std::fmt::Display
1231        + std::ops::Add
1232        + std::marker::Copy
1233        + std::ops::Add<T, Output = T>
1234        + std::ops::Sub<T, Output = T>
1235        + std::ops::Mul<T, Output = T>
1236        + std::string::ToString
1237        + std::str::FromStr,
1238    <T as std::str::FromStr>::Err: std::fmt::Debug,
1239{
1240    // https://machinelearningmastery.com/implement-simple-linear-regression-scratch-python/
1241    let mu1 = mean(list1);
1242    let mu2 = mean(list2);
1243    let zero: T = "0".parse().unwrap();
1244    let _len_str: f64 = list1.len().to_string().parse().unwrap(); // if division is required
1245    let tupled: Vec<_> = list1.iter().zip(list2).collect();
1246    let output = tupled.iter().fold(zero, |a, b| {
1247        a + ((*b.0 - mu1.to_string().parse().unwrap()) * (*b.1 - mu2.to_string().parse().unwrap()))
1248    });
1249    let numerator: f64 = output.to_string().parse().unwrap();
1250    numerator // / _len_str  // (this is not being divided by populaiton size)
1251}
1252
1253pub fn coefficient<T>(list1: &Vec<T>, list2: &Vec<T>) -> (f64, f64)
1254where
1255    T: std::iter::Sum<T>
1256        + std::ops::Div<Output = T>
1257        + std::fmt::Debug
1258        + std::fmt::Display
1259        + std::ops::Add
1260        + std::marker::Copy
1261        + std::ops::Add<T, Output = T>
1262        + std::ops::Sub<T, Output = T>
1263        + std::ops::Mul<T, Output = T>
1264        + std::str::FromStr,
1265    <T as std::str::FromStr>::Err: std::fmt::Debug,
1266{
1267    /*
1268    To find slope and intercept of a line
1269    */
1270    // https://machinelearningmastery.com/implement-simple-linear-regression-scratch-python/
1271    let b1 = covariance(list1, list2) / variance(list1);
1272    let b0 = mean(list2) - (b1 * mean(list1));
1273    (b0.to_string().parse().unwrap(), b1)
1274}
1275
1276pub fn simple_linear_regression_prediction<T>(train: &Vec<(T, T)>, test: &Vec<(T, T)>) -> Vec<T>
1277where
1278    T: std::iter::Sum<T>
1279        + std::ops::Div<Output = T>
1280        + std::fmt::Debug
1281        + std::fmt::Display
1282        + std::ops::Add
1283        + std::marker::Copy
1284        + std::ops::Add<T, Output = T>
1285        + std::ops::Sub<T, Output = T>
1286        + std::ops::Mul<T, Output = T>
1287        + std::str::FromStr,
1288    <T as std::str::FromStr>::Err: std::fmt::Debug,
1289{
1290    // https://machinelearningmastery.com/implement-simple-linear-regression-scratch-python/
1291    let train_features = &train.iter().map(|a| a.0).collect();
1292    let test_features = &test.iter().map(|a| a.1).collect();
1293    let (offset, slope) = coefficient(train_features, test_features);
1294    let b0: T = offset.to_string().parse().unwrap();
1295    let b1: T = slope.to_string().parse().unwrap();
1296    let predicted_output = test.iter().map(|a| b0 + b1 * a.0).collect();
1297    let original_output: Vec<_> = test.iter().map(|a| a.0).collect();
1298    println!("========================================================================================================================================================");
1299    println!("b0 = {:?} and b1= {:?}", b0, b1);
1300    println!(
1301        "RMSE: {:?}",
1302        root_mean_square(&predicted_output, &original_output)
1303    );
1304    predicted_output
1305}
1306
1307pub fn root_mean_square<T>(list1: &Vec<T>, list2: &Vec<T>) -> f64
1308where
1309    T: std::ops::Sub<T, Output = T>
1310        + Copy
1311        + std::ops::Mul<T, Output = T>
1312        + std::ops::Add<T, Output = T>
1313        + std::ops::Div<Output = T>
1314        + std::string::ToString
1315        + std::str::FromStr,
1316    <T as std::str::FromStr>::Err: std::fmt::Debug,
1317{
1318    // https://machinelearningmastery.com/implement-simple-linear-regression-scratch-python/
1319    let zero: T = "0".parse().unwrap();
1320    let tupled: Vec<_> = list1.iter().zip(list2).collect();
1321    let length: T = list1.len().to_string().parse().unwrap();
1322    let mean_square_error = tupled
1323        .iter()
1324        .fold(zero, |b, a| b + ((*a.1 - *a.0) * (*a.1 - *a.0)))
1325        / length;
1326    let mse: f64 = mean_square_error.to_string().parse().unwrap();
1327    mse.powf(0.5)
1328}
1329
1330// reading in files for multi column operations
1331use std::collections::HashMap;
1332use std::fs;
1333pub fn read_csv<'a>(path: String) -> (Vec<String>, Vec<Vec<String>>) {
1334    /*
1335    Returns headers and row wise values as strings
1336    */
1337    // println!("========================================================================================================================================================");
1338    println!("Reading the file ...");
1339    let file = fs::read_to_string(&path).unwrap();
1340    let splitted: Vec<&str> = file.split("\n").collect();
1341    let rows: i32 = (splitted.len() - 1) as i32;
1342    println!("Number of rows = {}", rows);
1343    let table: Vec<Vec<_>> = splitted.iter().map(|a| a.split(",").collect()).collect();
1344    let values = table[1..]
1345        .iter()
1346        .map(|a| a.iter().map(|b| b.to_string()).collect())
1347        .collect();
1348    let columns: Vec<String> = table[0].iter().map(|a| a.to_string()).collect();
1349    (columns, values)
1350}
1351
1352use std::io::Error;
1353pub fn convert_and_impute<U>(
1354    list: &Vec<String>,
1355    to: U,
1356    impute_with: U,
1357) -> (Result<Vec<U>, Error>, Vec<usize>)
1358where
1359    U: std::cmp::PartialEq + Copy + std::marker::Copy + std::string::ToString + std::str::FromStr,
1360    <U as std::str::FromStr>::Err: std::fmt::Debug,
1361{
1362    /*
1363    Convert a vector to a type by passing a value of that type and pass a value to replace missing values
1364    */
1365    println!("========================================================================================================================================================");
1366    // takes string input and converts it to int or float
1367    let mut output: Vec<_> = vec![];
1368    let mut missing = vec![];
1369    match type_of(to) {
1370        "f64" => {
1371            for (n, i) in list.iter().enumerate() {
1372                if *i != "" {
1373                    let x = i.parse::<U>().unwrap();
1374                    output.push(x);
1375                } else {
1376                    output.push(impute_with);
1377                    missing.push(n);
1378                    println!("Error found in {}th position of the vector", n);
1379                }
1380            }
1381        }
1382        "i32" => {
1383            for (n, i) in list.iter().enumerate() {
1384                if *i != "" {
1385                    let string_splitted: Vec<_> = i.split(".").collect();
1386                    let ones_digit = string_splitted[0].parse::<U>().unwrap();
1387                    output.push(ones_digit);
1388                } else {
1389                    output.push(impute_with);
1390                    missing.push(n);
1391                    println!("Error found in {}th position of the vector", n);
1392                }
1393            }
1394        }
1395        _ => println!("This type conversion cant be done, choose either int or float type\n Incase of string conversion, use impute_string"),
1396    }
1397
1398    (Ok(output), missing)
1399}
1400
1401pub fn impute_string<'a>(list: &'a mut Vec<String>, impute_with: &'a str) -> Vec<&'a str> {
1402    /*
1403    Replace missing value with the string thats passed
1404    */
1405    // println!("========================================================================================================================================================");
1406    list.iter()
1407        .enumerate()
1408        .map(|(n, a)| {
1409            if *a == String::from("") {
1410                println!("Missing value found in {}th position of the vector", n);
1411                impute_with
1412            } else {
1413                &a[..]
1414            }
1415        })
1416        .collect()
1417}
1418
1419// use std::collections::HashMap;
1420pub fn convert_string_categorical<T>(list: &Vec<T>, extra_class: bool) -> Vec<f64>
1421where
1422    T: std::cmp::PartialEq + std::cmp::Eq + std::hash::Hash + Copy,
1423{
1424    println!("========================================================================================================================================================");
1425    let values = unique_values(&list);
1426    if extra_class == true && values.len() > 10 {
1427        println!("The number of classes will be more than 10");
1428    } else {
1429        ();
1430    }
1431    let mut map: HashMap<&T, f64> = HashMap::new();
1432    for (n, i) in values.iter().enumerate() {
1433        map.insert(i, n as f64 + 1.);
1434    }
1435    list.iter().map(|a| map[a]).collect()
1436}
1437
1438pub fn min_max_scaler(list: &Vec<f64>) -> Vec<f64> {
1439    // println!("========================================================================================================================================================");
1440    let (minimum, maximum) = min_max_f(&list);
1441    let range: f64 = maximum - minimum;
1442    list.iter().map(|a| 1. - ((maximum - a) / range)).collect()
1443}
1444
1445pub fn logistic_function_f(matrix: &Vec<Vec<f64>>, beta: &Vec<Vec<f64>>) -> Vec<Vec<f64>> {
1446    println!("========================================================================================================================================================");
1447    //https://www.geeksforgeeks.org/understanding-logistic-regression/
1448    println!("logistic function");
1449    println!(
1450        "{:?}x{:?}\n{:?}x{:?}",
1451        matrix.len(),
1452        matrix[0].len(),
1453        beta.len(),
1454        beta[0].len()
1455    );
1456    matrix_multiplication(matrix, beta)
1457        .iter()
1458        .map(|a| a.iter().map(|b| 1. / (1. + ((b * -1.).exp()))).collect())
1459        .collect()
1460}
1461
1462pub fn log_gradient_f(
1463    matrix1: &Vec<Vec<f64>>,
1464    beta: &Vec<Vec<f64>>,
1465    matrix2: &Vec<f64>,
1466) -> Vec<Vec<f64>> {
1467    println!("========================================================================================================================================================");
1468    //https://www.geeksforgeeks.org/understanding-logistic-regression/
1469    println!("Log gradient_f");
1470    // PYTHON : // first_calc = logistic_func(beta, X) - y.reshape(X.shape[0], -1)
1471    let mut first_calc = vec![];
1472    for (n, i) in logistic_function_f(matrix1, beta).iter().enumerate() {
1473        let mut row = vec![];
1474        for j in i.iter() {
1475            row.push(j - matrix2[n]);
1476        }
1477        first_calc.push(row);
1478    }
1479
1480    let first_calc_t = transpose(&first_calc);
1481    let mut x = vec![];
1482    for j in 0..matrix1[0].len() {
1483        let mut row = vec![];
1484        for i in matrix1.iter() {
1485            row.push(i[j]);
1486        }
1487        x.push(row);
1488    }
1489
1490    // PYTHON : // final_calc = np.dot(first_calc.T, x)
1491    let mut final_calc = vec![];
1492    for i in first_calc_t.iter() {
1493        for j in x.iter() {
1494            final_calc.push(dot_product(&i, &j))
1495        }
1496    }
1497
1498    // println!("{:?}\n{:?}", &first_calc_t, &x);
1499    // println!("{:?}", &final_calc);
1500    // println!(
1501    //     "{:?}",
1502    //     shape_changer(&final_calc, matrix1[0].len(), matrix1.len())
1503    // );
1504    shape_changer(&final_calc, matrix1[0].len(), matrix1.len())
1505}
1506
1507pub fn logistic_predict(matrix1: &Vec<Vec<f64>>, beta: &Vec<Vec<f64>>) -> Vec<Vec<f64>> {
1508    // https://www.geeksforgeeks.org/understanding-logistic-regression/
1509    let prediction_probability = logistic_function_f(matrix1, beta);
1510    let output = prediction_probability
1511        .iter()
1512        .map(|a| a.iter().map(|b| if *b >= 0.5 { 1. } else { 0. }).collect())
1513        .collect();
1514    output
1515}
1516
1517pub fn randomize_vector<T: std::clone::Clone>(rows: &Vec<T>) -> Vec<T> {
1518    /*
1519    Shuffle values inside vector
1520    */
1521    use rand::seq::SliceRandom;
1522    // use rand::thread_rng;
1523    let mut order: Vec<usize> = (0..rows.len() as usize).collect();
1524    let slice: &mut [usize] = &mut order;
1525    let mut rng = thread_rng();
1526    slice.shuffle(&mut rng);
1527    // println!("{:?}", slice);
1528
1529    let mut output = vec![];
1530    for i in order.iter() {
1531        output.push(rows[*i].clone());
1532    }
1533    output
1534}
1535
1536pub fn randomize<T: std::clone::Clone>(rows: &Vec<Vec<T>>) -> Vec<Vec<T>> {
1537    /*
1538    Shuffle rows inside matrix
1539    */
1540    use rand::seq::SliceRandom;
1541    // use rand::thread_rng;
1542    let mut order: Vec<usize> = (0..rows.len() as usize).collect();
1543    let slice: &mut [usize] = &mut order;
1544    let mut rng = thread_rng();
1545    slice.shuffle(&mut rng);
1546    // println!("{:?}", slice);
1547
1548    let mut output = vec![];
1549    for i in order.iter() {
1550        output.push(rows[*i].clone());
1551    }
1552    output
1553}
1554
1555pub fn train_test_split_vector_f(input: &Vec<f64>, percentage: f64) -> (Vec<f64>, Vec<f64>) {
1556    /*
1557    Shuffle and split percentage of test for vector
1558    */
1559    // shuffle
1560    let data = randomize_vector(input);
1561    // println!("{:?}", data);
1562    // split
1563    let test_count = (data.len() as f64 * percentage) as usize;
1564    // println!("Test size is {:?}", test_count);
1565
1566    let test = data[0..test_count].to_vec();
1567    let train = data[test_count..].to_vec();
1568    (train, test)
1569}
1570
1571pub fn train_test_split_f(
1572    input: &Vec<Vec<f64>>,
1573    percentage: f64,
1574) -> (Vec<Vec<f64>>, Vec<Vec<f64>>) {
1575    /*
1576    Shuffle and split percentage of test for matrix
1577    */
1578    // shuffle
1579    let data = randomize(input);
1580    // println!("{:?}", data);
1581    // split
1582    let test_count = (data.len() as f64 * percentage) as usize;
1583    // println!("Test size is {:?}", test_count);
1584
1585    let test = data[0..test_count].to_vec();
1586    let train = data[test_count..].to_vec();
1587    (train, test)
1588}
1589
1590pub fn correlation<T>(list1: &Vec<T>, list2: &Vec<T>, name: &str) -> f64
1591where
1592    T: std::iter::Sum<T>
1593        + std::ops::Div<Output = T>
1594        + std::fmt::Debug
1595        + std::fmt::Display
1596        + std::ops::Add
1597        + std::cmp::PartialOrd
1598        + std::marker::Copy
1599        + std::ops::Add<T, Output = T>
1600        + std::ops::Sub<T, Output = T>
1601        + std::ops::Mul<T, Output = T>
1602        + std::string::ToString
1603        + std::str::FromStr,
1604    <T as std::str::FromStr>::Err: std::fmt::Debug,
1605{
1606    /*
1607    Correlation
1608    "p" => pearson
1609    "s" => spearman's
1610    */
1611    let cov = covariance(list1, list2);
1612    let output = match name {
1613        "p" => (cov / (std_dev(list1) * std_dev(list2))) / list1.len() as f64,
1614        "s" => {
1615            // https://statistics.laerd.com/statistical-guides/spearmans-rank-order-correlation-statistical-guide-2.php
1616            //covariance(&rank(list1), &rank(list2))/(std_dev(&rank(list1))*std_dev(&rank(list2)))
1617            let ranked_list1 = spearman_rank(list1);
1618            let ranked_list2 = spearman_rank(list2);
1619            let len = list1.len() as f64;
1620            // sorting rnaks back to original positions
1621            let mut rl1 = vec![];
1622            for k in list1.iter() {
1623                for (i, j) in ranked_list1.iter() {
1624                    if k == i {
1625                        rl1.push(j);
1626                    }
1627                }
1628            }
1629            let mut rl2 = vec![];
1630            for k in list2.iter() {
1631                for (i, j) in ranked_list2.iter() {
1632                    if k == i {
1633                        rl2.push(j);
1634                    }
1635                }
1636            }
1637
1638            let combined: Vec<_> = rl1.iter().zip(rl2.iter()).collect();
1639            let sum_of_square_of_difference = combined
1640                .iter()
1641                .map(|(a, b)| (***a - ***b) * (***a - ***b))
1642                .fold(0., |a, b| a + b);
1643            1. - ((6. * sum_of_square_of_difference) / (len * ((len * len) - 1.)))
1644            // 0.
1645        }
1646        _ => panic!("Either `p`: Pearson or `s`:Spearman has to be the name. Please retry!"),
1647    };
1648    match output {
1649        x if x < 0.2 && x > -0.2 => println!("There is a weak correlation between the two :"),
1650        x if x > 0.6 => println!("There is a strong positive correlation between the two :"),
1651        x if x < -0.6 => println!("There is a strong negative correlation between the two :"),
1652        _ => (),
1653    }
1654    output
1655}
1656
1657pub fn std_dev<T>(list1: &Vec<T>) -> f64
1658where
1659    T: std::iter::Sum<T>
1660        + std::ops::Div<Output = T>
1661        + std::fmt::Debug
1662        + std::fmt::Display
1663        + std::ops::Add
1664        + std::marker::Copy
1665        + std::ops::Add<T, Output = T>
1666        + std::ops::Sub<T, Output = T>
1667        + std::ops::Mul<T, Output = T>
1668        + std::string::ToString
1669        + std::str::FromStr,
1670    <T as std::str::FromStr>::Err: std::fmt::Debug,
1671{
1672    let mu: T = mean(list1).to_string().parse().unwrap();
1673    let square_of_difference = list1.iter().map(|a| (*a - mu) * (*a - mu)).collect();
1674    let var = mean(&square_of_difference);
1675    var.sqrt()
1676}
1677
1678pub fn spearman_rank<T>(list1: &Vec<T>) -> Vec<(T, f64)>
1679where
1680    T: std::iter::Sum<T>
1681        + std::ops::Div<Output = T>
1682        + std::fmt::Debug
1683        + std::fmt::Display
1684        + std::ops::Add
1685        + std::marker::Copy
1686        + std::cmp::PartialOrd
1687        + std::ops::Add<T, Output = T>
1688        + std::ops::Sub<T, Output = T>
1689        + std::ops::Mul<T, Output = T>
1690        + std::string::ToString
1691        + std::str::FromStr,
1692    <T as std::str::FromStr>::Err: std::fmt::Debug,
1693{
1694    /*
1695    Returns ranking of each value in ascending order with thier spearman rank in a vector of tuple
1696    */
1697    // https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient
1698    let mut sorted = list1.clone();
1699    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
1700    let mut map: Vec<(_, _)> = vec![];
1701    for (n, i) in sorted.iter().enumerate() {
1702        map.push(((n + 1), *i));
1703    }
1704    // repeating values
1705    let mut repeats: Vec<_> = vec![];
1706    for (n, i) in sorted.iter().enumerate() {
1707        if how_many_and_where_vector(&sorted, *i).len() > 1 {
1708            repeats.push((*i, how_many_and_where_vector(&sorted, *i)));
1709        } else {
1710            repeats.push((*i, vec![n]));
1711        }
1712    }
1713    // calculating the rank
1714    let rank: Vec<_> = repeats
1715        .iter()
1716        .map(|(a, b)| {
1717            (a, b.iter().fold(0., |a, b| a + *b as f64) / b.len() as f64) // mean of each position vector
1718        })
1719        .collect();
1720    let output: Vec<_> = rank.iter().map(|(a, b)| (**a, b + 1.)).collect(); // 1. is fro index offset
1721    output
1722}
1723
1724pub fn how_many_and_where_vector<T>(list: &Vec<T>, number: T) -> Vec<usize>
1725where
1726    T: std::cmp::PartialEq + std::fmt::Debug + Copy,
1727{
1728    /*
1729    Returns the positions of the number to be found in a vector
1730    */
1731    let tuple: Vec<_> = list
1732        .iter()
1733        .enumerate()
1734        .filter(|&(_, a)| *a == number)
1735        .map(|(n, _)| n)
1736        .collect();
1737    tuple
1738}
1739
1740pub fn how_many_and_where<T>(matrix: &Vec<Vec<T>>, number: T) -> Vec<(usize, usize)>
1741where
1742    T: std::cmp::PartialEq + std::fmt::Debug + Copy,
1743{
1744    /*
1745    Returns the positions of the number to be found in a matrix
1746    */
1747    let mut output = vec![];
1748    for (n, i) in matrix.iter().enumerate() {
1749        for j in how_many_and_where_vector(&i, number) {
1750            output.push((n, j));
1751        }
1752    }
1753    output
1754}
1755
1756pub fn z_score<T>(list: &Vec<T>, number: T) -> f64
1757where
1758    T: std::iter::Sum<T>
1759        + std::ops::Div<Output = T>
1760        + Copy
1761        + std::str::FromStr
1762        + std::string::ToString
1763        + std::ops::Add<T, Output = T>
1764        + std::ops::Sub<T, Output = T>
1765        + std::ops::Mul<T, Output = T>
1766        + std::fmt::Debug
1767        + std::cmp::PartialEq
1768        + std::fmt::Display
1769        + std::str::FromStr,
1770    <T as std::str::FromStr>::Err: std::fmt::Debug,
1771{
1772    /*
1773    Returns z_score
1774    */
1775    let n: f64 = number.to_string().parse().unwrap();
1776    if list.contains(&number) {
1777        (n - mean(list)) / std_dev(list)
1778    } else {
1779        panic!("The number not found in vector passed, please check");
1780    }
1781}
1782
1783pub fn one_hot_encoding(column: &Vec<&str>) -> Vec<Vec<u8>> {
1784    /*
1785    Counts unique values
1786    creates those many new columns
1787    each column will have 1 for every occurance of a particular unique value
1788    Ex: ["A", "B", "C"] => [[1,0,0],[0,1,0],[0,0,1]]
1789    */
1790    let values = unique_values(&column.clone());
1791    // println!("{:?}", values);
1792    let mut output = vec![];
1793    for i in values.iter() {
1794        output.push(column.iter().map(|a| if a == i { 1 } else { 0 }).collect());
1795    }
1796    output
1797}
1798
1799pub fn shape(words: &str, m: &Vec<Vec<f64>>) {
1800    // # of rows and columns of a matrix
1801    println!(
1802        "{:?} : Rows: {:?}, Columns: {:?}",
1803        words,
1804        m.len(),
1805        m[0].len()
1806    );
1807}
1808
1809pub fn rmse(test_data: &Vec<Vec<f64>>, predicted: &Vec<f64>) -> f64 {
1810    /*
1811    square root of (square of difference of predicted and actual divided by number of predications)
1812    */
1813    (mse(test_data, predicted)).sqrt()
1814}
1815
1816pub fn mse(test_data: &Vec<Vec<f64>>, predicted: &Vec<f64>) -> f64 {
1817    /*
1818    square of difference of predicted and actual divided by number of predications
1819    */
1820
1821    let mut square_error: Vec<f64> = vec![];
1822    for (n, i) in test_data.iter().enumerate() {
1823        let j = match i.last() {
1824            Some(x) => (predicted[n] - x) * (predicted[n] - x), // square difference
1825            _ => panic!("Something wrong in passed test data"),
1826        };
1827        square_error.push(j)
1828    }
1829    // println!("{:?}", square_error);
1830    square_error.iter().fold(0., |a, b| a + b) / (predicted.len() as f64)
1831}
1832
1833pub fn mae(test_data: &Vec<Vec<f64>>, predicted: &Vec<f64>) -> f64 {
1834    /*
1835    average of absolute difference of predicted and actual
1836    */
1837
1838    let mut absolute_error: Vec<f64> = vec![];
1839    for (n, i) in test_data.iter().enumerate() {
1840        let j = match i.last() {
1841            Some(x) => (predicted[n] - x).abs(), // absolute difference
1842            _ => panic!("Something wrong in passed test data"),
1843        };
1844        absolute_error.push(j)
1845    }
1846    // println!("{:?}", absolute_error);
1847    absolute_error.iter().fold(0., |a, b| a + b) / (predicted.len() as f64)
1848}
1849
1850pub fn r_square(predicted: &Vec<f64>, actual: &Vec<f64>, features: usize) -> (f64, f64) {
1851    // https://github.com/radialHuman/rust/blob/master/util/util_ml/src/lib_ml.rs
1852    /*
1853
1854    */
1855    let sst: Vec<_> = actual
1856        .iter()
1857        .map(|a| {
1858            (a - (actual.iter().fold(0., |a, b| a + b) / (actual.len() as f64))
1859                * (a - (actual.iter().fold(0., |a, b| a + b) / (actual.len() as f64))))
1860        })
1861        .collect();
1862    let ssr = predicted
1863        .iter()
1864        .zip(actual.iter())
1865        .fold(0., |a, b| a + (b.0 - b.1));
1866    let r2 = 1. - (ssr / (sst.iter().fold(0., |a, b| a + b)));
1867    // println!("{:?}\n{:?}", predicted, actual);
1868    let degree_of_freedom = predicted.len() as f64 - 1. - features as f64;
1869    let ar2 = 1. - ((1. - r2) * ((predicted.len() as f64 - 1.) / degree_of_freedom));
1870    (r2, ar2)
1871}
1872
1873pub fn mape(test_data: &Vec<Vec<f64>>, predicted: &Vec<f64>) -> f64 {
1874    /*
1875    average of absolute difference of predicted and actual
1876    */
1877
1878    let mut absolute_error: Vec<f64> = vec![];
1879    for (n, i) in test_data.iter().enumerate() {
1880        let j = match i.last() {
1881            Some(x) => (((predicted[n] - x) / predicted[n]).abs()) * 100., // absolute difference
1882            _ => panic!("Something wrong in passed test data"),
1883        };
1884        absolute_error.push(j)
1885    }
1886    // println!("{:?}", absolute_error);
1887    absolute_error.iter().fold(0., |a, b| a + b) / (predicted.len() as f64)
1888}
1889
1890pub fn drop_column(matrix: &Vec<Vec<f64>>, column_number: usize) -> Vec<Vec<f64>> {
1891    // let part1 = matrix[..column_number - 1].to_vec();
1892    // let part2 = matrix[column_number..].to_vec();
1893    // shape("target", &part2);
1894    [
1895        &matrix[..column_number - 1].to_vec()[..],
1896        &matrix[column_number..].to_vec()[..],
1897    ]
1898    .concat()
1899}
1900
1901pub fn float_randomize(matrix: &Vec<Vec<String>>) -> Vec<Vec<f64>> {
1902    randomize(
1903        &matrix
1904            .iter()
1905            .map(|a| {
1906                a.iter()
1907                    .map(|b| (*b).replace("\r", "").parse::<f64>().unwrap())
1908                    .collect::<Vec<f64>>()
1909            })
1910            .collect::<Vec<Vec<f64>>>(),
1911    )
1912}
1913
1914pub fn preprocess_train_test_split(
1915    matrix: &Vec<Vec<f64>>,
1916    test_percentage: f64,
1917    target_column: usize,
1918    preprocess: &str,
1919) -> (Vec<Vec<f64>>, Vec<f64>, Vec<Vec<f64>>, Vec<f64>) {
1920    /*
1921    preprocess : "s" : standardize, "m" : minmaxscaler, "_" : no change
1922    */
1923
1924    let (train_data, test_data) = train_test_split_f(matrix, test_percentage);
1925    // println!("Training size: {:?}", train_data.len());
1926    // println!("Test size: {:?}", test_data.len());
1927
1928    // converting rows to vector of columns of f64s
1929    let mut actual_train = row_to_columns_conversion(&train_data);
1930    let mut actual_test = row_to_columns_conversion(&test_data);
1931
1932    match preprocess {
1933        "s" => {
1934            actual_train = actual_train
1935                .iter()
1936                .map(|a| standardize_vector_f(a))
1937                .collect::<Vec<Vec<f64>>>();
1938            actual_test = actual_test
1939                .iter()
1940                .map(|a| standardize_vector_f(a))
1941                .collect::<Vec<Vec<f64>>>();
1942        }
1943        "m" => {
1944            actual_train = actual_train
1945                .iter()
1946                .map(|a| min_max_scaler(a))
1947                .collect::<Vec<Vec<f64>>>();
1948            actual_test = actual_test
1949                .iter()
1950                .map(|a| min_max_scaler(a))
1951                .collect::<Vec<Vec<f64>>>();
1952        }
1953
1954        _ => println!("Using the actual values without preprocessing unless 's' or 'm' is passed"),
1955    };
1956
1957    (
1958        drop_column(&actual_train, target_column),
1959        actual_train[target_column - 1].clone(),
1960        drop_column(&actual_test, target_column),
1961        actual_test[target_column - 1].clone(),
1962    )
1963}
1964
1965pub fn standardize_vector_f(list: &Vec<f64>) -> Vec<f64> {
1966    /*
1967    Preserves the shape of the original distribution. Doesn't
1968    reduce the importance of outliers. Least disruptive to the
1969    information in the original data. Default range for
1970    MinMaxScaler is O to 1.
1971        */
1972    list.iter()
1973        .map(|a| (*a - mean(list)) / std_dev(list))
1974        .collect()
1975}
1976
1977// pub fn min_max_scaler(list: &Vec<f64>) -> Vec<f64> {
1978//     let (minimum, maximum) = min_max_f(&list);
1979//     let range: f64 = maximum - minimum;
1980//     list.iter().map(|a| 1. - ((maximum - a) / range)).collect()
1981// }
1982
1983pub fn confuse_me(predicted: &Vec<f64>, actual: &Vec<f64>, class0: f64, class1: f64) {
1984    // https://medium.com/@MohammedS/performance-metrics-for-classification-problems-in-machine-learning-part-i-b085d432082b
1985    let mut tp = 0.; // class_one_is_class_one
1986    let mut fp = 0.; // class_one_is_class_two(Type 1)
1987    let mut fng = 0.; // class_two_is_class_one (Type 1)
1988    let mut tng = 0.; // class_two_is_class_two
1989
1990    for (i, j) in actual
1991        .iter()
1992        .zip(predicted.iter())
1993        .collect::<Vec<(&f64, &f64)>>()
1994        .iter()
1995    {
1996        if **i == class0 && **j == class0 {
1997            tp += 1.;
1998        }
1999        if **i == class1 && **j == class1 {
2000            tng += 1.;
2001        }
2002        if **i == class0 && **j == class1 {
2003            fp += 1.;
2004        }
2005        if **i == class1 && **j == class0 {
2006            fng += 1.;
2007        }
2008    }
2009    println!("\n|------------------------|");
2010    println!("|  {:?}    |   {:?}", tp, fp);
2011    println!("|------------------------|");
2012    println!("|  {:?}    |   {:?}", fng, tng);
2013    println!("|------------------------|");
2014    println!("Accuracy : {:.3}", (tp + tng) / (tp + fp + fng + tng));
2015    println!("Precision : {:.3}", (tp) / (tp + fp));
2016    let precision: f64 = (tp) / (tp + fp);
2017    println!("Recall (sensitivity) : {:.3}", (tp) / (tp + fng));
2018    let recall: f64 = (tp) / (tp + fng);
2019    println!("Specificity: {:.3}", (tng) / (fp + tng));
2020    println!(
2021        "F1 : {:.3}\n\n",
2022        (2. * precision * recall) / (precision * recall)
2023    );
2024}
2025
2026pub fn cv<T: Copy>(data: &Vec<Vec<T>>, k: usize) -> (Vec<Vec<T>>, Vec<Vec<T>>) {
2027    /*
2028    K-fold Cross validation
2029    */
2030
2031    (
2032        randomize(&data.clone())[k..].to_vec(),
2033        randomize(&data.clone())[..k].to_vec(),
2034    )
2035}
2036
2037pub fn z_outlier_f(list: &Vec<f64>) -> Vec<f64> {
2038    /*
2039    Anything below -3 or beyond 3 std deviations is considered as an outlier
2040    */
2041
2042    let mut v_clone = list.clone();
2043    v_clone.sort_by(|a, b| a.partial_cmp(b).unwrap());
2044    let z_v: Vec<_> = v_clone
2045        .iter()
2046        .map(|a| (z_score(&v_clone, *a), *a))
2047        .collect();
2048    z_v.iter()
2049        .filter(|(a, _)| (*a > 3.) || (*a < -3.))
2050        .map(|a| a.1)
2051        .collect::<Vec<f64>>()
2052}
2053
2054pub fn percentile_f(list: &Vec<f64>, percentile: u32) -> f64 {
2055    /*
2056    Returns passed percentile in the list
2057    */
2058    // https://en.wikipedia.org/wiki/Percentile
2059    list.clone().sort_by(|a, b| a.partial_cmp(b).unwrap());
2060    let oridinal_rank = round_off_f((percentile as f64 / 100.) * (list.len() as f64), 0);
2061    list[oridinal_rank as usize - 1]
2062}
2063
2064pub fn quartile_f(list: &Vec<f64>) {
2065    /*
2066    Returns quartiles like in a boxplot
2067    */
2068    println!(
2069        "\tPercentile:\t10th :{:?}\t25th :{:?}\t50th :{:?}\t75th :{:?}\t90th :{:?}",
2070        percentile_f(list, 10),
2071        percentile_f(list, 25),
2072        percentile_f(list, 50),
2073        percentile_f(list, 75),
2074        percentile_f(list, 90)
2075    );
2076}
2077
2078/*
2079DESCRIPTION
2080-----------------------------------------
2081STRUCTS
2082-------
20831. MatrixF : matrix: Vec<Vec<f64>> // upto 100x100
2084    > determinant_f
2085    > inverse_f
2086    > is_square_matrix
2087    x round_off_f
2088
20892. DataFrame : string: Vec<Vec<&'a str>>, numbers: Vec<Vec<f64>>, boolean: Vec<Vec<bool>>,
2090    > groupby : string_column_number , operation: &str // "sum" or "mean"
2091    > describe
2092    > sort : col_type : "s" or "n" , col_number, ascending : true or false
2093
20942. DataMap : string: HashMap<&'a str,Vec<&'a str>>, numbers:HashMap<&'a str,Vec<f64>>, boolean: HashMap<&'a str,Vec<bool>>,
2095    > groupby : string_column_number , operation: &str // "sum" or "mean"
2096    > describe
2097    > sort : col_type : "s" or "n" , col_name, ascending : true or false
2098
2099FUNCTIONS
2100---------
21011. dot_product :
2102    > 1. A &Vec<T>
2103    > 2. A &Vec<T>
2104    = 1. T
2105
21062. element_wise_operation : for vector
2107    > 1. A &mut Vec<T>
2108    > 2. A &mut Vec<T>
2109    > 3. operation &str ("add","sub","mul","div")
2110    = 1. Vec<T>
2111
21123. matrix_multiplication :
2113    > 1. A &Vec<Vec<T>>
2114    > 2. A &Vec<Vec<T>>
2115    = 1. Vec<Vec<T>>
2116
21174. pad_with_zero :
2118    > 1. A &mut Vec<T> to be modified
2119    > 2. usize of number of 0s to be added
2120    = 1. Vec<T>
2121
21225. print_a_matrix :
2123    > 1. A &str as parameter to describe the matrix
2124    > 2. To print &Vec<Vec<T>> line by line for better visual
2125    = 1. ()
2126
21276. shape_changer :
2128    > 1. A &Vec<T> to be converter into Vec<Vec<T>>
2129    > 2. number of columns to be converted to
2130    > 3. number of rows to be converted to
2131    = 1. Vec<Vec<T>>
2132
21337. transpose :
2134    > 1. A &Vec<Vec<T>> to be transposed
2135    = 1. Vec<Vec<T>>
2136
21378. vector_addition :
2138    > 1. A &Vec<T>
2139    > 2. A &Vec<T>
2140    = 1. Vec<T>
2141
21429. make_matrix_float :
2143    > 1. input: A &Vec<Vec<T>>
2144    = Vec<Vec<f64>>
2145
214610. make_vector_float :
2147    > 1. input: &Vec<T>
2148    = Vec<f64>
2149
215011. round_off_f :
2151    > 1. value: f64
2152    > 2. decimals: i32
2153    = f64
2154
215512. unique_values : of a Vector
2156    > 1. list : A &Vec<T>
2157    = 1. Vec<T>
2158
215913. value_counts :
2160    > 1. list : A &Vec<T>
2161    = HashMap<T, u32>
2162
216314. is_numerical :
2164    > 1. value: T
2165    = bool
2166
216715. min_max_f :
2168    > 1. list: A &Vec<f64>
2169    = (f64, f64)
2170
217116. type_of : To know the type of a variable
2172    > 1. _
2173    = &str
2174
217517. element_wise_matrix_operation : for matrices
2176    > 1. matrix1 : A &Vec<Vec<T>>
2177    > 2. matrix2 : A &Vec<Vec<T>>
2178    > 3. fucntion : &str ("add","sub","mul","div")
2179    = A Vec<Vec<T>>
2180
218118. matrix_vector_product_f
2182    > 1. matrix: &Vec<Vec<f64>>
2183    > 2. vector: &Vec<f64>
2184    = Vec<f64>
2185
218619. split_vector
2187    > 1. vector: &Vec<T>
2188    > 2. parts: i32
2189     = Vec<Vec<T>>
2190
219120. split_vector_at
2192    > 1. vector: &Vec<T>
2193    > 2. at: T
2194     = Vec<Vec<T>>
2195
219621. join_matrix
2197    > 1. matrix1: &Vec<Vec<T>>
2198    > 2. matrix2: &Vec<Vec<T>>
2199    > 3. how: &str : "long" or "wide"
2200    = Vec<Vec<T>>
2201
220222. make_matrix_string_literal
2203    > 1. data: &'a Vec<Vec<String>>
2204    = Vec<Vec<&'a str>>
2205
220623. head
2207    > 1. data: &Vec<Vec<T>>
2208    > 2. rows: usize
2209    = Vec<Vec<T>>
2210
221124. tail
2212    > 1. data: &Vec<Vec<T>>
2213    > 2. rows: usize
2214    = Vec<Vec<T>>
2215
221625. row_to_columns_conversion
2217    > 1. data: &Vec<Vec<T>>
2218    = Vec<Vec<T>>
2219
222026. columns_to_rows_conversion
2221    > 1. data: &Vec<Vec<T>>
2222    = Vec<Vec<T>>
2223
222427. datamap_comparision
2225    > table1: &DataMap
2226    > table2: &DataMap
2227
222828. dataframe_comparision
2229    > table1: &DataFrame
2230    > table2: &DataFrame
2231
223229. compare_vectors
2233    > v1: &Vec<T>,
2234    > v2: &Vec<T>,
2235    = (usize, Vec<(usize, usize)>)
2236
2237
2238*/
2239
2240#[derive(Debug)] // to make it usable by print!
2241pub struct MatrixF {
2242    pub matrix: Vec<Vec<f64>>,
2243}
2244
2245impl MatrixF {
2246    pub fn determinant_f(&self) -> f64 {
2247        // https://integratedmlai.com/find-the-determinant-of-a-matrix-with-pure-python-without-numpy-or-scipy/
2248        // check if it is a square matrix
2249        if MatrixF::is_square_matrix(&self.matrix) == true {
2250            println!("Calculating Determinant...");
2251
2252            match self.matrix.len() {
2253                1 => self.matrix[0][0],
2254                2 => MatrixF::determinant_2(&self),
2255                3..=100 => MatrixF::determinant_3plus(&self),
2256                _ => {
2257                    println!("Cant find determinant for size more than {}", 100);
2258                    "100".parse().unwrap()
2259                }
2260            }
2261        } else {
2262            panic!("The input should be a square matrix");
2263        }
2264    }
2265    fn determinant_2(&self) -> f64 {
2266        (self.matrix[0][0] * self.matrix[1][1]) - (self.matrix[1][0] * self.matrix[1][0])
2267    }
2268
2269    fn determinant_3plus(&self) -> f64 {
2270        // converting to upper triangle and multiplying the diagonals
2271        let length = self.matrix.len() - 1;
2272        let mut new_matrix = self.matrix.clone();
2273
2274        // rounding off value
2275        new_matrix = new_matrix
2276            .iter()
2277            .map(|a| a.iter().map(|a| MatrixF::round_off_f(*a, 3)).collect())
2278            .collect();
2279
2280        for diagonal in 0..=length {
2281            for i in diagonal + 1..=length {
2282                if new_matrix[diagonal][diagonal] == 0.0 {
2283                    new_matrix[diagonal][diagonal] = 0.001;
2284                }
2285                let scalar = new_matrix[i][diagonal] / new_matrix[diagonal][diagonal];
2286                for j in 0..=length {
2287                    new_matrix[i][j] = new_matrix[i][j] - (scalar * new_matrix[diagonal][j]);
2288                }
2289            }
2290        }
2291        let mut product = 1.;
2292        for i in 0..=length {
2293            product *= new_matrix[i][i]
2294        }
2295        product
2296    }
2297
2298    pub fn is_square_matrix<T>(matrix: &Vec<Vec<T>>) -> bool {
2299        if matrix.len() == matrix[0].len() {
2300            true
2301        } else {
2302            false
2303        }
2304    }
2305
2306    fn round_off_f(value: f64, decimals: i32) -> f64 {
2307        // println!("========================================================================================================================================================");
2308        ((value * 10.0f64.powi(decimals)).round()) / 10.0f64.powi(decimals)
2309    }
2310
2311    pub fn inverse_f(&self) -> Vec<Vec<f64>> {
2312        // https://integratedmlai.com/matrixinverse/
2313        let mut input = self.matrix.clone();
2314        let length = self.matrix.len();
2315        let mut identity = MatrixF::identity_matrix(length);
2316
2317        let index: Vec<usize> = (0..length).collect();
2318        // let int_index: Vec<i32> = index.iter().map(|a| *a as i32).collect();
2319
2320        for diagonal in 0..length {
2321            let diagonal_scalar = 1. / (input[diagonal][diagonal]);
2322            // first action
2323            for column_loop in 0..length {
2324                input[diagonal][column_loop] *= diagonal_scalar;
2325                identity[diagonal][column_loop] *= diagonal_scalar;
2326            }
2327
2328            // second action
2329            let except_diagonal: Vec<usize> = index[0..diagonal]
2330                .iter()
2331                .copied()
2332                .chain(index[diagonal + 1..].iter().copied())
2333                .collect();
2334            // println!("Here\n{:?}", exceptDiagonal);
2335
2336            for i in except_diagonal {
2337                let row_scalar = input[i as usize][diagonal].clone();
2338                for j in 0..length {
2339                    input[i][j] = input[i][j] - (row_scalar * input[diagonal][j]);
2340                    identity[i][j] = identity[i][j] - (row_scalar * identity[diagonal][j])
2341                }
2342            }
2343        }
2344
2345        identity
2346    }
2347
2348    fn identity_matrix(size: usize) -> Vec<Vec<f64>> {
2349        let mut output: Vec<Vec<f64>> = MatrixF::zero_matrix(size);
2350        for i in 0..=(size - 1) {
2351            for j in 0..=(size - 1) {
2352                if i == j {
2353                    output[i][j] = 1.;
2354                } else {
2355                    output[i][j] = 0.;
2356                }
2357            }
2358        }
2359        output
2360    }
2361
2362    fn zero_matrix(size: usize) -> Vec<Vec<f64>> {
2363        let mut output: Vec<Vec<f64>> = vec![];
2364        for _ in 0..=(size - 1) {
2365            output.push(vec![0.; size]);
2366        }
2367        output
2368    }
2369}
2370
2371pub struct DataFrame<'a> {
2372    // stored column wise
2373    pub string: Vec<Vec<&'a str>>,
2374    pub numerical: Vec<Vec<f64>>,
2375    pub boolean: Vec<Vec<bool>>,
2376}
2377impl<'a> DataFrame<'a> {
2378    pub fn describe(&self) {
2379        println!(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>");
2380        println!("      Details of the DataFrame",);
2381        println!(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>");
2382        for (n, i) in self.string.iter().enumerate() {
2383            println!(
2384                "String column #{:?}  Values count : {:?}",
2385                n,
2386                value_counts(i)
2387            )
2388        }
2389        for (n, i) in self.boolean.iter().enumerate() {
2390            println!(
2391                "String column #{:?}  Values count : {:?}",
2392                n,
2393                value_counts(i)
2394            )
2395        }
2396        for (n, i) in self.numerical.iter().enumerate() {
2397            println!("Numerical column #{:?}\n\tCount :{:?}", n, i.len());
2398            println!(
2399                "\tMinimum :{:?}  Maximum : {:?}",
2400                min_max_f(i).0,
2401                min_max_f(i).1
2402            );
2403            println!("\tMean :{:?}  Std Deviation : {:?}", mean(i), std_dev(i));
2404            quartile_f(i);
2405            println!("\tOutliers :{:?}", z_outlier_f(i));
2406        }
2407    }
2408    pub fn groupby(&self, string_column_number: usize, operation: &str) {
2409        // removing other string columns and boolean columns as they dont play any role
2410
2411        let reduced_dataframe_string = self.string[string_column_number].clone();
2412        let reduced_dataframe_float = self.numerical.clone();
2413
2414        // finding unique string values and the index they occur in
2415        let unique_string = unique_values(&reduced_dataframe_string);
2416        let mut unique_string_index = vec![];
2417        for i in unique_string.iter() {
2418            let mut single_string = vec![];
2419            for (n, j) in reduced_dataframe_string.iter().enumerate() {
2420                if i == j {
2421                    single_string.push(n);
2422                }
2423            }
2424            unique_string_index.push(single_string);
2425        }
2426
2427        // operating on numerical columns
2428        let mut output = vec![];
2429        for i in unique_string_index.iter() {
2430            let mut result = vec![];
2431            for j in reduced_dataframe_float.iter() {
2432                let seperated = j
2433                    .iter()
2434                    .enumerate()
2435                    .filter(|(n, _)| i.contains(n))
2436                    .collect::<Vec<(usize, &f64)>>();
2437                match operation {
2438                    "sum" => {
2439                        result.push(seperated.iter().map(|a| a.1).fold(0., |a, b| a + b));
2440                    }
2441                    "mean" => {
2442                        result.push(
2443                            seperated.iter().map(|a| a.1).fold(0., |a, b| a + b)
2444                                / (seperated.len() as f64),
2445                        );
2446                    }
2447                    _ => panic!("Enter either 'sum' or 'mean'"),
2448                };
2449            }
2450            output.push(result[0]);
2451        }
2452        println!(
2453            "Grouped on {:?} => {:?}",
2454            string_column_number,
2455            unique_string
2456                .iter()
2457                .zip(output.iter())
2458                .collect::<Vec<(&&str, &f64)>>()
2459        );
2460    }
2461    pub fn sort(&self, col_type: &str, col_number: usize, ascending: bool) -> DataFrame {
2462        /* returns a different DataFrame with rows sorted as per order passed
2463        col_type : "s": string,"n":numerical
2464        */
2465        let mut output = DataFrame {
2466            string: vec![],
2467            numerical: vec![],
2468            boolean: vec![],
2469        };
2470        let mut to_sort_by_string;
2471        let mut to_sort_by_numerical;
2472        let order: Vec<usize>;
2473        match col_type {
2474            "s" => {
2475                to_sort_by_string = self.string[col_number].clone();
2476                // finding the order of sorting
2477                order = DataFrame::find_order_of_sorting_string(&mut to_sort_by_string, ascending);
2478            }
2479            "n" => {
2480                to_sort_by_numerical = self.numerical[col_number].clone();
2481                // finding the order of sorting
2482                order = DataFrame::find_order_of_sorting_numerical(
2483                    &mut to_sort_by_numerical,
2484                    ascending,
2485                );
2486            }
2487            _ => panic!("Pass either `s` or `n`"),
2488        }
2489
2490        println!("New order is : {:?}", order);
2491        // reordering the original DataFrame (String)
2492        for each_vector in self.string.iter() {
2493            let mut new_vector = vec![];
2494            for o in order.iter() {
2495                new_vector.push(each_vector[*o]);
2496            }
2497            output.string.push(new_vector);
2498        }
2499
2500        // reordering the original DataFrame (Numerical)
2501        for each_vector in self.numerical.iter() {
2502            let mut new_vector = vec![];
2503            for o in order.iter() {
2504                new_vector.push(each_vector[*o]);
2505            }
2506
2507            output.numerical.push(new_vector);
2508        }
2509
2510        // reordering the original DataFrame (Boolean)
2511        for each_vector in self.boolean.iter() {
2512            let mut new_vector = vec![];
2513            for o in order.iter() {
2514                new_vector.push(each_vector[*o]);
2515            }
2516            output.boolean.push(new_vector);
2517        }
2518
2519        output
2520    }
2521    fn find_order_of_sorting_string(data: &mut Vec<&str>, ascending: bool) -> Vec<usize> {
2522        use std::collections::BTreeMap;
2523        let mut input = data.clone();
2524        let mut order: BTreeMap<usize, &str> = BTreeMap::new();
2525        let mut output = vec![];
2526
2527        // original order
2528        for (n, i) in data.iter().enumerate() {
2529            order.insert(n, i);
2530        }
2531        // println!("{:?}", order);
2532        match ascending {
2533            true => input.sort_unstable(),
2534            false => {
2535                input.sort_unstable();
2536                input.reverse();
2537            }
2538        };
2539
2540        // new order
2541        for i in input.iter() {
2542            for (k, v) in order.iter() {
2543                if (*i == *v) & (output.contains(k) == false) {
2544                    output.push(*k);
2545                    break;
2546                }
2547            }
2548        }
2549        output
2550    }
2551
2552    fn find_order_of_sorting_numerical(data: &mut Vec<f64>, ascending: bool) -> Vec<usize> {
2553        use std::collections::BTreeMap;
2554        let mut input = data.clone();
2555        let mut order: BTreeMap<usize, &f64> = BTreeMap::new();
2556        let mut output = vec![];
2557
2558        // original order
2559        for (n, i) in data.iter().enumerate() {
2560            order.insert(n, i);
2561        }
2562        // println!("{:?}", order);
2563        match ascending {
2564            true => input.sort_by(|a, b| a.partial_cmp(b).unwrap()),
2565            false => input.sort_by(|a, b| b.partial_cmp(a).unwrap()),
2566        };
2567
2568        // new order
2569        for i in input.iter() {
2570            for (k, v) in order.iter() {
2571                if (i == *v) & (output.contains(k) == false) {
2572                    output.push(*k);
2573                    break;
2574                }
2575            }
2576        }
2577        output
2578    }
2579}
2580
2581pub struct DataMap<'a> {
2582    // use std::collections::HashMap;
2583    // stored column wise
2584    pub string: HashMap<&'a str, Vec<&'a str>>,
2585    pub numerical: HashMap<&'a str, Vec<f64>>,
2586    pub boolean: HashMap<&'a str, Vec<bool>>,
2587}
2588impl<'a> DataMap<'a> {
2589    pub fn describe(&self) {
2590        println!(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>");
2591        println!("      Details of the DataMap",);
2592        println!(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>");
2593        for (k, v) in self.string.iter() {
2594            println!(
2595                "String column :{:?}  Values count : {:?}",
2596                k,
2597                value_counts(v)
2598            )
2599        }
2600        for (k, v) in self.boolean.iter() {
2601            println!(
2602                "Boolean column :{:?}  Values count : {:?}",
2603                k,
2604                value_counts(v)
2605            )
2606        }
2607        for (k, v) in self.numerical.iter() {
2608            println!("Numerical column :{:?}\n\tCount :{:?}", k, v.len());
2609            println!(
2610                "\tMinimum :{:?}  Maximum : {:?}",
2611                min_max_f(v).0,
2612                min_max_f(v).1
2613            );
2614            println!("\tMean :{:?}  Std Deviation : {:?}", mean(v), std_dev(v));
2615            quartile_f(v);
2616            println!("\tOutliers :{:?}", z_outlier_f(v));
2617        }
2618    }
2619
2620    pub fn groupby(&self, string_column: &str, operation: &str) {
2621        // removing other string columns and boolean columns as they dont play any role
2622
2623        let reduced_dataframe_string = self.string[string_column].clone();
2624        let reduced_dataframe_float: Vec<&Vec<f64>> = self.numerical.values().clone().collect();
2625
2626        // finding unique string values and the index they occur in
2627        let unique_string = unique_values(&reduced_dataframe_string);
2628        let mut unique_string_index = vec![];
2629        for i in unique_string.iter() {
2630            let mut single_string = vec![];
2631            for (n, j) in reduced_dataframe_string.iter().enumerate() {
2632                if i == j {
2633                    single_string.push(n);
2634                }
2635            }
2636            unique_string_index.push(single_string);
2637        }
2638
2639        // operating on numerical columns
2640        let mut output = vec![];
2641        for i in unique_string_index.iter() {
2642            let mut result = vec![];
2643            for j in reduced_dataframe_float.iter() {
2644                let seperated = j
2645                    .iter()
2646                    .enumerate()
2647                    .filter(|(n, _)| i.contains(n))
2648                    .collect::<Vec<(usize, &f64)>>();
2649                match operation {
2650                    "sum" => {
2651                        result.push(seperated.iter().map(|a| a.1).fold(0., |a, b| a + b));
2652                    }
2653                    "mean" => {
2654                        result.push(
2655                            seperated.iter().map(|a| a.1).fold(0., |a, b| a + b)
2656                                / (seperated.len() as f64),
2657                        );
2658                    }
2659                    _ => panic!("Enter either 'sum' or 'mean'"),
2660                };
2661            }
2662            output.push(result[0]);
2663        }
2664        println!(
2665            "Grouped by {:?} => {:?}",
2666            string_column,
2667            unique_string
2668                .iter()
2669                .zip(output.iter())
2670                .collect::<Vec<(&&str, &f64)>>()
2671        );
2672    }
2673    pub fn sort(&self, col_type: &str, col_name: &str, ascending: bool) -> DataMap {
2674        /* returns a different DataFrame with rows sorted as per order passed
2675        col_type : "s": string,"n":numerical
2676        */
2677        let mut output = DataMap {
2678            string: HashMap::new(),
2679            numerical: HashMap::new(),
2680            boolean: HashMap::new(),
2681        };
2682        let mut to_sort_by_string;
2683        let mut to_sort_by_numerical;
2684        let order: Vec<usize>;
2685        match col_type {
2686            "s" => {
2687                to_sort_by_string = self.string[col_name].clone();
2688                // finding the order of sorting
2689                order = DataFrame::find_order_of_sorting_string(&mut to_sort_by_string, ascending);
2690            }
2691            "n" => {
2692                to_sort_by_numerical = self.numerical[col_name].clone();
2693                // finding the order of sorting
2694                order = DataFrame::find_order_of_sorting_numerical(
2695                    &mut to_sort_by_numerical,
2696                    ascending,
2697                );
2698            }
2699            _ => panic!("Pass either `s` or `n`"),
2700        }
2701
2702        println!("New order is : {:?}", order);
2703        // reordering the original DataFrame (String)
2704        for (key, value) in self.string.iter() {
2705            let mut new_vector = vec![];
2706            for o in order.iter() {
2707                new_vector.push(value[*o]);
2708            }
2709            output.string.insert(*key, new_vector);
2710        }
2711        // reordering the original DataFrame (Numerical)
2712        for (key, value) in self.numerical.iter() {
2713            let mut new_vector = vec![];
2714            for o in order.iter() {
2715                new_vector.push(value[*o]);
2716            }
2717            output.numerical.insert(*key, new_vector);
2718        }
2719        // reordering the original DataFrame (Numerical)
2720        for (key, value) in self.boolean.iter() {
2721            let mut new_vector = vec![];
2722            for o in order.iter() {
2723                new_vector.push(value[*o]);
2724            }
2725            output.boolean.insert(*key, new_vector);
2726        }
2727
2728        output
2729    }
2730}
2731
2732pub fn print_a_matrix<T: std::fmt::Debug>(string: &str, matrix: &Vec<Vec<T>>) {
2733    // To print a matrix in a manner that resembles a matrix
2734    println!("{}", string);
2735    for i in matrix.iter() {
2736        println!("{:?}", i);
2737    }
2738    println!("");
2739    println!("");
2740}
2741
2742pub fn shape_changer<T>(list: &Vec<T>, columns: usize, rows: usize) -> Vec<Vec<T>>
2743where
2744    T: std::clone::Clone,
2745{
2746    /*Changes a list to desired shape matrix*/
2747    // println!("{},{}", &columns, &rows);
2748    let mut l = list.clone();
2749    let mut output = vec![vec![]; rows];
2750    if columns * rows == list.len() {
2751        for i in 0..rows {
2752            output[i] = l[..columns].iter().cloned().collect();
2753            // remove the ones pushed to output
2754            l = l[columns..].iter().cloned().collect();
2755        }
2756        output
2757    } else {
2758        panic!("!!! The shape transformation is not possible, check the values entered !!!");
2759        // vec![]
2760    }
2761}
2762
2763pub fn transpose<T: std::clone::Clone + Copy>(matrix: &Vec<Vec<T>>) -> Vec<Vec<T>> {
2764    // to transform a matrix
2765    let mut output = vec![];
2766    for j in 0..matrix[0].len() {
2767        for i in 0..matrix.len() {
2768            output.push(matrix[i][j]);
2769        }
2770    }
2771    let x = matrix[0].len();
2772    shape_changer(&output, matrix.len(), x)
2773}
2774
2775pub fn vector_addition<T>(a: &mut Vec<T>, b: &mut Vec<T>) -> Vec<T>
2776where
2777    T: std::ops::Add<Output = T> + Copy + std::fmt::Debug + std::str::FromStr,
2778    <T as std::str::FromStr>::Err: std::fmt::Debug,
2779{
2780    // index wise vector addition
2781    let mut output = vec![];
2782    if a.len() == b.len() {
2783        for i in 0..a.len() {
2784            output.push(a[i] + b[i]);
2785        }
2786        output
2787    } else {
2788        // padding with zeros
2789        if a.len() < b.len() {
2790            let new_a = pad_with_zero(a, b.len() - a.len(), "post");
2791            println!("The changed vector is {:?}", new_a);
2792            for i in 0..a.len() {
2793                output.push(a[i] + b[i]);
2794            }
2795            output
2796        } else {
2797            let new_b = pad_with_zero(b, a.len() - b.len(), "post");
2798            println!("The changed vector is {:?}", new_b);
2799            for i in 0..a.len() {
2800                output.push(a[i] + b[i]);
2801            }
2802            output
2803        }
2804    }
2805}
2806
2807pub fn matrix_multiplication<T>(input: &Vec<Vec<T>>, weights: &Vec<Vec<T>>) -> Vec<Vec<T>>
2808where
2809    T: Copy + std::iter::Sum + std::ops::Mul<Output = T>,
2810{
2811    // Matrix multiplcation
2812    println!(
2813        "Multiplication of {}x{} and {}x{}",
2814        input.len(),
2815        input[0].len(),
2816        weights.len(),
2817        weights[0].len()
2818    );
2819    println!("Output will be {}x{}", input.len(), weights[0].len());
2820    let weights_t = transpose(&weights);
2821    // print_a_matrix(&weights_t);
2822    let mut output: Vec<T> = vec![];
2823    if input[0].len() == weights.len() {
2824        for i in input.iter() {
2825            for j in weights_t.iter() {
2826                // println!("{:?}x{:?},", i, j);
2827                output.push(dot_product(&i, &j));
2828            }
2829        }
2830        // println!("{:?}", output);
2831        shape_changer(&output, input.len(), weights_t.len())
2832    } else {
2833        panic!("Dimension mismatch")
2834    }
2835}
2836
2837pub fn dot_product<T>(a: &Vec<T>, b: &Vec<T>) -> T
2838where
2839    T: std::ops::Mul<Output = T> + std::iter::Sum + Copy,
2840{
2841    let output: T = a.iter().zip(b.iter()).map(|(x, y)| *x * *y).sum();
2842    output
2843}
2844
2845pub fn element_wise_operation<T>(a: &Vec<T>, b: &Vec<T>, operation: &str) -> Vec<T>
2846where
2847    T: Copy
2848        + std::fmt::Debug
2849        + std::ops::Mul<Output = T>
2850        + std::ops::Add<Output = T>
2851        + std::ops::Sub<Output = T>
2852        + std::ops::Div<Output = T>
2853        + std::cmp::PartialEq
2854        + std::str::FromStr,
2855    <T as std::str::FromStr>::Err: std::fmt::Debug,
2856{
2857    /*
2858    operations between two vectors, by passing paramters: "mul","sub","div","add"
2859    */
2860    if a.len() == b.len() {
2861        a.iter().zip(b.iter()).map(|(x, y)| match operation {
2862                        "mul" => *x * *y,
2863                        "add" => *x + *y,
2864                        "sub" => *x - *y,
2865                        "div" => *x / *y,
2866                        _ => panic!("Operation unsuccessful!\nEnter any of the following(case sensitive):\n> Add\n> Sub\n> Mul\n> Div"),
2867                    })
2868                    .collect()
2869    } else {
2870        panic!("Dimension mismatch")
2871    }
2872}
2873
2874pub fn pad_with_zero<T>(vector: &mut Vec<T>, count: usize, position: &str) -> Vec<T>
2875where
2876    T: Copy + std::str::FromStr,
2877    <T as std::str::FromStr>::Err: std::fmt::Debug,
2878{
2879    /*
2880    Prefixing or postfixing 0s of a vector
2881    position : `post` or `pre`
2882    */
2883    let mut output = vector.clone();
2884    let zero = "0".parse::<T>().unwrap();
2885    match position {
2886        "post" => {
2887            for _ in 0..count {
2888                output.push(zero);
2889            }
2890        }
2891        "pre" => {
2892            let z = vec![zero; count];
2893            output = [&z[..], &vector[..]].concat()
2894        }
2895        _ => panic!("Position can either be `post` or `pre`"),
2896    };
2897    output
2898}
2899
2900pub fn make_matrix_float<T>(input: &Vec<Vec<T>>) -> Vec<Vec<f64>>
2901where
2902    T: std::fmt::Display + Copy,
2903{
2904    /*
2905    Convert each element of matrix into f64
2906    */
2907    // println!("========================================================================================================================================================");
2908    input
2909        .iter()
2910        .map(|a| {
2911            a.iter()
2912                .map(|b| {
2913                    if is_numerical(*b) {
2914                        format!("{}", b).parse().unwrap()
2915                    } else {
2916                        panic!("Non numerical value present in the intput");
2917                    }
2918                })
2919                .collect()
2920        })
2921        .collect()
2922}
2923
2924pub fn make_vector_float<T>(input: &Vec<T>) -> Vec<f64>
2925where
2926    T: std::fmt::Display + Copy,
2927{
2928    /*
2929    Convert each element of vector into f64
2930    */
2931    // println!("========================================================================================================================================================");
2932    input
2933        .iter()
2934        .map(|b| {
2935            if is_numerical(*b) {
2936                format!("{}", b).parse().unwrap()
2937            } else {
2938                panic!("Non numerical value present in the intput");
2939            }
2940        })
2941        .collect()
2942}
2943pub fn round_off_f(value: f64, decimals: i32) -> f64 {
2944    /*
2945    round off a f64 to the number decimals passed
2946    */
2947    // println!("========================================================================================================================================================");
2948    ((value * 10.0f64.powi(decimals)).round()) / 10.0f64.powi(decimals)
2949}
2950
2951pub fn min_max_f(list: &Vec<f64>) -> (f64, f64) {
2952    /*
2953    Returns a tuple with mininmum and maximum value in a vector
2954    */
2955    // println!("========================================================================================================================================================");
2956    if type_of(list[0]) == "f64" {
2957        let mut positive: Vec<f64> = list
2958            .clone()
2959            .iter()
2960            .filter(|a| **a >= 0.)
2961            .map(|a| *a)
2962            .collect();
2963        let mut negative: Vec<f64> = list
2964            .clone()
2965            .iter()
2966            .filter(|a| **a < 0.)
2967            .map(|a| *a)
2968            .collect();
2969        positive.sort_by(|a, b| a.partial_cmp(b).unwrap());
2970        negative.sort_by(|a, b| a.partial_cmp(b).unwrap());
2971        // println!("{:?}", list);
2972        if negative.len() > 0 && positive.len() > 0 {
2973            (negative[0], positive[positive.len() - 1])
2974        } else {
2975            if positive.len() == 0 && negative.len() != 0 {
2976                (negative[negative.len() - 1], negative[0])
2977            } else {
2978                if negative.len() == 0 && positive.len() != 0 {
2979                    (positive[0], positive[positive.len() - 1])
2980                } else {
2981                    panic!("Empty vector found")
2982                }
2983            }
2984        }
2985    } else {
2986        panic!("Input should be a float type")
2987    }
2988}
2989
2990pub fn is_numerical<T>(value: T) -> bool {
2991    if type_of(&value) == "&i32"
2992        || type_of(&value) == "&i8"
2993        || type_of(&value) == "&i16"
2994        || type_of(&value) == "&i64"
2995        || type_of(&value) == "&i128"
2996        || type_of(&value) == "&f64"
2997        || type_of(&value) == "&f32"
2998        || type_of(&value) == "&u32"
2999        || type_of(&value) == "&u8"
3000        || type_of(&value) == "&u16"
3001        || type_of(&value) == "&u64"
3002        || type_of(&value) == "&u128"
3003        || type_of(&value) == "&usize"
3004        || type_of(&value) == "&isize"
3005    {
3006        true
3007    } else {
3008        false
3009    }
3010}
3011
3012// use std::collections::BTreeMap;
3013pub fn value_counts<T: std::cmp::Ord>(list: &Vec<T>) -> BTreeMap<T, u32>
3014where
3015    T: std::cmp::PartialEq + std::cmp::Eq + std::hash::Hash + Copy,
3016{
3017    /*
3018    Returns a dictioanry of every unique value with its frequency count
3019    */
3020    // println!("========================================================================================================================================================");
3021    let mut count: BTreeMap<T, u32> = BTreeMap::new();
3022    for i in list {
3023        count.insert(*i, 1 + if count.contains_key(i) { count[i] } else { 0 });
3024    }
3025    count
3026}
3027
3028use std::any::type_name;
3029pub fn type_of<T>(_: T) -> &'static str {
3030    /*
3031    Returns the type of data passed
3032    */
3033    type_name::<T>()
3034}
3035
3036pub fn unique_values<T>(list: &Vec<T>) -> Vec<T>
3037where
3038    T: std::cmp::PartialEq + Copy,
3039{
3040    /*
3041    Reruns a set of distinct values for the vector passed
3042    */
3043    let mut output = vec![];
3044    for i in list.iter() {
3045        if output.contains(i) {
3046        } else {
3047            output.push(*i)
3048        };
3049    }
3050    output
3051}
3052
3053pub fn element_wise_matrix_operation<T>(
3054    matrix1: &Vec<Vec<T>>,
3055    matrix2: &Vec<Vec<T>>,
3056    operation: &str,
3057) -> Vec<Vec<T>>
3058where
3059    T: Copy
3060        + std::fmt::Debug
3061        + std::ops::Mul<Output = T>
3062        + std::ops::Add<Output = T>
3063        + std::ops::Sub<Output = T>
3064        + std::ops::Div<Output = T>
3065        + std::cmp::PartialEq
3066        + std::str::FromStr,
3067    <T as std::str::FromStr>::Err: std::fmt::Debug,
3068{
3069    /*
3070    Similar to elemenet wise vector operations, by passing paramters: "mul","sub","div","add"
3071    */
3072    if matrix1.len() == matrix2.len() && matrix1[0].len() == matrix2[0].len() {
3073        matrix1
3074            .iter()
3075            .zip(matrix2.iter())
3076            .map(|(x, y)| {
3077                x.iter()
3078                    .zip(y.iter())
3079                    .map(|a| match operation {
3080                        "mul" => *a.0 * *a.1,
3081                        "add" => *a.0 + *a.1,
3082                        "sub" => *a.0 - *a.1,
3083                        "div" => *a.0 / *a.1,
3084                        _ => panic!("Operation unsuccessful!\nEnter any of the following(case sensitive):\n> Add\n> Sub\n> Mul\n> Div"),
3085                    })
3086                    .collect()
3087            })
3088            .collect()
3089    } else {
3090        panic!("Dimension mismatch")
3091    }
3092}
3093
3094pub fn matrix_vector_product_f(matrix: &Vec<Vec<f64>>, vector: &Vec<f64>) -> Vec<f64> {
3095    /*
3096    Dot product of each row of matrix with a vector
3097    */
3098    let mut output: Vec<_> = vec![];
3099    if matrix[0].len() == vector.len() {
3100        for i in matrix.iter() {
3101            output.push(dot_product(i, vector));
3102        }
3103    } else {
3104        panic!("The lengths do not match, please check");
3105    }
3106    output
3107}
3108
3109pub fn split_vector<T: std::clone::Clone>(vector: &Vec<T>, parts: i32) -> Vec<Vec<T>> {
3110    /*
3111    Breaks vector into multiple parts if length is divisible by the # of parts
3112    */
3113    if vector.len() % parts as usize == 0 {
3114        let mut output = vec![];
3115        let size = vector.len() / parts as usize;
3116        let mut from = 0;
3117        let mut to = from + size;
3118        while to <= vector.len() {
3119            output.push(vector[from..to].to_vec());
3120            from = from + size;
3121            to = from + size;
3122        }
3123        output
3124    } else {
3125        panic!("This partition is not possible, check the number of partitions passed")
3126    }
3127}
3128
3129pub fn split_vector_at<T>(vector: &Vec<T>, at: T) -> Vec<Vec<T>>
3130where
3131    T: std::cmp::PartialEq + Copy + std::clone::Clone,
3132{
3133    /*
3134    Splits a vector into 2 at if a particular value is found
3135    */
3136    if vector.contains(&at) {
3137        let mut output = vec![];
3138        let copy = vector.clone();
3139        let mut from = 0;
3140        for (n, i) in vector.iter().enumerate() {
3141            if i == &at {
3142                output.push(copy[from..n].to_vec());
3143                from = n;
3144            }
3145        }
3146        output.push(copy[from..].to_vec());
3147        output
3148    } else {
3149        panic!("The value is not in the vector, please check");
3150    }
3151}
3152
3153pub fn join_matrix<T: Copy>(
3154    matrix1: &Vec<Vec<T>>,
3155    matrix2: &Vec<Vec<T>>,
3156    how: &str,
3157) -> Vec<Vec<T>> {
3158    /*
3159    "wide" : Places matrix next to each other to become one wide matrix
3160    "long" : Places matrix one below other to become a longer matrix
3161    */
3162    let mut output = vec![];
3163    let a = matrix1;
3164    let b = matrix2;
3165    match how {
3166        "wide" => {
3167            /*
3168            [[1,2],[3,5]] join_matrix [[0,1],[5,7]] => [[1,2,0,1],[3,5,5,7]]
3169            */
3170            if a.len() == b.len() {
3171                for (n, j) in a.iter().enumerate() {
3172                    let mut new_j = j.clone();
3173                    for (m, i) in b.iter().enumerate() {
3174                        for k in i.iter() {
3175                            if n == m {
3176                                new_j.push(*k);
3177                            }
3178                        }
3179                    }
3180                    output.push(new_j)
3181                }
3182                output
3183            } else {
3184                panic!("Please check the dimensions, # of rows are different");
3185            }
3186        }
3187        "long" => {
3188            /*
3189            [[1,2],[3,5]] join_matrix [[0,1],[5,7]] => [[1,2],[3,5],[0,1],[5,7]]
3190            */
3191            if a[0].len() == b[0].len() {
3192                for (n, _) in b.iter().enumerate() {
3193                    output.push(a[n].clone());
3194                }
3195                for (n, _) in b.iter().enumerate() {
3196                    output.push(b[n].clone());
3197                }
3198                output
3199            } else {
3200                panic!("Please check the dimensions, # of columns are different");
3201            }
3202        }
3203        _ => panic!("Select either long or wide"),
3204    }
3205}
3206
3207pub fn make_matrix_string_literal<'a>(data: &'a Vec<Vec<String>>) -> Vec<Vec<&'a str>> {
3208    /*
3209    Few Copy does not work on String so convert it to &str
3210    */
3211    let mut output = vec![];
3212    for i in data.iter() {
3213        output.push(i.iter().map(|a| &a[..]).collect())
3214    }
3215    println!("> String converted to &str");
3216    output
3217}
3218
3219pub fn head<T: std::clone::Clone + std::fmt::Debug>(data: &Vec<Vec<T>>, rows: usize) {
3220    /*
3221    Works on row wise data
3222    Shows first few rows of a matrix
3223    */
3224    if rows <= data.len() {
3225        let output = data[..rows].to_vec();
3226        print_a_matrix(&format!("First {} rows", rows), &output);
3227    } else {
3228        panic!("Data is nt that big, please check the numbers");
3229    }
3230}
3231
3232pub fn tail<T: std::clone::Clone + std::fmt::Debug>(data: &Vec<Vec<T>>, rows: usize) {
3233    /*
3234    Works on row wise data
3235    Shows first few rows of a matrix
3236    */
3237    if rows <= data.len() {
3238        let output = data[data.len() - rows..].to_vec();
3239        print_a_matrix(&format!("Last {} rows", rows), &output);
3240    } else {
3241        panic!("Data is nt that big, please check the numbers");
3242    }
3243}
3244
3245pub fn row_to_columns_conversion<T: std::fmt::Debug + Copy>(data: &Vec<Vec<T>>) -> Vec<Vec<T>> {
3246    /*
3247    Since read_csv gives values row wise, it might be required to convert it into columns for some calulation like aggeration
3248    converts [[1,6,11],[2,7,12],[3,8,13],[4,9,14],[5,10,15]] => [[1,2,3,4,5],[6,7,8,9,10],[11,12,13,14,15]]
3249    */
3250    // println!("{:?}x{:?} becomes", data.len(), data[0].len());
3251    let mut output: Vec<Vec<_>> = vec![];
3252    for j in 0..(data[0].len()) {
3253        let columns = data.iter().map(|a| a[j]).collect();
3254        output.push(columns)
3255    }
3256    // println!("{:?}x{:?}", output.len(), output[0].len());
3257    output
3258}
3259
3260pub fn columns_to_rows_conversion<T: std::fmt::Debug + Copy>(data: &Vec<Vec<T>>) -> Vec<Vec<T>> {
3261    /*
3262    Opposite of row_to_columns_conversion
3263    converts  [[1,2,3,4,5],[6,7,8,9,10],[11,12,13,14,15]] => [[1,6,11],[2,7,12],[3,8,13],[4,9,14],[5,10,15]]
3264    */
3265    // println!("{:?}x{:?} becomes", data.len(), data[0].len());
3266    let mut output = vec![];
3267    for j in 0..data[0].len() {
3268        let mut columns = vec![];
3269        for i in data.iter() {
3270            columns.push(i[j]);
3271        }
3272        output.push(columns)
3273    }
3274    // println!("{:?}x{:?}", output.len(), output[0].len());
3275    output
3276}
3277
3278pub fn datamap_comparision(table1: &DataMap, table2: &DataMap) {
3279    /*
3280    Generates report on count, similarities and dissimilarities of 2 DataMaps
3281    */
3282
3283    // comparing column count
3284    println!("\n********** Count comparision **********");
3285    let string_columns1 = table1.string.keys().collect::<Vec<&&str>>();
3286    let string_columns2 = table2.string.keys().collect::<Vec<&&str>>();
3287
3288    if string_columns1.len() == string_columns2.len() {
3289        println!("Number of String columns match");
3290    } else {
3291        println!(
3292            "Mismatch in count of String columns : Table 1 has {} ; while Table 2 has {:?}",
3293            string_columns1.len(),
3294            string_columns2.len()
3295        );
3296    }
3297
3298    let numerical_columns1 = table1.numerical.keys().collect::<Vec<&&str>>();
3299    let numerical_columns2 = table2.numerical.keys().collect::<Vec<&&str>>();
3300
3301    if numerical_columns1.len() == numerical_columns2.len() {
3302        println!("Number of Numerical columns match");
3303    } else {
3304        println!(
3305            "Mismatch in count of Numerical columns : Table 1 has {} while Table 2 has {:?}",
3306            numerical_columns1.len(),
3307            numerical_columns2.len()
3308        );
3309    }
3310
3311    let boolean_columns1 = table1.boolean.keys().collect::<Vec<&&str>>();
3312    let boolean_columns2 = table2.boolean.keys().collect::<Vec<&&str>>();
3313
3314    if boolean_columns1.len() == boolean_columns2.len() {
3315        println!("Number of Boolean columns match");
3316    } else {
3317        println!(
3318            "Mismatch in count of Boolean columns : Table 1 has {} while Table 2 has {:?}",
3319            boolean_columns1.len(),
3320            boolean_columns2.len()
3321        );
3322    }
3323
3324    println!("\n********** Column name comparision **********");
3325    // chekcing for duplicate headers d=is not required as hashmap takes care of that automatically
3326
3327    let mut c = 0;
3328    let mut mis_string1 = vec![];
3329    let mut mis_string2 = vec![];
3330    for i in string_columns1.iter() {
3331        if string_columns2.contains(i) {
3332            c += 1;
3333        } else {
3334            mis_string2.push(i);
3335        }
3336    }
3337
3338    for i in string_columns2.iter() {
3339        if string_columns1.contains(i) {
3340            c += 1;
3341        } else {
3342            mis_string1.push(i);
3343        }
3344    }
3345    // println!("{:?}", c);
3346    if c == string_columns1.len() + string_columns2.len() {
3347        println!("String columns match (irrespective of order)");
3348    } else {
3349        if mis_string1.len() > 0 && mis_string2.len() > 0 {
3350            println!(
3351                "Table 1 has {:?} missing in String ; Table 2 has {:?} missing in String",
3352                mis_string1, mis_string2
3353            );
3354        }
3355        if mis_string1.len() > 0 && mis_string2.len() == 0 {
3356            println!("Table 1 has {:?} missing in String", mis_string1);
3357        }
3358        if mis_string1.len() == 0 && mis_string2.len() > 0 {
3359            println!("Table 2 has {:?} missing in String", mis_string2);
3360        }
3361    }
3362
3363    c = 0;
3364    let mut mis_numerical1 = vec![];
3365    let mut mis_numerical2 = vec![];
3366    for i in numerical_columns1.iter() {
3367        if numerical_columns2.contains(i) {
3368            c += 1;
3369        } else {
3370            mis_numerical2.push(i);
3371        }
3372    }
3373
3374    for i in numerical_columns2.iter() {
3375        if numerical_columns1.contains(i) {
3376            c += 1;
3377        } else {
3378            mis_numerical1.push(i);
3379        }
3380    }
3381    // println!("{:?}", c);
3382    if c == numerical_columns1.len() + numerical_columns2.len() {
3383        println!("Numerical columns match (irrespective of order)");
3384    } else {
3385        if mis_numerical1.len() > 0 && mis_numerical2.len() > 0 {
3386            println!(
3387                "Table 1 has {:?} missing in Numerical ; Table 2 has {:?} missing in Numerical",
3388                mis_numerical1, mis_numerical2
3389            );
3390        }
3391        if mis_numerical1.len() > 0 && mis_numerical2.len() == 0 {
3392            println!("Table 1 has {:?} missing in Numerical", mis_numerical1);
3393        }
3394        if mis_numerical1.len() == 0 && mis_numerical2.len() > 0 {
3395            println!("Table 2 has {:?} missing in Numerical", mis_numerical2);
3396        }
3397    }
3398
3399    c = 0;
3400    let mut mis_boolean1 = vec![];
3401    let mut mis_boolean2 = vec![];
3402    for i in boolean_columns1.iter() {
3403        if boolean_columns2.contains(i) {
3404            c += 1;
3405        } else {
3406            mis_boolean2.push(i);
3407        }
3408    }
3409
3410    for i in boolean_columns2.iter() {
3411        if boolean_columns1.contains(i) {
3412            c += 1;
3413        } else {
3414            mis_boolean1.push(i);
3415        }
3416    }
3417    // println!("{:?}", c);
3418    if c == boolean_columns1.len() + boolean_columns2.len() {
3419        println!("Boolean columns match (irrespective of order)");
3420    } else {
3421        if mis_boolean1.len() > 0 && mis_boolean2.len() > 0 {
3422            println!(
3423                "Table 1 has {:?} missing in Boolean ; Table 2 has {:?} missing in Boolean",
3424                mis_boolean1, mis_boolean2
3425            );
3426        }
3427        if mis_boolean1.len() > 0 && mis_boolean2.len() == 0 {
3428            println!("Table 1 has {:?} missing in Boolean", mis_boolean1);
3429        }
3430        if mis_boolean1.len() == 0 && mis_boolean2.len() > 0 {
3431            println!("Table 2 has {:?} missing in Boolean", mis_boolean2);
3432        }
3433    }
3434
3435    println!("\n********** Value comparision (for the common columns) **********");
3436    let mut string_similarity = 0;
3437    let mut dissimilarity = vec![];
3438    for (k1, v1) in table1.string.iter() {
3439        for (k2, v2) in table2.string.iter() {
3440            if k1 == k2 {
3441                string_similarity += compare_vectors(&v1, &v2).0;
3442                dissimilarity.push(compare_vectors(&v1, &v2).1);
3443            }
3444        }
3445    }
3446    if string_similarity == table1.string.len() {
3447        println!("The string values matchs, if present");
3448    } else {
3449        println!("Dissimilar in String at (Table 1, Table 2): ");
3450        let _ = dissimilarity
3451            .iter()
3452            .enumerate()
3453            .map(|(n, a)| {
3454                println!(
3455                    "{:?} : {:?}",
3456                    table1.string.keys().collect::<Vec<&&str>>()[n],
3457                    a
3458                );
3459                a.clone()
3460            })
3461            .collect::<Vec<Vec<(usize, usize)>>>();
3462    }
3463
3464    let mut numerical_similarity = 0;
3465    dissimilarity = vec![];
3466    for (k1, v1) in table1.numerical.iter() {
3467        for (k2, v2) in table2.numerical.iter() {
3468            if k1 == k2 {
3469                numerical_similarity += compare_vectors(&v1, &v2).0;
3470                dissimilarity.push(compare_vectors(&v1, &v2).1);
3471            }
3472        }
3473    }
3474    if numerical_similarity == table1.numerical.len() {
3475        println!("The numerical values matchs, if present");
3476    } else {
3477        println!("Dissimilar in Numerical at (Table 1, Table 2): ");
3478        let _ = dissimilarity
3479            .iter()
3480            .enumerate()
3481            .map(|(n, a)| {
3482                println!(
3483                    "{:?} : {:?}",
3484                    table1.numerical.keys().collect::<Vec<&&str>>()[n],
3485                    a
3486                );
3487                a.clone()
3488            })
3489            .collect::<Vec<Vec<(usize, usize)>>>();
3490    }
3491
3492    let mut boolean_similarity = 0;
3493    dissimilarity = vec![];
3494    for (k1, v1) in table1.boolean.iter() {
3495        for (k2, v2) in table2.boolean.iter() {
3496            if k1 == k2 {
3497                boolean_similarity += compare_vectors(&v1, &v2).0;
3498                dissimilarity.push(compare_vectors(&v1, &v2).1);
3499            }
3500        }
3501    }
3502    if boolean_similarity == table1.boolean.len() {
3503        println!("The Boolean values matchs, if present");
3504    } else {
3505        println!("Dissimilar in boolean at (Table 1, Table 2): ");
3506        let _ = dissimilarity
3507            .iter()
3508            .enumerate()
3509            .map(|(n, a)| {
3510                println!(
3511                    "{:?} : {:?}",
3512                    table1.boolean.keys().collect::<Vec<&&str>>()[n],
3513                    a
3514                );
3515                a.clone()
3516            })
3517            .collect::<Vec<Vec<(usize, usize)>>>();
3518    }
3519}
3520
3521pub fn dataframe_comparision(table1: &DataFrame, table2: &DataFrame) {
3522    /*
3523    Generates report on count, similarities and dissimilarities of 2 DataMaps
3524    */
3525    // comparing column count
3526    println!("\n********** Count comparision **********");
3527    if table1.string.len() == table2.string.len() {
3528        println!("String columns count : {:?}", table1.string.len(),);
3529    } else {
3530        println!(
3531            "String columns count are not the same {:?} and {:?}",
3532            table1.string.len(),
3533            table2.string.len()
3534        );
3535    }
3536
3537    if table1.numerical.len() == table2.numerical.len() {
3538        println!("Numerical columns count : {:?}", table1.numerical.len(),);
3539    } else {
3540        println!(
3541            "Numerical columns count are not the same {:?} and {:?}",
3542            table1.numerical.len(),
3543            table2.numerical.len(),
3544        );
3545    }
3546
3547    if table1.boolean.len() == table2.boolean.len() {
3548        println!("Boolean columns count : {:?}", table1.boolean.len(),);
3549    } else {
3550        println!(
3551            "Boolean columns count are not the same {:?} and {:?}",
3552            table1.boolean.len(),
3553            table2.boolean.len()
3554        );
3555    }
3556
3557    println!("\n********** Value comparision (for the common columns) **********");
3558    let mut string_similarity = 0;
3559    let mut dissimilarity = vec![];
3560    for (ni, i) in table1.string.iter().enumerate() {
3561        for (nj, j) in i.iter().enumerate() {
3562            for (nk, k) in table2.string.iter().enumerate() {
3563                for (nl, l) in k.iter().enumerate() {
3564                    if nj == nl && nk == ni {
3565                        if j == l {
3566                            string_similarity += 1;
3567                        } else {
3568                            dissimilarity.push(((ni, nj), (nk, nl)));
3569                        }
3570                    }
3571                }
3572            }
3573        }
3574    }
3575    if string_similarity == table1.string[0].len() * table1.string.len() {
3576        println!("The string values matchs, if present");
3577    } else {
3578        println!("Dissimilar in String at :");
3579        let _ = dissimilarity
3580            .iter()
3581            .enumerate()
3582            .map(|(n, a)| {
3583                println!("{:?} : {:?}", n, a);
3584                *a
3585            })
3586            .collect::<Vec<((usize, usize), (usize, usize))>>();
3587    }
3588
3589    let mut numerical_similarity = 0;
3590    let mut dissimilarity = vec![];
3591    for (ni, i) in table1.numerical.iter().enumerate() {
3592        for (nj, j) in i.iter().enumerate() {
3593            for (nk, k) in table2.numerical.iter().enumerate() {
3594                for (nl, l) in k.iter().enumerate() {
3595                    if nj == nl && nk == ni {
3596                        if j == l {
3597                            numerical_similarity += 1;
3598                        } else {
3599                            dissimilarity.push(((ni, nj), (nk, nl)));
3600                        }
3601                    }
3602                }
3603            }
3604        }
3605    }
3606    if numerical_similarity == table1.numerical[0].len() * table1.numerical.len() {
3607        println!("The numerical values matchs, if present");
3608    } else {
3609        println!("Dissimilar in Numerical at :");
3610        let _ = dissimilarity
3611            .iter()
3612            .enumerate()
3613            .map(|(n, a)| {
3614                println!("{:?} : {:?}", n, a);
3615                *a
3616            })
3617            .collect::<Vec<((usize, usize), (usize, usize))>>();
3618    }
3619
3620    let mut boolean_similarity = 0;
3621    let mut dissimilarity = vec![];
3622    for (ni, i) in table1.boolean.iter().enumerate() {
3623        for (nj, j) in i.iter().enumerate() {
3624            for (nk, k) in table2.boolean.iter().enumerate() {
3625                for (nl, l) in k.iter().enumerate() {
3626                    if nj == nl && nk == ni {
3627                        if j == l {
3628                            boolean_similarity += 1;
3629                        } else {
3630                            dissimilarity.push(((ni, nj), (nk, nl)));
3631                        }
3632                    }
3633                }
3634            }
3635        }
3636    }
3637    if boolean_similarity == table1.boolean[0].len() * table1.boolean.len() {
3638        println!("The boolean values matchs, if present");
3639    } else {
3640        println!("Dissimilar in Boolean at :");
3641        let _ = dissimilarity
3642            .iter()
3643            .enumerate()
3644            .map(|(n, a)| {
3645                println!("{:?} : {:?}", n, a);
3646                *a
3647            })
3648            .collect::<Vec<((usize, usize), (usize, usize))>>();
3649    }
3650}
3651
3652pub fn compare_vectors<T: std::cmp::PartialEq>(
3653    v1: &Vec<T>,
3654    v2: &Vec<T>,
3655) -> (usize, Vec<(usize, usize)>) {
3656    let mut similarity = 0;
3657    let mut dissimilarity = vec![];
3658    for (n, i) in v1.iter().enumerate() {
3659        for (m, j) in v2.iter().enumerate() {
3660            if n == m {
3661                if *i == *j {
3662                    similarity += 1;
3663                } else {
3664                    dissimilarity.push((n, m))
3665                }
3666            }
3667        }
3668    }
3669    (similarity, dissimilarity)
3670}
3671
3672/*
3673DESCRIPTION
3674-----------------------------------------
3675STRUCTS
3676-------
36771. StringToMatch :
3678        > compare_percentage : comparision based on presence of characters and its position
3679            x calculate
3680        > clean_string : lower it and keep alphaneumericals only
3681            x char_vector
3682        > compare_chars
3683        > compare_position
3684        > fuzzy_subset : scores based on chuncks of string
3685            x n_gram
3686        > split_alpha_numericals : seperates numbers from the rest
3687        > char_count : Returns dictioanry of characters arranged in alphabetically increasing order with their frequency
3688        > frequent_char : Returns the more frequently occuring character in the string passed
3689        > char_replace : Finds a character, replaces it with a string at all positions or at just the first depending on operation argument
3690
3691FUNCTIONS
3692---------
36931. extract_vowels_consonants : Returns a tuple of vectors containing chars (after converting to lowercase)
3694    > 1. string : String
3695    = Vec<chars>
3696    = Vec<chars>
3697
36982. sentence_case
3699    > 1. string : String
3700    = String
3701
37023. remove_stop_words : Based on NLTK removing words that dont convey much from a string
3703    > 1. string : String
3704    = String
3705
37064. tokenize :
3707    > string: String
3708    > symbol: &Vec<&'a str>
3709     = Vec<String>
3710
3711*/
3712use std::collections::BTreeMap;
3713pub struct StringToMatch {
3714    pub string1: String,
3715    pub string2: String,
3716}
3717
3718impl StringToMatch {
3719    pub fn compare_percentage(
3720        &self,
3721        weightage_for_position: f64,
3722        weightage_for_presence: f64,
3723    ) -> f64 {
3724        /*
3725            Scores by comparing characters and its position as per weightage passed
3726            Weightage passed as ratio
3727            ex: 2.,1. will give double weightage to position than presence
3728        */
3729
3730        ((StringToMatch::compare_chars(&self) * weightage_for_presence * 100.)
3731            + (StringToMatch::compare_position(&self) * weightage_for_position * 100.))
3732            / 2.
3733    }
3734
3735    pub fn clean_string(s1: String) -> String {
3736        /*
3737            Lowercase and removes special characters
3738        */
3739
3740        // case uniformity
3741        let this = s1.to_lowercase();
3742
3743        // only alpha neurmericals accents - bytes between 48-57 ,97-122, 128-201
3744        // https://www.utf8-chartable.de/unicode-utf8-table.pl?number=1024&utf8=dec&unicodeinhtml=dec
3745        let this_byte: Vec<_> = this
3746            .as_bytes()
3747            .iter()
3748            .filter(|a| {
3749                (**a > 47 && **a < 58) || (**a > 96 && **a < 123) || (**a > 127 && **a < 201)
3750            })
3751            .map(|a| *a)
3752            .collect();
3753        let new_this = std::str::from_utf8(&this_byte[..]).unwrap();
3754        new_this.to_string()
3755    }
3756
3757    fn char_vector(string1: String) -> Vec<char> {
3758        /*
3759            String to vector of characters
3760        */
3761        let string1 = StringToMatch::clean_string(string1.clone());
3762        string1.chars().collect()
3763    }
3764
3765    fn calculate(actual: f64, v1: &Vec<char>, v2: &Vec<char>) -> f64 {
3766        /*
3767            normalizes score by dividing it with the longest string's length
3768        */
3769        let larger = if v1.len() > v2.len() {
3770            v1.len()
3771        } else {
3772            v2.len()
3773        };
3774        (actual / larger as f64)
3775    }
3776
3777    pub fn compare_chars(&self) -> f64 {
3778        /*
3779            Scores as per occurance of characters
3780        */
3781        let mut output = 0.;
3782        // println!("{:?} vs {:?}", self.string1, self.string2);
3783        let vec1 = StringToMatch::char_vector(self.string1.clone());
3784        let vec2 = StringToMatch::char_vector(self.string2.clone());
3785
3786        for i in vec1.iter() {
3787            if vec2.contains(i) {
3788                output += 1.;
3789            }
3790        }
3791        StringToMatch::calculate(output, &vec1, &vec2)
3792    }
3793    pub fn compare_position(&self) -> f64 {
3794        /*
3795            Scores as per similar positioning of characters
3796        */
3797        let mut output = 0.;
3798        // println!("{:?} vs {:?}", self.string1, self.string2);
3799        let vec1 = StringToMatch::char_vector(self.string1.clone());
3800        let vec2 = StringToMatch::char_vector(self.string2.clone());
3801
3802        let combined: Vec<_> = vec1.iter().zip(vec2.iter()).collect();
3803
3804        for (i, j) in combined.iter() {
3805            if i == j {
3806                output += 1.;
3807            }
3808        }
3809        StringToMatch::calculate(output, &vec1, &vec2)
3810    }
3811
3812    pub fn fuzzy_subset(&self, n_gram: usize) -> f64 {
3813        /*
3814            break into chuncks and compare if not a subset
3815        */
3816        let match_percentage;
3817        let vec1 = StringToMatch::clean_string(self.string1.clone());
3818        let vec2 = StringToMatch::clean_string(self.string2.clone());
3819
3820        // finding the subset out of the two parameters
3821        let mut subset = vec2.clone();
3822        let mut superset = vec1.clone();
3823        if vec1.len() < vec2.len() {
3824            subset = vec1;
3825            superset = vec2;
3826        }
3827
3828        let mut chunck_match_count = 0.;
3829
3830        // whole string
3831        if superset.contains(&subset) {
3832            match_percentage = 100.
3833        } else {
3834            // breaking them into continous chuncks
3835            let superset_n = StringToMatch::n_gram(&superset, n_gram);
3836            let subset_n = StringToMatch::n_gram(&subset, n_gram);
3837            for i in subset_n.iter() {
3838                if superset_n.contains(i) {
3839                    chunck_match_count += 1.;
3840                }
3841            }
3842            // calculating match score
3843            let smaller = if superset_n.len() < subset_n.len() {
3844                superset_n.len()
3845            } else {
3846                subset_n.len()
3847            };
3848            match_percentage = (chunck_match_count / smaller as f64) * 100.
3849        }
3850
3851        println!("{:?} in {:?}", subset, superset);
3852        match_percentage
3853    }
3854
3855    fn n_gram<'a>(string: &'a str, window_size: usize) -> Vec<&'a str> {
3856        let vector: Vec<_> = string.chars().collect();
3857        let mut output = vec![];
3858        for (mut n, _) in vector.iter().enumerate() {
3859            while n + window_size < string.len() - 1 {
3860                // println!("Working");
3861                output.push(&string[n..n + window_size]);
3862                n = n + window_size;
3863            }
3864        }
3865        unique_values(&output)
3866    }
3867
3868    pub fn split_alpha_numericals(string: String) -> (String, String) {
3869        /*
3870        "Something 123 else" => ("123","Something  else")
3871        */
3872        let bytes: Vec<_> = string.as_bytes().to_vec();
3873        let numbers: Vec<_> = bytes.iter().filter(|a| **a < 58 && **a > 47).collect();
3874        println!("{:?}", bytes);
3875        let aplhabets: Vec<_> = bytes
3876            .iter()
3877            .filter(|a| {
3878                (**a > 64 && **a < 91) // A-Z
3879                    || (**a > 96 && **a < 123) // a-z
3880                    || (**a > 127 && **a < 201) // letters with accents
3881                    || (**a == 32) // spaces
3882            })
3883            .collect();
3884
3885        (
3886            // to have output as concatenated string
3887            String::from_utf8(numbers.iter().map(|a| **a).collect()).unwrap(),
3888            String::from_utf8(aplhabets.iter().map(|a| **a).collect()).unwrap(),
3889        )
3890    }
3891
3892    pub fn char_count(string: String) -> BTreeMap<char, u32> {
3893        /*
3894        "SOmething Else" => {' ': 1, 'e': 3, 'g': 1, 'h': 1, 'i': 1, 'l': 1, 'm': 1, 'n': 1, 'o': 1, 's': 2, 't': 1}
3895         */
3896        let mut count: BTreeMap<char, Vec<i32>> = BTreeMap::new();
3897        let vector: Vec<_> = string.to_lowercase().chars().collect();
3898
3899        // empty dictiornaty
3900        for i in vector.iter() {
3901            count.insert(*i, vec![]);
3902        }
3903        // dictionary with 1
3904        let mut new_count: BTreeMap<char, Vec<i32>> = BTreeMap::new();
3905        for (k, _) in count.iter() {
3906            let mut values = vec![];
3907            for i in vector.iter() {
3908                if i == k {
3909                    values.push(1);
3910                }
3911            }
3912            new_count.insert(*k, values);
3913        }
3914
3915        // dictionary with sum of 1s
3916        let mut output = BTreeMap::new();
3917        for (k, v) in new_count.iter() {
3918            output.insert(*k, v.iter().fold(0, |a, b| a as u32 + *b as u32));
3919        }
3920
3921        output
3922    }
3923
3924    pub fn frequent_char(string: String) -> char {
3925        /*
3926            "SOmething Else" => 'e'
3927        */
3928        let dict = StringToMatch::char_count(string);
3929        let mut value = 0;
3930        let mut key = '-';
3931        for (k, _) in dict.iter() {
3932            key = match dict.get_key_value(k) {
3933                Some((x, y)) => {
3934                    if *y > value {
3935                        value = *y;
3936                        *x
3937                    } else {
3938                        key
3939                    }
3940                }
3941                _ => panic!("Please check the input!!"),
3942            };
3943        }
3944        key
3945    }
3946
3947    pub fn char_replace(string: String, find: char, replace: String, operation: &str) -> String {
3948        /*
3949        ALL : SOmething Else is now "SOmZthing ElsZ"
3950        First : SOmething Else is now "SOmZthing Else"
3951        */
3952
3953        if string.contains(find) {
3954            let string_utf8 = string.as_bytes().to_vec();
3955            let find_utf8 = find.to_string().as_bytes().to_vec();
3956            let replace_utf8 = replace.as_bytes().to_vec();
3957            let split = split_vector_at(&string_utf8, find_utf8[0]);
3958            let split_vec: Vec<_> = split
3959                .iter()
3960                .map(|a| String::from_utf8(a.to_vec()).unwrap())
3961                .collect();
3962            let mut new_string_vec = vec![];
3963            if operation == "all" {
3964                for (n, _) in split_vec.iter().enumerate() {
3965                    if n > 0 {
3966                        let x = split_vec[n][1..].to_string();
3967                        new_string_vec.push(format!(
3968                            "{}{}",
3969                            String::from_utf8(replace_utf8.clone()).unwrap(),
3970                            x.clone()
3971                        ));
3972                    } else {
3973                        new_string_vec.push(split_vec[n].clone());
3974                    }
3975                }
3976            } else {
3977                if operation == "first" {
3978                    for (n, _) in split_vec.iter().enumerate() {
3979                        if n == 1 {
3980                            let x = split_vec[n][1..].to_string();
3981
3982                            new_string_vec.push(format!(
3983                                "{}{}",
3984                                String::from_utf8(replace_utf8.clone()).unwrap(),
3985                                x.clone()
3986                            ));
3987                        } else {
3988                            new_string_vec.push(split_vec[n].clone());
3989                        }
3990                    }
3991                } else {
3992                    panic!("Either pass operation as `all` or `first`");
3993                }
3994            }
3995            new_string_vec.concat()
3996        } else {
3997            panic!("The character to replace does not exist in the string passed, please check!")
3998        }
3999    }
4000}
4001
4002pub fn extract_vowels_consonants(string: String) -> (Vec<char>, Vec<char>) {
4003    /*
4004    Returns a tuple of vectors containing chars (after converting to lowercase)
4005    .0 : list of vowels
4006    .1 : list fo consonants
4007    */
4008    let bytes: Vec<_> = string.as_bytes().to_vec();
4009    let vowels: Vec<_> = bytes
4010        .iter()
4011        .filter(|a| {
4012            **a == 97
4013                || **a == 101
4014                || **a == 105
4015                || **a == 111
4016                || **a == 117
4017                || **a == 65
4018                || **a == 69
4019                || **a == 73
4020                || **a == 79
4021                || **a == 85
4022        })
4023        .collect();
4024    let consonants: Vec<_> = bytes
4025        .iter()
4026        .filter(|a| {
4027            **a != 97
4028                && **a != 101
4029                && **a != 105
4030                && **a != 111
4031                && **a != 117
4032                && **a != 65
4033                && **a != 69
4034                && **a != 73
4035                && **a != 79
4036                && **a != 85
4037                && ((**a > 96 && **a < 123) || (**a > 64 && **a < 91))
4038        })
4039        .collect();
4040    let output: (Vec<_>, Vec<_>) = (
4041        String::from_utf8(vowels.iter().map(|a| **a).collect())
4042            .unwrap()
4043            .chars()
4044            .collect(),
4045        String::from_utf8(consonants.iter().map(|a| **a).collect())
4046            .unwrap()
4047            .chars()
4048            .collect(),
4049    );
4050    output
4051}
4052
4053pub fn sentence_case(string: String) -> String {
4054    /*
4055    "The quick brown dog jumps Over the lazy fox" => "The Quick Brown Dog Jumps Over The Lazy Fox"
4056    */
4057    let lower = string.to_lowercase();
4058    let split: Vec<_> = lower.split(' ').collect();
4059    let mut output = vec![];
4060    for i in split.iter() {
4061        let char_vec: Vec<_> = i.chars().collect();
4062        let mut b = [0; 2];
4063        char_vec[0].encode_utf8(&mut b);
4064        output.push(format!(
4065            "{}{}",
4066            &String::from_utf8(vec![b[0] - 32 as u8]).unwrap()[..],
4067            &i[1..]
4068        ));
4069    }
4070    output.join(" ")
4071}
4072
4073pub fn remove_stop_words(string: String) -> String {
4074    /*
4075    "Rust is a multi-paradigm programming language focused on performance and safety, especially safe concurrency.[15][16] Rust is syntactically similar to C++,[17] but provides memory safety without using garbage collection.\nRust was originally designed by Graydon Hoare at Mozilla Research, with contributions from Dave Herman, Brendan Eich, and others.[18][19] The designers refined the language while writing the Servo layout or browser engine,[20] and the Rust compiler. The compiler is free and open-source software dual-licensed under the MIT License and Apache License 2.0."
4076                                                                                                    |
4077                                                                                                    V
4078    "Rust multi-paradigm programming language focused performance safety, especially safe concurrency.[15][16] Rust syntactically similar C++,[17] provides memory safety without using garbage collection.\nRust originally designed Graydon Hoare Mozilla Research, contributions Dave Herman, Brendan Eich, others.[18][19] designers refined language writing Servo layout browser engine,[20] Rust compiler. compiler free open-source software dual-licensed MIT License Apache License 2.0."
4079         */
4080    // https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/stopwords.zip (14/06/2020)
4081    let mut split: Vec<_> = string.split(' ').collect();
4082    let stop_words = vec![
4083        "i",
4084        "me",
4085        "my",
4086        "myself",
4087        "we",
4088        "our",
4089        "ours",
4090        "ourselves",
4091        "you",
4092        "you're",
4093        "you've",
4094        "you'll",
4095        "you'd",
4096        "your",
4097        "yours",
4098        "yourself",
4099        "yourselves",
4100        "he",
4101        "him",
4102        "his",
4103        "himself",
4104        "she",
4105        "she's",
4106        "her",
4107        "hers",
4108        "herself",
4109        "it",
4110        "it's",
4111        "its",
4112        "itself",
4113        "they",
4114        "them",
4115        "their",
4116        "theirs",
4117        "themselves",
4118        "what",
4119        "which",
4120        "who",
4121        "whom",
4122        "this",
4123        "that",
4124        "that'll",
4125        "these",
4126        "those",
4127        "am",
4128        "is",
4129        "are",
4130        "was",
4131        "were",
4132        "be",
4133        "been",
4134        "being",
4135        "have",
4136        "has",
4137        "had",
4138        "having",
4139        "do",
4140        "does",
4141        "did",
4142        "doing",
4143        "a",
4144        "an",
4145        "the",
4146        "and",
4147        "but",
4148        "if",
4149        "or",
4150        "because",
4151        "as",
4152        "until",
4153        "while",
4154        "of",
4155        "at",
4156        "by",
4157        "for",
4158        "with",
4159        "about",
4160        "against",
4161        "between",
4162        "into",
4163        "through",
4164        "during",
4165        "before",
4166        "after",
4167        "above",
4168        "below",
4169        "to",
4170        "from",
4171        "up",
4172        "down",
4173        "in",
4174        "out",
4175        "on",
4176        "off",
4177        "over",
4178        "under",
4179        "again",
4180        "further",
4181        "then",
4182        "once",
4183        "here",
4184        "there",
4185        "when",
4186        "where",
4187        "why",
4188        "how",
4189        "all",
4190        "any",
4191        "both",
4192        "each",
4193        "few",
4194        "more",
4195        "most",
4196        "other",
4197        "some",
4198        "such",
4199        "no",
4200        "nor",
4201        "not",
4202        "only",
4203        "own",
4204        "same",
4205        "so",
4206        "than",
4207        "too",
4208        "very",
4209        "s",
4210        "t",
4211        "can",
4212        "will",
4213        "just",
4214        "don",
4215        "don't",
4216        "should",
4217        "should've",
4218        "now",
4219        "d",
4220        "ll",
4221        "m",
4222        "o",
4223        "re",
4224        "ve",
4225        "y",
4226        "ain",
4227        "aren",
4228        "aren't",
4229        "couldn",
4230        "couldn't",
4231        "didn",
4232        "didn't",
4233        "doesn",
4234        "doesn't",
4235        "hadn",
4236        "hadn't",
4237        "hasn",
4238        "hasn't",
4239        "haven",
4240        "haven't",
4241        "isn",
4242        "isn't",
4243        "ma",
4244        "mightn",
4245        "mightn't",
4246        "mustn",
4247        "mustn't",
4248        "needn",
4249        "needn't",
4250        "shan",
4251        "shan't",
4252        "shouldn",
4253        "shouldn't",
4254        "wasn",
4255        "wasn't",
4256        "weren",
4257        "weren't",
4258        "won",
4259        "won't",
4260        "wouldn",
4261        "wouldn't",
4262        "I",
4263        "Me",
4264        "My",
4265        "Myself",
4266        "We",
4267        "Our",
4268        "Ours",
4269        "Ourselves",
4270        "You",
4271        "You're",
4272        "You've",
4273        "You'll",
4274        "You'd",
4275        "Your",
4276        "Yours",
4277        "Yourself",
4278        "Yourselves",
4279        "He",
4280        "Him",
4281        "His",
4282        "Himself",
4283        "She",
4284        "She's",
4285        "Her",
4286        "Hers",
4287        "Herself",
4288        "It",
4289        "It's",
4290        "Its",
4291        "Itself",
4292        "They",
4293        "Them",
4294        "Their",
4295        "Theirs",
4296        "Themselves",
4297        "What",
4298        "Which",
4299        "Who",
4300        "Whom",
4301        "This",
4302        "That",
4303        "That'll",
4304        "These",
4305        "Those",
4306        "Am",
4307        "Is",
4308        "Are",
4309        "Was",
4310        "Were",
4311        "Be",
4312        "Been",
4313        "Being",
4314        "Have",
4315        "Has",
4316        "Had",
4317        "Having",
4318        "Do",
4319        "Does",
4320        "Did",
4321        "Doing",
4322        "A",
4323        "An",
4324        "The",
4325        "And",
4326        "But",
4327        "If",
4328        "Or",
4329        "Because",
4330        "As",
4331        "Until",
4332        "While",
4333        "Of",
4334        "At",
4335        "By",
4336        "For",
4337        "With",
4338        "About",
4339        "Against",
4340        "Between",
4341        "Into",
4342        "Through",
4343        "During",
4344        "Before",
4345        "After",
4346        "Above",
4347        "Below",
4348        "To",
4349        "From",
4350        "Up",
4351        "Down",
4352        "In",
4353        "Out",
4354        "On",
4355        "Off",
4356        "Over",
4357        "Under",
4358        "Again",
4359        "Further",
4360        "Then",
4361        "Once",
4362        "Here",
4363        "There",
4364        "When",
4365        "Where",
4366        "Why",
4367        "How",
4368        "All",
4369        "Any",
4370        "Both",
4371        "Each",
4372        "Few",
4373        "More",
4374        "Most",
4375        "Other",
4376        "Some",
4377        "Such",
4378        "No",
4379        "Nor",
4380        "Not",
4381        "Only",
4382        "Own",
4383        "Same",
4384        "So",
4385        "Than",
4386        "Too",
4387        "Very",
4388        "S",
4389        "T",
4390        "Can",
4391        "Will",
4392        "Just",
4393        "Don",
4394        "Don't",
4395        "Should",
4396        "Should've",
4397        "Now",
4398        "D",
4399        "Ll",
4400        "M",
4401        "O",
4402        "Re",
4403        "Ve",
4404        "Y",
4405        "Ain",
4406        "Aren",
4407        "Aren't",
4408        "Couldn",
4409        "Couldn't",
4410        "Didn",
4411        "Didn't",
4412        "Doesn",
4413        "Doesn't",
4414        "Hadn",
4415        "Hadn't",
4416        "Hasn",
4417        "Hasn't",
4418        "Haven",
4419        "Haven't",
4420        "Isn",
4421        "Isn't",
4422        "Ma",
4423        "Mightn",
4424        "Mightn't",
4425        "Mustn",
4426        "Mustn't",
4427        "Needn",
4428        "Needn't",
4429        "Shan",
4430        "Shan't",
4431        "Shouldn",
4432        "Shouldn't",
4433        "Wasn",
4434        "Wasn't",
4435        "Weren",
4436        "Weren't",
4437        "Won",
4438        "Won't",
4439        "Wouldn",
4440        "Wouldn't",
4441    ];
4442    split.retain(|a| stop_words.contains(a) == false);
4443    split
4444        .iter()
4445        .map(|a| String::from(*a))
4446        .collect::<Vec<String>>()
4447        .join(" ")
4448}
4449
4450pub fn tokenize<'a>(string: String, symbol: &Vec<&'a str>) -> Vec<String> {
4451    /*
4452    Tokeninzes by space and if required by any other symbol passed in as a vector
4453    To use only space and nothing else, pass empty vector
4454    */
4455
4456    let output1: Vec<&str> = string.split(" ").collect();
4457    let mut output2 = output1
4458        .iter()
4459        .map(|a| a.to_string())
4460        .collect::<Vec<String>>();
4461    let mut output = vec![];
4462    for j in symbol.iter() {
4463        for i in output2.iter() {
4464            if i.contains(j) {
4465                output.push((*i).split(j).collect());
4466            } else {
4467                output.push(i.to_string());
4468            }
4469        }
4470        output2 = output.clone();
4471        output = vec![];
4472    }
4473    output2
4474}
4475
4476/*
4477DESCRIPTION
4478-----------------------------------------
4479STRUCTS
4480-------
4481
4482FUNCTIONS
4483---------
44841. autocorrelation : at a given lag
4485    > 1. ts : &Vec<f64>
4486    > 2. lag : usize
4487    = Result<f64, std::io::Error>
4488
44892. simple_ma
4490    > 1. ts : &Vec<f64>
4491    > 2. lag : usize
4492    = Vec<f64>
4493
44943. exp_ma
4495    > 1. ts : &Vec<f64>
4496    > 2. alpha : f64
4497    = Vec<f64>
4498
44994. best_fit_line : returns intercept and slope of the best fit line
4500    > x: &Vec<f64>
4501    > y: &Vec<f64>)
4502     = (f64, f64)
4503
4504*/
4505
4506pub fn acf(ts: &Vec<f64>, lag: usize) -> Result<f64, std::io::Error> {
4507    /*
4508    To check for randomness of time series
4509    To check if the values are dependent on its past value (correlated)
4510    */
4511    // https://www.itl.nist.gov/div898/handbook/eda/section3/eda35c.htm
4512    let mean = mean(ts);
4513    let mut numerator = 0.;
4514    let mut denominator = 0.;
4515    for i in 0..ts.len() - lag {
4516        if i > lag {
4517            numerator += (ts[i] - mean) * (ts[i - lag] - mean);
4518            denominator += (ts[i] - mean) * (ts[i] - mean);
4519        }
4520    }
4521    match denominator {
4522        x if x != 0. => {
4523            if ((numerator / denominator).abs() > 0.5) && (lag != 0) {
4524                print!("At {:?} lag the series seems to be correlated\t", lag)
4525            }
4526            Ok(numerator / denominator)
4527        }
4528        _ => Err(std::io::Error::new(
4529            std::io::ErrorKind::Other,
4530            "Denominator is 0!",
4531        )),
4532    }
4533}
4534
4535pub fn simple_ma(ts: &Vec<f64>, lag: usize) -> Vec<f64> {
4536    let mut output = vec![];
4537    for i in 0..lag {
4538        if lag + i <= ts.len() {
4539            let sub_ts = ts[i..lag + i].to_vec();
4540            output.push(sub_ts.iter().fold(0., |a, b| a + b) / sub_ts.len() as f64);
4541        }
4542    }
4543    pad_with_zero(&mut output, lag, "pre")
4544}
4545
4546pub fn exp_ma(ts: &Vec<f64>, alpha: f64) -> Vec<f64> {
4547    // https://www.youtube.com/watch?v=k_HN0wOKDd0
4548    // assuming first forecast is == first actual value
4549    // new forecast  = aplha*actual old value+(1-alpha)*forecasted old value
4550    let mut output = vec![ts[0]];
4551    for (n, i) in ts[1..].to_vec().iter().enumerate() {
4552        output.push(alpha * i + (1. - alpha) * output[n]);
4553    }
4554    // removing the last value and adding 0 in front
4555    let exp_ma = pad_with_zero(&mut output[..ts.len() - 1].to_vec(), 1, "pre");
4556    let mse = mean(
4557        &ts[1..]
4558            .to_vec()
4559            .iter()
4560            .zip(output[..ts.len() - 1].to_vec().iter())
4561            .map(|(a, b)| (a - b) * (a - b))
4562            .collect(),
4563    );
4564    println!("Mean square error of this forecasting : {:?}", mse);
4565    exp_ma
4566}
4567
4568pub fn best_fit_line(x: &Vec<f64>, y: &Vec<f64>) -> (f64, f64) {
4569    // https://pythonprogramming.net/how-to-program-best-fit-line-machine-learning-tutorial/
4570    // intercept , slope
4571    let xy = x
4572        .iter()
4573        .zip(y.iter())
4574        .map(|a| a.0 * a.1)
4575        .collect::<Vec<f64>>();
4576    let xx = x
4577        .iter()
4578        .zip(x.iter())
4579        .map(|a| a.0 * a.1)
4580        .collect::<Vec<f64>>();
4581    let m = ((mean(x) * mean(y)) - mean(&xy)) / ((mean(x) * mean(x)) - mean(&xx));
4582
4583    let b = mean(y) - m * mean(x);
4584    (b, m)
4585}
simple_ml/lib.rs

simple_ml/
lib.rs