1use math::round;
2use rand::*;
3
4pub struct LayerDetails {
38 pub n_inputs: usize,
44 pub n_neurons: i32,
45}
46impl LayerDetails {
47 pub fn create_weights(&self) -> Vec<Vec<f64>> {
48 let mut rng = rand::thread_rng();
52 let mut weight: Vec<Vec<f64>> = vec![];
53 for _ in 0..self.n_inputs {
55 weight.push(
56 (0..self.n_neurons)
57 .map(|_| round::ceil(rng.gen_range(-1., 1.), 3))
58 .collect(),
59 );
60 }
61 weight
62 }
63 pub fn create_bias(&self, value: f64) -> Vec<f64> {
64 let bias = vec![value; self.n_neurons as usize];
69 bias
70 }
71 pub fn output_of_layer(
72 &self,
73 input: &Vec<Vec<f64>>,
74 weights: &Vec<Vec<f64>>,
75 bias: &mut Vec<f64>,
76 f: &str,
77 alpha: f64,
78 ) -> Vec<Vec<f64>> {
79 let mut mat_mul = transpose(&matrix_multiplication(&input, &weights));
90 let mut output: Vec<Vec<f64>> = vec![];
92 for i in &mut mat_mul {
93 output.push(vector_addition(i, bias));
95 }
96 let mut activated_output = vec![];
99 match f {
100 "relu" => {
101 println!("Alpha is for 'leaky relu' only, it is not taken into account here");
102 for i in output.clone() {
103 activated_output.push(activation_relu(&i));
104 }
105 }
106 "leaky relu" => {
107 for i in output.clone() {
108 activated_output.push(activation_leaky_relu(&i, alpha));
109 }
110 }
111 "sigmoid" => {
112 println!("Alpha is for 'leaky relu' only, it is not taken into account here");
113 for i in output.clone() {
114 activated_output.push(activation_sigmoid(&i));
115 }
116 }
117 "tanh" => {
118 println!("Alpha is for 'leaky relu' only, it is not taken into account here");
119 for i in output.clone() {
120 activated_output.push(activation_tanh(&i));
121 }
122 }
123 _ => panic!("Select from either 'tanh','sigmoid','relu','leaky relu'"),
124 }
125 activated_output
127 }
128}
129
130pub fn activation_relu<T>(input: &Vec<T>) -> Vec<T>
131where
132 T: Copy + std::cmp::PartialOrd + std::ops::Sub<Output = T> + std::str::FromStr,
133 <T as std::str::FromStr>::Err: std::fmt::Debug,
134{
135 let zero = "0".parse::<T>().unwrap();
141 input
142 .iter()
143 .map(|x| if *x > zero { *x } else { *x - *x })
144 .collect()
145}
146
147pub fn activation_leaky_relu<T>(input: &Vec<T>, alpha: f64) -> Vec<T>
148where
149 T: Copy + std::cmp::PartialOrd + std::ops::Mul<Output = T> + std::str::FromStr,
150 <T as std::str::FromStr>::Err: std::fmt::Debug,
151{
152 let zero = "0".parse::<T>().unwrap();
159 let a = format!("{}", alpha).parse::<T>().unwrap();
160 input
161 .iter()
162 .map(|x| if *x > zero { *x } else { a * *x })
163 .collect()
164}
165
166pub fn activation_sigmoid<T>(input: &Vec<T>) -> Vec<f64>
167where
168 T: std::str::FromStr + std::fmt::Debug,
169 <T as std::str::FromStr>::Err: std::fmt::Debug,
170{
171 input
177 .iter()
178 .map(|x| 1. / (1. + format!("{:?}", x).parse::<f64>().unwrap().exp()))
179 .collect()
180}
181
182pub fn activation_tanh<T>(input: &Vec<T>) -> Vec<f64>
183where
184 T: std::str::FromStr + std::fmt::Debug,
185 <T as std::str::FromStr>::Err: std::fmt::Debug,
186{
187 input
189 .iter()
190 .map(|x| {
191 (format!("{:?}", x).parse::<f64>().unwrap().exp()
192 - (format!("{:?}", x).parse::<f64>().unwrap() * (-1.)).exp())
193 / (format!("{:?}", x).parse::<f64>().unwrap().exp()
194 + (format!("{:?}", x).parse::<f64>().unwrap() * (-1.)).exp())
195 })
196 .collect()
197}
198
199pub struct OLS {
443 pub file_path: String,
444 pub target: usize, pub test_size: f64,
446}
447
448impl OLS {
449 pub fn fit(&self) {
450 let (columns, values) = read_csv(self.file_path.clone()); println!(
468 "The target here is header named: {:?}",
469 columns[self.target - 1]
470 );
471
472 let random_data = randomize(&values)
474 .iter()
475 .map(|a| {
476 a.iter()
477 .filter(|b| **b != "".to_string())
478 .map(|b| b.parse::<f64>().unwrap())
479 .collect::<Vec<f64>>()
480 })
481 .collect::<Vec<Vec<f64>>>();
482 let (train_data, test_data) = train_test_split_f(&random_data, self.test_size);
484 shape("Training data", &train_data);
485 shape("Testing data", &test_data);
486
487 shape("Training data", &train_data);
491 let actual_train = row_to_columns_conversion(&train_data);
492 let x = drop_column(&actual_train, self.target);
494 let b0_vec: Vec<Vec<f64>> = vec![vec![1.; x[0].len()]]; let X = [&b0_vec[..], &x[..]].concat(); let xt = MatrixF { matrix: X };
499
500 let y = vec![actual_train[self.target - 1].to_vec()];
502 let xtx = MatrixF {
514 matrix: matrix_multiplication(&xt.matrix, &transpose(&xt.matrix)),
515 };
516 let slopes = &matrix_multiplication(
518 &MatrixF::inverse_f(&xtx), &transpose(&vec![matrix_vector_product_f(&xt.matrix, &y[0])]), )[0];
521
522 let output: Vec<_> = columns[..columns.len() - 1]
524 .iter()
525 .zip(slopes[1..].iter())
526 .collect();
527 println!(
529 "\n\nThe coeficients of a columns as per simple linear regression on {:?}% of data is : \n{:?} and b0 is : {:?}",
530 self.test_size * 100.,
531 output,
532 slopes[0]
533 );
534
535 let mut predicted_values = vec![];
538 for i in test_data.iter() {
539 predicted_values.push({
540 let value = i
541 .iter()
542 .zip(slopes[1..].iter())
543 .map(|(a, b)| (a * b))
544 .collect::<Vec<f64>>();
545 value.iter().fold(slopes[0], |a, b| a + b) });
547 }
548
549 println!("RMSE : {:?}", rmse(&test_data, &predicted_values));
550 println!("MSE : {:?}", mse(&test_data, &predicted_values)); println!("MAE : {:?}", mae(&test_data, &predicted_values));
552 println!("MAPE : {:?}", mape(&test_data, &predicted_values));
553 println!(
554 "R2 and adjusted R2 : {:?}",
555 r_square(
556 &test_data
557 .iter()
558 .map(|a| a[test_data[0].len() - 1])
559 .collect(), &predicted_values,
561 columns.len(),
562 )
563 );
564
565 println!();
566 println!();
567 }
568}
569
570pub struct BLR {
571 pub file_path: String, pub test_size: f64, pub target_column: usize, pub learning_rate: f64, pub iter_count: u32, pub binary_threshold: f64, }
578impl BLR {
579 pub fn fit(&self) {
580 let (_, values) = read_csv(self.file_path.clone()); let random_data = float_randomize(&values);
593
594 let (x_train, y_train, x_test, y_test) =
596 preprocess_train_test_split(&random_data, self.test_size, self.target_column, "");
597
598 shape("Training features", &x_train);
599 shape("Test features", &x_test);
600 println!("Training target: {:?}", &y_train.len());
601 println!("Test target: {:?}", &y_test.len());
602
603 let length = x_train[0].len();
605 let feature_count = x_train.len();
606 let intercept = vec![vec![1.; length]];
608 let new_x_train = [&intercept[..], &x_train[..]].concat();
609 let mut coefficients = vec![0.; feature_count + 1];
610
611 let mut cost = vec![];
612 print!("Reducing loss ...");
613 for _ in 0..self.iter_count {
614 let s = BLR::sigmoid(&new_x_train, &coefficients);
615 cost.push(BLR::log_loss(&s, &y_train));
616 let gd = BLR::gradient_descent(&new_x_train, &s, &y_train);
617 coefficients = BLR::change_in_loss(&coefficients, self.learning_rate, &gd);
618 }
619 let predicted = BLR::predict(&x_test, &coefficients, self.binary_threshold);
628 confuse_me(&predicted, &y_test, -1., 1.);
629 }
630
631 pub fn predict(test_features: &Vec<Vec<f64>>, weights: &Vec<f64>, threshold: f64) -> Vec<f64> {
632 let length = test_features[0].len();
633 let intercept = vec![vec![1.; length]];
634 let new_x_test = [&intercept[..], &test_features[..]].concat();
635 let pred = BLR::sigmoid(&new_x_test, weights);
636 pred.iter()
637 .map(|a| if *a > threshold { 1. } else { 0. })
638 .collect()
639 }
640
641 pub fn change_in_loss(coeff: &Vec<f64>, lr: f64, gd: &Vec<f64>) -> Vec<f64> {
642 print!(".");
643 if coeff.len() == gd.len() {
644 element_wise_operation(coeff, &gd.iter().map(|a| a * lr).collect(), "add")
645 } else {
646 panic!("The dimensions do not match")
647 }
648 }
649
650 pub fn gradient_descent(
651 train: &Vec<Vec<f64>>,
652 sigmoid: &Vec<f64>,
653 y_train: &Vec<f64>,
654 ) -> Vec<f64> {
655 let part2 = element_wise_operation(sigmoid, y_train, "sub");
656 let numerator = matrix_vector_product_f(train, &part2);
657 numerator
658 .iter()
659 .map(|a| *a / (y_train.len() as f64))
660 .collect()
661 }
662
663 pub fn log_loss(sigmoid: &Vec<f64>, y_train: &Vec<f64>) -> f64 {
664 let part11 = sigmoid.iter().map(|a| a.log(1.0_f64.exp())).collect();
665 let part12 = y_train.iter().map(|a| a * -1.).collect();
666 let part21 = sigmoid
667 .iter()
668 .map(|a| (1. - a).log(1.0_f64.exp()))
669 .collect();
670 let part22 = y_train.iter().map(|a| 1. - a).collect();
671 let part1 = element_wise_operation(&part11, &part12, "mul");
672 let part2 = element_wise_operation(&part21, &part22, "mul");
673 mean(&element_wise_operation(&part1, &part2, "sub"))
674 }
675
676 pub fn sigmoid(train: &Vec<Vec<f64>>, coeff: &Vec<f64>) -> Vec<f64> {
677 let z = matrix_vector_product_f(&transpose(train), coeff);
678 z.iter().map(|a| 1. / (1. + a.exp())).collect()
679 }
680}
681
682pub struct KNN<'a> {
683 pub file_path: String,
684 pub test_size: f64,
685 pub target_column: usize,
686 pub k: usize,
687 pub method: &'a str,
688}
689impl<'a> KNN<'a> {
690 pub fn fit(&self) {
691 let (_, values) = read_csv(self.file_path.clone()); let random_data = float_randomize(&values); let (x_train, y_train, x_test, y_test) =
702 preprocess_train_test_split(&random_data, self.test_size, self.target_column, ""); let train_rows = columns_to_rows_conversion(&x_train);
707 let test_rows = columns_to_rows_conversion(&x_test);
708 shape("train Rows:", &train_rows);
709 shape("test Rows:", &test_rows);
710 let predcited = KNN::predict(&train_rows, &y_train, &test_rows, self.method, self.k);
714 println!("Metrics");
715 confuse_me(
716 &predcited.iter().map(|a| *a as f64).collect::<Vec<f64>>(),
717 &y_test,
718 -1.,
719 1.,
720 ); }
722
723 fn predict(
724 train_rows: &Vec<Vec<f64>>,
725 train_values: &Vec<f64>,
726 test_rows: &Vec<Vec<f64>>,
727 method: &str,
728 k: usize,
729 ) -> Vec<i32> {
730 match method {
731 "e" => println!("\n\nCalculating KNN using euclidean distance ..."),
732 "ma" => println!("\n\nCalculating KNN using manhattan distance ..."),
733 "co" => println!("\n\nCalculating KNN using cosine distance ..."),
734 "ch" => println!("\n\nCalculating KNN using chebyshev distance ..."),
735 _ => panic!("The method has to be either 'e' or 'ma' or 'co' or 'ch'"),
736 };
737 let mut predcited = vec![];
738 for j in test_rows.iter() {
739 let mut class_found = vec![];
740 for (n, i) in train_rows.iter().enumerate() {
741 let dis = Distance {
743 row1: i.clone(),
744 row2: j.clone(),
745 };
746 match method {
747 "e" => class_found.push((dis.distance_euclidean(), train_values[n])),
748 "ma" => class_found.push((dis.distance_manhattan(), train_values[n])),
749 "co" => class_found.push((dis.distance_cosine(), train_values[n])),
750 "ch" => class_found.push((dis.distance_chebyshev(), train_values[n])),
751 _ => (), };
753 }
754 class_found.sort_by(|(a, _), (c, _)| (*a).partial_cmp(c).unwrap());
756 let k_nearest = class_found[..k].to_vec();
757 let knn: Vec<f64> = k_nearest.iter().map(|a| a.1).collect();
758 let nearness = value_counts(&knn.iter().map(|a| *a as i32).collect());
760 predcited.push(*nearness.iter().next_back().unwrap().0)
762 }
763 predcited
764 }
765}
766
767pub struct Distance {
768 pub row1: Vec<f64>,
769 pub row2: Vec<f64>,
770}
771impl Distance {
772 pub fn distance_euclidean(&self) -> f64 {
773 let distance = self
776 .row1
777 .iter()
778 .zip(self.row2.iter())
779 .map(|(a, b)| (*a - *b) * (*a - *b))
780 .collect::<Vec<f64>>();
781 distance.iter().fold(0., |a, b| a + b).sqrt()
782 }
783
784 pub fn distance_manhattan(&self) -> f64 {
785 let distance = self
788 .row1
789 .iter()
790 .zip(self.row2.iter())
791 .map(|(a, b)| (*a - *b).abs())
792 .collect::<Vec<f64>>();
793 distance.iter().fold(0., |a, b| a + b)
794 }
795
796 pub fn distance_cosine(&self) -> f64 {
797 let numerator = self
800 .row1
801 .iter()
802 .zip(self.row2.iter())
803 .map(|(a, b)| (*a * *b))
804 .collect::<Vec<f64>>()
805 .iter()
806 .fold(0., |a, b| a + b);
807 let denominator = (self
808 .row1
809 .iter()
810 .map(|a| a * a)
811 .collect::<Vec<f64>>()
812 .iter()
813 .fold(0., |a, b| a + b)
814 .sqrt())
815 * (self
816 .row2
817 .iter()
818 .map(|a| a * a)
819 .collect::<Vec<f64>>()
820 .iter()
821 .fold(0., |a, b| a + b)
822 .sqrt());
823 1. - numerator / denominator
824 }
825
826 pub fn distance_chebyshev(&self) -> f64 {
827 let distance = self
829 .row1
830 .iter()
831 .zip(self.row2.iter())
832 .map(|(a, b)| (*a - *b).abs())
833 .collect::<Vec<f64>>();
834 distance.iter().cloned().fold(0. / 0., f64::max)
835 }
836}
837
838pub struct Kmeans {
839 pub file_path: String,
840 pub k: usize,
841 pub iterations: u32,
842}
843impl Kmeans {
844 pub fn fit(&self) {
845 let (_, values) = read_csv(self.file_path.clone()); let random_data: Vec<_> = float_randomize(&values);
872
873 let mut centroids = randomize(&random_data)[..self.k].to_vec();
875 print_a_matrix("Original means", ¢roids);
876
877 let mut new_mean: Vec<Vec<f64>> = vec![];
878 for x in 0..self.iterations - 1 {
879 let mut updated_cluster = vec![];
880 let mut nearest_centroid_number = vec![];
881 for i in random_data.iter() {
882 let mut distance = vec![];
883 for (centroid_number, j) in centroids.iter().enumerate() {
884 let dis = Distance {
885 row1: i.clone(),
886 row2: j.clone(),
887 };
888 distance.push((centroid_number, dis.distance_euclidean()))
889 }
890 distance.sort_by(|m, n| m.1.partial_cmp(&n.1).unwrap());
891 nearest_centroid_number.push(distance[0].0);
892 }
893
894 let clusters: Vec<(&usize, &Vec<f64>)> = nearest_centroid_number
896 .iter()
897 .zip(random_data.iter())
898 .collect();
899 new_mean = vec![];
903 for (m, _) in centroids.iter().enumerate() {
904 let mut group = vec![];
905 for i in clusters.iter() {
906 if *i.0 == m {
907 group.push(i.1.clone());
908 }
909 }
910 new_mean.push(
911 group
912 .iter()
913 .fold(vec![0.; self.k], |a, b| {
914 element_wise_operation(&a, b, "add")
915 })
916 .iter()
917 .map(|a| a / (group.len() as f64)) .collect(),
919 );
920 updated_cluster = clusters.clone()
921 }
922 println!("Iteration {:?}", x);
923 if centroids == new_mean {
924 let mut rearranged_output = vec![];
926 for i in values
927 .iter()
928 .map(|a| a.iter().map(|b| b.parse().unwrap()).collect())
929 .collect::<Vec<Vec<f64>>>()
930 .iter()
931 {
932 for (c, v) in updated_cluster.iter() {
933 if i == *v {
934 rearranged_output.push((c, v));
935 break;
936 }
937 }
938 }
939 println!(
941 "CLUSTERS\n{:?}",
942 rearranged_output
943 .iter()
944 .map(|a| **(a.0))
945 .collect::<Vec<usize>>()
946 );
947 break;
948 } else {
949 centroids = new_mean.clone();
950 }
951 }
952 print_a_matrix("Final means", ¢roids);
953 }
954}
955
956pub struct SSVM {
957 pub file_path: String, pub drop_column_number: Vec<usize>, pub test_size: f64, pub learning_rate: f64, pub iter_count: i32, pub reg_strength: f64, }
964impl SSVM {
965 pub fn fit(&self) -> Vec<f64> {
974 let (columns, values) = read_csv(self.file_path.clone()); let mut random_data = SSVM::float_randomize(&values);
980
981 println!(
982 "The columns are\n{:?}\n",
983 columns
984 .iter()
985 .filter(|a| **a != "\r".to_string())
986 .map(|a| a.replace("\"", ""))
987 .collect::<Vec<String>>()
988 );
989 shape("Before dropping columns the dimensions are", &random_data);
990
991 random_data = row_to_columns_conversion(&random_data);
993 if self.drop_column_number.len() > 0 {
994 for (n, i) in self.drop_column_number.iter().enumerate() {
995 if n == 0 {
996 println!("Dropping column #{}", i);
997 random_data = drop_column(&random_data, *i);
998 } else {
999 println!("Dropping column #{}", i);
1000 random_data = drop_column(&random_data, *i - n);
1001 }
1002 }
1003 }
1004 random_data = columns_to_rows_conversion(&random_data);
1006
1007 shape("After dropping columns the dimensions are", &random_data);
1008 println!();
1009
1010 head(&random_data, 5);
1011
1012 let mut normalized = row_to_columns_conversion(&random_data);
1014
1015 random_data = row_to_columns_conversion(&random_data);
1016 for (n, i) in random_data.iter().enumerate() {
1017 print!(".");
1018 if n != normalized.len() - 1 - self.drop_column_number.len() {
1019 normalized[n] = min_max_scaler(i);
1020 } else {
1021 normalized[n] = i.clone();
1022 }
1023 }
1024 println!("\nAfter normalizing:");
1025
1026 normalized = columns_to_rows_conversion(&normalized);
1028 head(&normalized, 5);
1029 println!();
1030
1031 let (mut x_train, y_train, mut x_test, y_test) =
1033 preprocess_train_test_split(&normalized, self.test_size, normalized[0].len(), "");
1034
1035 let mut length = x_train[0].len();
1037 let intercept = vec![vec![1.; length]];
1038 x_train = [&intercept[..], &x_train[..]].concat();
1039 length = x_test[0].len();
1040 let intercept = vec![vec![1.; length]];
1041 x_test = [&intercept[..], &x_test[..]].concat();
1042
1043 x_train = columns_to_rows_conversion(&x_train);
1045 x_test = columns_to_rows_conversion(&x_test);
1046
1047 shape("Training features", &x_train);
1049 shape("Test features", &x_test);
1050 println!("Training target: {:?}", &y_train.len());
1051 println!("Test target: {:?}", &y_test.len());
1052
1053 let weights = SSVM::sgd(&self, &x_train, &y_train);
1054 let predictions = SSVM::predict(&self, &x_test, &weights);
1055 confuse_me(&predictions, &y_test, -1., 1.);
1056 println!("Weights of intercept followed by features : {:?}", weights);
1057 weights
1058 }
1059 fn sgd(&self, features: &Vec<Vec<f64>>, output: &Vec<f64>) -> Vec<f64> {
1060 let max_epoch: i32 = self.iter_count;
1061 let mut weights = vec![0.; features[0].len()];
1062 let mut nth = 0.;
1063 let mut prev_cost = std::f64::INFINITY;
1064 let per_cost_threshold = 0.01;
1065 for epoch in 1..max_epoch {
1066 if epoch % 100 == 0 {
1068 print!("..");
1069 }
1070 let order = randomize_vector(&(0..output.len()).map(|a| a).collect());
1071 let mut x = vec![];
1072 let mut y = vec![];
1073 for i in order.iter() {
1074 x.push(features[*i].clone());
1075 y.push(output[*i]);
1076 }
1077
1078 for (n, i) in x.iter().enumerate() {
1080 let ascent = SSVM::calculate_cost_gradient(&self, &weights, i, y[n]);
1081 weights = element_wise_operation(
1082 &weights,
1083 &ascent.iter().map(|a| a * self.learning_rate).collect(),
1084 "sub",
1085 );
1086 }
1087 if epoch == 2f64.powf(nth) as i32 || epoch == max_epoch - 1 {
1090 let cost = SSVM::compute_cost(&self, &weights, features, output);
1091 println!("{} Epoch, has cost {}", epoch, cost);
1092 if (prev_cost - cost).abs() < (per_cost_threshold * prev_cost) {
1093 println!("{:?}", weights);
1094 return weights;
1095 }
1096 prev_cost = cost;
1097 nth += 1.;
1098 }
1099 }
1100 weights
1102 }
1103
1104 fn compute_cost(&self, weight: &Vec<f64>, x: &Vec<Vec<f64>>, y: &Vec<f64>) -> f64 {
1105 let mut distance = element_wise_operation(&matrix_vector_product_f(x, weight), &y, "mul");
1107 distance = distance.iter().map(|a| 1. - *a).collect();
1110 distance = distance
1111 .iter()
1112 .map(|a| if *a > 0. { *a } else { 0. })
1113 .collect();
1114 let hinge_loss =
1115 self.reg_strength * (distance.iter().fold(0., |a, b| a + b) / (x.len() as f64));
1116 (dot_product(&weight, &weight) / 2.) + hinge_loss
1117 }
1118
1119 fn calculate_cost_gradient(
1120 &self,
1121 weight: &Vec<f64>,
1122 x_batch: &Vec<f64>,
1123 y_batch: f64,
1124 ) -> Vec<f64> {
1125 let distance = 1. - (dot_product(&x_batch, &weight) * y_batch);
1126 let mut dw = vec![0.; weight.len()];
1128 let di;
1129 if distance < 0. {
1130 di = dw.clone();
1131 } else {
1132 let second_half = x_batch
1133 .iter()
1134 .map(|a| a * self.reg_strength * y_batch)
1135 .collect();
1136 di = element_wise_operation(weight, &second_half, "sub");
1137 }
1138 dw = element_wise_operation(&di, &dw, "add");
1139 dw
1141 }
1142
1143 fn predict(&self, test_features: &Vec<Vec<f64>>, weights: &Vec<f64>) -> Vec<f64> {
1144 let mut output = vec![];
1145 for i in test_features.iter() {
1146 if dot_product(i, weights) > 0. {
1147 output.push(1.);
1148 } else {
1149 output.push(-1.);
1150 }
1151 }
1152 println!("Predications : {:?}", output);
1153 output
1154 }
1155
1156 fn float_randomize(matrix: &Vec<Vec<String>>) -> Vec<Vec<f64>> {
1157 randomize(
1158 &matrix
1159 .iter()
1160 .map(|a| {
1161 a.iter()
1162 .map(|b| {
1163 (b).replace("\r", "")
1164 .replace("\n", "")
1165 .parse::<f64>()
1166 .unwrap()
1167 })
1168 .collect::<Vec<f64>>()
1169 })
1170 .collect::<Vec<Vec<f64>>>(),
1171 )
1172 }
1173}
1174
1175pub fn mean<T>(list: &Vec<T>) -> f64
1176where
1177 T: std::iter::Sum<T>
1178 + std::ops::Div<Output = T>
1179 + Copy
1180 + std::str::FromStr
1181 + std::string::ToString
1182 + std::ops::Add<T, Output = T>
1183 + std::fmt::Debug
1184 + std::fmt::Display
1185 + std::str::FromStr,
1186 <T as std::str::FromStr>::Err: std::fmt::Debug,
1187{
1188 let zero: T = "0".parse().unwrap();
1190 let len_str = list.len().to_string();
1191 let length: T = len_str.parse().unwrap();
1192 (list.iter().fold(zero, |acc, x| acc + *x) / length)
1193 .to_string()
1194 .parse()
1195 .unwrap()
1196}
1197
1198pub fn variance<T>(list: &Vec<T>) -> f64
1199where
1200 T: std::iter::Sum<T>
1201 + std::ops::Div<Output = T>
1202 + std::marker::Copy
1203 + std::fmt::Display
1204 + std::ops::Sub<T, Output = T>
1205 + std::ops::Add<T, Output = T>
1206 + std::ops::Mul<T, Output = T>
1207 + std::fmt::Debug
1208 + std::string::ToString
1209 + std::str::FromStr,
1210 <T as std::str::FromStr>::Err: std::fmt::Debug,
1211{
1212 let zero: T = "0".parse().unwrap();
1214 let mu = mean(list);
1215 let _len_str: T = list.len().to_string().parse().unwrap(); let output: Vec<_> = list
1217 .iter()
1218 .map(|x| (*x - mu.to_string().parse().unwrap()) * (*x - mu.to_string().parse().unwrap()))
1219 .collect();
1220 let variance = output.iter().fold(zero, |a, b| a + *b); variance.to_string().parse().unwrap()
1223}
1224
1225pub fn covariance<T>(list1: &Vec<T>, list2: &Vec<T>) -> f64
1226where
1227 T: std::iter::Sum<T>
1228 + std::ops::Div<Output = T>
1229 + std::fmt::Debug
1230 + std::fmt::Display
1231 + std::ops::Add
1232 + std::marker::Copy
1233 + std::ops::Add<T, Output = T>
1234 + std::ops::Sub<T, Output = T>
1235 + std::ops::Mul<T, Output = T>
1236 + std::string::ToString
1237 + std::str::FromStr,
1238 <T as std::str::FromStr>::Err: std::fmt::Debug,
1239{
1240 let mu1 = mean(list1);
1242 let mu2 = mean(list2);
1243 let zero: T = "0".parse().unwrap();
1244 let _len_str: f64 = list1.len().to_string().parse().unwrap(); let tupled: Vec<_> = list1.iter().zip(list2).collect();
1246 let output = tupled.iter().fold(zero, |a, b| {
1247 a + ((*b.0 - mu1.to_string().parse().unwrap()) * (*b.1 - mu2.to_string().parse().unwrap()))
1248 });
1249 let numerator: f64 = output.to_string().parse().unwrap();
1250 numerator }
1252
1253pub fn coefficient<T>(list1: &Vec<T>, list2: &Vec<T>) -> (f64, f64)
1254where
1255 T: std::iter::Sum<T>
1256 + std::ops::Div<Output = T>
1257 + std::fmt::Debug
1258 + std::fmt::Display
1259 + std::ops::Add
1260 + std::marker::Copy
1261 + std::ops::Add<T, Output = T>
1262 + std::ops::Sub<T, Output = T>
1263 + std::ops::Mul<T, Output = T>
1264 + std::str::FromStr,
1265 <T as std::str::FromStr>::Err: std::fmt::Debug,
1266{
1267 let b1 = covariance(list1, list2) / variance(list1);
1272 let b0 = mean(list2) - (b1 * mean(list1));
1273 (b0.to_string().parse().unwrap(), b1)
1274}
1275
1276pub fn simple_linear_regression_prediction<T>(train: &Vec<(T, T)>, test: &Vec<(T, T)>) -> Vec<T>
1277where
1278 T: std::iter::Sum<T>
1279 + std::ops::Div<Output = T>
1280 + std::fmt::Debug
1281 + std::fmt::Display
1282 + std::ops::Add
1283 + std::marker::Copy
1284 + std::ops::Add<T, Output = T>
1285 + std::ops::Sub<T, Output = T>
1286 + std::ops::Mul<T, Output = T>
1287 + std::str::FromStr,
1288 <T as std::str::FromStr>::Err: std::fmt::Debug,
1289{
1290 let train_features = &train.iter().map(|a| a.0).collect();
1292 let test_features = &test.iter().map(|a| a.1).collect();
1293 let (offset, slope) = coefficient(train_features, test_features);
1294 let b0: T = offset.to_string().parse().unwrap();
1295 let b1: T = slope.to_string().parse().unwrap();
1296 let predicted_output = test.iter().map(|a| b0 + b1 * a.0).collect();
1297 let original_output: Vec<_> = test.iter().map(|a| a.0).collect();
1298 println!("========================================================================================================================================================");
1299 println!("b0 = {:?} and b1= {:?}", b0, b1);
1300 println!(
1301 "RMSE: {:?}",
1302 root_mean_square(&predicted_output, &original_output)
1303 );
1304 predicted_output
1305}
1306
1307pub fn root_mean_square<T>(list1: &Vec<T>, list2: &Vec<T>) -> f64
1308where
1309 T: std::ops::Sub<T, Output = T>
1310 + Copy
1311 + std::ops::Mul<T, Output = T>
1312 + std::ops::Add<T, Output = T>
1313 + std::ops::Div<Output = T>
1314 + std::string::ToString
1315 + std::str::FromStr,
1316 <T as std::str::FromStr>::Err: std::fmt::Debug,
1317{
1318 let zero: T = "0".parse().unwrap();
1320 let tupled: Vec<_> = list1.iter().zip(list2).collect();
1321 let length: T = list1.len().to_string().parse().unwrap();
1322 let mean_square_error = tupled
1323 .iter()
1324 .fold(zero, |b, a| b + ((*a.1 - *a.0) * (*a.1 - *a.0)))
1325 / length;
1326 let mse: f64 = mean_square_error.to_string().parse().unwrap();
1327 mse.powf(0.5)
1328}
1329
1330use std::collections::HashMap;
1332use std::fs;
1333pub fn read_csv<'a>(path: String) -> (Vec<String>, Vec<Vec<String>>) {
1334 println!("Reading the file ...");
1339 let file = fs::read_to_string(&path).unwrap();
1340 let splitted: Vec<&str> = file.split("\n").collect();
1341 let rows: i32 = (splitted.len() - 1) as i32;
1342 println!("Number of rows = {}", rows);
1343 let table: Vec<Vec<_>> = splitted.iter().map(|a| a.split(",").collect()).collect();
1344 let values = table[1..]
1345 .iter()
1346 .map(|a| a.iter().map(|b| b.to_string()).collect())
1347 .collect();
1348 let columns: Vec<String> = table[0].iter().map(|a| a.to_string()).collect();
1349 (columns, values)
1350}
1351
1352use std::io::Error;
1353pub fn convert_and_impute<U>(
1354 list: &Vec<String>,
1355 to: U,
1356 impute_with: U,
1357) -> (Result<Vec<U>, Error>, Vec<usize>)
1358where
1359 U: std::cmp::PartialEq + Copy + std::marker::Copy + std::string::ToString + std::str::FromStr,
1360 <U as std::str::FromStr>::Err: std::fmt::Debug,
1361{
1362 println!("========================================================================================================================================================");
1366 let mut output: Vec<_> = vec![];
1368 let mut missing = vec![];
1369 match type_of(to) {
1370 "f64" => {
1371 for (n, i) in list.iter().enumerate() {
1372 if *i != "" {
1373 let x = i.parse::<U>().unwrap();
1374 output.push(x);
1375 } else {
1376 output.push(impute_with);
1377 missing.push(n);
1378 println!("Error found in {}th position of the vector", n);
1379 }
1380 }
1381 }
1382 "i32" => {
1383 for (n, i) in list.iter().enumerate() {
1384 if *i != "" {
1385 let string_splitted: Vec<_> = i.split(".").collect();
1386 let ones_digit = string_splitted[0].parse::<U>().unwrap();
1387 output.push(ones_digit);
1388 } else {
1389 output.push(impute_with);
1390 missing.push(n);
1391 println!("Error found in {}th position of the vector", n);
1392 }
1393 }
1394 }
1395 _ => println!("This type conversion cant be done, choose either int or float type\n Incase of string conversion, use impute_string"),
1396 }
1397
1398 (Ok(output), missing)
1399}
1400
1401pub fn impute_string<'a>(list: &'a mut Vec<String>, impute_with: &'a str) -> Vec<&'a str> {
1402 list.iter()
1407 .enumerate()
1408 .map(|(n, a)| {
1409 if *a == String::from("") {
1410 println!("Missing value found in {}th position of the vector", n);
1411 impute_with
1412 } else {
1413 &a[..]
1414 }
1415 })
1416 .collect()
1417}
1418
1419pub fn convert_string_categorical<T>(list: &Vec<T>, extra_class: bool) -> Vec<f64>
1421where
1422 T: std::cmp::PartialEq + std::cmp::Eq + std::hash::Hash + Copy,
1423{
1424 println!("========================================================================================================================================================");
1425 let values = unique_values(&list);
1426 if extra_class == true && values.len() > 10 {
1427 println!("The number of classes will be more than 10");
1428 } else {
1429 ();
1430 }
1431 let mut map: HashMap<&T, f64> = HashMap::new();
1432 for (n, i) in values.iter().enumerate() {
1433 map.insert(i, n as f64 + 1.);
1434 }
1435 list.iter().map(|a| map[a]).collect()
1436}
1437
1438pub fn min_max_scaler(list: &Vec<f64>) -> Vec<f64> {
1439 let (minimum, maximum) = min_max_f(&list);
1441 let range: f64 = maximum - minimum;
1442 list.iter().map(|a| 1. - ((maximum - a) / range)).collect()
1443}
1444
1445pub fn logistic_function_f(matrix: &Vec<Vec<f64>>, beta: &Vec<Vec<f64>>) -> Vec<Vec<f64>> {
1446 println!("========================================================================================================================================================");
1447 println!("logistic function");
1449 println!(
1450 "{:?}x{:?}\n{:?}x{:?}",
1451 matrix.len(),
1452 matrix[0].len(),
1453 beta.len(),
1454 beta[0].len()
1455 );
1456 matrix_multiplication(matrix, beta)
1457 .iter()
1458 .map(|a| a.iter().map(|b| 1. / (1. + ((b * -1.).exp()))).collect())
1459 .collect()
1460}
1461
1462pub fn log_gradient_f(
1463 matrix1: &Vec<Vec<f64>>,
1464 beta: &Vec<Vec<f64>>,
1465 matrix2: &Vec<f64>,
1466) -> Vec<Vec<f64>> {
1467 println!("========================================================================================================================================================");
1468 println!("Log gradient_f");
1470 let mut first_calc = vec![];
1472 for (n, i) in logistic_function_f(matrix1, beta).iter().enumerate() {
1473 let mut row = vec![];
1474 for j in i.iter() {
1475 row.push(j - matrix2[n]);
1476 }
1477 first_calc.push(row);
1478 }
1479
1480 let first_calc_t = transpose(&first_calc);
1481 let mut x = vec![];
1482 for j in 0..matrix1[0].len() {
1483 let mut row = vec![];
1484 for i in matrix1.iter() {
1485 row.push(i[j]);
1486 }
1487 x.push(row);
1488 }
1489
1490 let mut final_calc = vec![];
1492 for i in first_calc_t.iter() {
1493 for j in x.iter() {
1494 final_calc.push(dot_product(&i, &j))
1495 }
1496 }
1497
1498 shape_changer(&final_calc, matrix1[0].len(), matrix1.len())
1505}
1506
1507pub fn logistic_predict(matrix1: &Vec<Vec<f64>>, beta: &Vec<Vec<f64>>) -> Vec<Vec<f64>> {
1508 let prediction_probability = logistic_function_f(matrix1, beta);
1510 let output = prediction_probability
1511 .iter()
1512 .map(|a| a.iter().map(|b| if *b >= 0.5 { 1. } else { 0. }).collect())
1513 .collect();
1514 output
1515}
1516
1517pub fn randomize_vector<T: std::clone::Clone>(rows: &Vec<T>) -> Vec<T> {
1518 use rand::seq::SliceRandom;
1522 let mut order: Vec<usize> = (0..rows.len() as usize).collect();
1524 let slice: &mut [usize] = &mut order;
1525 let mut rng = thread_rng();
1526 slice.shuffle(&mut rng);
1527 let mut output = vec![];
1530 for i in order.iter() {
1531 output.push(rows[*i].clone());
1532 }
1533 output
1534}
1535
1536pub fn randomize<T: std::clone::Clone>(rows: &Vec<Vec<T>>) -> Vec<Vec<T>> {
1537 use rand::seq::SliceRandom;
1541 let mut order: Vec<usize> = (0..rows.len() as usize).collect();
1543 let slice: &mut [usize] = &mut order;
1544 let mut rng = thread_rng();
1545 slice.shuffle(&mut rng);
1546 let mut output = vec![];
1549 for i in order.iter() {
1550 output.push(rows[*i].clone());
1551 }
1552 output
1553}
1554
1555pub fn train_test_split_vector_f(input: &Vec<f64>, percentage: f64) -> (Vec<f64>, Vec<f64>) {
1556 let data = randomize_vector(input);
1561 let test_count = (data.len() as f64 * percentage) as usize;
1564 let test = data[0..test_count].to_vec();
1567 let train = data[test_count..].to_vec();
1568 (train, test)
1569}
1570
1571pub fn train_test_split_f(
1572 input: &Vec<Vec<f64>>,
1573 percentage: f64,
1574) -> (Vec<Vec<f64>>, Vec<Vec<f64>>) {
1575 let data = randomize(input);
1580 let test_count = (data.len() as f64 * percentage) as usize;
1583 let test = data[0..test_count].to_vec();
1586 let train = data[test_count..].to_vec();
1587 (train, test)
1588}
1589
1590pub fn correlation<T>(list1: &Vec<T>, list2: &Vec<T>, name: &str) -> f64
1591where
1592 T: std::iter::Sum<T>
1593 + std::ops::Div<Output = T>
1594 + std::fmt::Debug
1595 + std::fmt::Display
1596 + std::ops::Add
1597 + std::cmp::PartialOrd
1598 + std::marker::Copy
1599 + std::ops::Add<T, Output = T>
1600 + std::ops::Sub<T, Output = T>
1601 + std::ops::Mul<T, Output = T>
1602 + std::string::ToString
1603 + std::str::FromStr,
1604 <T as std::str::FromStr>::Err: std::fmt::Debug,
1605{
1606 let cov = covariance(list1, list2);
1612 let output = match name {
1613 "p" => (cov / (std_dev(list1) * std_dev(list2))) / list1.len() as f64,
1614 "s" => {
1615 let ranked_list1 = spearman_rank(list1);
1618 let ranked_list2 = spearman_rank(list2);
1619 let len = list1.len() as f64;
1620 let mut rl1 = vec![];
1622 for k in list1.iter() {
1623 for (i, j) in ranked_list1.iter() {
1624 if k == i {
1625 rl1.push(j);
1626 }
1627 }
1628 }
1629 let mut rl2 = vec![];
1630 for k in list2.iter() {
1631 for (i, j) in ranked_list2.iter() {
1632 if k == i {
1633 rl2.push(j);
1634 }
1635 }
1636 }
1637
1638 let combined: Vec<_> = rl1.iter().zip(rl2.iter()).collect();
1639 let sum_of_square_of_difference = combined
1640 .iter()
1641 .map(|(a, b)| (***a - ***b) * (***a - ***b))
1642 .fold(0., |a, b| a + b);
1643 1. - ((6. * sum_of_square_of_difference) / (len * ((len * len) - 1.)))
1644 }
1646 _ => panic!("Either `p`: Pearson or `s`:Spearman has to be the name. Please retry!"),
1647 };
1648 match output {
1649 x if x < 0.2 && x > -0.2 => println!("There is a weak correlation between the two :"),
1650 x if x > 0.6 => println!("There is a strong positive correlation between the two :"),
1651 x if x < -0.6 => println!("There is a strong negative correlation between the two :"),
1652 _ => (),
1653 }
1654 output
1655}
1656
1657pub fn std_dev<T>(list1: &Vec<T>) -> f64
1658where
1659 T: std::iter::Sum<T>
1660 + std::ops::Div<Output = T>
1661 + std::fmt::Debug
1662 + std::fmt::Display
1663 + std::ops::Add
1664 + std::marker::Copy
1665 + std::ops::Add<T, Output = T>
1666 + std::ops::Sub<T, Output = T>
1667 + std::ops::Mul<T, Output = T>
1668 + std::string::ToString
1669 + std::str::FromStr,
1670 <T as std::str::FromStr>::Err: std::fmt::Debug,
1671{
1672 let mu: T = mean(list1).to_string().parse().unwrap();
1673 let square_of_difference = list1.iter().map(|a| (*a - mu) * (*a - mu)).collect();
1674 let var = mean(&square_of_difference);
1675 var.sqrt()
1676}
1677
1678pub fn spearman_rank<T>(list1: &Vec<T>) -> Vec<(T, f64)>
1679where
1680 T: std::iter::Sum<T>
1681 + std::ops::Div<Output = T>
1682 + std::fmt::Debug
1683 + std::fmt::Display
1684 + std::ops::Add
1685 + std::marker::Copy
1686 + std::cmp::PartialOrd
1687 + std::ops::Add<T, Output = T>
1688 + std::ops::Sub<T, Output = T>
1689 + std::ops::Mul<T, Output = T>
1690 + std::string::ToString
1691 + std::str::FromStr,
1692 <T as std::str::FromStr>::Err: std::fmt::Debug,
1693{
1694 let mut sorted = list1.clone();
1699 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
1700 let mut map: Vec<(_, _)> = vec![];
1701 for (n, i) in sorted.iter().enumerate() {
1702 map.push(((n + 1), *i));
1703 }
1704 let mut repeats: Vec<_> = vec![];
1706 for (n, i) in sorted.iter().enumerate() {
1707 if how_many_and_where_vector(&sorted, *i).len() > 1 {
1708 repeats.push((*i, how_many_and_where_vector(&sorted, *i)));
1709 } else {
1710 repeats.push((*i, vec![n]));
1711 }
1712 }
1713 let rank: Vec<_> = repeats
1715 .iter()
1716 .map(|(a, b)| {
1717 (a, b.iter().fold(0., |a, b| a + *b as f64) / b.len() as f64) })
1719 .collect();
1720 let output: Vec<_> = rank.iter().map(|(a, b)| (**a, b + 1.)).collect(); output
1722}
1723
1724pub fn how_many_and_where_vector<T>(list: &Vec<T>, number: T) -> Vec<usize>
1725where
1726 T: std::cmp::PartialEq + std::fmt::Debug + Copy,
1727{
1728 let tuple: Vec<_> = list
1732 .iter()
1733 .enumerate()
1734 .filter(|&(_, a)| *a == number)
1735 .map(|(n, _)| n)
1736 .collect();
1737 tuple
1738}
1739
1740pub fn how_many_and_where<T>(matrix: &Vec<Vec<T>>, number: T) -> Vec<(usize, usize)>
1741where
1742 T: std::cmp::PartialEq + std::fmt::Debug + Copy,
1743{
1744 let mut output = vec![];
1748 for (n, i) in matrix.iter().enumerate() {
1749 for j in how_many_and_where_vector(&i, number) {
1750 output.push((n, j));
1751 }
1752 }
1753 output
1754}
1755
1756pub fn z_score<T>(list: &Vec<T>, number: T) -> f64
1757where
1758 T: std::iter::Sum<T>
1759 + std::ops::Div<Output = T>
1760 + Copy
1761 + std::str::FromStr
1762 + std::string::ToString
1763 + std::ops::Add<T, Output = T>
1764 + std::ops::Sub<T, Output = T>
1765 + std::ops::Mul<T, Output = T>
1766 + std::fmt::Debug
1767 + std::cmp::PartialEq
1768 + std::fmt::Display
1769 + std::str::FromStr,
1770 <T as std::str::FromStr>::Err: std::fmt::Debug,
1771{
1772 let n: f64 = number.to_string().parse().unwrap();
1776 if list.contains(&number) {
1777 (n - mean(list)) / std_dev(list)
1778 } else {
1779 panic!("The number not found in vector passed, please check");
1780 }
1781}
1782
1783pub fn one_hot_encoding(column: &Vec<&str>) -> Vec<Vec<u8>> {
1784 let values = unique_values(&column.clone());
1791 let mut output = vec![];
1793 for i in values.iter() {
1794 output.push(column.iter().map(|a| if a == i { 1 } else { 0 }).collect());
1795 }
1796 output
1797}
1798
1799pub fn shape(words: &str, m: &Vec<Vec<f64>>) {
1800 println!(
1802 "{:?} : Rows: {:?}, Columns: {:?}",
1803 words,
1804 m.len(),
1805 m[0].len()
1806 );
1807}
1808
1809pub fn rmse(test_data: &Vec<Vec<f64>>, predicted: &Vec<f64>) -> f64 {
1810 (mse(test_data, predicted)).sqrt()
1814}
1815
1816pub fn mse(test_data: &Vec<Vec<f64>>, predicted: &Vec<f64>) -> f64 {
1817 let mut square_error: Vec<f64> = vec![];
1822 for (n, i) in test_data.iter().enumerate() {
1823 let j = match i.last() {
1824 Some(x) => (predicted[n] - x) * (predicted[n] - x), _ => panic!("Something wrong in passed test data"),
1826 };
1827 square_error.push(j)
1828 }
1829 square_error.iter().fold(0., |a, b| a + b) / (predicted.len() as f64)
1831}
1832
1833pub fn mae(test_data: &Vec<Vec<f64>>, predicted: &Vec<f64>) -> f64 {
1834 let mut absolute_error: Vec<f64> = vec![];
1839 for (n, i) in test_data.iter().enumerate() {
1840 let j = match i.last() {
1841 Some(x) => (predicted[n] - x).abs(), _ => panic!("Something wrong in passed test data"),
1843 };
1844 absolute_error.push(j)
1845 }
1846 absolute_error.iter().fold(0., |a, b| a + b) / (predicted.len() as f64)
1848}
1849
1850pub fn r_square(predicted: &Vec<f64>, actual: &Vec<f64>, features: usize) -> (f64, f64) {
1851 let sst: Vec<_> = actual
1856 .iter()
1857 .map(|a| {
1858 (a - (actual.iter().fold(0., |a, b| a + b) / (actual.len() as f64))
1859 * (a - (actual.iter().fold(0., |a, b| a + b) / (actual.len() as f64))))
1860 })
1861 .collect();
1862 let ssr = predicted
1863 .iter()
1864 .zip(actual.iter())
1865 .fold(0., |a, b| a + (b.0 - b.1));
1866 let r2 = 1. - (ssr / (sst.iter().fold(0., |a, b| a + b)));
1867 let degree_of_freedom = predicted.len() as f64 - 1. - features as f64;
1869 let ar2 = 1. - ((1. - r2) * ((predicted.len() as f64 - 1.) / degree_of_freedom));
1870 (r2, ar2)
1871}
1872
1873pub fn mape(test_data: &Vec<Vec<f64>>, predicted: &Vec<f64>) -> f64 {
1874 let mut absolute_error: Vec<f64> = vec![];
1879 for (n, i) in test_data.iter().enumerate() {
1880 let j = match i.last() {
1881 Some(x) => (((predicted[n] - x) / predicted[n]).abs()) * 100., _ => panic!("Something wrong in passed test data"),
1883 };
1884 absolute_error.push(j)
1885 }
1886 absolute_error.iter().fold(0., |a, b| a + b) / (predicted.len() as f64)
1888}
1889
1890pub fn drop_column(matrix: &Vec<Vec<f64>>, column_number: usize) -> Vec<Vec<f64>> {
1891 [
1895 &matrix[..column_number - 1].to_vec()[..],
1896 &matrix[column_number..].to_vec()[..],
1897 ]
1898 .concat()
1899}
1900
1901pub fn float_randomize(matrix: &Vec<Vec<String>>) -> Vec<Vec<f64>> {
1902 randomize(
1903 &matrix
1904 .iter()
1905 .map(|a| {
1906 a.iter()
1907 .map(|b| (*b).replace("\r", "").parse::<f64>().unwrap())
1908 .collect::<Vec<f64>>()
1909 })
1910 .collect::<Vec<Vec<f64>>>(),
1911 )
1912}
1913
1914pub fn preprocess_train_test_split(
1915 matrix: &Vec<Vec<f64>>,
1916 test_percentage: f64,
1917 target_column: usize,
1918 preprocess: &str,
1919) -> (Vec<Vec<f64>>, Vec<f64>, Vec<Vec<f64>>, Vec<f64>) {
1920 let (train_data, test_data) = train_test_split_f(matrix, test_percentage);
1925 let mut actual_train = row_to_columns_conversion(&train_data);
1930 let mut actual_test = row_to_columns_conversion(&test_data);
1931
1932 match preprocess {
1933 "s" => {
1934 actual_train = actual_train
1935 .iter()
1936 .map(|a| standardize_vector_f(a))
1937 .collect::<Vec<Vec<f64>>>();
1938 actual_test = actual_test
1939 .iter()
1940 .map(|a| standardize_vector_f(a))
1941 .collect::<Vec<Vec<f64>>>();
1942 }
1943 "m" => {
1944 actual_train = actual_train
1945 .iter()
1946 .map(|a| min_max_scaler(a))
1947 .collect::<Vec<Vec<f64>>>();
1948 actual_test = actual_test
1949 .iter()
1950 .map(|a| min_max_scaler(a))
1951 .collect::<Vec<Vec<f64>>>();
1952 }
1953
1954 _ => println!("Using the actual values without preprocessing unless 's' or 'm' is passed"),
1955 };
1956
1957 (
1958 drop_column(&actual_train, target_column),
1959 actual_train[target_column - 1].clone(),
1960 drop_column(&actual_test, target_column),
1961 actual_test[target_column - 1].clone(),
1962 )
1963}
1964
1965pub fn standardize_vector_f(list: &Vec<f64>) -> Vec<f64> {
1966 list.iter()
1973 .map(|a| (*a - mean(list)) / std_dev(list))
1974 .collect()
1975}
1976
1977pub fn confuse_me(predicted: &Vec<f64>, actual: &Vec<f64>, class0: f64, class1: f64) {
1984 let mut tp = 0.; let mut fp = 0.; let mut fng = 0.; let mut tng = 0.; for (i, j) in actual
1991 .iter()
1992 .zip(predicted.iter())
1993 .collect::<Vec<(&f64, &f64)>>()
1994 .iter()
1995 {
1996 if **i == class0 && **j == class0 {
1997 tp += 1.;
1998 }
1999 if **i == class1 && **j == class1 {
2000 tng += 1.;
2001 }
2002 if **i == class0 && **j == class1 {
2003 fp += 1.;
2004 }
2005 if **i == class1 && **j == class0 {
2006 fng += 1.;
2007 }
2008 }
2009 println!("\n|------------------------|");
2010 println!("| {:?} | {:?}", tp, fp);
2011 println!("|------------------------|");
2012 println!("| {:?} | {:?}", fng, tng);
2013 println!("|------------------------|");
2014 println!("Accuracy : {:.3}", (tp + tng) / (tp + fp + fng + tng));
2015 println!("Precision : {:.3}", (tp) / (tp + fp));
2016 let precision: f64 = (tp) / (tp + fp);
2017 println!("Recall (sensitivity) : {:.3}", (tp) / (tp + fng));
2018 let recall: f64 = (tp) / (tp + fng);
2019 println!("Specificity: {:.3}", (tng) / (fp + tng));
2020 println!(
2021 "F1 : {:.3}\n\n",
2022 (2. * precision * recall) / (precision * recall)
2023 );
2024}
2025
2026pub fn cv<T: Copy>(data: &Vec<Vec<T>>, k: usize) -> (Vec<Vec<T>>, Vec<Vec<T>>) {
2027 (
2032 randomize(&data.clone())[k..].to_vec(),
2033 randomize(&data.clone())[..k].to_vec(),
2034 )
2035}
2036
2037pub fn z_outlier_f(list: &Vec<f64>) -> Vec<f64> {
2038 let mut v_clone = list.clone();
2043 v_clone.sort_by(|a, b| a.partial_cmp(b).unwrap());
2044 let z_v: Vec<_> = v_clone
2045 .iter()
2046 .map(|a| (z_score(&v_clone, *a), *a))
2047 .collect();
2048 z_v.iter()
2049 .filter(|(a, _)| (*a > 3.) || (*a < -3.))
2050 .map(|a| a.1)
2051 .collect::<Vec<f64>>()
2052}
2053
2054pub fn percentile_f(list: &Vec<f64>, percentile: u32) -> f64 {
2055 list.clone().sort_by(|a, b| a.partial_cmp(b).unwrap());
2060 let oridinal_rank = round_off_f((percentile as f64 / 100.) * (list.len() as f64), 0);
2061 list[oridinal_rank as usize - 1]
2062}
2063
2064pub fn quartile_f(list: &Vec<f64>) {
2065 println!(
2069 "\tPercentile:\t10th :{:?}\t25th :{:?}\t50th :{:?}\t75th :{:?}\t90th :{:?}",
2070 percentile_f(list, 10),
2071 percentile_f(list, 25),
2072 percentile_f(list, 50),
2073 percentile_f(list, 75),
2074 percentile_f(list, 90)
2075 );
2076}
2077
2078#[derive(Debug)] pub struct MatrixF {
2242 pub matrix: Vec<Vec<f64>>,
2243}
2244
2245impl MatrixF {
2246 pub fn determinant_f(&self) -> f64 {
2247 if MatrixF::is_square_matrix(&self.matrix) == true {
2250 println!("Calculating Determinant...");
2251
2252 match self.matrix.len() {
2253 1 => self.matrix[0][0],
2254 2 => MatrixF::determinant_2(&self),
2255 3..=100 => MatrixF::determinant_3plus(&self),
2256 _ => {
2257 println!("Cant find determinant for size more than {}", 100);
2258 "100".parse().unwrap()
2259 }
2260 }
2261 } else {
2262 panic!("The input should be a square matrix");
2263 }
2264 }
2265 fn determinant_2(&self) -> f64 {
2266 (self.matrix[0][0] * self.matrix[1][1]) - (self.matrix[1][0] * self.matrix[1][0])
2267 }
2268
2269 fn determinant_3plus(&self) -> f64 {
2270 let length = self.matrix.len() - 1;
2272 let mut new_matrix = self.matrix.clone();
2273
2274 new_matrix = new_matrix
2276 .iter()
2277 .map(|a| a.iter().map(|a| MatrixF::round_off_f(*a, 3)).collect())
2278 .collect();
2279
2280 for diagonal in 0..=length {
2281 for i in diagonal + 1..=length {
2282 if new_matrix[diagonal][diagonal] == 0.0 {
2283 new_matrix[diagonal][diagonal] = 0.001;
2284 }
2285 let scalar = new_matrix[i][diagonal] / new_matrix[diagonal][diagonal];
2286 for j in 0..=length {
2287 new_matrix[i][j] = new_matrix[i][j] - (scalar * new_matrix[diagonal][j]);
2288 }
2289 }
2290 }
2291 let mut product = 1.;
2292 for i in 0..=length {
2293 product *= new_matrix[i][i]
2294 }
2295 product
2296 }
2297
2298 pub fn is_square_matrix<T>(matrix: &Vec<Vec<T>>) -> bool {
2299 if matrix.len() == matrix[0].len() {
2300 true
2301 } else {
2302 false
2303 }
2304 }
2305
2306 fn round_off_f(value: f64, decimals: i32) -> f64 {
2307 ((value * 10.0f64.powi(decimals)).round()) / 10.0f64.powi(decimals)
2309 }
2310
2311 pub fn inverse_f(&self) -> Vec<Vec<f64>> {
2312 let mut input = self.matrix.clone();
2314 let length = self.matrix.len();
2315 let mut identity = MatrixF::identity_matrix(length);
2316
2317 let index: Vec<usize> = (0..length).collect();
2318 for diagonal in 0..length {
2321 let diagonal_scalar = 1. / (input[diagonal][diagonal]);
2322 for column_loop in 0..length {
2324 input[diagonal][column_loop] *= diagonal_scalar;
2325 identity[diagonal][column_loop] *= diagonal_scalar;
2326 }
2327
2328 let except_diagonal: Vec<usize> = index[0..diagonal]
2330 .iter()
2331 .copied()
2332 .chain(index[diagonal + 1..].iter().copied())
2333 .collect();
2334 for i in except_diagonal {
2337 let row_scalar = input[i as usize][diagonal].clone();
2338 for j in 0..length {
2339 input[i][j] = input[i][j] - (row_scalar * input[diagonal][j]);
2340 identity[i][j] = identity[i][j] - (row_scalar * identity[diagonal][j])
2341 }
2342 }
2343 }
2344
2345 identity
2346 }
2347
2348 fn identity_matrix(size: usize) -> Vec<Vec<f64>> {
2349 let mut output: Vec<Vec<f64>> = MatrixF::zero_matrix(size);
2350 for i in 0..=(size - 1) {
2351 for j in 0..=(size - 1) {
2352 if i == j {
2353 output[i][j] = 1.;
2354 } else {
2355 output[i][j] = 0.;
2356 }
2357 }
2358 }
2359 output
2360 }
2361
2362 fn zero_matrix(size: usize) -> Vec<Vec<f64>> {
2363 let mut output: Vec<Vec<f64>> = vec![];
2364 for _ in 0..=(size - 1) {
2365 output.push(vec![0.; size]);
2366 }
2367 output
2368 }
2369}
2370
2371pub struct DataFrame<'a> {
2372 pub string: Vec<Vec<&'a str>>,
2374 pub numerical: Vec<Vec<f64>>,
2375 pub boolean: Vec<Vec<bool>>,
2376}
2377impl<'a> DataFrame<'a> {
2378 pub fn describe(&self) {
2379 println!(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>");
2380 println!(" Details of the DataFrame",);
2381 println!(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>");
2382 for (n, i) in self.string.iter().enumerate() {
2383 println!(
2384 "String column #{:?} Values count : {:?}",
2385 n,
2386 value_counts(i)
2387 )
2388 }
2389 for (n, i) in self.boolean.iter().enumerate() {
2390 println!(
2391 "String column #{:?} Values count : {:?}",
2392 n,
2393 value_counts(i)
2394 )
2395 }
2396 for (n, i) in self.numerical.iter().enumerate() {
2397 println!("Numerical column #{:?}\n\tCount :{:?}", n, i.len());
2398 println!(
2399 "\tMinimum :{:?} Maximum : {:?}",
2400 min_max_f(i).0,
2401 min_max_f(i).1
2402 );
2403 println!("\tMean :{:?} Std Deviation : {:?}", mean(i), std_dev(i));
2404 quartile_f(i);
2405 println!("\tOutliers :{:?}", z_outlier_f(i));
2406 }
2407 }
2408 pub fn groupby(&self, string_column_number: usize, operation: &str) {
2409 let reduced_dataframe_string = self.string[string_column_number].clone();
2412 let reduced_dataframe_float = self.numerical.clone();
2413
2414 let unique_string = unique_values(&reduced_dataframe_string);
2416 let mut unique_string_index = vec![];
2417 for i in unique_string.iter() {
2418 let mut single_string = vec![];
2419 for (n, j) in reduced_dataframe_string.iter().enumerate() {
2420 if i == j {
2421 single_string.push(n);
2422 }
2423 }
2424 unique_string_index.push(single_string);
2425 }
2426
2427 let mut output = vec![];
2429 for i in unique_string_index.iter() {
2430 let mut result = vec![];
2431 for j in reduced_dataframe_float.iter() {
2432 let seperated = j
2433 .iter()
2434 .enumerate()
2435 .filter(|(n, _)| i.contains(n))
2436 .collect::<Vec<(usize, &f64)>>();
2437 match operation {
2438 "sum" => {
2439 result.push(seperated.iter().map(|a| a.1).fold(0., |a, b| a + b));
2440 }
2441 "mean" => {
2442 result.push(
2443 seperated.iter().map(|a| a.1).fold(0., |a, b| a + b)
2444 / (seperated.len() as f64),
2445 );
2446 }
2447 _ => panic!("Enter either 'sum' or 'mean'"),
2448 };
2449 }
2450 output.push(result[0]);
2451 }
2452 println!(
2453 "Grouped on {:?} => {:?}",
2454 string_column_number,
2455 unique_string
2456 .iter()
2457 .zip(output.iter())
2458 .collect::<Vec<(&&str, &f64)>>()
2459 );
2460 }
2461 pub fn sort(&self, col_type: &str, col_number: usize, ascending: bool) -> DataFrame {
2462 let mut output = DataFrame {
2466 string: vec![],
2467 numerical: vec![],
2468 boolean: vec![],
2469 };
2470 let mut to_sort_by_string;
2471 let mut to_sort_by_numerical;
2472 let order: Vec<usize>;
2473 match col_type {
2474 "s" => {
2475 to_sort_by_string = self.string[col_number].clone();
2476 order = DataFrame::find_order_of_sorting_string(&mut to_sort_by_string, ascending);
2478 }
2479 "n" => {
2480 to_sort_by_numerical = self.numerical[col_number].clone();
2481 order = DataFrame::find_order_of_sorting_numerical(
2483 &mut to_sort_by_numerical,
2484 ascending,
2485 );
2486 }
2487 _ => panic!("Pass either `s` or `n`"),
2488 }
2489
2490 println!("New order is : {:?}", order);
2491 for each_vector in self.string.iter() {
2493 let mut new_vector = vec![];
2494 for o in order.iter() {
2495 new_vector.push(each_vector[*o]);
2496 }
2497 output.string.push(new_vector);
2498 }
2499
2500 for each_vector in self.numerical.iter() {
2502 let mut new_vector = vec![];
2503 for o in order.iter() {
2504 new_vector.push(each_vector[*o]);
2505 }
2506
2507 output.numerical.push(new_vector);
2508 }
2509
2510 for each_vector in self.boolean.iter() {
2512 let mut new_vector = vec![];
2513 for o in order.iter() {
2514 new_vector.push(each_vector[*o]);
2515 }
2516 output.boolean.push(new_vector);
2517 }
2518
2519 output
2520 }
2521 fn find_order_of_sorting_string(data: &mut Vec<&str>, ascending: bool) -> Vec<usize> {
2522 use std::collections::BTreeMap;
2523 let mut input = data.clone();
2524 let mut order: BTreeMap<usize, &str> = BTreeMap::new();
2525 let mut output = vec![];
2526
2527 for (n, i) in data.iter().enumerate() {
2529 order.insert(n, i);
2530 }
2531 match ascending {
2533 true => input.sort_unstable(),
2534 false => {
2535 input.sort_unstable();
2536 input.reverse();
2537 }
2538 };
2539
2540 for i in input.iter() {
2542 for (k, v) in order.iter() {
2543 if (*i == *v) & (output.contains(k) == false) {
2544 output.push(*k);
2545 break;
2546 }
2547 }
2548 }
2549 output
2550 }
2551
2552 fn find_order_of_sorting_numerical(data: &mut Vec<f64>, ascending: bool) -> Vec<usize> {
2553 use std::collections::BTreeMap;
2554 let mut input = data.clone();
2555 let mut order: BTreeMap<usize, &f64> = BTreeMap::new();
2556 let mut output = vec![];
2557
2558 for (n, i) in data.iter().enumerate() {
2560 order.insert(n, i);
2561 }
2562 match ascending {
2564 true => input.sort_by(|a, b| a.partial_cmp(b).unwrap()),
2565 false => input.sort_by(|a, b| b.partial_cmp(a).unwrap()),
2566 };
2567
2568 for i in input.iter() {
2570 for (k, v) in order.iter() {
2571 if (i == *v) & (output.contains(k) == false) {
2572 output.push(*k);
2573 break;
2574 }
2575 }
2576 }
2577 output
2578 }
2579}
2580
2581pub struct DataMap<'a> {
2582 pub string: HashMap<&'a str, Vec<&'a str>>,
2585 pub numerical: HashMap<&'a str, Vec<f64>>,
2586 pub boolean: HashMap<&'a str, Vec<bool>>,
2587}
2588impl<'a> DataMap<'a> {
2589 pub fn describe(&self) {
2590 println!(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>");
2591 println!(" Details of the DataMap",);
2592 println!(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>");
2593 for (k, v) in self.string.iter() {
2594 println!(
2595 "String column :{:?} Values count : {:?}",
2596 k,
2597 value_counts(v)
2598 )
2599 }
2600 for (k, v) in self.boolean.iter() {
2601 println!(
2602 "Boolean column :{:?} Values count : {:?}",
2603 k,
2604 value_counts(v)
2605 )
2606 }
2607 for (k, v) in self.numerical.iter() {
2608 println!("Numerical column :{:?}\n\tCount :{:?}", k, v.len());
2609 println!(
2610 "\tMinimum :{:?} Maximum : {:?}",
2611 min_max_f(v).0,
2612 min_max_f(v).1
2613 );
2614 println!("\tMean :{:?} Std Deviation : {:?}", mean(v), std_dev(v));
2615 quartile_f(v);
2616 println!("\tOutliers :{:?}", z_outlier_f(v));
2617 }
2618 }
2619
2620 pub fn groupby(&self, string_column: &str, operation: &str) {
2621 let reduced_dataframe_string = self.string[string_column].clone();
2624 let reduced_dataframe_float: Vec<&Vec<f64>> = self.numerical.values().clone().collect();
2625
2626 let unique_string = unique_values(&reduced_dataframe_string);
2628 let mut unique_string_index = vec![];
2629 for i in unique_string.iter() {
2630 let mut single_string = vec![];
2631 for (n, j) in reduced_dataframe_string.iter().enumerate() {
2632 if i == j {
2633 single_string.push(n);
2634 }
2635 }
2636 unique_string_index.push(single_string);
2637 }
2638
2639 let mut output = vec![];
2641 for i in unique_string_index.iter() {
2642 let mut result = vec![];
2643 for j in reduced_dataframe_float.iter() {
2644 let seperated = j
2645 .iter()
2646 .enumerate()
2647 .filter(|(n, _)| i.contains(n))
2648 .collect::<Vec<(usize, &f64)>>();
2649 match operation {
2650 "sum" => {
2651 result.push(seperated.iter().map(|a| a.1).fold(0., |a, b| a + b));
2652 }
2653 "mean" => {
2654 result.push(
2655 seperated.iter().map(|a| a.1).fold(0., |a, b| a + b)
2656 / (seperated.len() as f64),
2657 );
2658 }
2659 _ => panic!("Enter either 'sum' or 'mean'"),
2660 };
2661 }
2662 output.push(result[0]);
2663 }
2664 println!(
2665 "Grouped by {:?} => {:?}",
2666 string_column,
2667 unique_string
2668 .iter()
2669 .zip(output.iter())
2670 .collect::<Vec<(&&str, &f64)>>()
2671 );
2672 }
2673 pub fn sort(&self, col_type: &str, col_name: &str, ascending: bool) -> DataMap {
2674 let mut output = DataMap {
2678 string: HashMap::new(),
2679 numerical: HashMap::new(),
2680 boolean: HashMap::new(),
2681 };
2682 let mut to_sort_by_string;
2683 let mut to_sort_by_numerical;
2684 let order: Vec<usize>;
2685 match col_type {
2686 "s" => {
2687 to_sort_by_string = self.string[col_name].clone();
2688 order = DataFrame::find_order_of_sorting_string(&mut to_sort_by_string, ascending);
2690 }
2691 "n" => {
2692 to_sort_by_numerical = self.numerical[col_name].clone();
2693 order = DataFrame::find_order_of_sorting_numerical(
2695 &mut to_sort_by_numerical,
2696 ascending,
2697 );
2698 }
2699 _ => panic!("Pass either `s` or `n`"),
2700 }
2701
2702 println!("New order is : {:?}", order);
2703 for (key, value) in self.string.iter() {
2705 let mut new_vector = vec![];
2706 for o in order.iter() {
2707 new_vector.push(value[*o]);
2708 }
2709 output.string.insert(*key, new_vector);
2710 }
2711 for (key, value) in self.numerical.iter() {
2713 let mut new_vector = vec![];
2714 for o in order.iter() {
2715 new_vector.push(value[*o]);
2716 }
2717 output.numerical.insert(*key, new_vector);
2718 }
2719 for (key, value) in self.boolean.iter() {
2721 let mut new_vector = vec![];
2722 for o in order.iter() {
2723 new_vector.push(value[*o]);
2724 }
2725 output.boolean.insert(*key, new_vector);
2726 }
2727
2728 output
2729 }
2730}
2731
2732pub fn print_a_matrix<T: std::fmt::Debug>(string: &str, matrix: &Vec<Vec<T>>) {
2733 println!("{}", string);
2735 for i in matrix.iter() {
2736 println!("{:?}", i);
2737 }
2738 println!("");
2739 println!("");
2740}
2741
2742pub fn shape_changer<T>(list: &Vec<T>, columns: usize, rows: usize) -> Vec<Vec<T>>
2743where
2744 T: std::clone::Clone,
2745{
2746 let mut l = list.clone();
2749 let mut output = vec![vec![]; rows];
2750 if columns * rows == list.len() {
2751 for i in 0..rows {
2752 output[i] = l[..columns].iter().cloned().collect();
2753 l = l[columns..].iter().cloned().collect();
2755 }
2756 output
2757 } else {
2758 panic!("!!! The shape transformation is not possible, check the values entered !!!");
2759 }
2761}
2762
2763pub fn transpose<T: std::clone::Clone + Copy>(matrix: &Vec<Vec<T>>) -> Vec<Vec<T>> {
2764 let mut output = vec![];
2766 for j in 0..matrix[0].len() {
2767 for i in 0..matrix.len() {
2768 output.push(matrix[i][j]);
2769 }
2770 }
2771 let x = matrix[0].len();
2772 shape_changer(&output, matrix.len(), x)
2773}
2774
2775pub fn vector_addition<T>(a: &mut Vec<T>, b: &mut Vec<T>) -> Vec<T>
2776where
2777 T: std::ops::Add<Output = T> + Copy + std::fmt::Debug + std::str::FromStr,
2778 <T as std::str::FromStr>::Err: std::fmt::Debug,
2779{
2780 let mut output = vec![];
2782 if a.len() == b.len() {
2783 for i in 0..a.len() {
2784 output.push(a[i] + b[i]);
2785 }
2786 output
2787 } else {
2788 if a.len() < b.len() {
2790 let new_a = pad_with_zero(a, b.len() - a.len(), "post");
2791 println!("The changed vector is {:?}", new_a);
2792 for i in 0..a.len() {
2793 output.push(a[i] + b[i]);
2794 }
2795 output
2796 } else {
2797 let new_b = pad_with_zero(b, a.len() - b.len(), "post");
2798 println!("The changed vector is {:?}", new_b);
2799 for i in 0..a.len() {
2800 output.push(a[i] + b[i]);
2801 }
2802 output
2803 }
2804 }
2805}
2806
2807pub fn matrix_multiplication<T>(input: &Vec<Vec<T>>, weights: &Vec<Vec<T>>) -> Vec<Vec<T>>
2808where
2809 T: Copy + std::iter::Sum + std::ops::Mul<Output = T>,
2810{
2811 println!(
2813 "Multiplication of {}x{} and {}x{}",
2814 input.len(),
2815 input[0].len(),
2816 weights.len(),
2817 weights[0].len()
2818 );
2819 println!("Output will be {}x{}", input.len(), weights[0].len());
2820 let weights_t = transpose(&weights);
2821 let mut output: Vec<T> = vec![];
2823 if input[0].len() == weights.len() {
2824 for i in input.iter() {
2825 for j in weights_t.iter() {
2826 output.push(dot_product(&i, &j));
2828 }
2829 }
2830 shape_changer(&output, input.len(), weights_t.len())
2832 } else {
2833 panic!("Dimension mismatch")
2834 }
2835}
2836
2837pub fn dot_product<T>(a: &Vec<T>, b: &Vec<T>) -> T
2838where
2839 T: std::ops::Mul<Output = T> + std::iter::Sum + Copy,
2840{
2841 let output: T = a.iter().zip(b.iter()).map(|(x, y)| *x * *y).sum();
2842 output
2843}
2844
2845pub fn element_wise_operation<T>(a: &Vec<T>, b: &Vec<T>, operation: &str) -> Vec<T>
2846where
2847 T: Copy
2848 + std::fmt::Debug
2849 + std::ops::Mul<Output = T>
2850 + std::ops::Add<Output = T>
2851 + std::ops::Sub<Output = T>
2852 + std::ops::Div<Output = T>
2853 + std::cmp::PartialEq
2854 + std::str::FromStr,
2855 <T as std::str::FromStr>::Err: std::fmt::Debug,
2856{
2857 if a.len() == b.len() {
2861 a.iter().zip(b.iter()).map(|(x, y)| match operation {
2862 "mul" => *x * *y,
2863 "add" => *x + *y,
2864 "sub" => *x - *y,
2865 "div" => *x / *y,
2866 _ => panic!("Operation unsuccessful!\nEnter any of the following(case sensitive):\n> Add\n> Sub\n> Mul\n> Div"),
2867 })
2868 .collect()
2869 } else {
2870 panic!("Dimension mismatch")
2871 }
2872}
2873
2874pub fn pad_with_zero<T>(vector: &mut Vec<T>, count: usize, position: &str) -> Vec<T>
2875where
2876 T: Copy + std::str::FromStr,
2877 <T as std::str::FromStr>::Err: std::fmt::Debug,
2878{
2879 let mut output = vector.clone();
2884 let zero = "0".parse::<T>().unwrap();
2885 match position {
2886 "post" => {
2887 for _ in 0..count {
2888 output.push(zero);
2889 }
2890 }
2891 "pre" => {
2892 let z = vec![zero; count];
2893 output = [&z[..], &vector[..]].concat()
2894 }
2895 _ => panic!("Position can either be `post` or `pre`"),
2896 };
2897 output
2898}
2899
2900pub fn make_matrix_float<T>(input: &Vec<Vec<T>>) -> Vec<Vec<f64>>
2901where
2902 T: std::fmt::Display + Copy,
2903{
2904 input
2909 .iter()
2910 .map(|a| {
2911 a.iter()
2912 .map(|b| {
2913 if is_numerical(*b) {
2914 format!("{}", b).parse().unwrap()
2915 } else {
2916 panic!("Non numerical value present in the intput");
2917 }
2918 })
2919 .collect()
2920 })
2921 .collect()
2922}
2923
2924pub fn make_vector_float<T>(input: &Vec<T>) -> Vec<f64>
2925where
2926 T: std::fmt::Display + Copy,
2927{
2928 input
2933 .iter()
2934 .map(|b| {
2935 if is_numerical(*b) {
2936 format!("{}", b).parse().unwrap()
2937 } else {
2938 panic!("Non numerical value present in the intput");
2939 }
2940 })
2941 .collect()
2942}
2943pub fn round_off_f(value: f64, decimals: i32) -> f64 {
2944 ((value * 10.0f64.powi(decimals)).round()) / 10.0f64.powi(decimals)
2949}
2950
2951pub fn min_max_f(list: &Vec<f64>) -> (f64, f64) {
2952 if type_of(list[0]) == "f64" {
2957 let mut positive: Vec<f64> = list
2958 .clone()
2959 .iter()
2960 .filter(|a| **a >= 0.)
2961 .map(|a| *a)
2962 .collect();
2963 let mut negative: Vec<f64> = list
2964 .clone()
2965 .iter()
2966 .filter(|a| **a < 0.)
2967 .map(|a| *a)
2968 .collect();
2969 positive.sort_by(|a, b| a.partial_cmp(b).unwrap());
2970 negative.sort_by(|a, b| a.partial_cmp(b).unwrap());
2971 if negative.len() > 0 && positive.len() > 0 {
2973 (negative[0], positive[positive.len() - 1])
2974 } else {
2975 if positive.len() == 0 && negative.len() != 0 {
2976 (negative[negative.len() - 1], negative[0])
2977 } else {
2978 if negative.len() == 0 && positive.len() != 0 {
2979 (positive[0], positive[positive.len() - 1])
2980 } else {
2981 panic!("Empty vector found")
2982 }
2983 }
2984 }
2985 } else {
2986 panic!("Input should be a float type")
2987 }
2988}
2989
2990pub fn is_numerical<T>(value: T) -> bool {
2991 if type_of(&value) == "&i32"
2992 || type_of(&value) == "&i8"
2993 || type_of(&value) == "&i16"
2994 || type_of(&value) == "&i64"
2995 || type_of(&value) == "&i128"
2996 || type_of(&value) == "&f64"
2997 || type_of(&value) == "&f32"
2998 || type_of(&value) == "&u32"
2999 || type_of(&value) == "&u8"
3000 || type_of(&value) == "&u16"
3001 || type_of(&value) == "&u64"
3002 || type_of(&value) == "&u128"
3003 || type_of(&value) == "&usize"
3004 || type_of(&value) == "&isize"
3005 {
3006 true
3007 } else {
3008 false
3009 }
3010}
3011
3012pub fn value_counts<T: std::cmp::Ord>(list: &Vec<T>) -> BTreeMap<T, u32>
3014where
3015 T: std::cmp::PartialEq + std::cmp::Eq + std::hash::Hash + Copy,
3016{
3017 let mut count: BTreeMap<T, u32> = BTreeMap::new();
3022 for i in list {
3023 count.insert(*i, 1 + if count.contains_key(i) { count[i] } else { 0 });
3024 }
3025 count
3026}
3027
3028use std::any::type_name;
3029pub fn type_of<T>(_: T) -> &'static str {
3030 type_name::<T>()
3034}
3035
3036pub fn unique_values<T>(list: &Vec<T>) -> Vec<T>
3037where
3038 T: std::cmp::PartialEq + Copy,
3039{
3040 let mut output = vec![];
3044 for i in list.iter() {
3045 if output.contains(i) {
3046 } else {
3047 output.push(*i)
3048 };
3049 }
3050 output
3051}
3052
3053pub fn element_wise_matrix_operation<T>(
3054 matrix1: &Vec<Vec<T>>,
3055 matrix2: &Vec<Vec<T>>,
3056 operation: &str,
3057) -> Vec<Vec<T>>
3058where
3059 T: Copy
3060 + std::fmt::Debug
3061 + std::ops::Mul<Output = T>
3062 + std::ops::Add<Output = T>
3063 + std::ops::Sub<Output = T>
3064 + std::ops::Div<Output = T>
3065 + std::cmp::PartialEq
3066 + std::str::FromStr,
3067 <T as std::str::FromStr>::Err: std::fmt::Debug,
3068{
3069 if matrix1.len() == matrix2.len() && matrix1[0].len() == matrix2[0].len() {
3073 matrix1
3074 .iter()
3075 .zip(matrix2.iter())
3076 .map(|(x, y)| {
3077 x.iter()
3078 .zip(y.iter())
3079 .map(|a| match operation {
3080 "mul" => *a.0 * *a.1,
3081 "add" => *a.0 + *a.1,
3082 "sub" => *a.0 - *a.1,
3083 "div" => *a.0 / *a.1,
3084 _ => panic!("Operation unsuccessful!\nEnter any of the following(case sensitive):\n> Add\n> Sub\n> Mul\n> Div"),
3085 })
3086 .collect()
3087 })
3088 .collect()
3089 } else {
3090 panic!("Dimension mismatch")
3091 }
3092}
3093
3094pub fn matrix_vector_product_f(matrix: &Vec<Vec<f64>>, vector: &Vec<f64>) -> Vec<f64> {
3095 let mut output: Vec<_> = vec![];
3099 if matrix[0].len() == vector.len() {
3100 for i in matrix.iter() {
3101 output.push(dot_product(i, vector));
3102 }
3103 } else {
3104 panic!("The lengths do not match, please check");
3105 }
3106 output
3107}
3108
3109pub fn split_vector<T: std::clone::Clone>(vector: &Vec<T>, parts: i32) -> Vec<Vec<T>> {
3110 if vector.len() % parts as usize == 0 {
3114 let mut output = vec![];
3115 let size = vector.len() / parts as usize;
3116 let mut from = 0;
3117 let mut to = from + size;
3118 while to <= vector.len() {
3119 output.push(vector[from..to].to_vec());
3120 from = from + size;
3121 to = from + size;
3122 }
3123 output
3124 } else {
3125 panic!("This partition is not possible, check the number of partitions passed")
3126 }
3127}
3128
3129pub fn split_vector_at<T>(vector: &Vec<T>, at: T) -> Vec<Vec<T>>
3130where
3131 T: std::cmp::PartialEq + Copy + std::clone::Clone,
3132{
3133 if vector.contains(&at) {
3137 let mut output = vec![];
3138 let copy = vector.clone();
3139 let mut from = 0;
3140 for (n, i) in vector.iter().enumerate() {
3141 if i == &at {
3142 output.push(copy[from..n].to_vec());
3143 from = n;
3144 }
3145 }
3146 output.push(copy[from..].to_vec());
3147 output
3148 } else {
3149 panic!("The value is not in the vector, please check");
3150 }
3151}
3152
3153pub fn join_matrix<T: Copy>(
3154 matrix1: &Vec<Vec<T>>,
3155 matrix2: &Vec<Vec<T>>,
3156 how: &str,
3157) -> Vec<Vec<T>> {
3158 let mut output = vec![];
3163 let a = matrix1;
3164 let b = matrix2;
3165 match how {
3166 "wide" => {
3167 if a.len() == b.len() {
3171 for (n, j) in a.iter().enumerate() {
3172 let mut new_j = j.clone();
3173 for (m, i) in b.iter().enumerate() {
3174 for k in i.iter() {
3175 if n == m {
3176 new_j.push(*k);
3177 }
3178 }
3179 }
3180 output.push(new_j)
3181 }
3182 output
3183 } else {
3184 panic!("Please check the dimensions, # of rows are different");
3185 }
3186 }
3187 "long" => {
3188 if a[0].len() == b[0].len() {
3192 for (n, _) in b.iter().enumerate() {
3193 output.push(a[n].clone());
3194 }
3195 for (n, _) in b.iter().enumerate() {
3196 output.push(b[n].clone());
3197 }
3198 output
3199 } else {
3200 panic!("Please check the dimensions, # of columns are different");
3201 }
3202 }
3203 _ => panic!("Select either long or wide"),
3204 }
3205}
3206
3207pub fn make_matrix_string_literal<'a>(data: &'a Vec<Vec<String>>) -> Vec<Vec<&'a str>> {
3208 let mut output = vec![];
3212 for i in data.iter() {
3213 output.push(i.iter().map(|a| &a[..]).collect())
3214 }
3215 println!("> String converted to &str");
3216 output
3217}
3218
3219pub fn head<T: std::clone::Clone + std::fmt::Debug>(data: &Vec<Vec<T>>, rows: usize) {
3220 if rows <= data.len() {
3225 let output = data[..rows].to_vec();
3226 print_a_matrix(&format!("First {} rows", rows), &output);
3227 } else {
3228 panic!("Data is nt that big, please check the numbers");
3229 }
3230}
3231
3232pub fn tail<T: std::clone::Clone + std::fmt::Debug>(data: &Vec<Vec<T>>, rows: usize) {
3233 if rows <= data.len() {
3238 let output = data[data.len() - rows..].to_vec();
3239 print_a_matrix(&format!("Last {} rows", rows), &output);
3240 } else {
3241 panic!("Data is nt that big, please check the numbers");
3242 }
3243}
3244
3245pub fn row_to_columns_conversion<T: std::fmt::Debug + Copy>(data: &Vec<Vec<T>>) -> Vec<Vec<T>> {
3246 let mut output: Vec<Vec<_>> = vec![];
3252 for j in 0..(data[0].len()) {
3253 let columns = data.iter().map(|a| a[j]).collect();
3254 output.push(columns)
3255 }
3256 output
3258}
3259
3260pub fn columns_to_rows_conversion<T: std::fmt::Debug + Copy>(data: &Vec<Vec<T>>) -> Vec<Vec<T>> {
3261 let mut output = vec![];
3267 for j in 0..data[0].len() {
3268 let mut columns = vec![];
3269 for i in data.iter() {
3270 columns.push(i[j]);
3271 }
3272 output.push(columns)
3273 }
3274 output
3276}
3277
3278pub fn datamap_comparision(table1: &DataMap, table2: &DataMap) {
3279 println!("\n********** Count comparision **********");
3285 let string_columns1 = table1.string.keys().collect::<Vec<&&str>>();
3286 let string_columns2 = table2.string.keys().collect::<Vec<&&str>>();
3287
3288 if string_columns1.len() == string_columns2.len() {
3289 println!("Number of String columns match");
3290 } else {
3291 println!(
3292 "Mismatch in count of String columns : Table 1 has {} ; while Table 2 has {:?}",
3293 string_columns1.len(),
3294 string_columns2.len()
3295 );
3296 }
3297
3298 let numerical_columns1 = table1.numerical.keys().collect::<Vec<&&str>>();
3299 let numerical_columns2 = table2.numerical.keys().collect::<Vec<&&str>>();
3300
3301 if numerical_columns1.len() == numerical_columns2.len() {
3302 println!("Number of Numerical columns match");
3303 } else {
3304 println!(
3305 "Mismatch in count of Numerical columns : Table 1 has {} while Table 2 has {:?}",
3306 numerical_columns1.len(),
3307 numerical_columns2.len()
3308 );
3309 }
3310
3311 let boolean_columns1 = table1.boolean.keys().collect::<Vec<&&str>>();
3312 let boolean_columns2 = table2.boolean.keys().collect::<Vec<&&str>>();
3313
3314 if boolean_columns1.len() == boolean_columns2.len() {
3315 println!("Number of Boolean columns match");
3316 } else {
3317 println!(
3318 "Mismatch in count of Boolean columns : Table 1 has {} while Table 2 has {:?}",
3319 boolean_columns1.len(),
3320 boolean_columns2.len()
3321 );
3322 }
3323
3324 println!("\n********** Column name comparision **********");
3325 let mut c = 0;
3328 let mut mis_string1 = vec![];
3329 let mut mis_string2 = vec![];
3330 for i in string_columns1.iter() {
3331 if string_columns2.contains(i) {
3332 c += 1;
3333 } else {
3334 mis_string2.push(i);
3335 }
3336 }
3337
3338 for i in string_columns2.iter() {
3339 if string_columns1.contains(i) {
3340 c += 1;
3341 } else {
3342 mis_string1.push(i);
3343 }
3344 }
3345 if c == string_columns1.len() + string_columns2.len() {
3347 println!("String columns match (irrespective of order)");
3348 } else {
3349 if mis_string1.len() > 0 && mis_string2.len() > 0 {
3350 println!(
3351 "Table 1 has {:?} missing in String ; Table 2 has {:?} missing in String",
3352 mis_string1, mis_string2
3353 );
3354 }
3355 if mis_string1.len() > 0 && mis_string2.len() == 0 {
3356 println!("Table 1 has {:?} missing in String", mis_string1);
3357 }
3358 if mis_string1.len() == 0 && mis_string2.len() > 0 {
3359 println!("Table 2 has {:?} missing in String", mis_string2);
3360 }
3361 }
3362
3363 c = 0;
3364 let mut mis_numerical1 = vec![];
3365 let mut mis_numerical2 = vec![];
3366 for i in numerical_columns1.iter() {
3367 if numerical_columns2.contains(i) {
3368 c += 1;
3369 } else {
3370 mis_numerical2.push(i);
3371 }
3372 }
3373
3374 for i in numerical_columns2.iter() {
3375 if numerical_columns1.contains(i) {
3376 c += 1;
3377 } else {
3378 mis_numerical1.push(i);
3379 }
3380 }
3381 if c == numerical_columns1.len() + numerical_columns2.len() {
3383 println!("Numerical columns match (irrespective of order)");
3384 } else {
3385 if mis_numerical1.len() > 0 && mis_numerical2.len() > 0 {
3386 println!(
3387 "Table 1 has {:?} missing in Numerical ; Table 2 has {:?} missing in Numerical",
3388 mis_numerical1, mis_numerical2
3389 );
3390 }
3391 if mis_numerical1.len() > 0 && mis_numerical2.len() == 0 {
3392 println!("Table 1 has {:?} missing in Numerical", mis_numerical1);
3393 }
3394 if mis_numerical1.len() == 0 && mis_numerical2.len() > 0 {
3395 println!("Table 2 has {:?} missing in Numerical", mis_numerical2);
3396 }
3397 }
3398
3399 c = 0;
3400 let mut mis_boolean1 = vec![];
3401 let mut mis_boolean2 = vec![];
3402 for i in boolean_columns1.iter() {
3403 if boolean_columns2.contains(i) {
3404 c += 1;
3405 } else {
3406 mis_boolean2.push(i);
3407 }
3408 }
3409
3410 for i in boolean_columns2.iter() {
3411 if boolean_columns1.contains(i) {
3412 c += 1;
3413 } else {
3414 mis_boolean1.push(i);
3415 }
3416 }
3417 if c == boolean_columns1.len() + boolean_columns2.len() {
3419 println!("Boolean columns match (irrespective of order)");
3420 } else {
3421 if mis_boolean1.len() > 0 && mis_boolean2.len() > 0 {
3422 println!(
3423 "Table 1 has {:?} missing in Boolean ; Table 2 has {:?} missing in Boolean",
3424 mis_boolean1, mis_boolean2
3425 );
3426 }
3427 if mis_boolean1.len() > 0 && mis_boolean2.len() == 0 {
3428 println!("Table 1 has {:?} missing in Boolean", mis_boolean1);
3429 }
3430 if mis_boolean1.len() == 0 && mis_boolean2.len() > 0 {
3431 println!("Table 2 has {:?} missing in Boolean", mis_boolean2);
3432 }
3433 }
3434
3435 println!("\n********** Value comparision (for the common columns) **********");
3436 let mut string_similarity = 0;
3437 let mut dissimilarity = vec![];
3438 for (k1, v1) in table1.string.iter() {
3439 for (k2, v2) in table2.string.iter() {
3440 if k1 == k2 {
3441 string_similarity += compare_vectors(&v1, &v2).0;
3442 dissimilarity.push(compare_vectors(&v1, &v2).1);
3443 }
3444 }
3445 }
3446 if string_similarity == table1.string.len() {
3447 println!("The string values matchs, if present");
3448 } else {
3449 println!("Dissimilar in String at (Table 1, Table 2): ");
3450 let _ = dissimilarity
3451 .iter()
3452 .enumerate()
3453 .map(|(n, a)| {
3454 println!(
3455 "{:?} : {:?}",
3456 table1.string.keys().collect::<Vec<&&str>>()[n],
3457 a
3458 );
3459 a.clone()
3460 })
3461 .collect::<Vec<Vec<(usize, usize)>>>();
3462 }
3463
3464 let mut numerical_similarity = 0;
3465 dissimilarity = vec![];
3466 for (k1, v1) in table1.numerical.iter() {
3467 for (k2, v2) in table2.numerical.iter() {
3468 if k1 == k2 {
3469 numerical_similarity += compare_vectors(&v1, &v2).0;
3470 dissimilarity.push(compare_vectors(&v1, &v2).1);
3471 }
3472 }
3473 }
3474 if numerical_similarity == table1.numerical.len() {
3475 println!("The numerical values matchs, if present");
3476 } else {
3477 println!("Dissimilar in Numerical at (Table 1, Table 2): ");
3478 let _ = dissimilarity
3479 .iter()
3480 .enumerate()
3481 .map(|(n, a)| {
3482 println!(
3483 "{:?} : {:?}",
3484 table1.numerical.keys().collect::<Vec<&&str>>()[n],
3485 a
3486 );
3487 a.clone()
3488 })
3489 .collect::<Vec<Vec<(usize, usize)>>>();
3490 }
3491
3492 let mut boolean_similarity = 0;
3493 dissimilarity = vec![];
3494 for (k1, v1) in table1.boolean.iter() {
3495 for (k2, v2) in table2.boolean.iter() {
3496 if k1 == k2 {
3497 boolean_similarity += compare_vectors(&v1, &v2).0;
3498 dissimilarity.push(compare_vectors(&v1, &v2).1);
3499 }
3500 }
3501 }
3502 if boolean_similarity == table1.boolean.len() {
3503 println!("The Boolean values matchs, if present");
3504 } else {
3505 println!("Dissimilar in boolean at (Table 1, Table 2): ");
3506 let _ = dissimilarity
3507 .iter()
3508 .enumerate()
3509 .map(|(n, a)| {
3510 println!(
3511 "{:?} : {:?}",
3512 table1.boolean.keys().collect::<Vec<&&str>>()[n],
3513 a
3514 );
3515 a.clone()
3516 })
3517 .collect::<Vec<Vec<(usize, usize)>>>();
3518 }
3519}
3520
3521pub fn dataframe_comparision(table1: &DataFrame, table2: &DataFrame) {
3522 println!("\n********** Count comparision **********");
3527 if table1.string.len() == table2.string.len() {
3528 println!("String columns count : {:?}", table1.string.len(),);
3529 } else {
3530 println!(
3531 "String columns count are not the same {:?} and {:?}",
3532 table1.string.len(),
3533 table2.string.len()
3534 );
3535 }
3536
3537 if table1.numerical.len() == table2.numerical.len() {
3538 println!("Numerical columns count : {:?}", table1.numerical.len(),);
3539 } else {
3540 println!(
3541 "Numerical columns count are not the same {:?} and {:?}",
3542 table1.numerical.len(),
3543 table2.numerical.len(),
3544 );
3545 }
3546
3547 if table1.boolean.len() == table2.boolean.len() {
3548 println!("Boolean columns count : {:?}", table1.boolean.len(),);
3549 } else {
3550 println!(
3551 "Boolean columns count are not the same {:?} and {:?}",
3552 table1.boolean.len(),
3553 table2.boolean.len()
3554 );
3555 }
3556
3557 println!("\n********** Value comparision (for the common columns) **********");
3558 let mut string_similarity = 0;
3559 let mut dissimilarity = vec![];
3560 for (ni, i) in table1.string.iter().enumerate() {
3561 for (nj, j) in i.iter().enumerate() {
3562 for (nk, k) in table2.string.iter().enumerate() {
3563 for (nl, l) in k.iter().enumerate() {
3564 if nj == nl && nk == ni {
3565 if j == l {
3566 string_similarity += 1;
3567 } else {
3568 dissimilarity.push(((ni, nj), (nk, nl)));
3569 }
3570 }
3571 }
3572 }
3573 }
3574 }
3575 if string_similarity == table1.string[0].len() * table1.string.len() {
3576 println!("The string values matchs, if present");
3577 } else {
3578 println!("Dissimilar in String at :");
3579 let _ = dissimilarity
3580 .iter()
3581 .enumerate()
3582 .map(|(n, a)| {
3583 println!("{:?} : {:?}", n, a);
3584 *a
3585 })
3586 .collect::<Vec<((usize, usize), (usize, usize))>>();
3587 }
3588
3589 let mut numerical_similarity = 0;
3590 let mut dissimilarity = vec![];
3591 for (ni, i) in table1.numerical.iter().enumerate() {
3592 for (nj, j) in i.iter().enumerate() {
3593 for (nk, k) in table2.numerical.iter().enumerate() {
3594 for (nl, l) in k.iter().enumerate() {
3595 if nj == nl && nk == ni {
3596 if j == l {
3597 numerical_similarity += 1;
3598 } else {
3599 dissimilarity.push(((ni, nj), (nk, nl)));
3600 }
3601 }
3602 }
3603 }
3604 }
3605 }
3606 if numerical_similarity == table1.numerical[0].len() * table1.numerical.len() {
3607 println!("The numerical values matchs, if present");
3608 } else {
3609 println!("Dissimilar in Numerical at :");
3610 let _ = dissimilarity
3611 .iter()
3612 .enumerate()
3613 .map(|(n, a)| {
3614 println!("{:?} : {:?}", n, a);
3615 *a
3616 })
3617 .collect::<Vec<((usize, usize), (usize, usize))>>();
3618 }
3619
3620 let mut boolean_similarity = 0;
3621 let mut dissimilarity = vec![];
3622 for (ni, i) in table1.boolean.iter().enumerate() {
3623 for (nj, j) in i.iter().enumerate() {
3624 for (nk, k) in table2.boolean.iter().enumerate() {
3625 for (nl, l) in k.iter().enumerate() {
3626 if nj == nl && nk == ni {
3627 if j == l {
3628 boolean_similarity += 1;
3629 } else {
3630 dissimilarity.push(((ni, nj), (nk, nl)));
3631 }
3632 }
3633 }
3634 }
3635 }
3636 }
3637 if boolean_similarity == table1.boolean[0].len() * table1.boolean.len() {
3638 println!("The boolean values matchs, if present");
3639 } else {
3640 println!("Dissimilar in Boolean at :");
3641 let _ = dissimilarity
3642 .iter()
3643 .enumerate()
3644 .map(|(n, a)| {
3645 println!("{:?} : {:?}", n, a);
3646 *a
3647 })
3648 .collect::<Vec<((usize, usize), (usize, usize))>>();
3649 }
3650}
3651
3652pub fn compare_vectors<T: std::cmp::PartialEq>(
3653 v1: &Vec<T>,
3654 v2: &Vec<T>,
3655) -> (usize, Vec<(usize, usize)>) {
3656 let mut similarity = 0;
3657 let mut dissimilarity = vec![];
3658 for (n, i) in v1.iter().enumerate() {
3659 for (m, j) in v2.iter().enumerate() {
3660 if n == m {
3661 if *i == *j {
3662 similarity += 1;
3663 } else {
3664 dissimilarity.push((n, m))
3665 }
3666 }
3667 }
3668 }
3669 (similarity, dissimilarity)
3670}
3671
3672use std::collections::BTreeMap;
3713pub struct StringToMatch {
3714 pub string1: String,
3715 pub string2: String,
3716}
3717
3718impl StringToMatch {
3719 pub fn compare_percentage(
3720 &self,
3721 weightage_for_position: f64,
3722 weightage_for_presence: f64,
3723 ) -> f64 {
3724 ((StringToMatch::compare_chars(&self) * weightage_for_presence * 100.)
3731 + (StringToMatch::compare_position(&self) * weightage_for_position * 100.))
3732 / 2.
3733 }
3734
3735 pub fn clean_string(s1: String) -> String {
3736 let this = s1.to_lowercase();
3742
3743 let this_byte: Vec<_> = this
3746 .as_bytes()
3747 .iter()
3748 .filter(|a| {
3749 (**a > 47 && **a < 58) || (**a > 96 && **a < 123) || (**a > 127 && **a < 201)
3750 })
3751 .map(|a| *a)
3752 .collect();
3753 let new_this = std::str::from_utf8(&this_byte[..]).unwrap();
3754 new_this.to_string()
3755 }
3756
3757 fn char_vector(string1: String) -> Vec<char> {
3758 let string1 = StringToMatch::clean_string(string1.clone());
3762 string1.chars().collect()
3763 }
3764
3765 fn calculate(actual: f64, v1: &Vec<char>, v2: &Vec<char>) -> f64 {
3766 let larger = if v1.len() > v2.len() {
3770 v1.len()
3771 } else {
3772 v2.len()
3773 };
3774 (actual / larger as f64)
3775 }
3776
3777 pub fn compare_chars(&self) -> f64 {
3778 let mut output = 0.;
3782 let vec1 = StringToMatch::char_vector(self.string1.clone());
3784 let vec2 = StringToMatch::char_vector(self.string2.clone());
3785
3786 for i in vec1.iter() {
3787 if vec2.contains(i) {
3788 output += 1.;
3789 }
3790 }
3791 StringToMatch::calculate(output, &vec1, &vec2)
3792 }
3793 pub fn compare_position(&self) -> f64 {
3794 let mut output = 0.;
3798 let vec1 = StringToMatch::char_vector(self.string1.clone());
3800 let vec2 = StringToMatch::char_vector(self.string2.clone());
3801
3802 let combined: Vec<_> = vec1.iter().zip(vec2.iter()).collect();
3803
3804 for (i, j) in combined.iter() {
3805 if i == j {
3806 output += 1.;
3807 }
3808 }
3809 StringToMatch::calculate(output, &vec1, &vec2)
3810 }
3811
3812 pub fn fuzzy_subset(&self, n_gram: usize) -> f64 {
3813 let match_percentage;
3817 let vec1 = StringToMatch::clean_string(self.string1.clone());
3818 let vec2 = StringToMatch::clean_string(self.string2.clone());
3819
3820 let mut subset = vec2.clone();
3822 let mut superset = vec1.clone();
3823 if vec1.len() < vec2.len() {
3824 subset = vec1;
3825 superset = vec2;
3826 }
3827
3828 let mut chunck_match_count = 0.;
3829
3830 if superset.contains(&subset) {
3832 match_percentage = 100.
3833 } else {
3834 let superset_n = StringToMatch::n_gram(&superset, n_gram);
3836 let subset_n = StringToMatch::n_gram(&subset, n_gram);
3837 for i in subset_n.iter() {
3838 if superset_n.contains(i) {
3839 chunck_match_count += 1.;
3840 }
3841 }
3842 let smaller = if superset_n.len() < subset_n.len() {
3844 superset_n.len()
3845 } else {
3846 subset_n.len()
3847 };
3848 match_percentage = (chunck_match_count / smaller as f64) * 100.
3849 }
3850
3851 println!("{:?} in {:?}", subset, superset);
3852 match_percentage
3853 }
3854
3855 fn n_gram<'a>(string: &'a str, window_size: usize) -> Vec<&'a str> {
3856 let vector: Vec<_> = string.chars().collect();
3857 let mut output = vec![];
3858 for (mut n, _) in vector.iter().enumerate() {
3859 while n + window_size < string.len() - 1 {
3860 output.push(&string[n..n + window_size]);
3862 n = n + window_size;
3863 }
3864 }
3865 unique_values(&output)
3866 }
3867
3868 pub fn split_alpha_numericals(string: String) -> (String, String) {
3869 let bytes: Vec<_> = string.as_bytes().to_vec();
3873 let numbers: Vec<_> = bytes.iter().filter(|a| **a < 58 && **a > 47).collect();
3874 println!("{:?}", bytes);
3875 let aplhabets: Vec<_> = bytes
3876 .iter()
3877 .filter(|a| {
3878 (**a > 64 && **a < 91) || (**a > 96 && **a < 123) || (**a > 127 && **a < 201) || (**a == 32) })
3883 .collect();
3884
3885 (
3886 String::from_utf8(numbers.iter().map(|a| **a).collect()).unwrap(),
3888 String::from_utf8(aplhabets.iter().map(|a| **a).collect()).unwrap(),
3889 )
3890 }
3891
3892 pub fn char_count(string: String) -> BTreeMap<char, u32> {
3893 let mut count: BTreeMap<char, Vec<i32>> = BTreeMap::new();
3897 let vector: Vec<_> = string.to_lowercase().chars().collect();
3898
3899 for i in vector.iter() {
3901 count.insert(*i, vec![]);
3902 }
3903 let mut new_count: BTreeMap<char, Vec<i32>> = BTreeMap::new();
3905 for (k, _) in count.iter() {
3906 let mut values = vec![];
3907 for i in vector.iter() {
3908 if i == k {
3909 values.push(1);
3910 }
3911 }
3912 new_count.insert(*k, values);
3913 }
3914
3915 let mut output = BTreeMap::new();
3917 for (k, v) in new_count.iter() {
3918 output.insert(*k, v.iter().fold(0, |a, b| a as u32 + *b as u32));
3919 }
3920
3921 output
3922 }
3923
3924 pub fn frequent_char(string: String) -> char {
3925 let dict = StringToMatch::char_count(string);
3929 let mut value = 0;
3930 let mut key = '-';
3931 for (k, _) in dict.iter() {
3932 key = match dict.get_key_value(k) {
3933 Some((x, y)) => {
3934 if *y > value {
3935 value = *y;
3936 *x
3937 } else {
3938 key
3939 }
3940 }
3941 _ => panic!("Please check the input!!"),
3942 };
3943 }
3944 key
3945 }
3946
3947 pub fn char_replace(string: String, find: char, replace: String, operation: &str) -> String {
3948 if string.contains(find) {
3954 let string_utf8 = string.as_bytes().to_vec();
3955 let find_utf8 = find.to_string().as_bytes().to_vec();
3956 let replace_utf8 = replace.as_bytes().to_vec();
3957 let split = split_vector_at(&string_utf8, find_utf8[0]);
3958 let split_vec: Vec<_> = split
3959 .iter()
3960 .map(|a| String::from_utf8(a.to_vec()).unwrap())
3961 .collect();
3962 let mut new_string_vec = vec![];
3963 if operation == "all" {
3964 for (n, _) in split_vec.iter().enumerate() {
3965 if n > 0 {
3966 let x = split_vec[n][1..].to_string();
3967 new_string_vec.push(format!(
3968 "{}{}",
3969 String::from_utf8(replace_utf8.clone()).unwrap(),
3970 x.clone()
3971 ));
3972 } else {
3973 new_string_vec.push(split_vec[n].clone());
3974 }
3975 }
3976 } else {
3977 if operation == "first" {
3978 for (n, _) in split_vec.iter().enumerate() {
3979 if n == 1 {
3980 let x = split_vec[n][1..].to_string();
3981
3982 new_string_vec.push(format!(
3983 "{}{}",
3984 String::from_utf8(replace_utf8.clone()).unwrap(),
3985 x.clone()
3986 ));
3987 } else {
3988 new_string_vec.push(split_vec[n].clone());
3989 }
3990 }
3991 } else {
3992 panic!("Either pass operation as `all` or `first`");
3993 }
3994 }
3995 new_string_vec.concat()
3996 } else {
3997 panic!("The character to replace does not exist in the string passed, please check!")
3998 }
3999 }
4000}
4001
4002pub fn extract_vowels_consonants(string: String) -> (Vec<char>, Vec<char>) {
4003 let bytes: Vec<_> = string.as_bytes().to_vec();
4009 let vowels: Vec<_> = bytes
4010 .iter()
4011 .filter(|a| {
4012 **a == 97
4013 || **a == 101
4014 || **a == 105
4015 || **a == 111
4016 || **a == 117
4017 || **a == 65
4018 || **a == 69
4019 || **a == 73
4020 || **a == 79
4021 || **a == 85
4022 })
4023 .collect();
4024 let consonants: Vec<_> = bytes
4025 .iter()
4026 .filter(|a| {
4027 **a != 97
4028 && **a != 101
4029 && **a != 105
4030 && **a != 111
4031 && **a != 117
4032 && **a != 65
4033 && **a != 69
4034 && **a != 73
4035 && **a != 79
4036 && **a != 85
4037 && ((**a > 96 && **a < 123) || (**a > 64 && **a < 91))
4038 })
4039 .collect();
4040 let output: (Vec<_>, Vec<_>) = (
4041 String::from_utf8(vowels.iter().map(|a| **a).collect())
4042 .unwrap()
4043 .chars()
4044 .collect(),
4045 String::from_utf8(consonants.iter().map(|a| **a).collect())
4046 .unwrap()
4047 .chars()
4048 .collect(),
4049 );
4050 output
4051}
4052
4053pub fn sentence_case(string: String) -> String {
4054 let lower = string.to_lowercase();
4058 let split: Vec<_> = lower.split(' ').collect();
4059 let mut output = vec![];
4060 for i in split.iter() {
4061 let char_vec: Vec<_> = i.chars().collect();
4062 let mut b = [0; 2];
4063 char_vec[0].encode_utf8(&mut b);
4064 output.push(format!(
4065 "{}{}",
4066 &String::from_utf8(vec![b[0] - 32 as u8]).unwrap()[..],
4067 &i[1..]
4068 ));
4069 }
4070 output.join(" ")
4071}
4072
4073pub fn remove_stop_words(string: String) -> String {
4074 let mut split: Vec<_> = string.split(' ').collect();
4082 let stop_words = vec![
4083 "i",
4084 "me",
4085 "my",
4086 "myself",
4087 "we",
4088 "our",
4089 "ours",
4090 "ourselves",
4091 "you",
4092 "you're",
4093 "you've",
4094 "you'll",
4095 "you'd",
4096 "your",
4097 "yours",
4098 "yourself",
4099 "yourselves",
4100 "he",
4101 "him",
4102 "his",
4103 "himself",
4104 "she",
4105 "she's",
4106 "her",
4107 "hers",
4108 "herself",
4109 "it",
4110 "it's",
4111 "its",
4112 "itself",
4113 "they",
4114 "them",
4115 "their",
4116 "theirs",
4117 "themselves",
4118 "what",
4119 "which",
4120 "who",
4121 "whom",
4122 "this",
4123 "that",
4124 "that'll",
4125 "these",
4126 "those",
4127 "am",
4128 "is",
4129 "are",
4130 "was",
4131 "were",
4132 "be",
4133 "been",
4134 "being",
4135 "have",
4136 "has",
4137 "had",
4138 "having",
4139 "do",
4140 "does",
4141 "did",
4142 "doing",
4143 "a",
4144 "an",
4145 "the",
4146 "and",
4147 "but",
4148 "if",
4149 "or",
4150 "because",
4151 "as",
4152 "until",
4153 "while",
4154 "of",
4155 "at",
4156 "by",
4157 "for",
4158 "with",
4159 "about",
4160 "against",
4161 "between",
4162 "into",
4163 "through",
4164 "during",
4165 "before",
4166 "after",
4167 "above",
4168 "below",
4169 "to",
4170 "from",
4171 "up",
4172 "down",
4173 "in",
4174 "out",
4175 "on",
4176 "off",
4177 "over",
4178 "under",
4179 "again",
4180 "further",
4181 "then",
4182 "once",
4183 "here",
4184 "there",
4185 "when",
4186 "where",
4187 "why",
4188 "how",
4189 "all",
4190 "any",
4191 "both",
4192 "each",
4193 "few",
4194 "more",
4195 "most",
4196 "other",
4197 "some",
4198 "such",
4199 "no",
4200 "nor",
4201 "not",
4202 "only",
4203 "own",
4204 "same",
4205 "so",
4206 "than",
4207 "too",
4208 "very",
4209 "s",
4210 "t",
4211 "can",
4212 "will",
4213 "just",
4214 "don",
4215 "don't",
4216 "should",
4217 "should've",
4218 "now",
4219 "d",
4220 "ll",
4221 "m",
4222 "o",
4223 "re",
4224 "ve",
4225 "y",
4226 "ain",
4227 "aren",
4228 "aren't",
4229 "couldn",
4230 "couldn't",
4231 "didn",
4232 "didn't",
4233 "doesn",
4234 "doesn't",
4235 "hadn",
4236 "hadn't",
4237 "hasn",
4238 "hasn't",
4239 "haven",
4240 "haven't",
4241 "isn",
4242 "isn't",
4243 "ma",
4244 "mightn",
4245 "mightn't",
4246 "mustn",
4247 "mustn't",
4248 "needn",
4249 "needn't",
4250 "shan",
4251 "shan't",
4252 "shouldn",
4253 "shouldn't",
4254 "wasn",
4255 "wasn't",
4256 "weren",
4257 "weren't",
4258 "won",
4259 "won't",
4260 "wouldn",
4261 "wouldn't",
4262 "I",
4263 "Me",
4264 "My",
4265 "Myself",
4266 "We",
4267 "Our",
4268 "Ours",
4269 "Ourselves",
4270 "You",
4271 "You're",
4272 "You've",
4273 "You'll",
4274 "You'd",
4275 "Your",
4276 "Yours",
4277 "Yourself",
4278 "Yourselves",
4279 "He",
4280 "Him",
4281 "His",
4282 "Himself",
4283 "She",
4284 "She's",
4285 "Her",
4286 "Hers",
4287 "Herself",
4288 "It",
4289 "It's",
4290 "Its",
4291 "Itself",
4292 "They",
4293 "Them",
4294 "Their",
4295 "Theirs",
4296 "Themselves",
4297 "What",
4298 "Which",
4299 "Who",
4300 "Whom",
4301 "This",
4302 "That",
4303 "That'll",
4304 "These",
4305 "Those",
4306 "Am",
4307 "Is",
4308 "Are",
4309 "Was",
4310 "Were",
4311 "Be",
4312 "Been",
4313 "Being",
4314 "Have",
4315 "Has",
4316 "Had",
4317 "Having",
4318 "Do",
4319 "Does",
4320 "Did",
4321 "Doing",
4322 "A",
4323 "An",
4324 "The",
4325 "And",
4326 "But",
4327 "If",
4328 "Or",
4329 "Because",
4330 "As",
4331 "Until",
4332 "While",
4333 "Of",
4334 "At",
4335 "By",
4336 "For",
4337 "With",
4338 "About",
4339 "Against",
4340 "Between",
4341 "Into",
4342 "Through",
4343 "During",
4344 "Before",
4345 "After",
4346 "Above",
4347 "Below",
4348 "To",
4349 "From",
4350 "Up",
4351 "Down",
4352 "In",
4353 "Out",
4354 "On",
4355 "Off",
4356 "Over",
4357 "Under",
4358 "Again",
4359 "Further",
4360 "Then",
4361 "Once",
4362 "Here",
4363 "There",
4364 "When",
4365 "Where",
4366 "Why",
4367 "How",
4368 "All",
4369 "Any",
4370 "Both",
4371 "Each",
4372 "Few",
4373 "More",
4374 "Most",
4375 "Other",
4376 "Some",
4377 "Such",
4378 "No",
4379 "Nor",
4380 "Not",
4381 "Only",
4382 "Own",
4383 "Same",
4384 "So",
4385 "Than",
4386 "Too",
4387 "Very",
4388 "S",
4389 "T",
4390 "Can",
4391 "Will",
4392 "Just",
4393 "Don",
4394 "Don't",
4395 "Should",
4396 "Should've",
4397 "Now",
4398 "D",
4399 "Ll",
4400 "M",
4401 "O",
4402 "Re",
4403 "Ve",
4404 "Y",
4405 "Ain",
4406 "Aren",
4407 "Aren't",
4408 "Couldn",
4409 "Couldn't",
4410 "Didn",
4411 "Didn't",
4412 "Doesn",
4413 "Doesn't",
4414 "Hadn",
4415 "Hadn't",
4416 "Hasn",
4417 "Hasn't",
4418 "Haven",
4419 "Haven't",
4420 "Isn",
4421 "Isn't",
4422 "Ma",
4423 "Mightn",
4424 "Mightn't",
4425 "Mustn",
4426 "Mustn't",
4427 "Needn",
4428 "Needn't",
4429 "Shan",
4430 "Shan't",
4431 "Shouldn",
4432 "Shouldn't",
4433 "Wasn",
4434 "Wasn't",
4435 "Weren",
4436 "Weren't",
4437 "Won",
4438 "Won't",
4439 "Wouldn",
4440 "Wouldn't",
4441 ];
4442 split.retain(|a| stop_words.contains(a) == false);
4443 split
4444 .iter()
4445 .map(|a| String::from(*a))
4446 .collect::<Vec<String>>()
4447 .join(" ")
4448}
4449
4450pub fn tokenize<'a>(string: String, symbol: &Vec<&'a str>) -> Vec<String> {
4451 let output1: Vec<&str> = string.split(" ").collect();
4457 let mut output2 = output1
4458 .iter()
4459 .map(|a| a.to_string())
4460 .collect::<Vec<String>>();
4461 let mut output = vec![];
4462 for j in symbol.iter() {
4463 for i in output2.iter() {
4464 if i.contains(j) {
4465 output.push((*i).split(j).collect());
4466 } else {
4467 output.push(i.to_string());
4468 }
4469 }
4470 output2 = output.clone();
4471 output = vec![];
4472 }
4473 output2
4474}
4475
4476pub fn acf(ts: &Vec<f64>, lag: usize) -> Result<f64, std::io::Error> {
4507 let mean = mean(ts);
4513 let mut numerator = 0.;
4514 let mut denominator = 0.;
4515 for i in 0..ts.len() - lag {
4516 if i > lag {
4517 numerator += (ts[i] - mean) * (ts[i - lag] - mean);
4518 denominator += (ts[i] - mean) * (ts[i] - mean);
4519 }
4520 }
4521 match denominator {
4522 x if x != 0. => {
4523 if ((numerator / denominator).abs() > 0.5) && (lag != 0) {
4524 print!("At {:?} lag the series seems to be correlated\t", lag)
4525 }
4526 Ok(numerator / denominator)
4527 }
4528 _ => Err(std::io::Error::new(
4529 std::io::ErrorKind::Other,
4530 "Denominator is 0!",
4531 )),
4532 }
4533}
4534
4535pub fn simple_ma(ts: &Vec<f64>, lag: usize) -> Vec<f64> {
4536 let mut output = vec![];
4537 for i in 0..lag {
4538 if lag + i <= ts.len() {
4539 let sub_ts = ts[i..lag + i].to_vec();
4540 output.push(sub_ts.iter().fold(0., |a, b| a + b) / sub_ts.len() as f64);
4541 }
4542 }
4543 pad_with_zero(&mut output, lag, "pre")
4544}
4545
4546pub fn exp_ma(ts: &Vec<f64>, alpha: f64) -> Vec<f64> {
4547 let mut output = vec![ts[0]];
4551 for (n, i) in ts[1..].to_vec().iter().enumerate() {
4552 output.push(alpha * i + (1. - alpha) * output[n]);
4553 }
4554 let exp_ma = pad_with_zero(&mut output[..ts.len() - 1].to_vec(), 1, "pre");
4556 let mse = mean(
4557 &ts[1..]
4558 .to_vec()
4559 .iter()
4560 .zip(output[..ts.len() - 1].to_vec().iter())
4561 .map(|(a, b)| (a - b) * (a - b))
4562 .collect(),
4563 );
4564 println!("Mean square error of this forecasting : {:?}", mse);
4565 exp_ma
4566}
4567
4568pub fn best_fit_line(x: &Vec<f64>, y: &Vec<f64>) -> (f64, f64) {
4569 let xy = x
4572 .iter()
4573 .zip(y.iter())
4574 .map(|a| a.0 * a.1)
4575 .collect::<Vec<f64>>();
4576 let xx = x
4577 .iter()
4578 .zip(x.iter())
4579 .map(|a| a.0 * a.1)
4580 .collect::<Vec<f64>>();
4581 let m = ((mean(x) * mean(y)) - mean(&xy)) / ((mean(x) * mean(x)) - mean(&xx));
4582
4583 let b = mean(y) - m * mean(x);
4584 (b, m)
4585}