1use std::{collections::HashMap, fmt::Display, path::Path};
38
39use jenks::get_jenks_classification;
40use text_processing::Processing;
41use utilities::Classification;
42use vectorization::Vectorization;
43
44mod jenks;
45pub mod text_processing;
46mod utilities;
47pub mod vectorization;
48
49pub fn mode(v: &Vec<u64>) -> Option<(u64, i32)> {
51 let frequencies = v.iter().fold(HashMap::new(), |mut freqs, value| {
52 *freqs.entry(value).or_insert(0) += 1;
53 freqs
54 });
55 frequencies
56 .into_iter()
57 .max_by_key(|&(_, count)| count)
58 .map(|(value, count)| (*value, count))
59}
60pub fn mean(v: &Vec<u64>) -> Option<f64> {
62 let sum = v.iter().sum::<u64>() as f64;
63 let count = v.len();
64
65 match count {
66 positive if positive > 0 => {
67 Some(sum / count as f64)
68 }
71 _ => None,
72 }
73}
74pub fn std_deviation(v: &Vec<u64>) -> Option<f64> {
76 match (mean(v), v.len()) {
77 (Some(data_mean), count) if count > 0 => {
78 let variance = v
79 .iter()
80 .map(|value| {
81 let diff = data_mean - (*value as f64);
82
83 diff * diff
84 })
85 .sum::<f64>()
86 / count as f64;
87 Some(variance.sqrt())
88 }
89 _ => None,
90 }
91}
92fn discrete_coefficient(v: &Vec<u64>) -> Option<f64> {
94 match (std_deviation(v), mean(v)) {
95 (Some(dev), Some(m)) => Some(dev / m),
96 _ => None,
97 }
98}
99pub struct CFDTCP<'a> {
100 v: Vectorization<'a>,
101 p: Processing,
102}
103
104impl<'a> CFDTCP<'a> {
105 pub fn new() -> CFDTCP<'a> {
106 let mut P = text_processing::Processing::new_file(Path::new(".stop_word.txt"));
107 P.set_ac();
108 let mut V = Vectorization::new();
109 Self { v: V, p: P }
110 }
111 pub fn centor(&mut self, c: String) -> &mut Self {
112 self.v.centor(self.p.parse(c));
113 self
114 }
115
116 pub fn list(&mut self, list: Vec<&'a str>) -> &mut Self {
117 self.v.list(list);
118 self
119 }
120
121 pub fn distribution(&mut self) -> Option<f64> {
122 discrete_coefficient(&self.v.get_dt())
123 }
124
125 pub fn mode(&mut self) -> Option<(u64, i32)> {
126 mode(&self.v.get_dt())
127 }
128
129 pub fn jenks_classify(&mut self, num_bins: usize) -> (usize, Classification) {
130 let mut c = Classification::new();
131 let mut n: usize = 0;
132 let mut min = f64::MAX;
133 let data = self.v.get_dt();
134 for num_bin in 1..num_bins {
135 let class = get_jenks_classification(num_bin, data);
136 let max_dev = CFDTCP::one_jenks_max_std_deviation(data, &class);
137 if max_dev < min {
139 min = max_dev;
140 n = num_bin;
141 c = class;
142 }
143 }
144 (n + 1, c)
145 }
146 fn one_jenks_max_std_deviation(dt: &Vec<u64>, res: &Classification) -> f64 {
147 let mut max = 0.0;
148 let start = 0;
149 for bin in res {
150 match std_deviation(&dt[start as usize..start + bin.count as usize].to_vec()) {
151 Some(v) => {
152 if v > max {
153 max = v
154 }
155 }
156 None => {}
157 }
158 }
159 max
160 }
161
162 pub fn get_dt(&mut self) -> &Vec<u64> {
163 self.v.get_dt()
164 }
165
166 pub fn clear(&mut self) {}
167}
168
169pub struct MyClassification(pub Classification);
170
171impl Display for MyClassification {
172 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
173 let mut vec = Vec::new();
174 for bin in &self.0 {
175 vec.push(format!(
176 "start:{} end:{} count:{}",
177 bin.bin_start, bin.bin_end, bin.count
178 ));
179 }
180 write!(f, "{:?}", vec)
181 }
182}