scouter_profile/profile/
string_profiler.rs1use crate::error::DataProfileError;
2use crate::profile::types::DataProfile;
3use crate::profile::types::{CharStats, Distinct, FeatureProfile, StringStats, WordStats};
4use crate::stats::compute_feature_correlations;
5use chrono::Utc;
6use ndarray::Array2;
7use rayon::prelude::*;
8use scouter_types::create_feature_map;
9use std::collections::BTreeMap;
10use std::collections::HashMap;
11
12pub struct StringProfiler {}
13
14impl StringProfiler {
15 pub fn new() -> Self {
16 StringProfiler {}
17 }
18
19 pub fn process_string_array<F>(
20 &self,
21 string_array: Vec<Vec<String>>,
22 string_features: Vec<String>,
23 compute_correlations: bool,
24 ) -> Result<DataProfile, DataProfileError> {
25 let profiles = self.create_string_profile(&string_array, &string_features)?;
26
27 let correlations: Option<HashMap<String, HashMap<String, f32>>> = if compute_correlations {
28 let converted_array =
29 self.convert_string_vec_to_num_array(&string_array, &string_features)?;
30
31 let correlations =
32 compute_feature_correlations(&converted_array.view(), &string_features);
33 Some(correlations)
34 } else {
35 None
36 };
37
38 let features: BTreeMap<String, FeatureProfile> = profiles
39 .iter()
40 .map(|profile| {
41 let mut profile = profile.clone();
42
43 if let Some(correlations) = correlations.as_ref() {
44 let correlation = correlations.get(&profile.id);
45 if let Some(correlation) = correlation {
46 profile.add_correlations(correlation.clone());
47 }
48 }
49
50 (profile.id.clone(), profile)
51 })
52 .collect();
53
54 Ok(DataProfile { features })
55 }
56
57 pub fn convert_string_vec_to_num_array(
58 &self,
59 string_array: &[Vec<String>],
60 string_features: &[String],
61 ) -> Result<Array2<f32>, DataProfileError> {
62 let feature_map = create_feature_map(string_features, string_array)?;
63
64 let arrays = string_array
66 .par_iter()
67 .enumerate()
68 .map(|(i, col)| {
69 let map = feature_map.features.get(&string_features[i]).unwrap();
70
71 let col = col
73 .iter()
74 .map(|x| *map.get(x).unwrap_or(map.get("missing").unwrap()) as f32)
75 .collect::<Vec<_>>();
76 Array2::from_shape_vec((col.len(), 1), col).unwrap()
77 })
78 .collect::<Vec<_>>();
79
80 let num_array = ndarray::concatenate(
81 ndarray::Axis(1),
82 &arrays.iter().map(|a| a.view()).collect::<Vec<_>>(),
83 )?;
84
85 Ok(num_array)
86 }
87
88 pub fn create_string_profile(
98 &self,
99 string_array: &[Vec<String>],
100 string_features: &[String],
101 ) -> Result<Vec<FeatureProfile>, DataProfileError> {
102 let string_profiler = StringProfiler::new();
103 let string_profile = string_profiler.compute_2d_stats(string_array, string_features)?;
104
105 Ok(string_profile)
106 }
107
108 pub fn compute_stats(&self, array: &Vec<String>) -> Result<StringStats, DataProfileError> {
118 let mut unique = HashMap::new();
119
120 let count = array.len();
121 let mut lengths = Vec::new();
122
123 for item in array {
124 *unique.entry(item).or_insert(0) += 1;
125 lengths.push(item.chars().count());
126 }
127
128 let unique_count = unique.len();
130
131 lengths.sort();
133 let median = lengths[lengths.len() / 2];
134
135 let char_stats = CharStats {
136 min_length: lengths[0],
137 max_length: lengths[lengths.len() - 1],
138 mean_length: lengths.iter().sum::<usize>() as f64 / count as f64,
139 median_length: median,
140 };
141
142 let mut word_stats = HashMap::new();
144 for (key, value) in unique.iter() {
145 word_stats.insert(
146 key.to_string(),
147 Distinct {
148 count: *value,
149 percent: *value as f64 / count as f64,
150 },
151 );
152 }
153
154 let string_stats = StringStats {
155 distinct: Distinct {
156 count: unique_count,
157 percent: (unique_count as f64 / count as f64) * 100.0,
158 },
159 char_stats,
160 word_stats: WordStats { words: word_stats },
161 };
162
163 Ok(string_stats)
164 }
165
166 pub fn compute_2d_stats(
167 &self,
168 array: &[Vec<String>],
169 string_features: &[String],
170 ) -> Result<Vec<FeatureProfile>, DataProfileError> {
171 let map_vec = array
174 .par_iter()
175 .enumerate()
176 .map(|(i, col)| {
177 let feature = &string_features[i];
178 let stats = self.compute_stats(col)?;
179
180 Ok(FeatureProfile {
181 id: feature.to_string(),
182 string_stats: Some(stats),
183 numeric_stats: None,
184 timestamp: Utc::now(),
185 correlations: None,
186 })
187 })
188 .collect::<Result<Vec<FeatureProfile>, DataProfileError>>()?;
189
190 Ok(map_vec)
191 }
192}
193
194impl Default for StringProfiler {
195 fn default() -> Self {
196 StringProfiler::new()
197 }
198}
199
200#[cfg(test)]
201mod tests {
202
203 use super::*;
204
205 #[test]
206 fn test_compute_stats() {
207 let string_profiler = StringProfiler::new();
208 let array = vec![
209 vec![
210 "hello".to_string(),
211 "world".to_string(),
212 "world".to_string(),
213 ],
214 vec!["blah".to_string(), "foo".to_string(), "world".to_string()],
215 ];
216
217 let stats = string_profiler
218 .compute_2d_stats(&array, &["feature1".to_string(), "feature2".to_string()])
219 .unwrap();
220
221 assert_eq!(stats.len(), 2);
222 }
223}