scouter_profile/profile/
string_profiler.rs

1use crate::error::DataProfileError;
2use crate::profile::types::DataProfile;
3use crate::profile::types::{CharStats, Distinct, FeatureProfile, StringStats, WordStats};
4use crate::stats::compute_feature_correlations;
5use chrono::Utc;
6use ndarray::Array2;
7use rayon::prelude::*;
8use scouter_types::create_feature_map;
9use std::collections::BTreeMap;
10use std::collections::HashMap;
11
12pub struct StringProfiler {}
13
14impl StringProfiler {
15    pub fn new() -> Self {
16        StringProfiler {}
17    }
18
19    pub fn process_string_array<F>(
20        &self,
21        string_array: Vec<Vec<String>>,
22        string_features: Vec<String>,
23        compute_correlations: bool,
24    ) -> Result<DataProfile, DataProfileError> {
25        let profiles = self.create_string_profile(&string_array, &string_features)?;
26
27        let correlations: Option<HashMap<String, HashMap<String, f32>>> = if compute_correlations {
28            let converted_array =
29                self.convert_string_vec_to_num_array(&string_array, &string_features)?;
30
31            let correlations =
32                compute_feature_correlations(&converted_array.view(), &string_features);
33            Some(correlations)
34        } else {
35            None
36        };
37
38        let features: BTreeMap<String, FeatureProfile> = profiles
39            .iter()
40            .map(|profile| {
41                let mut profile = profile.clone();
42
43                if let Some(correlations) = correlations.as_ref() {
44                    let correlation = correlations.get(&profile.id);
45                    if let Some(correlation) = correlation {
46                        profile.add_correlations(correlation.clone());
47                    }
48                }
49
50                (profile.id.clone(), profile)
51            })
52            .collect();
53
54        Ok(DataProfile { features })
55    }
56
57    pub fn convert_string_vec_to_num_array(
58        &self,
59        string_array: &[Vec<String>],
60        string_features: &[String],
61    ) -> Result<Array2<f32>, DataProfileError> {
62        let feature_map = create_feature_map(string_features, string_array)?;
63
64        // zip and map string_array and string_features
65        let arrays = string_array
66            .par_iter()
67            .enumerate()
68            .map(|(i, col)| {
69                let map = feature_map.features.get(&string_features[i]).unwrap();
70
71                // attempt to set feature. If not found, set to missing
72                let col = col
73                    .iter()
74                    .map(|x| *map.get(x).unwrap_or(map.get("missing").unwrap()) as f32)
75                    .collect::<Vec<_>>();
76                Array2::from_shape_vec((col.len(), 1), col).unwrap()
77            })
78            .collect::<Vec<_>>();
79
80        let num_array = ndarray::concatenate(
81            ndarray::Axis(1),
82            &arrays.iter().map(|a| a.view()).collect::<Vec<_>>(),
83        )?;
84
85        Ok(num_array)
86    }
87
88    // Create a string profile for a 2D array of strings
89    //
90    // # Arguments
91    //
92    // * `string_array` - A 2D array of strings
93    // * `string_features` - A vector of feature names
94    // # Returns
95    //
96    // * `Vec<FeatureProfile>` - A vector of feature profiles
97    pub fn create_string_profile(
98        &self,
99        string_array: &[Vec<String>],
100        string_features: &[String],
101    ) -> Result<Vec<FeatureProfile>, DataProfileError> {
102        let string_profiler = StringProfiler::new();
103        let string_profile = string_profiler.compute_2d_stats(string_array, string_features)?;
104
105        Ok(string_profile)
106    }
107
108    // Compute the statistics for a string array
109    //
110    // # Arguments
111    //
112    // * `array` - A vector of strings
113    //
114    // # Returns
115    //
116    // * `StringStats` - A struct containing the statistics for the string array
117    pub fn compute_stats(&self, array: &Vec<String>) -> Result<StringStats, DataProfileError> {
118        let mut unique = HashMap::new();
119
120        let count = array.len();
121        let mut lengths = Vec::new();
122
123        for item in array {
124            *unique.entry(item).or_insert(0) += 1;
125            lengths.push(item.chars().count());
126        }
127
128        // unique count
129        let unique_count = unique.len();
130
131        // median
132        lengths.sort();
133        let median = lengths[lengths.len() / 2];
134
135        let char_stats = CharStats {
136            min_length: lengths[0],
137            max_length: lengths[lengths.len() - 1],
138            mean_length: lengths.iter().sum::<usize>() as f64 / count as f64,
139            median_length: median,
140        };
141
142        // need to get distinct for each word
143        let mut word_stats = HashMap::new();
144        for (key, value) in unique.iter() {
145            word_stats.insert(
146                key.to_string(),
147                Distinct {
148                    count: *value,
149                    percent: *value as f64 / count as f64,
150                },
151            );
152        }
153
154        let string_stats = StringStats {
155            distinct: Distinct {
156                count: unique_count,
157                percent: (unique_count as f64 / count as f64) * 100.0,
158            },
159            char_stats,
160            word_stats: WordStats { words: word_stats },
161        };
162
163        Ok(string_stats)
164    }
165
166    pub fn compute_2d_stats(
167        &self,
168        array: &[Vec<String>],
169        string_features: &[String],
170    ) -> Result<Vec<FeatureProfile>, DataProfileError> {
171        // zip the string features with the array
172
173        let map_vec = array
174            .par_iter()
175            .enumerate()
176            .map(|(i, col)| {
177                let feature = &string_features[i];
178                let stats = self.compute_stats(col)?;
179
180                Ok(FeatureProfile {
181                    id: feature.to_string(),
182                    string_stats: Some(stats),
183                    numeric_stats: None,
184                    timestamp: Utc::now(),
185                    correlations: None,
186                })
187            })
188            .collect::<Result<Vec<FeatureProfile>, DataProfileError>>()?;
189
190        Ok(map_vec)
191    }
192}
193
194impl Default for StringProfiler {
195    fn default() -> Self {
196        StringProfiler::new()
197    }
198}
199
200#[cfg(test)]
201mod tests {
202
203    use super::*;
204
205    #[test]
206    fn test_compute_stats() {
207        let string_profiler = StringProfiler::new();
208        let array = vec![
209            vec![
210                "hello".to_string(),
211                "world".to_string(),
212                "world".to_string(),
213            ],
214            vec!["blah".to_string(), "foo".to_string(), "world".to_string()],
215        ];
216
217        let stats = string_profiler
218            .compute_2d_stats(&array, &["feature1".to_string(), "feature2".to_string()])
219            .unwrap();
220
221        assert_eq!(stats.len(), 2);
222    }
223}