1#![allow(clippy::useless_conversion)]
2use crate::data_utils::{convert_array_type, DataConverterEnum};
3use ndarray_stats::MaybeNan;
4use num_traits::{Float, FromPrimitive, Num};
5use numpy::ndarray::ArrayView2;
6use numpy::ndarray::{concatenate, Axis};
7use numpy::PyReadonlyArray2;
8use pyo3::prelude::*;
9use scouter_profile::error::DataProfileError;
10use scouter_profile::{
11 compute_feature_correlations, DataProfile, FeatureProfile, NumProfiler, StringProfiler,
12};
13use scouter_types::DataType;
14use std::collections::BTreeMap;
15use std::collections::HashMap;
16use tracing::{debug, error, instrument};
17
18#[pyclass]
19pub struct DataProfiler {
20 num_profiler: NumProfiler,
21 string_profiler: StringProfiler,
22}
23
24#[pymethods]
25#[allow(clippy::new_without_default)]
26impl DataProfiler {
27 #[new]
28 pub fn new() -> Self {
29 Self {
30 num_profiler: NumProfiler::default(),
31 string_profiler: StringProfiler::default(),
32 }
33 }
34
35 #[pyo3(signature = (data, data_type=None, bin_size=20, compute_correlations=false))]
36 #[instrument(skip_all)]
37 pub fn create_data_profile<'py>(
38 &mut self,
39 py: Python<'py>,
40 data: &Bound<'py, PyAny>,
41 data_type: Option<&DataType>,
42 bin_size: Option<usize>,
43 compute_correlations: Option<bool>,
44 ) -> Result<DataProfile, DataProfileError> {
45 debug!("Creating data profile");
46
47 let bin_size = bin_size.unwrap_or(20);
48 let compute_correlations = compute_correlations.unwrap_or(false);
49
50 let data_type = match data_type {
52 Some(data_type) => data_type,
53 None => {
54 let class = data.getattr("__class__")?;
55 let module = class.getattr("__module__")?.str()?.to_string();
56 let name = class.getattr("__name__")?.str()?.to_string();
57 let full_class_name = format!("{module}.{name}");
58
59 &DataType::from_module_name(&full_class_name)?
60 }
61 };
62
63 debug!("Converting data with type: {:?}", data_type);
64 let (num_features, num_array, dtype, string_features, string_vec) =
65 DataConverterEnum::convert_data(py, data_type, data)?;
66
67 if let Some(dtype) = dtype {
69 debug!("Data type detected for numeric data: {:?}", dtype);
70 if dtype == "float64" {
71 let read_array =
72 convert_array_type::<f64>(num_array.unwrap(), &dtype).map_err(|e| {
73 error!("Failed to convert numeric array: {}", e);
74 e
75 })?;
76
77 return self.create_data_profile_f64(
78 compute_correlations,
79 bin_size,
80 num_features,
81 Some(read_array),
82 string_features,
83 string_vec,
84 );
85 } else {
86 let read_array =
87 convert_array_type::<f32>(num_array.unwrap(), &dtype).map_err(|e| {
88 error!("Failed to convert numeric array: {}", e);
89 e
90 })?;
91 return self.create_data_profile_f32(
92 compute_correlations,
93 bin_size,
94 num_features,
95 Some(read_array),
96 string_features,
97 string_vec,
98 );
99 }
100 }
101
102 self.create_data_profile_f32(
103 compute_correlations,
104 bin_size,
105 num_features,
106 None,
107 string_features,
108 string_vec,
109 )
110 }
111}
112
113impl DataProfiler {
114 pub fn create_data_profile_f32(
115 &mut self,
116 compute_correlations: bool,
117 bin_size: usize,
118 numeric_features: Vec<String>,
119 numeric_array: Option<PyReadonlyArray2<f32>>,
120 string_features: Vec<String>,
121 string_array: Option<Vec<Vec<String>>>,
122 ) -> Result<DataProfile, DataProfileError> {
123 if !string_features.is_empty() && string_array.is_some() && numeric_array.is_none() {
124 let profile = self.string_profiler.process_string_array::<f32>(
125 string_array.unwrap(),
126 string_features,
127 compute_correlations,
128 )?;
129 Ok(profile)
130 } else if string_array.is_none() && numeric_array.is_some() && !numeric_features.is_empty()
131 {
132 let profile = self.num_profiler.process_num_array(
133 compute_correlations,
134 &numeric_array.unwrap().as_array(),
135 numeric_features,
136 bin_size,
137 )?;
138
139 Ok(profile)
140 } else {
141 let profile = self.process_string_and_num_array(
142 compute_correlations,
143 numeric_array.unwrap().as_array(),
144 string_array.unwrap(),
145 numeric_features,
146 string_features,
147 bin_size,
148 )?;
149
150 Ok(profile)
151 }
152 }
153
154 pub fn create_data_profile_f64(
155 &mut self,
156 compute_correlations: bool,
157 bin_size: usize,
158 numeric_features: Vec<String>,
159 numeric_array: Option<PyReadonlyArray2<f64>>,
160 string_features: Vec<String>,
161 string_array: Option<Vec<Vec<String>>>,
162 ) -> Result<DataProfile, DataProfileError> {
163 if !string_features.is_empty() && string_array.is_some() && numeric_array.is_none() {
164 let profile = self.string_profiler.process_string_array::<f32>(
165 string_array.unwrap(),
166 string_features,
167 compute_correlations,
168 )?;
169 Ok(profile)
170 } else if string_array.is_none() && numeric_array.is_some() && !numeric_features.is_empty()
171 {
172 let profile = self.num_profiler.process_num_array(
173 compute_correlations,
174 &numeric_array.unwrap().as_array(),
175 numeric_features,
176 bin_size,
177 )?;
178
179 Ok(profile)
180 } else {
181 debug!("Processing both string and numeric arrays");
182 let profile = self.process_string_and_num_array(
183 compute_correlations,
184 numeric_array.unwrap().as_array(),
185 string_array.unwrap(),
186 numeric_features,
187 string_features,
188 bin_size,
189 )?;
190
191 Ok(profile)
192 }
193 }
194
195 fn compute_correlations<F>(
196 &mut self,
197 numeric_array: ArrayView2<F>,
198 string_array: Vec<Vec<String>>,
199 numeric_features: Vec<String>,
200 string_features: Vec<String>,
201 ) -> Result<HashMap<String, HashMap<String, f32>>, DataProfileError>
202 where
203 F: Float
204 + MaybeNan
205 + FromPrimitive
206 + std::fmt::Display
207 + Sync
208 + Send
209 + Num
210 + Clone
211 + std::fmt::Debug
212 + 'static
213 + std::convert::Into<f64>,
214 <F as MaybeNan>::NotNan: Ord,
215 f64: From<F>,
216 <F as MaybeNan>::NotNan: Clone,
217 {
218 debug!("Creating Numeric Profile: Computing correlations");
219 let converted_array = self
220 .string_profiler
221 .convert_string_vec_to_num_array(&string_array, &string_features)?;
222
223 let converted_array = converted_array.mapv(|x| F::from(x).unwrap());
225
226 let concatenated_array = {
228 let numeric_array_view = numeric_array.view();
229 let converted_array_view = converted_array.view();
230 concatenate(Axis(1), &[numeric_array_view, converted_array_view])?
231 };
232
233 let mut features = numeric_features.clone();
235 features.append(&mut string_features.clone());
236
237 let correlations = compute_feature_correlations(&concatenated_array.view(), &features);
238 Ok(correlations)
239 }
240
241 #[instrument(skip_all)]
242 fn process_string_and_num_array<F>(
243 &mut self,
244 compute_correlations: bool,
245 numeric_array: ArrayView2<F>,
246 string_array: Vec<Vec<String>>,
247 numeric_features: Vec<String>,
248 string_features: Vec<String>,
249 bin_size: usize,
250 ) -> Result<DataProfile, DataProfileError>
251 where
252 F: Float
253 + MaybeNan
254 + FromPrimitive
255 + std::fmt::Display
256 + Sync
257 + Send
258 + Num
259 + Clone
260 + std::fmt::Debug
261 + 'static
262 + std::convert::Into<f64>,
263 <F as MaybeNan>::NotNan: Ord,
264 f64: From<F>,
265 <F as MaybeNan>::NotNan: Clone,
266 {
267 debug!("Creating String Profile");
268 let string_profiles = self
269 .string_profiler
270 .create_string_profile(&string_array, &string_features)?;
271
272 debug!("Creating Numeric Profile: Computing stats");
273 let num_profiles =
274 self.num_profiler
275 .compute_stats(&numeric_features, &numeric_array, &bin_size)?;
276
277 let correlations: Option<HashMap<String, HashMap<String, f32>>> = if compute_correlations {
278 match self.compute_correlations(
279 numeric_array,
280 string_array,
281 numeric_features.clone(),
282 string_features.clone(),
283 ) {
284 Ok(correlations) => Some(correlations),
285 Err(e) => {
286 error!("Failed to compute correlations: {}", e);
287 None
288 }
289 }
290 } else {
291 debug!("Creating Numeric Profile: Skipping correlations");
292 None
293 };
294
295 let mut features: BTreeMap<String, FeatureProfile> = string_profiles
296 .iter()
297 .map(|profile| {
298 let mut profile = profile.clone();
299
300 if let Some(correlations) = correlations.as_ref() {
301 let correlation = correlations.get(&profile.id);
302 if let Some(correlation) = correlation {
303 profile.add_correlations(correlation.clone());
304 }
305 }
306
307 (profile.id.clone(), profile)
308 })
309 .collect();
310
311 let num_features: BTreeMap<String, FeatureProfile> = num_profiles
312 .iter()
313 .map(|profile| {
314 let mut profile = profile.clone();
315
316 if let Some(correlations) = correlations.as_ref() {
317 let correlation = correlations.get(&profile.id);
318 if let Some(correlation) = correlation {
319 profile.add_correlations(correlation.clone());
320 }
321 }
322
323 (profile.id.clone(), profile)
324 })
325 .collect();
326
327 features.extend(num_features);
328
329 Ok(DataProfile { features })
330 }
331}