1#![allow(clippy::useless_conversion)]
2use crate::data_utils::{convert_array_type, DataConverterEnum};
3use ndarray_stats::MaybeNan;
4use num_traits::{Float, FromPrimitive, Num};
5use numpy::ndarray::ArrayView2;
6use numpy::ndarray::{concatenate, Axis};
7use numpy::PyReadonlyArray2;
8use pyo3::prelude::*;
9use scouter_profile::error::DataProfileError;
10use scouter_profile::{
11    compute_feature_correlations, DataProfile, FeatureProfile, NumProfiler, StringProfiler,
12};
13use scouter_types::DataType;
14use std::collections::BTreeMap;
15use std::collections::HashMap;
16use tracing::{debug, error, instrument};
17
18#[pyclass]
19pub struct DataProfiler {
20    num_profiler: NumProfiler,
21    string_profiler: StringProfiler,
22}
23
24#[pymethods]
25#[allow(clippy::new_without_default)]
26impl DataProfiler {
27    #[new]
28    pub fn new() -> Self {
29        Self {
30            num_profiler: NumProfiler::default(),
31            string_profiler: StringProfiler::default(),
32        }
33    }
34
35    #[pyo3(signature = (data, data_type=None, bin_size=20, compute_correlations=false))]
36    #[instrument(skip_all)]
37    pub fn create_data_profile<'py>(
38        &mut self,
39        py: Python<'py>,
40        data: &Bound<'py, PyAny>,
41        data_type: Option<&DataType>,
42        bin_size: Option<usize>,
43        compute_correlations: Option<bool>,
44    ) -> Result<DataProfile, DataProfileError> {
45        debug!("Creating data profile");
46
47        let bin_size = bin_size.unwrap_or(20);
48        let compute_correlations = compute_correlations.unwrap_or(false);
49
50        let data_type = match data_type {
52            Some(data_type) => data_type,
53            None => {
54                let class = data.getattr("__class__")?;
55                let module = class.getattr("__module__")?.str()?.to_string();
56                let name = class.getattr("__name__")?.str()?.to_string();
57                let full_class_name = format!("{module}.{name}");
58
59                &DataType::from_module_name(&full_class_name)?
60            }
61        };
62
63        debug!("Converting data with type: {:?}", data_type);
64        let (num_features, num_array, dtype, string_features, string_vec) =
65            DataConverterEnum::convert_data(py, data_type, data)?;
66
67        if let Some(dtype) = dtype {
69            debug!("Data type detected for numeric data: {:?}", dtype);
70            if dtype == "float64" {
71                let read_array =
72                    convert_array_type::<f64>(num_array.unwrap(), &dtype).map_err(|e| {
73                        error!("Failed to convert numeric array: {}", e);
74                        e
75                    })?;
76
77                return self.create_data_profile_f64(
78                    compute_correlations,
79                    bin_size,
80                    num_features,
81                    Some(read_array),
82                    string_features,
83                    string_vec,
84                );
85            } else {
86                let read_array =
87                    convert_array_type::<f32>(num_array.unwrap(), &dtype).map_err(|e| {
88                        error!("Failed to convert numeric array: {}", e);
89                        e
90                    })?;
91                return self.create_data_profile_f32(
92                    compute_correlations,
93                    bin_size,
94                    num_features,
95                    Some(read_array),
96                    string_features,
97                    string_vec,
98                );
99            }
100        }
101
102        self.create_data_profile_f32(
103            compute_correlations,
104            bin_size,
105            num_features,
106            None,
107            string_features,
108            string_vec,
109        )
110    }
111}
112
113impl DataProfiler {
114    pub fn create_data_profile_f32(
115        &mut self,
116        compute_correlations: bool,
117        bin_size: usize,
118        numeric_features: Vec<String>,
119        numeric_array: Option<PyReadonlyArray2<f32>>,
120        string_features: Vec<String>,
121        string_array: Option<Vec<Vec<String>>>,
122    ) -> Result<DataProfile, DataProfileError> {
123        if !string_features.is_empty() && string_array.is_some() && numeric_array.is_none() {
124            let profile = self.string_profiler.process_string_array::<f32>(
125                string_array.unwrap(),
126                string_features,
127                compute_correlations,
128            )?;
129            Ok(profile)
130        } else if string_array.is_none() && numeric_array.is_some() && !numeric_features.is_empty()
131        {
132            let profile = self.num_profiler.process_num_array(
133                compute_correlations,
134                &numeric_array.unwrap().as_array(),
135                numeric_features,
136                bin_size,
137            )?;
138
139            Ok(profile)
140        } else {
141            let profile = self.process_string_and_num_array(
142                compute_correlations,
143                numeric_array.unwrap().as_array(),
144                string_array.unwrap(),
145                numeric_features,
146                string_features,
147                bin_size,
148            )?;
149
150            Ok(profile)
151        }
152    }
153
154    pub fn create_data_profile_f64(
155        &mut self,
156        compute_correlations: bool,
157        bin_size: usize,
158        numeric_features: Vec<String>,
159        numeric_array: Option<PyReadonlyArray2<f64>>,
160        string_features: Vec<String>,
161        string_array: Option<Vec<Vec<String>>>,
162    ) -> Result<DataProfile, DataProfileError> {
163        if !string_features.is_empty() && string_array.is_some() && numeric_array.is_none() {
164            let profile = self.string_profiler.process_string_array::<f32>(
165                string_array.unwrap(),
166                string_features,
167                compute_correlations,
168            )?;
169            Ok(profile)
170        } else if string_array.is_none() && numeric_array.is_some() && !numeric_features.is_empty()
171        {
172            let profile = self.num_profiler.process_num_array(
173                compute_correlations,
174                &numeric_array.unwrap().as_array(),
175                numeric_features,
176                bin_size,
177            )?;
178
179            Ok(profile)
180        } else {
181            debug!("Processing both string and numeric arrays");
182            let profile = self.process_string_and_num_array(
183                compute_correlations,
184                numeric_array.unwrap().as_array(),
185                string_array.unwrap(),
186                numeric_features,
187                string_features,
188                bin_size,
189            )?;
190
191            Ok(profile)
192        }
193    }
194
195    fn compute_correlations<F>(
196        &mut self,
197        numeric_array: ArrayView2<F>,
198        string_array: Vec<Vec<String>>,
199        numeric_features: Vec<String>,
200        string_features: Vec<String>,
201    ) -> Result<HashMap<String, HashMap<String, f32>>, DataProfileError>
202    where
203        F: Float
204            + MaybeNan
205            + FromPrimitive
206            + std::fmt::Display
207            + Sync
208            + Send
209            + Num
210            + Clone
211            + std::fmt::Debug
212            + 'static
213            + std::convert::Into<f64>,
214        <F as MaybeNan>::NotNan: Ord,
215        f64: From<F>,
216        <F as MaybeNan>::NotNan: Clone,
217    {
218        debug!("Creating Numeric Profile: Computing correlations");
219        let converted_array = self
220            .string_profiler
221            .convert_string_vec_to_num_array(&string_array, &string_features)?;
222
223        let converted_array = converted_array.mapv(|x| F::from(x).unwrap());
225
226        let concatenated_array = {
228            let numeric_array_view = numeric_array.view();
229            let converted_array_view = converted_array.view();
230            concatenate(Axis(1), &[numeric_array_view, converted_array_view])?
231        };
232
233        let mut features = numeric_features.clone();
235        features.append(&mut string_features.clone());
236
237        let correlations = compute_feature_correlations(&concatenated_array.view(), &features);
238        Ok(correlations)
239    }
240
241    #[instrument(skip_all)]
242    fn process_string_and_num_array<F>(
243        &mut self,
244        compute_correlations: bool,
245        numeric_array: ArrayView2<F>,
246        string_array: Vec<Vec<String>>,
247        numeric_features: Vec<String>,
248        string_features: Vec<String>,
249        bin_size: usize,
250    ) -> Result<DataProfile, DataProfileError>
251    where
252        F: Float
253            + MaybeNan
254            + FromPrimitive
255            + std::fmt::Display
256            + Sync
257            + Send
258            + Num
259            + Clone
260            + std::fmt::Debug
261            + 'static
262            + std::convert::Into<f64>,
263        <F as MaybeNan>::NotNan: Ord,
264        f64: From<F>,
265        <F as MaybeNan>::NotNan: Clone,
266    {
267        debug!("Creating String Profile");
268        let string_profiles = self
269            .string_profiler
270            .create_string_profile(&string_array, &string_features)?;
271
272        debug!("Creating Numeric Profile: Computing stats");
273        let num_profiles =
274            self.num_profiler
275                .compute_stats(&numeric_features, &numeric_array, &bin_size)?;
276
277        let correlations: Option<HashMap<String, HashMap<String, f32>>> = if compute_correlations {
278            match self.compute_correlations(
279                numeric_array,
280                string_array,
281                numeric_features.clone(),
282                string_features.clone(),
283            ) {
284                Ok(correlations) => Some(correlations),
285                Err(e) => {
286                    error!("Failed to compute correlations: {}", e);
287                    None
288                }
289            }
290        } else {
291            debug!("Creating Numeric Profile: Skipping correlations");
292            None
293        };
294
295        let mut features: BTreeMap<String, FeatureProfile> = string_profiles
296            .iter()
297            .map(|profile| {
298                let mut profile = profile.clone();
299
300                if let Some(correlations) = correlations.as_ref() {
301                    let correlation = correlations.get(&profile.id);
302                    if let Some(correlation) = correlation {
303                        profile.add_correlations(correlation.clone());
304                    }
305                }
306
307                (profile.id.clone(), profile)
308            })
309            .collect();
310
311        let num_features: BTreeMap<String, FeatureProfile> = num_profiles
312            .iter()
313            .map(|profile| {
314                let mut profile = profile.clone();
315
316                if let Some(correlations) = correlations.as_ref() {
317                    let correlation = correlations.get(&profile.id);
318                    if let Some(correlation) = correlation {
319                        profile.add_correlations(correlation.clone());
320                    }
321                }
322
323                (profile.id.clone(), profile)
324            })
325            .collect();
326
327        features.extend(num_features);
328
329        Ok(DataProfile { features })
330    }
331}