scouter_client/data_utils/
pandas.rs

1use crate::data_utils::types::DataTypes;
2use crate::data_utils::{ConvertedData, DataConverter};
3use crate::error::DataError;
4use pyo3::prelude::*;
5use tracing::{debug, instrument};
6
7pub struct PandasDataConverter;
8
9impl DataConverter for PandasDataConverter {
10    #[instrument(skip_all)]
11    fn categorize_features<'py>(
12        _py: Python<'py>,
13        data: &Bound<'py, PyAny>,
14    ) -> Result<DataTypes, DataError> {
15        let column_name_dtype = data
16            .getattr("columns")?
17            .getattr("dtype")?
18            .str()?
19            .to_string();
20
21        if !column_name_dtype.contains("object") {
22            return Err(DataError::ColumnNamesMustBeStrings);
23        }
24
25        let all_columns = data.getattr("columns")?.extract::<Vec<String>>()?;
26
27        // get integer and float columns
28        let integer_columns = data
29            .call_method1("select_dtypes", ("integer",))?
30            .getattr("columns")?
31            .extract::<Vec<String>>()?;
32
33        let float_columns = data
34            .call_method1("select_dtypes", ("float",))?
35            .getattr("columns")?
36            .extract::<Vec<String>>()?;
37
38        let non_numeric_columns: Vec<String> = all_columns
39            .iter()
40            .filter(|col| !float_columns.contains(col) && !integer_columns.contains(col))
41            .cloned()
42            .collect();
43
44        debug!("Non-numeric columns: {:?}", non_numeric_columns);
45
46        // Introducing specific numeric types because we may want to handle them differently at a later point
47        Ok(DataTypes::new(
48            integer_columns,
49            float_columns,
50            non_numeric_columns,
51        ))
52    }
53
54    fn process_numeric_features<'py>(
55        data: &Bound<'py, PyAny>,
56        data_types: &DataTypes,
57    ) -> Result<(Option<Bound<'py, PyAny>>, Option<String>), DataError> {
58        if data_types.numeric_features.is_empty() {
59            return Ok((None, None));
60        }
61
62        // if mixed type is true, it assumes we are at least dealing with float and integer types. we will need to convert all to float64
63        let array = if data_types.has_mixed_types() {
64            data.get_item(&data_types.numeric_features)?
65                .call_method1("astype", ("float64",))?
66                .call_method0("to_numpy")?
67        } else {
68            data.get_item(&data_types.numeric_features)?
69                .call_method0("to_numpy")?
70        };
71
72        let dtype = Some(array.getattr("dtype")?.str()?.to_string());
73
74        //
75
76        Ok((Some(array), dtype))
77    }
78
79    #[allow(clippy::needless_lifetimes)]
80    fn process_string_features<'py>(
81        data: &Bound<'py, PyAny>,
82        features: &[String],
83    ) -> Result<Option<Vec<Vec<String>>>, DataError> {
84        if features.is_empty() {
85            return Ok(None);
86        }
87
88        let string_cols = data
89            .get_item(features)?
90            .call_method1("astype", ("str",))?
91            .getattr("values")?
92            .getattr("T")?
93            .call_method0("tolist")?;
94        let string_array = string_cols.extract::<Vec<Vec<String>>>()?;
95
96        Ok(Some(string_array))
97    }
98
99    #[instrument(skip_all)]
100    fn prepare_data<'py>(
101        py: Python<'py>,
102        data: &Bound<'py, PyAny>,
103    ) -> Result<ConvertedData<'py>, DataError> {
104        let data_types = PandasDataConverter::categorize_features(py, data)?;
105
106        let (numeric_array, dtype) =
107            PandasDataConverter::process_numeric_features(data, &data_types)?;
108
109        let string_array =
110            PandasDataConverter::process_string_features(data, &data_types.string_features)?;
111
112        Ok((
113            data_types.numeric_features,
114            numeric_array,
115            dtype,
116            data_types.string_features,
117            string_array,
118        ))
119    }
120}