scouter_client/data_utils/
pandas.rs1use crate::data_utils::types::DataTypes;
2use crate::data_utils::{ConvertedData, DataConverter};
3use crate::error::DataError;
4use pyo3::prelude::*;
5use tracing::{debug, instrument};
6
7pub struct PandasDataConverter;
8
9impl DataConverter for PandasDataConverter {
10 #[instrument(skip_all)]
11 fn categorize_features<'py>(
12 _py: Python<'py>,
13 data: &Bound<'py, PyAny>,
14 ) -> Result<DataTypes, DataError> {
15 let column_name_dtype = data
16 .getattr("columns")?
17 .getattr("dtype")?
18 .str()?
19 .to_string();
20
21 if !column_name_dtype.contains("object") {
22 return Err(DataError::ColumnNamesMustBeStrings);
23 }
24
25 let all_columns = data.getattr("columns")?.extract::<Vec<String>>()?;
26
27 let integer_columns = data
29 .call_method1("select_dtypes", ("integer",))?
30 .getattr("columns")?
31 .extract::<Vec<String>>()?;
32
33 let float_columns = data
34 .call_method1("select_dtypes", ("float",))?
35 .getattr("columns")?
36 .extract::<Vec<String>>()?;
37
38 let non_numeric_columns: Vec<String> = all_columns
39 .iter()
40 .filter(|col| !float_columns.contains(col) && !integer_columns.contains(col))
41 .cloned()
42 .collect();
43
44 debug!("Non-numeric columns: {:?}", non_numeric_columns);
45
46 Ok(DataTypes::new(
48 integer_columns,
49 float_columns,
50 non_numeric_columns,
51 ))
52 }
53
54 fn process_numeric_features<'py>(
55 data: &Bound<'py, PyAny>,
56 data_types: &DataTypes,
57 ) -> Result<(Option<Bound<'py, PyAny>>, Option<String>), DataError> {
58 if data_types.numeric_features.is_empty() {
59 return Ok((None, None));
60 }
61
62 let array = if data_types.has_mixed_types() {
64 data.get_item(&data_types.numeric_features)?
65 .call_method1("astype", ("float64",))?
66 .call_method0("to_numpy")?
67 } else {
68 data.get_item(&data_types.numeric_features)?
69 .call_method0("to_numpy")?
70 };
71
72 let dtype = Some(array.getattr("dtype")?.str()?.to_string());
73
74 Ok((Some(array), dtype))
77 }
78
79 #[allow(clippy::needless_lifetimes)]
80 fn process_string_features<'py>(
81 data: &Bound<'py, PyAny>,
82 features: &[String],
83 ) -> Result<Option<Vec<Vec<String>>>, DataError> {
84 if features.is_empty() {
85 return Ok(None);
86 }
87
88 let string_cols = data
89 .get_item(features)?
90 .call_method1("astype", ("str",))?
91 .getattr("values")?
92 .getattr("T")?
93 .call_method0("tolist")?;
94 let string_array = string_cols.extract::<Vec<Vec<String>>>()?;
95
96 Ok(Some(string_array))
97 }
98
99 #[instrument(skip_all)]
100 fn prepare_data<'py>(
101 py: Python<'py>,
102 data: &Bound<'py, PyAny>,
103 ) -> Result<ConvertedData<'py>, DataError> {
104 let data_types = PandasDataConverter::categorize_features(py, data)?;
105
106 let (numeric_array, dtype) =
107 PandasDataConverter::process_numeric_features(data, &data_types)?;
108
109 let string_array =
110 PandasDataConverter::process_string_features(data, &data_types.string_features)?;
111
112 Ok((
113 data_types.numeric_features,
114 numeric_array,
115 dtype,
116 data_types.string_features,
117 string_array,
118 ))
119 }
120}