dendritic_datasets/
student_performance.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
use std::fs::File; 
use dendritic_ndarray::ndarray::NDArray;
use arrow_schema::{DataType, Field, Schema};
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
use parquet::errors::Result; 
use crate::utils::*;
use dendritic_preprocessing::standard_scalar::*;

/// Load schema for student performance data
pub fn load_student_schema() -> Schema {
    Schema::new(vec![
        Field::new("student_id", DataType::Float64, false),
        Field::new("age", DataType::Float64, false),
        Field::new("gender", DataType::Float64, false),
        Field::new("ethnicity", DataType::Float64, false),
        Field::new("parental_education", DataType::Float64, false),
        Field::new("study_time_weekly", DataType::Float64, false),
        Field::new("absences", DataType::Float64, false),
        Field::new("tutoring", DataType::Float64, false),
        Field::new("parental_support", DataType::Float64, false),
        Field::new("extra_cirricular", DataType::Float64, false),
        Field::new("sports", DataType::Float64, false),
        Field::new("music", DataType::Float64, false),
        Field::new("volunteering", DataType::Float64, false),
        Field::new("gpa", DataType::Float64, false),
        Field::new("grade_class", DataType::Float64, false),
    ])
}

/// Convert student performance data to parquet
pub fn convert_student_csv_to_parquet() {

    let student_schema = load_student_schema();

    csv_to_parquet(
        student_schema,
        "data/student_performance.csv",
        "data/student_performance.parquet"
    ); 
}

/// Load student performance data from path
pub fn load_student_data() -> Result<(NDArray<f64>, NDArray<f64>)> {
    
    /* switch to datasets/data directory */
    let path = "data/student_performance.parquet";
    let file = File::open(path).unwrap();
    let mut reader = ParquetRecordBatchReaderBuilder::try_new(file)?
        .build()?;

    let batch = reader.next().unwrap().unwrap();
    let (input, y_train) = select_features(
        batch.clone(),
        vec![
            "age",
            "gender",
            "ethnicity",
            "parental_education",
            "study_time_weekly", 
            "absences",
            "tutoring",
            "parental_support", 
            "extra_cirricular",
            "sports",
            "music",
            "volunteering",
            "gpa",
        ],
        "grade_class"
    );

    let x_train = min_max_scalar(input).unwrap();
    Ok((x_train, y_train))

}