Function gbdt::input::load

source ·
pub fn load(file_name: &str, input_format: InputFormat) -> Result<DataVec>
Expand description

Load file with certain input format.

§Example

§LibSVM format

use gbdt::input::InputFormat;
use gbdt::input;
let test_file = "data/xgb_binary_logistic/agaricus.txt.test";
let mut fmt = input::InputFormat::txt_format();
fmt.set_feature_size(126);
fmt.set_delimeter(' ');
let test_data = input::load(test_file, fmt);

§CSV format

use gbdt::input::InputFormat;
use gbdt::input;
let test_file = "data/xgb_multi_softmax/dermatology.data.test";
let mut fmt = InputFormat::csv_format();
fmt.set_feature_size(34);
let test_data = input::load(test_file, fmt);

§Error

Raise error if file cannot be open correctly.

Examples found in repository?
examples/test-iris.rs (line 27)
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
fn main() {
    let mut cfg = Config::new();
    cfg.set_feature_size(4);
    cfg.set_max_depth(4);
    cfg.set_iterations(100);
    cfg.set_shrinkage(0.1);
    cfg.set_loss("LAD");
    cfg.set_debug(true);
    cfg.set_training_optimization_level(2);

    // load data
    let train_file = "dataset/iris/train.txt";
    let test_file = "dataset/iris/test.txt";

    let mut input_format = InputFormat::csv_format();
    input_format.set_feature_size(4);
    input_format.set_label_index(4);
    let mut train_dv: DataVec =
        load(train_file, input_format).expect("failed to load training data");
    let test_dv: DataVec = load(test_file, input_format).expect("failed to load test data");

    // train and save the model
    let mut gbdt = GBDT::new(&cfg);
    gbdt.fit(&mut train_dv);
    gbdt.save_model("gbdt.model")
        .expect("failed to save the model");

    // load the model and do inference
    let model = GBDT::load_model("gbdt.model").expect("failed to load the model");
    let predicted: PredVec = model.predict(&test_dv);

    assert_eq!(predicted.len(), test_dv.len());
    let mut correct = 0;
    let mut wrong = 0;
    for i in 0..predicted.len() {
        if almost_equal_thrs(test_dv[i].label, predicted[i], 0.0001) {
            correct += 1;
        } else {
            wrong += 1;
        };
        println!("[{}]  {}  {}", i, test_dv[i].label, predicted[i]);
    }

    println!("correct: {}", correct);
    println!("wrong:   {}", wrong);

    assert!(wrong <= 2);
}
More examples
Hide additional examples
examples/test-multithreads.rs (line 24)
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
fn main() {
    let thread_num = 12;
    let feature_size = 36;
    let model_path = "xgb-data/xgb_reg_linear/gbdt.model";
    let test_file = "xgb-data/xgb_reg_linear/machine.txt.test";

    // load model
    let gbdt = GBDT::from_xgboost_dump(model_path, "reg:linear").expect("faild to load model");

    // load test data
    let mut fmt = input::InputFormat::txt_format();
    fmt.set_feature_size(feature_size);
    fmt.set_delimeter(' ');
    let mut test_data = input::load(test_file, fmt).unwrap();

    // split test data to `thread_num` vectors.
    let t1 = Instant::now();
    let mut handles = vec![];
    let mut test_data_vec = vec![];
    let data_size = test_data.len();
    let batch_size = (data_size - 1) / thread_num + 1;
    for one_batch in test_data.chunks(batch_size) {
        test_data_vec.push(one_batch.to_vec())
    }

    test_data.clear();
    test_data.shrink_to_fit();
    let t2 = Instant::now();
    println!("split data: {:?}", t2 - t1);

    // Create `thread_num` threads. Call gbdt::predict in parallel
    let t1 = Instant::now();
    let gbdt_arc = Arc::new(gbdt);
    for data in test_data_vec.into_iter() {
        let gbdt_clone = Arc::clone(&gbdt_arc);
        let handle = thread::spawn(move || gbdt_clone.predict(&data));
        handles.push(handle)
    }

    // collect results
    let mut preds = Vec::with_capacity(data_size);
    for handle in handles {
        preds.append(&mut handle.join().unwrap());
    }

    let t2 = Instant::now();
    println!("predict data: {:?}", t2 - t1);
    assert_eq!(preds.len(), data_size);
}
examples/test-xgb-rank-pairwise.rs (line 21)
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
fn main() {
    // Call this command to convert xgboost model:
    // python examples/convert_xgboost.py xgb-data/xgb_rank_pairwise/xgb.model "rank:pairwise" xgb-data/xgb_rank_pairwise/gbdt.model
    // load model
    let gbdt = GBDT::from_xgboost_dump("xgb-data/xgb_rank_pairwise/gbdt.model", "rank:pairwise")
        .expect("failed to load model");

    // load test data
    let test_file = "xgb-data/xgb_rank_pairwise/mq2008.test";
    let mut input_format = input::InputFormat::txt_format();
    input_format.set_feature_size(47);
    input_format.set_delimeter(' ');
    let test_data = input::load(test_file, input_format).expect("failed to load test data");

    // inference
    println!("start prediction");
    let predicted: PredVec = gbdt.predict(&test_data);
    assert_eq!(predicted.len(), test_data.len());

    // compare to xgboost prediction results
    let predict_result = "xgb-data/xgb_rank_pairwise/pred.csv";

    let mut xgb_results = Vec::new();
    let file = File::open(predict_result).expect("failed to load pred.csv");
    let reader = BufReader::new(file);
    for line in reader.lines() {
        let text = line.expect("failed to read data from pred.csv");
        let value: ValueType = text.parse().expect("failed to parse data from pred.csv");
        xgb_results.push(value);
    }

    let mut max_diff: ValueType = -1.0;
    for (value1, value2) in predicted.iter().zip(xgb_results.iter()) {
        println!("{} {}", value1, value2);
        let diff = (value1 - value2).abs();
        if diff > max_diff {
            max_diff = diff;
        }
    }

    println!(
        "Compared to results from xgboost, max error is: {:.10}",
        max_diff
    );
    assert!(max_diff < 0.01);
}
examples/test-xgb-reg-linear.rs (line 22)
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
fn main() {
    // Use xg.py in xgb-data/xgb_reg_linear to generate a model and get prediction results from xgboost.
    // Call this command to convert xgboost model:
    // python examples/convert_xgboost.py xgb-data/xgb_reg_linear/xgb.model "reg:linear" xgb-data/xgb_reg_linear/gbdt.model
    // load model
    let gbdt = GBDT::from_xgboost_dump("xgb-data/xgb_reg_linear/gbdt.model", "reg:linear")
        .expect("failed to load model");

    // load test data
    let test_file = "xgb-data/xgb_reg_linear/machine.txt.test";
    let mut input_format = input::InputFormat::txt_format();
    input_format.set_feature_size(36);
    input_format.set_delimeter(' ');
    let test_data = input::load(test_file, input_format).expect("failed to load test data");

    // inference
    println!("start prediction");
    let predicted: PredVec = gbdt.predict(&test_data);
    assert_eq!(predicted.len(), test_data.len());

    // compare to xgboost prediction results
    let predict_result = "xgb-data/xgb_reg_linear/pred.csv";

    let mut xgb_results = Vec::new();
    let file = File::open(predict_result).expect("failed to load pred.csv");
    let reader = BufReader::new(file);
    for line in reader.lines() {
        let text = line.expect("failed to read data from pred.csv");
        let value: ValueType = text.parse().expect("failed to parse data from pred.csv");
        xgb_results.push(value);
    }

    let mut max_diff: ValueType = -1.0;
    for (value1, value2) in predicted.iter().zip(xgb_results.iter()) {
        println!("{} {}", value1, value2);
        let diff = (value1 - value2).abs();
        if diff > max_diff {
            max_diff = diff;
        }
    }

    println!(
        "Compared to results from xgboost, max error is: {:.10}",
        max_diff
    );
    assert!(max_diff < 0.01);
}
examples/test-agaricus-lepiota.rs (line 29)
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
fn main() {
    let mut cfg = Config::new();
    cfg.set_feature_size(22);
    cfg.set_max_depth(3);
    cfg.set_iterations(50);
    cfg.set_shrinkage(0.1);
    cfg.set_loss("LogLikelyhood");
    cfg.set_debug(true);
    //cfg.set_data_sample_ratio(0.8);
    //cfg.set_feature_sample_ratio(0.5);
    cfg.set_training_optimization_level(2);

    // load data
    let train_file = "dataset/agaricus-lepiota/train.txt";
    let test_file = "dataset/agaricus-lepiota/test.txt";

    let mut input_format = InputFormat::csv_format();
    input_format.set_feature_size(22);
    input_format.set_label_index(22);
    let mut train_dv: DataVec =
        load(train_file, input_format).expect("failed to load training data");
    let test_dv: DataVec = load(test_file, input_format).expect("failed to load test data");

    // train and save model
    let mut gbdt = GBDT::new(&cfg);
    gbdt.fit(&mut train_dv);
    gbdt.save_model("gbdt.model")
        .expect("failed to save the model");

    // load model and do inference
    let model = GBDT::load_model("gbdt.model").expect("failed to load the model");
    let predicted: PredVec = model.predict(&test_dv);

    assert_eq!(predicted.len(), test_dv.len());
    let mut correct = 0;
    let mut wrong = 0;
    for i in 0..predicted.len() {
        let label = if predicted[i] > 0.5 { 1.0 } else { -1.0 };
        if (test_dv[i].label - label).abs() < 0.0001 {
            correct += 1;
        } else {
            wrong += 1;
        };
        //println!("[{}]  {}  {}", i, test_dv[i].label, predicted[i]);
    }

    println!("correct: {}", correct);
    println!("wrong:   {}", wrong);

    let auc = AUC(&test_dv, &predicted, test_dv.len());
    println!("AUC: {}", auc);

    use gbdt::fitness::almost_equal;
    assert_eq!(wrong, 0);
    assert!(almost_equal(auc, 1.0));
}
examples/test-xgb-multi-softmax.rs (line 21)
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
fn main() {
    // Use xg.py in xgb-data/xgb_multi_softmax to generate a model and get prediction results from xgboost.
    // Call this command to convert xgboost model:
    // python examples/convert_xgboost.py xgb-data/xgb_multi_softmax/xgb.model "multi:softmax" xgb-data/xgb_multi_softmax/gbdt.model
    // load model
    let gbdt = GBDT::from_xgboost_dump("xgb-data/xgb_multi_softmax/gbdt.model", "multi:softmax")
        .expect("failed to load model");

    // load test data
    let test_file = "xgb-data/xgb_multi_softmax/dermatology.data.test";
    let mut input_format = input::InputFormat::csv_format();
    input_format.set_label_index(34);
    let test_data = input::load(test_file, input_format).expect("failed to load test data");

    // inference
    println!("start prediction");
    let (labels, _probs) = gbdt.predict_multiclass(&test_data, 6);
    assert_eq!(labels.len(), test_data.len());

    // compare to xgboost prediction results
    let predict_result = "xgb-data/xgb_multi_softmax/pred.csv";

    let mut xgb_results = Vec::new();
    let file = File::open(predict_result).expect("failed to load pred.csv");
    let reader = BufReader::new(file);
    for line in reader.lines() {
        let text = line.expect("failed to read data from pred.csv");
        let value: ValueType = text.parse().expect("failed to parse data from pred.csv");
        xgb_results.push(value);
    }

    let mut max_diff: ValueType = -1.0;
    for (value1, value2) in labels.iter().zip(xgb_results.iter()) {
        println!("{} {}", value1, value2);
        let diff = (*value1 as ValueType - *value2).abs();
        if diff > max_diff {
            max_diff = diff;
        }
    }

    println!(
        "Compared to results from xgboost, max error is: {:.10}",
        max_diff
    );
    assert!(max_diff < 0.01);
}