Crate bodkin

Crate bodkin 

Source
Expand description

Bodkin is a library that provides a derive macro to generate Arrow integration code for Rust structs.

For example, given the following struct:

pub struct Example {
 pub id: u32,
}

The derive macro will generate code similar to the following:

 
// The user writes this code.
pub struct Example {
  pub id: u32,
}
 
// The derive macro generates the `ExampleArrow` struct and associated methods.
 
// Arrow works with arrays, so this generated code associates an array with each field of the original struct.
pub struct ExampleArrow {
    pub ids: arrow_array::UInt32Array,
}
 
impl ExampleArrow {
    /// Convert an arrow `RecordBatch` to an ExampleArrow struct, in a fallible way.
    pub fn try_from_record_batch(
        batch: &arrow_array::RecordBatch,
    ) -> bodkin::Result<Self> {
        let ids = batch
            .column_by_name("id")
            .ok_or_else(|| bodkin::BodkinError::new("missing column 'id'".into()))?
            .as_any()
            .downcast_ref::<arrow_array::UInt32Array>()
            .ok_or_else(|| bodkin::BodkinError::new("invalid column 'id'".into()))?;
        bodkin::Result::Ok(ExampleArrow { ids: ids.clone() })
    }
}
impl ExampleArrow {
    /// Generate an Arrow schema, this is useful for generating parquet or lancedb tables.
    pub fn arrow_schema() -> arrow::datatypes::Schema {
        let fields = vec![
                arrow::datatypes::Field::new(
                    "id",
                    arrow::datatypes::DataType::UInt32,
                    false,
                ),
            ];
        arrow::datatypes::Schema::new(fields)
    }
}
 
impl ExampleArrow {
    /// Convert a slice of Example-s to an arrow `RecordBatch`, in a fallible way.
    pub fn to_record_batch(
        items: &[Example],
    ) -> bodkin::Result<arrow_array::RecordBatch> {
        use arrow_array::Array;
        let schema = Self::arrow_schema();
        let ids = arrow_array::UInt32Array::from(
            items.iter().map(|item| item.id.clone()).collect::<Vec<_>>(),
        );
        let out = arrow_array::RecordBatch::try_new(
            std::sync::Arc::new(schema),
            vec![std::sync::Arc::new(ids)],
        )?;
        bodkin::Result::Ok(out)
    }
}

The following user code uses the generated code:

use bodkin::ArrowIntegration;
use std::slice;
 
#[derive(ArrowIntegration)]
pub struct Example {
  pub id: u32,
}
 
fn main() {
    println!("Generated schema: {:#?}", ExampleArrow::arrow_schema());
    let data = Example { id: 1 };
    let record_batch = ExampleArrow::to_record_batch(slice::from_ref(&data))
        .expect("Failed to convert to record batch");
    println!("Generated record batch: {:#?}", record_batch);
    let round_trip_data =
        ExampleArrow::try_from_record_batch(&record_batch).expect("Failed to read from record batch");
    assert_eq!(data.id, round_trip_data.ids.value(0));
}

Enums§

BodkinError
Error used internally by the Bodkin library.

Type Aliases§

Result
A specialized Result type to be used by the code generated by the ArrowIntegration derive macro.