1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
//! GPU-resident column type.
//!
//! A [`Column`] owns GPU memory containing a single array of typed data,
//! optionally with a null bitmask. It is the fundamental building block
//! for GPU-accelerated DataFrame operations.
//!
//! # Examples
//!
//! ```rust,no_run
//! use cudf::Column;
//!
//! // Create from a Rust slice (copies data to GPU)
//! let col = Column::from_slice(&[1i32, 2, 3, 4, 5]).unwrap();
//! assert_eq!(col.len(), 5);
//!
//! // Read back to host
//! let data: Vec<i32> = col.to_vec().unwrap();
//! assert_eq!(data, vec![1, 2, 3, 4, 5]);
//! ```
use std::fmt;
use cxx::UniquePtr;
use crate::error::{CudfError, Result};
use crate::types::{DataType, TypeId, checked_i32};
/// An owning, GPU-resident column of typed data.
///
/// `Column` wraps a `std::unique_ptr<cudf::column>` on the C++ side.
/// Dropping a `Column` frees the associated GPU memory.
///
/// # Thread Safety
///
/// `Column` implements [`Send`] (can be moved between threads) but not
/// [`Sync`] (cannot be shared between threads without synchronization).
/// Use `Arc<Mutex<Column>>` if shared access is needed.
pub struct Column {
pub(crate) inner: UniquePtr<cudf_cxx::column::ffi::OwnedColumn>,
}
// SAFETY: GPU memory is process-global; a Column can be safely moved to another thread.
// CUDA operations must still be serialized per-stream, but Column ownership transfer is safe.
unsafe impl Send for Column {}
macro_rules! impl_from_optional {
($fn_name:ident, $ty:ty, $default:expr, $ffi_fn:path) => {
/// Create a nullable column from optional values.
///
/// `None` values become null entries in the GPU column.
pub fn $fn_name(data: &[Option<$ty>]) -> Result<Self> {
let _size = checked_i32(data.len())?;
let values: Vec<$ty> = data.iter().map(|o| o.unwrap_or($default)).collect();
let validity: Vec<bool> = data.iter().map(|o| o.is_some()).collect();
let raw = $ffi_fn(&values, &validity).map_err(CudfError::from_cxx)?;
Ok(Self { inner: raw })
}
};
}
macro_rules! dispatch_from_slice {
($data:expr, $($variant:ident => $ty:ty, $ffi_fn:path);+ $(;)?) => {
match T::TYPE_ID {
$(TypeId::$variant => {
// SAFETY: T matches TYPE_ID by sealed CudfType trait; same size and repr.
let typed = unsafe {
std::slice::from_raw_parts($data.as_ptr() as *const $ty, $data.len())
};
$ffi_fn(typed).map_err(CudfError::from_cxx)?
})+
_ => {
return Err(CudfError::InvalidArgument(format!(
"from_slice does not support {:?}", T::TYPE_ID
)));
}
}
};
}
macro_rules! dispatch_to_vec {
($self_:expr, $ptr:expr, $len:expr, $($variant:ident => $ty:ty, $ffi_fn:path);+ $(;)?) => {
match T::TYPE_ID {
$(TypeId::$variant => {
let out = unsafe { std::slice::from_raw_parts_mut($ptr as *mut $ty, $len) };
$ffi_fn(&$self_.inner, out).map_err(CudfError::from_cxx)?;
})+
TypeId::Bool8 => {
// Read into a temporary u8 buffer, then convert to bool safely.
// Direct write to Vec<bool> memory is UB if GPU data contains values != 0/1.
let mut u8_buf = vec![0u8; $len];
cudf_cxx::column::ffi::column_to_u8(&$self_.inner, &mut u8_buf)
.map_err(CudfError::from_cxx)?;
let bools: Vec<bool> = u8_buf.into_iter().map(|v| v != 0).collect();
// SAFETY: T is guaranteed to be bool by the sealed CudfType trait
// (Bool8 maps only to bool). Vec<bool> and Vec<T=bool> have
// identical layout. We use manual deconstruction to avoid
// relying on transmute's layout guarantees for Vec.
let mut bools = std::mem::ManuallyDrop::new(bools);
let result = unsafe {
Vec::from_raw_parts(
bools.as_mut_ptr() as *mut T,
bools.len(),
bools.capacity(),
)
};
return Ok(result);
}
_ => {
return Err(CudfError::InvalidArgument(format!(
"to_vec does not yet support {:?}",
T::TYPE_ID
)));
}
}
};
}
impl Column {
// -- Accessors --
/// Number of elements in this column.
pub fn len(&self) -> usize {
self.inner.size() as usize
}
/// Whether this column has zero elements.
pub fn is_empty(&self) -> bool {
self.len() == 0
}
/// The data type of this column.
///
/// # Panics
///
/// Panics if the underlying libcudf column has a type_id that this crate
/// does not recognize (e.g. a libcudf version mismatch). Use
/// [`try_data_type`](Self::try_data_type) for a non-panicking alternative.
pub fn data_type(&self) -> DataType {
self.try_data_type()
.unwrap_or_else(|e| panic!("cudf: data_type() failed — {e}"))
}
/// The data type of this column (fallible version).
///
/// Returns an error instead of panicking when the FFI type_id is
/// unrecognized or the decimal parameters are invalid.
pub fn try_data_type(&self) -> Result<DataType> {
let raw = self.inner.type_id();
let id = TypeId::from_raw(raw).ok_or_else(|| {
CudfError::InvalidArgument(format!(
"unrecognized type_id {} from FFI — possible libcudf version mismatch",
raw
))
})?;
let scale = self.inner.type_scale();
if scale != 0 {
DataType::decimal(id, scale).map_err(|e| {
CudfError::InvalidArgument(format!(
"invalid decimal type — type_id {:?} with scale {}: {}",
id, scale, e
))
})
} else {
Ok(DataType::new(id))
}
}
/// Number of null elements. Returns 0 if the column is not nullable.
pub fn null_count(&self) -> usize {
self.inner.null_count() as usize
}
/// Whether this column can contain null values (has a validity bitmask).
pub fn is_nullable(&self) -> bool {
self.inner.is_nullable()
}
/// Whether this column actually contains any null values.
pub fn has_nulls(&self) -> bool {
self.inner.has_nulls()
}
/// Number of child columns (non-zero for nested types like LIST, STRUCT).
pub fn num_children(&self) -> usize {
self.inner.num_children() as usize
}
// -- Construction --
/// Create an empty column with all nulls.
///
/// Currently only numeric types are supported (the underlying C++ uses
/// `cudf::make_numeric_column`). Non-numeric types will return an error.
pub fn empty(dtype: DataType, size: usize) -> Result<Self> {
if !dtype.id().is_numeric() {
return Err(CudfError::InvalidArgument(format!(
"Column::empty currently supports only numeric types, got {:?}",
dtype.id()
)));
}
let size_i32 = checked_i32(size)?;
let raw = cudf_cxx::column::ffi::column_empty(dtype.id() as i32, size_i32)
.map_err(CudfError::from_cxx)?;
Ok(Self { inner: raw })
}
/// Create a string column from a slice of string-like values, copying data to GPU.
///
/// # Examples
///
/// ```rust,no_run
/// use cudf::Column;
///
/// let col = Column::from_strings(&["hello", "world", "!"]).unwrap();
/// assert_eq!(col.len(), 3);
/// ```
pub fn from_strings(data: &[impl AsRef<str>]) -> Result<Self> {
let _size = checked_i32(data.len())?;
let strings: Vec<String> = data.iter().map(|s| s.as_ref().to_string()).collect();
let raw =
cudf_cxx::column::ffi::column_from_strings(&strings).map_err(CudfError::from_cxx)?;
Ok(Self { inner: raw })
}
impl_from_optional!(
from_optional_i8,
i8,
0,
cudf_cxx::column::ffi::column_from_i8_nullable
);
impl_from_optional!(
from_optional_i16,
i16,
0,
cudf_cxx::column::ffi::column_from_i16_nullable
);
impl_from_optional!(
from_optional_i32,
i32,
0,
cudf_cxx::column::ffi::column_from_i32_nullable
);
impl_from_optional!(
from_optional_i64,
i64,
0,
cudf_cxx::column::ffi::column_from_i64_nullable
);
impl_from_optional!(
from_optional_u8,
u8,
0,
cudf_cxx::column::ffi::column_from_u8_nullable
);
impl_from_optional!(
from_optional_u16,
u16,
0,
cudf_cxx::column::ffi::column_from_u16_nullable
);
impl_from_optional!(
from_optional_u32,
u32,
0,
cudf_cxx::column::ffi::column_from_u32_nullable
);
impl_from_optional!(
from_optional_u64,
u64,
0,
cudf_cxx::column::ffi::column_from_u64_nullable
);
impl_from_optional!(
from_optional_f32,
f32,
0.0,
cudf_cxx::column::ffi::column_from_f32_nullable
);
impl_from_optional!(
from_optional_f64,
f64,
0.0,
cudf_cxx::column::ffi::column_from_f64_nullable
);
/// Create a nullable string column from optional values.
///
/// `None` values become null entries in the GPU column.
/// For `None` entries, an empty string is stored as a placeholder;
/// the null bitmask records which entries are actually null.
///
/// # Examples
///
/// ```rust,no_run
/// use cudf::Column;
///
/// let col = Column::from_optional_strings(&[Some("hello"), None, Some("world")]).unwrap();
/// assert_eq!(col.len(), 3);
/// assert_eq!(col.null_count(), 1);
/// ```
pub fn from_optional_strings(data: &[Option<impl AsRef<str>>]) -> Result<Self> {
let _size = checked_i32(data.len())?;
let strings: Vec<String> = data
.iter()
.map(|o| match o {
Some(s) => s.as_ref().to_string(),
None => String::new(),
})
.collect();
let validity: Vec<bool> = data.iter().map(|o| o.is_some()).collect();
let raw = cudf_cxx::column::ffi::column_from_strings_nullable(&strings, &validity)
.map_err(CudfError::from_cxx)?;
Ok(Self { inner: raw })
}
/// Create a nullable bool column from optional values.
///
/// `None` values become null entries in the GPU column.
pub fn from_optional_bool(data: &[Option<bool>]) -> Result<Self> {
let _size = checked_i32(data.len())?;
let values: Vec<bool> = data.iter().map(|o| o.unwrap_or(false)).collect();
let validity: Vec<bool> = data.iter().map(|o| o.is_some()).collect();
let raw = cudf_cxx::column::ffi::column_from_bool_nullable(&values, &validity)
.map_err(CudfError::from_cxx)?;
Ok(Self { inner: raw })
}
// -- String Data Transfer --
/// Extract all strings from a string column to host.
///
/// Returns `Err` if the column is not a `STRING` type.
/// For nullable columns, null entries are returned as empty strings.
/// Use [`null_mask_to_host`](Self::null_mask_to_host) to distinguish nulls from
/// actual empty strings.
///
/// # Examples
///
/// ```rust,no_run
/// use cudf::Column;
///
/// let col = Column::from_strings(&["hello", "world"]).unwrap();
/// let strings = col.to_strings().unwrap();
/// assert_eq!(strings, vec!["hello".to_string(), "world".to_string()]);
/// ```
pub fn to_strings(&self) -> Result<Vec<String>> {
cudf_cxx::column::ffi::column_to_strings(&self.inner).map_err(CudfError::from_cxx)
}
/// Extract all strings from a nullable string column as `Option<String>`.
///
/// Valid entries are wrapped in `Some`, null entries become `None`.
///
/// # Examples
///
/// ```rust,no_run
/// use cudf::Column;
///
/// let col = Column::from_optional_strings(&[Some("hello"), None, Some("world")]).unwrap();
/// let strings = col.to_optional_strings().unwrap();
/// assert_eq!(strings, vec![Some("hello".to_string()), None, Some("world".to_string())]);
/// ```
pub fn to_optional_strings(&self) -> Result<Vec<Option<String>>> {
let strings = self.to_strings()?;
if !self.has_nulls() {
return Ok(strings.into_iter().map(Some).collect());
}
let mask = self.null_mask_to_host()?;
Ok(strings
.into_iter()
.enumerate()
.map(|(i, s)| {
if mask[i / 8] & (1 << (i % 8)) != 0 {
Some(s)
} else {
None
}
})
.collect())
}
// -- Data Transfer --
/// Copy the null bitmask to a host byte vector.
/// Each bit indicates whether the corresponding element is valid (1) or null (0).
///
/// The buffer is sized to match libcudf's `bitmask_allocation_size_bytes`,
/// which pads to 64-byte alignment.
pub fn null_mask_to_host(&self) -> Result<Vec<u8>> {
// libcudf's bitmask_allocation_size_bytes pads to 64-byte boundaries.
// We must match that to avoid "Output buffer too small" from the C++ side.
let num_bits_bytes = self.len().div_ceil(8);
let num_bytes = (num_bits_bytes + 63) & !63; // pad to 64-byte alignment
let num_bytes = num_bytes.max(64); // minimum 64 bytes (matches cudf policy)
let mut buf = vec![0u8; num_bytes];
cudf_cxx::column::ffi::column_null_mask(&self.inner, &mut buf)
.map_err(CudfError::from_cxx)?;
// Truncate to only the meaningful bytes
buf.truncate(self.len().div_ceil(8));
Ok(buf)
}
}
// -- Type-specific construction and transfer --
mod private {
pub trait Sealed {}
}
/// Trait for types that can be stored in a GPU column.
///
/// This is implemented for all primitive numeric types supported by libcudf.
/// It enables generic `Column::from_slice` and `Column::to_vec` operations.
///
/// This trait is sealed and cannot be implemented outside of the `cudf` crate.
pub trait CudfType: Copy + Send + 'static + private::Sealed {
/// The corresponding libcudf type ID.
const TYPE_ID: TypeId;
}
macro_rules! impl_cudf_type {
($($ty:ty => $id:ident),+ $(,)?) => {
$(
impl private::Sealed for $ty {}
impl CudfType for $ty {
const TYPE_ID: TypeId = TypeId::$id;
}
)+
};
}
impl_cudf_type! {
i8 => Int8, i16 => Int16, i32 => Int32, i64 => Int64,
u8 => Uint8, u16 => Uint16, u32 => Uint32, u64 => Uint64,
f32 => Float32, f64 => Float64, bool => Bool8,
}
impl Column {
/// Create a column from a host slice, copying data to GPU.
///
/// # Examples
///
/// ```rust,no_run
/// use cudf::Column;
///
/// let ints = Column::from_slice(&[1i32, 2, 3]).unwrap();
/// let floats = Column::from_slice(&[1.0f64, 2.0, 3.0]).unwrap();
/// ```
pub fn from_slice<T: CudfType>(data: &[T]) -> Result<Self> {
let _size = checked_i32(data.len())?;
let inner = dispatch_from_slice! {
data,
Int8 => i8, cudf_cxx::column::ffi::column_from_i8;
Int16 => i16, cudf_cxx::column::ffi::column_from_i16;
Int32 => i32, cudf_cxx::column::ffi::column_from_i32;
Int64 => i64, cudf_cxx::column::ffi::column_from_i64;
Uint8 => u8, cudf_cxx::column::ffi::column_from_u8;
Uint16 => u16, cudf_cxx::column::ffi::column_from_u16;
Uint32 => u32, cudf_cxx::column::ffi::column_from_u32;
Uint64 => u64, cudf_cxx::column::ffi::column_from_u64;
Float32 => f32, cudf_cxx::column::ffi::column_from_f32;
Float64 => f64, cudf_cxx::column::ffi::column_from_f64;
Bool8 => bool, cudf_cxx::column::ffi::column_from_bool;
};
Ok(Self { inner })
}
/// Copy column data back to host as a Vec.
///
/// # Type Safety
///
/// The type parameter `T` must match the column's actual data type.
/// Returns `Err(CudfError::TypeMismatch)` if they don't match.
///
/// # Nullability
///
/// Returns `Err(CudfError::InvalidArgument)` if the column contains nulls,
/// because null values would be silently replaced by whatever GPU memory
/// happens to contain (indistinguishable from real values).
/// Use [`to_optional_vec`](Self::to_optional_vec) instead for nullable columns.
///
/// # Examples
///
/// ```rust,no_run
/// use cudf::Column;
///
/// let col = Column::from_slice(&[1i32, 2, 3]).unwrap();
/// let data: Vec<i32> = col.to_vec().unwrap();
/// assert_eq!(data, vec![1, 2, 3]);
/// ```
pub fn to_vec<T: CudfType>(&self) -> Result<Vec<T>> {
if self.has_nulls() {
return Err(CudfError::InvalidArgument(
"to_vec() cannot be used on columns with null values (null values would be \
indistinguishable from real data). Use to_optional_vec() instead, or use \
null_mask_to_host() to get the null mask separately."
.into(),
));
}
self.to_vec_raw()
}
/// Copy column data back to host as a `Vec<Option<T>>`, preserving null information.
///
/// Valid elements are wrapped in `Some`, null elements become `None`.
/// If the column has no nulls, all elements will be `Some`.
///
/// # Type Safety
///
/// The type parameter `T` must match the column's actual data type.
/// Returns `Err(CudfError::TypeMismatch)` if they don't match.
///
/// # Examples
///
/// ```rust,no_run
/// use cudf::Column;
///
/// let col = Column::from_optional_i32(&[Some(1), None, Some(3)]).unwrap();
/// let data: Vec<Option<i32>> = col.to_optional_vec().unwrap();
/// assert_eq!(data, vec![Some(1), None, Some(3)]);
/// ```
pub fn to_optional_vec<T: CudfType>(&self) -> Result<Vec<Option<T>>> {
let values = self.to_vec_raw::<T>()?;
if !self.has_nulls() {
return Ok(values.into_iter().map(Some).collect());
}
let mask = self.null_mask_to_host()?;
Ok(values
.into_iter()
.enumerate()
.map(|(i, v)| {
if mask[i / 8] & (1 << (i % 8)) != 0 {
Some(v)
} else {
None
}
})
.collect())
}
/// Internal: transfer column data to host without null checking.
fn to_vec_raw<T: CudfType>(&self) -> Result<Vec<T>> {
// Type check
let actual = self.data_type().id();
if actual != T::TYPE_ID {
return Err(CudfError::TypeMismatch {
expected: format!("{:?}", T::TYPE_ID),
actual: format!("{:?}", actual),
});
}
let len = self.len();
if len == 0 {
return Ok(Vec::new());
}
// Allocate output buffer. We write into spare capacity via raw pointer,
// only calling set_len AFTER the C++ side successfully fills the data.
let mut result: Vec<T> = Vec::with_capacity(len);
let ptr = result.as_mut_ptr();
// Dispatch to the appropriate cudf-cxx transfer function.
dispatch_to_vec! {
self, ptr, len,
Int8 => i8, cudf_cxx::column::ffi::column_to_i8;
Int16 => i16, cudf_cxx::column::ffi::column_to_i16;
Int32 => i32, cudf_cxx::column::ffi::column_to_i32;
Int64 => i64, cudf_cxx::column::ffi::column_to_i64;
Uint8 => u8, cudf_cxx::column::ffi::column_to_u8;
Uint16 => u16, cudf_cxx::column::ffi::column_to_u16;
Uint32 => u32, cudf_cxx::column::ffi::column_to_u32;
Uint64 => u64, cudf_cxx::column::ffi::column_to_u64;
Float32 => f32, cudf_cxx::column::ffi::column_to_f32;
Float64 => f64, cudf_cxx::column::ffi::column_to_f64;
}
// SAFETY: The C++ side has successfully filled exactly `len` elements
// via cudaMemcpy. The type size is guaranteed correct by the type check above.
unsafe {
result.set_len(len);
}
Ok(result)
}
}
impl fmt::Debug for Column {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Display::fmt(self, f)
}
}
impl fmt::Display for Column {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"Column({}, len={}, nulls={})",
self.data_type(),
self.len(),
self.null_count()
)
}
}