kermit-ds 0.1.1

Data structures used in Kermit
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
//! Core relation abstraction: the [`Relation`] trait that every storage
//! backend implements (see `TreeTrie` and `ColumnTrie` in [`crate::ds`]),
//! plus the blanket [`RelationFileExt`] for loading from CSV or Parquet.
//!
//! The trait exists so join algorithms in `kermit-algos` can be written
//! generically over different trie layouts without coupling to a specific
//! representation. All tuple values are `usize` keys — typically
//! dictionary-encoded IDs from a separate symbol table — so a relation never
//! stores raw strings or domain values directly.
use {
    arrow::array::AsArray,
    kermit_iters::JoinIterable,
    parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder,
    std::{fmt, fs::File, path::Path},
};

/// Error type for relation file operations (CSV and Parquet).
#[derive(Debug)]
pub enum RelationError {
    /// A CSV library error.
    Csv(csv::Error),
    /// A filesystem I/O error.
    Io(std::io::Error),
    /// A Parquet library error.
    Parquet(parquet::errors::ParquetError),
    /// An Arrow conversion error.
    Arrow(arrow::error::ArrowError),
    /// A data value that could not be converted (e.g. non-integer in a CSV).
    InvalidData(String),
}

impl fmt::Display for RelationError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            | RelationError::Csv(e) => write!(f, "CSV error: {e}"),
            | RelationError::Io(e) => write!(f, "I/O error: {e}"),
            | RelationError::Parquet(e) => write!(f, "Parquet error: {e}"),
            | RelationError::Arrow(e) => write!(f, "Arrow error: {e}"),
            | RelationError::InvalidData(msg) => write!(f, "Invalid data: {msg}"),
        }
    }
}

impl std::error::Error for RelationError {
    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
        match self {
            | RelationError::Csv(e) => Some(e),
            | RelationError::Io(e) => Some(e),
            | RelationError::Parquet(e) => Some(e),
            | RelationError::Arrow(e) => Some(e),
            | RelationError::InvalidData(_) => None,
        }
    }
}

impl From<csv::Error> for RelationError {
    fn from(e: csv::Error) -> Self { RelationError::Csv(e) }
}

impl From<std::io::Error> for RelationError {
    fn from(e: std::io::Error) -> Self { RelationError::Io(e) }
}

impl From<parquet::errors::ParquetError> for RelationError {
    fn from(e: parquet::errors::ParquetError) -> Self { RelationError::Parquet(e) }
}

impl From<arrow::error::ArrowError> for RelationError {
    fn from(e: arrow::error::ArrowError) -> Self { RelationError::Arrow(e) }
}

/// Whether a relation's attributes are identified by name or by position.
///
/// Returned by [`RelationHeader::model_type`]. A header is [`Named`] when it
/// carries explicit attribute names (e.g. from a CSV/Parquet schema), and
/// [`Positional`] otherwise — typical for intermediate relations produced
/// during query evaluation where only arity matters.
///
/// [`Named`]: ModelType::Named
/// [`Positional`]: ModelType::Positional
pub enum ModelType {
    /// Attributes are accessed by column index only; attribute names are
    /// absent.
    Positional,
    /// Attributes have explicit string names, typically sourced from a file
    /// header or schema.
    Named,
}

/// Metadata for a relation: its name, attribute names, and arity.
///
/// A header is **named** when `attrs` is non-empty (then `arity ==
/// attrs.len()`) and **positional** when `attrs` is empty (then `arity` is
/// the only authoritative column count). Orthogonally, a header is
/// **nameless** when its `name` is empty — used for intermediate or
/// projected relations whose origin no longer matters.
#[derive(Clone, Debug)]
pub struct RelationHeader {
    name: String,
    /// Attribute names. Empty iff this is a positional header.
    attrs: Vec<String>,
    /// Number of columns. For named headers this equals `attrs.len()`; for
    /// positional headers (`attrs.is_empty()`) it is the only authoritative
    /// column count.
    arity: usize,
}

impl RelationHeader {
    /// Creates a named header with the given attribute names. Arity is
    /// derived from `attrs.len()`.
    pub fn new(name: impl Into<String>, attrs: Vec<String>) -> Self {
        let arity = attrs.len();
        RelationHeader {
            name: name.into(),
            attrs,
            arity,
        }
    }

    /// Creates a nameless header with the given attribute names. Arity is
    /// inferred from the length of `attrs`.
    pub fn new_nameless(attrs: Vec<String>) -> Self {
        let arity = attrs.len();
        RelationHeader {
            name: String::new(),
            attrs,
            arity,
        }
    }

    /// Creates a named header with positional (unnamed) attributes.
    pub fn new_positional(name: impl Into<String>, arity: usize) -> Self {
        RelationHeader {
            name: name.into(),
            attrs: vec![],
            arity,
        }
    }

    /// Creates a nameless header with positional attributes of the given arity.
    pub fn new_nameless_positional(arity: usize) -> Self {
        RelationHeader {
            name: String::new(),
            attrs: vec![],
            arity,
        }
    }

    /// Returns `true` if this header has an empty name (i.e. was created via
    /// one of the `new_nameless*` constructors).
    pub fn is_nameless(&self) -> bool { self.name.is_empty() }

    /// Returns the relation's name (empty string for nameless headers).
    pub fn name(&self) -> &str { &self.name }

    /// Returns the attribute names. Empty for positional headers.
    pub fn attrs(&self) -> &[String] { &self.attrs }

    /// Returns the arity (number of columns) of the relation.
    pub fn arity(&self) -> usize { self.arity }

    /// Returns [`ModelType::Named`] when attribute names are set, otherwise
    /// [`ModelType::Positional`].
    pub fn model_type(&self) -> ModelType {
        if self.attrs.is_empty() {
            ModelType::Positional
        } else {
            ModelType::Named
        }
    }
}

impl From<usize> for RelationHeader {
    fn from(value: usize) -> RelationHeader { RelationHeader::new_nameless_positional(value) }
}

/// A relation that can produce a new relation containing only the specified
/// columns.
///
/// Projection is the π operator from relational algebra: given column indices
/// `[c₀, c₁, …]` it yields a relation whose `i`-th column is the `cᵢ`-th
/// column of the source. Duplicate and reordered indices are permitted; the
/// resulting relation has arity `columns.len()`.
pub trait Projectable {
    /// Returns a new relation containing only the columns at the given
    /// indices, in the order supplied.
    ///
    /// # Panics
    ///
    /// Panics if any element of `columns` is `>= self.header().arity()`.
    fn project(&self, columns: Vec<usize>) -> Self;
}

/// A relational data structure that stores tuples of `usize` keys and can
/// participate in joins.
///
/// Tuple values are `usize` keys (typically dictionary-encoded — see the
/// module-level docs). The supertraits expose:
///
/// - [`JoinIterable`] — produces iterators that the join algorithms in
///   `kermit-algos` consume. Implementors typically also implement
///   [`TrieIterable`](kermit_iters::TrieIterable) so the iterator can be driven
///   hierarchically.
/// - [`Projectable`] — the relational π operator (column projection).
pub trait Relation: JoinIterable + Projectable {
    /// Returns the header describing this relation's name, attributes, and
    /// arity.
    fn header(&self) -> &RelationHeader;

    /// Creates an empty relation matching `header`.
    fn new(header: RelationHeader) -> Self;

    /// Creates a relation populated with `tuples`, matching `header`.
    /// Implementations may sort or deduplicate during bulk construction;
    /// prefer this over `new` followed by repeated `insert` calls when all
    /// tuples are known up front.
    ///
    /// # Panics
    ///
    /// Panics if any tuple's length does not equal `header.arity()`.
    fn from_tuples(header: RelationHeader, tuples: Vec<Vec<usize>>) -> Self;

    /// Inserts a tuple. Duplicate tuples are silently absorbed (the relation
    /// behaves as a set).
    ///
    /// # Panics
    ///
    /// Panics if `tuple.len() != self.header().arity()`.
    fn insert(&mut self, tuple: Vec<usize>);

    /// Inserts every tuple in `tuples`. Equivalent to calling
    /// [`insert`](Self::insert) in a loop; provided so implementations can
    /// specialise bulk insertion.
    ///
    /// # Panics
    ///
    /// Panics if any tuple's length does not match the relation's arity.
    fn insert_all(&mut self, tuples: Vec<Vec<usize>>);
}

/// Loads a [`Relation`] from a CSV or Parquet file.
///
/// Defined as an extension trait (with a blanket impl over every
/// [`Relation`]) so file-loading is added without bloating the core trait
/// or requiring each concrete data structure to reimplement it. Anything
/// that implements [`Relation`] automatically gains
/// [`from_csv`](Self::from_csv) and [`from_parquet`](Self::from_parquet).
pub trait RelationFileExt: Relation {
    /// Creates a new relation from a Parquet file.
    ///
    /// Column names are extracted from the Parquet schema and the relation
    /// name is taken from the file stem. All columns must be `Int64` and
    /// every value must be non-negative so it fits in `usize`.
    ///
    /// # Errors
    ///
    /// Returns a [`RelationError`] if any of the following occur:
    /// - [`RelationError::Io`] — the file cannot be opened.
    /// - [`RelationError::Parquet`] — the file is not a valid Parquet file or
    ///   the reader cannot be constructed.
    /// - [`RelationError::Arrow`] — a record batch fails to decode.
    /// - [`RelationError::InvalidData`] — an `Int64` value cannot be converted
    ///   to `usize` (e.g. it is negative).
    fn from_parquet<P: AsRef<Path>>(filepath: P) -> Result<Self, RelationError>
    where
        Self: Sized;

    /// Creates a new relation from a CSV file.
    ///
    /// The first row is treated as a header providing attribute names; each
    /// subsequent row is one tuple. Every field must parse as a `usize`. The
    /// relation name is taken from the file stem.
    ///
    /// # Errors
    ///
    /// Returns a [`RelationError`] if any of the following occur:
    /// - [`RelationError::Io`] — the file cannot be opened.
    /// - [`RelationError::Csv`] — the CSV reader cannot parse the header or a
    ///   row (e.g. inconsistent column count).
    /// - [`RelationError::InvalidData`] — a field cannot be parsed as a
    ///   `usize`; the message identifies the offending row and column.
    fn from_csv<P: AsRef<Path>>(filepath: P) -> Result<Self, RelationError>
    where
        Self: Sized;
}

/// Blanket implementation of `RelationFileExt` for any type that
/// implements `Relation`.
impl<R> RelationFileExt for R
where
    R: Relation,
{
    fn from_csv<P: AsRef<Path>>(filepath: P) -> Result<Self, RelationError> {
        let path = filepath.as_ref();
        let file = File::open(path)?;

        let mut rdr = csv::ReaderBuilder::new()
            .has_headers(true)
            .delimiter(b',')
            .double_quote(false)
            .escape(Some(b'\\'))
            .flexible(false)
            .comment(Some(b'#'))
            .from_reader(file);

        // Extract column names from CSV header
        let attrs: Vec<String> = rdr.headers()?.iter().map(|s| s.to_string()).collect();

        // Extract relation name from filename (without extension)
        let relation_name = path
            .file_stem()
            .and_then(|s| s.to_str())
            .unwrap_or("")
            .to_string();

        // Create header from the CSV header with the extracted name
        let header = RelationHeader::new(relation_name, attrs);

        let mut tuples = Vec::new();
        for (row_idx, result) in rdr.records().enumerate() {
            let record = result?;
            let mut tuple: Vec<usize> = Vec::with_capacity(record.len());
            for (col_idx, field) in record.iter().enumerate() {
                let value = field.parse::<usize>().map_err(|_| {
                    RelationError::InvalidData(format!(
                        "row {row_idx}, column {col_idx}: cannot parse {:?} as usize",
                        field,
                    ))
                })?;
                tuple.push(value);
            }
            tuples.push(tuple);
        }
        Ok(R::from_tuples(header, tuples))
    }

    fn from_parquet<P: AsRef<Path>>(filepath: P) -> Result<Self, RelationError> {
        let path = filepath.as_ref();
        let file = File::open(path)?;

        let builder = ParquetRecordBatchReaderBuilder::try_new(file)?;

        // Extract schema to get column names
        let schema = builder.schema();
        let attrs: Vec<String> = schema
            .fields()
            .iter()
            .map(|field| field.name().clone())
            .collect();

        // Extract relation name from filename (without extension)
        let relation_name = path
            .file_stem()
            .and_then(|s| s.to_str())
            .unwrap_or("")
            .to_string();

        // Create header from the parquet schema with the extracted name
        let header = RelationHeader::new(relation_name, attrs);

        // Build the reader
        let reader = builder.build()?;

        // Collect all tuples first for efficient construction
        let mut tuples = Vec::new();

        // Read all record batches and collect tuples
        for batch_result in reader {
            let batch = batch_result?;

            let num_rows = batch.num_rows();
            let num_cols = batch.num_columns();

            // Convert columnar data to row format (tuples)
            for row_idx in 0..num_rows {
                let mut tuple: Vec<usize> = Vec::with_capacity(num_cols);

                for col_idx in 0..num_cols {
                    let column = batch.column(col_idx);
                    let int_array = column.as_primitive::<arrow::datatypes::Int64Type>();

                    if let Ok(value) = usize::try_from(int_array.value(row_idx)) {
                        tuple.push(value);
                    } else {
                        return Err(RelationError::InvalidData(
                            "failed to convert Parquet value to usize".into(),
                        ));
                    }
                }

                tuples.push(tuple);
            }
        }

        // Use from_tuples for efficient construction (sorts before insertion)
        Ok(R::from_tuples(header, tuples))
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    // ── RelationError Display ──────────────────────────────────────────

    #[test]
    fn relation_error_display_csv() {
        let csv_err = csv::Error::from(std::io::Error::new(
            std::io::ErrorKind::NotFound,
            "file not found",
        ));
        let err = RelationError::from(csv_err);
        let msg = err.to_string();
        assert!(msg.starts_with("CSV error:"), "got: {msg}");
    }

    #[test]
    fn relation_error_display_io() {
        let err = RelationError::from(std::io::Error::new(std::io::ErrorKind::NotFound, "gone"));
        assert!(err.to_string().starts_with("I/O error:"));
    }

    #[test]
    fn relation_error_display_invalid_data() {
        let err = RelationError::InvalidData("bad value".into());
        assert_eq!(err.to_string(), "Invalid data: bad value");
    }

    #[test]
    fn relation_error_source_delegates() {
        use std::error::Error;

        let io_err = std::io::Error::other("inner");
        let err = RelationError::Io(io_err);
        assert!(err.source().is_some());

        let err = RelationError::InvalidData("no source".into());
        assert!(err.source().is_none());
    }

    // ── from_csv error on invalid data ─────────────────────────────────

    #[test]
    fn from_csv_rejects_non_integer_values() {
        use crate::ds::TreeTrie;

        let dir = std::env::temp_dir();
        let path = dir.join("test_csv_bad_value.csv");
        std::fs::write(&path, "a,b\n1,2\n3,hello\n").unwrap();

        let result = TreeTrie::from_csv(&path);
        assert!(result.is_err(), "expected error for non-integer CSV value");

        let err = result.unwrap_err();
        let msg = err.to_string();
        assert!(
            msg.contains("hello"),
            "error should mention the bad value, got: {msg}"
        );
        assert!(
            msg.contains("row 1"),
            "error should mention the row, got: {msg}"
        );
        assert!(
            msg.contains("column 1"),
            "error should mention the column, got: {msg}"
        );

        std::fs::remove_file(path).ok();
    }

    #[test]
    fn from_csv_missing_file_returns_error() {
        use crate::ds::TreeTrie;

        let result = TreeTrie::from_csv("/tmp/nonexistent_kermit_test_file.csv");
        assert!(result.is_err());
        assert!(
            matches!(result.unwrap_err(), RelationError::Io(_)),
            "expected Io variant for missing file"
        );
    }

    // ── from_parquet error paths ───────────────────────────────────────

    #[test]
    fn from_parquet_missing_file_returns_error() {
        use crate::ds::TreeTrie;

        let result = TreeTrie::from_parquet("/tmp/nonexistent_kermit_test_file.parquet");
        assert!(result.is_err());
        assert!(
            matches!(result.unwrap_err(), RelationError::Io(_)),
            "expected Io variant for missing file"
        );
    }

    #[test]
    fn from_parquet_invalid_file_returns_error() {
        use crate::ds::TreeTrie;

        let dir = std::env::temp_dir();
        let path = dir.join("test_bad_parquet.parquet");
        std::fs::write(&path, b"this is not a parquet file").unwrap();

        let result = TreeTrie::from_parquet(&path);
        assert!(result.is_err());
        assert!(
            matches!(result.unwrap_err(), RelationError::Parquet(_)),
            "expected Parquet variant for corrupt file"
        );

        std::fs::remove_file(path).ok();
    }
}