etl-unit 0.1.0

Semantic data model for ETL units — qualities and measurements over subjects and time. Built on Polars.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
//! Fragment data references for the ETL pipeline.
//!
//! # Type Hierarchy
//!
//! Data flows through these types as it is refined from raw source data
//! into measurements in the Universe:
//!
//! ```text
//! BoundSource (Arc<DataFrame>)
//!   The raw data from a source chain, with column mappings from
//!   physical names to canonical names. The Arc allows multiple
//!   measurements to share a reference to the same data.
//!
//!   → MeasurementFragment (FragmentRef)
//!       One source's contribution to one measurement. Holds a
//!       FragmentRef — typically a ColumnRef pointing back into the
//!       BoundSource's Arc<DataFrame>. No data is copied at this stage.
//!
//!     → ComposedMeasurement (FragmentRef or materialized)
//!         All sources' fragments for one measurement, composed.
//!         Single-source measurements pass through as ColumnRef.
//!         Multi-source measurements stack or materialize.
//!
//!       → MeasurementData (materializes on demand)
//!           The final measurement in the Universe. Materializes
//!           the FragmentRef into an owned DataFrame only when the
//!           subset path needs to read the data (signal policy,
//!           crush, resample, join).
//! ```
//!
//! # FragmentRef variants
//!
//! - **ColumnRef** — a column reference into a shared source DataFrame.
//!   Used for direct measurements (sump, discharge) from a single source.
//!   The `Arc<DataFrame>` is shared across all measurements extracted from
//!   the same source. No data is copied.
//!
//! - **Stacked** — multiple fragments from different sources, each holding
//!   a reference to its source DataFrame. Used when the same measurement
//!   comes from multiple sources with the same shape. Not materialized
//!   until the subset path needs a contiguous DataFrame.
//!
//! - **Materialized** — an owned DataFrame. Used when the shape changed
//!   from the source (unpivot, component crush, truth mapping) or when
//!   explicit concatenation was required.

use std::sync::Arc;

use memuse::DynamicUsage;
use polars::prelude::*;
use serde::Serialize;

use crate::column::{CanonicalColumnName, SourceColumnName};

// ============================================================================
// Deferred Transform
// ============================================================================

/// A deferred, shape-preserving transformation applied at materialization.
///
/// Wraps a closure that transforms a LazyFrame. Used for isomorphic
/// operations like null-value substitution and binary truth mapping
/// that don't change the row count or column structure.
///
/// Cloneable via Arc. Debug prints a label, not the closure body.
#[derive(Clone)]
pub struct DeferredTransform {
    label: String,
    apply: Arc<dyn Fn(LazyFrame) -> LazyFrame + Send + Sync>,
}

impl DeferredTransform {
    /// Create a new deferred transform with a descriptive label.
    pub fn new(
        label: impl Into<String>,
        apply: impl Fn(LazyFrame) -> LazyFrame + Send + Sync + 'static,
    ) -> Self {
        Self {
            label: label.into(),
            apply: Arc::new(apply),
        }
    }

    /// Apply the transform to a LazyFrame.
    pub fn apply(&self, lf: LazyFrame) -> LazyFrame {
        (self.apply)(lf)
    }

    /// The label describing what this transform does.
    pub fn label(&self) -> &str {
        &self.label
    }
}

impl std::fmt::Debug for DeferredTransform {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "DeferredTransform({})", self.label)
    }
}

/// Identifier for a data source (e.g., "scada", "mrms").
/// Defined here in etl-unit so FragmentRef can use it
/// without depending on the data-pipeline crate.
#[derive(Debug, Clone, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
#[serde(transparent)]
pub struct DataSourceName(String);

impl DataSourceName {
    pub fn new(name: impl Into<String>) -> Self {
        Self(name.into())
    }
    pub fn as_str(&self) -> &str {
        &self.0
    }
}

impl std::fmt::Display for DataSourceName {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.0)
    }
}

impl From<&str> for DataSourceName {
    fn from(s: &str) -> Self {
        Self(s.to_string())
    }
}

impl From<String> for DataSourceName {
    fn from(s: String) -> Self {
        Self(s)
    }
}

/// How a measurement fragment references its data.
///
/// A `FragmentRef` is how a `MeasurementFragment` holds its data —
/// either as a cheap pointer into a shared source DataFrame, or as an
/// owned DataFrame when the data has been transformed.
///
/// See the [module-level docs](self) for where this fits in the type hierarchy.
#[derive(Debug, Clone)]
pub enum FragmentRef {
    /// Column reference into a shared source DataFrame.
    ///
    /// The source DataFrame contains all columns from the source
    /// (subject, time, multiple measurements). This variant records
    /// which value column to extract and how to rename columns from
    /// source names to canonical names. No data is copied.
    ColumnRef(ColumnRefData),

    /// Multiple same-shape fragments from different sources.
    ///
    /// Each fragment references its source DataFrame. Materialized
    /// only when `as_dataframe()` is called (e.g., during subset).
    Stacked(Vec<StackedFragment>),

    /// Owned DataFrame — shape differs from source.
    ///
    /// Used for: unpivoted measurements, component-crushed measurements,
    /// truth-mapped binaries, or any case where the data couldn't be
    /// expressed as a column reference into the original source.
    Materialized(DataFrame),
}

/// A column reference into a shared source DataFrame.
#[derive(Debug, Clone)]
pub struct ColumnRefData {
    /// The shared source DataFrame (e.g., one BoundSource's data).
    pub source: Arc<DataFrame>,
    /// Source column name for this measurement's values.
    pub value_column: SourceColumnName,
    /// Canonical name for the measurement (used as alias in output).
    pub canonical_name: CanonicalColumnName,
    /// Which source this came from.
    pub source_name: DataSourceName,
    /// Column mappings for subject, time, and components.
    /// Each entry is (source_name_in_df, canonical_name_in_output).
    pub column_mappings: Vec<(SourceColumnName, CanonicalColumnName)>,
    /// Optional deferred transform applied at materialization.
    /// Shape-preserving: null fill, truth mapping, column arithmetic, etc.
    pub transform: Option<DeferredTransform>,
}

/// One fragment in a stacked measurement — references a source DataFrame.
#[derive(Debug, Clone)]
pub struct StackedFragment {
    /// The shared source DataFrame.
    pub source: Arc<DataFrame>,
    /// Source column name for this measurement's values.
    pub value_column: SourceColumnName,
    /// Canonical name for the measurement.
    pub canonical_name: CanonicalColumnName,
    /// Which source contributed this fragment.
    pub source_name: DataSourceName,
    /// Column mappings (same as ColumnRefData).
    pub column_mappings: Vec<(SourceColumnName, CanonicalColumnName)>,
    /// Optional deferred transform (same as ColumnRefData).
    pub transform: Option<DeferredTransform>,
}

impl FragmentRef {
    /// Resolve to a DataFrame.
    ///
    /// - `ColumnRef`: selects, renames, and applies deferred transforms.
    /// - `Stacked`: selects + transforms + vertical stacks all fragments.
    /// - `Materialized`: returns a clone.
    pub fn as_dataframe(&self) -> Result<DataFrame, PolarsError> {
        self.as_lazy()?.collect()
    }

    /// Resolve to a LazyFrame for predicate pushdown.
    ///
    /// Allows filters (time range, subject) to be pushed into the
    /// column selection plan alongside deferred transforms, so Polars
    /// can optimize them together.
    pub fn as_lazy(&self) -> Result<LazyFrame, PolarsError> {
        match self {
            FragmentRef::ColumnRef(ref_data) => {
                let exprs = build_select_exprs(
                    &ref_data.column_mappings,
                    &ref_data.value_column,
                    &ref_data.canonical_name,
                );
                let mut lf = (*ref_data.source).clone().lazy().select(exprs);
                if let Some(ref transform) = ref_data.transform {
                    lf = transform.apply(lf);
                }
                Ok(lf)
            }
            FragmentRef::Stacked(fragments) => {
                if fragments.is_empty() {
                    return Ok(DataFrame::empty().lazy());
                }
                if fragments.len() == 1 {
                    let f = &fragments[0];
                    let exprs =
                        build_select_exprs(&f.column_mappings, &f.value_column, &f.canonical_name);
                    let mut lf = (*f.source).clone().lazy().select(exprs);
                    if let Some(ref transform) = f.transform {
                        lf = transform.apply(lf);
                    }
                    return Ok(lf);
                }

                let mut lazy_frames = Vec::with_capacity(fragments.len());
                for frag in fragments {
                    let exprs = build_select_exprs(
                        &frag.column_mappings,
                        &frag.value_column,
                        &frag.canonical_name,
                    );
                    let mut lf = (*frag.source).clone().lazy().select(exprs);
                    if let Some(ref transform) = frag.transform {
                        lf = transform.apply(lf);
                    }
                    lazy_frames.push(lf);
                }

                concat(lazy_frames, UnionArgs::default())
            }
            FragmentRef::Materialized(df) => Ok(df.clone().lazy()),
        }
    }

    /// Row count.
    /// For ColumnRef, this is the source DataFrame's height (may include
    /// rows for other measurements). For Stacked, sum of all fragments.
    pub fn height(&self) -> usize {
        match self {
            FragmentRef::ColumnRef(r) => r.source.height(),
            FragmentRef::Stacked(frags) => frags.iter().map(|f| f.source.height()).sum(),
            FragmentRef::Materialized(df) => df.height(),
        }
    }

    /// Whether this is already materialized.
    pub fn is_materialized(&self) -> bool {
        matches!(self, FragmentRef::Materialized(_))
    }

    /// Estimated memory usage in bytes.
    ///
    /// - `ColumnRef`: 0 — data lives in the shared source, not owned here.
    ///   The source size is reported separately via `shared_source_bytes()`.
    /// - `Stacked`: 0 — same as ColumnRef, references only.
    /// - `Materialized`: the DataFrame's estimated heap size.
    pub fn owned_bytes(&self) -> usize {
        match self {
            FragmentRef::ColumnRef(_) => 0,
            FragmentRef::Stacked(_) => 0,
            FragmentRef::Materialized(df) => df.estimated_size(),
        }
    }

    /// Size of the shared source DataFrame(s) this fragment references.
    ///
    /// For ColumnRef/Stacked, this is the source DataFrame size (shared
    /// with other measurements). For Materialized, returns 0 since there's
    /// no shared source.
    pub fn shared_source_bytes(&self) -> usize {
        match self {
            FragmentRef::ColumnRef(r) => r.source.estimated_size(),
            FragmentRef::Stacked(frags) => frags.iter().map(|f| f.source.estimated_size()).sum(),
            FragmentRef::Materialized(_) => 0,
        }
    }

    /// Raw pointers to the underlying Arc<DataFrame>(s) for deduplication.
    ///
    /// Two ColumnRefs from the same BoundSource return the same pointer.
    /// Used by `memory_summary()` to avoid double-counting shared memory.
    pub fn source_arc_ptrs(&self) -> Vec<usize> {
        match self {
            FragmentRef::ColumnRef(r) => vec![Arc::as_ptr(&r.source) as usize],
            FragmentRef::Stacked(frags) => frags
                .iter()
                .map(|f| Arc::as_ptr(&f.source) as usize)
                .collect(),
            FragmentRef::Materialized(_) => vec![],
        }
    }

    /// Shared source bytes for a specific Arc pointer.
    pub fn shared_source_bytes_for_ptr(&self, ptr: usize) -> usize {
        match self {
            FragmentRef::ColumnRef(r) => {
                if Arc::as_ptr(&r.source) as usize == ptr {
                    r.source.estimated_size()
                } else {
                    0
                }
            }
            FragmentRef::Stacked(frags) => frags
                .iter()
                .find(|f| Arc::as_ptr(&f.source) as usize == ptr)
                .map(|f| f.source.estimated_size())
                .unwrap_or(0),
            FragmentRef::Materialized(_) => 0,
        }
    }

    /// Get the raw source DataFrame without column selection or renaming.
    /// For ColumnRef: returns the full shared DataFrame (all columns from the source).
    /// For Materialized: returns the owned DataFrame (already has only its columns).
    /// For Stacked: returns None (multiple sources).
    pub fn raw_source_dataframe(&self) -> Option<&DataFrame> {
        match self {
            FragmentRef::ColumnRef(r) => Some(&*r.source),
            FragmentRef::Materialized(df) => Some(df),
            FragmentRef::Stacked(_) => None,
        }
    }

    /// The logical source name for this fragment, if any.
    ///
    /// - `ColumnRef`: the source the column points into.
    /// - `Stacked`: the *first* fragment's source name. (Stacked fragments
    ///   may span multiple sources; the plan layer treats the first one as
    ///   the primary identity, consistent with `source_arc_ptrs().first()`.)
    /// - `Materialized`: `None` — no upstream source identity preserved.
    pub fn source_name(&self) -> Option<&DataSourceName> {
        match self {
            FragmentRef::ColumnRef(r) => Some(&r.source_name),
            FragmentRef::Stacked(frags) => frags.first().map(|f| &f.source_name),
            FragmentRef::Materialized(_) => None,
        }
    }

    /// Get the physical (source) column name for a canonical column name.
    /// Checks both the column_mappings (subject, time, components) and the
    /// value_column/canonical_name pair.
    /// Returns None if not a ColumnRef or the canonical name isn't mapped.
    /// For Materialized: physical = canonical (already renamed).
    pub fn physical_column_name(&self, canonical: &str) -> Option<String> {
        match self {
            FragmentRef::ColumnRef(r) => {
                // Check value column
                if r.canonical_name.as_str() == canonical {
                    return Some(r.value_column.as_str().to_string());
                }
                // Check column mappings (subject, time, components)
                r.column_mappings
                    .iter()
                    .find(|(_, canon)| canon.as_str() == canonical)
                    .map(|(src, _)| src.as_str().to_string())
            }
            _ => Some(canonical.to_string()),
        }
    }

    /// Whether this fragment has a deferred transform.
    pub fn has_transform(&self) -> bool {
        match self {
            FragmentRef::ColumnRef(r) => r.transform.is_some(),
            FragmentRef::Stacked(frags) => frags.iter().any(|f| f.transform.is_some()),
            FragmentRef::Materialized(_) => false,
        }
    }

    /// Diagnostic description.
    pub fn storage_description(&self) -> StorageDescription {
        match self {
            FragmentRef::ColumnRef(r) => StorageDescription {
                kind: "column_ref".to_string(),
                source_count: 1,
                sources: vec![r.source_name.to_string()],
                rows: r.source.height(),
            },
            FragmentRef::Stacked(frags) => StorageDescription {
                kind: "stacked".to_string(),
                source_count: frags.len(),
                sources: frags.iter().map(|f| f.source_name.to_string()).collect(),
                rows: frags.iter().map(|f| f.source.height()).sum(),
            },
            FragmentRef::Materialized(df) => StorageDescription {
                kind: "materialized".to_string(),
                source_count: 1,
                sources: vec![],
                rows: df.height(),
            },
        }
    }
}

// ============================================================================
// Memory Measurement (memuse)
// ============================================================================

impl DynamicUsage for FragmentRef {
    fn dynamic_usage(&self) -> usize {
        match self {
            // ColumnRef: the fragment itself owns no data — just an Arc pointer
            // and some small vecs for column mappings. The source DataFrame
            // is shared and counted separately.
            FragmentRef::ColumnRef(r) => {
                // Column mappings vec overhead
                r.column_mappings.len()
                    * std::mem::size_of::<(SourceColumnName, CanonicalColumnName)>()
                    + r.value_column.as_str().len()
                    + r.canonical_name.as_str().len()
                    + r.source_name.as_str().len()
            }
            FragmentRef::Stacked(frags) => frags
                .iter()
                .map(|f| {
                    f.column_mappings.len()
                        * std::mem::size_of::<(SourceColumnName, CanonicalColumnName)>()
                        + f.value_column.as_str().len()
                        + f.canonical_name.as_str().len()
                        + f.source_name.as_str().len()
                })
                .sum(),
            // Materialized: owns the DataFrame data on the heap.
            FragmentRef::Materialized(df) => df.estimated_size(),
        }
    }

    fn dynamic_usage_bounds(&self) -> (usize, Option<usize>) {
        let usage = self.dynamic_usage();
        (usage, Some(usage))
    }
}

impl DynamicUsage for ColumnRefData {
    fn dynamic_usage(&self) -> usize {
        // Only counts the metadata — the Arc<DataFrame> is shared
        self.column_mappings.len() * std::mem::size_of::<(SourceColumnName, CanonicalColumnName)>()
            + self.value_column.as_str().len()
            + self.canonical_name.as_str().len()
            + self.source_name.as_str().len()
    }

    fn dynamic_usage_bounds(&self) -> (usize, Option<usize>) {
        let usage = self.dynamic_usage();
        (usage, Some(usage))
    }
}

/// Diagnostic info about how a fragment's data is stored.
#[derive(Debug, Clone, Serialize)]
pub struct StorageDescription {
    pub kind: String,
    pub source_count: usize,
    pub sources: Vec<String>,
    pub rows: usize,
}

/// Build select expressions with source→canonical renaming.
fn build_select_exprs(
    column_mappings: &[(SourceColumnName, CanonicalColumnName)],
    value_source: &SourceColumnName,
    value_canonical: &CanonicalColumnName,
) -> Vec<Expr> {
    let mut exprs: Vec<Expr> = column_mappings
        .iter()
        .map(|(src, canon)| col(src.as_str()).alias(canon.as_str()))
        .collect();
    exprs.push(col(value_source.as_str()).alias(value_canonical.as_str()));
    exprs
}

#[cfg(test)]
mod tests {
    use super::*;

    fn make_source_df() -> DataFrame {
        DataFrame::new(vec![
            Column::new("station_id".into(), &["A", "A", "B", "B"]),
            Column::new("obs_time".into(), &[1i64, 2, 1, 2]),
            Column::new("sump_reading".into(), &[1.0f64, 2.0, 3.0, 4.0]),
            Column::new("discharge_reading".into(), &[10.0f64, 20.0, 30.0, 40.0]),
        ])
        .unwrap()
    }

    fn standard_mappings() -> Vec<(SourceColumnName, CanonicalColumnName)> {
        vec![
            (
                SourceColumnName::new("station_id"),
                CanonicalColumnName::new("station_name"),
            ),
            (
                SourceColumnName::new("obs_time"),
                CanonicalColumnName::new("timestamp"),
            ),
        ]
    }

    #[test]
    fn test_column_ref_selects_and_renames() {
        let source = Arc::new(make_source_df());
        let storage = FragmentRef::ColumnRef(ColumnRefData {
            source: source.clone(),
            value_column: SourceColumnName::new("sump_reading"),
            canonical_name: CanonicalColumnName::new("sump"),
            source_name: DataSourceName::new("scada"),
            column_mappings: standard_mappings(),
            transform: None,
        });

        let df = storage.as_dataframe().unwrap();
        assert_eq!(df.width(), 3); // station_name, timestamp, sump
        assert_eq!(df.height(), 4);
        assert!(df.column("sump").is_ok());
        assert!(df.column("station_name").is_ok());
        assert!(df.column("timestamp").is_ok());
        // Source names should not be in output
        assert!(df.column("station_id").is_err());
        assert!(df.column("sump_reading").is_err());
    }

    #[test]
    fn test_stacked_combines_fragments() {
        let source_a = Arc::new(
            DataFrame::new(vec![
                Column::new("station_id".into(), &["A", "A"]),
                Column::new("obs_time".into(), &[1i64, 2]),
                Column::new("sump_reading".into(), &[1.0f64, 2.0]),
            ])
            .unwrap(),
        );

        let source_b = Arc::new(
            DataFrame::new(vec![
                Column::new("station_id".into(), &["B", "B"]),
                Column::new("obs_time".into(), &[1i64, 2]),
                Column::new("sump_reading".into(), &[3.0f64, 4.0]),
            ])
            .unwrap(),
        );

        let mappings = standard_mappings();
        let storage = FragmentRef::Stacked(vec![
            StackedFragment {
                source: source_a,
                value_column: SourceColumnName::new("sump_reading"),
                canonical_name: CanonicalColumnName::new("sump"),
                source_name: DataSourceName::new("store"),
                column_mappings: mappings.clone(),
                transform: None,
            },
            StackedFragment {
                source: source_b,
                value_column: SourceColumnName::new("sump_reading"),
                canonical_name: CanonicalColumnName::new("sump"),
                source_name: DataSourceName::new("adhoc"),
                column_mappings: mappings,
                transform: None,
            },
        ]);

        let df = storage.as_dataframe().unwrap();
        assert_eq!(df.height(), 4);
        assert_eq!(df.width(), 3);
        assert!(df.column("sump").is_ok());
        assert!(df.column("station_name").is_ok());
    }

    #[test]
    fn test_materialized_returns_as_is() {
        let df = DataFrame::new(vec![
            Column::new("station_name".into(), &["A"]),
            Column::new("timestamp".into(), &[1i64]),
            Column::new("engine".into(), &["1"]),
            Column::new("engines_on_count".into(), &[1.0f64]),
        ])
        .unwrap();

        let storage = FragmentRef::Materialized(df.clone());
        let result = storage.as_dataframe().unwrap();
        assert_eq!(result.height(), 1);
        assert_eq!(result.width(), 4);
    }

    #[test]
    fn test_column_ref_shared_across_measurements() {
        let source = Arc::new(make_source_df());
        let mappings = standard_mappings();

        let sump_ref = FragmentRef::ColumnRef(ColumnRefData {
            source: source.clone(),
            value_column: SourceColumnName::new("sump_reading"),
            canonical_name: CanonicalColumnName::new("sump"),
            source_name: DataSourceName::new("scada"),
            column_mappings: mappings.clone(),
            transform: None,
        });

        let discharge_ref = FragmentRef::ColumnRef(ColumnRefData {
            source: source.clone(),
            value_column: SourceColumnName::new("discharge_reading"),
            canonical_name: CanonicalColumnName::new("discharge"),
            source_name: DataSourceName::new("scada"),
            column_mappings: mappings,
            transform: None,
        });

        // Both reference the same source — Arc refcount is 3
        assert_eq!(Arc::strong_count(&source), 3);

        let sump_df = sump_ref.as_dataframe().unwrap();
        let discharge_df = discharge_ref.as_dataframe().unwrap();

        assert!(sump_df.column("sump").is_ok());
        assert!(discharge_df.column("discharge").is_ok());
        // Each has its own measurement column
        assert!(sump_df.column("discharge").is_err());
        assert!(discharge_df.column("sump").is_err());
    }

    #[test]
    fn test_lazy_produces_same_result() {
        let source = Arc::new(make_source_df());
        let storage = FragmentRef::ColumnRef(ColumnRefData {
            source,
            value_column: SourceColumnName::new("sump_reading"),
            canonical_name: CanonicalColumnName::new("sump"),
            source_name: DataSourceName::new("scada"),
            column_mappings: standard_mappings(),
            transform: None,
        });

        let eager = storage.as_dataframe().unwrap();
        let lazy = storage.as_lazy().unwrap().collect().unwrap();

        assert_eq!(eager.height(), lazy.height());
        assert_eq!(eager.width(), lazy.width());
        assert_eq!(eager.get_column_names(), lazy.get_column_names());
    }
}