Skip to main content

triplets_core/source/backends/
csv_source.rs

1use chrono::{DateTime, Utc};
2use std::path::PathBuf;
3
4use crate::config::{NegativeStrategy, SamplerConfig, Selector, TripletRecipe};
5use crate::data::{DataRecord, QualityScore, SectionRole};
6use crate::errors::SamplerError;
7use crate::source::{DataSource, IndexablePager, IndexableSource, SourceCursor, SourceSnapshot};
8use crate::types::SourceId;
9use crate::utils::{file_times, make_section, normalize_inline_whitespace};
10
11const CSV_RECIPE_ANCHOR_POSITIVE_WRONG_ARTICLE: &str = "csv_anchor_positive_wrong_article";
12const CSV_RECIPE_ANCHOR_ANCHOR_WRONG_ARTICLE: &str = "csv_anchor_anchor_wrong_article";
13/// Default CSV text-columns-mode SimCSE-style recipe name.
14pub const CSV_RECIPE_TEXT_SIMCSE_WRONG_ARTICLE: &str = "csv_text_simcse_wrong_article";
15
16/// Configuration for a CSV-backed data source.
17///
18/// Two modes are supported:
19///
20/// - **Role mode** — `anchor_column` is set (with an optional `positive_column`).
21///   Each row produces an `Anchor` section from `anchor_column` and a `Context`
22///   section from `positive_column` (or the anchor text when `positive_column` is
23///   absent).
24///
25/// - **Text mode** — only `text_column` is set.  Each row produces both an
26///   `Anchor` and a `Context` section from the same column (SimCSE-style).
27///
28/// `anchor_column` and `text_column` are mutually exclusive.
29///
30/// The CSV file **must** have a named header row.  Columns are always looked up
31/// by name, so a header-free file cannot be used with this source.
32#[derive(Clone, Debug)]
33pub struct CsvSourceConfig {
34    /// Stable source identifier used in records and persistence keys.
35    pub source_id: SourceId,
36    /// Path to the CSV file.
37    pub path: PathBuf,
38    /// Column name for anchor text.  Enables role mode when set.
39    ///
40    /// Mutually exclusive with `text_column`.
41    pub anchor_column: Option<String>,
42    /// Column name for positive/context text.  Used with `anchor_column`.
43    ///
44    /// When absent in role mode, the anchor text is reused as the context
45    /// (identical-positive fallback, suitable for contrastive pre-training).
46    pub positive_column: Option<String>,
47    /// Column name for single-text mode.
48    ///
49    /// Mutually exclusive with `anchor_column`.
50    pub text_column: Option<String>,
51    /// Trust/quality score assigned to every record from this source.
52    pub trust: f32,
53}
54
55impl CsvSourceConfig {
56    /// Create a config for a CSV source with the given identifier and path.
57    pub fn new(source_id: impl Into<SourceId>, path: impl Into<PathBuf>) -> Self {
58        Self {
59            source_id: source_id.into(),
60            path: path.into(),
61            anchor_column: None,
62            positive_column: None,
63            text_column: None,
64            trust: 0.85,
65        }
66    }
67
68    /// Set the column used as the anchor (enables role mode).
69    pub fn with_anchor_column(mut self, column: impl Into<String>) -> Self {
70        self.anchor_column = Some(column.into());
71        self
72    }
73
74    /// Set the column used as the positive/context (role mode only).
75    pub fn with_positive_column(mut self, column: impl Into<String>) -> Self {
76        self.positive_column = Some(column.into());
77        self
78    }
79
80    /// Set the column used as the single text field (enables text mode).
81    pub fn with_text_column(mut self, column: impl Into<String>) -> Self {
82        self.text_column = Some(column.into());
83        self
84    }
85
86    /// Override the default trust score.
87    pub fn with_trust(mut self, trust: f32) -> Self {
88        self.trust = trust;
89        self
90    }
91
92    fn is_role_mode(&self) -> bool {
93        self.anchor_column.is_some()
94    }
95
96    fn validate(&self) -> Result<(), SamplerError> {
97        if self.anchor_column.is_some() && self.text_column.is_some() {
98            return Err(SamplerError::Configuration(
99                "CsvSourceConfig: `anchor_column` and `text_column` are mutually exclusive"
100                    .to_string(),
101            ));
102        }
103        if self.anchor_column.is_none() && self.text_column.is_none() {
104            return Err(SamplerError::Configuration(
105                "CsvSourceConfig: one of `anchor_column` or `text_column` must be set".to_string(),
106            ));
107        }
108        if self.positive_column.is_some() && self.anchor_column.is_none() {
109            return Err(SamplerError::Configuration(
110                "CsvSourceConfig: `positive_column` requires `anchor_column` to be set".to_string(),
111            ));
112        }
113        Ok(())
114    }
115}
116
117/// Column-mapped CSV data source.
118///
119/// Reads all rows from a CSV file at construction and exposes them as
120/// [`DataRecord`]s.  Suitable for small-to-medium datasets that fit comfortably
121/// in memory.
122///
123/// ## Modes
124///
125/// Configure the source with either anchor/positive columns (role mode) or a
126/// single text column (text mode):
127///
128/// ```rust,no_run
129/// use triplets_core::source::{CsvSource, CsvSourceConfig};
130///
131/// // Role mode: explicit anchor + positive columns.
132/// let config = CsvSourceConfig::new("my_qna", "data/qna.csv")
133///     .with_anchor_column("question")
134///     .with_positive_column("answer")
135///     .with_trust(0.9);
136/// let source = CsvSource::new(config).unwrap();
137///
138/// // Text mode: single text column.
139/// let config2 = CsvSourceConfig::new("my_corpus", "data/corpus.csv")
140///     .with_text_column("text");
141/// let source2 = CsvSource::new(config2).unwrap();
142/// ```
143#[derive(Debug)]
144pub struct CsvSource {
145    config: CsvSourceConfig,
146    records: Vec<DataRecord>,
147}
148
149impl CsvSource {
150    /// Load a CSV source from the given configuration.
151    ///
152    /// Returns a `SamplerError::Configuration` error if the config is invalid,
153    /// or a `SamplerError::SourceUnavailable` error if the CSV file cannot be
154    /// opened or parsed.
155    pub fn new(config: CsvSourceConfig) -> Result<Self, SamplerError> {
156        config.validate()?;
157        let records = Self::load_records(&config)?;
158        Ok(Self { config, records })
159    }
160
161    fn load_records(config: &CsvSourceConfig) -> Result<Vec<DataRecord>, SamplerError> {
162        let (created_at, updated_at) = file_times(&config.path);
163
164        let mut reader = csv::ReaderBuilder::new()
165            .has_headers(true)
166            .flexible(false)
167            .trim(csv::Trim::All)
168            .from_path(&config.path)
169            .map_err(|err| SamplerError::SourceUnavailable {
170                source_id: config.source_id.clone(),
171                reason: format!("failed to open CSV file '{}': {err}", config.path.display()),
172            })?;
173
174        // Columns are selected by name against the header row.
175        let headers = reader
176            .headers()
177            .map_err(|err| SamplerError::SourceUnavailable {
178                source_id: config.source_id.clone(),
179                reason: format!(
180                    "failed to read CSV headers in '{}': {err}",
181                    config.path.display()
182                ),
183            })?
184            .clone();
185
186        // Pre-resolve column indices so we error early on bad config rather
187        // than silently skipping every row.
188        let anchor_idx = if let Some(col) = &config.anchor_column {
189            Some(column_index(&headers, col).ok_or_else(|| {
190                SamplerError::Configuration(format!(
191                    "anchor_column '{}' not found in CSV headers of '{}'",
192                    col,
193                    config.path.display()
194                ))
195            })?)
196        } else {
197            None
198        };
199
200        let positive_idx = if let Some(col) = &config.positive_column {
201            Some(column_index(&headers, col).ok_or_else(|| {
202                SamplerError::Configuration(format!(
203                    "positive_column '{}' not found in CSV headers of '{}'",
204                    col,
205                    config.path.display()
206                ))
207            })?)
208        } else {
209            None
210        };
211
212        let text_idx = if let Some(col) = &config.text_column {
213            Some(column_index(&headers, col).ok_or_else(|| {
214                SamplerError::Configuration(format!(
215                    "text_column '{}' not found in CSV headers of '{}'",
216                    col,
217                    config.path.display()
218                ))
219            })?)
220        } else {
221            None
222        };
223
224        let mut records = Vec::new();
225
226        let cols = ColumnIndices {
227            anchor: anchor_idx,
228            positive: positive_idx,
229            text: text_idx,
230        };
231
232        for (row_idx, result) in reader.records().enumerate() {
233            let row = result.map_err(|err| SamplerError::SourceUnavailable {
234                source_id: config.source_id.clone(),
235                reason: format!(
236                    "failed to read row {} in '{}': {err}",
237                    row_idx,
238                    config.path.display()
239                ),
240            })?;
241
242            if let Some(record) = build_record(config, &row, row_idx, &cols, created_at, updated_at)
243            {
244                records.push(record);
245            }
246        }
247
248        Ok(records)
249    }
250}
251
252/// Resolve a column name to its zero-based index in a header record.
253fn column_index(headers: &csv::StringRecord, name: &str) -> Option<usize> {
254    headers.iter().position(|h| h.eq_ignore_ascii_case(name))
255}
256
257/// Pre-resolved column indices for a CSV source.
258struct ColumnIndices {
259    anchor: Option<usize>,
260    positive: Option<usize>,
261    text: Option<usize>,
262}
263
264/// Build a [`DataRecord`] from a single CSV row.
265///
266/// Returns `None` when required column values are empty or missing.
267fn build_record(
268    config: &CsvSourceConfig,
269    row: &csv::StringRecord,
270    row_idx: usize,
271    cols: &ColumnIndices,
272    created_at: DateTime<Utc>,
273    updated_at: DateTime<Utc>,
274) -> Option<DataRecord> {
275    let id = format!("{}::row_{}", config.source_id, row_idx);
276
277    let sections = if config.is_role_mode() {
278        // Role mode: anchor + optional positive
279        let anchor_raw = cols.anchor.and_then(|i| row.get(i)).unwrap_or("");
280        let anchor_text = normalize_inline_whitespace(anchor_raw);
281        if anchor_text.is_empty() {
282            return None;
283        }
284
285        let positive_text = if let Some(pidx) = cols.positive {
286            let raw = row.get(pidx).unwrap_or("");
287            let normalized = normalize_inline_whitespace(raw);
288            if normalized.is_empty() {
289                return None;
290            }
291            normalized
292        } else {
293            // Fall back to anchor text as positive when no positive column is set.
294            anchor_text.clone()
295        };
296
297        let anchor_heading = config.anchor_column.as_deref();
298        let positive_heading = config
299            .positive_column
300            .as_deref()
301            .or(config.anchor_column.as_deref());
302
303        vec![
304            make_section(SectionRole::Anchor, anchor_heading, &anchor_text),
305            make_section(SectionRole::Context, positive_heading, &positive_text),
306        ]
307    } else {
308        // Text mode: single column used for both Anchor and Context (SimCSE pattern).
309        let raw = cols.text.and_then(|i| row.get(i)).unwrap_or("");
310        let text = normalize_inline_whitespace(raw);
311        if text.is_empty() {
312            return None;
313        }
314
315        let heading = config.text_column.as_deref();
316        vec![
317            make_section(SectionRole::Anchor, heading, &text),
318            make_section(SectionRole::Context, heading, &text),
319        ]
320    };
321
322    Some(DataRecord {
323        id,
324        source: config.source_id.clone(),
325        created_at,
326        updated_at,
327        quality: QualityScore {
328            trust: config.trust,
329        },
330        taxonomy: vec![config.source_id.clone()],
331        sections,
332        meta_prefix: None,
333    })
334}
335
336impl IndexableSource for CsvSource {
337    fn id(&self) -> &str {
338        &self.config.source_id
339    }
340
341    fn len_hint(&self) -> Option<usize> {
342        Some(self.records.len())
343    }
344
345    fn record_at(&self, idx: usize) -> Result<Option<DataRecord>, SamplerError> {
346        Ok(self.records.get(idx).cloned())
347    }
348}
349
350impl DataSource for CsvSource {
351    fn id(&self) -> &str {
352        &self.config.source_id
353    }
354
355    fn refresh(
356        &self,
357        _config: &SamplerConfig,
358        cursor: Option<&SourceCursor>,
359        limit: Option<usize>,
360    ) -> Result<SourceSnapshot, SamplerError> {
361        IndexablePager::new(&self.config.source_id).refresh(self, cursor, limit)
362    }
363
364    fn reported_record_count(&self, _config: &SamplerConfig) -> Result<u128, SamplerError> {
365        Ok(self.records.len() as u128)
366    }
367
368    fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
369        if !self.config.is_role_mode() {
370            // Text mode: SimCSE-style recipe that allows same anchor/positive text.
371            // Dropout noise provides the necessary embedding variation between
372            // the two identical slots; the negative comes from a different record.
373            return vec![TripletRecipe {
374                name: CSV_RECIPE_TEXT_SIMCSE_WRONG_ARTICLE.into(),
375                anchor: Selector::Role(SectionRole::Anchor),
376                positive_selector: Selector::Role(SectionRole::Context),
377                negative_selector: Selector::Role(SectionRole::Context),
378                negative_strategy: NegativeStrategy::WrongArticle,
379                weight: 1.0,
380                instruction: None,
381                allow_same_anchor_positive: true,
382            }];
383        }
384
385        vec![
386            // Primary lane: context (positive) negatives for broad coverage.
387            TripletRecipe {
388                name: CSV_RECIPE_ANCHOR_POSITIVE_WRONG_ARTICLE.into(),
389                anchor: Selector::Role(SectionRole::Anchor),
390                positive_selector: Selector::Role(SectionRole::Context),
391                negative_selector: Selector::Role(SectionRole::Context),
392                negative_strategy: NegativeStrategy::WrongArticle,
393                weight: 0.75,
394                instruction: None,
395                allow_same_anchor_positive: false,
396            },
397            // Medium-hard lane: anchor-as-negative for discrimination pressure.
398            TripletRecipe {
399                name: CSV_RECIPE_ANCHOR_ANCHOR_WRONG_ARTICLE.into(),
400                anchor: Selector::Role(SectionRole::Anchor),
401                positive_selector: Selector::Role(SectionRole::Context),
402                negative_selector: Selector::Role(SectionRole::Anchor),
403                negative_strategy: NegativeStrategy::WrongArticle,
404                weight: 0.25,
405                instruction: None,
406                allow_same_anchor_positive: false,
407            },
408        ]
409    }
410}
411
412#[cfg(test)]
413mod tests {
414    use super::*;
415    use crate::config::SamplerConfig;
416    use crate::source::DataSource;
417    use std::io::Write;
418    use tempfile::NamedTempFile;
419
420    fn write_csv(content: &str) -> NamedTempFile {
421        let mut f = NamedTempFile::new().unwrap();
422        write!(f, "{content}").unwrap();
423        f
424    }
425
426    fn sampler_config() -> SamplerConfig {
427        SamplerConfig {
428            seed: 42,
429            ..SamplerConfig::default()
430        }
431    }
432
433    // ──────────────────────────────────────────────────────────── construction
434
435    #[test]
436    fn rejects_anchor_and_text_columns_together() {
437        let f = write_csv("anchor,text\nhello,world\n");
438        let err = CsvSource::new(
439            CsvSourceConfig::new("src", f.path())
440                .with_anchor_column("anchor")
441                .with_text_column("text"),
442        )
443        .unwrap_err();
444        assert!(
445            matches!(err, SamplerError::Configuration(_)),
446            "expected Configuration error, got {err:?}"
447        );
448    }
449
450    #[test]
451    fn rejects_missing_column_spec() {
452        let f = write_csv("anchor,text\nhello,world\n");
453        let err = CsvSource::new(CsvSourceConfig::new("src", f.path())).unwrap_err();
454        assert!(matches!(err, SamplerError::Configuration(_)));
455    }
456
457    #[test]
458    fn rejects_positive_without_anchor() {
459        let f = write_csv("anchor,text\nhello,world\n");
460        let err =
461            CsvSource::new(CsvSourceConfig::new("src", f.path()).with_positive_column("text"))
462                .unwrap_err();
463        assert!(matches!(err, SamplerError::Configuration(_)));
464    }
465
466    #[test]
467    fn rejects_missing_anchor_column_in_file() {
468        let f = write_csv("question,answer\nhello,world\n");
469        let err =
470            CsvSource::new(CsvSourceConfig::new("src", f.path()).with_anchor_column("missing_col"))
471                .unwrap_err();
472        assert!(matches!(err, SamplerError::Configuration(_)));
473    }
474
475    #[test]
476    fn rejects_missing_text_column_in_file() {
477        let f = write_csv("question,answer\nhello,world\n");
478        let err =
479            CsvSource::new(CsvSourceConfig::new("src", f.path()).with_text_column("missing_col"))
480                .unwrap_err();
481        assert!(matches!(err, SamplerError::Configuration(_)));
482    }
483
484    #[test]
485    fn rejects_missing_positive_column_in_file() {
486        let f = write_csv("question,answer\nhello,world\n");
487        let err = CsvSource::new(
488            CsvSourceConfig::new("src", f.path())
489                .with_anchor_column("question")
490                .with_positive_column("missing_col"),
491        )
492        .unwrap_err();
493        assert!(matches!(err, SamplerError::Configuration(_)));
494    }
495
496    // ──────────────────────────────────────────────────────────── role mode
497
498    #[test]
499    fn role_mode_anchor_and_positive() {
500        let f = write_csv("question,answer\nWhat is Rust?,A systems language.\n");
501        let source = CsvSource::new(
502            CsvSourceConfig::new("qna", f.path())
503                .with_anchor_column("question")
504                .with_positive_column("answer"),
505        )
506        .unwrap();
507
508        let snapshot = source.refresh(&sampler_config(), None, None).unwrap();
509        assert_eq!(snapshot.records.len(), 1);
510        let record = &snapshot.records[0];
511        assert_eq!(record.source, "qna");
512        assert_eq!(record.sections.len(), 2);
513        assert_eq!(record.sections[0].role, SectionRole::Anchor);
514        assert_eq!(record.sections[0].text, "What is Rust?");
515        assert_eq!(record.sections[1].role, SectionRole::Context);
516        assert_eq!(record.sections[1].text, "A systems language.");
517    }
518
519    #[test]
520    fn role_mode_anchor_only_duplicates_to_context() {
521        let f = write_csv("sentence\nHello world\n");
522        let source = CsvSource::new(
523            CsvSourceConfig::new("anchors", f.path()).with_anchor_column("sentence"),
524        )
525        .unwrap();
526
527        let snapshot = source.refresh(&sampler_config(), None, None).unwrap();
528        assert_eq!(snapshot.records.len(), 1);
529        let record = &snapshot.records[0];
530        assert_eq!(record.sections.len(), 2);
531        assert_eq!(record.sections[0].role, SectionRole::Anchor);
532        assert_eq!(record.sections[1].role, SectionRole::Context);
533        // Context must mirror the anchor text.
534        assert_eq!(record.sections[0].text, record.sections[1].text);
535    }
536
537    #[test]
538    fn role_mode_skips_rows_with_empty_anchor() {
539        let f = write_csv(
540            "question,answer\n\
541             What is Rust?,A systems language.\n\
542             ,Missing anchor\n\
543             What is Go?,A concurrent language.\n",
544        );
545        let source = CsvSource::new(
546            CsvSourceConfig::new("qna", f.path())
547                .with_anchor_column("question")
548                .with_positive_column("answer"),
549        )
550        .unwrap();
551        let snapshot = source.refresh(&sampler_config(), None, None).unwrap();
552        assert_eq!(snapshot.records.len(), 2);
553    }
554
555    #[test]
556    fn role_mode_skips_rows_with_empty_positive() {
557        let f = write_csv(
558            "question,answer\n\
559             What is Rust?,A systems language.\n\
560             What is Go?,\n",
561        );
562        let source = CsvSource::new(
563            CsvSourceConfig::new("qna", f.path())
564                .with_anchor_column("question")
565                .with_positive_column("answer"),
566        )
567        .unwrap();
568        let snapshot = source.refresh(&sampler_config(), None, None).unwrap();
569        assert_eq!(snapshot.records.len(), 1);
570    }
571
572    // ──────────────────────────────────────────────────────────── text mode
573
574    #[test]
575    fn text_mode_produces_identical_anchor_and_context() {
576        let f = write_csv("text\nThe quick brown fox\n");
577        let source =
578            CsvSource::new(CsvSourceConfig::new("corpus", f.path()).with_text_column("text"))
579                .unwrap();
580
581        let snapshot = source.refresh(&sampler_config(), None, None).unwrap();
582        assert_eq!(snapshot.records.len(), 1);
583        let record = &snapshot.records[0];
584        assert_eq!(record.sections.len(), 2);
585        assert_eq!(record.sections[0].role, SectionRole::Anchor);
586        assert_eq!(record.sections[1].role, SectionRole::Context);
587        assert_eq!(record.sections[0].text, record.sections[1].text);
588    }
589
590    #[test]
591    fn text_mode_skips_empty_rows() {
592        let f = write_csv("text\nHello\n\nWorld\n");
593        let source =
594            CsvSource::new(CsvSourceConfig::new("corpus", f.path()).with_text_column("text"))
595                .unwrap();
596        let snapshot = source.refresh(&sampler_config(), None, None).unwrap();
597        assert_eq!(snapshot.records.len(), 2);
598    }
599
600    // ──────────────────────────────────────────────────────── quality / trust
601
602    #[test]
603    fn applies_trust_score() {
604        let f = write_csv("text\nHello world\n");
605        let source = CsvSource::new(
606            CsvSourceConfig::new("corpus", f.path())
607                .with_text_column("text")
608                .with_trust(0.7),
609        )
610        .unwrap();
611        let snapshot = source.refresh(&sampler_config(), None, None).unwrap();
612        assert_eq!(snapshot.records[0].quality.trust, 0.7);
613    }
614
615    // ──────────────────────────────────────────────────────── default recipes
616
617    #[test]
618    fn text_mode_default_recipes_is_simcse() {
619        let f = write_csv("text\nHello\n");
620        let source =
621            CsvSource::new(CsvSourceConfig::new("corpus", f.path()).with_text_column("text"))
622                .unwrap();
623        let recipes = source.default_triplet_recipes();
624        assert_eq!(recipes.len(), 1);
625        assert_eq!(recipes[0].name, CSV_RECIPE_TEXT_SIMCSE_WRONG_ARTICLE);
626        assert!(
627            recipes[0].allow_same_anchor_positive,
628            "SimCSE recipe must allow same anchor/positive"
629        );
630    }
631
632    #[test]
633    fn role_mode_default_recipes_returns_two_recipes() {
634        let f = write_csv("question,answer\nQ,A\n");
635        let source = CsvSource::new(
636            CsvSourceConfig::new("qna", f.path())
637                .with_anchor_column("question")
638                .with_positive_column("answer"),
639        )
640        .unwrap();
641        let recipes = source.default_triplet_recipes();
642        assert_eq!(recipes.len(), 2);
643        let names: Vec<&str> = recipes.iter().map(|r| r.name.as_ref()).collect();
644        assert!(names.contains(&CSV_RECIPE_ANCHOR_POSITIVE_WRONG_ARTICLE));
645        assert!(names.contains(&CSV_RECIPE_ANCHOR_ANCHOR_WRONG_ARTICLE));
646        assert!(
647            recipes.iter().all(|r| !r.allow_same_anchor_positive),
648            "role-mode recipes must not allow same anchor/positive"
649        );
650    }
651
652    // ──────────────────────────────────────────────────────── IndexableSource
653
654    #[test]
655    fn len_hint_matches_loaded_record_count() {
656        let f = write_csv("text\nAlpha\nBeta\nGamma\n");
657        let source =
658            CsvSource::new(CsvSourceConfig::new("corpus", f.path()).with_text_column("text"))
659                .unwrap();
660        assert_eq!(source.len_hint(), Some(3));
661    }
662
663    #[test]
664    fn record_at_returns_correct_record() {
665        let f = write_csv("question,answer\nFirst?,Yes.\nSecond?,No.\n");
666        let source = CsvSource::new(
667            CsvSourceConfig::new("qna", f.path())
668                .with_anchor_column("question")
669                .with_positive_column("answer"),
670        )
671        .unwrap();
672        let r0 = source.record_at(0).unwrap().unwrap();
673        let r1 = source.record_at(1).unwrap().unwrap();
674        assert_eq!(r0.sections[0].text, "First?");
675        assert_eq!(r1.sections[0].text, "Second?");
676        assert!(source.record_at(99).unwrap().is_none());
677    }
678
679    // ──────────────────────────────────────────────────────── reported count
680
681    #[test]
682    fn reported_record_count_matches_loaded_records() {
683        let f = write_csv("text\nAlpha\nBeta\n");
684        let source =
685            CsvSource::new(CsvSourceConfig::new("corpus", f.path()).with_text_column("text"))
686                .unwrap();
687        let count = source.reported_record_count(&sampler_config()).unwrap();
688        assert_eq!(count, 2);
689    }
690
691    // ──────────────────────────────────────────────────────── stable record IDs
692
693    #[test]
694    fn record_ids_are_stable_across_refreshes() {
695        let f = write_csv("text\nAlpha\nBeta\n");
696        let source =
697            CsvSource::new(CsvSourceConfig::new("corpus", f.path()).with_text_column("text"))
698                .unwrap();
699        let ids_a: Vec<_> = source
700            .refresh(&sampler_config(), None, None)
701            .unwrap()
702            .records
703            .iter()
704            .map(|r| r.id.clone())
705            .collect();
706        let ids_b: Vec<_> = source
707            .refresh(&sampler_config(), None, None)
708            .unwrap()
709            .records
710            .iter()
711            .map(|r| r.id.clone())
712            .collect();
713        // IDs must be the same set (order may differ due to pager permutation).
714        let mut sorted_a = ids_a.clone();
715        let mut sorted_b = ids_b.clone();
716        sorted_a.sort();
717        sorted_b.sort();
718        assert_eq!(sorted_a, sorted_b);
719    }
720
721    // ──────────────────────────────────────────────────────── source id
722
723    #[test]
724    fn source_id_is_propagated_to_records() {
725        let f = write_csv("text\nHello\n");
726        let source =
727            CsvSource::new(CsvSourceConfig::new("my_source", f.path()).with_text_column("text"))
728                .unwrap();
729        assert_eq!(DataSource::id(&source), "my_source");
730        let snapshot = source.refresh(&sampler_config(), None, None).unwrap();
731        assert_eq!(snapshot.records[0].source, "my_source");
732    }
733
734    // ──────────────────────────────────────────────────── column name trimming
735
736    #[test]
737    fn column_lookup_is_case_insensitive() {
738        let f = write_csv("Question,Answer\nWhat is Rust?,A systems language.\n");
739        // Lower-case lookup against mixed-case headers.
740        let source = CsvSource::new(
741            CsvSourceConfig::new("qna", f.path())
742                .with_anchor_column("question")
743                .with_positive_column("answer"),
744        )
745        .unwrap();
746        let snapshot = source.refresh(&sampler_config(), None, None).unwrap();
747        assert_eq!(snapshot.records.len(), 1);
748        assert_eq!(snapshot.records[0].sections[0].text, "What is Rust?");
749    }
750
751    // ──────────────────────────────────────────────────── multi-row paging
752
753    #[test]
754    fn refresh_with_limit_returns_at_most_limit_records() {
755        let f = write_csv("text\nA\nB\nC\nD\nE\n");
756        let source =
757            CsvSource::new(CsvSourceConfig::new("corpus", f.path()).with_text_column("text"))
758                .unwrap();
759        let snapshot = source.refresh(&sampler_config(), None, Some(3)).unwrap();
760        assert!(
761            snapshot.records.len() <= 3,
762            "expected at most 3 records, got {}",
763            snapshot.records.len()
764        );
765    }
766
767    // ──────────────────────────────────────────── validate: third error branch
768
769    #[test]
770    fn rejects_positive_and_text_column_without_anchor() {
771        // positive_column + text_column (but no anchor_column) must reach the
772        // third validate() check after passing the first two guards.
773        let f = write_csv("text,answer\nhello,world\n");
774        let err = CsvSource::new(
775            CsvSourceConfig::new("src", f.path())
776                .with_text_column("text")
777                .with_positive_column("answer"),
778        )
779        .unwrap_err();
780        assert!(
781            matches!(err, SamplerError::Configuration(_)),
782            "expected Configuration error, got {err:?}"
783        );
784    }
785
786    // ───────────────────────────────────────────────── file open failure path
787
788    #[test]
789    fn returns_source_unavailable_for_nonexistent_file() {
790        // Exercises the from_path error closure.
791        let err = CsvSource::new(
792            CsvSourceConfig::new("src", "/nonexistent/does-not-exist.csv").with_text_column("text"),
793        )
794        .unwrap_err();
795        assert!(
796            matches!(err, SamplerError::SourceUnavailable { .. }),
797            "expected SourceUnavailable, got {err:?}"
798        );
799    }
800
801    // ──────────────────────────────────────────────────── row parse error path
802
803    #[test]
804    fn returns_source_unavailable_for_malformed_row() {
805        // With flexible(false), a data row that has more columns than the header
806        // triggers a csv parse error, which must map to SourceUnavailable.
807        let f = write_csv("question,answer\nWhat is Rust?,Good language.,extra_column\n");
808        let err = CsvSource::new(
809            CsvSourceConfig::new("src", f.path())
810                .with_anchor_column("question")
811                .with_positive_column("answer"),
812        )
813        .unwrap_err();
814        assert!(
815            matches!(err, SamplerError::SourceUnavailable { .. }),
816            "expected SourceUnavailable for malformed row, got {err:?}"
817        );
818    }
819
820    // ─────────────────────────────── text mode: whitespace-only cell is skipped
821
822    #[test]
823    fn text_mode_skips_whitespace_only_cells() {
824        // A cell containing only spaces is trimmed to "" by the csv reader
825        // (Trim::All).  Our normalize_inline_whitespace("") returns "", so
826        // the record is skipped via the empty-text guard in build_record.
827        // Blank lines (just "\n") are silently dropped by the csv crate before
828        // reaching build_record, so we need an actual whitespace-valued cell.
829        let f = write_csv("text\nHello\n   \nWorld\n");
830        let source =
831            CsvSource::new(CsvSourceConfig::new("corpus", f.path()).with_text_column("text"))
832                .unwrap();
833        let snapshot = source.refresh(&sampler_config(), None, None).unwrap();
834        assert_eq!(
835            snapshot.records.len(),
836            2,
837            "whitespace-only cell should be skipped"
838        );
839    }
840
841    // ─────────────────────────────────────── IndexableSource::id() is reachable
842
843    #[test]
844    fn indexable_source_id_matches_config() {
845        // IndexableSource::id() is only invoked through the trait
846        // object; call it directly to ensure the implementation is exercised.
847        let f = write_csv("text\nHello\n");
848        let source =
849            CsvSource::new(CsvSourceConfig::new("explicit_id", f.path()).with_text_column("text"))
850                .unwrap();
851        assert_eq!(IndexableSource::id(&source), "explicit_id");
852    }
853}