1use chrono::{DateTime, Utc};
2use std::path::PathBuf;
3
4use crate::config::{NegativeStrategy, SamplerConfig, Selector, TripletRecipe};
5use crate::data::{DataRecord, QualityScore, SectionRole};
6use crate::errors::SamplerError;
7use crate::source::{DataSource, IndexablePager, IndexableSource, SourceCursor, SourceSnapshot};
8use crate::types::SourceId;
9use crate::utils::{file_times, make_section, normalize_inline_whitespace};
10
11const CSV_RECIPE_ANCHOR_POSITIVE_WRONG_ARTICLE: &str = "csv_anchor_positive_wrong_article";
12const CSV_RECIPE_ANCHOR_ANCHOR_WRONG_ARTICLE: &str = "csv_anchor_anchor_wrong_article";
13pub const CSV_RECIPE_TEXT_SIMCSE_WRONG_ARTICLE: &str = "csv_text_simcse_wrong_article";
15
16#[derive(Clone, Debug)]
33pub struct CsvSourceConfig {
34 pub source_id: SourceId,
36 pub path: PathBuf,
38 pub anchor_column: Option<String>,
42 pub positive_column: Option<String>,
47 pub text_column: Option<String>,
51 pub trust: f32,
53}
54
55impl CsvSourceConfig {
56 pub fn new(source_id: impl Into<SourceId>, path: impl Into<PathBuf>) -> Self {
58 Self {
59 source_id: source_id.into(),
60 path: path.into(),
61 anchor_column: None,
62 positive_column: None,
63 text_column: None,
64 trust: 0.85,
65 }
66 }
67
68 pub fn with_anchor_column(mut self, column: impl Into<String>) -> Self {
70 self.anchor_column = Some(column.into());
71 self
72 }
73
74 pub fn with_positive_column(mut self, column: impl Into<String>) -> Self {
76 self.positive_column = Some(column.into());
77 self
78 }
79
80 pub fn with_text_column(mut self, column: impl Into<String>) -> Self {
82 self.text_column = Some(column.into());
83 self
84 }
85
86 pub fn with_trust(mut self, trust: f32) -> Self {
88 self.trust = trust;
89 self
90 }
91
92 fn is_role_mode(&self) -> bool {
93 self.anchor_column.is_some()
94 }
95
96 fn validate(&self) -> Result<(), SamplerError> {
97 if self.anchor_column.is_some() && self.text_column.is_some() {
98 return Err(SamplerError::Configuration(
99 "CsvSourceConfig: `anchor_column` and `text_column` are mutually exclusive"
100 .to_string(),
101 ));
102 }
103 if self.anchor_column.is_none() && self.text_column.is_none() {
104 return Err(SamplerError::Configuration(
105 "CsvSourceConfig: one of `anchor_column` or `text_column` must be set".to_string(),
106 ));
107 }
108 if self.positive_column.is_some() && self.anchor_column.is_none() {
109 return Err(SamplerError::Configuration(
110 "CsvSourceConfig: `positive_column` requires `anchor_column` to be set".to_string(),
111 ));
112 }
113 Ok(())
114 }
115}
116
117#[derive(Debug)]
144pub struct CsvSource {
145 config: CsvSourceConfig,
146 records: Vec<DataRecord>,
147}
148
149impl CsvSource {
150 pub fn new(config: CsvSourceConfig) -> Result<Self, SamplerError> {
156 config.validate()?;
157 let records = Self::load_records(&config)?;
158 Ok(Self { config, records })
159 }
160
161 fn load_records(config: &CsvSourceConfig) -> Result<Vec<DataRecord>, SamplerError> {
162 let (created_at, updated_at) = file_times(&config.path);
163
164 let mut reader = csv::ReaderBuilder::new()
165 .has_headers(true)
166 .flexible(false)
167 .trim(csv::Trim::All)
168 .from_path(&config.path)
169 .map_err(|err| SamplerError::SourceUnavailable {
170 source_id: config.source_id.clone(),
171 reason: format!("failed to open CSV file '{}': {err}", config.path.display()),
172 })?;
173
174 let headers = reader
176 .headers()
177 .map_err(|err| SamplerError::SourceUnavailable {
178 source_id: config.source_id.clone(),
179 reason: format!(
180 "failed to read CSV headers in '{}': {err}",
181 config.path.display()
182 ),
183 })?
184 .clone();
185
186 let anchor_idx = if let Some(col) = &config.anchor_column {
189 Some(column_index(&headers, col).ok_or_else(|| {
190 SamplerError::Configuration(format!(
191 "anchor_column '{}' not found in CSV headers of '{}'",
192 col,
193 config.path.display()
194 ))
195 })?)
196 } else {
197 None
198 };
199
200 let positive_idx = if let Some(col) = &config.positive_column {
201 Some(column_index(&headers, col).ok_or_else(|| {
202 SamplerError::Configuration(format!(
203 "positive_column '{}' not found in CSV headers of '{}'",
204 col,
205 config.path.display()
206 ))
207 })?)
208 } else {
209 None
210 };
211
212 let text_idx = if let Some(col) = &config.text_column {
213 Some(column_index(&headers, col).ok_or_else(|| {
214 SamplerError::Configuration(format!(
215 "text_column '{}' not found in CSV headers of '{}'",
216 col,
217 config.path.display()
218 ))
219 })?)
220 } else {
221 None
222 };
223
224 let mut records = Vec::new();
225
226 let cols = ColumnIndices {
227 anchor: anchor_idx,
228 positive: positive_idx,
229 text: text_idx,
230 };
231
232 for (row_idx, result) in reader.records().enumerate() {
233 let row = result.map_err(|err| SamplerError::SourceUnavailable {
234 source_id: config.source_id.clone(),
235 reason: format!(
236 "failed to read row {} in '{}': {err}",
237 row_idx,
238 config.path.display()
239 ),
240 })?;
241
242 if let Some(record) = build_record(config, &row, row_idx, &cols, created_at, updated_at)
243 {
244 records.push(record);
245 }
246 }
247
248 Ok(records)
249 }
250}
251
252fn column_index(headers: &csv::StringRecord, name: &str) -> Option<usize> {
254 headers.iter().position(|h| h.eq_ignore_ascii_case(name))
255}
256
257struct ColumnIndices {
259 anchor: Option<usize>,
260 positive: Option<usize>,
261 text: Option<usize>,
262}
263
264fn build_record(
268 config: &CsvSourceConfig,
269 row: &csv::StringRecord,
270 row_idx: usize,
271 cols: &ColumnIndices,
272 created_at: DateTime<Utc>,
273 updated_at: DateTime<Utc>,
274) -> Option<DataRecord> {
275 let id = format!("{}::row_{}", config.source_id, row_idx);
276
277 let sections = if config.is_role_mode() {
278 let anchor_raw = cols.anchor.and_then(|i| row.get(i)).unwrap_or("");
280 let anchor_text = normalize_inline_whitespace(anchor_raw);
281 if anchor_text.is_empty() {
282 return None;
283 }
284
285 let positive_text = if let Some(pidx) = cols.positive {
286 let raw = row.get(pidx).unwrap_or("");
287 let normalized = normalize_inline_whitespace(raw);
288 if normalized.is_empty() {
289 return None;
290 }
291 normalized
292 } else {
293 anchor_text.clone()
295 };
296
297 let anchor_heading = config.anchor_column.as_deref();
298 let positive_heading = config
299 .positive_column
300 .as_deref()
301 .or(config.anchor_column.as_deref());
302
303 vec![
304 make_section(SectionRole::Anchor, anchor_heading, &anchor_text),
305 make_section(SectionRole::Context, positive_heading, &positive_text),
306 ]
307 } else {
308 let raw = cols.text.and_then(|i| row.get(i)).unwrap_or("");
310 let text = normalize_inline_whitespace(raw);
311 if text.is_empty() {
312 return None;
313 }
314
315 let heading = config.text_column.as_deref();
316 vec![
317 make_section(SectionRole::Anchor, heading, &text),
318 make_section(SectionRole::Context, heading, &text),
319 ]
320 };
321
322 Some(DataRecord {
323 id,
324 source: config.source_id.clone(),
325 created_at,
326 updated_at,
327 quality: QualityScore {
328 trust: config.trust,
329 },
330 taxonomy: vec![config.source_id.clone()],
331 sections,
332 meta_prefix: None,
333 })
334}
335
336impl IndexableSource for CsvSource {
337 fn id(&self) -> &str {
338 &self.config.source_id
339 }
340
341 fn len_hint(&self) -> Option<usize> {
342 Some(self.records.len())
343 }
344
345 fn record_at(&self, idx: usize) -> Result<Option<DataRecord>, SamplerError> {
346 Ok(self.records.get(idx).cloned())
347 }
348}
349
350impl DataSource for CsvSource {
351 fn id(&self) -> &str {
352 &self.config.source_id
353 }
354
355 fn refresh(
356 &self,
357 _config: &SamplerConfig,
358 cursor: Option<&SourceCursor>,
359 limit: Option<usize>,
360 ) -> Result<SourceSnapshot, SamplerError> {
361 IndexablePager::new(&self.config.source_id).refresh(self, cursor, limit)
362 }
363
364 fn reported_record_count(&self, _config: &SamplerConfig) -> Result<u128, SamplerError> {
365 Ok(self.records.len() as u128)
366 }
367
368 fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
369 if !self.config.is_role_mode() {
370 return vec![TripletRecipe {
374 name: CSV_RECIPE_TEXT_SIMCSE_WRONG_ARTICLE.into(),
375 anchor: Selector::Role(SectionRole::Anchor),
376 positive_selector: Selector::Role(SectionRole::Context),
377 negative_selector: Selector::Role(SectionRole::Context),
378 negative_strategy: NegativeStrategy::WrongArticle,
379 weight: 1.0,
380 instruction: None,
381 allow_same_anchor_positive: true,
382 }];
383 }
384
385 vec![
386 TripletRecipe {
388 name: CSV_RECIPE_ANCHOR_POSITIVE_WRONG_ARTICLE.into(),
389 anchor: Selector::Role(SectionRole::Anchor),
390 positive_selector: Selector::Role(SectionRole::Context),
391 negative_selector: Selector::Role(SectionRole::Context),
392 negative_strategy: NegativeStrategy::WrongArticle,
393 weight: 0.75,
394 instruction: None,
395 allow_same_anchor_positive: false,
396 },
397 TripletRecipe {
399 name: CSV_RECIPE_ANCHOR_ANCHOR_WRONG_ARTICLE.into(),
400 anchor: Selector::Role(SectionRole::Anchor),
401 positive_selector: Selector::Role(SectionRole::Context),
402 negative_selector: Selector::Role(SectionRole::Anchor),
403 negative_strategy: NegativeStrategy::WrongArticle,
404 weight: 0.25,
405 instruction: None,
406 allow_same_anchor_positive: false,
407 },
408 ]
409 }
410}
411
412#[cfg(test)]
413mod tests {
414 use super::*;
415 use crate::config::SamplerConfig;
416 use crate::source::DataSource;
417 use std::io::Write;
418 use tempfile::NamedTempFile;
419
420 fn write_csv(content: &str) -> NamedTempFile {
421 let mut f = NamedTempFile::new().unwrap();
422 write!(f, "{content}").unwrap();
423 f
424 }
425
426 fn sampler_config() -> SamplerConfig {
427 SamplerConfig {
428 seed: 42,
429 ..SamplerConfig::default()
430 }
431 }
432
433 #[test]
436 fn rejects_anchor_and_text_columns_together() {
437 let f = write_csv("anchor,text\nhello,world\n");
438 let err = CsvSource::new(
439 CsvSourceConfig::new("src", f.path())
440 .with_anchor_column("anchor")
441 .with_text_column("text"),
442 )
443 .unwrap_err();
444 assert!(
445 matches!(err, SamplerError::Configuration(_)),
446 "expected Configuration error, got {err:?}"
447 );
448 }
449
450 #[test]
451 fn rejects_missing_column_spec() {
452 let f = write_csv("anchor,text\nhello,world\n");
453 let err = CsvSource::new(CsvSourceConfig::new("src", f.path())).unwrap_err();
454 assert!(matches!(err, SamplerError::Configuration(_)));
455 }
456
457 #[test]
458 fn rejects_positive_without_anchor() {
459 let f = write_csv("anchor,text\nhello,world\n");
460 let err =
461 CsvSource::new(CsvSourceConfig::new("src", f.path()).with_positive_column("text"))
462 .unwrap_err();
463 assert!(matches!(err, SamplerError::Configuration(_)));
464 }
465
466 #[test]
467 fn rejects_missing_anchor_column_in_file() {
468 let f = write_csv("question,answer\nhello,world\n");
469 let err =
470 CsvSource::new(CsvSourceConfig::new("src", f.path()).with_anchor_column("missing_col"))
471 .unwrap_err();
472 assert!(matches!(err, SamplerError::Configuration(_)));
473 }
474
475 #[test]
476 fn rejects_missing_text_column_in_file() {
477 let f = write_csv("question,answer\nhello,world\n");
478 let err =
479 CsvSource::new(CsvSourceConfig::new("src", f.path()).with_text_column("missing_col"))
480 .unwrap_err();
481 assert!(matches!(err, SamplerError::Configuration(_)));
482 }
483
484 #[test]
485 fn rejects_missing_positive_column_in_file() {
486 let f = write_csv("question,answer\nhello,world\n");
487 let err = CsvSource::new(
488 CsvSourceConfig::new("src", f.path())
489 .with_anchor_column("question")
490 .with_positive_column("missing_col"),
491 )
492 .unwrap_err();
493 assert!(matches!(err, SamplerError::Configuration(_)));
494 }
495
496 #[test]
499 fn role_mode_anchor_and_positive() {
500 let f = write_csv("question,answer\nWhat is Rust?,A systems language.\n");
501 let source = CsvSource::new(
502 CsvSourceConfig::new("qna", f.path())
503 .with_anchor_column("question")
504 .with_positive_column("answer"),
505 )
506 .unwrap();
507
508 let snapshot = source.refresh(&sampler_config(), None, None).unwrap();
509 assert_eq!(snapshot.records.len(), 1);
510 let record = &snapshot.records[0];
511 assert_eq!(record.source, "qna");
512 assert_eq!(record.sections.len(), 2);
513 assert_eq!(record.sections[0].role, SectionRole::Anchor);
514 assert_eq!(record.sections[0].text, "What is Rust?");
515 assert_eq!(record.sections[1].role, SectionRole::Context);
516 assert_eq!(record.sections[1].text, "A systems language.");
517 }
518
519 #[test]
520 fn role_mode_anchor_only_duplicates_to_context() {
521 let f = write_csv("sentence\nHello world\n");
522 let source = CsvSource::new(
523 CsvSourceConfig::new("anchors", f.path()).with_anchor_column("sentence"),
524 )
525 .unwrap();
526
527 let snapshot = source.refresh(&sampler_config(), None, None).unwrap();
528 assert_eq!(snapshot.records.len(), 1);
529 let record = &snapshot.records[0];
530 assert_eq!(record.sections.len(), 2);
531 assert_eq!(record.sections[0].role, SectionRole::Anchor);
532 assert_eq!(record.sections[1].role, SectionRole::Context);
533 assert_eq!(record.sections[0].text, record.sections[1].text);
535 }
536
537 #[test]
538 fn role_mode_skips_rows_with_empty_anchor() {
539 let f = write_csv(
540 "question,answer\n\
541 What is Rust?,A systems language.\n\
542 ,Missing anchor\n\
543 What is Go?,A concurrent language.\n",
544 );
545 let source = CsvSource::new(
546 CsvSourceConfig::new("qna", f.path())
547 .with_anchor_column("question")
548 .with_positive_column("answer"),
549 )
550 .unwrap();
551 let snapshot = source.refresh(&sampler_config(), None, None).unwrap();
552 assert_eq!(snapshot.records.len(), 2);
553 }
554
555 #[test]
556 fn role_mode_skips_rows_with_empty_positive() {
557 let f = write_csv(
558 "question,answer\n\
559 What is Rust?,A systems language.\n\
560 What is Go?,\n",
561 );
562 let source = CsvSource::new(
563 CsvSourceConfig::new("qna", f.path())
564 .with_anchor_column("question")
565 .with_positive_column("answer"),
566 )
567 .unwrap();
568 let snapshot = source.refresh(&sampler_config(), None, None).unwrap();
569 assert_eq!(snapshot.records.len(), 1);
570 }
571
572 #[test]
575 fn text_mode_produces_identical_anchor_and_context() {
576 let f = write_csv("text\nThe quick brown fox\n");
577 let source =
578 CsvSource::new(CsvSourceConfig::new("corpus", f.path()).with_text_column("text"))
579 .unwrap();
580
581 let snapshot = source.refresh(&sampler_config(), None, None).unwrap();
582 assert_eq!(snapshot.records.len(), 1);
583 let record = &snapshot.records[0];
584 assert_eq!(record.sections.len(), 2);
585 assert_eq!(record.sections[0].role, SectionRole::Anchor);
586 assert_eq!(record.sections[1].role, SectionRole::Context);
587 assert_eq!(record.sections[0].text, record.sections[1].text);
588 }
589
590 #[test]
591 fn text_mode_skips_empty_rows() {
592 let f = write_csv("text\nHello\n\nWorld\n");
593 let source =
594 CsvSource::new(CsvSourceConfig::new("corpus", f.path()).with_text_column("text"))
595 .unwrap();
596 let snapshot = source.refresh(&sampler_config(), None, None).unwrap();
597 assert_eq!(snapshot.records.len(), 2);
598 }
599
600 #[test]
603 fn applies_trust_score() {
604 let f = write_csv("text\nHello world\n");
605 let source = CsvSource::new(
606 CsvSourceConfig::new("corpus", f.path())
607 .with_text_column("text")
608 .with_trust(0.7),
609 )
610 .unwrap();
611 let snapshot = source.refresh(&sampler_config(), None, None).unwrap();
612 assert_eq!(snapshot.records[0].quality.trust, 0.7);
613 }
614
615 #[test]
618 fn text_mode_default_recipes_is_simcse() {
619 let f = write_csv("text\nHello\n");
620 let source =
621 CsvSource::new(CsvSourceConfig::new("corpus", f.path()).with_text_column("text"))
622 .unwrap();
623 let recipes = source.default_triplet_recipes();
624 assert_eq!(recipes.len(), 1);
625 assert_eq!(recipes[0].name, CSV_RECIPE_TEXT_SIMCSE_WRONG_ARTICLE);
626 assert!(
627 recipes[0].allow_same_anchor_positive,
628 "SimCSE recipe must allow same anchor/positive"
629 );
630 }
631
632 #[test]
633 fn role_mode_default_recipes_returns_two_recipes() {
634 let f = write_csv("question,answer\nQ,A\n");
635 let source = CsvSource::new(
636 CsvSourceConfig::new("qna", f.path())
637 .with_anchor_column("question")
638 .with_positive_column("answer"),
639 )
640 .unwrap();
641 let recipes = source.default_triplet_recipes();
642 assert_eq!(recipes.len(), 2);
643 let names: Vec<&str> = recipes.iter().map(|r| r.name.as_ref()).collect();
644 assert!(names.contains(&CSV_RECIPE_ANCHOR_POSITIVE_WRONG_ARTICLE));
645 assert!(names.contains(&CSV_RECIPE_ANCHOR_ANCHOR_WRONG_ARTICLE));
646 assert!(
647 recipes.iter().all(|r| !r.allow_same_anchor_positive),
648 "role-mode recipes must not allow same anchor/positive"
649 );
650 }
651
652 #[test]
655 fn len_hint_matches_loaded_record_count() {
656 let f = write_csv("text\nAlpha\nBeta\nGamma\n");
657 let source =
658 CsvSource::new(CsvSourceConfig::new("corpus", f.path()).with_text_column("text"))
659 .unwrap();
660 assert_eq!(source.len_hint(), Some(3));
661 }
662
663 #[test]
664 fn record_at_returns_correct_record() {
665 let f = write_csv("question,answer\nFirst?,Yes.\nSecond?,No.\n");
666 let source = CsvSource::new(
667 CsvSourceConfig::new("qna", f.path())
668 .with_anchor_column("question")
669 .with_positive_column("answer"),
670 )
671 .unwrap();
672 let r0 = source.record_at(0).unwrap().unwrap();
673 let r1 = source.record_at(1).unwrap().unwrap();
674 assert_eq!(r0.sections[0].text, "First?");
675 assert_eq!(r1.sections[0].text, "Second?");
676 assert!(source.record_at(99).unwrap().is_none());
677 }
678
679 #[test]
682 fn reported_record_count_matches_loaded_records() {
683 let f = write_csv("text\nAlpha\nBeta\n");
684 let source =
685 CsvSource::new(CsvSourceConfig::new("corpus", f.path()).with_text_column("text"))
686 .unwrap();
687 let count = source.reported_record_count(&sampler_config()).unwrap();
688 assert_eq!(count, 2);
689 }
690
691 #[test]
694 fn record_ids_are_stable_across_refreshes() {
695 let f = write_csv("text\nAlpha\nBeta\n");
696 let source =
697 CsvSource::new(CsvSourceConfig::new("corpus", f.path()).with_text_column("text"))
698 .unwrap();
699 let ids_a: Vec<_> = source
700 .refresh(&sampler_config(), None, None)
701 .unwrap()
702 .records
703 .iter()
704 .map(|r| r.id.clone())
705 .collect();
706 let ids_b: Vec<_> = source
707 .refresh(&sampler_config(), None, None)
708 .unwrap()
709 .records
710 .iter()
711 .map(|r| r.id.clone())
712 .collect();
713 let mut sorted_a = ids_a.clone();
715 let mut sorted_b = ids_b.clone();
716 sorted_a.sort();
717 sorted_b.sort();
718 assert_eq!(sorted_a, sorted_b);
719 }
720
721 #[test]
724 fn source_id_is_propagated_to_records() {
725 let f = write_csv("text\nHello\n");
726 let source =
727 CsvSource::new(CsvSourceConfig::new("my_source", f.path()).with_text_column("text"))
728 .unwrap();
729 assert_eq!(DataSource::id(&source), "my_source");
730 let snapshot = source.refresh(&sampler_config(), None, None).unwrap();
731 assert_eq!(snapshot.records[0].source, "my_source");
732 }
733
734 #[test]
737 fn column_lookup_is_case_insensitive() {
738 let f = write_csv("Question,Answer\nWhat is Rust?,A systems language.\n");
739 let source = CsvSource::new(
741 CsvSourceConfig::new("qna", f.path())
742 .with_anchor_column("question")
743 .with_positive_column("answer"),
744 )
745 .unwrap();
746 let snapshot = source.refresh(&sampler_config(), None, None).unwrap();
747 assert_eq!(snapshot.records.len(), 1);
748 assert_eq!(snapshot.records[0].sections[0].text, "What is Rust?");
749 }
750
751 #[test]
754 fn refresh_with_limit_returns_at_most_limit_records() {
755 let f = write_csv("text\nA\nB\nC\nD\nE\n");
756 let source =
757 CsvSource::new(CsvSourceConfig::new("corpus", f.path()).with_text_column("text"))
758 .unwrap();
759 let snapshot = source.refresh(&sampler_config(), None, Some(3)).unwrap();
760 assert!(
761 snapshot.records.len() <= 3,
762 "expected at most 3 records, got {}",
763 snapshot.records.len()
764 );
765 }
766
767 #[test]
770 fn rejects_positive_and_text_column_without_anchor() {
771 let f = write_csv("text,answer\nhello,world\n");
774 let err = CsvSource::new(
775 CsvSourceConfig::new("src", f.path())
776 .with_text_column("text")
777 .with_positive_column("answer"),
778 )
779 .unwrap_err();
780 assert!(
781 matches!(err, SamplerError::Configuration(_)),
782 "expected Configuration error, got {err:?}"
783 );
784 }
785
786 #[test]
789 fn returns_source_unavailable_for_nonexistent_file() {
790 let err = CsvSource::new(
792 CsvSourceConfig::new("src", "/nonexistent/does-not-exist.csv").with_text_column("text"),
793 )
794 .unwrap_err();
795 assert!(
796 matches!(err, SamplerError::SourceUnavailable { .. }),
797 "expected SourceUnavailable, got {err:?}"
798 );
799 }
800
801 #[test]
804 fn returns_source_unavailable_for_malformed_row() {
805 let f = write_csv("question,answer\nWhat is Rust?,Good language.,extra_column\n");
808 let err = CsvSource::new(
809 CsvSourceConfig::new("src", f.path())
810 .with_anchor_column("question")
811 .with_positive_column("answer"),
812 )
813 .unwrap_err();
814 assert!(
815 matches!(err, SamplerError::SourceUnavailable { .. }),
816 "expected SourceUnavailable for malformed row, got {err:?}"
817 );
818 }
819
820 #[test]
823 fn text_mode_skips_whitespace_only_cells() {
824 let f = write_csv("text\nHello\n \nWorld\n");
830 let source =
831 CsvSource::new(CsvSourceConfig::new("corpus", f.path()).with_text_column("text"))
832 .unwrap();
833 let snapshot = source.refresh(&sampler_config(), None, None).unwrap();
834 assert_eq!(
835 snapshot.records.len(),
836 2,
837 "whitespace-only cell should be skipped"
838 );
839 }
840
841 #[test]
844 fn indexable_source_id_matches_config() {
845 let f = write_csv("text\nHello\n");
848 let source =
849 CsvSource::new(CsvSourceConfig::new("explicit_id", f.path()).with_text_column("text"))
850 .unwrap();
851 assert_eq!(IndexableSource::id(&source), "explicit_id");
852 }
853}