1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::sync::Arc;
4
5use crate::config::{NegativeStrategy, SamplerConfig, Selector, TripletRecipe};
6use crate::data::{DataRecord, QualityScore, RecordSection, SectionRole};
7use crate::errors::SamplerError;
8use crate::source::indexing::file_corpus::FileCorpusIndex;
9use crate::source::{DataSource, SourceCursor, SourceSnapshot};
10use crate::types::{CategoryId, SourceId, TaxonomyValue};
11use crate::utils::{file_times, is_text_file};
12use crate::utils::{make_section, normalize_inline_whitespace};
13
14const FILE_RECIPE_TITLE_CONTEXT_WRONG_DATE: &str = "title_context_wrong_date";
15const FILE_RECIPE_TITLE_ANCHOR_WRONG_DATE: &str = "title_anchor_wrong_date";
16const FILE_RECIPE_TITLE_CONTEXT_WRONG_ARTICLE: &str = "title_context_wrong_article";
17const FILE_RECIPE_TITLE_ANCHOR_WRONG_ARTICLE: &str = "title_anchor_wrong_article";
18
19pub type TaxonomyBuilder =
21 Arc<dyn Fn(&Path, &Path, &SourceId) -> Vec<TaxonomyValue> + Send + Sync + 'static>;
22
23pub type SectionBuilder = Arc<dyn Fn(&str, &str) -> Vec<RecordSection> + Send + Sync + 'static>;
25
26#[derive(Clone)]
28pub struct FileSourceConfig {
29 pub source_id: SourceId,
31 pub root: PathBuf,
33 pub trust: f32,
35 pub category_trust: HashMap<CategoryId, f32>,
37 pub follow_links: bool,
39 pub text_files_only: bool,
41 pub group_by_directory: bool,
43 pub title_replace_underscores: bool,
45 pub include_date_aware_default_recipe: bool,
47 pub index_dir: Option<PathBuf>,
52 pub default_triplet_recipes: Vec<TripletRecipe>,
54 pub taxonomy_builder: TaxonomyBuilder,
56 pub section_builder: SectionBuilder,
58}
59
60impl FileSourceConfig {
61 pub fn new(source_id: impl Into<SourceId>, root: impl Into<PathBuf>) -> Self {
63 Self {
64 source_id: source_id.into(),
65 root: root.into(),
66 trust: 0.85,
67 category_trust: HashMap::new(),
68 follow_links: true,
69 text_files_only: false,
70 group_by_directory: true,
71 title_replace_underscores: true,
72 include_date_aware_default_recipe: false,
73 index_dir: None,
74 default_triplet_recipes: default_title_context_triplet_recipes(false),
75 taxonomy_builder: Arc::new(taxonomy_from_path),
76 section_builder: Arc::new(anchor_context_sections),
77 }
78 }
79
80 pub fn with_trust(mut self, trust: f32) -> Self {
82 self.trust = trust;
83 self
84 }
85
86 pub fn with_category_trust(mut self, category: impl Into<String>, trust: f32) -> Self {
88 self.category_trust
89 .insert(category.into().to_lowercase(), trust);
90 self
91 }
92
93 pub fn with_follow_links(mut self, follow_links: bool) -> Self {
95 self.follow_links = follow_links;
96 self
97 }
98
99 pub fn with_text_files_only(mut self, text_files_only: bool) -> Self {
101 self.text_files_only = text_files_only;
102 self
103 }
104
105 pub fn with_directory_grouping(mut self, group_by_directory: bool) -> Self {
107 self.group_by_directory = group_by_directory;
108 self
109 }
110
111 pub fn with_title_replace_underscores(mut self, replace_underscores: bool) -> Self {
113 self.title_replace_underscores = replace_underscores;
114 self
115 }
116
117 pub fn with_date_aware_default_recipe(mut self, include: bool) -> Self {
122 self.include_date_aware_default_recipe = include;
123 self.default_triplet_recipes = default_title_context_triplet_recipes(include);
124 self
125 }
126
127 pub fn with_index_dir(mut self, index_dir: impl Into<PathBuf>) -> Self {
129 self.index_dir = Some(index_dir.into());
130 self
131 }
132
133 pub fn with_default_triplet_recipes(mut self, recipes: Vec<TripletRecipe>) -> Self {
135 self.default_triplet_recipes = recipes;
136 self
137 }
138
139 pub fn with_taxonomy_builder(mut self, taxonomy_builder: TaxonomyBuilder) -> Self {
141 self.taxonomy_builder = taxonomy_builder;
142 self
143 }
144
145 pub fn with_section_builder(mut self, section_builder: SectionBuilder) -> Self {
147 self.section_builder = section_builder;
148 self
149 }
150}
151
152pub fn default_title_context_triplet_recipes(include_date_aware: bool) -> Vec<TripletRecipe> {
157 let mut recipes = Vec::new();
158 if include_date_aware {
159 recipes.push(TripletRecipe {
162 name: FILE_RECIPE_TITLE_CONTEXT_WRONG_DATE.into(),
163 anchor: Selector::Role(SectionRole::Anchor),
164 positive_selector: Selector::Role(SectionRole::Context),
165 negative_selector: Selector::Role(SectionRole::Context),
166 negative_strategy: NegativeStrategy::WrongPublicationDate,
167 weight: 0.30,
168 instruction: None,
169 allow_same_anchor_positive: false,
170 });
171 recipes.push(TripletRecipe {
175 name: FILE_RECIPE_TITLE_ANCHOR_WRONG_DATE.into(),
176 anchor: Selector::Role(SectionRole::Anchor),
177 positive_selector: Selector::Role(SectionRole::Context),
178 negative_selector: Selector::Role(SectionRole::Anchor),
179 negative_strategy: NegativeStrategy::WrongPublicationDate,
180 weight: 0.10,
181 instruction: None,
182 allow_same_anchor_positive: false,
183 });
184 }
185 recipes.push(TripletRecipe {
188 name: FILE_RECIPE_TITLE_CONTEXT_WRONG_ARTICLE.into(),
189 anchor: Selector::Role(SectionRole::Anchor),
190 positive_selector: Selector::Role(SectionRole::Context),
191 negative_selector: Selector::Role(SectionRole::Context),
192 negative_strategy: NegativeStrategy::WrongArticle,
193 weight: if include_date_aware { 0.35 } else { 0.75 },
194 instruction: None,
195 allow_same_anchor_positive: false,
196 });
197 recipes.push(TripletRecipe {
200 name: FILE_RECIPE_TITLE_ANCHOR_WRONG_ARTICLE.into(),
201 anchor: Selector::Role(SectionRole::Anchor),
202 positive_selector: Selector::Role(SectionRole::Context),
203 negative_selector: Selector::Role(SectionRole::Anchor),
204 negative_strategy: NegativeStrategy::WrongArticle,
205 weight: 0.25,
206 instruction: None,
207 allow_same_anchor_positive: false,
208 });
209 recipes
210}
211
212pub struct FileSource {
214 config: FileSourceConfig,
215}
216
217impl FileSource {
218 pub fn new(config: FileSourceConfig) -> Self {
220 Self { config }
221 }
222
223 fn file_corpus_index(&self, sampler_seed: u64) -> FileCorpusIndex {
224 let mut index = FileCorpusIndex::new(&self.config.root, &self.config.source_id)
225 .with_sampler_seed(sampler_seed)
226 .with_follow_links(self.config.follow_links)
227 .with_text_files_only(self.config.text_files_only)
228 .with_directory_grouping(self.config.group_by_directory);
229
230 if let Some(index_dir) = &self.config.index_dir {
231 index = index.with_index_dir(index_dir.clone());
232 }
233
234 index
235 }
236
237 fn trust_for_taxonomy(&self, taxonomy: &[String]) -> f32 {
238 for segment in taxonomy.iter().skip(1) {
239 if let Some(weight) = self.config.category_trust.get(&segment.to_lowercase()) {
240 return *weight;
241 }
242 }
243 self.config.trust
244 }
245
246 fn build_record(&self, path: &Path) -> Result<Option<DataRecord>, SamplerError> {
247 if !is_text_file(path) {
248 return Ok(None);
249 }
250 let title = FileCorpusIndex::normalized_title_from_stem(
251 path,
252 &self.config.source_id,
253 self.config.title_replace_underscores,
254 )?;
255 if title.is_empty() {
256 return Ok(None);
257 }
258
259 let body_raw = std::fs::read_to_string(path)?;
260 let body = normalize_inline_whitespace(body_raw);
261 if body.is_empty() {
262 return Ok(None);
263 }
264
265 let taxonomy =
266 (self.config.taxonomy_builder)(&self.config.root, path, &self.config.source_id);
267 let sections = (self.config.section_builder)(&title, &body);
268 let trust = self.trust_for_taxonomy(&taxonomy);
269 let (created_at, updated_at) = file_times(path);
270
271 Ok(Some(DataRecord {
272 id: FileCorpusIndex::source_scoped_record_id(
273 &self.config.source_id,
274 &self.config.root,
275 path,
276 ),
277 source: self.config.source_id.clone(),
278 created_at,
279 updated_at,
280 quality: QualityScore { trust },
281 taxonomy,
282 sections,
283 meta_prefix: None,
284 }))
285 }
286}
287
288impl DataSource for FileSource {
289 fn id(&self) -> &str {
290 &self.config.source_id
291 }
292
293 fn refresh(
294 &self,
295 config: &SamplerConfig,
296 cursor: Option<&SourceCursor>,
297 limit: Option<usize>,
298 ) -> Result<SourceSnapshot, SamplerError> {
299 self.file_corpus_index(config.seed)
300 .refresh_indexable(cursor, limit, |path| self.build_record(path))
301 }
302
303 fn reported_record_count(&self, config: &SamplerConfig) -> Result<u128, SamplerError> {
304 self.file_corpus_index(config.seed)
305 .indexed_record_count()
306 .map(|count| count as u128)
307 }
308
309 fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
310 self.config.default_triplet_recipes.clone()
311 }
312}
313
314pub fn taxonomy_from_path(root: &Path, path: &Path, source_id: &SourceId) -> Vec<TaxonomyValue> {
318 let mut taxonomy = vec![source_id.to_string()];
319 if let Ok(rel) = path.strip_prefix(root)
320 && let Some(parent) = rel.parent()
321 {
322 for segment in parent.iter() {
323 taxonomy.push(segment.to_string_lossy().to_string());
324 }
325 }
326 taxonomy
327}
328
329pub fn anchor_context_sections(title: &str, body: &str) -> Vec<RecordSection> {
331 vec![
332 make_section(SectionRole::Anchor, None, title),
333 make_section(SectionRole::Context, None, body),
334 ]
335}
336
337#[cfg(test)]
338mod tests {
339 use super::*;
340 use crate::config::{NegativeStrategy, Selector};
341 use tempfile::tempdir;
342
343 fn sampler_config(seed: u64) -> SamplerConfig {
344 SamplerConfig {
345 seed,
346 ..SamplerConfig::default()
347 }
348 }
349
350 #[test]
351 fn reads_records_without_default_source_id() {
352 let temp = tempdir().unwrap();
353 let category = temp.path().join("factual");
354 std::fs::create_dir_all(&category).unwrap();
355 std::fs::write(
356 category.join("What_is_alpha.txt"),
357 "Alpha measures risk-adjusted outperformance.",
358 )
359 .unwrap();
360
361 let source = FileSource::new(FileSourceConfig::new("qa_custom", temp.path()));
362 let snapshot = source.refresh(&sampler_config(101), None, None).unwrap();
363
364 assert_eq!(snapshot.records.len(), 1);
365 assert_eq!(snapshot.records[0].source, "qa_custom");
366 }
367
368 #[test]
369 fn applies_category_trust_overrides() {
370 let temp = tempdir().unwrap();
371 let factual = temp.path().join("factual");
372 let opinion = temp.path().join("opinionated");
373 std::fs::create_dir_all(&factual).unwrap();
374 std::fs::create_dir_all(&opinion).unwrap();
375 std::fs::write(
376 factual.join("What_is_beta.txt"),
377 "Beta compares volatility.",
378 )
379 .unwrap();
380 std::fs::write(
381 opinion.join("Will_rates_fall.txt"),
382 "Probably not this year.",
383 )
384 .unwrap();
385
386 let source = FileSource::new(
387 FileSourceConfig::new("qa_weighted", temp.path())
388 .with_category_trust("factual", 0.95)
389 .with_category_trust("opinionated", 0.6),
390 );
391 let snapshot = source.refresh(&sampler_config(101), None, None).unwrap();
392
393 let factual_record = snapshot
394 .records
395 .iter()
396 .find(|record| record.taxonomy.iter().any(|value| value == "factual"))
397 .unwrap();
398 let opinion_record = snapshot
399 .records
400 .iter()
401 .find(|record| record.taxonomy.iter().any(|value| value == "opinionated"))
402 .unwrap();
403 assert_eq!(factual_record.quality.trust, 0.95);
404 assert_eq!(opinion_record.quality.trust, 0.6);
405 }
406
407 #[test]
408 fn supports_custom_sections_and_default_recipes() {
409 let temp = tempdir().unwrap();
410 std::fs::write(
411 temp.path().join("What_is_gamma.txt"),
412 "Gamma measures convexity.",
413 )
414 .unwrap();
415
416 let sections: SectionBuilder = Arc::new(|question, answer| {
417 vec![
418 make_section(SectionRole::Anchor, Some("Question"), question),
419 make_section(SectionRole::Context, Some("Answer"), answer),
420 ]
421 });
422
423 let recipes = vec![TripletRecipe {
424 name: "question_answer".into(),
425 anchor: Selector::Role(SectionRole::Anchor),
426 positive_selector: Selector::Role(SectionRole::Context),
427 negative_selector: Selector::Role(SectionRole::Context),
428 negative_strategy: NegativeStrategy::QuestionAnswerMismatch,
429 weight: 1.0,
430 instruction: None,
431 allow_same_anchor_positive: false,
432 }];
433
434 let source = FileSource::new(
435 FileSourceConfig::new("qa_sections", temp.path())
436 .with_section_builder(sections)
437 .with_default_triplet_recipes(recipes.clone()),
438 );
439
440 let snapshot = source.refresh(&sampler_config(101), None, None).unwrap();
441 assert_eq!(snapshot.records.len(), 1);
442 assert_eq!(snapshot.records[0].sections.len(), 2);
443 assert_eq!(source.default_triplet_recipes().len(), recipes.len());
444 }
445
446 #[test]
447 fn file_source_config_new_has_explicit_default_triplet_recipes() {
448 let temp = tempdir().unwrap();
449 let source = FileSource::new(FileSourceConfig::new("qa_defaults", temp.path()));
450 let defaults = source.default_triplet_recipes();
451 assert!(!defaults.is_empty());
452 let names: Vec<&str> = defaults.iter().map(|recipe| recipe.name.as_ref()).collect();
453 assert!(!names.contains(&FILE_RECIPE_TITLE_CONTEXT_WRONG_DATE));
454 assert!(!names.contains(&FILE_RECIPE_TITLE_ANCHOR_WRONG_DATE));
455 assert!(names.contains(&FILE_RECIPE_TITLE_CONTEXT_WRONG_ARTICLE));
456 assert!(names.contains(&FILE_RECIPE_TITLE_ANCHOR_WRONG_ARTICLE));
457 let summary_wrong_article = defaults
458 .iter()
459 .find(|recipe| recipe.name == FILE_RECIPE_TITLE_CONTEXT_WRONG_ARTICLE)
460 .unwrap();
461 let anchor_wrong_article = defaults
462 .iter()
463 .find(|recipe| recipe.name == FILE_RECIPE_TITLE_ANCHOR_WRONG_ARTICLE)
464 .unwrap();
465 assert_eq!(summary_wrong_article.weight, 0.75);
466 assert_eq!(anchor_wrong_article.weight, 0.25);
467 }
468
469 #[test]
470 fn file_source_config_can_enable_date_aware_default_recipe() {
471 let temp = tempdir().unwrap();
472 let source = FileSource::new(
473 FileSourceConfig::new("qa_defaults_with_date", temp.path())
474 .with_date_aware_default_recipe(true),
475 );
476 let defaults = source.default_triplet_recipes();
477 let names: Vec<&str> = defaults.iter().map(|recipe| recipe.name.as_ref()).collect();
478 assert!(names.contains(&FILE_RECIPE_TITLE_CONTEXT_WRONG_DATE));
479 assert!(names.contains(&FILE_RECIPE_TITLE_ANCHOR_WRONG_DATE));
480 assert!(names.contains(&FILE_RECIPE_TITLE_CONTEXT_WRONG_ARTICLE));
481 assert!(names.contains(&FILE_RECIPE_TITLE_ANCHOR_WRONG_ARTICLE));
482 let summary_wrong_date = defaults
483 .iter()
484 .find(|recipe| recipe.name == FILE_RECIPE_TITLE_CONTEXT_WRONG_DATE)
485 .unwrap();
486 let anchor_wrong_date = defaults
487 .iter()
488 .find(|recipe| recipe.name == FILE_RECIPE_TITLE_ANCHOR_WRONG_DATE)
489 .unwrap();
490 let summary_wrong_article = defaults
491 .iter()
492 .find(|recipe| recipe.name == FILE_RECIPE_TITLE_CONTEXT_WRONG_ARTICLE)
493 .unwrap();
494 let anchor_wrong_article = defaults
495 .iter()
496 .find(|recipe| recipe.name == FILE_RECIPE_TITLE_ANCHOR_WRONG_ARTICLE)
497 .unwrap();
498 assert_eq!(summary_wrong_date.weight, 0.30);
499 assert_eq!(anchor_wrong_date.weight, 0.10);
500 assert_eq!(summary_wrong_article.weight, 0.35);
501 assert_eq!(anchor_wrong_article.weight, 0.25);
502 }
503
504 #[test]
505 fn file_source_config_override_replaces_default_triplet_recipes() {
506 let temp = tempdir().unwrap();
507 let custom = vec![TripletRecipe {
508 name: "custom_only".into(),
509 anchor: Selector::Role(SectionRole::Context),
510 positive_selector: Selector::Role(SectionRole::Context),
511 negative_selector: Selector::Role(SectionRole::Context),
512 negative_strategy: NegativeStrategy::WrongArticle,
513 weight: 1.0,
514 instruction: None,
515 allow_same_anchor_positive: false,
516 }];
517 let source = FileSource::new(
518 FileSourceConfig::new("qa_defaults_override", temp.path())
519 .with_default_triplet_recipes(custom.clone()),
520 );
521 let recipes = source.default_triplet_recipes();
522 assert_eq!(recipes.len(), 1);
523 assert_eq!(recipes[0].name.as_ref(), "custom_only");
524 }
525
526 #[test]
527 fn taxonomy_from_path_handles_nested_and_non_descendant_paths() {
528 let temp = tempdir().unwrap();
529 let root = temp.path().join("root");
530 std::fs::create_dir_all(root.join("topic/subtopic")).unwrap();
531
532 let nested = root.join("topic/subtopic/doc.txt");
533 let taxonomy = taxonomy_from_path(&root, &nested, &"qa_tax".to_string());
534 assert_eq!(taxonomy, vec!["qa_tax", "topic", "subtopic"]);
535
536 let outside = temp.path().join("outside.txt");
537 let outside_taxonomy = taxonomy_from_path(&root, &outside, &"qa_tax".to_string());
538 assert_eq!(outside_taxonomy, vec!["qa_tax"]);
539 }
540
541 #[test]
542 fn anchor_context_sections_build_expected_roles_and_text() {
543 let sections = anchor_context_sections("What is delta", "Delta is change over time.");
544 assert_eq!(sections.len(), 2);
545 assert_eq!(sections[0].role, SectionRole::Anchor);
546 assert_eq!(sections[0].text, "What is delta");
547 assert_eq!(sections[1].role, SectionRole::Context);
548 assert_eq!(sections[1].text, "Delta is change over time.");
549 }
550
551 #[test]
552 fn title_replace_underscores_toggle_changes_anchor_title_text() {
553 let temp = tempdir().unwrap();
554 std::fs::write(
555 temp.path().join("What_is_delta.txt"),
556 "Delta captures directional change.",
557 )
558 .unwrap();
559
560 let source_default =
561 FileSource::new(FileSourceConfig::new("qa_title_default", temp.path()));
562 let default_snapshot = source_default
563 .refresh(&sampler_config(101), None, Some(1))
564 .unwrap();
565 assert_eq!(default_snapshot.records.len(), 1);
566 assert_eq!(
567 default_snapshot.records[0].sections[0].text,
568 "What is delta"
569 );
570
571 let source_preserve = FileSource::new(
572 FileSourceConfig::new("qa_title_preserve", temp.path())
573 .with_title_replace_underscores(false),
574 );
575 let preserve_snapshot = source_preserve
576 .refresh(&sampler_config(101), None, Some(1))
577 .unwrap();
578 assert_eq!(preserve_snapshot.records.len(), 1);
579 assert_eq!(
580 preserve_snapshot.records[0].sections[0].text,
581 "What_is_delta"
582 );
583 }
584
585 #[test]
586 fn refresh_skips_non_txt_files_even_when_text_only_disabled() {
587 let temp = tempdir().unwrap();
588 std::fs::write(temp.path().join("notes.md"), "markdown should be skipped").unwrap();
589 std::fs::write(temp.path().join("doc.txt"), "plain text should be indexed").unwrap();
590
591 let source = FileSource::new(
592 FileSourceConfig::new("qa_filtering", temp.path()).with_text_files_only(false),
593 );
594 let snapshot = source.refresh(&sampler_config(101), None, None).unwrap();
595 assert_eq!(snapshot.records.len(), 1);
596 assert!(snapshot.records[0].id.contains("doc.txt"));
597 }
598
599 #[test]
600 fn trust_falls_back_to_default_and_count_and_id_are_exposed() {
601 let temp = tempdir().unwrap();
602 let docs = temp.path().join("docs");
603 std::fs::create_dir_all(&docs).unwrap();
604 std::fs::write(docs.join("alpha.txt"), "Alpha body.").unwrap();
605
606 let source = FileSource::new(
607 FileSourceConfig::new("qa_count", temp.path())
608 .with_trust(0.42)
609 .with_category_trust("factual", 0.95)
610 .with_taxonomy_builder(Arc::new(|_, _, source_id| {
611 vec![source_id.clone(), "UNMATCHED".to_string()]
612 })),
613 );
614
615 let seed_101 = sampler_config(101);
616 let snapshot = source.refresh(&seed_101, None, None).unwrap();
617 assert_eq!(snapshot.records.len(), 1);
618 assert_eq!(snapshot.records[0].quality.trust, 0.42);
619 assert_eq!(source.id(), "qa_count");
620 assert_eq!(source.reported_record_count(&seed_101).unwrap(), 1);
621 }
622
623 #[test]
624 fn sampler_seed_controls_file_source_refresh_order() {
625 let temp = tempdir().unwrap();
626 for idx in 0..12 {
627 std::fs::write(
628 temp.path().join(format!("doc_{idx:02}.txt")),
629 format!("Body text for {idx}"),
630 )
631 .unwrap();
632 }
633
634 let source_a = FileSource::new(FileSourceConfig::new("seeded_a", temp.path()));
635 let source_b = FileSource::new(FileSourceConfig::new("seeded_a", temp.path()));
636 let source_c = FileSource::new(FileSourceConfig::new("seeded_a", temp.path()));
637
638 let ids_a: Vec<String> = source_a
639 .refresh(&sampler_config(11), None, Some(8))
640 .unwrap()
641 .records
642 .into_iter()
643 .map(|record| record.id)
644 .collect();
645 let ids_b: Vec<String> = source_b
646 .refresh(&sampler_config(11), None, Some(8))
647 .unwrap()
648 .records
649 .into_iter()
650 .map(|record| record.id)
651 .collect();
652 let ids_c: Vec<String> = source_c
653 .refresh(&sampler_config(29), None, Some(8))
654 .unwrap()
655 .records
656 .into_iter()
657 .map(|record| record.id)
658 .collect();
659
660 assert_eq!(ids_a, ids_b);
661 assert_ne!(ids_a, ids_c);
662 }
663
664 #[test]
665 fn with_index_dir_persists_refresh_index_store_under_custom_directory() {
666 let temp = tempdir().unwrap();
667 let index_temp = tempdir().unwrap();
668 std::fs::write(temp.path().join("alpha.txt"), "Alpha body").unwrap();
669
670 let custom_index_dir = index_temp.path().join("custom_index_store");
671 std::fs::create_dir_all(&custom_index_dir).unwrap();
672
673 let source_id = "qa_custom_index_refresh".to_string();
674 let source = FileSource::new(
675 FileSourceConfig::new(source_id.clone(), temp.path())
676 .with_index_dir(custom_index_dir.clone()),
677 );
678
679 let snapshot = source.refresh(&sampler_config(101), None, None).unwrap();
680 assert_eq!(snapshot.records.len(), 1);
681
682 let expected_store_path = FileCorpusIndex::index_store_path_for(
683 Some(custom_index_dir.as_path()),
684 temp.path(),
685 &source_id,
686 );
687 assert!(
688 expected_store_path.is_file(),
689 "expected index store to exist at {}",
690 expected_store_path.display()
691 );
692 }
693
694 #[test]
695 fn with_index_dir_persists_count_index_store_under_custom_directory() {
696 let temp = tempdir().unwrap();
697 let index_temp = tempdir().unwrap();
698 std::fs::write(temp.path().join("alpha.txt"), "Alpha body").unwrap();
699
700 let custom_index_dir = index_temp.path().join("custom_index_store");
701 std::fs::create_dir_all(&custom_index_dir).unwrap();
702
703 let source_id = "qa_custom_index_count".to_string();
704 let source = FileSource::new(
705 FileSourceConfig::new(source_id.clone(), temp.path())
706 .with_index_dir(custom_index_dir.clone()),
707 );
708
709 assert_eq!(
710 source.reported_record_count(&sampler_config(101)).unwrap(),
711 1
712 );
713
714 let expected_store_path = FileCorpusIndex::index_store_path_for(
715 Some(custom_index_dir.as_path()),
716 temp.path(),
717 &source_id,
718 );
719 assert!(
720 expected_store_path.is_file(),
721 "expected index store to exist at {}",
722 expected_store_path.display()
723 );
724 }
725}