Skip to main content

triplets_core/source/backends/
file_source.rs

1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::sync::Arc;
4
5use crate::config::{NegativeStrategy, SamplerConfig, Selector, TripletRecipe};
6use crate::data::{DataRecord, QualityScore, RecordSection, SectionRole};
7use crate::errors::SamplerError;
8use crate::source::indexing::file_corpus::FileCorpusIndex;
9use crate::source::{DataSource, SourceCursor, SourceSnapshot};
10use crate::types::{CategoryId, SourceId, TaxonomyValue};
11use crate::utils::{file_times, is_text_file};
12use crate::utils::{make_section, normalize_inline_whitespace};
13
14const FILE_RECIPE_TITLE_CONTEXT_WRONG_DATE: &str = "title_context_wrong_date";
15const FILE_RECIPE_TITLE_ANCHOR_WRONG_DATE: &str = "title_anchor_wrong_date";
16const FILE_RECIPE_TITLE_CONTEXT_WRONG_ARTICLE: &str = "title_context_wrong_article";
17const FILE_RECIPE_TITLE_ANCHOR_WRONG_ARTICLE: &str = "title_anchor_wrong_article";
18
19/// Builds taxonomy values from a root path and file path.
20pub type TaxonomyBuilder =
21    Arc<dyn Fn(&Path, &Path, &SourceId) -> Vec<TaxonomyValue> + Send + Sync + 'static>;
22
23/// Builds record sections from a normalized title and body.
24pub type SectionBuilder = Arc<dyn Fn(&str, &str) -> Vec<RecordSection> + Send + Sync + 'static>;
25
26/// Configuration for a generic filesystem-backed data source.
27#[derive(Clone)]
28pub struct FileSourceConfig {
29    /// Stable source identifier used in records and persistence keys.
30    pub source_id: SourceId,
31    /// Root directory containing source files.
32    pub root: PathBuf,
33    /// Default quality trust score applied to generated records.
34    pub trust: f32,
35    /// Optional trust overrides keyed by taxonomy segment.
36    pub category_trust: HashMap<CategoryId, f32>,
37    /// Whether to follow symlinks during index walking.
38    pub follow_links: bool,
39    /// Whether indexing should include only text files.
40    pub text_files_only: bool,
41    /// Whether deterministic directory grouping is enabled.
42    pub group_by_directory: bool,
43    /// Whether title extraction should replace underscores with spaces.
44    pub title_replace_underscores: bool,
45    /// Whether default recipe set includes the date-aware negative lane.
46    pub include_date_aware_default_recipe: bool,
47    /// Optional directory used for persisted file-corpus index stores.
48    ///
49    /// When `None`, file-corpus indexing uses the managed cache discovery root.
50    /// Set this in tests to keep index writes inside temporary directories.
51    pub index_dir: Option<PathBuf>,
52    /// Optional default recipes returned by this source.
53    pub default_triplet_recipes: Vec<TripletRecipe>,
54    /// Taxonomy builder invoked per file.
55    pub taxonomy_builder: TaxonomyBuilder,
56    /// Section builder invoked per file.
57    pub section_builder: SectionBuilder,
58}
59
60impl FileSourceConfig {
61    /// Create a config for a filesystem source with explicit id and root.
62    pub fn new(source_id: impl Into<SourceId>, root: impl Into<PathBuf>) -> Self {
63        Self {
64            source_id: source_id.into(),
65            root: root.into(),
66            trust: 0.85,
67            category_trust: HashMap::new(),
68            follow_links: true,
69            text_files_only: false,
70            group_by_directory: true,
71            title_replace_underscores: true,
72            include_date_aware_default_recipe: false,
73            index_dir: None,
74            default_triplet_recipes: default_title_context_triplet_recipes(false),
75            taxonomy_builder: Arc::new(taxonomy_from_path),
76            section_builder: Arc::new(anchor_context_sections),
77        }
78    }
79
80    /// Override default trust score.
81    pub fn with_trust(mut self, trust: f32) -> Self {
82        self.trust = trust;
83        self
84    }
85
86    /// Add a taxonomy-segment trust override.
87    pub fn with_category_trust(mut self, category: impl Into<String>, trust: f32) -> Self {
88        self.category_trust
89            .insert(category.into().to_lowercase(), trust);
90        self
91    }
92
93    /// Override whether symlinks are followed during index walk.
94    pub fn with_follow_links(mut self, follow_links: bool) -> Self {
95        self.follow_links = follow_links;
96        self
97    }
98
99    /// Override whether index walk includes only text files.
100    pub fn with_text_files_only(mut self, text_files_only: bool) -> Self {
101        self.text_files_only = text_files_only;
102        self
103    }
104
105    /// Enable or disable deterministic directory grouping.
106    pub fn with_directory_grouping(mut self, group_by_directory: bool) -> Self {
107        self.group_by_directory = group_by_directory;
108        self
109    }
110
111    /// Set whether title extraction replaces underscores with spaces.
112    pub fn with_title_replace_underscores(mut self, replace_underscores: bool) -> Self {
113        self.title_replace_underscores = replace_underscores;
114        self
115    }
116
117    /// Enable/disable the date-aware default recipe lane (`WrongPublicationDate`).
118    ///
119    /// "Date-aware" here uses publication-date metadata on records (for example
120    /// taxonomy/meta date fields), not filesystem timestamps from source files.
121    pub fn with_date_aware_default_recipe(mut self, include: bool) -> Self {
122        self.include_date_aware_default_recipe = include;
123        self.default_triplet_recipes = default_title_context_triplet_recipes(include);
124        self
125    }
126
127    /// Override the directory used to persist file-corpus index stores.
128    pub fn with_index_dir(mut self, index_dir: impl Into<PathBuf>) -> Self {
129        self.index_dir = Some(index_dir.into());
130        self
131    }
132
133    /// Set source-provided default triplet recipes.
134    pub fn with_default_triplet_recipes(mut self, recipes: Vec<TripletRecipe>) -> Self {
135        self.default_triplet_recipes = recipes;
136        self
137    }
138
139    /// Set a custom taxonomy builder.
140    pub fn with_taxonomy_builder(mut self, taxonomy_builder: TaxonomyBuilder) -> Self {
141        self.taxonomy_builder = taxonomy_builder;
142        self
143    }
144
145    /// Set a custom section builder.
146    pub fn with_section_builder(mut self, section_builder: SectionBuilder) -> Self {
147        self.section_builder = section_builder;
148        self
149    }
150}
151
152/// Default mixed-negative recipes used by `FileSource` title/body corpora.
153///
154/// When `include_date_aware` is enabled, the date-aware lane compares metadata
155/// publication dates, not filesystem mtime/ctime/atime values.
156pub fn default_title_context_triplet_recipes(include_date_aware: bool) -> Vec<TripletRecipe> {
157    let mut recipes = Vec::new();
158    if include_date_aware {
159        // Make date-aware summary negatives nearly as common as summary wrong-article
160        // negatives so temporal contrast is meaningfully represented.
161        recipes.push(TripletRecipe {
162            name: FILE_RECIPE_TITLE_CONTEXT_WRONG_DATE.into(),
163            anchor: Selector::Role(SectionRole::Anchor),
164            positive_selector: Selector::Role(SectionRole::Context),
165            negative_selector: Selector::Role(SectionRole::Context),
166            negative_strategy: NegativeStrategy::WrongPublicationDate,
167            weight: 0.30,
168            instruction: None,
169            allow_same_anchor_positive: false,
170        });
171        // Keep a smaller anchor-negative date-aware lane for harder examples
172        // without overwhelming the primary summary-driven objectives.
173        // Date-aware means publication-date metadata comparison, not file mtime.
174        recipes.push(TripletRecipe {
175            name: FILE_RECIPE_TITLE_ANCHOR_WRONG_DATE.into(),
176            anchor: Selector::Role(SectionRole::Anchor),
177            positive_selector: Selector::Role(SectionRole::Context),
178            negative_selector: Selector::Role(SectionRole::Anchor),
179            negative_strategy: NegativeStrategy::WrongPublicationDate,
180            weight: 0.10,
181            instruction: None,
182            allow_same_anchor_positive: false,
183        });
184    }
185    // Rebalance summary wrong-article depending on whether date-aware lanes are
186    // enabled so the full pool stays intentionally weighted in both modes.
187    recipes.push(TripletRecipe {
188        name: FILE_RECIPE_TITLE_CONTEXT_WRONG_ARTICLE.into(),
189        anchor: Selector::Role(SectionRole::Anchor),
190        positive_selector: Selector::Role(SectionRole::Context),
191        negative_selector: Selector::Role(SectionRole::Context),
192        negative_strategy: NegativeStrategy::WrongArticle,
193        weight: if include_date_aware { 0.35 } else { 0.75 },
194        instruction: None,
195        allow_same_anchor_positive: false,
196    });
197    // Medium-hard lane adds anchor-as-negative pressure to improve
198    // discrimination among title-like anchor fields.
199    recipes.push(TripletRecipe {
200        name: FILE_RECIPE_TITLE_ANCHOR_WRONG_ARTICLE.into(),
201        anchor: Selector::Role(SectionRole::Anchor),
202        positive_selector: Selector::Role(SectionRole::Context),
203        negative_selector: Selector::Role(SectionRole::Anchor),
204        negative_strategy: NegativeStrategy::WrongArticle,
205        weight: 0.25,
206        instruction: None,
207        allow_same_anchor_positive: false,
208    });
209    recipes
210}
211
212/// Generic filesystem-backed source with configurable taxonomy and section mapping.
213pub struct FileSource {
214    config: FileSourceConfig,
215}
216
217impl FileSource {
218    /// Create a generic file source from configuration.
219    pub fn new(config: FileSourceConfig) -> Self {
220        Self { config }
221    }
222
223    fn file_corpus_index(&self, sampler_seed: u64) -> FileCorpusIndex {
224        let mut index = FileCorpusIndex::new(&self.config.root, &self.config.source_id)
225            .with_sampler_seed(sampler_seed)
226            .with_follow_links(self.config.follow_links)
227            .with_text_files_only(self.config.text_files_only)
228            .with_directory_grouping(self.config.group_by_directory);
229
230        if let Some(index_dir) = &self.config.index_dir {
231            index = index.with_index_dir(index_dir.clone());
232        }
233
234        index
235    }
236
237    fn trust_for_taxonomy(&self, taxonomy: &[String]) -> f32 {
238        for segment in taxonomy.iter().skip(1) {
239            if let Some(weight) = self.config.category_trust.get(&segment.to_lowercase()) {
240                return *weight;
241            }
242        }
243        self.config.trust
244    }
245
246    fn build_record(&self, path: &Path) -> Result<Option<DataRecord>, SamplerError> {
247        if !is_text_file(path) {
248            return Ok(None);
249        }
250        let title = FileCorpusIndex::normalized_title_from_stem(
251            path,
252            &self.config.source_id,
253            self.config.title_replace_underscores,
254        )?;
255        if title.is_empty() {
256            return Ok(None);
257        }
258
259        let body_raw = std::fs::read_to_string(path)?;
260        let body = normalize_inline_whitespace(body_raw);
261        if body.is_empty() {
262            return Ok(None);
263        }
264
265        let taxonomy =
266            (self.config.taxonomy_builder)(&self.config.root, path, &self.config.source_id);
267        let sections = (self.config.section_builder)(&title, &body);
268        let trust = self.trust_for_taxonomy(&taxonomy);
269        let (created_at, updated_at) = file_times(path);
270
271        Ok(Some(DataRecord {
272            id: FileCorpusIndex::source_scoped_record_id(
273                &self.config.source_id,
274                &self.config.root,
275                path,
276            ),
277            source: self.config.source_id.clone(),
278            created_at,
279            updated_at,
280            quality: QualityScore { trust },
281            taxonomy,
282            sections,
283            meta_prefix: None,
284        }))
285    }
286}
287
288impl DataSource for FileSource {
289    fn id(&self) -> &str {
290        &self.config.source_id
291    }
292
293    fn refresh(
294        &self,
295        config: &SamplerConfig,
296        cursor: Option<&SourceCursor>,
297        limit: Option<usize>,
298    ) -> Result<SourceSnapshot, SamplerError> {
299        self.file_corpus_index(config.seed)
300            .refresh_indexable(cursor, limit, |path| self.build_record(path))
301    }
302
303    fn reported_record_count(&self, config: &SamplerConfig) -> Result<u128, SamplerError> {
304        self.file_corpus_index(config.seed)
305            .indexed_record_count()
306            .map(|count| count as u128)
307    }
308
309    fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
310        self.config.default_triplet_recipes.clone()
311    }
312}
313
314/// Build default taxonomy from the file path relative to `root`.
315///
316/// Output shape is `[source_id, <parent segments...>]`.
317pub fn taxonomy_from_path(root: &Path, path: &Path, source_id: &SourceId) -> Vec<TaxonomyValue> {
318    let mut taxonomy = vec![source_id.to_string()];
319    if let Ok(rel) = path.strip_prefix(root)
320        && let Some(parent) = rel.parent()
321    {
322        for segment in parent.iter() {
323            taxonomy.push(segment.to_string_lossy().to_string());
324        }
325    }
326    taxonomy
327}
328
329/// Build a default two-section payload of title anchor and body context.
330pub fn anchor_context_sections(title: &str, body: &str) -> Vec<RecordSection> {
331    vec![
332        make_section(SectionRole::Anchor, None, title),
333        make_section(SectionRole::Context, None, body),
334    ]
335}
336
337#[cfg(test)]
338mod tests {
339    use super::*;
340    use crate::config::{NegativeStrategy, Selector};
341    use tempfile::tempdir;
342
343    fn sampler_config(seed: u64) -> SamplerConfig {
344        SamplerConfig {
345            seed,
346            ..SamplerConfig::default()
347        }
348    }
349
350    #[test]
351    fn reads_records_without_default_source_id() {
352        let temp = tempdir().unwrap();
353        let category = temp.path().join("factual");
354        std::fs::create_dir_all(&category).unwrap();
355        std::fs::write(
356            category.join("What_is_alpha.txt"),
357            "Alpha measures risk-adjusted outperformance.",
358        )
359        .unwrap();
360
361        let source = FileSource::new(FileSourceConfig::new("qa_custom", temp.path()));
362        let snapshot = source.refresh(&sampler_config(101), None, None).unwrap();
363
364        assert_eq!(snapshot.records.len(), 1);
365        assert_eq!(snapshot.records[0].source, "qa_custom");
366    }
367
368    #[test]
369    fn applies_category_trust_overrides() {
370        let temp = tempdir().unwrap();
371        let factual = temp.path().join("factual");
372        let opinion = temp.path().join("opinionated");
373        std::fs::create_dir_all(&factual).unwrap();
374        std::fs::create_dir_all(&opinion).unwrap();
375        std::fs::write(
376            factual.join("What_is_beta.txt"),
377            "Beta compares volatility.",
378        )
379        .unwrap();
380        std::fs::write(
381            opinion.join("Will_rates_fall.txt"),
382            "Probably not this year.",
383        )
384        .unwrap();
385
386        let source = FileSource::new(
387            FileSourceConfig::new("qa_weighted", temp.path())
388                .with_category_trust("factual", 0.95)
389                .with_category_trust("opinionated", 0.6),
390        );
391        let snapshot = source.refresh(&sampler_config(101), None, None).unwrap();
392
393        let factual_record = snapshot
394            .records
395            .iter()
396            .find(|record| record.taxonomy.iter().any(|value| value == "factual"))
397            .unwrap();
398        let opinion_record = snapshot
399            .records
400            .iter()
401            .find(|record| record.taxonomy.iter().any(|value| value == "opinionated"))
402            .unwrap();
403        assert_eq!(factual_record.quality.trust, 0.95);
404        assert_eq!(opinion_record.quality.trust, 0.6);
405    }
406
407    #[test]
408    fn supports_custom_sections_and_default_recipes() {
409        let temp = tempdir().unwrap();
410        std::fs::write(
411            temp.path().join("What_is_gamma.txt"),
412            "Gamma measures convexity.",
413        )
414        .unwrap();
415
416        let sections: SectionBuilder = Arc::new(|question, answer| {
417            vec![
418                make_section(SectionRole::Anchor, Some("Question"), question),
419                make_section(SectionRole::Context, Some("Answer"), answer),
420            ]
421        });
422
423        let recipes = vec![TripletRecipe {
424            name: "question_answer".into(),
425            anchor: Selector::Role(SectionRole::Anchor),
426            positive_selector: Selector::Role(SectionRole::Context),
427            negative_selector: Selector::Role(SectionRole::Context),
428            negative_strategy: NegativeStrategy::QuestionAnswerMismatch,
429            weight: 1.0,
430            instruction: None,
431            allow_same_anchor_positive: false,
432        }];
433
434        let source = FileSource::new(
435            FileSourceConfig::new("qa_sections", temp.path())
436                .with_section_builder(sections)
437                .with_default_triplet_recipes(recipes.clone()),
438        );
439
440        let snapshot = source.refresh(&sampler_config(101), None, None).unwrap();
441        assert_eq!(snapshot.records.len(), 1);
442        assert_eq!(snapshot.records[0].sections.len(), 2);
443        assert_eq!(source.default_triplet_recipes().len(), recipes.len());
444    }
445
446    #[test]
447    fn file_source_config_new_has_explicit_default_triplet_recipes() {
448        let temp = tempdir().unwrap();
449        let source = FileSource::new(FileSourceConfig::new("qa_defaults", temp.path()));
450        let defaults = source.default_triplet_recipes();
451        assert!(!defaults.is_empty());
452        let names: Vec<&str> = defaults.iter().map(|recipe| recipe.name.as_ref()).collect();
453        assert!(!names.contains(&FILE_RECIPE_TITLE_CONTEXT_WRONG_DATE));
454        assert!(!names.contains(&FILE_RECIPE_TITLE_ANCHOR_WRONG_DATE));
455        assert!(names.contains(&FILE_RECIPE_TITLE_CONTEXT_WRONG_ARTICLE));
456        assert!(names.contains(&FILE_RECIPE_TITLE_ANCHOR_WRONG_ARTICLE));
457        let summary_wrong_article = defaults
458            .iter()
459            .find(|recipe| recipe.name == FILE_RECIPE_TITLE_CONTEXT_WRONG_ARTICLE)
460            .unwrap();
461        let anchor_wrong_article = defaults
462            .iter()
463            .find(|recipe| recipe.name == FILE_RECIPE_TITLE_ANCHOR_WRONG_ARTICLE)
464            .unwrap();
465        assert_eq!(summary_wrong_article.weight, 0.75);
466        assert_eq!(anchor_wrong_article.weight, 0.25);
467    }
468
469    #[test]
470    fn file_source_config_can_enable_date_aware_default_recipe() {
471        let temp = tempdir().unwrap();
472        let source = FileSource::new(
473            FileSourceConfig::new("qa_defaults_with_date", temp.path())
474                .with_date_aware_default_recipe(true),
475        );
476        let defaults = source.default_triplet_recipes();
477        let names: Vec<&str> = defaults.iter().map(|recipe| recipe.name.as_ref()).collect();
478        assert!(names.contains(&FILE_RECIPE_TITLE_CONTEXT_WRONG_DATE));
479        assert!(names.contains(&FILE_RECIPE_TITLE_ANCHOR_WRONG_DATE));
480        assert!(names.contains(&FILE_RECIPE_TITLE_CONTEXT_WRONG_ARTICLE));
481        assert!(names.contains(&FILE_RECIPE_TITLE_ANCHOR_WRONG_ARTICLE));
482        let summary_wrong_date = defaults
483            .iter()
484            .find(|recipe| recipe.name == FILE_RECIPE_TITLE_CONTEXT_WRONG_DATE)
485            .unwrap();
486        let anchor_wrong_date = defaults
487            .iter()
488            .find(|recipe| recipe.name == FILE_RECIPE_TITLE_ANCHOR_WRONG_DATE)
489            .unwrap();
490        let summary_wrong_article = defaults
491            .iter()
492            .find(|recipe| recipe.name == FILE_RECIPE_TITLE_CONTEXT_WRONG_ARTICLE)
493            .unwrap();
494        let anchor_wrong_article = defaults
495            .iter()
496            .find(|recipe| recipe.name == FILE_RECIPE_TITLE_ANCHOR_WRONG_ARTICLE)
497            .unwrap();
498        assert_eq!(summary_wrong_date.weight, 0.30);
499        assert_eq!(anchor_wrong_date.weight, 0.10);
500        assert_eq!(summary_wrong_article.weight, 0.35);
501        assert_eq!(anchor_wrong_article.weight, 0.25);
502    }
503
504    #[test]
505    fn file_source_config_override_replaces_default_triplet_recipes() {
506        let temp = tempdir().unwrap();
507        let custom = vec![TripletRecipe {
508            name: "custom_only".into(),
509            anchor: Selector::Role(SectionRole::Context),
510            positive_selector: Selector::Role(SectionRole::Context),
511            negative_selector: Selector::Role(SectionRole::Context),
512            negative_strategy: NegativeStrategy::WrongArticle,
513            weight: 1.0,
514            instruction: None,
515            allow_same_anchor_positive: false,
516        }];
517        let source = FileSource::new(
518            FileSourceConfig::new("qa_defaults_override", temp.path())
519                .with_default_triplet_recipes(custom.clone()),
520        );
521        let recipes = source.default_triplet_recipes();
522        assert_eq!(recipes.len(), 1);
523        assert_eq!(recipes[0].name.as_ref(), "custom_only");
524    }
525
526    #[test]
527    fn taxonomy_from_path_handles_nested_and_non_descendant_paths() {
528        let temp = tempdir().unwrap();
529        let root = temp.path().join("root");
530        std::fs::create_dir_all(root.join("topic/subtopic")).unwrap();
531
532        let nested = root.join("topic/subtopic/doc.txt");
533        let taxonomy = taxonomy_from_path(&root, &nested, &"qa_tax".to_string());
534        assert_eq!(taxonomy, vec!["qa_tax", "topic", "subtopic"]);
535
536        let outside = temp.path().join("outside.txt");
537        let outside_taxonomy = taxonomy_from_path(&root, &outside, &"qa_tax".to_string());
538        assert_eq!(outside_taxonomy, vec!["qa_tax"]);
539    }
540
541    #[test]
542    fn anchor_context_sections_build_expected_roles_and_text() {
543        let sections = anchor_context_sections("What is delta", "Delta is change over time.");
544        assert_eq!(sections.len(), 2);
545        assert_eq!(sections[0].role, SectionRole::Anchor);
546        assert_eq!(sections[0].text, "What is delta");
547        assert_eq!(sections[1].role, SectionRole::Context);
548        assert_eq!(sections[1].text, "Delta is change over time.");
549    }
550
551    #[test]
552    fn title_replace_underscores_toggle_changes_anchor_title_text() {
553        let temp = tempdir().unwrap();
554        std::fs::write(
555            temp.path().join("What_is_delta.txt"),
556            "Delta captures directional change.",
557        )
558        .unwrap();
559
560        let source_default =
561            FileSource::new(FileSourceConfig::new("qa_title_default", temp.path()));
562        let default_snapshot = source_default
563            .refresh(&sampler_config(101), None, Some(1))
564            .unwrap();
565        assert_eq!(default_snapshot.records.len(), 1);
566        assert_eq!(
567            default_snapshot.records[0].sections[0].text,
568            "What is delta"
569        );
570
571        let source_preserve = FileSource::new(
572            FileSourceConfig::new("qa_title_preserve", temp.path())
573                .with_title_replace_underscores(false),
574        );
575        let preserve_snapshot = source_preserve
576            .refresh(&sampler_config(101), None, Some(1))
577            .unwrap();
578        assert_eq!(preserve_snapshot.records.len(), 1);
579        assert_eq!(
580            preserve_snapshot.records[0].sections[0].text,
581            "What_is_delta"
582        );
583    }
584
585    #[test]
586    fn refresh_skips_non_txt_files_even_when_text_only_disabled() {
587        let temp = tempdir().unwrap();
588        std::fs::write(temp.path().join("notes.md"), "markdown should be skipped").unwrap();
589        std::fs::write(temp.path().join("doc.txt"), "plain text should be indexed").unwrap();
590
591        let source = FileSource::new(
592            FileSourceConfig::new("qa_filtering", temp.path()).with_text_files_only(false),
593        );
594        let snapshot = source.refresh(&sampler_config(101), None, None).unwrap();
595        assert_eq!(snapshot.records.len(), 1);
596        assert!(snapshot.records[0].id.contains("doc.txt"));
597    }
598
599    #[test]
600    fn trust_falls_back_to_default_and_count_and_id_are_exposed() {
601        let temp = tempdir().unwrap();
602        let docs = temp.path().join("docs");
603        std::fs::create_dir_all(&docs).unwrap();
604        std::fs::write(docs.join("alpha.txt"), "Alpha body.").unwrap();
605
606        let source = FileSource::new(
607            FileSourceConfig::new("qa_count", temp.path())
608                .with_trust(0.42)
609                .with_category_trust("factual", 0.95)
610                .with_taxonomy_builder(Arc::new(|_, _, source_id| {
611                    vec![source_id.clone(), "UNMATCHED".to_string()]
612                })),
613        );
614
615        let seed_101 = sampler_config(101);
616        let snapshot = source.refresh(&seed_101, None, None).unwrap();
617        assert_eq!(snapshot.records.len(), 1);
618        assert_eq!(snapshot.records[0].quality.trust, 0.42);
619        assert_eq!(source.id(), "qa_count");
620        assert_eq!(source.reported_record_count(&seed_101).unwrap(), 1);
621    }
622
623    #[test]
624    fn sampler_seed_controls_file_source_refresh_order() {
625        let temp = tempdir().unwrap();
626        for idx in 0..12 {
627            std::fs::write(
628                temp.path().join(format!("doc_{idx:02}.txt")),
629                format!("Body text for {idx}"),
630            )
631            .unwrap();
632        }
633
634        let source_a = FileSource::new(FileSourceConfig::new("seeded_a", temp.path()));
635        let source_b = FileSource::new(FileSourceConfig::new("seeded_a", temp.path()));
636        let source_c = FileSource::new(FileSourceConfig::new("seeded_a", temp.path()));
637
638        let ids_a: Vec<String> = source_a
639            .refresh(&sampler_config(11), None, Some(8))
640            .unwrap()
641            .records
642            .into_iter()
643            .map(|record| record.id)
644            .collect();
645        let ids_b: Vec<String> = source_b
646            .refresh(&sampler_config(11), None, Some(8))
647            .unwrap()
648            .records
649            .into_iter()
650            .map(|record| record.id)
651            .collect();
652        let ids_c: Vec<String> = source_c
653            .refresh(&sampler_config(29), None, Some(8))
654            .unwrap()
655            .records
656            .into_iter()
657            .map(|record| record.id)
658            .collect();
659
660        assert_eq!(ids_a, ids_b);
661        assert_ne!(ids_a, ids_c);
662    }
663
664    #[test]
665    fn with_index_dir_persists_refresh_index_store_under_custom_directory() {
666        let temp = tempdir().unwrap();
667        let index_temp = tempdir().unwrap();
668        std::fs::write(temp.path().join("alpha.txt"), "Alpha body").unwrap();
669
670        let custom_index_dir = index_temp.path().join("custom_index_store");
671        std::fs::create_dir_all(&custom_index_dir).unwrap();
672
673        let source_id = "qa_custom_index_refresh".to_string();
674        let source = FileSource::new(
675            FileSourceConfig::new(source_id.clone(), temp.path())
676                .with_index_dir(custom_index_dir.clone()),
677        );
678
679        let snapshot = source.refresh(&sampler_config(101), None, None).unwrap();
680        assert_eq!(snapshot.records.len(), 1);
681
682        let expected_store_path = FileCorpusIndex::index_store_path_for(
683            Some(custom_index_dir.as_path()),
684            temp.path(),
685            &source_id,
686        );
687        assert!(
688            expected_store_path.is_file(),
689            "expected index store to exist at {}",
690            expected_store_path.display()
691        );
692    }
693
694    #[test]
695    fn with_index_dir_persists_count_index_store_under_custom_directory() {
696        let temp = tempdir().unwrap();
697        let index_temp = tempdir().unwrap();
698        std::fs::write(temp.path().join("alpha.txt"), "Alpha body").unwrap();
699
700        let custom_index_dir = index_temp.path().join("custom_index_store");
701        std::fs::create_dir_all(&custom_index_dir).unwrap();
702
703        let source_id = "qa_custom_index_count".to_string();
704        let source = FileSource::new(
705            FileSourceConfig::new(source_id.clone(), temp.path())
706                .with_index_dir(custom_index_dir.clone()),
707        );
708
709        assert_eq!(
710            source.reported_record_count(&sampler_config(101)).unwrap(),
711            1
712        );
713
714        let expected_store_path = FileCorpusIndex::index_store_path_for(
715            Some(custom_index_dir.as_path()),
716            temp.path(),
717            &source_id,
718        );
719        assert!(
720            expected_store_path.is_file(),
721            "expected index store to exist at {}",
722            expected_store_path.display()
723        );
724    }
725}