1use std::collections::HashMap;
43use std::fs;
44use std::io;
45use std::path::{Path, PathBuf};
46
47use crate::config::{PipelineConfig, ProjectionKind};
48use crate::corpus_features::{CORPUS_FEATURE_COUNT, CorpusFeatures};
49use crate::feedback::FeedbackSummary;
50use crate::tuner::TuneReport;
51use crate::util::{default_timestamp, migrate_legacy_array_to_jsonl, sphereql_home_dir};
52
53#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
56pub struct MetaTrainingRecord {
57 pub corpus_id: String,
60 pub features: CorpusFeatures,
62 pub best_config: PipelineConfig,
64 pub best_score: f64,
66 #[serde(default)]
79 pub score_lift: Option<f64>,
80 pub metric_name: String,
84 pub strategy: String,
87 pub timestamp: String,
89}
90
91impl MetaTrainingRecord {
92 pub fn from_tune_result(
100 corpus_id: impl Into<String>,
101 features: CorpusFeatures,
102 report: &TuneReport,
103 strategy_label: impl Into<String>,
104 ) -> Self {
105 Self {
106 corpus_id: corpus_id.into(),
107 features,
108 best_config: report.best_config.clone(),
109 best_score: report.best_score,
110 score_lift: score_lift_from_report(report),
111 metric_name: report.metric_name.clone(),
112 strategy: strategy_label.into(),
113 timestamp: default_timestamp(),
114 }
115 }
116
117 pub fn with_timestamp(mut self, ts: impl Into<String>) -> Self {
120 self.timestamp = ts.into();
121 self
122 }
123
124 pub fn save_list(records: &[Self], path: impl AsRef<Path>) -> io::Result<()> {
133 let path = path.as_ref();
134 if let Some(parent) = path.parent()
135 && !parent.as_os_str().is_empty()
136 {
137 fs::create_dir_all(parent)?;
138 }
139 let json = serde_json::to_string_pretty(records)
140 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
141 fs::write(path, json)
142 }
143
144 pub fn load_list(path: impl AsRef<Path>) -> io::Result<Vec<Self>> {
152 let path = path.as_ref();
153 if !path.exists() {
154 return Ok(Vec::new());
155 }
156 let raw = fs::read_to_string(path)?;
157 let trimmed = raw.trim_start();
158 if trimmed.is_empty() {
159 return Ok(Vec::new());
160 }
161 if trimmed.starts_with('[') {
162 return serde_json::from_str(trimmed)
164 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e));
165 }
166 trimmed
168 .lines()
169 .filter(|l| !l.trim().is_empty())
170 .map(|l| {
171 serde_json::from_str::<Self>(l)
172 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
173 })
174 .collect()
175 }
176
177 pub fn default_store_path() -> io::Result<PathBuf> {
179 Ok(sphereql_home_dir()?.join("meta_records.json"))
180 }
181
182 pub fn append_to_default_store(&self) -> io::Result<PathBuf> {
193 let path = Self::default_store_path()?;
194 self.append_to(&path)?;
195 Ok(path)
196 }
197
198 pub fn append_to(&self, path: impl AsRef<Path>) -> io::Result<()> {
201 use std::io::Write;
202
203 let path = path.as_ref();
204 if let Some(parent) = path.parent()
205 && !parent.as_os_str().is_empty()
206 {
207 fs::create_dir_all(parent)?;
208 }
209
210 migrate_legacy_array_to_jsonl(path, |head| {
214 let records: Vec<Self> = serde_json::from_str(head.trim_start())
215 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
216 let mut migrated = String::with_capacity(head.len());
217 for r in &records {
218 serde_json::to_string(r)
219 .map(|line| {
220 migrated.push_str(&line);
221 migrated.push('\n');
222 })
223 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
224 }
225 Ok(migrated)
226 })?;
227
228 let mut f = fs::OpenOptions::new()
229 .create(true)
230 .append(true)
231 .open(path)?;
232 let line = serde_json::to_string(self)
233 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
234 writeln!(f, "{line}")
235 }
236
237 pub fn load_default_store() -> io::Result<Vec<Self>> {
240 Self::load_list(Self::default_store_path()?)
241 }
242
243 pub fn adjust_score_with_feedback(&self, summary: &FeedbackSummary, alpha: f64) -> f64 {
262 let a = alpha.clamp(0.0, 1.0);
263 (1.0 - a) * self.best_score + a * summary.mean_score
264 }
265}
266
267fn score_lift_from_report(report: &TuneReport) -> Option<f64> {
273 if report.trials.len() < 2 {
274 return None;
275 }
276 let mean = report.mean_score();
277 let headroom = 1.0 - mean;
278 if headroom < 1e-9 {
279 return Some(0.0);
280 }
281 Some(((report.best_score - mean) / headroom).clamp(0.0, 1.0))
282}
283
284const LOG_SCALED_FEATURES: [usize; 4] = [0, 1, 2, 3];
293
294fn to_model_space(raw: &[f64; CORPUS_FEATURE_COUNT]) -> [f64; CORPUS_FEATURE_COUNT] {
298 let mut out = *raw;
299 for &i in &LOG_SCALED_FEATURES {
300 out[i] = out[i].max(0.0).ln_1p();
301 }
302 out
303}
304
305fn filter_dominant_metric(records: &[MetaTrainingRecord]) -> Vec<MetaTrainingRecord> {
312 if records.is_empty() {
313 return Vec::new();
314 }
315 let mut counts: HashMap<&str, usize> = HashMap::new();
316 for r in records {
317 *counts.entry(r.metric_name.as_str()).or_default() += 1;
318 }
319 if counts.len() <= 1 {
320 return records.to_vec();
321 }
322 let dominant = counts
323 .iter()
324 .max_by(|a, b| a.1.cmp(b.1).then(a.0.cmp(b.0)))
325 .map(|(k, _)| (*k).to_string())
326 .expect("counts non-empty");
327 records
328 .iter()
329 .filter(|r| r.metric_name == dominant)
330 .cloned()
331 .collect()
332}
333
334fn compute_feature_stats(
343 records: &[MetaTrainingRecord],
344) -> ([f64; CORPUS_FEATURE_COUNT], [f64; CORPUS_FEATURE_COUNT]) {
345 let mut means = [0.0; CORPUS_FEATURE_COUNT];
346 let mut stds = [0.0; CORPUS_FEATURE_COUNT];
347 let n = records.len();
348 if n == 0 {
349 return (means, [1.0; CORPUS_FEATURE_COUNT]);
350 }
351 let vecs: Vec<[f64; CORPUS_FEATURE_COUNT]> = records
352 .iter()
353 .map(|r| to_model_space(&r.features.to_vec()))
354 .collect();
355
356 for i in 0..CORPUS_FEATURE_COUNT {
357 let mean: f64 = vecs.iter().map(|v| v[i]).sum::<f64>() / n as f64;
358 means[i] = mean;
359 let var: f64 =
360 vecs.iter().map(|v| (v[i] - mean).powi(2)).sum::<f64>() / (n - 1).max(1) as f64;
361 let sd = var.sqrt();
362 stds[i] = if sd > f64::EPSILON { sd } else { 0.0 };
363 }
364 (means, stds)
365}
366
367fn normalize_features(
372 model_space: &[f64; CORPUS_FEATURE_COUNT],
373 means: &[f64; CORPUS_FEATURE_COUNT],
374 stds: &[f64; CORPUS_FEATURE_COUNT],
375) -> [f64; CORPUS_FEATURE_COUNT] {
376 let mut out = [0.0; CORPUS_FEATURE_COUNT];
377 for i in 0..CORPUS_FEATURE_COUNT {
378 let sd = stds[i];
379 out[i] = if sd > f64::EPSILON {
380 (model_space[i] - means[i]) / sd
381 } else {
382 0.0
383 };
384 }
385 out
386}
387
388fn normalized_euclidean(a: &[f64; CORPUS_FEATURE_COUNT], b: &[f64; CORPUS_FEATURE_COUNT]) -> f64 {
390 a.iter()
391 .zip(b.iter())
392 .map(|(x, y)| (x - y).powi(2))
393 .sum::<f64>()
394 .sqrt()
395}
396
397fn median_f64(values: impl Iterator<Item = f64>) -> f64 {
399 let mut v: Vec<f64> = values.collect();
400 assert!(!v.is_empty(), "median of empty sequence");
401 v.sort_by(|a, b| a.total_cmp(b));
402 v[v.len() / 2]
403}
404
405fn median_usize(values: impl Iterator<Item = usize>) -> usize {
407 let mut v: Vec<usize> = values.collect();
408 assert!(!v.is_empty(), "median of empty sequence");
409 v.sort_unstable();
410 v[v.len() / 2]
411}
412
413pub trait MetaModel {
421 fn fit(&mut self, records: &[MetaTrainingRecord]);
423
424 fn is_fitted(&self) -> bool;
430
431 fn predict(&self, features: &CorpusFeatures) -> PipelineConfig;
437
438 fn name(&self) -> &str;
440}
441
442#[derive(Debug, Clone)]
458pub struct NearestNeighborMetaModel {
459 records: Vec<MetaTrainingRecord>,
460 feature_means: [f64; CORPUS_FEATURE_COUNT],
461 feature_stds: [f64; CORPUS_FEATURE_COUNT],
462}
463
464impl Default for NearestNeighborMetaModel {
465 fn default() -> Self {
466 Self {
467 records: Vec::new(),
468 feature_means: [0.0; CORPUS_FEATURE_COUNT],
469 feature_stds: [1.0; CORPUS_FEATURE_COUNT],
470 }
471 }
472}
473
474impl NearestNeighborMetaModel {
475 pub fn new() -> Self {
476 Self::default()
477 }
478
479 pub fn records(&self) -> &[MetaTrainingRecord] {
483 &self.records
484 }
485
486 pub fn rank_candidates(&self, features: &CorpusFeatures) -> Vec<(usize, f64)> {
489 let q = normalize_features(
490 &to_model_space(&features.to_vec()),
491 &self.feature_means,
492 &self.feature_stds,
493 );
494 let mut ranked: Vec<(usize, f64)> = self
495 .records
496 .iter()
497 .enumerate()
498 .map(|(i, r)| {
499 let v = normalize_features(
500 &to_model_space(&r.features.to_vec()),
501 &self.feature_means,
502 &self.feature_stds,
503 );
504 (i, normalized_euclidean(&q, &v))
505 })
506 .collect();
507 ranked.sort_by(|a, b| a.1.total_cmp(&b.1));
512 ranked
513 }
514
515 pub fn predict_blended(&self, features: &CorpusFeatures, k: usize) -> PipelineConfig {
541 assert!(
542 !self.records.is_empty(),
543 "NearestNeighborMetaModel::predict_blended called before fit(); \
544 call .fit(records) with at least one record first"
545 );
546 let ranked = self.rank_candidates(features);
547 let k = k.clamp(1, ranked.len());
548 let top: Vec<&MetaTrainingRecord> =
549 ranked[..k].iter().map(|&(i, _)| &self.records[i]).collect();
550
551 let mut kind_counts: HashMap<ProjectionKind, usize> = HashMap::new();
554 for r in &top {
555 *kind_counts
556 .entry(r.best_config.projection_kind)
557 .or_default() += 1;
558 }
559 let max_count = kind_counts.values().copied().max().unwrap_or(0);
560 let kind = top
561 .iter()
562 .map(|r| r.best_config.projection_kind)
563 .find(|kk| kind_counts[kk] == max_count)
564 .unwrap_or(top[0].best_config.projection_kind);
565
566 let mut cfg = top[0].best_config.clone();
574 cfg.projection_kind = kind;
575
576 cfg.routing.num_domain_groups =
578 median_usize(top.iter().map(|r| r.best_config.routing.num_domain_groups));
579 cfg.routing.low_evr_threshold =
580 median_f64(top.iter().map(|r| r.best_config.routing.low_evr_threshold));
581 cfg.bridges.threshold_base =
582 median_f64(top.iter().map(|r| r.best_config.bridges.threshold_base));
583 cfg.bridges.threshold_evr_penalty = median_f64(
584 top.iter()
585 .map(|r| r.best_config.bridges.threshold_evr_penalty),
586 );
587 cfg.bridges.overlap_artifact_territorial = median_f64(
588 top.iter()
589 .map(|r| r.best_config.bridges.overlap_artifact_territorial),
590 );
591 cfg.inner_sphere.min_evr_improvement = median_f64(
592 top.iter()
593 .map(|r| r.best_config.inner_sphere.min_evr_improvement),
594 );
595
596 let kind_matching: Vec<&&MetaTrainingRecord> = top
600 .iter()
601 .filter(|r| r.best_config.projection_kind == kind)
602 .collect();
603 if !kind_matching.is_empty() {
604 match kind {
605 ProjectionKind::LaplacianEigenmap => {
606 cfg.laplacian.k_neighbors = median_usize(
607 kind_matching
608 .iter()
609 .map(|r| r.best_config.laplacian.k_neighbors),
610 );
611 cfg.laplacian.active_threshold = median_f64(
612 kind_matching
613 .iter()
614 .map(|r| r.best_config.laplacian.active_threshold),
615 );
616 }
617 ProjectionKind::UmapSphere => {
618 cfg.umap.n_neighbors =
619 median_usize(kind_matching.iter().map(|r| r.best_config.umap.n_neighbors));
620 cfg.umap.n_epochs =
621 median_usize(kind_matching.iter().map(|r| r.best_config.umap.n_epochs));
622 cfg.umap.category_weight = median_f64(
623 kind_matching
624 .iter()
625 .map(|r| r.best_config.umap.category_weight),
626 );
627 cfg.umap.min_dist =
628 median_f64(kind_matching.iter().map(|r| r.best_config.umap.min_dist));
629 }
630 ProjectionKind::Pca | ProjectionKind::KernelPca => {}
631 }
632 }
633 cfg
634 }
635}
636
637impl MetaModel for NearestNeighborMetaModel {
638 fn fit(&mut self, records: &[MetaTrainingRecord]) {
639 self.records = filter_dominant_metric(records);
640 let (means, stds) = compute_feature_stats(&self.records);
641 self.feature_means = means;
642 self.feature_stds = if self.records.is_empty() {
643 [1.0; CORPUS_FEATURE_COUNT]
644 } else {
645 stds
646 };
647 }
648
649 fn is_fitted(&self) -> bool {
650 !self.records.is_empty()
651 }
652
653 fn predict(&self, features: &CorpusFeatures) -> PipelineConfig {
654 assert!(
659 !self.records.is_empty(),
660 "NearestNeighborMetaModel::predict called before fit(); \
661 call .fit(records) with at least one record first"
662 );
663 let ranked = self.rank_candidates(features);
664 let best_idx = ranked[0].0;
665 self.records[best_idx].best_config.clone()
666 }
667
668 fn name(&self) -> &str {
669 "nearest_neighbor"
670 }
671}
672
673#[derive(Debug, Clone)]
694pub struct DistanceWeightedMetaModel {
695 records: Vec<MetaTrainingRecord>,
696 feature_means: [f64; CORPUS_FEATURE_COUNT],
697 feature_stds: [f64; CORPUS_FEATURE_COUNT],
698 epsilon: f64,
699}
700
701impl Default for DistanceWeightedMetaModel {
702 fn default() -> Self {
703 Self {
704 records: Vec::new(),
705 feature_means: [0.0; CORPUS_FEATURE_COUNT],
706 feature_stds: [1.0; CORPUS_FEATURE_COUNT],
707 epsilon: 0.1,
708 }
709 }
710}
711
712impl DistanceWeightedMetaModel {
713 pub fn new() -> Self {
714 Self::default()
715 }
716
717 pub fn with_epsilon(mut self, epsilon: f64) -> Self {
723 self.epsilon = epsilon.max(1e-12);
724 self
725 }
726
727 pub fn records(&self) -> &[MetaTrainingRecord] {
728 &self.records
729 }
730
731 pub fn score_candidates(&self, features: &CorpusFeatures) -> Vec<(usize, f64, f64)> {
735 let q = normalize_features(
736 &to_model_space(&features.to_vec()),
737 &self.feature_means,
738 &self.feature_stds,
739 );
740 let mut out: Vec<(usize, f64, f64)> = self
741 .records
742 .iter()
743 .enumerate()
744 .filter_map(|(i, r)| {
745 let evidence = r.score_lift.unwrap_or(r.best_score);
748 if !evidence.is_finite() {
754 return None;
755 }
756 let v = normalize_features(
757 &to_model_space(&r.features.to_vec()),
758 &self.feature_means,
759 &self.feature_stds,
760 );
761 let d = normalized_euclidean(&q, &v);
762 let weighted = evidence / (d + self.epsilon);
763 if !weighted.is_finite() {
764 return None;
765 }
766 Some((i, weighted, d))
767 })
768 .collect();
769 out.sort_by(|a, b| b.1.total_cmp(&a.1));
772 out
773 }
774}
775
776impl MetaModel for DistanceWeightedMetaModel {
777 fn fit(&mut self, records: &[MetaTrainingRecord]) {
778 self.records = filter_dominant_metric(records);
779 let (means, stds) = compute_feature_stats(&self.records);
780 self.feature_means = means;
781 self.feature_stds = if self.records.is_empty() {
782 [1.0; CORPUS_FEATURE_COUNT]
783 } else {
784 stds
785 };
786 }
787
788 fn is_fitted(&self) -> bool {
789 !self.records.is_empty()
790 }
791
792 fn predict(&self, features: &CorpusFeatures) -> PipelineConfig {
793 assert!(
798 !self.records.is_empty(),
799 "DistanceWeightedMetaModel::predict called before fit(); \
800 call .fit(records) with at least one record first"
801 );
802 let ranked = self.score_candidates(features);
803 let best_idx = ranked.first().map_or(0, |&(idx, _, _)| idx);
807 self.records[best_idx].best_config.clone()
808 }
809
810 fn name(&self) -> &str {
811 "distance_weighted"
812 }
813}
814
815#[cfg(test)]
818mod tests {
819 use super::*;
820 use crate::config::ProjectionKind;
821 use crate::tuner::TrialRecord;
822
823 fn feat(n: usize, c: usize, sparsity: f64, intra: f64) -> CorpusFeatures {
824 CorpusFeatures {
825 n_items: n,
826 n_categories: c,
827 dim: 128,
828 mean_members_per_category: n as f64 / c as f64,
829 category_size_entropy: 1.0,
830 mean_sparsity: sparsity,
831 axis_utilization_entropy: 0.9,
832 noise_estimate: 0.02,
833 mean_intra_category_similarity: intra,
834 mean_inter_category_similarity: 0.1,
835 category_separation_ratio: intra / 0.1,
836 }
837 }
838
839 fn record(id: &str, f: CorpusFeatures, kind: ProjectionKind, score: f64) -> MetaTrainingRecord {
840 MetaTrainingRecord {
841 corpus_id: id.to_string(),
842 features: f,
843 best_config: PipelineConfig {
844 projection_kind: kind,
845 ..Default::default()
846 },
847 best_score: score,
848 score_lift: None,
849 metric_name: "test_metric".to_string(),
850 strategy: "test_strategy".to_string(),
851 timestamp: "2026-04-22T00:00:00Z".to_string(),
852 }
853 }
854
855 fn trial(score: f64) -> TrialRecord {
856 TrialRecord {
857 config: PipelineConfig::default(),
858 score,
859 build_ms: 0,
860 components: Vec::new(),
861 }
862 }
863
864 #[test]
865 fn record_json_roundtrip() {
866 let r = record("r1", feat(100, 5, 0.2, 0.6), ProjectionKind::Pca, 0.5);
867 let json = serde_json::to_string(&r).unwrap();
868 let back: MetaTrainingRecord = serde_json::from_str(&json).unwrap();
869 assert_eq!(back.corpus_id, "r1");
870 assert_eq!(back.best_config.projection_kind, ProjectionKind::Pca);
871 assert!((back.best_score - 0.5).abs() < 1e-12);
872 }
873
874 #[test]
875 fn record_without_score_lift_field_still_deserializes() {
876 let r = record("r1", feat(100, 5, 0.2, 0.6), ProjectionKind::Pca, 0.5);
879 let mut json: serde_json::Value = serde_json::to_value(&r).unwrap();
880 json.as_object_mut().unwrap().remove("score_lift");
881 let back: MetaTrainingRecord = serde_json::from_value(json).unwrap();
882 assert!(back.score_lift.is_none());
883 }
884
885 #[test]
886 fn to_model_space_log_compresses_scale_features_only() {
887 let f = feat(500, 20, 0.25, 0.6);
888 let raw = f.to_vec();
889 let ms = to_model_space(&raw);
890 for &i in &LOG_SCALED_FEATURES {
891 assert!(
892 (ms[i] - raw[i].ln_1p()).abs() < 1e-12,
893 "scale feature {i} should be ln(1+x)"
894 );
895 }
896 for i in 0..CORPUS_FEATURE_COUNT {
897 if !LOG_SCALED_FEATURES.contains(&i) {
898 assert_eq!(ms[i], raw[i], "non-scale feature {i} must pass through");
899 }
900 }
901 }
902
903 #[test]
904 fn is_fitted_flips_after_fit() {
905 let mut nn = NearestNeighborMetaModel::new();
906 let mut dw = DistanceWeightedMetaModel::new();
907 assert!(!nn.is_fitted());
908 assert!(!dw.is_fitted());
909
910 let r = record("only", feat(500, 20, 0.1, 0.4), ProjectionKind::Pca, 0.7);
911 nn.fit(std::slice::from_ref(&r));
912 dw.fit(std::slice::from_ref(&r));
913 assert!(nn.is_fitted());
914 assert!(dw.is_fitted());
915
916 nn.fit(&[]);
917 assert!(
918 !nn.is_fitted(),
919 "refit on empty set must clear fitted state"
920 );
921 }
922
923 #[test]
924 fn nn_predict_single_record_returns_its_config() {
925 let r = record(
926 "only",
927 feat(500, 20, 0.1, 0.4),
928 ProjectionKind::LaplacianEigenmap,
929 0.7,
930 );
931 let mut m = NearestNeighborMetaModel::new();
932 m.fit(std::slice::from_ref(&r));
933 let predicted = m.predict(&feat(1000, 30, 0.05, 0.3));
934 assert_eq!(predicted.projection_kind, ProjectionKind::LaplacianEigenmap);
935 }
936
937 #[test]
938 fn nn_predict_picks_nearest_neighbor() {
939 let r_a = record(
942 "sparse",
943 feat(500, 5, 0.05, 0.8),
944 ProjectionKind::LaplacianEigenmap,
945 0.7,
946 );
947 let r_b = record("dense", feat(500, 5, 0.50, 0.2), ProjectionKind::Pca, 0.6);
948 let mut m = NearestNeighborMetaModel::new();
949 m.fit(&[r_a.clone(), r_b.clone()]);
950
951 let query_near_a = feat(500, 5, 0.06, 0.78);
952 let query_near_b = feat(500, 5, 0.48, 0.22);
953
954 assert_eq!(
955 m.predict(&query_near_a).projection_kind,
956 ProjectionKind::LaplacianEigenmap,
957 );
958 assert_eq!(
959 m.predict(&query_near_b).projection_kind,
960 ProjectionKind::Pca,
961 );
962 }
963
964 #[test]
965 fn nn_rank_candidates_sorted_ascending() {
966 let r_a = record("a", feat(500, 5, 0.05, 0.8), ProjectionKind::Pca, 0.7);
967 let r_b = record("b", feat(500, 5, 0.50, 0.2), ProjectionKind::KernelPca, 0.6);
968 let mut m = NearestNeighborMetaModel::new();
969 m.fit(&[r_a, r_b]);
970 let q = feat(500, 5, 0.07, 0.75);
971 let ranked = m.rank_candidates(&q);
972 assert_eq!(ranked.len(), 2);
973 assert!(ranked[0].1 <= ranked[1].1);
974 }
975
976 #[test]
977 fn nn_handles_zero_variance_feature() {
978 let r_a = record("a", feat(500, 5, 0.05, 0.8), ProjectionKind::Pca, 0.7);
982 let r_b = record(
983 "b",
984 feat(500, 5, 0.50, 0.2),
985 ProjectionKind::LaplacianEigenmap,
986 0.6,
987 );
988 let mut m = NearestNeighborMetaModel::new();
989 m.fit(&[r_a, r_b]);
990 let q = feat(500, 5, 0.1, 0.7);
991 let ranked = m.rank_candidates(&q);
992 assert!(ranked[0].1.is_finite());
993 assert!(ranked[1].1.is_finite());
994 }
995
996 #[test]
997 fn fit_stratifies_to_dominant_metric() {
998 let r1 = record("a", feat(500, 5, 0.05, 0.8), ProjectionKind::Pca, 0.7);
1002 let r2 = record("b", feat(500, 5, 0.50, 0.2), ProjectionKind::Pca, 0.6);
1003 let mut alien = record("c", feat(500, 5, 0.30, 0.5), ProjectionKind::KernelPca, 0.9);
1004 alien.metric_name = "other_metric".to_string();
1005
1006 let mut m = NearestNeighborMetaModel::new();
1007 m.fit(&[r1, r2, alien.clone()]);
1008 assert_eq!(m.records().len(), 2, "dominant-metric records retained");
1009 assert!(m.records().iter().all(|r| r.metric_name == "test_metric"));
1010 let predicted = m.predict(&alien.features);
1011 assert_ne!(predicted.projection_kind, ProjectionKind::KernelPca);
1012 }
1013
1014 #[test]
1015 fn filter_dominant_metric_tie_picks_lexicographically_largest() {
1016 let mut r1 = record("a", feat(500, 5, 0.05, 0.8), ProjectionKind::Pca, 0.7);
1019 r1.metric_name = "alpha".to_string();
1020 let mut r2 = record("b", feat(500, 5, 0.50, 0.2), ProjectionKind::Pca, 0.6);
1021 r2.metric_name = "beta".to_string();
1022 let mut r3 = record("c", feat(500, 5, 0.30, 0.5), ProjectionKind::Pca, 0.5);
1023 r3.metric_name = "gamma".to_string();
1024
1025 let kept = filter_dominant_metric(&[r1, r2, r3]);
1026 assert_eq!(kept.len(), 1);
1027 assert_eq!(kept[0].metric_name, "gamma");
1028 }
1029
1030 #[test]
1031 fn single_metric_training_set_is_untouched() {
1032 let records = vec![
1033 record("a", feat(500, 5, 0.05, 0.8), ProjectionKind::Pca, 0.7),
1034 record("b", feat(500, 5, 0.50, 0.2), ProjectionKind::Pca, 0.6),
1035 ];
1036 let mut m = NearestNeighborMetaModel::new();
1037 m.fit(&records);
1038 assert_eq!(m.records().len(), 2);
1039 }
1040
1041 #[test]
1042 #[should_panic(expected = "called before fit")]
1043 fn nn_predict_before_fit_panics() {
1044 let m = NearestNeighborMetaModel::new();
1045 let _ = m.predict(&feat(100, 5, 0.1, 0.3));
1046 }
1047
1048 #[test]
1049 fn predict_blended_k1_matches_predict() {
1050 let r_a = record(
1051 "a",
1052 feat(500, 5, 0.05, 0.8),
1053 ProjectionKind::LaplacianEigenmap,
1054 0.7,
1055 );
1056 let r_b = record("b", feat(500, 5, 0.50, 0.2), ProjectionKind::Pca, 0.6);
1057 let mut m = NearestNeighborMetaModel::new();
1058 m.fit(&[r_a, r_b]);
1059 let q = feat(500, 5, 0.06, 0.78);
1060 let single = m.predict(&q);
1061 let blended = m.predict_blended(&q, 1);
1062 assert_eq!(blended.projection_kind, single.projection_kind);
1063 assert_eq!(
1064 blended.routing.num_domain_groups,
1065 single.routing.num_domain_groups
1066 );
1067 assert!((blended.bridges.threshold_base - single.bridges.threshold_base).abs() < 1e-12);
1068 }
1069
1070 #[test]
1071 fn predict_blended_takes_median_of_knobs() {
1072 let mut r1 = record("a", feat(500, 5, 0.10, 0.70), ProjectionKind::Pca, 0.7);
1076 r1.best_config.routing.num_domain_groups = 3;
1077 let mut r2 = record("b", feat(500, 5, 0.12, 0.68), ProjectionKind::Pca, 0.6);
1078 r2.best_config.routing.num_domain_groups = 5;
1079 let mut r3 = record("c", feat(500, 5, 0.14, 0.66), ProjectionKind::Pca, 0.5);
1080 r3.best_config.routing.num_domain_groups = 9;
1081
1082 let mut m = NearestNeighborMetaModel::new();
1083 m.fit(&[r1, r2, r3]);
1084 let blended = m.predict_blended(&feat(500, 5, 0.12, 0.68), 3);
1085 assert_eq!(blended.projection_kind, ProjectionKind::Pca);
1086 assert_eq!(blended.routing.num_domain_groups, 5);
1087 }
1088
1089 #[test]
1090 fn predict_blended_majority_kind_wins() {
1091 let mut r1 = record(
1095 "a",
1096 feat(500, 5, 0.10, 0.70),
1097 ProjectionKind::LaplacianEigenmap,
1098 0.7,
1099 );
1100 r1.best_config.laplacian.k_neighbors = 10;
1101 let mut r2 = record(
1102 "b",
1103 feat(500, 5, 0.12, 0.68),
1104 ProjectionKind::LaplacianEigenmap,
1105 0.6,
1106 );
1107 r2.best_config.laplacian.k_neighbors = 20;
1108 let r3 = record("c", feat(500, 5, 0.14, 0.66), ProjectionKind::Pca, 0.5);
1109
1110 let mut m = NearestNeighborMetaModel::new();
1111 m.fit(&[r1, r2, r3]);
1112 let blended = m.predict_blended(&feat(500, 5, 0.12, 0.68), 3);
1113 assert_eq!(blended.projection_kind, ProjectionKind::LaplacianEigenmap);
1114 assert_eq!(blended.laplacian.k_neighbors, 20);
1116 }
1117
1118 #[test]
1119 fn save_and_load_list_roundtrip() {
1120 let dir = std::env::temp_dir();
1121 let path = dir.join("sphereql_meta_test.json");
1122 let _ = fs::remove_file(&path);
1123
1124 let records = vec![
1125 record("r1", feat(100, 5, 0.2, 0.5), ProjectionKind::Pca, 0.4),
1126 record(
1127 "r2",
1128 feat(800, 30, 0.05, 0.6),
1129 ProjectionKind::LaplacianEigenmap,
1130 0.5,
1131 ),
1132 ];
1133 MetaTrainingRecord::save_list(&records, &path).unwrap();
1134
1135 let loaded = MetaTrainingRecord::load_list(&path).unwrap();
1136 assert_eq!(loaded.len(), 2);
1137 assert_eq!(loaded[0].corpus_id, "r1");
1138 assert_eq!(
1139 loaded[1].best_config.projection_kind,
1140 ProjectionKind::LaplacianEigenmap
1141 );
1142
1143 let _ = fs::remove_file(&path);
1144 }
1145
1146 #[test]
1147 fn load_nonexistent_returns_empty() {
1148 let path = std::env::temp_dir().join("sphereql_nonexistent_12345.json");
1149 let loaded = MetaTrainingRecord::load_list(&path).unwrap();
1150 assert!(loaded.is_empty());
1151 }
1152
1153 #[test]
1154 fn append_to_migrates_legacy_array_file() {
1155 let dir =
1156 std::env::temp_dir().join(format!("sphereql_meta_migrate_{}", std::process::id()));
1157 let _ = fs::remove_dir_all(&dir);
1158 let path = dir.join("records.json");
1159
1160 let legacy = vec![
1162 record("r1", feat(100, 5, 0.2, 0.5), ProjectionKind::Pca, 0.4),
1163 record(
1164 "r2",
1165 feat(800, 30, 0.05, 0.6),
1166 ProjectionKind::LaplacianEigenmap,
1167 0.5,
1168 ),
1169 ];
1170 MetaTrainingRecord::save_list(&legacy, &path).unwrap();
1171
1172 record("r3", feat(200, 8, 0.1, 0.4), ProjectionKind::KernelPca, 0.6)
1174 .append_to(&path)
1175 .unwrap();
1176
1177 let loaded = MetaTrainingRecord::load_list(&path).unwrap();
1178 assert_eq!(loaded.len(), 3);
1179 assert_eq!(loaded[0].corpus_id, "r1");
1180 assert_eq!(loaded[1].corpus_id, "r2");
1181 assert_eq!(loaded[2].corpus_id, "r3");
1182 assert_eq!(
1183 loaded[1].best_config.projection_kind,
1184 ProjectionKind::LaplacianEigenmap
1185 );
1186
1187 let raw = fs::read_to_string(&path).unwrap();
1189 assert!(!raw.trim_start().starts_with('['));
1190 assert_eq!(raw.lines().count(), 3);
1191
1192 let _ = fs::remove_dir_all(&dir);
1193 }
1194
1195 #[test]
1196 fn from_tune_result_copies_fields() {
1197 let cfg = PipelineConfig {
1198 projection_kind: ProjectionKind::LaplacianEigenmap,
1199 ..Default::default()
1200 };
1201 let report = TuneReport {
1202 metric_name: "connectivity_composite".to_string(),
1203 best_score: 0.42,
1204 best_config: cfg.clone(),
1205 trials: Vec::new(),
1206 failures: Vec::new(),
1207 umap_graph_builds: 0,
1208 };
1209 let r = MetaTrainingRecord::from_tune_result(
1210 "test_corpus",
1211 feat(100, 5, 0.1, 0.5),
1212 &report,
1213 "random{budget=24,seed=42}",
1214 );
1215 assert_eq!(r.corpus_id, "test_corpus");
1216 assert_eq!(r.metric_name, "connectivity_composite");
1217 assert!((r.best_score - 0.42).abs() < 1e-12);
1218 assert!(r.score_lift.is_none());
1220 assert_eq!(
1221 r.best_config.projection_kind,
1222 ProjectionKind::LaplacianEigenmap
1223 );
1224 assert_eq!(r.strategy, "random{budget=24,seed=42}");
1225 assert!(!r.timestamp.is_empty());
1227 assert!(r.timestamp.parse::<u64>().is_ok());
1228 }
1229
1230 #[test]
1231 fn from_tune_result_computes_headroom_lift() {
1232 let report = TuneReport {
1235 metric_name: "m".to_string(),
1236 best_score: 0.8,
1237 best_config: PipelineConfig::default(),
1238 trials: vec![trial(0.4), trial(0.6), trial(0.8)],
1239 failures: Vec::new(),
1240 umap_graph_builds: 0,
1241 };
1242 let r = MetaTrainingRecord::from_tune_result("c", feat(10, 2, 0.1, 0.3), &report, "s");
1243 let lift = r.score_lift.expect("two or more trials produce lift");
1244 assert!((lift - 0.5).abs() < 1e-12, "got {lift}");
1245 }
1246
1247 #[test]
1248 fn from_tune_result_single_trial_has_no_lift() {
1249 let report = TuneReport {
1252 metric_name: "m".to_string(),
1253 best_score: 0.7,
1254 best_config: PipelineConfig::default(),
1255 trials: vec![trial(0.7)],
1256 failures: Vec::new(),
1257 umap_graph_builds: 0,
1258 };
1259 let r = MetaTrainingRecord::from_tune_result("c", feat(10, 2, 0.1, 0.3), &report, "s");
1260 assert!(r.score_lift.is_none());
1261 }
1262
1263 #[test]
1264 fn from_tune_result_lift_zero_when_landscape_saturated() {
1265 let report = TuneReport {
1267 metric_name: "m".to_string(),
1268 best_score: 1.0,
1269 best_config: PipelineConfig::default(),
1270 trials: vec![trial(1.0), trial(1.0)],
1271 failures: Vec::new(),
1272 umap_graph_builds: 0,
1273 };
1274 let r = MetaTrainingRecord::from_tune_result("c", feat(10, 2, 0.1, 0.3), &report, "s");
1275 assert_eq!(r.score_lift, Some(0.0));
1276 }
1277
1278 #[test]
1279 fn with_timestamp_overrides_default() {
1280 let report = TuneReport {
1281 metric_name: "m".to_string(),
1282 best_score: 0.5,
1283 best_config: PipelineConfig::default(),
1284 trials: Vec::new(),
1285 failures: Vec::new(),
1286 umap_graph_builds: 0,
1287 };
1288 let r = MetaTrainingRecord::from_tune_result("c", feat(10, 2, 0.1, 0.3), &report, "s")
1289 .with_timestamp("2026-04-22T12:00:00Z");
1290 assert_eq!(r.timestamp, "2026-04-22T12:00:00Z");
1291 }
1292
1293 #[test]
1294 fn save_list_creates_parent_dirs() {
1295 let dir = std::env::temp_dir().join(format!("sphereql_create_test_{}", std::process::id()));
1296 let _ = fs::remove_dir_all(&dir);
1297 let path = dir.join("nested").join("records.json");
1298
1299 let r = record("r1", feat(100, 5, 0.1, 0.5), ProjectionKind::Pca, 0.4);
1300 MetaTrainingRecord::save_list(&[r], &path).unwrap();
1301 assert!(path.exists());
1302
1303 let _ = fs::remove_dir_all(&dir);
1304 }
1305
1306 #[test]
1307 fn default_store_path_resolves() {
1308 let path = MetaTrainingRecord::default_store_path().unwrap();
1312 assert!(path.ends_with("meta_records.json"));
1313 assert!(path.iter().any(|c| c.to_string_lossy() == ".sphereql"));
1314 }
1315
1316 #[test]
1317 fn dw_predict_single_record_returns_its_config() {
1318 let r = record(
1320 "only",
1321 feat(500, 20, 0.1, 0.4),
1322 ProjectionKind::LaplacianEigenmap,
1323 0.7,
1324 );
1325 let mut m = DistanceWeightedMetaModel::new();
1326 m.fit(std::slice::from_ref(&r));
1327 let predicted = m.predict(&feat(1000, 30, 0.05, 0.3));
1328 assert_eq!(predicted.projection_kind, ProjectionKind::LaplacianEigenmap);
1329 }
1330
1331 #[test]
1332 fn dw_prefers_higher_score_when_equidistant() {
1333 let shared_feat = feat(500, 5, 0.1, 0.5);
1336 let lo = record(
1337 "low",
1338 shared_feat.clone(),
1339 ProjectionKind::LaplacianEigenmap,
1340 0.2,
1341 );
1342 let hi = record("high", shared_feat.clone(), ProjectionKind::Pca, 0.9);
1343
1344 let mut m = DistanceWeightedMetaModel::new();
1345 m.fit(&[lo, hi]);
1346 let predicted = m.predict(&shared_feat);
1347 assert_eq!(predicted.projection_kind, ProjectionKind::Pca);
1350 }
1351
1352 #[test]
1353 fn dw_prefers_lift_evidence_over_raw_score() {
1354 let shared_feat = feat(500, 5, 0.1, 0.5);
1360 let mut easy = record("easy", shared_feat.clone(), ProjectionKind::KernelPca, 0.9);
1361 easy.score_lift = Some(0.0);
1362 let mut hard = record(
1363 "hard",
1364 shared_feat.clone(),
1365 ProjectionKind::LaplacianEigenmap,
1366 0.6,
1367 );
1368 hard.score_lift = Some(0.8);
1369
1370 let mut m = DistanceWeightedMetaModel::new();
1371 m.fit(&[easy, hard]);
1372 let predicted = m.predict(&shared_feat);
1373 assert_eq!(predicted.projection_kind, ProjectionKind::LaplacianEigenmap);
1374 }
1375
1376 #[test]
1377 fn dw_all_records_without_lift_fall_back_to_best_score() {
1378 let shared_feat = feat(500, 5, 0.1, 0.5);
1382 let lo = record("lo", shared_feat.clone(), ProjectionKind::Pca, 0.2);
1383 let hi = record(
1384 "hi",
1385 shared_feat.clone(),
1386 ProjectionKind::LaplacianEigenmap,
1387 0.9,
1388 );
1389 assert!(lo.score_lift.is_none() && hi.score_lift.is_none());
1390
1391 let mut m = DistanceWeightedMetaModel::new();
1392 m.fit(&[lo, hi]);
1393 let ranked = m.score_candidates(&shared_feat);
1394 assert_eq!(ranked.len(), 2, "no record filtered as non-finite");
1395 let predicted = m.predict(&shared_feat);
1396 assert_eq!(predicted.projection_kind, ProjectionKind::LaplacianEigenmap);
1397 }
1398
1399 #[test]
1400 fn dw_prefers_closer_when_similar_score() {
1401 let close = record(
1404 "close",
1405 feat(500, 5, 0.06, 0.82),
1406 ProjectionKind::LaplacianEigenmap,
1407 0.70,
1408 );
1409 let far = record(
1410 "far",
1411 feat(500, 5, 0.55, 0.15),
1412 ProjectionKind::Pca,
1413 0.72, );
1415 let mut m = DistanceWeightedMetaModel::new();
1416 m.fit(&[close, far]);
1417 let q = feat(500, 5, 0.05, 0.80); assert_eq!(
1419 m.predict(&q).projection_kind,
1420 ProjectionKind::LaplacianEigenmap,
1421 );
1422 }
1423
1424 #[test]
1425 fn dw_score_candidates_sorted_descending() {
1426 let ra = record("a", feat(500, 5, 0.05, 0.8), ProjectionKind::Pca, 0.6);
1427 let rb = record("b", feat(500, 5, 0.50, 0.2), ProjectionKind::Pca, 0.9);
1428 let mut m = DistanceWeightedMetaModel::new();
1429 m.fit(&[ra, rb]);
1430 let ranked = m.score_candidates(&feat(500, 5, 0.07, 0.78));
1431 assert_eq!(ranked.len(), 2);
1432 assert!(ranked[0].1 >= ranked[1].1);
1433 }
1434
1435 #[test]
1436 fn dw_is_deterministic() {
1437 let records = vec![
1438 record("a", feat(500, 5, 0.05, 0.8), ProjectionKind::Pca, 0.7),
1439 record(
1440 "b",
1441 feat(500, 5, 0.50, 0.2),
1442 ProjectionKind::LaplacianEigenmap,
1443 0.6,
1444 ),
1445 ];
1446 let mut m1 = DistanceWeightedMetaModel::new();
1447 m1.fit(&records);
1448 let mut m2 = DistanceWeightedMetaModel::new();
1449 m2.fit(&records);
1450 let q = feat(500, 5, 0.10, 0.7);
1451 assert_eq!(
1452 m1.predict(&q).projection_kind,
1453 m2.predict(&q).projection_kind
1454 );
1455 }
1456
1457 #[test]
1458 fn dw_epsilon_clamps_non_positive() {
1459 let m = DistanceWeightedMetaModel::new().with_epsilon(-1.0);
1460 let r = record("r", feat(100, 5, 0.1, 0.3), ProjectionKind::Pca, 0.5);
1464 let mut m = m;
1465 m.fit(std::slice::from_ref(&r));
1466 let ranked = m.score_candidates(&r.features);
1467 assert!(ranked[0].1.is_finite());
1468 }
1469
1470 #[test]
1471 #[should_panic(expected = "called before fit")]
1472 fn dw_predict_before_fit_panics() {
1473 let m = DistanceWeightedMetaModel::new();
1474 let _ = m.predict(&feat(100, 5, 0.1, 0.3));
1475 }
1476
1477 #[test]
1478 fn dw_name_stable() {
1479 let m = DistanceWeightedMetaModel::new();
1480 assert_eq!(m.name(), "distance_weighted");
1481 }
1482
1483 #[test]
1484 fn adjust_score_with_feedback_blends_at_alpha() {
1485 let r = record("r", feat(100, 5, 0.1, 0.3), ProjectionKind::Pca, 0.8);
1486 let summary = FeedbackSummary {
1487 corpus_id: "r".into(),
1488 n_events: 10,
1489 mean_score: 0.4,
1490 min_score: 0.1,
1491 max_score: 0.9,
1492 };
1493 assert!((r.adjust_score_with_feedback(&summary, 0.0) - 0.8).abs() < 1e-12);
1495 assert!((r.adjust_score_with_feedback(&summary, 1.0) - 0.4).abs() < 1e-12);
1497 assert!((r.adjust_score_with_feedback(&summary, 0.5) - 0.6).abs() < 1e-12);
1499 assert!((r.adjust_score_with_feedback(&summary, 2.0) - 0.4).abs() < 1e-12);
1501 assert!((r.adjust_score_with_feedback(&summary, -1.0) - 0.8).abs() < 1e-12);
1502 }
1503}