oxirs_embed/application_tasks/
mod.rs

1//! Application-specific evaluation tasks for embedding models
2//!
3//! This module implements comprehensive evaluation metrics and benchmarks for
4//! real-world applications of knowledge graph embeddings including recommendation
5//! systems, search relevance, clustering performance, and classification accuracy.
6
7pub mod classification;
8pub mod clustering;
9pub mod query_answering;
10pub mod recommendation;
11pub mod retrieval;
12pub mod search;
13
14use crate::EmbeddingModel;
15use anyhow::{anyhow, Result};
16use serde::{Deserialize, Serialize};
17use std::collections::VecDeque;
18use std::sync::{Arc, RwLock};
19use std::time::Instant;
20
21// Re-export all public types from submodules
22pub use classification::{
23    ClassResults, ClassificationEvaluator, ClassificationMetric, ClassificationReport,
24    ClassificationResults, SimpleClassifier,
25};
26pub use clustering::{
27    ClusterAnalysis, ClusteringEvaluator, ClusteringMetric, ClusteringResults,
28    ClusteringStabilityAnalysis,
29};
30pub use query_answering::{
31    ApplicationQueryAnsweringEvaluator, ComplexityResults, QueryAnsweringMetric,
32    QueryAnsweringResults, QueryComplexity, QueryResult, QueryType, QuestionAnswerPair,
33    ReasoningAnalysis, TypeResults,
34};
35pub use recommendation::{
36    ABTestResults, CoverageStats, DiversityAnalysis, InteractionType, ItemMetadata,
37    RecommendationEvaluator, RecommendationMetric, RecommendationResults, UserInteraction,
38    UserRecommendationResults,
39};
40pub use retrieval::{
41    DocumentMetadata, RetrievalAnalysis, RetrievalEvaluator, RetrievalMetric, RetrievalQuery,
42    RetrievalResults,
43};
44pub use search::{
45    QueryPerformanceAnalysis, QueryResults, RelevanceJudgment, SearchEffectivenessMetrics,
46    SearchEvaluator, SearchMetric, SearchResults,
47};
48
49/// Configuration for application evaluation
50#[derive(Debug, Clone, Serialize, Deserialize)]
51pub struct ApplicationEvalConfig {
52    /// Enable recommendation evaluation
53    pub enable_recommendation_eval: bool,
54    /// Enable search relevance evaluation
55    pub enable_search_eval: bool,
56    /// Enable clustering evaluation
57    pub enable_clustering_eval: bool,
58    /// Enable classification evaluation
59    pub enable_classification_eval: bool,
60    /// Enable retrieval evaluation
61    pub enable_retrieval_eval: bool,
62    /// Enable query answering evaluation
63    pub enable_query_answering_eval: bool,
64    /// Sample size for evaluations
65    pub sample_size: usize,
66    /// Number of recommendations to generate
67    pub num_recommendations: usize,
68    /// Number of clusters for clustering evaluation
69    pub num_clusters: usize,
70    /// Cross-validation folds
71    pub cv_folds: usize,
72    /// Enable user satisfaction simulation
73    pub enable_user_satisfaction: bool,
74    /// Number of query answering tests
75    pub num_query_tests: usize,
76}
77
78impl Default for ApplicationEvalConfig {
79    fn default() -> Self {
80        Self {
81            enable_recommendation_eval: true,
82            enable_search_eval: true,
83            enable_clustering_eval: true,
84            enable_classification_eval: true,
85            enable_retrieval_eval: true,
86            enable_query_answering_eval: true,
87            sample_size: 1000,
88            num_recommendations: 10,
89            num_clusters: 5,
90            cv_folds: 5,
91            enable_user_satisfaction: true,
92            num_query_tests: 100,
93        }
94    }
95}
96
97/// Application evaluation results
98#[derive(Debug, Clone, Serialize, Deserialize)]
99pub struct ApplicationEvalResults {
100    /// Timestamp of evaluation
101    pub timestamp: chrono::DateTime<chrono::Utc>,
102    /// Model identifier
103    pub model_id: String,
104    /// Recommendation evaluation results
105    pub recommendation_results: Option<RecommendationResults>,
106    /// Search evaluation results
107    pub search_results: Option<SearchResults>,
108    /// Clustering evaluation results
109    pub clustering_results: Option<ClusteringResults>,
110    /// Classification evaluation results
111    pub classification_results: Option<ClassificationResults>,
112    /// Retrieval evaluation results
113    pub retrieval_results: Option<RetrievalResults>,
114    /// Query answering evaluation results
115    pub query_answering_results: Option<QueryAnsweringResults>,
116    /// Overall application score
117    pub overall_score: f64,
118    /// Evaluation time (seconds)
119    pub evaluation_time: f64,
120}
121
122/// Application-specific task evaluator
123pub struct ApplicationTaskEvaluator {
124    /// Evaluation configuration
125    config: ApplicationEvalConfig,
126    /// Task-specific evaluators
127    recommendation_evaluator: RecommendationEvaluator,
128    search_evaluator: SearchEvaluator,
129    clustering_evaluator: ClusteringEvaluator,
130    classification_evaluator: ClassificationEvaluator,
131    retrieval_evaluator: RetrievalEvaluator,
132    query_answering_evaluator: ApplicationQueryAnsweringEvaluator,
133    /// Evaluation history
134    evaluation_history: Arc<RwLock<VecDeque<ApplicationEvalResults>>>,
135}
136
137impl ApplicationTaskEvaluator {
138    /// Create a new application task evaluator
139    pub fn new(config: ApplicationEvalConfig) -> Self {
140        Self {
141            config,
142            recommendation_evaluator: RecommendationEvaluator::new(),
143            search_evaluator: SearchEvaluator::new(),
144            clustering_evaluator: ClusteringEvaluator::new(),
145            classification_evaluator: ClassificationEvaluator::new(),
146            retrieval_evaluator: RetrievalEvaluator::new(),
147            query_answering_evaluator: ApplicationQueryAnsweringEvaluator::new(),
148            evaluation_history: Arc::new(RwLock::new(VecDeque::new())),
149        }
150    }
151
152    /// Run comprehensive application evaluation
153    pub async fn evaluate_all_tasks(
154        &self,
155        model: &dyn EmbeddingModel,
156    ) -> Result<ApplicationEvalResults> {
157        let start_time = Instant::now();
158        let model_id = model.model_id().to_string();
159
160        let mut recommendation_results = None;
161        let mut search_results = None;
162        let mut clustering_results = None;
163        let mut classification_results = None;
164        let mut retrieval_results = None;
165        let mut query_answering_results = None;
166
167        // Run enabled evaluations
168        if self.config.enable_recommendation_eval {
169            recommendation_results = Some(
170                self.recommendation_evaluator
171                    .evaluate(model, &self.config)
172                    .await?,
173            );
174        }
175
176        if self.config.enable_search_eval {
177            search_results = Some(self.search_evaluator.evaluate(model, &self.config).await?);
178        }
179
180        if self.config.enable_clustering_eval {
181            clustering_results = Some(
182                self.clustering_evaluator
183                    .evaluate(model, &self.config)
184                    .await?,
185            );
186        }
187
188        if self.config.enable_classification_eval {
189            classification_results = Some(
190                self.classification_evaluator
191                    .evaluate(model, &self.config)
192                    .await?,
193            );
194        }
195
196        if self.config.enable_retrieval_eval {
197            retrieval_results = Some(
198                self.retrieval_evaluator
199                    .evaluate(model, &self.config)
200                    .await?,
201            );
202        }
203
204        if self.config.enable_query_answering_eval {
205            query_answering_results = Some(
206                self.query_answering_evaluator
207                    .evaluate(model, &self.config)
208                    .await?,
209            );
210        }
211
212        let evaluation_time = start_time.elapsed().as_secs_f64();
213
214        // Calculate overall score
215        let overall_score = self.calculate_overall_score(
216            &recommendation_results,
217            &search_results,
218            &clustering_results,
219            &classification_results,
220            &retrieval_results,
221            &query_answering_results,
222        );
223
224        let results = ApplicationEvalResults {
225            timestamp: chrono::Utc::now(),
226            model_id,
227            recommendation_results,
228            search_results,
229            clustering_results,
230            classification_results,
231            retrieval_results,
232            query_answering_results,
233            overall_score,
234            evaluation_time,
235        };
236
237        // Store in history
238        if let Ok(mut history) = self.evaluation_history.write() {
239            history.push_back(results.clone());
240            if history.len() > 100 {
241                history.pop_front();
242            }
243        }
244
245        Ok(results)
246    }
247
248    /// Calculate overall score from individual evaluation results
249    fn calculate_overall_score(
250        &self,
251        recommendation: &Option<RecommendationResults>,
252        search: &Option<SearchResults>,
253        clustering: &Option<ClusteringResults>,
254        classification: &Option<ClassificationResults>,
255        retrieval: &Option<RetrievalResults>,
256        query_answering: &Option<QueryAnsweringResults>,
257    ) -> f64 {
258        let mut total_score = 0.0;
259        let mut component_count = 0;
260
261        if let Some(rec) = recommendation {
262            // Extract key metrics from recommendation results
263            let precision = rec.metric_scores.get("PrecisionAtK(5)").unwrap_or(&0.0);
264            let coverage = rec.metric_scores.get("Coverage").unwrap_or(&0.0);
265            total_score += (precision + coverage) / 2.0;
266            component_count += 1;
267        }
268
269        if let Some(search) = search {
270            let ndcg = search.metric_scores.get("NDCG(10)").unwrap_or(&0.0);
271            let map = search.metric_scores.get("MAP").unwrap_or(&0.0);
272            total_score += (ndcg + map) / 2.0;
273            component_count += 1;
274        }
275
276        if let Some(clustering) = clustering {
277            let silhouette = clustering
278                .metric_scores
279                .get("SilhouetteScore")
280                .unwrap_or(&0.0);
281            total_score += silhouette.abs(); // Silhouette can be negative
282            component_count += 1;
283        }
284
285        if let Some(classification) = classification {
286            let accuracy = classification.metric_scores.get("Accuracy").unwrap_or(&0.0);
287            let f1 = classification.metric_scores.get("F1Score").unwrap_or(&0.0);
288            total_score += (accuracy + f1) / 2.0;
289            component_count += 1;
290        }
291
292        if let Some(retrieval) = retrieval {
293            let precision = retrieval
294                .metric_scores
295                .get("PrecisionAtK(10)")
296                .unwrap_or(&0.0);
297            let recall = retrieval.metric_scores.get("RecallAtK(10)").unwrap_or(&0.0);
298            total_score += (precision + recall) / 2.0;
299            component_count += 1;
300        }
301
302        if let Some(qa) = query_answering {
303            total_score += qa.overall_accuracy;
304            component_count += 1;
305        }
306
307        if component_count > 0 {
308            total_score / component_count as f64
309        } else {
310            0.0
311        }
312    }
313
314    /// Get evaluation history
315    pub fn get_evaluation_history(&self) -> Result<Vec<ApplicationEvalResults>> {
316        match self.evaluation_history.read() {
317            Ok(history) => Ok(history.iter().cloned().collect()),
318            _ => Err(anyhow!("Failed to read evaluation history")),
319        }
320    }
321
322    /// Clear evaluation history
323    pub fn clear_history(&self) -> Result<()> {
324        match self.evaluation_history.write() {
325            Ok(mut history) => {
326                history.clear();
327                Ok(())
328            }
329            _ => Err(anyhow!("Failed to clear evaluation history")),
330        }
331    }
332
333    /// Get configuration
334    pub fn config(&self) -> &ApplicationEvalConfig {
335        &self.config
336    }
337
338    /// Update configuration
339    pub fn set_config(&mut self, config: ApplicationEvalConfig) {
340        self.config = config;
341    }
342}