cp_validator/
evaluation.rs

1//! Evaluation pipeline for content contributors and live peers.
2//!
3//! Per CP-015 section 5: Validators download content from Arweave, index it
4//! locally in an isolated substrate, run test queries, and measure retrieval
5//! quality using precision@10, NDCG@10, and MRR.
6//!
7//! Per CP-015 section 13: Validators test live peers by connecting over Tor,
8//! sending test queries, and measuring search quality and latency.
9
10use crate::corpus::TestQuery;
11use crate::rating::Rating;
12use crate::window::ValidationWindow;
13use cp_arweave::ArweaveClient;
14use cp_graph::GraphStore;
15use cp_tor::types::{PeerRegistration, SearchResponse, SearchStatus};
16use ed25519_dalek::{Signer, SigningKey};
17use serde::{Deserialize, Serialize};
18use serde_big_array::BigArray;
19use std::collections::HashMap;
20use std::time::Instant;
21use tracing::{debug, info, warn};
22use uuid::Uuid;
23
24/// Quality metrics for a contributor's content, measured against ground truth.
25///
26/// Per CP-015 section 1.
27#[derive(Debug, Clone, Serialize, Deserialize)]
28pub struct QualityMetrics {
29    /// Fraction of top-10 results that are relevant.
30    pub precision_at_10: f32,
31    /// Normalized discounted cumulative gain at 10.
32    pub ndcg_at_10: f32,
33    /// Mean reciprocal rank of first relevant result.
34    pub mrr: f32,
35    /// Number of valid Merkle proofs in content.
36    pub merkle_proofs_valid: u32,
37    /// Number of invalid or missing proofs.
38    pub merkle_proofs_invalid: u32,
39    /// Adapter perplexity delta (negative means improvement).
40    pub adapter_perplexity_delta: Option<f32>,
41}
42
43/// Result of evaluating a single contributor's content.
44///
45/// Per CP-015 section 10.
46#[derive(Debug, Clone, Serialize, Deserialize)]
47pub struct EvaluationResult {
48    /// Ed25519 public key of the contributor.
49    pub contributor_key: [u8; 32],
50    /// Node ID of the validator that performed this evaluation.
51    pub validator_node_id: [u8; 16],
52    /// Window ID when this evaluation was performed.
53    pub window_id: u64,
54    /// Aggregated quality metrics.
55    pub metrics: QualityMetrics,
56    /// Timestamp of the evaluation (Unix milliseconds).
57    pub timestamp: i64,
58    /// Ed25519 signature over (contributor_key || window_id || metrics).
59    #[serde(with = "BigArray")]
60    pub signature: [u8; 64],
61}
62
63impl EvaluationResult {
64    /// Compute the bytes to sign: BLAKE3(CBOR(contributor_key || window_id || metrics)).
65    pub fn signing_bytes(&self) -> [u8; 32] {
66        let signable = EvaluationResultSignable {
67            contributor_key: &self.contributor_key,
68            window_id: self.window_id,
69            metrics: &self.metrics,
70            timestamp: self.timestamp,
71        };
72        let mut buf = Vec::new();
73        ciborium::into_writer(&signable, &mut buf).expect("CBOR serialization cannot fail");
74        *blake3::hash(&buf).as_bytes()
75    }
76}
77
78#[derive(Serialize)]
79struct EvaluationResultSignable<'a> {
80    contributor_key: &'a [u8; 32],
81    window_id: u64,
82    metrics: &'a QualityMetrics,
83    timestamp: i64,
84}
85
86/// Quality metrics for a live peer, including latency and availability.
87///
88/// Per CP-015 section 13.
89#[derive(Debug, Clone, Serialize, Deserialize)]
90pub struct PeerQualityMetrics {
91    /// Fraction of top-10 results that are relevant.
92    pub precision_at_10: f32,
93    /// Normalized discounted cumulative gain at 10.
94    pub ndcg_at_10: f32,
95    /// Mean reciprocal rank of first relevant result.
96    pub mrr: f32,
97    /// Average response latency in milliseconds.
98    pub response_latency_ms: u16,
99    /// Fraction of results with valid Merkle proofs.
100    pub proof_validity_rate: f32,
101    /// Fraction of test windows where peer was reachable.
102    pub availability_rate: f32,
103}
104
105/// Result of testing a live peer.
106///
107/// Per CP-015 section 13.
108#[derive(Debug, Clone, Serialize, Deserialize)]
109pub struct PeerTestResult {
110    /// Node ID of the tested peer.
111    pub peer_node_id: [u8; 16],
112    /// Onion address of the tested peer.
113    pub peer_onion: String,
114    /// Node ID of the validator.
115    pub validator_node_id: [u8; 16],
116    /// Window ID when this test was performed.
117    pub window_id: u64,
118    /// Quality metrics.
119    pub metrics: PeerQualityMetrics,
120    /// Timestamp (Unix milliseconds).
121    pub timestamp: i64,
122    /// Ed25519 signature.
123    #[serde(with = "BigArray")]
124    pub signature: [u8; 64],
125}
126
127impl PeerTestResult {
128    /// Compute the bytes to sign.
129    pub fn signing_bytes(&self) -> [u8; 32] {
130        let signable = PeerTestResultSignable {
131            peer_node_id: &self.peer_node_id,
132            peer_onion: &self.peer_onion,
133            window_id: self.window_id,
134            metrics: &self.metrics,
135            timestamp: self.timestamp,
136        };
137        let mut buf = Vec::new();
138        ciborium::into_writer(&signable, &mut buf).expect("CBOR serialization cannot fail");
139        *blake3::hash(&buf).as_bytes()
140    }
141}
142
143#[derive(Serialize)]
144struct PeerTestResultSignable<'a> {
145    peer_node_id: &'a [u8; 16],
146    peer_onion: &'a str,
147    window_id: u64,
148    metrics: &'a PeerQualityMetrics,
149    timestamp: i64,
150}
151
152// ============================================================================
153// Retrieval quality metrics
154// ============================================================================
155
156/// Precision at K: fraction of the top-K results that are relevant.
157///
158/// Given a ranked list of returned IDs and a set of known relevant IDs,
159/// counts how many of the top K returned IDs appear in the relevant set.
160///
161/// Returns 0.0 if k is 0 or returned is empty.
162pub fn precision_at_k(returned: &[Uuid], relevant: &[Uuid], k: usize) -> f32 {
163    if k == 0 || returned.is_empty() {
164        return 0.0;
165    }
166
167    let take = returned.len().min(k);
168    let relevant_count = returned[..take]
169        .iter()
170        .filter(|id| relevant.contains(id))
171        .count();
172
173    relevant_count as f32 / k as f32
174}
175
176/// Normalized Discounted Cumulative Gain at K.
177///
178/// Measures ranking quality accounting for position: relevant results
179/// appearing earlier are more valuable. Uses graded relevance (0-3)
180/// when available, falling back to binary relevance from the relevant set.
181///
182/// The relevance_grades map provides graded relevance for each ID.
183/// IDs not in the map but present in the relevant set get grade 1.
184/// IDs in neither get grade 0.
185///
186/// DCG@K = sum_{i=1}^{K} (2^rel_i - 1) / log2(i + 1)
187/// NDCG@K = DCG@K / IDCG@K
188///
189/// where IDCG@K is the DCG of the ideal ranking.
190pub fn ndcg_at_k(
191    returned: &[Uuid],
192    relevant: &[Uuid],
193    relevance_grades: &HashMap<Uuid, u8>,
194    k: usize,
195) -> f32 {
196    if k == 0 || returned.is_empty() || relevant.is_empty() {
197        return 0.0;
198    }
199
200    let take = returned.len().min(k);
201
202    // Compute DCG for the returned ranking
203    let dcg = compute_dcg(&returned[..take], relevant, relevance_grades);
204
205    // Compute ideal DCG: sort all known relevant items by their grade, descending
206    let mut ideal_grades: Vec<u8> = relevant
207        .iter()
208        .map(|id| *relevance_grades.get(id).unwrap_or(&1))
209        .collect();
210    ideal_grades.sort_unstable_by(|a, b| b.cmp(a));
211    ideal_grades.truncate(k);
212
213    let idcg: f64 = ideal_grades
214        .iter()
215        .enumerate()
216        .map(|(i, &grade)| {
217            let gain = (2.0_f64).powi(grade as i32) - 1.0;
218            gain / (i as f64 + 2.0).log2()
219        })
220        .sum();
221
222    if idcg == 0.0 {
223        return 0.0;
224    }
225
226    (dcg / idcg) as f32
227}
228
229/// Compute Discounted Cumulative Gain for a ranked list.
230fn compute_dcg(
231    ranked: &[Uuid],
232    relevant: &[Uuid],
233    relevance_grades: &HashMap<Uuid, u8>,
234) -> f64 {
235    ranked
236        .iter()
237        .enumerate()
238        .map(|(i, id)| {
239            let grade = if let Some(&g) = relevance_grades.get(id) {
240                g
241            } else if relevant.contains(id) {
242                1
243            } else {
244                0
245            };
246            let gain = (2.0_f64).powi(grade as i32) - 1.0;
247            gain / (i as f64 + 2.0).log2()
248        })
249        .sum()
250}
251
252/// Mean Reciprocal Rank: the reciprocal of the rank of the first relevant result.
253///
254/// If no relevant result is found in the returned list, returns 0.0.
255/// Rank is 1-indexed (first position = rank 1).
256pub fn mrr(returned: &[Uuid], relevant: &[Uuid]) -> f32 {
257    for (i, id) in returned.iter().enumerate() {
258        if relevant.contains(id) {
259            return 1.0 / (i as f32 + 1.0);
260        }
261    }
262    0.0
263}
264
265/// Composite quality score for a contributor evaluation result.
266///
267/// Per CP-015 section 7:
268///   score = 0.4 * ndcg@10 + 0.3 * precision@10 + 0.2 * mrr
269///         + 0.1 * proof_validity_rate
270///   With a 0.8 penalty multiplier if adapter_perplexity_delta > 0.
271pub fn composite_score(metrics: &QualityMetrics) -> f32 {
272    let proof_rate = if metrics.merkle_proofs_valid + metrics.merkle_proofs_invalid > 0 {
273        metrics.merkle_proofs_valid as f32
274            / (metrics.merkle_proofs_valid + metrics.merkle_proofs_invalid) as f32
275    } else {
276        0.0
277    };
278
279    let mut score = 0.4 * metrics.ndcg_at_10
280        + 0.3 * metrics.precision_at_10
281        + 0.2 * metrics.mrr
282        + 0.1 * proof_rate;
283
284    // Penalty for adapter degradation
285    if let Some(delta) = metrics.adapter_perplexity_delta {
286        if delta > 0.0 {
287            score *= 0.8;
288        }
289    }
290
291    score
292}
293
294/// Composite quality score for a peer test result.
295///
296/// Per CP-015 section 14:
297///   70% quality + 20% reliability + 10% latency
298pub fn peer_composite_score(metrics: &PeerQualityMetrics) -> f32 {
299    let quality = 0.4 * metrics.ndcg_at_10
300        + 0.2 * metrics.precision_at_10
301        + 0.1 * metrics.mrr;
302
303    let reliability = metrics.proof_validity_rate * metrics.availability_rate;
304
305    // Latency score: 1.0 at 0ms, 0.0 at 5000ms, linear
306    let latency_score = (1.0 - metrics.response_latency_ms as f32 / 5000.0).max(0.0);
307
308    0.7 * quality + 0.2 * reliability + 0.1 * latency_score
309}
310
311/// Evaluate a contributor by downloading their content from Arweave,
312/// indexing it in an isolated substrate, and running test queries.
313///
314/// Per CP-015 section 5.
315pub async fn evaluate_contributor(
316    contributor_key: [u8; 32],
317    arweave: &ArweaveClient,
318    test_queries: &[TestQuery],
319    validator_node_id: [u8; 16],
320    signing_key: &SigningKey,
321) -> Result<EvaluationResult, crate::ValidatorError> {
322    info!(
323        contributor = hex::encode(contributor_key),
324        queries = test_queries.len(),
325        "Evaluating contributor"
326    );
327
328    // 1. Discover contributor's content on Arweave
329    let contributor_hex = hex::encode(contributor_key);
330    let (discovered, _cursor) =
331        cp_arweave::discover_diffs_by_contributor(arweave, &contributor_hex, 50, None)
332            .await
333            .map_err(|e| crate::ValidatorError::Arweave(e.to_string()))?;
334
335    if discovered.is_empty() {
336        return Err(crate::ValidatorError::NoContent(contributor_hex));
337    }
338
339    // 2. Build a temporary local index of the contributor's content
340    let temp_dir = tempfile::tempdir()
341        .map_err(|e| crate::ValidatorError::Internal(format!("Failed to create temp dir: {}", e)))?;
342    let db_path = temp_dir.path().join("eval.db");
343    let db_path_str = db_path.to_string_lossy().to_string();
344
345    let mut local_index = GraphStore::open(&db_path_str)
346        .map_err(|e| crate::ValidatorError::Internal(format!("Failed to open graph store: {}", e)))?;
347
348    let mut downloaded_count = 0u64;
349    for diff_meta in &discovered {
350        match cp_arweave::download_diff(arweave, diff_meta).await {
351            Ok(downloaded) => {
352                for doc in &downloaded.diff.added_docs {
353                    let _ = local_index.insert_document(doc);
354                }
355                for chunk in &downloaded.diff.added_chunks {
356                    let _ = local_index.insert_chunk(chunk);
357                }
358                for emb in &downloaded.diff.added_embeddings {
359                    let _ = local_index.insert_embedding(emb);
360                }
361                for edge in &downloaded.diff.added_edges {
362                    let _ = local_index.add_edge(edge);
363                }
364                downloaded_count += 1;
365            }
366            Err(e) => {
367                warn!(tx_id = diff_meta.tx_id, error = %e, "Failed to download diff");
368            }
369        }
370    }
371
372    if downloaded_count == 0 {
373        return Err(crate::ValidatorError::NoContent(contributor_hex));
374    }
375
376    info!(
377        contributor = hex::encode(contributor_key),
378        diffs = downloaded_count,
379        "Indexed contributor content"
380    );
381
382    // 3. Run test queries against the contributor's content
383    let mut precision_sum = 0.0f32;
384    let mut ndcg_sum = 0.0f32;
385    let mut mrr_sum = 0.0f32;
386    let mut query_count = 0u32;
387
388    for query in test_queries {
389        // Convert i16 query embedding to f32 for search
390        let query_f32: Vec<f32> = query.query_embedding.iter().map(|&v| v as f32 / 32767.0).collect();
391
392        let results = local_index.search(&query_f32, 10);
393
394        match results {
395            Ok(result_ids) => {
396                let returned_uuids: Vec<Uuid> = result_ids
397                    .iter()
398                    .map(|(id, _score)| *id)
399                    .collect();
400
401                let relevant_uuids: Vec<Uuid> = query.relevant_chunk_ids.clone();
402
403                precision_sum += precision_at_k(&returned_uuids, &relevant_uuids, 10);
404                ndcg_sum += ndcg_at_k(
405                    &returned_uuids,
406                    &relevant_uuids,
407                    &query.relevance_grade_map(),
408                    10,
409                );
410                mrr_sum += mrr(&returned_uuids, &relevant_uuids);
411                query_count += 1;
412            }
413            Err(e) => {
414                debug!(error = %e, "Search failed for query");
415            }
416        }
417    }
418
419    // 4. Verify corpus integrity: check that each document's hierarchical_hash
420    // matches the Merkle root of its chunk text_hashes (Bug 7 + 8 fix)
421    let mut proofs_valid = 0u32;
422    let mut proofs_invalid = 0u32;
423    let all_docs = local_index
424        .get_all_documents()
425        .unwrap_or_default();
426    for doc in &all_docs {
427        if doc.hierarchical_hash == [0u8; 32] {
428            // Placeholder hash — contributor never set it
429            proofs_invalid += 1;
430            continue;
431        }
432        let chunks = local_index.get_chunks_for_doc(doc.id).unwrap_or_default();
433        if chunks.is_empty() {
434            proofs_invalid += 1;
435            continue;
436        }
437        let chunk_hashes: Vec<[u8; 32]> = chunks.iter().map(|c| c.text_hash).collect();
438        let recomputed = cp_core::Document::compute_hierarchical_hash(&chunk_hashes);
439        if recomputed == doc.hierarchical_hash {
440            proofs_valid += 1;
441        } else {
442            proofs_invalid += 1;
443        }
444    }
445
446    let metrics = if query_count > 0 {
447        QualityMetrics {
448            precision_at_10: precision_sum / query_count as f32,
449            ndcg_at_10: ndcg_sum / query_count as f32,
450            mrr: mrr_sum / query_count as f32,
451            merkle_proofs_valid: proofs_valid,
452            merkle_proofs_invalid: proofs_invalid,
453            adapter_perplexity_delta: None,
454        }
455    } else {
456        QualityMetrics {
457            precision_at_10: 0.0,
458            ndcg_at_10: 0.0,
459            mrr: 0.0,
460            merkle_proofs_valid: proofs_valid,
461            merkle_proofs_invalid: proofs_invalid,
462            adapter_perplexity_delta: None,
463        }
464    };
465
466    let window = ValidationWindow::current();
467    let now = std::time::SystemTime::now()
468        .duration_since(std::time::UNIX_EPOCH)
469        .unwrap()
470        .as_millis() as i64;
471
472    let mut result = EvaluationResult {
473        contributor_key,
474        validator_node_id,
475        window_id: window.window_id,
476        metrics,
477        timestamp: now,
478        signature: [0u8; 64],
479    };
480
481    // Sign the result
482    let hash = result.signing_bytes();
483    let sig = signing_key.sign(&hash);
484    result.signature = sig.to_bytes();
485
486    Ok(result)
487}
488
489/// Evaluate a live peer by connecting over Tor, sending test queries, and
490/// measuring search quality and latency.
491///
492/// Per CP-015 section 13. Requires a running TorRuntime to establish
493/// connections to the peer's onion service.
494pub async fn evaluate_peer(
495    peer: &PeerRegistration,
496    test_queries: &[TestQuery],
497    validator_node_id: [u8; 16],
498    signing_key: &SigningKey,
499    model_hash: [u8; 32],
500    tor_runtime: &cp_tor::TorRuntime,
501) -> Result<PeerTestResult, crate::ValidatorError> {
502    info!(
503        peer = hex::encode(peer.node_id),
504        onion = &peer.onion_address,
505        "Testing live peer"
506    );
507
508    // Select up to 5 test queries per validation window
509    let selected = if test_queries.len() > 5 {
510        &test_queries[..5]
511    } else {
512        test_queries
513    };
514
515    let mut precision_sum = 0.0f32;
516    let mut ndcg_sum = 0.0f32;
517    let mut mrr_sum = 0.0f32;
518    let mut latency_sum = 0u64;
519    let mut valid_proofs = 0u32;
520    let mut total_proofs = 0u32;
521    let mut successful_queries = 0u32;
522
523    // Derive an ephemeral session key for these queries
524    let session_key = cp_tor::SessionKey::derive_with_salt(
525        &signing_key.to_bytes(),
526        &blake3::hash(b"validator-peer-test").as_bytes()[..32].try_into().unwrap(),
527    );
528
529    for query in selected {
530        let start = Instant::now();
531
532        // Connect to peer and send query
533        let response_result = tokio::time::timeout(
534            std::time::Duration::from_secs(5),
535            send_query_to_peer(
536                tor_runtime,
537                &peer.onion_address,
538                &query.query_embedding,
539                Some(query.query_text.clone()),
540                model_hash,
541                &session_key,
542                &peer.public_key,
543            ),
544        )
545        .await;
546
547        let elapsed_ms = start.elapsed().as_millis() as u64;
548
549        match response_result {
550            Ok(Ok(response)) if response.status == SearchStatus::Ok => {
551                latency_sum += elapsed_ms;
552
553                let returned_uuids: Vec<Uuid> = response
554                    .results
555                    .iter()
556                    .map(|r| Uuid::from_bytes(r.chunk_id))
557                    .collect();
558
559                let relevant_uuids: Vec<Uuid> = query.relevant_chunk_ids.clone();
560
561                precision_sum += precision_at_k(&returned_uuids, &relevant_uuids, 10);
562                ndcg_sum += ndcg_at_k(
563                    &returned_uuids,
564                    &relevant_uuids,
565                    &query.relevance_grade_map(),
566                    10,
567                );
568                mrr_sum += mrr(&returned_uuids, &relevant_uuids);
569
570                // Count proof validity
571                for result in &response.results {
572                    if let Some(proof_hashes) = &result.merkle_proof {
573                        total_proofs += 1;
574                        // The Tor search response provides proof as Vec<[u8; 32]>
575                        // (sibling hashes only). To use cp_core::state::verify_merkle_proof
576                        // we would need (hash, is_left) tuples and a leaf index.
577                        // For now, verify the proof chain manually: hash the chunk_id
578                        // as the leaf and walk up the tree. We assume left-first ordering
579                        // matching the canonical Merkle tree construction.
580                        let leaf = *blake3::hash(&result.chunk_id).as_bytes();
581                        let computed_root = proof_hashes.iter().fold(leaf, |current, sibling| {
582                            let mut hasher = blake3::Hasher::new();
583                            if current <= *sibling {
584                                hasher.update(&current);
585                                hasher.update(sibling);
586                            } else {
587                                hasher.update(sibling);
588                                hasher.update(&current);
589                            }
590                            *hasher.finalize().as_bytes()
591                        });
592                        if computed_root == response.peer_state_root {
593                            valid_proofs += 1;
594                        }
595                    }
596                }
597
598                successful_queries += 1;
599            }
600            Ok(Ok(response)) => {
601                debug!(status = ?response.status, "Peer returned non-OK status");
602            }
603            Ok(Err(e)) => {
604                debug!(error = %e, "Search request failed");
605            }
606            Err(_) => {
607                debug!("Search request timed out");
608            }
609        }
610    }
611
612    let metrics = if successful_queries > 0 {
613        PeerQualityMetrics {
614            precision_at_10: precision_sum / successful_queries as f32,
615            ndcg_at_10: ndcg_sum / successful_queries as f32,
616            mrr: mrr_sum / successful_queries as f32,
617            response_latency_ms: (latency_sum / successful_queries as u64).min(u16::MAX as u64) as u16,
618            proof_validity_rate: if total_proofs > 0 {
619                valid_proofs as f32 / total_proofs as f32
620            } else {
621                0.0
622            },
623            availability_rate: 1.0,
624        }
625    } else {
626        PeerQualityMetrics {
627            precision_at_10: 0.0,
628            ndcg_at_10: 0.0,
629            mrr: 0.0,
630            response_latency_ms: 0,
631            proof_validity_rate: 0.0,
632            availability_rate: 0.0,
633        }
634    };
635
636    let window = ValidationWindow::current();
637    let now = std::time::SystemTime::now()
638        .duration_since(std::time::UNIX_EPOCH)
639        .unwrap()
640        .as_millis() as i64;
641
642    let mut result = PeerTestResult {
643        peer_node_id: peer.node_id,
644        peer_onion: peer.onion_address.clone(),
645        validator_node_id,
646        window_id: window.window_id,
647        metrics,
648        timestamp: now,
649        signature: [0u8; 64],
650    };
651
652    let hash = result.signing_bytes();
653    let sig = signing_key.sign(&hash);
654    result.signature = sig.to_bytes();
655
656    Ok(result)
657}
658
659/// Send a search query to a peer over Tor using the cp-tor client.
660async fn send_query_to_peer(
661    tor_runtime: &cp_tor::TorRuntime,
662    onion_address: &str,
663    query_embedding: &[i16],
664    query_text: Option<String>,
665    model_hash: [u8; 32],
666    session_key: &cp_tor::SessionKey,
667    peer_public_key: &[u8; 32],
668) -> Result<SearchResponse, crate::ValidatorError> {
669    let mut stream = tor_runtime
670        .connect_to_peer(onion_address)
671        .await
672        .map_err(|e| crate::ValidatorError::Tor(e.to_string()))?;
673
674    let response = cp_tor::search(
675        &mut stream,
676        query_embedding.to_vec(),
677        query_text,
678        10,
679        true,
680        model_hash,
681        session_key,
682        peer_public_key,
683    )
684    .await
685    .map_err(|e| crate::ValidatorError::Tor(e.to_string()))?;
686
687    Ok(response)
688}
689
690/// Update contributor ratings based on evaluation results using pairwise
691/// OpenSkill comparisons.
692///
693/// Per CP-015 section 7: Results are sorted by composite score, then
694/// adjacent pairs are compared using the OpenSkill update rule.
695pub fn update_contributor_ratings(
696    results: &[EvaluationResult],
697    ratings: &mut HashMap<[u8; 32], (Rating, u64, i64)>,
698) {
699    if results.len() < 2 {
700        return;
701    }
702
703    // Sort by composite score, descending
704    let mut ranked: Vec<&EvaluationResult> = results.iter().collect();
705    ranked.sort_by(|a, b| {
706        composite_score(&b.metrics)
707            .partial_cmp(&composite_score(&a.metrics))
708            .unwrap_or(std::cmp::Ordering::Equal)
709    });
710
711    let now = std::time::SystemTime::now()
712        .duration_since(std::time::UNIX_EPOCH)
713        .unwrap()
714        .as_millis() as i64;
715
716    // Pairwise comparisons: each adjacent pair
717    for i in 0..ranked.len() - 1 {
718        let winner_key = ranked[i].contributor_key;
719        let loser_key = ranked[i + 1].contributor_key;
720
721        let winner_entry = ratings
722            .entry(winner_key)
723            .or_insert((Rating::default(), 0, now));
724        let winner_rating = winner_entry.0;
725
726        let loser_entry = ratings
727            .entry(loser_key)
728            .or_insert((Rating::default(), 0, now));
729        let loser_rating = loser_entry.0;
730
731        let (new_winner, new_loser) =
732            crate::rating::pairwise_update(&winner_rating, &loser_rating);
733
734        ratings.get_mut(&winner_key).unwrap().0 = new_winner;
735        ratings.get_mut(&winner_key).unwrap().1 += 1;
736        ratings.get_mut(&winner_key).unwrap().2 = now;
737
738        ratings.get_mut(&loser_key).unwrap().0 = new_loser;
739        ratings.get_mut(&loser_key).unwrap().1 += 1;
740        ratings.get_mut(&loser_key).unwrap().2 = now;
741    }
742}
743
744/// Update peer ratings based on peer test results using pairwise
745/// OpenSkill comparisons.
746///
747/// Same mechanism as contributor ratings (CP-015 section 14) but uses
748/// peer_composite_score for ranking.
749pub fn update_peer_ratings(
750    results: &[PeerTestResult],
751    ratings: &mut HashMap<[u8; 16], (Rating, u64, i64)>,
752) {
753    if results.len() < 2 {
754        return;
755    }
756
757    let mut ranked: Vec<&PeerTestResult> = results.iter().collect();
758    ranked.sort_by(|a, b| {
759        peer_composite_score(&b.metrics)
760            .partial_cmp(&peer_composite_score(&a.metrics))
761            .unwrap_or(std::cmp::Ordering::Equal)
762    });
763
764    let now = std::time::SystemTime::now()
765        .duration_since(std::time::UNIX_EPOCH)
766        .unwrap()
767        .as_millis() as i64;
768
769    for i in 0..ranked.len() - 1 {
770        let winner_key = ranked[i].peer_node_id;
771        let loser_key = ranked[i + 1].peer_node_id;
772
773        let winner_entry = ratings
774            .entry(winner_key)
775            .or_insert((Rating::default(), 0, now));
776        let winner_rating = winner_entry.0;
777
778        let loser_entry = ratings
779            .entry(loser_key)
780            .or_insert((Rating::default(), 0, now));
781        let loser_rating = loser_entry.0;
782
783        let (new_winner, new_loser) =
784            crate::rating::pairwise_update(&winner_rating, &loser_rating);
785
786        ratings.get_mut(&winner_key).unwrap().0 = new_winner;
787        ratings.get_mut(&winner_key).unwrap().1 += 1;
788        ratings.get_mut(&winner_key).unwrap().2 = now;
789
790        ratings.get_mut(&loser_key).unwrap().0 = new_loser;
791        ratings.get_mut(&loser_key).unwrap().1 += 1;
792        ratings.get_mut(&loser_key).unwrap().2 = now;
793    }
794}
795
796#[cfg(test)]
797mod tests {
798    use super::*;
799
800    fn make_uuids(n: usize) -> Vec<Uuid> {
801        (0..n)
802            .map(|i| {
803                let mut bytes = [0u8; 16];
804                bytes[0] = i as u8;
805                Uuid::from_bytes(bytes)
806            })
807            .collect()
808    }
809
810    // ---- precision_at_k ----
811
812    #[test]
813    fn test_precision_at_k_all_relevant() {
814        let ids = make_uuids(10);
815        let relevant = ids.clone();
816        assert!((precision_at_k(&ids, &relevant, 10) - 1.0).abs() < 1e-6);
817    }
818
819    #[test]
820    fn test_precision_at_k_none_relevant() {
821        let returned = make_uuids(10);
822        let relevant = vec![Uuid::from_bytes([99u8; 16])];
823        assert!((precision_at_k(&returned, &relevant, 10)).abs() < 1e-6);
824    }
825
826    #[test]
827    fn test_precision_at_k_half_relevant() {
828        let returned = make_uuids(10);
829        let relevant: Vec<Uuid> = returned[..5].to_vec();
830        assert!((precision_at_k(&returned, &relevant, 10) - 0.5).abs() < 1e-6);
831    }
832
833    #[test]
834    fn test_precision_at_k_fewer_results_than_k() {
835        let returned = make_uuids(3);
836        let relevant = returned.clone();
837        // 3 relevant out of k=10 -> 3/10 = 0.3
838        assert!((precision_at_k(&returned, &relevant, 10) - 0.3).abs() < 1e-6);
839    }
840
841    #[test]
842    fn test_precision_at_k_zero() {
843        let returned = make_uuids(5);
844        let relevant = returned.clone();
845        assert!((precision_at_k(&returned, &relevant, 0)).abs() < 1e-6);
846    }
847
848    #[test]
849    fn test_precision_at_k_empty_returned() {
850        let relevant = make_uuids(5);
851        assert!((precision_at_k(&[], &relevant, 10)).abs() < 1e-6);
852    }
853
854    // ---- ndcg_at_k ----
855
856    #[test]
857    fn test_ndcg_at_k_perfect_ranking() {
858        let ids = make_uuids(5);
859        let relevant = ids.clone();
860        let mut grades = HashMap::new();
861        for (i, id) in ids.iter().enumerate() {
862            grades.insert(*id, (5 - i) as u8);
863        }
864        let score = ndcg_at_k(&ids, &relevant, &grades, 5);
865        assert!((score - 1.0).abs() < 1e-5, "Perfect NDCG should be 1.0, got {}", score);
866    }
867
868    #[test]
869    fn test_ndcg_at_k_no_relevant() {
870        let returned = make_uuids(5);
871        let relevant: Vec<Uuid> = vec![];
872        let grades = HashMap::new();
873        assert!((ndcg_at_k(&returned, &relevant, &grades, 5)).abs() < 1e-6);
874    }
875
876    #[test]
877    fn test_ndcg_at_k_binary_relevance() {
878        let returned = make_uuids(5);
879        let relevant = vec![returned[4]];
880        let grades = HashMap::new();
881
882        let score = ndcg_at_k(&returned, &relevant, &grades, 5);
883        assert!(score > 0.3 && score < 0.5, "Got NDCG = {}", score);
884    }
885
886    #[test]
887    fn test_ndcg_at_k_zero_k() {
888        let ids = make_uuids(5);
889        let grades = HashMap::new();
890        assert!((ndcg_at_k(&ids, &ids, &grades, 0)).abs() < 1e-6);
891    }
892
893    // ---- mrr ----
894
895    #[test]
896    fn test_mrr_first_position() {
897        let returned = make_uuids(5);
898        let relevant = vec![returned[0]];
899        assert!((mrr(&returned, &relevant) - 1.0).abs() < 1e-6);
900    }
901
902    #[test]
903    fn test_mrr_second_position() {
904        let returned = make_uuids(5);
905        let relevant = vec![returned[1]];
906        assert!((mrr(&returned, &relevant) - 0.5).abs() < 1e-6);
907    }
908
909    #[test]
910    fn test_mrr_third_position() {
911        let returned = make_uuids(5);
912        let relevant = vec![returned[2]];
913        let score = mrr(&returned, &relevant);
914        assert!((score - 1.0 / 3.0).abs() < 1e-6, "Expected 1/3, got {}", score);
915    }
916
917    #[test]
918    fn test_mrr_no_relevant() {
919        let returned = make_uuids(5);
920        let relevant = vec![Uuid::from_bytes([99u8; 16])];
921        assert!((mrr(&returned, &relevant)).abs() < 1e-6);
922    }
923
924    #[test]
925    fn test_mrr_empty() {
926        let relevant = make_uuids(3);
927        assert!((mrr(&[], &relevant)).abs() < 1e-6);
928    }
929
930    #[test]
931    fn test_mrr_multiple_relevant_returns_first() {
932        let returned = make_uuids(5);
933        let relevant = vec![returned[1], returned[3]];
934        assert!((mrr(&returned, &relevant) - 0.5).abs() < 1e-6);
935    }
936
937    // ---- composite_score ----
938
939    #[test]
940    fn test_composite_score_perfect() {
941        let metrics = QualityMetrics {
942            precision_at_10: 1.0,
943            ndcg_at_10: 1.0,
944            mrr: 1.0,
945            merkle_proofs_valid: 10,
946            merkle_proofs_invalid: 0,
947            adapter_perplexity_delta: None,
948        };
949        let score = composite_score(&metrics);
950        assert!((score - 1.0).abs() < 1e-6, "Perfect score should be 1.0, got {}", score);
951    }
952
953    #[test]
954    fn test_composite_score_zero() {
955        let metrics = QualityMetrics {
956            precision_at_10: 0.0,
957            ndcg_at_10: 0.0,
958            mrr: 0.0,
959            merkle_proofs_valid: 0,
960            merkle_proofs_invalid: 0,
961            adapter_perplexity_delta: None,
962        };
963        assert!((composite_score(&metrics)).abs() < 1e-6);
964    }
965
966    #[test]
967    fn test_composite_score_adapter_penalty() {
968        let metrics = QualityMetrics {
969            precision_at_10: 1.0,
970            ndcg_at_10: 1.0,
971            mrr: 1.0,
972            merkle_proofs_valid: 10,
973            merkle_proofs_invalid: 0,
974            adapter_perplexity_delta: Some(0.5),
975        };
976        let score = composite_score(&metrics);
977        assert!((score - 0.8).abs() < 1e-6, "Penalized score should be 0.8, got {}", score);
978    }
979
980    #[test]
981    fn test_composite_score_adapter_improvement() {
982        let metrics = QualityMetrics {
983            precision_at_10: 1.0,
984            ndcg_at_10: 1.0,
985            mrr: 1.0,
986            merkle_proofs_valid: 10,
987            merkle_proofs_invalid: 0,
988            adapter_perplexity_delta: Some(-0.5),
989        };
990        let score = composite_score(&metrics);
991        assert!((score - 1.0).abs() < 1e-6);
992    }
993
994    // ---- peer_composite_score ----
995
996    #[test]
997    fn test_peer_composite_score_fast_good_peer() {
998        let metrics = PeerQualityMetrics {
999            precision_at_10: 1.0,
1000            ndcg_at_10: 1.0,
1001            mrr: 1.0,
1002            response_latency_ms: 0,
1003            proof_validity_rate: 1.0,
1004            availability_rate: 1.0,
1005        };
1006        let score = peer_composite_score(&metrics);
1007        // quality = 0.4 + 0.2 + 0.1 = 0.7
1008        // reliability = 1.0
1009        // latency_score = 1.0
1010        // 0.7*0.7 + 0.2*1.0 + 0.1*1.0 = 0.49 + 0.2 + 0.1 = 0.79
1011        assert!((score - 0.79).abs() < 0.01, "Got {}", score);
1012    }
1013
1014    #[test]
1015    fn test_peer_composite_score_slow_peer() {
1016        let metrics = PeerQualityMetrics {
1017            precision_at_10: 1.0,
1018            ndcg_at_10: 1.0,
1019            mrr: 1.0,
1020            response_latency_ms: 5000,
1021            proof_validity_rate: 1.0,
1022            availability_rate: 1.0,
1023        };
1024        let score = peer_composite_score(&metrics);
1025        assert!((score - 0.69).abs() < 0.01, "Got {}", score);
1026    }
1027
1028    #[test]
1029    fn test_peer_composite_score_unreachable() {
1030        let metrics = PeerQualityMetrics {
1031            precision_at_10: 0.0,
1032            ndcg_at_10: 0.0,
1033            mrr: 0.0,
1034            response_latency_ms: 0,
1035            proof_validity_rate: 0.0,
1036            availability_rate: 0.0,
1037        };
1038        let score = peer_composite_score(&metrics);
1039        assert!((score - 0.1).abs() < 0.01, "Got {}", score);
1040    }
1041
1042    // ---- update_contributor_ratings ----
1043
1044    #[test]
1045    fn test_update_contributor_ratings_single_result() {
1046        let result = EvaluationResult {
1047            contributor_key: [1u8; 32],
1048            validator_node_id: [0u8; 16],
1049            window_id: 100,
1050            metrics: QualityMetrics {
1051                precision_at_10: 0.8,
1052                ndcg_at_10: 0.7,
1053                mrr: 0.9,
1054                merkle_proofs_valid: 5,
1055                merkle_proofs_invalid: 0,
1056                adapter_perplexity_delta: None,
1057            },
1058            timestamp: 0,
1059            signature: [0u8; 64],
1060        };
1061
1062        let mut ratings = HashMap::new();
1063        update_contributor_ratings(&[result], &mut ratings);
1064        assert!(ratings.is_empty());
1065    }
1066
1067    #[test]
1068    fn test_update_contributor_ratings_two_results() {
1069        let good = EvaluationResult {
1070            contributor_key: [1u8; 32],
1071            validator_node_id: [0u8; 16],
1072            window_id: 100,
1073            metrics: QualityMetrics {
1074                precision_at_10: 0.9,
1075                ndcg_at_10: 0.9,
1076                mrr: 0.9,
1077                merkle_proofs_valid: 10,
1078                merkle_proofs_invalid: 0,
1079                adapter_perplexity_delta: None,
1080            },
1081            timestamp: 0,
1082            signature: [0u8; 64],
1083        };
1084
1085        let bad = EvaluationResult {
1086            contributor_key: [2u8; 32],
1087            validator_node_id: [0u8; 16],
1088            window_id: 100,
1089            metrics: QualityMetrics {
1090                precision_at_10: 0.1,
1091                ndcg_at_10: 0.1,
1092                mrr: 0.1,
1093                merkle_proofs_valid: 1,
1094                merkle_proofs_invalid: 9,
1095                adapter_perplexity_delta: None,
1096            },
1097            timestamp: 0,
1098            signature: [0u8; 64],
1099        };
1100
1101        let mut ratings = HashMap::new();
1102        update_contributor_ratings(&[good, bad], &mut ratings);
1103
1104        let good_rating = ratings.get(&[1u8; 32]).unwrap();
1105        let bad_rating = ratings.get(&[2u8; 32]).unwrap();
1106
1107        assert!(
1108            good_rating.0.mu > bad_rating.0.mu,
1109            "Good contributor mu ({}) should exceed bad contributor mu ({})",
1110            good_rating.0.mu,
1111            bad_rating.0.mu
1112        );
1113    }
1114}
cp_validator/evaluation.rs

cp_validator/
evaluation.rs