1use cp_arweave::ArweaveClient;
9use serde::{Deserialize, Serialize};
10use std::collections::HashMap;
11use std::path::Path;
12use tracing::info;
13use uuid::Uuid;
14
15#[derive(Debug, Clone, Serialize, Deserialize)]
21pub struct TestQuery {
22 pub query_text: String,
24
25 pub query_embedding: Vec<i16>,
29
30 pub relevant_chunk_ids: Vec<Uuid>,
32
33 pub relevance_grades: Vec<u8>,
38}
39
40impl TestQuery {
41 pub fn query_embedding_f32(&self) -> Vec<f32> {
44 self.query_embedding
45 .iter()
46 .map(|&v| f32::from(v) / 32767.0)
47 .collect()
48 }
49
50 pub fn query_embedding_i16(&self) -> Vec<i16> {
52 self.query_embedding.clone()
53 }
54
55 pub fn relevance_grade_map(&self) -> HashMap<Uuid, u8> {
57 self.relevant_chunk_ids
58 .iter()
59 .zip(self.relevance_grades.iter())
60 .map(|(id, &grade)| (*id, grade))
61 .collect()
62 }
63}
64
65#[derive(Debug, Clone, Serialize, Deserialize)]
67pub struct TestCorpus {
68 pub queries: Vec<TestQuery>,
70}
71
72impl TestCorpus {
73 pub fn new() -> Self {
75 Self {
76 queries: Vec::new(),
77 }
78 }
79
80 pub fn len(&self) -> usize {
82 self.queries.len()
83 }
84
85 pub fn is_empty(&self) -> bool {
87 self.queries.is_empty()
88 }
89
90 pub fn load_from_file(path: &Path) -> Result<Self, crate::ValidatorError> {
94 let bytes = std::fs::read(path).map_err(|e| {
95 crate::ValidatorError::Corpus(format!(
96 "Failed to read corpus file {}: {}",
97 path.display(),
98 e
99 ))
100 })?;
101
102 if let Ok(corpus) = ciborium::from_reader::<TestCorpus, _>(bytes.as_slice()) {
104 info!(queries = corpus.queries.len(), path = %path.display(), "Loaded corpus from CBOR");
105 return Ok(corpus);
106 }
107
108 let corpus: TestCorpus = serde_json::from_slice(&bytes).map_err(|e| {
109 crate::ValidatorError::Corpus(format!(
110 "Failed to parse corpus file {} (tried CBOR and JSON): {}",
111 path.display(),
112 e
113 ))
114 })?;
115
116 info!(queries = corpus.queries.len(), path = %path.display(), "Loaded corpus from JSON");
117 Ok(corpus)
118 }
119
120 pub fn save_to_file(&self, path: &Path) -> Result<(), crate::ValidatorError> {
122 let mut buf = Vec::new();
123 ciborium::into_writer(self, &mut buf).map_err(|e| {
124 crate::ValidatorError::Corpus(format!("Failed to serialize corpus: {e}"))
125 })?;
126
127 std::fs::write(path, &buf).map_err(|e| {
128 crate::ValidatorError::Corpus(format!(
129 "Failed to write corpus file {}: {}",
130 path.display(),
131 e
132 ))
133 })?;
134
135 info!(queries = self.queries.len(), path = %path.display(), "Saved corpus to CBOR");
136 Ok(())
137 }
138
139 pub async fn load_from_arweave(arweave: &ArweaveClient) -> Result<Self, crate::ValidatorError> {
145 info!("Loading test corpus from Arweave");
146
147 let tags = vec![cp_arweave::TagFilter::new(
149 "Content-Type",
150 &["application/x-canon-test-corpus"],
151 )];
152
153 let result = arweave
154 .query_transactions(&tags, 1, None, cp_arweave::SortOrder::HeightDesc)
155 .await
156 .map_err(|e| crate::ValidatorError::Arweave(format!("Failed to query corpus: {e}")))?;
157
158 if result.edges.is_empty() {
159 return Err(crate::ValidatorError::Corpus(
160 "No test corpus found on Arweave".to_string(),
161 ));
162 }
163
164 let tx_id = &result.edges[0].node.id;
165 info!(tx_id = tx_id, "Found corpus transaction");
166
167 let data = arweave.get_transaction_data(tx_id).await.map_err(|e| {
168 crate::ValidatorError::Arweave(format!("Failed to download corpus: {e}"))
169 })?;
170
171 let corpus_bytes = match zstd::decode_all(data.as_slice()) {
173 Ok(decompressed) => decompressed,
174 Err(_) => data,
175 };
176
177 let corpus: TestCorpus = ciborium::from_reader(corpus_bytes.as_slice()).map_err(|e| {
178 crate::ValidatorError::Corpus(format!("Failed to parse corpus from Arweave: {e}"))
179 })?;
180
181 info!(queries = corpus.queries.len(), "Loaded corpus from Arweave");
182 Ok(corpus)
183 }
184}
185
186impl Default for TestCorpus {
187 fn default() -> Self {
188 Self::new()
189 }
190}
191
192#[cfg(test)]
193mod tests {
194 use super::*;
195
196 fn sample_query(i: usize) -> TestQuery {
197 TestQuery {
198 query_text: format!("What is topic {i}?"),
199 query_embedding: vec![i as i16; 10],
200 relevant_chunk_ids: vec![
201 Uuid::from_bytes([i as u8; 16]),
202 Uuid::from_bytes([(i + 1) as u8; 16]),
203 ],
204 relevance_grades: vec![3, 1],
205 }
206 }
207
208 #[test]
209 fn test_test_query_relevance_grade_map() {
210 let query = sample_query(0);
211 let map = query.relevance_grade_map();
212 assert_eq!(map.len(), 2);
213 assert_eq!(*map.get(&Uuid::from_bytes([0u8; 16])).unwrap(), 3);
214 assert_eq!(*map.get(&Uuid::from_bytes([1u8; 16])).unwrap(), 1);
215 }
216
217 #[test]
218 fn test_test_query_embedding_i16() {
219 let query = sample_query(5);
220 let emb = query.query_embedding_i16();
221 assert_eq!(emb, vec![5i16; 10]);
222 }
223
224 #[test]
225 fn test_test_query_embedding_f32() {
226 let query = sample_query(5);
227 let emb = query.query_embedding_f32();
228 assert_eq!(emb.len(), 10);
229 assert!((emb[0] - 5.0 / 32767.0).abs() < 1e-6);
231 }
232
233 #[test]
234 fn test_corpus_new_empty() {
235 let corpus = TestCorpus::new();
236 assert!(corpus.is_empty());
237 assert_eq!(corpus.len(), 0);
238 }
239
240 #[test]
241 fn test_corpus_with_queries() {
242 let corpus = TestCorpus {
243 queries: (0..10).map(sample_query).collect(),
244 };
245 assert!(!corpus.is_empty());
246 assert_eq!(corpus.len(), 10);
247 }
248
249 #[test]
250 fn test_corpus_save_load_cbor_roundtrip() {
251 let corpus = TestCorpus {
252 queries: (0..5).map(sample_query).collect(),
253 };
254
255 let tmp = tempfile::NamedTempFile::new().unwrap();
256 let path = tmp.path().to_path_buf();
257
258 corpus.save_to_file(&path).unwrap();
259 let loaded = TestCorpus::load_from_file(&path).unwrap();
260
261 assert_eq!(loaded.len(), corpus.len());
262 for (a, b) in loaded.queries.iter().zip(corpus.queries.iter()) {
263 assert_eq!(a.query_text, b.query_text);
264 assert_eq!(a.query_embedding, b.query_embedding);
265 assert_eq!(a.relevant_chunk_ids, b.relevant_chunk_ids);
266 assert_eq!(a.relevance_grades, b.relevance_grades);
267 }
268 }
269
270 #[test]
271 fn test_corpus_load_json() {
272 let corpus = TestCorpus {
273 queries: vec![sample_query(0)],
274 };
275
276 let json = serde_json::to_vec(&corpus).unwrap();
277 let tmp = tempfile::NamedTempFile::new().unwrap();
278 std::fs::write(tmp.path(), &json).unwrap();
279
280 let loaded = TestCorpus::load_from_file(tmp.path()).unwrap();
281 assert_eq!(loaded.len(), 1);
282 assert_eq!(loaded.queries[0].query_text, "What is topic 0?");
283 }
284
285 #[test]
286 fn test_corpus_load_nonexistent() {
287 let result = TestCorpus::load_from_file(Path::new("/nonexistent/corpus.cbor"));
288 assert!(result.is_err());
289 }
290
291 #[test]
292 fn test_corpus_cbor_serialization() {
293 let corpus = TestCorpus {
294 queries: (0..3).map(sample_query).collect(),
295 };
296
297 let mut buf = Vec::new();
298 ciborium::into_writer(&corpus, &mut buf).unwrap();
299 let decoded: TestCorpus = ciborium::from_reader(buf.as_slice()).unwrap();
300
301 assert_eq!(decoded.len(), 3);
302 }
303
304 #[test]
305 fn test_test_query_serialization() {
306 let query = sample_query(42);
307 let json = serde_json::to_string(&query).unwrap();
308 let decoded: TestQuery = serde_json::from_str(&json).unwrap();
309 assert_eq!(decoded.query_text, query.query_text);
310 assert_eq!(decoded.relevant_chunk_ids, query.relevant_chunk_ids);
311 }
312}