1use crate::decode::*;
7use crate::encode::*;
8use byteorder::{BigEndian, ReadBytesExt};
9use std::collections::HashMap;
10use std::io::Error;
11
12pub fn relabel_ben_lines<R: Read, W: Write>(
27 mut reader: R,
28 mut writer: W,
29 variant: BenVariant,
30) -> io::Result<()> {
31 let mut sample_number = 0;
32 loop {
33 let mut tmp_buffer = [0u8];
34 let max_val_bits = match reader.read_exact(&mut tmp_buffer) {
35 Ok(_) => tmp_buffer[0],
36 Err(e) => {
37 if e.kind() == io::ErrorKind::UnexpectedEof {
38 break;
39 }
40 return Err(e);
41 }
42 };
43
44 let max_len_bits = reader.read_u8()?;
45 let n_bytes = reader.read_u32::<BigEndian>()?;
46
47 let mut ben_line = decode_ben_line(&mut reader, max_val_bits, max_len_bits, n_bytes)?;
48
49 let mut label = 0;
51 let mut label_map = HashMap::new();
52 for (val, _len) in ben_line.iter_mut() {
53 let new_val = match label_map.get(val) {
54 Some(v) => *v,
55 None => {
56 label += 1;
57 label_map.insert(*val, label);
58 label
59 }
60 };
61 *val = new_val;
62 }
63
64 let relabeled = encode_ben_vec_from_rle(ben_line);
65 writer.write_all(&relabeled)?;
66
67 let count_occurrences = if variant == BenVariant::MkvChain {
68 let count = reader.read_u16::<BigEndian>()?;
69 writer.write_all(&count.to_be_bytes())?;
70 count
71 } else {
72 1
73 };
74
75 sample_number += count_occurrences as usize;
76
77 log!("Relabeling line: {}\r", sample_number);
78 }
79 logln!();
80 logln!("Done!");
81
82 Ok(())
83}
84
85pub fn relabel_ben_file<R: Read, W: Write>(mut reader: R, mut writer: W) -> io::Result<()> {
101 let mut check_buffer = [0u8; 17];
102 reader.read_exact(&mut check_buffer)?;
103
104 let variant = match &check_buffer {
105 b"STANDARD BEN FILE" => BenVariant::Standard,
106 b"MKVCHAIN BEN FILE" => BenVariant::MkvChain,
107 _ => {
108 return Err(Error::new(
109 io::ErrorKind::InvalidData,
110 "Invalid file format",
111 ));
112 }
113 };
114
115 writer.write_all(&check_buffer)?;
116
117 relabel_ben_lines(&mut reader, &mut writer, variant)?;
118
119 Ok(())
120}
121
122pub fn relabel_ben_lines_with_map<R: Read, W: Write>(
141 mut reader: R,
142 mut writer: W,
143 new_to_old_node_map: HashMap<usize, usize>,
144 variant: BenVariant,
145) -> io::Result<()> {
146 let mut sample_number = 0;
147 loop {
148 let mut tmp_buffer = [0u8];
149 let max_val_bits = match reader.read_exact(&mut tmp_buffer) {
150 Ok(_) => tmp_buffer[0],
151 Err(e) => {
152 if e.kind() == io::ErrorKind::UnexpectedEof {
153 break;
154 }
155 return Err(e);
156 }
157 };
158
159 let max_len_bits = reader.read_u8()?;
160 let n_bytes = reader.read_u32::<BigEndian>()?;
161
162 let ben_line = decode_ben_line(&mut reader, max_val_bits, max_len_bits, n_bytes)?;
163
164 let assignment_vec = rle_to_vec(ben_line);
165 let new_assignment_vec = assignment_vec
166 .iter()
167 .enumerate()
168 .map(|(i, _)| {
169 let new_val_pos = new_to_old_node_map.get(&i).unwrap();
171 let new_val = assignment_vec[*new_val_pos];
173 new_val
174 })
175 .collect::<Vec<u16>>();
176
177 let new_rle = assign_to_rle(new_assignment_vec);
178
179 let relabeled = encode_ben_vec_from_rle(new_rle);
180 writer.write_all(&relabeled)?;
181
182 let count_occurrences = if variant == BenVariant::MkvChain {
183 let count = reader.read_u16::<BigEndian>()?;
184 writer.write_all(&count.to_be_bytes())?;
185 count
186 } else {
187 1
188 };
189
190 sample_number += count_occurrences as usize;
191 log!("Relabeling line: {}\r", sample_number);
192 }
193 logln!();
194 logln!("Done!");
195
196 Ok(())
197}
198
199pub fn relabel_ben_file_with_map<R: Read, W: Write>(
218 mut reader: R,
219 mut writer: W,
220 new_to_old_node_map: HashMap<usize, usize>,
221) -> io::Result<()> {
222 let mut check_buffer = [0u8; 17];
223 reader.read_exact(&mut check_buffer)?;
224
225 let variant = match &check_buffer {
226 b"STANDARD BEN FILE" => BenVariant::Standard,
227 b"MKVCHAIN BEN FILE" => BenVariant::MkvChain,
228 _ => {
229 return Err(Error::new(
230 io::ErrorKind::InvalidData,
231 "Invalid file format",
232 ));
233 }
234 };
235
236 writer.write_all(&check_buffer)?;
237
238 relabel_ben_lines_with_map(&mut reader, &mut writer, new_to_old_node_map, variant)?;
239
240 Ok(())
241}
242
243#[cfg(test)]
244mod tests {
245 use super::*;
246 use rand::seq::SliceRandom;
247 use rand::SeedableRng;
248 use rand_chacha::ChaCha8Rng;
249 use rand_distr::{Distribution, Uniform};
250
251 fn shuffle_with_mapping<T>(vec: &mut Vec<T>) -> HashMap<usize, usize>
252 where
253 T: Clone + std::cmp::PartialEq,
254 {
255 let mut rng = ChaCha8Rng::seed_from_u64(42);
256 let original_vec = vec.clone(); vec.shuffle(&mut rng);
258
259 let mut map = HashMap::new();
260 for (new_index, item) in vec.iter().enumerate() {
261 let original_index = original_vec.iter().position(|i| i == item).unwrap();
262 map.insert(new_index, original_index);
263 }
264 map
265 }
266
267 #[test]
268 fn test_relabel_ben_line_simple() {
269 let in_rle = vec![(2, 2), (3, 2), (1, 2), (4, 2)];
270
271 let input = encode_ben_vec_from_rle(in_rle);
272
273 let out_rle = vec![(1, 2), (2, 2), (3, 2), (4, 2)];
274 let expected = encode_ben_vec_from_rle(out_rle);
275
276 let mut buf = Vec::new();
277 relabel_ben_lines(input.as_slice(), &mut buf, BenVariant::Standard).unwrap();
278
279 assert_eq!(buf, expected);
280 }
281
282 #[test]
283 fn test_relabel_simple_file() {
284 let file = format!(
285 "{}\n{}\n{}\n{}\n{}\n{}\n{}\n",
286 "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":1}",
287 "{\"assignment\":[2,1,3,4,5,5,3,4,2],\"sample\":2}",
288 "{\"assignment\":[3,3,1,1,2,2,3,3,4],\"sample\":3}",
289 "{\"assignment\":[4,3,2,1,4,3,2,1,1],\"sample\":4}",
290 "{\"assignment\":[3,2,2,4,1,3,1,4,3],\"sample\":5}",
291 "{\"assignment\":[2,2,3,3,4,4,5,5,1],\"sample\":6}",
292 "{\"assignment\":[2,4,1,5,2,4,3,1,3],\"sample\":7}"
293 );
294
295 let input = file.as_bytes();
296
297 let mut output = Vec::new();
298 let writer = io::BufWriter::new(&mut output);
299
300 encode_jsonl_to_ben(input, writer, BenVariant::Standard).unwrap();
301
302 let mut output2 = Vec::new();
303 let writer2 = io::BufWriter::new(&mut output2);
304 relabel_ben_file(output.as_slice(), writer2).unwrap();
305
306 let mut output3 = Vec::new();
307 let writer3 = io::BufWriter::new(&mut output3);
308 decode_ben_to_jsonl(output2.as_slice(), writer3).unwrap();
309
310 let output_str = String::from_utf8(output3).unwrap();
311
312 let out_file = format!(
313 "{}\n{}\n{}\n{}\n{}\n{}\n{}\n",
314 "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":1}",
315 "{\"assignment\":[1,2,3,4,5,5,3,4,1],\"sample\":2}",
316 "{\"assignment\":[1,1,2,2,3,3,1,1,4],\"sample\":3}",
317 "{\"assignment\":[1,2,3,4,1,2,3,4,4],\"sample\":4}",
318 "{\"assignment\":[1,2,2,3,4,1,4,3,1],\"sample\":5}",
319 "{\"assignment\":[1,1,2,2,3,3,4,4,5],\"sample\":6}",
320 "{\"assignment\":[1,2,3,4,1,2,5,3,5],\"sample\":7}"
321 );
322
323 assert_eq!(output_str, out_file);
324 }
325
326 #[test]
327 fn test_relabel_simple_file_mkv() {
328 let file = format!(
329 "{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n",
330 "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":1}",
331 "{\"assignment\":[2,1,3,4,5,5,3,4,2],\"sample\":2}",
332 "{\"assignment\":[3,3,1,1,2,2,3,3,4],\"sample\":3}",
333 "{\"assignment\":[4,3,2,1,4,3,2,1,1],\"sample\":4}",
334 "{\"assignment\":[3,2,2,4,1,3,1,4,3],\"sample\":5}",
335 "{\"assignment\":[3,2,2,4,1,3,1,4,3],\"sample\":6}",
336 "{\"assignment\":[3,2,2,4,1,3,1,4,3],\"sample\":7}",
337 "{\"assignment\":[2,2,3,3,4,4,5,5,1],\"sample\":8}",
338 "{\"assignment\":[2,4,1,5,2,4,3,1,3],\"sample\":9}",
339 "{\"assignment\":[2,4,1,5,2,4,3,1,3],\"sample\":10}"
340 );
341
342 let input = file.as_bytes();
343
344 let mut output = Vec::new();
345 let writer = io::BufWriter::new(&mut output);
346
347 encode_jsonl_to_ben(input, writer, BenVariant::MkvChain).unwrap();
348
349 let mut output2 = Vec::new();
350 let writer2 = io::BufWriter::new(&mut output2);
351 relabel_ben_file(output.as_slice(), writer2).unwrap();
352
353 let mut output3 = Vec::new();
354 let writer3 = io::BufWriter::new(&mut output3);
355 decode_ben_to_jsonl(output2.as_slice(), writer3).unwrap();
356
357 let output_str = String::from_utf8(output3).unwrap();
358
359 let out_file = format!(
360 "{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n",
361 "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":1}",
362 "{\"assignment\":[1,2,3,4,5,5,3,4,1],\"sample\":2}",
363 "{\"assignment\":[1,1,2,2,3,3,1,1,4],\"sample\":3}",
364 "{\"assignment\":[1,2,3,4,1,2,3,4,4],\"sample\":4}",
365 "{\"assignment\":[1,2,2,3,4,1,4,3,1],\"sample\":5}",
366 "{\"assignment\":[1,2,2,3,4,1,4,3,1],\"sample\":6}",
367 "{\"assignment\":[1,2,2,3,4,1,4,3,1],\"sample\":7}",
368 "{\"assignment\":[1,1,2,2,3,3,4,4,5],\"sample\":8}",
369 "{\"assignment\":[1,2,3,4,1,2,5,3,5],\"sample\":9}",
370 "{\"assignment\":[1,2,3,4,1,2,5,3,5],\"sample\":10}"
371 );
372
373 assert_eq!(output_str, out_file);
374 }
375
376 #[test]
377 fn test_relabel_ben_line_with_map() {
378 let in_assign = vec![2, 3, 1, 4, 5, 5, 3, 4, 2];
379 let in_rle = assign_to_rle(in_assign);
380
381 let input = encode_ben_vec_from_rle(in_rle);
382
383 let out_assign = vec![1, 2, 2, 3, 3, 4, 4, 5, 5];
384 let out_rle = assign_to_rle(out_assign);
385 let expected = encode_ben_vec_from_rle(out_rle);
386
387 let mut new_to_old_map = HashMap::new();
388 new_to_old_map.insert(0, 2);
389 new_to_old_map.insert(1, 0);
390 new_to_old_map.insert(2, 8);
391 new_to_old_map.insert(3, 1);
392 new_to_old_map.insert(4, 6);
393 new_to_old_map.insert(5, 3);
394 new_to_old_map.insert(6, 7);
395 new_to_old_map.insert(7, 4);
396 new_to_old_map.insert(8, 5);
397
398 let mut buf = Vec::new();
399 relabel_ben_lines_with_map(
400 input.as_slice(),
401 &mut buf,
402 new_to_old_map,
403 BenVariant::Standard,
404 )
405 .unwrap();
406
407 assert_eq!(buf, expected);
408 }
409
410 #[test]
411 fn test_relabel_ben_line_with_shuffle() {
412 let in_assign = vec![2, 3, 1, 4, 5, 5, 3, 4, 2];
413 let mut out_assign = in_assign.clone();
414
415 let in_rle = assign_to_rle(in_assign);
416 let input = encode_ben_vec_from_rle(in_rle);
417
418 let new_to_old_map = shuffle_with_mapping(&mut out_assign);
419 let out_rle = assign_to_rle(out_assign);
420 let expected = encode_ben_vec_from_rle(out_rle);
421
422 let mut buf = Vec::new();
423 relabel_ben_lines_with_map(
424 input.as_slice(),
425 &mut buf,
426 new_to_old_map,
427 BenVariant::Standard,
428 )
429 .unwrap();
430
431 assert_eq!(buf, expected);
432 }
433
434 #[test]
435 fn test_relabel_ben_line_with_large_shuffle() {
436 let seed = 129530786u64;
437 let mut rng = ChaCha8Rng::seed_from_u64(seed);
438
439 let mu = Uniform::new(1, 21).expect("Could not make uniform sampler");
440
441 let in_assign = (0..100_000)
442 .map(|_| mu.sample(&mut rng) as u16)
443 .collect::<Vec<u16>>();
444 let mut out_assign = in_assign.clone();
445
446 let in_rle = assign_to_rle(in_assign.to_vec());
447 let input = encode_ben_vec_from_rle(in_rle);
448
449 let new_to_old_map = shuffle_with_mapping(&mut out_assign);
450 let out_rle = assign_to_rle(out_assign);
451 let expected = encode_ben_vec_from_rle(out_rle);
452
453 let mut buf = Vec::new();
454 relabel_ben_lines_with_map(
455 input.as_slice(),
456 &mut buf,
457 new_to_old_map,
458 BenVariant::Standard,
459 )
460 .unwrap();
461
462 assert_eq!(buf, expected);
463 }
464
465 #[test]
466 fn test_relabel_simple_file_with_map() {
467 let file = format!(
468 "{}\n{}\n{}\n{}\n{}\n{}\n{}\n",
469 "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":1}",
470 "{\"assignment\":[2,1,3,4,5,5,3,4,2],\"sample\":2}",
471 "{\"assignment\":[3,3,1,1,2,2,3,3,4],\"sample\":3}",
472 "{\"assignment\":[4,3,2,1,4,3,2,1,1],\"sample\":4}",
473 "{\"assignment\":[3,2,2,4,1,3,1,4,3],\"sample\":5}",
474 "{\"assignment\":[2,2,3,3,4,4,5,5,1],\"sample\":6}",
475 "{\"assignment\":[2,4,1,5,2,4,3,1,3],\"sample\":7}"
476 );
477
478 let new_to_old_map: HashMap<usize, usize> = [
479 (0, 2),
480 (1, 3),
481 (2, 4),
482 (3, 5),
483 (4, 6),
484 (5, 7),
485 (6, 8),
486 (7, 0),
487 (8, 1),
488 ]
489 .iter()
490 .cloned()
491 .collect();
492
493 let input = file.as_bytes();
494
495 let mut output = Vec::new();
496 let writer = io::BufWriter::new(&mut output);
497
498 encode_jsonl_to_ben(input, writer, BenVariant::Standard).unwrap();
499
500 let mut output2 = Vec::new();
501 let writer2 = io::BufWriter::new(&mut output2);
502 relabel_ben_file_with_map(output.as_slice(), writer2, new_to_old_map).unwrap();
503
504 let mut output3 = Vec::new();
505 let writer3 = io::BufWriter::new(&mut output3);
506 decode_ben_to_jsonl(output2.as_slice(), writer3).unwrap();
507
508 let output_str = String::from_utf8(output3).unwrap();
509
510 let out_file = format!(
511 "{}\n{}\n{}\n{}\n{}\n{}\n{}\n",
512 "{\"assignment\":[3,4,5,5,3,4,2,1,2],\"sample\":1}",
513 "{\"assignment\":[3,4,5,5,3,4,2,2,1],\"sample\":2}",
514 "{\"assignment\":[1,1,2,2,3,3,4,3,3],\"sample\":3}",
515 "{\"assignment\":[2,1,4,3,2,1,1,4,3],\"sample\":4}",
516 "{\"assignment\":[2,4,1,3,1,4,3,3,2],\"sample\":5}",
517 "{\"assignment\":[3,3,4,4,5,5,1,2,2],\"sample\":6}",
518 "{\"assignment\":[1,5,2,4,3,1,3,2,4],\"sample\":7}"
519 );
520
521 assert_eq!(output_str, out_file);
522 }
523
524 #[test]
525 fn test_relabel_simple_file_with_map_mkv() {
526 let file = format!(
527 "{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n",
528 "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":1}",
529 "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":2}",
530 "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":3}",
531 "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":4}",
532 "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":5}",
533 "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":6}",
534 "{\"assignment\":[2,1,3,4,5,5,3,4,2],\"sample\":7}",
535 "{\"assignment\":[2,1,3,4,5,5,3,4,2],\"sample\":8}",
536 "{\"assignment\":[2,1,3,4,5,5,3,4,2],\"sample\":9}",
537 "{\"assignment\":[2,4,1,5,2,4,3,1,3],\"sample\":10}",
538 );
539
540 let new_to_old_map: HashMap<usize, usize> = [
541 (0, 2),
542 (1, 3),
543 (2, 4),
544 (3, 5),
545 (4, 6),
546 (5, 7),
547 (6, 8),
548 (7, 0),
549 (8, 1),
550 ]
551 .iter()
552 .cloned()
553 .collect();
554
555 let input = file.as_bytes();
556
557 let mut output = Vec::new();
558 let writer = io::BufWriter::new(&mut output);
559
560 encode_jsonl_to_ben(input, writer, BenVariant::MkvChain).unwrap();
561
562 let mut output2 = Vec::new();
563 let writer2 = io::BufWriter::new(&mut output2);
564 relabel_ben_file_with_map(output.as_slice(), writer2, new_to_old_map).unwrap();
565
566 let mut output3 = Vec::new();
567 let writer3 = io::BufWriter::new(&mut output3);
568 decode_ben_to_jsonl(output2.as_slice(), writer3).unwrap();
569
570 let output_str = String::from_utf8(output3).unwrap();
571
572 let out_file = format!(
573 "{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n",
574 "{\"assignment\":[3,4,5,5,3,4,2,1,2],\"sample\":1}",
575 "{\"assignment\":[3,4,5,5,3,4,2,1,2],\"sample\":2}",
576 "{\"assignment\":[3,4,5,5,3,4,2,1,2],\"sample\":3}",
577 "{\"assignment\":[3,4,5,5,3,4,2,1,2],\"sample\":4}",
578 "{\"assignment\":[3,4,5,5,3,4,2,1,2],\"sample\":5}",
579 "{\"assignment\":[3,4,5,5,3,4,2,1,2],\"sample\":6}",
580 "{\"assignment\":[3,4,5,5,3,4,2,2,1],\"sample\":7}",
581 "{\"assignment\":[3,4,5,5,3,4,2,2,1],\"sample\":8}",
582 "{\"assignment\":[3,4,5,5,3,4,2,2,1],\"sample\":9}",
583 "{\"assignment\":[1,5,2,4,3,1,3,2,4],\"sample\":10}",
584 );
585
586 assert_eq!(output_str, out_file);
587 }
588}