ben/encode/
relabel.rs

1//! This module contains the main functions that are used in the `reben` binary
2//! for relabeling the assignment vectors in a BEN file. The relabeling is done
3//! can be doe either so that the values are in ascending order or according to
4//! a mapping provided by the user in a map file.
5
6use crate::decode::*;
7use crate::encode::*;
8use byteorder::{BigEndian, ReadBytesExt};
9use std::collections::HashMap;
10use std::io::Error;
11
12/// Relabels each of the assignment vectors in a BEN file so that the values are
13/// in ascending order.
14///
15/// # Arguments
16///
17/// * `reader` - A reader that implements the `Read` trait containing the BEN file to
18/// be relabeled.
19/// * `writer` - A writer that implements the `Write` trait and which will contain the
20/// relabeled BEN file.
21///
22/// # Errors
23///
24/// Returns an error if the file format is invalid or if there is an issue reading or writing
25/// the file.
26pub fn relabel_ben_lines<R: Read, W: Write>(
27    mut reader: R,
28    mut writer: W,
29    variant: BenVariant,
30) -> io::Result<()> {
31    let mut sample_number = 0;
32    loop {
33        let mut tmp_buffer = [0u8];
34        let max_val_bits = match reader.read_exact(&mut tmp_buffer) {
35            Ok(_) => tmp_buffer[0],
36            Err(e) => {
37                if e.kind() == io::ErrorKind::UnexpectedEof {
38                    break;
39                }
40                return Err(e);
41            }
42        };
43
44        let max_len_bits = reader.read_u8()?;
45        let n_bytes = reader.read_u32::<BigEndian>()?;
46
47        let mut ben_line = decode_ben_line(&mut reader, max_val_bits, max_len_bits, n_bytes)?;
48
49        // relabel the line
50        let mut label = 0;
51        let mut label_map = HashMap::new();
52        for (val, _len) in ben_line.iter_mut() {
53            let new_val = match label_map.get(val) {
54                Some(v) => *v,
55                None => {
56                    label += 1;
57                    label_map.insert(*val, label);
58                    label
59                }
60            };
61            *val = new_val;
62        }
63
64        let relabeled = encode_ben_vec_from_rle(ben_line);
65        writer.write_all(&relabeled)?;
66
67        let count_occurrences = if variant == BenVariant::MkvChain {
68            let count = reader.read_u16::<BigEndian>()?;
69            writer.write_all(&count.to_be_bytes())?;
70            count
71        } else {
72            1
73        };
74
75        sample_number += count_occurrences as usize;
76
77        log!("Relabeling line: {}\r", sample_number);
78    }
79    logln!();
80    logln!("Done!");
81
82    Ok(())
83}
84
85/// Relabels the values in a BEN file so that the assignment vector values are
86/// in ascending order. So , if the assignment vector is [2, 3, 1, 4, 5, 5, 3, 4, 2]
87/// the relabeled assignment vector will be [1, 2, 3, 4, 5, 5, 2, 4, 1].
88///
89/// # Arguments
90///
91/// * `reader` - A reader that implements the `Read` trait containing the BEN file to
92/// be relabeled.
93/// * `writer` - A writer that implements the `Write` trait and which will contain the
94/// relabeled BEN file.
95///
96/// # Errors
97///
98/// Returns an error if the file format is invalid or if there is an issue reading or writing
99/// the file.
100pub fn relabel_ben_file<R: Read, W: Write>(mut reader: R, mut writer: W) -> io::Result<()> {
101    let mut check_buffer = [0u8; 17];
102    reader.read_exact(&mut check_buffer)?;
103
104    let variant = match &check_buffer {
105        b"STANDARD BEN FILE" => BenVariant::Standard,
106        b"MKVCHAIN BEN FILE" => BenVariant::MkvChain,
107        _ => {
108            return Err(Error::new(
109                io::ErrorKind::InvalidData,
110                "Invalid file format",
111            ));
112        }
113    };
114
115    writer.write_all(&check_buffer)?;
116
117    relabel_ben_lines(&mut reader, &mut writer, variant)?;
118
119    Ok(())
120}
121
122/// Relabels the values in a BEN file so that the assignment vector values are
123/// sorted according to a mapping. The mapping is a HashMap where the key is the
124/// position in the new assignment vector and the value is the position in the old
125/// assignment vector.
126///
127/// # Arguments
128///
129/// * `reader` - A reader that implements the `Read` trait containing the BEN file to
130/// be relabeled.
131/// * `writer` - A writer that implements the `Write` trait and which will contain the
132/// relabeled BEN file.
133/// * `new_to_old_node_map` - A HashMap where the key is the position in the new assignment
134/// vector and the value is the position in the old assignment vector.
135///
136/// # Errors
137///
138/// Returns an error if the file format is invalid or if there is an issue reading or writing
139/// the file.
140pub fn relabel_ben_lines_with_map<R: Read, W: Write>(
141    mut reader: R,
142    mut writer: W,
143    new_to_old_node_map: HashMap<usize, usize>,
144    variant: BenVariant,
145) -> io::Result<()> {
146    let mut sample_number = 0;
147    loop {
148        let mut tmp_buffer = [0u8];
149        let max_val_bits = match reader.read_exact(&mut tmp_buffer) {
150            Ok(_) => tmp_buffer[0],
151            Err(e) => {
152                if e.kind() == io::ErrorKind::UnexpectedEof {
153                    break;
154                }
155                return Err(e);
156            }
157        };
158
159        let max_len_bits = reader.read_u8()?;
160        let n_bytes = reader.read_u32::<BigEndian>()?;
161
162        let ben_line = decode_ben_line(&mut reader, max_val_bits, max_len_bits, n_bytes)?;
163
164        let assignment_vec = rle_to_vec(ben_line);
165        let new_assignment_vec = assignment_vec
166            .iter()
167            .enumerate()
168            .map(|(i, _)| {
169                // position of the new value in the old assignment
170                let new_val_pos = new_to_old_node_map.get(&i).unwrap();
171                // get the new value from the old assignment
172                let new_val = assignment_vec[*new_val_pos];
173                new_val
174            })
175            .collect::<Vec<u16>>();
176
177        let new_rle = assign_to_rle(new_assignment_vec);
178
179        let relabeled = encode_ben_vec_from_rle(new_rle);
180        writer.write_all(&relabeled)?;
181
182        let count_occurrences = if variant == BenVariant::MkvChain {
183            let count = reader.read_u16::<BigEndian>()?;
184            writer.write_all(&count.to_be_bytes())?;
185            count
186        } else {
187            1
188        };
189
190        sample_number += count_occurrences as usize;
191        log!("Relabeling line: {}\r", sample_number);
192    }
193    logln!();
194    logln!("Done!");
195
196    Ok(())
197}
198
199/// Relabels the values in a BEN file so that the assignment vector values are
200/// sorted according to a mapping. The mapping is a HashMap where the key is the
201/// position in the new assignment vector and the value is the position in the old
202/// assignment vector.
203///
204/// # Arguments
205///
206/// * `reader` - A reader that implements the `Read` trait containing the BEN file to
207/// be relabeled.
208/// * `writer` - A writer that implements the `Write` trait and which will contain the
209/// relabeled BEN file.
210/// * `new_to_old_node_map` - A HashMap where the key is the position in the new assignment
211/// vector and the value is the position in the old assignment vector.
212///
213/// # Errors
214///
215/// Returns an error if the file format is invalid or if there is an issue reading or writing
216/// the file.d according to a mapping. The mapping is a HashMap where the key is the
217pub fn relabel_ben_file_with_map<R: Read, W: Write>(
218    mut reader: R,
219    mut writer: W,
220    new_to_old_node_map: HashMap<usize, usize>,
221) -> io::Result<()> {
222    let mut check_buffer = [0u8; 17];
223    reader.read_exact(&mut check_buffer)?;
224
225    let variant = match &check_buffer {
226        b"STANDARD BEN FILE" => BenVariant::Standard,
227        b"MKVCHAIN BEN FILE" => BenVariant::MkvChain,
228        _ => {
229            return Err(Error::new(
230                io::ErrorKind::InvalidData,
231                "Invalid file format",
232            ));
233        }
234    };
235
236    writer.write_all(&check_buffer)?;
237
238    relabel_ben_lines_with_map(&mut reader, &mut writer, new_to_old_node_map, variant)?;
239
240    Ok(())
241}
242
243#[cfg(test)]
244mod tests {
245    use super::*;
246    use rand::seq::SliceRandom;
247    use rand::SeedableRng;
248    use rand_chacha::ChaCha8Rng;
249    use rand_distr::{Distribution, Uniform};
250
251    fn shuffle_with_mapping<T>(vec: &mut Vec<T>) -> HashMap<usize, usize>
252    where
253        T: Clone + std::cmp::PartialEq,
254    {
255        let mut rng = ChaCha8Rng::seed_from_u64(42);
256        let original_vec = vec.clone(); // Clone the original vector to preserve initial values
257        vec.shuffle(&mut rng);
258
259        let mut map = HashMap::new();
260        for (new_index, item) in vec.iter().enumerate() {
261            let original_index = original_vec.iter().position(|i| i == item).unwrap();
262            map.insert(new_index, original_index);
263        }
264        map
265    }
266
267    #[test]
268    fn test_relabel_ben_line_simple() {
269        let in_rle = vec![(2, 2), (3, 2), (1, 2), (4, 2)];
270
271        let input = encode_ben_vec_from_rle(in_rle);
272
273        let out_rle = vec![(1, 2), (2, 2), (3, 2), (4, 2)];
274        let expected = encode_ben_vec_from_rle(out_rle);
275
276        let mut buf = Vec::new();
277        relabel_ben_lines(input.as_slice(), &mut buf, BenVariant::Standard).unwrap();
278
279        assert_eq!(buf, expected);
280    }
281
282    #[test]
283    fn test_relabel_simple_file() {
284        let file = format!(
285            "{}\n{}\n{}\n{}\n{}\n{}\n{}\n",
286            "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":1}",
287            "{\"assignment\":[2,1,3,4,5,5,3,4,2],\"sample\":2}",
288            "{\"assignment\":[3,3,1,1,2,2,3,3,4],\"sample\":3}",
289            "{\"assignment\":[4,3,2,1,4,3,2,1,1],\"sample\":4}",
290            "{\"assignment\":[3,2,2,4,1,3,1,4,3],\"sample\":5}",
291            "{\"assignment\":[2,2,3,3,4,4,5,5,1],\"sample\":6}",
292            "{\"assignment\":[2,4,1,5,2,4,3,1,3],\"sample\":7}"
293        );
294
295        let input = file.as_bytes();
296
297        let mut output = Vec::new();
298        let writer = io::BufWriter::new(&mut output);
299
300        encode_jsonl_to_ben(input, writer, BenVariant::Standard).unwrap();
301
302        let mut output2 = Vec::new();
303        let writer2 = io::BufWriter::new(&mut output2);
304        relabel_ben_file(output.as_slice(), writer2).unwrap();
305
306        let mut output3 = Vec::new();
307        let writer3 = io::BufWriter::new(&mut output3);
308        decode_ben_to_jsonl(output2.as_slice(), writer3).unwrap();
309
310        let output_str = String::from_utf8(output3).unwrap();
311
312        let out_file = format!(
313            "{}\n{}\n{}\n{}\n{}\n{}\n{}\n",
314            "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":1}",
315            "{\"assignment\":[1,2,3,4,5,5,3,4,1],\"sample\":2}",
316            "{\"assignment\":[1,1,2,2,3,3,1,1,4],\"sample\":3}",
317            "{\"assignment\":[1,2,3,4,1,2,3,4,4],\"sample\":4}",
318            "{\"assignment\":[1,2,2,3,4,1,4,3,1],\"sample\":5}",
319            "{\"assignment\":[1,1,2,2,3,3,4,4,5],\"sample\":6}",
320            "{\"assignment\":[1,2,3,4,1,2,5,3,5],\"sample\":7}"
321        );
322
323        assert_eq!(output_str, out_file);
324    }
325
326    #[test]
327    fn test_relabel_simple_file_mkv() {
328        let file = format!(
329            "{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n",
330            "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":1}",
331            "{\"assignment\":[2,1,3,4,5,5,3,4,2],\"sample\":2}",
332            "{\"assignment\":[3,3,1,1,2,2,3,3,4],\"sample\":3}",
333            "{\"assignment\":[4,3,2,1,4,3,2,1,1],\"sample\":4}",
334            "{\"assignment\":[3,2,2,4,1,3,1,4,3],\"sample\":5}",
335            "{\"assignment\":[3,2,2,4,1,3,1,4,3],\"sample\":6}",
336            "{\"assignment\":[3,2,2,4,1,3,1,4,3],\"sample\":7}",
337            "{\"assignment\":[2,2,3,3,4,4,5,5,1],\"sample\":8}",
338            "{\"assignment\":[2,4,1,5,2,4,3,1,3],\"sample\":9}",
339            "{\"assignment\":[2,4,1,5,2,4,3,1,3],\"sample\":10}"
340        );
341
342        let input = file.as_bytes();
343
344        let mut output = Vec::new();
345        let writer = io::BufWriter::new(&mut output);
346
347        encode_jsonl_to_ben(input, writer, BenVariant::MkvChain).unwrap();
348
349        let mut output2 = Vec::new();
350        let writer2 = io::BufWriter::new(&mut output2);
351        relabel_ben_file(output.as_slice(), writer2).unwrap();
352
353        let mut output3 = Vec::new();
354        let writer3 = io::BufWriter::new(&mut output3);
355        decode_ben_to_jsonl(output2.as_slice(), writer3).unwrap();
356
357        let output_str = String::from_utf8(output3).unwrap();
358
359        let out_file = format!(
360            "{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n",
361            "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":1}",
362            "{\"assignment\":[1,2,3,4,5,5,3,4,1],\"sample\":2}",
363            "{\"assignment\":[1,1,2,2,3,3,1,1,4],\"sample\":3}",
364            "{\"assignment\":[1,2,3,4,1,2,3,4,4],\"sample\":4}",
365            "{\"assignment\":[1,2,2,3,4,1,4,3,1],\"sample\":5}",
366            "{\"assignment\":[1,2,2,3,4,1,4,3,1],\"sample\":6}",
367            "{\"assignment\":[1,2,2,3,4,1,4,3,1],\"sample\":7}",
368            "{\"assignment\":[1,1,2,2,3,3,4,4,5],\"sample\":8}",
369            "{\"assignment\":[1,2,3,4,1,2,5,3,5],\"sample\":9}",
370            "{\"assignment\":[1,2,3,4,1,2,5,3,5],\"sample\":10}"
371        );
372
373        assert_eq!(output_str, out_file);
374    }
375
376    #[test]
377    fn test_relabel_ben_line_with_map() {
378        let in_assign = vec![2, 3, 1, 4, 5, 5, 3, 4, 2];
379        let in_rle = assign_to_rle(in_assign);
380
381        let input = encode_ben_vec_from_rle(in_rle);
382
383        let out_assign = vec![1, 2, 2, 3, 3, 4, 4, 5, 5];
384        let out_rle = assign_to_rle(out_assign);
385        let expected = encode_ben_vec_from_rle(out_rle);
386
387        let mut new_to_old_map = HashMap::new();
388        new_to_old_map.insert(0, 2);
389        new_to_old_map.insert(1, 0);
390        new_to_old_map.insert(2, 8);
391        new_to_old_map.insert(3, 1);
392        new_to_old_map.insert(4, 6);
393        new_to_old_map.insert(5, 3);
394        new_to_old_map.insert(6, 7);
395        new_to_old_map.insert(7, 4);
396        new_to_old_map.insert(8, 5);
397
398        let mut buf = Vec::new();
399        relabel_ben_lines_with_map(
400            input.as_slice(),
401            &mut buf,
402            new_to_old_map,
403            BenVariant::Standard,
404        )
405        .unwrap();
406
407        assert_eq!(buf, expected);
408    }
409
410    #[test]
411    fn test_relabel_ben_line_with_shuffle() {
412        let in_assign = vec![2, 3, 1, 4, 5, 5, 3, 4, 2];
413        let mut out_assign = in_assign.clone();
414
415        let in_rle = assign_to_rle(in_assign);
416        let input = encode_ben_vec_from_rle(in_rle);
417
418        let new_to_old_map = shuffle_with_mapping(&mut out_assign);
419        let out_rle = assign_to_rle(out_assign);
420        let expected = encode_ben_vec_from_rle(out_rle);
421
422        let mut buf = Vec::new();
423        relabel_ben_lines_with_map(
424            input.as_slice(),
425            &mut buf,
426            new_to_old_map,
427            BenVariant::Standard,
428        )
429        .unwrap();
430
431        assert_eq!(buf, expected);
432    }
433
434    #[test]
435    fn test_relabel_ben_line_with_large_shuffle() {
436        let seed = 129530786u64;
437        let mut rng = ChaCha8Rng::seed_from_u64(seed);
438
439        let mu = Uniform::new(1, 21).expect("Could not make uniform sampler");
440
441        let in_assign = (0..100_000)
442            .map(|_| mu.sample(&mut rng) as u16)
443            .collect::<Vec<u16>>();
444        let mut out_assign = in_assign.clone();
445
446        let in_rle = assign_to_rle(in_assign.to_vec());
447        let input = encode_ben_vec_from_rle(in_rle);
448
449        let new_to_old_map = shuffle_with_mapping(&mut out_assign);
450        let out_rle = assign_to_rle(out_assign);
451        let expected = encode_ben_vec_from_rle(out_rle);
452
453        let mut buf = Vec::new();
454        relabel_ben_lines_with_map(
455            input.as_slice(),
456            &mut buf,
457            new_to_old_map,
458            BenVariant::Standard,
459        )
460        .unwrap();
461
462        assert_eq!(buf, expected);
463    }
464
465    #[test]
466    fn test_relabel_simple_file_with_map() {
467        let file = format!(
468            "{}\n{}\n{}\n{}\n{}\n{}\n{}\n",
469            "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":1}",
470            "{\"assignment\":[2,1,3,4,5,5,3,4,2],\"sample\":2}",
471            "{\"assignment\":[3,3,1,1,2,2,3,3,4],\"sample\":3}",
472            "{\"assignment\":[4,3,2,1,4,3,2,1,1],\"sample\":4}",
473            "{\"assignment\":[3,2,2,4,1,3,1,4,3],\"sample\":5}",
474            "{\"assignment\":[2,2,3,3,4,4,5,5,1],\"sample\":6}",
475            "{\"assignment\":[2,4,1,5,2,4,3,1,3],\"sample\":7}"
476        );
477
478        let new_to_old_map: HashMap<usize, usize> = [
479            (0, 2),
480            (1, 3),
481            (2, 4),
482            (3, 5),
483            (4, 6),
484            (5, 7),
485            (6, 8),
486            (7, 0),
487            (8, 1),
488        ]
489        .iter()
490        .cloned()
491        .collect();
492
493        let input = file.as_bytes();
494
495        let mut output = Vec::new();
496        let writer = io::BufWriter::new(&mut output);
497
498        encode_jsonl_to_ben(input, writer, BenVariant::Standard).unwrap();
499
500        let mut output2 = Vec::new();
501        let writer2 = io::BufWriter::new(&mut output2);
502        relabel_ben_file_with_map(output.as_slice(), writer2, new_to_old_map).unwrap();
503
504        let mut output3 = Vec::new();
505        let writer3 = io::BufWriter::new(&mut output3);
506        decode_ben_to_jsonl(output2.as_slice(), writer3).unwrap();
507
508        let output_str = String::from_utf8(output3).unwrap();
509
510        let out_file = format!(
511            "{}\n{}\n{}\n{}\n{}\n{}\n{}\n",
512            "{\"assignment\":[3,4,5,5,3,4,2,1,2],\"sample\":1}",
513            "{\"assignment\":[3,4,5,5,3,4,2,2,1],\"sample\":2}",
514            "{\"assignment\":[1,1,2,2,3,3,4,3,3],\"sample\":3}",
515            "{\"assignment\":[2,1,4,3,2,1,1,4,3],\"sample\":4}",
516            "{\"assignment\":[2,4,1,3,1,4,3,3,2],\"sample\":5}",
517            "{\"assignment\":[3,3,4,4,5,5,1,2,2],\"sample\":6}",
518            "{\"assignment\":[1,5,2,4,3,1,3,2,4],\"sample\":7}"
519        );
520
521        assert_eq!(output_str, out_file);
522    }
523
524    #[test]
525    fn test_relabel_simple_file_with_map_mkv() {
526        let file = format!(
527            "{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n",
528            "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":1}",
529            "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":2}",
530            "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":3}",
531            "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":4}",
532            "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":5}",
533            "{\"assignment\":[1,2,3,4,5,5,3,4,2],\"sample\":6}",
534            "{\"assignment\":[2,1,3,4,5,5,3,4,2],\"sample\":7}",
535            "{\"assignment\":[2,1,3,4,5,5,3,4,2],\"sample\":8}",
536            "{\"assignment\":[2,1,3,4,5,5,3,4,2],\"sample\":9}",
537            "{\"assignment\":[2,4,1,5,2,4,3,1,3],\"sample\":10}",
538        );
539
540        let new_to_old_map: HashMap<usize, usize> = [
541            (0, 2),
542            (1, 3),
543            (2, 4),
544            (3, 5),
545            (4, 6),
546            (5, 7),
547            (6, 8),
548            (7, 0),
549            (8, 1),
550        ]
551        .iter()
552        .cloned()
553        .collect();
554
555        let input = file.as_bytes();
556
557        let mut output = Vec::new();
558        let writer = io::BufWriter::new(&mut output);
559
560        encode_jsonl_to_ben(input, writer, BenVariant::MkvChain).unwrap();
561
562        let mut output2 = Vec::new();
563        let writer2 = io::BufWriter::new(&mut output2);
564        relabel_ben_file_with_map(output.as_slice(), writer2, new_to_old_map).unwrap();
565
566        let mut output3 = Vec::new();
567        let writer3 = io::BufWriter::new(&mut output3);
568        decode_ben_to_jsonl(output2.as_slice(), writer3).unwrap();
569
570        let output_str = String::from_utf8(output3).unwrap();
571
572        let out_file = format!(
573            "{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}\n",
574            "{\"assignment\":[3,4,5,5,3,4,2,1,2],\"sample\":1}",
575            "{\"assignment\":[3,4,5,5,3,4,2,1,2],\"sample\":2}",
576            "{\"assignment\":[3,4,5,5,3,4,2,1,2],\"sample\":3}",
577            "{\"assignment\":[3,4,5,5,3,4,2,1,2],\"sample\":4}",
578            "{\"assignment\":[3,4,5,5,3,4,2,1,2],\"sample\":5}",
579            "{\"assignment\":[3,4,5,5,3,4,2,1,2],\"sample\":6}",
580            "{\"assignment\":[3,4,5,5,3,4,2,2,1],\"sample\":7}",
581            "{\"assignment\":[3,4,5,5,3,4,2,2,1],\"sample\":8}",
582            "{\"assignment\":[3,4,5,5,3,4,2,2,1],\"sample\":9}",
583            "{\"assignment\":[1,5,2,4,3,1,3,2,4],\"sample\":10}",
584        );
585
586        assert_eq!(output_str, out_file);
587    }
588}