1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
//! This module contains the main encoding functions for turning an
//! input JSONL or BEN file into a BEN or XBEN file.
//!
//! Any input JSONL file is expected to be in the standard
//!
//! ```json
//! {"assignment": [...], "sample": #}
//! ```
//!
//! format.
//!
//! The BEN format is
//! a simple bit-packed run-length encoded assignment vector with
//! some special headers that allow the decoder to know how many
//! bytes to read for each sample.
//!
//!
//! The XBEN format uses LZMA2 dictionary compression on
//! a byte-level decompressed version of the BEN format (known as ben32)
//! to achieve better compression ratios than we could achieve with applying
//! LZMA2 compression directly to the BEN format.

pub mod relabel;
pub mod translate;

use crate::utils::*;
use serde_json::Value;
use std::io::{self, BufRead, Cursor, Read, Result, Write};
use xz2::write::XzEncoder;

use self::translate::ben_to_ben32_lines;
use super::{log, logln};

/// A struct to make the writing of BEN files easier
/// and more ergonomic.
///
/// # Example
///
/// ```
/// use ben::encode::BenEncoder;
///
/// let mut buffer = Vec::new();
/// let mut ben_encoder = BenEncoder::new(&mut buffer);
///
/// ben_encoder.write_assignment(vec![1, 1, 1, 2, 2, 2]);
/// ```
pub struct BenEncoder<W: Write> {
    writer: W,
}

impl<W: Write> BenEncoder<W> {
    /// Create a new BenEncoder instance and handles
    /// the BEN file header.
    pub fn new(mut writer: W) -> Self {
        writer.write_all(b"STANDARD BEN FILE").unwrap();
        BenEncoder { writer }
    }

    /// Write a run-length encoded assignment vector to the
    /// BEN file.
    pub fn write_rle(&mut self, rle_vec: Vec<(u16, u16)>) -> Result<()> {
        let encoded = encode_ben_vec_from_rle(rle_vec);
        self.writer.write_all(&encoded)?;
        Ok(())
    }

    /// Write an assignment vector to the BEN file.
    pub fn write_assignment(&mut self, assign_vec: Vec<u16>) -> Result<()> {
        let rle_vec = assign_to_rle(assign_vec);
        self.write_rle(rle_vec)?;
        Ok(())
    }

    /// Write a JSON value containing an assignment vector to the BEN file.
    pub fn write_json_value(&mut self, data: Value) -> Result<()> {
        let assign_vec = data["assignment"].as_array().unwrap();
        let rle_vec = assign_to_rle(
            assign_vec
                .into_iter()
                .map(|x| x.as_u64().unwrap() as u16)
                .collect(),
        );
        self.write_rle(rle_vec)?;
        Ok(())
    }
}

/// A struct to make the writing of XBEN files easier
/// and more ergonomic.
pub struct XBenEncoder<W: Write> {
    encoder: XzEncoder<W>,
}

impl<W: Write> XBenEncoder<W> {
    /// Create a new XBenEncoder instance and handles
    /// the BEN file header.
    pub fn new(mut encoder: XzEncoder<W>) -> Self {
        encoder.write_all(b"STANDARD BEN FILE").unwrap();
        XBenEncoder { encoder }
    }

    /// Write a an assigment vector encoded as a JSON value
    /// to the XBEN file.
    pub fn write_json_value(&mut self, data: Value) -> Result<()> {
        let encoded = encode_ben32_line(data);
        self.encoder.write_all(&encoded)?;
        Ok(())
    }

    /// Converts a raw BEN assignment file into to an XBEN file.
    /// This function will check to see if the header is there and then
    /// handle it accordingly.
    pub fn write_ben_file(&mut self, mut reader: impl BufRead) -> Result<()> {
        let mut buff = [0u8; 17];
        reader.read_exact(&mut buff)?;

        // Create a new reader that prepends buff back onto the original reader
        let mut reader = if buff != b"STANDARD BEN FILE".as_slice() {
            let cursor = Cursor::new(buff.to_vec());
            let reader = cursor.chain(reader);
            Box::new(reader) as Box<dyn BufRead>
        } else {
            Box::new(reader)
        };

        ben_to_ben32_lines(&mut *reader, &mut self.encoder)
    }
}

/// This function takes a json encoded line containing an assignment
/// vector and a sample number and encodes the assignment vector
/// into a binary format known as "ben32". The ben32 format serves
/// as an intermediate format that allows for efficient compression
/// of BEN files using LZMA2 compression methods.
///
/// # Arguments
///
/// * `data` - A JSON object containing an assignment vector and a sample number
///
/// # Returns
///
/// A vector of bytes containing the ben32 encoded assignment vector
fn encode_ben32_line(data: Value) -> Vec<u8> {
    let assign_vec = data["assignment"].as_array().unwrap();
    let mut prev_assign: u16 = 0;
    let mut count: u16 = 0;
    let mut first = true;

    let mut ret = Vec::new();

    for assignment in assign_vec {
        let assign = assignment.as_u64().unwrap() as u16;
        if first {
            prev_assign = assign;
            count = 1;
            first = false;
            continue;
        }
        if assign == prev_assign {
            count += 1;
        } else {
            let encoded = (prev_assign as u32) << 16 | count as u32;
            ret.extend(&encoded.to_be_bytes());
            // Reset for next run
            prev_assign = assign;
            count = 1;
        }
    }

    // Handle the last run
    if count > 0 {
        let encoded = (prev_assign as u32) << 16 | count as u32;
        ret.extend(&encoded.to_be_bytes());
    }

    ret.extend([0, 0, 0, 0]);
    ret
}

/// This function takes a JSONL file and compresses it to the
/// XBEN format.
///
/// The JSONL file is assumed to be formatted in the standard
///
/// ```json
/// {"assignment": [...], "sample": #}
/// ```
///
/// format. While the BEN format is
/// a simple bit-packed (streamable!) run-length encoded assignment
/// vector, the XBEN format uses LZMA2 dictionary compression on
/// the byte level to achieve better compression ratios. In order
/// to use XBEN files, the `decode_xben_to_ben` function must be
/// used to decode the file back into a BEN format.
pub fn jsonl_encode_xben<R: BufRead, W: Write>(reader: R, writer: W) -> Result<()> {
    let encoder = XzEncoder::new(writer, 9);
    let mut ben_encoder = XBenEncoder::new(encoder);

    let mut line_num = 1;

    for line_result in reader.lines() {
        log!("Encoding line: {}\r", line_num);
        line_num += 1;
        let line = line_result?;
        let data: Value = serde_json::from_str(&line).expect("Error parsing JSON from line");

        ben_encoder.write_json_value(data)?;
    }

    logln!();
    logln!("Done!");

    Ok(())
}

/// This is a convenience function that applies level 9 LZMA2 compression
/// to a general file.
///
/// # Arguments
///
/// * `reader` - A buffered reader for the input file
/// * `writer` - A writer for the output file
///
/// # Returns
///
/// A Result type that contains the result of the operation
///
/// ```
/// use ben::encode::xz_compress;
/// use lipsum::lipsum;
/// use std::io::{BufReader, BufWriter};
///
/// let input = lipsum(100);
/// let reader = BufReader::new(input.as_bytes());
///
/// let mut output_buffer = Vec::new();
/// let writer = BufWriter::new(&mut output_buffer);
///
/// xz_compress(reader, writer).unwrap();
///
/// println!("{:?}", output_buffer);
/// ```
pub fn xz_compress<R: BufRead, W: Write>(mut reader: R, writer: W) -> Result<()> {
    let mut buff = [0; 4096];
    let mut encoder = XzEncoder::new(writer, 9);

    while let Ok(count) = reader.read(&mut buff) {
        if count == 0 {
            break;
        }
        encoder.write_all(&buff[..count])?;
    }
    drop(encoder); // Make sure to flush and finish compression
    Ok(())
}

/// This function takes in a standard assignment vector and encodes
/// it into a bit-packed ben version.
///
/// # Arguments
///
/// * `assign_vec` - A vector of u16 values representing the assignment vector
///
/// # Returns
///
/// A vector of bytes containing the bit-packed ben encoded assignment vector
pub fn encode_ben_vec_from_assign(assign_vec: Vec<u16>) -> Vec<u8> {
    let rle_vec: Vec<(u16, u16)> = assign_to_rle(assign_vec);
    encode_ben_vec_from_rle(rle_vec)
}

/// This function takes a run-length encoded assignment vector and
/// encodes into a bit-packed ben version
///
/// # Arguments
///
/// * `rle_vec` - A vector of tuples containing the value and length of each run
///
/// # Returns
///
/// A vector of bytes containing the bit-packed ben encoded assignment vector
fn encode_ben_vec_from_rle(rle_vec: Vec<(u16, u16)>) -> Vec<u8> {
    let mut output_vec: Vec<u8> = Vec::new();

    let max_val: u16 = rle_vec.iter().max_by_key(|x| x.0).unwrap().0;
    let max_len: u16 = rle_vec.iter().max_by_key(|x| x.1).unwrap().1;
    let max_val_bits: u8 = (16 - max_val.leading_zeros() as u8).max(1);
    let max_len_bits: u8 = 16 - max_len.leading_zeros() as u8;
    let assign_bits: u32 = (max_val_bits + max_len_bits) as u32;
    let n_bytes: u32 = if (assign_bits * rle_vec.len() as u32) % 8 == 0 {
        (assign_bits * rle_vec.len() as u32) / 8
    } else {
        (assign_bits * rle_vec.len() as u32) / 8 + 1
    };

    output_vec.push(max_val_bits);
    output_vec.push(max_len_bits);
    output_vec.extend(n_bytes.to_be_bytes().as_slice());

    let mut remainder: u32 = 0;
    let mut remainder_bits: u8 = 0;

    for (val, len) in rle_vec {
        let mut new_val: u32 = (remainder << max_val_bits) | (val as u32);

        let mut buff: u8;

        let mut n_bits_left: u8 = remainder_bits + max_val_bits;

        while n_bits_left >= 8 {
            n_bits_left -= 8;
            buff = (new_val >> n_bits_left) as u8;
            output_vec.push(buff);
            new_val = new_val & (!((0xFFFFFFFF as u32) << n_bits_left));
        }

        new_val = (new_val << max_len_bits) | (len as u32);
        n_bits_left += max_len_bits;

        while n_bits_left >= 8 {
            n_bits_left -= 8;
            buff = (new_val >> n_bits_left) as u8;
            output_vec.push(buff);
            new_val = new_val & (!((0xFFFFFFFF as u32) << n_bits_left));
        }

        remainder_bits = n_bits_left;
        remainder = new_val;
    }

    if remainder_bits > 0 {
        let buff = (remainder << (8 - remainder_bits)) as u8;
        output_vec.push(buff);
    }

    output_vec
}

/// This function takes a JSONL file and compresses it into
/// the BEN format.
///
/// The JSONL file is assumed to be formatted in the standard
///
/// ```json
/// {"assignment": [...], "sample": #}
/// ```
///
/// format.
///
/// # Arguments
///
/// * `reader` - A buffered reader for the input file
/// * `writer` - A writer for the output file
///
/// # Returns
///
/// A Result type that contains the result of the operation
///
/// # Example
///
/// ```
/// use std::io::{BufReader, BufWriter};
/// use serde_json::json;
/// use ben::encode::jsonl_encode_ben;
///
/// let input = r#"{"assignment": [1,1,1,2,2,2], "sample": 1}"#.to_string()
///     + "\n"
///     + r#"{"assignment": [1,1,2,2,1,2], "sample": 2}"#;
///
/// let reader = BufReader::new(input.as_bytes());
/// let mut write_buffer = Vec::new();
/// let mut writer = BufWriter::new(&mut write_buffer);
///
/// jsonl_encode_ben(reader, writer).unwrap();
///
/// println!("{:?}", write_buffer);
/// // This will output
/// // [83, 84, 65, 78, 68, 65, 82, 68, 32,
/// //  66, 69, 78, 32, 70, 73, 76, 69, 2,
/// //  2, 0, 0, 0, 1, 123, 2, 2, 0, 0, 0,
/// //  2, 106, 89]
/// ```
///
pub fn jsonl_encode_ben<R: BufRead, W: Write>(reader: R, writer: W) -> Result<()> {
    let mut line_num = 1;
    let mut ben_encoder = BenEncoder::new(writer);
    for line_result in reader.lines() {
        log!("Encoding line: {}\r", line_num);
        line_num += 1;
        let line = line_result?; // Handle potential I/O errors for each line
        let data: Value = serde_json::from_str(&line).expect("Error parsing JSON from line");

        ben_encoder.write_json_value(data)?;
    }
    logln!();
    logln!("Done!"); // Print newline after progress bar
    Ok(())
}

/// This function takes a BEN file and encodes it into an XBEN
/// file using bit-to-byte decompression followed by LZMA2 compression.
///
/// # Arguments
///
/// * `reader` - A buffered reader for the input file
/// * `writer` - A writer for the output file
///
/// # Returns
///
/// A Result type that contains the result of the operation
pub fn ben_encode_xben<R: BufRead, W: Write>(mut reader: R, writer: W) -> Result<()> {
    let mut check_buffer = [0u8; 17];
    reader.read_exact(&mut check_buffer)?;

    if &check_buffer != b"STANDARD BEN FILE" {
        return Err(io::Error::new(
            io::ErrorKind::InvalidData,
            "Invalid file format",
        ));
    }

    let encoder = XzEncoder::new(writer, 9);
    let mut ben_encoder = XBenEncoder::new(encoder);

    ben_encoder.write_ben_file(reader)?;

    Ok(())
}

#[cfg(test)]
mod tests {
    include!("tests/encode_tests.rs");
}