use std::fs::File;
use std::io::{BufWriter, Read, Write};
use std::path::PathBuf;
use clap::Parser;
use xet_data::deduplication::Chunker;
use xet_data::deduplication::constants::TARGET_CHUNK_SIZE;
#[derive(Debug, Parser)]
#[command(
version,
about,
long_about = "Example of using the chunker. Splits the input file or stdin into chunks and writes to stdout or the specified file the chunk hash in string format and the chunk size on a new line for each chunk in order in the file"
)]
struct ChunkArgs {
#[arg(short, long)]
input: Option<PathBuf>,
#[arg(short, long)]
output: Option<PathBuf>,
}
fn main() {
let args = ChunkArgs::parse();
let mut input: Box<dyn Read> = if let Some(file_path) = args.input {
Box::new(File::open(file_path).unwrap())
} else {
Box::new(std::io::stdin())
};
let mut output: Box<dyn Write> = if let Some(save) = args.output {
Box::new(BufWriter::new(File::create(save).unwrap()))
} else {
Box::new(std::io::stdout())
};
let mut chunker = Chunker::new(*TARGET_CHUNK_SIZE);
const INGESTION_BLOCK_SIZE: usize = 8 * 1024 * 1024; let mut buf = vec![0u8; INGESTION_BLOCK_SIZE];
loop {
let num_read = input.read(&mut buf).unwrap();
if num_read == 0 {
break;
}
let chunks = chunker.next_block(&buf[..num_read], false);
for chunk in chunks {
output
.write_all(format!("{} {}\n", chunk.hash, chunk.data.len()).as_bytes())
.unwrap();
}
}
if let Some(chunk) = chunker.finish() {
output
.write_all(format!("{} {}\n", chunk.hash, chunk.data.len()).as_bytes())
.unwrap();
}
output.flush().unwrap();
}