use std::fs::File;
use std::path::PathBuf;
use zstd::Decoder;
use std::io::{BufRead, BufReader, Write, BufWriter};
use clap::{command, Parser};
use std::collections::HashMap;
use indicatif::{ProgressBar, ProgressStyle};
#[derive(Parser, Debug)]
#[command(name = "reddit-search")]
#[command(author = "Luc Aggett (luc@aggett.com)")]
#[command(version = "1.0")]
#[command(about = "utility to search and filter reddit dumps", long_about = None)]
struct Args {
#[arg(short, long, default_value = "/Users/luc/Documents/Personal/Code/reddit-search/RC_2005-12.zst")]
input: PathBuf,
#[arg(short, long, default_value = "output.json")]
output: PathBuf,
#[arg(num_args(0..))]
fields: Vec<String>,
#[arg(short, long, default_value = "false")]
verbose: bool,
}
fn process_line(line: String, field_map: &HashMap<String, String>) -> Option<String> {
// if the field map only has one entry, then we can just check the single field without iterating
if field_map.len() == 1 {
let (field, value) = field_map.iter().next().unwrap();
// If the line contains the field and value in the format "field":"value" or "field":value, then return the line
return if line.contains(&format!("\"{}\":\"{}\"", field, value)) || line.contains(&format!("\"{}\":{}", field, value)) {
Some(line)
} else {
None
}
}
if field_map.iter().all(|(field, value)| {
// If the line contains the field and value in the format "field":"value" or "field":value, then return the line
line.contains(&format!("\"{}\":\"{}\"", field, value)) || line.contains(&format!("\"{}\":{}", field, value))
}) {
Some(line)
} else {
None
}
}
fn main() {
let args = Args::parse();
// log the arguments
// open the input file as a stream, since the file is very large
let input_file = File::open(args.input).unwrap();
let metadata = input_file.metadata().unwrap();
let file_size = metadata.len();
let mut decoder = Decoder::new(input_file).unwrap();
decoder.window_log_max(31).unwrap();
let input_stream = BufReader::new(decoder);
// open the output file as a stream
let output_file = File::create(args.output).unwrap();
let mut output_stream = BufWriter::new(output_file);
// create a progress bar
let pb = ProgressBar::new(file_size);
pb.set_style(ProgressStyle::default_bar()
.template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {bytes}/{total_bytes} ({eta})").expect("Failed to load progress bar")
.progress_chars("#>-"));
// process lines
let mut matched_lines = 0;
let mut total_lines = 0;
let mut lines = input_stream.lines();
let field_map: HashMap<String, String> = args.fields.iter().filter_map(|field| {
let parts: Vec<&str> = field.split(':').collect();
if parts.len() == 2 {
Some((parts[0].to_string(), parts[1].to_string()))
} else {
None
}
}).collect();
while let Some(result) = lines.next() {
total_lines += 1;
let line = result.unwrap();
if let Some(obj) = process_line(line, &field_map) {
matched_lines += 1;
writeln!(output_stream, "{}", obj).unwrap();
}
// update the progress bar
pb.set_position(total_lines as u64);
}
pb.finish_with_message("Complete");
println!("Complete : {} : {}", total_lines, matched_lines);
}