reddit-search 0.2.1

A search tool for the pushshift.io Reddit dumps.
use std::fs::File;
use std::path::PathBuf;
use zstd::Decoder;
use std::io::{BufRead, BufReader, Write, BufWriter};
use clap::{command, Parser};
use std::collections::HashMap;
use indicatif::{ProgressBar, ProgressStyle};

#[derive(Parser, Debug)]
#[command(name = "reddit-search")]
#[command(author = "Luc Aggett (luc@aggett.com)")]
#[command(version = "1.0")]
#[command(about = "utility to search and filter reddit dumps", long_about = None)]
struct Args {
    #[arg(short, long, default_value = "/Users/luc/Documents/Personal/Code/reddit-search/RC_2005-12.zst")]
    input: PathBuf,
    #[arg(short, long, default_value = "output.json")]
    output: PathBuf,
    #[arg(num_args(0..))]
    fields: Vec<String>,
    #[arg(short, long, default_value = "false")]
    verbose: bool,
}

fn process_line(line: String, field_map: &HashMap<String, String>) -> Option<String> {
    // if the field map only has one entry, then we can just check the single field without iterating
    if field_map.len() == 1 {
        let (field, value) = field_map.iter().next().unwrap();
        // If the line contains the field and value in the format "field":"value" or "field":value, then return the line
        return if line.contains(&format!("\"{}\":\"{}\"", field, value)) || line.contains(&format!("\"{}\":{}", field, value)) {
            Some(line)
        } else {
            None
        }
    }
    if field_map.iter().all(|(field, value)| {
        // If the line contains the field and value in the format "field":"value" or "field":value, then return the line
        line.contains(&format!("\"{}\":\"{}\"", field, value)) || line.contains(&format!("\"{}\":{}", field, value))
    }) {
        Some(line)
    } else {
        None
    }
}

fn main() {
    let args = Args::parse();
    // log the arguments

    // open the input file as a stream, since the file is very large
    let input_file = File::open(args.input).unwrap();
    let metadata = input_file.metadata().unwrap();
    let file_size = metadata.len();
    let mut decoder = Decoder::new(input_file).unwrap();
    decoder.window_log_max(31).unwrap();
    let input_stream = BufReader::new(decoder);

    // open the output file as a stream
    let output_file = File::create(args.output).unwrap();
    let mut output_stream = BufWriter::new(output_file);

    // create a progress bar
    let pb = ProgressBar::new(file_size);
    pb.set_style(ProgressStyle::default_bar()
        .template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {bytes}/{total_bytes} ({eta})").expect("Failed to load progress bar")
        .progress_chars("#>-"));

    // process lines
    let mut matched_lines = 0;
    let mut total_lines = 0;
    let mut lines = input_stream.lines();

    let field_map: HashMap<String, String> = args.fields.iter().filter_map(|field| {
        let parts: Vec<&str> = field.split(':').collect();
        if parts.len() == 2 {
            Some((parts[0].to_string(), parts[1].to_string()))
        } else {
            None
        }
    }).collect();

    while let Some(result) = lines.next() {
        total_lines += 1;
        let line = result.unwrap();
        if let Some(obj) = process_line(line, &field_map) {
            matched_lines += 1;
            writeln!(output_stream, "{}", obj).unwrap();
        }

        // update the progress bar
        pb.set_position(total_lines as u64);
    }

    pb.finish_with_message("Complete");
    println!("Complete : {} : {}", total_lines, matched_lines);
}