rs-jsonl2stats 0.1.0

Create basic stats from jsonl
Documentation
use std::io;

use io::BufRead;

use io::Write;

use std::collections::BTreeMap;

use serde_json::Map;
use serde_json::Value;

#[derive(Default, serde::Serialize)]
pub struct RealStat {
    pub minimum: f64,
    pub maximum: f64,
    pub total: f64,
    pub count: u64,
}

impl RealStat {
    pub fn process_value(&mut self, val: f64) {
        self.minimum = val.min(self.minimum);
        self.maximum = val.max(self.maximum);
        self.total += val;
        self.count += 1;
    }
}

#[derive(Default, serde::Serialize)]
pub struct IntStat {
    pub minimum: i64,
    pub maximum: i64,
    pub total: i64,
    pub count: u64,
}

impl IntStat {
    pub fn process_value(&mut self, val: i64) {
        self.minimum = val.min(self.minimum);
        self.maximum = val.max(self.maximum);
        self.total += val;
        self.count += 1;
    }
}

#[derive(Default, serde::Serialize)]
pub struct StrStat {
    pub minimum: String,
    pub maximum: String,
    pub count: u64,
}

impl StrStat {
    pub fn process_value(&mut self, val: String) {
        self.count += 1;

        match (self.maximum < val, val < self.minimum) {
            (true, true) => {
                self.maximum = val.clone();
                self.minimum = val;
            }
            (true, false) => {
                self.maximum = val;
            }
            (false, true) => {
                self.minimum = val;
            }
            (false, false) => {}
        }
    }
}

#[derive(Default, serde::Serialize)]
pub struct BasicStats {
    pub real: BTreeMap<String, RealStat>,
    pub integer: BTreeMap<String, IntStat>,
    pub string: BTreeMap<String, StrStat>,
}

impl BasicStats {
    pub fn process_real(&mut self, key: &str, real: f64) -> Result<(), io::Error> {
        if !self.real.contains_key(key) {
            self.real.insert(
                key.into(),
                RealStat {
                    minimum: real,
                    maximum: real,
                    total: real,
                    count: 1,
                },
            );
            return Ok(());
        }

        let stat: &mut RealStat = self
            .real
            .get_mut(key)
            .ok_or(io::Error::other("must exist"))?;

        stat.process_value(real);

        Ok(())
    }

    pub fn process_int(&mut self, key: &str, integer: i64) -> Result<(), io::Error> {
        if !self.integer.contains_key(key) {
            self.integer.insert(
                key.into(),
                IntStat {
                    minimum: integer,
                    maximum: integer,
                    total: integer,
                    count: 1,
                },
            );
            return Ok(());
        }

        let stat: &mut IntStat = self
            .integer
            .get_mut(key)
            .ok_or(io::Error::other("must exist"))?;

        stat.process_value(integer);

        Ok(())
    }

    pub fn process_str(&mut self, key: &str, string: String) -> Result<(), io::Error> {
        if !self.string.contains_key(key) {
            self.string.insert(
                key.into(),
                StrStat {
                    minimum: string.clone(),
                    maximum: string,
                    count: 1,
                },
            );
            return Ok(());
        }

        let stat: &mut StrStat = self
            .string
            .get_mut(key)
            .ok_or(io::Error::other("must exist"))?;

        stat.process_value(string);

        Ok(())
    }
}

impl BasicStats {
    pub fn jsonl2stats<I>(
        jsonl: I,
        reals: Vec<String>,
        ints: Vec<String>,
        strs: Vec<String>,
    ) -> Result<Self, io::Error>
    where
        I: Iterator<Item = Result<Map<String, Value>, io::Error>>,
    {
        let mut stats = BasicStats::default();
        for robj in jsonl {
            let mut obj: Map<String, Value> = robj?;

            for rkey in &reals {
                let oval: Option<Value> = obj.remove(rkey);
                let of64: Option<f64> = oval.and_then(|v| v.as_f64());
                if let Some(val) = of64 {
                    stats.process_real(rkey, val)?;
                }
            }

            for ikey in &ints {
                let oval: Option<Value> = obj.remove(ikey);
                let oi64: Option<i64> = oval.and_then(|v| v.as_i64());
                if let Some(val) = oi64 {
                    stats.process_int(ikey, val)?;
                }
            }

            for skey in &strs {
                if let Some(Value::String(s)) = obj.remove(skey) {
                    stats.process_str(skey, s)?;
                }
            }
        }
        Ok(stats)
    }
}

impl BasicStats {
    pub fn to_writer<W>(&self, wtr: W) -> Result<(), io::Error>
    where
        W: Write,
    {
        serde_json::to_writer(wtr, self).map_err(io::Error::other)
    }
}

pub fn jsonl2objs<I>(jsonl: I) -> impl Iterator<Item = Result<Map<String, Value>, io::Error>>
where
    I: Iterator<Item = Result<String, io::Error>>,
{
    jsonl.map(|rline| rline.and_then(|line| serde_json::from_str(&line).map_err(io::Error::other)))
}

pub fn stdin2jsonl() -> impl Iterator<Item = Result<String, io::Error>> {
    io::stdin().lock().lines()
}

pub fn stdin2jsonl2stats2stdout(
    reals: Vec<String>,
    ints: Vec<String>,
    strs: Vec<String>,
) -> Result<(), io::Error> {
    let jsonl = stdin2jsonl();
    let objs = jsonl2objs(jsonl);
    let stat: BasicStats = BasicStats::jsonl2stats(objs, reals, ints, strs)?;

    let o = io::stdout();
    let mut ol = o.lock();

    stat.to_writer(&mut ol)?;

    ol.flush()
}

pub fn env_key2val2splited(split_char: &'static str) -> impl Fn(&'static str) -> Vec<String> {
    move |env_key: &'static str| {
        let s: String = std::env::var(env_key).unwrap_or_default();
        let splited = s.split(split_char);
        splited.map(|s| s.into()).collect()
    }
}

pub const SPLIT_CHAR_DEFAULT: &str = ",";

pub const ENV_KEY_REAL: &str = "ENV_REALS";

pub const ENV_KEY_INT: &str = "ENV_INTS";

pub const ENV_KEY_STR: &str = "ENV_STRS";

pub fn stdin2jsonl2stats2stdout_default() -> Result<(), io::Error> {
    let key2v = env_key2val2splited(SPLIT_CHAR_DEFAULT);

    stdin2jsonl2stats2stdout(key2v(ENV_KEY_REAL), key2v(ENV_KEY_INT), key2v(ENV_KEY_STR))
}