dent 0.3.0

Library and tool for summarizing and comparing small data sets.
Documentation
#!/usr/bin/env python3
from itertools import combinations, product
import os
import shutil

import numpy as np
from numpy.random import normal
from scipy import stats


data_dir = 'data'
kat_dir = 'kat'

means = [1, 1.1, 10]
stddevs = [0.1, 1]
sizes = [100, 1000]

params = list(product(means, stddevs, sizes))


def fmt_path(m, std, n):
    return os.path.join(data_dir, '{}_{}_{}'.format(m, std, n))

def read(path):
    return np.fromfile(path, dtype=np.dtype('float64'), sep='\n')

def summarize(d):
    # Use an unbiased estimator of variance and derived values.
    ddof = 1

    # Use the percentile interpolation strategy under test.
    interpolation = 'linear'

    return {
        'n': len(d),
        'min': np.min(d),
        'max': np.max(d),
        'lower_quartile': np.percentile(d, 25.0, interpolation=interpolation),
        'upper_quartile': np.percentile(d, 75.0, interpolation=interpolation),
        'median': np.median(d),
        'mean': np.mean(d),
        'std': np.std(d, ddof=ddof),
        'sem': stats.sem(d, ddof=ddof),
        'var': np.var(d, ddof=ddof),
    }

def fmt_summary(summary):
    fmt = lambda k: '{}\t{}'.format(k, summary[k])

    return '\n'.join([fmt(k) for k in sorted(summary.keys())])


def write_summary(path, summary):
    name = os.path.basename(path)

    kat_name = 'summary_{}'.format(name)
    kat_path = os.path.join(kat_dir, kat_name)

    with open(kat_path, 'w') as f:
        f.write('src\t{}\n'.format(name))
        f.write(fmt_summary(summary))
        f.write('\n')

def t_test(x, y):
    t, _ = stats.ttest_ind(x, y, equal_var=False)

    return t

def write_t_test(path1, path2, t):
    name1 = os.path.basename(path1)
    name2 = os.path.basename(path2)

    kat_name = 'ttest-{}-{}'.format(name1, name2)
    kat_path = os.path.join(kat_dir, kat_name)

    with open(kat_path, 'w') as f:
        f.write('src1\t{}\n'.format(name1))
        f.write('src2\t{}\n'.format(name2))
        f.write('t\t{}\n'.format(t))

# See: http://www.itl.nist.gov/div898/handbook/eda/section3/eda3672.htm
def make_t_critical_value_table():
    MAX_DF = 100
    degrees_freedom = [[df] for df in range(1, MAX_DF + 1)]

    sig_levels_1_sided = [0.100, 0.050, 0.025, 0.010, 0.005, 0.001]
    sig_levels_2_sided = [a/2.0 for a in sig_levels_1_sided]

    norm_row = stats.norm.isf(sig_levels_1_sided)
    t_table = stats.t.isf(sig_levels_2_sided, degrees_freedom)

    fmt_row = lambda r: ''.join(map(str, ['    ', list(r), ',']))

    print('[')

    # The t-distribution is approximately normal as d.f. → ∞, so the normal
    # distribution's critical values will be used to fill index 0 of the
    # 2-dimensional array we will emit. If 1 <= d.f. <= MAX_DF, then the d.f.
    # value is exactly the index into the table. Otherwise, we approximate the
    # t-distribution with the normal, and just use index 0.
    print(fmt_row(norm_row))

    for row in t_table:
        print(fmt_row(row))

    print('];')

def make_data():
    shutil.rmtree(data_dir, ignore_errors=True)
    os.makedirs(data_dir, exist_ok=True)

    for ps in params:
        d = normal(*ps)
        path = fmat_path(*ps)
        d.tofile(path, sep='\n')

def make_kat():
    shutil.rmtree(kat_dir, ignore_errors=True)
    os.makedirs(kat_dir, exist_ok=True)

    paths = [os.path.join(data_dir, f) for f in os.listdir(data_dir)]

    for p in paths:
        d = read(p)
        s = summarize(d)

        write_summary(p, s)

    for (p1, p2) in combinations(paths, r=2):
        d1 = read(p1)
        d2 = read(p2)
        t = t_test(d1, d2)

        write_t_test(p1, p2, t)


if __name__ == '__main__':
    make_kat()