Trait lace::OracleT

source ·
pub trait OracleT: CanOracle {
Show 40 methods // Provided methods fn state_diagnostics(&self) -> Vec<StateDiagnostics> { ... } fn shape(&self) -> (usize, usize, usize) { ... } fn is_empty(&self) -> bool { ... } fn ftype<Ix: ColumnIndex>(&self, col_ix: Ix) -> Result<FType, IndexError> { ... } fn ftypes(&self) -> Vec<FType> { ... } fn summarize_col<Ix: ColumnIndex>( &self, col_ix: Ix ) -> Result<SummaryStatistics, IndexError> { ... } fn depprob<Ix: ColumnIndex>( &self, col_a: Ix, col_b: Ix ) -> Result<f64, IndexError> { ... } fn depprob_pw<'x, Ix>( &self, pairs: &'x [(Ix, Ix)] ) -> Result<Vec<f64>, IndexError> where Ix: ColumnIndex, &'x [(Ix, Ix)]: IntoParallelIterator<Item = &'x (Ix, Ix)> { ... } fn _rowsim_validation( &self, row_a: usize, row_b: usize, wrt: &Option<&Vec<usize>> ) -> Result<(), RowSimError> { ... } fn rowsim<RIx: RowIndex, CIx: ColumnIndex>( &self, row_a: RIx, row_b: RIx, wrt: Option<&[CIx]>, variant: RowSimilarityVariant ) -> Result<f64, RowSimError> { ... } fn rowsim_pw<'x, RIx, CIx>( &self, pairs: &'x [(RIx, RIx)], wrt: Option<&[CIx]>, variant: RowSimilarityVariant ) -> Result<Vec<f64>, RowSimError> where RIx: RowIndex, CIx: ColumnIndex + Sync, &'x [(RIx, RIx)]: IntoParallelIterator<Item = &'x (RIx, RIx)> { ... } fn novelty<RIx: RowIndex, CIx: ColumnIndex>( &self, row_ix: RIx, wrt: Option<&[CIx]> ) -> Result<f64, IndexError> { ... } fn mi<Ix: ColumnIndex>( &self, col_a: Ix, col_b: Ix, n: usize, mi_type: MiType ) -> Result<f64, MiError> { ... } fn mi_pw<Ix: ColumnIndex>( &self, col_pairs: &[(Ix, Ix)], n: usize, mi_type: MiType ) -> Result<Vec<f64>, MiError> { ... } fn entropy<Ix: ColumnIndex>( &self, col_ixs: &[Ix], n: usize ) -> Result<f64, EntropyError> { ... } fn predictor_search<Ix: ColumnIndex>( &self, cols_t: &[Ix], max_predictors: usize, n_qmc_samples: usize ) -> Result<Vec<(usize, f64)>, IndexError> { ... } fn info_prop<IxT: ColumnIndex, IxX: ColumnIndex>( &self, cols_t: &[IxT], cols_x: &[IxX], n: usize ) -> Result<f64, InfoPropError> { ... } fn conditional_entropy<IxT: ColumnIndex, IxX: ColumnIndex>( &self, col_t: IxT, cols_x: &[IxX], n: usize ) -> Result<f64, ConditionalEntropyError> { ... } fn conditional_entropy_pw<Ix: ColumnIndex>( &self, col_pairs: &[(Ix, Ix)], n: usize, kind: ConditionalEntropyType ) -> Result<Vec<f64>, ConditionalEntropyError> { ... } fn surprisal<RIx: RowIndex, CIx: ColumnIndex>( &self, x: &Datum, row_ix: RIx, col_ix: CIx, state_ixs: Option<Vec<usize>> ) -> Result<Option<f64>, SurprisalError> { ... } fn self_surprisal<RIx: RowIndex, CIx: ColumnIndex>( &self, row_ix: RIx, col_ix: CIx, state_ixs: Option<Vec<usize>> ) -> Result<Option<f64>, SurprisalError> { ... } fn datum<RIx: RowIndex, CIx: ColumnIndex>( &self, row_ix: RIx, col_ix: CIx ) -> Result<Datum, IndexError> { ... } fn logp<Ix: ColumnIndex, GIx: ColumnIndex>( &self, col_ixs: &[Ix], vals: &[Vec<Datum>], given: &Given<GIx>, state_ixs_opt: Option<&[usize]> ) -> Result<Vec<f64>, LogpError> { ... } fn logp_scaled<Ix: ColumnIndex, GIx: ColumnIndex>( &self, col_ixs: &[Ix], vals: &[Vec<Datum>], given: &Given<GIx>, state_ixs_opt: Option<&[usize]> ) -> Result<Vec<f64>, LogpError> where Self: Sized { ... } fn draw<RIx: RowIndex, CIx: ColumnIndex, R: Rng>( &self, row_ix: RIx, col_ix: CIx, n: usize, rng: &mut R ) -> Result<Vec<Datum>, IndexError> { ... } fn simulate<Ix: ColumnIndex, GIx: ColumnIndex, R: Rng>( &self, col_ixs: &[Ix], given: &Given<GIx>, n: usize, state_ixs_opt: Option<Vec<usize>>, rng: &mut R ) -> Result<Vec<Vec<Datum>>, SimulateError> { ... } fn impute<RIx: RowIndex, CIx: ColumnIndex>( &self, row_ix: RIx, col_ix: CIx, with_uncertainty: bool ) -> Result<(Datum, Option<f64>), IndexError> { ... } fn predict<Ix: ColumnIndex, GIx: ColumnIndex>( &self, col_ix: Ix, given: &Given<GIx>, with_uncertainty: bool, state_ixs_opt: Option<&[usize]> ) -> Result<(Datum, Option<f64>), PredictError> { ... } fn variability<Ix: ColumnIndex, GIx: ColumnIndex>( &self, col_ix: Ix, given: &Given<GIx>, state_ixs_opt: Option<&[usize]> ) -> Result<Variability, VariabilityError> { ... } fn feature_error<Ix: ColumnIndex>( &self, col_ix: Ix ) -> Result<(f64, f64), IndexError> { ... } fn _logp_unchecked( &self, col_ixs: &[usize], vals: &[Vec<Datum>], given: &Given<usize>, state_ixs_opt: Option<&[usize]>, scaled: bool ) -> Vec<f64> { ... } fn _simulate_unchecked<R: Rng>( &self, col_ixs: &[usize], given: &Given<usize>, n: usize, state_ixs_opt: Option<Vec<usize>>, rng: &mut R ) -> Vec<Vec<Datum>> { ... } fn _surprisal_unchecked( &self, x: &Datum, row_ix: usize, col_ix: usize, state_ixs_opt: Option<Vec<usize>> ) -> Option<f64> { ... } fn _dual_entropy(&self, col_a: usize, col_b: usize, n: usize) -> f64 { ... } fn _mi_components( &self, col_a: usize, col_b: usize, n: usize ) -> MiComponents { ... } fn _sobol_joint_entropy(&self, col_ixs: &[usize], n: usize) -> f64 { ... } fn _mc_joint_entropy<R: Rng>( &self, col_ixs: &[usize], n: usize, rng: &mut R ) -> f64 { ... } fn _entropy_unchecked(&self, col_ixs: &[usize], n: usize) -> f64 { ... } fn _impute_uncertainty(&self, row_ix: usize, col_ix: usize) -> f64 { ... } fn _predict_uncertainty( &self, col_ix: usize, given: &Given<usize>, state_ixs_opt: Option<&[usize]> ) -> f64 { ... }
}

Provided Methods§

source

fn state_diagnostics(&self) -> Vec<StateDiagnostics>

Returns the diagnostics for each state

source

fn shape(&self) -> (usize, usize, usize)

Returns a tuple containing the number of rows, the number of columns, and the number of states

§Example
use lace::OracleT;

let oracle = Example::Animals.oracle().unwrap();
let shape = oracle.shape();

assert_eq!(shape, (50, 85, 16));
source

fn is_empty(&self) -> bool

Returns true if the object is empty, having no structure to analyze.

source

fn ftype<Ix: ColumnIndex>(&self, col_ix: Ix) -> Result<FType, IndexError>

Return the FType of the column col_ix

§Example
use lace::OracleT;

let oracle = Example::Animals.oracle().unwrap();

let ftype = oracle.ftype("swims").unwrap();

assert_eq!(ftype, FType::Categorical);
source

fn ftypes(&self) -> Vec<FType>

Returns a vector of the feature types of each row

§Example
use lace::OracleT;

let oracle = Example::Animals.oracle().unwrap();
let ftypes = oracle.ftypes();

assert!(ftypes.iter().all(|ftype| ftype.is_categorical()));
source

fn summarize_col<Ix: ColumnIndex>( &self, col_ix: Ix ) -> Result<SummaryStatistics, IndexError>

Return a summary of the data in the column

§Example
use lace::OracleT;
use lace_data::SummaryStatistics;

let oracle = Example::Animals.oracle().unwrap();

let swims_summary = oracle.summarize_col("swims").unwrap();

match swims_summary {
    SummaryStatistics::Categorical { min, max, mode } => {
        assert_eq!(min, 0);
        assert_eq!(max, 1);
        assert_eq!(mode, vec![0]);
    }
    _ => panic!("should be categorical")
}
source

fn depprob<Ix: ColumnIndex>( &self, col_a: Ix, col_b: Ix ) -> Result<f64, IndexError>

Estimated dependence probability between col_a and col_b

§Example
use lace::OracleT;

let oracle = Example::Animals.oracle().unwrap();

let depprob_flippers = oracle.depprob(
    "swims",
    "flippers",
).unwrap();

let depprob_fast = oracle.depprob(
    "swims",
    "fast",
).unwrap();

assert!(depprob_flippers > depprob_fast);
source

fn depprob_pw<'x, Ix>( &self, pairs: &'x [(Ix, Ix)] ) -> Result<Vec<f64>, IndexError>
where Ix: ColumnIndex, &'x [(Ix, Ix)]: IntoParallelIterator<Item = &'x (Ix, Ix)>,

Compute dependence probability for a list of column pairs.

§Example
use lace::OracleT;

let oracle = Example::Animals.oracle().unwrap();
let depprobs = oracle.depprob_pw(&vec![(1, 12), (3, 2)]).unwrap();

assert_eq!(depprobs.len(), 2);
assert_eq!(depprobs[0], oracle.depprob(1, 12).unwrap());
assert_eq!(depprobs[1], oracle.depprob(3, 2).unwrap());
source

fn _rowsim_validation( &self, row_a: usize, row_b: usize, wrt: &Option<&Vec<usize>> ) -> Result<(), RowSimError>

source

fn rowsim<RIx: RowIndex, CIx: ColumnIndex>( &self, row_a: RIx, row_b: RIx, wrt: Option<&[CIx]>, variant: RowSimilarityVariant ) -> Result<f64, RowSimError>

Estimated row similarity between row_a and row_b

§Arguments
  • row_a: the first row index
  • row_b: the second row index
  • wrt: an optional vector of column indices to constrain the similarity. Only the view to which the columns in wrt are assigned will be considered in the similarity calculation
  • variant: The type of row similarity to compute
§Example
use lace::RowSimilarityVariant;
use lace::OracleT;

let oracle = Example::Animals.oracle().unwrap();
let wrt: Option<&[usize]> = None;
let rowsim = oracle.rowsim(
    "wolf",
    "collie",
    wrt,
    RowSimilarityVariant::ViewWeighted,
).unwrap();

assert!(rowsim >= 0.0 && rowsim <= 1.0);

Adding context with wrt (with respect to):


let rowsim_wrt = oracle.rowsim(
    "wolf",
    "collie",
    Some(&["swims"]),
    RowSimilarityVariant::ViewWeighted,
).unwrap();

assert_ne!(rowsim, rowsim_wrt);
source

fn rowsim_pw<'x, RIx, CIx>( &self, pairs: &'x [(RIx, RIx)], wrt: Option<&[CIx]>, variant: RowSimilarityVariant ) -> Result<Vec<f64>, RowSimError>
where RIx: RowIndex, CIx: ColumnIndex + Sync, &'x [(RIx, RIx)]: IntoParallelIterator<Item = &'x (RIx, RIx)>,

Compute row similarity for pairs of rows

§Example
use lace::RowSimilarityVariant;
use lace::OracleT;

let oracle = Example::Animals.oracle().unwrap();
let wrt: Option<&[usize]> = None;
let rowsims = oracle.rowsim_pw(
    &[
        ("gorilla", "spider+monkey"),
        ("gorilla", "skunk"),
    ],
    wrt,
    RowSimilarityVariant::ViewWeighted,
).unwrap();

assert!(rowsims.iter().all(|&rowsim| 0.0 <= rowsim && rowsim <= 1.0));
source

fn novelty<RIx: RowIndex, CIx: ColumnIndex>( &self, row_ix: RIx, wrt: Option<&[CIx]> ) -> Result<f64, IndexError>

Determine the relative novelty of a row.

Novelty is defined as the reciprocal of the mean size of categories (as a proportion of the total number of data) to which the row belongs. If a row is in smaller categories, it will have a higher novelty.

§Notes

Novelty is contextual; it must be compared to the novelty of all other rows. The mean novelty score will increase as the data become more divided. For example, if there is one view with two even categories, each row’s novelty will be 0.5; if there are four even categories, the mean novelty score will be 0.75.

§Example

Dolphins are more novel than rats

use lace::OracleT;

let wrt: Option<&[usize]> = None;
let oracle = Example::Animals.oracle().unwrap();
let novelty_dolphin = oracle.novelty("dolphin", wrt).unwrap();
let novelty_rat = oracle.novelty("rat", wrt).unwrap();

assert!(novelty_rat < novelty_dolphin);

Dolphins are more novel than rats with respect to their swimming.


let wrt = vec!["swims"];

let novelty_rat = oracle.novelty("rat", Some(&wrt)).unwrap();
let novelty_dolphin = oracle.novelty("dolphin", Some(&wrt)).unwrap();

assert!(novelty_dolphin > novelty_rat);
source

fn mi<Ix: ColumnIndex>( &self, col_a: Ix, col_b: Ix, n: usize, mi_type: MiType ) -> Result<f64, MiError>

Estimate the mutual information between col_a and col_b using Monte Carlo integration

Note: If both columns are categorical, the mutual information will be computed exactly.

§Arguments
  • col_a: the first column index
  • col_b: the second column index
  • n: the number of samples for the Monte Carlo integral
  • mi_type: the type of mutual information to return.
§Example
use lace::OracleT;
use lace::MiType;

let oracle = Example::Animals.oracle().unwrap();

let mi_flippers = oracle.mi(
    "swims",
    "flippers",
    1000,
    MiType::Iqr,
).unwrap();

let mi_fast = oracle.mi(
    "swims",
    "fast",
    1000,
    MiType::Iqr,
).unwrap();

assert!(mi_flippers > mi_fast);

The IQR normalized variant is normalized between 0 and 1

let mi_self = oracle.mi(
    "swims",
    "swims",
    1000,
    MiType::Iqr,
).unwrap();

assert_eq!(mi_self, 1.0);

Mutual information is not as well behaved for continuous variables since differential (continuous) entropy can be negative. The Linfoot MiType can help. Linfoot is a transformed mutual information variant that will be in the interval (0, 1).

let oracle = Example::Satellites.oracle().unwrap();

let linfoot = oracle.mi(
    "longitude_radians_of_geo",
    "Eccentricity",
    1000,
    MiType::Linfoot,
).unwrap();

assert!(0.0 < linfoot && linfoot < 1.0);
source

fn mi_pw<Ix: ColumnIndex>( &self, col_pairs: &[(Ix, Ix)], n: usize, mi_type: MiType ) -> Result<Vec<f64>, MiError>

Compute mutual information over pairs of columns

§Notes

This function has special optimizations over computing oracle::mi for pairs manually.

source

fn entropy<Ix: ColumnIndex>( &self, col_ixs: &[Ix], n: usize ) -> Result<f64, EntropyError>

Estimate joint entropy

§Notes

The computation is exact under certain circumstances, otherwise the quantity is approximated via Monte Carlo integration.

  • All columns are categorical, in which case the exact answer is computed via enumeration. The user should be aware combinatorial expansion of the terms in the summation.
  • There is only one index in col_ixs and that column is categorical, gaussian, or labeler.
  • There are two columns and one is categorical and the other is gaussian
§Arguments
  • col_ixs: vector of column indices
  • n: number of samples for the Monte Carlo integral.
§Examples

There is more information in the swims column than in the blue column

use lace::OracleT;
use lace::MiType;

let oracle = Example::Animals.oracle().unwrap();

// Close to uniformly distributed -> high entropy
let h_swims = oracle.entropy(
    &["swims"],
    10_000,
).unwrap();

// Close to deterministic -> low entropy
let h_blue = oracle.entropy(
    &["blue"],
    10_000,
).unwrap();

assert!(h_blue < h_swims);

The n argument isn’t used for a single categorical column because the exact computation is used.

let h_swims_10k = oracle.entropy(
    &["swims"],
    10_000,
).unwrap();

let h_swims_0 = oracle.entropy(
    &["swims"],
    1,
).unwrap();

assert!((h_swims_10k - h_swims_0).abs() < 1E-12);

Determine the set of predictors that most efficiently account for the most information in a set of target columns.

§Notes

The estimates will be bad if the number of samples is too low to fill the space. This will be particularly apparent in large numbers of categorical variables where not filling the space means missing out on entire classes. If you notice large jumps in the running info_prop (it should be roughly log(n)), then you are having bad error and will need to up the number of samples. The max recommended number of predictors plus targets is 10.

§Arguments
  • cols_t: The target column indices. The ones you want to predict.
  • max_predictors: The max number of predictors to search.
  • n_qmc_samples: The number of QMC samples to use for entropy estimation
§Returns

A Vec of (col_ix, info_prop). The first column index is the column that is the single best predictor of the targets. The additional columns in the sequence are the columns added to the predictor set that maximizes the prediction. The information proportions are the proportions of information accounted for by the predictors with that column added to the set.

§Example

Which four columns should I choose to best predict whether an animals swims

use lace::examples::animals::Column;
use lace::OracleT;

let oracle = Example::Animals.oracle().unwrap();

let predictors = oracle.predictor_search(
    &vec!["swims"],
    4,
    10_000
).unwrap();

// We asked for four predictors, so we get four.
assert_eq!(predictors.len(), 4);

// Whether something lives in water is the single best predictor of
// whether something swims.
let water: usize = Column::Water.into();
assert_eq!(predictors[0].0, water);

// All information proportions, without runaway approximation error,
// should be in [0, 1].
for (_col_ix, info_prop) in &predictors {
    assert!(0.0 < *info_prop && *info_prop < 1.0)
}

// As we add predictors, the information proportions increase
// monotonically
for i in 1..4 {
    assert!(predictors[i-1].1 < predictors[i].1);
}
source

fn info_prop<IxT: ColumnIndex, IxX: ColumnIndex>( &self, cols_t: &[IxT], cols_x: &[IxX], n: usize ) -> Result<f64, InfoPropError>

Compute the proportion of information in cols_t accounted for by cols_x.

§Arguments
  • cols_t: The target columns. Typically the target of a prediction.
  • cols_x: The predictor columns.
  • n: the number of samples for the Monte Carlo integral. Make n high enough to integrate a function with as many dimensions as there are total columns in cols_t and cols_x.
§Notes

If all variables are discrete, the information proportion should be in [0, 1] (with minor deviations due to approximation error); the behavior is less predictable when one or more of the variables are continuous because entropy is a murky concept in continuous space.

§Example

Flippers tells us more about swimming that an animal’s being fast.

use lace::OracleT;

let oracle = Example::Animals.oracle().unwrap();

let ip_flippers = oracle.info_prop(
    &["swims"],
    &["flippers"],
    1000,
).unwrap();

let ip_fast = oracle.info_prop(
    &["swims"],
    &["fast"],
    1000,
).unwrap();

assert!(ip_flippers > ip_fast);

assert!(ip_flippers >= 0.0);
assert!(ip_flippers <= 1.0);

assert!(ip_fast >= 0.0);
assert!(ip_fast <= 1.0);

Adding more predictor columns increases the information proportion monotonically.

let ip_flippers_coastal = oracle.info_prop(
    &["swims"],
    &["flippers", "coastal"],
    1000,
).unwrap();

assert!(ip_flippers < ip_flippers_coastal);
assert!(ip_flippers_coastal <= 1.0);

let ip_flippers_coastal_fast = oracle.info_prop(
    &["swims"],
    &["flippers", "coastal", "fast"],
    1000,
).unwrap();

assert!(ip_flippers_coastal < ip_flippers_coastal_fast);
assert!(ip_flippers_coastal_fast <= 1.0);
source

fn conditional_entropy<IxT: ColumnIndex, IxX: ColumnIndex>( &self, col_t: IxT, cols_x: &[IxX], n: usize ) -> Result<f64, ConditionalEntropyError>

Conditional entropy H(T|X) where X is lists of column indices

§Arguments
  • col_t: the target column index
  • col_x: the observed column index
  • n: the number of samples for the Monte Carlo integral
§Example

Knowing whether something has flippers leaves less information to account for WRT its swimming than does knowing whether it is fast and has a tail.

use lace::OracleT;
use lace::examples::animals::Column;

let oracle = Example::Animals.oracle().unwrap();

let mi_flippers = oracle.conditional_entropy(
    "swims",
    &["flippers"],
    1000,
).unwrap();

let mi_fast_tail = oracle.conditional_entropy(
    "swims",
    &["fast", "tail"],
    1000,
).unwrap();

assert!(mi_flippers < mi_fast_tail);
source

fn conditional_entropy_pw<Ix: ColumnIndex>( &self, col_pairs: &[(Ix, Ix)], n: usize, kind: ConditionalEntropyType ) -> Result<Vec<f64>, ConditionalEntropyError>

Pairwise copmutation of conditional entreopy or information proportion

§Example
use lace::OracleT;
use lace::ConditionalEntropyType;
use lace::examples::animals::Column;

let oracle = Example::Animals.oracle().unwrap();

let col_pairs: Vec<(usize, usize)> = vec![
    (Column::Swims.into(), Column::Flippers.into()),
    (Column::Swims.into(), Column::Fast.into()),
];

let ce = oracle.conditional_entropy_pw(
    &col_pairs,
    1000,
    ConditionalEntropyType::UnNormed
).unwrap();

assert_eq!(ce.len(), 2);
assert!(ce[0] < ce[1]);

… and specify information proportion instead of un-normalized conditional entropy changes the relationships.

let info_prop = oracle.conditional_entropy_pw(
    &col_pairs,
    1000,
    ConditionalEntropyType::InfoProp
).unwrap();

assert_eq!(info_prop.len(), 2);
assert!(info_prop[0] > info_prop[1]);
source

fn surprisal<RIx: RowIndex, CIx: ColumnIndex>( &self, x: &Datum, row_ix: RIx, col_ix: CIx, state_ixs: Option<Vec<usize>> ) -> Result<Option<f64>, SurprisalError>

Negative log PDF/PMF of a datum, x, in a specific cell of the table at position row_ix, col_ix.

surprisal is different from logp in that it works only on cells that exist in the table. logp works on hypothetical data that have not been inserted into the table. Because the data in a cell is modeled as a result of the running of the inference algorithm, the likelihood of any cell is implicitly conditioned on all other cells in the table, therefore surprisal does not accept conditions.

§Notes

To compute surprisal of non-inserted data, use -logp(..).

§Arguments
  • x: the value of which to compute the surprisal
  • row_ix: The row index of x
  • col_ix: column index of x
  • state_ixs: The optional state indices over which to compute surprisal. If None, use all states.
§Returns

None if x is Missing, otherwise returns Some(value)

§Example

A pig being fierce is more surprising than a lion being fierce.

use lace::OracleT;
use lace_data::Datum;
use lace::examples::animals::{Column, Row};

let oracle = Example::Animals.oracle().unwrap();

let present = Datum::Categorical(1_u8.into());

let s_pig = oracle.surprisal(
    &present,
    "pig",
    "fierce",
    None,
).unwrap();

let s_lion = oracle.surprisal(
    &present,
    "lion",
    "fierce",
    None,
).unwrap();

assert!(s_pig > s_lion);
source

fn self_surprisal<RIx: RowIndex, CIx: ColumnIndex>( &self, row_ix: RIx, col_ix: CIx, state_ixs: Option<Vec<usize>> ) -> Result<Option<f64>, SurprisalError>

Get the surprisal of the datum in a cell.

§Arguments
  • row_ix: The hypothetical row index of the cell.
  • col_ix: The hypothetical column index of the cell.
  • state_ixs: The optional state indices over which to compute surprisal. If None, use all states.
§Example

A pig is fierce, which is more surprising than a lion being fierce.

use lace::OracleT;

let oracle = Example::Animals.oracle().unwrap();

let s_pig = oracle.self_surprisal(
    "pig",
    "fierce",
    None,
).unwrap();

let s_lion = oracle.self_surprisal(
    "lion",
    "fierce",
    None,
).unwrap();

assert!(s_pig > s_lion);
source

fn datum<RIx: RowIndex, CIx: ColumnIndex>( &self, row_ix: RIx, col_ix: CIx ) -> Result<Datum, IndexError>

Get the datum at an index

§Example
use lace::OracleT;
use lace_data::Datum;

let oracle = Example::Animals.oracle().unwrap();

let x = oracle.datum("pig", "fierce").unwrap();

assert_eq!(x, Datum::Categorical(1_u8.into()));

Getting data from the satellites dataset

let oracle = Example::Satellites.oracle().unwrap();

let x = oracle.datum(
    "International Space Station (ISS [first element Zarya])",
    "Class_of_Orbit"
).unwrap();

assert_eq!(x, Datum::Categorical("LEO".into()));

let y = oracle.datum(
    "International Space Station (ISS [first element Zarya])",
    "Period_minutes",
).unwrap();

assert_eq!(y, Datum::Continuous(92.8));
source

fn logp<Ix: ColumnIndex, GIx: ColumnIndex>( &self, col_ixs: &[Ix], vals: &[Vec<Datum>], given: &Given<GIx>, state_ixs_opt: Option<&[usize]> ) -> Result<Vec<f64>, LogpError>

Compute the log PDF/PMF of a set of values possibly conditioned on the values of other columns

§Arguments
  • col_ixs: An d-length vector of the indices of the columns comprising the data.
  • vals: An n-length vector of d-length vectors. The joint probability of each of the n entries will be computed.
  • given: an optional set of observations on which to condition the PMF/PDF
  • state_ixs_opt: An optional vector of the state indices to use for the logp computation. If None, all states are used.
§Returns

A vector, p, where p[i] is the log PDF/PMF corresponding to the data in vals[i].

§Example

The probability that an animals swims is lower than the probability that it swims given that is has flippers.

use lace::OracleT;
use lace_data::Datum;
use lace::Given;

let oracle = Example::Animals.oracle().unwrap();

let logp_swims = oracle.logp(
    &["swims"],
    &[vec![Datum::Categorical(0_u8.into())], vec![Datum::Categorical(1_u8.into())]],
    &Given::<usize>::Nothing,
    None,
).unwrap();

let logp_swims_given_flippers = oracle.logp(
    &["swims"],
    &[
        vec![Datum::Categorical(0_u8.into())],
        vec![Datum::Categorical(1_u8.into())]
    ],
    &Given::Conditions(
        vec![("flippers", Datum::Categorical(1_u8.into()))]
    ),
    None,
).unwrap();

// Also: exhaustive probabilities should sum to one.
assert!(logp_swims[1] < logp_swims_given_flippers[1]);

let sum_p = logp_swims
    .iter()
    .map(|lp| lp.exp())
    .sum::<f64>();

assert!((sum_p - 1.0).abs() < 1E-10);

let sum_p_given = logp_swims_given_flippers
    .iter()
    .map(|lp| lp.exp())
    .sum::<f64>();

assert!((sum_p_given - 1.0).abs() < 1E-10);

For missing not at random columns, you can ask about the likelihood of missing values.

let oracle = Example::Satellites.oracle().unwrap();

let logps = oracle.logp(
    &["longitude_radians_of_geo", "Type_of_Orbit", "Period_minutes"],
    &[
        vec![Datum::Missing, Datum::Missing, Datum::Continuous(70.0)],
        vec![Datum::Missing, Datum::Categorical("Polar".into()), Datum::Continuous(70.0)],
        vec![Datum::Continuous(1.2), Datum::Missing, Datum::Continuous(70.0)],
    ],
    &Given::<usize>::Nothing,
    None,
).unwrap();

assert!(logps[0] > logps[1]);
assert!(logps[0] > logps[2]);

And you can condition on missingness

let oracle = Example::Satellites.oracle().unwrap();

let logp = oracle.logp(
    &["Period_minutes"],
    &[
        vec![Datum::Continuous(70.0)],   // ~LEO
        vec![Datum::Continuous(300.0)],  // ~MEO
        vec![Datum::Continuous(1440.0)], // ~GEO
    ],
    &Given::<usize>::Nothing,
    None,
).unwrap();

let logp_missing = oracle.logp(
    &["Period_minutes"],
    &[
        vec![Datum::Continuous(70.0)],   // ~LEO
        vec![Datum::Continuous(300.0)],  // ~MEO
        vec![Datum::Continuous(1440.0)], // ~GEO
    ],
    &Given::Conditions(vec![
        ("longitude_radians_of_geo", Datum::Missing)
    ]),
    None,
).unwrap();

// LEO is more likely if no 'longitude_radians_of_geo' was given
assert!(logp_missing[0] > logp[0]); // p LEO goes up w/ missing
// GEO is less likely if no 'longitude_radians_of_geo' was given
assert!(logp_missing[2] < logp[2]); // p GEO goes down
source

fn logp_scaled<Ix: ColumnIndex, GIx: ColumnIndex>( &self, col_ixs: &[Ix], vals: &[Vec<Datum>], given: &Given<GIx>, state_ixs_opt: Option<&[usize]> ) -> Result<Vec<f64>, LogpError>
where Self: Sized,

A version of logp where the likelihood are scaled by the column modes.

The goal of this function is to create a notion of logp that is more standardized across rare variants. For example, if there is a class, A, of object with a highly variable distribution, we would have a hard time comparing the surprisal of class A variant to the surprisal of variants under a non-variable class B.

That’s a long way of saying that this is a hack and there’s not any mathematical rigor behind it.

§Notes
  • The contribution of each column is individually scaled based on the max likelihood of the individual mixture model, then the geometric mean is computed over column contributions.
  • Generating the cache is expensive, so if you plan on calling the function with the same col_ixs and given, then you should really pre-generate the cache.
§Arguments
  • col_ixs: An d-length vector of the indices of the columns comprising the data.
  • vals: An n-length vector of d-length vectors. The joint probability of each of the n entries will be computed.
  • given: an optional set of observations on which to condition the PMF/PDF
  • state_ixs_opt: An optional vector of the state indices to use for the logp computation. If None, all states are used.
§Example
use lace::{OracleT, Datum, Given};

let oracle = Example::Animals.oracle().unwrap();

let logp_scaled = oracle.logp_scaled(
   &["swims"],
   &[vec![Datum::Categorical(0_u8.into())]],
   &Given::<usize>::Nothing,
   None,
).unwrap()[0];
source

fn draw<RIx: RowIndex, CIx: ColumnIndex, R: Rng>( &self, row_ix: RIx, col_ix: CIx, n: usize, rng: &mut R ) -> Result<Vec<Datum>, IndexError>

Draw n samples from the cell at [row_ix, col_ix].

§Arguments
  • row_ix: the row index
  • col_ix, the column index
  • n: the number of draws to collect
§Example

Draw 12 values of a Pig’s fierceness.

use lace::OracleT;

let oracle = Example::Animals.oracle().unwrap();

let mut rng = rand::thread_rng();
let xs = oracle.draw("pig", "fierce", 12, &mut rng).unwrap();

assert_eq!(xs.len(), 12);
assert!(xs.iter().all(|x| x.is_categorical()));
source

fn simulate<Ix: ColumnIndex, GIx: ColumnIndex, R: Rng>( &self, col_ixs: &[Ix], given: &Given<GIx>, n: usize, state_ixs_opt: Option<Vec<usize>>, rng: &mut R ) -> Result<Vec<Vec<Datum>>, SimulateError>

Simulate values from joint or conditional distribution

§Arguments
  • col_ixs: a d-length vector containing the column indices to simulate
  • given: optional observations by which to constrain the simulation, i.e., simulate from p(col_ixs|given)
  • n: the number of simulation
  • state_ixs_opt: The indices of the states from which to simulate. If None, simulate from all states.
§Returns

An n-by-d vector of vectors, x, where x[i][j] is the jth dimension of the ith simulation.

§Example

Simulate the appearance of a hypothetical animal that is fierce and fast.

use lace::OracleT;
use lace::Given;
use lace_data::Datum;

let oracle = Example::Animals.oracle().unwrap();

let mut rng = rand::thread_rng();

let given = Given::Conditions(
    vec![
        ("fierce", Datum::Categorical(1_u8.into())),
        ("fast", Datum::Categorical(1_u8.into())),
    ]
);

let xs = oracle.simulate(
    &["black", "tail"],
    &given,
    10,
    None,
    &mut rng,
).unwrap();

assert_eq!(xs.len(), 10);
assert!(xs.iter().all(|x| x.len() == 2));
source

fn impute<RIx: RowIndex, CIx: ColumnIndex>( &self, row_ix: RIx, col_ix: CIx, with_uncertainty: bool ) -> Result<(Datum, Option<f64>), IndexError>

Return the most likely value for a cell in the table along with the confidence in that imputation.

Imputation can be done on non-missing cells and will re-predict the value of the cell rather than returning the existing value. To get the current value of a cell, use Oracle::data.

Impute uncertainty is the mean total variation distance between each state’s impute distribution and the average impute distribution.

§Arguments
  • row_ix: the row index of the cell to impute
  • col_ix: the column index of the cell to impute
  • with_uncertainty: if true compute and return the uncertainty
§Returns

A (value, uncertainty_option) tuple.

§Example

Impute the value of swims for an dolphin and an polar bear.

use lace::OracleT;
use lace_data::Datum;

let oracle = Example::Animals.oracle().unwrap();

let dolphin_swims = oracle.impute(
    "dolphin",
    "swims",
    true,
).unwrap();

let bear_swims = oracle.impute(
    "polar+bear",
    "swims",
    true,
).unwrap();

assert_eq!(dolphin_swims.0, Datum::Categorical(1_u8.into()));
assert_eq!(bear_swims.0, Datum::Categorical(1_u8.into()));

let dolphin_swims_unc = dolphin_swims.1.unwrap();
let bear_swims_unc = bear_swims.1.unwrap();

// Given that a polar bear is a furry, footed mammal, it's harder to
// model  why we know it swims.
assert!(bear_swims_unc > dolphin_swims_unc);

Imputing a missing-not-at-random value will still return a value

let oracle = Example::Satellites.oracle().unwrap();

let (imp, _) = oracle.impute(
    "X-Sat",
    "Type_of_Orbit",
    true,
).unwrap();

assert_eq!(imp, Datum::Categorical("Sun-Synchronous".into()));
let (imp, _) = oracle.impute(
    "X-Sat",
    "longitude_radians_of_geo",
    true,
).unwrap();

assert!((imp.to_f64_opt().unwrap() - 0.18514237733859296).abs() < 1e-10);
source

fn predict<Ix: ColumnIndex, GIx: ColumnIndex>( &self, col_ix: Ix, given: &Given<GIx>, with_uncertainty: bool, state_ixs_opt: Option<&[usize]> ) -> Result<(Datum, Option<f64>), PredictError>

Return the most likely value for a column given a set of conditions along with the confidence in that prediction.

§Arguments
  • col_ix: the index of the column to predict
  • given: optional observations by which to constrain the prediction
  • with_uncertainty: if true, copmute and return uncertainty
  • state_ixs_opt: Optional vector of state indices from which to predict, if None, use all states.
§Returns

A (value, uncertainty_option) Tuple

§Examples

Predict the most likely class of orbit for given longitude of Geosynchronous orbit.

use lace::examples::Example;
use lace::prelude::*;

let oracle = Example::Satellites.oracle().unwrap();

let (pred, _) = oracle.predict(
    "Class_of_Orbit",
    &Given::Conditions(vec![
        ("longitude_radians_of_geo", Datum::Continuous(1.0))
    ]),
    false,
    None,
).unwrap();

assert_eq!(pred, Datum::Categorical("GEO".into()));

Predict the most likely class of orbit given the longitude_radians_of_geo field is missing. Note: this requires the column to be missing-not-at-random.

let (pred_long_missing, _) = oracle.predict(
    "Class_of_Orbit",
    &Given::Conditions(vec![
        ("longitude_radians_of_geo", Datum::Missing)
    ]),
    false,
    None,
).unwrap();

assert_eq!(pred_long_missing, Datum::Categorical("LEO".into()));

Predict a categorical value that is missing not at random

let (pred_type, _) = oracle.predict(
    "Type_of_Orbit",
    &Given::Conditions(vec![(
        "Class_of_Orbit", Datum::Categorical("MEO".into()))
    ]),
    false,
    None,
).unwrap();

assert_eq!(pred_type, Datum::Missing);

Predict a continuous value that is missing not at random and is missing most of the time

let (pred_type, _) = oracle.predict(
    "longitude_radians_of_geo",
    &Given::<usize>::Nothing,
    false,
    None,
).unwrap();

assert_eq!(pred_type, Datum::Missing);

Note that the uncertainty when the prediction is missing is the uncertainty only off the missing prediction. For example, the longitude_radians_of_geo value is only present for geosynchronous satellites, which have an orbital period of around 1440 minutes. We can see the uncertainty drop as we condition on periods farther away from 1440 miuntues.

let (pred_close, unc_close) = oracle.predict(
    "longitude_radians_of_geo",
    &Given::Conditions(vec![
        ("Period_minutes", Datum::Continuous(1200.0))
    ]),
    true,
    None,
).unwrap();

assert_eq!(pred_close, Datum::Missing);

let (pred_far, unc_far) = oracle.predict(
    "longitude_radians_of_geo",
    &Given::Conditions(vec![
        ("Period_minutes", Datum::Continuous(1000.0))
    ]),
    true,
    None,
).unwrap();

assert_eq!(pred_far, Datum::Missing);
dbg!(&unc_far, &unc_close);
assert!(unc_far.unwrap() < unc_close.unwrap());
source

fn variability<Ix: ColumnIndex, GIx: ColumnIndex>( &self, col_ix: Ix, given: &Given<GIx>, state_ixs_opt: Option<&[usize]> ) -> Result<Variability, VariabilityError>

Compute the variability of a conditional distribution

§Notes
  • Returns variance for Continuous and Count columns
  • Returns Entropy for Categorical columns
§Arguments
  • col_ix: the index of the column for which to compute the variability
  • given: optional observations by which to constrain the prediction
  • state_ixs_opt: Optional vector of state indices from which to compute, if None, use all states.
source

fn feature_error<Ix: ColumnIndex>( &self, col_ix: Ix ) -> Result<(f64, f64), IndexError>

Compute the error between the observed data in a feature and the feature model.

§Returns

An (error, centroid) tuple where error a float in [0, 1], and the centroid is the centroid of the error. For continuous features, the error is derived from the probability integral transform, and for discrete variables the error is the error between the inferred and empirical CDFs.

source

fn _logp_unchecked( &self, col_ixs: &[usize], vals: &[Vec<Datum>], given: &Given<usize>, state_ixs_opt: Option<&[usize]>, scaled: bool ) -> Vec<f64>

source

fn _simulate_unchecked<R: Rng>( &self, col_ixs: &[usize], given: &Given<usize>, n: usize, state_ixs_opt: Option<Vec<usize>>, rng: &mut R ) -> Vec<Vec<Datum>>

source

fn _surprisal_unchecked( &self, x: &Datum, row_ix: usize, col_ix: usize, state_ixs_opt: Option<Vec<usize>> ) -> Option<f64>

source

fn _dual_entropy(&self, col_a: usize, col_b: usize, n: usize) -> f64

specialization for column pairs. If a specialization is not founds for the specific columns types, will fall back to MC approximation

source

fn _mi_components(&self, col_a: usize, col_b: usize, n: usize) -> MiComponents

Get the components of mutual information between two columns

source

fn _sobol_joint_entropy(&self, col_ixs: &[usize], n: usize) -> f64

Use a Sobol QMC sequence to appropriate joint entropy

§Notes

This thing is shit. Don’t use it.

source

fn _mc_joint_entropy<R: Rng>( &self, col_ixs: &[usize], n: usize, rng: &mut R ) -> f64

source

fn _entropy_unchecked(&self, col_ixs: &[usize], n: usize) -> f64

source

fn _impute_uncertainty(&self, row_ix: usize, col_ix: usize) -> f64

Computes the predictive uncertainty for the datum at (row_ix, col_ix) as mean the pairwise KL divergence between the components to which the datum is assigned.

§Notes

Impute uncertainty applies only to impute operations where we want to recover a specific missing (or not missing) entry. There is no special handling of non-missing entries.

§Arguments
  • row_ix: the row index
  • col_ix: the column index
source

fn _predict_uncertainty( &self, col_ix: usize, given: &Given<usize>, state_ixs_opt: Option<&[usize]> ) -> f64

Computes the uncertainty associated with predicting the value of a features with optional given conditions. Uses Jensen-Shannon divergence computed on the mixture of mixtures.

§Notes

Predict uncertainty applies only to prediction of hypothetical values, and not to imputation of in-table values.

§Arguments
  • col_ix: the column index
  • given_opt: an optional list of (column index, value) tuples designating other observations on which to condition the prediction

Object Safety§

This trait is not object safe.

Implementors§

source§

impl<T: CanOracle> OracleT for T