pub trait OracleT: CanOracle {
Show 40 methods
// Provided methods
fn state_diagnostics(&self) -> Vec<StateDiagnostics> { ... }
fn shape(&self) -> (usize, usize, usize) { ... }
fn is_empty(&self) -> bool { ... }
fn ftype<Ix: ColumnIndex>(&self, col_ix: Ix) -> Result<FType, IndexError> { ... }
fn ftypes(&self) -> Vec<FType> { ... }
fn summarize_col<Ix: ColumnIndex>(
&self,
col_ix: Ix
) -> Result<SummaryStatistics, IndexError> { ... }
fn depprob<Ix: ColumnIndex>(
&self,
col_a: Ix,
col_b: Ix
) -> Result<f64, IndexError> { ... }
fn depprob_pw<'x, Ix>(
&self,
pairs: &'x [(Ix, Ix)]
) -> Result<Vec<f64>, IndexError>
where Ix: ColumnIndex,
&'x [(Ix, Ix)]: IntoParallelIterator<Item = &'x (Ix, Ix)> { ... }
fn _rowsim_validation(
&self,
row_a: usize,
row_b: usize,
wrt: &Option<&Vec<usize>>
) -> Result<(), RowSimError> { ... }
fn rowsim<RIx: RowIndex, CIx: ColumnIndex>(
&self,
row_a: RIx,
row_b: RIx,
wrt: Option<&[CIx]>,
variant: RowSimilarityVariant
) -> Result<f64, RowSimError> { ... }
fn rowsim_pw<'x, RIx, CIx>(
&self,
pairs: &'x [(RIx, RIx)],
wrt: Option<&[CIx]>,
variant: RowSimilarityVariant
) -> Result<Vec<f64>, RowSimError>
where RIx: RowIndex,
CIx: ColumnIndex + Sync,
&'x [(RIx, RIx)]: IntoParallelIterator<Item = &'x (RIx, RIx)> { ... }
fn novelty<RIx: RowIndex, CIx: ColumnIndex>(
&self,
row_ix: RIx,
wrt: Option<&[CIx]>
) -> Result<f64, IndexError> { ... }
fn mi<Ix: ColumnIndex>(
&self,
col_a: Ix,
col_b: Ix,
n: usize,
mi_type: MiType
) -> Result<f64, MiError> { ... }
fn mi_pw<Ix: ColumnIndex>(
&self,
col_pairs: &[(Ix, Ix)],
n: usize,
mi_type: MiType
) -> Result<Vec<f64>, MiError> { ... }
fn entropy<Ix: ColumnIndex>(
&self,
col_ixs: &[Ix],
n: usize
) -> Result<f64, EntropyError> { ... }
fn predictor_search<Ix: ColumnIndex>(
&self,
cols_t: &[Ix],
max_predictors: usize,
n_qmc_samples: usize
) -> Result<Vec<(usize, f64)>, IndexError> { ... }
fn info_prop<IxT: ColumnIndex, IxX: ColumnIndex>(
&self,
cols_t: &[IxT],
cols_x: &[IxX],
n: usize
) -> Result<f64, InfoPropError> { ... }
fn conditional_entropy<IxT: ColumnIndex, IxX: ColumnIndex>(
&self,
col_t: IxT,
cols_x: &[IxX],
n: usize
) -> Result<f64, ConditionalEntropyError> { ... }
fn conditional_entropy_pw<Ix: ColumnIndex>(
&self,
col_pairs: &[(Ix, Ix)],
n: usize,
kind: ConditionalEntropyType
) -> Result<Vec<f64>, ConditionalEntropyError> { ... }
fn surprisal<RIx: RowIndex, CIx: ColumnIndex>(
&self,
x: &Datum,
row_ix: RIx,
col_ix: CIx,
state_ixs: Option<Vec<usize>>
) -> Result<Option<f64>, SurprisalError> { ... }
fn self_surprisal<RIx: RowIndex, CIx: ColumnIndex>(
&self,
row_ix: RIx,
col_ix: CIx,
state_ixs: Option<Vec<usize>>
) -> Result<Option<f64>, SurprisalError> { ... }
fn datum<RIx: RowIndex, CIx: ColumnIndex>(
&self,
row_ix: RIx,
col_ix: CIx
) -> Result<Datum, IndexError> { ... }
fn logp<Ix: ColumnIndex, GIx: ColumnIndex>(
&self,
col_ixs: &[Ix],
vals: &[Vec<Datum>],
given: &Given<GIx>,
state_ixs_opt: Option<&[usize]>
) -> Result<Vec<f64>, LogpError> { ... }
fn logp_scaled<Ix: ColumnIndex, GIx: ColumnIndex>(
&self,
col_ixs: &[Ix],
vals: &[Vec<Datum>],
given: &Given<GIx>,
state_ixs_opt: Option<&[usize]>
) -> Result<Vec<f64>, LogpError>
where Self: Sized { ... }
fn draw<RIx: RowIndex, CIx: ColumnIndex, R: Rng>(
&self,
row_ix: RIx,
col_ix: CIx,
n: usize,
rng: &mut R
) -> Result<Vec<Datum>, IndexError> { ... }
fn simulate<Ix: ColumnIndex, GIx: ColumnIndex, R: Rng>(
&self,
col_ixs: &[Ix],
given: &Given<GIx>,
n: usize,
state_ixs_opt: Option<Vec<usize>>,
rng: &mut R
) -> Result<Vec<Vec<Datum>>, SimulateError> { ... }
fn impute<RIx: RowIndex, CIx: ColumnIndex>(
&self,
row_ix: RIx,
col_ix: CIx,
with_uncertainty: bool
) -> Result<(Datum, Option<f64>), IndexError> { ... }
fn predict<Ix: ColumnIndex, GIx: ColumnIndex>(
&self,
col_ix: Ix,
given: &Given<GIx>,
with_uncertainty: bool,
state_ixs_opt: Option<&[usize]>
) -> Result<(Datum, Option<f64>), PredictError> { ... }
fn variability<Ix: ColumnIndex, GIx: ColumnIndex>(
&self,
col_ix: Ix,
given: &Given<GIx>,
state_ixs_opt: Option<&[usize]>
) -> Result<Variability, VariabilityError> { ... }
fn feature_error<Ix: ColumnIndex>(
&self,
col_ix: Ix
) -> Result<(f64, f64), IndexError> { ... }
fn _logp_unchecked(
&self,
col_ixs: &[usize],
vals: &[Vec<Datum>],
given: &Given<usize>,
state_ixs_opt: Option<&[usize]>,
scaled: bool
) -> Vec<f64> { ... }
fn _simulate_unchecked<R: Rng>(
&self,
col_ixs: &[usize],
given: &Given<usize>,
n: usize,
state_ixs_opt: Option<Vec<usize>>,
rng: &mut R
) -> Vec<Vec<Datum>> { ... }
fn _surprisal_unchecked(
&self,
x: &Datum,
row_ix: usize,
col_ix: usize,
state_ixs_opt: Option<Vec<usize>>
) -> Option<f64> { ... }
fn _dual_entropy(&self, col_a: usize, col_b: usize, n: usize) -> f64 { ... }
fn _mi_components(
&self,
col_a: usize,
col_b: usize,
n: usize
) -> MiComponents { ... }
fn _sobol_joint_entropy(&self, col_ixs: &[usize], n: usize) -> f64 { ... }
fn _mc_joint_entropy<R: Rng>(
&self,
col_ixs: &[usize],
n: usize,
rng: &mut R
) -> f64 { ... }
fn _entropy_unchecked(&self, col_ixs: &[usize], n: usize) -> f64 { ... }
fn _impute_uncertainty(&self, row_ix: usize, col_ix: usize) -> f64 { ... }
fn _predict_uncertainty(
&self,
col_ix: usize,
given: &Given<usize>,
state_ixs_opt: Option<&[usize]>
) -> f64 { ... }
}
Provided Methods§
sourcefn state_diagnostics(&self) -> Vec<StateDiagnostics>
fn state_diagnostics(&self) -> Vec<StateDiagnostics>
Returns the diagnostics for each state
sourcefn shape(&self) -> (usize, usize, usize)
fn shape(&self) -> (usize, usize, usize)
Returns a tuple containing the number of rows, the number of columns, and the number of states
§Example
use lace::OracleT;
let oracle = Example::Animals.oracle().unwrap();
let shape = oracle.shape();
assert_eq!(shape, (50, 85, 16));
sourcefn is_empty(&self) -> bool
fn is_empty(&self) -> bool
Returns true if the object is empty, having no structure to analyze.
sourcefn ftype<Ix: ColumnIndex>(&self, col_ix: Ix) -> Result<FType, IndexError>
fn ftype<Ix: ColumnIndex>(&self, col_ix: Ix) -> Result<FType, IndexError>
Return the FType of the column col_ix
§Example
use lace::OracleT;
let oracle = Example::Animals.oracle().unwrap();
let ftype = oracle.ftype("swims").unwrap();
assert_eq!(ftype, FType::Categorical);
sourcefn ftypes(&self) -> Vec<FType>
fn ftypes(&self) -> Vec<FType>
Returns a vector of the feature types of each row
§Example
use lace::OracleT;
let oracle = Example::Animals.oracle().unwrap();
let ftypes = oracle.ftypes();
assert!(ftypes.iter().all(|ftype| ftype.is_categorical()));
sourcefn summarize_col<Ix: ColumnIndex>(
&self,
col_ix: Ix
) -> Result<SummaryStatistics, IndexError>
fn summarize_col<Ix: ColumnIndex>( &self, col_ix: Ix ) -> Result<SummaryStatistics, IndexError>
Return a summary of the data in the column
§Example
use lace::OracleT;
use lace_data::SummaryStatistics;
let oracle = Example::Animals.oracle().unwrap();
let swims_summary = oracle.summarize_col("swims").unwrap();
match swims_summary {
SummaryStatistics::Categorical { min, max, mode } => {
assert_eq!(min, 0);
assert_eq!(max, 1);
assert_eq!(mode, vec![0]);
}
_ => panic!("should be categorical")
}
sourcefn depprob<Ix: ColumnIndex>(
&self,
col_a: Ix,
col_b: Ix
) -> Result<f64, IndexError>
fn depprob<Ix: ColumnIndex>( &self, col_a: Ix, col_b: Ix ) -> Result<f64, IndexError>
Estimated dependence probability between col_a
and col_b
§Example
use lace::OracleT;
let oracle = Example::Animals.oracle().unwrap();
let depprob_flippers = oracle.depprob(
"swims",
"flippers",
).unwrap();
let depprob_fast = oracle.depprob(
"swims",
"fast",
).unwrap();
assert!(depprob_flippers > depprob_fast);
sourcefn depprob_pw<'x, Ix>(
&self,
pairs: &'x [(Ix, Ix)]
) -> Result<Vec<f64>, IndexError>
fn depprob_pw<'x, Ix>( &self, pairs: &'x [(Ix, Ix)] ) -> Result<Vec<f64>, IndexError>
Compute dependence probability for a list of column pairs.
§Example
use lace::OracleT;
let oracle = Example::Animals.oracle().unwrap();
let depprobs = oracle.depprob_pw(&vec![(1, 12), (3, 2)]).unwrap();
assert_eq!(depprobs.len(), 2);
assert_eq!(depprobs[0], oracle.depprob(1, 12).unwrap());
assert_eq!(depprobs[1], oracle.depprob(3, 2).unwrap());
fn _rowsim_validation( &self, row_a: usize, row_b: usize, wrt: &Option<&Vec<usize>> ) -> Result<(), RowSimError>
sourcefn rowsim<RIx: RowIndex, CIx: ColumnIndex>(
&self,
row_a: RIx,
row_b: RIx,
wrt: Option<&[CIx]>,
variant: RowSimilarityVariant
) -> Result<f64, RowSimError>
fn rowsim<RIx: RowIndex, CIx: ColumnIndex>( &self, row_a: RIx, row_b: RIx, wrt: Option<&[CIx]>, variant: RowSimilarityVariant ) -> Result<f64, RowSimError>
Estimated row similarity between row_a
and row_b
§Arguments
- row_a: the first row index
- row_b: the second row index
- wrt: an optional vector of column indices to constrain the similarity.
Only the view to which the columns in
wrt
are assigned will be considered in the similarity calculation - variant: The type of row similarity to compute
§Example
use lace::RowSimilarityVariant;
use lace::OracleT;
let oracle = Example::Animals.oracle().unwrap();
let wrt: Option<&[usize]> = None;
let rowsim = oracle.rowsim(
"wolf",
"collie",
wrt,
RowSimilarityVariant::ViewWeighted,
).unwrap();
assert!(rowsim >= 0.0 && rowsim <= 1.0);
Adding context with wrt
(with respect to):
let rowsim_wrt = oracle.rowsim(
"wolf",
"collie",
Some(&["swims"]),
RowSimilarityVariant::ViewWeighted,
).unwrap();
assert_ne!(rowsim, rowsim_wrt);
sourcefn rowsim_pw<'x, RIx, CIx>(
&self,
pairs: &'x [(RIx, RIx)],
wrt: Option<&[CIx]>,
variant: RowSimilarityVariant
) -> Result<Vec<f64>, RowSimError>where
RIx: RowIndex,
CIx: ColumnIndex + Sync,
&'x [(RIx, RIx)]: IntoParallelIterator<Item = &'x (RIx, RIx)>,
fn rowsim_pw<'x, RIx, CIx>(
&self,
pairs: &'x [(RIx, RIx)],
wrt: Option<&[CIx]>,
variant: RowSimilarityVariant
) -> Result<Vec<f64>, RowSimError>where
RIx: RowIndex,
CIx: ColumnIndex + Sync,
&'x [(RIx, RIx)]: IntoParallelIterator<Item = &'x (RIx, RIx)>,
Compute row similarity for pairs of rows
§Example
use lace::RowSimilarityVariant;
use lace::OracleT;
let oracle = Example::Animals.oracle().unwrap();
let wrt: Option<&[usize]> = None;
let rowsims = oracle.rowsim_pw(
&[
("gorilla", "spider+monkey"),
("gorilla", "skunk"),
],
wrt,
RowSimilarityVariant::ViewWeighted,
).unwrap();
assert!(rowsims.iter().all(|&rowsim| 0.0 <= rowsim && rowsim <= 1.0));
sourcefn novelty<RIx: RowIndex, CIx: ColumnIndex>(
&self,
row_ix: RIx,
wrt: Option<&[CIx]>
) -> Result<f64, IndexError>
fn novelty<RIx: RowIndex, CIx: ColumnIndex>( &self, row_ix: RIx, wrt: Option<&[CIx]> ) -> Result<f64, IndexError>
Determine the relative novelty of a row.
Novelty is defined as the reciprocal of the mean size of categories (as a proportion of the total number of data) to which the row belongs. If a row is in smaller categories, it will have a higher novelty.
§Notes
Novelty is contextual; it must be compared to the novelty of all other rows. The mean novelty score will increase as the data become more divided. For example, if there is one view with two even categories, each row’s novelty will be 0.5; if there are four even categories, the mean novelty score will be 0.75.
§Example
Dolphins are more novel than rats
use lace::OracleT;
let wrt: Option<&[usize]> = None;
let oracle = Example::Animals.oracle().unwrap();
let novelty_dolphin = oracle.novelty("dolphin", wrt).unwrap();
let novelty_rat = oracle.novelty("rat", wrt).unwrap();
assert!(novelty_rat < novelty_dolphin);
Dolphins are more novel than rats with respect to their swimming.
let wrt = vec!["swims"];
let novelty_rat = oracle.novelty("rat", Some(&wrt)).unwrap();
let novelty_dolphin = oracle.novelty("dolphin", Some(&wrt)).unwrap();
assert!(novelty_dolphin > novelty_rat);
sourcefn mi<Ix: ColumnIndex>(
&self,
col_a: Ix,
col_b: Ix,
n: usize,
mi_type: MiType
) -> Result<f64, MiError>
fn mi<Ix: ColumnIndex>( &self, col_a: Ix, col_b: Ix, n: usize, mi_type: MiType ) -> Result<f64, MiError>
Estimate the mutual information between col_a
and col_b
using Monte
Carlo integration
Note: If both columns are categorical, the mutual information will be computed exactly.
§Arguments
- col_a: the first column index
- col_b: the second column index
- n: the number of samples for the Monte Carlo integral
- mi_type: the type of mutual information to return.
§Example
use lace::OracleT;
use lace::MiType;
let oracle = Example::Animals.oracle().unwrap();
let mi_flippers = oracle.mi(
"swims",
"flippers",
1000,
MiType::Iqr,
).unwrap();
let mi_fast = oracle.mi(
"swims",
"fast",
1000,
MiType::Iqr,
).unwrap();
assert!(mi_flippers > mi_fast);
The IQR normalized variant is normalized between 0 and 1
let mi_self = oracle.mi(
"swims",
"swims",
1000,
MiType::Iqr,
).unwrap();
assert_eq!(mi_self, 1.0);
Mutual information is not as well behaved for continuous variables since
differential (continuous) entropy can be negative. The Linfoot
MiType
can help. Linfoot is a transformed mutual information variant
that will be in the interval (0, 1).
let oracle = Example::Satellites.oracle().unwrap();
let linfoot = oracle.mi(
"longitude_radians_of_geo",
"Eccentricity",
1000,
MiType::Linfoot,
).unwrap();
assert!(0.0 < linfoot && linfoot < 1.0);
sourcefn mi_pw<Ix: ColumnIndex>(
&self,
col_pairs: &[(Ix, Ix)],
n: usize,
mi_type: MiType
) -> Result<Vec<f64>, MiError>
fn mi_pw<Ix: ColumnIndex>( &self, col_pairs: &[(Ix, Ix)], n: usize, mi_type: MiType ) -> Result<Vec<f64>, MiError>
Compute mutual information over pairs of columns
§Notes
This function has special optimizations over computing oracle::mi for pairs manually.
sourcefn entropy<Ix: ColumnIndex>(
&self,
col_ixs: &[Ix],
n: usize
) -> Result<f64, EntropyError>
fn entropy<Ix: ColumnIndex>( &self, col_ixs: &[Ix], n: usize ) -> Result<f64, EntropyError>
Estimate joint entropy
§Notes
The computation is exact under certain circumstances, otherwise the quantity is approximated via Monte Carlo integration.
- All columns are categorical, in which case the exact answer is computed via enumeration. The user should be aware combinatorial expansion of the terms in the summation.
- There is only one index in col_ixs and that column is categorical, gaussian, or labeler.
- There are two columns and one is categorical and the other is gaussian
§Arguments
- col_ixs: vector of column indices
- n: number of samples for the Monte Carlo integral.
§Examples
There is more information in the swims column than in the blue column
use lace::OracleT;
use lace::MiType;
let oracle = Example::Animals.oracle().unwrap();
// Close to uniformly distributed -> high entropy
let h_swims = oracle.entropy(
&["swims"],
10_000,
).unwrap();
// Close to deterministic -> low entropy
let h_blue = oracle.entropy(
&["blue"],
10_000,
).unwrap();
assert!(h_blue < h_swims);
The n
argument isn’t used for a single categorical column because
the exact computation is used.
let h_swims_10k = oracle.entropy(
&["swims"],
10_000,
).unwrap();
let h_swims_0 = oracle.entropy(
&["swims"],
1,
).unwrap();
assert!((h_swims_10k - h_swims_0).abs() < 1E-12);
sourcefn predictor_search<Ix: ColumnIndex>(
&self,
cols_t: &[Ix],
max_predictors: usize,
n_qmc_samples: usize
) -> Result<Vec<(usize, f64)>, IndexError>
fn predictor_search<Ix: ColumnIndex>( &self, cols_t: &[Ix], max_predictors: usize, n_qmc_samples: usize ) -> Result<Vec<(usize, f64)>, IndexError>
Determine the set of predictors that most efficiently account for the most information in a set of target columns.
§Notes
The estimates will be bad if the number of samples is too low to fill
the space. This will be particularly apparent in large numbers of
categorical variables where not filling the space means missing out on
entire classes. If you notice large jumps in the running info_prop (it
should be roughly log(n)
), then you are having bad error and will
need to up the number of samples. The max recommended number of
predictors plus targets is 10.
§Arguments
- cols_t: The target column indices. The ones you want to predict.
- max_predictors: The max number of predictors to search.
- n_qmc_samples: The number of QMC samples to use for entropy estimation
§Returns
A Vec of (col_ix, info_prop). The first column index is the column that is the single best predictor of the targets. The additional columns in the sequence are the columns added to the predictor set that maximizes the prediction. The information proportions are the proportions of information accounted for by the predictors with that column added to the set.
§Example
Which four columns should I choose to best predict whether an animals swims
use lace::examples::animals::Column;
use lace::OracleT;
let oracle = Example::Animals.oracle().unwrap();
let predictors = oracle.predictor_search(
&vec!["swims"],
4,
10_000
).unwrap();
// We asked for four predictors, so we get four.
assert_eq!(predictors.len(), 4);
// Whether something lives in water is the single best predictor of
// whether something swims.
let water: usize = Column::Water.into();
assert_eq!(predictors[0].0, water);
// All information proportions, without runaway approximation error,
// should be in [0, 1].
for (_col_ix, info_prop) in &predictors {
assert!(0.0 < *info_prop && *info_prop < 1.0)
}
// As we add predictors, the information proportions increase
// monotonically
for i in 1..4 {
assert!(predictors[i-1].1 < predictors[i].1);
}
sourcefn info_prop<IxT: ColumnIndex, IxX: ColumnIndex>(
&self,
cols_t: &[IxT],
cols_x: &[IxX],
n: usize
) -> Result<f64, InfoPropError>
fn info_prop<IxT: ColumnIndex, IxX: ColumnIndex>( &self, cols_t: &[IxT], cols_x: &[IxX], n: usize ) -> Result<f64, InfoPropError>
Compute the proportion of information in cols_t
accounted for by
cols_x
.
§Arguments
- cols_t: The target columns. Typically the target of a prediction.
- cols_x: The predictor columns.
- n: the number of samples for the Monte Carlo integral. Make n high
enough to integrate a function with as many dimensions as there
are total columns in
cols_t
andcols_x
.
§Notes
If all variables are discrete, the information proportion should be in [0, 1] (with minor deviations due to approximation error); the behavior is less predictable when one or more of the variables are continuous because entropy is a murky concept in continuous space.
§Example
Flippers tells us more about swimming that an animal’s being fast.
use lace::OracleT;
let oracle = Example::Animals.oracle().unwrap();
let ip_flippers = oracle.info_prop(
&["swims"],
&["flippers"],
1000,
).unwrap();
let ip_fast = oracle.info_prop(
&["swims"],
&["fast"],
1000,
).unwrap();
assert!(ip_flippers > ip_fast);
assert!(ip_flippers >= 0.0);
assert!(ip_flippers <= 1.0);
assert!(ip_fast >= 0.0);
assert!(ip_fast <= 1.0);
Adding more predictor columns increases the information proportion monotonically.
let ip_flippers_coastal = oracle.info_prop(
&["swims"],
&["flippers", "coastal"],
1000,
).unwrap();
assert!(ip_flippers < ip_flippers_coastal);
assert!(ip_flippers_coastal <= 1.0);
let ip_flippers_coastal_fast = oracle.info_prop(
&["swims"],
&["flippers", "coastal", "fast"],
1000,
).unwrap();
assert!(ip_flippers_coastal < ip_flippers_coastal_fast);
assert!(ip_flippers_coastal_fast <= 1.0);
sourcefn conditional_entropy<IxT: ColumnIndex, IxX: ColumnIndex>(
&self,
col_t: IxT,
cols_x: &[IxX],
n: usize
) -> Result<f64, ConditionalEntropyError>
fn conditional_entropy<IxT: ColumnIndex, IxX: ColumnIndex>( &self, col_t: IxT, cols_x: &[IxX], n: usize ) -> Result<f64, ConditionalEntropyError>
Conditional entropy H(T|X) where X is lists of column indices
§Arguments
- col_t: the target column index
- col_x: the observed column index
- n: the number of samples for the Monte Carlo integral
§Example
Knowing whether something has flippers leaves less information to account for WRT its swimming than does knowing whether it is fast and has a tail.
use lace::OracleT;
use lace::examples::animals::Column;
let oracle = Example::Animals.oracle().unwrap();
let mi_flippers = oracle.conditional_entropy(
"swims",
&["flippers"],
1000,
).unwrap();
let mi_fast_tail = oracle.conditional_entropy(
"swims",
&["fast", "tail"],
1000,
).unwrap();
assert!(mi_flippers < mi_fast_tail);
sourcefn conditional_entropy_pw<Ix: ColumnIndex>(
&self,
col_pairs: &[(Ix, Ix)],
n: usize,
kind: ConditionalEntropyType
) -> Result<Vec<f64>, ConditionalEntropyError>
fn conditional_entropy_pw<Ix: ColumnIndex>( &self, col_pairs: &[(Ix, Ix)], n: usize, kind: ConditionalEntropyType ) -> Result<Vec<f64>, ConditionalEntropyError>
Pairwise copmutation of conditional entreopy or information proportion
§Example
use lace::OracleT;
use lace::ConditionalEntropyType;
use lace::examples::animals::Column;
let oracle = Example::Animals.oracle().unwrap();
let col_pairs: Vec<(usize, usize)> = vec![
(Column::Swims.into(), Column::Flippers.into()),
(Column::Swims.into(), Column::Fast.into()),
];
let ce = oracle.conditional_entropy_pw(
&col_pairs,
1000,
ConditionalEntropyType::UnNormed
).unwrap();
assert_eq!(ce.len(), 2);
assert!(ce[0] < ce[1]);
… and specify information proportion instead of un-normalized conditional entropy changes the relationships.
let info_prop = oracle.conditional_entropy_pw(
&col_pairs,
1000,
ConditionalEntropyType::InfoProp
).unwrap();
assert_eq!(info_prop.len(), 2);
assert!(info_prop[0] > info_prop[1]);
sourcefn surprisal<RIx: RowIndex, CIx: ColumnIndex>(
&self,
x: &Datum,
row_ix: RIx,
col_ix: CIx,
state_ixs: Option<Vec<usize>>
) -> Result<Option<f64>, SurprisalError>
fn surprisal<RIx: RowIndex, CIx: ColumnIndex>( &self, x: &Datum, row_ix: RIx, col_ix: CIx, state_ixs: Option<Vec<usize>> ) -> Result<Option<f64>, SurprisalError>
Negative log PDF/PMF of a datum, x, in a specific cell of the table at position row_ix, col_ix.
surprisal
is different from logp
in that it works only on cells
that exist in the table. logp
works on hypothetical data that have
not been inserted into the table. Because the data in a cell is modeled
as a result of the running of the inference algorithm, the likelihood of
any cell is implicitly conditioned on all other cells in the table,
therefore surprisal
does not accept conditions.
§Notes
To compute surprisal of non-inserted data, use -logp(..)
.
§Arguments
- x: the value of which to compute the surprisal
- row_ix: The row index of
x
- col_ix: column index of
x
- state_ixs: The optional state indices over which to compute
surprisal. If
None
, use all states.
§Returns
None
if x is Missing
, otherwise returns Some(value)
§Example
A pig being fierce is more surprising than a lion being fierce.
use lace::OracleT;
use lace_data::Datum;
use lace::examples::animals::{Column, Row};
let oracle = Example::Animals.oracle().unwrap();
let present = Datum::Categorical(1_u8.into());
let s_pig = oracle.surprisal(
&present,
"pig",
"fierce",
None,
).unwrap();
let s_lion = oracle.surprisal(
&present,
"lion",
"fierce",
None,
).unwrap();
assert!(s_pig > s_lion);
sourcefn self_surprisal<RIx: RowIndex, CIx: ColumnIndex>(
&self,
row_ix: RIx,
col_ix: CIx,
state_ixs: Option<Vec<usize>>
) -> Result<Option<f64>, SurprisalError>
fn self_surprisal<RIx: RowIndex, CIx: ColumnIndex>( &self, row_ix: RIx, col_ix: CIx, state_ixs: Option<Vec<usize>> ) -> Result<Option<f64>, SurprisalError>
Get the surprisal of the datum in a cell.
§Arguments
- row_ix: The hypothetical row index of the cell.
- col_ix: The hypothetical column index of the cell.
- state_ixs: The optional state indices over which to compute
surprisal. If
None
, use all states.
§Example
A pig is fierce, which is more surprising than a lion being fierce.
use lace::OracleT;
let oracle = Example::Animals.oracle().unwrap();
let s_pig = oracle.self_surprisal(
"pig",
"fierce",
None,
).unwrap();
let s_lion = oracle.self_surprisal(
"lion",
"fierce",
None,
).unwrap();
assert!(s_pig > s_lion);
sourcefn datum<RIx: RowIndex, CIx: ColumnIndex>(
&self,
row_ix: RIx,
col_ix: CIx
) -> Result<Datum, IndexError>
fn datum<RIx: RowIndex, CIx: ColumnIndex>( &self, row_ix: RIx, col_ix: CIx ) -> Result<Datum, IndexError>
Get the datum at an index
§Example
use lace::OracleT;
use lace_data::Datum;
let oracle = Example::Animals.oracle().unwrap();
let x = oracle.datum("pig", "fierce").unwrap();
assert_eq!(x, Datum::Categorical(1_u8.into()));
Getting data from the satellites dataset
let oracle = Example::Satellites.oracle().unwrap();
let x = oracle.datum(
"International Space Station (ISS [first element Zarya])",
"Class_of_Orbit"
).unwrap();
assert_eq!(x, Datum::Categorical("LEO".into()));
let y = oracle.datum(
"International Space Station (ISS [first element Zarya])",
"Period_minutes",
).unwrap();
assert_eq!(y, Datum::Continuous(92.8));
sourcefn logp<Ix: ColumnIndex, GIx: ColumnIndex>(
&self,
col_ixs: &[Ix],
vals: &[Vec<Datum>],
given: &Given<GIx>,
state_ixs_opt: Option<&[usize]>
) -> Result<Vec<f64>, LogpError>
fn logp<Ix: ColumnIndex, GIx: ColumnIndex>( &self, col_ixs: &[Ix], vals: &[Vec<Datum>], given: &Given<GIx>, state_ixs_opt: Option<&[usize]> ) -> Result<Vec<f64>, LogpError>
Compute the log PDF/PMF of a set of values possibly conditioned on the values of other columns
§Arguments
- col_ixs: An d-length vector of the indices of the columns comprising the data.
- vals: An n-length vector of d-length vectors. The joint probability of each of the n entries will be computed.
- given: an optional set of observations on which to condition the PMF/PDF
- state_ixs_opt: An optional vector of the state indices to use for the
logp computation. If
None
, all states are used.
§Returns
A vector, p
, where p[i]
is the log PDF/PMF corresponding to the data
in vals[i]
.
§Example
The probability that an animals swims is lower than the probability that it swims given that is has flippers.
use lace::OracleT;
use lace_data::Datum;
use lace::Given;
let oracle = Example::Animals.oracle().unwrap();
let logp_swims = oracle.logp(
&["swims"],
&[vec![Datum::Categorical(0_u8.into())], vec![Datum::Categorical(1_u8.into())]],
&Given::<usize>::Nothing,
None,
).unwrap();
let logp_swims_given_flippers = oracle.logp(
&["swims"],
&[
vec![Datum::Categorical(0_u8.into())],
vec![Datum::Categorical(1_u8.into())]
],
&Given::Conditions(
vec![("flippers", Datum::Categorical(1_u8.into()))]
),
None,
).unwrap();
// Also: exhaustive probabilities should sum to one.
assert!(logp_swims[1] < logp_swims_given_flippers[1]);
let sum_p = logp_swims
.iter()
.map(|lp| lp.exp())
.sum::<f64>();
assert!((sum_p - 1.0).abs() < 1E-10);
let sum_p_given = logp_swims_given_flippers
.iter()
.map(|lp| lp.exp())
.sum::<f64>();
assert!((sum_p_given - 1.0).abs() < 1E-10);
For missing not at random columns, you can ask about the likelihood of missing values.
let oracle = Example::Satellites.oracle().unwrap();
let logps = oracle.logp(
&["longitude_radians_of_geo", "Type_of_Orbit", "Period_minutes"],
&[
vec![Datum::Missing, Datum::Missing, Datum::Continuous(70.0)],
vec![Datum::Missing, Datum::Categorical("Polar".into()), Datum::Continuous(70.0)],
vec![Datum::Continuous(1.2), Datum::Missing, Datum::Continuous(70.0)],
],
&Given::<usize>::Nothing,
None,
).unwrap();
assert!(logps[0] > logps[1]);
assert!(logps[0] > logps[2]);
And you can condition on missingness
let oracle = Example::Satellites.oracle().unwrap();
let logp = oracle.logp(
&["Period_minutes"],
&[
vec![Datum::Continuous(70.0)], // ~LEO
vec![Datum::Continuous(300.0)], // ~MEO
vec![Datum::Continuous(1440.0)], // ~GEO
],
&Given::<usize>::Nothing,
None,
).unwrap();
let logp_missing = oracle.logp(
&["Period_minutes"],
&[
vec![Datum::Continuous(70.0)], // ~LEO
vec![Datum::Continuous(300.0)], // ~MEO
vec![Datum::Continuous(1440.0)], // ~GEO
],
&Given::Conditions(vec![
("longitude_radians_of_geo", Datum::Missing)
]),
None,
).unwrap();
// LEO is more likely if no 'longitude_radians_of_geo' was given
assert!(logp_missing[0] > logp[0]); // p LEO goes up w/ missing
// GEO is less likely if no 'longitude_radians_of_geo' was given
assert!(logp_missing[2] < logp[2]); // p GEO goes down
sourcefn logp_scaled<Ix: ColumnIndex, GIx: ColumnIndex>(
&self,
col_ixs: &[Ix],
vals: &[Vec<Datum>],
given: &Given<GIx>,
state_ixs_opt: Option<&[usize]>
) -> Result<Vec<f64>, LogpError>where
Self: Sized,
fn logp_scaled<Ix: ColumnIndex, GIx: ColumnIndex>(
&self,
col_ixs: &[Ix],
vals: &[Vec<Datum>],
given: &Given<GIx>,
state_ixs_opt: Option<&[usize]>
) -> Result<Vec<f64>, LogpError>where
Self: Sized,
A version of logp
where the likelihood are scaled by the column modes.
The goal of this function is to create a notion of logp that is more standardized across rare variants. For example, if there is a class, A, of object with a highly variable distribution, we would have a hard time comparing the surprisal of class A variant to the surprisal of variants under a non-variable class B.
That’s a long way of saying that this is a hack and there’s not any mathematical rigor behind it.
§Notes
- The contribution of each column is individually scaled based on the max likelihood of the individual mixture model, then the geometric mean is computed over column contributions.
- Generating the cache is expensive, so if you plan on calling the
function with the same
col_ixs
andgiven
, then you should really pre-generate the cache.
§Arguments
- col_ixs: An d-length vector of the indices of the columns comprising the data.
- vals: An n-length vector of d-length vectors. The joint probability of each of the n entries will be computed.
- given: an optional set of observations on which to condition the PMF/PDF
- state_ixs_opt: An optional vector of the state indices to use for the
logp computation. If
None
, all states are used.
§Example
use lace::{OracleT, Datum, Given};
let oracle = Example::Animals.oracle().unwrap();
let logp_scaled = oracle.logp_scaled(
&["swims"],
&[vec![Datum::Categorical(0_u8.into())]],
&Given::<usize>::Nothing,
None,
).unwrap()[0];
sourcefn draw<RIx: RowIndex, CIx: ColumnIndex, R: Rng>(
&self,
row_ix: RIx,
col_ix: CIx,
n: usize,
rng: &mut R
) -> Result<Vec<Datum>, IndexError>
fn draw<RIx: RowIndex, CIx: ColumnIndex, R: Rng>( &self, row_ix: RIx, col_ix: CIx, n: usize, rng: &mut R ) -> Result<Vec<Datum>, IndexError>
Draw n
samples from the cell at [row_ix, col_ix]
.
§Arguments
- row_ix: the row index
- col_ix, the column index
- n: the number of draws to collect
§Example
Draw 12 values of a Pig’s fierceness.
use lace::OracleT;
let oracle = Example::Animals.oracle().unwrap();
let mut rng = rand::thread_rng();
let xs = oracle.draw("pig", "fierce", 12, &mut rng).unwrap();
assert_eq!(xs.len(), 12);
assert!(xs.iter().all(|x| x.is_categorical()));
sourcefn simulate<Ix: ColumnIndex, GIx: ColumnIndex, R: Rng>(
&self,
col_ixs: &[Ix],
given: &Given<GIx>,
n: usize,
state_ixs_opt: Option<Vec<usize>>,
rng: &mut R
) -> Result<Vec<Vec<Datum>>, SimulateError>
fn simulate<Ix: ColumnIndex, GIx: ColumnIndex, R: Rng>( &self, col_ixs: &[Ix], given: &Given<GIx>, n: usize, state_ixs_opt: Option<Vec<usize>>, rng: &mut R ) -> Result<Vec<Vec<Datum>>, SimulateError>
Simulate values from joint or conditional distribution
§Arguments
- col_ixs: a d-length vector containing the column indices to simulate
- given: optional observations by which to constrain the simulation, i.e., simulate from p(col_ixs|given)
- n: the number of simulation
- state_ixs_opt: The indices of the states from which to simulate. If
None
, simulate from all states.
§Returns
An n-by-d vector of vectors, x
, where x[i][j]
is the
jth dimension of the ith simulation.
§Example
Simulate the appearance of a hypothetical animal that is fierce and fast.
use lace::OracleT;
use lace::Given;
use lace_data::Datum;
let oracle = Example::Animals.oracle().unwrap();
let mut rng = rand::thread_rng();
let given = Given::Conditions(
vec![
("fierce", Datum::Categorical(1_u8.into())),
("fast", Datum::Categorical(1_u8.into())),
]
);
let xs = oracle.simulate(
&["black", "tail"],
&given,
10,
None,
&mut rng,
).unwrap();
assert_eq!(xs.len(), 10);
assert!(xs.iter().all(|x| x.len() == 2));
sourcefn impute<RIx: RowIndex, CIx: ColumnIndex>(
&self,
row_ix: RIx,
col_ix: CIx,
with_uncertainty: bool
) -> Result<(Datum, Option<f64>), IndexError>
fn impute<RIx: RowIndex, CIx: ColumnIndex>( &self, row_ix: RIx, col_ix: CIx, with_uncertainty: bool ) -> Result<(Datum, Option<f64>), IndexError>
Return the most likely value for a cell in the table along with the confidence in that imputation.
Imputation can be done on non-missing cells and will re-predict the
value of the cell rather than returning the existing value. To get
the current value of a cell, use Oracle::data
.
Impute uncertainty is the mean total variation distance between each state’s impute distribution and the average impute distribution.
§Arguments
- row_ix: the row index of the cell to impute
- col_ix: the column index of the cell to impute
- with_uncertainty: if
true
compute and return the uncertainty
§Returns
A (value, uncertainty_option)
tuple.
§Example
Impute the value of swims for an dolphin and an polar bear.
use lace::OracleT;
use lace_data::Datum;
let oracle = Example::Animals.oracle().unwrap();
let dolphin_swims = oracle.impute(
"dolphin",
"swims",
true,
).unwrap();
let bear_swims = oracle.impute(
"polar+bear",
"swims",
true,
).unwrap();
assert_eq!(dolphin_swims.0, Datum::Categorical(1_u8.into()));
assert_eq!(bear_swims.0, Datum::Categorical(1_u8.into()));
let dolphin_swims_unc = dolphin_swims.1.unwrap();
let bear_swims_unc = bear_swims.1.unwrap();
// Given that a polar bear is a furry, footed mammal, it's harder to
// model why we know it swims.
assert!(bear_swims_unc > dolphin_swims_unc);
Imputing a missing-not-at-random value will still return a value
let oracle = Example::Satellites.oracle().unwrap();
let (imp, _) = oracle.impute(
"X-Sat",
"Type_of_Orbit",
true,
).unwrap();
assert_eq!(imp, Datum::Categorical("Sun-Synchronous".into()));
let (imp, _) = oracle.impute(
"X-Sat",
"longitude_radians_of_geo",
true,
).unwrap();
assert!((imp.to_f64_opt().unwrap() - 0.18514237733859296).abs() < 1e-10);
sourcefn predict<Ix: ColumnIndex, GIx: ColumnIndex>(
&self,
col_ix: Ix,
given: &Given<GIx>,
with_uncertainty: bool,
state_ixs_opt: Option<&[usize]>
) -> Result<(Datum, Option<f64>), PredictError>
fn predict<Ix: ColumnIndex, GIx: ColumnIndex>( &self, col_ix: Ix, given: &Given<GIx>, with_uncertainty: bool, state_ixs_opt: Option<&[usize]> ) -> Result<(Datum, Option<f64>), PredictError>
Return the most likely value for a column given a set of conditions along with the confidence in that prediction.
§Arguments
- col_ix: the index of the column to predict
- given: optional observations by which to constrain the prediction
- with_uncertainty: if true, copmute and return uncertainty
- state_ixs_opt: Optional vector of state indices from which to predict, if None, use all states.
§Returns
A (value, uncertainty_option)
Tuple
§Examples
Predict the most likely class of orbit for given longitude of Geosynchronous orbit.
use lace::examples::Example;
use lace::prelude::*;
let oracle = Example::Satellites.oracle().unwrap();
let (pred, _) = oracle.predict(
"Class_of_Orbit",
&Given::Conditions(vec![
("longitude_radians_of_geo", Datum::Continuous(1.0))
]),
false,
None,
).unwrap();
assert_eq!(pred, Datum::Categorical("GEO".into()));
Predict the most likely class of orbit given the
longitude_radians_of_geo
field is missing. Note: this requires the
column to be missing-not-at-random.
let (pred_long_missing, _) = oracle.predict(
"Class_of_Orbit",
&Given::Conditions(vec![
("longitude_radians_of_geo", Datum::Missing)
]),
false,
None,
).unwrap();
assert_eq!(pred_long_missing, Datum::Categorical("LEO".into()));
Predict a categorical value that is missing not at random
let (pred_type, _) = oracle.predict(
"Type_of_Orbit",
&Given::Conditions(vec![(
"Class_of_Orbit", Datum::Categorical("MEO".into()))
]),
false,
None,
).unwrap();
assert_eq!(pred_type, Datum::Missing);
Predict a continuous value that is missing not at random and is missing most of the time
let (pred_type, _) = oracle.predict(
"longitude_radians_of_geo",
&Given::<usize>::Nothing,
false,
None,
).unwrap();
assert_eq!(pred_type, Datum::Missing);
Note that the uncertainty when the prediction is missing is the
uncertainty only off the missing prediction. For example, the
longitude_radians_of_geo
value is only present for geosynchronous
satellites, which have an orbital period of around 1440 minutes. We can
see the uncertainty drop as we condition on periods farther away from
1440 miuntues.
let (pred_close, unc_close) = oracle.predict(
"longitude_radians_of_geo",
&Given::Conditions(vec![
("Period_minutes", Datum::Continuous(1200.0))
]),
true,
None,
).unwrap();
assert_eq!(pred_close, Datum::Missing);
let (pred_far, unc_far) = oracle.predict(
"longitude_radians_of_geo",
&Given::Conditions(vec![
("Period_minutes", Datum::Continuous(1000.0))
]),
true,
None,
).unwrap();
assert_eq!(pred_far, Datum::Missing);
dbg!(&unc_far, &unc_close);
assert!(unc_far.unwrap() < unc_close.unwrap());
sourcefn variability<Ix: ColumnIndex, GIx: ColumnIndex>(
&self,
col_ix: Ix,
given: &Given<GIx>,
state_ixs_opt: Option<&[usize]>
) -> Result<Variability, VariabilityError>
fn variability<Ix: ColumnIndex, GIx: ColumnIndex>( &self, col_ix: Ix, given: &Given<GIx>, state_ixs_opt: Option<&[usize]> ) -> Result<Variability, VariabilityError>
Compute the variability of a conditional distribution
§Notes
- Returns variance for Continuous and Count columns
- Returns Entropy for Categorical columns
§Arguments
- col_ix: the index of the column for which to compute the variability
- given: optional observations by which to constrain the prediction
- state_ixs_opt: Optional vector of state indices from which to compute, if None, use all states.
sourcefn feature_error<Ix: ColumnIndex>(
&self,
col_ix: Ix
) -> Result<(f64, f64), IndexError>
fn feature_error<Ix: ColumnIndex>( &self, col_ix: Ix ) -> Result<(f64, f64), IndexError>
Compute the error between the observed data in a feature and the feature model.
§Returns
An (error, centroid)
tuple where error a float in [0, 1], and the
centroid is the centroid of the error. For continuous features, the
error is derived from the probability integral transform, and for
discrete variables the error is the error between the inferred and
empirical CDFs.
fn _logp_unchecked( &self, col_ixs: &[usize], vals: &[Vec<Datum>], given: &Given<usize>, state_ixs_opt: Option<&[usize]>, scaled: bool ) -> Vec<f64>
fn _simulate_unchecked<R: Rng>( &self, col_ixs: &[usize], given: &Given<usize>, n: usize, state_ixs_opt: Option<Vec<usize>>, rng: &mut R ) -> Vec<Vec<Datum>>
fn _surprisal_unchecked( &self, x: &Datum, row_ix: usize, col_ix: usize, state_ixs_opt: Option<Vec<usize>> ) -> Option<f64>
sourcefn _dual_entropy(&self, col_a: usize, col_b: usize, n: usize) -> f64
fn _dual_entropy(&self, col_a: usize, col_b: usize, n: usize) -> f64
specialization for column pairs. If a specialization is not founds for the specific columns types, will fall back to MC approximation
sourcefn _mi_components(&self, col_a: usize, col_b: usize, n: usize) -> MiComponents
fn _mi_components(&self, col_a: usize, col_b: usize, n: usize) -> MiComponents
Get the components of mutual information between two columns
sourcefn _sobol_joint_entropy(&self, col_ixs: &[usize], n: usize) -> f64
fn _sobol_joint_entropy(&self, col_ixs: &[usize], n: usize) -> f64
fn _mc_joint_entropy<R: Rng>( &self, col_ixs: &[usize], n: usize, rng: &mut R ) -> f64
fn _entropy_unchecked(&self, col_ixs: &[usize], n: usize) -> f64
sourcefn _impute_uncertainty(&self, row_ix: usize, col_ix: usize) -> f64
fn _impute_uncertainty(&self, row_ix: usize, col_ix: usize) -> f64
Computes the predictive uncertainty for the datum at (row_ix, col_ix) as mean the pairwise KL divergence between the components to which the datum is assigned.
§Notes
Impute uncertainty applies only to impute operations where we want to recover a specific missing (or not missing) entry. There is no special handling of non-missing entries.
§Arguments
- row_ix: the row index
- col_ix: the column index
sourcefn _predict_uncertainty(
&self,
col_ix: usize,
given: &Given<usize>,
state_ixs_opt: Option<&[usize]>
) -> f64
fn _predict_uncertainty( &self, col_ix: usize, given: &Given<usize>, state_ixs_opt: Option<&[usize]> ) -> f64
Computes the uncertainty associated with predicting the value of a features with optional given conditions. Uses Jensen-Shannon divergence computed on the mixture of mixtures.
§Notes
Predict uncertainty applies only to prediction of hypothetical values, and not to imputation of in-table values.
§Arguments
- col_ix: the column index
- given_opt: an optional list of (column index, value) tuples designating other observations on which to condition the prediction