Struct analiticcl::VariantModel[][src]

pub struct VariantModel {
Show fields pub decoder: VocabDecoder, pub encoder: VocabEncoder, pub alphabet: Alphabet, pub index: AnaIndex, pub sortedindex: BTreeMap<u16, Vec<AnaValue>>, pub have_freq: bool, pub freq_sum: usize, pub weights: Weights, pub lexicons: Vec<String>, pub confusables: Vec<Confusable>, pub confusables_before_pruning: bool, pub variantclusters: VariantClusterMap, pub debug: bool,
}

The VariantModel is the most high-level model of analiticcl, it holds all data required for variant matching.

Fields

decoder: VocabDecoder

Maps Vocabulary IDs to their textual strings and other related properties

encoder: VocabEncoder

Map strings to vocabulary IDs

alphabet: Alphabet

Defines the alphabet used for the variant model

index: AnaIndex

The main index, mapping anagrams to instances

sortedindex: BTreeMap<u16, Vec<AnaValue>>

A secondary sorted index indices of the outer vector correspond to the length of an anagram (in chars) - 1 Inner vector is always sorted

have_freq: bool

Does the model have frequency information?

freq_sum: usize

Total sum of all frequencies in the lexicon

weights: Weights

Weights used in scoring

lexicons: Vec<String>

Stores the names of the loaded lexicons, they will be referenced by index from individual items for provenance reasons

confusables: Vec<Confusable>

Holds weighted confusable recipes that can be used in scoring and ranking

confusables_before_pruning: bool

Process confusables before pruning by max_matches

variantclusters: VariantClusterMap

Groups clusters of variants (either from explicitly loaded variant files or in a later stage perhaps also computed)

debug: bool

Implementations

impl VariantModel[src]

pub fn new(alphabet_file: &str, weights: Weights, debug: bool) -> VariantModel[src]

Instantiate a new variant model

pub fn new_with_alphabet(
    alphabet: Alphabet,
    weights: Weights,
    debug: bool
) -> VariantModel
[src]

Instantiate a new variant model, explicitly passing an alphabet rather than loading one from file.

pub fn set_confusables_before_pruning(&mut self)[src]

Configure the model to match against known confusables prior to pruning on maximum weight. This may lead to better results but may have a significant performance impact.

pub fn alphabet_size(&self) -> CharIndexType[src]

Returns the size of the alphabet, this is typically +1 longer than the actual alphabet file as it includes the UNKNOWN symbol.

pub fn get_or_create_index<'a, 'b>(
    &'a mut self,
    anahash: &'b AnaValue
) -> &'a mut AnaIndexNode
[src]

Get an item from the index or insert it if it doesn’t exist yet

pub fn build(&mut self)[src]

Build the anagram index (and secondary index) so the model is ready for variant matching

pub fn contains_key(&self, key: &AnaValue) -> bool[src]

Tests if the anagram value exists in the index

pub fn get_anagram_instances(&self, text: &str) -> Vec<&VocabValue>[src]

Get all anagram instances for a specific entry

pub fn get(&self, text: &str) -> Option<&VocabValue>[src]

Get an exact item in the lexicon (if it exists)

pub fn has(&self, text: &str) -> bool[src]

Tests if the lexicon has a specific entry, by text

pub fn get_vocab(&self, vocab_id: VocabId) -> Option<&VocabValue>[src]

Resolves a vocabulary ID

pub fn read_alphabet(&mut self, filename: &str) -> Result<(), Error>[src]

Read the alphabet from a TSV file The file contains one alphabet entry per line, but may consist of multiple tab-separated alphabet entries on that line, which will be treated as the identical. The alphabet is not limited to single characters but may consist of longer string, a greedy matching approach will be used so order matters (but only for this)

pub fn read_confusablelist(&mut self, filename: &str) -> Result<(), Error>[src]

Read a confusiblelist from a TSV file Contains edit scripts in the first columned (formatted in sesdiff style) and optionally a weight in the second column. favourable confusables have a weight > 1.0, unfavourable ones are < 1.0 (penalties) Weight values should be relatively close to 1.0 as they are applied to the entire score

pub fn add_to_confusables(
    &mut self,
    editscript: &str,
    weight: f64
) -> Result<(), Error>
[src]

Add a confusable

pub fn read_vocabulary(
    &mut self,
    filename: &str,
    params: &VocabParams,
    lexicon_weight: f32
) -> Result<(), Error>
[src]

Read vocabulary (a lexicon or corpus-derived lexicon) from a TSV file May contain frequency information The parameters define what value can be read from what column

pub fn read_variants(
    &mut self,
    filename: &str,
    lexicon_weight: f32
) -> Result<(), Error>
[src]

Read a variants list of equally weighted variants from a TSV file Each line simply contains tab-separated variants and all entries on a single line are considered variants. Consumed much less memory than weighted variants.

pub fn read_weighted_variants(
    &mut self,
    filename: &str,
    lexicon_weight: f32,
    intermediate: bool
) -> Result<(), Error>
[src]

Read a weighted variant list from a TSV file. Contains a canonical/reference form in the first column, and variants with score (two columns) in the following columns. Consumes much more memory than equally weighted variants.

pub fn add_to_vocabulary(
    &mut self,
    text: &str,
    frequency: Option<u32>,
    lexicon_weight: Option<f32>,
    lexicon_index: u8
) -> VocabId
[src]

Adds an entry in the vocabulary

pub fn find_variants(
    &self,
    input: &str,
    max_anagram_distance: u8,
    max_edit_distance: u8,
    max_matches: usize,
    score_threshold: f64,
    stop_criterion: StopCriterion,
    cache: Option<&mut Cache>
) -> Vec<(VocabId, f64)>
[src]

Find variants in the vocabulary for a given string (in its totality), returns a vector of vocabulaly ID and score pairs The resulting vocabulary Ids can be resolved through get_vocab()

pub fn find_nearest_anahashes<'a>(
    &'a self,
    focus: &AnaValue,
    normstring: &Vec<u8>,
    max_distance: u8,
    stop_criterion: StopCriterion,
    cache: Option<&mut HashSet<AnaValue>>
) -> HashSet<&'a AnaValue>
[src]

Find the nearest anahashes that exists in the model (computing anahashes in the neigbhourhood if needed).

pub fn gather_instances(
    &self,
    nearest_anagrams: &HashSet<&AnaValue>,
    querystring: &[u8],
    query: &str,
    max_edit_distance: u8
) -> Vec<(VocabId, Distance)>
[src]

Gather instances and their edit distances, given a search string (normalised to the alphabet) and anagram hashes

pub fn score_and_rank(
    &self,
    instances: Vec<(VocabId, Distance)>,
    input: &str,
    max_matches: usize,
    score_threshold: f64
) -> Vec<(VocabId, f64)>
[src]

Rank and score all variants

pub fn rescore_confusables(
    &self,
    results: &mut Vec<(VocabId, f64)>,
    input: &str
)
[src]

Rescores the scored variants by testing against known confusables

pub fn compute_confusable_weight(&self, input: &str, candidate: VocabId) -> f64[src]

compute weight over known confusables Should return 1.0 when there are no known confusables < 1.0 when there are unfavourable confusables

1.0 when there are favourable confusables

pub fn add_to_reverse_index(
    &self,
    reverseindex: &mut ReverseIndex,
    input: &str,
    matched_vocab_id: VocabId,
    score: f64
)
[src]

Adds the input item to the reverse index, as instantiation of the given vocabulary id

Auto Trait Implementations

Blanket Implementations

impl<T> Any for T where
    T: 'static + ?Sized
[src]

impl<T> Borrow<T> for T where
    T: ?Sized
[src]

impl<T> BorrowMut<T> for T where
    T: ?Sized
[src]

impl<T> From<T> for T[src]

impl<T, U> Into<U> for T where
    U: From<T>, 
[src]

impl<T, U> TryFrom<U> for T where
    U: Into<T>, 
[src]

type Error = Infallible

The type returned in the event of a conversion error.

impl<T, U> TryInto<U> for T where
    U: TryFrom<T>, 
[src]

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.