Struct caribon::Parser [−] [src]

#[repr(C)]
pub struct Parser { /* fields omitted */ }

Parser which can load a string, detects repetition on it and outputs an HTML file.

Methods

`impl Parser`
[src]

`fn list_languages() -> Vec<&'static str>`

Returns a vector containing all languages that are implemented.

These values are correct values to give to Parser::new.

`fn get_ignored_from_string(list: &str) -> Vec<String>`

Returns a vector of ignored words from a string.

Arguments

list – A space or comma separated string, containing words that should be ignored (i.e., don't count repetitions on them).

Example

let v = caribon::Parser::get_ignored_from_string("some, words; to ignore");
assert_eq!(v.len(), 4);

`fn get_ignored_from_lang(lang: &str) -> Vec<String>`

Returns a vector containing the default ignored words for this language.

`fn new(lang: &str) -> Result<Parser>`

Returns Ok(Parser) if language is ok, Err(Error) else.

Arguments

lang – The input text language. This will be used to create the stemmer; it also determines what list of ignored words to use. If lang == "no_stemmer", stemming is disabled

Example

let result = caribon::Parser::new("english");
assert!(result.is_ok());

let result = caribon::Parser::new("incorrect language");
assert!(result.is_err());

let result = caribon::Parser::new("no_stemmer");
assert!(result.is_ok());

`fn with_fuzzy(self, fuzzy: Option<f32>) -> Parser`

Sets fuzzy string matching (default None)

If sets to Some(x), instead of just using equality to compare string, the Parser will use Levenhstein distance.

Arguments

fuzzy – None to deactivate fuzzy matching, or Some(x) to activate it. x must be between 0.0 and 1.0 as it corresponds to the relative distance, e.g "Caribon" has a length of 7 so if fuzzy is set with Some(0.5), it will requires a maximal distance of 3 (actually 3.5 but distance is Integer)

Example

let mut parser = caribon::Parser::new("english").unwrap()
                                            .with_fuzzy(Some(0.25));
let mut ast = parser.tokenize("trust Rust").unwrap();
parser.detect_local(&mut ast, 1.9);
let result = parser.ast_to_markdown(&ast); // not the best output format, but easy to debug
assert_eq!(&result, "**trust** **Rust**"); // these two words do have some letters in common

`fn with_max_distance(self, max_dist: u32) -> Parser`

Sets max distance for repetitions (default 50).

Arguments

max_dist – A number corresponding to a number of words. If two occurences of a same word are separated by more than this distance, it will not be counted as a repetition.

Examples

let mut parser = caribon::Parser::new("english").unwrap()
                                            .with_max_distance(20);
let mut ast = parser.tokenize("This word is repeated in a few words").unwrap();
parser.detect_local(&mut ast, 1.9);
let result = parser.ast_to_markdown(&ast); // not the best output format, but easy to debug
assert_eq!(&result, "This **word** is repeated in a few **words**"); //repetition detected

let mut parser = caribon::Parser::new("english").unwrap()
                                            .with_max_distance(2);
let mut ast = parser.tokenize("This word is repeated in a few words").unwrap();
parser.detect_local(&mut ast, 1.9);
let result = parser.ast_to_markdown(&ast); // not the best output format, but easy to debug
assert_eq!(&result, "This word is repeated in a few words"); // repetition not detected because of
                                                             // excessively low max_distance

`fn with_html(self, html: bool) -> Parser`

Sets HTML detection in input (default true).

You should set it to false if a text is text-formatted, and to true if it contains HTML.

`fn with_ignore_proper(self, proper: bool) -> Parser`

Sets whether repetition detection should ignore proper nouns (default false).

Basically, if set to true, words that start with a capital and are not at the beginning of a sentence won't be counted for repetitions. Currently, there are still counted if they are in the beginning of a sentence, but with most texts it won't be enough to highligth them as repetitions.

`fn with_ignored(self, list: &str) -> Parser`

Sets the ignored list with a list of words contained in the argument string.

This method replaces the default list of ignored words. If you want to add ignored words to the default list of a language, use with_ignored instead.

Arguments

list – A comma or whitespace separated list of words that should be ignored.

`fn with_more_ignored(self, list: &str) -> Parser`

Appends a list of words contained in the argument string to the list of ignored words

Arguments

list – A comma or whitespace separated list of words that should be ignored.

`fn tokenize(&mut self, s: &str) -> Result<Ast>`

Tokenize a string into a list of words.

This is the step that converts a string to some inner representation.

Arguments

s – The string to tokenize.

`fn detect_local(&mut self, ast: &mut Ast, threshold: f32)`

Detect the local number of repetitions.

For each word, the repetition value is set to the total number of occurences of this word since there has been hat least self.max_distance between two occurences.

It is the default algorithm, and probably the one you want to use.

Arguments

ast – A mutable reference to an internal data structure returned by tokenize threshold – The threshold to consider a repetition (e.g. 1.9)

Example

let mut parser = caribon::Parser::new("english").unwrap();
let mut ast = parser.tokenize("Testing whether this repetition detector works or does not work").unwrap();
parser.detect_local(&mut ast, 1.9);
let result = parser.ast_to_markdown(&ast); // not the most useful output format, but the easiest to debug
assert_eq!(&result, "Testing whether this repetition detector **works** or does not **work**");

`fn words_stats(&self, ast: &Ast) -> (HashMap<String, f32>, u32)`

Returns stats about the words

Arguments

words – A reference to a list of words

Returns

This method retuns a tuple: * the first element is a hashmap between stemmed strings and the number of occurences of this word * the second oelement is the total number of (valid) words in the list (non counting whitespace, HTML tags...)

`fn detect_global(&self, ast: &mut Ast, threshold: f32)`

Detect the global number of repetitions.

For each word, repetition value is set to the total number of occurences of this word in whole text, divided by total number of words in the text

Arguments

vec – A vector of Word.
threshold – A threshold to highlight repetitions (e.g. 0.01)

`fn ast_to_terminal(&self, ast: &Ast) -> String`

Display the words to terminal, higlighting the repetitions.

Use terminal colour codes to highlight the repetitions

Arguments

ast – A reference to Ast, returned by tokenize and modified by detect_*

`fn ast_to_repetitions(&self, ast: &Ast) -> Vec<Repetition>`

Returns a list of repetitions found in the AST.

`fn ast_to_ispell(&self, ast: &Ast, list: bool, offset: usize) -> String`

Display repetitions in an ispell-compatible manner

This is used if you want to run caribon instead of a text editor, making it pretend to be ispell

Will print as errors words whose repetition value is above threshold

Arguments

ast: the ast that must be printed
list: only display error (--list option)
offset: the offset to beginning of the line

`fn ast_to_markdown(&self, ast: &Ast) -> String`

Display the Ast to markdown, emphasizing the repetitions.

This is more limited than HTML or even terminal output, as it completely discards colour information that have been passed by detect_* methods, but it might be useful if e.g. you want to generate some files later with Pandoc (or any other program).

Arguments

ast – An Ast containing repetitions.

`fn ast_to_html(&self, ast: &mut Ast, standalone: bool) -> String`

Display the Ast to HTML, higlighting the repetitions.

Use some basic CSS/Js for underlining repetitions and highlighting the over occurrences of the word under the mouse.

Arguments

ast – An Ast containing repetitions.
standalone – If true, generate a standalone HTML file, else just an HTML fragment

Struct caribon::Parser [−] [src]

Methods

impl Parser[src]

fn list_languages() -> Vec<&'static str>

fn get_ignored_from_string(list: &str) -> Vec<String>

fn get_ignored_from_lang(lang: &str) -> Vec<String>

fn new(lang: &str) -> Result<Parser>

fn with_fuzzy(self, fuzzy: Option<f32>) -> Parser

fn with_max_distance(self, max_dist: u32) -> Parser

fn with_html(self, html: bool) -> Parser

fn with_ignore_proper(self, proper: bool) -> Parser

fn with_ignored(self, list: &str) -> Parser

fn with_more_ignored(self, list: &str) -> Parser

fn tokenize(&mut self, s: &str) -> Result<Ast>

fn detect_local(&mut self, ast: &mut Ast, threshold: f32)

fn words_stats(&self, ast: &Ast) -> (HashMap<String, f32>, u32)

fn detect_global(&self, ast: &mut Ast, threshold: f32)

fn ast_to_terminal(&self, ast: &Ast) -> String

fn ast_to_repetitions(&self, ast: &Ast) -> Vec<Repetition>

fn ast_to_ispell(&self, ast: &Ast, list: bool, offset: usize) -> String

fn ast_to_markdown(&self, ast: &Ast) -> String

fn ast_to_html(&self, ast: &mut Ast, standalone: bool) -> String

`impl Parser`
[src]

`fn list_languages() -> Vec<&'static str>`

`fn get_ignored_from_string(list: &str) -> Vec<String>`

`fn get_ignored_from_lang(lang: &str) -> Vec<String>`

`fn new(lang: &str) -> Result<Parser>`

`fn with_fuzzy(self, fuzzy: Option<f32>) -> Parser`

`fn with_max_distance(self, max_dist: u32) -> Parser`

`fn with_html(self, html: bool) -> Parser`

`fn with_ignore_proper(self, proper: bool) -> Parser`

`fn with_ignored(self, list: &str) -> Parser`

`fn with_more_ignored(self, list: &str) -> Parser`

`fn tokenize(&mut self, s: &str) -> Result<Ast>`

`fn detect_local(&mut self, ast: &mut Ast, threshold: f32)`

`fn words_stats(&self, ast: &Ast) -> (HashMap<String, f32>, u32)`

`fn detect_global(&self, ast: &mut Ast, threshold: f32)`

`fn ast_to_terminal(&self, ast: &Ast) -> String`

`fn ast_to_repetitions(&self, ast: &Ast) -> Vec<Repetition>`

`fn ast_to_ispell(&self, ast: &Ast, list: bool, offset: usize) -> String`

`fn ast_to_markdown(&self, ast: &Ast) -> String`

`fn ast_to_html(&self, ast: &mut Ast, standalone: bool) -> String`