acorn_lib/analyzer/
readability.rs

1//! # Readability utilities
2//!
3//! Analyze readabilty of prose using modern readability metrics.
4use crate::constants::*;
5use crate::util::{find_first, Label};
6use clap::ValueEnum;
7use derive_more::Display;
8use dotenvy::dotenv;
9use fancy_regex::Regex;
10use lazy_static::lazy_static;
11use std::collections::HashMap;
12use tracing::debug;
13use tracing::warn;
14
15lazy_static! {
16    /// Apostrophe
17    pub static ref APOSTROPHE: Regex = Regex::new(r#"['’]"#).unwrap();
18    /// Non-alphabetic
19    pub static ref NON_ALPHABETIC: Regex = Regex::new(r#"[^a-zA-Z]"#).unwrap();
20    /// Vowels
21    pub static ref VOWEL: Regex = Regex::new(r#"[^aeiouy]+"#).unwrap();
22    /// ###  Match single syllable pre- and suffixes
23    pub static ref SINGLE: Regex = Regex::new(r#"^(?:un|fore|ware|none?|out|post|sub|pre|pro|dis|side|some)|(?:ly|less|some|ful|ers?|ness|cians?|ments?|ettes?|villes?|ships?|sides?|ports?|shires?|[gnst]ion(?:ed|s)?)$"#).unwrap();
24    /// ### Match double syllable pre- and suffixes
25    pub static ref DOUBLE: Regex = Regex::new(r#"^(?:above|anti|ante|counter|hyper|afore|agri|infra|intra|inter|over|semi|ultra|under|extra|dia|micro|mega|kilo|pico|nano|macro|somer)|(?:fully|berry|woman|women|edly|union|((?:[bcdfghjklmnpqrstvwxz])|[aeiou])ye?ing)$"#).unwrap();
26    /// ### Match triple syllabble suffixes
27    pub static ref TRIPLE: Regex = Regex::new(r#"(creations?|ology|ologist|onomy|onomist)$"#).unwrap();
28    /// ### Match syllables counted as two, but should be one
29    pub static ref SINGLE_SYLLABIC_ONE : Regex = Regex::new(r#"awe($|d|so)|cia(?:l|$)|tia|cius|cious|[^aeiou]giu|[aeiouy][^aeiouy]ion|iou|sia$|eous$|[oa]gue$|.[^aeiuoycgltdb]{2,}ed$|.ely$|^jua|uai|eau|^busi$|(?:[aeiouy](?:[bcfgklmnprsvwxyz]|ch|dg|g[hn]|lch|l[lv]|mm|nch|n[cgn]|r[bcnsv]|squ|s[chkls]|th)ed$)|(?:[aeiouy](?:[bdfklmnprstvy]|ch|g[hn]|lch|l[lv]|mm|nch|nn|r[nsv]|squ|s[cklst]|th)es$)"#).unwrap();
30    /// ### Match two-syllable words counted as two, but should be one
31    pub static ref SINGLE_SYLLABIC_TWO : Regex = Regex::new(r#"[aeiouy](?:[bcdfgklmnprstvyz]|ch|dg|g[hn]|l[lv]|mm|n[cgns]|r[cnsv]|squ|s[cklst]|th)e$"#).unwrap();
32    /// ### Match syllables counted as one, but should be two
33    pub static ref DOUBLE_SYLLABIC_ONE: Regex = Regex::new(r#"(?:([^aeiouy])\\1l|[^aeiouy]ie(?:r|s?t)|[aeiouym]bl|eo|ism|asm|thm|dnt|snt|uity|dea|gean|oa|ua|react?|orbed|shred|eings?|[aeiouy]sh?e[rs])$"#).unwrap();
34    /// ### Match two-syllable words counted as one, but should be two
35    pub static ref DOUBLE_SYLLABIC_TWO: Regex = Regex::new(r#"creat(?!u)|[^gq]ua[^auieo]|[aeiou]{3}|^(?:ia|mc|coa[dglx].)|^re(app|es|im|us)|(th|d)eist"#).unwrap();
36    /// ### Match three-syllable words counted as one, but should be two
37    pub static ref DOUBLE_SYLLABIC_THREE: Regex = Regex::new(r#"[^aeiou]y[ae]|[^l]lien|riet|dien|iu|io|ii|uen|[aeilotu]real|real[aeilotu]|iell|eo[^aeiou]|[aeiou]y[aeiou]"#).unwrap();
38    /// ### Match four-syllable words counted as one, but should be two
39    pub static ref DOUBLE_SYLLABIC_FOUR: Regex = Regex::new(r#"[^s]ia"#).unwrap();
40    /// Nouns with irregular singular/plural forms
41    pub static ref IRREGULAR_NOUNS: HashMap<&'static str, &'static str> = vec![
42        ("child", "children"),
43        ("cow", "cattle"),
44        ("foot", "feet"),
45        ("goose", "geese"),
46        ("man", "men"),
47        ("move", "moves"),
48        ("person", "people"),
49        ("radius", "radii"),
50        ("sex", "sexes"),
51        ("tooth", "teeth"),
52        ("woman", "women"),
53    ].into_iter().collect();
54    /// Nouns with irregular plural/singular forms
55    ///
56    /// Inverted version of [IRREGULAR_NOUNS]
57    pub static ref IRREGULAR_NOUNS_INVERTED: HashMap<&'static str, &'static str> = IRREGULAR_NOUNS.clone().into_iter().map(|(k, v)| (v, k)).collect();
58    /// ### Nouns that need to be fixed when counting syllables
59    ///
60    /// All counts are (correct - 1)
61    pub static ref NEED_TO_BE_FIXED: HashMap<&'static str, usize> = vec![
62        ("ayo", 2),
63        ("australian", 3),
64        ("dionysius", 5),
65        ("disbursement", 3),
66        ("discouragement", 4),
67        ("disenfranchisement", 5),
68        ("disengagement", 4),
69        ("disgraceful", 3),
70        ("diskette", 2),
71        ("displacement", 3),
72        ("distasteful", 3),
73        ("distinctiveness", 4),
74        ("distraction", 3),
75        ("geoffrion", 4),
76        ("mcquaid", 2),
77        ("mcquaide", 2),
78        ("mcquaig", 2),
79        ("mcquain", 2),
80        ("nonbusiness", 3),
81        ("nonetheless", 3),
82        ("nonmanagement", 4),
83        ("outplacement", 3),
84        ("outrageously", 4),
85        ("postponement", 3),
86        ("preemption", 3),
87        ("preignition", 4),
88        ("preinvasion", 4),
89        ("preisler", 3),
90        ("preoccupation", 5),
91        ("prevette", 2),
92        ("probusiness", 3),
93        ("procurement", 3),
94        ("pronouncement", 3),
95        ("sidewater", 3),
96        ("sidewinder", 3),
97        ("ungerer", 3),
98    ].into_iter().collect();
99    /// ### Nouns with problematic syllable counts
100    pub static ref PROBLEMATIC_WORDS: HashMap<&'static str, usize> = vec![
101        ("abalone", 4),
102        ("abare", 3),
103        ("abbruzzese", 4),
104        ("abed", 2),
105        ("aborigine", 5),
106        ("abruzzese", 4),
107        ("acreage", 3),
108        ("adame", 3),
109        ("adieu", 2),
110        ("adobe", 3),
111        ("anemone", 4),
112        ("anyone", 3),
113        ("apache", 3),
114        ("aphrodite", 4),
115        ("apostrophe", 4),
116        ("ariadne", 4),
117        ("cafe", 2),
118        ("café", 2),
119        ("calliope", 4),
120        ("catastrophe", 4),
121        ("chile", 2),
122        ("chloe", 2),
123        ("circe", 2),
124        ("cliche", 2),
125        ("cliché", 2),
126        ("contrariety", 4),
127        ("coyote", 3),
128        ("daphne", 2),
129        ("epitome", 4),
130        ("eurydice", 4),
131        ("euterpe", 3),
132        ("every", 2),
133        ("everywhere", 3),
134        ("forever", 3),
135        ("gethsemane", 4),
136        ("guacamole", 4),
137        ("hermione", 4),
138        ("hyperbole", 4),
139        ("jesse", 2),
140        ("jukebox", 2),
141        ("karate", 3),
142        ("machete", 3),
143        ("maybe", 2),
144        ("naive", 2),
145        ("newlywed", 3),
146        ("ninety", 2),
147        ("penelope", 4),
148        ("people", 2),
149        ("persephone", 4),
150        ("phoebe", 2),
151        ("pulse", 1),
152        ("queue", 1),
153        ("recipe", 3),
154        ("reptilian", 4),
155        ("resumé", 2),
156        ("riverbed", 3),
157        ("scotia", 3),
158        ("sesame", 3),
159        ("shoreline", 2),
160        ("simile", 3),
161        ("snuffleupagus", 5),
162        ("sometimes", 2),
163        ("syncope", 3),
164        ("tamale", 3),
165        ("waterbed", 3),
166        ("wednesday", 2),
167        ("viceroyship", 3),
168        ("yosemite", 4),
169        ("zoë", 2),
170    ].into_iter().collect();
171}
172/// Plural to singular regex patterns
173const PLURAL_TO_SINGULAR: [(&str, &str); 28] = [
174    (r#"(quiz)zes$"#, r#"${1}"#),
175    (r#"(matr)ices$"#, r#"${1}ix"#),
176    (r#"(vert|ind)ices$"#, r#"${1}ex"#),
177    (r#"^(ox)en$"#, r#"${1}"#),
178    (r#"(alias)es$"#, r#"${1}"#),
179    (r#"(octop|vir)i$"#, r#"${1}us"#),
180    (r#"(cris|ax|test)es$"#, r#"${1}is"#),
181    (r#"(shoe)s$"#, r#"${1}"#),
182    (r#"(o)es$"#, r#"${1}"#),
183    (r#"(bus)es$"#, r#"${1}"#),
184    (r#"([m|l])ice$"#, r#"${1}ouse"#),
185    (r#"(x|ch|ss|sh)es$"#, r#"${1}"#),
186    (r#"(m)ovies$"#, r#"${1}ovie"#),
187    (r#"(s)eries$"#, r#"${1}eries"#),
188    (r#"([^aeiouy]|qu)ies$"#, r#"${1}y"#),
189    (r#"([lr])ves$"#, r#"${1}f"#),
190    (r#"(tive)s$"#, r#"${1}"#),
191    (r#"(hive)s$"#, r#"${1}"#),
192    (r#"(li|wi|kni)ves$"#, r#"${1}fe"#),
193    (r#"(shea|loa|lea|thie)ves$"#, r#"${1}f"#),
194    (r#"(^analy)ses$"#, r#"${1}sis"#),
195    (r#"((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$"#, r#"${1}${2}sis"#),
196    (r#"([ti])a$"#, r#"${1}um"#),
197    (r#"(n)ews$"#, r#"${1}ews"#),
198    (r#"(h|bl)ouses$"#, r#"${1}ouse"#),
199    (r#"(corpse)s$"#, r#"${1}"#),
200    (r#"(us)es$"#, r#"${1}"#),
201    (r#"s$"#, r#""#),
202];
203/// ### Nouns with the same singular and plural forms
204pub const SAME_SINGULAR_PLURAL: [&str; 110] = [
205    "accommodation",
206    "advice",
207    "alms",
208    "aircraft",
209    "aluminum",
210    "barracks",
211    "bison",
212    "binoculars",
213    "bourgeois",
214    "breadfruit",
215    "buffalo",
216    "cannon",
217    "caribou",
218    "chalk",
219    "chassis",
220    "chinos",
221    "clippers",
222    "clothing",
223    "cod",
224    "concrete",
225    "corps",
226    "correspondence",
227    "crossroads",
228    "data",
229    "deer",
230    "doldrums",
231    "dungarees",
232    "education",
233    "eggfruit",
234    "elk",
235    "equipment",
236    "eyeglasses",
237    "fish",
238    "flares",
239    "flour",
240    "food",
241    "fruit",
242    "furniture",
243    "gallows",
244    "goldfish",
245    "grapefruit",
246    "greenfly",
247    "grouse",
248    "haddock",
249    "halibut",
250    "head",
251    "headquarters",
252    "help",
253    "homework",
254    "hovercraft",
255    "ides",
256    "information",
257    "insignia",
258    "jackfruit",
259    "jeans",
260    "knickers",
261    "knowledge",
262    "kudos",
263    "leggings",
264    "lego",
265    "luggage",
266    "mathematics",
267    "money",
268    "moose",
269    "monkfish",
270    "mullet",
271    "nailclippers",
272    "news",
273    "nitrogen",
274    "offspring",
275    "oxygen",
276    "pants",
277    "pyjamas",
278    "passionfruit",
279    "pike",
280    "pliers",
281    "police",
282    "premises",
283    "reindeer",
284    "rendezvous",
285    "rice",
286    "salmon",
287    "scissors",
288    "series",
289    "shambles",
290    "sheep",
291    "shellfish",
292    "shorts",
293    "shrimp",
294    "smithereens",
295    "spacecraft",
296    "species",
297    "squid",
298    "staff",
299    "starfruit",
300    "statistics",
301    "stone",
302    "sugar",
303    "swine",
304    "tights",
305    "tongs",
306    "traffic",
307    "trousers",
308    "trout",
309    "tuna",
310    "tweezers",
311    "wheat",
312    "whitebait",
313    "wood",
314    "you",
315];
316/// Readability Type
317#[derive(Clone, Copy, Debug, Default, Display, PartialEq, ValueEnum)]
318pub enum ReadabilityType {
319    /// Automated Readability Index (ARI)
320    #[default]
321    #[display("ari")]
322    ARI,
323    /// Coleman-Liau Index (CLI)
324    #[display("cli")]
325    CLI,
326    /// Flesch-Kincaid Grade Level (FKGL)
327    #[display("fkgl")]
328    FKGL,
329    /// Flesch Reading Ease (FRES)
330    #[display("fres")]
331    FRES,
332    /// Gunning Fog Index (GFI)
333    #[display("gfi")]
334    GFI,
335    /// Lix (abbreviation of Swedish läsbarhetsindex)
336    #[display("lix")]
337    Lix,
338    /// SMOG Index (SMOG)
339    #[display("smog")]
340    SMOG,
341}
342impl ReadabilityType {
343    /// Calculate Readability for a given text and readability type
344    pub fn calculate(self, text: &str) -> f64 {
345        match self {
346            | ReadabilityType::ARI => automated_readability_index(text),
347            | ReadabilityType::CLI => coleman_liau_index(text),
348            | ReadabilityType::FKGL => flesch_kincaid_grade_level(text),
349            | ReadabilityType::FRES => flesch_reading_ease_score(text),
350            | ReadabilityType::GFI => gunning_fog_index(text),
351            | ReadabilityType::Lix => lix(text),
352            | ReadabilityType::SMOG => smog(text),
353        }
354    }
355    /// Get Readability Type from string
356    pub fn from_string(value: &str) -> ReadabilityType {
357        match value.to_lowercase().replace("-", " ").as_str() {
358            | "ari" | "automated readability index" => ReadabilityType::ARI,
359            | "cli" | "coleman liau index" => ReadabilityType::CLI,
360            | "fkgl" | "flesch kincaid grade level" => ReadabilityType::FKGL,
361            | "fres" | "flesch reading ease score" => ReadabilityType::FRES,
362            | "gfi" | "gunning fog index" => ReadabilityType::GFI,
363            | "lix" => ReadabilityType::Lix,
364            | "smog" | "simple measure of gobbledygook" => ReadabilityType::SMOG,
365            | _ => {
366                warn!(value, "=> {} Unknown Readability Type", Label::using());
367                ReadabilityType::default()
368            }
369        }
370    }
371    /// Get maximum allowed value for a given readability type
372    pub fn maximum_allowed(self) -> f64 {
373        match dotenv() {
374            | Ok(_) => {
375                let variables = dotenvy::vars().collect::<Vec<(String, String)>>();
376                let pair = match self {
377                    | ReadabilityType::ARI => find_first(variables, "MAX_ALLOWED_ARI"),
378                    | ReadabilityType::CLI => find_first(variables, "MAX_ALLOWED_CLI"),
379                    | ReadabilityType::FKGL => find_first(variables, "MAX_ALLOWED_FKGL"),
380                    | ReadabilityType::FRES => find_first(variables, "MAX_ALLOWED_FRES"),
381                    | ReadabilityType::GFI => find_first(variables, "MAX_ALLOWED_GFI"),
382                    | ReadabilityType::Lix => find_first(variables, "MAX_ALLOWED_LIX"),
383                    | ReadabilityType::SMOG => find_first(variables, "MAX_ALLOWED_SMOG"),
384                };
385                match pair {
386                    | Some((_, value)) => value.parse::<f64>().unwrap(),
387                    | None => MAX_ALLOWED_ARI,
388                }
389            }
390            | Err(_) => match self {
391                | ReadabilityType::ARI => MAX_ALLOWED_ARI,
392                | ReadabilityType::CLI => MAX_ALLOWED_CLI,
393                | ReadabilityType::FKGL => MAX_ALLOWED_FKGL,
394                | ReadabilityType::FRES => MAX_ALLOWED_FRES,
395                | ReadabilityType::GFI => MAX_ALLOWED_GFI,
396                | ReadabilityType::Lix => MAX_ALLOWED_LIX,
397                | ReadabilityType::SMOG => MAX_ALLOWED_SMOG,
398            },
399        }
400    }
401}
402/// Count the number of "complex words"[^complex] in a given text
403///
404/// [^complex]: Words with 3 or more syllables
405pub fn complex_word_count(text: &str) -> u32 {
406    words(text).iter().filter(|word| syllable_count(word) > 2).count() as u32
407}
408/// Count the number of letters in a given text
409///
410/// Does NOT count white space or punctuation
411pub fn letter_count(text: &str) -> u32 {
412    text.chars()
413        .filter(|c| !(c.is_whitespace() || NON_ALPHABETIC.is_match(&c.to_string()).unwrap_or_default()))
414        .count() as u32
415}
416/// Count the number of "long words"[^long] in a given text
417///
418/// [^long]: Words with more than 6 letters
419pub fn long_word_count(text: &str) -> u32 {
420    words(text).iter().filter(|word| word.len() > 6).count() as u32
421}
422/// Count the number of sentences in a given text
423pub fn sentence_count(text: &str) -> u32 {
424    text.split('.').filter(|s| !s.is_empty()).collect::<Vec<_>>().len() as u32
425}
426/// Get list of words in a given text
427pub fn words(text: &str) -> Vec<String> {
428    text.split_whitespace().map(String::from).collect()
429}
430/// Count the number of words in a given text
431///
432/// See [`words`]
433pub fn word_count(text: &str) -> u32 {
434    words(text).len() as u32
435}
436/// Automated Readability Index (ARI)
437///
438/// The formula was derived from a large dataset of texts used in US schools.
439/// The result is a number that corresponds with a US grade level.
440///
441/// Requires counting letters, words, and sentences
442///
443/// See <https://en.wikipedia.org/wiki/Automated_readability_index> for more information
444pub fn automated_readability_index(text: &str) -> f64 {
445    let letters = letter_count(text);
446    let words = word_count(text);
447    let sentences = sentence_count(text);
448    debug!(letters, words, sentences, "=> {}", Label::using());
449    let score = 4.71 * (letters as f64 / words as f64) + 0.5 * (words as f64 / sentences as f64) - 21.43;
450    format!("{score:.2}").parse().unwrap()
451}
452/// Coleman-Liau Index (CLI)
453///
454/// Requires counting letters, words, and sentences
455pub fn coleman_liau_index(text: &str) -> f64 {
456    let letters = letter_count(text);
457    let words = word_count(text);
458    let sentences = sentence_count(text);
459    debug!(letters, words, sentences, "=> {}", Label::using());
460    let score = (0.0588 * 100.0 * (letters as f64 / words as f64)) - (0.296 * 100.0 * (sentences as f64 / words as f64)) - 15.8;
461    format!("{score:.2}").parse().unwrap()
462}
463/// Flesch-Kincaid Grade Level (FKGL)
464///
465/// Arguably the most popular readability test.
466/// The result is a number that corresponds with a US grade level.
467///
468/// Requires counting words, sentences, and syllables
469///
470/// See <https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests> for more information
471pub fn flesch_kincaid_grade_level(text: &str) -> f64 {
472    let words = word_count(text);
473    let sentences = sentence_count(text);
474    let syllables = syllable_count(text);
475    debug!(words, sentences, syllables, "=> {}", Label::using());
476    let score = 0.39 * (words as f64 / sentences as f64) + 11.8 * (syllables as f64 / words as f64) - 15.59;
477    format!("{score:.2}").parse().unwrap()
478}
479/// Flesch Reading Ease Score (FRES)
480///
481/// FRES range is 100 (very easy) - 0 (extremely difficult)
482///
483/// Requires counting words, sentences, and syllables
484///
485/// See <https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests> for more information
486pub fn flesch_reading_ease_score(text: &str) -> f64 {
487    let words = word_count(text);
488    let sentences = sentence_count(text);
489    let syllables = syllable_count(text);
490    debug!(words, sentences, syllables, "=> {}", Label::using());
491    let score = 206.835 - (1.015 * words as f64 / sentences as f64) - (84.6 * syllables as f64 / words as f64);
492    format!("{score:.2}").parse().unwrap()
493}
494/// Gunning Fog Index (GFI)
495///
496/// Estimates the years of formal education a person needs to understand the text on the first reading
497///
498/// Requires counting words, sentences, and "complex words" (see [complex_word_count])
499///
500/// See <https://en.wikipedia.org/wiki/Gunning_fog_index> for more information
501pub fn gunning_fog_index(text: &str) -> f64 {
502    let words = word_count(text);
503    let complex_words = complex_word_count(text);
504    let sentences = sentence_count(text);
505    let score = 0.4 * ((words as f64 / sentences as f64) + (100.0 * (complex_words as f64 / words as f64)));
506    format!("{score:.2}").parse().unwrap()
507}
508/// Lix (abbreviation of Swedish läsbarhetsindex)
509///
510/// Indicates the difficulty of reading a text
511///
512/// Requires counting words, sentences, and long words (see [long_word_count])
513///
514/// "Lix" is an abbreviation of *läsbarhetsindex*, which means "readability index" in Swedish
515///
516/// See <https://en.wikipedia.org/wiki/Lix_(readability_test)> for more information
517pub fn lix(text: &str) -> f64 {
518    let words = word_count(text);
519    let sentences = sentence_count(text);
520    let long_words = long_word_count(text);
521    let score = (words as f64 / sentences as f64) + 100.0 * (long_words as f64 / words as f64);
522    format!("{score:.2}").parse().unwrap()
523}
524/// Simple Measure of Gobbledygook (SMOG)
525///
526/// Estimates the years of education needed to understand a piece of writing
527///
528/// **Caution**: SMOG formula was normalized on 30-sentence samples
529///
530/// Requires counting sentences, and "complex words" (see [complex_word_count])
531///
532/// See <https://en.wikipedia.org/wiki/SMOG> for more information
533pub fn smog(text: &str) -> f64 {
534    let sentences = sentence_count(text);
535    let complex_words = complex_word_count(text);
536    let score = 1.0430 * (30.0 * (complex_words as f64 / sentences as f64)).sqrt() + 3.1291;
537    format!("{score:.2}").parse().unwrap()
538}
539/// Get the singular form of a word (e.g. "people" -> "person")
540///
541/// Adapted from the PHP library, [Text-Statistics](https://github.com/DaveChild/Text-Statistics)
542pub fn singular_form(word: &str) -> String {
543    match word.to_lowercase().as_str() {
544        | value if SAME_SINGULAR_PLURAL.contains(&value) => value.to_string(),
545        | value if IRREGULAR_NOUNS.contains_key(&value) => value.to_string(),
546        | value if IRREGULAR_NOUNS_INVERTED.contains_key(&value) => match IRREGULAR_NOUNS_INVERTED.get(value) {
547            | Some(value) => value.to_string(),
548            | None => value.to_string(),
549        },
550        | value => {
551            let pair = PLURAL_TO_SINGULAR
552                .iter()
553                .find(|(pattern, _)| match Regex::new(pattern).unwrap().is_match(value) {
554                    | Ok(true) => true,
555                    | Ok(false) | Err(_) => false,
556                });
557            match pair {
558                | Some((pattern, replacement)) => {
559                    debug!(pattern, replacement, value, "=> {} Singular form conversion", Label::using());
560                    let re = Regex::new(pattern).unwrap();
561                    re.replace_all(value, *replacement).to_string()
562                }
563                | None => value.to_string(),
564            }
565        }
566    }
567}
568/// Count the number of syllables in a given text
569pub fn syllable_count(text: &str) -> usize {
570    fn syllables(word: String) -> usize {
571        let singular = singular_form(&word);
572        match word.as_str() {
573            | "" => 0,
574            | value if value.len() < 3 => 1,
575            | value if PROBLEMATIC_WORDS.contains_key(value) => match PROBLEMATIC_WORDS.get(value) {
576                | Some(x) => *x,
577                | None => 0,
578            },
579            | _ if PROBLEMATIC_WORDS.contains_key(&singular.as_str()) => match PROBLEMATIC_WORDS.get(singular.as_str()) {
580                | Some(x) => *x,
581                | None => 0,
582            },
583            | value if NEED_TO_BE_FIXED.contains_key(value) => match NEED_TO_BE_FIXED.get(value) {
584                | Some(x) => *x,
585                | None => 0,
586            },
587            | _ if NEED_TO_BE_FIXED.contains_key(&singular.as_str()) => match NEED_TO_BE_FIXED.get(singular.as_str()) {
588                | Some(x) => *x,
589                | None => 0,
590            },
591            | _ => {
592                let mut input = word;
593                let mut count: isize = 0;
594                // TODO: Combine SINGLE, DOUBLE, and TRIPLE regex operations
595                count += 3 * TRIPLE.find_iter(&input).count() as isize;
596                input = TRIPLE.replace_all(&input, "").to_string();
597                count += 2 * DOUBLE.find_iter(&input).count() as isize;
598                input = DOUBLE.replace_all(&input, "").to_string();
599                count += SINGLE.find_iter(&input).count() as isize;
600                input = SINGLE.replace_all(&input, "").to_string();
601                count -= SINGLE_SYLLABIC_ONE.find_iter(&input).count() as isize;
602                count -= SINGLE_SYLLABIC_TWO.find_iter(&input).count() as isize;
603                count += DOUBLE_SYLLABIC_ONE.find_iter(&input).count() as isize;
604                count += DOUBLE_SYLLABIC_TWO.find_iter(&input).count() as isize;
605                count += DOUBLE_SYLLABIC_THREE.find_iter(&input).count() as isize;
606                count += DOUBLE_SYLLABIC_FOUR.find_iter(&input).count() as isize;
607                count += VOWEL.split(&input).filter(|x| !x.as_ref().unwrap().is_empty()).count() as isize;
608                count as usize
609            }
610        }
611    }
612    let tokens = text.split_whitespace().flat_map(tokenize).collect::<Vec<String>>();
613    tokens.into_iter().map(syllables).sum()
614}
615// TODO: Expand acronyms into words
616/// Break text into tokens
617///
618/// Currently replaces `é` and `ë` with `-e`, splits on hyphens, and removes non-alphabetic characters.
619///
620/// This function is a good entry point for adding support for the nuacnces of 'scientific" texts
621pub fn tokenize(value: &str) -> Vec<String> {
622    value
623        .replace("é", "-e")
624        .replace("ë", "-e")
625        .split('-')
626        .map(|x| NON_ALPHABETIC.replace_all(x, "").to_lowercase())
627        .collect::<Vec<_>>()
628}
acorn_lib/analyzer/readability.rs

acorn_lib/analyzer/
readability.rs