1use crate::constants::*;
5use crate::util::{find_first, Label};
6use clap::ValueEnum;
7use derive_more::Display;
8use dotenvy::dotenv;
9use fancy_regex::Regex;
10use lazy_static::lazy_static;
11use std::collections::HashMap;
12use tracing::debug;
13use tracing::warn;
14
15lazy_static! {
16 pub static ref APOSTROPHE: Regex = Regex::new(r#"['’]"#).unwrap();
18 pub static ref NON_ALPHABETIC: Regex = Regex::new(r#"[^a-zA-Z]"#).unwrap();
20 pub static ref VOWEL: Regex = Regex::new(r#"[^aeiouy]+"#).unwrap();
22 pub static ref SINGLE: Regex = Regex::new(r#"^(?:un|fore|ware|none?|out|post|sub|pre|pro|dis|side|some)|(?:ly|less|some|ful|ers?|ness|cians?|ments?|ettes?|villes?|ships?|sides?|ports?|shires?|[gnst]ion(?:ed|s)?)$"#).unwrap();
24 pub static ref DOUBLE: Regex = Regex::new(r#"^(?:above|anti|ante|counter|hyper|afore|agri|infra|intra|inter|over|semi|ultra|under|extra|dia|micro|mega|kilo|pico|nano|macro|somer)|(?:fully|berry|woman|women|edly|union|((?:[bcdfghjklmnpqrstvwxz])|[aeiou])ye?ing)$"#).unwrap();
26 pub static ref TRIPLE: Regex = Regex::new(r#"(creations?|ology|ologist|onomy|onomist)$"#).unwrap();
28 pub static ref SINGLE_SYLLABIC_ONE : Regex = Regex::new(r#"awe($|d|so)|cia(?:l|$)|tia|cius|cious|[^aeiou]giu|[aeiouy][^aeiouy]ion|iou|sia$|eous$|[oa]gue$|.[^aeiuoycgltdb]{2,}ed$|.ely$|^jua|uai|eau|^busi$|(?:[aeiouy](?:[bcfgklmnprsvwxyz]|ch|dg|g[hn]|lch|l[lv]|mm|nch|n[cgn]|r[bcnsv]|squ|s[chkls]|th)ed$)|(?:[aeiouy](?:[bdfklmnprstvy]|ch|g[hn]|lch|l[lv]|mm|nch|nn|r[nsv]|squ|s[cklst]|th)es$)"#).unwrap();
30 pub static ref SINGLE_SYLLABIC_TWO : Regex = Regex::new(r#"[aeiouy](?:[bcdfgklmnprstvyz]|ch|dg|g[hn]|l[lv]|mm|n[cgns]|r[cnsv]|squ|s[cklst]|th)e$"#).unwrap();
32 pub static ref DOUBLE_SYLLABIC_ONE: Regex = Regex::new(r#"(?:([^aeiouy])\\1l|[^aeiouy]ie(?:r|s?t)|[aeiouym]bl|eo|ism|asm|thm|dnt|snt|uity|dea|gean|oa|ua|react?|orbed|shred|eings?|[aeiouy]sh?e[rs])$"#).unwrap();
34 pub static ref DOUBLE_SYLLABIC_TWO: Regex = Regex::new(r#"creat(?!u)|[^gq]ua[^auieo]|[aeiou]{3}|^(?:ia|mc|coa[dglx].)|^re(app|es|im|us)|(th|d)eist"#).unwrap();
36 pub static ref DOUBLE_SYLLABIC_THREE: Regex = Regex::new(r#"[^aeiou]y[ae]|[^l]lien|riet|dien|iu|io|ii|uen|[aeilotu]real|real[aeilotu]|iell|eo[^aeiou]|[aeiou]y[aeiou]"#).unwrap();
38 pub static ref DOUBLE_SYLLABIC_FOUR: Regex = Regex::new(r#"[^s]ia"#).unwrap();
40 pub static ref IRREGULAR_NOUNS: HashMap<&'static str, &'static str> = vec![
42 ("child", "children"),
43 ("cow", "cattle"),
44 ("foot", "feet"),
45 ("goose", "geese"),
46 ("man", "men"),
47 ("move", "moves"),
48 ("person", "people"),
49 ("radius", "radii"),
50 ("sex", "sexes"),
51 ("tooth", "teeth"),
52 ("woman", "women"),
53 ].into_iter().collect();
54 pub static ref IRREGULAR_NOUNS_INVERTED: HashMap<&'static str, &'static str> = IRREGULAR_NOUNS.clone().into_iter().map(|(k, v)| (v, k)).collect();
58 pub static ref NEED_TO_BE_FIXED: HashMap<&'static str, usize> = vec![
62 ("ayo", 2),
63 ("australian", 3),
64 ("dionysius", 5),
65 ("disbursement", 3),
66 ("discouragement", 4),
67 ("disenfranchisement", 5),
68 ("disengagement", 4),
69 ("disgraceful", 3),
70 ("diskette", 2),
71 ("displacement", 3),
72 ("distasteful", 3),
73 ("distinctiveness", 4),
74 ("distraction", 3),
75 ("geoffrion", 4),
76 ("mcquaid", 2),
77 ("mcquaide", 2),
78 ("mcquaig", 2),
79 ("mcquain", 2),
80 ("nonbusiness", 3),
81 ("nonetheless", 3),
82 ("nonmanagement", 4),
83 ("outplacement", 3),
84 ("outrageously", 4),
85 ("postponement", 3),
86 ("preemption", 3),
87 ("preignition", 4),
88 ("preinvasion", 4),
89 ("preisler", 3),
90 ("preoccupation", 5),
91 ("prevette", 2),
92 ("probusiness", 3),
93 ("procurement", 3),
94 ("pronouncement", 3),
95 ("sidewater", 3),
96 ("sidewinder", 3),
97 ("ungerer", 3),
98 ].into_iter().collect();
99 pub static ref PROBLEMATIC_WORDS: HashMap<&'static str, usize> = vec![
101 ("abalone", 4),
102 ("abare", 3),
103 ("abbruzzese", 4),
104 ("abed", 2),
105 ("aborigine", 5),
106 ("abruzzese", 4),
107 ("acreage", 3),
108 ("adame", 3),
109 ("adieu", 2),
110 ("adobe", 3),
111 ("anemone", 4),
112 ("anyone", 3),
113 ("apache", 3),
114 ("aphrodite", 4),
115 ("apostrophe", 4),
116 ("ariadne", 4),
117 ("cafe", 2),
118 ("café", 2),
119 ("calliope", 4),
120 ("catastrophe", 4),
121 ("chile", 2),
122 ("chloe", 2),
123 ("circe", 2),
124 ("cliche", 2),
125 ("cliché", 2),
126 ("contrariety", 4),
127 ("coyote", 3),
128 ("daphne", 2),
129 ("epitome", 4),
130 ("eurydice", 4),
131 ("euterpe", 3),
132 ("every", 2),
133 ("everywhere", 3),
134 ("forever", 3),
135 ("gethsemane", 4),
136 ("guacamole", 4),
137 ("hermione", 4),
138 ("hyperbole", 4),
139 ("jesse", 2),
140 ("jukebox", 2),
141 ("karate", 3),
142 ("machete", 3),
143 ("maybe", 2),
144 ("naive", 2),
145 ("newlywed", 3),
146 ("ninety", 2),
147 ("penelope", 4),
148 ("people", 2),
149 ("persephone", 4),
150 ("phoebe", 2),
151 ("pulse", 1),
152 ("queue", 1),
153 ("recipe", 3),
154 ("reptilian", 4),
155 ("resumé", 2),
156 ("riverbed", 3),
157 ("scotia", 3),
158 ("sesame", 3),
159 ("shoreline", 2),
160 ("simile", 3),
161 ("snuffleupagus", 5),
162 ("sometimes", 2),
163 ("syncope", 3),
164 ("tamale", 3),
165 ("waterbed", 3),
166 ("wednesday", 2),
167 ("viceroyship", 3),
168 ("yosemite", 4),
169 ("zoë", 2),
170 ].into_iter().collect();
171}
172const PLURAL_TO_SINGULAR: [(&str, &str); 28] = [
174 (r#"(quiz)zes$"#, r#"${1}"#),
175 (r#"(matr)ices$"#, r#"${1}ix"#),
176 (r#"(vert|ind)ices$"#, r#"${1}ex"#),
177 (r#"^(ox)en$"#, r#"${1}"#),
178 (r#"(alias)es$"#, r#"${1}"#),
179 (r#"(octop|vir)i$"#, r#"${1}us"#),
180 (r#"(cris|ax|test)es$"#, r#"${1}is"#),
181 (r#"(shoe)s$"#, r#"${1}"#),
182 (r#"(o)es$"#, r#"${1}"#),
183 (r#"(bus)es$"#, r#"${1}"#),
184 (r#"([m|l])ice$"#, r#"${1}ouse"#),
185 (r#"(x|ch|ss|sh)es$"#, r#"${1}"#),
186 (r#"(m)ovies$"#, r#"${1}ovie"#),
187 (r#"(s)eries$"#, r#"${1}eries"#),
188 (r#"([^aeiouy]|qu)ies$"#, r#"${1}y"#),
189 (r#"([lr])ves$"#, r#"${1}f"#),
190 (r#"(tive)s$"#, r#"${1}"#),
191 (r#"(hive)s$"#, r#"${1}"#),
192 (r#"(li|wi|kni)ves$"#, r#"${1}fe"#),
193 (r#"(shea|loa|lea|thie)ves$"#, r#"${1}f"#),
194 (r#"(^analy)ses$"#, r#"${1}sis"#),
195 (r#"((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$"#, r#"${1}${2}sis"#),
196 (r#"([ti])a$"#, r#"${1}um"#),
197 (r#"(n)ews$"#, r#"${1}ews"#),
198 (r#"(h|bl)ouses$"#, r#"${1}ouse"#),
199 (r#"(corpse)s$"#, r#"${1}"#),
200 (r#"(us)es$"#, r#"${1}"#),
201 (r#"s$"#, r#""#),
202];
203pub const SAME_SINGULAR_PLURAL: [&str; 110] = [
205 "accommodation",
206 "advice",
207 "alms",
208 "aircraft",
209 "aluminum",
210 "barracks",
211 "bison",
212 "binoculars",
213 "bourgeois",
214 "breadfruit",
215 "buffalo",
216 "cannon",
217 "caribou",
218 "chalk",
219 "chassis",
220 "chinos",
221 "clippers",
222 "clothing",
223 "cod",
224 "concrete",
225 "corps",
226 "correspondence",
227 "crossroads",
228 "data",
229 "deer",
230 "doldrums",
231 "dungarees",
232 "education",
233 "eggfruit",
234 "elk",
235 "equipment",
236 "eyeglasses",
237 "fish",
238 "flares",
239 "flour",
240 "food",
241 "fruit",
242 "furniture",
243 "gallows",
244 "goldfish",
245 "grapefruit",
246 "greenfly",
247 "grouse",
248 "haddock",
249 "halibut",
250 "head",
251 "headquarters",
252 "help",
253 "homework",
254 "hovercraft",
255 "ides",
256 "information",
257 "insignia",
258 "jackfruit",
259 "jeans",
260 "knickers",
261 "knowledge",
262 "kudos",
263 "leggings",
264 "lego",
265 "luggage",
266 "mathematics",
267 "money",
268 "moose",
269 "monkfish",
270 "mullet",
271 "nailclippers",
272 "news",
273 "nitrogen",
274 "offspring",
275 "oxygen",
276 "pants",
277 "pyjamas",
278 "passionfruit",
279 "pike",
280 "pliers",
281 "police",
282 "premises",
283 "reindeer",
284 "rendezvous",
285 "rice",
286 "salmon",
287 "scissors",
288 "series",
289 "shambles",
290 "sheep",
291 "shellfish",
292 "shorts",
293 "shrimp",
294 "smithereens",
295 "spacecraft",
296 "species",
297 "squid",
298 "staff",
299 "starfruit",
300 "statistics",
301 "stone",
302 "sugar",
303 "swine",
304 "tights",
305 "tongs",
306 "traffic",
307 "trousers",
308 "trout",
309 "tuna",
310 "tweezers",
311 "wheat",
312 "whitebait",
313 "wood",
314 "you",
315];
316#[derive(Clone, Copy, Debug, Default, Display, PartialEq, ValueEnum)]
318pub enum ReadabilityType {
319 #[default]
321 #[display("ari")]
322 ARI,
323 #[display("cli")]
325 CLI,
326 #[display("fkgl")]
328 FKGL,
329 #[display("fres")]
331 FRES,
332 #[display("gfi")]
334 GFI,
335 #[display("lix")]
337 Lix,
338 #[display("smog")]
340 SMOG,
341}
342impl ReadabilityType {
343 pub fn calculate(self, text: &str) -> f64 {
345 match self {
346 | ReadabilityType::ARI => automated_readability_index(text),
347 | ReadabilityType::CLI => coleman_liau_index(text),
348 | ReadabilityType::FKGL => flesch_kincaid_grade_level(text),
349 | ReadabilityType::FRES => flesch_reading_ease_score(text),
350 | ReadabilityType::GFI => gunning_fog_index(text),
351 | ReadabilityType::Lix => lix(text),
352 | ReadabilityType::SMOG => smog(text),
353 }
354 }
355 pub fn from_string(value: &str) -> ReadabilityType {
357 match value.to_lowercase().replace("-", " ").as_str() {
358 | "ari" | "automated readability index" => ReadabilityType::ARI,
359 | "cli" | "coleman liau index" => ReadabilityType::CLI,
360 | "fkgl" | "flesch kincaid grade level" => ReadabilityType::FKGL,
361 | "fres" | "flesch reading ease score" => ReadabilityType::FRES,
362 | "gfi" | "gunning fog index" => ReadabilityType::GFI,
363 | "lix" => ReadabilityType::Lix,
364 | "smog" | "simple measure of gobbledygook" => ReadabilityType::SMOG,
365 | _ => {
366 warn!(value, "=> {} Unknown Readability Type", Label::using());
367 ReadabilityType::default()
368 }
369 }
370 }
371 pub fn maximum_allowed(self) -> f64 {
373 match dotenv() {
374 | Ok(_) => {
375 let variables = dotenvy::vars().collect::<Vec<(String, String)>>();
376 let pair = match self {
377 | ReadabilityType::ARI => find_first(variables, "MAX_ALLOWED_ARI"),
378 | ReadabilityType::CLI => find_first(variables, "MAX_ALLOWED_CLI"),
379 | ReadabilityType::FKGL => find_first(variables, "MAX_ALLOWED_FKGL"),
380 | ReadabilityType::FRES => find_first(variables, "MAX_ALLOWED_FRES"),
381 | ReadabilityType::GFI => find_first(variables, "MAX_ALLOWED_GFI"),
382 | ReadabilityType::Lix => find_first(variables, "MAX_ALLOWED_LIX"),
383 | ReadabilityType::SMOG => find_first(variables, "MAX_ALLOWED_SMOG"),
384 };
385 match pair {
386 | Some((_, value)) => value.parse::<f64>().unwrap(),
387 | None => MAX_ALLOWED_ARI,
388 }
389 }
390 | Err(_) => match self {
391 | ReadabilityType::ARI => MAX_ALLOWED_ARI,
392 | ReadabilityType::CLI => MAX_ALLOWED_CLI,
393 | ReadabilityType::FKGL => MAX_ALLOWED_FKGL,
394 | ReadabilityType::FRES => MAX_ALLOWED_FRES,
395 | ReadabilityType::GFI => MAX_ALLOWED_GFI,
396 | ReadabilityType::Lix => MAX_ALLOWED_LIX,
397 | ReadabilityType::SMOG => MAX_ALLOWED_SMOG,
398 },
399 }
400 }
401}
402pub fn complex_word_count(text: &str) -> u32 {
406 words(text).iter().filter(|word| syllable_count(word) > 2).count() as u32
407}
408pub fn letter_count(text: &str) -> u32 {
412 text.chars()
413 .filter(|c| !(c.is_whitespace() || NON_ALPHABETIC.is_match(&c.to_string()).unwrap_or_default()))
414 .count() as u32
415}
416pub fn long_word_count(text: &str) -> u32 {
420 words(text).iter().filter(|word| word.len() > 6).count() as u32
421}
422pub fn sentence_count(text: &str) -> u32 {
424 text.split('.').filter(|s| !s.is_empty()).collect::<Vec<_>>().len() as u32
425}
426pub fn words(text: &str) -> Vec<String> {
428 text.split_whitespace().map(String::from).collect()
429}
430pub fn word_count(text: &str) -> u32 {
434 words(text).len() as u32
435}
436pub fn automated_readability_index(text: &str) -> f64 {
445 let letters = letter_count(text);
446 let words = word_count(text);
447 let sentences = sentence_count(text);
448 debug!(letters, words, sentences, "=> {}", Label::using());
449 let score = 4.71 * (letters as f64 / words as f64) + 0.5 * (words as f64 / sentences as f64) - 21.43;
450 format!("{score:.2}").parse().unwrap()
451}
452pub fn coleman_liau_index(text: &str) -> f64 {
456 let letters = letter_count(text);
457 let words = word_count(text);
458 let sentences = sentence_count(text);
459 debug!(letters, words, sentences, "=> {}", Label::using());
460 let score = (0.0588 * 100.0 * (letters as f64 / words as f64)) - (0.296 * 100.0 * (sentences as f64 / words as f64)) - 15.8;
461 format!("{score:.2}").parse().unwrap()
462}
463pub fn flesch_kincaid_grade_level(text: &str) -> f64 {
472 let words = word_count(text);
473 let sentences = sentence_count(text);
474 let syllables = syllable_count(text);
475 debug!(words, sentences, syllables, "=> {}", Label::using());
476 let score = 0.39 * (words as f64 / sentences as f64) + 11.8 * (syllables as f64 / words as f64) - 15.59;
477 format!("{score:.2}").parse().unwrap()
478}
479pub fn flesch_reading_ease_score(text: &str) -> f64 {
487 let words = word_count(text);
488 let sentences = sentence_count(text);
489 let syllables = syllable_count(text);
490 debug!(words, sentences, syllables, "=> {}", Label::using());
491 let score = 206.835 - (1.015 * words as f64 / sentences as f64) - (84.6 * syllables as f64 / words as f64);
492 format!("{score:.2}").parse().unwrap()
493}
494pub fn gunning_fog_index(text: &str) -> f64 {
502 let words = word_count(text);
503 let complex_words = complex_word_count(text);
504 let sentences = sentence_count(text);
505 let score = 0.4 * ((words as f64 / sentences as f64) + (100.0 * (complex_words as f64 / words as f64)));
506 format!("{score:.2}").parse().unwrap()
507}
508pub fn lix(text: &str) -> f64 {
518 let words = word_count(text);
519 let sentences = sentence_count(text);
520 let long_words = long_word_count(text);
521 let score = (words as f64 / sentences as f64) + 100.0 * (long_words as f64 / words as f64);
522 format!("{score:.2}").parse().unwrap()
523}
524pub fn smog(text: &str) -> f64 {
534 let sentences = sentence_count(text);
535 let complex_words = complex_word_count(text);
536 let score = 1.0430 * (30.0 * (complex_words as f64 / sentences as f64)).sqrt() + 3.1291;
537 format!("{score:.2}").parse().unwrap()
538}
539pub fn singular_form(word: &str) -> String {
543 match word.to_lowercase().as_str() {
544 | value if SAME_SINGULAR_PLURAL.contains(&value) => value.to_string(),
545 | value if IRREGULAR_NOUNS.contains_key(&value) => value.to_string(),
546 | value if IRREGULAR_NOUNS_INVERTED.contains_key(&value) => match IRREGULAR_NOUNS_INVERTED.get(value) {
547 | Some(value) => value.to_string(),
548 | None => value.to_string(),
549 },
550 | value => {
551 let pair = PLURAL_TO_SINGULAR
552 .iter()
553 .find(|(pattern, _)| match Regex::new(pattern).unwrap().is_match(value) {
554 | Ok(true) => true,
555 | Ok(false) | Err(_) => false,
556 });
557 match pair {
558 | Some((pattern, replacement)) => {
559 debug!(pattern, replacement, value, "=> {} Singular form conversion", Label::using());
560 let re = Regex::new(pattern).unwrap();
561 re.replace_all(value, *replacement).to_string()
562 }
563 | None => value.to_string(),
564 }
565 }
566 }
567}
568pub fn syllable_count(text: &str) -> usize {
570 fn syllables(word: String) -> usize {
571 let singular = singular_form(&word);
572 match word.as_str() {
573 | "" => 0,
574 | value if value.len() < 3 => 1,
575 | value if PROBLEMATIC_WORDS.contains_key(value) => match PROBLEMATIC_WORDS.get(value) {
576 | Some(x) => *x,
577 | None => 0,
578 },
579 | _ if PROBLEMATIC_WORDS.contains_key(&singular.as_str()) => match PROBLEMATIC_WORDS.get(singular.as_str()) {
580 | Some(x) => *x,
581 | None => 0,
582 },
583 | value if NEED_TO_BE_FIXED.contains_key(value) => match NEED_TO_BE_FIXED.get(value) {
584 | Some(x) => *x,
585 | None => 0,
586 },
587 | _ if NEED_TO_BE_FIXED.contains_key(&singular.as_str()) => match NEED_TO_BE_FIXED.get(singular.as_str()) {
588 | Some(x) => *x,
589 | None => 0,
590 },
591 | _ => {
592 let mut input = word;
593 let mut count: isize = 0;
594 count += 3 * TRIPLE.find_iter(&input).count() as isize;
596 input = TRIPLE.replace_all(&input, "").to_string();
597 count += 2 * DOUBLE.find_iter(&input).count() as isize;
598 input = DOUBLE.replace_all(&input, "").to_string();
599 count += SINGLE.find_iter(&input).count() as isize;
600 input = SINGLE.replace_all(&input, "").to_string();
601 count -= SINGLE_SYLLABIC_ONE.find_iter(&input).count() as isize;
602 count -= SINGLE_SYLLABIC_TWO.find_iter(&input).count() as isize;
603 count += DOUBLE_SYLLABIC_ONE.find_iter(&input).count() as isize;
604 count += DOUBLE_SYLLABIC_TWO.find_iter(&input).count() as isize;
605 count += DOUBLE_SYLLABIC_THREE.find_iter(&input).count() as isize;
606 count += DOUBLE_SYLLABIC_FOUR.find_iter(&input).count() as isize;
607 count += VOWEL.split(&input).filter(|x| !x.as_ref().unwrap().is_empty()).count() as isize;
608 count as usize
609 }
610 }
611 }
612 let tokens = text.split_whitespace().flat_map(tokenize).collect::<Vec<String>>();
613 tokens.into_iter().map(syllables).sum()
614}
615pub fn tokenize(value: &str) -> Vec<String> {
622 value
623 .replace("é", "-e")
624 .replace("ë", "-e")
625 .split('-')
626 .map(|x| NON_ALPHABETIC.replace_all(x, "").to_lowercase())
627 .collect::<Vec<_>>()
628}