use pretty_assertions::assert_eq;
use super::*;
fn test<const T: usize>(text: &str, lang: &str, cfg: Config, n_best: usize, expected: [(&str, &str, f64); T]) {
let stopwords = StopWords::predefined(lang).unwrap();
let mut actual = Yake::new(stopwords, cfg).get_n_best(text, n_best);
actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
assert_eq!(actual, expected);
}
#[test]
fn empty_text() {
test("", "en", Config::default(), 1, []);
}
#[test]
fn zero_size_ngram() {
test("happy new year", "en", Config { ngrams: 0, ..Default::default() }, 1, []);
}
#[test]
fn short() {
test("this is a keyword", "en", Config::default(), 1, [("keyword", "keyword", 0.1583)]);
}
#[test]
fn keywords_order_is_preserved() {
test(
"Machine learning",
"en",
Config { ngrams: 1, ..Default::default() },
3,
[("Machine", "machine", 0.1583), ("learning", "learning", 0.1583)],
);
}
#[test]
fn laptop() {
test(
"Do you need an Apple laptop?",
"en",
Config { ngrams: 1, ..Default::default() },
2,
[("Apple", "apple", 0.1448), ("laptop", "laptop", 0.1583)],
);
}
#[test]
fn headphones() {
test(
"Do you like headphones? \
Starting this Saturday, we will be kicking off a huge sale of headphones! \
If you need headphones, we've got you covered!",
"en",
Config { ngrams: 1, ..Default::default() },
3,
[("headphones", "headphones", 0.1141), ("Saturday", "saturday", 0.2111), ("Starting", "starting", 0.4096)],
);
}
#[test]
fn multi_ngram() {
test(
"I will give you a great deal if you just read this!",
"en",
Config { ngrams: 2, ..Default::default() },
1,
[("great deal", "great deal", 0.0257)],
);
}
#[test]
fn singular() {
test(
"One smartwatch. One phone. Many phone.",
"en",
Config { ngrams: 1, ..Default::default() },
2,
[("smartwatch", "smartwatch", 0.2025), ("phone", "phone", 0.2474)],
);
}
#[test]
fn plural() {
test(
"One smartwatch. One phone. Many phones.",
"en",
Config { ngrams: 1, ..Default::default() },
3,
[("smartwatch", "smartwatch", 0.2025), ("phone", "phone", 0.4949), ("phones", "phones", 0.4949)],
);
}
#[test]
fn non_hyphenated() {
test("Truly high tech!", "en", Config { ngrams: 2, ..Default::default() }, 1, [("high tech", "high tech", 0.0494)]);
}
#[test]
fn hyphenated() {
test("Truly high-tech!", "en", Config { ngrams: 2, ..Default::default() }, 1, [("high-tech", "high-tech", 0.1583)]);
}
#[test]
fn weekly_newsletter_short() {
test(
"This is your weekly newsletter!",
"en",
Config { ngrams: 2, ..Default::default() },
3,
[
("weekly newsletter", "weekly newsletter", 0.0494),
("newsletter", "newsletter", 0.1583),
("weekly", "weekly", 0.2974),
],
);
}
#[test]
fn weekly_newsletter_long() {
test(
"This is your weekly newsletter! \
Hundreds of great deals - everything from men's fashion \
to high-tech drones!",
"en",
Config { ngrams: 2, ..Default::default() },
5,
[
("weekly newsletter", "weekly newsletter", 0.0780),
("newsletter", "newsletter", 0.2005),
("weekly", "weekly", 0.3607),
("great deals", "great deals", 0.4456),
("high-tech drones", "high-tech drones", 0.4456),
],
);
}
#[test]
fn weekly_newsletter_long_with_paragraphs() {
test(
"This is your weekly newsletter!\n\n \
\tHundreds of great deals - everything from men's fashion \n\
to high-tech drones!",
"en",
Config { ngrams: 2, ..Default::default() },
5,
[
("weekly newsletter", "weekly newsletter", 0.0780),
("newsletter", "newsletter", 0.2005),
("weekly", "weekly", 0.3607),
("great deals", "great deals", 0.4456),
("high-tech drones", "high-tech drones", 0.4456),
],
);
}
#[test]
fn composite_recurring_words_and_bigger_window() {
test(
"Machine learning is a growing field. Few research fields grow as much as machine learning grows.",
"en",
Config { ngrams: 2, window_size: 2, ..Default::default() },
5,
[
("Machine learning", "machine learning", 0.1346),
("growing field", "growing field", 0.1672),
("learning", "learning", 0.2265),
("Machine", "machine", 0.2341),
("growing", "growing", 0.2799),
],
);
}
#[test]
fn composite_recurring_words_near_numbers() {
test(
"I buy 100 yellow bananas every day. Every night I eat bananas - all but 5 bananas.",
"en",
Config { ngrams: 2, ..Default::default() },
3,
[("yellow bananas", "yellow bananas", 0.0682), ("buy", "buy", 0.1428), ("yellow", "yellow", 0.1428)],
);
}
#[test]
fn composite_recurring_words_near_spelled_out_numbers() {
test(
"I buy a hundred yellow bananas every day. Every night I eat bananas - all but five bananas.",
"en",
Config { ngrams: 2, ..Default::default() },
3,
[
("hundred yellow", "hundred yellow", 0.0446),
("yellow bananas", "yellow bananas", 0.1017),
("day", "day", 0.1428),
],
);
}
#[test]
fn with_stopword_in_the_middle() {
test(
"Game of Thrones",
"en",
Config { remove_duplicates: false, ..Config::default() },
1,
[("Game of Thrones", "game of thrones", 0.01380)],
);
}
mod liaad_yake_samples {
use super::*;
#[test]
fn google_sample_single_ngram() {
test(
include_str!("samples/test_google.txt"),
"en",
Config { ngrams: 1, ..Default::default() },
10,
[
("Google", "google", 0.0251),
("Kaggle", "kaggle", 0.0273),
("data", "data", 0.08),
("science", "science", 0.0983),
("platform", "platform", 0.124),
("service", "service", 0.1316),
("acquiring", "acquiring", 0.1511),
("learning", "learning", 0.1621),
("Goldbloom", "goldbloom", 0.1625),
("machine", "machine", 0.1672),
],
);
}
#[test]
fn google_sample_defaults() {
test(
include_str!("samples/test_google.txt"),
"en",
Config::default(),
10,
[
("Google", "google", 0.0251),
("Kaggle", "kaggle", 0.0273),
("CEO Anthony Goldbloom", "ceo anthony goldbloom", 0.0483),
("data science", "data science", 0.055),
("acquiring data science", "acquiring data science", 0.0603),
("Google Cloud Platform", "google cloud platform", 0.0746),
("data", "data", 0.08),
("San Francisco", "san francisco", 0.0914),
("Anthony Goldbloom declined", "anthony goldbloom declined", 0.0974),
("science", "science", 0.0983),
],
);
}
#[test]
fn gitter_sample_defaults() {
test(
include_str!("samples/test_gitter.txt"),
"en",
Config::default(),
10,
[
("Gitter", "gitter", 0.0190),
("GitLab", "gitlab", 0.0478),
("acquires software chat", "acquires software chat", 0.0479),
("chat startup Gitter", "chat startup gitter", 0.0512),
("software chat startup", "software chat startup", 0.0612),
("Gitter chat", "gitter chat", 0.0684),
("GitLab acquires software", "gitlab acquires software", 0.0685),
("startup", "startup", 0.0783),
("software", "software", 0.0879),
("code", "code", 0.0879),
],
);
}
#[test]
fn genius_sample_defaults() {
test(
include_str!("samples/test_genius.txt"),
"en",
Config::default(),
10,
[
("Genius", "genius", 0.0261),
("company", "company", 0.0263),
("Genius quietly laid", "genius quietly laid", 0.027),
("company quietly laid", "company quietly laid", 0.0392),
("media company", "media company", 0.0404),
("Lehman", "lehman", 0.0412),
("quietly laid", "quietly laid", 0.0583),
("Tom Lehman told", "tom lehman told", 0.0603),
("video", "video", 0.0650),
("co-founder Tom Lehman", "co-founder tom lehman", 0.0669),
],
);
}
#[test]
fn german_sample_defaults() {
test(
include_str!("samples/test_german.txt"),
"de",
Config::default(),
10,
[
("Vereinigten Staaten", "vereinigten staaten", 0.0152), ("Präsidenten Donald Trump", "präsidenten donald trump", 0.0182),
("Donald Trump", "donald trump", 0.0211), ("trifft Donald Trump", "trifft donald trump", 0.0231), ("Trump", "trump", 0.0240),
("Trumps Finanzminister Steven", "trumps finanzminister steven", 0.0243),
("Kanzlerin Angela Merkel", "kanzlerin angela merkel", 0.0275), ("deutsche Kanzlerin Angela", "deutsche kanzlerin angela", 0.0316), ("Merkel trifft Donald", "merkel trifft donald", 0.0353), ("Exportnation Deutschland", "exportnation deutschland", 0.038), ],
);
}
#[test]
fn dutch_sample_defaults() {
test(
include_str!("samples/test_nl.txt"),
"nl",
Config::default(),
10,
[
("Vincent van Gogh", "vincent van gogh", 0.0111),
("Gogh Museum", "gogh museum", 0.0125),
("Gogh", "gogh", 0.0150),
("Museum", "museum", 0.0438),
("brieven", "brieven", 0.0635),
("Vincent", "vincent", 0.0643),
("Goghs schilderijen", "goghs schilderijen", 0.1009),
("Gogh verging", "gogh verging", 0.1215),
("Goghs", "goghs", 0.1651),
("schrijven", "schrijven", 0.1704),
],
);
}
#[test]
fn finnish_sample_defaults() {
test(
include_str!("samples/test_fi.txt"),
"fi",
Config::default(),
10,
[
("Mobile Networks", "mobile networks", 0.0043),
("Nokia tekee muutoksia", "nokia tekee muutoksia", 0.0061),
("tekee muutoksia organisaatioonsa", "tekee muutoksia organisaatioonsa", 0.0065),
("johtokuntaansa vauhdittaakseen yhtiön", "johtokuntaansa vauhdittaakseen yhtiön", 0.0088),
("vauhdittaakseen yhtiön strategian", "vauhdittaakseen yhtiön strategian", 0.0088),
("yhtiön strategian toteuttamista", "yhtiön strategian toteuttamista", 0.0092),
("Networks", "networks", 0.0102),
("Networks and Applications", "networks and applications", 0.0113),
("strategian toteuttamista Nokia", "strategian toteuttamista nokia", 0.0127),
("siirtyy Mobile Networks", "siirtyy mobile networks", 0.0130),
],
);
}
#[test]
fn italian_sample_defaults() {
test(
include_str!("samples/test_it.txt"),
"it",
Config::default(),
5,
[
("Champions League", "champions league", 0.0390),
("Quarti", "quarti", 0.0520),
("Atlético Madrid", "atlético madrid", 0.0592),
("Ottavi di finale", "ottavi di finale", 0.0646),
("Real Madrid", "real madrid", 0.0701),
],
);
}
#[test]
fn french_sample_defaults() {
test(
include_str!("samples/test_fr.txt"),
"fr",
Config::default(),
10,
[
("dégrade en France", "dégrade en france", 0.0254),
("jusque-là uniquement associée", "jusque-là uniquement associée", 0.0504),
("sondage Ifop réalisé", "sondage ifop réalisé", 0.0554),
("religion se dégrade", "religion se dégrade", 0.091),
("France", "france", 0.0941),
("l'extrême droite", "l'extrême droite", 0.0997),
("sondage Ifop", "sondage ifop", 0.101),
("Islam", "islam", 0.1021),
("musulmane en France", "musulmane en france", 0.1078),
("Allemagne", "allemagne", 0.1086),
],
);
}
#[test]
fn portuguese_sport_sample_defaults() {
test(
include_str!("samples/test_pt_1.txt"),
"pt",
Config::default(),
10,
[
("seleção brasileira treinará", "seleção brasileira treinará", 0.0072),
("seleção brasileira", "seleção brasileira", 0.0100),
("Seleção Brasileira visando", "seleção brasileira visando", 0.0192),
("Seleção Brasileira encara", "seleção brasileira encara", 0.0344),
("brasileira treinará", "brasileira treinará", 0.0373),
("Renato Augusto", "renato augusto", 0.0376),
("Copa da Rússia", "copa da rússia", 0.0407),
("seleção", "seleção", 0.0454),
("brasileira", "brasileira", 0.0528),
("meia Renato Augusto", "meia renato augusto", 0.0623),
],
);
}
#[test]
fn portuguese_tourism_sample_defaults() {
test(
include_str!("samples/test_pt_2.txt"),
"pt",
Config::default(),
10,
[
("Alvor", "alvor", 0.0165),
("Rio Alvor", "rio alvor", 0.0336),
("Ria de Alvor", "ria de alvor", 0.0488),
("encantadora vila", "encantadora vila", 0.0575),
("Algarve", "algarve", 0.0774),
("impressionantes de Portugal", "impressionantes de portugal", 0.0844),
("estuário do Rio", "estuário do rio", 0.0907),
("vila", "vila", 0.1017),
("Ria", "ria", 0.1053),
("Oceano Atlântico", "oceano atlântico", 0.1357),
],
);
}
#[test]
fn spanish_sample_defaults() {
test(
include_str!("samples/test_es.txt"),
"es",
Config::default(),
10,
[
("Guerra Civil Española", "guerra civil española", 0.0032),
("Guerra Civil", "guerra civil", 0.0130),
("Civil Española", "civil española", 0.0153),
("Partido Socialista Obrero", "partido socialista obrero", 0.0283),
("empezó la Guerra", "empezó la guerra", 0.0333),
("Socialista Obrero Español", "socialista obrero español", 0.0411),
("José Castillo", "josé castillo", 0.0426),
("Española", "española", 0.0566),
("José Antonio Primo", "josé antonio primo", 0.0589),
("José Calvo Sotelo", "josé calvo sotelo", 0.0596),
],
);
}
#[test]
fn polish_sample_defaults() {
test(
include_str!("samples/test_pl.txt"),
"pl",
Config::default(),
10,
[
("franka", "franka", 0.0328),
("Geerta Wildersa VVD", "geerta wildersa vvd", 0.0346),
("Geerta Wildersa", "geerta wildersa", 0.0399),
("kurs franka", "kurs franka", 0.0486),
("partii Geerta Wildersa", "partii geerta wildersa", 0.0675),
("proc", "proc", 0.0692),
("mld", "mld", 0.0724),
("Narodowego Banku Szwajcarii", "narodowego banku szwajcarii", 0.0728),
("kurs franka poniżej", "kurs franka poniżej", 0.0758),
("Wildersa", "wildersa", 0.0765),
],
);
}
#[test]
fn turkish_sample_defaults() {
test(
include_str!("samples/test_tr.txt"),
"tr",
Config::default(),
10,
[
("OECD", "oecd", 0.0178),
("Tek Bakışta Eğitim", "tek bakışta eğitim", 0.0236),
("eğitim", "eğitim", 0.0278),
("OECD eğitim endeksi", "oecd eğitim endeksi", 0.0323),
("OECD ortalamasının", "oecd ortalamasının", 0.0383),
("Kalkınma Örgütü'nün", "kalkınma örgütü'nün", 0.045),
("Tek Bakışta", "tek bakışta", 0.045),
("İşbirliği ve Kalkınma", "i̇şbirliği ve kalkınma", 0.0468),
("Türkiye'de", "türkiye'de", 0.0480),
("yüksek", "yüksek", 0.0513),
],
);
}
#[test]
fn arabic_sample_defaults() {
test(
include_str!("samples/test_ar.txt"),
"ar",
Config::default(),
10,
[
("عبد السلام العجيلي", "عبد السلام العجيلي", 0.0105),
("اللغة العربية الأربعاء", "اللغة العربية الأربعاء", 0.0139),
("عبد النبي اصطيف", "عبد النبي اصطيف", 0.0142),
("العجيلي في مرآة", "العجيلي في مرآة", 0.0177),
("مرآة النقد المقارن", "مرآة النقد المقارن", 0.0183), ("السلام العجيلي", "السلام العجيلي", 0.0198),
("اللغة العربية", "اللغة العربية", 0.0207),
("مرآة النقد", "مرآة النقد", 0.0255), ("اللغة العربية بدمشق", "اللغة العربية بدمشق", 0.0261),
("مجمع اللغة العربية", "مجمع اللغة العربية", 0.0281),
],
);
}
#[test]
fn dataset_text_1_defaults() {
test(
include_str!("samples/test_data_1.txt"),
"pt",
Config::default(),
10,
[
("Médio Oriente continua", "médio oriente continua", 0.0008),
("Médio Oriente", "médio oriente", 0.0045),
("Oriente continua", "oriente continua", 0.0117),
("registar-se violentos confrontos", "registar-se violentos confrontos", 0.0178),
("Faixa de Gaza", "faixa de gaza", 0.0268),
("fogo hoje voltaram", "fogo hoje voltaram", 0.0311),
("voltaram a registar-se", "voltaram a registar-se", 0.0311),
("registar-se violentos", "registar-se violentos", 0.0311),
("Exército israelita", "exército israelita", 0.0368),
("Exército israelita voltou", "exército israelita voltou", 0.0639),
],
);
}
#[test]
fn dataset_text_2_defaults() {
test(
include_str!("samples/test_data_2.txt"),
"en",
Config::default(),
5,
[
("highly radioactive water", "highly radioactive water", 0.0006),
("crippled nuclear plant", "crippled nuclear plant", 0.0006),
("ocean Japan official", "ocean japan official", 0.0031),
("Japan official", "japan official", 0.0046),
("official says highly", "official says highly", 0.0050),
],
);
}
#[test]
fn dataset_text_3_defaults() {
test(
include_str!("samples/test_data_3.txt"),
"en",
Config::default(),
5,
[
("Global Crossing", "global crossing", 0.0034),
("Hutchison Telecommunications", "hutchison telecommunications", 0.0053),
("Telecommunications and Singapore", "telecommunications and singapore", 0.0072),
("Singapore Technologies", "singapore technologies", 0.0072),
("Technologies take control", "technologies take control", 0.0157),
],
);
}
#[test]
fn dataset_text_4_defaults() {
test(
include_str!("samples/test_data_4.txt"),
"en",
Config::default(),
10,
[
("annual revenues increasing", "annual revenues increasing", 0.0018),
("retail inventory management", "retail inventory management", 0.0023),
("Dollar General", "dollar general", 0.0034),
("inventory management", "inventory management", 0.0112),
("perpetual progress", "perpetual progress", 0.0133),
("revenues increasing", "revenues increasing", 0.0133),
("fast track", "fast track", 0.0133),
("road to perpetual", "road to perpetual", 0.0159),
("annual revenues", "annual revenues", 0.0168),
("stores opened", "stores opened", 0.0168),
],
);
}
#[test]
fn dataset_text_5_defaults() {
test(
include_str!("samples/test_data_5.txt"),
"en",
Config::default(),
10,
[
("Handoff Trigger Table", "handoff trigger table", 0.0007),
("Handoff", "handoff", 0.0010),
("WLAN Networks ABSTRACT", "wlan networks abstract", 0.0019),
("Vertical handoff", "vertical handoff", 0.0020),
("Handoff Trigger", "handoff trigger", 0.0021),
("proactive handoff scheme", "proactive handoff scheme", 0.0021),
("HTT Method Figure", "htt method figure", 0.0022),
("WLAN", "wlan", 0.0023),
("ABSTRACT Vertical handoff", "abstract vertical handoff", 0.0030),
("traditional handoff scheme", "traditional handoff scheme", 0.0033),
],
);
}
#[test]
fn dataset_text_6_defaults() {
test(
include_str!("samples/test_data_6.txt"),
"en",
Config::default(),
10,
[
("MRSA", "mrsa", 0.0047),
("TSN Database", "tsn database", 0.0107),
("methicillin-resistant Staphylococcus aureus", "methicillin-resistant staphylococcus aureus", 0.0116),
("rates of MRSA", "rates of mrsa", 0.0145),
("Staphylococcus aureus", "staphylococcus aureus", 0.0167),
("methicillin-resistant Staphylococcus", "methicillin-resistant staphylococcus", 0.0177),
("prevalence of MRSA", "prevalence of mrsa", 0.0201),
("MRSA infections", "mrsa infections", 0.0218),
("MRSA infections detected", "mrsa infections detected", 0.0223),
("TSN", "tsn", 0.0250),
],
);
}
#[test]
fn dataset_text_7_defaults() {
test(
include_str!("samples/test_data_7.txt"),
"en",
Config::default(),
10,
[
("Environment Design Level", "environment design level", 0.0008),
("Jerusalem Jerusalem", "jerusalem jerusalem", 0.0009),
("Dynamics Based Control", "dynamics based control", 0.0011),
("system dynamics", "system dynamics", 0.0017),
("DBC", "dbc", 0.0019),
("target system dynamics", "target system dynamics", 0.0019),
("target dynamics", "target dynamics", 0.0023),
("Science Bar Ilan", "science bar ilan", 0.0025),
("EMT", "emt", 0.0026),
("Dynamics", "dynamics", 0.0026),
],
);
}
}