yake-rust 1.0.3

Yake (Yet Another Keyword Extractor) in Rust
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
use pretty_assertions::assert_eq;

use super::*;

fn test<const T: usize>(text: &str, lang: &str, cfg: Config, n_best: usize, expected: [(&str, &str, f64); T]) {
    let stopwords = StopWords::predefined(lang).unwrap();
    let mut actual = Yake::new(stopwords, cfg).get_n_best(text, n_best);
    // leave only 4 digits
    actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
    assert_eq!(actual, expected);
}

#[test]
fn empty_text() {
    test("", "en", Config::default(), 1, []);
}

#[test]
fn zero_size_ngram() {
    test("happy new year", "en", Config { ngrams: 0, ..Default::default() }, 1, []);
}

#[test]
fn short() {
    test("this is a keyword", "en", Config::default(), 1, [("keyword", "keyword", 0.1583)]);
    // Results agree with reference implementation LIAAD/yake
}

#[test]
fn keywords_order_is_preserved() {
    // If not, this test becomes unstable.
    test(
        "Machine learning",
        "en",
        Config { ngrams: 1, ..Default::default() },
        3,
        [("Machine", "machine", 0.1583), ("learning", "learning", 0.1583)],
    );
    // Results agree with reference implementation LIAAD/yake
}

#[test]
fn laptop() {
    test(
        "Do you need an Apple laptop?",
        "en",
        Config { ngrams: 1, ..Default::default() },
        2,
        [("Apple", "apple", 0.1448), ("laptop", "laptop", 0.1583)],
    );
    // Results agree with reference implementation LIAAD/yake
}

#[test]
fn headphones() {
    test(
        "Do you like headphones? \
            Starting this Saturday, we will be kicking off a huge sale of headphones! \
            If you need headphones, we've got you covered!",
        "en",
        Config { ngrams: 1, ..Default::default() },
        3,
        [("headphones", "headphones", 0.1141), ("Saturday", "saturday", 0.2111), ("Starting", "starting", 0.4096)],
    );
    // Results agree with reference implementation LIAAD/yake
}

#[test]
fn multi_ngram() {
    test(
        "I will give you a great deal if you just read this!",
        "en",
        Config { ngrams: 2, ..Default::default() },
        1,
        [("great deal", "great deal", 0.0257)],
    );
    // Results agree with reference implementation LIAAD/yake
}

#[test]
fn singular() {
    test(
        // Weird grammar; to compare with the "plural" test
        "One smartwatch. One phone. Many phone.",
        "en",
        Config { ngrams: 1, ..Default::default() },
        2,
        [("smartwatch", "smartwatch", 0.2025), ("phone", "phone", 0.2474)],
    );
    // Results agree with reference implementation LIAAD/yake
}

#[test]
fn plural() {
    test(
        "One smartwatch. One phone. Many phones.",
        "en",
        Config { ngrams: 1, ..Default::default() },
        3,
        [("smartwatch", "smartwatch", 0.2025), ("phone", "phone", 0.4949), ("phones", "phones", 0.4949)],
    );
    // Results agree with reference implementation LIAAD/yake
}

#[test]
fn non_hyphenated() {
    // For comparison with the "hyphenated" test
    test("Truly high tech!", "en", Config { ngrams: 2, ..Default::default() }, 1, [("high tech", "high tech", 0.0494)]);
    // Results agree with reference implementation LIAAD/yake
}

#[test]
fn hyphenated() {
    test("Truly high-tech!", "en", Config { ngrams: 2, ..Default::default() }, 1, [("high-tech", "high-tech", 0.1583)]);
    // Results agree with reference implementation LIAAD/yake
}

#[test]
fn weekly_newsletter_short() {
    test(
        "This is your weekly newsletter!",
        "en",
        Config { ngrams: 2, ..Default::default() },
        3,
        [
            ("weekly newsletter", "weekly newsletter", 0.0494),
            ("newsletter", "newsletter", 0.1583),
            ("weekly", "weekly", 0.2974),
        ],
    );
    // Results agree with reference implementation LIAAD/yake
}

#[test]
fn weekly_newsletter_long() {
    test(
        "This is your weekly newsletter! \
        Hundreds of great deals - everything from men's fashion \
        to high-tech drones!",
        "en",
        Config { ngrams: 2, ..Default::default() },
        5,
        [
            ("weekly newsletter", "weekly newsletter", 0.0780),
            ("newsletter", "newsletter", 0.2005),
            ("weekly", "weekly", 0.3607),
            ("great deals", "great deals", 0.4456),
            ("high-tech drones", "high-tech drones", 0.4456),
        ],
    );
    // Results agree with reference implementation LIAAD/yake
}

#[test]
fn weekly_newsletter_long_with_paragraphs() {
    test(
        "This is your weekly newsletter!\n\n \
        \tHundreds of great deals - everything from men's fashion \n\
        to high-tech drones!",
        "en",
        Config { ngrams: 2, ..Default::default() },
        5,
        [
            ("weekly newsletter", "weekly newsletter", 0.0780),
            ("newsletter", "newsletter", 0.2005),
            ("weekly", "weekly", 0.3607),
            ("great deals", "great deals", 0.4456),
            ("high-tech drones", "high-tech drones", 0.4456),
        ],
    );
    // Results agree with reference implementation LIAAD/yake
}

#[test]
fn composite_recurring_words_and_bigger_window() {
    test(
        "Machine learning is a growing field. Few research fields grow as much as machine learning grows.",
        "en",
        Config { ngrams: 2, window_size: 2, ..Default::default() },
        5,
        [
            ("Machine learning", "machine learning", 0.1346),
            ("growing field", "growing field", 0.1672),
            ("learning", "learning", 0.2265),
            ("Machine", "machine", 0.2341),
            ("growing", "growing", 0.2799),
        ],
    );
    // Results agree with reference implementation LIAAD/yake
}

#[test]
fn composite_recurring_words_near_numbers() {
    test(
        "I buy 100 yellow bananas every day. Every night I eat bananas - all but 5 bananas.",
        "en",
        Config { ngrams: 2, ..Default::default() },
        3,
        [("yellow bananas", "yellow bananas", 0.0682), ("buy", "buy", 0.1428), ("yellow", "yellow", 0.1428)],
    );
    // Results agree with reference implementation LIAAD/yake
}

#[test]
fn composite_recurring_words_near_spelled_out_numbers() {
    // For comparison with "composite_recurring_words_near_numbers" to see if numbers cause
    test(
        "I buy a hundred yellow bananas every day. Every night I eat bananas - all but five bananas.",
        "en",
        Config { ngrams: 2, ..Default::default() },
        3,
        [
            ("hundred yellow", "hundred yellow", 0.0446),
            ("yellow bananas", "yellow bananas", 0.1017),
            ("day", "day", 0.1428),
        ],
    );
    // Results agree with reference implementation LIAAD/yake
}

#[test]
fn with_stopword_in_the_middle() {
    test(
        "Game of Thrones",
        "en",
        Config { remove_duplicates: false, ..Config::default() },
        1,
        [("Game of Thrones", "game of thrones", 0.01380)],
    );
    // Results agree with reference implementation LIAAD/yake
}

mod liaad_yake_samples {
    use super::*;

    #[test]
    fn google_sample_single_ngram() {
        // LIAAD/yake sample text
        test(
            include_str!("samples/test_google.txt"),
            "en",
            Config { ngrams: 1, ..Default::default() },
            10,
            [
                ("Google", "google", 0.0251),
                ("Kaggle", "kaggle", 0.0273),
                ("data", "data", 0.08),
                ("science", "science", 0.0983),
                ("platform", "platform", 0.124),
                ("service", "service", 0.1316),
                ("acquiring", "acquiring", 0.1511),
                ("learning", "learning", 0.1621),
                ("Goldbloom", "goldbloom", 0.1625),
                ("machine", "machine", 0.1672),
            ],
        );
        // Results agree with reference implementation LIAAD/yake
    }

    #[test]
    fn google_sample_defaults() {
        // LIAAD/yake sample text
        test(
            include_str!("samples/test_google.txt"),
            "en",
            Config::default(),
            10,
            [
                ("Google", "google", 0.0251),
                ("Kaggle", "kaggle", 0.0273),
                ("CEO Anthony Goldbloom", "ceo anthony goldbloom", 0.0483),
                ("data science", "data science", 0.055),
                ("acquiring data science", "acquiring data science", 0.0603),
                ("Google Cloud Platform", "google cloud platform", 0.0746),
                ("data", "data", 0.08),
                ("San Francisco", "san francisco", 0.0914),
                ("Anthony Goldbloom declined", "anthony goldbloom declined", 0.0974),
                ("science", "science", 0.0983),
            ],
        );
        // Results agree with reference implementation LIAAD/yake
    }

    #[test]
    fn gitter_sample_defaults() {
        // LIAAD/yake sample text
        test(
            include_str!("samples/test_gitter.txt"),
            "en",
            Config::default(),
            10,
            [
                ("Gitter", "gitter", 0.0190),
                ("GitLab", "gitlab", 0.0478),
                ("acquires software chat", "acquires software chat", 0.0479),
                ("chat startup Gitter", "chat startup gitter", 0.0512),
                ("software chat startup", "software chat startup", 0.0612),
                ("Gitter chat", "gitter chat", 0.0684),
                ("GitLab acquires software", "gitlab acquires software", 0.0685),
                ("startup", "startup", 0.0783),
                ("software", "software", 0.0879),
                ("code", "code", 0.0879),
            ],
        );
        // Results agree with reference implementation LIAAD/yake
    }

    #[test]
    fn genius_sample_defaults() {
        // LIAAD/yake sample text
        test(
            include_str!("samples/test_genius.txt"),
            "en",
            Config::default(),
            10,
            [
                ("Genius", "genius", 0.0261),
                ("company", "company", 0.0263),
                ("Genius quietly laid", "genius quietly laid", 0.027),
                ("company quietly laid", "company quietly laid", 0.0392),
                ("media company", "media company", 0.0404),
                ("Lehman", "lehman", 0.0412),
                ("quietly laid", "quietly laid", 0.0583),
                ("Tom Lehman told", "tom lehman told", 0.0603),
                ("video", "video", 0.0650),
                ("co-founder Tom Lehman", "co-founder tom lehman", 0.0669),
            ],
        );
        // Results agree with reference implementation LIAAD/yake
    }

    #[test]
    fn german_sample_defaults() {
        // LIAAD/yake sample text
        test(
            include_str!("samples/test_german.txt"),
            "de",
            Config::default(),
            10,
            [
                ("Vereinigten Staaten", "vereinigten staaten", 0.0152), // LIAAD REFERENCE: 0.151
                ("Präsidenten Donald Trump", "präsidenten donald trump", 0.0182),
                ("Donald Trump", "donald trump", 0.0211), // LIAAD REFERENCE: 0.21
                ("trifft Donald Trump", "trifft donald trump", 0.0231), // LIAAD REFERENCE: 0.23
                ("Trump", "trump", 0.0240),
                ("Trumps Finanzminister Steven", "trumps finanzminister steven", 0.0243),
                ("Kanzlerin Angela Merkel", "kanzlerin angela merkel", 0.0275), // LIAAD REFERENCE: 0.273
                ("deutsche Kanzlerin Angela", "deutsche kanzlerin angela", 0.0316), // LIAAD REFERENCE: 0.314
                ("Merkel trifft Donald", "merkel trifft donald", 0.0353),       // LIAAD REFERENCE: 0.351
                ("Exportnation Deutschland", "exportnation deutschland", 0.038), // LIAAD REFERENCE: 0.0379
            ],
        );
        // REASONS FOR DISCREPANCY:
        // - The text contains both "bereit" ("ready") and "bereits" ("already").
        //   While "bereits" is a stopword, "bereit" is not.
        //   LIAAD/yake keeps track of whether a term is a stopword or not
        //   in a key-value mapping, where the key is the term, lowercase, plural-normalized.
        //   (Note that the plural normalization techique used is rarely effective in German.)
        //   Since "bereits" occurs before "bereit" in the text, LIAAD/yake sees it,
        //   recognizes it is a stopword, and stores it under the key "bereit". Later,
        //   when it encounters "bereit" (NOT a stopword), it already has that key in its
        //   mapping so it looks it up and finds that it is a keyword (which it is not).
        //   Meanwhile, yake-rust does not have such a key-value store, so it correctly
        //   recognizes "bereits" as a stopword and "bereit" as a non-stopword. The extra
        //   inclusion of "bereit" in the non-stopwords affects the TF statistics and thus
        //   the frequency contribution to the weights, leading to slightly different scores.
        //
        //   This is technically a bug in the reference implementation caused by the plural
        //   normalization. This small discrepancy is thus acceptable.
        //
    }

    #[test]
    fn dutch_sample_defaults() {
        // LIAAD/yake sample text
        test(
            include_str!("samples/test_nl.txt"),
            "nl",
            Config::default(),
            10,
            [
                ("Vincent van Gogh", "vincent van gogh", 0.0111),
                ("Gogh Museum", "gogh museum", 0.0125),
                ("Gogh", "gogh", 0.0150),
                ("Museum", "museum", 0.0438),
                ("brieven", "brieven", 0.0635),
                ("Vincent", "vincent", 0.0643),
                ("Goghs schilderijen", "goghs schilderijen", 0.1009),
                ("Gogh verging", "gogh verging", 0.1215),
                ("Goghs", "goghs", 0.1651),
                ("schrijven", "schrijven", 0.1704),
            ],
        );
        // Results agree with reference implementation LIAAD/yake
    }

    #[test]
    fn finnish_sample_defaults() {
        // LIAAD/yake sample text
        test(
            include_str!("samples/test_fi.txt"),
            "fi",
            Config::default(),
            10,
            [
                ("Mobile Networks", "mobile networks", 0.0043),
                ("Nokia tekee muutoksia", "nokia tekee muutoksia", 0.0061),
                ("tekee muutoksia organisaatioonsa", "tekee muutoksia organisaatioonsa", 0.0065),
                ("johtokuntaansa vauhdittaakseen yhtiön", "johtokuntaansa vauhdittaakseen yhtiön", 0.0088),
                ("vauhdittaakseen yhtiön strategian", "vauhdittaakseen yhtiön strategian", 0.0088),
                ("yhtiön strategian toteuttamista", "yhtiön strategian toteuttamista", 0.0092),
                ("Networks", "networks", 0.0102),
                ("Networks and Applications", "networks and applications", 0.0113),
                ("strategian toteuttamista Nokia", "strategian toteuttamista nokia", 0.0127),
                ("siirtyy Mobile Networks", "siirtyy mobile networks", 0.0130),
            ],
        );
        // Results agree with reference implementation LIAAD/yake
    }

    #[test]
    fn italian_sample_defaults() {
        // LIAAD/yake sample text
        test(
            include_str!("samples/test_it.txt"),
            "it",
            Config::default(),
            5,
            [
                ("Champions League", "champions league", 0.0390),
                ("Quarti", "quarti", 0.0520),
                ("Atlético Madrid", "atlético madrid", 0.0592),
                ("Ottavi di finale", "ottavi di finale", 0.0646),
                ("Real Madrid", "real madrid", 0.0701),
            ],
        );
        // Results agree with reference implementation LIAAD/yake
    }

    #[test]
    fn french_sample_defaults() {
        // LIAAD/yake sample text
        test(
            include_str!("samples/test_fr.txt"),
            "fr",
            Config::default(),
            10,
            [
                ("dégrade en France", "dégrade en france", 0.0254),
                ("jusque-là uniquement associée", "jusque-là uniquement associée", 0.0504),
                ("sondage Ifop réalisé", "sondage ifop réalisé", 0.0554),
                ("religion se dégrade", "religion se dégrade", 0.091),
                ("France", "france", 0.0941),
                ("l'extrême droite", "l'extrême droite", 0.0997),
                ("sondage Ifop", "sondage ifop", 0.101),
                ("Islam", "islam", 0.1021),
                ("musulmane en France", "musulmane en france", 0.1078),
                ("Allemagne", "allemagne", 0.1086),
            ],
        );
        // Results agree with reference implementation LIAAD/yake
    }

    #[test]
    fn portuguese_sport_sample_defaults() {
        // LIAAD/yake sample text
        test(
            include_str!("samples/test_pt_1.txt"),
            "pt",
            Config::default(),
            10,
            [
                ("seleção brasileira treinará", "seleção brasileira treinará", 0.0072),
                ("seleção brasileira", "seleção brasileira", 0.0100),
                ("Seleção Brasileira visando", "seleção brasileira visando", 0.0192),
                ("Seleção Brasileira encara", "seleção brasileira encara", 0.0344),
                ("brasileira treinará", "brasileira treinará", 0.0373),
                ("Renato Augusto", "renato augusto", 0.0376),
                ("Copa da Rússia", "copa da rússia", 0.0407),
                ("seleção", "seleção", 0.0454),
                ("brasileira", "brasileira", 0.0528),
                ("meia Renato Augusto", "meia renato augusto", 0.0623),
            ],
        );
        // Results agree with reference implementation LIAAD/yake
    }

    #[test]
    fn portuguese_tourism_sample_defaults() {
        // LIAAD/yake sample text
        test(
            include_str!("samples/test_pt_2.txt"),
            "pt",
            Config::default(),
            10,
            [
                ("Alvor", "alvor", 0.0165),
                ("Rio Alvor", "rio alvor", 0.0336),
                ("Ria de Alvor", "ria de alvor", 0.0488),
                ("encantadora vila", "encantadora vila", 0.0575),
                ("Algarve", "algarve", 0.0774),
                ("impressionantes de Portugal", "impressionantes de portugal", 0.0844),
                ("estuário do Rio", "estuário do rio", 0.0907),
                ("vila", "vila", 0.1017),
                ("Ria", "ria", 0.1053),
                ("Oceano Atlântico", "oceano atlântico", 0.1357),
            ],
        );
        // Results agree with reference implementation LIAAD/yake
    }

    #[test]
    fn spanish_sample_defaults() {
        // LIAAD/yake sample text
        test(
            include_str!("samples/test_es.txt"),
            "es",
            Config::default(),
            10,
            [
                ("Guerra Civil Española", "guerra civil española", 0.0032),
                ("Guerra Civil", "guerra civil", 0.0130),
                ("Civil Española", "civil española", 0.0153),
                ("Partido Socialista Obrero", "partido socialista obrero", 0.0283),
                ("empezó la Guerra", "empezó la guerra", 0.0333),
                ("Socialista Obrero Español", "socialista obrero español", 0.0411),
                ("José Castillo", "josé castillo", 0.0426),
                ("Española", "española", 0.0566),
                ("José Antonio Primo", "josé antonio primo", 0.0589),
                ("José Calvo Sotelo", "josé calvo sotelo", 0.0596),
            ],
        );
        // Results agree with reference implementation LIAAD/yake
    }

    #[test]
    fn polish_sample_defaults() {
        // LIAAD/yake sample text
        test(
            include_str!("samples/test_pl.txt"),
            "pl",
            Config::default(),
            10,
            [
                ("franka", "franka", 0.0328),
                ("Geerta Wildersa VVD", "geerta wildersa vvd", 0.0346),
                ("Geerta Wildersa", "geerta wildersa", 0.0399),
                ("kurs franka", "kurs franka", 0.0486),
                ("partii Geerta Wildersa", "partii geerta wildersa", 0.0675),
                ("proc", "proc", 0.0692),
                ("mld", "mld", 0.0724),
                ("Narodowego Banku Szwajcarii", "narodowego banku szwajcarii", 0.0728),
                ("kurs franka poniżej", "kurs franka poniżej", 0.0758),
                ("Wildersa", "wildersa", 0.0765),
            ],
        );
        // Results agree with reference implementation LIAAD/yake
    }

    #[test]
    fn turkish_sample_defaults() {
        // LIAAD/yake sample text
        test(
            include_str!("samples/test_tr.txt"),
            "tr",
            Config::default(),
            10,
            [
                ("OECD", "oecd", 0.0178),
                ("Tek Bakışta Eğitim", "tek bakışta eğitim", 0.0236),
                ("eğitim", "eğitim", 0.0278),
                ("OECD eğitim endeksi", "oecd eğitim endeksi", 0.0323),
                ("OECD ortalamasının", "oecd ortalamasının", 0.0383),
                ("Kalkınma Örgütü'nün", "kalkınma örgütü'nün", 0.045),
                ("Tek Bakışta", "tek bakışta", 0.045),
                ("İşbirliği ve Kalkınma", "i̇şbirliği ve kalkınma", 0.0468),
                ("Türkiye'de", "türkiye'de", 0.0480),
                ("yüksek", "yüksek", 0.0513),
            ],
        );
        // Results agree with reference implementation LIAAD/yake
    }

    #[test]
    fn arabic_sample_defaults() {
        // LIAAD/yake sample text
        test(
            include_str!("samples/test_ar.txt"),
            "ar",
            Config::default(),
            10,
            [
                ("عبد السلام العجيلي", "عبد السلام العجيلي", 0.0105),
                ("اللغة العربية الأربعاء", "اللغة العربية الأربعاء", 0.0139),
                ("عبد النبي اصطيف", "عبد النبي اصطيف", 0.0142),
                ("العجيلي في مرآة", "العجيلي في مرآة", 0.0177),
                ("مرآة النقد المقارن", "مرآة النقد المقارن", 0.0183), // LIAAD REFERENCE: 0.018
                ("السلام العجيلي", "السلام العجيلي", 0.0198),
                ("اللغة العربية", "اللغة العربية", 0.0207),
                ("مرآة النقد", "مرآة النقد", 0.0255), // LIAAD REFERENCE: 0.025
                ("اللغة العربية بدمشق", "اللغة العربية بدمشق", 0.0261),
                ("مجمع اللغة العربية", "مجمع اللغة العربية", 0.0281),
            ],
        );
    }

    #[test]
    fn dataset_text_1_defaults() {
        // LIAAD/yake sample text
        test(
            include_str!("samples/test_data_1.txt"),
            "pt",
            Config::default(),
            10,
            [
                ("Médio Oriente continua", "médio oriente continua", 0.0008),
                ("Médio Oriente", "médio oriente", 0.0045),
                ("Oriente continua", "oriente continua", 0.0117),
                ("registar-se violentos confrontos", "registar-se violentos confrontos", 0.0178),
                ("Faixa de Gaza", "faixa de gaza", 0.0268),
                ("fogo hoje voltaram", "fogo hoje voltaram", 0.0311),
                ("voltaram a registar-se", "voltaram a registar-se", 0.0311),
                ("registar-se violentos", "registar-se violentos", 0.0311),
                ("Exército israelita", "exército israelita", 0.0368),
                ("Exército israelita voltou", "exército israelita voltou", 0.0639),
            ],
        );
        // Results agree with reference implementation LIAAD/yake
    }

    #[test]
    fn dataset_text_2_defaults() {
        // LIAAD/yake sample text
        test(
            include_str!("samples/test_data_2.txt"),
            "en",
            Config::default(),
            5,
            [
                ("highly radioactive water", "highly radioactive water", 0.0006),
                ("crippled nuclear plant", "crippled nuclear plant", 0.0006),
                ("ocean Japan official", "ocean japan official", 0.0031),
                ("Japan official", "japan official", 0.0046),
                ("official says highly", "official says highly", 0.0050),
            ],
        );
        // Results agree with reference implementation LIAAD/yake
    }

    #[test]
    fn dataset_text_3_defaults() {
        // LIAAD/yake sample text
        test(
            include_str!("samples/test_data_3.txt"),
            "en",
            Config::default(),
            5,
            [
                ("Global Crossing", "global crossing", 0.0034),
                ("Hutchison Telecommunications", "hutchison telecommunications", 0.0053),
                ("Telecommunications and Singapore", "telecommunications and singapore", 0.0072),
                ("Singapore Technologies", "singapore technologies", 0.0072),
                ("Technologies take control", "technologies take control", 0.0157),
            ],
        );
        // Results agree with reference implementation LIAAD/yake
    }

    #[test]
    fn dataset_text_4_defaults() {
        // LIAAD/yake sample text
        test(
            include_str!("samples/test_data_4.txt"),
            "en",
            Config::default(),
            10,
            [
                ("annual revenues increasing", "annual revenues increasing", 0.0018),
                ("retail inventory management", "retail inventory management", 0.0023),
                ("Dollar General", "dollar general", 0.0034),
                ("inventory management", "inventory management", 0.0112),
                ("perpetual progress", "perpetual progress", 0.0133),
                ("revenues increasing", "revenues increasing", 0.0133),
                ("fast track", "fast track", 0.0133),
                ("road to perpetual", "road to perpetual", 0.0159),
                ("annual revenues", "annual revenues", 0.0168),
                ("stores opened", "stores opened", 0.0168),
            ],
        );
        // Results agree with reference implementation LIAAD/yake
    }

    #[test]
    fn dataset_text_5_defaults() {
        // LIAAD/yake sample text
        test(
            include_str!("samples/test_data_5.txt"),
            "en",
            Config::default(),
            10,
            [
                ("Handoff Trigger Table", "handoff trigger table", 0.0007),
                ("Handoff", "handoff", 0.0010),
                ("WLAN Networks ABSTRACT", "wlan networks abstract", 0.0019),
                ("Vertical handoff", "vertical handoff", 0.0020),
                ("Handoff Trigger", "handoff trigger", 0.0021),
                ("proactive handoff scheme", "proactive handoff scheme", 0.0021),
                ("HTT Method Figure", "htt method figure", 0.0022),
                ("WLAN", "wlan", 0.0023),
                ("ABSTRACT Vertical handoff", "abstract vertical handoff", 0.0030),
                ("traditional handoff scheme", "traditional handoff scheme", 0.0033),
            ],
        );
        // Results agree with reference implementation LIAAD/yake
    }

    #[test]
    fn dataset_text_6_defaults() {
        // LIAAD/yake sample text
        test(
            include_str!("samples/test_data_6.txt"),
            "en",
            Config::default(),
            10,
            [
                ("MRSA", "mrsa", 0.0047),
                ("TSN Database", "tsn database", 0.0107),
                ("methicillin-resistant Staphylococcus aureus", "methicillin-resistant staphylococcus aureus", 0.0116),
                ("rates of MRSA", "rates of mrsa", 0.0145),
                ("Staphylococcus aureus", "staphylococcus aureus", 0.0167),
                ("methicillin-resistant Staphylococcus", "methicillin-resistant staphylococcus", 0.0177),
                ("prevalence of MRSA", "prevalence of mrsa", 0.0201),
                ("MRSA infections", "mrsa infections", 0.0218),
                ("MRSA infections detected", "mrsa infections detected", 0.0223),
                ("TSN", "tsn", 0.0250),
            ],
        );
        // Results agree with reference implementation LIAAD/yake
    }

    #[test]
    fn dataset_text_7_defaults() {
        // LIAAD/yake sample text
        test(
            include_str!("samples/test_data_7.txt"),
            "en",
            Config::default(),
            10,
            [
                ("Environment Design Level", "environment design level", 0.0008),
                ("Jerusalem Jerusalem", "jerusalem jerusalem", 0.0009),
                ("Dynamics Based Control", "dynamics based control", 0.0011),
                ("system dynamics", "system dynamics", 0.0017),
                ("DBC", "dbc", 0.0019),
                ("target system dynamics", "target system dynamics", 0.0019),
                ("target dynamics", "target dynamics", 0.0023),
                ("Science Bar Ilan", "science bar ilan", 0.0025),
                ("EMT", "emt", 0.0026),
                ("Dynamics", "dynamics", 0.0026),
            ],
        );
        // Results agree with reference implementation LIAAD/yake
    }
}