gaoya/text/
tokenizers.rs

1use shingles::Shingles;
2use itertools::Itertools;
3use crate::text::multi_shingles::MultiShingles;
4
5pub fn whitespace_split<'a>(text: &'a str) -> impl Iterator<Item = &'a str> {
6    text
7        .split(|c: char| c.is_ascii_punctuation() || c.is_ascii_whitespace())
8        .filter(|&x| !x.is_empty())
9}
10
11pub fn whitespace_split_boxed<'a>(text: &'a str) -> Box<dyn Iterator<Item = &'a str> + 'a> {
12    Box::new(text
13        .split(|c: char| c.is_ascii_punctuation() || c.is_ascii_whitespace())
14        .filter(|&x| !x.is_empty()))
15}
16
17pub fn shingle_text<'a>(text: &'a str, size: usize) -> impl Iterator<Item = &'a str> {
18    Shingles::new(text, size)
19}
20
21
22pub fn shingle_text_range<'a>(text: &'a str, from: usize, to: usize) -> impl Iterator<Item = &'a str> {
23    MultiShingles::new(text, from, to)
24}
25
26pub fn shingle_text_boxed<'a>(text: &'a str, size: usize) -> Box<dyn Iterator<Item = &'a str> + 'a> {
27    Box::new(Shingles::new(text, size))
28}
29
30
31pub fn shingle_tokens<'a>(tokens: &'a Vec<&'a str>, size: usize) -> impl Iterator<Item = String> {
32    Shingles::new(tokens.as_slice(), size)
33        .into_iter().map(|tokens| tokens.join(""))
34        .collect_vec()
35        .into_iter()
36}
37
38
39
40