creature_feature/
n_gram.rs

1use crate::accum_ftzr::{Ftzr, IterFtzr, LinearFixed};
2#[cfg(feature = "serde")]
3use serde::{Deserialize, Serialize};
4use std::convert::{TryFrom, TryInto};
5
6/// The type of a fixed-length n-gram over copied data. Created by `n_gram::<N>()`
7#[derive(Hash, Copy, Clone, PartialEq, Ord, PartialOrd, Eq, Debug, Default)]
8#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
9pub struct NGram<const N: usize>();
10
11/// The associated iterator for `<NGram<N> as IterFtzr<T>>::Iter`
12#[derive(Hash, Copy, Clone, PartialEq, Ord, PartialOrd, Eq, Debug)]
13#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
14pub struct NGramIter<'a, T, const N: usize> {
15    idx: usize,
16    data: &'a [T],
17}
18
19impl<'a, T: 'a, const N: usize> Iterator for NGramIter<'a, T, N>
20where
21    [T; N]: TryFrom<&'a [T]>,
22{
23    type Item = [T; N];
24    #[inline]
25    fn next(&mut self) -> Option<Self::Item> {
26        let jdx = self.idx + N;
27        if jdx <= self.data.len() {
28            let ret = Some(
29                TryInto::try_into(&self.data[self.idx..jdx])
30                    .map_err(|_| ())
31                    .expect("Error converting from slice to [T;N]"),
32            );
33            self.idx += 1;
34            ret
35        } else {
36            None
37        }
38    }
39}
40
41impl<'a, T, const N: usize> IterFtzr<&'a [T]> for NGram<N>
42where
43    [T; N]: TryFrom<&'a [T]>,
44{
45    type TokenGroup = [T; N];
46    type Iter = NGramIter<'a, T, N>;
47
48    fn iterate_features(&self, origin: &'a [T]) -> Self::Iter {
49        NGramIter {
50            idx: 0,
51            data: origin,
52        }
53    }
54}
55
56impl<const N: usize> LinearFixed for NGram<N> {
57    fn chunk_size(&self) -> usize {
58        N
59    }
60}
61
62impl<'a, T, const N: usize> IterFtzr<&'a Vec<T>> for NGram<N>
63where
64    [T; N]: TryFrom<&'a [T]>,
65{
66    type TokenGroup = [T; N];
67    type Iter = NGramIter<'a, T, N>;
68
69    fn iterate_features(&self, origin: &'a Vec<T>) -> Self::Iter {
70        NGramIter {
71            idx: 0,
72            data: origin.as_slice(),
73        }
74    }
75}
76
77impl<'a, const N: usize> IterFtzr<&'a str> for NGram<N>
78where
79    [u8; N]: TryFrom<&'a [u8]>,
80{
81    type TokenGroup = [u8; N];
82    type Iter = NGramIter<'a, u8, N>;
83
84    fn iterate_features(&self, origin: &'a str) -> Self::Iter {
85        NGramIter {
86            idx: 0,
87            data: origin.as_bytes(),
88        }
89    }
90}
91
92impl<'a, const N: usize> IterFtzr<&'a String> for NGram<N>
93where
94    [u8; N]: TryFrom<&'a [u8]>,
95{
96    type TokenGroup = [u8; N];
97    type Iter = NGramIter<'a, u8, N>;
98
99    fn iterate_features(&self, origin: &'a String) -> Self::Iter {
100        self.iterate_features(origin.as_str())
101    }
102}
103
104impl<'a, T, const N: usize, const M: usize> IterFtzr<&'a [T; M]> for NGram<N>
105where
106    [T; N]: TryFrom<&'a [T]>,
107{
108    type TokenGroup = [T; N];
109    type Iter = NGramIter<'a, T, N>;
110
111    fn iterate_features(&self, origin: &'a [T; M]) -> Self::Iter {
112        NGramIter {
113            idx: 0,
114            data: &origin[..],
115        }
116    }
117}
118
119/// general n-grams over copied data, produces owned data (like String) or multiple `[T; N]`. (Compare to `n_slice`)
120/// ```
121/// use creature_feature::ftzrs::n_gram;
122///
123/// let my_ftzr = n_gram::<7>();
124///
125/// let feats: Vec<[T; 7]> = my_ftzr.featurize(my_data);
126/// let feats: Vec<String> = my_ftzr.featurize(my_other_data);
127/// ```
128pub fn n_gram<const N: usize>() -> NGram<N> {
129    NGram::<N>()
130}
131
132/// bigrams over copied data, produces owned data (like String) or multiple `[T; 2]`. (Compare to `bislice`)
133pub fn bigram() -> NGram<2> {
134    NGram::<2>()
135}
136
137/// trigrams over copied data, produces owned data (like String) or multiple `[T; 3]`. (Compare to `trislice`)
138pub fn trigram() -> NGram<3> {
139    NGram::<3>()
140}
141
142impl<Origin, const N: usize> Ftzr<Origin> for NGram<N>
143where
144    Self: IterFtzr<Origin>,
145{
146    type TokenGroup = <Self as IterFtzr<Origin>>::TokenGroup;
147    fn push_tokens<Push>(&self, origin: Origin, push: &mut Push)
148    where
149        Push: FnMut(Self::TokenGroup) -> (),
150    {
151        for t in self.iterate_features(origin) {
152            push(t)
153        }
154    }
155}