creature_feature/
n_gram.rs1use crate::accum_ftzr::{Ftzr, IterFtzr, LinearFixed};
2#[cfg(feature = "serde")]
3use serde::{Deserialize, Serialize};
4use std::convert::{TryFrom, TryInto};
5
6#[derive(Hash, Copy, Clone, PartialEq, Ord, PartialOrd, Eq, Debug, Default)]
8#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
9pub struct NGram<const N: usize>();
10
11#[derive(Hash, Copy, Clone, PartialEq, Ord, PartialOrd, Eq, Debug)]
13#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
14pub struct NGramIter<'a, T, const N: usize> {
15 idx: usize,
16 data: &'a [T],
17}
18
19impl<'a, T: 'a, const N: usize> Iterator for NGramIter<'a, T, N>
20where
21 [T; N]: TryFrom<&'a [T]>,
22{
23 type Item = [T; N];
24 #[inline]
25 fn next(&mut self) -> Option<Self::Item> {
26 let jdx = self.idx + N;
27 if jdx <= self.data.len() {
28 let ret = Some(
29 TryInto::try_into(&self.data[self.idx..jdx])
30 .map_err(|_| ())
31 .expect("Error converting from slice to [T;N]"),
32 );
33 self.idx += 1;
34 ret
35 } else {
36 None
37 }
38 }
39}
40
41impl<'a, T, const N: usize> IterFtzr<&'a [T]> for NGram<N>
42where
43 [T; N]: TryFrom<&'a [T]>,
44{
45 type TokenGroup = [T; N];
46 type Iter = NGramIter<'a, T, N>;
47
48 fn iterate_features(&self, origin: &'a [T]) -> Self::Iter {
49 NGramIter {
50 idx: 0,
51 data: origin,
52 }
53 }
54}
55
56impl<const N: usize> LinearFixed for NGram<N> {
57 fn chunk_size(&self) -> usize {
58 N
59 }
60}
61
62impl<'a, T, const N: usize> IterFtzr<&'a Vec<T>> for NGram<N>
63where
64 [T; N]: TryFrom<&'a [T]>,
65{
66 type TokenGroup = [T; N];
67 type Iter = NGramIter<'a, T, N>;
68
69 fn iterate_features(&self, origin: &'a Vec<T>) -> Self::Iter {
70 NGramIter {
71 idx: 0,
72 data: origin.as_slice(),
73 }
74 }
75}
76
77impl<'a, const N: usize> IterFtzr<&'a str> for NGram<N>
78where
79 [u8; N]: TryFrom<&'a [u8]>,
80{
81 type TokenGroup = [u8; N];
82 type Iter = NGramIter<'a, u8, N>;
83
84 fn iterate_features(&self, origin: &'a str) -> Self::Iter {
85 NGramIter {
86 idx: 0,
87 data: origin.as_bytes(),
88 }
89 }
90}
91
92impl<'a, const N: usize> IterFtzr<&'a String> for NGram<N>
93where
94 [u8; N]: TryFrom<&'a [u8]>,
95{
96 type TokenGroup = [u8; N];
97 type Iter = NGramIter<'a, u8, N>;
98
99 fn iterate_features(&self, origin: &'a String) -> Self::Iter {
100 self.iterate_features(origin.as_str())
101 }
102}
103
104impl<'a, T, const N: usize, const M: usize> IterFtzr<&'a [T; M]> for NGram<N>
105where
106 [T; N]: TryFrom<&'a [T]>,
107{
108 type TokenGroup = [T; N];
109 type Iter = NGramIter<'a, T, N>;
110
111 fn iterate_features(&self, origin: &'a [T; M]) -> Self::Iter {
112 NGramIter {
113 idx: 0,
114 data: &origin[..],
115 }
116 }
117}
118
119pub fn n_gram<const N: usize>() -> NGram<N> {
129 NGram::<N>()
130}
131
132pub fn bigram() -> NGram<2> {
134 NGram::<2>()
135}
136
137pub fn trigram() -> NGram<3> {
139 NGram::<3>()
140}
141
142impl<Origin, const N: usize> Ftzr<Origin> for NGram<N>
143where
144 Self: IterFtzr<Origin>,
145{
146 type TokenGroup = <Self as IterFtzr<Origin>>::TokenGroup;
147 fn push_tokens<Push>(&self, origin: Origin, push: &mut Push)
148 where
149 Push: FnMut(Self::TokenGroup) -> (),
150 {
151 for t in self.iterate_features(origin) {
152 push(t)
153 }
154 }
155}