gliner/model/input/
tokenized.rs1use composable::Composable;
2use crate::util::result::Result;
3use crate::text::token::Token;
4use crate::text::splitter::Splitter;
5use super::text::TextInput;
6
7
8pub struct TokenizedInput {
10 pub tokens: Vec<Vec<Token>>,
12 pub texts: Vec<String>,
14 pub entities: Vec<String>,
16}
17
18
19impl TokenizedInput {
20
21 pub fn from(input: TextInput, splitter: &impl Splitter, max_length: Option<usize>) -> Result<Self> {
22 let mut tokens = Vec::with_capacity(input.texts.len());
24 for s in &input.texts {
25 tokens.push(splitter.split(s, max_length)?);
26 }
27
28 Ok(Self {
29 tokens,
30 texts: input.texts,
31 entities: input.entities,
32 })
33 }
34}
35
36pub struct RawToTokenized<'a, S> {
38 splitter: &'a S,
39 max_length: Option<usize>
40}
41
42impl<'a, S> RawToTokenized<'a, S> {
43 pub fn new(splitter: &'a S, max_length: Option<usize>) -> Self {
44 Self {
45 splitter,
46 max_length
47 }
48 }
49}
50
51impl<S: Splitter> Composable<TextInput, TokenizedInput> for RawToTokenized<'_, S> {
52 fn apply(&self, input: TextInput) -> Result<TokenizedInput> {
53 TokenizedInput::from(input, self.splitter, self.max_length)
54 }
55}
56
57
58
59#[cfg(test)]
61mod tests {
62 use super::*;
63
64 #[test]
65 fn test() -> Result<()> {
66 #![allow(clippy::get_first)]
68 #![allow(clippy::unwrap_used)]
69 let splitter = crate::text::splitter::RegexSplitter::default();
71 let batch = [ "This is a text", "This is another one"];
72 let entities = [ "person", "place" ];
73 let input = TextInput::from_str(&batch, &entities)?;
74 let tokenized = TokenizedInput::from(input, &splitter, None)?;
75 if false {
77 println!("{:?}", tokenized.tokens);
78 }
79 assert_eq!(tokenized.tokens.len(), 2);
81 assert_eq!(tokenized.tokens.get(0).unwrap().len(), 4);
82 assert_eq!(tokenized.tokens.get(0).unwrap().get(0).unwrap().text(), "This");
83 assert_eq!(tokenized.tokens.get(0).unwrap().get(0).unwrap().start(), 0);
84 assert_eq!(tokenized.tokens.get(1).unwrap().len(), 4);
85 assert_eq!(tokenized.tokens.get(1).unwrap().get(3).unwrap().text(), "one");
86 assert_eq!(tokenized.tokens.get(1).unwrap().get(3).unwrap().end(), batch[1].len());
87 Ok(())
89 }
90}