gliner/model/input/
tokenized.rs

1use composable::Composable;
2use crate::util::result::Result;
3use crate::text::token::Token;
4use crate::text::splitter::Splitter;
5use super::text::TextInput;
6
7
8/// Represents the output of the word-level segmentation
9pub struct TokenizedInput {
10    /// Tokens
11    pub tokens: Vec<Vec<Token>>,
12    /// Original sequences
13    pub texts: Vec<String>,    
14    /// Original entities
15    pub entities: Vec<String>, 
16}
17
18
19impl TokenizedInput {
20
21    pub fn from(input: TextInput, splitter: &impl Splitter, max_length: Option<usize>) -> Result<Self> {
22        // leverage the given `Splitter` to tokenize each input sequence
23        let mut tokens = Vec::with_capacity(input.texts.len());
24        for s in &input.texts {
25            tokens.push(splitter.split(s, max_length)?);
26        }
27
28        Ok(Self {
29            tokens,
30            texts: input.texts,            
31            entities: input.entities,
32        })
33    }
34}
35
36/// Composable: Text => Tokenized
37pub struct RawToTokenized<'a, S> { 
38    splitter: &'a S,
39    max_length: Option<usize>
40}
41
42impl<'a, S> RawToTokenized<'a, S> {
43    pub fn new(splitter: &'a S, max_length: Option<usize>) -> Self {
44        Self { 
45            splitter, 
46            max_length
47        }
48    }
49}
50
51impl<S: Splitter> Composable<TextInput, TokenizedInput> for RawToTokenized<'_, S> {
52    fn apply(&self, input: TextInput) -> Result<TokenizedInput> {
53        TokenizedInput::from(input, self.splitter, self.max_length)
54    }
55}
56
57
58
59/// Unit tests
60#[cfg(test)]
61mod tests {
62    use super::*;
63
64    #[test]
65    fn test() -> Result<()> {
66        // Silent some clippy warnings for unit tests
67        #![allow(clippy::get_first)]
68        #![allow(clippy::unwrap_used)]
69        // Processing
70        let splitter = crate::text::splitter::RegexSplitter::default();
71        let batch = [ "This is a text", "This is another one"];
72        let entities = [ "person", "place" ];
73        let input = TextInput::from_str(&batch, &entities)?;
74        let tokenized = TokenizedInput::from(input, &splitter, None)?;
75        // Some prints
76        if false {
77            println!("{:?}", tokenized.tokens);
78        }
79        // Assertions
80        assert_eq!(tokenized.tokens.len(), 2);
81        assert_eq!(tokenized.tokens.get(0).unwrap().len(), 4);
82        assert_eq!(tokenized.tokens.get(0).unwrap().get(0).unwrap().text(), "This");
83        assert_eq!(tokenized.tokens.get(0).unwrap().get(0).unwrap().start(), 0);
84        assert_eq!(tokenized.tokens.get(1).unwrap().len(), 4);
85        assert_eq!(tokenized.tokens.get(1).unwrap().get(3).unwrap().text(), "one");
86        assert_eq!(tokenized.tokens.get(1).unwrap().get(3).unwrap().end(), batch[1].len());
87        // Everything rules
88        Ok(())
89    }
90}