layered_nlp/
create_tokens.rs

1mod get_word_tag;
2
3use crate::ll_line::{LLLine, LLToken, LToken, TextTag};
4use crate::type_bucket::AnyAttribute;
5use unicode_segmentation::UnicodeSegmentation;
6
7pub enum InputToken {
8    Text {
9        /// This text will be split up and TextTags will be added to its pieces
10        text: String,
11        /// Custom attributes
12        attrs: Vec<AnyAttribute>,
13    },
14    Custom {
15        /// Position relative size
16        size: usize,
17        /// Custom attributes
18        attrs: Vec<AnyAttribute>,
19    },
20}
21
22impl InputToken {
23    pub fn text(text: String, attrs: Vec<AnyAttribute>) -> Self {
24        InputToken::Text { text, attrs }
25    }
26
27    pub fn custom(size: usize, attrs: Vec<AnyAttribute>) -> Self {
28        InputToken::Custom { size, attrs }
29    }
30
31    pub fn add_attr<T: 'static + std::fmt::Debug>(&mut self, value: T) {
32        match self {
33            InputToken::Text { attrs, .. } => attrs.push(AnyAttribute::new(value)),
34            InputToken::Custom { attrs, .. } => attrs.push(AnyAttribute::new(value)),
35        }
36    }
37}
38
39/// Splits the `InputToken`s and generate `TextTag`s.
40#[deprecated = "Use create_line_from_input_tokens"]
41pub fn create_tokens<F>(input: Vec<InputToken>, get_text_size: F) -> LLLine
42where
43    F: Fn(&str) -> usize,
44{
45    create_line_from_input_tokens(input, get_text_size)
46}
47
48pub fn create_line_from_input_tokens<F>(input: Vec<InputToken>, get_text_size: F) -> LLLine
49where
50    F: Fn(&str) -> usize,
51{
52    let mut start_idx_end_idx_attributes: Vec<(usize, usize, Vec<AnyAttribute>)> = Vec::new();
53    let mut lltokens: Vec<LLToken> = Vec::new();
54    let mut current_size = 0;
55
56    for (ltokens, attrs) in input.into_iter().map(|input_token| match input_token {
57        InputToken::Text { text, attrs } => {
58            (create_tokens_for_string(&text, &get_text_size), attrs)
59        }
60        InputToken::Custom { size, attrs } => (vec![(LToken::Value, size)], attrs),
61    }) {
62        assert!(
63            !ltokens.is_empty(),
64            "Cannot create a LLLine from empty String"
65        );
66
67        let from_idx = lltokens.len();
68        for (ltoken, size) in ltokens {
69            let next_size = current_size + size;
70            lltokens.push(LLToken {
71                token_idx: lltokens.len(),
72                pos_starts_at: current_size,
73                pos_ends_at: next_size,
74                token: ltoken,
75            });
76
77            current_size = next_size;
78        }
79        let to_idx = lltokens.len() - 1;
80
81        start_idx_end_idx_attributes.push((from_idx, to_idx, attrs));
82    }
83
84    let mut ll_line = LLLine::new(lltokens);
85    for (start_idx, end_idx, attributes) in start_idx_end_idx_attributes {
86        ll_line.add_any_attrs(start_idx, end_idx, attributes);
87    }
88
89    ll_line
90}
91
92// helper for [create_tokens]
93fn create_tokens_for_string<F>(input: &str, get_text_size: F) -> Vec<(LToken, usize)>
94where
95    F: Fn(&str) -> usize,
96{
97    // `fold` because we end up splitting more than just unicode word boundaries
98    input
99        .split_word_bounds()
100        .fold(Vec::new(), |mut ltokens, unicode_word| {
101            // Split apart digit word boundaries, because unicode `split_word_bounds` will group digits and commas and points together
102            // such as "12,3" and "10.0". We need these to be split up further into ["12", ",", "3"] and ["10", ".", "0"] respectively.
103            // http://www.unicode.org/reports/tr29/#Word_Boundaries
104            // if \d+[,\.a-zA-Z]\d+ or more repeats (3 is minimum)
105            if unicode_word.chars().next().unwrap().is_ascii_digit() {
106                // make assumtions about the length of every char being 1
107                assert!(
108                    unicode_word.is_ascii(),
109                    "Unexpected non-ascii digit word boundary: {}",
110                    unicode_word
111                );
112
113                let mut collected_digits = String::new();
114                let mut collected_letters = String::new();
115
116                // using a macro since pulling this out into a closure or function would be very verbose
117                // as you'd have to pass references to collected_digits, get_text_size, ltokens
118                macro_rules! insert_collected_digits {
119                    () => {
120                        if collected_digits.len() > 0 {
121                            let size = get_text_size(&collected_digits);
122                            ltokens.push((
123                                LToken::Text(std::mem::take(&mut collected_digits), TextTag::NATN),
124                                size,
125                            ));
126                        }
127                    };
128                }
129
130                macro_rules! insert_collected_letters {
131                    () => {
132                        if collected_letters.len() > 0 {
133                            let size = get_text_size(&collected_letters);
134                            ltokens.push((
135                                LToken::Text(std::mem::take(&mut collected_letters), TextTag::WORD),
136                                size,
137                            ));
138                        }
139                    };
140                }
141
142                for ch in unicode_word.chars() {
143                    if ch.is_ascii_digit() {
144                        insert_collected_letters!();
145                        collected_digits.push(ch);
146                    } else if ch.is_alphabetic() {
147                        insert_collected_digits!();
148                        collected_letters.push(ch);
149                    } else {
150                        insert_collected_letters!();
151                        insert_collected_digits!();
152                        let text = String::from(ch);
153                        let size = get_text_size(&text);
154                        ltokens.push((LToken::Text(text, TextTag::PUNC), size));
155                    }
156                }
157
158                insert_collected_letters!();
159                insert_collected_digits!();
160            } else {
161                ltokens.push((
162                    LToken::Text(
163                        unicode_word.to_string(),
164                        get_word_tag::get_unicode_word_tag(unicode_word),
165                    ),
166                    get_text_size(unicode_word),
167                ));
168            }
169
170            ltokens
171        })
172}
173
174#[cfg(test)]
175mod test {
176    use super::{create_tokens, InputToken};
177    use crate::ll_line::LLLineDisplay;
178    use crate::type_bucket::AnyAttribute;
179
180    #[derive(Debug, Clone)]
181    enum MarkKind {
182        Italic,
183        Bold,
184    }
185
186    #[derive(Debug, Clone)]
187    struct Link {
188        href: String,
189    }
190
191    #[test]
192    fn test_create_tokens() {
193        let ll_line = create_tokens(
194            vec![
195                InputToken::Text {
196                    text: String::from("Hello, "),
197                    attrs: Vec::new(),
198                },
199                InputToken::Text {
200                    text: String::from("World"),
201                    attrs: vec![
202                        AnyAttribute::new(MarkKind::Bold),
203                        AnyAttribute::new(MarkKind::Italic),
204                    ],
205                },
206                InputToken::Text {
207                    text: String::from("!"),
208                    attrs: vec![],
209                },
210            ],
211            |text| text.len(),
212        );
213
214        let mut ll_line_display = LLLineDisplay::new(&ll_line);
215        ll_line_display.include::<MarkKind>();
216
217        insta::assert_display_snapshot!(ll_line_display, @r###"
218        Hello  ,     World  !
219                     ╰───╯Italic
220                     ╰───╯Bold
221        "###);
222    }
223
224    #[test]
225    fn test_create_tokens_email() {
226        let ll_line = create_tokens(
227            vec![InputToken::Text {
228                text: String::from("name@example.com"),
229                attrs: vec![
230                    AnyAttribute::new(MarkKind::Italic),
231                    AnyAttribute::new(Link {
232                        href: String::from("mailto:name@example.com"),
233                    }),
234                ],
235            }],
236            |text| text.len(),
237        );
238
239        // display insta test
240        let mut ll_line_display = LLLineDisplay::new(&ll_line);
241        ll_line_display.include::<MarkKind>();
242        ll_line_display.include::<Link>();
243
244        insta::assert_display_snapshot!(ll_line_display, @r###"
245        name  @  example.com
246        ╰──────────────────╯Italic
247        ╰──────────────────╯Link { href: "mailto:name@example.com" }
248        "###);
249    }
250}
layered_nlp/create_tokens.rs

layered_nlp/
create_tokens.rs