layered_nlp/
ll_line.rs

1mod display;
2mod finish_with;
3mod ll_selection;
4pub mod x;
5
6pub use finish_with::FinishWith;
7pub use ll_selection::LLSelection;
8
9use crate::type_bucket::{self, AnyAttribute};
10use crate::type_id_to_many::TypeIdToMany;
11pub use display::LLLineDisplay;
12use std::fmt::{self, Write};
13use std::iter::FromIterator;
14use std::{collections::HashMap, rc::Rc};
15pub use x::{Attr, AttrEq};
16use x::{XForwards, XMatch};
17
18/// [TextTag] is an attribute added at the beginning of every new line.
19/// 
20/// Each piece of a line is sort of "tokenized" and each token is assigned a [TextTag] attribute.
21#[derive(Clone, Debug, PartialEq)]
22pub enum TextTag {
23    /// Natural number like `0`, `1200`, `0004`
24    NATN,
25    /// English sentence punctuation symbols.
26    /// `,`, `.`, `!`, `;`, `:`, `?`, `'`, `"`
27    PUNC,
28    /// Any other symbol or emoji
29    SYMB,
30    /// A combination of unicode whitespaces
31    SPACE,
32    /// A word as identified by unicode word recognition rules.
33    /// 
34    /// For example: `yello`, `Paris`, `don't`, `should've`
35    WORD,
36}
37
38#[derive(Debug)]
39pub enum LToken {
40    Text(String, TextTag),
41    /// TODO: something more interesting
42    Value,
43}
44
45#[derive(Debug)]
46pub struct LLToken {
47    pub(crate) token_idx: usize,
48    // token span position (not token index)
49    pub(crate) pos_starts_at: usize,
50    // token span position (not token index)
51    pub(crate) pos_ends_at: usize,
52    pub(crate) token: LToken,
53}
54
55/// (starts at, ends at) token indexes
56type LRange = (usize, usize);
57/// (starts at, ends at) token positions
58type PositionRange = (usize, usize);
59
60/// Top-level
61struct LLLineAttrs {
62    // "bi-map" / "tri-map"
63    ranges: TypeIdToMany<LRange>,
64    /// match_forwards uses [LLSelection::end_idx]
65    starts_at: Vec<TypeIdToMany<LRange>>,
66    /// match_backwards uses [LLSelection::start_idx]
67    ends_at: Vec<TypeIdToMany<LRange>>,
68    values: HashMap<LRange, type_bucket::TypeBucket>,
69}
70
71pub struct LLLineFind<'l, Found> {
72    start_pos_at: usize,
73    end_pos_at: usize,
74    found: Found,
75    _phantom: std::marker::PhantomData<&'l ()>,
76}
77
78impl<'l, Found: fmt::Debug> fmt::Debug for LLLineFind<'l, Found> {
79    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
80        f.debug_struct("LLLineFind")
81            .field("start", &self.start_pos_at)
82            .field("end", &self.end_pos_at)
83            .field("found", &self.found)
84            .finish()
85    }
86}
87
88impl<'l, Found> LLLineFind<'l, Found> {
89    pub fn range(&self) -> PositionRange {
90        (self.start_pos_at, self.end_pos_at)
91    }
92    pub fn attr(&self) -> &Found {
93        &self.found
94    }
95}
96
97/// Create using [layered_nlp::create_tokens] function.
98pub struct LLLine {
99    // how much do we actually need of the original Vec if much of the data is put into the bi-map?
100    ll_tokens: Vec<LLToken>,
101    attrs: LLLineAttrs,
102} 
103
104impl LLLine {
105    pub(crate) fn new(ll_tokens: Vec<LLToken>) -> Self {
106        let starts_at: Vec<TypeIdToMany<LRange>> =
107            (0..ll_tokens.len()).map(|_| Default::default()).collect();
108        let ends_at: Vec<TypeIdToMany<LRange>> =
109            (0..ll_tokens.len()).map(|_| Default::default()).collect();
110
111        let mut attrs = LLLineAttrs {
112            ranges: Default::default(),
113            starts_at,
114            ends_at,
115            values: Default::default(),
116        };
117
118        for (token_idx, ll_token) in ll_tokens.iter().enumerate() {
119            match &ll_token.token {
120                LToken::Text(text, tag) => {
121                    if text.chars().count() == 1 {
122                        // insert char automatically if just one char
123                        attrs.insert((token_idx, token_idx), text.chars().next().unwrap());
124                    }
125                    // insert TextTag automatically
126                    attrs.insert((token_idx, token_idx), tag.clone());
127                }
128                LToken::Value => {
129                    // nothing to do...
130                }
131            }
132        }
133
134        LLLine { ll_tokens, attrs }
135    }
136
137    pub fn run<R>(mut self, recognizer: &R) -> Self
138    where
139        R: Resolver,
140    {
141        // Empty line can't recognize anything since they can't create `LLSelection`
142        if self.ll_tokens.is_empty() {
143            return self;
144        }
145
146        let ll_line = Rc::new(self);
147
148        let assignments = recognizer.go(LLSelection {
149            ll_line: ll_line.clone(),
150            start_idx: 0,
151            end_idx: ll_line.ll_tokens().len() - 1,
152        });
153
154        self = Rc::try_unwrap(ll_line)
155            .map_err(drop)
156            .expect("there is no other Rc currently");
157
158        // store new attributes generated by the resolver
159        for LLCursorAssignment {
160            start_idx,
161            end_idx,
162            value,
163        } in assignments
164        {
165            self.attrs.insert((start_idx, end_idx), value);
166        }
167
168        self
169    }
170    pub(crate) fn add_any_attrs(
171        &mut self,
172        start_idx: usize,
173        end_idx: usize,
174        attrs: Vec<AnyAttribute>,
175    ) {
176        let range = (start_idx, end_idx);
177
178        for attr in attrs {
179            self.attrs
180                .starts_at
181                .get_mut(start_idx)
182                .expect("has initial starts_at value in bounds")
183                .insert_any_distinct(attr.type_id(), range);
184            self.attrs
185                .ends_at
186                .get_mut(end_idx)
187                .expect("has initial ends_at value in bounds")
188                .insert_any_distinct(attr.type_id(), range);
189            self.attrs.ranges.insert_any_distinct(attr.type_id(), range);
190            self.attrs
191                .values
192                .entry(range)
193                .or_default()
194                .insert_any_attribute(attr);
195        }
196    }
197
198    /// Get a reference to the ll line's ll tokens.
199    pub fn ll_tokens(&self) -> &[LLToken] {
200        &self.ll_tokens
201    }
202
203    /// Returns Attributes' information outside `LLLine`
204    /// "find"
205    pub fn find<'l, M: XMatch<'l>>(&'l self, matcher: &M) -> Vec<LLLineFind<'l, M::Out>> {
206        (0..self.ll_tokens.len())
207            .flat_map(|i| {
208                let forwards = XForwards { from_idx: i };
209
210                matcher
211                    .go(&forwards, &self)
212                    .into_iter()
213                    .map(move |(out, next_idx)| LLLineFind {
214                        start_pos_at: self.pos_start_at(i),
215                        end_pos_at: self.pos_end_at(next_idx.0),
216                        found: out,
217                        _phantom: std::marker::PhantomData,
218                    })
219            })
220            .collect()
221    }
222
223    fn pos_end_at(&self, idx: usize) -> usize {
224        self.ll_tokens
225            .get(idx)
226            .expect("pos_end_at in bounds")
227            .pos_ends_at
228    }
229    fn pos_start_at(&self, idx: usize) -> usize {
230        self.ll_tokens
231            .get(idx)
232            .expect("pos_start_at in bounds")
233            .pos_starts_at
234    }
235
236    /// Returns Attributes' information outside `LLLine`
237    pub fn query<'a, T: 'static>(&'a self) -> Vec<(LRange, String, Vec<&T>)> {
238        self.attrs
239            .ranges
240            .get::<T>()
241            .iter()
242            .map(|range| {
243                let text =
244                    String::from_iter(self.ll_tokens[range.0..=range.1].iter().map(|token| {
245                        match &token.token {
246                            LToken::Text(text, _) => text,
247                            LToken::Value => "",
248                        }
249                    }));
250
251                (
252                    *range,
253                    text,
254                    self.attrs.values[range].get::<T>().iter().collect(),
255                )
256            })
257            .collect()
258    }
259}
260
261impl LLLineAttrs {
262    fn insert<T: 'static + std::fmt::Debug>(&mut self, range: LRange, value: T) {
263        self.starts_at
264            .get_mut(range.0)
265            .expect("has initial starts_at value in bounds")
266            .insert_distinct::<T>(range);
267        self.ends_at
268            .get_mut(range.1)
269            .expect("has initial ends_at value in bounds")
270            .insert_distinct::<T>(range);
271        self.ranges.insert_distinct::<T>(range);
272        self.values.entry(range).or_default().insert(value);
273    }
274}
275
276#[track_caller]
277fn assert_ll_lines_equals(first: &Rc<LLLine>, second: &Rc<LLLine>) {
278    if !Rc::ptr_eq(first, second) {
279        panic!("Two different lines used")
280    }
281}
282
283// TODO rename
284#[derive(Debug)]
285pub struct LLCursorAssignment<Attr> {
286    // private
287    start_idx: usize,
288    end_idx: usize,
289    // provided from resolver
290    value: Attr,
291}
292
293pub trait Resolver {
294    /// The kind of value that this resolver will assign into the LLLine.
295    ///
296    /// It is constrained to [std::fmt::Debug] in order to ensure that it's easy
297    /// to debug with [layered_nlp::LLLineDisplay].
298    type Attr: std::fmt::Debug + 'static;
299    /// How to perform the assignments.
300    fn go(&self, selection: LLSelection) -> Vec<LLCursorAssignment<Self::Attr>>;
301}