stam/api/
text.rs

1/*
2    STAM Library (Stand-off Text Annotation Model)
3        by Maarten van Gompel <proycon@anaproy.nl>
4        Digital Infrastucture, KNAW Humanities Cluster
5
6        Licensed under the GNU General Public License v3
7
8        https://github.com/annotation/stam-rust
9*/
10
11//! This module contains the high-level API for finding text. The `FindText` trait embodies this API and is implemented on
12//! [`ResultItem<TextResource>`] and [`ResultTextSelection`]. It builds upon the lower-level [`Text`] trait.
13
14use crate::annotationstore::AnnotationStore;
15use crate::config::Configurable;
16use crate::error::StamError;
17use crate::resources::{TextResource, TextResourceHandle};
18use crate::selector::Offset;
19use crate::store::*;
20use crate::text::Text;
21use crate::textselection::ResultTextSelection;
22use crate::textselection::TextSelection;
23use crate::types::*;
24use regex::{Regex, RegexSet};
25use smallvec::{smallvec, SmallVec};
26
27/// This trait provides text-searching methods that operate on structures that hold or represent text content. It builds upon the lower-level [`Text`] trait.
28pub trait FindText<'store, 'slf>: Text<'store, 'slf>
29where
30    'store: 'slf,
31{
32    /// Searches the text using one or more regular expressions, returns an iterator over TextSelections along with the matching expression, this
33    /// is held by the [`FindRegexMatch`] struct.
34    ///
35    /// Passing multiple regular expressions at once is more efficient than calling this function anew for each one.
36    /// If capture groups are used in the regular expression, only those parts will be returned (the rest is context). If none are used,
37    /// the entire expression is returned.
38    ///
39    /// The `allow_overlap` parameter determines if the matching expressions are allowed to
40    /// overlap. It you are doing some form of tokenisation, you also likely want this set to
41    /// false. All of this only matters if you supply multiple regular expressions.
42    ///
43    /// Results are returned in the exact order they are found in the text
44    fn find_text_regex<'regex>(
45        &'slf self,
46        expressions: &'regex [Regex],
47        precompiledset: Option<&RegexSet>,
48        allow_overlap: bool,
49    ) -> Result<FindRegexIter<'store, 'regex>, StamError>;
50
51    /// Searches for the specified text fragment. Returns an iterator to iterate over all matches in the text.
52    /// The iterator returns encapsulated [`TextSelection`] items as [`ResultTextSelection`].
53    ///
54    /// For more complex and powerful searching use [`FindText::find_text_regex()`] instead
55    ///
56    /// If you want to search only a subpart of the text, extract a [`TextSelection`] first with
57    /// [`FindText::textselection()`] and then run [`FindText::find_text()`] on that instead.
58    fn find_text<'fragment>(
59        &'slf self,
60        fragment: &'fragment str,
61    ) -> FindTextIter<'store, 'fragment>;
62
63    /// Searches for the specified text fragment. Returns an iterator to iterate over all matches in the text.
64    /// The iterator returns [`TextSelection`] items wrapped as [`ResultTextSelection`].
65    ///
66    /// For more complex and powerful searching use [`FindText::find_text_regex()`] instead
67    ///
68    /// If you want to search only a subpart of the text, extract a [`TextSelection`] first with
69    /// [`FindText::textselection()`] and then run [`FindText::find_text()`] on that instead.
70    fn find_text_nocase(&'slf self, fragment: &str) -> FindNoCaseTextIter<'store>;
71
72    /// Searches for the multiple text fragment in sequence. Returns a vector with
73    /// [`TextSelection`] instances wrapped as [`ResultTextSelection`].
74    ///
75    /// Matches must appear in the exact order specified, but *may* have other intermittent text,
76    /// determined by the `allow_skip_char` closure. A recommended closure for natural language
77    /// text is: `|c| !c.is_alphabetic()`
78    ///
79    /// The `case_sensitive` parameter determines if the search is case sensitive or not, case insensitive searches have a performance penalty.
80    fn find_text_sequence<'fragment, F>(
81        &'slf self,
82        fragments: &'fragment [&'fragment str],
83        allow_skip_char: F,
84        case_sensitive: bool,
85    ) -> Option<Vec<ResultTextSelection<'store>>>
86    where
87        F: Fn(char) -> bool,
88    {
89        let mut results: Vec<ResultTextSelection<'store>> = Vec::with_capacity(fragments.len());
90        let mut begin: usize = 0;
91        let mut textselectionresult = self.textselection(&Offset::whole());
92        for fragment in fragments {
93            if let Ok(searchtext) = textselectionresult {
94                if let Some(m) = if case_sensitive {
95                    searchtext.find_text(fragment).next()
96                } else {
97                    searchtext.find_text_nocase(fragment).next()
98                } {
99                    if m.begin() > begin {
100                        //we skipped some text since last match, check the characters in between matches
101                        let skipped_text = self
102                            .textselection(&Offset::simple(begin, m.begin()))
103                            .expect("textselection must succeed")
104                            .text();
105                        for c in skipped_text.chars() {
106                            if !allow_skip_char(c) {
107                                return None;
108                            }
109                        }
110                    }
111                    begin = m.end();
112                    results.push(m);
113                } else {
114                    return None;
115                }
116                //slice (shorten) new text for next test
117                textselectionresult = searchtext.textselection(&Offset::new(
118                    Cursor::BeginAligned(begin - searchtext.begin()), //offset must be relative
119                    Cursor::EndAligned(0),
120                ));
121            } else {
122                return None;
123            }
124        }
125
126        Some(results)
127    }
128
129    /// Returns an iterator of [`TextSelection`] instances that represent partitions
130    /// of the text given the specified delimiter. No text is modified.
131    ///
132    /// The iterator returns [`TextSelection`] items as a fat pointer [`ResultTextSelection`]).
133    fn split_text<'b>(&'slf self, delimiter: &'b str) -> SplitTextIter<'store, 'b>;
134
135    /// Trims all occurrences of any character in `chars` from both the beginning and end of the text,
136    /// returning a smaller [`TextSelection`] (as a fat pointer [`ResultTextSelection`]). No text is modified.
137    fn trim_text(&'slf self, chars: &[char]) -> Result<ResultTextSelection<'store>, StamError> {
138        let mut trimbegin = 0;
139        let mut trimend = 0;
140        for c in self.text().chars() {
141            if chars.contains(&c) {
142                trimbegin += 1;
143            } else {
144                break;
145            }
146        }
147        for c in self.text().chars().rev() {
148            if chars.contains(&c) {
149                trimend -= 1;
150            } else {
151                break;
152            }
153        }
154        self.textselection(&Offset::new(
155            Cursor::BeginAligned(trimbegin),
156            Cursor::EndAligned(trimend),
157        ))
158    }
159
160    /// Trims all occurrences of any character `chars` that pass the supplied function, from both the beginning and end of the text,
161    /// returning a smaller [`TextSelection`] (as a fat pointer [`ResultTextSelection`]). No text is modified.
162    fn trim_text_with<F>(&'slf self, f: F) -> Result<ResultTextSelection<'store>, StamError>
163    where
164        F: Fn(char) -> bool,
165    {
166        let mut trimbegin = 0;
167        let mut trimend = 0;
168        for c in self.text().chars() {
169            if f(c) {
170                trimbegin += 1;
171            } else {
172                break;
173            }
174        }
175        for c in self.text().chars().rev() {
176            if f(c) {
177                trimend -= 1;
178            } else {
179                break;
180            }
181        }
182        self.textselection(&Offset::new(
183            Cursor::BeginAligned(trimbegin),
184            Cursor::EndAligned(trimend),
185        ))
186    }
187
188    /// Returns a [`TextSelection`] that corresponds to the offset. If the TextSelection
189    /// exists, the existing one will be returned (as a copy, but it will have a [`TextSelection::handle()`].
190    /// If it doesn't exist yet, a new one will be returned, and it won't have a handle, nor will it be added to the store automatically.
191
192    /// The [`TextSelection`] is returned as in a fat pointer ([`ResultTextSelection`]) that also contains reference to the underlying store.
193    ///
194    /// Use [`TextResource::known_textselection()`] instead if you want to limit to existing text selections on resources.
195    fn textselection(&'slf self, offset: &Offset)
196        -> Result<ResultTextSelection<'store>, StamError>;
197}
198
199/// this implementation mostly defers directly to the wrapped item, documentation is found on the trait and not repeated here
200impl<'store, 'slf> Text<'store, 'slf> for ResultItem<'store, TextResource>
201where
202    'store: 'slf,
203{
204    fn textlen(&self) -> usize {
205        self.as_ref().textlen()
206    }
207
208    fn text(&'slf self) -> &'store str {
209        self.as_ref().text()
210    }
211
212    fn text_by_offset(&self, offset: &Offset) -> Result<&'store str, StamError> {
213        self.as_ref().text_by_offset(offset)
214    }
215
216    fn absolute_cursor(&self, cursor: usize) -> usize {
217        cursor
218    }
219
220    fn utf8byte(&self, abscursor: usize) -> Result<usize, StamError> {
221        self.as_ref().utf8byte(abscursor)
222    }
223
224    fn utf8byte_to_charpos(&self, bytecursor: usize) -> Result<usize, StamError> {
225        self.as_ref().utf8byte_to_charpos(bytecursor)
226    }
227}
228
229impl<'store, 'slf> FindText<'store, 'slf> for ResultItem<'store, TextResource>
230where
231    'store: 'slf,
232{
233    /// Returns a [`TextSelection`] that corresponds to the offset. If the TextSelection
234    /// exists, the existing one will be returned.
235    /// If it doesn't exist yet, a new one will be returned, and it won't have a handle, nor will it be added to the store automatically.
236    ///
237    /// The [`TextSelection`] is returned in a fat pointer ([`ResultTextSelection`]) that also contains reference to the underlying store.
238    fn textselection(&self, offset: &Offset) -> Result<ResultTextSelection<'store>, StamError> {
239        match self.as_ref().known_textselection(offset) {
240            Ok(Some(handle)) => {
241                //existing textselection
242                let textselection: &TextSelection = self.as_ref().get(handle)?; //shouldn't fail here anymore
243                let wrapped = textselection.as_resultitem(self.as_ref(), self.rootstore());
244                Ok(ResultTextSelection::Bound(wrapped))
245            }
246            Ok(None) => {
247                let textselection: TextSelection =
248                    self.as_ref().textselection_by_offset_unchecked(offset)?;
249                Ok(ResultTextSelection::Unbound(
250                    self.rootstore(),
251                    self.as_ref(),
252                    textselection,
253                ))
254            }
255            Err(err) => Err(err), //an error occured, propagate
256        }
257    }
258
259    /// Searches the text using one or more regular expressions, returns an iterator over TextSelections along with the matching expression, this
260    /// is held by the [`FindRegexMatch`] struct.
261    ///
262    /// Passing multiple regular expressions at once is more efficient than calling this function anew for each one.
263    /// If capture groups are used in the regular expression, only those parts will be returned (the rest is context). If none are used,
264    /// the entire expression is returned.
265    ///
266    /// The `allow_overlap` parameter determines if the matching expressions are allowed to
267    /// overlap. It you are doing some form of tokenisation, you also likely want this set to
268    /// false. All of this only matters if you supply multiple regular expressions.
269    ///
270    /// Results are returned in the exact order they are found in the text
271    fn find_text_regex<'regex>(
272        &self,
273        expressions: &'regex [Regex],
274        precompiledset: Option<&RegexSet>,
275        allow_overlap: bool,
276    ) -> Result<FindRegexIter<'store, 'regex>, StamError> {
277        debug(self.as_ref().config(), || {
278            format!("find_text_regex: expressions={:?}", expressions)
279        });
280        let selectexpressions =
281            find_text_regex_select_expressions(self.text(), expressions, precompiledset)?;
282        //Returns an iterator that does the remainder of the actual searching
283        Ok(FindRegexIter {
284            resource: self.clone(),
285            expressions,
286            selectexpressions,
287            matchiters: Vec::new(),
288            nextmatches: Vec::new(),
289            text: self.text(),
290            begincharpos: 0,
291            beginbytepos: 0,
292            allow_overlap,
293        })
294    }
295
296    /// Searches for the specified text fragment. Returns an iterator to iterate over all matches in the text.
297    /// The iterator returns [`TextSelection`] items.
298    ///
299    /// This search is case sensitive, use [`FindText::find_text_nocase()`] to search case insensitive.
300    /// For more complex and powerful searching use [`FindText::find_text_regex()`] instead
301    ///
302    /// If you want to search only a subpart of the text, extract a [`TextSelection`] first with
303    /// [`FindText::textselection()`] and then run [`FindText::find_text()`] on that instead.
304    fn find_text<'fragment>(&self, fragment: &'fragment str) -> FindTextIter<'store, 'fragment> {
305        FindTextIter {
306            store: self.rootstore(),
307            resources: smallvec!(self.handle()),
308            resourcecursor: 0,
309            fragment,
310            offset: Offset::whole(),
311        }
312    }
313
314    /// Searches for the specified text fragment. Returns an iterator to iterate over all matches in the text.
315    /// The iterator returns [`TextSelection`] items.
316    ///
317    /// This search is case insensitive, use [`FindText::find_text()`] to search case sensitive. This variant is slightly less performant than the exact variant.
318    /// For more complex and powerful searching use [`FindText::find_text_regex()`] instead
319    ///
320    /// If you want to search only a subpart of the text, extract a [`TextSelection`] first with
321    /// [`FindText::textselection()`] and then run [`FindText::find_text_nocase()`] on that instead.
322    fn find_text_nocase(&self, fragment: &str) -> FindNoCaseTextIter<'store> {
323        FindNoCaseTextIter {
324            store: self.rootstore(),
325            resources: smallvec!(self.handle()),
326            resourcecursor: 0,
327            fragment: fragment.to_lowercase(),
328            offset: Offset::whole(),
329        }
330    }
331
332    /// Splits the text of this resource given a delimiter, the resulting iterator yields [`TextSelection`] items (as [`ResultTextSelection`]).
333    fn split_text<'b>(&self, delimiter: &'b str) -> SplitTextIter<'store, 'b> {
334        SplitTextIter {
335            resource: self.clone(),
336            iter: self.text().split(delimiter),
337            byteoffset: 0,
338        }
339    }
340}
341
342impl<'store, 'slf> Text<'store, 'slf> for ResultItem<'store, TextSelection>
343where
344    'store: 'slf,
345{
346    fn text(&'slf self) -> &'store str {
347        let resource = self.store(); //courtesy of ResultItem
348        let beginbyte = resource
349            .utf8byte(self.begin())
350            .expect("utf8byte conversion should succeed");
351        let endbyte = resource
352            .utf8byte(self.end())
353            .expect("utf8byte conversion should succeed");
354        &resource.text()[beginbyte..endbyte]
355    }
356
357    fn textlen(&self) -> usize {
358        self.end() - self.begin()
359    }
360
361    /// Returns a string reference to a slice of text as specified by the offset
362    fn text_by_offset(&self, offset: &Offset) -> Result<&'store str, StamError> {
363        let beginbyte =
364            self.utf8byte(self.absolute_cursor(self.beginaligned_cursor(&offset.begin)?))?;
365        let endbyte =
366            self.utf8byte(self.absolute_cursor(self.beginaligned_cursor(&offset.end)?))?;
367        if endbyte < beginbyte {
368            Err(StamError::InvalidOffset(
369                Cursor::BeginAligned(beginbyte),
370                Cursor::BeginAligned(endbyte),
371                "End must be greater than or equal to begin. (Cursor should be interpreted as UTF-8 bytes in this error context only)",
372            ))
373        } else {
374            Ok(&self.text()[beginbyte..endbyte])
375        }
376    }
377
378    /// This converts a unicode point to utf-8 byte, all in *relative* offsets to this textselection
379    fn utf8byte(&self, abscursor: usize) -> Result<usize, StamError> {
380        //Convert from and to absolute coordinates so we don't have to reimplemented all the logic
381        //and can just call this same method on [`TextResource`](crate::TextResource), which has the proper indices for this
382        let beginbyte = self
383            .store()
384            .subslice_utf8_offset(self.text())
385            .expect("subslice should succeed");
386        Ok(self.store().utf8byte(self.absolute_cursor(abscursor))? - beginbyte)
387    }
388
389    /// This converts utf-8 byte to charpos, all in *relative* offsets to this textselection
390    fn utf8byte_to_charpos(&self, bytecursor: usize) -> Result<usize, StamError> {
391        //Convert from and to absolute coordinates so we don't have to reimplemented all the logic
392        //and can just call this same method on [`TextResource`](crate::TextResource), which has the proper indices for this
393        let beginbyte = self
394            .store()
395            .subslice_utf8_offset(self.text())
396            .expect("subslice should succeed");
397        Ok(self
398            .store()
399            .utf8byte_to_charpos(self.absolute_cursor(beginbyte + bytecursor))?
400            - self.begin())
401    }
402
403    fn absolute_cursor(&self, cursor: usize) -> usize {
404        self.begin() + cursor
405    }
406}
407
408impl<'store, 'slf> FindText<'store, 'slf> for ResultItem<'store, TextSelection>
409where
410    'store: 'slf,
411{
412    /// Searches the text using one or more regular expressions, returns an iterator over TextSelections along with the matching expression, this
413    /// is held by the [`FindRegexMatch`] struct.
414    ///
415    /// Passing multiple regular expressions at once is more efficient than calling this function anew for each one.
416    /// If capture groups are used in the regular expression, only those parts will be returned (the rest is context). If none are used,
417    /// the entire expression is returned.
418    ///
419    /// An `offset` can be specified to work on a sub-part rather than the entire text (like an existing TextSelection).
420    ///
421    /// The `allow_overlap` parameter determines if the matching expressions are allowed to
422    /// overlap. It you are doing some form of tokenisation, you also likely want this set to
423    /// false. All of this only matters if you supply multiple regular expressions.
424    ///
425    /// Results are returned in the exact order they are found in the text
426    fn find_text_regex<'regex>(
427        &'slf self,
428        expressions: &'regex [Regex],
429        precompiledset: Option<&RegexSet>,
430        allow_overlap: bool,
431    ) -> Result<FindRegexIter<'store, 'regex>, StamError> {
432        debug(self.store().config(), || {
433            format!(
434                "TextSelection::find_text_regex: expressions={:?}",
435                expressions
436            )
437        });
438        let text = self.text();
439        let selectexpressions =
440            find_text_regex_select_expressions(text, expressions, precompiledset)?;
441        //Returns an iterator that does the remainder of the actual searching
442        Ok(FindRegexIter {
443            resource: self.resource(),
444            expressions,
445            selectexpressions,
446            matchiters: Vec::new(),
447            nextmatches: Vec::new(),
448            text: self.text(),
449            begincharpos: self.begin(),
450            beginbytepos: self
451                .store()
452                .subslice_utf8_offset(text)
453                .expect("Subslice must be found"),
454            allow_overlap,
455        })
456    }
457
458    /// Searches for the specified text fragment. Returns an iterator to iterate over all matches in the text.
459    /// The iterator returns [`TextSelection`] items.
460    ///
461    /// For more complex and powerful searching use [`FindText::find_text_regex()`] instead
462    ///
463    /// If you want to search only a subpart of the text, extract a [`TextSelection`] first and then run [`FindText::find_text()`] on that instead.
464    fn find_text<'fragment>(
465        &'slf self,
466        fragment: &'fragment str,
467    ) -> FindTextIter<'store, 'fragment> {
468        FindTextIter {
469            store: self.rootstore(),
470            resources: smallvec!(self.resource().handle()),
471            resourcecursor: 0,
472            fragment,
473            offset: Offset::from(self),
474        }
475    }
476
477    /// Searches for the specified text fragment. Returns an iterator to iterate over all matches in the text.
478    /// The iterator returns [`TextSelection`] items.
479    ///
480    /// This search is case insensitive, use [`FindText::find_text()`] to search case sensitive. This variant is slightly less performant than the exact variant.
481    /// For more complex and powerful searching use [`FindText::find_text_regex()`] instead
482    ///
483    /// If you want to search only a subpart of the text, extract a [`TextSelection`] first with
484    /// [`FindText::textselection()`] and then run [`FindText::find_text_nocase()`] on that instead.
485    fn find_text_nocase(&'slf self, fragment: &str) -> FindNoCaseTextIter<'store> {
486        FindNoCaseTextIter {
487            store: self.rootstore(),
488            resources: smallvec!(self.resource().handle()),
489            resourcecursor: 0,
490            fragment: fragment.to_lowercase(),
491            offset: Offset::from(self),
492        }
493    }
494
495    fn split_text<'b>(&'slf self, delimiter: &'b str) -> SplitTextIter<'store, 'b> {
496        SplitTextIter {
497            resource: self.resource(),
498            iter: self.store().text().split(delimiter),
499            byteoffset: self
500                .subslice_utf8_offset(self.text())
501                .expect("subslice must succeed for split_text"),
502        }
503    }
504
505    /// Returns a [`TextSelection`] that corresponds to the offset **WITHIN** the textselection.
506    /// This returns a [`TextSelection`] with absolute coordinates in the resource.
507    ///
508    /// If the textselection is known (i.e. it has associated annotations), it will be returned as such with a handle (borrowed).
509    /// If it doesn't exist yet, a new one will be returned, and it won't have a handle, nor will it be added to the store automatically.
510    ///
511    /// The [`TextSelection`] is returned as in a far pointer (`ResultItem`) that also contains reference to the underlying store (the [`TextResource`]).
512    fn textselection(
513        &'slf self,
514        offset: &Offset,
515    ) -> Result<ResultTextSelection<'store>, StamError> {
516        let resource = self.resource(); //courtesy of ResultItem
517        let offset = self.absolute_offset(&offset)?; //turns the relative offset into an absolute one (i.e. offsets in TextResource)
518        resource.textselection(&offset)
519    }
520}
521
522impl<'store, 'slf> Text<'store, 'slf> for ResultTextSelection<'store>
523where
524    'store: 'slf,
525{
526    fn text(&'slf self) -> &'store str {
527        let resource = self.store();
528        let beginbyte = resource
529            .utf8byte(self.begin())
530            .expect("utf8byte conversion should succeed");
531        let endbyte = resource
532            .utf8byte(self.end())
533            .expect("utf8byte conversion should succeed");
534        &resource.text()[beginbyte..endbyte]
535    }
536
537    fn textlen(&self) -> usize {
538        self.end() - self.begin()
539    }
540
541    /// Returns a string reference to a slice of text as specified by the offset
542    fn text_by_offset(&'slf self, offset: &Offset) -> Result<&'store str, StamError> {
543        let beginbyte =
544            self.utf8byte(self.absolute_cursor(self.beginaligned_cursor(&offset.begin)?))?;
545        let endbyte =
546            self.utf8byte(self.absolute_cursor(self.beginaligned_cursor(&offset.end)?))?;
547        if endbyte < beginbyte {
548            Err(StamError::InvalidOffset(
549                Cursor::BeginAligned(beginbyte),
550                Cursor::BeginAligned(endbyte),
551                "End must be greater than or equal to begin. (Cursor should be interpreted as UTF-8 bytes in this error context only)",
552            ))
553        } else {
554            Ok(&self.text()[beginbyte..endbyte])
555        }
556    }
557
558    /// Finds the utf-8 byte position where the specified text subslice begins
559    /// The returned offset is relative to the TextSelection
560    fn subslice_utf8_offset(&self, subslice: &str) -> Option<usize> {
561        let self_begin = self.text().as_ptr() as usize;
562        let sub_begin = subslice.as_ptr() as usize;
563        if sub_begin < self_begin || sub_begin > self_begin.wrapping_add(self.text().len()) {
564            None
565        } else {
566            Some(sub_begin.wrapping_sub(self_begin))
567        }
568    }
569
570    /// This converts a unicode point to utf-8 byte, all in *relative* offsets to this textselection
571    fn utf8byte(&self, abscursor: usize) -> Result<usize, StamError> {
572        //Convert from and to absolute coordinates so we don't have to reimplemented all the logic
573        //and can just call this same method on TextResource, which has the proper indices for this
574        let beginbyte = self
575            .store()
576            .subslice_utf8_offset(self.text())
577            .expect("subslice should succeed");
578        Ok(self.store().utf8byte(self.absolute_cursor(abscursor))? - beginbyte)
579    }
580
581    /// This converts utf-8 byte to charpos, all in *relative* offsets to this textselection
582    fn utf8byte_to_charpos(&self, bytecursor: usize) -> Result<usize, StamError> {
583        //Convert from and to absolute coordinates so we don't have to reimplemented all the logic
584        //and can just call this same method on TextResource, which has the proper indices for this
585        let beginbyte = self
586            .store()
587            .subslice_utf8_offset(self.text())
588            .expect("subslice should succeed");
589        Ok(self
590            .store()
591            .utf8byte_to_charpos(self.absolute_cursor(beginbyte + bytecursor))?
592            - self.begin())
593    }
594
595    fn absolute_cursor(&self, cursor: usize) -> usize {
596        self.begin() + cursor
597    }
598}
599
600impl<'store, 'slf> FindText<'store, 'slf> for ResultTextSelection<'store>
601where
602    'store: 'slf,
603{
604    /// Searches the text using one or more regular expressions, returns an iterator over TextSelections along with the matching expression, this
605    /// is held by the [`FindRegexMatch`] struct.
606    ///
607    /// Passing multiple regular expressions at once is more efficient than calling this function anew for each one.
608    /// If capture groups are used in the regular expression, only those parts will be returned (the rest is context). If none are used,
609    /// the entire expression is returned.
610    ///
611    /// An `offset` can be specified to work on a sub-part rather than the entire text (like an existing TextSelection).
612    ///
613    /// The `allow_overlap` parameter determines if the matching expressions are allowed to
614    /// overlap. It you are doing some form of tokenisation, you also likely want this set to
615    /// false. All of this only matters if you supply multiple regular expressions.
616    ///
617    /// Results are returned in the exact order they are found in the text
618    fn find_text_regex<'regex>(
619        &'slf self,
620        expressions: &'regex [Regex],
621        precompiledset: Option<&RegexSet>,
622        allow_overlap: bool,
623    ) -> Result<FindRegexIter<'store, 'regex>, StamError> {
624        debug(self.store().config(), || {
625            format!(
626                "TextSelection::find_text_regex: expressions={:?}",
627                expressions
628            )
629        });
630        let text = self.text();
631        let selectexpressions =
632            find_text_regex_select_expressions(text, expressions, precompiledset)?;
633        //Returns an iterator that does the remainder of the actual searching
634        Ok(FindRegexIter {
635            resource: self.resource(),
636            expressions,
637            selectexpressions,
638            matchiters: Vec::new(),
639            nextmatches: Vec::new(),
640            text: self.text(),
641            begincharpos: self.begin(),
642            beginbytepos: self
643                .store()
644                .subslice_utf8_offset(text)
645                .expect("Subslice must be found"),
646            allow_overlap,
647        })
648    }
649
650    /// Searches for the specified text fragment. Returns an iterator to iterate over all matches in the text.
651    /// The iterator returns [`TextSelection`] items.
652    ///
653    /// For more complex and powerful searching use [`FindText::find_text_regex()`] instead
654    ///
655    /// If you want to search only a subpart of the text, extract a [`TextSelection`] first and then run [`self.find_text()`] on that instead.
656    fn find_text<'fragment>(
657        &'slf self,
658        fragment: &'fragment str,
659    ) -> FindTextIter<'store, 'fragment> {
660        FindTextIter {
661            store: self.rootstore(),
662            resources: smallvec!(self.resource().handle()),
663            resourcecursor: 0,
664            fragment,
665            offset: Offset::from(self),
666        }
667    }
668
669    /// Searches for the specified text fragment. Returns an iterator to iterate over all matches in the text.
670    /// The iterator returns [`TextSelection`] items.
671    ///
672    /// This search is case insensitive, use [`FindText::find_text()`] to search case sensitive. This variant is slightly less performant than the exact variant.
673    /// For more complex and powerful searching use [`FindText::find_text_regex()`] instead
674    ///
675    /// If you want to search only a subpart of the text, extract a [`TextSelection`] first with
676    /// [`FindText::textselection()`] and then run [`FindText::find_text_nocase()`] on that instead.
677    fn find_text_nocase(&'slf self, fragment: &str) -> FindNoCaseTextIter<'store> {
678        FindNoCaseTextIter {
679            store: self.rootstore(),
680            resources: smallvec!(self.resource().handle()),
681            resourcecursor: 0,
682            fragment: fragment.to_lowercase(),
683            offset: Offset::from(self),
684        }
685    }
686
687    fn split_text<'b>(&'slf self, delimiter: &'b str) -> SplitTextIter<'store, 'b> {
688        SplitTextIter {
689            resource: self.resource(),
690            iter: self.store().text().split(delimiter),
691            byteoffset: self
692                .subslice_utf8_offset(self.text())
693                .expect("subslice must succeed for split_text"),
694        }
695    }
696
697    /// Returns a [`TextSelection`] that corresponds to the offset **WITHIN** the textselection.
698    /// This returns a [`TextSelection`] with absolute coordinates in the resource.
699    ///
700    /// If the textselection is known (i.e. it has associated annotations), it will be returned as such with a handle (borrowed).
701    /// If it doesn't exist yet, a new one will be returned, and it won't have a handle, nor will it be added to the store automatically.
702    ///
703    /// The [`TextSelection`] is returned as in a far pointer ([`ResultTextSelection`]) that also contains reference to the underlying store (the [`TextResource`]).
704    fn textselection(
705        &'slf self,
706        offset: &Offset,
707    ) -> Result<ResultTextSelection<'store>, StamError> {
708        let offset = self.absolute_offset(&offset)?; //turns the relative offset into an absolute one (i.e. offsets in TextResource)
709        self.resource().textselection(&offset)
710    }
711}
712
713impl AnnotationStore {
714    /// Searches for text in all resources using one or more regular expressions, returns an iterator over TextSelections along with the matching expression, this
715    /// See [`FindText::find_text_regex()`].
716    /// Note that this method, unlike its counterpart [`FindText::find_text_regex()`], silently ignores any deeper errors that might occur.
717    pub fn find_text_regex<'store, 'r>(
718        &'store self,
719        expressions: &'r [Regex],
720        precompiledset: &'r Option<RegexSet>,
721        allow_overlap: bool,
722    ) -> impl Iterator<Item = FindRegexMatch<'store, 'r>> {
723        self.resources()
724            .filter_map(move |resource: ResultItem<'store, TextResource>| {
725                //      ^-- the move is only needed to move the bool in, otherwise we had to make it &'r bool and that'd be weird
726                resource
727                    .find_text_regex(expressions, precompiledset.as_ref(), allow_overlap)
728                    .ok() //ignore errors!
729            })
730            .flatten()
731    }
732
733    pub fn find_text<'store, 'fragment>(
734        &'store self,
735        fragment: &'fragment str,
736    ) -> FindTextIter<'store, 'fragment> {
737        FindTextIter {
738            store: self,
739            resources: self
740                .resources
741                .iter()
742                .filter_map(|x| x.as_ref().map(|res| res.handle().unwrap()))
743                .collect(),
744            resourcecursor: 0,
745            fragment,
746            offset: Offset::whole(),
747        }
748    }
749
750    pub fn find_text_nocase<'store>(&'store self, fragment: &str) -> FindNoCaseTextIter<'store> {
751        FindNoCaseTextIter {
752            store: self,
753            resources: self
754                .resources
755                .iter()
756                .filter_map(|x| x.as_ref().map(|res| res.handle().unwrap()))
757                .collect(),
758            resourcecursor: 0,
759            fragment: fragment.to_lowercase(),
760            offset: Offset::whole(),
761        }
762    }
763}
764
765/// Auxiliary function used by find_text_regex(). This method does, if needed, a single initial pass
766/// over the regular expression set, identifying which regular expressions match and are to be searched
767/// for in subsequent passes to find WHERE they match.
768pub(crate) fn find_text_regex_select_expressions<'a, 'b>(
769    text: &'a str,
770    expressions: &'b [Regex],
771    precompiledset: Option<&RegexSet>,
772) -> Result<Vec<usize>, StamError> {
773    Ok(if expressions.len() > 2 {
774        //we have multiple expressions, first we do a pass to see WHICH of the regular expression matche (taking them all into account in a single pass!).
775        //then afterwards we find for each of the matching expressions WHERE they are found
776        let foundexpressions: Vec<_> = if let Some(regexset) = precompiledset {
777            regexset.matches(text).into_iter().collect()
778        } else {
779            RegexSet::new(expressions.iter().map(|x| x.as_str()))
780                .map_err(|e| {
781                    StamError::RegexError(e, "Parsing regular expressions in search_text()")
782                })?
783                .matches(text)
784                .into_iter()
785                .collect()
786        };
787        foundexpressions
788    } else {
789        match expressions.len() {
790            1 => vec![0],
791            2 => vec![0, 1],
792            _ => unreachable!("Expected 1 or 2 expressions"),
793        }
794    })
795}
796
797/// Wrapper over iterator regex Matches or CaptureMatches
798pub(crate) enum Matches<'r, 'store> {
799    NoCapture(regex::Matches<'r, 'store>),
800    WithCapture(regex::CaptureMatches<'r, 'store>),
801}
802
803/// Wrapper over regex Match or Captures (as returned by the iterator)
804pub(crate) enum Match<'store> {
805    NoCapture(regex::Match<'store>),
806    WithCapture(regex::Captures<'store>),
807}
808
809impl<'t> Match<'t> {
810    /// Return the begin offset of the match (in utf-8 bytes)
811    fn begin(&self) -> usize {
812        match self {
813            Self::NoCapture(m) => m.start(),
814            Self::WithCapture(m) => {
815                let mut begin = None;
816                for group in m.iter() {
817                    if let Some(group) = group {
818                        if begin.is_none() || begin.unwrap() < group.start() {
819                            begin = Some(group.start());
820                        }
821                    }
822                }
823                begin.expect("there must be at least one capture group that was found")
824            }
825        }
826    }
827
828    /// Return the end offset of the match (in utf-8 bytes)
829    fn end(&self) -> usize {
830        match self {
831            Self::NoCapture(m) => m.end(),
832            Self::WithCapture(m) => {
833                let mut end = None;
834                for group in m.iter() {
835                    if let Some(group) = group {
836                        if end.is_none() || end.unwrap() < group.start() {
837                            end = Some(group.start());
838                        }
839                    }
840                }
841                end.expect("there must be at least one capture group that was found")
842            }
843        }
844    }
845}
846
847impl<'r, 't> Iterator for Matches<'r, 't> {
848    type Item = Match<'t>;
849
850    fn next(&mut self) -> Option<Self::Item> {
851        match self {
852            Self::NoCapture(iter) => {
853                if let Some(m) = iter.next() {
854                    Some(Match::NoCapture(m))
855                } else {
856                    None
857                }
858            }
859            Self::WithCapture(iter) => {
860                if let Some(m) = iter.next() {
861                    Some(Match::WithCapture(m))
862                } else {
863                    None
864                }
865            }
866        }
867    }
868}
869
870/// This match structure is returned by the [`FindRegexIter`] iterator, which is in turn produced
871/// by [`FindText::find_text_regex()`] and searches a text based on regular expressions. This
872/// structure represents a single regular-expression match of the iterator on the text.
873pub struct FindRegexMatch<'store, 'r> {
874    expression: &'r Regex,
875    expression_index: usize,
876    textselections: SmallVec<[ResultTextSelection<'store>; 2]>,
877    //Records the numbers of the capture that match (1-indexed)
878    capturegroups: SmallVec<[usize; 2]>,
879    resource: ResultItem<'store, TextResource>,
880}
881
882impl<'store, 'r> FindRegexMatch<'store, 'r> {
883    /// Does this match return multiple text selections?
884    /// Multiple text selections are returned only when the expression contains multiple capture groups.
885    pub fn multi(&self) -> bool {
886        self.textselections.len() > 1
887    }
888
889    /// Returns the regular expression that matched
890    pub fn expression(&self) -> &'r Regex {
891        self.expression
892    }
893
894    /// Returns the index of regular expression that matched
895    pub fn expression_index(&self) -> usize {
896        self.expression_index
897    }
898
899    pub fn textselections(&self) -> &[ResultTextSelection<'store>] {
900        &self.textselections
901    }
902
903    pub fn resource(&self) -> &ResultItem<'store, TextResource> {
904        &self.resource
905    }
906
907    /// Records the number of the capture groups (1-indexed!) that match.
908    /// This array has the same length as textselections and identifies precisely
909    /// which textselection corresponds with which capture group.
910    pub fn capturegroups(&self) -> &[usize] {
911        &self.capturegroups
912    }
913
914    /// Return the text of the match, this only works
915    /// if the regular expression targets a single
916    /// consecutive text, i.e. by not using multiple capture groups.
917    pub fn as_str(&self) -> Option<&'store str> {
918        if self.multi() {
919            None
920        } else {
921            self.textselections
922                .first()
923                .map(|textselection| textselection.text())
924        }
925    }
926
927    /// This returns a vector of texts and is mainly useful in case multiple
928    /// patterns were captured.
929    /// Use [`Self::as_str()`] instead if you expect only a single text item.
930    pub fn text(&self) -> Vec<&str> {
931        self.textselections
932            .iter()
933            .map(|textselection| textselection.text())
934            .collect()
935    }
936}
937
938/// This iterator is produced by [`FindText::find_text_regex()`] and searches a text based on regular expressions.
939pub struct FindRegexIter<'store, 'regex> {
940    pub(crate) resource: ResultItem<'store, TextResource>,
941    pub(crate) expressions: &'regex [Regex], // allows keeping all of the regular expressions external and borrow it, even if only a subset is found (subset is detected in prior pass by search_by_text())
942    pub(crate) selectexpressions: Vec<usize>, //points at an expression, not used directly but via selectionexpression() method
943    pub(crate) matchiters: Vec<Matches<'regex, 'store>>, //each expression (from selectexpressions) has its own interator  (same length as above vec)
944    pub(crate) nextmatches: Vec<Option<Match<'store>>>, //this buffers the next match for each expression (from selectexpressions, same length as above vec)
945    pub(crate) text: &'store str,
946    pub(crate) begincharpos: usize,
947    pub(crate) beginbytepos: usize,
948    pub(crate) allow_overlap: bool,
949}
950
951impl<'store, 'regex> Iterator for FindRegexIter<'store, 'regex> {
952    type Item = FindRegexMatch<'store, 'regex>;
953    fn next(&mut self) -> Option<Self::Item> {
954        if self.matchiters.is_empty() {
955            //instantiate the iterators for the expressions and retrieve the first item for each
956            //this is only called once when the iterator first starts
957            for i in self.selectexpressions.iter() {
958                let re = &self.expressions[*i];
959                let mut iter = if re.captures_len() > 1 {
960                    Matches::WithCapture(re.captures_iter(self.text))
961                } else {
962                    Matches::NoCapture(re.find_iter(self.text))
963                };
964                self.nextmatches.push(iter.next());
965                self.matchiters.push(iter);
966            }
967        }
968
969        //find the best next match (the single one next in line amongst all the iterators)
970        let mut bestnextmatch: Option<&Match<'store>> = None;
971        let mut bestmatchindex = None;
972        for (i, m) in self.nextmatches.iter().enumerate() {
973            if let Some(m) = m {
974                if bestnextmatch.is_none() || m.begin() < bestnextmatch.unwrap().begin() {
975                    bestnextmatch = Some(m);
976                    bestmatchindex = Some(i);
977                }
978            }
979        }
980
981        if let Some(i) = bestmatchindex {
982            // this match will be the result, convert it to the proper structure
983            let m = self.nextmatches[i].take().unwrap();
984
985            // iterate any buffers than overlap with this result, discarding those matces in the process
986            if !self.allow_overlap {
987                for (j, m2) in self.nextmatches.iter_mut().enumerate() {
988                    if j != i && m2.is_some() {
989                        if m2.as_ref().unwrap().begin() >= m.begin()
990                            && m2.as_ref().unwrap().begin() < m.end()
991                        {
992                            //(note: no need to check whether m2.end in range m.begin-m.end)
993                            *m2 = self.matchiters[j].next();
994                        }
995                    }
996                }
997            }
998
999            let result = self.match_to_result(m, i);
1000
1001            // iterate the iterator for this one and buffer the next match for next round
1002            self.nextmatches[i] = self.matchiters[i].next();
1003
1004            Some(result)
1005        } else {
1006            //nothing found, we are all done
1007            None
1008        }
1009    }
1010}
1011
1012impl<'store, 'regex> FindRegexIter<'store, 'regex> {
1013    /// Build the final match structure we return
1014    fn match_to_result(
1015        &self,
1016        m: Match<'store>,
1017        selectexpression_index: usize,
1018    ) -> FindRegexMatch<'store, 'regex> {
1019        let expression_index = self.selectexpressions[selectexpression_index];
1020        match m {
1021            Match::NoCapture(m) => {
1022                let textselection = self
1023                    .resource
1024                    .textselection(&Offset::simple(
1025                        self.begincharpos
1026                            + self
1027                                .resource
1028                                .utf8byte_to_charpos(self.beginbytepos + m.start())
1029                                .expect("byte to pos conversion must succeed"),
1030                        self.begincharpos
1031                            + self
1032                                .resource
1033                                .utf8byte_to_charpos(self.beginbytepos + m.end())
1034                                .expect("byte to pos conversion must succeed"),
1035                    ))
1036                    .expect("textselection from offset must succeed");
1037                FindRegexMatch {
1038                    expression: &self.expressions[expression_index],
1039                    expression_index,
1040                    resource: self.resource.clone(),
1041                    textselections: smallvec!(textselection),
1042                    capturegroups: smallvec!(),
1043                }
1044            }
1045            Match::WithCapture(m) => {
1046                let mut groupiter = m.iter();
1047                groupiter.next(); //The first match always corresponds to the overall match of the regex, we can ignore it
1048                let mut textselections: SmallVec<_> = SmallVec::new();
1049                let mut capturegroups: SmallVec<[usize; 2]> = SmallVec::new();
1050                for (i, group) in groupiter.enumerate() {
1051                    if let Some(group) = group {
1052                        capturegroups.push(i + 1); //1-indexed
1053                        textselections.push(
1054                            self.resource
1055                                .textselection(&Offset::simple(
1056                                    self.begincharpos
1057                                        + self
1058                                            .resource
1059                                            .utf8byte_to_charpos(self.beginbytepos + group.start())
1060                                            .expect("byte to pos conversion must succeed"),
1061                                    self.begincharpos
1062                                        + self
1063                                            .resource
1064                                            .utf8byte_to_charpos(self.beginbytepos + group.end())
1065                                            .expect("byte to pos conversion must succeed"),
1066                                ))
1067                                .expect("textselection from offset must succeed"),
1068                        )
1069                    }
1070                }
1071                FindRegexMatch {
1072                    expression: &self.expressions[expression_index],
1073                    expression_index,
1074                    resource: self.resource.clone(),
1075                    textselections,
1076                    capturegroups,
1077                }
1078            }
1079        }
1080    }
1081}
1082
1083/// This iterator is produced by [`FindText::find_text()`] and searches a text for a single fragment. The search is case sensitive. See [`FindNoCaseTextIter`] for a case-insensitive variant.
1084/// The iterator yields [`ResultTextSelection`] items (which encapsulates [`TextSelection`]).
1085pub struct FindTextIter<'a, 'b> {
1086    pub(crate) store: &'a AnnotationStore,
1087    pub(crate) resources: SmallVec<[TextResourceHandle; 1]>,
1088    pub(crate) fragment: &'b str,
1089    pub(crate) resourcecursor: usize,
1090    pub(crate) offset: Offset,
1091}
1092
1093impl<'a, 'b> Iterator for FindTextIter<'a, 'b> {
1094    type Item = ResultTextSelection<'a>;
1095    fn next(&mut self) -> Option<Self::Item> {
1096        loop {
1097            if let Some(resourcehandle) = self.resources.get(self.resourcecursor).copied() {
1098                let resource = self
1099                    .store
1100                    .resource(resourcehandle)
1101                    .expect("resource must exist");
1102                if let Some(text) = resource.text_by_offset(&self.offset).ok() {
1103                    let beginbytepos = resource
1104                        .subslice_utf8_offset(text)
1105                        .expect("bytepos must be valid");
1106                    if let Some(foundbytepos) = text.find(self.fragment) {
1107                        let endbytepos = foundbytepos + self.fragment.len();
1108                        let newbegin = resource
1109                            .utf8byte_to_charpos(beginbytepos + foundbytepos)
1110                            .expect("utf-8 byte must resolve to valid charpos");
1111                        let newend = resource
1112                            .utf8byte_to_charpos(beginbytepos + endbytepos)
1113                            .expect("utf-8 byte must resolve to valid charpos");
1114                        //set offset for next run
1115                        self.offset = Offset {
1116                            begin: Cursor::BeginAligned(newend),
1117                            end: self.offset.end,
1118                        };
1119                        match resource.textselection(&Offset::simple(newbegin, newend)) {
1120                            Ok(textselection) => return Some(textselection),
1121                            Err(e) => {
1122                                eprintln!("WARNING: FindTextIter ended prematurely: {}", e);
1123                                return None;
1124                            }
1125                        }
1126                    } else {
1127                        self.resourcecursor += 1;
1128                        self.offset = Offset::whole();
1129                    }
1130                } else {
1131                    self.resourcecursor += 1;
1132                    self.offset = Offset::whole();
1133                }
1134            } else {
1135                return None;
1136            }
1137        }
1138    }
1139}
1140/// This iterator is produced by [`FindText::find_text_nocase()`] and searches a text for a single fragment, without regard for casing.
1141/// It has more overhead than the exact (case sensitive) variant [`FindTextIter`].
1142pub struct FindNoCaseTextIter<'a> {
1143    pub(crate) store: &'a AnnotationStore,
1144    pub(crate) resources: SmallVec<[TextResourceHandle; 1]>,
1145
1146    /// Fragment must be lowercase
1147    pub(crate) fragment: String,
1148    pub(crate) resourcecursor: usize,
1149    pub(crate) offset: Offset,
1150}
1151
1152impl<'a> Iterator for FindNoCaseTextIter<'a> {
1153    type Item = ResultTextSelection<'a>;
1154    fn next(&mut self) -> Option<Self::Item> {
1155        loop {
1156            if let Some(resourcehandle) = self.resources.get(self.resourcecursor).copied() {
1157                let resource = self
1158                    .store
1159                    .resource(resourcehandle)
1160                    .expect("resource must exist");
1161                if let Some(text) = resource.text_by_offset(&self.offset).ok() {
1162                    let beginbytepos = resource
1163                        .subslice_utf8_offset(text)
1164                        .expect("bytepos must be valid");
1165                    let text = text.to_lowercase();
1166                    if let Some(foundbytepos) = text.find(self.fragment.as_str()) {
1167                        let endbytepos = foundbytepos + self.fragment.len(); //MAYBE TODO: possible issue if uppercase and lowercase variants have different byte length!
1168                        let newbegin = resource
1169                            .utf8byte_to_charpos(beginbytepos + foundbytepos)
1170                            .expect("utf-8 byte must resolve to valid charpos");
1171                        let newend = resource
1172                            .utf8byte_to_charpos(beginbytepos + endbytepos)
1173                            .expect("utf-8 byte must resolve to valid charpos");
1174                        //set offset for next run
1175                        self.offset = Offset {
1176                            begin: Cursor::BeginAligned(newend),
1177                            end: self.offset.end,
1178                        };
1179                        match resource.textselection(&Offset::simple(newbegin, newend)) {
1180                            Ok(textselection) => return Some(textselection),
1181                            Err(e) => {
1182                                eprintln!("WARNING: FindTextIter ended prematurely: {}", e);
1183                                return None;
1184                            }
1185                        }
1186                    } else {
1187                        self.resourcecursor += 1;
1188                        self.offset = Offset::whole();
1189                    }
1190                } else {
1191                    self.resourcecursor += 1;
1192                    self.offset = Offset::whole();
1193                }
1194            } else {
1195                return None;
1196            }
1197        }
1198    }
1199}
1200
1201/// This iterator is produced by [`FindText::split_text()`] and splits a text based on a delimiter.
1202/// The iterator yields [`ResultTextSelection`] (which encapsulates [`TextSelection`]).
1203pub struct SplitTextIter<'store, 'b> {
1204    pub(crate) resource: ResultItem<'store, TextResource>,
1205    pub(crate) iter: std::str::Split<'store, &'b str>,
1206    pub(crate) byteoffset: usize,
1207}
1208
1209impl<'store, 'b> Iterator for SplitTextIter<'store, 'b> {
1210    type Item = ResultTextSelection<'store>;
1211    fn next(&mut self) -> Option<Self::Item> {
1212        if let Some(matchstr) = self.iter.next() {
1213            let beginbyte = self
1214                .resource
1215                .subslice_utf8_offset(matchstr)
1216                .expect("match must be found")
1217                - self.byteoffset;
1218            let endbyte = (beginbyte + matchstr.len()) - self.byteoffset;
1219            Some(
1220                self.resource
1221                    .textselection(&Offset::simple(
1222                        self.resource
1223                            .utf8byte_to_charpos(beginbyte)
1224                            .expect("utf-8 byte must resolve to char pos"),
1225                        self.resource
1226                            .utf8byte_to_charpos(endbyte)
1227                            .expect("utf-8 byte must resolve to char pos"),
1228                    ))
1229                    .expect("text selection must succeed"),
1230            )
1231        } else {
1232            None
1233        }
1234    }
1235}
stam/api/text.rs

stam/api/
text.rs