stam/api/text.rs
1/*
2 STAM Library (Stand-off Text Annotation Model)
3 by Maarten van Gompel <proycon@anaproy.nl>
4 Digital Infrastucture, KNAW Humanities Cluster
5
6 Licensed under the GNU General Public License v3
7
8 https://github.com/annotation/stam-rust
9*/
10
11//! This module contains the high-level API for finding text. The `FindText` trait embodies this API and is implemented on
12//! [`ResultItem<TextResource>`] and [`ResultTextSelection`]. It builds upon the lower-level [`Text`] trait.
13
14use crate::annotationstore::AnnotationStore;
15use crate::config::Configurable;
16use crate::error::StamError;
17use crate::resources::{TextResource, TextResourceHandle};
18use crate::selector::Offset;
19use crate::store::*;
20use crate::text::Text;
21use crate::textselection::ResultTextSelection;
22use crate::textselection::TextSelection;
23use crate::types::*;
24use regex::{Regex, RegexSet};
25use smallvec::{smallvec, SmallVec};
26
27/// This trait provides text-searching methods that operate on structures that hold or represent text content. It builds upon the lower-level [`Text`] trait.
28pub trait FindText<'store, 'slf>: Text<'store, 'slf>
29where
30 'store: 'slf,
31{
32 /// Searches the text using one or more regular expressions, returns an iterator over TextSelections along with the matching expression, this
33 /// is held by the [`FindRegexMatch`] struct.
34 ///
35 /// Passing multiple regular expressions at once is more efficient than calling this function anew for each one.
36 /// If capture groups are used in the regular expression, only those parts will be returned (the rest is context). If none are used,
37 /// the entire expression is returned.
38 ///
39 /// The `allow_overlap` parameter determines if the matching expressions are allowed to
40 /// overlap. It you are doing some form of tokenisation, you also likely want this set to
41 /// false. All of this only matters if you supply multiple regular expressions.
42 ///
43 /// Results are returned in the exact order they are found in the text
44 fn find_text_regex<'regex>(
45 &'slf self,
46 expressions: &'regex [Regex],
47 precompiledset: Option<&RegexSet>,
48 allow_overlap: bool,
49 ) -> Result<FindRegexIter<'store, 'regex>, StamError>;
50
51 /// Searches for the specified text fragment. Returns an iterator to iterate over all matches in the text.
52 /// The iterator returns encapsulated [`TextSelection`] items as [`ResultTextSelection`].
53 ///
54 /// For more complex and powerful searching use [`FindText::find_text_regex()`] instead
55 ///
56 /// If you want to search only a subpart of the text, extract a [`TextSelection`] first with
57 /// [`FindText::textselection()`] and then run [`FindText::find_text()`] on that instead.
58 fn find_text<'fragment>(
59 &'slf self,
60 fragment: &'fragment str,
61 ) -> FindTextIter<'store, 'fragment>;
62
63 /// Searches for the specified text fragment. Returns an iterator to iterate over all matches in the text.
64 /// The iterator returns [`TextSelection`] items wrapped as [`ResultTextSelection`].
65 ///
66 /// For more complex and powerful searching use [`FindText::find_text_regex()`] instead
67 ///
68 /// If you want to search only a subpart of the text, extract a [`TextSelection`] first with
69 /// [`FindText::textselection()`] and then run [`FindText::find_text()`] on that instead.
70 fn find_text_nocase(&'slf self, fragment: &str) -> FindNoCaseTextIter<'store>;
71
72 /// Searches for the multiple text fragment in sequence. Returns a vector with
73 /// [`TextSelection`] instances wrapped as [`ResultTextSelection`].
74 ///
75 /// Matches must appear in the exact order specified, but *may* have other intermittent text,
76 /// determined by the `allow_skip_char` closure. A recommended closure for natural language
77 /// text is: `|c| !c.is_alphabetic()`
78 ///
79 /// The `case_sensitive` parameter determines if the search is case sensitive or not, case insensitive searches have a performance penalty.
80 fn find_text_sequence<'fragment, F>(
81 &'slf self,
82 fragments: &'fragment [&'fragment str],
83 allow_skip_char: F,
84 case_sensitive: bool,
85 ) -> Option<Vec<ResultTextSelection<'store>>>
86 where
87 F: Fn(char) -> bool,
88 {
89 let mut results: Vec<ResultTextSelection<'store>> = Vec::with_capacity(fragments.len());
90 let mut begin: usize = 0;
91 let mut textselectionresult = self.textselection(&Offset::whole());
92 for fragment in fragments {
93 if let Ok(searchtext) = textselectionresult {
94 if let Some(m) = if case_sensitive {
95 searchtext.find_text(fragment).next()
96 } else {
97 searchtext.find_text_nocase(fragment).next()
98 } {
99 if m.begin() > begin {
100 //we skipped some text since last match, check the characters in between matches
101 let skipped_text = self
102 .textselection(&Offset::simple(begin, m.begin()))
103 .expect("textselection must succeed")
104 .text();
105 for c in skipped_text.chars() {
106 if !allow_skip_char(c) {
107 return None;
108 }
109 }
110 }
111 begin = m.end();
112 results.push(m);
113 } else {
114 return None;
115 }
116 //slice (shorten) new text for next test
117 textselectionresult = searchtext.textselection(&Offset::new(
118 Cursor::BeginAligned(begin - searchtext.begin()), //offset must be relative
119 Cursor::EndAligned(0),
120 ));
121 } else {
122 return None;
123 }
124 }
125
126 Some(results)
127 }
128
129 /// Returns an iterator of [`TextSelection`] instances that represent partitions
130 /// of the text given the specified delimiter. No text is modified.
131 ///
132 /// The iterator returns [`TextSelection`] items as a fat pointer [`ResultTextSelection`]).
133 fn split_text<'b>(&'slf self, delimiter: &'b str) -> SplitTextIter<'store, 'b>;
134
135 /// Trims all occurrences of any character in `chars` from both the beginning and end of the text,
136 /// returning a smaller [`TextSelection`] (as a fat pointer [`ResultTextSelection`]). No text is modified.
137 fn trim_text(&'slf self, chars: &[char]) -> Result<ResultTextSelection<'store>, StamError> {
138 let mut trimbegin = 0;
139 let mut trimend = 0;
140 for c in self.text().chars() {
141 if chars.contains(&c) {
142 trimbegin += 1;
143 } else {
144 break;
145 }
146 }
147 for c in self.text().chars().rev() {
148 if chars.contains(&c) {
149 trimend -= 1;
150 } else {
151 break;
152 }
153 }
154 self.textselection(&Offset::new(
155 Cursor::BeginAligned(trimbegin),
156 Cursor::EndAligned(trimend),
157 ))
158 }
159
160 /// Trims all occurrences of any character `chars` that pass the supplied function, from both the beginning and end of the text,
161 /// returning a smaller [`TextSelection`] (as a fat pointer [`ResultTextSelection`]). No text is modified.
162 fn trim_text_with<F>(&'slf self, f: F) -> Result<ResultTextSelection<'store>, StamError>
163 where
164 F: Fn(char) -> bool,
165 {
166 let mut trimbegin = 0;
167 let mut trimend = 0;
168 for c in self.text().chars() {
169 if f(c) {
170 trimbegin += 1;
171 } else {
172 break;
173 }
174 }
175 for c in self.text().chars().rev() {
176 if f(c) {
177 trimend -= 1;
178 } else {
179 break;
180 }
181 }
182 self.textselection(&Offset::new(
183 Cursor::BeginAligned(trimbegin),
184 Cursor::EndAligned(trimend),
185 ))
186 }
187
188 /// Returns a [`TextSelection`] that corresponds to the offset. If the TextSelection
189 /// exists, the existing one will be returned (as a copy, but it will have a [`TextSelection::handle()`].
190 /// If it doesn't exist yet, a new one will be returned, and it won't have a handle, nor will it be added to the store automatically.
191
192 /// The [`TextSelection`] is returned as in a fat pointer ([`ResultTextSelection`]) that also contains reference to the underlying store.
193 ///
194 /// Use [`TextResource::known_textselection()`] instead if you want to limit to existing text selections on resources.
195 fn textselection(&'slf self, offset: &Offset)
196 -> Result<ResultTextSelection<'store>, StamError>;
197}
198
199/// this implementation mostly defers directly to the wrapped item, documentation is found on the trait and not repeated here
200impl<'store, 'slf> Text<'store, 'slf> for ResultItem<'store, TextResource>
201where
202 'store: 'slf,
203{
204 fn textlen(&self) -> usize {
205 self.as_ref().textlen()
206 }
207
208 fn text(&'slf self) -> &'store str {
209 self.as_ref().text()
210 }
211
212 fn text_by_offset(&self, offset: &Offset) -> Result<&'store str, StamError> {
213 self.as_ref().text_by_offset(offset)
214 }
215
216 fn absolute_cursor(&self, cursor: usize) -> usize {
217 cursor
218 }
219
220 fn utf8byte(&self, abscursor: usize) -> Result<usize, StamError> {
221 self.as_ref().utf8byte(abscursor)
222 }
223
224 fn utf8byte_to_charpos(&self, bytecursor: usize) -> Result<usize, StamError> {
225 self.as_ref().utf8byte_to_charpos(bytecursor)
226 }
227}
228
229impl<'store, 'slf> FindText<'store, 'slf> for ResultItem<'store, TextResource>
230where
231 'store: 'slf,
232{
233 /// Returns a [`TextSelection`] that corresponds to the offset. If the TextSelection
234 /// exists, the existing one will be returned.
235 /// If it doesn't exist yet, a new one will be returned, and it won't have a handle, nor will it be added to the store automatically.
236 ///
237 /// The [`TextSelection`] is returned in a fat pointer ([`ResultTextSelection`]) that also contains reference to the underlying store.
238 fn textselection(&self, offset: &Offset) -> Result<ResultTextSelection<'store>, StamError> {
239 match self.as_ref().known_textselection(offset) {
240 Ok(Some(handle)) => {
241 //existing textselection
242 let textselection: &TextSelection = self.as_ref().get(handle)?; //shouldn't fail here anymore
243 let wrapped = textselection.as_resultitem(self.as_ref(), self.rootstore());
244 Ok(ResultTextSelection::Bound(wrapped))
245 }
246 Ok(None) => {
247 let textselection: TextSelection =
248 self.as_ref().textselection_by_offset_unchecked(offset)?;
249 Ok(ResultTextSelection::Unbound(
250 self.rootstore(),
251 self.as_ref(),
252 textselection,
253 ))
254 }
255 Err(err) => Err(err), //an error occured, propagate
256 }
257 }
258
259 /// Searches the text using one or more regular expressions, returns an iterator over TextSelections along with the matching expression, this
260 /// is held by the [`FindRegexMatch`] struct.
261 ///
262 /// Passing multiple regular expressions at once is more efficient than calling this function anew for each one.
263 /// If capture groups are used in the regular expression, only those parts will be returned (the rest is context). If none are used,
264 /// the entire expression is returned.
265 ///
266 /// The `allow_overlap` parameter determines if the matching expressions are allowed to
267 /// overlap. It you are doing some form of tokenisation, you also likely want this set to
268 /// false. All of this only matters if you supply multiple regular expressions.
269 ///
270 /// Results are returned in the exact order they are found in the text
271 fn find_text_regex<'regex>(
272 &self,
273 expressions: &'regex [Regex],
274 precompiledset: Option<&RegexSet>,
275 allow_overlap: bool,
276 ) -> Result<FindRegexIter<'store, 'regex>, StamError> {
277 debug(self.as_ref().config(), || {
278 format!("find_text_regex: expressions={:?}", expressions)
279 });
280 let selectexpressions =
281 find_text_regex_select_expressions(self.text(), expressions, precompiledset)?;
282 //Returns an iterator that does the remainder of the actual searching
283 Ok(FindRegexIter {
284 resource: self.clone(),
285 expressions,
286 selectexpressions,
287 matchiters: Vec::new(),
288 nextmatches: Vec::new(),
289 text: self.text(),
290 begincharpos: 0,
291 beginbytepos: 0,
292 allow_overlap,
293 })
294 }
295
296 /// Searches for the specified text fragment. Returns an iterator to iterate over all matches in the text.
297 /// The iterator returns [`TextSelection`] items.
298 ///
299 /// This search is case sensitive, use [`FindText::find_text_nocase()`] to search case insensitive.
300 /// For more complex and powerful searching use [`FindText::find_text_regex()`] instead
301 ///
302 /// If you want to search only a subpart of the text, extract a [`TextSelection`] first with
303 /// [`FindText::textselection()`] and then run [`FindText::find_text()`] on that instead.
304 fn find_text<'fragment>(&self, fragment: &'fragment str) -> FindTextIter<'store, 'fragment> {
305 FindTextIter {
306 store: self.rootstore(),
307 resources: smallvec!(self.handle()),
308 resourcecursor: 0,
309 fragment,
310 offset: Offset::whole(),
311 }
312 }
313
314 /// Searches for the specified text fragment. Returns an iterator to iterate over all matches in the text.
315 /// The iterator returns [`TextSelection`] items.
316 ///
317 /// This search is case insensitive, use [`FindText::find_text()`] to search case sensitive. This variant is slightly less performant than the exact variant.
318 /// For more complex and powerful searching use [`FindText::find_text_regex()`] instead
319 ///
320 /// If you want to search only a subpart of the text, extract a [`TextSelection`] first with
321 /// [`FindText::textselection()`] and then run [`FindText::find_text_nocase()`] on that instead.
322 fn find_text_nocase(&self, fragment: &str) -> FindNoCaseTextIter<'store> {
323 FindNoCaseTextIter {
324 store: self.rootstore(),
325 resources: smallvec!(self.handle()),
326 resourcecursor: 0,
327 fragment: fragment.to_lowercase(),
328 offset: Offset::whole(),
329 }
330 }
331
332 /// Splits the text of this resource given a delimiter, the resulting iterator yields [`TextSelection`] items (as [`ResultTextSelection`]).
333 fn split_text<'b>(&self, delimiter: &'b str) -> SplitTextIter<'store, 'b> {
334 SplitTextIter {
335 resource: self.clone(),
336 iter: self.text().split(delimiter),
337 byteoffset: 0,
338 }
339 }
340}
341
342impl<'store, 'slf> Text<'store, 'slf> for ResultItem<'store, TextSelection>
343where
344 'store: 'slf,
345{
346 fn text(&'slf self) -> &'store str {
347 let resource = self.store(); //courtesy of ResultItem
348 let beginbyte = resource
349 .utf8byte(self.begin())
350 .expect("utf8byte conversion should succeed");
351 let endbyte = resource
352 .utf8byte(self.end())
353 .expect("utf8byte conversion should succeed");
354 &resource.text()[beginbyte..endbyte]
355 }
356
357 fn textlen(&self) -> usize {
358 self.end() - self.begin()
359 }
360
361 /// Returns a string reference to a slice of text as specified by the offset
362 fn text_by_offset(&self, offset: &Offset) -> Result<&'store str, StamError> {
363 let beginbyte =
364 self.utf8byte(self.absolute_cursor(self.beginaligned_cursor(&offset.begin)?))?;
365 let endbyte =
366 self.utf8byte(self.absolute_cursor(self.beginaligned_cursor(&offset.end)?))?;
367 if endbyte < beginbyte {
368 Err(StamError::InvalidOffset(
369 Cursor::BeginAligned(beginbyte),
370 Cursor::BeginAligned(endbyte),
371 "End must be greater than or equal to begin. (Cursor should be interpreted as UTF-8 bytes in this error context only)",
372 ))
373 } else {
374 Ok(&self.text()[beginbyte..endbyte])
375 }
376 }
377
378 /// This converts a unicode point to utf-8 byte, all in *relative* offsets to this textselection
379 fn utf8byte(&self, abscursor: usize) -> Result<usize, StamError> {
380 //Convert from and to absolute coordinates so we don't have to reimplemented all the logic
381 //and can just call this same method on [`TextResource`](crate::TextResource), which has the proper indices for this
382 let beginbyte = self
383 .store()
384 .subslice_utf8_offset(self.text())
385 .expect("subslice should succeed");
386 Ok(self.store().utf8byte(self.absolute_cursor(abscursor))? - beginbyte)
387 }
388
389 /// This converts utf-8 byte to charpos, all in *relative* offsets to this textselection
390 fn utf8byte_to_charpos(&self, bytecursor: usize) -> Result<usize, StamError> {
391 //Convert from and to absolute coordinates so we don't have to reimplemented all the logic
392 //and can just call this same method on [`TextResource`](crate::TextResource), which has the proper indices for this
393 let beginbyte = self
394 .store()
395 .subslice_utf8_offset(self.text())
396 .expect("subslice should succeed");
397 Ok(self
398 .store()
399 .utf8byte_to_charpos(self.absolute_cursor(beginbyte + bytecursor))?
400 - self.begin())
401 }
402
403 fn absolute_cursor(&self, cursor: usize) -> usize {
404 self.begin() + cursor
405 }
406}
407
408impl<'store, 'slf> FindText<'store, 'slf> for ResultItem<'store, TextSelection>
409where
410 'store: 'slf,
411{
412 /// Searches the text using one or more regular expressions, returns an iterator over TextSelections along with the matching expression, this
413 /// is held by the [`FindRegexMatch`] struct.
414 ///
415 /// Passing multiple regular expressions at once is more efficient than calling this function anew for each one.
416 /// If capture groups are used in the regular expression, only those parts will be returned (the rest is context). If none are used,
417 /// the entire expression is returned.
418 ///
419 /// An `offset` can be specified to work on a sub-part rather than the entire text (like an existing TextSelection).
420 ///
421 /// The `allow_overlap` parameter determines if the matching expressions are allowed to
422 /// overlap. It you are doing some form of tokenisation, you also likely want this set to
423 /// false. All of this only matters if you supply multiple regular expressions.
424 ///
425 /// Results are returned in the exact order they are found in the text
426 fn find_text_regex<'regex>(
427 &'slf self,
428 expressions: &'regex [Regex],
429 precompiledset: Option<&RegexSet>,
430 allow_overlap: bool,
431 ) -> Result<FindRegexIter<'store, 'regex>, StamError> {
432 debug(self.store().config(), || {
433 format!(
434 "TextSelection::find_text_regex: expressions={:?}",
435 expressions
436 )
437 });
438 let text = self.text();
439 let selectexpressions =
440 find_text_regex_select_expressions(text, expressions, precompiledset)?;
441 //Returns an iterator that does the remainder of the actual searching
442 Ok(FindRegexIter {
443 resource: self.resource(),
444 expressions,
445 selectexpressions,
446 matchiters: Vec::new(),
447 nextmatches: Vec::new(),
448 text: self.text(),
449 begincharpos: self.begin(),
450 beginbytepos: self
451 .store()
452 .subslice_utf8_offset(text)
453 .expect("Subslice must be found"),
454 allow_overlap,
455 })
456 }
457
458 /// Searches for the specified text fragment. Returns an iterator to iterate over all matches in the text.
459 /// The iterator returns [`TextSelection`] items.
460 ///
461 /// For more complex and powerful searching use [`FindText::find_text_regex()`] instead
462 ///
463 /// If you want to search only a subpart of the text, extract a [`TextSelection`] first and then run [`FindText::find_text()`] on that instead.
464 fn find_text<'fragment>(
465 &'slf self,
466 fragment: &'fragment str,
467 ) -> FindTextIter<'store, 'fragment> {
468 FindTextIter {
469 store: self.rootstore(),
470 resources: smallvec!(self.resource().handle()),
471 resourcecursor: 0,
472 fragment,
473 offset: Offset::from(self),
474 }
475 }
476
477 /// Searches for the specified text fragment. Returns an iterator to iterate over all matches in the text.
478 /// The iterator returns [`TextSelection`] items.
479 ///
480 /// This search is case insensitive, use [`FindText::find_text()`] to search case sensitive. This variant is slightly less performant than the exact variant.
481 /// For more complex and powerful searching use [`FindText::find_text_regex()`] instead
482 ///
483 /// If you want to search only a subpart of the text, extract a [`TextSelection`] first with
484 /// [`FindText::textselection()`] and then run [`FindText::find_text_nocase()`] on that instead.
485 fn find_text_nocase(&'slf self, fragment: &str) -> FindNoCaseTextIter<'store> {
486 FindNoCaseTextIter {
487 store: self.rootstore(),
488 resources: smallvec!(self.resource().handle()),
489 resourcecursor: 0,
490 fragment: fragment.to_lowercase(),
491 offset: Offset::from(self),
492 }
493 }
494
495 fn split_text<'b>(&'slf self, delimiter: &'b str) -> SplitTextIter<'store, 'b> {
496 SplitTextIter {
497 resource: self.resource(),
498 iter: self.store().text().split(delimiter),
499 byteoffset: self
500 .subslice_utf8_offset(self.text())
501 .expect("subslice must succeed for split_text"),
502 }
503 }
504
505 /// Returns a [`TextSelection`] that corresponds to the offset **WITHIN** the textselection.
506 /// This returns a [`TextSelection`] with absolute coordinates in the resource.
507 ///
508 /// If the textselection is known (i.e. it has associated annotations), it will be returned as such with a handle (borrowed).
509 /// If it doesn't exist yet, a new one will be returned, and it won't have a handle, nor will it be added to the store automatically.
510 ///
511 /// The [`TextSelection`] is returned as in a far pointer (`ResultItem`) that also contains reference to the underlying store (the [`TextResource`]).
512 fn textselection(
513 &'slf self,
514 offset: &Offset,
515 ) -> Result<ResultTextSelection<'store>, StamError> {
516 let resource = self.resource(); //courtesy of ResultItem
517 let offset = self.absolute_offset(&offset)?; //turns the relative offset into an absolute one (i.e. offsets in TextResource)
518 resource.textselection(&offset)
519 }
520}
521
522impl<'store, 'slf> Text<'store, 'slf> for ResultTextSelection<'store>
523where
524 'store: 'slf,
525{
526 fn text(&'slf self) -> &'store str {
527 let resource = self.store();
528 let beginbyte = resource
529 .utf8byte(self.begin())
530 .expect("utf8byte conversion should succeed");
531 let endbyte = resource
532 .utf8byte(self.end())
533 .expect("utf8byte conversion should succeed");
534 &resource.text()[beginbyte..endbyte]
535 }
536
537 fn textlen(&self) -> usize {
538 self.end() - self.begin()
539 }
540
541 /// Returns a string reference to a slice of text as specified by the offset
542 fn text_by_offset(&'slf self, offset: &Offset) -> Result<&'store str, StamError> {
543 let beginbyte =
544 self.utf8byte(self.absolute_cursor(self.beginaligned_cursor(&offset.begin)?))?;
545 let endbyte =
546 self.utf8byte(self.absolute_cursor(self.beginaligned_cursor(&offset.end)?))?;
547 if endbyte < beginbyte {
548 Err(StamError::InvalidOffset(
549 Cursor::BeginAligned(beginbyte),
550 Cursor::BeginAligned(endbyte),
551 "End must be greater than or equal to begin. (Cursor should be interpreted as UTF-8 bytes in this error context only)",
552 ))
553 } else {
554 Ok(&self.text()[beginbyte..endbyte])
555 }
556 }
557
558 /// Finds the utf-8 byte position where the specified text subslice begins
559 /// The returned offset is relative to the TextSelection
560 fn subslice_utf8_offset(&self, subslice: &str) -> Option<usize> {
561 let self_begin = self.text().as_ptr() as usize;
562 let sub_begin = subslice.as_ptr() as usize;
563 if sub_begin < self_begin || sub_begin > self_begin.wrapping_add(self.text().len()) {
564 None
565 } else {
566 Some(sub_begin.wrapping_sub(self_begin))
567 }
568 }
569
570 /// This converts a unicode point to utf-8 byte, all in *relative* offsets to this textselection
571 fn utf8byte(&self, abscursor: usize) -> Result<usize, StamError> {
572 //Convert from and to absolute coordinates so we don't have to reimplemented all the logic
573 //and can just call this same method on TextResource, which has the proper indices for this
574 let beginbyte = self
575 .store()
576 .subslice_utf8_offset(self.text())
577 .expect("subslice should succeed");
578 Ok(self.store().utf8byte(self.absolute_cursor(abscursor))? - beginbyte)
579 }
580
581 /// This converts utf-8 byte to charpos, all in *relative* offsets to this textselection
582 fn utf8byte_to_charpos(&self, bytecursor: usize) -> Result<usize, StamError> {
583 //Convert from and to absolute coordinates so we don't have to reimplemented all the logic
584 //and can just call this same method on TextResource, which has the proper indices for this
585 let beginbyte = self
586 .store()
587 .subslice_utf8_offset(self.text())
588 .expect("subslice should succeed");
589 Ok(self
590 .store()
591 .utf8byte_to_charpos(self.absolute_cursor(beginbyte + bytecursor))?
592 - self.begin())
593 }
594
595 fn absolute_cursor(&self, cursor: usize) -> usize {
596 self.begin() + cursor
597 }
598}
599
600impl<'store, 'slf> FindText<'store, 'slf> for ResultTextSelection<'store>
601where
602 'store: 'slf,
603{
604 /// Searches the text using one or more regular expressions, returns an iterator over TextSelections along with the matching expression, this
605 /// is held by the [`FindRegexMatch`] struct.
606 ///
607 /// Passing multiple regular expressions at once is more efficient than calling this function anew for each one.
608 /// If capture groups are used in the regular expression, only those parts will be returned (the rest is context). If none are used,
609 /// the entire expression is returned.
610 ///
611 /// An `offset` can be specified to work on a sub-part rather than the entire text (like an existing TextSelection).
612 ///
613 /// The `allow_overlap` parameter determines if the matching expressions are allowed to
614 /// overlap. It you are doing some form of tokenisation, you also likely want this set to
615 /// false. All of this only matters if you supply multiple regular expressions.
616 ///
617 /// Results are returned in the exact order they are found in the text
618 fn find_text_regex<'regex>(
619 &'slf self,
620 expressions: &'regex [Regex],
621 precompiledset: Option<&RegexSet>,
622 allow_overlap: bool,
623 ) -> Result<FindRegexIter<'store, 'regex>, StamError> {
624 debug(self.store().config(), || {
625 format!(
626 "TextSelection::find_text_regex: expressions={:?}",
627 expressions
628 )
629 });
630 let text = self.text();
631 let selectexpressions =
632 find_text_regex_select_expressions(text, expressions, precompiledset)?;
633 //Returns an iterator that does the remainder of the actual searching
634 Ok(FindRegexIter {
635 resource: self.resource(),
636 expressions,
637 selectexpressions,
638 matchiters: Vec::new(),
639 nextmatches: Vec::new(),
640 text: self.text(),
641 begincharpos: self.begin(),
642 beginbytepos: self
643 .store()
644 .subslice_utf8_offset(text)
645 .expect("Subslice must be found"),
646 allow_overlap,
647 })
648 }
649
650 /// Searches for the specified text fragment. Returns an iterator to iterate over all matches in the text.
651 /// The iterator returns [`TextSelection`] items.
652 ///
653 /// For more complex and powerful searching use [`FindText::find_text_regex()`] instead
654 ///
655 /// If you want to search only a subpart of the text, extract a [`TextSelection`] first and then run [`self.find_text()`] on that instead.
656 fn find_text<'fragment>(
657 &'slf self,
658 fragment: &'fragment str,
659 ) -> FindTextIter<'store, 'fragment> {
660 FindTextIter {
661 store: self.rootstore(),
662 resources: smallvec!(self.resource().handle()),
663 resourcecursor: 0,
664 fragment,
665 offset: Offset::from(self),
666 }
667 }
668
669 /// Searches for the specified text fragment. Returns an iterator to iterate over all matches in the text.
670 /// The iterator returns [`TextSelection`] items.
671 ///
672 /// This search is case insensitive, use [`FindText::find_text()`] to search case sensitive. This variant is slightly less performant than the exact variant.
673 /// For more complex and powerful searching use [`FindText::find_text_regex()`] instead
674 ///
675 /// If you want to search only a subpart of the text, extract a [`TextSelection`] first with
676 /// [`FindText::textselection()`] and then run [`FindText::find_text_nocase()`] on that instead.
677 fn find_text_nocase(&'slf self, fragment: &str) -> FindNoCaseTextIter<'store> {
678 FindNoCaseTextIter {
679 store: self.rootstore(),
680 resources: smallvec!(self.resource().handle()),
681 resourcecursor: 0,
682 fragment: fragment.to_lowercase(),
683 offset: Offset::from(self),
684 }
685 }
686
687 fn split_text<'b>(&'slf self, delimiter: &'b str) -> SplitTextIter<'store, 'b> {
688 SplitTextIter {
689 resource: self.resource(),
690 iter: self.store().text().split(delimiter),
691 byteoffset: self
692 .subslice_utf8_offset(self.text())
693 .expect("subslice must succeed for split_text"),
694 }
695 }
696
697 /// Returns a [`TextSelection`] that corresponds to the offset **WITHIN** the textselection.
698 /// This returns a [`TextSelection`] with absolute coordinates in the resource.
699 ///
700 /// If the textselection is known (i.e. it has associated annotations), it will be returned as such with a handle (borrowed).
701 /// If it doesn't exist yet, a new one will be returned, and it won't have a handle, nor will it be added to the store automatically.
702 ///
703 /// The [`TextSelection`] is returned as in a far pointer ([`ResultTextSelection`]) that also contains reference to the underlying store (the [`TextResource`]).
704 fn textselection(
705 &'slf self,
706 offset: &Offset,
707 ) -> Result<ResultTextSelection<'store>, StamError> {
708 let offset = self.absolute_offset(&offset)?; //turns the relative offset into an absolute one (i.e. offsets in TextResource)
709 self.resource().textselection(&offset)
710 }
711}
712
713impl AnnotationStore {
714 /// Searches for text in all resources using one or more regular expressions, returns an iterator over TextSelections along with the matching expression, this
715 /// See [`FindText::find_text_regex()`].
716 /// Note that this method, unlike its counterpart [`FindText::find_text_regex()`], silently ignores any deeper errors that might occur.
717 pub fn find_text_regex<'store, 'r>(
718 &'store self,
719 expressions: &'r [Regex],
720 precompiledset: &'r Option<RegexSet>,
721 allow_overlap: bool,
722 ) -> impl Iterator<Item = FindRegexMatch<'store, 'r>> {
723 self.resources()
724 .filter_map(move |resource: ResultItem<'store, TextResource>| {
725 // ^-- the move is only needed to move the bool in, otherwise we had to make it &'r bool and that'd be weird
726 resource
727 .find_text_regex(expressions, precompiledset.as_ref(), allow_overlap)
728 .ok() //ignore errors!
729 })
730 .flatten()
731 }
732
733 pub fn find_text<'store, 'fragment>(
734 &'store self,
735 fragment: &'fragment str,
736 ) -> FindTextIter<'store, 'fragment> {
737 FindTextIter {
738 store: self,
739 resources: self
740 .resources
741 .iter()
742 .filter_map(|x| x.as_ref().map(|res| res.handle().unwrap()))
743 .collect(),
744 resourcecursor: 0,
745 fragment,
746 offset: Offset::whole(),
747 }
748 }
749
750 pub fn find_text_nocase<'store>(&'store self, fragment: &str) -> FindNoCaseTextIter<'store> {
751 FindNoCaseTextIter {
752 store: self,
753 resources: self
754 .resources
755 .iter()
756 .filter_map(|x| x.as_ref().map(|res| res.handle().unwrap()))
757 .collect(),
758 resourcecursor: 0,
759 fragment: fragment.to_lowercase(),
760 offset: Offset::whole(),
761 }
762 }
763}
764
765/// Auxiliary function used by find_text_regex(). This method does, if needed, a single initial pass
766/// over the regular expression set, identifying which regular expressions match and are to be searched
767/// for in subsequent passes to find WHERE they match.
768pub(crate) fn find_text_regex_select_expressions<'a, 'b>(
769 text: &'a str,
770 expressions: &'b [Regex],
771 precompiledset: Option<&RegexSet>,
772) -> Result<Vec<usize>, StamError> {
773 Ok(if expressions.len() > 2 {
774 //we have multiple expressions, first we do a pass to see WHICH of the regular expression matche (taking them all into account in a single pass!).
775 //then afterwards we find for each of the matching expressions WHERE they are found
776 let foundexpressions: Vec<_> = if let Some(regexset) = precompiledset {
777 regexset.matches(text).into_iter().collect()
778 } else {
779 RegexSet::new(expressions.iter().map(|x| x.as_str()))
780 .map_err(|e| {
781 StamError::RegexError(e, "Parsing regular expressions in search_text()")
782 })?
783 .matches(text)
784 .into_iter()
785 .collect()
786 };
787 foundexpressions
788 } else {
789 match expressions.len() {
790 1 => vec![0],
791 2 => vec![0, 1],
792 _ => unreachable!("Expected 1 or 2 expressions"),
793 }
794 })
795}
796
797/// Wrapper over iterator regex Matches or CaptureMatches
798pub(crate) enum Matches<'r, 'store> {
799 NoCapture(regex::Matches<'r, 'store>),
800 WithCapture(regex::CaptureMatches<'r, 'store>),
801}
802
803/// Wrapper over regex Match or Captures (as returned by the iterator)
804pub(crate) enum Match<'store> {
805 NoCapture(regex::Match<'store>),
806 WithCapture(regex::Captures<'store>),
807}
808
809impl<'t> Match<'t> {
810 /// Return the begin offset of the match (in utf-8 bytes)
811 fn begin(&self) -> usize {
812 match self {
813 Self::NoCapture(m) => m.start(),
814 Self::WithCapture(m) => {
815 let mut begin = None;
816 for group in m.iter() {
817 if let Some(group) = group {
818 if begin.is_none() || begin.unwrap() < group.start() {
819 begin = Some(group.start());
820 }
821 }
822 }
823 begin.expect("there must be at least one capture group that was found")
824 }
825 }
826 }
827
828 /// Return the end offset of the match (in utf-8 bytes)
829 fn end(&self) -> usize {
830 match self {
831 Self::NoCapture(m) => m.end(),
832 Self::WithCapture(m) => {
833 let mut end = None;
834 for group in m.iter() {
835 if let Some(group) = group {
836 if end.is_none() || end.unwrap() < group.start() {
837 end = Some(group.start());
838 }
839 }
840 }
841 end.expect("there must be at least one capture group that was found")
842 }
843 }
844 }
845}
846
847impl<'r, 't> Iterator for Matches<'r, 't> {
848 type Item = Match<'t>;
849
850 fn next(&mut self) -> Option<Self::Item> {
851 match self {
852 Self::NoCapture(iter) => {
853 if let Some(m) = iter.next() {
854 Some(Match::NoCapture(m))
855 } else {
856 None
857 }
858 }
859 Self::WithCapture(iter) => {
860 if let Some(m) = iter.next() {
861 Some(Match::WithCapture(m))
862 } else {
863 None
864 }
865 }
866 }
867 }
868}
869
870/// This match structure is returned by the [`FindRegexIter`] iterator, which is in turn produced
871/// by [`FindText::find_text_regex()`] and searches a text based on regular expressions. This
872/// structure represents a single regular-expression match of the iterator on the text.
873pub struct FindRegexMatch<'store, 'r> {
874 expression: &'r Regex,
875 expression_index: usize,
876 textselections: SmallVec<[ResultTextSelection<'store>; 2]>,
877 //Records the numbers of the capture that match (1-indexed)
878 capturegroups: SmallVec<[usize; 2]>,
879 resource: ResultItem<'store, TextResource>,
880}
881
882impl<'store, 'r> FindRegexMatch<'store, 'r> {
883 /// Does this match return multiple text selections?
884 /// Multiple text selections are returned only when the expression contains multiple capture groups.
885 pub fn multi(&self) -> bool {
886 self.textselections.len() > 1
887 }
888
889 /// Returns the regular expression that matched
890 pub fn expression(&self) -> &'r Regex {
891 self.expression
892 }
893
894 /// Returns the index of regular expression that matched
895 pub fn expression_index(&self) -> usize {
896 self.expression_index
897 }
898
899 pub fn textselections(&self) -> &[ResultTextSelection<'store>] {
900 &self.textselections
901 }
902
903 pub fn resource(&self) -> &ResultItem<'store, TextResource> {
904 &self.resource
905 }
906
907 /// Records the number of the capture groups (1-indexed!) that match.
908 /// This array has the same length as textselections and identifies precisely
909 /// which textselection corresponds with which capture group.
910 pub fn capturegroups(&self) -> &[usize] {
911 &self.capturegroups
912 }
913
914 /// Return the text of the match, this only works
915 /// if the regular expression targets a single
916 /// consecutive text, i.e. by not using multiple capture groups.
917 pub fn as_str(&self) -> Option<&'store str> {
918 if self.multi() {
919 None
920 } else {
921 self.textselections
922 .first()
923 .map(|textselection| textselection.text())
924 }
925 }
926
927 /// This returns a vector of texts and is mainly useful in case multiple
928 /// patterns were captured.
929 /// Use [`Self::as_str()`] instead if you expect only a single text item.
930 pub fn text(&self) -> Vec<&str> {
931 self.textselections
932 .iter()
933 .map(|textselection| textselection.text())
934 .collect()
935 }
936}
937
938/// This iterator is produced by [`FindText::find_text_regex()`] and searches a text based on regular expressions.
939pub struct FindRegexIter<'store, 'regex> {
940 pub(crate) resource: ResultItem<'store, TextResource>,
941 pub(crate) expressions: &'regex [Regex], // allows keeping all of the regular expressions external and borrow it, even if only a subset is found (subset is detected in prior pass by search_by_text())
942 pub(crate) selectexpressions: Vec<usize>, //points at an expression, not used directly but via selectionexpression() method
943 pub(crate) matchiters: Vec<Matches<'regex, 'store>>, //each expression (from selectexpressions) has its own interator (same length as above vec)
944 pub(crate) nextmatches: Vec<Option<Match<'store>>>, //this buffers the next match for each expression (from selectexpressions, same length as above vec)
945 pub(crate) text: &'store str,
946 pub(crate) begincharpos: usize,
947 pub(crate) beginbytepos: usize,
948 pub(crate) allow_overlap: bool,
949}
950
951impl<'store, 'regex> Iterator for FindRegexIter<'store, 'regex> {
952 type Item = FindRegexMatch<'store, 'regex>;
953 fn next(&mut self) -> Option<Self::Item> {
954 if self.matchiters.is_empty() {
955 //instantiate the iterators for the expressions and retrieve the first item for each
956 //this is only called once when the iterator first starts
957 for i in self.selectexpressions.iter() {
958 let re = &self.expressions[*i];
959 let mut iter = if re.captures_len() > 1 {
960 Matches::WithCapture(re.captures_iter(self.text))
961 } else {
962 Matches::NoCapture(re.find_iter(self.text))
963 };
964 self.nextmatches.push(iter.next());
965 self.matchiters.push(iter);
966 }
967 }
968
969 //find the best next match (the single one next in line amongst all the iterators)
970 let mut bestnextmatch: Option<&Match<'store>> = None;
971 let mut bestmatchindex = None;
972 for (i, m) in self.nextmatches.iter().enumerate() {
973 if let Some(m) = m {
974 if bestnextmatch.is_none() || m.begin() < bestnextmatch.unwrap().begin() {
975 bestnextmatch = Some(m);
976 bestmatchindex = Some(i);
977 }
978 }
979 }
980
981 if let Some(i) = bestmatchindex {
982 // this match will be the result, convert it to the proper structure
983 let m = self.nextmatches[i].take().unwrap();
984
985 // iterate any buffers than overlap with this result, discarding those matces in the process
986 if !self.allow_overlap {
987 for (j, m2) in self.nextmatches.iter_mut().enumerate() {
988 if j != i && m2.is_some() {
989 if m2.as_ref().unwrap().begin() >= m.begin()
990 && m2.as_ref().unwrap().begin() < m.end()
991 {
992 //(note: no need to check whether m2.end in range m.begin-m.end)
993 *m2 = self.matchiters[j].next();
994 }
995 }
996 }
997 }
998
999 let result = self.match_to_result(m, i);
1000
1001 // iterate the iterator for this one and buffer the next match for next round
1002 self.nextmatches[i] = self.matchiters[i].next();
1003
1004 Some(result)
1005 } else {
1006 //nothing found, we are all done
1007 None
1008 }
1009 }
1010}
1011
1012impl<'store, 'regex> FindRegexIter<'store, 'regex> {
1013 /// Build the final match structure we return
1014 fn match_to_result(
1015 &self,
1016 m: Match<'store>,
1017 selectexpression_index: usize,
1018 ) -> FindRegexMatch<'store, 'regex> {
1019 let expression_index = self.selectexpressions[selectexpression_index];
1020 match m {
1021 Match::NoCapture(m) => {
1022 let textselection = self
1023 .resource
1024 .textselection(&Offset::simple(
1025 self.begincharpos
1026 + self
1027 .resource
1028 .utf8byte_to_charpos(self.beginbytepos + m.start())
1029 .expect("byte to pos conversion must succeed"),
1030 self.begincharpos
1031 + self
1032 .resource
1033 .utf8byte_to_charpos(self.beginbytepos + m.end())
1034 .expect("byte to pos conversion must succeed"),
1035 ))
1036 .expect("textselection from offset must succeed");
1037 FindRegexMatch {
1038 expression: &self.expressions[expression_index],
1039 expression_index,
1040 resource: self.resource.clone(),
1041 textselections: smallvec!(textselection),
1042 capturegroups: smallvec!(),
1043 }
1044 }
1045 Match::WithCapture(m) => {
1046 let mut groupiter = m.iter();
1047 groupiter.next(); //The first match always corresponds to the overall match of the regex, we can ignore it
1048 let mut textselections: SmallVec<_> = SmallVec::new();
1049 let mut capturegroups: SmallVec<[usize; 2]> = SmallVec::new();
1050 for (i, group) in groupiter.enumerate() {
1051 if let Some(group) = group {
1052 capturegroups.push(i + 1); //1-indexed
1053 textselections.push(
1054 self.resource
1055 .textselection(&Offset::simple(
1056 self.begincharpos
1057 + self
1058 .resource
1059 .utf8byte_to_charpos(self.beginbytepos + group.start())
1060 .expect("byte to pos conversion must succeed"),
1061 self.begincharpos
1062 + self
1063 .resource
1064 .utf8byte_to_charpos(self.beginbytepos + group.end())
1065 .expect("byte to pos conversion must succeed"),
1066 ))
1067 .expect("textselection from offset must succeed"),
1068 )
1069 }
1070 }
1071 FindRegexMatch {
1072 expression: &self.expressions[expression_index],
1073 expression_index,
1074 resource: self.resource.clone(),
1075 textselections,
1076 capturegroups,
1077 }
1078 }
1079 }
1080 }
1081}
1082
1083/// This iterator is produced by [`FindText::find_text()`] and searches a text for a single fragment. The search is case sensitive. See [`FindNoCaseTextIter`] for a case-insensitive variant.
1084/// The iterator yields [`ResultTextSelection`] items (which encapsulates [`TextSelection`]).
1085pub struct FindTextIter<'a, 'b> {
1086 pub(crate) store: &'a AnnotationStore,
1087 pub(crate) resources: SmallVec<[TextResourceHandle; 1]>,
1088 pub(crate) fragment: &'b str,
1089 pub(crate) resourcecursor: usize,
1090 pub(crate) offset: Offset,
1091}
1092
1093impl<'a, 'b> Iterator for FindTextIter<'a, 'b> {
1094 type Item = ResultTextSelection<'a>;
1095 fn next(&mut self) -> Option<Self::Item> {
1096 loop {
1097 if let Some(resourcehandle) = self.resources.get(self.resourcecursor).copied() {
1098 let resource = self
1099 .store
1100 .resource(resourcehandle)
1101 .expect("resource must exist");
1102 if let Some(text) = resource.text_by_offset(&self.offset).ok() {
1103 let beginbytepos = resource
1104 .subslice_utf8_offset(text)
1105 .expect("bytepos must be valid");
1106 if let Some(foundbytepos) = text.find(self.fragment) {
1107 let endbytepos = foundbytepos + self.fragment.len();
1108 let newbegin = resource
1109 .utf8byte_to_charpos(beginbytepos + foundbytepos)
1110 .expect("utf-8 byte must resolve to valid charpos");
1111 let newend = resource
1112 .utf8byte_to_charpos(beginbytepos + endbytepos)
1113 .expect("utf-8 byte must resolve to valid charpos");
1114 //set offset for next run
1115 self.offset = Offset {
1116 begin: Cursor::BeginAligned(newend),
1117 end: self.offset.end,
1118 };
1119 match resource.textselection(&Offset::simple(newbegin, newend)) {
1120 Ok(textselection) => return Some(textselection),
1121 Err(e) => {
1122 eprintln!("WARNING: FindTextIter ended prematurely: {}", e);
1123 return None;
1124 }
1125 }
1126 } else {
1127 self.resourcecursor += 1;
1128 self.offset = Offset::whole();
1129 }
1130 } else {
1131 self.resourcecursor += 1;
1132 self.offset = Offset::whole();
1133 }
1134 } else {
1135 return None;
1136 }
1137 }
1138 }
1139}
1140/// This iterator is produced by [`FindText::find_text_nocase()`] and searches a text for a single fragment, without regard for casing.
1141/// It has more overhead than the exact (case sensitive) variant [`FindTextIter`].
1142pub struct FindNoCaseTextIter<'a> {
1143 pub(crate) store: &'a AnnotationStore,
1144 pub(crate) resources: SmallVec<[TextResourceHandle; 1]>,
1145
1146 /// Fragment must be lowercase
1147 pub(crate) fragment: String,
1148 pub(crate) resourcecursor: usize,
1149 pub(crate) offset: Offset,
1150}
1151
1152impl<'a> Iterator for FindNoCaseTextIter<'a> {
1153 type Item = ResultTextSelection<'a>;
1154 fn next(&mut self) -> Option<Self::Item> {
1155 loop {
1156 if let Some(resourcehandle) = self.resources.get(self.resourcecursor).copied() {
1157 let resource = self
1158 .store
1159 .resource(resourcehandle)
1160 .expect("resource must exist");
1161 if let Some(text) = resource.text_by_offset(&self.offset).ok() {
1162 let beginbytepos = resource
1163 .subslice_utf8_offset(text)
1164 .expect("bytepos must be valid");
1165 let text = text.to_lowercase();
1166 if let Some(foundbytepos) = text.find(self.fragment.as_str()) {
1167 let endbytepos = foundbytepos + self.fragment.len(); //MAYBE TODO: possible issue if uppercase and lowercase variants have different byte length!
1168 let newbegin = resource
1169 .utf8byte_to_charpos(beginbytepos + foundbytepos)
1170 .expect("utf-8 byte must resolve to valid charpos");
1171 let newend = resource
1172 .utf8byte_to_charpos(beginbytepos + endbytepos)
1173 .expect("utf-8 byte must resolve to valid charpos");
1174 //set offset for next run
1175 self.offset = Offset {
1176 begin: Cursor::BeginAligned(newend),
1177 end: self.offset.end,
1178 };
1179 match resource.textselection(&Offset::simple(newbegin, newend)) {
1180 Ok(textselection) => return Some(textselection),
1181 Err(e) => {
1182 eprintln!("WARNING: FindTextIter ended prematurely: {}", e);
1183 return None;
1184 }
1185 }
1186 } else {
1187 self.resourcecursor += 1;
1188 self.offset = Offset::whole();
1189 }
1190 } else {
1191 self.resourcecursor += 1;
1192 self.offset = Offset::whole();
1193 }
1194 } else {
1195 return None;
1196 }
1197 }
1198 }
1199}
1200
1201/// This iterator is produced by [`FindText::split_text()`] and splits a text based on a delimiter.
1202/// The iterator yields [`ResultTextSelection`] (which encapsulates [`TextSelection`]).
1203pub struct SplitTextIter<'store, 'b> {
1204 pub(crate) resource: ResultItem<'store, TextResource>,
1205 pub(crate) iter: std::str::Split<'store, &'b str>,
1206 pub(crate) byteoffset: usize,
1207}
1208
1209impl<'store, 'b> Iterator for SplitTextIter<'store, 'b> {
1210 type Item = ResultTextSelection<'store>;
1211 fn next(&mut self) -> Option<Self::Item> {
1212 if let Some(matchstr) = self.iter.next() {
1213 let beginbyte = self
1214 .resource
1215 .subslice_utf8_offset(matchstr)
1216 .expect("match must be found")
1217 - self.byteoffset;
1218 let endbyte = (beginbyte + matchstr.len()) - self.byteoffset;
1219 Some(
1220 self.resource
1221 .textselection(&Offset::simple(
1222 self.resource
1223 .utf8byte_to_charpos(beginbyte)
1224 .expect("utf-8 byte must resolve to char pos"),
1225 self.resource
1226 .utf8byte_to_charpos(endbyte)
1227 .expect("utf-8 byte must resolve to char pos"),
1228 ))
1229 .expect("text selection must succeed"),
1230 )
1231 } else {
1232 None
1233 }
1234 }
1235}