1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use harper_brill::{Chunker, Tagger, brill_chunker, brill_tagger};
6use paste::paste;
7
8use crate::expr::{Expr, ExprExt, LongestMatchOf, Repeating, SequenceExpr};
9use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
10use crate::patterns::WordSet;
11use crate::punctuation::Punctuation;
12use crate::vec_ext::VecExt;
13use crate::{
14 Dictionary, FatStringToken, FatToken, FstDictionary, Lrc, Token, TokenKind, TokenStringExt,
15};
16use crate::{OrdinalSuffix, Span};
17
18#[derive(Debug, Clone)]
20pub struct Document {
21 source: Lrc<Vec<char>>,
22 tokens: Vec<Token>,
23}
24
25impl Default for Document {
26 fn default() -> Self {
27 Self::new("", &PlainEnglish, &FstDictionary::curated())
28 }
29}
30
31impl Document {
32 pub fn token_indices_intersecting(&self, span: Span) -> Vec<usize> {
36 self.tokens()
37 .enumerate()
38 .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
39 .collect()
40 }
41
42 pub fn fat_tokens_intersecting(&self, span: Span) -> Vec<FatToken> {
46 let indices = self.token_indices_intersecting(span);
47
48 indices
49 .into_iter()
50 .map(|i| self.tokens[i].to_fat(&self.source))
51 .collect()
52 }
53
54 pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
57 let source: Vec<_> = text.chars().collect();
58
59 Self::new_from_vec(Lrc::new(source), parser, dictionary)
60 }
61
62 pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
65 let source: Vec<_> = text.chars().collect();
66
67 Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
68 }
69
70 pub fn new_from_vec(
73 source: Lrc<Vec<char>>,
74 parser: &impl Parser,
75 dictionary: &impl Dictionary,
76 ) -> Self {
77 let tokens = parser.parse(&source);
78
79 let mut document = Self { source, tokens };
80 document.parse(dictionary);
81
82 document
83 }
84
85 pub fn new_plain_english_curated(text: &str) -> Self {
88 Self::new(text, &PlainEnglish, &FstDictionary::curated())
89 }
90
91 pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
94 Self::new(text, &PlainEnglish, dictionary)
95 }
96
97 pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
100 Self::new(
101 text,
102 &Markdown::new(markdown_options),
103 &FstDictionary::curated(),
104 )
105 }
106
107 pub fn new_markdown_default_curated(text: &str) -> Self {
110 Self::new_markdown_curated(text, MarkdownOptions::default())
111 }
112
113 pub fn new_markdown(
116 text: &str,
117 markdown_options: MarkdownOptions,
118 dictionary: &impl Dictionary,
119 ) -> Self {
120 Self::new(text, &Markdown::new(markdown_options), dictionary)
121 }
122
123 pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
126 Self::new_markdown(text, MarkdownOptions::default(), dictionary)
127 }
128
129 fn parse(&mut self, dictionary: &impl Dictionary) {
133 self.condense_spaces();
134 self.condense_newlines();
135 self.newlines_to_breaks();
136 self.condense_contractions();
137 self.condense_dotted_initialisms();
138 self.condense_number_suffixes();
139 self.condense_ellipsis();
140 self.condense_latin();
141 self.match_quotes();
142
143 let token_strings: Vec<_> = self
144 .tokens
145 .iter()
146 .filter(|t| !t.kind.is_whitespace())
147 .map(|t| self.get_span_content_str(&t.span))
148 .collect();
149
150 let token_tags = brill_tagger().tag_sentence(&token_strings);
151 let np_flags = brill_chunker().chunk_sentence(&token_strings, &token_tags);
152
153 let mut i = 0;
154
155 for token in self.tokens.iter_mut() {
157 if let TokenKind::Word(meta) = &mut token.kind {
158 let word_source = token.span.get_content(&self.source);
159 let mut found_meta = dictionary.get_word_metadata(word_source).cloned();
160
161 if let Some(inner) = &mut found_meta {
162 inner.pos_tag = token_tags[i];
163 inner.np_member = Some(np_flags[i]);
164 }
165
166 *meta = found_meta;
167 i += 1;
168 } else if !token.kind.is_whitespace() {
169 i += 1;
170 }
171 }
172 }
173
174 fn newlines_to_breaks(&mut self) {
176 for token in &mut self.tokens {
177 if let TokenKind::Newline(n) = token.kind {
178 if n >= 2 {
179 token.kind = TokenKind::ParagraphBreak;
180 }
181 }
182 }
183 }
184
185 fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
191 for idx in indices {
193 let end_tok = self.tokens[idx + stretch_len - 1].clone();
194 let start_tok = &mut self.tokens[*idx];
195
196 start_tok.span.end = end_tok.span.end;
197 }
198
199 let old = self.tokens.clone();
201 self.tokens.clear();
202
203 self.tokens
205 .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
206
207 let mut iter = indices.iter().peekable();
208
209 while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
210 self.tokens.push(old[*a_idx].clone());
211
212 if let Some(b_idx) = b {
213 self.tokens
214 .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
215 }
216 }
217
218 self.tokens.extend_from_slice(
220 &old[indices
221 .last()
222 .map(|v| v + stretch_len)
223 .unwrap_or(indices.len())..],
224 );
225 }
226
227 pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
228 let index = self
229 .tokens
230 .binary_search_by(|t| {
231 if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
232 Ordering::Equal
233 } else {
234 t.span.start.cmp(&char_index)
235 }
236 })
237 .ok()?;
238
239 Some(&self.tokens[index])
240 }
241
242 pub fn get_token(&self, index: usize) -> Option<&Token> {
244 self.tokens.get(index)
245 }
246
247 pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
249 match base.checked_add_signed(offset) {
250 None => None,
251 Some(idx) => self.get_token(idx),
252 }
253 }
254
255 pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
257 self.tokens.iter()
258 }
259
260 pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
261 fn is_np_member(t: &Token) -> bool {
262 t.kind
263 .as_word()
264 .and_then(|x| x.as_ref())
265 .and_then(|w| w.np_member)
266 .unwrap_or(false)
267 }
268
269 fn trim(slice: &[Token]) -> &[Token] {
270 let mut start = 0;
271 let mut end = slice.len();
272 while start < end && slice[start].kind.is_whitespace() {
273 start += 1;
274 }
275 while end > start && slice[end - 1].kind.is_whitespace() {
276 end -= 1;
277 }
278 &slice[start..end]
279 }
280
281 self.tokens
282 .as_slice()
283 .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
284 .filter_map(|s| {
285 let s = trim(s);
286 if s.iter().any(is_np_member) {
287 Some(s)
288 } else {
289 None
290 }
291 })
292 }
293
294 pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
296 self.tokens().map(|token| token.to_fat(&self.source))
297 }
298
299 pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
302 if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
304 return None;
305 }
306 let word_token = self.get_token_offset(base, offset + offset.signum());
308 let word_token = word_token?;
309 word_token.kind.is_word().then_some(word_token)
310 }
311
312 pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
314 self.fat_tokens().map(|t| t.into())
315 }
316
317 pub fn get_span_content(&self, span: &Span) -> &[char] {
318 span.get_content(&self.source)
319 }
320
321 pub fn get_span_content_str(&self, span: &Span) -> String {
322 String::from_iter(self.get_span_content(span))
323 }
324
325 pub fn get_full_string(&self) -> String {
326 self.get_span_content_str(&Span {
327 start: 0,
328 end: self.source.len(),
329 })
330 }
331
332 pub fn get_full_content(&self) -> &[char] {
333 &self.source
334 }
335
336 pub fn get_source(&self) -> &[char] {
337 &self.source
338 }
339
340 pub fn get_tokens(&self) -> &[Token] {
341 &self.tokens
342 }
343
344 fn match_quotes(&mut self) {
350 let quote_indices: Vec<usize> = self.tokens.iter_quote_indices().collect();
351
352 for i in 0..quote_indices.len() / 2 {
353 let a_i = quote_indices[i * 2];
354 let b_i = quote_indices[i * 2 + 1];
355
356 {
357 let a = self.tokens[a_i].kind.as_mut_quote().unwrap();
358 a.twin_loc = Some(b_i);
359 }
360
361 {
362 let b = self.tokens[b_i].kind.as_mut_quote().unwrap();
363 b.twin_loc = Some(a_i);
364 }
365 }
366 }
367
368 fn condense_number_suffixes(&mut self) {
370 if self.tokens.len() < 2 {
371 return;
372 }
373
374 let mut replace_starts = Vec::new();
375
376 for idx in 0..self.tokens.len() - 1 {
377 let b = &self.tokens[idx + 1];
378 let a = &self.tokens[idx];
379
380 if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind) {
383 if let Some(found_suffix) =
384 OrdinalSuffix::from_chars(self.get_span_content(&b.span))
385 {
386 self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
387 replace_starts.push(idx);
388 }
389 }
390 }
391
392 self.condense_indices(&replace_starts, 2);
393 }
394
395 fn condense_spaces(&mut self) {
398 let mut cursor = 0;
399 let copy = self.tokens.clone();
400
401 let mut remove_these = VecDeque::new();
402
403 while cursor < self.tokens.len() {
404 let start_tok = &mut self.tokens[cursor];
406
407 if let TokenKind::Space(start_count) = &mut start_tok.kind {
408 loop {
409 cursor += 1;
410
411 if cursor >= copy.len() {
412 break;
413 }
414
415 let child_tok = ©[cursor];
416
417 if start_tok.span.end != child_tok.span.start {
419 break;
420 }
421
422 if let TokenKind::Space(n) = child_tok.kind {
423 *start_count += n;
424 start_tok.span.end = child_tok.span.end;
425 remove_these.push_back(cursor);
426 cursor += 1;
427 } else {
428 break;
429 };
430 }
431 }
432
433 cursor += 1;
434 }
435
436 self.tokens.remove_indices(remove_these);
437 }
438
439 thread_local! {
440 static LATIN_EXPR: Lrc<LongestMatchOf> = Document::uncached_latin_expr();
441 }
442
443 fn uncached_latin_expr() -> Lrc<LongestMatchOf> {
444 Lrc::new(LongestMatchOf::new(vec![
445 Box::new(
446 SequenceExpr::default()
447 .then(WordSet::new(&["etc", "vs"]))
448 .then_period(),
449 ),
450 Box::new(
451 SequenceExpr::aco("et")
452 .then_whitespace()
453 .t_aco("al")
454 .then_period(),
455 ),
456 ]))
457 }
458
459 fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
462 where
463 F: Fn(&mut Token),
464 {
465 let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
466
467 let mut remove_indices = VecDeque::with_capacity(matches.len());
468
469 for m in matches {
470 remove_indices.extend(m.start + 1..m.end);
471 self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
472 edit(&mut self.tokens[m.start]);
473 }
474
475 self.tokens.remove_indices(remove_indices);
476 }
477
478 fn condense_latin(&mut self) {
479 self.condense_expr(&Self::LATIN_EXPR.with(|v| v.clone()), |_| {})
480 }
481
482 fn condense_newlines(&mut self) {
485 let mut cursor = 0;
486 let copy = self.tokens.clone();
487
488 let mut remove_these = VecDeque::new();
489
490 while cursor < self.tokens.len() {
491 let start_tok = &mut self.tokens[cursor];
493
494 if let TokenKind::Newline(start_count) = &mut start_tok.kind {
495 loop {
496 cursor += 1;
497
498 if cursor >= copy.len() {
499 break;
500 }
501
502 let child_tok = ©[cursor];
503 if let TokenKind::Newline(n) = child_tok.kind {
504 *start_count += n;
505 start_tok.span.end = child_tok.span.end;
506 remove_these.push_back(cursor);
507 cursor += 1;
508 } else {
509 break;
510 };
511 }
512 }
513
514 cursor += 1;
515 }
516
517 self.tokens.remove_indices(remove_these);
518 }
519
520 fn condense_dotted_initialisms(&mut self) {
523 if self.tokens.len() < 2 {
524 return;
525 }
526
527 let mut to_remove = VecDeque::new();
528
529 let mut cursor = 1;
530
531 let mut initialism_start = None;
532
533 loop {
534 let a = &self.tokens[cursor - 1];
535 let b = &self.tokens[cursor];
536
537 let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
538
539 if is_initialism_chunk {
540 if initialism_start.is_none() {
541 initialism_start = Some(cursor - 1);
542 } else {
543 to_remove.push_back(cursor - 1);
544 }
545
546 to_remove.push_back(cursor);
547 cursor += 1;
548 } else {
549 if let Some(start) = initialism_start {
550 let end = self.tokens[cursor - 2].span.end;
551 let start_tok: &mut Token = &mut self.tokens[start];
552 start_tok.span.end = end;
553 }
554
555 initialism_start = None;
556 }
557
558 cursor += 1;
559
560 if cursor >= self.tokens.len() - 1 {
561 break;
562 }
563 }
564
565 self.tokens.remove_indices(to_remove);
566 }
567
568 fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
569 let period = SequenceExpr::default().then_period();
570 Lrc::new(Repeating::new(Box::new(period), 2))
571 }
572
573 thread_local! {
574 static ELLIPSIS_EXPR: Lrc<Repeating> = Document::uncached_ellipsis_pattern();
575 }
576
577 fn condense_ellipsis(&mut self) {
578 let expr = Self::ELLIPSIS_EXPR.with(|v| v.clone());
579 self.condense_expr(&expr, |tok| {
580 tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
581 });
582 }
583
584 fn uncached_contraction_expr() -> Lrc<SequenceExpr> {
585 Lrc::new(
586 SequenceExpr::default()
587 .then_any_word()
588 .then_apostrophe()
589 .then_any_word(),
590 )
591 }
592
593 thread_local! {
594 static CONTRACTION_EXPR: Lrc<SequenceExpr> = Document::uncached_contraction_expr();
595 }
596
597 fn condense_contractions(&mut self) {
600 let expr = Self::CONTRACTION_EXPR.with(|v| v.clone());
601
602 self.condense_expr(&expr, |_| {});
603 }
604}
605
606macro_rules! create_fns_on_doc {
608 ($thing:ident) => {
609 paste! {
610 fn [< first_ $thing >](&self) -> Option<&Token> {
611 self.tokens.[< first_ $thing >]()
612 }
613
614 fn [< last_ $thing >](&self) -> Option<&Token> {
615 self.tokens.[< last_ $thing >]()
616 }
617
618 fn [< last_ $thing _index>](&self) -> Option<usize> {
619 self.tokens.[< last_ $thing _index >]()
620 }
621
622 fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
623 self.tokens.[< iter_ $thing _indices >]()
624 }
625
626 fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
627 self.tokens.[< iter_ $thing s >]()
628 }
629 }
630 };
631}
632
633impl TokenStringExt for Document {
634 create_fns_on_doc!(adjective);
635 create_fns_on_doc!(apostrophe);
636 create_fns_on_doc!(at);
637 create_fns_on_doc!(chunk_terminator);
638 create_fns_on_doc!(comma);
639 create_fns_on_doc!(conjunction);
640 create_fns_on_doc!(currency);
641 create_fns_on_doc!(ellipsis);
642 create_fns_on_doc!(hostname);
643 create_fns_on_doc!(likely_homograph);
644 create_fns_on_doc!(noun);
645 create_fns_on_doc!(number);
646 create_fns_on_doc!(paragraph_break);
647 create_fns_on_doc!(pipe);
648 create_fns_on_doc!(preposition);
649 create_fns_on_doc!(punctuation);
650 create_fns_on_doc!(quote);
651 create_fns_on_doc!(sentence_terminator);
652 create_fns_on_doc!(space);
653 create_fns_on_doc!(unlintable);
654 create_fns_on_doc!(verb);
655 create_fns_on_doc!(word);
656 create_fns_on_doc!(word_like);
657
658 fn first_sentence_word(&self) -> Option<&Token> {
659 self.tokens.first_sentence_word()
660 }
661
662 fn first_non_whitespace(&self) -> Option<&Token> {
663 self.tokens.first_non_whitespace()
664 }
665
666 fn span(&self) -> Option<Span> {
667 self.tokens.span()
668 }
669
670 fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
671 self.tokens.iter_linking_verb_indices()
672 }
673
674 fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
675 self.tokens.iter_linking_verbs()
676 }
677
678 fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
679 self.tokens.iter_chunks()
680 }
681
682 fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
683 self.tokens.iter_paragraphs()
684 }
685
686 fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
687 self.tokens.iter_sentences()
688 }
689}
690
691impl Display for Document {
692 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
693 for token in &self.tokens {
694 write!(f, "{}", self.get_span_content_str(&token.span))?;
695 }
696
697 Ok(())
698 }
699}
700
701#[cfg(test)]
702mod tests {
703 use itertools::Itertools;
704
705 use super::Document;
706 use crate::{Span, parsers::MarkdownOptions};
707
708 fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
709 let document = Document::new_plain_english_curated(text);
710
711 assert_eq!(document.tokens.len(), final_tok_count);
712
713 let document = Document::new_markdown_curated(text, MarkdownOptions::default());
714
715 assert_eq!(document.tokens.len(), final_tok_count);
716 }
717
718 #[test]
719 fn simple_contraction() {
720 assert_condensed_contractions("isn't", 1);
721 }
722
723 #[test]
724 fn simple_contraction2() {
725 assert_condensed_contractions("wasn't", 1);
726 }
727
728 #[test]
729 fn simple_contraction3() {
730 assert_condensed_contractions("There's", 1);
731 }
732
733 #[test]
734 fn medium_contraction() {
735 assert_condensed_contractions("isn't wasn't", 3);
736 }
737
738 #[test]
739 fn medium_contraction2() {
740 assert_condensed_contractions("There's no way", 5);
741 }
742
743 #[test]
744 fn selects_token_at_char_index() {
745 let text = "There were three little pigs. They built three little homes.";
746 let document = Document::new_plain_english_curated(text);
747
748 let got = document.get_token_at_char_index(19).unwrap();
749
750 assert!(got.kind.is_word());
751 assert_eq!(got.span, Span::new(17, 23));
752 }
753
754 fn assert_token_count(source: &str, count: usize) {
755 let document = Document::new_plain_english_curated(source);
756
757 dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
758 assert_eq!(document.tokens.len(), count);
759 }
760
761 #[test]
762 fn condenses_number_suffixes() {
763 assert_token_count("1st", 1);
764 assert_token_count("This is the 2nd test", 9);
765 assert_token_count("This is the 3rd test", 9);
766 assert_token_count(
767 "It works even with weird capitalization like this: 600nD",
768 18,
769 );
770 }
771
772 #[test]
773 fn condenses_ie() {
774 assert_token_count("There is a thing (i.e. that one)", 15);
775 assert_token_count("We are trying to condense \"i.e.\"", 13);
776 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
777 }
778
779 #[test]
780 fn condenses_eg() {
781 assert_token_count("We are trying to condense \"e.g.\"", 13);
782 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
783 }
784
785 #[test]
786 fn condenses_nsa() {
787 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
788 }
789
790 #[test]
791 fn parses_ellipsis() {
792 assert_token_count("...", 1);
793 }
794
795 #[test]
796 fn parses_long_ellipsis() {
797 assert_token_count(".....", 1);
798 }
799
800 #[test]
801 fn parses_short_ellipsis() {
802 assert_token_count("..", 1);
803 }
804
805 #[test]
806 fn selects_token_at_offset() {
807 let doc = Document::new_plain_english_curated("Foo bar baz");
808
809 let tok = doc.get_token_offset(1, -1).unwrap();
810
811 assert_eq!(tok.span, Span::new(0, 3));
812 }
813
814 #[test]
815 fn cant_select_token_before_start() {
816 let doc = Document::new_plain_english_curated("Foo bar baz");
817
818 let tok = doc.get_token_offset(0, -1);
819
820 assert!(tok.is_none());
821 }
822
823 #[test]
824 fn select_next_word_pos_offset() {
825 let doc = Document::new_plain_english_curated("Foo bar baz");
826
827 let bar = doc.get_next_word_from_offset(0, 1).unwrap();
828 let bar = doc.get_span_content(&bar.span);
829 assert_eq!(bar, ['b', 'a', 'r']);
830 }
831
832 #[test]
833 fn select_next_word_neg_offset() {
834 let doc = Document::new_plain_english_curated("Foo bar baz");
835
836 let bar = doc.get_next_word_from_offset(2, -1).unwrap();
837 let bar = doc.get_span_content(&bar.span);
838 assert_eq!(bar, ['F', 'o', 'o']);
839 }
840
841 #[test]
842 fn cant_select_next_word_not_from_whitespace() {
843 let doc = Document::new_plain_english_curated("Foo bar baz");
844
845 let tok = doc.get_next_word_from_offset(0, 2);
846
847 assert!(tok.is_none());
848 }
849
850 #[test]
851 fn cant_select_next_word_before_start() {
852 let doc = Document::new_plain_english_curated("Foo bar baz");
853
854 let tok = doc.get_next_word_from_offset(0, -1);
855
856 assert!(tok.is_none());
857 }
858
859 #[test]
860 fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
861 let doc = Document::new_plain_english_curated("Foo, bar, baz");
862
863 let tok = doc.get_next_word_from_offset(0, 1);
864
865 assert!(tok.is_none());
866 }
867
868 #[test]
869 fn cant_select_next_word_with_punctuation_after_whitespace() {
870 let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
871
872 let tok = doc.get_next_word_from_offset(0, 1);
873
874 assert!(tok.is_none());
875 }
876}