1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use harper_brill::{Chunker, Tagger, brill_chunker, brill_tagger};
6use paste::paste;
7
8use crate::expr::{Expr, ExprExt, LongestMatchOf, Repeating, SequenceExpr};
9use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
10use crate::patterns::WordSet;
11use crate::punctuation::Punctuation;
12use crate::spell::{Dictionary, FstDictionary};
13use crate::vec_ext::VecExt;
14use crate::{FatStringToken, FatToken, Lrc, Token, TokenKind, TokenStringExt};
15use crate::{OrdinalSuffix, Span};
16
17#[derive(Debug, Clone)]
19pub struct Document {
20 source: Lrc<Vec<char>>,
21 tokens: Vec<Token>,
22}
23
24impl Default for Document {
25 fn default() -> Self {
26 Self::new("", &PlainEnglish, &FstDictionary::curated())
27 }
28}
29
30impl Document {
31 pub fn token_indices_intersecting(&self, span: Span) -> Vec<usize> {
35 self.tokens()
36 .enumerate()
37 .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
38 .collect()
39 }
40
41 pub fn fat_tokens_intersecting(&self, span: Span) -> Vec<FatToken> {
45 let indices = self.token_indices_intersecting(span);
46
47 indices
48 .into_iter()
49 .map(|i| self.tokens[i].to_fat(&self.source))
50 .collect()
51 }
52
53 pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
56 let source: Vec<_> = text.chars().collect();
57
58 Self::new_from_vec(Lrc::new(source), parser, dictionary)
59 }
60
61 pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
64 let source: Vec<_> = text.chars().collect();
65
66 Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
67 }
68
69 pub fn new_from_vec(
72 source: Lrc<Vec<char>>,
73 parser: &impl Parser,
74 dictionary: &impl Dictionary,
75 ) -> Self {
76 let tokens = parser.parse(&source);
77
78 let mut document = Self { source, tokens };
79 document.parse(dictionary);
80
81 document
82 }
83
84 pub fn new_plain_english_curated(text: &str) -> Self {
87 Self::new(text, &PlainEnglish, &FstDictionary::curated())
88 }
89
90 pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
93 Self::new(text, &PlainEnglish, dictionary)
94 }
95
96 pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
99 Self::new(
100 text,
101 &Markdown::new(markdown_options),
102 &FstDictionary::curated(),
103 )
104 }
105
106 pub fn new_markdown_default_curated(text: &str) -> Self {
109 Self::new_markdown_curated(text, MarkdownOptions::default())
110 }
111
112 pub fn new_markdown(
115 text: &str,
116 markdown_options: MarkdownOptions,
117 dictionary: &impl Dictionary,
118 ) -> Self {
119 Self::new(text, &Markdown::new(markdown_options), dictionary)
120 }
121
122 pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
125 Self::new_markdown(text, MarkdownOptions::default(), dictionary)
126 }
127
128 fn parse(&mut self, dictionary: &impl Dictionary) {
132 self.condense_spaces();
133 self.condense_newlines();
134 self.newlines_to_breaks();
135 self.condense_contractions();
136 self.condense_dotted_initialisms();
137 self.condense_number_suffixes();
138 self.condense_ellipsis();
139 self.condense_latin();
140 self.match_quotes();
141
142 let token_strings: Vec<_> = self
143 .tokens
144 .iter()
145 .filter(|t| !t.kind.is_whitespace())
146 .map(|t| self.get_span_content_str(&t.span))
147 .collect();
148
149 let token_tags = brill_tagger().tag_sentence(&token_strings);
150 let np_flags = brill_chunker().chunk_sentence(&token_strings, &token_tags);
151
152 let mut i = 0;
153
154 for token in self.tokens.iter_mut() {
156 if let TokenKind::Word(meta) = &mut token.kind {
157 let word_source = token.span.get_content(&self.source);
158 let mut found_meta = dictionary.get_word_metadata(word_source).cloned();
159
160 if let Some(inner) = &mut found_meta {
161 inner.pos_tag = token_tags[i];
162 inner.np_member = Some(np_flags[i]);
163 }
164
165 *meta = found_meta;
166 i += 1;
167 } else if !token.kind.is_whitespace() {
168 i += 1;
169 }
170 }
171 }
172
173 fn newlines_to_breaks(&mut self) {
175 for token in &mut self.tokens {
176 if let TokenKind::Newline(n) = token.kind {
177 if n >= 2 {
178 token.kind = TokenKind::ParagraphBreak;
179 }
180 }
181 }
182 }
183
184 fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
190 for idx in indices {
192 let end_tok = self.tokens[idx + stretch_len - 1].clone();
193 let start_tok = &mut self.tokens[*idx];
194
195 start_tok.span.end = end_tok.span.end;
196 }
197
198 let old = self.tokens.clone();
200 self.tokens.clear();
201
202 self.tokens
204 .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
205
206 let mut iter = indices.iter().peekable();
207
208 while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
209 self.tokens.push(old[*a_idx].clone());
210
211 if let Some(b_idx) = b {
212 self.tokens
213 .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
214 }
215 }
216
217 self.tokens.extend_from_slice(
219 &old[indices
220 .last()
221 .map(|v| v + stretch_len)
222 .unwrap_or(indices.len())..],
223 );
224 }
225
226 pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
227 let index = self
228 .tokens
229 .binary_search_by(|t| {
230 if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
231 Ordering::Equal
232 } else {
233 t.span.start.cmp(&char_index)
234 }
235 })
236 .ok()?;
237
238 Some(&self.tokens[index])
239 }
240
241 pub fn get_token(&self, index: usize) -> Option<&Token> {
243 self.tokens.get(index)
244 }
245
246 pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
248 match base.checked_add_signed(offset) {
249 None => None,
250 Some(idx) => self.get_token(idx),
251 }
252 }
253
254 pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
256 self.tokens.iter()
257 }
258
259 pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
260 fn is_np_member(t: &Token) -> bool {
261 t.kind
262 .as_word()
263 .and_then(|x| x.as_ref())
264 .and_then(|w| w.np_member)
265 .unwrap_or(false)
266 }
267
268 fn trim(slice: &[Token]) -> &[Token] {
269 let mut start = 0;
270 let mut end = slice.len();
271 while start < end && slice[start].kind.is_whitespace() {
272 start += 1;
273 }
274 while end > start && slice[end - 1].kind.is_whitespace() {
275 end -= 1;
276 }
277 &slice[start..end]
278 }
279
280 self.tokens
281 .as_slice()
282 .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
283 .filter_map(|s| {
284 let s = trim(s);
285 if s.iter().any(is_np_member) {
286 Some(s)
287 } else {
288 None
289 }
290 })
291 }
292
293 pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
295 self.tokens().map(|token| token.to_fat(&self.source))
296 }
297
298 pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
301 if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
303 return None;
304 }
305 let word_token = self.get_token_offset(base, offset + offset.signum());
307 let word_token = word_token?;
308 word_token.kind.is_word().then_some(word_token)
309 }
310
311 pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
313 self.fat_tokens().map(|t| t.into())
314 }
315
316 pub fn get_span_content(&self, span: &Span) -> &[char] {
317 span.get_content(&self.source)
318 }
319
320 pub fn get_span_content_str(&self, span: &Span) -> String {
321 String::from_iter(self.get_span_content(span))
322 }
323
324 pub fn get_full_string(&self) -> String {
325 self.get_span_content_str(&Span {
326 start: 0,
327 end: self.source.len(),
328 })
329 }
330
331 pub fn get_full_content(&self) -> &[char] {
332 &self.source
333 }
334
335 pub fn get_source(&self) -> &[char] {
336 &self.source
337 }
338
339 pub fn get_tokens(&self) -> &[Token] {
340 &self.tokens
341 }
342
343 fn match_quotes(&mut self) {
349 let quote_indices: Vec<usize> = self.tokens.iter_quote_indices().collect();
350
351 for i in 0..quote_indices.len() / 2 {
352 let a_i = quote_indices[i * 2];
353 let b_i = quote_indices[i * 2 + 1];
354
355 {
356 let a = self.tokens[a_i].kind.as_mut_quote().unwrap();
357 a.twin_loc = Some(b_i);
358 }
359
360 {
361 let b = self.tokens[b_i].kind.as_mut_quote().unwrap();
362 b.twin_loc = Some(a_i);
363 }
364 }
365 }
366
367 fn condense_number_suffixes(&mut self) {
369 if self.tokens.len() < 2 {
370 return;
371 }
372
373 let mut replace_starts = Vec::new();
374
375 for idx in 0..self.tokens.len() - 1 {
376 let b = &self.tokens[idx + 1];
377 let a = &self.tokens[idx];
378
379 if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind) {
382 if let Some(found_suffix) =
383 OrdinalSuffix::from_chars(self.get_span_content(&b.span))
384 {
385 self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
386 replace_starts.push(idx);
387 }
388 }
389 }
390
391 self.condense_indices(&replace_starts, 2);
392 }
393
394 fn condense_spaces(&mut self) {
397 let mut cursor = 0;
398 let copy = self.tokens.clone();
399
400 let mut remove_these = VecDeque::new();
401
402 while cursor < self.tokens.len() {
403 let start_tok = &mut self.tokens[cursor];
405
406 if let TokenKind::Space(start_count) = &mut start_tok.kind {
407 loop {
408 cursor += 1;
409
410 if cursor >= copy.len() {
411 break;
412 }
413
414 let child_tok = ©[cursor];
415
416 if start_tok.span.end != child_tok.span.start {
418 break;
419 }
420
421 if let TokenKind::Space(n) = child_tok.kind {
422 *start_count += n;
423 start_tok.span.end = child_tok.span.end;
424 remove_these.push_back(cursor);
425 cursor += 1;
426 } else {
427 break;
428 };
429 }
430 }
431
432 cursor += 1;
433 }
434
435 self.tokens.remove_indices(remove_these);
436 }
437
438 thread_local! {
439 static LATIN_EXPR: Lrc<LongestMatchOf> = Document::uncached_latin_expr();
440 }
441
442 fn uncached_latin_expr() -> Lrc<LongestMatchOf> {
443 Lrc::new(LongestMatchOf::new(vec![
444 Box::new(
445 SequenceExpr::default()
446 .then(WordSet::new(&["etc", "vs"]))
447 .then_period(),
448 ),
449 Box::new(
450 SequenceExpr::aco("et")
451 .then_whitespace()
452 .t_aco("al")
453 .then_period(),
454 ),
455 ]))
456 }
457
458 fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
461 where
462 F: Fn(&mut Token),
463 {
464 let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
465
466 let mut remove_indices = VecDeque::with_capacity(matches.len());
467
468 for m in matches {
469 remove_indices.extend(m.start + 1..m.end);
470 self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
471 edit(&mut self.tokens[m.start]);
472 }
473
474 self.tokens.remove_indices(remove_indices);
475 }
476
477 fn condense_latin(&mut self) {
478 self.condense_expr(&Self::LATIN_EXPR.with(|v| v.clone()), |_| {})
479 }
480
481 fn condense_newlines(&mut self) {
484 let mut cursor = 0;
485 let copy = self.tokens.clone();
486
487 let mut remove_these = VecDeque::new();
488
489 while cursor < self.tokens.len() {
490 let start_tok = &mut self.tokens[cursor];
492
493 if let TokenKind::Newline(start_count) = &mut start_tok.kind {
494 loop {
495 cursor += 1;
496
497 if cursor >= copy.len() {
498 break;
499 }
500
501 let child_tok = ©[cursor];
502 if let TokenKind::Newline(n) = child_tok.kind {
503 *start_count += n;
504 start_tok.span.end = child_tok.span.end;
505 remove_these.push_back(cursor);
506 cursor += 1;
507 } else {
508 break;
509 };
510 }
511 }
512
513 cursor += 1;
514 }
515
516 self.tokens.remove_indices(remove_these);
517 }
518
519 fn condense_dotted_initialisms(&mut self) {
522 if self.tokens.len() < 2 {
523 return;
524 }
525
526 let mut to_remove = VecDeque::new();
527
528 let mut cursor = 1;
529
530 let mut initialism_start = None;
531
532 loop {
533 let a = &self.tokens[cursor - 1];
534 let b = &self.tokens[cursor];
535
536 let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
537
538 if is_initialism_chunk {
539 if initialism_start.is_none() {
540 initialism_start = Some(cursor - 1);
541 } else {
542 to_remove.push_back(cursor - 1);
543 }
544
545 to_remove.push_back(cursor);
546 cursor += 1;
547 } else {
548 if let Some(start) = initialism_start {
549 let end = self.tokens[cursor - 2].span.end;
550 let start_tok: &mut Token = &mut self.tokens[start];
551 start_tok.span.end = end;
552 }
553
554 initialism_start = None;
555 }
556
557 cursor += 1;
558
559 if cursor >= self.tokens.len() - 1 {
560 break;
561 }
562 }
563
564 self.tokens.remove_indices(to_remove);
565 }
566
567 fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
568 let period = SequenceExpr::default().then_period();
569 Lrc::new(Repeating::new(Box::new(period), 2))
570 }
571
572 thread_local! {
573 static ELLIPSIS_EXPR: Lrc<Repeating> = Document::uncached_ellipsis_pattern();
574 }
575
576 fn condense_ellipsis(&mut self) {
577 let expr = Self::ELLIPSIS_EXPR.with(|v| v.clone());
578 self.condense_expr(&expr, |tok| {
579 tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
580 });
581 }
582
583 fn uncached_contraction_expr() -> Lrc<SequenceExpr> {
584 Lrc::new(
585 SequenceExpr::default()
586 .then_any_word()
587 .then_apostrophe()
588 .then_any_word(),
589 )
590 }
591
592 thread_local! {
593 static CONTRACTION_EXPR: Lrc<SequenceExpr> = Document::uncached_contraction_expr();
594 }
595
596 fn condense_contractions(&mut self) {
599 let expr = Self::CONTRACTION_EXPR.with(|v| v.clone());
600
601 self.condense_expr(&expr, |_| {});
602 }
603}
604
605macro_rules! create_fns_on_doc {
607 ($thing:ident) => {
608 paste! {
609 fn [< first_ $thing >](&self) -> Option<&Token> {
610 self.tokens.[< first_ $thing >]()
611 }
612
613 fn [< last_ $thing >](&self) -> Option<&Token> {
614 self.tokens.[< last_ $thing >]()
615 }
616
617 fn [< last_ $thing _index>](&self) -> Option<usize> {
618 self.tokens.[< last_ $thing _index >]()
619 }
620
621 fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
622 self.tokens.[< iter_ $thing _indices >]()
623 }
624
625 fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
626 self.tokens.[< iter_ $thing s >]()
627 }
628 }
629 };
630}
631
632impl TokenStringExt for Document {
633 create_fns_on_doc!(adjective);
634 create_fns_on_doc!(apostrophe);
635 create_fns_on_doc!(at);
636 create_fns_on_doc!(chunk_terminator);
637 create_fns_on_doc!(comma);
638 create_fns_on_doc!(conjunction);
639 create_fns_on_doc!(currency);
640 create_fns_on_doc!(ellipsis);
641 create_fns_on_doc!(hostname);
642 create_fns_on_doc!(likely_homograph);
643 create_fns_on_doc!(noun);
644 create_fns_on_doc!(number);
645 create_fns_on_doc!(paragraph_break);
646 create_fns_on_doc!(pipe);
647 create_fns_on_doc!(preposition);
648 create_fns_on_doc!(punctuation);
649 create_fns_on_doc!(quote);
650 create_fns_on_doc!(sentence_terminator);
651 create_fns_on_doc!(space);
652 create_fns_on_doc!(unlintable);
653 create_fns_on_doc!(verb);
654 create_fns_on_doc!(word);
655 create_fns_on_doc!(word_like);
656
657 fn first_sentence_word(&self) -> Option<&Token> {
658 self.tokens.first_sentence_word()
659 }
660
661 fn first_non_whitespace(&self) -> Option<&Token> {
662 self.tokens.first_non_whitespace()
663 }
664
665 fn span(&self) -> Option<Span> {
666 self.tokens.span()
667 }
668
669 fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
670 self.tokens.iter_linking_verb_indices()
671 }
672
673 fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
674 self.tokens.iter_linking_verbs()
675 }
676
677 fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
678 self.tokens.iter_chunks()
679 }
680
681 fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
682 self.tokens.iter_paragraphs()
683 }
684
685 fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
686 self.tokens.iter_sentences()
687 }
688}
689
690impl Display for Document {
691 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
692 for token in &self.tokens {
693 write!(f, "{}", self.get_span_content_str(&token.span))?;
694 }
695
696 Ok(())
697 }
698}
699
700#[cfg(test)]
701mod tests {
702 use itertools::Itertools;
703
704 use super::Document;
705 use crate::{Span, parsers::MarkdownOptions};
706
707 fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
708 let document = Document::new_plain_english_curated(text);
709
710 assert_eq!(document.tokens.len(), final_tok_count);
711
712 let document = Document::new_markdown_curated(text, MarkdownOptions::default());
713
714 assert_eq!(document.tokens.len(), final_tok_count);
715 }
716
717 #[test]
718 fn simple_contraction() {
719 assert_condensed_contractions("isn't", 1);
720 }
721
722 #[test]
723 fn simple_contraction2() {
724 assert_condensed_contractions("wasn't", 1);
725 }
726
727 #[test]
728 fn simple_contraction3() {
729 assert_condensed_contractions("There's", 1);
730 }
731
732 #[test]
733 fn medium_contraction() {
734 assert_condensed_contractions("isn't wasn't", 3);
735 }
736
737 #[test]
738 fn medium_contraction2() {
739 assert_condensed_contractions("There's no way", 5);
740 }
741
742 #[test]
743 fn selects_token_at_char_index() {
744 let text = "There were three little pigs. They built three little homes.";
745 let document = Document::new_plain_english_curated(text);
746
747 let got = document.get_token_at_char_index(19).unwrap();
748
749 assert!(got.kind.is_word());
750 assert_eq!(got.span, Span::new(17, 23));
751 }
752
753 fn assert_token_count(source: &str, count: usize) {
754 let document = Document::new_plain_english_curated(source);
755
756 dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
757 assert_eq!(document.tokens.len(), count);
758 }
759
760 #[test]
761 fn condenses_number_suffixes() {
762 assert_token_count("1st", 1);
763 assert_token_count("This is the 2nd test", 9);
764 assert_token_count("This is the 3rd test", 9);
765 assert_token_count(
766 "It works even with weird capitalization like this: 600nD",
767 18,
768 );
769 }
770
771 #[test]
772 fn condenses_ie() {
773 assert_token_count("There is a thing (i.e. that one)", 15);
774 assert_token_count("We are trying to condense \"i.e.\"", 13);
775 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
776 }
777
778 #[test]
779 fn condenses_eg() {
780 assert_token_count("We are trying to condense \"e.g.\"", 13);
781 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
782 }
783
784 #[test]
785 fn condenses_nsa() {
786 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
787 }
788
789 #[test]
790 fn parses_ellipsis() {
791 assert_token_count("...", 1);
792 }
793
794 #[test]
795 fn parses_long_ellipsis() {
796 assert_token_count(".....", 1);
797 }
798
799 #[test]
800 fn parses_short_ellipsis() {
801 assert_token_count("..", 1);
802 }
803
804 #[test]
805 fn selects_token_at_offset() {
806 let doc = Document::new_plain_english_curated("Foo bar baz");
807
808 let tok = doc.get_token_offset(1, -1).unwrap();
809
810 assert_eq!(tok.span, Span::new(0, 3));
811 }
812
813 #[test]
814 fn cant_select_token_before_start() {
815 let doc = Document::new_plain_english_curated("Foo bar baz");
816
817 let tok = doc.get_token_offset(0, -1);
818
819 assert!(tok.is_none());
820 }
821
822 #[test]
823 fn select_next_word_pos_offset() {
824 let doc = Document::new_plain_english_curated("Foo bar baz");
825
826 let bar = doc.get_next_word_from_offset(0, 1).unwrap();
827 let bar = doc.get_span_content(&bar.span);
828 assert_eq!(bar, ['b', 'a', 'r']);
829 }
830
831 #[test]
832 fn select_next_word_neg_offset() {
833 let doc = Document::new_plain_english_curated("Foo bar baz");
834
835 let bar = doc.get_next_word_from_offset(2, -1).unwrap();
836 let bar = doc.get_span_content(&bar.span);
837 assert_eq!(bar, ['F', 'o', 'o']);
838 }
839
840 #[test]
841 fn cant_select_next_word_not_from_whitespace() {
842 let doc = Document::new_plain_english_curated("Foo bar baz");
843
844 let tok = doc.get_next_word_from_offset(0, 2);
845
846 assert!(tok.is_none());
847 }
848
849 #[test]
850 fn cant_select_next_word_before_start() {
851 let doc = Document::new_plain_english_curated("Foo bar baz");
852
853 let tok = doc.get_next_word_from_offset(0, -1);
854
855 assert!(tok.is_none());
856 }
857
858 #[test]
859 fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
860 let doc = Document::new_plain_english_curated("Foo, bar, baz");
861
862 let tok = doc.get_next_word_from_offset(0, 1);
863
864 assert!(tok.is_none());
865 }
866
867 #[test]
868 fn cant_select_next_word_with_punctuation_after_whitespace() {
869 let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
870
871 let tok = doc.get_next_word_from_offset(0, 1);
872
873 assert!(tok.is_none());
874 }
875}