1#![doc = include_str!("../README.md")]
2
3pub mod contracts;
4pub mod operations;
5pub mod surface;
6
7pub use contracts::{
8 AsTextSegmentContract, IntoTextDocumentContract, TextAnnotationSpan, TextDocumentContract,
9 TextProvenance, TextSegmentContract, TextSourceRef, TimebaseContract, TimestampContract,
10};
11
12use std::collections::BTreeMap;
13
14use serde::{Deserialize, Serialize};
15use unicode_normalization::UnicodeNormalization;
16use unicode_segmentation::UnicodeSegmentation;
17use video_analysis_core::{OwnedTextSegment, TextSegment, Timestamp};
18
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
20pub struct TextDocument<'a> {
22 pub id: &'a str,
24 pub text: &'a str,
26 pub language: Option<&'a str>,
28 pub timestamp: Option<Timestamp>,
30}
31
32impl<'a> TextDocument<'a> {
33 pub fn new(id: &'a str, text: &'a str) -> Self {
35 Self {
36 id,
37 text,
38 language: None,
39 timestamp: None,
40 }
41 }
42
43 pub fn from_segment_id(id: &'a str, segment: &TextSegment<'a>) -> Self {
45 Self {
46 id,
47 text: segment.text,
48 language: segment.language,
49 timestamp: segment.timestamp,
50 }
51 }
52
53 pub fn from_stream_segment(stream_id: &str, segment: &TextSegment<'_>) -> OwnedTextDocument {
55 OwnedTextDocument::from_stream_segment(stream_id, segment)
56 }
57
58 pub fn from_segment(stream_id: &'a str, segment: &TextSegment<'a>) -> Self {
64 Self {
65 id: stream_id,
66 text: segment.text,
67 language: segment.language,
68 timestamp: segment.timestamp,
69 }
70 }
71}
72
73#[derive(Debug, Clone, PartialEq, Eq)]
74pub struct OwnedTextDocument {
76 pub id: String,
78 pub text: String,
80 pub language: Option<String>,
82 pub timestamp: Option<Timestamp>,
84}
85
86impl OwnedTextDocument {
87 pub fn new(id: impl Into<String>, text: impl Into<String>) -> Self {
89 Self {
90 id: id.into(),
91 text: text.into(),
92 language: None,
93 timestamp: None,
94 }
95 }
96
97 pub fn language(mut self, language: impl Into<String>) -> Self {
99 self.language = Some(language.into());
100 self
101 }
102
103 pub fn timestamp(mut self, timestamp: Timestamp) -> Self {
105 self.timestamp = Some(timestamp);
106 self
107 }
108
109 pub fn from_segment_id(id: impl Into<String>, segment: &OwnedTextSegment) -> Self {
111 let segment = segment.as_segment();
112 Self {
113 id: id.into(),
114 text: segment.text.to_string(),
115 language: segment.language.map(ToString::to_string),
116 timestamp: segment.timestamp,
117 }
118 }
119
120 pub fn from_stream_segment(stream_id: &str, segment: &TextSegment<'_>) -> Self {
122 Self {
123 id: segment_document_id(stream_id, segment.segment_index),
124 text: segment.text.to_string(),
125 language: segment.language.map(ToString::to_string),
126 timestamp: segment.timestamp,
127 }
128 }
129
130 pub fn from_owned_stream_segment(stream_id: &str, segment: &OwnedTextSegment) -> Self {
132 Self::from_stream_segment(stream_id, &segment.as_segment())
133 }
134
135 pub fn from_segment(stream_id: impl Into<String>, segment: &OwnedTextSegment) -> Self {
141 let segment = segment.as_segment();
142 Self {
143 id: stream_id.into(),
144 text: segment.text.to_string(),
145 language: segment.language.map(ToString::to_string),
146 timestamp: segment.timestamp,
147 }
148 }
149
150 pub fn as_document(&self) -> TextDocument<'_> {
152 TextDocument {
153 id: &self.id,
154 text: &self.text,
155 language: self.language.as_deref(),
156 timestamp: self.timestamp,
157 }
158 }
159}
160
161pub fn segment_document_id(stream_id: &str, segment_index: u64) -> String {
163 format!("{stream_id}:{segment_index}")
164}
165
166#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
167pub struct TextStats {
169 pub bytes: usize,
171 pub chars: usize,
173 pub words: usize,
175 pub lines: usize,
177 pub sentences: usize,
179}
180
181#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
182pub struct TextSpan {
184 pub byte_start: usize,
186 pub byte_end: usize,
188 pub char_start: usize,
190 pub char_end: usize,
192}
193
194#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
195pub struct AnnotationId(pub usize);
197
198#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
199pub struct AnnotationConfidence(f32);
201
202impl AnnotationConfidence {
203 pub fn new(value: f32) -> Self {
205 let value = if value.is_finite() {
206 value.clamp(0.0, 1.0)
207 } else {
208 0.0
209 };
210 Self(value)
211 }
212
213 pub fn get(self) -> f32 {
215 self.0
216 }
217}
218
219impl Default for AnnotationConfidence {
220 fn default() -> Self {
221 Self(1.0)
222 }
223}
224
225impl From<f32> for AnnotationConfidence {
226 fn from(value: f32) -> Self {
227 Self::new(value)
228 }
229}
230
231#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
232pub enum AnnotationProvenance {
234 Heuristic,
236 Tokenizer,
238 Onnx,
240 Candle,
242 CudaOxide,
244 External,
246 Derived,
248}
249
250#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
251pub struct TextSpanRef {
253 pub id: AnnotationId,
255 pub span: TextSpan,
257}
258
259#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
260pub struct TextBoundaryOptions {
262 pub lowercase: bool,
264 pub normalize_unicode: bool,
266 pub include_numbers: bool,
268 pub include_punctuation: bool,
270 pub min_chars: usize,
272}
273
274impl Default for TextBoundaryOptions {
275 fn default() -> Self {
276 Self {
277 lowercase: true,
278 normalize_unicode: true,
279 include_numbers: true,
280 include_punctuation: false,
281 min_chars: 1,
282 }
283 }
284}
285
286#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
287pub struct WordSegment {
289 pub text: String,
291 pub normalized: String,
293 pub span: TextSpan,
295 pub kind: TokenKind,
297}
298
299#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
300pub struct GraphemeSpan {
302 pub text: String,
304 pub span: TextSpan,
306}
307
308#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
309pub struct ScriptProfile {
311 pub scripts: BTreeMap<String, usize>,
313 pub digits: usize,
315 pub whitespace: usize,
317 pub punctuation: usize,
319 pub other: usize,
321 pub dominant_script: Option<String>,
323 pub is_mixed: bool,
325}
326
327#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
328pub struct Token {
330 pub text: String,
332 pub normalized: String,
334 pub span: TextSpan,
336 pub kind: TokenKind,
338}
339
340#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
341pub enum TokenKind {
343 Word,
345 Number,
347 Url,
349 Email,
351 Mention,
353 Hashtag,
355 Punctuation,
357 Other,
359}
360
361#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
362pub struct Sentence {
364 pub text: String,
366 pub span: TextSpan,
368 pub token_count: usize,
370}
371
372#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
373pub struct Paragraph {
375 pub text: String,
377 pub span: TextSpan,
379 pub sentence_count: usize,
381}
382
383#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
384pub struct CanonicalToken {
386 pub id: AnnotationId,
388 pub span: TextSpanRef,
390 pub text: String,
392 pub normalized: String,
394 pub kind: TokenKind,
396 pub sentence_id: AnnotationId,
398 pub paragraph_id: Option<AnnotationId>,
400}
401
402#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
403pub struct AnnotatedSentence {
405 pub id: AnnotationId,
407 pub span: TextSpanRef,
409 pub text: String,
411 pub token_start: usize,
413 pub token_end: usize,
415 pub token_ids: Vec<AnnotationId>,
417 pub paragraph_id: Option<AnnotationId>,
419}
420
421#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
422pub struct AnnotatedParagraph {
424 pub id: AnnotationId,
426 pub span: TextSpanRef,
428 pub text: String,
430 pub sentence_start: usize,
432 pub sentence_end: usize,
434 pub sentence_ids: Vec<AnnotationId>,
436}
437
438#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
439pub struct TextAnnotationGraph {
441 pub text: String,
443 pub provenance: AnnotationProvenance,
445 pub confidence: AnnotationConfidence,
447 pub tokens: Vec<CanonicalToken>,
449 pub sentences: Vec<AnnotatedSentence>,
451 pub paragraphs: Vec<AnnotatedParagraph>,
453}
454
455impl TextAnnotationGraph {
456 pub fn token(&self, id: AnnotationId) -> Option<&CanonicalToken> {
458 self.tokens.iter().find(|token| token.id == id)
459 }
460
461 pub fn sentence(&self, id: AnnotationId) -> Option<&AnnotatedSentence> {
463 self.sentences.iter().find(|sentence| sentence.id == id)
464 }
465
466 pub fn paragraph(&self, id: AnnotationId) -> Option<&AnnotatedParagraph> {
468 self.paragraphs.iter().find(|paragraph| paragraph.id == id)
469 }
470}
471
472#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
473pub struct TextProcessingOptions {
475 pub language: Option<String>,
477 pub lowercase: bool,
479 pub normalize_unicode: bool,
481 pub keep_apostrophes: bool,
483 pub include_punctuation: bool,
485}
486
487impl Default for TextProcessingOptions {
488 fn default() -> Self {
489 Self {
490 language: None,
491 lowercase: true,
492 normalize_unicode: true,
493 keep_apostrophes: true,
494 include_punctuation: false,
495 }
496 }
497}
498
499#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
500pub struct DetailedTextStats {
502 pub basic: TextStats,
504 pub paragraphs: usize,
506 pub tokens: usize,
508 pub unique_tokens: usize,
510 pub average_words_per_sentence: f32,
512 pub average_chars_per_word: f32,
514}
515
516pub fn text_stats(text: &str) -> TextStats {
518 TextStats {
519 bytes: text.len(),
520 chars: text.chars().count(),
521 words: tokenize_words(text).len(),
522 lines: text.lines().count(),
523 sentences: split_sentences(text).len(),
524 }
525}
526
527pub fn normalize_whitespace(text: &str) -> String {
529 text.split_whitespace().collect::<Vec<_>>().join(" ")
530}
531
532pub fn normalize_text(text: &str, options: &TextProcessingOptions) -> String {
534 let normalized = if options.normalize_unicode {
535 text.nfkc().collect::<String>()
536 } else {
537 text.to_string()
538 };
539 if options.lowercase {
540 normalized.to_lowercase()
541 } else {
542 normalized
543 }
544}
545
546pub fn tokenize_words(text: &str) -> Vec<String> {
548 let options = TextProcessingOptions::default();
549 tokenize(text, &options)
550 .into_iter()
551 .filter(|token| {
552 matches!(
553 token.kind,
554 TokenKind::Word
555 | TokenKind::Number
556 | TokenKind::Url
557 | TokenKind::Email
558 | TokenKind::Mention
559 | TokenKind::Hashtag
560 )
561 })
562 .map(|token| token.normalized)
563 .collect()
564}
565
566pub fn word_counts(text: &str) -> BTreeMap<String, usize> {
568 let mut counts = BTreeMap::new();
569 for token in tokenize_words(text) {
570 *counts.entry(token).or_insert(0) += 1;
571 }
572 counts
573}
574
575pub fn split_sentences(text: &str) -> Vec<String> {
577 split_sentence_spans(text, &TextProcessingOptions::default())
578 .into_iter()
579 .map(|sentence| normalize_whitespace(&sentence.text))
580 .collect()
581}
582
583pub fn segment_words(text: &str, options: &TextBoundaryOptions) -> Vec<WordSegment> {
585 let processing = TextProcessingOptions {
586 lowercase: options.lowercase,
587 normalize_unicode: options.normalize_unicode,
588 include_punctuation: options.include_punctuation,
589 ..TextProcessingOptions::default()
590 };
591 let mut segments = Vec::<WordSegment>::new();
592 for (byte_start, segment) in UnicodeSegmentation::split_word_bound_indices(text) {
593 if segment.chars().all(char::is_whitespace) {
594 continue;
595 }
596 let kind = classify_word_segment(segment);
597 let keep = match kind {
598 TokenKind::Word
599 | TokenKind::Url
600 | TokenKind::Email
601 | TokenKind::Mention
602 | TokenKind::Hashtag => true,
603 TokenKind::Number => options.include_numbers,
604 TokenKind::Punctuation => options.include_punctuation,
605 TokenKind::Other => segment.chars().any(char::is_alphanumeric),
606 };
607 if !keep || segment.chars().count() < options.min_chars {
608 continue;
609 }
610 let byte_end = byte_start + segment.len();
611 let current = WordSegment {
612 text: segment.to_string(),
613 normalized: normalize_text(segment, &processing),
614 span: span_for(text, byte_start, byte_end),
615 kind,
616 };
617 if let Some(previous) = segments.last_mut() {
618 if previous.kind == TokenKind::Word
619 && current.kind == TokenKind::Word
620 && previous.span.byte_end == current.span.byte_start
621 {
622 previous.text.push_str(¤t.text);
623 previous.normalized = normalize_text(&previous.text, &processing);
624 previous.span.byte_end = current.span.byte_end;
625 previous.span.char_end = current.span.char_end;
626 continue;
627 }
628 }
629 segments.push(current);
630 }
631 segments
632}
633
634pub fn segment_graphemes(text: &str) -> Vec<GraphemeSpan> {
636 UnicodeSegmentation::grapheme_indices(text, true)
637 .map(|(byte_start, grapheme)| {
638 let byte_end = byte_start + grapheme.len();
639 GraphemeSpan {
640 text: grapheme.to_string(),
641 span: span_for(text, byte_start, byte_end),
642 }
643 })
644 .collect()
645}
646
647pub fn detect_script_profile(text: &str) -> ScriptProfile {
649 let mut scripts = BTreeMap::<String, usize>::new();
650 let mut digits = 0;
651 let mut whitespace = 0;
652 let mut punctuation = 0;
653 let mut other = 0;
654
655 for ch in text.chars() {
656 if ch.is_whitespace() {
657 whitespace += 1;
658 } else if ch.is_numeric() {
659 digits += 1;
660 } else if is_sentence_or_symbol_punctuation(ch) || ch.is_ascii_punctuation() {
661 punctuation += 1;
662 } else if let Some(script) = script_name(ch) {
663 *scripts.entry(script.to_string()).or_insert(0) += 1;
664 } else {
665 other += 1;
666 }
667 }
668
669 let dominant_script = scripts
670 .iter()
671 .max_by(|left, right| left.1.cmp(right.1).then_with(|| right.0.cmp(left.0)))
672 .map(|(script, _)| script.clone());
673 let is_mixed = scripts.len() > 1;
674
675 ScriptProfile {
676 scripts,
677 digits,
678 whitespace,
679 punctuation,
680 other,
681 dominant_script,
682 is_mixed,
683 }
684}
685
686pub fn tokenize(text: &str, options: &TextProcessingOptions) -> Vec<Token> {
688 let mut tokens = Vec::new();
689 let mut byte_index = 0;
690
691 while byte_index < text.len() {
692 let ch = next_char(text, byte_index);
693 if ch.is_whitespace() {
694 byte_index += ch.len_utf8();
695 continue;
696 }
697
698 let (byte_end, kind) = if starts_url(text, byte_index) {
699 (consume_until_whitespace(text, byte_index), TokenKind::Url)
700 } else if ch == '@' {
701 let end = consume_prefixed_word(text, byte_index);
702 if end > byte_index + ch.len_utf8() {
703 (end, TokenKind::Mention)
704 } else {
705 (byte_index + ch.len_utf8(), TokenKind::Other)
706 }
707 } else if ch == '#' {
708 let end = consume_prefixed_word(text, byte_index);
709 if end > byte_index + ch.len_utf8() {
710 (end, TokenKind::Hashtag)
711 } else {
712 (byte_index + ch.len_utf8(), TokenKind::Other)
713 }
714 } else if ch.is_ascii_digit() {
715 (consume_number(text, byte_index), TokenKind::Number)
716 } else if is_word_char(ch, options.keep_apostrophes) {
717 let mut end = consume_word_like(text, byte_index, options.keep_apostrophes);
718 let candidate_end =
719 trim_trailing_token_punctuation(text, byte_index, end, TokenKind::Email);
720 let candidate = &text[byte_index..candidate_end];
721 let kind = if is_email(candidate) {
722 end = candidate_end;
723 TokenKind::Email
724 } else {
725 end = consume_plain_word(text, byte_index, options.keep_apostrophes);
726 TokenKind::Word
727 };
728 (end, kind)
729 } else if is_sentence_or_symbol_punctuation(ch) {
730 (byte_index + ch.len_utf8(), TokenKind::Punctuation)
731 } else {
732 (byte_index + ch.len_utf8(), TokenKind::Other)
733 };
734
735 let byte_end = trim_trailing_token_punctuation(text, byte_index, byte_end, kind);
736 if byte_end == byte_index {
737 byte_index += ch.len_utf8();
738 continue;
739 }
740 if kind != TokenKind::Punctuation || options.include_punctuation {
741 let raw = &text[byte_index..byte_end];
742 tokens.push(Token {
743 text: raw.to_string(),
744 normalized: normalize_text(raw, options),
745 span: span_for(text, byte_index, byte_end),
746 kind,
747 });
748 }
749 byte_index = byte_end;
750 }
751
752 tokens
753}
754
755pub fn split_sentence_spans(text: &str, options: &TextProcessingOptions) -> Vec<Sentence> {
757 let mut sentences = Vec::new();
758 let mut start = 0;
759 let chars = text.char_indices().collect::<Vec<_>>();
760
761 for (position, (byte_index, ch)) in chars.iter().copied().enumerate() {
762 if !is_sentence_terminator(ch) {
763 continue;
764 }
765 if ch == '.' && is_abbreviation_boundary(text, byte_index) {
766 continue;
767 }
768 if ch == '.'
769 && previous_char(&chars, position).is_some_and(|value| value.is_ascii_digit())
770 && next_char_from_indices(&chars, position).is_some_and(|value| value.is_ascii_digit())
771 {
772 continue;
773 }
774 if next_char_from_indices(&chars, position).is_some_and(is_sentence_terminator) {
775 continue;
776 }
777
778 let end = byte_index + ch.len_utf8();
779 push_sentence(text, start, end, options, &mut sentences);
780 start = end;
781 }
782
783 push_sentence(text, start, text.len(), options, &mut sentences);
784 sentences
785}
786
787pub fn split_paragraphs(text: &str) -> Vec<Paragraph> {
789 let mut paragraphs = Vec::new();
790 let mut paragraph_start = None;
791 let mut last_non_blank_end = 0;
792 let mut line_start = 0;
793
794 for line in text.split_inclusive('\n') {
795 let line_end = line_start + line.len();
796 let line_without_newline = line.trim_end_matches(['\r', '\n']);
797 if line_without_newline.trim().is_empty() {
798 if let Some(start) = paragraph_start.take() {
799 push_paragraph(text, start, last_non_blank_end, &mut paragraphs);
800 }
801 } else {
802 let content_start =
803 line_start + (line_without_newline.len() - line_without_newline.trim_start().len());
804 paragraph_start.get_or_insert(content_start);
805 last_non_blank_end = line_start + line_without_newline.trim_end().len();
806 }
807 line_start = line_end;
808 }
809
810 if let Some(start) = paragraph_start {
811 push_paragraph(text, start, last_non_blank_end, &mut paragraphs);
812 }
813
814 paragraphs
815}
816
817pub fn build_annotation_graph(text: &str, options: &TextProcessingOptions) -> TextAnnotationGraph {
819 let tokens = tokenize(text, options);
820 let sentences = split_sentence_spans(text, options);
821 let paragraphs = split_paragraphs(text);
822 build_annotation_graph_from_parts(text, &tokens, &sentences, ¶graphs)
823}
824
825pub fn build_annotation_graph_from_parts(
827 text: &str,
828 tokens: &[Token],
829 sentences: &[Sentence],
830 paragraphs: &[Paragraph],
831) -> TextAnnotationGraph {
832 let token_id_offset = 0;
833 let sentence_id_offset = tokens.len();
834 let paragraph_id_offset = tokens.len() + sentences.len();
835
836 let annotated_paragraphs = paragraphs
837 .iter()
838 .enumerate()
839 .map(|(index, paragraph)| {
840 let id = AnnotationId(paragraph_id_offset + index);
841 let sentence_indices = sentences
842 .iter()
843 .enumerate()
844 .filter(|(_, sentence)| span_is_inside(sentence.span, paragraph.span))
845 .map(|(sentence_index, _)| sentence_index)
846 .collect::<Vec<_>>();
847 let sentence_ids = sentence_indices
848 .iter()
849 .map(|sentence_index| AnnotationId(sentence_id_offset + sentence_index))
850 .collect::<Vec<_>>();
851 let sentence_start = sentence_indices.first().copied().unwrap_or(sentences.len());
852 let sentence_end = sentence_indices
853 .last()
854 .map(|index| index + 1)
855 .unwrap_or(sentence_start);
856
857 AnnotatedParagraph {
858 id,
859 span: TextSpanRef {
860 id,
861 span: paragraph.span,
862 },
863 text: paragraph.text.clone(),
864 sentence_start,
865 sentence_end,
866 sentence_ids,
867 }
868 })
869 .collect::<Vec<_>>();
870
871 let annotated_sentences = sentences
872 .iter()
873 .enumerate()
874 .map(|(sentence_index, sentence)| {
875 let id = AnnotationId(sentence_id_offset + sentence_index);
876 let token_indices = tokens
877 .iter()
878 .enumerate()
879 .filter(|(_, token)| span_is_inside(token.span, sentence.span))
880 .map(|(index, _)| index)
881 .collect::<Vec<_>>();
882 let paragraph_id = annotated_paragraphs
883 .iter()
884 .find(|paragraph| span_is_inside(sentence.span, paragraph.span.span))
885 .map(|paragraph| paragraph.id);
886 AnnotatedSentence {
887 id,
888 span: TextSpanRef {
889 id,
890 span: sentence.span,
891 },
892 text: sentence.text.clone(),
893 token_start: token_indices.first().copied().unwrap_or(tokens.len()),
894 token_end: token_indices
895 .last()
896 .map(|index| index + 1)
897 .unwrap_or(tokens.len()),
898 token_ids: token_indices
899 .into_iter()
900 .map(|index| AnnotationId(token_id_offset + index))
901 .collect(),
902 paragraph_id,
903 }
904 })
905 .collect::<Vec<_>>();
906
907 let annotated_tokens = tokens
908 .iter()
909 .enumerate()
910 .map(|(token_index, token)| {
911 let id = AnnotationId(token_id_offset + token_index);
912 let sentence_id = annotated_sentences
913 .iter()
914 .find(|sentence| span_is_inside(token.span, sentence.span.span))
915 .map(|sentence| sentence.id)
916 .unwrap_or(AnnotationId(sentence_id_offset));
917 let paragraph_id = annotated_paragraphs
918 .iter()
919 .find(|paragraph| span_is_inside(token.span, paragraph.span.span))
920 .map(|paragraph| paragraph.id);
921 CanonicalToken {
922 id,
923 span: TextSpanRef {
924 id,
925 span: token.span,
926 },
927 text: token.text.clone(),
928 normalized: token.normalized.clone(),
929 kind: token.kind,
930 sentence_id,
931 paragraph_id,
932 }
933 })
934 .collect();
935
936 TextAnnotationGraph {
937 text: text.to_string(),
938 provenance: AnnotationProvenance::Tokenizer,
939 confidence: AnnotationConfidence::default(),
940 tokens: annotated_tokens,
941 sentences: annotated_sentences,
942 paragraphs: annotated_paragraphs,
943 }
944}
945
946pub fn detailed_text_stats(text: &str, options: &TextProcessingOptions) -> DetailedTextStats {
948 let basic = text_stats(text);
949 let paragraphs = split_paragraphs(text).len();
950 let tokens = tokenize(text, options);
951 let unique_tokens = tokens
952 .iter()
953 .filter(|token| token.kind != TokenKind::Punctuation)
954 .map(|token| token.normalized.clone())
955 .collect::<std::collections::BTreeSet<_>>()
956 .len();
957 let chars_in_words = tokens
958 .iter()
959 .filter(|token| matches!(token.kind, TokenKind::Word | TokenKind::Number))
960 .map(|token| token.text.chars().count())
961 .sum::<usize>();
962 DetailedTextStats {
963 basic,
964 paragraphs,
965 tokens: tokens.len(),
966 unique_tokens,
967 average_words_per_sentence: if basic.sentences == 0 {
968 0.0
969 } else {
970 basic.words as f32 / basic.sentences as f32
971 },
972 average_chars_per_word: if basic.words == 0 {
973 0.0
974 } else {
975 chars_in_words as f32 / basic.words as f32
976 },
977 }
978}
979
980fn next_char(text: &str, byte_index: usize) -> char {
981 text[byte_index..]
982 .chars()
983 .next()
984 .expect("byte_index must be inside text")
985}
986
987fn span_for(text: &str, byte_start: usize, byte_end: usize) -> TextSpan {
988 TextSpan {
989 byte_start,
990 byte_end,
991 char_start: text[..byte_start].chars().count(),
992 char_end: text[..byte_end].chars().count(),
993 }
994}
995
996fn span_is_inside(inner: TextSpan, outer: TextSpan) -> bool {
997 inner.byte_start >= outer.byte_start && inner.byte_end <= outer.byte_end
998}
999
1000fn starts_url(text: &str, byte_index: usize) -> bool {
1001 let tail = &text[byte_index..];
1002 tail.starts_with("http://") || tail.starts_with("https://") || tail.starts_with("www.")
1003}
1004
1005fn classify_word_segment(segment: &str) -> TokenKind {
1006 if starts_url(segment, 0) {
1007 TokenKind::Url
1008 } else if is_email(segment) {
1009 TokenKind::Email
1010 } else if segment.starts_with('@') && segment[1..].chars().any(char::is_alphanumeric) {
1011 TokenKind::Mention
1012 } else if segment.starts_with('#') && segment[1..].chars().any(char::is_alphanumeric) {
1013 TokenKind::Hashtag
1014 } else if segment
1015 .chars()
1016 .all(|ch| ch.is_numeric() || matches!(ch, '.' | ',' | ':' | '/' | '-'))
1017 {
1018 TokenKind::Number
1019 } else if segment.chars().all(is_sentence_or_symbol_punctuation) {
1020 TokenKind::Punctuation
1021 } else if segment.chars().any(char::is_alphanumeric) {
1022 TokenKind::Word
1023 } else {
1024 TokenKind::Other
1025 }
1026}
1027
1028fn consume_until_whitespace(text: &str, byte_start: usize) -> usize {
1029 let mut end = byte_start;
1030 for (offset, ch) in text[byte_start..].char_indices() {
1031 if ch.is_whitespace() {
1032 break;
1033 }
1034 end = byte_start + offset + ch.len_utf8();
1035 }
1036 end
1037}
1038
1039fn consume_prefixed_word(text: &str, byte_start: usize) -> usize {
1040 let base = byte_start + next_char(text, byte_start).len_utf8();
1041 let mut end = base;
1042 for (offset, ch) in text[base..].char_indices() {
1043 if ch.is_alphanumeric() || ch == '_' || ch == '-' {
1044 end = base + offset + ch.len_utf8();
1045 } else {
1046 break;
1047 }
1048 }
1049 end
1050}
1051
1052fn consume_number(text: &str, byte_start: usize) -> usize {
1053 let mut end = byte_start;
1054 for (offset, ch) in text[byte_start..].char_indices() {
1055 if ch.is_ascii_digit() || matches!(ch, '.' | ',' | ':' | '/' | '-') {
1056 end = byte_start + offset + ch.len_utf8();
1057 } else {
1058 break;
1059 }
1060 }
1061 end
1062}
1063
1064fn consume_word_like(text: &str, byte_start: usize, keep_apostrophes: bool) -> usize {
1065 let mut end = byte_start;
1066 for (offset, ch) in text[byte_start..].char_indices() {
1067 if is_word_char(ch, keep_apostrophes) || matches!(ch, '@' | '.' | '_' | '-' | '+') {
1068 end = byte_start + offset + ch.len_utf8();
1069 } else {
1070 break;
1071 }
1072 }
1073 end
1074}
1075
1076fn consume_plain_word(text: &str, byte_start: usize, keep_apostrophes: bool) -> usize {
1077 let mut end = byte_start;
1078 for (offset, ch) in text[byte_start..].char_indices() {
1079 if is_word_char(ch, keep_apostrophes) {
1080 end = byte_start + offset + ch.len_utf8();
1081 } else {
1082 break;
1083 }
1084 }
1085 end
1086}
1087
1088fn is_word_char(ch: char, keep_apostrophes: bool) -> bool {
1089 ch.is_alphanumeric() || (keep_apostrophes && is_apostrophe(ch))
1090}
1091
1092fn is_apostrophe(ch: char) -> bool {
1093 matches!(ch, '\'' | '’')
1094}
1095
1096fn is_email(candidate: &str) -> bool {
1097 let Some((local, domain)) = candidate.split_once('@') else {
1098 return false;
1099 };
1100 !local.is_empty() && domain.contains('.') && !domain.starts_with('.') && !domain.ends_with('.')
1101}
1102
1103fn is_sentence_or_symbol_punctuation(ch: char) -> bool {
1104 ch.is_ascii_punctuation()
1105 || matches!(
1106 ch,
1107 '…' | '。' | '!' | '?' | '،' | '؛' | '¿' | '¡' | '«' | '»'
1108 )
1109}
1110
1111fn trim_trailing_token_punctuation(
1112 text: &str,
1113 byte_start: usize,
1114 mut byte_end: usize,
1115 kind: TokenKind,
1116) -> usize {
1117 if !matches!(kind, TokenKind::Url | TokenKind::Email | TokenKind::Number) {
1118 return byte_end;
1119 }
1120 while byte_end > byte_start {
1121 let Some(ch) = text[..byte_end].chars().next_back() else {
1122 break;
1123 };
1124 if matches!(ch, '.' | ',' | ';' | ':' | '!' | '?' | ')' | ']' | '}') {
1125 byte_end -= ch.len_utf8();
1126 } else {
1127 break;
1128 }
1129 }
1130 byte_end
1131}
1132
1133fn is_sentence_terminator(ch: char) -> bool {
1134 matches!(ch, '.' | '?' | '!' | '…' | '。' | '!' | '?')
1135}
1136
1137fn is_abbreviation_boundary(text: &str, period_byte_index: usize) -> bool {
1138 let prefix = &text[..period_byte_index];
1139 let word_start = prefix
1140 .char_indices()
1141 .rev()
1142 .find_map(|(index, ch)| (!ch.is_alphabetic()).then_some(index + ch.len_utf8()))
1143 .unwrap_or(0);
1144 let word = &text[word_start..period_byte_index];
1145 if word.is_empty() {
1146 return false;
1147 }
1148 let normalized = word.to_ascii_lowercase();
1149 matches!(
1150 normalized.as_str(),
1151 "mr" | "mrs"
1152 | "ms"
1153 | "dr"
1154 | "prof"
1155 | "sr"
1156 | "jr"
1157 | "st"
1158 | "vs"
1159 | "etc"
1160 | "e.g"
1161 | "i.e"
1162 | "u.s"
1163 | "u.k"
1164 ) || (word.chars().count() == 1
1165 && word
1166 .chars()
1167 .next()
1168 .is_some_and(|ch| ch.is_ascii_uppercase()))
1169}
1170
1171fn script_name(ch: char) -> Option<&'static str> {
1172 let value = ch as u32;
1173 match value {
1174 0x0041..=0x007A | 0x00C0..=0x024F | 0x1E00..=0x1EFF => Some("Latin"),
1175 0x0370..=0x03FF | 0x1F00..=0x1FFF => Some("Greek"),
1176 0x0400..=0x052F | 0x2DE0..=0x2DFF | 0xA640..=0xA69F => Some("Cyrillic"),
1177 0x0590..=0x05FF => Some("Hebrew"),
1178 0x0600..=0x06FF | 0x0750..=0x077F | 0x08A0..=0x08FF => Some("Arabic"),
1179 0x0900..=0x097F => Some("Devanagari"),
1180 0x3040..=0x309F => Some("Hiragana"),
1181 0x30A0..=0x30FF | 0x31F0..=0x31FF => Some("Katakana"),
1182 0x3400..=0x4DBF | 0x4E00..=0x9FFF | 0xF900..=0xFAFF => Some("Han"),
1183 0xAC00..=0xD7AF | 0x1100..=0x11FF | 0x3130..=0x318F => Some("Hangul"),
1184 _ if ch.is_alphabetic() => Some("Other"),
1185 _ => None,
1186 }
1187}
1188
1189fn previous_char(chars: &[(usize, char)], position: usize) -> Option<char> {
1190 position
1191 .checked_sub(1)
1192 .and_then(|index| chars.get(index).map(|(_, ch)| *ch))
1193}
1194
1195fn next_char_from_indices(chars: &[(usize, char)], position: usize) -> Option<char> {
1196 chars.get(position + 1).map(|(_, ch)| *ch)
1197}
1198
1199fn push_sentence(
1200 text: &str,
1201 byte_start: usize,
1202 byte_end: usize,
1203 options: &TextProcessingOptions,
1204 sentences: &mut Vec<Sentence>,
1205) {
1206 if byte_start >= byte_end {
1207 return;
1208 }
1209 let raw = &text[byte_start..byte_end];
1210 let leading = raw.len() - raw.trim_start().len();
1211 let trailing = raw.trim_end().len();
1212 let start = byte_start + leading;
1213 let end = byte_start + trailing;
1214 if start >= end {
1215 return;
1216 }
1217 let sentence_text = text[start..end].to_string();
1218 let token_count = tokenize(&sentence_text, options).len();
1219 sentences.push(Sentence {
1220 text: sentence_text,
1221 span: span_for(text, start, end),
1222 token_count,
1223 });
1224}
1225
1226fn push_paragraph(text: &str, byte_start: usize, byte_end: usize, paragraphs: &mut Vec<Paragraph>) {
1227 if byte_start >= byte_end {
1228 return;
1229 }
1230 let paragraph_text = text[byte_start..byte_end].to_string();
1231 paragraphs.push(Paragraph {
1232 sentence_count: split_sentence_spans(¶graph_text, &TextProcessingOptions::default())
1233 .len(),
1234 text: paragraph_text,
1235 span: span_for(text, byte_start, byte_end),
1236 });
1237}
1238
1239#[cfg(test)]
1240mod tests {
1241 use super::*;
1242
1243 #[test]
1244 fn segment_document_ids_include_stream_and_index() {
1245 assert_eq!(segment_document_id("subs", 7), "subs:7");
1246 }
1247
1248 #[test]
1249 fn tokenizes_and_counts_words() {
1250 let counts = word_counts("Hello, hello world.");
1251 assert_eq!(counts.get("hello"), Some(&2));
1252 assert_eq!(counts.get("world"), Some(&1));
1253 }
1254
1255 #[test]
1256 fn computes_text_stats() {
1257 let stats = text_stats("One sentence. Two words!");
1258 assert_eq!(stats.sentences, 2);
1259 assert_eq!(stats.words, 4);
1260 }
1261
1262 #[test]
1263 fn tokenizes_unicode_words_with_offsets() {
1264 let tokens = tokenize("Hi café 東京", &TextProcessingOptions::default());
1265 assert_eq!(tokens[1].text, "café");
1266 assert_eq!(tokens[1].span.byte_start, 3);
1267 assert_eq!(tokens[1].span.char_start, 3);
1268 assert_eq!(tokens[2].text, "東京");
1269 }
1270
1271 #[test]
1272 fn classifies_common_token_patterns() {
1273 let tokens = tokenize(
1274 "Mail a@b.com. #rust @team https://example.com 3.14",
1275 &TextProcessingOptions::default(),
1276 );
1277 assert!(tokens.iter().any(|token| token.kind == TokenKind::Email));
1278 assert!(tokens.iter().any(|token| token.kind == TokenKind::Hashtag));
1279 assert!(tokens.iter().any(|token| token.kind == TokenKind::Mention));
1280 assert!(tokens.iter().any(|token| token.kind == TokenKind::Url));
1281 assert!(tokens.iter().any(|token| token.kind == TokenKind::Number));
1282 }
1283
1284 #[test]
1285 fn apostrophe_behavior_is_configurable() {
1286 let keep = TextProcessingOptions::default();
1287 assert_eq!(tokenize_words("Don't stop"), vec!["don't", "stop"]);
1288
1289 let split = TextProcessingOptions {
1290 keep_apostrophes: false,
1291 ..TextProcessingOptions::default()
1292 };
1293 let tokens = tokenize("Don't", &split)
1294 .into_iter()
1295 .map(|token| token.normalized)
1296 .collect::<Vec<_>>();
1297 assert_eq!(tokens, vec!["don", "t"]);
1298 assert_eq!(tokenize("Don't", &keep)[0].normalized, "don't");
1299 }
1300
1301 #[test]
1302 fn splits_sentences_with_decimals_ellipses_and_multilingual_marks() {
1303 let sentences = split_sentences("Dr. Smith wrote pi is 3.14. Wait... Really? Yes!");
1304 assert_eq!(
1305 sentences,
1306 vec!["Dr. Smith wrote pi is 3.14.", "Wait...", "Really?", "Yes!"]
1307 );
1308 }
1309
1310 #[test]
1311 fn segments_graphemes_with_byte_and_char_spans() {
1312 let graphemes = segment_graphemes("e\u{301}👍🏽a");
1313 assert_eq!(graphemes.len(), 3);
1314 assert_eq!(graphemes[0].text, "e\u{301}");
1315 assert_eq!(graphemes[0].span.byte_start, 0);
1316 assert_eq!(graphemes[0].span.byte_end, 3);
1317 assert_eq!(graphemes[0].span.char_start, 0);
1318 assert_eq!(graphemes[0].span.char_end, 2);
1319 assert_eq!(graphemes[1].text, "👍🏽");
1320 assert_eq!(graphemes[1].span.char_start, 2);
1321 assert_eq!(graphemes[1].span.char_end, 4);
1322 }
1323
1324 #[test]
1325 fn segments_words_with_unicode_boundaries() {
1326 let segments = segment_words("Café 東京 42!", &TextBoundaryOptions::default());
1327 let texts = segments
1328 .into_iter()
1329 .map(|segment| segment.normalized)
1330 .collect::<Vec<_>>();
1331 assert_eq!(texts, vec!["café", "東京", "42"]);
1332 }
1333
1334 #[test]
1335 fn profiles_scripts() {
1336 let profile = detect_script_profile("Hello 東京 123!");
1337 assert_eq!(profile.scripts.get("Latin"), Some(&5));
1338 assert_eq!(profile.scripts.get("Han"), Some(&2));
1339 assert_eq!(profile.digits, 3);
1340 assert_eq!(profile.dominant_script.as_deref(), Some("Latin"));
1341 assert!(profile.is_mixed);
1342 }
1343
1344 #[test]
1345 fn splits_paragraphs_on_blank_lines() {
1346 let paragraphs = split_paragraphs("First paragraph.\nStill first.\n\nSecond.");
1347 assert_eq!(paragraphs.len(), 2);
1348 assert_eq!(paragraphs[0].sentence_count, 2);
1349 assert_eq!(paragraphs[1].text, "Second.");
1350 }
1351
1352 #[test]
1353 fn detailed_stats_include_derived_counts() {
1354 let stats = detailed_text_stats("One sentence.\n\nTwo words here.", &Default::default());
1355 assert_eq!(stats.paragraphs, 2);
1356 assert_eq!(stats.basic.sentences, 2);
1357 assert!(stats.average_words_per_sentence > 0.0);
1358 }
1359
1360 #[test]
1361 fn builds_annotation_graph_with_stable_cross_references() {
1362 let graph = build_annotation_graph(
1363 "Alice launched the API.\n\nBerlin hosted the event.",
1364 &TextProcessingOptions::default(),
1365 );
1366 assert_eq!(graph.tokens.len(), 8);
1367 assert_eq!(graph.sentences.len(), 2);
1368 assert_eq!(graph.paragraphs.len(), 2);
1369 assert_eq!(graph.provenance, AnnotationProvenance::Tokenizer);
1370 assert!(graph.confidence.get() > 0.0);
1371
1372 let first_sentence = &graph.sentences[0];
1373 let first_token = graph.token(first_sentence.token_ids[0]).unwrap();
1374 assert_eq!(first_token.text, "Alice");
1375 assert_eq!(first_token.sentence_id, first_sentence.id);
1376 assert_eq!(first_token.paragraph_id, first_sentence.paragraph_id);
1377
1378 let second_paragraph = &graph.paragraphs[1];
1379 assert_eq!(second_paragraph.sentence_ids.len(), 1);
1380 let sentence = graph.sentence(second_paragraph.sentence_ids[0]).unwrap();
1381 assert_eq!(sentence.text, "Berlin hosted the event.");
1382 }
1383}