text_core/
lib.rs

1#![doc = include_str!("../README.md")]
2
3pub mod contracts;
4pub mod operations;
5pub mod surface;
6
7pub use contracts::{
8    AsTextSegmentContract, IntoTextDocumentContract, TextAnnotationSpan, TextDocumentContract,
9    TextProvenance, TextSegmentContract, TextSourceRef, TimebaseContract, TimestampContract,
10};
11
12use std::collections::BTreeMap;
13
14use serde::{Deserialize, Serialize};
15use unicode_normalization::UnicodeNormalization;
16use unicode_segmentation::UnicodeSegmentation;
17use video_analysis_core::{OwnedTextSegment, TextSegment, Timestamp};
18
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
20/// Borrowed text document used as the lightweight boundary between text crates.
21pub struct TextDocument<'a> {
22    /// Stable caller-supplied document id.
23    pub id: &'a str,
24    /// UTF-8 document body.
25    pub text: &'a str,
26    /// Optional BCP-47-style language hint such as `en`.
27    pub language: Option<&'a str>,
28    /// Optional media timestamp when this document came from a timed segment.
29    pub timestamp: Option<Timestamp>,
30}
31
32impl<'a> TextDocument<'a> {
33    /// Creates a new value.
34    pub fn new(id: &'a str, text: &'a str) -> Self {
35        Self {
36            id,
37            text,
38            language: None,
39            timestamp: None,
40        }
41    }
42
43    /// Builds this value from segment identifier.
44    pub fn from_segment_id(id: &'a str, segment: &TextSegment<'a>) -> Self {
45        Self {
46            id,
47            text: segment.text,
48            language: segment.language,
49            timestamp: segment.timestamp,
50        }
51    }
52
53    /// Builds this value from stream segment.
54    pub fn from_stream_segment(stream_id: &str, segment: &TextSegment<'_>) -> OwnedTextDocument {
55        OwnedTextDocument::from_stream_segment(stream_id, segment)
56    }
57
58    /// Builds a document using `stream_id` directly as the document id.
59    ///
60    /// Prefer [`TextDocument::from_segment_id`] when the id has already been
61    /// chosen, or [`TextDocument::from_stream_segment`] when converting a
62    /// stream segment into the canonical `stream_id:segment_index` id.
63    pub fn from_segment(stream_id: &'a str, segment: &TextSegment<'a>) -> Self {
64        Self {
65            id: stream_id,
66            text: segment.text,
67            language: segment.language,
68            timestamp: segment.timestamp,
69        }
70    }
71}
72
73#[derive(Debug, Clone, PartialEq, Eq)]
74/// Owned text document for storage, serialization, and cross-thread workflows.
75pub struct OwnedTextDocument {
76    /// Stable caller-supplied document id.
77    pub id: String,
78    /// UTF-8 document body.
79    pub text: String,
80    /// Optional BCP-47-style language hint such as `en`.
81    pub language: Option<String>,
82    /// Optional media timestamp when this document came from a timed segment.
83    pub timestamp: Option<Timestamp>,
84}
85
86impl OwnedTextDocument {
87    /// Creates a new value.
88    pub fn new(id: impl Into<String>, text: impl Into<String>) -> Self {
89        Self {
90            id: id.into(),
91            text: text.into(),
92            language: None,
93            timestamp: None,
94        }
95    }
96
97    /// Returns language.
98    pub fn language(mut self, language: impl Into<String>) -> Self {
99        self.language = Some(language.into());
100        self
101    }
102
103    /// Returns timestamp.
104    pub fn timestamp(mut self, timestamp: Timestamp) -> Self {
105        self.timestamp = Some(timestamp);
106        self
107    }
108
109    /// Builds this value from segment identifier.
110    pub fn from_segment_id(id: impl Into<String>, segment: &OwnedTextSegment) -> Self {
111        let segment = segment.as_segment();
112        Self {
113            id: id.into(),
114            text: segment.text.to_string(),
115            language: segment.language.map(ToString::to_string),
116            timestamp: segment.timestamp,
117        }
118    }
119
120    /// Builds this value from stream segment.
121    pub fn from_stream_segment(stream_id: &str, segment: &TextSegment<'_>) -> Self {
122        Self {
123            id: segment_document_id(stream_id, segment.segment_index),
124            text: segment.text.to_string(),
125            language: segment.language.map(ToString::to_string),
126            timestamp: segment.timestamp,
127        }
128    }
129
130    /// Builds this value from owned stream segment.
131    pub fn from_owned_stream_segment(stream_id: &str, segment: &OwnedTextSegment) -> Self {
132        Self::from_stream_segment(stream_id, &segment.as_segment())
133    }
134
135    /// Builds a document using the supplied `stream_id` directly as the id.
136    ///
137    /// Prefer [`OwnedTextDocument::from_segment_id`] when the id has already
138    /// been chosen, or [`OwnedTextDocument::from_owned_stream_segment`] for the
139    /// canonical `stream_id:segment_index` id.
140    pub fn from_segment(stream_id: impl Into<String>, segment: &OwnedTextSegment) -> Self {
141        let segment = segment.as_segment();
142        Self {
143            id: stream_id.into(),
144            text: segment.text.to_string(),
145            language: segment.language.map(ToString::to_string),
146            timestamp: segment.timestamp,
147        }
148    }
149
150    /// Borrows this value as a document.
151    pub fn as_document(&self) -> TextDocument<'_> {
152        TextDocument {
153            id: &self.id,
154            text: &self.text,
155            language: self.language.as_deref(),
156            timestamp: self.timestamp,
157        }
158    }
159}
160
161/// Returns segment document identifier.
162pub fn segment_document_id(stream_id: &str, segment_index: u64) -> String {
163    format!("{stream_id}:{segment_index}")
164}
165
166#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
167/// Data type for text stats.
168pub struct TextStats {
169    /// The bytes value.
170    pub bytes: usize,
171    /// The chars value.
172    pub chars: usize,
173    /// The words value.
174    pub words: usize,
175    /// The lines value.
176    pub lines: usize,
177    /// The sentences value.
178    pub sentences: usize,
179}
180
181#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
182/// Half-open byte and character span into a UTF-8 text buffer.
183pub struct TextSpan {
184    /// Inclusive byte offset.
185    pub byte_start: usize,
186    /// Exclusive byte offset.
187    pub byte_end: usize,
188    /// Inclusive Unicode scalar index.
189    pub char_start: usize,
190    /// Exclusive Unicode scalar index.
191    pub char_end: usize,
192}
193
194#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
195/// Data type for annotation identifier.
196pub struct AnnotationId(pub usize);
197
198#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
199/// Confidence score normalized into the inclusive range `0.0..=1.0`.
200pub struct AnnotationConfidence(f32);
201
202impl AnnotationConfidence {
203    /// Creates a clamped confidence score; non-finite inputs become `0.0`.
204    pub fn new(value: f32) -> Self {
205        let value = if value.is_finite() {
206            value.clamp(0.0, 1.0)
207        } else {
208            0.0
209        };
210        Self(value)
211    }
212
213    /// Returns the normalized confidence value.
214    pub fn get(self) -> f32 {
215        self.0
216    }
217}
218
219impl Default for AnnotationConfidence {
220    fn default() -> Self {
221        Self(1.0)
222    }
223}
224
225impl From<f32> for AnnotationConfidence {
226    fn from(value: f32) -> Self {
227        Self::new(value)
228    }
229}
230
231#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
232/// Variants describing annotation provenance.
233pub enum AnnotationProvenance {
234    /// The heuristic variant.
235    Heuristic,
236    /// The tokenizer variant.
237    Tokenizer,
238    /// The ONNX variant.
239    Onnx,
240    /// The candle variant.
241    Candle,
242    /// The cuda oxide variant.
243    CudaOxide,
244    /// The external variant.
245    External,
246    /// The derived variant.
247    Derived,
248}
249
250#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
251/// Data type for text span ref.
252pub struct TextSpanRef {
253    /// Identifier for this value.
254    pub id: AnnotationId,
255    /// The span value.
256    pub span: TextSpan,
257}
258
259#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
260/// Data type for text boundary options.
261pub struct TextBoundaryOptions {
262    /// The lowercase value.
263    pub lowercase: bool,
264    /// The normalize unicode value.
265    pub normalize_unicode: bool,
266    /// The include numbers value.
267    pub include_numbers: bool,
268    /// The include punctuation value.
269    pub include_punctuation: bool,
270    /// The min chars value.
271    pub min_chars: usize,
272}
273
274impl Default for TextBoundaryOptions {
275    fn default() -> Self {
276        Self {
277            lowercase: true,
278            normalize_unicode: true,
279            include_numbers: true,
280            include_punctuation: false,
281            min_chars: 1,
282        }
283    }
284}
285
286#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
287/// Data type for word segment.
288pub struct WordSegment {
289    /// Text content for this value.
290    pub text: String,
291    /// The normalized value.
292    pub normalized: String,
293    /// The span value.
294    pub span: TextSpan,
295    /// The kind value.
296    pub kind: TokenKind,
297}
298
299#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
300/// Data type for grapheme span.
301pub struct GraphemeSpan {
302    /// Text content for this value.
303    pub text: String,
304    /// The span value.
305    pub span: TextSpan,
306}
307
308#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
309/// Data type for script profile.
310pub struct ScriptProfile {
311    /// The scripts value.
312    pub scripts: BTreeMap<String, usize>,
313    /// The digits value.
314    pub digits: usize,
315    /// The whitespace value.
316    pub whitespace: usize,
317    /// The punctuation value.
318    pub punctuation: usize,
319    /// The other value.
320    pub other: usize,
321    /// The dominant script value.
322    pub dominant_script: Option<String>,
323    /// The is mixed value.
324    pub is_mixed: bool,
325}
326
327#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
328/// Data type for token.
329pub struct Token {
330    /// Text content for this value.
331    pub text: String,
332    /// The normalized value.
333    pub normalized: String,
334    /// The span value.
335    pub span: TextSpan,
336    /// The kind value.
337    pub kind: TokenKind,
338}
339
340#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
341/// Variants describing token kind.
342pub enum TokenKind {
343    /// The word variant.
344    Word,
345    /// The number variant.
346    Number,
347    /// The URL variant.
348    Url,
349    /// The email variant.
350    Email,
351    /// The mention variant.
352    Mention,
353    /// The hashtag variant.
354    Hashtag,
355    /// The punctuation variant.
356    Punctuation,
357    /// The other variant.
358    Other,
359}
360
361#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
362/// Data type for sentence.
363pub struct Sentence {
364    /// Text content for this value.
365    pub text: String,
366    /// The span value.
367    pub span: TextSpan,
368    /// The token count value.
369    pub token_count: usize,
370}
371
372#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
373/// Data type for paragraph.
374pub struct Paragraph {
375    /// Text content for this value.
376    pub text: String,
377    /// The span value.
378    pub span: TextSpan,
379    /// The sentence count value.
380    pub sentence_count: usize,
381}
382
383#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
384/// Data type for canonical token.
385pub struct CanonicalToken {
386    /// Identifier for this value.
387    pub id: AnnotationId,
388    /// The span value.
389    pub span: TextSpanRef,
390    /// Text content for this value.
391    pub text: String,
392    /// The normalized value.
393    pub normalized: String,
394    /// The kind value.
395    pub kind: TokenKind,
396    /// The sentence identifier value.
397    pub sentence_id: AnnotationId,
398    /// The paragraph identifier value.
399    pub paragraph_id: Option<AnnotationId>,
400}
401
402#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
403/// Data type for annotated sentence.
404pub struct AnnotatedSentence {
405    /// Identifier for this value.
406    pub id: AnnotationId,
407    /// The span value.
408    pub span: TextSpanRef,
409    /// Text content for this value.
410    pub text: String,
411    /// The token start value.
412    pub token_start: usize,
413    /// The token end value.
414    pub token_end: usize,
415    /// The token identifiers value.
416    pub token_ids: Vec<AnnotationId>,
417    /// The paragraph identifier value.
418    pub paragraph_id: Option<AnnotationId>,
419}
420
421#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
422/// Data type for annotated paragraph.
423pub struct AnnotatedParagraph {
424    /// Identifier for this value.
425    pub id: AnnotationId,
426    /// The span value.
427    pub span: TextSpanRef,
428    /// Text content for this value.
429    pub text: String,
430    /// The sentence start value.
431    pub sentence_start: usize,
432    /// The sentence end value.
433    pub sentence_end: usize,
434    /// The sentence identifiers value.
435    pub sentence_ids: Vec<AnnotationId>,
436}
437
438#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
439/// Data type for text annotation graph.
440pub struct TextAnnotationGraph {
441    /// Text content for this value.
442    pub text: String,
443    /// The provenance value.
444    pub provenance: AnnotationProvenance,
445    /// Confidence score for this value.
446    pub confidence: AnnotationConfidence,
447    /// The tokens value.
448    pub tokens: Vec<CanonicalToken>,
449    /// The sentences value.
450    pub sentences: Vec<AnnotatedSentence>,
451    /// The paragraphs value.
452    pub paragraphs: Vec<AnnotatedParagraph>,
453}
454
455impl TextAnnotationGraph {
456    /// Returns token.
457    pub fn token(&self, id: AnnotationId) -> Option<&CanonicalToken> {
458        self.tokens.iter().find(|token| token.id == id)
459    }
460
461    /// Returns sentence.
462    pub fn sentence(&self, id: AnnotationId) -> Option<&AnnotatedSentence> {
463        self.sentences.iter().find(|sentence| sentence.id == id)
464    }
465
466    /// Returns paragraph.
467    pub fn paragraph(&self, id: AnnotationId) -> Option<&AnnotatedParagraph> {
468        self.paragraphs.iter().find(|paragraph| paragraph.id == id)
469    }
470}
471
472#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
473/// Data type for text processing options.
474pub struct TextProcessingOptions {
475    /// Language tag for this value.
476    pub language: Option<String>,
477    /// The lowercase value.
478    pub lowercase: bool,
479    /// The normalize unicode value.
480    pub normalize_unicode: bool,
481    /// The keep apostrophes value.
482    pub keep_apostrophes: bool,
483    /// The include punctuation value.
484    pub include_punctuation: bool,
485}
486
487impl Default for TextProcessingOptions {
488    fn default() -> Self {
489        Self {
490            language: None,
491            lowercase: true,
492            normalize_unicode: true,
493            keep_apostrophes: true,
494            include_punctuation: false,
495        }
496    }
497}
498
499#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
500/// Data type for detailed text stats.
501pub struct DetailedTextStats {
502    /// The basic value.
503    pub basic: TextStats,
504    /// The paragraphs value.
505    pub paragraphs: usize,
506    /// The tokens value.
507    pub tokens: usize,
508    /// The unique tokens value.
509    pub unique_tokens: usize,
510    /// The average words per sentence value.
511    pub average_words_per_sentence: f32,
512    /// The average chars per word value.
513    pub average_chars_per_word: f32,
514}
515
516/// Returns text stats.
517pub fn text_stats(text: &str) -> TextStats {
518    TextStats {
519        bytes: text.len(),
520        chars: text.chars().count(),
521        words: tokenize_words(text).len(),
522        lines: text.lines().count(),
523        sentences: split_sentences(text).len(),
524    }
525}
526
527/// Returns normalize whitespace.
528pub fn normalize_whitespace(text: &str) -> String {
529    text.split_whitespace().collect::<Vec<_>>().join(" ")
530}
531
532/// Returns normalize text.
533pub fn normalize_text(text: &str, options: &TextProcessingOptions) -> String {
534    let normalized = if options.normalize_unicode {
535        text.nfkc().collect::<String>()
536    } else {
537        text.to_string()
538    };
539    if options.lowercase {
540        normalized.to_lowercase()
541    } else {
542        normalized
543    }
544}
545
546/// Returns tokenize words.
547pub fn tokenize_words(text: &str) -> Vec<String> {
548    let options = TextProcessingOptions::default();
549    tokenize(text, &options)
550        .into_iter()
551        .filter(|token| {
552            matches!(
553                token.kind,
554                TokenKind::Word
555                    | TokenKind::Number
556                    | TokenKind::Url
557                    | TokenKind::Email
558                    | TokenKind::Mention
559                    | TokenKind::Hashtag
560            )
561        })
562        .map(|token| token.normalized)
563        .collect()
564}
565
566/// Returns word counts.
567pub fn word_counts(text: &str) -> BTreeMap<String, usize> {
568    let mut counts = BTreeMap::new();
569    for token in tokenize_words(text) {
570        *counts.entry(token).or_insert(0) += 1;
571    }
572    counts
573}
574
575/// Returns split sentences.
576pub fn split_sentences(text: &str) -> Vec<String> {
577    split_sentence_spans(text, &TextProcessingOptions::default())
578        .into_iter()
579        .map(|sentence| normalize_whitespace(&sentence.text))
580        .collect()
581}
582
583/// Returns segment words.
584pub fn segment_words(text: &str, options: &TextBoundaryOptions) -> Vec<WordSegment> {
585    let processing = TextProcessingOptions {
586        lowercase: options.lowercase,
587        normalize_unicode: options.normalize_unicode,
588        include_punctuation: options.include_punctuation,
589        ..TextProcessingOptions::default()
590    };
591    let mut segments = Vec::<WordSegment>::new();
592    for (byte_start, segment) in UnicodeSegmentation::split_word_bound_indices(text) {
593        if segment.chars().all(char::is_whitespace) {
594            continue;
595        }
596        let kind = classify_word_segment(segment);
597        let keep = match kind {
598            TokenKind::Word
599            | TokenKind::Url
600            | TokenKind::Email
601            | TokenKind::Mention
602            | TokenKind::Hashtag => true,
603            TokenKind::Number => options.include_numbers,
604            TokenKind::Punctuation => options.include_punctuation,
605            TokenKind::Other => segment.chars().any(char::is_alphanumeric),
606        };
607        if !keep || segment.chars().count() < options.min_chars {
608            continue;
609        }
610        let byte_end = byte_start + segment.len();
611        let current = WordSegment {
612            text: segment.to_string(),
613            normalized: normalize_text(segment, &processing),
614            span: span_for(text, byte_start, byte_end),
615            kind,
616        };
617        if let Some(previous) = segments.last_mut() {
618            if previous.kind == TokenKind::Word
619                && current.kind == TokenKind::Word
620                && previous.span.byte_end == current.span.byte_start
621            {
622                previous.text.push_str(&current.text);
623                previous.normalized = normalize_text(&previous.text, &processing);
624                previous.span.byte_end = current.span.byte_end;
625                previous.span.char_end = current.span.char_end;
626                continue;
627            }
628        }
629        segments.push(current);
630    }
631    segments
632}
633
634/// Returns segment graphemes.
635pub fn segment_graphemes(text: &str) -> Vec<GraphemeSpan> {
636    UnicodeSegmentation::grapheme_indices(text, true)
637        .map(|(byte_start, grapheme)| {
638            let byte_end = byte_start + grapheme.len();
639            GraphemeSpan {
640                text: grapheme.to_string(),
641                span: span_for(text, byte_start, byte_end),
642            }
643        })
644        .collect()
645}
646
647/// Returns detect script profile.
648pub fn detect_script_profile(text: &str) -> ScriptProfile {
649    let mut scripts = BTreeMap::<String, usize>::new();
650    let mut digits = 0;
651    let mut whitespace = 0;
652    let mut punctuation = 0;
653    let mut other = 0;
654
655    for ch in text.chars() {
656        if ch.is_whitespace() {
657            whitespace += 1;
658        } else if ch.is_numeric() {
659            digits += 1;
660        } else if is_sentence_or_symbol_punctuation(ch) || ch.is_ascii_punctuation() {
661            punctuation += 1;
662        } else if let Some(script) = script_name(ch) {
663            *scripts.entry(script.to_string()).or_insert(0) += 1;
664        } else {
665            other += 1;
666        }
667    }
668
669    let dominant_script = scripts
670        .iter()
671        .max_by(|left, right| left.1.cmp(right.1).then_with(|| right.0.cmp(left.0)))
672        .map(|(script, _)| script.clone());
673    let is_mixed = scripts.len() > 1;
674
675    ScriptProfile {
676        scripts,
677        digits,
678        whitespace,
679        punctuation,
680        other,
681        dominant_script,
682        is_mixed,
683    }
684}
685
686/// Returns tokenize.
687pub fn tokenize(text: &str, options: &TextProcessingOptions) -> Vec<Token> {
688    let mut tokens = Vec::new();
689    let mut byte_index = 0;
690
691    while byte_index < text.len() {
692        let ch = next_char(text, byte_index);
693        if ch.is_whitespace() {
694            byte_index += ch.len_utf8();
695            continue;
696        }
697
698        let (byte_end, kind) = if starts_url(text, byte_index) {
699            (consume_until_whitespace(text, byte_index), TokenKind::Url)
700        } else if ch == '@' {
701            let end = consume_prefixed_word(text, byte_index);
702            if end > byte_index + ch.len_utf8() {
703                (end, TokenKind::Mention)
704            } else {
705                (byte_index + ch.len_utf8(), TokenKind::Other)
706            }
707        } else if ch == '#' {
708            let end = consume_prefixed_word(text, byte_index);
709            if end > byte_index + ch.len_utf8() {
710                (end, TokenKind::Hashtag)
711            } else {
712                (byte_index + ch.len_utf8(), TokenKind::Other)
713            }
714        } else if ch.is_ascii_digit() {
715            (consume_number(text, byte_index), TokenKind::Number)
716        } else if is_word_char(ch, options.keep_apostrophes) {
717            let mut end = consume_word_like(text, byte_index, options.keep_apostrophes);
718            let candidate_end =
719                trim_trailing_token_punctuation(text, byte_index, end, TokenKind::Email);
720            let candidate = &text[byte_index..candidate_end];
721            let kind = if is_email(candidate) {
722                end = candidate_end;
723                TokenKind::Email
724            } else {
725                end = consume_plain_word(text, byte_index, options.keep_apostrophes);
726                TokenKind::Word
727            };
728            (end, kind)
729        } else if is_sentence_or_symbol_punctuation(ch) {
730            (byte_index + ch.len_utf8(), TokenKind::Punctuation)
731        } else {
732            (byte_index + ch.len_utf8(), TokenKind::Other)
733        };
734
735        let byte_end = trim_trailing_token_punctuation(text, byte_index, byte_end, kind);
736        if byte_end == byte_index {
737            byte_index += ch.len_utf8();
738            continue;
739        }
740        if kind != TokenKind::Punctuation || options.include_punctuation {
741            let raw = &text[byte_index..byte_end];
742            tokens.push(Token {
743                text: raw.to_string(),
744                normalized: normalize_text(raw, options),
745                span: span_for(text, byte_index, byte_end),
746                kind,
747            });
748        }
749        byte_index = byte_end;
750    }
751
752    tokens
753}
754
755/// Returns split sentence spans.
756pub fn split_sentence_spans(text: &str, options: &TextProcessingOptions) -> Vec<Sentence> {
757    let mut sentences = Vec::new();
758    let mut start = 0;
759    let chars = text.char_indices().collect::<Vec<_>>();
760
761    for (position, (byte_index, ch)) in chars.iter().copied().enumerate() {
762        if !is_sentence_terminator(ch) {
763            continue;
764        }
765        if ch == '.' && is_abbreviation_boundary(text, byte_index) {
766            continue;
767        }
768        if ch == '.'
769            && previous_char(&chars, position).is_some_and(|value| value.is_ascii_digit())
770            && next_char_from_indices(&chars, position).is_some_and(|value| value.is_ascii_digit())
771        {
772            continue;
773        }
774        if next_char_from_indices(&chars, position).is_some_and(is_sentence_terminator) {
775            continue;
776        }
777
778        let end = byte_index + ch.len_utf8();
779        push_sentence(text, start, end, options, &mut sentences);
780        start = end;
781    }
782
783    push_sentence(text, start, text.len(), options, &mut sentences);
784    sentences
785}
786
787/// Returns split paragraphs.
788pub fn split_paragraphs(text: &str) -> Vec<Paragraph> {
789    let mut paragraphs = Vec::new();
790    let mut paragraph_start = None;
791    let mut last_non_blank_end = 0;
792    let mut line_start = 0;
793
794    for line in text.split_inclusive('\n') {
795        let line_end = line_start + line.len();
796        let line_without_newline = line.trim_end_matches(['\r', '\n']);
797        if line_without_newline.trim().is_empty() {
798            if let Some(start) = paragraph_start.take() {
799                push_paragraph(text, start, last_non_blank_end, &mut paragraphs);
800            }
801        } else {
802            let content_start =
803                line_start + (line_without_newline.len() - line_without_newline.trim_start().len());
804            paragraph_start.get_or_insert(content_start);
805            last_non_blank_end = line_start + line_without_newline.trim_end().len();
806        }
807        line_start = line_end;
808    }
809
810    if let Some(start) = paragraph_start {
811        push_paragraph(text, start, last_non_blank_end, &mut paragraphs);
812    }
813
814    paragraphs
815}
816
817/// Builds annotation graph.
818pub fn build_annotation_graph(text: &str, options: &TextProcessingOptions) -> TextAnnotationGraph {
819    let tokens = tokenize(text, options);
820    let sentences = split_sentence_spans(text, options);
821    let paragraphs = split_paragraphs(text);
822    build_annotation_graph_from_parts(text, &tokens, &sentences, &paragraphs)
823}
824
825/// Builds annotation graph from parts.
826pub fn build_annotation_graph_from_parts(
827    text: &str,
828    tokens: &[Token],
829    sentences: &[Sentence],
830    paragraphs: &[Paragraph],
831) -> TextAnnotationGraph {
832    let token_id_offset = 0;
833    let sentence_id_offset = tokens.len();
834    let paragraph_id_offset = tokens.len() + sentences.len();
835
836    let annotated_paragraphs = paragraphs
837        .iter()
838        .enumerate()
839        .map(|(index, paragraph)| {
840            let id = AnnotationId(paragraph_id_offset + index);
841            let sentence_indices = sentences
842                .iter()
843                .enumerate()
844                .filter(|(_, sentence)| span_is_inside(sentence.span, paragraph.span))
845                .map(|(sentence_index, _)| sentence_index)
846                .collect::<Vec<_>>();
847            let sentence_ids = sentence_indices
848                .iter()
849                .map(|sentence_index| AnnotationId(sentence_id_offset + sentence_index))
850                .collect::<Vec<_>>();
851            let sentence_start = sentence_indices.first().copied().unwrap_or(sentences.len());
852            let sentence_end = sentence_indices
853                .last()
854                .map(|index| index + 1)
855                .unwrap_or(sentence_start);
856
857            AnnotatedParagraph {
858                id,
859                span: TextSpanRef {
860                    id,
861                    span: paragraph.span,
862                },
863                text: paragraph.text.clone(),
864                sentence_start,
865                sentence_end,
866                sentence_ids,
867            }
868        })
869        .collect::<Vec<_>>();
870
871    let annotated_sentences = sentences
872        .iter()
873        .enumerate()
874        .map(|(sentence_index, sentence)| {
875            let id = AnnotationId(sentence_id_offset + sentence_index);
876            let token_indices = tokens
877                .iter()
878                .enumerate()
879                .filter(|(_, token)| span_is_inside(token.span, sentence.span))
880                .map(|(index, _)| index)
881                .collect::<Vec<_>>();
882            let paragraph_id = annotated_paragraphs
883                .iter()
884                .find(|paragraph| span_is_inside(sentence.span, paragraph.span.span))
885                .map(|paragraph| paragraph.id);
886            AnnotatedSentence {
887                id,
888                span: TextSpanRef {
889                    id,
890                    span: sentence.span,
891                },
892                text: sentence.text.clone(),
893                token_start: token_indices.first().copied().unwrap_or(tokens.len()),
894                token_end: token_indices
895                    .last()
896                    .map(|index| index + 1)
897                    .unwrap_or(tokens.len()),
898                token_ids: token_indices
899                    .into_iter()
900                    .map(|index| AnnotationId(token_id_offset + index))
901                    .collect(),
902                paragraph_id,
903            }
904        })
905        .collect::<Vec<_>>();
906
907    let annotated_tokens = tokens
908        .iter()
909        .enumerate()
910        .map(|(token_index, token)| {
911            let id = AnnotationId(token_id_offset + token_index);
912            let sentence_id = annotated_sentences
913                .iter()
914                .find(|sentence| span_is_inside(token.span, sentence.span.span))
915                .map(|sentence| sentence.id)
916                .unwrap_or(AnnotationId(sentence_id_offset));
917            let paragraph_id = annotated_paragraphs
918                .iter()
919                .find(|paragraph| span_is_inside(token.span, paragraph.span.span))
920                .map(|paragraph| paragraph.id);
921            CanonicalToken {
922                id,
923                span: TextSpanRef {
924                    id,
925                    span: token.span,
926                },
927                text: token.text.clone(),
928                normalized: token.normalized.clone(),
929                kind: token.kind,
930                sentence_id,
931                paragraph_id,
932            }
933        })
934        .collect();
935
936    TextAnnotationGraph {
937        text: text.to_string(),
938        provenance: AnnotationProvenance::Tokenizer,
939        confidence: AnnotationConfidence::default(),
940        tokens: annotated_tokens,
941        sentences: annotated_sentences,
942        paragraphs: annotated_paragraphs,
943    }
944}
945
946/// Returns detailed text stats.
947pub fn detailed_text_stats(text: &str, options: &TextProcessingOptions) -> DetailedTextStats {
948    let basic = text_stats(text);
949    let paragraphs = split_paragraphs(text).len();
950    let tokens = tokenize(text, options);
951    let unique_tokens = tokens
952        .iter()
953        .filter(|token| token.kind != TokenKind::Punctuation)
954        .map(|token| token.normalized.clone())
955        .collect::<std::collections::BTreeSet<_>>()
956        .len();
957    let chars_in_words = tokens
958        .iter()
959        .filter(|token| matches!(token.kind, TokenKind::Word | TokenKind::Number))
960        .map(|token| token.text.chars().count())
961        .sum::<usize>();
962    DetailedTextStats {
963        basic,
964        paragraphs,
965        tokens: tokens.len(),
966        unique_tokens,
967        average_words_per_sentence: if basic.sentences == 0 {
968            0.0
969        } else {
970            basic.words as f32 / basic.sentences as f32
971        },
972        average_chars_per_word: if basic.words == 0 {
973            0.0
974        } else {
975            chars_in_words as f32 / basic.words as f32
976        },
977    }
978}
979
980fn next_char(text: &str, byte_index: usize) -> char {
981    text[byte_index..]
982        .chars()
983        .next()
984        .expect("byte_index must be inside text")
985}
986
987fn span_for(text: &str, byte_start: usize, byte_end: usize) -> TextSpan {
988    TextSpan {
989        byte_start,
990        byte_end,
991        char_start: text[..byte_start].chars().count(),
992        char_end: text[..byte_end].chars().count(),
993    }
994}
995
996fn span_is_inside(inner: TextSpan, outer: TextSpan) -> bool {
997    inner.byte_start >= outer.byte_start && inner.byte_end <= outer.byte_end
998}
999
1000fn starts_url(text: &str, byte_index: usize) -> bool {
1001    let tail = &text[byte_index..];
1002    tail.starts_with("http://") || tail.starts_with("https://") || tail.starts_with("www.")
1003}
1004
1005fn classify_word_segment(segment: &str) -> TokenKind {
1006    if starts_url(segment, 0) {
1007        TokenKind::Url
1008    } else if is_email(segment) {
1009        TokenKind::Email
1010    } else if segment.starts_with('@') && segment[1..].chars().any(char::is_alphanumeric) {
1011        TokenKind::Mention
1012    } else if segment.starts_with('#') && segment[1..].chars().any(char::is_alphanumeric) {
1013        TokenKind::Hashtag
1014    } else if segment
1015        .chars()
1016        .all(|ch| ch.is_numeric() || matches!(ch, '.' | ',' | ':' | '/' | '-'))
1017    {
1018        TokenKind::Number
1019    } else if segment.chars().all(is_sentence_or_symbol_punctuation) {
1020        TokenKind::Punctuation
1021    } else if segment.chars().any(char::is_alphanumeric) {
1022        TokenKind::Word
1023    } else {
1024        TokenKind::Other
1025    }
1026}
1027
1028fn consume_until_whitespace(text: &str, byte_start: usize) -> usize {
1029    let mut end = byte_start;
1030    for (offset, ch) in text[byte_start..].char_indices() {
1031        if ch.is_whitespace() {
1032            break;
1033        }
1034        end = byte_start + offset + ch.len_utf8();
1035    }
1036    end
1037}
1038
1039fn consume_prefixed_word(text: &str, byte_start: usize) -> usize {
1040    let base = byte_start + next_char(text, byte_start).len_utf8();
1041    let mut end = base;
1042    for (offset, ch) in text[base..].char_indices() {
1043        if ch.is_alphanumeric() || ch == '_' || ch == '-' {
1044            end = base + offset + ch.len_utf8();
1045        } else {
1046            break;
1047        }
1048    }
1049    end
1050}
1051
1052fn consume_number(text: &str, byte_start: usize) -> usize {
1053    let mut end = byte_start;
1054    for (offset, ch) in text[byte_start..].char_indices() {
1055        if ch.is_ascii_digit() || matches!(ch, '.' | ',' | ':' | '/' | '-') {
1056            end = byte_start + offset + ch.len_utf8();
1057        } else {
1058            break;
1059        }
1060    }
1061    end
1062}
1063
1064fn consume_word_like(text: &str, byte_start: usize, keep_apostrophes: bool) -> usize {
1065    let mut end = byte_start;
1066    for (offset, ch) in text[byte_start..].char_indices() {
1067        if is_word_char(ch, keep_apostrophes) || matches!(ch, '@' | '.' | '_' | '-' | '+') {
1068            end = byte_start + offset + ch.len_utf8();
1069        } else {
1070            break;
1071        }
1072    }
1073    end
1074}
1075
1076fn consume_plain_word(text: &str, byte_start: usize, keep_apostrophes: bool) -> usize {
1077    let mut end = byte_start;
1078    for (offset, ch) in text[byte_start..].char_indices() {
1079        if is_word_char(ch, keep_apostrophes) {
1080            end = byte_start + offset + ch.len_utf8();
1081        } else {
1082            break;
1083        }
1084    }
1085    end
1086}
1087
1088fn is_word_char(ch: char, keep_apostrophes: bool) -> bool {
1089    ch.is_alphanumeric() || (keep_apostrophes && is_apostrophe(ch))
1090}
1091
1092fn is_apostrophe(ch: char) -> bool {
1093    matches!(ch, '\'' | '’')
1094}
1095
1096fn is_email(candidate: &str) -> bool {
1097    let Some((local, domain)) = candidate.split_once('@') else {
1098        return false;
1099    };
1100    !local.is_empty() && domain.contains('.') && !domain.starts_with('.') && !domain.ends_with('.')
1101}
1102
1103fn is_sentence_or_symbol_punctuation(ch: char) -> bool {
1104    ch.is_ascii_punctuation()
1105        || matches!(
1106            ch,
1107            '…' | '。' | '！' | '？' | '،' | '؛' | '¿' | '¡' | '«' | '»'
1108        )
1109}
1110
1111fn trim_trailing_token_punctuation(
1112    text: &str,
1113    byte_start: usize,
1114    mut byte_end: usize,
1115    kind: TokenKind,
1116) -> usize {
1117    if !matches!(kind, TokenKind::Url | TokenKind::Email | TokenKind::Number) {
1118        return byte_end;
1119    }
1120    while byte_end > byte_start {
1121        let Some(ch) = text[..byte_end].chars().next_back() else {
1122            break;
1123        };
1124        if matches!(ch, '.' | ',' | ';' | ':' | '!' | '?' | ')' | ']' | '}') {
1125            byte_end -= ch.len_utf8();
1126        } else {
1127            break;
1128        }
1129    }
1130    byte_end
1131}
1132
1133fn is_sentence_terminator(ch: char) -> bool {
1134    matches!(ch, '.' | '?' | '!' | '…' | '。' | '！' | '？')
1135}
1136
1137fn is_abbreviation_boundary(text: &str, period_byte_index: usize) -> bool {
1138    let prefix = &text[..period_byte_index];
1139    let word_start = prefix
1140        .char_indices()
1141        .rev()
1142        .find_map(|(index, ch)| (!ch.is_alphabetic()).then_some(index + ch.len_utf8()))
1143        .unwrap_or(0);
1144    let word = &text[word_start..period_byte_index];
1145    if word.is_empty() {
1146        return false;
1147    }
1148    let normalized = word.to_ascii_lowercase();
1149    matches!(
1150        normalized.as_str(),
1151        "mr" | "mrs"
1152            | "ms"
1153            | "dr"
1154            | "prof"
1155            | "sr"
1156            | "jr"
1157            | "st"
1158            | "vs"
1159            | "etc"
1160            | "e.g"
1161            | "i.e"
1162            | "u.s"
1163            | "u.k"
1164    ) || (word.chars().count() == 1
1165        && word
1166            .chars()
1167            .next()
1168            .is_some_and(|ch| ch.is_ascii_uppercase()))
1169}
1170
1171fn script_name(ch: char) -> Option<&'static str> {
1172    let value = ch as u32;
1173    match value {
1174        0x0041..=0x007A | 0x00C0..=0x024F | 0x1E00..=0x1EFF => Some("Latin"),
1175        0x0370..=0x03FF | 0x1F00..=0x1FFF => Some("Greek"),
1176        0x0400..=0x052F | 0x2DE0..=0x2DFF | 0xA640..=0xA69F => Some("Cyrillic"),
1177        0x0590..=0x05FF => Some("Hebrew"),
1178        0x0600..=0x06FF | 0x0750..=0x077F | 0x08A0..=0x08FF => Some("Arabic"),
1179        0x0900..=0x097F => Some("Devanagari"),
1180        0x3040..=0x309F => Some("Hiragana"),
1181        0x30A0..=0x30FF | 0x31F0..=0x31FF => Some("Katakana"),
1182        0x3400..=0x4DBF | 0x4E00..=0x9FFF | 0xF900..=0xFAFF => Some("Han"),
1183        0xAC00..=0xD7AF | 0x1100..=0x11FF | 0x3130..=0x318F => Some("Hangul"),
1184        _ if ch.is_alphabetic() => Some("Other"),
1185        _ => None,
1186    }
1187}
1188
1189fn previous_char(chars: &[(usize, char)], position: usize) -> Option<char> {
1190    position
1191        .checked_sub(1)
1192        .and_then(|index| chars.get(index).map(|(_, ch)| *ch))
1193}
1194
1195fn next_char_from_indices(chars: &[(usize, char)], position: usize) -> Option<char> {
1196    chars.get(position + 1).map(|(_, ch)| *ch)
1197}
1198
1199fn push_sentence(
1200    text: &str,
1201    byte_start: usize,
1202    byte_end: usize,
1203    options: &TextProcessingOptions,
1204    sentences: &mut Vec<Sentence>,
1205) {
1206    if byte_start >= byte_end {
1207        return;
1208    }
1209    let raw = &text[byte_start..byte_end];
1210    let leading = raw.len() - raw.trim_start().len();
1211    let trailing = raw.trim_end().len();
1212    let start = byte_start + leading;
1213    let end = byte_start + trailing;
1214    if start >= end {
1215        return;
1216    }
1217    let sentence_text = text[start..end].to_string();
1218    let token_count = tokenize(&sentence_text, options).len();
1219    sentences.push(Sentence {
1220        text: sentence_text,
1221        span: span_for(text, start, end),
1222        token_count,
1223    });
1224}
1225
1226fn push_paragraph(text: &str, byte_start: usize, byte_end: usize, paragraphs: &mut Vec<Paragraph>) {
1227    if byte_start >= byte_end {
1228        return;
1229    }
1230    let paragraph_text = text[byte_start..byte_end].to_string();
1231    paragraphs.push(Paragraph {
1232        sentence_count: split_sentence_spans(&paragraph_text, &TextProcessingOptions::default())
1233            .len(),
1234        text: paragraph_text,
1235        span: span_for(text, byte_start, byte_end),
1236    });
1237}
1238
1239#[cfg(test)]
1240mod tests {
1241    use super::*;
1242
1243    #[test]
1244    fn segment_document_ids_include_stream_and_index() {
1245        assert_eq!(segment_document_id("subs", 7), "subs:7");
1246    }
1247
1248    #[test]
1249    fn tokenizes_and_counts_words() {
1250        let counts = word_counts("Hello, hello world.");
1251        assert_eq!(counts.get("hello"), Some(&2));
1252        assert_eq!(counts.get("world"), Some(&1));
1253    }
1254
1255    #[test]
1256    fn computes_text_stats() {
1257        let stats = text_stats("One sentence. Two words!");
1258        assert_eq!(stats.sentences, 2);
1259        assert_eq!(stats.words, 4);
1260    }
1261
1262    #[test]
1263    fn tokenizes_unicode_words_with_offsets() {
1264        let tokens = tokenize("Hi café 東京", &TextProcessingOptions::default());
1265        assert_eq!(tokens[1].text, "café");
1266        assert_eq!(tokens[1].span.byte_start, 3);
1267        assert_eq!(tokens[1].span.char_start, 3);
1268        assert_eq!(tokens[2].text, "東京");
1269    }
1270
1271    #[test]
1272    fn classifies_common_token_patterns() {
1273        let tokens = tokenize(
1274            "Mail a@b.com. #rust @team https://example.com 3.14",
1275            &TextProcessingOptions::default(),
1276        );
1277        assert!(tokens.iter().any(|token| token.kind == TokenKind::Email));
1278        assert!(tokens.iter().any(|token| token.kind == TokenKind::Hashtag));
1279        assert!(tokens.iter().any(|token| token.kind == TokenKind::Mention));
1280        assert!(tokens.iter().any(|token| token.kind == TokenKind::Url));
1281        assert!(tokens.iter().any(|token| token.kind == TokenKind::Number));
1282    }
1283
1284    #[test]
1285    fn apostrophe_behavior_is_configurable() {
1286        let keep = TextProcessingOptions::default();
1287        assert_eq!(tokenize_words("Don't stop"), vec!["don't", "stop"]);
1288
1289        let split = TextProcessingOptions {
1290            keep_apostrophes: false,
1291            ..TextProcessingOptions::default()
1292        };
1293        let tokens = tokenize("Don't", &split)
1294            .into_iter()
1295            .map(|token| token.normalized)
1296            .collect::<Vec<_>>();
1297        assert_eq!(tokens, vec!["don", "t"]);
1298        assert_eq!(tokenize("Don't", &keep)[0].normalized, "don't");
1299    }
1300
1301    #[test]
1302    fn splits_sentences_with_decimals_ellipses_and_multilingual_marks() {
1303        let sentences = split_sentences("Dr. Smith wrote pi is 3.14. Wait... Really？ Yes!");
1304        assert_eq!(
1305            sentences,
1306            vec!["Dr. Smith wrote pi is 3.14.", "Wait...", "Really？", "Yes!"]
1307        );
1308    }
1309
1310    #[test]
1311    fn segments_graphemes_with_byte_and_char_spans() {
1312        let graphemes = segment_graphemes("e\u{301}👍🏽a");
1313        assert_eq!(graphemes.len(), 3);
1314        assert_eq!(graphemes[0].text, "e\u{301}");
1315        assert_eq!(graphemes[0].span.byte_start, 0);
1316        assert_eq!(graphemes[0].span.byte_end, 3);
1317        assert_eq!(graphemes[0].span.char_start, 0);
1318        assert_eq!(graphemes[0].span.char_end, 2);
1319        assert_eq!(graphemes[1].text, "👍🏽");
1320        assert_eq!(graphemes[1].span.char_start, 2);
1321        assert_eq!(graphemes[1].span.char_end, 4);
1322    }
1323
1324    #[test]
1325    fn segments_words_with_unicode_boundaries() {
1326        let segments = segment_words("Café 東京 42!", &TextBoundaryOptions::default());
1327        let texts = segments
1328            .into_iter()
1329            .map(|segment| segment.normalized)
1330            .collect::<Vec<_>>();
1331        assert_eq!(texts, vec!["café", "東京", "42"]);
1332    }
1333
1334    #[test]
1335    fn profiles_scripts() {
1336        let profile = detect_script_profile("Hello 東京 123!");
1337        assert_eq!(profile.scripts.get("Latin"), Some(&5));
1338        assert_eq!(profile.scripts.get("Han"), Some(&2));
1339        assert_eq!(profile.digits, 3);
1340        assert_eq!(profile.dominant_script.as_deref(), Some("Latin"));
1341        assert!(profile.is_mixed);
1342    }
1343
1344    #[test]
1345    fn splits_paragraphs_on_blank_lines() {
1346        let paragraphs = split_paragraphs("First paragraph.\nStill first.\n\nSecond.");
1347        assert_eq!(paragraphs.len(), 2);
1348        assert_eq!(paragraphs[0].sentence_count, 2);
1349        assert_eq!(paragraphs[1].text, "Second.");
1350    }
1351
1352    #[test]
1353    fn detailed_stats_include_derived_counts() {
1354        let stats = detailed_text_stats("One sentence.\n\nTwo words here.", &Default::default());
1355        assert_eq!(stats.paragraphs, 2);
1356        assert_eq!(stats.basic.sentences, 2);
1357        assert!(stats.average_words_per_sentence > 0.0);
1358    }
1359
1360    #[test]
1361    fn builds_annotation_graph_with_stable_cross_references() {
1362        let graph = build_annotation_graph(
1363            "Alice launched the API.\n\nBerlin hosted the event.",
1364            &TextProcessingOptions::default(),
1365        );
1366        assert_eq!(graph.tokens.len(), 8);
1367        assert_eq!(graph.sentences.len(), 2);
1368        assert_eq!(graph.paragraphs.len(), 2);
1369        assert_eq!(graph.provenance, AnnotationProvenance::Tokenizer);
1370        assert!(graph.confidence.get() > 0.0);
1371
1372        let first_sentence = &graph.sentences[0];
1373        let first_token = graph.token(first_sentence.token_ids[0]).unwrap();
1374        assert_eq!(first_token.text, "Alice");
1375        assert_eq!(first_token.sentence_id, first_sentence.id);
1376        assert_eq!(first_token.paragraph_id, first_sentence.paragraph_id);
1377
1378        let second_paragraph = &graph.paragraphs[1];
1379        assert_eq!(second_paragraph.sentence_ids.len(), 1);
1380        let sentence = graph.sentence(second_paragraph.sentence_ids[0]).unwrap();
1381        assert_eq!(sentence.text, "Berlin hosted the event.");
1382    }
1383}
text_core/lib.rs

text_core/
lib.rs