recoco_splitters/split/mod.rs
1// Recoco is a Rust-only fork of CocoIndex, by [CocoIndex](https://CocoIndex)
2// Original code from CocoIndex is copyrighted by CocoIndex
3// SPDX-FileCopyrightText: 2025-2026 CocoIndex (upstream)
4// SPDX-FileContributor: CocoIndex Contributors
5//
6// All modifications from the upstream for Recoco are copyrighted by Knitli Inc.
7// SPDX-FileCopyrightText: 2026 Knitli Inc. (Recoco)
8// SPDX-FileContributor: Adam Poulemanos <adam@knit.li>
9//
10// Both the upstream CocoIndex code and the Recoco modifications are licensed under the Apache-2.0 License.
11// SPDX-License-Identifier: Apache-2.0
12
13//! Text splitting utilities.
14//!
15//! This module provides text splitting functionality including:
16//! - Splitting by regex separators
17//! - Recursive syntax-aware chunking
18
19#[cfg(feature = "splitter-separator")]
20pub use crate::by_separators::{KeepSeparator, SeparatorSplitConfig, SeparatorSplitter};
21#[cfg(feature = "splitter-recursive")]
22pub use crate::recursive::{
23 CustomLanguageConfig, RecursiveChunkConfig, RecursiveChunker, RecursiveSplitConfig,
24};
25
26/// A text range specified by byte offsets.
27#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub struct TextRange {
29 /// Start byte offset (inclusive).
30 pub start: usize,
31 /// End byte offset (exclusive).
32 pub end: usize,
33}
34
35impl TextRange {
36 /// Create a new text range.
37 pub fn new(start: usize, end: usize) -> Self {
38 Self { start, end }
39 }
40
41 /// Get the length of the range in bytes.
42 pub fn len(&self) -> usize {
43 self.end - self.start
44 }
45
46 /// Check if the range is empty.
47 pub fn is_empty(&self) -> bool {
48 self.start >= self.end
49 }
50}
51
52/// Output position information with character offset and line/column.
53#[derive(Debug, Clone, Copy, PartialEq, Eq)]
54pub struct OutputPosition {
55 /// Character (not byte) offset from the start of the text.
56 pub char_offset: usize,
57 /// 1-based line number.
58 pub line: u32,
59 /// 1-based column number.
60 pub column: u32,
61}
62
63/// A chunk of text with its range and position information.
64#[derive(Debug, Clone)]
65pub struct Chunk {
66 /// Byte range in the original text. Use this to slice the original string.
67 pub range: TextRange,
68 /// Start position (character offset, line, column).
69 pub start: OutputPosition,
70 /// End position (character offset, line, column).
71 pub end: OutputPosition,
72}
73
74#[cfg(test)]
75mod tests {
76 use super::*;
77
78 #[test]
79 fn test_text_range() {
80 let range = TextRange::new(0, 10);
81 assert_eq!(range.len(), 10);
82 assert!(!range.is_empty());
83
84 let empty = TextRange::new(5, 5);
85 assert_eq!(empty.len(), 0);
86 assert!(empty.is_empty());
87 }
88}