recoco_splitters/split/mod.rs
1// ReCoco is a Rust-only fork of CocoIndex, by [CocoIndex](https://CocoIndex)
2// Original code from CocoIndex is copyrighted by CocoIndex
3// SPDX-FileCopyrightText: 2025-2026 CocoIndex (upstream)
4// SPDX-FileContributor: CocoIndex Contributors
5//
6// All modifications from the upstream for ReCoco are copyrighted by Knitli Inc.
7// SPDX-FileCopyrightText: 2026 Knitli Inc. (ReCoco)
8// SPDX-FileContributor: Adam Poulemanos <adam@knit.li>
9//
10// Both the upstream CocoIndex code and the ReCoco modifications are licensed under the Apache-2.0 License.
11// SPDX-License-Identifier: Apache-2.0
12
13//! Text splitting utilities.
14//!
15//! This module provides text splitting functionality including:
16//! - Splitting by regex separators
17//! - Recursive syntax-aware chunking
18
19mod by_separators;
20mod output_positions;
21mod recursive;
22
23pub use by_separators::{KeepSeparator, SeparatorSplitConfig, SeparatorSplitter};
24pub use recursive::{
25 CustomLanguageConfig, RecursiveChunkConfig, RecursiveChunker, RecursiveSplitConfig,
26};
27
28/// A text range specified by byte offsets.
29#[derive(Debug, Clone, Copy, PartialEq, Eq)]
30pub struct TextRange {
31 /// Start byte offset (inclusive).
32 pub start: usize,
33 /// End byte offset (exclusive).
34 pub end: usize,
35}
36
37impl TextRange {
38 /// Create a new text range.
39 pub fn new(start: usize, end: usize) -> Self {
40 Self { start, end }
41 }
42
43 /// Get the length of the range in bytes.
44 pub fn len(&self) -> usize {
45 self.end - self.start
46 }
47
48 /// Check if the range is empty.
49 pub fn is_empty(&self) -> bool {
50 self.start >= self.end
51 }
52}
53
54/// Output position information with character offset and line/column.
55#[derive(Debug, Clone, Copy, PartialEq, Eq)]
56pub struct OutputPosition {
57 /// Character (not byte) offset from the start of the text.
58 pub char_offset: usize,
59 /// 1-based line number.
60 pub line: u32,
61 /// 1-based column number.
62 pub column: u32,
63}
64
65/// A chunk of text with its range and position information.
66#[derive(Debug, Clone)]
67pub struct Chunk {
68 /// Byte range in the original text. Use this to slice the original string.
69 pub range: TextRange,
70 /// Start position (character offset, line, column).
71 pub start: OutputPosition,
72 /// End position (character offset, line, column).
73 pub end: OutputPosition,
74}
75
76#[cfg(test)]
77mod tests {
78 use super::*;
79
80 #[test]
81 fn test_text_range() {
82 let range = TextRange::new(0, 10);
83 assert_eq!(range.len(), 10);
84 assert!(!range.is_empty());
85
86 let empty = TextRange::new(5, 5);
87 assert_eq!(empty.len(), 0);
88 assert!(empty.is_empty());
89 }
90}