recoco_splitters/split/
mod.rs

1// ReCoco is a Rust-only fork of CocoIndex, by [CocoIndex](https://CocoIndex)
2// Original code from CocoIndex is copyrighted by CocoIndex
3// SPDX-FileCopyrightText: 2025-2026 CocoIndex (upstream)
4// SPDX-FileContributor: CocoIndex Contributors
5//
6// All modifications from the upstream for ReCoco are copyrighted by Knitli Inc.
7// SPDX-FileCopyrightText: 2026 Knitli Inc. (ReCoco)
8// SPDX-FileContributor: Adam Poulemanos <adam@knit.li>
9//
10// Both the upstream CocoIndex code and the ReCoco modifications are licensed under the Apache-2.0 License.
11// SPDX-License-Identifier: Apache-2.0
12
13//! Text splitting utilities.
14//!
15//! This module provides text splitting functionality including:
16//! - Splitting by regex separators
17//! - Recursive syntax-aware chunking
18
19mod by_separators;
20mod output_positions;
21mod recursive;
22
23pub use by_separators::{KeepSeparator, SeparatorSplitConfig, SeparatorSplitter};
24pub use recursive::{
25    CustomLanguageConfig, RecursiveChunkConfig, RecursiveChunker, RecursiveSplitConfig,
26};
27
28/// A text range specified by byte offsets.
29#[derive(Debug, Clone, Copy, PartialEq, Eq)]
30pub struct TextRange {
31    /// Start byte offset (inclusive).
32    pub start: usize,
33    /// End byte offset (exclusive).
34    pub end: usize,
35}
36
37impl TextRange {
38    /// Create a new text range.
39    pub fn new(start: usize, end: usize) -> Self {
40        Self { start, end }
41    }
42
43    /// Get the length of the range in bytes.
44    pub fn len(&self) -> usize {
45        self.end - self.start
46    }
47
48    /// Check if the range is empty.
49    pub fn is_empty(&self) -> bool {
50        self.start >= self.end
51    }
52}
53
54/// Output position information with character offset and line/column.
55#[derive(Debug, Clone, Copy, PartialEq, Eq)]
56pub struct OutputPosition {
57    /// Character (not byte) offset from the start of the text.
58    pub char_offset: usize,
59    /// 1-based line number.
60    pub line: u32,
61    /// 1-based column number.
62    pub column: u32,
63}
64
65/// A chunk of text with its range and position information.
66#[derive(Debug, Clone)]
67pub struct Chunk {
68    /// Byte range in the original text. Use this to slice the original string.
69    pub range: TextRange,
70    /// Start position (character offset, line, column).
71    pub start: OutputPosition,
72    /// End position (character offset, line, column).
73    pub end: OutputPosition,
74}
75
76#[cfg(test)]
77mod tests {
78    use super::*;
79
80    #[test]
81    fn test_text_range() {
82        let range = TextRange::new(0, 10);
83        assert_eq!(range.len(), 10);
84        assert!(!range.is_empty());
85
86        let empty = TextRange::new(5, 5);
87        assert_eq!(empty.len(), 0);
88        assert!(empty.is_empty());
89    }
90}