Skip to main content

recoco_splitters/split/
mod.rs

1// Recoco is a Rust-only fork of CocoIndex, by [CocoIndex](https://CocoIndex)
2// Original code from CocoIndex is copyrighted by CocoIndex
3// SPDX-FileCopyrightText: 2025-2026 CocoIndex (upstream)
4// SPDX-FileContributor: CocoIndex Contributors
5//
6// All modifications from the upstream for Recoco are copyrighted by Knitli Inc.
7// SPDX-FileCopyrightText: 2026 Knitli Inc. (Recoco)
8// SPDX-FileContributor: Adam Poulemanos <adam@knit.li>
9//
10// Both the upstream CocoIndex code and the Recoco modifications are licensed under the Apache-2.0 License.
11// SPDX-License-Identifier: Apache-2.0
12
13//! Text splitting utilities.
14//!
15//! This module provides text splitting functionality including:
16//! - Splitting by regex separators
17//! - Recursive syntax-aware chunking
18
19#[cfg(feature = "splitter-separator")]
20pub use crate::by_separators::{KeepSeparator, SeparatorSplitConfig, SeparatorSplitter};
21#[cfg(feature = "splitter-recursive")]
22pub use crate::recursive::{
23    CustomLanguageConfig, RecursiveChunkConfig, RecursiveChunker, RecursiveSplitConfig,
24};
25
26/// A text range specified by byte offsets.
27#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub struct TextRange {
29    /// Start byte offset (inclusive).
30    pub start: usize,
31    /// End byte offset (exclusive).
32    pub end: usize,
33}
34
35impl TextRange {
36    /// Create a new text range.
37    pub fn new(start: usize, end: usize) -> Self {
38        Self { start, end }
39    }
40
41    /// Get the length of the range in bytes.
42    pub fn len(&self) -> usize {
43        self.end - self.start
44    }
45
46    /// Check if the range is empty.
47    pub fn is_empty(&self) -> bool {
48        self.start >= self.end
49    }
50}
51
52/// Output position information with character offset and line/column.
53#[derive(Debug, Clone, Copy, PartialEq, Eq)]
54pub struct OutputPosition {
55    /// Character (not byte) offset from the start of the text.
56    pub char_offset: usize,
57    /// 1-based line number.
58    pub line: u32,
59    /// 1-based column number.
60    pub column: u32,
61}
62
63/// A chunk of text with its range and position information.
64#[derive(Debug, Clone)]
65pub struct Chunk {
66    /// Byte range in the original text. Use this to slice the original string.
67    pub range: TextRange,
68    /// Start position (character offset, line, column).
69    pub start: OutputPosition,
70    /// End position (character offset, line, column).
71    pub end: OutputPosition,
72}
73
74#[cfg(test)]
75mod tests {
76    use super::*;
77
78    #[test]
79    fn test_text_range() {
80        let range = TextRange::new(0, 10);
81        assert_eq!(range.len(), 10);
82        assert!(!range.is_empty());
83
84        let empty = TextRange::new(5, 5);
85        assert_eq!(empty.len(), 0);
86        assert!(empty.is_empty());
87    }
88}