swiftide_integrations/treesitter/
chunk_code.rs

1//! Chunk code using tree-sitter
2use anyhow::{Context as _, Result};
3use async_trait::async_trait;
4use derive_builder::Builder;
5
6use crate::treesitter::{ChunkSize, CodeSplitter, SupportedLanguages};
7use swiftide_core::{
8    ChunkerTransformer,
9    indexing::{IndexingStream, TextNode},
10};
11
12/// The `ChunkCode` struct is responsible for chunking code into smaller pieces
13/// based on the specified language and chunk size.
14///
15/// It uses tree-sitter under the hood, and tries to split the code into smaller, meaningful
16/// chunks.
17///
18/// # Example
19///
20/// ```no_run
21/// # use swiftide_integrations::treesitter::transformers::ChunkCode;
22/// # use swiftide_integrations::treesitter::SupportedLanguages;
23/// // Chunk rust code with a maximum chunk size of 1000 bytes.
24/// ChunkCode::try_for_language_and_chunk_size(SupportedLanguages::Rust, 1000);
25///
26/// // Chunk python code with a minimum chunk size of 500 bytes and maximum chunk size of 2048.
27/// // Smaller chunks than 500 bytes will be discarded.
28/// ChunkCode::try_for_language_and_chunk_size(SupportedLanguages::Python, 500..2048);
29/// ````
30#[derive(Debug, Clone, Builder)]
31#[builder(pattern = "owned", setter(into, strip_option))]
32pub struct ChunkCode {
33    chunker: CodeSplitter,
34    #[builder(default)]
35    concurrency: Option<usize>,
36}
37
38impl ChunkCode {
39    pub fn builder() -> ChunkCodeBuilder {
40        ChunkCodeBuilder::default()
41    }
42
43    /// Tries to create a `ChunkCode` instance for a given programming language.
44    ///
45    /// # Parameters
46    /// - `lang`: The programming language to be used for chunking. It should implement
47    ///   `TryInto<SupportedLanguages>`.
48    ///
49    /// # Returns
50    /// - `Result<Self>`: Returns an instance of `ChunkCode` if successful, otherwise returns an
51    ///   error.
52    ///
53    /// # Errors
54    /// - Returns an error if the language is not supported or if the `CodeSplitter` fails to build.
55    pub fn try_for_language(lang: impl TryInto<SupportedLanguages>) -> Result<Self> {
56        Ok(Self {
57            chunker: CodeSplitter::builder().try_language(lang)?.build()?,
58            concurrency: None,
59        })
60    }
61
62    /// Tries to create a `ChunkCode` instance for a given programming language and chunk size.
63    ///
64    /// # Parameters
65    /// - `lang`: The programming language to be used for chunking. It should implement
66    ///   `TryInto<SupportedLanguages>`.
67    /// - `chunk_size`: The size of the chunks. It should implement `Into<ChunkSize>`.
68    ///
69    /// # Returns
70    /// - `Result<Self>`: Returns an instance of `ChunkCode` if successful, otherwise returns an
71    ///   error.
72    ///
73    /// # Errors
74    /// - Returns an error if the language is not supported, if the chunk size is invalid, or if the
75    ///   `CodeSplitter` fails to build.
76    pub fn try_for_language_and_chunk_size(
77        lang: impl TryInto<SupportedLanguages>,
78        chunk_size: impl Into<ChunkSize>,
79    ) -> Result<Self> {
80        Ok(Self {
81            chunker: CodeSplitter::builder()
82                .try_language(lang)?
83                .chunk_size(chunk_size)
84                .build()?,
85            concurrency: None,
86        })
87    }
88
89    #[must_use]
90    pub fn with_concurrency(mut self, concurrency: usize) -> Self {
91        self.concurrency = Some(concurrency);
92        self
93    }
94}
95
96#[async_trait]
97impl ChunkerTransformer for ChunkCode {
98    type Input = String;
99    type Output = String;
100    /// Transforms a `TextNode` by splitting its code chunk into smaller pieces.
101    ///
102    /// # Parameters
103    /// - `node`: The `TextNode` containing the code chunk to be split.
104    ///
105    /// # Returns
106    /// - `IndexingStream`: A stream of `TextNode` instances, each containing a smaller chunk of
107    ///   code.
108    ///
109    /// # Errors
110    /// - If the code splitting fails, an error is sent downstream.
111    #[tracing::instrument(skip_all, name = "transformers.chunk_code")]
112    async fn transform_node(&self, node: TextNode) -> IndexingStream<String> {
113        let split_result = self.chunker.split(&node.chunk);
114
115        if let Ok(split) = split_result {
116            let mut offset = 0;
117
118            IndexingStream::iter(split.into_iter().map(move |chunk| {
119                let chunk_size = chunk.len();
120
121                let node = TextNode::build_from_other(&node)
122                    .chunk(chunk)
123                    .offset(offset)
124                    .build();
125
126                offset += chunk_size;
127
128                node
129            }))
130        } else {
131            // Send the error downstream
132            IndexingStream::iter(vec![Err(split_result
133                .with_context(|| format!("Failed to chunk {}", node.path.display()))
134                .unwrap_err())])
135        }
136    }
137
138    fn concurrency(&self) -> Option<usize> {
139        self.concurrency
140    }
141}