swiftide_integrations/treesitter/chunk_code.rs
1//! Chunk code using tree-sitter
2use anyhow::{Context as _, Result};
3use async_trait::async_trait;
4use derive_builder::Builder;
5
6use crate::treesitter::{ChunkSize, CodeSplitter, SupportedLanguages};
7use swiftide_core::{
8 ChunkerTransformer,
9 indexing::{IndexingStream, TextNode},
10};
11
12/// The `ChunkCode` struct is responsible for chunking code into smaller pieces
13/// based on the specified language and chunk size.
14///
15/// It uses tree-sitter under the hood, and tries to split the code into smaller, meaningful
16/// chunks.
17///
18/// # Example
19///
20/// ```no_run
21/// # use swiftide_integrations::treesitter::transformers::ChunkCode;
22/// # use swiftide_integrations::treesitter::SupportedLanguages;
23/// // Chunk rust code with a maximum chunk size of 1000 bytes.
24/// ChunkCode::try_for_language_and_chunk_size(SupportedLanguages::Rust, 1000);
25///
26/// // Chunk python code with a minimum chunk size of 500 bytes and maximum chunk size of 2048.
27/// // Smaller chunks than 500 bytes will be discarded.
28/// ChunkCode::try_for_language_and_chunk_size(SupportedLanguages::Python, 500..2048);
29/// ````
30#[derive(Debug, Clone, Builder)]
31#[builder(pattern = "owned", setter(into, strip_option))]
32pub struct ChunkCode {
33 chunker: CodeSplitter,
34 #[builder(default)]
35 concurrency: Option<usize>,
36}
37
38impl ChunkCode {
39 pub fn builder() -> ChunkCodeBuilder {
40 ChunkCodeBuilder::default()
41 }
42
43 /// Tries to create a `ChunkCode` instance for a given programming language.
44 ///
45 /// # Parameters
46 /// - `lang`: The programming language to be used for chunking. It should implement
47 /// `TryInto<SupportedLanguages>`.
48 ///
49 /// # Returns
50 /// - `Result<Self>`: Returns an instance of `ChunkCode` if successful, otherwise returns an
51 /// error.
52 ///
53 /// # Errors
54 /// - Returns an error if the language is not supported or if the `CodeSplitter` fails to build.
55 pub fn try_for_language(lang: impl TryInto<SupportedLanguages>) -> Result<Self> {
56 Ok(Self {
57 chunker: CodeSplitter::builder().try_language(lang)?.build()?,
58 concurrency: None,
59 })
60 }
61
62 /// Tries to create a `ChunkCode` instance for a given programming language and chunk size.
63 ///
64 /// # Parameters
65 /// - `lang`: The programming language to be used for chunking. It should implement
66 /// `TryInto<SupportedLanguages>`.
67 /// - `chunk_size`: The size of the chunks. It should implement `Into<ChunkSize>`.
68 ///
69 /// # Returns
70 /// - `Result<Self>`: Returns an instance of `ChunkCode` if successful, otherwise returns an
71 /// error.
72 ///
73 /// # Errors
74 /// - Returns an error if the language is not supported, if the chunk size is invalid, or if the
75 /// `CodeSplitter` fails to build.
76 pub fn try_for_language_and_chunk_size(
77 lang: impl TryInto<SupportedLanguages>,
78 chunk_size: impl Into<ChunkSize>,
79 ) -> Result<Self> {
80 Ok(Self {
81 chunker: CodeSplitter::builder()
82 .try_language(lang)?
83 .chunk_size(chunk_size)
84 .build()?,
85 concurrency: None,
86 })
87 }
88
89 #[must_use]
90 pub fn with_concurrency(mut self, concurrency: usize) -> Self {
91 self.concurrency = Some(concurrency);
92 self
93 }
94}
95
96#[async_trait]
97impl ChunkerTransformer for ChunkCode {
98 type Input = String;
99 type Output = String;
100 /// Transforms a `TextNode` by splitting its code chunk into smaller pieces.
101 ///
102 /// # Parameters
103 /// - `node`: The `TextNode` containing the code chunk to be split.
104 ///
105 /// # Returns
106 /// - `IndexingStream`: A stream of `TextNode` instances, each containing a smaller chunk of
107 /// code.
108 ///
109 /// # Errors
110 /// - If the code splitting fails, an error is sent downstream.
111 #[tracing::instrument(skip_all, name = "transformers.chunk_code")]
112 async fn transform_node(&self, node: TextNode) -> IndexingStream<String> {
113 let split_result = self.chunker.split(&node.chunk);
114
115 if let Ok(split) = split_result {
116 let mut offset = 0;
117
118 IndexingStream::iter(split.into_iter().map(move |chunk| {
119 let chunk_size = chunk.len();
120
121 let node = TextNode::build_from_other(&node)
122 .chunk(chunk)
123 .offset(offset)
124 .build();
125
126 offset += chunk_size;
127
128 node
129 }))
130 } else {
131 // Send the error downstream
132 IndexingStream::iter(vec![Err(split_result
133 .with_context(|| format!("Failed to chunk {}", node.path.display()))
134 .unwrap_err())])
135 }
136 }
137
138 fn concurrency(&self) -> Option<usize> {
139 self.concurrency
140 }
141}