1use std::{
21 collections::HashMap,
22 fmt::Debug,
23 hash::{Hash, Hasher},
24 os::unix::ffi::OsStrExt,
25 path::PathBuf,
26};
27
28use derive_builder::Builder;
29use itertools::Itertools;
30use serde::{Deserialize, Serialize};
31
32use crate::{Embedding, SparseEmbedding, metadata::Metadata};
33
34pub trait Chunk: Clone + Send + Sync + Debug + AsRef<[u8]> + 'static {}
40impl<T> Chunk for T where T: Clone + Send + Sync + Debug + AsRef<[u8]> + 'static {}
41
42#[derive(Default, Clone, Serialize, Deserialize, PartialEq, Builder)]
48#[builder(setter(into, strip_option), build_fn(error = "anyhow::Error"))]
49pub struct Node<T: Chunk> {
50 #[builder(default)]
52 pub path: PathBuf,
53 pub chunk: T,
55 #[builder(default)]
57 pub vectors: Option<HashMap<EmbeddedField, Embedding>>,
58 #[builder(default)]
60 pub sparse_vectors: Option<HashMap<EmbeddedField, SparseEmbedding>>,
61 #[builder(default)]
63 pub metadata: Metadata,
64 #[builder(default)]
66 pub embed_mode: EmbedMode,
67 #[builder(default)]
69 pub original_size: usize,
70 #[builder(default)]
73 pub offset: usize,
74}
75
76pub type TextNode = Node<String>;
77
78impl<T: Chunk> NodeBuilder<T> {
79 pub fn maybe_sparse_vectors(
80 &mut self,
81 sparse_vectors: Option<HashMap<EmbeddedField, SparseEmbedding>>,
82 ) -> &mut Self {
83 self.sparse_vectors = Some(sparse_vectors);
84 self
85 }
86
87 pub fn maybe_vectors(
88 &mut self,
89 vectors: Option<HashMap<EmbeddedField, Embedding>>,
90 ) -> &mut Self {
91 self.vectors = Some(vectors);
92 self
93 }
94}
95
96impl<T: Chunk> Debug for Node<T> {
97 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
102 f.debug_struct("Node")
103 .field("id", &self.id())
104 .field("path", &self.path)
105 .field("chunk", &self.chunk)
106 .field("metadata", &self.metadata)
107 .field(
108 "vectors",
109 &self
110 .vectors
111 .iter()
112 .flat_map(HashMap::iter)
113 .map(|(embed_type, vec)| format!("'{embed_type}': {}", vec.len()))
114 .join(","),
115 )
116 .field(
117 "sparse_vectors",
118 &self
119 .sparse_vectors
120 .iter()
121 .flat_map(HashMap::iter)
122 .map(|(embed_type, vec)| {
123 format!(
124 "'{embed_type}': indices({}), values({})",
125 vec.indices.len(),
126 vec.values.len()
127 )
128 })
129 .join(","),
130 )
131 .field("embed_mode", &self.embed_mode)
132 .finish()
133 }
134}
135
136impl<T: Chunk> Node<T> {
137 pub fn build_from_other(node: &Node<T>) -> NodeBuilder<T> {
140 NodeBuilder::default()
141 .path(node.path.clone())
142 .chunk(node.chunk.clone())
143 .metadata(node.metadata.clone())
144 .maybe_vectors(node.vectors.clone())
145 .maybe_sparse_vectors(node.sparse_vectors.clone())
146 .embed_mode(node.embed_mode)
147 .original_size(node.original_size)
148 .offset(node.offset)
149 .to_owned()
150 }
151
152 pub fn builder<VALUE: Chunk + Clone>() -> NodeBuilder<VALUE> {
154 NodeBuilder::default()
155 }
156
157 pub fn new(chunk: impl Into<String>) -> Node<String> {
161 let chunk = chunk.into();
162 let original_size = chunk.len();
163 Node {
164 chunk,
165 original_size,
166 ..Default::default()
167 }
168 }
169
170 pub fn with_metadata(&mut self, metadata: impl Into<Metadata>) -> &mut Self {
171 self.metadata = metadata.into();
172 self
173 }
174
175 pub fn with_vectors(
176 &mut self,
177 vectors: impl Into<HashMap<EmbeddedField, Embedding>>,
178 ) -> &mut Self {
179 self.vectors = Some(vectors.into());
180 self
181 }
182
183 pub fn with_sparse_vectors(
184 &mut self,
185 sparse_vectors: impl Into<HashMap<EmbeddedField, SparseEmbedding>>,
186 ) -> &mut Self {
187 self.sparse_vectors = Some(sparse_vectors.into());
188 self
189 }
190
191 pub fn id(&self) -> uuid::Uuid {
198 let bytes = [self.path.as_os_str().as_bytes(), self.chunk.as_ref()].concat();
200
201 uuid::Uuid::new_v3(&uuid::Uuid::NAMESPACE_OID, &bytes)
202 }
203}
204
205impl Node<String> {
206 pub fn as_embeddables(&self) -> Vec<(EmbeddedField, String)> {
212 let mut embeddables = Vec::new();
214
215 if self.embed_mode == EmbedMode::SingleWithMetadata || self.embed_mode == EmbedMode::Both {
216 embeddables.push((EmbeddedField::Combined, self.combine_chunk_with_metadata()));
217 }
218
219 if self.embed_mode == EmbedMode::PerField || self.embed_mode == EmbedMode::Both {
220 embeddables.push((EmbeddedField::Chunk, self.chunk.clone()));
221 for (name, value) in &self.metadata {
222 let value = value
223 .as_str()
224 .map_or_else(|| value.to_string(), ToString::to_string);
225 embeddables.push((EmbeddedField::Metadata(name.clone()), value));
226 }
227 }
228
229 embeddables
230 }
231
232 fn combine_chunk_with_metadata(&self) -> String {
241 let metadata = self
243 .metadata
244 .iter()
245 .map(|(k, v)| {
246 let v = v
247 .as_str()
248 .map_or_else(|| v.to_string(), ToString::to_string);
249
250 format!("{k}: {v}")
251 })
252 .collect::<Vec<String>>()
253 .join("\n");
254
255 format!("{}\n{}", metadata, self.chunk)
256 }
257}
258
259impl Hash for Node<String> {
260 fn hash<H: Hasher>(&self, state: &mut H) {
264 self.path.hash(state);
265 self.chunk.hash(state);
266 }
267}
268
269impl<T: Into<String>> From<T> for Node<String> {
270 fn from(value: T) -> Self {
271 let value: String = value.into();
272 Node::<String>::new(value)
273 }
274}
275
276#[derive(Copy, Debug, Default, Clone, Serialize, Deserialize, PartialEq)]
278pub enum EmbedMode {
279 #[default]
280 SingleWithMetadata,
282 PerField,
284 Both,
287}
288
289#[derive(
291 Clone, Default, Serialize, Deserialize, PartialEq, Eq, Hash, strum_macros::Display, Debug,
292)]
293pub enum EmbeddedField {
294 #[default]
295 Combined,
297 Chunk,
299 #[strum(to_string = "Metadata: {0}")]
302 Metadata(String),
303}
304
305impl EmbeddedField {
306 pub fn sparse_field_name(&self) -> String {
308 format!("{self}_sparse")
309 }
310
311 pub fn field_name(&self) -> String {
313 format!("{self}")
314 }
315}
316
317#[allow(clippy::from_over_into)]
318impl Into<String> for EmbeddedField {
319 fn into(self) -> String {
320 self.to_string()
321 }
322}
323
324#[cfg(test)]
325mod tests {
326 use super::*;
327 use test_case::test_case;
328
329 #[test_case(&EmbeddedField::Combined, ["Combined", "Combined_sparse"])]
330 #[test_case(&EmbeddedField::Chunk, ["Chunk", "Chunk_sparse"])]
331 #[test_case(&EmbeddedField::Metadata("test".into()), ["Metadata: test", "Metadata: test_sparse"])]
332 fn field_name_tests(embedded_field: &EmbeddedField, expected: [&str; 2]) {
333 assert_eq!(embedded_field.field_name(), expected[0]);
334 assert_eq!(embedded_field.sparse_field_name(), expected[1]);
335 }
336
337 #[test]
338 fn test_debugging_node_with_utf8_char_boundary() {
339 let node = Node::from("🦀".repeat(101));
340 let _ = format!("{node:?}");
342
343 let node = Node::from("Jürgen".repeat(100));
345 let _ = format!("{node:?}");
346 }
347
348 #[test]
349 fn test_build_from_other_without_vectors() {
350 let original_node = Node::from("test_chunk")
351 .with_metadata(Metadata::default())
352 .with_vectors(HashMap::new())
353 .with_sparse_vectors(HashMap::new())
354 .to_owned();
355
356 let builder = Node::build_from_other(&original_node);
357 let new_node = builder.build().unwrap();
358
359 assert_eq!(original_node, new_node);
360 }
361
362 #[test]
363 fn test_build_from_other_with_vectors() {
364 let mut vectors = HashMap::new();
365 vectors.insert(EmbeddedField::Chunk, Embedding::default());
366
367 let mut sparse_vectors = HashMap::new();
368 sparse_vectors.insert(
369 EmbeddedField::Chunk,
370 SparseEmbedding {
371 indices: vec![],
372 values: vec![],
373 },
374 );
375
376 let original_node = Node::from("test_chunk")
377 .with_metadata(Metadata::default())
378 .with_vectors(vectors.clone())
379 .with_sparse_vectors(sparse_vectors.clone())
380 .to_owned();
381
382 let builder = Node::build_from_other(&original_node);
383 let new_node = builder.build().unwrap();
384
385 assert_eq!(original_node, new_node);
386 }
387}