1use std::{
21 collections::HashMap,
22 fmt::Debug,
23 hash::{Hash, Hasher},
24 os::unix::ffi::OsStrExt,
25 path::PathBuf,
26};
27
28use derive_builder::Builder;
29use itertools::Itertools;
30use serde::{Deserialize, Serialize};
31
32use crate::{Embedding, SparseEmbedding, metadata::Metadata, util::debug_long_utf8};
33
34#[derive(Default, Clone, Serialize, Deserialize, PartialEq, Builder)]
40#[builder(setter(into, strip_option), build_fn(error = "anyhow::Error"))]
41pub struct Node {
42 #[builder(default)]
44 pub path: PathBuf,
45 pub chunk: String,
47 #[builder(default)]
49 pub vectors: Option<HashMap<EmbeddedField, Embedding>>,
50 #[builder(default)]
52 pub sparse_vectors: Option<HashMap<EmbeddedField, SparseEmbedding>>,
53 #[builder(default)]
55 pub metadata: Metadata,
56 #[builder(default)]
58 pub embed_mode: EmbedMode,
59 #[builder(default)]
61 pub original_size: usize,
62 #[builder(default)]
65 pub offset: usize,
66 #[builder(default)]
68 pub parent_id: Option<uuid::Uuid>,
69}
70
71impl NodeBuilder {
72 pub fn maybe_sparse_vectors(
73 &mut self,
74 sparse_vectors: Option<HashMap<EmbeddedField, SparseEmbedding>>,
75 ) -> &mut Self {
76 self.sparse_vectors = Some(sparse_vectors);
77 self
78 }
79
80 pub fn maybe_vectors(
81 &mut self,
82 vectors: Option<HashMap<EmbeddedField, Embedding>>,
83 ) -> &mut Self {
84 self.vectors = Some(vectors);
85 self
86 }
87}
88
89impl Debug for Node {
90 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
95 f.debug_struct("Node")
96 .field("id", &self.id())
97 .field("path", &self.path)
98 .field("chunk", &debug_long_utf8(&self.chunk, 100))
99 .field("metadata", &self.metadata)
100 .field(
101 "vectors",
102 &self
103 .vectors
104 .iter()
105 .flat_map(HashMap::iter)
106 .map(|(embed_type, vec)| format!("'{embed_type}': {}", vec.len()))
107 .join(","),
108 )
109 .field(
110 "sparse_vectors",
111 &self
112 .sparse_vectors
113 .iter()
114 .flat_map(HashMap::iter)
115 .map(|(embed_type, vec)| {
116 format!(
117 "'{embed_type}': indices({}), values({})",
118 vec.indices.len(),
119 vec.values.len()
120 )
121 })
122 .join(","),
123 )
124 .field("embed_mode", &self.embed_mode)
125 .finish()
126 }
127}
128
129impl Node {
130 pub fn chunking_from(node: &Node) -> NodeBuilder {
133 NodeBuilder::default()
134 .path(node.path.clone())
135 .chunk(node.chunk.clone())
136 .metadata(node.metadata.clone())
137 .maybe_vectors(node.vectors.clone())
138 .maybe_sparse_vectors(node.sparse_vectors.clone())
139 .embed_mode(node.embed_mode)
140 .original_size(node.original_size)
141 .offset(node.offset)
142 .parent_id(node.id())
143 .to_owned()
144 }
145
146 pub fn builder() -> NodeBuilder {
148 NodeBuilder::default()
149 }
150
151 pub fn new(chunk: impl Into<String>) -> Node {
155 let chunk = chunk.into();
156 let original_size = chunk.len();
157 Node {
158 chunk,
159 original_size,
160 ..Default::default()
161 }
162 }
163
164 pub fn with_metadata(&mut self, metadata: impl Into<Metadata>) -> &mut Self {
165 self.metadata = metadata.into();
166 self
167 }
168
169 pub fn with_vectors(
170 &mut self,
171 vectors: impl Into<HashMap<EmbeddedField, Embedding>>,
172 ) -> &mut Self {
173 self.vectors = Some(vectors.into());
174 self
175 }
176
177 pub fn with_sparse_vectors(
178 &mut self,
179 sparse_vectors: impl Into<HashMap<EmbeddedField, SparseEmbedding>>,
180 ) -> &mut Self {
181 self.sparse_vectors = Some(sparse_vectors.into());
182 self
183 }
184
185 pub fn as_embeddables(&self) -> Vec<(EmbeddedField, String)> {
191 let mut embeddables = Vec::new();
193
194 if self.embed_mode == EmbedMode::SingleWithMetadata || self.embed_mode == EmbedMode::Both {
195 embeddables.push((EmbeddedField::Combined, self.combine_chunk_with_metadata()));
196 }
197
198 if self.embed_mode == EmbedMode::PerField || self.embed_mode == EmbedMode::Both {
199 embeddables.push((EmbeddedField::Chunk, self.chunk.clone()));
200 for (name, value) in &self.metadata {
201 let value = value
202 .as_str()
203 .map_or_else(|| value.to_string(), ToString::to_string);
204 embeddables.push((EmbeddedField::Metadata(name.clone()), value));
205 }
206 }
207
208 embeddables
209 }
210
211 fn combine_chunk_with_metadata(&self) -> String {
220 let metadata = self
222 .metadata
223 .iter()
224 .map(|(k, v)| {
225 let v = v
226 .as_str()
227 .map_or_else(|| v.to_string(), ToString::to_string);
228
229 format!("{k}: {v}")
230 })
231 .collect::<Vec<String>>()
232 .join("\n");
233
234 format!("{}\n{}", metadata, self.chunk)
235 }
236
237 pub fn id(&self) -> uuid::Uuid {
244 let bytes = [self.path.as_os_str().as_bytes(), self.chunk.as_bytes()].concat();
245
246 uuid::Uuid::new_v3(&uuid::Uuid::NAMESPACE_OID, &bytes)
247 }
248
249 pub fn parent_id(&self) -> Option<uuid::Uuid> {
250 self.parent_id
251 }
252}
253
254impl Hash for Node {
255 fn hash<H: Hasher>(&self, state: &mut H) {
259 self.path.hash(state);
260 self.chunk.hash(state);
261 }
262}
263
264impl<T: Into<String>> From<T> for Node {
265 fn from(value: T) -> Self {
266 Node::new(value)
267 }
268}
269
270#[derive(Copy, Debug, Default, Clone, Serialize, Deserialize, PartialEq)]
272pub enum EmbedMode {
273 #[default]
274 SingleWithMetadata,
276 PerField,
278 Both,
281}
282
283#[derive(
285 Clone, Default, Serialize, Deserialize, PartialEq, Eq, Hash, strum_macros::Display, Debug,
286)]
287pub enum EmbeddedField {
288 #[default]
289 Combined,
291 Chunk,
293 #[strum(to_string = "Metadata: {0}")]
296 Metadata(String),
297}
298
299impl EmbeddedField {
300 pub fn sparse_field_name(&self) -> String {
302 format!("{self}_sparse")
303 }
304
305 pub fn field_name(&self) -> String {
307 format!("{self}")
308 }
309}
310
311#[allow(clippy::from_over_into)]
312impl Into<String> for EmbeddedField {
313 fn into(self) -> String {
314 self.to_string()
315 }
316}
317
318#[cfg(test)]
319mod tests {
320 use super::*;
321 use test_case::test_case;
322
323 #[test_case(&EmbeddedField::Combined, ["Combined", "Combined_sparse"])]
324 #[test_case(&EmbeddedField::Chunk, ["Chunk", "Chunk_sparse"])]
325 #[test_case(&EmbeddedField::Metadata("test".into()), ["Metadata: test", "Metadata: test_sparse"])]
326 fn field_name_tests(embedded_field: &EmbeddedField, expected: [&str; 2]) {
327 assert_eq!(embedded_field.field_name(), expected[0]);
328 assert_eq!(embedded_field.sparse_field_name(), expected[1]);
329 }
330
331 #[test]
332 fn test_debugging_node_with_utf8_char_boundary() {
333 let node = Node::new("🦀".repeat(101));
334 let _ = format!("{node:?}");
336
337 Node::new("Jürgen".repeat(100));
339 let _ = format!("{node:?}");
340 }
341
342 #[test]
343 fn test_build_from_other_without_vectors() {
344 let original_node = Node::new("test_chunk")
345 .with_metadata(Metadata::default())
346 .with_vectors(HashMap::new())
347 .with_sparse_vectors(HashMap::new())
348 .to_owned();
349
350 let builder = Node::chunking_from(&original_node);
351 let chunked_node = builder.build().unwrap();
352
353 assert_eq!(chunked_node.parent_id(), Some(original_node.id()));
354
355 assert_eq!(original_node.parent_id(), None);
356
357 assert_eq!(original_node.chunk, chunked_node.chunk);
358 assert_eq!(original_node.path, chunked_node.path);
359 assert_eq!(original_node.metadata, chunked_node.metadata);
360 assert_eq!(original_node.vectors, chunked_node.vectors);
361 assert_eq!(original_node.sparse_vectors, chunked_node.sparse_vectors);
362 assert_eq!(original_node.embed_mode, chunked_node.embed_mode);
363 assert_eq!(original_node.original_size, chunked_node.original_size);
364 assert_eq!(original_node.offset, chunked_node.offset);
365 }
366
367 #[test]
368 fn test_build_from_other_with_vectors() {
369 let mut vectors = HashMap::new();
370 vectors.insert(EmbeddedField::Chunk, Embedding::default());
371
372 let mut sparse_vectors = HashMap::new();
373 sparse_vectors.insert(
374 EmbeddedField::Chunk,
375 SparseEmbedding {
376 indices: vec![],
377 values: vec![],
378 },
379 );
380
381 let original_node = Node::new("test_chunk")
382 .with_metadata(Metadata::default())
383 .with_vectors(vectors.clone())
384 .with_sparse_vectors(sparse_vectors.clone())
385 .to_owned();
386
387 let builder = Node::chunking_from(&original_node);
388 let chunked_node = builder.build().unwrap();
389
390 assert_eq!(chunked_node.parent_id(), Some(original_node.id()));
391
392 assert_eq!(original_node.parent_id(), None);
393
394 assert_eq!(original_node.chunk, chunked_node.chunk);
395 assert_eq!(original_node.path, chunked_node.path);
396 assert_eq!(original_node.metadata, chunked_node.metadata);
397 assert_eq!(original_node.vectors, chunked_node.vectors);
398 assert_eq!(original_node.sparse_vectors, chunked_node.sparse_vectors);
399 assert_eq!(original_node.embed_mode, chunked_node.embed_mode);
400 assert_eq!(original_node.original_size, chunked_node.original_size);
401 assert_eq!(original_node.offset, chunked_node.offset);
402 }
403}