use std::{
collections::HashMap,
fmt::Debug,
hash::{Hash, Hasher},
os::unix::ffi::OsStrExt,
path::PathBuf,
};
use itertools::Itertools;
use serde::{Deserialize, Serialize};
use crate::{metadata::Metadata, util::debug_long_utf8, Embedding, SparseEmbedding};
#[derive(Default, Clone, Serialize, Deserialize, PartialEq)]
pub struct Node {
pub id: Option<uuid::Uuid>,
pub path: PathBuf,
pub chunk: String,
pub vectors: Option<HashMap<EmbeddedField, Embedding>>,
pub sparse_vectors: Option<HashMap<EmbeddedField, SparseEmbedding>>,
pub metadata: Metadata,
pub embed_mode: EmbedMode,
pub original_size: usize,
pub offset: usize,
}
impl Debug for Node {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Node")
.field("id", &self.id)
.field("path", &self.path)
.field("chunk", &debug_long_utf8(&self.chunk, 100))
.field("metadata", &self.metadata)
.field(
"vectors",
&self
.vectors
.iter()
.flat_map(HashMap::iter)
.map(|(embed_type, vec)| format!("'{embed_type}': {}", vec.len()))
.join(","),
)
.field(
"sparse_vectors",
&self
.sparse_vectors
.iter()
.flat_map(HashMap::iter)
.map(|(embed_type, vec)| {
format!(
"'{embed_type}': indices({}), values({})",
vec.indices.len(),
vec.values.len()
)
})
.join(","),
)
.field("embed_mode", &self.embed_mode)
.finish()
}
}
impl Node {
pub fn new(chunk: impl Into<String>) -> Node {
let chunk = chunk.into();
let original_size = chunk.len();
Node {
chunk,
original_size,
..Default::default()
}
}
pub fn with_metadata(&mut self, metadata: impl Into<Metadata>) -> &mut Self {
self.metadata = metadata.into();
self
}
pub fn with_vectors(
&mut self,
vectors: impl Into<HashMap<EmbeddedField, Embedding>>,
) -> &mut Self {
self.vectors = Some(vectors.into());
self
}
pub fn with_sparse_vectors(
&mut self,
sparse_vectors: impl Into<HashMap<EmbeddedField, SparseEmbedding>>,
) -> &mut Self {
self.sparse_vectors = Some(sparse_vectors.into());
self
}
pub fn as_embeddables(&self) -> Vec<(EmbeddedField, String)> {
let mut embeddables = Vec::new();
if self.embed_mode == EmbedMode::SingleWithMetadata || self.embed_mode == EmbedMode::Both {
embeddables.push((EmbeddedField::Combined, self.combine_chunk_with_metadata()));
}
if self.embed_mode == EmbedMode::PerField || self.embed_mode == EmbedMode::Both {
embeddables.push((EmbeddedField::Chunk, self.chunk.clone()));
for (name, value) in &self.metadata {
let value = value
.as_str()
.map_or_else(|| value.to_string(), ToString::to_string);
embeddables.push((EmbeddedField::Metadata(name.clone()), value));
}
}
embeddables
}
fn combine_chunk_with_metadata(&self) -> String {
let metadata = self
.metadata
.iter()
.map(|(k, v)| {
let v = v
.as_str()
.map_or_else(|| v.to_string(), ToString::to_string);
format!("{k}: {v}")
})
.collect::<Vec<String>>()
.join("\n");
format!("{}\n{}", metadata, self.chunk)
}
pub fn id(&self) -> uuid::Uuid {
if let Some(id) = self.id {
return id;
}
let bytes = [self.path.as_os_str().as_bytes(), self.chunk.as_bytes()].concat();
uuid::Uuid::new_v3(&uuid::Uuid::NAMESPACE_OID, &bytes)
}
pub fn update_id(&mut self) {
self.id = None;
self.id = Some(self.id());
}
}
impl Hash for Node {
fn hash<H: Hasher>(&self, state: &mut H) {
self.path.hash(state);
self.chunk.hash(state);
}
}
impl<T: Into<String>> From<T> for Node {
fn from(value: T) -> Self {
Node::new(value)
}
}
#[derive(Copy, Debug, Default, Clone, Serialize, Deserialize, PartialEq)]
pub enum EmbedMode {
#[default]
SingleWithMetadata,
PerField,
Both,
}
#[derive(
Clone, Default, Serialize, Deserialize, PartialEq, Eq, Hash, strum_macros::Display, Debug,
)]
pub enum EmbeddedField {
#[default]
Combined,
Chunk,
#[strum(to_string = "Metadata: {0}")]
Metadata(String),
}
impl EmbeddedField {
pub fn sparse_field_name(&self) -> String {
format!("{self}_sparse")
}
pub fn field_name(&self) -> String {
format!("{self}")
}
}
#[allow(clippy::from_over_into)]
impl Into<String> for EmbeddedField {
fn into(self) -> String {
self.to_string()
}
}
#[cfg(test)]
mod tests {
use super::*;
use test_case::test_case;
#[test_case(&EmbeddedField::Combined, ["Combined", "Combined_sparse"])]
#[test_case(&EmbeddedField::Chunk, ["Chunk", "Chunk_sparse"])]
#[test_case(&EmbeddedField::Metadata("test".into()), ["Metadata: test", "Metadata: test_sparse"])]
fn field_name_tests(embedded_field: &EmbeddedField, expected: [&str; 2]) {
assert_eq!(embedded_field.field_name(), expected[0]);
assert_eq!(embedded_field.sparse_field_name(), expected[1]);
}
#[test]
fn test_debugging_node_with_utf8_char_boundary() {
let node = Node::new("🦀".repeat(101));
let _ = format!("{node:?}");
Node::new("Jürgen".repeat(100));
let _ = format!("{node:?}");
}
}