use std::sync::Arc;
use chrono::{DateTime, Utc};
use entelix_memory::Namespace;
use serde::{Deserialize, Serialize};
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
#[serde(transparent)]
pub struct DocumentId(Arc<str>);
impl DocumentId {
#[must_use]
pub fn new(id: impl Into<String>) -> Self {
let s: String = id.into();
assert!(!s.is_empty(), "DocumentId must not be empty");
Self(Arc::from(s))
}
#[must_use]
pub fn as_str(&self) -> &str {
&self.0
}
}
impl std::fmt::Display for DocumentId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(&self.0)
}
}
impl From<String> for DocumentId {
fn from(s: String) -> Self {
Self::new(s)
}
}
impl From<&str> for DocumentId {
fn from(s: &str) -> Self {
Self::new(s)
}
}
#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct Source {
pub uri: String,
pub loader: String,
pub fetched_at: DateTime<Utc>,
pub etag: Option<String>,
}
impl Source {
#[must_use]
pub fn now(uri: impl Into<String>, loader: impl Into<String>) -> Self {
Self {
uri: uri.into(),
loader: loader.into(),
fetched_at: Utc::now(),
etag: None,
}
}
#[must_use]
pub fn with_etag(mut self, etag: impl Into<String>) -> Self {
self.etag = Some(etag.into());
self
}
}
#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct Lineage {
pub parent_id: DocumentId,
pub chunk_index: u32,
pub total_chunks: u32,
pub splitter: String,
pub chunker_chain: Vec<String>,
}
impl Lineage {
#[must_use]
pub fn from_split(
parent_id: DocumentId,
chunk_index: u32,
total_chunks: u32,
splitter: impl Into<String>,
) -> Self {
Self {
parent_id,
chunk_index,
total_chunks,
splitter: splitter.into(),
chunker_chain: Vec::new(),
}
}
pub fn push_chunker(&mut self, chunker: impl Into<String>) {
self.chunker_chain.push(chunker.into());
}
}
#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
#[non_exhaustive]
pub struct Document {
pub id: DocumentId,
pub content: String,
#[serde(default, skip_serializing_if = "serde_json::Value::is_null")]
pub metadata: serde_json::Value,
pub source: Source,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub lineage: Option<Lineage>,
pub namespace: Namespace,
}
impl Document {
#[must_use]
pub fn root(
id: impl Into<DocumentId>,
content: impl Into<String>,
source: Source,
namespace: Namespace,
) -> Self {
Self {
id: id.into(),
content: content.into(),
metadata: serde_json::Value::Null,
source,
lineage: None,
namespace,
}
}
#[must_use]
pub fn with_metadata(mut self, metadata: serde_json::Value) -> Self {
self.metadata = metadata;
self
}
#[must_use]
pub fn child(&self, content: impl Into<String>, lineage: Lineage) -> Self {
let child_id = format!("{}:{}", self.id, lineage.chunk_index);
Self {
id: DocumentId::new(child_id),
content: content.into(),
metadata: self.metadata.clone(),
source: self.source.clone(),
lineage: Some(lineage),
namespace: self.namespace.clone(),
}
}
}
#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
use super::*;
fn ns() -> Namespace {
Namespace::new(entelix_core::TenantId::new("acme"))
}
fn src() -> Source {
Source::now("file:///tmp/doc.md", "test")
}
#[test]
fn document_id_rejects_empty() {
let result = std::panic::catch_unwind(|| DocumentId::new(""));
assert!(result.is_err(), "empty DocumentId must panic");
}
#[test]
fn document_id_clone_shares_arc() {
let id = DocumentId::new("doc-1");
let cloned = id.clone();
assert_eq!(Arc::as_ptr(&id.0), Arc::as_ptr(&cloned.0));
}
#[test]
fn child_id_suffixes_with_chunk_index() {
let root = Document::root("paper", "full text", src(), ns());
let lineage = Lineage::from_split(root.id.clone(), 3, 10, "recursive");
let child = root.child("slice", lineage);
assert_eq!(child.id.as_str(), "paper:3");
assert_eq!(child.lineage.as_ref().unwrap().chunk_index, 3);
assert_eq!(child.lineage.as_ref().unwrap().total_chunks, 10);
assert_eq!(child.source.uri, root.source.uri);
assert_eq!(child.namespace, root.namespace);
}
#[test]
fn lineage_push_chunker_records_chain_order() {
let mut lineage = Lineage::from_split(DocumentId::new("d"), 0, 1, "recursive");
lineage.push_chunker("contextual");
lineage.push_chunker("hyde");
assert_eq!(lineage.chunker_chain, vec!["contextual", "hyde"]);
}
#[test]
fn source_with_etag_preserves_other_fields() {
let s = Source::now("https://example.com/p", "web").with_etag("W/\"abc\"");
assert_eq!(s.etag.as_deref(), Some("W/\"abc\""));
assert_eq!(s.loader, "web");
}
#[test]
fn document_round_trips_through_serde() {
let doc = Document::root("d", "hello", src(), ns())
.with_metadata(serde_json::json!({"locale": "en"}));
let json = serde_json::to_string(&doc).unwrap();
let back: Document = serde_json::from_str(&json).unwrap();
assert_eq!(doc, back);
}
}