Skip to main content

ucm_core/
id.rs

1//! Block ID generation with 96-bit collision resistance.
2//!
3//! Block IDs are deterministic, content-addressed identifiers derived from:
4//! - Content type
5//! - Semantic role (optional)
6//! - Normalized content
7//! - Namespace (optional, for multi-tenant scenarios)
8//!
9//! Using 96 bits of entropy ensures collision probability < 10⁻¹⁵ at 10M blocks.
10
11use crate::content::Content;
12use crate::error::{Error, ErrorCode, Result};
13use crate::normalize::normalize_content;
14use serde::{Deserialize, Serialize};
15use sha2::{Digest, Sha256};
16use std::fmt;
17use std::str::FromStr;
18
19/// Block identifier with 96 bits of entropy (12 bytes).
20///
21/// Format: `blk_<24 hex characters>`
22///
23/// # Example
24/// ```
25/// use ucm_core::BlockId;
26///
27/// let id = BlockId::from_bytes([0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
28///                               0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c]);
29/// assert!(id.to_string().starts_with("blk_"));
30/// assert_eq!(id.to_string().len(), 28); // "blk_" + 24 hex chars
31/// ```
32#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
33pub struct BlockId(#[serde(with = "hex_array")] pub [u8; 12]);
34
35impl BlockId {
36    /// Create a BlockId from raw bytes
37    pub fn from_bytes(bytes: [u8; 12]) -> Self {
38        Self(bytes)
39    }
40
41    /// Get the raw bytes
42    pub fn as_bytes(&self) -> &[u8; 12] {
43        &self.0
44    }
45
46    /// Generate a root block ID (all zeros with marker)
47    pub fn root() -> Self {
48        let mut bytes = [0u8; 12];
49        bytes[0] = 0xFF; // Marker for root
50        Self(bytes)
51    }
52
53    /// Check if this is a root block ID
54    pub fn is_root(&self) -> bool {
55        self.0[0] == 0xFF && self.0[1..].iter().all(|&b| b == 0)
56    }
57
58    /// Create a BlockId from hex string (12 hex chars = 6 bytes, padded to 12)
59    pub fn from_hex(s: &str) -> Result<Self> {
60        let bytes = hex::decode(s).map_err(|_| {
61            Error::new(
62                ErrorCode::E002InvalidBlockId,
63                format!("Invalid hex string: {}", s),
64            )
65        })?;
66        if bytes.len() > 12 {
67            return Err(Error::new(
68                ErrorCode::E002InvalidBlockId,
69                "Hex string too long",
70            ));
71        }
72        let mut arr = [0u8; 12];
73        let start = 12 - bytes.len();
74        arr[start..].copy_from_slice(&bytes);
75        Ok(Self(arr))
76    }
77}
78
79impl fmt::Debug for BlockId {
80    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
81        write!(f, "BlockId({})", self)
82    }
83}
84
85impl fmt::Display for BlockId {
86    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
87        write!(f, "blk_{}", hex::encode(self.0))
88    }
89}
90
91impl FromStr for BlockId {
92    type Err = Error;
93
94    fn from_str(s: &str) -> Result<Self> {
95        let hex_part = s
96            .strip_prefix("blk_")
97            .ok_or_else(|| Error::InvalidBlockId(format!("missing 'blk_' prefix: {}", s)))?;
98
99        if hex_part.len() != 24 {
100            return Err(Error::InvalidBlockId(format!(
101                "expected 24 hex characters, got {}",
102                hex_part.len()
103            )));
104        }
105
106        let bytes = hex::decode(hex_part)
107            .map_err(|e| Error::InvalidBlockId(format!("invalid hex: {}", e)))?;
108
109        if bytes.len() != 12 {
110            return Err(Error::InvalidBlockId(format!(
111                "expected 12 bytes, got {}",
112                bytes.len()
113            )));
114        }
115
116        let mut arr = [0u8; 12];
117        arr.copy_from_slice(&bytes);
118        Ok(BlockId(arr))
119    }
120}
121
122/// Content hash (full SHA256)
123#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
124pub struct ContentHash(#[serde(with = "hex_array_32")] pub [u8; 32]);
125
126impl ContentHash {
127    pub fn from_bytes(bytes: [u8; 32]) -> Self {
128        Self(bytes)
129    }
130
131    pub fn as_bytes(&self) -> &[u8; 32] {
132        &self.0
133    }
134}
135
136impl fmt::Debug for ContentHash {
137    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
138        write!(f, "ContentHash({})", hex::encode(&self.0[..8]))
139    }
140}
141
142impl fmt::Display for ContentHash {
143    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
144        write!(f, "{}", hex::encode(self.0))
145    }
146}
147
148/// Configuration for ID generation
149#[derive(Debug, Clone, Default)]
150pub struct IdGeneratorConfig {
151    /// Namespace for multi-tenant scenarios
152    pub namespace: Option<String>,
153    /// Whether to include semantic role in hash
154    pub include_semantic_role: bool,
155}
156
157impl IdGeneratorConfig {
158    pub fn new() -> Self {
159        Self::default()
160    }
161
162    pub fn with_namespace(mut self, namespace: impl Into<String>) -> Self {
163        self.namespace = Some(namespace.into());
164        self
165    }
166
167    pub fn with_semantic_role(mut self, include: bool) -> Self {
168        self.include_semantic_role = include;
169        self
170    }
171}
172
173/// ID generator with configurable options
174#[derive(Debug, Clone)]
175pub struct IdGenerator {
176    config: IdGeneratorConfig,
177}
178
179impl IdGenerator {
180    pub fn new(config: IdGeneratorConfig) -> Self {
181        Self { config }
182    }
183
184    pub fn with_defaults() -> Self {
185        Self::new(IdGeneratorConfig::default())
186    }
187
188    /// Generate a block ID from content
189    pub fn generate(&self, content: &Content, semantic_role: Option<&str>) -> BlockId {
190        generate_block_id(content, semantic_role, self.config.namespace.as_deref())
191    }
192
193    /// Generate a content hash
194    pub fn content_hash(&self, content: &Content) -> ContentHash {
195        compute_content_hash(content)
196    }
197}
198
199impl Default for IdGenerator {
200    fn default() -> Self {
201        Self::with_defaults()
202    }
203}
204
205/// Generate a deterministic block ID from content.
206///
207/// The ID is derived from:
208/// 1. Optional namespace (for multi-tenant isolation)
209/// 2. Content type discriminant
210/// 3. Optional semantic role
211/// 4. Normalized content
212///
213/// # Arguments
214/// * `content` - The block content
215/// * `semantic_role` - Optional semantic role (e.g., "intro.hook")
216/// * `namespace` - Optional namespace for multi-tenant scenarios
217///
218/// # Example
219/// ```
220/// use ucm_core::Content;
221/// use ucm_core::id::generate_block_id;
222///
223/// let content = Content::text("Hello, world!");
224///
225/// let id1 = generate_block_id(&content, Some("intro"), None);
226/// let id2 = generate_block_id(&content, Some("intro"), None);
227/// assert_eq!(id1, id2); // Deterministic
228///
229/// let id3 = generate_block_id(&content, Some("conclusion"), None);
230/// assert_ne!(id1, id3); // Different role = different ID
231/// ```
232pub fn generate_block_id(
233    content: &Content,
234    semantic_role: Option<&str>,
235    namespace: Option<&str>,
236) -> BlockId {
237    let mut hasher = Sha256::new();
238
239    // Add namespace if present
240    if let Some(ns) = namespace {
241        hasher.update(ns.as_bytes());
242        hasher.update(b":");
243    }
244
245    // Add content type discriminant
246    hasher.update(content.type_tag().as_bytes());
247    hasher.update(b":");
248
249    // Add semantic role
250    if let Some(role) = semantic_role {
251        hasher.update(role.as_bytes());
252    }
253    hasher.update(b":");
254
255    // Add normalized content
256    let normalized = normalize_content(content);
257    hasher.update(normalized.as_bytes());
258
259    // Extract 96 bits (12 bytes) from the 256-bit hash
260    let hash = hasher.finalize();
261    let mut id_bytes = [0u8; 12];
262    id_bytes.copy_from_slice(&hash[..12]);
263
264    BlockId(id_bytes)
265}
266
267/// Compute the full content hash (SHA256)
268pub fn compute_content_hash(content: &Content) -> ContentHash {
269    let mut hasher = Sha256::new();
270    let normalized = normalize_content(content);
271    hasher.update(normalized.as_bytes());
272    let hash = hasher.finalize();
273    let mut hash_bytes = [0u8; 32];
274    hash_bytes.copy_from_slice(&hash);
275    ContentHash(hash_bytes)
276}
277
278// Serde helpers for hex encoding
279mod hex_array {
280    use serde::{self, Deserialize, Deserializer, Serializer};
281
282    pub fn serialize<S>(bytes: &[u8; 12], serializer: S) -> Result<S::Ok, S::Error>
283    where
284        S: Serializer,
285    {
286        serializer.serialize_str(&hex::encode(bytes))
287    }
288
289    pub fn deserialize<'de, D>(deserializer: D) -> Result<[u8; 12], D::Error>
290    where
291        D: Deserializer<'de>,
292    {
293        let s = String::deserialize(deserializer)?;
294        let bytes = hex::decode(&s).map_err(serde::de::Error::custom)?;
295        if bytes.len() != 12 {
296            return Err(serde::de::Error::custom("expected 12 bytes"));
297        }
298        let mut arr = [0u8; 12];
299        arr.copy_from_slice(&bytes);
300        Ok(arr)
301    }
302}
303
304mod hex_array_32 {
305    use serde::{self, Deserialize, Deserializer, Serializer};
306
307    pub fn serialize<S>(bytes: &[u8; 32], serializer: S) -> Result<S::Ok, S::Error>
308    where
309        S: Serializer,
310    {
311        serializer.serialize_str(&hex::encode(bytes))
312    }
313
314    pub fn deserialize<'de, D>(deserializer: D) -> Result<[u8; 32], D::Error>
315    where
316        D: Deserializer<'de>,
317    {
318        let s = String::deserialize(deserializer)?;
319        let bytes = hex::decode(&s).map_err(serde::de::Error::custom)?;
320        if bytes.len() != 32 {
321            return Err(serde::de::Error::custom("expected 32 bytes"));
322        }
323        let mut arr = [0u8; 32];
324        arr.copy_from_slice(&bytes);
325        Ok(arr)
326    }
327}
328
329#[cfg(test)]
330mod tests {
331    use super::*;
332
333    #[test]
334    fn test_block_id_display() {
335        let id = BlockId::from_bytes([
336            0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c,
337        ]);
338        assert_eq!(id.to_string(), "blk_0102030405060708090a0b0c");
339    }
340
341    #[test]
342    fn test_block_id_parse() {
343        let id_str = "blk_0102030405060708090a0b0c";
344        let id: BlockId = id_str.parse().unwrap();
345        assert_eq!(id.to_string(), id_str);
346    }
347
348    #[test]
349    fn test_block_id_parse_invalid() {
350        assert!("invalid".parse::<BlockId>().is_err());
351        assert!("blk_invalid".parse::<BlockId>().is_err());
352        assert!("blk_0102".parse::<BlockId>().is_err()); // Too short
353    }
354
355    #[test]
356    fn test_deterministic_id_generation() {
357        let content = Content::text("Hello, world!");
358
359        let id1 = generate_block_id(&content, Some("intro"), None);
360        let id2 = generate_block_id(&content, Some("intro"), None);
361        assert_eq!(id1, id2);
362    }
363
364    #[test]
365    fn test_different_role_different_id() {
366        let content = Content::text("Hello, world!");
367
368        let id1 = generate_block_id(&content, Some("intro"), None);
369        let id2 = generate_block_id(&content, Some("conclusion"), None);
370        assert_ne!(id1, id2);
371    }
372
373    #[test]
374    fn test_namespace_isolation() {
375        let content = Content::text("Hello");
376
377        let id1 = generate_block_id(&content, None, Some("tenant-a"));
378        let id2 = generate_block_id(&content, None, Some("tenant-b"));
379        assert_ne!(id1, id2);
380    }
381
382    #[test]
383    fn test_root_block_id() {
384        let root = BlockId::root();
385        assert!(root.is_root());
386
387        let non_root = BlockId::from_bytes([0x01; 12]);
388        assert!(!non_root.is_root());
389    }
390
391    #[test]
392    fn test_content_hash() {
393        let content = Content::text("Hello");
394
395        let hash1 = compute_content_hash(&content);
396        let hash2 = compute_content_hash(&content);
397        assert_eq!(hash1, hash2);
398    }
399
400    #[test]
401    fn test_id_generator() {
402        let gen = IdGenerator::new(IdGeneratorConfig::new().with_namespace("test"));
403        let content = Content::text("Hello");
404
405        let id = gen.generate(&content, Some("intro"));
406        assert!(!id.is_root());
407    }
408}