Skip to main content

ucm_core/
metadata.rs

1//! Block metadata for search, display, and LLM optimization.
2
3use crate::content::Content;
4use crate::id::ContentHash;
5use crate::normalize::is_cjk_character;
6use chrono::{DateTime, Utc};
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9use std::error::Error as StdError;
10use std::fmt;
11use std::str::FromStr;
12
13/// Block metadata
14#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
15pub struct BlockMetadata {
16    /// Semantic role in document structure
17    #[serde(skip_serializing_if = "Option::is_none")]
18    pub semantic_role: Option<SemanticRole>,
19
20    /// Human-readable label
21    #[serde(skip_serializing_if = "Option::is_none")]
22    pub label: Option<String>,
23
24    /// Searchable tags
25    #[serde(default, skip_serializing_if = "Vec::is_empty")]
26    pub tags: Vec<String>,
27
28    /// Pre-computed summary for folding/context management
29    #[serde(skip_serializing_if = "Option::is_none")]
30    pub summary: Option<String>,
31
32    /// Estimated token count (computed lazily)
33    #[serde(skip_serializing_if = "Option::is_none")]
34    pub token_estimate: Option<TokenEstimate>,
35
36    /// Content hash for change detection
37    pub content_hash: ContentHash,
38
39    /// Creation timestamp
40    pub created_at: DateTime<Utc>,
41
42    /// Last modification timestamp
43    pub modified_at: DateTime<Utc>,
44
45    /// Custom key-value metadata
46    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
47    pub custom: HashMap<String, serde_json::Value>,
48}
49
50impl BlockMetadata {
51    /// Create new metadata with current timestamp
52    pub fn new(content_hash: ContentHash) -> Self {
53        let now = Utc::now();
54        Self {
55            semantic_role: None,
56            label: None,
57            tags: Vec::new(),
58            summary: None,
59            token_estimate: None,
60            content_hash,
61            created_at: now,
62            modified_at: now,
63            custom: HashMap::new(),
64        }
65    }
66
67    /// Set semantic role
68    pub fn with_role(mut self, role: SemanticRole) -> Self {
69        self.semantic_role = Some(role);
70        self
71    }
72
73    /// Set label
74    pub fn with_label(mut self, label: impl Into<String>) -> Self {
75        self.label = Some(label.into());
76        self
77    }
78
79    /// Add a tag
80    pub fn with_tag(mut self, tag: impl Into<String>) -> Self {
81        self.tags.push(tag.into());
82        self
83    }
84
85    /// Add multiple tags
86    pub fn with_tags(mut self, tags: impl IntoIterator<Item = impl Into<String>>) -> Self {
87        self.tags.extend(tags.into_iter().map(|t| t.into()));
88        self
89    }
90
91    /// Set summary
92    pub fn with_summary(mut self, summary: impl Into<String>) -> Self {
93        self.summary = Some(summary.into());
94        self
95    }
96
97    /// Set custom metadata
98    pub fn with_custom(mut self, key: impl Into<String>, value: serde_json::Value) -> Self {
99        self.custom.insert(key.into(), value);
100        self
101    }
102
103    /// Update modification timestamp
104    pub fn touch(&mut self) {
105        self.modified_at = Utc::now();
106    }
107
108    /// Check if block has a specific tag
109    pub fn has_tag(&self, tag: &str) -> bool {
110        self.tags.iter().any(|t| t == tag)
111    }
112}
113
114impl Default for BlockMetadata {
115    fn default() -> Self {
116        Self::new(ContentHash::from_bytes([0u8; 32]))
117    }
118}
119
120/// Semantic role in document structure
121#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
122pub struct SemanticRole {
123    /// Primary category
124    pub category: RoleCategory,
125    /// Subcategory (optional)
126    #[serde(skip_serializing_if = "Option::is_none")]
127    pub subcategory: Option<String>,
128    /// Custom qualifier
129    #[serde(skip_serializing_if = "Option::is_none")]
130    pub qualifier: Option<String>,
131}
132
133impl SemanticRole {
134    pub fn new(category: RoleCategory) -> Self {
135        Self {
136            category,
137            subcategory: None,
138            qualifier: None,
139        }
140    }
141
142    pub fn with_subcategory(mut self, sub: impl Into<String>) -> Self {
143        self.subcategory = Some(sub.into());
144        self
145    }
146
147    pub fn with_qualifier(mut self, qual: impl Into<String>) -> Self {
148        self.qualifier = Some(qual.into());
149        self
150    }
151
152    /// Parse from string format (e.g., "intro.hook")
153    pub fn parse(s: &str) -> Option<Self> {
154        let parts: Vec<&str> = s.split('.').collect();
155        if parts.is_empty() {
156            return None;
157        }
158
159        let category = RoleCategory::from_str(parts[0]).ok()?;
160        let subcategory = parts.get(1).map(|s| s.to_string());
161        let qualifier = parts.get(2).map(|s| s.to_string());
162
163        Some(Self {
164            category,
165            subcategory,
166            qualifier,
167        })
168    }
169}
170
171impl std::fmt::Display for SemanticRole {
172    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
173        write!(f, "{}", self.category.as_str())?;
174        if let Some(ref sub) = self.subcategory {
175            write!(f, ".{}", sub)?;
176        }
177        if let Some(ref qual) = self.qualifier {
178            write!(f, ".{}", qual)?;
179        }
180        Ok(())
181    }
182}
183
184/// Semantic role categories
185#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
186#[serde(rename_all = "snake_case")]
187pub enum RoleCategory {
188    // Document structure
189    Title,
190    Subtitle,
191    Abstract,
192    TableOfContents,
193
194    // Headings (H1-H6)
195    Heading1,
196    Heading2,
197    Heading3,
198    Heading4,
199    Heading5,
200    Heading6,
201
202    // Paragraphs and lists
203    Paragraph,
204    List,
205
206    // Introduction
207    Intro,
208    IntroHook,
209    IntroContext,
210    IntroThesis,
211
212    // Body
213    Body,
214    BodyArgument,
215    BodyEvidence,
216    BodyExample,
217    BodyCounterargument,
218    BodyTransition,
219
220    // Conclusion
221    Conclusion,
222    ConclusionSummary,
223    ConclusionImplication,
224    ConclusionCallToAction,
225
226    // Special sections
227    Sidebar,
228    Callout,
229    Warning,
230    Note,
231    Quote,
232
233    // Technical
234    Definition,
235    Theorem,
236    Proof,
237    Algorithm,
238    Code,
239
240    // Meta
241    Metadata,
242    Citation,
243    Footnote,
244    Appendix,
245    Reference,
246
247    // Custom
248    Custom,
249}
250
251#[derive(Debug, Clone, PartialEq, Eq)]
252pub struct RoleCategoryParseError(pub String);
253
254impl fmt::Display for RoleCategoryParseError {
255    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
256        write!(f, "unknown role category '{}'", self.0)
257    }
258}
259
260impl StdError for RoleCategoryParseError {}
261
262impl RoleCategory {
263    pub fn as_str(&self) -> &'static str {
264        match self {
265            Self::Title => "title",
266            Self::Subtitle => "subtitle",
267            Self::Abstract => "abstract",
268            Self::TableOfContents => "toc",
269            Self::Heading1 => "heading1",
270            Self::Heading2 => "heading2",
271            Self::Heading3 => "heading3",
272            Self::Heading4 => "heading4",
273            Self::Heading5 => "heading5",
274            Self::Heading6 => "heading6",
275            Self::Paragraph => "paragraph",
276            Self::List => "list",
277            Self::Intro => "intro",
278            Self::IntroHook => "intro_hook",
279            Self::IntroContext => "intro_context",
280            Self::IntroThesis => "intro_thesis",
281            Self::Body => "body",
282            Self::BodyArgument => "body_argument",
283            Self::BodyEvidence => "body_evidence",
284            Self::BodyExample => "body_example",
285            Self::BodyCounterargument => "body_counterargument",
286            Self::BodyTransition => "body_transition",
287            Self::Conclusion => "conclusion",
288            Self::ConclusionSummary => "conclusion_summary",
289            Self::ConclusionImplication => "conclusion_implication",
290            Self::ConclusionCallToAction => "conclusion_cta",
291            Self::Sidebar => "sidebar",
292            Self::Callout => "callout",
293            Self::Warning => "warning",
294            Self::Note => "note",
295            Self::Quote => "quote",
296            Self::Definition => "definition",
297            Self::Theorem => "theorem",
298            Self::Proof => "proof",
299            Self::Algorithm => "algorithm",
300            Self::Code => "code",
301            Self::Metadata => "metadata",
302            Self::Citation => "citation",
303            Self::Footnote => "footnote",
304            Self::Appendix => "appendix",
305            Self::Reference => "reference",
306            Self::Custom => "custom",
307        }
308    }
309}
310
311impl FromStr for RoleCategory {
312    type Err = RoleCategoryParseError;
313
314    fn from_str(s: &str) -> Result<Self, Self::Err> {
315        match s.to_lowercase().as_str() {
316            "title" => Ok(Self::Title),
317            "subtitle" => Ok(Self::Subtitle),
318            "abstract" => Ok(Self::Abstract),
319            "toc" | "table_of_contents" => Ok(Self::TableOfContents),
320            "heading1" | "h1" => Ok(Self::Heading1),
321            "heading2" | "h2" => Ok(Self::Heading2),
322            "heading3" | "h3" => Ok(Self::Heading3),
323            "heading4" | "h4" => Ok(Self::Heading4),
324            "heading5" | "h5" => Ok(Self::Heading5),
325            "heading6" | "h6" => Ok(Self::Heading6),
326            "paragraph" | "para" | "p" => Ok(Self::Paragraph),
327            "list" | "ul" | "ol" => Ok(Self::List),
328            "intro" | "introduction" => Ok(Self::Intro),
329            "intro_hook" | "hook" => Ok(Self::IntroHook),
330            "intro_context" | "context" => Ok(Self::IntroContext),
331            "intro_thesis" | "thesis" => Ok(Self::IntroThesis),
332            "body" => Ok(Self::Body),
333            "body_argument" | "argument" => Ok(Self::BodyArgument),
334            "body_evidence" | "evidence" => Ok(Self::BodyEvidence),
335            "body_example" | "example" => Ok(Self::BodyExample),
336            "body_counterargument" | "counterargument" => Ok(Self::BodyCounterargument),
337            "body_transition" | "transition" => Ok(Self::BodyTransition),
338            "conclusion" => Ok(Self::Conclusion),
339            "conclusion_summary" | "summary" => Ok(Self::ConclusionSummary),
340            "conclusion_implication" | "implication" => Ok(Self::ConclusionImplication),
341            "conclusion_cta" | "cta" | "call_to_action" => Ok(Self::ConclusionCallToAction),
342            "sidebar" => Ok(Self::Sidebar),
343            "callout" => Ok(Self::Callout),
344            "warning" => Ok(Self::Warning),
345            "note" => Ok(Self::Note),
346            "quote" | "blockquote" => Ok(Self::Quote),
347            "definition" => Ok(Self::Definition),
348            "theorem" => Ok(Self::Theorem),
349            "proof" => Ok(Self::Proof),
350            "algorithm" => Ok(Self::Algorithm),
351            "code" => Ok(Self::Code),
352            "metadata" | "meta" => Ok(Self::Metadata),
353            "citation" | "cite" => Ok(Self::Citation),
354            "footnote" => Ok(Self::Footnote),
355            "appendix" => Ok(Self::Appendix),
356            "reference" | "ref" => Ok(Self::Reference),
357            "custom" => Ok(Self::Custom),
358            _ => Err(RoleCategoryParseError(s.to_string())),
359        }
360    }
361}
362
363/// Token estimation with model awareness
364#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
365pub struct TokenEstimate {
366    /// Estimated tokens for GPT-4 tokenizer
367    pub gpt4: u32,
368    /// Estimated tokens for Claude tokenizer
369    pub claude: u32,
370    /// Estimated tokens for Llama tokenizer
371    pub llama: u32,
372    /// Generic estimate (average)
373    pub generic: u32,
374}
375
376impl TokenEstimate {
377    /// Create a new token estimate with all models
378    pub fn new(gpt4: u32, claude: u32, llama: u32) -> Self {
379        let generic = (gpt4 + claude + llama) / 3;
380        Self {
381            gpt4,
382            claude,
383            llama,
384            generic,
385        }
386    }
387
388    /// Compute token estimate from content
389    pub fn compute(content: &Content) -> Self {
390        match content {
391            Content::Text(text) => Self::estimate_text(&text.text),
392            Content::Code(code) => Self::estimate_code(&code.source, &code.language),
393            Content::Table(table) => Self::estimate_table(&table.columns, &table.rows),
394            Content::Json { value, .. } => Self::estimate_json(value),
395            Content::Math(math) => Self::estimate_text(&math.expression),
396            _ => Self::default_estimate(),
397        }
398    }
399
400    /// Get estimate for a specific model
401    pub fn for_model(&self, model: TokenModel) -> u32 {
402        match model {
403            TokenModel::GPT4 => self.gpt4,
404            TokenModel::Claude => self.claude,
405            TokenModel::Llama => self.llama,
406            TokenModel::Generic => self.generic,
407        }
408    }
409
410    fn estimate_text(text: &str) -> Self {
411        let char_count = text.chars().count();
412        let word_count = text.split_whitespace().count();
413
414        // Detect script type for better estimation
415        let cjk_count = text.chars().filter(|c| is_cjk_character(*c)).count();
416        let cjk_ratio = cjk_count as f32 / char_count.max(1) as f32;
417
418        // CJK characters are ~1-2 tokens each, Latin ~4 chars per token
419        let base_estimate = if cjk_ratio > 0.5 {
420            (char_count as f32 * 1.5) as u32
421        } else {
422            (word_count as f32 * 1.3 + char_count as f32 / 4.0) as u32 / 2
423        };
424
425        Self {
426            gpt4: base_estimate,
427            claude: (base_estimate as f32 * 1.1) as u32,
428            llama: (base_estimate as f32 * 0.95) as u32,
429            generic: base_estimate,
430        }
431    }
432
433    fn estimate_code(source: &str, language: &str) -> Self {
434        let line_count = source.lines().count();
435        let char_count = source.len();
436
437        // Code typically has more tokens due to punctuation
438        let base = (char_count / 3 + line_count * 2) as u32;
439
440        // Language-specific adjustments
441        let factor = match language.to_lowercase().as_str() {
442            "rust" | "cpp" | "c" | "c++" => 1.2,
443            "python" => 0.9,
444            "javascript" | "typescript" | "js" | "ts" => 1.1,
445            "go" | "golang" => 1.0,
446            "java" => 1.15,
447            _ => 1.0,
448        };
449
450        let adjusted = (base as f32 * factor) as u32;
451
452        Self {
453            gpt4: adjusted,
454            claude: (adjusted as f32 * 1.05) as u32,
455            llama: (adjusted as f32 * 0.95) as u32,
456            generic: adjusted,
457        }
458    }
459
460    fn estimate_table(columns: &[crate::content::Column], rows: &[crate::content::Row]) -> Self {
461        let cell_count = columns.len() * rows.len();
462        let header_tokens = columns.len() * 5; // ~5 tokens per header
463        let cell_tokens = cell_count * 3; // ~3 tokens per cell average
464        let structure_tokens = rows.len() * 2; // Row separators
465
466        let total = (header_tokens + cell_tokens + structure_tokens) as u32;
467
468        Self {
469            gpt4: total,
470            claude: (total as f32 * 1.1) as u32,
471            llama: total,
472            generic: total,
473        }
474    }
475
476    fn estimate_json(value: &serde_json::Value) -> Self {
477        let serialized = serde_json::to_string(value).unwrap_or_default();
478        Self::estimate_text(&serialized)
479    }
480
481    fn default_estimate() -> Self {
482        Self {
483            gpt4: 100,
484            claude: 110,
485            llama: 95,
486            generic: 100,
487        }
488    }
489}
490
491impl Default for TokenEstimate {
492    fn default() -> Self {
493        Self::default_estimate()
494    }
495}
496
497/// Token model selector
498#[derive(Debug, Clone, Copy, PartialEq, Eq)]
499pub enum TokenModel {
500    GPT4,
501    Claude,
502    Llama,
503    Generic,
504}
505
506#[cfg(test)]
507mod tests {
508    use super::*;
509
510    #[test]
511    fn test_semantic_role_parse() {
512        let role = SemanticRole::parse("intro.hook").unwrap();
513        assert_eq!(role.category, RoleCategory::Intro);
514        assert_eq!(role.subcategory, Some("hook".to_string()));
515    }
516
517    #[test]
518    fn test_semantic_role_display() {
519        let role = SemanticRole::new(RoleCategory::Intro)
520            .with_subcategory("hook")
521            .with_qualifier("v2");
522        assert_eq!(role.to_string(), "intro.hook.v2");
523    }
524
525    #[test]
526    fn test_role_category_roundtrip() {
527        let category = RoleCategory::BodyEvidence;
528        let s = category.as_str();
529        let parsed = RoleCategory::from_str(s).unwrap();
530        assert_eq!(parsed, category);
531    }
532
533    #[test]
534    fn test_token_estimate_text() {
535        let estimate = TokenEstimate::estimate_text("Hello, world! This is a test.");
536        assert!(estimate.gpt4 > 0);
537        assert!(estimate.claude > 0);
538    }
539
540    #[test]
541    fn test_token_estimate_cjk() {
542        let estimate = TokenEstimate::estimate_text("你好世界");
543        // CJK should have higher token count per character
544        assert!(estimate.gpt4 > 0);
545    }
546
547    #[test]
548    fn test_metadata_builder() {
549        let hash = ContentHash::from_bytes([1u8; 32]);
550        let metadata = BlockMetadata::new(hash)
551            .with_label("Test Block")
552            .with_tags(["important", "draft"])
553            .with_role(SemanticRole::new(RoleCategory::Intro));
554
555        assert_eq!(metadata.label, Some("Test Block".to_string()));
556        assert!(metadata.has_tag("important"));
557        assert!(metadata.has_tag("draft"));
558    }
559}