1use crate::content::Content;
4use crate::id::ContentHash;
5use crate::normalize::is_cjk_character;
6use chrono::{DateTime, Utc};
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9use std::error::Error as StdError;
10use std::fmt;
11use std::str::FromStr;
12
13#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
15pub struct BlockMetadata {
16 #[serde(skip_serializing_if = "Option::is_none")]
18 pub semantic_role: Option<SemanticRole>,
19
20 #[serde(skip_serializing_if = "Option::is_none")]
22 pub label: Option<String>,
23
24 #[serde(default, skip_serializing_if = "Vec::is_empty")]
26 pub tags: Vec<String>,
27
28 #[serde(skip_serializing_if = "Option::is_none")]
30 pub summary: Option<String>,
31
32 #[serde(skip_serializing_if = "Option::is_none")]
34 pub token_estimate: Option<TokenEstimate>,
35
36 pub content_hash: ContentHash,
38
39 pub created_at: DateTime<Utc>,
41
42 pub modified_at: DateTime<Utc>,
44
45 #[serde(default, skip_serializing_if = "HashMap::is_empty")]
47 pub custom: HashMap<String, serde_json::Value>,
48}
49
50impl BlockMetadata {
51 pub fn new(content_hash: ContentHash) -> Self {
53 let now = Utc::now();
54 Self {
55 semantic_role: None,
56 label: None,
57 tags: Vec::new(),
58 summary: None,
59 token_estimate: None,
60 content_hash,
61 created_at: now,
62 modified_at: now,
63 custom: HashMap::new(),
64 }
65 }
66
67 pub fn with_role(mut self, role: SemanticRole) -> Self {
69 self.semantic_role = Some(role);
70 self
71 }
72
73 pub fn with_label(mut self, label: impl Into<String>) -> Self {
75 self.label = Some(label.into());
76 self
77 }
78
79 pub fn with_tag(mut self, tag: impl Into<String>) -> Self {
81 self.tags.push(tag.into());
82 self
83 }
84
85 pub fn with_tags(mut self, tags: impl IntoIterator<Item = impl Into<String>>) -> Self {
87 self.tags.extend(tags.into_iter().map(|t| t.into()));
88 self
89 }
90
91 pub fn with_summary(mut self, summary: impl Into<String>) -> Self {
93 self.summary = Some(summary.into());
94 self
95 }
96
97 pub fn with_custom(mut self, key: impl Into<String>, value: serde_json::Value) -> Self {
99 self.custom.insert(key.into(), value);
100 self
101 }
102
103 pub fn touch(&mut self) {
105 self.modified_at = Utc::now();
106 }
107
108 pub fn has_tag(&self, tag: &str) -> bool {
110 self.tags.iter().any(|t| t == tag)
111 }
112}
113
114impl Default for BlockMetadata {
115 fn default() -> Self {
116 Self::new(ContentHash::from_bytes([0u8; 32]))
117 }
118}
119
120#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
122pub struct SemanticRole {
123 pub category: RoleCategory,
125 #[serde(skip_serializing_if = "Option::is_none")]
127 pub subcategory: Option<String>,
128 #[serde(skip_serializing_if = "Option::is_none")]
130 pub qualifier: Option<String>,
131}
132
133impl SemanticRole {
134 pub fn new(category: RoleCategory) -> Self {
135 Self {
136 category,
137 subcategory: None,
138 qualifier: None,
139 }
140 }
141
142 pub fn with_subcategory(mut self, sub: impl Into<String>) -> Self {
143 self.subcategory = Some(sub.into());
144 self
145 }
146
147 pub fn with_qualifier(mut self, qual: impl Into<String>) -> Self {
148 self.qualifier = Some(qual.into());
149 self
150 }
151
152 pub fn parse(s: &str) -> Option<Self> {
154 let parts: Vec<&str> = s.split('.').collect();
155 if parts.is_empty() {
156 return None;
157 }
158
159 let category = RoleCategory::from_str(parts[0]).ok()?;
160 let subcategory = parts.get(1).map(|s| s.to_string());
161 let qualifier = parts.get(2).map(|s| s.to_string());
162
163 Some(Self {
164 category,
165 subcategory,
166 qualifier,
167 })
168 }
169}
170
171impl std::fmt::Display for SemanticRole {
172 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
173 write!(f, "{}", self.category.as_str())?;
174 if let Some(ref sub) = self.subcategory {
175 write!(f, ".{}", sub)?;
176 }
177 if let Some(ref qual) = self.qualifier {
178 write!(f, ".{}", qual)?;
179 }
180 Ok(())
181 }
182}
183
184#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
186#[serde(rename_all = "snake_case")]
187pub enum RoleCategory {
188 Title,
190 Subtitle,
191 Abstract,
192 TableOfContents,
193
194 Heading1,
196 Heading2,
197 Heading3,
198 Heading4,
199 Heading5,
200 Heading6,
201
202 Paragraph,
204 List,
205
206 Intro,
208 IntroHook,
209 IntroContext,
210 IntroThesis,
211
212 Body,
214 BodyArgument,
215 BodyEvidence,
216 BodyExample,
217 BodyCounterargument,
218 BodyTransition,
219
220 Conclusion,
222 ConclusionSummary,
223 ConclusionImplication,
224 ConclusionCallToAction,
225
226 Sidebar,
228 Callout,
229 Warning,
230 Note,
231 Quote,
232
233 Definition,
235 Theorem,
236 Proof,
237 Algorithm,
238 Code,
239
240 Metadata,
242 Citation,
243 Footnote,
244 Appendix,
245 Reference,
246
247 Custom,
249}
250
251#[derive(Debug, Clone, PartialEq, Eq)]
252pub struct RoleCategoryParseError(pub String);
253
254impl fmt::Display for RoleCategoryParseError {
255 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
256 write!(f, "unknown role category '{}'", self.0)
257 }
258}
259
260impl StdError for RoleCategoryParseError {}
261
262impl RoleCategory {
263 pub fn as_str(&self) -> &'static str {
264 match self {
265 Self::Title => "title",
266 Self::Subtitle => "subtitle",
267 Self::Abstract => "abstract",
268 Self::TableOfContents => "toc",
269 Self::Heading1 => "heading1",
270 Self::Heading2 => "heading2",
271 Self::Heading3 => "heading3",
272 Self::Heading4 => "heading4",
273 Self::Heading5 => "heading5",
274 Self::Heading6 => "heading6",
275 Self::Paragraph => "paragraph",
276 Self::List => "list",
277 Self::Intro => "intro",
278 Self::IntroHook => "intro_hook",
279 Self::IntroContext => "intro_context",
280 Self::IntroThesis => "intro_thesis",
281 Self::Body => "body",
282 Self::BodyArgument => "body_argument",
283 Self::BodyEvidence => "body_evidence",
284 Self::BodyExample => "body_example",
285 Self::BodyCounterargument => "body_counterargument",
286 Self::BodyTransition => "body_transition",
287 Self::Conclusion => "conclusion",
288 Self::ConclusionSummary => "conclusion_summary",
289 Self::ConclusionImplication => "conclusion_implication",
290 Self::ConclusionCallToAction => "conclusion_cta",
291 Self::Sidebar => "sidebar",
292 Self::Callout => "callout",
293 Self::Warning => "warning",
294 Self::Note => "note",
295 Self::Quote => "quote",
296 Self::Definition => "definition",
297 Self::Theorem => "theorem",
298 Self::Proof => "proof",
299 Self::Algorithm => "algorithm",
300 Self::Code => "code",
301 Self::Metadata => "metadata",
302 Self::Citation => "citation",
303 Self::Footnote => "footnote",
304 Self::Appendix => "appendix",
305 Self::Reference => "reference",
306 Self::Custom => "custom",
307 }
308 }
309}
310
311impl FromStr for RoleCategory {
312 type Err = RoleCategoryParseError;
313
314 fn from_str(s: &str) -> Result<Self, Self::Err> {
315 match s.to_lowercase().as_str() {
316 "title" => Ok(Self::Title),
317 "subtitle" => Ok(Self::Subtitle),
318 "abstract" => Ok(Self::Abstract),
319 "toc" | "table_of_contents" => Ok(Self::TableOfContents),
320 "heading1" | "h1" => Ok(Self::Heading1),
321 "heading2" | "h2" => Ok(Self::Heading2),
322 "heading3" | "h3" => Ok(Self::Heading3),
323 "heading4" | "h4" => Ok(Self::Heading4),
324 "heading5" | "h5" => Ok(Self::Heading5),
325 "heading6" | "h6" => Ok(Self::Heading6),
326 "paragraph" | "para" | "p" => Ok(Self::Paragraph),
327 "list" | "ul" | "ol" => Ok(Self::List),
328 "intro" | "introduction" => Ok(Self::Intro),
329 "intro_hook" | "hook" => Ok(Self::IntroHook),
330 "intro_context" | "context" => Ok(Self::IntroContext),
331 "intro_thesis" | "thesis" => Ok(Self::IntroThesis),
332 "body" => Ok(Self::Body),
333 "body_argument" | "argument" => Ok(Self::BodyArgument),
334 "body_evidence" | "evidence" => Ok(Self::BodyEvidence),
335 "body_example" | "example" => Ok(Self::BodyExample),
336 "body_counterargument" | "counterargument" => Ok(Self::BodyCounterargument),
337 "body_transition" | "transition" => Ok(Self::BodyTransition),
338 "conclusion" => Ok(Self::Conclusion),
339 "conclusion_summary" | "summary" => Ok(Self::ConclusionSummary),
340 "conclusion_implication" | "implication" => Ok(Self::ConclusionImplication),
341 "conclusion_cta" | "cta" | "call_to_action" => Ok(Self::ConclusionCallToAction),
342 "sidebar" => Ok(Self::Sidebar),
343 "callout" => Ok(Self::Callout),
344 "warning" => Ok(Self::Warning),
345 "note" => Ok(Self::Note),
346 "quote" | "blockquote" => Ok(Self::Quote),
347 "definition" => Ok(Self::Definition),
348 "theorem" => Ok(Self::Theorem),
349 "proof" => Ok(Self::Proof),
350 "algorithm" => Ok(Self::Algorithm),
351 "code" => Ok(Self::Code),
352 "metadata" | "meta" => Ok(Self::Metadata),
353 "citation" | "cite" => Ok(Self::Citation),
354 "footnote" => Ok(Self::Footnote),
355 "appendix" => Ok(Self::Appendix),
356 "reference" | "ref" => Ok(Self::Reference),
357 "custom" => Ok(Self::Custom),
358 _ => Err(RoleCategoryParseError(s.to_string())),
359 }
360 }
361}
362
363#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
365pub struct TokenEstimate {
366 pub gpt4: u32,
368 pub claude: u32,
370 pub llama: u32,
372 pub generic: u32,
374}
375
376impl TokenEstimate {
377 pub fn new(gpt4: u32, claude: u32, llama: u32) -> Self {
379 let generic = (gpt4 + claude + llama) / 3;
380 Self {
381 gpt4,
382 claude,
383 llama,
384 generic,
385 }
386 }
387
388 pub fn compute(content: &Content) -> Self {
390 match content {
391 Content::Text(text) => Self::estimate_text(&text.text),
392 Content::Code(code) => Self::estimate_code(&code.source, &code.language),
393 Content::Table(table) => Self::estimate_table(&table.columns, &table.rows),
394 Content::Json { value, .. } => Self::estimate_json(value),
395 Content::Math(math) => Self::estimate_text(&math.expression),
396 _ => Self::default_estimate(),
397 }
398 }
399
400 pub fn for_model(&self, model: TokenModel) -> u32 {
402 match model {
403 TokenModel::GPT4 => self.gpt4,
404 TokenModel::Claude => self.claude,
405 TokenModel::Llama => self.llama,
406 TokenModel::Generic => self.generic,
407 }
408 }
409
410 fn estimate_text(text: &str) -> Self {
411 let char_count = text.chars().count();
412 let word_count = text.split_whitespace().count();
413
414 let cjk_count = text.chars().filter(|c| is_cjk_character(*c)).count();
416 let cjk_ratio = cjk_count as f32 / char_count.max(1) as f32;
417
418 let base_estimate = if cjk_ratio > 0.5 {
420 (char_count as f32 * 1.5) as u32
421 } else {
422 (word_count as f32 * 1.3 + char_count as f32 / 4.0) as u32 / 2
423 };
424
425 Self {
426 gpt4: base_estimate,
427 claude: (base_estimate as f32 * 1.1) as u32,
428 llama: (base_estimate as f32 * 0.95) as u32,
429 generic: base_estimate,
430 }
431 }
432
433 fn estimate_code(source: &str, language: &str) -> Self {
434 let line_count = source.lines().count();
435 let char_count = source.len();
436
437 let base = (char_count / 3 + line_count * 2) as u32;
439
440 let factor = match language.to_lowercase().as_str() {
442 "rust" | "cpp" | "c" | "c++" => 1.2,
443 "python" => 0.9,
444 "javascript" | "typescript" | "js" | "ts" => 1.1,
445 "go" | "golang" => 1.0,
446 "java" => 1.15,
447 _ => 1.0,
448 };
449
450 let adjusted = (base as f32 * factor) as u32;
451
452 Self {
453 gpt4: adjusted,
454 claude: (adjusted as f32 * 1.05) as u32,
455 llama: (adjusted as f32 * 0.95) as u32,
456 generic: adjusted,
457 }
458 }
459
460 fn estimate_table(columns: &[crate::content::Column], rows: &[crate::content::Row]) -> Self {
461 let cell_count = columns.len() * rows.len();
462 let header_tokens = columns.len() * 5; let cell_tokens = cell_count * 3; let structure_tokens = rows.len() * 2; let total = (header_tokens + cell_tokens + structure_tokens) as u32;
467
468 Self {
469 gpt4: total,
470 claude: (total as f32 * 1.1) as u32,
471 llama: total,
472 generic: total,
473 }
474 }
475
476 fn estimate_json(value: &serde_json::Value) -> Self {
477 let serialized = serde_json::to_string(value).unwrap_or_default();
478 Self::estimate_text(&serialized)
479 }
480
481 fn default_estimate() -> Self {
482 Self {
483 gpt4: 100,
484 claude: 110,
485 llama: 95,
486 generic: 100,
487 }
488 }
489}
490
491impl Default for TokenEstimate {
492 fn default() -> Self {
493 Self::default_estimate()
494 }
495}
496
497#[derive(Debug, Clone, Copy, PartialEq, Eq)]
499pub enum TokenModel {
500 GPT4,
501 Claude,
502 Llama,
503 Generic,
504}
505
506#[cfg(test)]
507mod tests {
508 use super::*;
509
510 #[test]
511 fn test_semantic_role_parse() {
512 let role = SemanticRole::parse("intro.hook").unwrap();
513 assert_eq!(role.category, RoleCategory::Intro);
514 assert_eq!(role.subcategory, Some("hook".to_string()));
515 }
516
517 #[test]
518 fn test_semantic_role_display() {
519 let role = SemanticRole::new(RoleCategory::Intro)
520 .with_subcategory("hook")
521 .with_qualifier("v2");
522 assert_eq!(role.to_string(), "intro.hook.v2");
523 }
524
525 #[test]
526 fn test_role_category_roundtrip() {
527 let category = RoleCategory::BodyEvidence;
528 let s = category.as_str();
529 let parsed = RoleCategory::from_str(s).unwrap();
530 assert_eq!(parsed, category);
531 }
532
533 #[test]
534 fn test_token_estimate_text() {
535 let estimate = TokenEstimate::estimate_text("Hello, world! This is a test.");
536 assert!(estimate.gpt4 > 0);
537 assert!(estimate.claude > 0);
538 }
539
540 #[test]
541 fn test_token_estimate_cjk() {
542 let estimate = TokenEstimate::estimate_text("你好世界");
543 assert!(estimate.gpt4 > 0);
545 }
546
547 #[test]
548 fn test_metadata_builder() {
549 let hash = ContentHash::from_bytes([1u8; 32]);
550 let metadata = BlockMetadata::new(hash)
551 .with_label("Test Block")
552 .with_tags(["important", "draft"])
553 .with_role(SemanticRole::new(RoleCategory::Intro));
554
555 assert_eq!(metadata.label, Some("Test Block".to_string()));
556 assert!(metadata.has_tag("important"));
557 assert!(metadata.has_tag("draft"));
558 }
559}