trace_share_core/
chunk.rs1use crate::models::{
2 CanonicalEvent, ChunkDocument, ChunkMetadata, content_hash, doc_id, normalize_text,
3};
4
5const CHUNK_CHAR_LIMIT: usize = 3200;
6
7pub fn chunk_events(
8 events: &[CanonicalEvent],
9 policy_version: &str,
10 sanitizer_version: &str,
11) -> Vec<ChunkDocument> {
12 if events.is_empty() {
13 return Vec::new();
14 }
15
16 let mut docs = Vec::new();
17 let mut bucket: Vec<&CanonicalEvent> = Vec::new();
18 let mut bucket_size = 0usize;
19 let mut chunk_index = 0usize;
20
21 for event in events {
22 let piece = format!("[{}][{}] {}", event.ts.to_rfc3339(), event.kind, event.text);
23 if !bucket.is_empty() && bucket_size + piece.len() > CHUNK_CHAR_LIMIT {
24 docs.push(make_doc(
25 &bucket,
26 chunk_index,
27 policy_version,
28 sanitizer_version,
29 ));
30 bucket.clear();
31 bucket_size = 0;
32 chunk_index += 1;
33 }
34 bucket_size += piece.len();
35 bucket.push(event);
36 }
37
38 if !bucket.is_empty() {
39 docs.push(make_doc(
40 &bucket,
41 chunk_index,
42 policy_version,
43 sanitizer_version,
44 ));
45 }
46
47 docs
48}
49
50fn make_doc(
51 events: &[&CanonicalEvent],
52 chunk_index: usize,
53 policy_version: &str,
54 sanitizer_version: &str,
55) -> ChunkDocument {
56 let source = events[0].source.clone();
57 let session_id = events[0].session_id.clone();
58 let ts_start = events[0].ts.to_rfc3339();
59 let ts_end = events[events.len() - 1].ts.to_rfc3339();
60
61 let text = events
62 .iter()
63 .map(|e| format!("[{}][{}] {}", e.ts.to_rfc3339(), e.kind, e.text))
64 .collect::<Vec<_>>()
65 .join("\n");
66
67 let tool_names = events
68 .iter()
69 .filter_map(|e| e.tool.as_ref().map(|t| t.name.clone()))
70 .collect::<Vec<_>>();
71
72 let error_types = events
73 .iter()
74 .filter(|e| e.kind == "error")
75 .map(|e| e.kind.clone())
76 .collect::<Vec<_>>();
77
78 let normalized = normalize_text(&text);
79 let c_hash = content_hash(&normalized);
80 let id = doc_id(&source, &session_id, chunk_index, &c_hash);
81
82 ChunkDocument {
83 id,
84 text,
85 metadata: ChunkMetadata {
86 source,
87 session_id,
88 chunk_index,
89 ts_start,
90 ts_end,
91 tool_names,
92 error_types,
93 repo_fingerprint: None,
94 language: None,
95 policy_version: policy_version.to_string(),
96 sanitizer_version: sanitizer_version.to_string(),
97 content_hash: c_hash,
98 },
99 }
100}