Skip to main content

trace_share_core/
chunk.rs

1use crate::models::{
2    CanonicalEvent, ChunkDocument, ChunkMetadata, content_hash, doc_id, normalize_text,
3};
4
5const CHUNK_CHAR_LIMIT: usize = 3200;
6
7pub fn chunk_events(
8    events: &[CanonicalEvent],
9    policy_version: &str,
10    sanitizer_version: &str,
11) -> Vec<ChunkDocument> {
12    if events.is_empty() {
13        return Vec::new();
14    }
15
16    let mut docs = Vec::new();
17    let mut bucket: Vec<&CanonicalEvent> = Vec::new();
18    let mut bucket_size = 0usize;
19    let mut chunk_index = 0usize;
20
21    for event in events {
22        let piece = format!("[{}][{}] {}", event.ts.to_rfc3339(), event.kind, event.text);
23        if !bucket.is_empty() && bucket_size + piece.len() > CHUNK_CHAR_LIMIT {
24            docs.push(make_doc(
25                &bucket,
26                chunk_index,
27                policy_version,
28                sanitizer_version,
29            ));
30            bucket.clear();
31            bucket_size = 0;
32            chunk_index += 1;
33        }
34        bucket_size += piece.len();
35        bucket.push(event);
36    }
37
38    if !bucket.is_empty() {
39        docs.push(make_doc(
40            &bucket,
41            chunk_index,
42            policy_version,
43            sanitizer_version,
44        ));
45    }
46
47    docs
48}
49
50fn make_doc(
51    events: &[&CanonicalEvent],
52    chunk_index: usize,
53    policy_version: &str,
54    sanitizer_version: &str,
55) -> ChunkDocument {
56    let source = events[0].source.clone();
57    let session_id = events[0].session_id.clone();
58    let ts_start = events[0].ts.to_rfc3339();
59    let ts_end = events[events.len() - 1].ts.to_rfc3339();
60
61    let text = events
62        .iter()
63        .map(|e| format!("[{}][{}] {}", e.ts.to_rfc3339(), e.kind, e.text))
64        .collect::<Vec<_>>()
65        .join("\n");
66
67    let tool_names = events
68        .iter()
69        .filter_map(|e| e.tool.as_ref().map(|t| t.name.clone()))
70        .collect::<Vec<_>>();
71
72    let error_types = events
73        .iter()
74        .filter(|e| e.kind == "error")
75        .map(|e| e.kind.clone())
76        .collect::<Vec<_>>();
77
78    let normalized = normalize_text(&text);
79    let c_hash = content_hash(&normalized);
80    let id = doc_id(&source, &session_id, chunk_index, &c_hash);
81
82    ChunkDocument {
83        id,
84        text,
85        metadata: ChunkMetadata {
86            source,
87            session_id,
88            chunk_index,
89            ts_start,
90            ts_end,
91            tool_names,
92            error_types,
93            repo_fingerprint: None,
94            language: None,
95            policy_version: policy_version.to_string(),
96            sanitizer_version: sanitizer_version.to_string(),
97            content_hash: c_hash,
98        },
99    }
100}