1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
//! Prompt truncation tests (UTF-8 safety)
use super::*;
use crate::router::{Importance, RouteMetadata, TaskType};
#[test]
fn test_build_router_prompt_truncates_long_prompt_safely() {
// Long ASCII prompt - should truncate cleanly
let long_prompt = "a".repeat(1000);
let meta = RouteMetadata {
token_estimate: 100,
importance: Importance::Normal,
task_type: TaskType::CasualChat,
};
let result = LlmBasedRouter::build_router_prompt(&long_prompt, &meta);
// Should not panic, should contain truncation marker
assert!(result.contains("[truncated]"));
}
#[test]
fn test_build_router_prompt_handles_multibyte_chars_at_boundary() {
// Create a string where a multibyte UTF-8 character falls exactly at byte 500
// "世" is 3 bytes in UTF-8 (0xE4 0xB8 0x96)
// We want byte 499-501 to be this character, so byte slicing at 500 would panic
let ascii_prefix = "a".repeat(498); // 498 bytes
let prompt = format!("{}世界test", ascii_prefix); // byte 498-500 is "世" (3 bytes)
let meta = RouteMetadata {
token_estimate: 100,
importance: Importance::Normal,
task_type: TaskType::CasualChat,
};
// This should NOT panic - the current implementation WILL panic
let result = LlmBasedRouter::build_router_prompt(&prompt, &meta);
// Should contain truncation marker and be valid UTF-8
assert!(result.contains("[truncated]"));
}
#[test]
fn test_build_router_prompt_handles_emoji_at_boundary() {
// Emoji are 4-byte UTF-8 sequences
// Create string where emoji falls at truncation boundary
let ascii_prefix = "a".repeat(497);
let prompt = format!("{}🦑test", ascii_prefix); // 🦑 is 4 bytes
let meta = RouteMetadata {
token_estimate: 100,
importance: Importance::Normal,
task_type: TaskType::CasualChat,
};
// Should NOT panic
let result = LlmBasedRouter::build_router_prompt(&prompt, &meta);
// Should be valid UTF-8 with truncation marker
assert!(result.contains("[truncated]"));
}
#[test]
fn test_build_router_prompt_preserves_short_multibyte_prompt() {
// Short prompt with multibyte characters should NOT be truncated
let prompt = "Explain quantum entanglement in Chinese: 量子纠缠";
let meta = RouteMetadata {
token_estimate: 100,
importance: Importance::Normal,
task_type: TaskType::CasualChat,
};
let result = LlmBasedRouter::build_router_prompt(prompt, &meta);
// Should contain the original prompt, NOT truncated
assert!(result.contains(prompt));
assert!(!result.contains("[truncated]"));
}
#[test]
fn test_build_router_prompt_handles_zwj_emoji_at_boundary() {
// GAP #7: ZWJ (Zero-Width Joiner) Emoji Truncation
//
// ZWJ emoji like 👨👩👧👦 (family) are composed of multiple codepoints joined by U+200D (ZWJ).
// Family emoji: 👨 (man) + ZWJ + 👩 (woman) + ZWJ + 👧 (girl) + ZWJ + 👦 (boy)
// Total: ~25 bytes in UTF-8
//
// Truncation at character boundary should not produce � (replacement character).
// Create string where ZWJ emoji sequence falls near truncation boundary (500 chars)
let ascii_prefix = "a".repeat(480); // Leave room for ZWJ emoji + some padding
let prompt = format!("{}Family emoji: 👨👩👧👦 test", ascii_prefix);
let meta = RouteMetadata {
token_estimate: 100,
importance: Importance::Normal,
task_type: TaskType::CasualChat,
};
let result = LlmBasedRouter::build_router_prompt(&prompt, &meta);
// Should be valid UTF-8 (no replacement characters)
assert!(
!result.contains('\u{FFFD}'),
"Truncated output should not contain replacement character (�)"
);
// Should be valid UTF-8 (can be converted without error)
assert!(
result.is_char_boundary(result.len()),
"Truncated output should end on char boundary"
);
}
#[test]
fn test_build_router_prompt_handles_rtl_text_at_boundary() {
// GAP #7: RTL (Right-to-Left) Text Truncation
//
// RTL languages like Arabic and Hebrew use bidirectional text.
// Truncation should preserve valid UTF-8 even with RTL characters.
//
// Arabic text uses 2-3 bytes per character in UTF-8.
// Create string with Arabic text near truncation boundary
let ascii_prefix = "a".repeat(490);
let prompt = format!(
"{}Arabic: مرحبا بك في عالم الذكاء الاصطناعي test",
ascii_prefix
);
let meta = RouteMetadata {
token_estimate: 100,
importance: Importance::Normal,
task_type: TaskType::CasualChat,
};
let result = LlmBasedRouter::build_router_prompt(&prompt, &meta);
// Should be valid UTF-8 (no replacement characters)
assert!(
!result.contains('\u{FFFD}'),
"Truncated output should not contain replacement character (�)"
);
// Should contain truncation marker since prompt > 500 chars
assert!(result.contains("[truncated]"));
}
#[test]
fn test_build_router_prompt_handles_combining_diacritics_at_boundary() {
// GAP #7: Combining Diacritics Truncation
//
// Combining diacritics are separate codepoints that modify base characters.
// Example: é can be composed as 'e' (U+0065) + ́ (U+0301)
//
// Truncation at character boundary should not split combining sequences.
// Create string with combining diacritics near boundary
// Use decomposed form: e + combining acute accent
let ascii_prefix = "a".repeat(495);
let decomposed_text = "café resume"; // May contain combining forms depending on normalization
let prompt = format!("{}{}", ascii_prefix, decomposed_text);
let meta = RouteMetadata {
token_estimate: 100,
importance: Importance::Normal,
task_type: TaskType::CasualChat,
};
let result = LlmBasedRouter::build_router_prompt(&prompt, &meta);
// Should be valid UTF-8 (char-based truncation ensures this)
assert!(
!result.contains('\u{FFFD}'),
"Truncated output should not contain replacement character (�)"
);
// Verify truncation marker present
assert!(result.contains("[truncated]"));
}