1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
//! Regression tests for the Qwen 3 / DeepSeek SentencePiece-style
//! tool-call marker leak.
//!
//! Observed: when a Qwen 3 model (via custom OpenAI-compatible
//! endpoint) is asked to call a tool, it emits the proper
//! structured `tool_calls` field AND a human-readable token-marker
//! echo in `content`:
//!
//! <|tool▁calls_section_begin|>
//! <|tool▁call_begin|>analyze_video<|tool▁calls_section_end|>
//! <|tool▁calls_section_end|> ... (hundreds of repeats)
//!
//! The streaming filter at `custom_openai_compatible.rs` only knew
//! `<tool_call>` and `<function=` — Qwen's `<|tool▁...|>` family
//! was unknown, so every chunk streamed straight to the user's
//! channel as text. The repetition guard at `helpers.rs:320` did
//! eventually kill the stream, but only after ~7 KB of garbage had
//! already shipped to Telegram and the TUI and been persisted as
//! the assistant turn.
//!
//! Two surgical fixes pin this:
//!
//! 1. `TOOL_MARKERS` (streaming) — add the `<|tool▁calls_section_begin|>`
//! and `<|tool▁call_begin|>` openers so the filter routes the
//! whole section into the tool-capture buffer instead of display.
//! 2. `strip_llm_artifacts` (post-stream) — sweep any
//! `<|tool<sep>...|>` token that survived the streaming filter,
//! where `<sep>` is U+2581 OR ASCII `_` (some quantizations emit
//! the ASCII form).
//!
//! These tests pin both ends.
use crate::utils::sanitize::strip_llm_artifacts;
const SEP: &str = "\u{2581}"; // SentencePiece word-boundary char emitted by Qwen 3
#[test]
fn strips_qwen_section_open_and_close_with_u2581() {
// All four marker variants are stripped, but the literal tool
// name between `call_begin` and `call_end` is preserved — by
// design, so prose mentioning the tool name elsewhere doesn't
// get clobbered. See `preserves_tool_name_payload_between_markers`.
let input = format!(
"Hello<|tool{SEP}calls_section_begin|><|tool{SEP}call_begin|>analyze_video<|tool{SEP}call_end|><|tool{SEP}calls_section_end|>"
);
let out = strip_llm_artifacts(&input);
assert!(
!out.contains("<|tool"),
"no marker fragment must survive: {out:?}"
);
assert!(out.starts_with("Hello"), "prose prefix preserved: {out:?}");
assert!(out.contains("analyze_video"), "payload preserved: {out:?}");
}
#[test]
fn strips_repeated_section_end_storm() {
// The actual incident pattern: one call_begin + analyze_video,
// then hundreds of section_end repeats from the model loop.
let mut input = String::from("prefix");
input.push_str(&format!("<|tool{SEP}calls_section_begin|>"));
input.push_str(&format!("<|tool{SEP}call_begin|>"));
input.push_str("analyze_video");
for _ in 0..50 {
input.push_str(&format!("<|tool{SEP}calls_section_end|>"));
}
input.push_str("suffix");
let out = strip_llm_artifacts(&input);
assert_eq!(
out, "prefixanalyze_videosuffix",
"every marker in the storm must go, leaving only the literal tool name + prose"
);
assert!(
!out.contains("<|tool"),
"no marker fragment should survive: {out:?}"
);
}
#[test]
fn strips_ascii_underscore_variant() {
// Some Qwen quantizations / forks emit ASCII `_` instead of
// U+2581. The regex must catch both so we don't regress when
// the next quantization ships.
let input =
"before<|tool_calls_section_begin|><|tool_call_begin|>grep<|tool_calls_section_end|>after";
let out = strip_llm_artifacts(input);
assert_eq!(out, "beforegrepafter");
}
#[test]
fn strips_unknown_future_variants_via_catchall() {
// The regex is intentionally `<\|tool[<sep>][^|]*\|>` so a
// future variant like `<|tool▁call_argument_begin|>` is
// caught without a code change.
let input = format!("a<|tool{SEP}call_argument_begin|>b<|tool{SEP}call_metadata|>c");
let out = strip_llm_artifacts(&input);
assert_eq!(
out, "abc",
"future marker variants must be swept too: {out:?}"
);
}
#[test]
fn preserves_prose_that_only_looks_like_a_marker() {
// The leading `<|tool` followed by U+2581-or-`_` is specific
// enough that natural prose never matches.
let input = "the <|tool of choice|> for this is rg";
let out = strip_llm_artifacts(input);
assert_eq!(
out, input,
"must NOT strip prose that lacks the SP/underscore separator"
);
}
#[test]
fn preserves_tool_name_payload_between_markers() {
// The model's intent (the literal tool name `analyze_video`)
// survives — only the marker wrappers go. This matters because
// some downstream sanitizers / displays may want to surface
// the tool name; we strip wrappers, not content.
let input = format!("<|tool{SEP}call_begin|>analyze_video<|tool{SEP}call_end|>");
let out = strip_llm_artifacts(&input);
assert_eq!(out, "analyze_video");
}
#[test]
fn handles_input_with_no_markers_fast_path() {
// `strip_llm_artifacts` guards the regex behind a
// `result.contains("<|tool")` check so the common case (no
// markers) doesn't allocate a regex scan.
let input = "regular agent response with no Qwen tokens at all";
let out = strip_llm_artifacts(input);
assert_eq!(out, input);
}