tinyjuice 0.2.1

Pluggable token compression for OpenHuman.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
//! Additional unit tests for the `tokenjuice::text` sub-modules.
//!
//! Focuses on coverage gaps identified in test-map.md Pick 5
//! ("TokenJuice Rust port for tool-output compaction"):
//! - `strip_ansi` with multi-byte / emoji text (grapheme safety).
//! - `dedupe_adjacent` additional edge cases.
//! - `clamp_text_middle` grapheme-safe split — never breaks inside a
//!   multi-byte codepoint or multi-scalar grapheme cluster.
//! - 3-layer overlay precedence: project > user > builtin.
//! - Rule loader gracefully handles invalid regex (diagnostic, no panic).

use crate::rules::loader::{LoadRuleOptions, load_rules};
use crate::text::width::{count_text_chars, graphemes};
use crate::text::{clamp_text_middle, dedupe_adjacent, strip_ansi};
use crate::types::RuleOrigin;

// ── strip_ansi — multi-byte / emoji safety ───────────────────────────────────

#[test]
fn strip_ansi_leaves_multibyte_cjk_intact() {
    // CJK characters must pass through completely even when preceded by ANSI.
    let input = "\x1b[32m中文\x1b[0m";
    assert_eq!(strip_ansi(input), "中文");
}

#[test]
fn strip_ansi_leaves_emoji_intact() {
    // Emoji must survive stripping.
    let input = "\x1b[1m😀 hello\x1b[0m";
    assert_eq!(strip_ansi(input), "😀 hello");
}

#[test]
fn strip_ansi_multi_byte_only_no_escapes() {
    // When there are no ANSI codes, multi-byte text is returned unchanged.
    let text = "こんにちは";
    assert_eq!(strip_ansi(text), text);
}

#[test]
fn strip_ansi_zwj_emoji_sequence_preserved() {
    // ZWJ sequences (family emoji) must not be mangled.
    let fam = "\u{1F468}\u{200D}\u{1F469}\u{200D}\u{1F467}"; // family emoji
    let colored = format!("\x1b[31m{fam}\x1b[0m");
    let stripped = strip_ansi(&colored);
    assert_eq!(stripped, fam, "ZWJ sequence must survive ANSI stripping");
}

#[test]
fn strip_ansi_mixed_scripts_preserved() {
    // Arabic, CJK, Latin, emoji all in one string with ANSI wrappers.
    let input = "\x1b[33mعربي 中文 hello 🌍\x1b[0m";
    let stripped = strip_ansi(input);
    assert_eq!(stripped, "عربي 中文 hello 🌍");
}

#[test]
fn strip_ansi_empty_string() {
    assert_eq!(strip_ansi(""), "");
}

#[test]
fn strip_ansi_only_escape_sequences() {
    // If the entire string is escape sequences, the result should be empty.
    let all_ansi = "\x1b[0m\x1b[1m\x1b[31m";
    assert_eq!(strip_ansi(all_ansi), "");
}

// ── dedupe_adjacent ───────────────────────────────────────────────────────────

fn strs(v: &[&str]) -> Vec<String> {
    v.iter().map(|s| s.to_string()).collect()
}

#[test]
fn dedupe_adjacent_collapses_run_of_identical_lines() {
    let lines = strs(&["a", "a", "a", "b"]);
    let out = dedupe_adjacent(&lines);
    assert_eq!(out, strs(&["a", "b"]));
}

#[test]
fn dedupe_adjacent_preserves_non_adjacent_duplicates() {
    // Same value reappearing after a different line must NOT be collapsed.
    let lines = strs(&["a", "b", "a"]);
    let out = dedupe_adjacent(&lines);
    assert_eq!(out, strs(&["a", "b", "a"]));
}

#[test]
fn dedupe_adjacent_single_element_is_unchanged() {
    let lines = strs(&["only"]);
    let out = dedupe_adjacent(&lines);
    assert_eq!(out, strs(&["only"]));
}

#[test]
fn dedupe_adjacent_all_identical_collapses_to_one() {
    let lines = strs(&["x", "x", "x", "x", "x"]);
    let out = dedupe_adjacent(&lines);
    assert_eq!(out, strs(&["x"]));
}

#[test]
fn dedupe_adjacent_empty_lines_are_deduplicated() {
    // Adjacent blank lines must also be collapsed.
    let lines = strs(&["a", "", "", "b", "", "c"]);
    let out = dedupe_adjacent(&lines);
    assert_eq!(out, strs(&["a", "", "b", "", "c"]));
}

#[test]
fn dedupe_adjacent_multibyte_lines_collapsed() {
    let lines = strs(&["日本語", "日本語", "日本語"]);
    let out = dedupe_adjacent(&lines);
    assert_eq!(out, strs(&["日本語"]));
}

// ── clamp_text_middle — grapheme-safe middle truncation ───────────────────────

/// Assert that `clamp_text_middle` never splits inside a multi-byte
/// grapheme: every byte of the output must decode to valid UTF-8, and
/// every character in the output must appear as a whole grapheme in the
/// source string.
#[test]
fn clamp_text_middle_output_is_valid_utf8() {
    // A long string of CJK characters (each is 3 bytes in UTF-8).
    // String is always valid UTF-8, so a tautological from_utf8 check would
    // not actually verify the grapheme-safety contract. Instead, assert that
    // every grapheme in the clamped output also exists as a complete grapheme
    // in the source (i.e. no partial cluster fragments leaked through).
    let cjk: String = "中文字符测试!".repeat(50);
    let clamped = clamp_text_middle(&cjk, 30);
    let source_graphemes: std::collections::HashSet<&str> = graphemes(&cjk).into_iter().collect();
    for g in graphemes(&clamped) {
        // The omission marker is the only legitimate non-source content.
        if g == "·"
            || g.chars().all(|c| {
                c.is_ascii_punctuation() || c.is_ascii_whitespace() || c.is_ascii_alphanumeric()
            })
        {
            continue;
        }
        assert!(
            source_graphemes.contains(g),
            "grapheme {g:?} in clamp output is not a whole grapheme of the source"
        );
    }
}

#[test]
fn clamp_text_middle_does_not_split_emoji_grapheme() {
    // Each emoji is 4 bytes; a naïve byte split could land in the middle.
    // Verify boundary correctness by counting graphemes — the clamp must
    // never produce a partial codepoint or partial grapheme.
    let emojis: String = "😀".repeat(100);
    let clamped = clamp_text_middle(&emojis, 20);
    // Every non-marker grapheme in the output must equal "😀" (source has
    // exactly one distinct grapheme). A partial split would leave a
    // replacement char or a stray surrogate-equivalent sequence.
    for g in graphemes(&clamped) {
        let only_ascii = g.is_ascii();
        assert!(
            g == "😀" || only_ascii,
            "unexpected grapheme {g:?} — partial emoji split detected"
        );
    }
}

#[test]
fn clamp_text_middle_short_text_is_passthrough() {
    // Strings shorter than max_chars are returned verbatim.
    let text = "hello 世界 🌍";
    let clamped = clamp_text_middle(text, 200);
    assert_eq!(clamped, text);
}

#[test]
fn clamp_text_middle_inserts_omission_marker() {
    let long_text = "line\n".repeat(200);
    let clamped = clamp_text_middle(&long_text, 100);
    assert!(
        clamped.contains("omitted"),
        "middle clamp must contain omission marker, got: {}",
        &clamped[..clamped.len().min(120)]
    );
}

#[test]
fn clamp_text_middle_grapheme_count_respects_limit() {
    // The result should not exceed max_chars + marker length substantially.
    // We use a lenient bound (2× marker overhead) rather than an exact count.
    let long_text = "あいうえお\n".repeat(200); // multi-byte lines
    let max = 100usize;
    let clamped = clamp_text_middle(&long_text, max);
    let grapheme_count = count_text_chars(&clamped);
    // Allow up to 2× max to accommodate the omission marker.
    assert!(
        grapheme_count <= max * 2,
        "clamped grapheme count {grapheme_count} exceeds 2×max ({max})"
    );
}

#[test]
fn clamp_text_middle_zwj_sequence_not_split() {
    // ZWJ family emoji repeated; each base char is 4 bytes, ZWJ is 3 bytes.
    // Assert no partial ZWJ family fragments survive in the output: any
    // grapheme containing the ZWJ codepoint must be the full family unit.
    let zwj_unit = "\u{1F468}\u{200D}\u{1F469}\u{200D}\u{1F467}"; // family
    let long: String = (zwj_unit.to_owned() + "\n").repeat(100);
    let clamped = clamp_text_middle(&long, 30);
    for g in graphemes(&clamped) {
        if g.contains('\u{200D}')
            || g.chars()
                .any(|c| matches!(c, '\u{1F468}' | '\u{1F469}' | '\u{1F467}'))
        {
            assert_eq!(
                g, zwj_unit,
                "clamp produced partial ZWJ cluster {g:?}; expected the full family unit"
            );
        }
    }
}

// ── grapheme helper round-trip ────────────────────────────────────────────────

#[test]
fn graphemes_clusters_match_count_text_chars() {
    let mixed = "hello 中文 😀 emoji";
    let gs = graphemes(mixed);
    assert_eq!(gs.len(), count_text_chars(mixed));
}

// ── 3-layer overlay precedence ────────────────────────────────────────────────

#[test]
fn three_layer_overlay_project_beats_user_beats_builtin() {
    // Create temporary dirs for user and project layers.
    let user_dir = tempfile::tempdir().expect("user tempdir");
    let project_dir = tempfile::tempdir().expect("project tempdir");

    // User overrides the builtin `git/status` rule with family "user-family".
    std::fs::write(
        user_dir.path().join("gs.json"),
        r#"{"id":"git/status","family":"user-family","match":{}}"#,
    )
    .unwrap();

    // Project overrides the same rule with family "project-family" (highest priority).
    std::fs::write(
        project_dir.path().join("gs.json"),
        r#"{"id":"git/status","family":"project-family","match":{}}"#,
    )
    .unwrap();

    let opts = LoadRuleOptions {
        user_rules_dir: Some(user_dir.path().to_owned()),
        project_rules_dir: Some(project_dir.path().to_owned()),
        ..Default::default()
    };
    let rules = load_rules(&opts);
    let gs = rules
        .iter()
        .find(|r| r.rule.id == "git/status")
        .expect("git/status rule must be present");

    // Project must win over user and builtin.
    assert_eq!(
        gs.rule.family, "project-family",
        "project layer must override user and builtin layers"
    );
    assert_eq!(
        gs.source,
        RuleOrigin::Project,
        "winning rule must be sourced from Project"
    );
}

#[test]
fn two_layer_overlay_user_beats_builtin() {
    let user_dir = tempfile::tempdir().expect("user tempdir");

    std::fs::write(
        user_dir.path().join("gs.json"),
        r#"{"id":"git/status","family":"user-only","match":{}}"#,
    )
    .unwrap();

    let opts = LoadRuleOptions {
        user_rules_dir: Some(user_dir.path().to_owned()),
        exclude_project: true,
        ..Default::default()
    };
    let rules = load_rules(&opts);
    let gs = rules
        .iter()
        .find(|r| r.rule.id == "git/status")
        .expect("git/status rule");

    assert_eq!(gs.rule.family, "user-only");
    assert_eq!(gs.source, RuleOrigin::User);
}

#[test]
fn builtin_rule_present_when_no_overrides() {
    let opts = LoadRuleOptions {
        exclude_user: true,
        exclude_project: true,
        ..Default::default()
    };
    let rules = load_rules(&opts);
    let gs = rules.iter().find(|r| r.rule.id == "git/status");
    assert!(gs.is_some(), "builtin git/status rule must be present");
    assert_eq!(
        gs.unwrap().source,
        RuleOrigin::Builtin,
        "with no overlay layers, source must be Builtin"
    );
}

// ── rule loader — invalid regex gracefully handled ────────────────────────────

#[test]
fn invalid_regex_in_skip_patterns_does_not_panic() {
    use crate::rules::compiler::compile_rule;
    use crate::types::{JsonRule, RuleFilters, RuleMatch};

    let rule = JsonRule {
        id: "test/bad-skip".to_owned(),
        family: "test".to_owned(),
        description: None,
        priority: None,
        on_empty: None,
        match_output: None,
        counter_source: None,
        r#match: RuleMatch::default(),
        filters: Some(RuleFilters {
            skip_patterns: Some(vec![
                "[invalid-regex".to_owned(), // deliberately malformed
                "valid_pattern".to_owned(),  // valid one after the bad one
            ]),
            keep_patterns: None,
        }),
        transforms: None,
        summarize: None,
        counters: None,
        failure: None,
    };

    // Must not panic; invalid regex is silently dropped.
    let compiled = compile_rule(
        rule,
        RuleOrigin::Builtin,
        "builtin:test/bad-skip".to_owned(),
    );

    // The invalid pattern is dropped; the valid one should be retained.
    assert_eq!(
        compiled.compiled.skip_patterns.len(),
        1,
        "invalid regex must be dropped; valid pattern must be retained"
    );
}

#[test]
fn all_invalid_regex_in_skip_patterns_leaves_empty_vec() {
    use crate::rules::compiler::compile_rule;
    use crate::types::{JsonRule, RuleFilters, RuleMatch};

    let rule = JsonRule {
        id: "test/all-bad".to_owned(),
        family: "test".to_owned(),
        description: None,
        priority: None,
        on_empty: None,
        match_output: None,
        counter_source: None,
        r#match: RuleMatch::default(),
        filters: Some(RuleFilters {
            skip_patterns: Some(vec![
                "[bad1".to_owned(),
                "(bad2".to_owned(),
                "{bad3".to_owned(),
            ]),
            keep_patterns: None,
        }),
        transforms: None,
        summarize: None,
        counters: None,
        failure: None,
    };

    let compiled = compile_rule(rule, RuleOrigin::Builtin, "builtin:test/all-bad".to_owned());
    assert!(
        compiled.compiled.skip_patterns.is_empty(),
        "all invalid skip patterns must produce an empty vec"
    );
}

#[test]
fn invalid_regex_loaded_from_disk_is_skipped_not_fatal() {
    // Write a rule JSON with an invalid skip_pattern to a temp project dir.
    let dir = tempfile::tempdir().expect("tempdir");
    let bad_rule = r#"{
        "id": "test/disk-bad-regex",
        "family": "test",
        "match": {},
        "filters": {
            "skipPatterns": ["[invalid"]
        }
    }"#;
    std::fs::write(dir.path().join("bad_regex.json"), bad_rule).unwrap();

    // Also add a valid rule to ensure loading continues normally.
    let good_rule = r#"{"id":"test/disk-good","family":"test","match":{}}"#;
    std::fs::write(dir.path().join("good.json"), good_rule).unwrap();

    let opts = LoadRuleOptions {
        project_rules_dir: Some(dir.path().to_owned()),
        exclude_user: true,
        ..Default::default()
    };

    // Must not panic; bad regex → compiled rule with no skip patterns.
    let rules = load_rules(&opts);
    // The valid rule is still present.
    assert!(
        rules.iter().any(|r| r.rule.id == "test/disk-good"),
        "valid rule must still load alongside the bad-regex rule"
    );
    // The bad-regex rule must still load — but with the invalid skip pattern
    // dropped so the rule itself is non-fatal. Asserting presence avoids a
    // false positive where the rule is silently dropped entirely.
    let bad = rules
        .iter()
        .find(|r| r.rule.id == "test/disk-bad-regex")
        .expect("bad-regex rule must still load");
    assert!(
        bad.compiled.skip_patterns.is_empty(),
        "bad-regex rule must have empty compiled skip_patterns"
    );
}