1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
//! SECURE-UTF8-TOLERANCE-V1 — uniform non-UTF-8 file tolerance
//!
//! The prior `luau-utf8-tolerance-v1` milestone (commit 4c61af8) wired
//! the [`tldr_core::fs::read_to_string_tolerant`] helper into the surface
//! command (`crates/tldr-core/src/surface/{luau,lua}.rs`). But other
//! directory-scanning commands kept their own strict
//! `std::fs::read_to_string` paths, so a single non-UTF-8 file in a
//! scanned tree still aborted the whole command.
//!
//! Concrete repro pre-fix:
//!
//! ```text
//! $ tldr secure --lang luau /tmp/repos/luau-luau
//! Error: IO error: stream did not contain valid UTF-8
//! ```
//!
//! ...because `tests/conformance/literals.luau` (and `pm.luau`,
//! `sort.luau`) intentionally embed raw 0xFF/0xFE bytes as parser-test
//! corpus. `secure` scanned ~3 files, hit the bad one, and bailed.
//!
//! These tests synthesise the same shape (one valid + one invalid-UTF-8
//! file in a tempdir) and assert the dir-scanning commands continue
//! instead of aborting.
use assert_cmd::Command;
use std::fs;
use tempfile::tempdir;
fn tldr_cmd() -> Command {
Command::new(assert_cmd::cargo::cargo_bin!("tldr"))
}
/// Write a "1 valid + 1 non-UTF-8 .py" tempdir. Returns the tempdir
/// guard so the caller controls lifetime.
fn make_mixed_utf8_dir() -> tempfile::TempDir {
let dir = tempdir().unwrap();
fs::write(dir.path().join("good.py"), b"def safe():\n return 1\n").unwrap();
// Synthetic bad file: valid prefix + raw 0xFF/0xFE (never legal as
// a UTF-8 leading byte) + valid suffix. Same shape as the real
// luau-luau parser-test fixtures.
let mut bytes: Vec<u8> = Vec::new();
bytes.extend_from_slice(b"# valid prefix\n");
bytes.extend_from_slice(&[0xFFu8, 0xFEu8]);
bytes.extend_from_slice(b"\n# valid suffix\n");
fs::write(dir.path().join("bad.py"), bytes).unwrap();
dir
}
/// `tldr smells` MUST complete and emit valid JSON when a directory
/// contains a non-UTF-8 source file alongside valid ones.
///
/// Smells already used `std::fs::read_to_string(path).unwrap_or_default()`
/// which avoided the abort but silently substituted an empty source.
/// This test guards the post-fix behavior: scan completes, JSON is
/// well-formed, total exit code is 0.
#[test]
fn test_smells_continues_after_bad_file_in_dir() {
let dir = make_mixed_utf8_dir();
let output = tldr_cmd()
.arg("smells")
.arg(dir.path().to_str().unwrap())
.arg("-f")
.arg("json")
.output()
.expect("smells must execute");
assert!(
output.status.success(),
"smells must NOT abort on non-UTF-8 input (status: {:?}, stderr: {})",
output.status,
String::from_utf8_lossy(&output.stderr),
);
let stdout = String::from_utf8_lossy(&output.stdout);
let _: serde_json::Value = serde_json::from_str(&stdout)
.unwrap_or_else(|e| panic!("smells stdout must be valid JSON; err: {}; stdout: {}", e, stdout));
}
/// `tldr structure` MUST complete and emit valid JSON when a directory
/// contains a non-UTF-8 source file alongside valid ones.
///
/// Structure routes through `tldr_core::ast::parser::parse_file_with_lang`,
/// which already uses `String::from_utf8_lossy` (M2 mitigation). This
/// test pins that behavior so a future refactor can't regress it.
#[test]
fn test_structure_continues_after_bad_file_in_dir() {
let dir = make_mixed_utf8_dir();
let output = tldr_cmd()
.arg("structure")
.arg(dir.path().to_str().unwrap())
.arg("-f")
.arg("json")
.output()
.expect("structure must execute");
assert!(
output.status.success(),
"structure must NOT abort on non-UTF-8 input (status: {:?}, stderr: {})",
output.status,
String::from_utf8_lossy(&output.stderr),
);
let stdout = String::from_utf8_lossy(&output.stdout);
let _: serde_json::Value = serde_json::from_str(&stdout)
.unwrap_or_else(|e| panic!("structure stdout must be valid JSON; err: {}; stdout: {}", e, stdout));
}
/// `tldr vuln` MUST complete and surface skipped non-UTF-8 files in
/// `files_skipped` + `warnings`. Pre-fix vuln silently dropped them
/// via an `if let Ok(..)` guard — coverage degraded with no signal.
#[test]
fn test_vuln_continues_after_bad_file_in_dir() {
let dir = make_mixed_utf8_dir();
let output = tldr_cmd()
.arg("vuln")
.arg(dir.path().to_str().unwrap())
.arg("--lang")
.arg("python")
.arg("-f")
.arg("json")
.output()
.expect("vuln must execute");
// Exit code 0 (no findings) or 2 (findings detected) are both
// success cases; only a hard error (exit 1) would indicate the
// tolerance regressed.
let code = output.status.code();
assert!(
matches!(code, Some(0) | Some(2)),
"vuln must NOT abort on non-UTF-8 input (status: {:?}, stderr: {})",
output.status,
String::from_utf8_lossy(&output.stderr),
);
let stdout = String::from_utf8_lossy(&output.stdout);
let report: serde_json::Value = serde_json::from_str(&stdout)
.unwrap_or_else(|e| panic!("vuln stdout must be valid JSON; err: {}; stdout: {}", e, stdout));
let files_skipped = report
.get("files_skipped")
.and_then(|v| v.as_u64())
.unwrap_or(0);
assert_eq!(
files_skipped, 1,
"vuln must report files_skipped=1 for the synthetic bad file; report={}",
report
);
let warnings = report
.get("warnings")
.and_then(|v| v.as_array())
.cloned()
.unwrap_or_default();
assert_eq!(
warnings.len(),
1,
"vuln must emit exactly one warning for the bad file; warnings={:?}",
warnings
);
let warning = warnings[0].as_str().unwrap_or("");
assert!(
warning.contains("bad.py"),
"warning must reference the skipped file path; got: {}",
warning
);
assert!(
warning.contains("invalid UTF-8") || warning.contains("byte"),
"warning must describe the UTF-8 failure with a byte offset; got: {}",
warning
);
}