Skip to main content

contextual_encoder/
java.rs

1//! java string literal encoder.
2//!
3//! encodes untrusted strings for safe embedding in java string literals.
4//!
5//! - [`for_java`] — safe for java string and char literal contexts
6//!
7//! # encoding rules
8//!
9//! - named escapes: `\b`, `\t`, `\n`, `\f`, `\r`, `\"`, `\'`, `\\`
10//! - other C0 controls and DEL → octal escapes (shortest form, or 3-digit
11//!   when the next character is an octal digit to avoid ambiguity)
12//! - U+2028, U+2029 → `\u2028`, `\u2029` (java line terminators)
13//! - supplementary plane characters (U+10000+) → UTF-16 surrogate pairs
14//!   (`\uHHHH\uHHHH`)
15//! - unicode non-characters → space
16
17use std::fmt;
18
19use crate::engine::{encode_loop, is_unicode_noncharacter};
20
21/// encodes `input` for safe embedding in a java string literal.
22///
23/// produces output suitable for embedding between double quotes in java
24/// source code. also safe for char literals (single quotes are escaped).
25///
26/// # encoding rules
27///
28/// | input | output |
29/// |-------|--------|
30/// | C0 named (`\b`, `\t`, `\n`, `\f`, `\r`) | named escape |
31/// | `"`, `'`, `\` | `\"`, `\'`, `\\` |
32/// | other C0 controls, DEL | octal escape |
33/// | U+2028, U+2029 | `\u2028`, `\u2029` |
34/// | supplementary plane (U+10000+) | surrogate pair `\uHHHH\uHHHH` |
35/// | unicode non-characters | space |
36///
37/// octal escapes use the shortest form (`\0` for NUL) unless the next
38/// character is an octal digit, in which case the 3-digit form is used
39/// (`\000`) to prevent ambiguity.
40///
41/// # examples
42///
43/// ```
44/// use contextual_encoder::for_java;
45///
46/// assert_eq!(for_java(r#"he said "hello""#), r#"he said \"hello\""#);
47/// assert_eq!(for_java("line\nbreak"), r"line\nbreak");
48/// assert_eq!(for_java("null\x00byte"), r"null\0byte");
49/// assert_eq!(for_java("\x007"), r"\0007");
50/// ```
51pub fn for_java(input: &str) -> String {
52    let mut out = String::with_capacity(input.len());
53    write_java(&mut out, input).expect("writing to string cannot fail");
54    out
55}
56
57/// writes the java-encoded form of `input` to `out`.
58///
59/// see [`for_java`] for encoding rules.
60pub fn write_java<W: fmt::Write>(out: &mut W, input: &str) -> fmt::Result {
61    encode_loop(out, input, needs_java_encoding, write_java_encoded)
62}
63
64fn needs_java_encoding(c: char) -> bool {
65    match c {
66        '\x00'..='\x1F' | '\x7F' | '"' | '\'' | '\\' | '\u{2028}' | '\u{2029}' => true,
67        c if (c as u32) >= 0x10000 => true,
68        c if is_unicode_noncharacter(c as u32) => true,
69        _ => false,
70    }
71}
72
73fn write_java_encoded<W: fmt::Write>(out: &mut W, c: char, next: Option<char>) -> fmt::Result {
74    match c {
75        '\x08' => out.write_str("\\b"),
76        '\t' => out.write_str("\\t"),
77        '\n' => out.write_str("\\n"),
78        '\x0C' => out.write_str("\\f"),
79        '\r' => out.write_str("\\r"),
80        '"' => out.write_str("\\\""),
81        '\'' => out.write_str("\\'"),
82        '\\' => out.write_str("\\\\"),
83        '\u{2028}' => out.write_str("\\u2028"),
84        '\u{2029}' => out.write_str("\\u2029"),
85        c if is_unicode_noncharacter(c as u32) => out.write_char(' '),
86        // supplementary plane → UTF-16 surrogate pair
87        c if (c as u32) >= 0x10000 => {
88            let cp = c as u32 - 0x10000;
89            let high = 0xD800 + (cp >> 10);
90            let low = 0xDC00 + (cp & 0x3FF);
91            write!(out, "\\u{high:04x}\\u{low:04x}")
92        }
93        // C0 controls (without named escapes) and DEL → octal
94        c => {
95            let val = c as u32;
96            let next_is_octal = next.is_some_and(|n| ('0'..='7').contains(&n));
97            if next_is_octal {
98                write!(out, "\\{val:03o}")
99            } else {
100                write!(out, "\\{val:o}")
101            }
102        }
103    }
104}
105
106#[cfg(test)]
107mod tests {
108    use super::*;
109
110    #[test]
111    fn passthrough() {
112        assert_eq!(for_java("hello world"), "hello world");
113        assert_eq!(for_java(""), "");
114        assert_eq!(for_java("café"), "café");
115    }
116
117    #[test]
118    fn named_escapes() {
119        assert_eq!(for_java("\x08"), "\\b");
120        assert_eq!(for_java("\t"), "\\t");
121        assert_eq!(for_java("\n"), "\\n");
122        assert_eq!(for_java("\x0C"), "\\f");
123        assert_eq!(for_java("\r"), "\\r");
124    }
125
126    #[test]
127    fn quotes_and_backslash() {
128        assert_eq!(for_java(r#"a"b"#), r#"a\"b"#);
129        assert_eq!(for_java("a'b"), r"a\'b");
130        assert_eq!(for_java(r"a\b"), r"a\\b");
131    }
132
133    #[test]
134    fn octal_shortest_form() {
135        // NUL followed by non-octal → shortest form
136        assert_eq!(for_java("\x00a"), "\\0a");
137        // SOH
138        assert_eq!(for_java("\x01a"), "\\1a");
139        // BEL
140        assert_eq!(for_java("\x07a"), "\\7a");
141        // VT (0x0B = 0o13)
142        assert_eq!(for_java("\x0Ba"), "\\13a");
143        // DEL (0x7F = 0o177)
144        assert_eq!(for_java("\x7Fa"), "\\177a");
145    }
146
147    #[test]
148    fn octal_three_digit_before_octal_char() {
149        // NUL followed by octal digit → 3-digit form
150        assert_eq!(for_java("\x000"), "\\0000");
151        assert_eq!(for_java("\x007"), "\\0007");
152        assert_eq!(for_java("\x015"), "\\0015");
153    }
154
155    #[test]
156    fn octal_at_end_of_input() {
157        // no next char → shortest form
158        assert_eq!(for_java("\x00"), "\\0");
159        assert_eq!(for_java("\x07"), "\\7");
160        assert_eq!(for_java("\x7F"), "\\177");
161    }
162
163    #[test]
164    fn line_separators() {
165        assert_eq!(for_java("\u{2028}"), "\\u2028");
166        assert_eq!(for_java("\u{2029}"), "\\u2029");
167    }
168
169    #[test]
170    fn supplementary_plane_surrogate_pairs() {
171        // U+1F600 (GRINNING FACE) = 0x1F600 - 0x10000 = 0xF600
172        // high = 0xD800 + (0xF600 >> 10) = 0xD800 + 0x3D = 0xD83D
173        // low  = 0xDC00 + (0xF600 & 0x3FF) = 0xDC00 + 0x200 = 0xDE00
174        assert_eq!(for_java("\u{1F600}"), "\\ud83d\\ude00");
175
176        // U+10000 (LINEAR B SYLLABLE B008 A)
177        // high = 0xD800, low = 0xDC00
178        assert_eq!(for_java("\u{10000}"), "\\ud800\\udc00");
179
180        // U+10FFFD (last non-char-adjacent codepoint)
181        // 0x10FFFD - 0x10000 = 0xFFFD
182        // high = 0xD800 + (0xFFFD >> 10) = 0xD800 + 0x3FF = 0xDBFF
183        // low  = 0xDC00 + (0xFFFD & 0x3FF) = 0xDC00 + 0x3FD = 0xDFFD
184        assert_eq!(for_java("\u{10FFFD}"), "\\udbff\\udffd");
185    }
186
187    #[test]
188    fn noncharacters_replaced_with_space() {
189        assert_eq!(for_java("\u{FDD0}"), " ");
190        assert_eq!(for_java("\u{FFFE}"), " ");
191    }
192
193    #[test]
194    fn mixed_input() {
195        assert_eq!(
196            for_java("he said \"hello\"\nnew line"),
197            "he said \\\"hello\\\"\\nnew line"
198        );
199    }
200
201    #[test]
202    fn writer_matches_string() {
203        let input = "test\x00\"\\\u{1F600}";
204        let string_result = for_java(input);
205        let mut writer_result = String::new();
206        write_java(&mut writer_result, input).unwrap();
207        assert_eq!(string_result, writer_result);
208    }
209}