contextual_encoder/lib.rs
1#![forbid(unsafe_code)]
2
3//! contextual output encoding for XSS defense and safe literal embedding.
4//!
5//! this crate provides context-aware encoding functions inspired by the
6//! [OWASP Java Encoder](https://owasp.org/owasp-java-encoder/). each function
7//! encodes input for safe embedding in a specific output context — web contexts
8//! (HTML, XML, JavaScript, CSS, URI) and source literal contexts (Java, Rust,
9//! Ruby).
10//!
11//! **disclaimer:** contextual-encoder is an independent Rust crate. its API and security model
12//! are inspired by the OWASP Java Encoder, but this project is not affiliated with,
13//! endorsed by, or maintained by the OWASP Foundation.
14//!
15//! # quick start
16//!
17//! ```
18//! use contextual_encoder::{for_html, for_javascript, for_css_string, for_uri_component};
19//!
20//! let user_input = "<script>alert('xss')</script>";
21//!
22//! // safe for HTML text content and quoted attributes
23//! let html_safe = for_html(user_input);
24//! assert!(html_safe.contains("<script>"));
25//!
26//! // safe for javascript string literals (universal)
27//! let js_safe = for_javascript(user_input);
28//! assert!(js_safe.contains(r"<\/script>"));
29//!
30//! // safe for quoted CSS string values
31//! let css_safe = for_css_string(user_input);
32//! assert!(css_safe.contains(r"\3c"));
33//!
34//! // safe as a URI query parameter value
35//! let uri_safe = for_uri_component(user_input);
36//! assert!(uri_safe.contains("%3C"));
37//! ```
38//!
39//! # available contexts
40//!
41//! ## HTML
42//!
43//! | function | safe for |
44//! |----------|----------|
45//! | [`for_html`] | text content + quoted attributes |
46//! | [`for_html_content`] | text content only |
47//! | [`for_html_attribute`] | quoted attributes only |
48//! | [`for_html_unquoted_attribute`] | unquoted attribute values |
49//!
50//! ## XML
51//!
52//! | function | safe for |
53//! |----------|----------|
54//! | [`for_xml`] | XML text content + quoted attributes (alias for `for_html`) |
55//! | [`for_xml_content`] | XML text content only (alias for `for_html_content`) |
56//! | [`for_xml_attribute`] | quoted XML attributes only (alias for `for_html_attribute`) |
57//! | [`for_xml_comment`] | XML comment content |
58//! | [`for_cdata`] | CDATA section content |
59//!
60//! ## XML 1.1
61//!
62//! | function | safe for |
63//! |----------|----------|
64//! | [`for_xml11`] | XML 1.1 content + quoted attributes |
65//! | [`for_xml11_content`] | XML 1.1 content only |
66//! | [`for_xml11_attribute`] | XML 1.1 quoted attributes only |
67//!
68//! ## JavaScript
69//!
70//! | function | safe for |
71//! |----------|----------|
72//! | [`for_javascript`] | general JS string contexts |
73//! | [`for_javascript_attribute`] | HTML event attributes |
74//! | [`for_javascript_block`] | `<script>` blocks |
75//! | [`for_javascript_source`] | standalone .js files |
76//! | [`for_js_template`] | ES6 template literal content (`` `...` ``) |
77//!
78//! ## CSS
79//!
80//! | function | safe for |
81//! |----------|----------|
82//! | [`for_css_string`] | quoted CSS string values |
83//! | [`for_css_url`] | CSS `url()` values |
84//!
85//! ## URI
86//!
87//! | function | safe for |
88//! |----------|----------|
89//! | [`for_uri_component`] | URI components (query params, path segments) |
90//!
91//! ## additional literal contexts
92//!
93//! these encoders are not part of the OWASP Java Encoder's scope. they encode
94//! untrusted strings for safe embedding in source code literals.
95//!
96//! | function | safe for |
97//! |----------|----------|
98//! | [`for_json`] | JSON string values |
99//! | [`for_java`] | Java string / char literals |
100//! | [`for_go_string`] | Go interpreted string literals (`"..."`) |
101//! | [`for_go_char`] | Go rune literals (`'...'`) |
102//! | [`for_go_byte_string`] | Go byte-explicit string literals (`[]byte("...")`) |
103//! | [`for_rust_string`] | Rust string literals (`"..."`) |
104//! | [`for_rust_char`] | Rust char literals (`'...'`) |
105//! | [`for_rust_byte_string`] | Rust byte string literals (`b"..."`) |
106//! | [`for_ruby_string`] | Ruby double-quoted string literals (`"..."`) |
107//! | [`for_python_string`] | Python string literals (`"..."` or `'...'`) |
108//! | [`for_python_bytes`] | Python bytes literals (`b"..."` or `b'...'`) |
109//! | [`for_python_raw_string`] | Python raw string literals (`r"..."` or `r'...'`) |
110//! | [`for_sql`] | Standard SQL string literals (`'...'`) |
111//! | [`for_sql_backslash`] | MySQL/MariaDB string literals with backslash escaping (`'...'`) |
112//!
113//! # security model
114//!
115//! this is a **contextual output encoder**, not a sanitizer. it prevents
116//! cross-site scripting by encoding output for specific contexts, but it
117//! does not validate or sanitize input.
118//!
119//! **important caveats:**
120//!
121//! - **encoding is not sanitization.** encoding `<script>` as `<script>`
122//! makes it display safely in HTML, but does not remove it. if you need to
123//! allow a subset of HTML, use a dedicated sanitizer.
124//! - **context matters.** using the wrong encoder for a context can leave
125//! you vulnerable. `for_html_content` output is not safe in attributes.
126//! - **tag and attribute names cannot be encoded.** never pass untrusted data
127//! as a tag name, attribute name, or event handler name. validate these
128//! against a whitelist.
129//! - **full URLs must be validated separately.** `for_uri_component` encodes
130//! a component, not a full URL. to embed an untrusted URL, validate its
131//! scheme and structure first, then encode for the final sink.
132//! - **template literals.** the string literal JavaScript encoders do not
133//! encode backticks. use [`for_js_template`] to embed data directly in
134//! ES2015+ template literals.
135//! - **grave accent.** unpatched Internet Explorer treats `` ` `` as an
136//! attribute delimiter. `for_html_unquoted_attribute` encodes it, but
137//! numeric entities decode back to the original character, so this is
138//! not a complete fix. avoid unquoted attributes.
139//! - **HTML comments.** no HTML comment encoder is provided because HTML
140//! comments have vendor-specific extensions (e.g., conditional comments)
141//! that make safe encoding impractical. [`for_xml_comment`] is for XML
142//! comments only.
143//!
144//! # writer-based API
145//!
146//! every `for_*` function has a corresponding `write_*` function that writes
147//! to any `std::fmt::Write` implementor, avoiding allocation when writing to
148//! an existing buffer:
149//!
150//! ```
151//! use contextual_encoder::write_html;
152//!
153//! let mut buf = String::new();
154//! write_html(&mut buf, "safe & sound").unwrap();
155//! assert_eq!(buf, "safe & sound");
156//! ```
157//!
158//! # display wrappers
159//!
160//! every `for_*` function also has a corresponding `display_*` function that
161//! returns a zero-allocation [`Display`](std::fmt::Display) wrapper. use these
162//! when embedding encoded output inline in `format!` or `write!`:
163//!
164//! ```
165//! use contextual_encoder::display_html;
166//!
167//! let user_input = "<script>alert('xss')</script>";
168//! // one allocation (the final String), zero intermediate allocations
169//! let safe = format!("<p>{}</p>", display_html(user_input));
170//! assert!(safe.contains("<script>"));
171//! ```
172
173pub mod css;
174pub mod display;
175pub mod go;
176pub mod html;
177pub mod java;
178pub mod javascript;
179pub mod json;
180pub mod python;
181pub mod ruby;
182pub mod rust;
183pub mod sql;
184pub mod uri;
185pub mod xml;
186
187mod engine;
188
189// convenience re-exports — users can `use contextual_encoder::for_html` directly
190pub use css::{for_css_string, for_css_url, write_css_string, write_css_url};
191pub use display::{
192 display_cdata, display_css_string, display_css_url, display_go_byte_string, display_go_char,
193 display_go_string, display_html, display_html_attribute, display_html_content,
194 display_html_unquoted_attribute, display_java, display_javascript,
195 display_javascript_attribute, display_javascript_block, display_javascript_source,
196 display_js_template, display_json, display_python_bytes, display_python_raw_string,
197 display_python_string, display_ruby_string, display_rust_byte_string, display_rust_char,
198 display_rust_string, display_sql, display_sql_backslash, display_uri_component, display_xml,
199 display_xml11, display_xml11_attribute, display_xml11_content, display_xml_attribute,
200 display_xml_comment, display_xml_content,
201};
202pub use go::{
203 for_go_byte_string, for_go_char, for_go_string, write_go_byte_string, write_go_char,
204 write_go_string,
205};
206pub use html::{
207 for_html, for_html_attribute, for_html_content, for_html_unquoted_attribute, write_html,
208 write_html_attribute, write_html_content, write_html_unquoted_attribute,
209};
210pub use java::{for_java, write_java};
211pub use javascript::{
212 for_javascript, for_javascript_attribute, for_javascript_block, for_javascript_source,
213 for_js_template, write_javascript, write_javascript_attribute, write_javascript_block,
214 write_javascript_source, write_js_template,
215};
216pub use json::{for_json, write_json};
217pub use python::{
218 for_python_bytes, for_python_raw_string, for_python_string, write_python_bytes,
219 write_python_raw_string, write_python_string,
220};
221pub use ruby::{for_ruby_string, write_ruby_string};
222pub use rust::{
223 for_rust_byte_string, for_rust_char, for_rust_string, write_rust_byte_string, write_rust_char,
224 write_rust_string,
225};
226pub use sql::{for_sql, for_sql_backslash, write_sql, write_sql_backslash};
227pub use uri::{for_uri_component, write_uri_component};
228pub use xml::{
229 for_cdata, for_xml, for_xml11, for_xml11_attribute, for_xml11_content, for_xml_attribute,
230 for_xml_comment, for_xml_content, write_cdata, write_xml, write_xml11, write_xml11_attribute,
231 write_xml11_content, write_xml_attribute, write_xml_comment, write_xml_content,
232};
233
234#[cfg(test)]
235mod tests {
236 use super::*;
237
238 #[test]
239 fn empty_string_returns_empty() {
240 assert_eq!(for_html(""), "");
241 assert_eq!(for_html_content(""), "");
242 assert_eq!(for_html_attribute(""), "");
243 assert_eq!(for_html_unquoted_attribute(""), "");
244 assert_eq!(for_javascript(""), "");
245 assert_eq!(for_javascript_attribute(""), "");
246 assert_eq!(for_javascript_block(""), "");
247 assert_eq!(for_javascript_source(""), "");
248 assert_eq!(for_css_string(""), "");
249 assert_eq!(for_css_url(""), "");
250 assert_eq!(for_uri_component(""), "");
251 assert_eq!(for_xml(""), "");
252 assert_eq!(for_xml_content(""), "");
253 assert_eq!(for_xml_attribute(""), "");
254 assert_eq!(for_xml_comment(""), "");
255 assert_eq!(for_cdata(""), "");
256 assert_eq!(for_xml11(""), "");
257 assert_eq!(for_xml11_content(""), "");
258 assert_eq!(for_xml11_attribute(""), "");
259 assert_eq!(for_java(""), "");
260 assert_eq!(for_json(""), "");
261 assert_eq!(for_go_string(""), "");
262 assert_eq!(for_go_char(""), "");
263 assert_eq!(for_go_byte_string(""), "");
264 assert_eq!(for_rust_string(""), "");
265 assert_eq!(for_rust_char(""), "");
266 assert_eq!(for_rust_byte_string(""), "");
267 assert_eq!(for_ruby_string(""), "");
268 assert_eq!(for_python_string(""), "");
269 assert_eq!(for_python_bytes(""), "");
270 assert_eq!(for_python_raw_string(""), "");
271 assert_eq!(for_js_template(""), "");
272 assert_eq!(for_sql(""), "");
273 assert_eq!(for_sql_backslash(""), "");
274 }
275
276 #[test]
277 fn empty_string_writer_variants() {
278 let mut buf = String::new();
279 write_html(&mut buf, "").unwrap();
280 assert_eq!(buf, "");
281
282 buf.clear();
283 write_javascript(&mut buf, "").unwrap();
284 assert_eq!(buf, "");
285
286 buf.clear();
287 write_css_string(&mut buf, "").unwrap();
288 assert_eq!(buf, "");
289
290 buf.clear();
291 write_uri_component(&mut buf, "").unwrap();
292 assert_eq!(buf, "");
293 }
294
295 // two-byte: é (U+00E9), ñ (U+00F1)
296 // three-byte: 世 (U+4E16), € (U+20AC)
297 // four-byte: 😀 (U+1F600), 𐍈 (U+10348)
298
299 #[test]
300 fn multibyte_utf8_html() {
301 assert_eq!(for_html("café"), "café");
302 assert_eq!(for_html("世界"), "世界");
303 assert_eq!(for_html("😀"), "😀");
304 assert_eq!(for_html("é<世>&😀"), "é<世>&😀");
305 }
306
307 #[test]
308 fn multibyte_utf8_javascript() {
309 assert_eq!(for_javascript("café"), "café");
310 assert_eq!(for_javascript("世界"), "世界");
311 assert_eq!(for_javascript("😀"), "😀");
312 }
313
314 #[test]
315 fn multibyte_utf8_css_string() {
316 assert_eq!(for_css_string("café"), "café");
317 assert_eq!(for_css_string("世界"), "世界");
318 assert_eq!(for_css_string("😀"), "😀");
319 }
320
321 #[test]
322 fn multibyte_utf8_uri_component() {
323 assert_eq!(for_uri_component("é"), "%C3%A9");
324 assert_eq!(for_uri_component("世"), "%E4%B8%96");
325 assert_eq!(for_uri_component("😀"), "%F0%9F%98%80");
326 assert_eq!(for_uri_component("café"), "caf%C3%A9");
327 }
328
329 #[test]
330 fn multibyte_utf8_go_string_passthrough() {
331 assert_eq!(for_go_string("caf\u{00e9}"), "caf\u{00e9}");
332 assert_eq!(for_go_string("\u{4e16}\u{754c}"), "\u{4e16}\u{754c}");
333 assert_eq!(for_go_string("\u{1F600}"), "\u{1F600}");
334 }
335
336 #[test]
337 fn multibyte_utf8_go_byte_string() {
338 assert_eq!(for_go_byte_string("\u{00e9}"), r"\xc3\xa9");
339 assert_eq!(for_go_byte_string("\u{4e16}"), r"\xe4\xb8\x96");
340 assert_eq!(for_go_byte_string("\u{1F600}"), r"\xf0\x9f\x98\x80");
341 }
342
343 #[test]
344 fn multibyte_utf8_rust_byte_string() {
345 assert_eq!(for_rust_byte_string("é"), r"\xc3\xa9");
346 assert_eq!(for_rust_byte_string("世"), r"\xe4\xb8\x96");
347 assert_eq!(for_rust_byte_string("😀"), r"\xf0\x9f\x98\x80");
348 }
349
350 #[test]
351 fn multibyte_utf8_rust_string_passthrough() {
352 assert_eq!(for_rust_string("café"), "café");
353 assert_eq!(for_rust_string("世界"), "世界");
354 assert_eq!(for_rust_string("😀"), "😀");
355 }
356
357 #[test]
358 fn multibyte_utf8_ruby_string_passthrough() {
359 assert_eq!(for_ruby_string("café"), "café");
360 assert_eq!(for_ruby_string("世界"), "世界");
361 assert_eq!(for_ruby_string("😀"), "😀");
362 }
363
364 #[test]
365 fn multibyte_utf8_python_string_passthrough() {
366 assert_eq!(for_python_string("café"), "café");
367 assert_eq!(for_python_string("世界"), "世界");
368 assert_eq!(for_python_string("😀"), "😀");
369 }
370
371 #[test]
372 fn multibyte_utf8_python_bytes() {
373 assert_eq!(for_python_bytes("\u{00e9}"), r"\xc3\xa9");
374 assert_eq!(for_python_bytes("\u{4e16}"), r"\xe4\xb8\x96");
375 assert_eq!(for_python_bytes("\u{1F600}"), r"\xf0\x9f\x98\x80");
376 }
377
378 #[test]
379 fn multibyte_utf8_python_raw_string_passthrough() {
380 assert_eq!(for_python_raw_string("café"), "café");
381 assert_eq!(for_python_raw_string("世界"), "世界");
382 assert_eq!(for_python_raw_string("😀"), "😀");
383 }
384
385 #[test]
386 fn multibyte_utf8_json() {
387 assert_eq!(for_json("café"), "café");
388 assert_eq!(for_json("世界"), "世界");
389 assert_eq!(for_json("😀"), "😀");
390 }
391
392 #[test]
393 fn multibyte_utf8_java() {
394 assert_eq!(for_java("café"), "café");
395 assert_eq!(for_java("世界"), "世界");
396 assert_eq!(for_java("😀"), "\\ud83d\\ude00");
397 }
398
399 #[test]
400 fn multibyte_utf8_sql() {
401 assert_eq!(for_sql("café"), "café");
402 assert_eq!(for_sql("世界"), "世界");
403 assert_eq!(for_sql("😀"), "😀");
404 }
405
406 #[test]
407 fn multibyte_utf8_sql_backslash() {
408 assert_eq!(for_sql_backslash("café"), "café");
409 assert_eq!(for_sql_backslash("世界"), "世界");
410 assert_eq!(for_sql_backslash("😀"), "😀");
411 }
412
413 #[test]
414 fn multibyte_utf8_xml() {
415 assert_eq!(for_xml("café"), "café");
416 assert_eq!(for_xml("世界"), "世界");
417 assert_eq!(for_xml("😀"), "😀");
418 }
419}