contextual_encoder/lib.rs
1#![forbid(unsafe_code)]
2
3//! contextual output encoding for XSS defense and safe literal embedding.
4//!
5//! this crate provides context-aware encoding functions inspired by the
6//! [OWASP Java Encoder](https://owasp.org/owasp-java-encoder/). each function
7//! encodes input for safe embedding in a specific output context — web contexts
8//! (HTML, XML, JavaScript, CSS, URI) and source literal contexts (Java, Rust,
9//! Ruby).
10//!
11//! **disclaimer:** contextual-encoder is an independent Rust crate. its API and security model
12//! are inspired by the OWASP Java Encoder, but this project is not affiliated with,
13//! endorsed by, or maintained by the OWASP Foundation.
14//!
15//! # quick start
16//!
17//! ```
18//! use contextual_encoder::{for_html, for_javascript, for_css_string, for_uri_component};
19//!
20//! let user_input = "<script>alert('xss')</script>";
21//!
22//! // safe for HTML text content and quoted attributes
23//! let html_safe = for_html(user_input);
24//! assert!(html_safe.contains("<script>"));
25//!
26//! // safe for javascript string literals (universal)
27//! let js_safe = for_javascript(user_input);
28//! assert!(js_safe.contains(r"<\/script>"));
29//!
30//! // safe for quoted CSS string values
31//! let css_safe = for_css_string(user_input);
32//! assert!(css_safe.contains(r"\3c"));
33//!
34//! // safe as a URI query parameter value
35//! let uri_safe = for_uri_component(user_input);
36//! assert!(uri_safe.contains("%3C"));
37//! ```
38//!
39//! # available contexts
40//!
41//! ## HTML
42//!
43//! | function | safe for |
44//! |----------|----------|
45//! | [`for_html`] | text content + quoted attributes |
46//! | [`for_html_content`] | text content only |
47//! | [`for_html_attribute`] | quoted attributes only |
48//! | [`for_html_unquoted_attribute`] | unquoted attribute values |
49//!
50//! ## XML
51//!
52//! | function | safe for |
53//! |----------|----------|
54//! | [`for_xml`] | XML text content + quoted attributes (alias for `for_html`) |
55//! | [`for_xml_content`] | XML text content only (alias for `for_html_content`) |
56//! | [`for_xml_attribute`] | quoted XML attributes only (alias for `for_html_attribute`) |
57//! | [`for_xml_comment`] | XML comment content |
58//! | [`for_cdata`] | CDATA section content |
59//!
60//! ## XML 1.1
61//!
62//! | function | safe for |
63//! |----------|----------|
64//! | [`for_xml11`] | XML 1.1 content + quoted attributes |
65//! | [`for_xml11_content`] | XML 1.1 content only |
66//! | [`for_xml11_attribute`] | XML 1.1 quoted attributes only |
67//!
68//! ## JavaScript
69//!
70//! | function | safe for |
71//! |----------|----------|
72//! | [`for_javascript`] | general JS string contexts |
73//! | [`for_javascript_attribute`] | HTML event attributes |
74//! | [`for_javascript_block`] | `<script>` blocks |
75//! | [`for_javascript_source`] | standalone .js files |
76//! | [`for_js_template`] | ES6 template literal content (`` `...` ``) |
77//!
78//! ## CSS
79//!
80//! | function | safe for |
81//! |----------|----------|
82//! | [`for_css_string`] | quoted CSS string values |
83//! | [`for_css_url`] | CSS `url()` values |
84//!
85//! ## URI
86//!
87//! | function | safe for |
88//! |----------|----------|
89//! | [`for_uri_component`] | URI components (query params, path segments) |
90//!
91//! ## additional literal contexts
92//!
93//! these encoders are not part of the OWASP Java Encoder's scope. they encode
94//! untrusted strings for safe embedding in source code literals.
95//!
96//! | function | safe for |
97//! |----------|----------|
98//! | [`for_json`] | JSON string values |
99//! | [`for_java`] | Java string / char literals |
100//! | [`for_go_string`] | Go interpreted string literals (`"..."`) |
101//! | [`for_go_char`] | Go rune literals (`'...'`) |
102//! | [`for_go_byte_string`] | Go byte-explicit string literals (`[]byte("...")`) |
103//! | [`for_rust_string`] | Rust string literals (`"..."`) |
104//! | [`for_rust_char`] | Rust char literals (`'...'`) |
105//! | [`for_rust_byte_string`] | Rust byte string literals (`b"..."`) |
106//! | [`for_ruby_string`] | Ruby double-quoted string literals (`"..."`) |
107//! | [`for_python_string`] | Python string literals (`"..."` or `'...'`) |
108//! | [`for_python_bytes`] | Python bytes literals (`b"..."` or `b'...'`) |
109//! | [`for_python_raw_string`] | Python raw string literals (`r"..."` or `r'...'`) |
110//! | [`for_sql`] | Standard SQL string literals (`'...'`) |
111//! | [`for_sql_backslash`] | MySQL/MariaDB string literals with backslash escaping (`'...'`) |
112//!
113//! # security model
114//!
115//! this is a **contextual output encoder**, not a sanitizer. it prevents
116//! cross-site scripting by encoding output for specific contexts, but it
117//! does not validate or sanitize input.
118//!
119//! **important caveats:**
120//!
121//! - **encoding is not sanitization.** encoding `<script>` as `<script>`
122//! makes it display safely in HTML, but does not remove it. if you need to
123//! allow a subset of HTML, use a dedicated sanitizer.
124//! - **context matters.** using the wrong encoder for a context can leave
125//! you vulnerable. `for_html_content` output is not safe in attributes.
126//! - **tag and attribute names cannot be encoded.** never pass untrusted data
127//! as a tag name, attribute name, or event handler name. validate these
128//! against a whitelist.
129//! - **full URLs must be validated separately.** `for_uri_component` encodes
130//! a component, not a full URL. to embed an untrusted URL, validate its
131//! scheme and structure first, then encode for the final sink.
132//! - **template literals.** the string literal JavaScript encoders do not
133//! encode backticks. use [`for_js_template`] to embed data directly in
134//! ES2015+ template literals.
135//! - **grave accent.** unpatched Internet Explorer treats `` ` `` as an
136//! attribute delimiter. `for_html_unquoted_attribute` encodes it, but
137//! numeric entities decode back to the original character, so this is
138//! not a complete fix. avoid unquoted attributes.
139//! - **HTML comments.** no HTML comment encoder is provided because HTML
140//! comments have vendor-specific extensions (e.g., conditional comments)
141//! that make safe encoding impractical. [`for_xml_comment`] is for XML
142//! comments only.
143//!
144//! # writer-based API
145//!
146//! every `for_*` function has a corresponding `write_*` function that writes
147//! to any `std::fmt::Write` implementor, avoiding allocation when writing to
148//! an existing buffer:
149//!
150//! ```
151//! use contextual_encoder::write_html;
152//!
153//! let mut buf = String::new();
154//! write_html(&mut buf, "safe & sound").unwrap();
155//! assert_eq!(buf, "safe & sound");
156//! ```
157
158pub mod css;
159pub mod go;
160pub mod html;
161pub mod java;
162pub mod javascript;
163pub mod json;
164pub mod python;
165pub mod ruby;
166pub mod rust;
167pub mod sql;
168pub mod uri;
169pub mod xml;
170
171mod engine;
172
173// convenience re-exports — users can `use contextual_encoder::for_html` directly
174pub use css::{for_css_string, for_css_url, write_css_string, write_css_url};
175pub use go::{
176 for_go_byte_string, for_go_char, for_go_string, write_go_byte_string, write_go_char,
177 write_go_string,
178};
179pub use html::{
180 for_html, for_html_attribute, for_html_content, for_html_unquoted_attribute, write_html,
181 write_html_attribute, write_html_content, write_html_unquoted_attribute,
182};
183pub use java::{for_java, write_java};
184pub use javascript::{
185 for_javascript, for_javascript_attribute, for_javascript_block, for_javascript_source,
186 for_js_template, write_javascript, write_javascript_attribute, write_javascript_block,
187 write_javascript_source, write_js_template,
188};
189pub use json::{for_json, write_json};
190pub use python::{
191 for_python_bytes, for_python_raw_string, for_python_string, write_python_bytes,
192 write_python_raw_string, write_python_string,
193};
194pub use ruby::{for_ruby_string, write_ruby_string};
195pub use rust::{
196 for_rust_byte_string, for_rust_char, for_rust_string, write_rust_byte_string, write_rust_char,
197 write_rust_string,
198};
199pub use sql::{for_sql, for_sql_backslash, write_sql, write_sql_backslash};
200pub use uri::{for_uri_component, write_uri_component};
201pub use xml::{
202 for_cdata, for_xml, for_xml11, for_xml11_attribute, for_xml11_content, for_xml_attribute,
203 for_xml_comment, for_xml_content, write_cdata, write_xml, write_xml11, write_xml11_attribute,
204 write_xml11_content, write_xml_attribute, write_xml_comment, write_xml_content,
205};
206
207#[cfg(test)]
208mod tests {
209 use super::*;
210
211 #[test]
212 fn empty_string_returns_empty() {
213 assert_eq!(for_html(""), "");
214 assert_eq!(for_html_content(""), "");
215 assert_eq!(for_html_attribute(""), "");
216 assert_eq!(for_html_unquoted_attribute(""), "");
217 assert_eq!(for_javascript(""), "");
218 assert_eq!(for_javascript_attribute(""), "");
219 assert_eq!(for_javascript_block(""), "");
220 assert_eq!(for_javascript_source(""), "");
221 assert_eq!(for_css_string(""), "");
222 assert_eq!(for_css_url(""), "");
223 assert_eq!(for_uri_component(""), "");
224 assert_eq!(for_xml(""), "");
225 assert_eq!(for_xml_content(""), "");
226 assert_eq!(for_xml_attribute(""), "");
227 assert_eq!(for_xml_comment(""), "");
228 assert_eq!(for_cdata(""), "");
229 assert_eq!(for_xml11(""), "");
230 assert_eq!(for_xml11_content(""), "");
231 assert_eq!(for_xml11_attribute(""), "");
232 assert_eq!(for_java(""), "");
233 assert_eq!(for_json(""), "");
234 assert_eq!(for_go_string(""), "");
235 assert_eq!(for_go_char(""), "");
236 assert_eq!(for_go_byte_string(""), "");
237 assert_eq!(for_rust_string(""), "");
238 assert_eq!(for_rust_char(""), "");
239 assert_eq!(for_rust_byte_string(""), "");
240 assert_eq!(for_ruby_string(""), "");
241 assert_eq!(for_python_string(""), "");
242 assert_eq!(for_python_bytes(""), "");
243 assert_eq!(for_python_raw_string(""), "");
244 assert_eq!(for_js_template(""), "");
245 assert_eq!(for_sql(""), "");
246 assert_eq!(for_sql_backslash(""), "");
247 }
248
249 #[test]
250 fn empty_string_writer_variants() {
251 let mut buf = String::new();
252 write_html(&mut buf, "").unwrap();
253 assert_eq!(buf, "");
254
255 buf.clear();
256 write_javascript(&mut buf, "").unwrap();
257 assert_eq!(buf, "");
258
259 buf.clear();
260 write_css_string(&mut buf, "").unwrap();
261 assert_eq!(buf, "");
262
263 buf.clear();
264 write_uri_component(&mut buf, "").unwrap();
265 assert_eq!(buf, "");
266 }
267
268 // two-byte: é (U+00E9), ñ (U+00F1)
269 // three-byte: 世 (U+4E16), € (U+20AC)
270 // four-byte: 😀 (U+1F600), 𐍈 (U+10348)
271
272 #[test]
273 fn multibyte_utf8_html() {
274 assert_eq!(for_html("café"), "café");
275 assert_eq!(for_html("世界"), "世界");
276 assert_eq!(for_html("😀"), "😀");
277 assert_eq!(for_html("é<世>&😀"), "é<世>&😀");
278 }
279
280 #[test]
281 fn multibyte_utf8_javascript() {
282 assert_eq!(for_javascript("café"), "café");
283 assert_eq!(for_javascript("世界"), "世界");
284 assert_eq!(for_javascript("😀"), "😀");
285 }
286
287 #[test]
288 fn multibyte_utf8_css_string() {
289 assert_eq!(for_css_string("café"), "café");
290 assert_eq!(for_css_string("世界"), "世界");
291 assert_eq!(for_css_string("😀"), "😀");
292 }
293
294 #[test]
295 fn multibyte_utf8_uri_component() {
296 assert_eq!(for_uri_component("é"), "%C3%A9");
297 assert_eq!(for_uri_component("世"), "%E4%B8%96");
298 assert_eq!(for_uri_component("😀"), "%F0%9F%98%80");
299 assert_eq!(for_uri_component("café"), "caf%C3%A9");
300 }
301
302 #[test]
303 fn multibyte_utf8_go_string_passthrough() {
304 assert_eq!(for_go_string("caf\u{00e9}"), "caf\u{00e9}");
305 assert_eq!(for_go_string("\u{4e16}\u{754c}"), "\u{4e16}\u{754c}");
306 assert_eq!(for_go_string("\u{1F600}"), "\u{1F600}");
307 }
308
309 #[test]
310 fn multibyte_utf8_go_byte_string() {
311 assert_eq!(for_go_byte_string("\u{00e9}"), r"\xc3\xa9");
312 assert_eq!(for_go_byte_string("\u{4e16}"), r"\xe4\xb8\x96");
313 assert_eq!(for_go_byte_string("\u{1F600}"), r"\xf0\x9f\x98\x80");
314 }
315
316 #[test]
317 fn multibyte_utf8_rust_byte_string() {
318 assert_eq!(for_rust_byte_string("é"), r"\xc3\xa9");
319 assert_eq!(for_rust_byte_string("世"), r"\xe4\xb8\x96");
320 assert_eq!(for_rust_byte_string("😀"), r"\xf0\x9f\x98\x80");
321 }
322
323 #[test]
324 fn multibyte_utf8_rust_string_passthrough() {
325 assert_eq!(for_rust_string("café"), "café");
326 assert_eq!(for_rust_string("世界"), "世界");
327 assert_eq!(for_rust_string("😀"), "😀");
328 }
329
330 #[test]
331 fn multibyte_utf8_ruby_string_passthrough() {
332 assert_eq!(for_ruby_string("café"), "café");
333 assert_eq!(for_ruby_string("世界"), "世界");
334 assert_eq!(for_ruby_string("😀"), "😀");
335 }
336
337 #[test]
338 fn multibyte_utf8_python_string_passthrough() {
339 assert_eq!(for_python_string("café"), "café");
340 assert_eq!(for_python_string("世界"), "世界");
341 assert_eq!(for_python_string("😀"), "😀");
342 }
343
344 #[test]
345 fn multibyte_utf8_python_bytes() {
346 assert_eq!(for_python_bytes("\u{00e9}"), r"\xc3\xa9");
347 assert_eq!(for_python_bytes("\u{4e16}"), r"\xe4\xb8\x96");
348 assert_eq!(for_python_bytes("\u{1F600}"), r"\xf0\x9f\x98\x80");
349 }
350
351 #[test]
352 fn multibyte_utf8_python_raw_string_passthrough() {
353 assert_eq!(for_python_raw_string("café"), "café");
354 assert_eq!(for_python_raw_string("世界"), "世界");
355 assert_eq!(for_python_raw_string("😀"), "😀");
356 }
357
358 #[test]
359 fn multibyte_utf8_json() {
360 assert_eq!(for_json("café"), "café");
361 assert_eq!(for_json("世界"), "世界");
362 assert_eq!(for_json("😀"), "😀");
363 }
364
365 #[test]
366 fn multibyte_utf8_java() {
367 assert_eq!(for_java("café"), "café");
368 assert_eq!(for_java("世界"), "世界");
369 assert_eq!(for_java("😀"), "\\ud83d\\ude00");
370 }
371
372 #[test]
373 fn multibyte_utf8_sql() {
374 assert_eq!(for_sql("café"), "café");
375 assert_eq!(for_sql("世界"), "世界");
376 assert_eq!(for_sql("😀"), "😀");
377 }
378
379 #[test]
380 fn multibyte_utf8_sql_backslash() {
381 assert_eq!(for_sql_backslash("café"), "café");
382 assert_eq!(for_sql_backslash("世界"), "世界");
383 assert_eq!(for_sql_backslash("😀"), "😀");
384 }
385
386 #[test]
387 fn multibyte_utf8_xml() {
388 assert_eq!(for_xml("café"), "café");
389 assert_eq!(for_xml("世界"), "世界");
390 assert_eq!(for_xml("😀"), "😀");
391 }
392}