Skip to main content

contextual_encoder/
lib.rs

1#![forbid(unsafe_code)]
2
3//! contextual output encoding for XSS defense and safe literal embedding.
4//!
5//! this crate provides context-aware encoding functions inspired by the
6//! [OWASP Java Encoder](https://owasp.org/owasp-java-encoder/). each function
7//! encodes input for safe embedding in a specific output context — web contexts
8//! (HTML, XML, JavaScript, CSS, URI) and source literal contexts (Java, Rust,
9//! Ruby).
10//!
11//! **disclaimer:** contextual-encoder is an independent Rust crate. its API and security model
12//! are inspired by the OWASP Java Encoder, but this project is not affiliated with,
13//! endorsed by, or maintained by the OWASP Foundation.
14//!
15//! # quick start
16//!
17//! ```
18//! use contextual_encoder::{for_html, for_javascript, for_css_string, for_uri_component};
19//!
20//! let user_input = "<script>alert('xss')</script>";
21//!
22//! // safe for HTML text content and quoted attributes
23//! let html_safe = for_html(user_input);
24//! assert!(html_safe.contains("&lt;script&gt;"));
25//!
26//! // safe for javascript string literals (universal)
27//! let js_safe = for_javascript(user_input);
28//! assert!(js_safe.contains(r"<\/script>"));
29//!
30//! // safe for quoted CSS string values
31//! let css_safe = for_css_string(user_input);
32//! assert!(css_safe.contains(r"\3c"));
33//!
34//! // safe as a URI query parameter value
35//! let uri_safe = for_uri_component(user_input);
36//! assert!(uri_safe.contains("%3C"));
37//! ```
38//!
39//! # available contexts
40//!
41//! ## HTML
42//!
43//! | function | safe for |
44//! |----------|----------|
45//! | [`for_html`] | text content + quoted attributes |
46//! | [`for_html_content`] | text content only |
47//! | [`for_html_attribute`] | quoted attributes only |
48//! | [`for_html_unquoted_attribute`] | unquoted attribute values |
49//!
50//! ## XML
51//!
52//! | function | safe for |
53//! |----------|----------|
54//! | [`for_xml`] | XML text content + quoted attributes (alias for `for_html`) |
55//! | [`for_xml_content`] | XML text content only (alias for `for_html_content`) |
56//! | [`for_xml_attribute`] | quoted XML attributes only (alias for `for_html_attribute`) |
57//! | [`for_xml_comment`] | XML comment content |
58//! | [`for_cdata`] | CDATA section content |
59//!
60//! ## XML 1.1
61//!
62//! | function | safe for |
63//! |----------|----------|
64//! | [`for_xml11`] | XML 1.1 content + quoted attributes |
65//! | [`for_xml11_content`] | XML 1.1 content only |
66//! | [`for_xml11_attribute`] | XML 1.1 quoted attributes only |
67//!
68//! ## JavaScript
69//!
70//! | function | safe for |
71//! |----------|----------|
72//! | [`for_javascript`] | general JS string contexts |
73//! | [`for_javascript_attribute`] | HTML event attributes |
74//! | [`for_javascript_block`] | `<script>` blocks |
75//! | [`for_javascript_source`] | standalone .js files |
76//! | [`for_js_template`] | ES6 template literal content (`` `...` ``) |
77//!
78//! ## CSS
79//!
80//! | function | safe for |
81//! |----------|----------|
82//! | [`for_css_string`] | quoted CSS string values |
83//! | [`for_css_url`] | CSS `url()` values |
84//!
85//! ## URI
86//!
87//! | function | safe for |
88//! |----------|----------|
89//! | [`for_uri_component`] | URI components (query params, path segments) |
90//!
91//! ## additional literal contexts
92//!
93//! these encoders are not part of the OWASP Java Encoder's scope. they encode
94//! untrusted strings for safe embedding in source code literals.
95//!
96//! | function | safe for |
97//! |----------|----------|
98//! | [`for_json`] | JSON string values |
99//! | [`for_java`] | Java string / char literals |
100//! | [`for_go_string`] | Go interpreted string literals (`"..."`) |
101//! | [`for_go_char`] | Go rune literals (`'...'`) |
102//! | [`for_go_byte_string`] | Go byte-explicit string literals (`[]byte("...")`) |
103//! | [`for_rust_string`] | Rust string literals (`"..."`) |
104//! | [`for_rust_char`] | Rust char literals (`'...'`) |
105//! | [`for_rust_byte_string`] | Rust byte string literals (`b"..."`) |
106//! | [`for_ruby_string`] | Ruby double-quoted string literals (`"..."`) |
107//! | [`for_python_string`] | Python string literals (`"..."` or `'...'`) |
108//! | [`for_python_bytes`] | Python bytes literals (`b"..."` or `b'...'`) |
109//! | [`for_python_raw_string`] | Python raw string literals (`r"..."` or `r'...'`) |
110//! | [`for_sql`] | Standard SQL string literals (`'...'`) |
111//! | [`for_sql_backslash`] | MySQL/MariaDB string literals with backslash escaping (`'...'`) |
112//!
113//! # security model
114//!
115//! this is a **contextual output encoder**, not a sanitizer. it prevents
116//! cross-site scripting by encoding output for specific contexts, but it
117//! does not validate or sanitize input.
118//!
119//! **important caveats:**
120//!
121//! - **encoding is not sanitization.** encoding `<script>` as `&lt;script&gt;`
122//!   makes it display safely in HTML, but does not remove it. if you need to
123//!   allow a subset of HTML, use a dedicated sanitizer.
124//! - **context matters.** using the wrong encoder for a context can leave
125//!   you vulnerable. `for_html_content` output is not safe in attributes.
126//! - **tag and attribute names cannot be encoded.** never pass untrusted data
127//!   as a tag name, attribute name, or event handler name. validate these
128//!   against a whitelist.
129//! - **full URLs must be validated separately.** `for_uri_component` encodes
130//!   a component, not a full URL. to embed an untrusted URL, validate its
131//!   scheme and structure first, then encode for the final sink.
132//! - **template literals.** the string literal JavaScript encoders do not
133//!   encode backticks. use [`for_js_template`] to embed data directly in
134//!   ES2015+ template literals.
135//! - **grave accent.** unpatched Internet Explorer treats `` ` `` as an
136//!   attribute delimiter. `for_html_unquoted_attribute` encodes it, but
137//!   numeric entities decode back to the original character, so this is
138//!   not a complete fix. avoid unquoted attributes.
139//! - **HTML comments.** no HTML comment encoder is provided because HTML
140//!   comments have vendor-specific extensions (e.g., conditional comments)
141//!   that make safe encoding impractical. [`for_xml_comment`] is for XML
142//!   comments only.
143//!
144//! # writer-based API
145//!
146//! every `for_*` function has a corresponding `write_*` function that writes
147//! to any `std::fmt::Write` implementor, avoiding allocation when writing to
148//! an existing buffer:
149//!
150//! ```
151//! use contextual_encoder::write_html;
152//!
153//! let mut buf = String::new();
154//! write_html(&mut buf, "safe & sound").unwrap();
155//! assert_eq!(buf, "safe &amp; sound");
156//! ```
157
158pub mod css;
159pub mod go;
160pub mod html;
161pub mod java;
162pub mod javascript;
163pub mod json;
164pub mod python;
165pub mod ruby;
166pub mod rust;
167pub mod sql;
168pub mod uri;
169pub mod xml;
170
171mod engine;
172
173// convenience re-exports — users can `use contextual_encoder::for_html` directly
174pub use css::{for_css_string, for_css_url, write_css_string, write_css_url};
175pub use go::{
176    for_go_byte_string, for_go_char, for_go_string, write_go_byte_string, write_go_char,
177    write_go_string,
178};
179pub use html::{
180    for_html, for_html_attribute, for_html_content, for_html_unquoted_attribute, write_html,
181    write_html_attribute, write_html_content, write_html_unquoted_attribute,
182};
183pub use java::{for_java, write_java};
184pub use javascript::{
185    for_javascript, for_javascript_attribute, for_javascript_block, for_javascript_source,
186    for_js_template, write_javascript, write_javascript_attribute, write_javascript_block,
187    write_javascript_source, write_js_template,
188};
189pub use json::{for_json, write_json};
190pub use python::{
191    for_python_bytes, for_python_raw_string, for_python_string, write_python_bytes,
192    write_python_raw_string, write_python_string,
193};
194pub use ruby::{for_ruby_string, write_ruby_string};
195pub use rust::{
196    for_rust_byte_string, for_rust_char, for_rust_string, write_rust_byte_string, write_rust_char,
197    write_rust_string,
198};
199pub use sql::{for_sql, for_sql_backslash, write_sql, write_sql_backslash};
200pub use uri::{for_uri_component, write_uri_component};
201pub use xml::{
202    for_cdata, for_xml, for_xml11, for_xml11_attribute, for_xml11_content, for_xml_attribute,
203    for_xml_comment, for_xml_content, write_cdata, write_xml, write_xml11, write_xml11_attribute,
204    write_xml11_content, write_xml_attribute, write_xml_comment, write_xml_content,
205};
206
207#[cfg(test)]
208mod tests {
209    use super::*;
210
211    #[test]
212    fn empty_string_returns_empty() {
213        assert_eq!(for_html(""), "");
214        assert_eq!(for_html_content(""), "");
215        assert_eq!(for_html_attribute(""), "");
216        assert_eq!(for_html_unquoted_attribute(""), "");
217        assert_eq!(for_javascript(""), "");
218        assert_eq!(for_javascript_attribute(""), "");
219        assert_eq!(for_javascript_block(""), "");
220        assert_eq!(for_javascript_source(""), "");
221        assert_eq!(for_css_string(""), "");
222        assert_eq!(for_css_url(""), "");
223        assert_eq!(for_uri_component(""), "");
224        assert_eq!(for_xml(""), "");
225        assert_eq!(for_xml_content(""), "");
226        assert_eq!(for_xml_attribute(""), "");
227        assert_eq!(for_xml_comment(""), "");
228        assert_eq!(for_cdata(""), "");
229        assert_eq!(for_xml11(""), "");
230        assert_eq!(for_xml11_content(""), "");
231        assert_eq!(for_xml11_attribute(""), "");
232        assert_eq!(for_java(""), "");
233        assert_eq!(for_json(""), "");
234        assert_eq!(for_go_string(""), "");
235        assert_eq!(for_go_char(""), "");
236        assert_eq!(for_go_byte_string(""), "");
237        assert_eq!(for_rust_string(""), "");
238        assert_eq!(for_rust_char(""), "");
239        assert_eq!(for_rust_byte_string(""), "");
240        assert_eq!(for_ruby_string(""), "");
241        assert_eq!(for_python_string(""), "");
242        assert_eq!(for_python_bytes(""), "");
243        assert_eq!(for_python_raw_string(""), "");
244        assert_eq!(for_js_template(""), "");
245        assert_eq!(for_sql(""), "");
246        assert_eq!(for_sql_backslash(""), "");
247    }
248
249    #[test]
250    fn empty_string_writer_variants() {
251        let mut buf = String::new();
252        write_html(&mut buf, "").unwrap();
253        assert_eq!(buf, "");
254
255        buf.clear();
256        write_javascript(&mut buf, "").unwrap();
257        assert_eq!(buf, "");
258
259        buf.clear();
260        write_css_string(&mut buf, "").unwrap();
261        assert_eq!(buf, "");
262
263        buf.clear();
264        write_uri_component(&mut buf, "").unwrap();
265        assert_eq!(buf, "");
266    }
267
268    // two-byte: é (U+00E9), ñ (U+00F1)
269    // three-byte: 世 (U+4E16), € (U+20AC)
270    // four-byte: 😀 (U+1F600), 𐍈 (U+10348)
271
272    #[test]
273    fn multibyte_utf8_html() {
274        assert_eq!(for_html("café"), "café");
275        assert_eq!(for_html("世界"), "世界");
276        assert_eq!(for_html("😀"), "😀");
277        assert_eq!(for_html("é<世>&😀"), "é&lt;世&gt;&amp;😀");
278    }
279
280    #[test]
281    fn multibyte_utf8_javascript() {
282        assert_eq!(for_javascript("café"), "café");
283        assert_eq!(for_javascript("世界"), "世界");
284        assert_eq!(for_javascript("😀"), "😀");
285    }
286
287    #[test]
288    fn multibyte_utf8_css_string() {
289        assert_eq!(for_css_string("café"), "café");
290        assert_eq!(for_css_string("世界"), "世界");
291        assert_eq!(for_css_string("😀"), "😀");
292    }
293
294    #[test]
295    fn multibyte_utf8_uri_component() {
296        assert_eq!(for_uri_component("é"), "%C3%A9");
297        assert_eq!(for_uri_component("世"), "%E4%B8%96");
298        assert_eq!(for_uri_component("😀"), "%F0%9F%98%80");
299        assert_eq!(for_uri_component("café"), "caf%C3%A9");
300    }
301
302    #[test]
303    fn multibyte_utf8_go_string_passthrough() {
304        assert_eq!(for_go_string("caf\u{00e9}"), "caf\u{00e9}");
305        assert_eq!(for_go_string("\u{4e16}\u{754c}"), "\u{4e16}\u{754c}");
306        assert_eq!(for_go_string("\u{1F600}"), "\u{1F600}");
307    }
308
309    #[test]
310    fn multibyte_utf8_go_byte_string() {
311        assert_eq!(for_go_byte_string("\u{00e9}"), r"\xc3\xa9");
312        assert_eq!(for_go_byte_string("\u{4e16}"), r"\xe4\xb8\x96");
313        assert_eq!(for_go_byte_string("\u{1F600}"), r"\xf0\x9f\x98\x80");
314    }
315
316    #[test]
317    fn multibyte_utf8_rust_byte_string() {
318        assert_eq!(for_rust_byte_string("é"), r"\xc3\xa9");
319        assert_eq!(for_rust_byte_string("世"), r"\xe4\xb8\x96");
320        assert_eq!(for_rust_byte_string("😀"), r"\xf0\x9f\x98\x80");
321    }
322
323    #[test]
324    fn multibyte_utf8_rust_string_passthrough() {
325        assert_eq!(for_rust_string("café"), "café");
326        assert_eq!(for_rust_string("世界"), "世界");
327        assert_eq!(for_rust_string("😀"), "😀");
328    }
329
330    #[test]
331    fn multibyte_utf8_ruby_string_passthrough() {
332        assert_eq!(for_ruby_string("café"), "café");
333        assert_eq!(for_ruby_string("世界"), "世界");
334        assert_eq!(for_ruby_string("😀"), "😀");
335    }
336
337    #[test]
338    fn multibyte_utf8_python_string_passthrough() {
339        assert_eq!(for_python_string("café"), "café");
340        assert_eq!(for_python_string("世界"), "世界");
341        assert_eq!(for_python_string("😀"), "😀");
342    }
343
344    #[test]
345    fn multibyte_utf8_python_bytes() {
346        assert_eq!(for_python_bytes("\u{00e9}"), r"\xc3\xa9");
347        assert_eq!(for_python_bytes("\u{4e16}"), r"\xe4\xb8\x96");
348        assert_eq!(for_python_bytes("\u{1F600}"), r"\xf0\x9f\x98\x80");
349    }
350
351    #[test]
352    fn multibyte_utf8_python_raw_string_passthrough() {
353        assert_eq!(for_python_raw_string("café"), "café");
354        assert_eq!(for_python_raw_string("世界"), "世界");
355        assert_eq!(for_python_raw_string("😀"), "😀");
356    }
357
358    #[test]
359    fn multibyte_utf8_json() {
360        assert_eq!(for_json("café"), "café");
361        assert_eq!(for_json("世界"), "世界");
362        assert_eq!(for_json("😀"), "😀");
363    }
364
365    #[test]
366    fn multibyte_utf8_java() {
367        assert_eq!(for_java("café"), "café");
368        assert_eq!(for_java("世界"), "世界");
369        assert_eq!(for_java("😀"), "\\ud83d\\ude00");
370    }
371
372    #[test]
373    fn multibyte_utf8_sql() {
374        assert_eq!(for_sql("café"), "café");
375        assert_eq!(for_sql("世界"), "世界");
376        assert_eq!(for_sql("😀"), "😀");
377    }
378
379    #[test]
380    fn multibyte_utf8_sql_backslash() {
381        assert_eq!(for_sql_backslash("café"), "café");
382        assert_eq!(for_sql_backslash("世界"), "世界");
383        assert_eq!(for_sql_backslash("😀"), "😀");
384    }
385
386    #[test]
387    fn multibyte_utf8_xml() {
388        assert_eq!(for_xml("café"), "café");
389        assert_eq!(for_xml("世界"), "世界");
390        assert_eq!(for_xml("😀"), "😀");
391    }
392}