ironpress 1.2.1

Pure Rust HTML/CSS/Markdown to PDF converter with layout engine, tables, images, custom fonts, and streaming output. No browser, no system dependencies.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
//! SVG sanitizer — strips dangerous elements and attributes before parsing.

/// Maximum number of SVG elements allowed.
pub const MAX_SVG_ELEMENTS: usize = 10_000;

/// Maximum SVG nesting depth.
pub const MAX_SVG_DEPTH: usize = 50;

/// Allowlisted SVG elements (everything else is stripped, content preserved if safe).
const ALLOWED_ELEMENTS: &[&str] = &[
    "svg", "g", "path", "rect", "circle", "ellipse", "line", "polyline", "polygon", "title",
    "desc", "defs",
];

/// Blocklisted elements (removed WITH their content — these are dangerous).
const BLOCKED_ELEMENTS: &[&str] = &[
    "script",
    "foreignobject",
    "use",
    "image",
    "a",
    "animate",
    "set",
    "animatemotion",
    "animatetransform",
    "iframe",
    "embed",
    "object",
    "style",
    "handler",
    "listener",
];

/// Sanitize SVG markup string. Returns cleaned SVG.
pub fn sanitize_svg(svg: &str) -> String {
    // 1. Remove blocked elements and their content
    let mut result = svg.to_string();
    for tag in BLOCKED_ELEMENTS {
        result = remove_tag_with_content(&result, tag);
    }

    // 2. Remove event handler attributes and dangerous href attributes
    result = remove_dangerous_attributes(&result);

    // 3. Remove javascript: in attribute values
    result = remove_javascript_urls(&result);

    // 4. Strip unknown elements (keep content)
    result = strip_unknown_elements(&result);

    // 5. Check element count limit
    let count = count_elements(&result);
    if count > MAX_SVG_ELEMENTS {
        return String::from("<svg></svg>");
    }

    result
}

/// Remove a tag and all its content (case-insensitive).
fn remove_tag_with_content(input: &str, tag: &str) -> String {
    let mut result = input.to_string();
    let open = format!("<{}", tag);
    let close = format!("</{}>", tag);

    loop {
        let lower = result.to_ascii_lowercase();
        let start = lower.find(&open);
        let end = lower.find(&close);

        match (start, end) {
            (Some(s), Some(e)) => {
                let end_pos = e + close.len();
                result = format!("{}{}", &result[..s], &result[end_pos..]);
            }
            (Some(s), None) => {
                // Self-closing or unclosed — remove from start to end of tag
                if let Some(gt) = result[s..].find('>') {
                    result = format!("{}{}", &result[..s], &result[s + gt + 1..]);
                } else {
                    break;
                }
            }
            _ => break,
        }
    }

    result
}

/// Remove on* event handler attributes, href, and xlink:href inside tags.
fn remove_dangerous_attributes(html: &str) -> String {
    let mut result = String::with_capacity(html.len());
    let bytes = html.as_bytes();
    let mut i = 0;
    let mut in_tag = false;

    while i < bytes.len() {
        let c = bytes[i] as char;

        if c == '<' && !in_tag {
            in_tag = true;
            result.push(c);
            i += 1;
            continue;
        }

        if c == '>' {
            in_tag = false;
            result.push(c);
            i += 1;
            continue;
        }

        if in_tag {
            // Check for on* event handlers
            if (c == 'o' || c == 'O') && i + 2 < bytes.len() {
                let next = bytes[i + 1] as char;
                if (next == 'n' || next == 'N') && (bytes[i + 2] as char).is_ascii_alphabetic() {
                    let prev = if i > 0 { bytes[i - 1] as char } else { ' ' };
                    if prev == ' ' || prev == '\t' || prev == '\n' {
                        i = skip_attribute(bytes, i);
                        continue;
                    }
                }
            }

            // Check for href attribute
            if (c == 'h' || c == 'H') && i + 4 < bytes.len() {
                let chunk: String = bytes[i..i + 4]
                    .iter()
                    .map(|&b| (b as char).to_ascii_lowercase())
                    .collect();
                if chunk == "href" {
                    let prev = if i > 0 { bytes[i - 1] as char } else { ' ' };
                    if prev == ' ' || prev == '\t' || prev == '\n' || prev == ':' {
                        i = skip_attribute(bytes, i);
                        continue;
                    }
                }
            }

            // Check for xlink:href attribute
            if (c == 'x' || c == 'X') && i + 10 < bytes.len() {
                let chunk: String = bytes[i..i + 10]
                    .iter()
                    .map(|&b| (b as char).to_ascii_lowercase())
                    .collect();
                if chunk == "xlink:href" {
                    let prev = if i > 0 { bytes[i - 1] as char } else { ' ' };
                    if prev == ' ' || prev == '\t' || prev == '\n' {
                        i = skip_attribute(bytes, i);
                        continue;
                    }
                }
            }
        }

        result.push(c);
        i += 1;
    }

    result
}

/// Skip past an attribute name and its value (name="value" or name='value' or name=value).
fn skip_attribute(bytes: &[u8], start: usize) -> usize {
    let mut j = start;
    // Skip attribute name
    while j < bytes.len() && bytes[j] != b'=' && bytes[j] != b' ' && bytes[j] != b'>' {
        j += 1;
    }
    // Skip = and value
    if j < bytes.len() && bytes[j] == b'=' {
        j += 1;
        // Skip whitespace
        while j < bytes.len() && (bytes[j] as char).is_whitespace() {
            j += 1;
        }
        if j < bytes.len() && (bytes[j] == b'"' || bytes[j] == b'\'') {
            let quote = bytes[j];
            j += 1;
            while j < bytes.len() && bytes[j] != quote {
                j += 1;
            }
            if j < bytes.len() {
                j += 1; // skip closing quote
            }
        } else {
            // Unquoted — skip to space or >
            while j < bytes.len() && bytes[j] != b' ' && bytes[j] != b'>' {
                j += 1;
            }
        }
    }
    j
}

/// Remove javascript: from attribute values.
fn remove_javascript_urls(html: &str) -> String {
    // Case-insensitive replacement
    let mut result = String::with_capacity(html.len());
    let lower = html.to_ascii_lowercase();
    let target = "javascript:";
    let mut last = 0;

    for (pos, _) in lower.match_indices(target) {
        result.push_str(&html[last..pos]);
        last = pos + target.len();
    }
    result.push_str(&html[last..]);
    result
}

/// Strip tags that are not in the allowlist, but keep their text content.
fn strip_unknown_elements(html: &str) -> String {
    let mut result = String::with_capacity(html.len());
    let bytes = html.as_bytes();
    let mut i = 0;

    while i < bytes.len() {
        if bytes[i] == b'<' {
            // Find end of tag
            if let Some(gt_offset) = html[i..].find('>') {
                let tag_str = &html[i + 1..i + gt_offset];
                let tag_name = extract_tag_name(tag_str);
                let tag_lower = tag_name.to_ascii_lowercase();

                if is_allowed_element(&tag_lower) {
                    // Keep the whole tag
                    result.push_str(&html[i..=i + gt_offset]);
                }
                // else: skip the tag (content will be kept by subsequent iterations)
                i += gt_offset + 1;
            } else {
                result.push(bytes[i] as char);
                i += 1;
            }
        } else {
            result.push(bytes[i] as char);
            i += 1;
        }
    }

    result
}

/// Extract the tag name from the content between < and >.
fn extract_tag_name(tag_content: &str) -> &str {
    let s = tag_content.trim_start_matches('/').trim();
    // Tag name ends at first space, /, or end
    let end = s
        .find(|c: char| c.is_whitespace() || c == '/')
        .unwrap_or(s.len());
    &s[..end]
}

/// Check if a tag name (lowercase) is in the allowlist.
fn is_allowed_element(tag_lower: &str) -> bool {
    ALLOWED_ELEMENTS.contains(&tag_lower)
}

/// Count the number of opening tags in the markup.
fn count_elements(html: &str) -> usize {
    let mut count = 0;
    let bytes = html.as_bytes();
    let mut i = 0;
    while i < bytes.len() {
        if bytes[i] == b'<' && i + 1 < bytes.len() && bytes[i + 1] != b'/' {
            count += 1;
        }
        i += 1;
    }
    count
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn sanitize_strips_script() {
        let input = r#"<svg><rect width="10" height="10"/><script>alert(1)</script></svg>"#;
        let result = sanitize_svg(input);
        assert!(!result.contains("script"));
        assert!(!result.contains("alert"));
    }

    #[test]
    fn sanitize_strips_foreignobject() {
        let input = r#"<svg><foreignObject><div>evil</div></foreignObject><rect/></svg>"#;
        let result = sanitize_svg(input);
        assert!(!result.contains("foreignObject"));
        assert!(!result.contains("foreignobject"));
        assert!(!result.contains("evil"));
    }

    #[test]
    fn sanitize_strips_use_element() {
        let input = r##"<svg><use href="#evil"/></svg>"##;
        let result = sanitize_svg(input);
        assert!(!result.contains("use"));
    }

    #[test]
    fn sanitize_strips_event_handlers() {
        let input = r#"<svg><rect onclick="alert(1)" width="10" height="10"/></svg>"#;
        let result = sanitize_svg(input);
        assert!(!result.contains("onclick"));
        assert!(!result.contains("alert"));
        assert!(result.contains("rect"));
    }

    #[test]
    fn sanitize_preserves_basic_shapes() {
        let input = r#"<svg><rect width="10" height="10"/><circle cx="5" cy="5" r="3"/><path d="M0 0L10 10"/></svg>"#;
        let result = sanitize_svg(input);
        assert!(result.contains("rect"));
        assert!(result.contains("circle"));
        assert!(result.contains("path"));
    }

    #[test]
    fn sanitize_strips_href() {
        let input = r#"<svg><rect href="http://evil.com" width="10"/></svg>"#;
        let result = sanitize_svg(input);
        assert!(!result.contains("href"));
        assert!(!result.contains("evil.com"));
    }

    #[test]
    fn sanitize_exceeds_max_elements_returns_empty_svg() {
        // Build an SVG with more than MAX_SVG_ELEMENTS opening tags
        let mut input = String::from("<svg>");
        for _ in 0..MAX_SVG_ELEMENTS + 1 {
            input.push_str("<rect/>");
        }
        input.push_str("</svg>");
        let result = sanitize_svg(&input);
        assert_eq!(result, "<svg></svg>");
    }

    #[test]
    fn sanitize_nested_blocked_elements() {
        let input = "<svg><script><script>inner</script></script><rect/></svg>";
        let result = sanitize_svg(input);
        assert!(!result.contains("script"));
        assert!(!result.contains("inner"));
        assert!(result.contains("rect"));
    }

    #[test]
    fn sanitize_unclosed_blocked_element_no_gt() {
        // A blocked tag with no closing tag and no '>' — triggers the break branch (line 81)
        // The tag persists because there's no '>' to close it, but the break is exercised.
        let input = "<svg><rect/><script";
        let result = sanitize_svg(input);
        // The unclosed fragment remains since there is no '>' to find
        assert!(result.contains("<script"));
    }

    #[test]
    fn sanitize_self_closing_blocked_element() {
        // A blocked tag with opening but no closing, has '>' — triggers (Some(s), None) branch
        let input = "<svg><image src='evil.png'/><rect/></svg>";
        let result = sanitize_svg(input);
        assert!(!result.contains("image"));
        assert!(!result.contains("evil.png"));
        assert!(result.contains("rect"));
    }

    #[test]
    fn sanitize_strips_xlink_href() {
        let input = r#"<svg><rect xlink:href="http://evil.com" width="10"/></svg>"#;
        let result = sanitize_svg(input);
        assert!(!result.contains("xlink:href"));
        assert!(!result.contains("evil.com"));
    }

    #[test]
    fn sanitize_strips_xlink_href_mixed_case() {
        let input = r#"<svg><rect Xlink:Href="http://evil.com" width="10"/></svg>"#;
        let result = sanitize_svg(input);
        assert!(!result.contains("Xlink:Href"));
        assert!(!result.contains("evil.com"));
    }

    #[test]
    fn sanitize_event_handler_mixed_case() {
        let input = r#"<svg><rect OnClick="alert(1)" width="10"/></svg>"#;
        let result = sanitize_svg(input);
        assert!(!result.contains("OnClick"));
        assert!(!result.contains("alert"));
        assert!(result.contains("rect"));
    }

    #[test]
    fn sanitize_event_handler_with_tab_separator() {
        let input = "<svg><rect\tonclick=\"alert(1)\" width=\"10\"/></svg>";
        let result = sanitize_svg(input);
        assert!(!result.contains("onclick"));
    }

    #[test]
    fn sanitize_event_handler_with_newline_separator() {
        let input = "<svg><rect\nonclick=\"alert(1)\" width=\"10\"/></svg>";
        let result = sanitize_svg(input);
        assert!(!result.contains("onclick"));
    }

    #[test]
    fn sanitize_removes_javascript_urls() {
        let input = r#"<svg><rect fill="javascript:void(0)"/></svg>"#;
        let result = sanitize_svg(input);
        assert!(!result.contains("javascript:"));
        // The "void(0)" part remains since only the "javascript:" prefix is stripped
    }

    #[test]
    fn sanitize_removes_javascript_urls_mixed_case() {
        let input = r#"<svg><rect fill="JavaScript:void(0)"/></svg>"#;
        let result = sanitize_svg(input);
        assert!(!result.contains("JavaScript:"));
        assert!(!result.contains("javascript:"));
    }

    #[test]
    fn sanitize_removes_multiple_javascript_urls() {
        let input = r#"<svg><rect fill="javascript:x" stroke="javascript:y"/></svg>"#;
        let result = sanitize_svg(input);
        assert!(!result.contains("javascript:"));
    }

    #[test]
    fn sanitize_strips_unknown_elements_keeps_text() {
        let input = "<svg><div>hello</div><rect/></svg>";
        let result = sanitize_svg(input);
        assert!(!result.contains("<div"));
        assert!(!result.contains("</div>"));
        assert!(result.contains("hello"));
        assert!(result.contains("rect"));
    }

    #[test]
    fn sanitize_empty_svg() {
        let result = sanitize_svg("");
        assert_eq!(result, "");
    }

    #[test]
    fn sanitize_text_only_content() {
        let result = sanitize_svg("just plain text");
        assert_eq!(result, "just plain text");
    }

    #[test]
    fn sanitize_unclosed_tag_in_strip_unknown() {
        // An unclosed '<' with no '>' triggers the else branch in strip_unknown_elements
        let input = "<svg><rect/></svg><broken";
        let result = sanitize_svg(input);
        // The '<' is preserved as-is since there's no closing '>'
        assert!(result.contains("<broken"));
    }

    #[test]
    fn sanitize_unquoted_attribute_value() {
        // An event handler with an unquoted value exercises the unquoted branch in skip_attribute
        let input = "<svg><rect onclick=alert width=\"10\"/></svg>";
        let result = sanitize_svg(input);
        assert!(!result.contains("onclick"));
        assert!(!result.contains("alert"));
    }

    #[test]
    fn sanitize_attribute_with_whitespace_after_equals() {
        // Whitespace between = and the quoted value exercises that branch in skip_attribute
        let input = "<svg><rect onclick= \"alert(1)\" width=\"10\"/></svg>";
        let result = sanitize_svg(input);
        assert!(!result.contains("onclick"));
        assert!(!result.contains("alert"));
    }

    #[test]
    fn sanitize_attribute_with_single_quotes() {
        let input = "<svg><rect onclick='alert(1)' width='10'/></svg>";
        let result = sanitize_svg(input);
        assert!(!result.contains("onclick"));
        assert!(!result.contains("alert"));
    }

    #[test]
    fn sanitize_at_element_limit_passes() {
        // Exactly MAX_SVG_ELEMENTS should be allowed
        let mut input = String::from("<svg>");
        for _ in 0..MAX_SVG_ELEMENTS - 1 {
            input.push_str("<rect/>");
        }
        input.push_str("</svg>");
        let result = sanitize_svg(&input);
        assert!(result.contains("rect"));
        assert_ne!(result, "<svg></svg>");
    }

    #[test]
    fn sanitize_xlink_href_with_tab_prefix() {
        let input = "<svg><rect\txlink:href=\"http://evil.com\" width=\"10\"/></svg>";
        let result = sanitize_svg(input);
        assert!(!result.contains("xlink:href"));
    }

    #[test]
    fn sanitize_xlink_href_with_newline_prefix() {
        let input = "<svg><rect\nxlink:href=\"http://evil.com\" width=\"10\"/></svg>";
        let result = sanitize_svg(input);
        assert!(!result.contains("xlink:href"));
    }

    #[test]
    fn sanitize_href_after_colon() {
        // href preceded by ':' (like in xlink:href) should also be stripped
        let input = r#"<svg><rect xlink:href="http://evil.com" width="10"/></svg>"#;
        let result = sanitize_svg(input);
        assert!(!result.contains("href"));
    }
}