1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
use percent_encoding::percent_decode_str;
/// Parse filename from Content-Disposition header
/// Prioritizes filename* parameter if present, otherwise uses filename parameter
pub fn parse_filename_from_content_disposition(content_disposition: &str) -> Option<String> {
let parts: Vec<&str> = content_disposition
.split(';')
.map(|part| part.trim())
.collect();
// First try to find filename* parameter
for part in parts.iter() {
if let Some(value) = part.strip_prefix("filename*=") {
if let Some(filename) = parse_encoded_filename(value) {
return Some(filename);
}
}
}
// If filename* is not found or parsing failed, try regular filename parameter
for part in parts {
if let Some(value) = part.strip_prefix("filename=") {
return parse_regular_filename(value);
}
}
None
}
/// Parse regular filename parameter
/// Handles both quoted and unquoted filenames
fn parse_regular_filename(filename: &str) -> Option<String> {
// Content-Disposition: attachment; filename="file with \"quotes\".txt" // This won't occur
// Content-Disposition: attachment; filename*=UTF-8''file%20with%20quotes.txt // This is the actual practice
//
// We don't need to handle escaped characters in Content-Disposition header parsing because:
//
// It's not a standard practice
// It rarely occurs in real-world scenarios
// When filenames contain special characters, they should use the filename* parameter
// Remove quotes if present
let filename = if filename.starts_with('"') && filename.ends_with('"') && filename.len() >= 2 {
&filename[1..(filename.len() - 1)]
} else {
filename
};
if filename.is_empty() {
return None;
}
Some(filename.to_string())
}
/// Parse RFC 5987 encoded filename (filename*)
/// Format: charset'language'encoded-value
fn parse_encoded_filename(content: &str) -> Option<String> {
// Remove "filename*=" prefix
// According to RFC 5987, format should be: charset'language'encoded-value
let parts: Vec<&str> = content.splitn(3, '\'').collect();
if parts.len() != 3 {
return None;
}
let charset = parts[0];
let encoded_filename = parts[2];
// Percent-decode the encoded filename into bytes.
let decoded_bytes = percent_decode_str(encoded_filename).collect::<Vec<u8>>();
if charset.eq_ignore_ascii_case("UTF-8") {
if let Ok(decoded_str) = String::from_utf8(decoded_bytes) {
return Some(decoded_str);
}
} else if charset.eq_ignore_ascii_case("ISO-8859-1") {
// RFC 5987 says to use ISO/IEC 8859-1:1998.
// But Firefox and Chromium decode %99 as ™ so they're actually using
// Windows-1252. This mixup is common on the web.
// This affects the 0x80-0x9F range. According to ISO 8859-1 those are
// control characters. According to Windows-1252 most of them are
// printable characters.
// They agree on all the other characters, and filenames shouldn't have
// control characters, so Windows-1252 makes sense.
if let Some(decoded_str) = encoding_rs::WINDOWS_1252
.decode_without_bom_handling_and_without_replacement(&decoded_bytes)
{
return Some(decoded_str.into_owned());
}
} else {
// Unknown charset. As a fallback, try interpreting as UTF-8.
// Firefox also does this.
// Chromium makes up its own filename. (Even if `filename=` is present.)
if let Ok(decoded_str) = String::from_utf8(decoded_bytes) {
return Some(decoded_str);
}
}
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_simple_filename() {
let header = r#"attachment; filename="example.pdf""#;
assert_eq!(
parse_filename_from_content_disposition(header),
Some("example.pdf".to_string())
);
}
#[test]
fn test_filename_without_quotes() {
let header = "attachment; filename=example.pdf";
assert_eq!(
parse_filename_from_content_disposition(header),
Some("example.pdf".to_string())
);
}
#[test]
fn test_encoded_filename() {
// UTF-8 encoded Chinese filename "测试.pdf"
let header = "attachment; filename*=UTF-8''%E6%B5%8B%E8%AF%95.pdf";
assert_eq!(
parse_filename_from_content_disposition(header),
Some("测试.pdf".to_string())
);
}
#[test]
fn test_both_filenames() {
// When both filename and filename* are present, filename* should be preferred
let header =
r#"attachment; filename="fallback.pdf"; filename*=UTF-8''%E6%B5%8B%E8%AF%95.pdf"#;
assert_eq!(
parse_filename_from_content_disposition(header),
Some("测试.pdf".to_string())
);
}
#[test]
fn test_decode_with_windows_1252() {
let header = "content-disposition: attachment; filename*=iso-8859-1'en'a%99b";
assert_eq!(
parse_filename_from_content_disposition(header),
Some("a™b".to_string())
);
}
#[test]
fn test_both_filenames_with_bad_format() {
// When both filename and filename* are present, filename* with bad format, filename should be used
let header = r#"attachment; filename="fallback.pdf"; filename*=UTF-8'bad_format.pdf"#;
assert_eq!(
parse_filename_from_content_disposition(header),
Some("fallback.pdf".to_string())
);
}
#[test]
fn test_no_filename() {
let header = "attachment";
assert_eq!(parse_filename_from_content_disposition(header), None);
}
#[test]
fn test_iso_8859_1() {
let header = "attachment;filename*=iso-8859-1'en'%A3%20rates";
assert_eq!(
parse_filename_from_content_disposition(header),
Some("£ rates".to_string())
);
}
#[test]
fn test_bad_encoding_fallback_to_utf8() {
let header = "attachment;filename*=UTF-16''%E6%B5%8B%E8%AF%95.pdf";
assert_eq!(
parse_filename_from_content_disposition(header),
Some("测试.pdf".to_string())
);
}
}