1use crate::rest::ContentType;
25
26const MAX_CLASSIFY_BYTES: usize = 64 * 1024;
27const SYMBOL_RATIO_THRESHOLD: f32 = 0.20;
28
29const ALLOWED_URL_SCHEMES: &[&str] = &[
30 "http", "https", "ftp", "ftps", "ssh", "sftp", "mailto", "file", "ws", "wss",
31];
32
33const CODE_LINE_OPENERS: &[&str] = &[
37 "fn ",
38 "def ",
39 "function ",
40 "function(",
41 "class ",
42 "interface ",
43 "trait ",
44 "impl ",
45 "struct ",
46 "enum ",
47 "type ",
48 "import ",
49 "from ",
50 "export ",
51 "module ",
52 "package ",
53 "use ",
54 "namespace ",
55 "const ",
56 "let ",
57 "var ",
58 "pub ",
59 "static ",
60 "async ",
61 "await ",
62 "return ",
63 "yield ",
64 "throw ",
65 "if (",
66 "for (",
67 "while (",
68 "switch (",
69 "catch (",
70 "#include",
71 "#define",
72 "#!/",
73];
74
75const CODE_BIGRAMS: &[&str] = &[
77 "=>", "->", "::", "!=", "==", "&&", "||", "</", "/>", "//", "/*", "*/", "++", "--", ">=", "<=",
78 ">>", "<<", "...",
79];
80
81pub fn detect(content: &[u8]) -> ContentType {
89 if content.len() > MAX_CLASSIFY_BYTES {
92 return ContentType::Text;
93 }
94 let s = match std::str::from_utf8(content) {
97 Ok(s) => s,
98 Err(_) => return ContentType::Text,
99 };
100 detect_str(s)
101}
102
103fn detect_str(content: &str) -> ContentType {
104 let s = content.trim();
105 if s.is_empty() {
106 return ContentType::Text;
107 }
108
109 if s.starts_with("#!/") {
110 return ContentType::Code;
111 }
112
113 if !s.chars().any(char::is_whitespace) {
114 if let Ok(url) = url::Url::parse(s) {
115 if ALLOWED_URL_SCHEMES.contains(&url.scheme()) {
116 return ContentType::Url;
117 }
118 }
119 }
120
121 let bytes = s.as_bytes();
122 let first = bytes[0];
123 let last = bytes[bytes.len() - 1];
124 if ((first == b'{' && last == b'}') || (first == b'[' && last == b']'))
125 && serde_json::from_str::<serde_json::Value>(s).is_ok()
126 {
127 return ContentType::Code;
128 }
129
130 for line in s.lines() {
131 let trimmed = line.trim_start();
132 if CODE_LINE_OPENERS.iter().any(|kw| trimmed.starts_with(kw)) {
133 return ContentType::Code;
134 }
135 }
136
137 let scan = scan(s);
138 if scan.symbol_ratio > SYMBOL_RATIO_THRESHOLD && scan.bigram_count >= 1 {
139 return ContentType::Code;
140 }
141 if scan.bigram_count >= 2 {
142 return ContentType::Code;
143 }
144 if scan.indented_lines >= 1 && scan.bigram_count >= 1 {
145 return ContentType::Code;
146 }
147
148 ContentType::Text
149}
150
151struct ScanResult {
152 symbol_ratio: f32,
153 bigram_count: usize,
154 indented_lines: usize,
155}
156
157fn scan(s: &str) -> ScanResult {
158 let bytes = s.as_bytes();
159 let mut symbol_count: usize = 0;
160 let mut alnum_count: usize = 0;
161 let mut indented_lines: usize = 0;
162
163 if is_indent_at(bytes, 0) {
164 indented_lines += 1;
165 }
166 for (i, &b) in bytes.iter().enumerate() {
167 if is_code_symbol(b) {
168 symbol_count += 1;
169 } else if b.is_ascii_alphanumeric() {
170 alnum_count += 1;
171 } else if b == b'\n' && is_indent_at(bytes, i + 1) {
172 indented_lines += 1;
173 }
174 }
175
176 let bigram_count = CODE_BIGRAMS.iter().filter(|p| s.contains(*p)).count();
177 let symbol_ratio = if alnum_count == 0 {
178 0.0
179 } else {
180 symbol_count as f32 / alnum_count as f32
181 };
182
183 ScanResult {
184 symbol_ratio,
185 bigram_count,
186 indented_lines,
187 }
188}
189
190const fn is_code_symbol(b: u8) -> bool {
191 matches!(
192 b,
193 b'{' | b'}'
194 | b'('
195 | b')'
196 | b'['
197 | b']'
198 | b';'
199 | b'='
200 | b'<'
201 | b'>'
202 | b'/'
203 | b'\\'
204 | b'|'
205 | b'&'
206 | b'*'
207 | b'+'
208 | b':'
209 )
210}
211
212fn is_indent_at(bytes: &[u8], i: usize) -> bool {
213 match bytes.get(i) {
214 Some(b'\t') => true,
215 Some(b' ') => matches!(bytes.get(i + 1), Some(b' ')),
216 _ => false,
217 }
218}
219
220#[cfg(test)]
221mod tests {
222 use super::*;
223
224 fn detect(s: &str) -> ContentType {
228 super::detect(s.as_bytes())
229 }
230
231 #[test]
232 fn empty_is_text() {
233 assert_eq!(detect(""), ContentType::Text);
234 assert_eq!(detect(" \n\t "), ContentType::Text);
235 }
236
237 #[test]
238 fn short_prose_is_text() {
239 assert_eq!(detect("Hello world"), ContentType::Text);
240 assert_eq!(
241 detect("This is a normal sentence with a period."),
242 ContentType::Text
243 );
244 }
245
246 #[test]
247 fn korean_prose_is_text() {
248 assert_eq!(
249 detect("안녕하세요. 오늘 회의는 3시입니다."),
250 ContentType::Text
251 );
252 }
253
254 #[test]
255 fn long_prose_is_text() {
256 let s = "The quick brown fox jumps over the lazy dog. \
257 This is a longer paragraph designed to test that prose, \
258 even with occasional punctuation like commas, periods, and \
259 apostrophes, does not cross the code threshold.";
260 assert_eq!(detect(s), ContentType::Text);
261 }
262
263 #[test]
264 fn https_url() {
265 assert_eq!(detect("https://example.com"), ContentType::Url);
266 assert_eq!(
267 detect("https://example.com/path?q=1&r=2#frag"),
268 ContentType::Url
269 );
270 assert_eq!(detect(" https://example.com "), ContentType::Url);
271 }
272
273 #[test]
274 fn other_schemes_url() {
275 assert_eq!(detect("http://localhost:8080"), ContentType::Url);
276 assert_eq!(detect("mailto:foo@bar.com"), ContentType::Url);
277 assert_eq!(detect("ssh://user@host.com"), ContentType::Url);
278 assert_eq!(detect("file:///tmp/x"), ContentType::Url);
279 assert_eq!(
280 detect("wss://relay.example.com/v1/stream"),
281 ContentType::Url
282 );
283 }
284
285 #[test]
286 fn url_with_whitespace_is_not_url() {
287 assert_eq!(
288 detect("check out https://example.com today"),
289 ContentType::Text
290 );
291 }
292
293 #[test]
294 fn bare_hostname_is_not_url() {
295 assert_eq!(detect("example.com"), ContentType::Text);
296 assert_eq!(detect("foo.bar.baz"), ContentType::Text);
297 }
298
299 #[test]
300 fn windows_path_is_not_url() {
301 assert_eq!(detect("c:\\users\\me\\file.txt"), ContentType::Text);
302 }
303
304 #[test]
305 fn shebang_is_code() {
306 assert_eq!(detect("#!/usr/bin/env bash\necho hello"), ContentType::Code);
307 assert_eq!(detect("#!/bin/sh"), ContentType::Code);
308 }
309
310 #[test]
311 fn json_object_is_code() {
312 assert_eq!(detect(r#"{"key": "value"}"#), ContentType::Code);
313 assert_eq!(
314 detect(r#"{"nested": {"deep": [1, 2, 3]}, "ok": true}"#),
315 ContentType::Code
316 );
317 }
318
319 #[test]
320 fn json_array_is_code() {
321 assert_eq!(detect("[1, 2, 3]"), ContentType::Code);
322 }
323
324 #[test]
325 fn json_shaped_but_invalid_is_text() {
326 assert_eq!(detect("{not really json}"), ContentType::Text);
327 }
328
329 #[test]
330 fn rust_snippet_is_code() {
331 let s = "fn main() {\n let x = 42;\n println!(\"{}\", x);\n}";
332 assert_eq!(detect(s), ContentType::Code);
333 }
334
335 #[test]
336 fn python_snippet_is_code() {
337 let s = "def greet(name):\n return f\"hello, {name}\"\n\nprint(greet(\"world\"))";
338 assert_eq!(detect(s), ContentType::Code);
339 }
340
341 #[test]
342 fn typescript_snippet_is_code() {
343 let s = "const add = (a: number, b: number) => a + b;\nexport { add };";
344 assert_eq!(detect(s), ContentType::Code);
345 }
346
347 #[test]
348 fn javascript_one_liner_is_code() {
349 assert_eq!(
352 detect("const foo = bar.map(x => x * 2);"),
353 ContentType::Code
354 );
355 }
356
357 #[test]
358 fn import_statement_is_code() {
359 assert_eq!(
360 detect("import { useState } from 'react';"),
361 ContentType::Code
362 );
363 }
364
365 #[test]
366 fn html_snippet_is_code() {
367 let s = "<div class=\"foo\">\n <span>hi</span>\n</div>";
368 assert_eq!(detect(s), ContentType::Code);
369 }
370
371 #[test]
372 fn c_include_is_code() {
373 assert_eq!(detect("#include <stdio.h>"), ContentType::Code);
374 }
375
376 #[test]
377 fn huge_input_skips_classification() {
378 let huge = "a".repeat(MAX_CLASSIFY_BYTES + 1);
379 assert_eq!(detect(&huge), ContentType::Text);
380 }
381
382 #[test]
383 fn prose_with_arrow_is_text() {
384 assert_eq!(
386 detect("Move the cursor -> click submit -> wait"),
387 ContentType::Text
388 );
389 }
390
391 #[test]
392 fn prose_with_let_is_text() {
393 assert_eq!(
395 detect("Why don't you let me know when you're free."),
396 ContentType::Text
397 );
398 }
399
400 #[test]
401 fn non_utf8_bytes_is_text() {
402 assert_eq!(super::detect(&[0xC3, 0x28]), ContentType::Text);
405 assert_eq!(super::detect(&[0xFF, 0xFE, 0xFD]), ContentType::Text);
407 }
408
409 #[test]
410 fn oversize_bytes_skip_utf8_scan() {
411 let mut huge = Vec::with_capacity(MAX_CLASSIFY_BYTES + 32);
414 huge.extend_from_slice(b"fn main() { println!(\"x\"); }\n");
415 huge.resize(MAX_CLASSIFY_BYTES + 1, b'a');
416 assert_eq!(super::detect(&huge), ContentType::Text);
417 }
418}