1use crate::rest::ContentType;
18
19const MAX_CLASSIFY_BYTES: usize = 64 * 1024;
20const SYMBOL_RATIO_THRESHOLD: f32 = 0.20;
21
22const ALLOWED_URL_SCHEMES: &[&str] = &[
23 "http", "https", "ftp", "ftps", "ssh", "sftp", "mailto", "file", "ws", "wss",
24];
25
26const CODE_LINE_OPENERS: &[&str] = &[
30 "fn ",
31 "def ",
32 "function ",
33 "function(",
34 "class ",
35 "interface ",
36 "trait ",
37 "impl ",
38 "struct ",
39 "enum ",
40 "type ",
41 "import ",
42 "from ",
43 "export ",
44 "module ",
45 "package ",
46 "use ",
47 "namespace ",
48 "const ",
49 "let ",
50 "var ",
51 "pub ",
52 "static ",
53 "async ",
54 "await ",
55 "return ",
56 "yield ",
57 "throw ",
58 "if (",
59 "for (",
60 "while (",
61 "switch (",
62 "catch (",
63 "#include",
64 "#define",
65 "#!/",
66];
67
68const CODE_BIGRAMS: &[&str] = &[
70 "=>", "->", "::", "!=", "==", "&&", "||", "</", "/>", "//", "/*", "*/", "++", "--", ">=", "<=",
71 ">>", "<<", "...",
72];
73
74pub fn detect(content: &str) -> ContentType {
76 let s = content.trim();
77 if s.is_empty() || s.len() > MAX_CLASSIFY_BYTES {
78 return ContentType::Text;
79 }
80
81 if s.starts_with("#!/") {
82 return ContentType::Code;
83 }
84
85 if !s.chars().any(char::is_whitespace) {
86 if let Ok(url) = url::Url::parse(s) {
87 if ALLOWED_URL_SCHEMES.contains(&url.scheme()) {
88 return ContentType::Url;
89 }
90 }
91 }
92
93 let bytes = s.as_bytes();
94 let first = bytes[0];
95 let last = bytes[bytes.len() - 1];
96 if ((first == b'{' && last == b'}') || (first == b'[' && last == b']'))
97 && serde_json::from_str::<serde_json::Value>(s).is_ok()
98 {
99 return ContentType::Code;
100 }
101
102 for line in s.lines() {
103 let trimmed = line.trim_start();
104 if CODE_LINE_OPENERS.iter().any(|kw| trimmed.starts_with(kw)) {
105 return ContentType::Code;
106 }
107 }
108
109 let scan = scan(s);
110 if scan.symbol_ratio > SYMBOL_RATIO_THRESHOLD && scan.bigram_count >= 1 {
111 return ContentType::Code;
112 }
113 if scan.bigram_count >= 2 {
114 return ContentType::Code;
115 }
116 if scan.indented_lines >= 1 && scan.bigram_count >= 1 {
117 return ContentType::Code;
118 }
119
120 ContentType::Text
121}
122
123struct ScanResult {
124 symbol_ratio: f32,
125 bigram_count: usize,
126 indented_lines: usize,
127}
128
129fn scan(s: &str) -> ScanResult {
130 let bytes = s.as_bytes();
131 let mut symbol_count: usize = 0;
132 let mut alnum_count: usize = 0;
133 let mut indented_lines: usize = 0;
134
135 if is_indent_at(bytes, 0) {
136 indented_lines += 1;
137 }
138 for (i, &b) in bytes.iter().enumerate() {
139 if is_code_symbol(b) {
140 symbol_count += 1;
141 } else if b.is_ascii_alphanumeric() {
142 alnum_count += 1;
143 } else if b == b'\n' && is_indent_at(bytes, i + 1) {
144 indented_lines += 1;
145 }
146 }
147
148 let bigram_count = CODE_BIGRAMS.iter().filter(|p| s.contains(*p)).count();
149 let symbol_ratio = if alnum_count == 0 {
150 0.0
151 } else {
152 symbol_count as f32 / alnum_count as f32
153 };
154
155 ScanResult {
156 symbol_ratio,
157 bigram_count,
158 indented_lines,
159 }
160}
161
162const fn is_code_symbol(b: u8) -> bool {
163 matches!(
164 b,
165 b'{' | b'}'
166 | b'('
167 | b')'
168 | b'['
169 | b']'
170 | b';'
171 | b'='
172 | b'<'
173 | b'>'
174 | b'/'
175 | b'\\'
176 | b'|'
177 | b'&'
178 | b'*'
179 | b'+'
180 | b':'
181 )
182}
183
184fn is_indent_at(bytes: &[u8], i: usize) -> bool {
185 match bytes.get(i) {
186 Some(b'\t') => true,
187 Some(b' ') => matches!(bytes.get(i + 1), Some(b' ')),
188 _ => false,
189 }
190}
191
192#[cfg(test)]
193mod tests {
194 use super::*;
195
196 #[test]
197 fn empty_is_text() {
198 assert_eq!(detect(""), ContentType::Text);
199 assert_eq!(detect(" \n\t "), ContentType::Text);
200 }
201
202 #[test]
203 fn short_prose_is_text() {
204 assert_eq!(detect("Hello world"), ContentType::Text);
205 assert_eq!(
206 detect("This is a normal sentence with a period."),
207 ContentType::Text
208 );
209 }
210
211 #[test]
212 fn korean_prose_is_text() {
213 assert_eq!(
214 detect("안녕하세요. 오늘 회의는 3시입니다."),
215 ContentType::Text
216 );
217 }
218
219 #[test]
220 fn long_prose_is_text() {
221 let s = "The quick brown fox jumps over the lazy dog. \
222 This is a longer paragraph designed to test that prose, \
223 even with occasional punctuation like commas, periods, and \
224 apostrophes, does not cross the code threshold.";
225 assert_eq!(detect(s), ContentType::Text);
226 }
227
228 #[test]
229 fn https_url() {
230 assert_eq!(detect("https://example.com"), ContentType::Url);
231 assert_eq!(
232 detect("https://example.com/path?q=1&r=2#frag"),
233 ContentType::Url
234 );
235 assert_eq!(detect(" https://example.com "), ContentType::Url);
236 }
237
238 #[test]
239 fn other_schemes_url() {
240 assert_eq!(detect("http://localhost:8080"), ContentType::Url);
241 assert_eq!(detect("mailto:foo@bar.com"), ContentType::Url);
242 assert_eq!(detect("ssh://user@host.com"), ContentType::Url);
243 assert_eq!(detect("file:///tmp/x"), ContentType::Url);
244 assert_eq!(
245 detect("wss://relay.example.com/v1/stream"),
246 ContentType::Url
247 );
248 }
249
250 #[test]
251 fn url_with_whitespace_is_not_url() {
252 assert_eq!(
253 detect("check out https://example.com today"),
254 ContentType::Text
255 );
256 }
257
258 #[test]
259 fn bare_hostname_is_not_url() {
260 assert_eq!(detect("example.com"), ContentType::Text);
261 assert_eq!(detect("foo.bar.baz"), ContentType::Text);
262 }
263
264 #[test]
265 fn windows_path_is_not_url() {
266 assert_eq!(detect("c:\\users\\me\\file.txt"), ContentType::Text);
267 }
268
269 #[test]
270 fn shebang_is_code() {
271 assert_eq!(detect("#!/usr/bin/env bash\necho hello"), ContentType::Code);
272 assert_eq!(detect("#!/bin/sh"), ContentType::Code);
273 }
274
275 #[test]
276 fn json_object_is_code() {
277 assert_eq!(detect(r#"{"key": "value"}"#), ContentType::Code);
278 assert_eq!(
279 detect(r#"{"nested": {"deep": [1, 2, 3]}, "ok": true}"#),
280 ContentType::Code
281 );
282 }
283
284 #[test]
285 fn json_array_is_code() {
286 assert_eq!(detect("[1, 2, 3]"), ContentType::Code);
287 }
288
289 #[test]
290 fn json_shaped_but_invalid_is_text() {
291 assert_eq!(detect("{not really json}"), ContentType::Text);
292 }
293
294 #[test]
295 fn rust_snippet_is_code() {
296 let s = "fn main() {\n let x = 42;\n println!(\"{}\", x);\n}";
297 assert_eq!(detect(s), ContentType::Code);
298 }
299
300 #[test]
301 fn python_snippet_is_code() {
302 let s = "def greet(name):\n return f\"hello, {name}\"\n\nprint(greet(\"world\"))";
303 assert_eq!(detect(s), ContentType::Code);
304 }
305
306 #[test]
307 fn typescript_snippet_is_code() {
308 let s = "const add = (a: number, b: number) => a + b;\nexport { add };";
309 assert_eq!(detect(s), ContentType::Code);
310 }
311
312 #[test]
313 fn javascript_one_liner_is_code() {
314 assert_eq!(
317 detect("const foo = bar.map(x => x * 2);"),
318 ContentType::Code
319 );
320 }
321
322 #[test]
323 fn import_statement_is_code() {
324 assert_eq!(
325 detect("import { useState } from 'react';"),
326 ContentType::Code
327 );
328 }
329
330 #[test]
331 fn html_snippet_is_code() {
332 let s = "<div class=\"foo\">\n <span>hi</span>\n</div>";
333 assert_eq!(detect(s), ContentType::Code);
334 }
335
336 #[test]
337 fn c_include_is_code() {
338 assert_eq!(detect("#include <stdio.h>"), ContentType::Code);
339 }
340
341 #[test]
342 fn huge_input_skips_classification() {
343 let huge = "a".repeat(MAX_CLASSIFY_BYTES + 1);
344 assert_eq!(detect(&huge), ContentType::Text);
345 }
346
347 #[test]
348 fn prose_with_arrow_is_text() {
349 assert_eq!(
351 detect("Move the cursor -> click submit -> wait"),
352 ContentType::Text
353 );
354 }
355
356 #[test]
357 fn prose_with_let_is_text() {
358 assert_eq!(
360 detect("Why don't you let me know when you're free."),
361 ContentType::Text
362 );
363 }
364}