Skip to main content

client_core/
classify.rs

1//! Content classification for plain-text clipboard input.
2//!
3//! Returns `Text`, `Url`, or `Code`. The caller must have already ruled out
4//! image bytes via magic-byte sniffing — this module never returns `Image`.
5//!
6//! Decision order (first match wins):
7//!   1. trim; empty / > 64 KB → Text
8//!   2. shebang `#!/...` → Code
9//!   3. whole-string URL parse with scheme allow-list → Url
10//!   4. `{...}` / `[...]` shape + valid JSON → Code
11//!   5. any line starts with a code-opener keyword → Code
12//!   6. symbol-to-alphanumeric ratio > 0.20 with at least one code bigram → Code
13//!   7. ≥ 2 distinct code bigrams → Code
14//!   8. indented line(s) with a code bigram → Code
15//!   9. otherwise → Text
16
17use crate::rest::ContentType;
18
19const MAX_CLASSIFY_BYTES: usize = 64 * 1024;
20const SYMBOL_RATIO_THRESHOLD: f32 = 0.20;
21
22const ALLOWED_URL_SCHEMES: &[&str] = &[
23    "http", "https", "ftp", "ftps", "ssh", "sftp", "mailto", "file", "ws", "wss",
24];
25
26/// Tokens that, when they begin a (left-trimmed) line, are unambiguous code
27/// signals. The trailing space / paren prevents prose collisions like
28/// "use this" or "let me know".
29const CODE_LINE_OPENERS: &[&str] = &[
30    "fn ",
31    "def ",
32    "function ",
33    "function(",
34    "class ",
35    "interface ",
36    "trait ",
37    "impl ",
38    "struct ",
39    "enum ",
40    "type ",
41    "import ",
42    "from ",
43    "export ",
44    "module ",
45    "package ",
46    "use ",
47    "namespace ",
48    "const ",
49    "let ",
50    "var ",
51    "pub ",
52    "static ",
53    "async ",
54    "await ",
55    "return ",
56    "yield ",
57    "throw ",
58    "if (",
59    "for (",
60    "while (",
61    "switch (",
62    "catch (",
63    "#include",
64    "#define",
65    "#!/",
66];
67
68/// Token pairs almost never seen in natural-language prose.
69const CODE_BIGRAMS: &[&str] = &[
70    "=>", "->", "::", "!=", "==", "&&", "||", "</", "/>", "//", "/*", "*/", "++", "--", ">=", "<=",
71    ">>", "<<", "...",
72];
73
74/// Classify a plain-text clip. Never returns `Image`.
75pub fn detect(content: &str) -> ContentType {
76    let s = content.trim();
77    if s.is_empty() || s.len() > MAX_CLASSIFY_BYTES {
78        return ContentType::Text;
79    }
80
81    if s.starts_with("#!/") {
82        return ContentType::Code;
83    }
84
85    if !s.chars().any(char::is_whitespace) {
86        if let Ok(url) = url::Url::parse(s) {
87            if ALLOWED_URL_SCHEMES.contains(&url.scheme()) {
88                return ContentType::Url;
89            }
90        }
91    }
92
93    let bytes = s.as_bytes();
94    let first = bytes[0];
95    let last = bytes[bytes.len() - 1];
96    if ((first == b'{' && last == b'}') || (first == b'[' && last == b']'))
97        && serde_json::from_str::<serde_json::Value>(s).is_ok()
98    {
99        return ContentType::Code;
100    }
101
102    for line in s.lines() {
103        let trimmed = line.trim_start();
104        if CODE_LINE_OPENERS.iter().any(|kw| trimmed.starts_with(kw)) {
105            return ContentType::Code;
106        }
107    }
108
109    let scan = scan(s);
110    if scan.symbol_ratio > SYMBOL_RATIO_THRESHOLD && scan.bigram_count >= 1 {
111        return ContentType::Code;
112    }
113    if scan.bigram_count >= 2 {
114        return ContentType::Code;
115    }
116    if scan.indented_lines >= 1 && scan.bigram_count >= 1 {
117        return ContentType::Code;
118    }
119
120    ContentType::Text
121}
122
123struct ScanResult {
124    symbol_ratio: f32,
125    bigram_count: usize,
126    indented_lines: usize,
127}
128
129fn scan(s: &str) -> ScanResult {
130    let bytes = s.as_bytes();
131    let mut symbol_count: usize = 0;
132    let mut alnum_count: usize = 0;
133    let mut indented_lines: usize = 0;
134
135    if is_indent_at(bytes, 0) {
136        indented_lines += 1;
137    }
138    for (i, &b) in bytes.iter().enumerate() {
139        if is_code_symbol(b) {
140            symbol_count += 1;
141        } else if b.is_ascii_alphanumeric() {
142            alnum_count += 1;
143        } else if b == b'\n' && is_indent_at(bytes, i + 1) {
144            indented_lines += 1;
145        }
146    }
147
148    let bigram_count = CODE_BIGRAMS.iter().filter(|p| s.contains(*p)).count();
149    let symbol_ratio = if alnum_count == 0 {
150        0.0
151    } else {
152        symbol_count as f32 / alnum_count as f32
153    };
154
155    ScanResult {
156        symbol_ratio,
157        bigram_count,
158        indented_lines,
159    }
160}
161
162const fn is_code_symbol(b: u8) -> bool {
163    matches!(
164        b,
165        b'{' | b'}'
166            | b'('
167            | b')'
168            | b'['
169            | b']'
170            | b';'
171            | b'='
172            | b'<'
173            | b'>'
174            | b'/'
175            | b'\\'
176            | b'|'
177            | b'&'
178            | b'*'
179            | b'+'
180            | b':'
181    )
182}
183
184fn is_indent_at(bytes: &[u8], i: usize) -> bool {
185    match bytes.get(i) {
186        Some(b'\t') => true,
187        Some(b' ') => matches!(bytes.get(i + 1), Some(b' ')),
188        _ => false,
189    }
190}
191
192#[cfg(test)]
193mod tests {
194    use super::*;
195
196    #[test]
197    fn empty_is_text() {
198        assert_eq!(detect(""), ContentType::Text);
199        assert_eq!(detect("   \n\t "), ContentType::Text);
200    }
201
202    #[test]
203    fn short_prose_is_text() {
204        assert_eq!(detect("Hello world"), ContentType::Text);
205        assert_eq!(
206            detect("This is a normal sentence with a period."),
207            ContentType::Text
208        );
209    }
210
211    #[test]
212    fn korean_prose_is_text() {
213        assert_eq!(
214            detect("안녕하세요. 오늘 회의는 3시입니다."),
215            ContentType::Text
216        );
217    }
218
219    #[test]
220    fn long_prose_is_text() {
221        let s = "The quick brown fox jumps over the lazy dog. \
222                 This is a longer paragraph designed to test that prose, \
223                 even with occasional punctuation like commas, periods, and \
224                 apostrophes, does not cross the code threshold.";
225        assert_eq!(detect(s), ContentType::Text);
226    }
227
228    #[test]
229    fn https_url() {
230        assert_eq!(detect("https://example.com"), ContentType::Url);
231        assert_eq!(
232            detect("https://example.com/path?q=1&r=2#frag"),
233            ContentType::Url
234        );
235        assert_eq!(detect("  https://example.com  "), ContentType::Url);
236    }
237
238    #[test]
239    fn other_schemes_url() {
240        assert_eq!(detect("http://localhost:8080"), ContentType::Url);
241        assert_eq!(detect("mailto:foo@bar.com"), ContentType::Url);
242        assert_eq!(detect("ssh://user@host.com"), ContentType::Url);
243        assert_eq!(detect("file:///tmp/x"), ContentType::Url);
244        assert_eq!(
245            detect("wss://relay.example.com/v1/stream"),
246            ContentType::Url
247        );
248    }
249
250    #[test]
251    fn url_with_whitespace_is_not_url() {
252        assert_eq!(
253            detect("check out https://example.com today"),
254            ContentType::Text
255        );
256    }
257
258    #[test]
259    fn bare_hostname_is_not_url() {
260        assert_eq!(detect("example.com"), ContentType::Text);
261        assert_eq!(detect("foo.bar.baz"), ContentType::Text);
262    }
263
264    #[test]
265    fn windows_path_is_not_url() {
266        assert_eq!(detect("c:\\users\\me\\file.txt"), ContentType::Text);
267    }
268
269    #[test]
270    fn shebang_is_code() {
271        assert_eq!(detect("#!/usr/bin/env bash\necho hello"), ContentType::Code);
272        assert_eq!(detect("#!/bin/sh"), ContentType::Code);
273    }
274
275    #[test]
276    fn json_object_is_code() {
277        assert_eq!(detect(r#"{"key": "value"}"#), ContentType::Code);
278        assert_eq!(
279            detect(r#"{"nested": {"deep": [1, 2, 3]}, "ok": true}"#),
280            ContentType::Code
281        );
282    }
283
284    #[test]
285    fn json_array_is_code() {
286        assert_eq!(detect("[1, 2, 3]"), ContentType::Code);
287    }
288
289    #[test]
290    fn json_shaped_but_invalid_is_text() {
291        assert_eq!(detect("{not really json}"), ContentType::Text);
292    }
293
294    #[test]
295    fn rust_snippet_is_code() {
296        let s = "fn main() {\n    let x = 42;\n    println!(\"{}\", x);\n}";
297        assert_eq!(detect(s), ContentType::Code);
298    }
299
300    #[test]
301    fn python_snippet_is_code() {
302        let s = "def greet(name):\n    return f\"hello, {name}\"\n\nprint(greet(\"world\"))";
303        assert_eq!(detect(s), ContentType::Code);
304    }
305
306    #[test]
307    fn typescript_snippet_is_code() {
308        let s = "const add = (a: number, b: number) => a + b;\nexport { add };";
309        assert_eq!(detect(s), ContentType::Code);
310    }
311
312    #[test]
313    fn javascript_one_liner_is_code() {
314        // Single-line const declaration — the kind of clip a developer
315        // routinely copies from a tutorial.
316        assert_eq!(
317            detect("const foo = bar.map(x => x * 2);"),
318            ContentType::Code
319        );
320    }
321
322    #[test]
323    fn import_statement_is_code() {
324        assert_eq!(
325            detect("import { useState } from 'react';"),
326            ContentType::Code
327        );
328    }
329
330    #[test]
331    fn html_snippet_is_code() {
332        let s = "<div class=\"foo\">\n  <span>hi</span>\n</div>";
333        assert_eq!(detect(s), ContentType::Code);
334    }
335
336    #[test]
337    fn c_include_is_code() {
338        assert_eq!(detect("#include <stdio.h>"), ContentType::Code);
339    }
340
341    #[test]
342    fn huge_input_skips_classification() {
343        let huge = "a".repeat(MAX_CLASSIFY_BYTES + 1);
344        assert_eq!(detect(&huge), ContentType::Text);
345    }
346
347    #[test]
348    fn prose_with_arrow_is_text() {
349        // A single `->` in conversational text shouldn't trigger code.
350        assert_eq!(
351            detect("Move the cursor -> click submit -> wait"),
352            ContentType::Text
353        );
354    }
355
356    #[test]
357    fn prose_with_let_is_text() {
358        // "let" inside prose (not as a line opener) stays text.
359        assert_eq!(
360            detect("Why don't you let me know when you're free."),
361            ContentType::Text
362        );
363    }
364}