Skip to main content

client_core/
classify.rs

1//! Content classification for clipboard input.
2//!
3//! Returns `Text`, `Url`, or `Code`. The caller must have already ruled out
4//! image bytes via magic-byte sniffing — this module never returns `Image`.
5//!
6//! Input is raw bytes: non-UTF-8 input short-circuits to `Text`, and input
7//! over `MAX_CLASSIFY_BYTES` short-circuits to `Text` without any UTF-8
8//! validation. This keeps `cinch push <huge-text>` (20 MB stdin) cheap —
9//! no O(n) UTF-8 walk before bailing.
10//!
11//! Decision order (first match wins):
12//!   1. > 64 KB bytes → Text (no UTF-8 scan)
13//!   2. invalid UTF-8 → Text
14//!   3. trim; empty → Text
15//!   4. shebang `#!/...` → Code
16//!   5. whole-string URL parse with scheme allow-list → Url
17//!   6. `{...}` / `[...]` shape + valid JSON → Code
18//!   7. any line starts with a code-opener keyword → Code
19//!   8. symbol-to-alphanumeric ratio > 0.20 with at least one code bigram → Code
20//!   9. ≥ 2 distinct code bigrams → Code
21//!  10. indented line(s) with a code bigram → Code
22//!  11. otherwise → Text
23
24use crate::rest::ContentType;
25
26const MAX_CLASSIFY_BYTES: usize = 64 * 1024;
27const SYMBOL_RATIO_THRESHOLD: f32 = 0.20;
28
29const ALLOWED_URL_SCHEMES: &[&str] = &[
30    "http", "https", "ftp", "ftps", "ssh", "sftp", "mailto", "file", "ws", "wss",
31];
32
33/// Tokens that, when they begin a (left-trimmed) line, are unambiguous code
34/// signals. The trailing space / paren prevents prose collisions like
35/// "use this" or "let me know".
36const CODE_LINE_OPENERS: &[&str] = &[
37    "fn ",
38    "def ",
39    "function ",
40    "function(",
41    "class ",
42    "interface ",
43    "trait ",
44    "impl ",
45    "struct ",
46    "enum ",
47    "type ",
48    "import ",
49    "from ",
50    "export ",
51    "module ",
52    "package ",
53    "use ",
54    "namespace ",
55    "const ",
56    "let ",
57    "var ",
58    "pub ",
59    "static ",
60    "async ",
61    "await ",
62    "return ",
63    "yield ",
64    "throw ",
65    "if (",
66    "for (",
67    "while (",
68    "switch (",
69    "catch (",
70    "#include",
71    "#define",
72    "#!/",
73];
74
75/// Token pairs almost never seen in natural-language prose.
76const CODE_BIGRAMS: &[&str] = &[
77    "=>", "->", "::", "!=", "==", "&&", "||", "</", "/>", "//", "/*", "*/", "++", "--", ">=", "<=",
78    ">>", "<<", "...",
79];
80
81/// Classify a clip from raw bytes. Never returns `Image`.
82///
83/// Bytes input avoids an O(n) UTF-8 scan on large clipboard payloads:
84/// callers can pass `&Vec<u8>` directly (e.g. `cinch push` stdin), and this
85/// function caps both the byte buffer and the UTF-8 validation at
86/// `MAX_CLASSIFY_BYTES` — anything past that boundary cannot affect the
87/// classification decision anyway.
88pub fn detect(content: &[u8]) -> ContentType {
89    // Oversize bytes short-circuit to Text, preserving the prior
90    // ">64 KB → Text" semantic without touching the buffer.
91    if content.len() > MAX_CLASSIFY_BYTES {
92        return ContentType::Text;
93    }
94    // Genuinely binary / non-UTF-8 input: caller should have caught image
95    // bytes via magic-byte sniffing; everything else degrades to Text.
96    let s = match std::str::from_utf8(content) {
97        Ok(s) => s,
98        Err(_) => return ContentType::Text,
99    };
100    detect_str(s)
101}
102
103fn detect_str(content: &str) -> ContentType {
104    let s = content.trim();
105    if s.is_empty() {
106        return ContentType::Text;
107    }
108
109    if s.starts_with("#!/") {
110        return ContentType::Code;
111    }
112
113    if !s.chars().any(char::is_whitespace) {
114        if let Ok(url) = url::Url::parse(s) {
115            if ALLOWED_URL_SCHEMES.contains(&url.scheme()) {
116                return ContentType::Url;
117            }
118        }
119    }
120
121    let bytes = s.as_bytes();
122    let first = bytes[0];
123    let last = bytes[bytes.len() - 1];
124    if ((first == b'{' && last == b'}') || (first == b'[' && last == b']'))
125        && serde_json::from_str::<serde_json::Value>(s).is_ok()
126    {
127        return ContentType::Code;
128    }
129
130    for line in s.lines() {
131        let trimmed = line.trim_start();
132        if CODE_LINE_OPENERS.iter().any(|kw| trimmed.starts_with(kw)) {
133            return ContentType::Code;
134        }
135    }
136
137    let scan = scan(s);
138    if scan.symbol_ratio > SYMBOL_RATIO_THRESHOLD && scan.bigram_count >= 1 {
139        return ContentType::Code;
140    }
141    if scan.bigram_count >= 2 {
142        return ContentType::Code;
143    }
144    if scan.indented_lines >= 1 && scan.bigram_count >= 1 {
145        return ContentType::Code;
146    }
147
148    ContentType::Text
149}
150
151struct ScanResult {
152    symbol_ratio: f32,
153    bigram_count: usize,
154    indented_lines: usize,
155}
156
157fn scan(s: &str) -> ScanResult {
158    let bytes = s.as_bytes();
159    let mut symbol_count: usize = 0;
160    let mut alnum_count: usize = 0;
161    let mut indented_lines: usize = 0;
162
163    if is_indent_at(bytes, 0) {
164        indented_lines += 1;
165    }
166    for (i, &b) in bytes.iter().enumerate() {
167        if is_code_symbol(b) {
168            symbol_count += 1;
169        } else if b.is_ascii_alphanumeric() {
170            alnum_count += 1;
171        } else if b == b'\n' && is_indent_at(bytes, i + 1) {
172            indented_lines += 1;
173        }
174    }
175
176    let bigram_count = CODE_BIGRAMS.iter().filter(|p| s.contains(*p)).count();
177    let symbol_ratio = if alnum_count == 0 {
178        0.0
179    } else {
180        symbol_count as f32 / alnum_count as f32
181    };
182
183    ScanResult {
184        symbol_ratio,
185        bigram_count,
186        indented_lines,
187    }
188}
189
190const fn is_code_symbol(b: u8) -> bool {
191    matches!(
192        b,
193        b'{' | b'}'
194            | b'('
195            | b')'
196            | b'['
197            | b']'
198            | b';'
199            | b'='
200            | b'<'
201            | b'>'
202            | b'/'
203            | b'\\'
204            | b'|'
205            | b'&'
206            | b'*'
207            | b'+'
208            | b':'
209    )
210}
211
212fn is_indent_at(bytes: &[u8], i: usize) -> bool {
213    match bytes.get(i) {
214        Some(b'\t') => true,
215        Some(b' ') => matches!(bytes.get(i + 1), Some(b' ')),
216        _ => false,
217    }
218}
219
220#[cfg(test)]
221mod tests {
222    use super::*;
223
224    // Most cases are text-shaped, so route them through the public bytes API
225    // via `as_bytes()`. Bytes-only paths (non-UTF-8, oversize) get their own
226    // explicit tests at the bottom.
227    fn detect(s: &str) -> ContentType {
228        super::detect(s.as_bytes())
229    }
230
231    #[test]
232    fn empty_is_text() {
233        assert_eq!(detect(""), ContentType::Text);
234        assert_eq!(detect("   \n\t "), ContentType::Text);
235    }
236
237    #[test]
238    fn short_prose_is_text() {
239        assert_eq!(detect("Hello world"), ContentType::Text);
240        assert_eq!(
241            detect("This is a normal sentence with a period."),
242            ContentType::Text
243        );
244    }
245
246    #[test]
247    fn korean_prose_is_text() {
248        assert_eq!(
249            detect("안녕하세요. 오늘 회의는 3시입니다."),
250            ContentType::Text
251        );
252    }
253
254    #[test]
255    fn long_prose_is_text() {
256        let s = "The quick brown fox jumps over the lazy dog. \
257                 This is a longer paragraph designed to test that prose, \
258                 even with occasional punctuation like commas, periods, and \
259                 apostrophes, does not cross the code threshold.";
260        assert_eq!(detect(s), ContentType::Text);
261    }
262
263    #[test]
264    fn https_url() {
265        assert_eq!(detect("https://example.com"), ContentType::Url);
266        assert_eq!(
267            detect("https://example.com/path?q=1&r=2#frag"),
268            ContentType::Url
269        );
270        assert_eq!(detect("  https://example.com  "), ContentType::Url);
271    }
272
273    #[test]
274    fn other_schemes_url() {
275        assert_eq!(detect("http://localhost:8080"), ContentType::Url);
276        assert_eq!(detect("mailto:foo@bar.com"), ContentType::Url);
277        assert_eq!(detect("ssh://user@host.com"), ContentType::Url);
278        assert_eq!(detect("file:///tmp/x"), ContentType::Url);
279        assert_eq!(
280            detect("wss://relay.example.com/v1/stream"),
281            ContentType::Url
282        );
283    }
284
285    #[test]
286    fn url_with_whitespace_is_not_url() {
287        assert_eq!(
288            detect("check out https://example.com today"),
289            ContentType::Text
290        );
291    }
292
293    #[test]
294    fn bare_hostname_is_not_url() {
295        assert_eq!(detect("example.com"), ContentType::Text);
296        assert_eq!(detect("foo.bar.baz"), ContentType::Text);
297    }
298
299    #[test]
300    fn windows_path_is_not_url() {
301        assert_eq!(detect("c:\\users\\me\\file.txt"), ContentType::Text);
302    }
303
304    #[test]
305    fn shebang_is_code() {
306        assert_eq!(detect("#!/usr/bin/env bash\necho hello"), ContentType::Code);
307        assert_eq!(detect("#!/bin/sh"), ContentType::Code);
308    }
309
310    #[test]
311    fn json_object_is_code() {
312        assert_eq!(detect(r#"{"key": "value"}"#), ContentType::Code);
313        assert_eq!(
314            detect(r#"{"nested": {"deep": [1, 2, 3]}, "ok": true}"#),
315            ContentType::Code
316        );
317    }
318
319    #[test]
320    fn json_array_is_code() {
321        assert_eq!(detect("[1, 2, 3]"), ContentType::Code);
322    }
323
324    #[test]
325    fn json_shaped_but_invalid_is_text() {
326        assert_eq!(detect("{not really json}"), ContentType::Text);
327    }
328
329    #[test]
330    fn rust_snippet_is_code() {
331        let s = "fn main() {\n    let x = 42;\n    println!(\"{}\", x);\n}";
332        assert_eq!(detect(s), ContentType::Code);
333    }
334
335    #[test]
336    fn python_snippet_is_code() {
337        let s = "def greet(name):\n    return f\"hello, {name}\"\n\nprint(greet(\"world\"))";
338        assert_eq!(detect(s), ContentType::Code);
339    }
340
341    #[test]
342    fn typescript_snippet_is_code() {
343        let s = "const add = (a: number, b: number) => a + b;\nexport { add };";
344        assert_eq!(detect(s), ContentType::Code);
345    }
346
347    #[test]
348    fn javascript_one_liner_is_code() {
349        // Single-line const declaration — the kind of clip a developer
350        // routinely copies from a tutorial.
351        assert_eq!(
352            detect("const foo = bar.map(x => x * 2);"),
353            ContentType::Code
354        );
355    }
356
357    #[test]
358    fn import_statement_is_code() {
359        assert_eq!(
360            detect("import { useState } from 'react';"),
361            ContentType::Code
362        );
363    }
364
365    #[test]
366    fn html_snippet_is_code() {
367        let s = "<div class=\"foo\">\n  <span>hi</span>\n</div>";
368        assert_eq!(detect(s), ContentType::Code);
369    }
370
371    #[test]
372    fn c_include_is_code() {
373        assert_eq!(detect("#include <stdio.h>"), ContentType::Code);
374    }
375
376    #[test]
377    fn huge_input_skips_classification() {
378        let huge = "a".repeat(MAX_CLASSIFY_BYTES + 1);
379        assert_eq!(detect(&huge), ContentType::Text);
380    }
381
382    #[test]
383    fn prose_with_arrow_is_text() {
384        // A single `->` in conversational text shouldn't trigger code.
385        assert_eq!(
386            detect("Move the cursor -> click submit -> wait"),
387            ContentType::Text
388        );
389    }
390
391    #[test]
392    fn prose_with_let_is_text() {
393        // "let" inside prose (not as a line opener) stays text.
394        assert_eq!(
395            detect("Why don't you let me know when you're free."),
396            ContentType::Text
397        );
398    }
399
400    #[test]
401    fn non_utf8_bytes_is_text() {
402        // 0xC3 0x28: 0xC3 starts a 2-byte sequence but 0x28 is not a valid
403        // continuation byte — invalid UTF-8.
404        assert_eq!(super::detect(&[0xC3, 0x28]), ContentType::Text);
405        // High-bit garbage that is not valid UTF-8 anywhere.
406        assert_eq!(super::detect(&[0xFF, 0xFE, 0xFD]), ContentType::Text);
407    }
408
409    #[test]
410    fn oversize_bytes_skip_utf8_scan() {
411        // Even if the head is unambiguously code, oversize input bails to
412        // Text without a UTF-8 walk over the full buffer.
413        let mut huge = Vec::with_capacity(MAX_CLASSIFY_BYTES + 32);
414        huge.extend_from_slice(b"fn main() { println!(\"x\"); }\n");
415        huge.resize(MAX_CLASSIFY_BYTES + 1, b'a');
416        assert_eq!(super::detect(&huge), ContentType::Text);
417    }
418}