vision_squeezer/
lib.rs

1use std::io::Cursor;
2
3use base64::{Engine, engine::general_purpose::STANDARD as B64};
4use chrono::Utc;
5use image::{DynamicImage, ImageBuffer, Luma, imageops::FilterType};
6use rusqlite::{Connection, params};
7use std::path::PathBuf;
8// ── Config ────────────────────────────────────────────────────────────────────
9
10/// Output encoding format.
11#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
12pub enum OutputFormat {
13    /// JPEG at configured quality (default).
14    #[default]
15    Jpeg,
16    /// WebP at configured quality — typically 30-50% smaller than JPEG at equal quality.
17    WebP,
18}
19
20/// All tuneable knobs for the pipeline.
21#[derive(Clone, Debug)]
22pub struct ProcessConfig {
23    /// Output quality 1–100 (default 75). Applies to both JPEG and WebP.
24    pub quality: u8,
25    /// LLM patch size in pixels. Overridden when `target_model` is set.
26    pub tile_size: u32,
27    /// Remove solid-color padding borders before resizing (default true).
28    pub crop: bool,
29    /// Max channel delta to treat a pixel as background (default 15).
30    pub bg_tolerance: u8,
31    /// Output encoding format (default: JPEG).
32    pub output_format: OutputFormat,
33    /// When set, resizing is model-aware (accounts for pre-scaling behavior).
34    pub target_model: Option<VisionModel>,
35    /// Limit the maximum number of tiles the output image can consume.
36    pub max_tiles: Option<u32>,
37}
38
39impl Default for ProcessConfig {
40    fn default() -> Self {
41        Self {
42            quality: 75,
43            tile_size: 512,
44            crop: true,
45            bg_tolerance: 15,
46            output_format: OutputFormat::Jpeg,
47            target_model: None,
48            max_tiles: None,
49        }
50    }
51}
52
53impl ProcessConfig {
54    pub fn builder() -> ProcessConfigBuilder {
55        ProcessConfigBuilder(Self::default())
56    }
57}
58
59pub struct ProcessConfigBuilder(ProcessConfig);
60
61impl ProcessConfigBuilder {
62    pub fn quality(mut self, q: u8) -> Self {
63        self.0.quality = q.clamp(1, 100);
64        self
65    }
66    pub fn tile_size(mut self, t: u32) -> Self {
67        self.0.tile_size = t.max(1);
68        self
69    }
70    pub fn crop(mut self, c: bool) -> Self {
71        self.0.crop = c;
72        self
73    }
74    pub fn bg_tolerance(mut self, t: u8) -> Self {
75        self.0.bg_tolerance = t;
76        self
77    }
78    pub fn output_format(mut self, f: OutputFormat) -> Self {
79        self.0.output_format = f;
80        self
81    }
82    pub fn target_model(mut self, m: VisionModel) -> Self {
83        self.0.target_model = Some(m);
84        self
85    }
86    pub fn max_tiles(mut self, m: u32) -> Self {
87        self.0.max_tiles = Some(m);
88        self
89    }
90    pub fn build(self) -> ProcessConfig {
91        self.0
92    }
93}
94
95// ── Token Estimation ──────────────────────────────────────────────────────────
96
97/// Supported vision model families with their patch pricing.
98#[derive(Clone, Copy, Debug)]
99pub enum VisionModel {
100    /// Claude 3.5/4.5/4.6/4.7: Area-based calculation (Tokens ≈ width × height / 750).
101    Claude,
102    /// GPT-4o / GPT-4.5 high detail: fits in 2048x2048, scales short side to 768, then 512x512 tiles.
103    Gpt4o,
104    /// GPT-5/5.5: 6000px max dim, 10.24M max pixels, 512×512 tiles, 1536 token cap.
105    Gpt5,
106    /// Gemini 2.0/3.0: flat 258 tokens if ≤ 384x384, else 258 per 768x768 tile.
107    Gemini15,
108}
109
110#[derive(Debug)]
111pub struct TokenEstimate {
112    pub model: VisionModel,
113    pub tokens: u32,
114    pub tiles: u32,
115}
116
117/// Estimate LLM vision tokens for an image of given dimensions.
118pub fn estimate_tokens(width: u32, height: u32, model: VisionModel) -> TokenEstimate {
119    match model {
120        VisionModel::Claude => {
121            // 2026 area-based pricing for Claude
122            let tokens = ((width as u64 * height as u64) / 750) as u32;
123            TokenEstimate {
124                model,
125                tiles: 1,
126                tokens: tokens.max(85),
127            }
128        }
129        VisionModel::Gpt4o => {
130            // GPT-4o / 4.5: fit within 2048x2048, then short side scaled to 768px, then 512x512 tiles.
131            let (mut w, mut h) = fit_within(width, height, 2048);
132            let short_side = w.min(h);
133            if short_side > 768 {
134                let scale = 768.0 / short_side as f64;
135                w = (w as f64 * scale).round() as u32;
136                h = (h as f64 * scale).round() as u32;
137            }
138            let tiles = tile_count(w, 512) * tile_count(h, 512);
139            TokenEstimate {
140                model,
141                tiles,
142                tokens: 85 + tiles * 170,
143            }
144        }
145        VisionModel::Gpt5 => {
146            let (w, h) = fit_within_pixels(width, height, 6000, 10_240_000);
147            let tiles = tile_count(w, 512) * tile_count(h, 512);
148            TokenEstimate {
149                model,
150                tiles,
151                tokens: (85 + tiles * 170).min(1536),
152            }
153        }
154        VisionModel::Gemini15 => {
155            // Gemini 2026: flat 258 if <= 384x384, else 768x768 tiles.
156            if width <= 384 && height <= 384 {
157                TokenEstimate {
158                    model,
159                    tiles: 1,
160                    tokens: 258,
161                }
162            } else {
163                let tiles = tile_count(width, 768) * tile_count(height, 768);
164                TokenEstimate {
165                    model,
166                    tiles,
167                    tokens: tiles * 258,
168                }
169            }
170        }
171    }
172}
173
174/// Scale dimensions to fit within `max_side` while preserving aspect ratio.
175pub fn fit_within(width: u32, height: u32, max_side: u32) -> (u32, u32) {
176    if width <= max_side && height <= max_side {
177        return (width, height);
178    }
179    let scale = max_side as f64 / width.max(height) as f64;
180    (
181        (width as f64 * scale) as u32,
182        (height as f64 * scale) as u32,
183    )
184}
185
186/// Scale dimensions to fit within both a max-side limit and a total-pixel limit.
187pub fn fit_within_pixels(width: u32, height: u32, max_side: u32, max_pixels: u64) -> (u32, u32) {
188    let (mut w, mut h) = fit_within(width, height, max_side);
189    let total = w as u64 * h as u64;
190    if total > max_pixels {
191        let scale = (max_pixels as f64 / total as f64).sqrt();
192        w = (w as f64 * scale) as u32;
193        h = (h as f64 * scale) as u32;
194    }
195    (w.max(1), h.max(1))
196}
197
198/// Compute the optimal dimensions to *send* to a given model to minimize tiles.
199///
200/// For models that pre-scale images (GPT-4o, Gemini), we simulate their scaling,
201/// snap the scaled result to tile boundaries, then invert back to input space.
202/// For Claude (no pre-scaling), we snap the input directly.
203pub fn optimal_send_dimensions(width: u32, height: u32, model: VisionModel) -> (u32, u32) {
204    match model {
205        VisionModel::Claude => {
206            // Claude is now area-based, so tiling doesn't dictate a specific rigid boundary.
207            // But we still snap to 256 or 512 so dimensions aren't completely arbitrary.
208            (
209                snap_to_tile_boundary(width, 256),
210                snap_to_tile_boundary(height, 256),
211            )
212        }
213        VisionModel::Gpt4o => optimal_for_prescaling_model(width, height, 2048, 512),
214        VisionModel::Gpt5 => {
215            let (fw, fh) = fit_within_pixels(width, height, 6000, 10_240_000);
216            (
217                snap_to_tile_boundary(fw, 512).max(512),
218                snap_to_tile_boundary(fh, 512).max(512),
219            )
220        }
221        VisionModel::Gemini15 => {
222            // Gemini uses 768x768 tiles if > 384x384
223            if width <= 384 && height <= 384 {
224                (width, height)
225            } else {
226                optimal_for_prescaling_model(width, height, 4096, 768)
227            }
228        }
229    }
230}
231
232/// For models that pre-scale (GPT-4o, Gemini), find the smallest input dimensions
233/// that, after the model's internal fit-within + tiling, produce the fewest tiles.
234///
235/// Strategy: enumerate candidate tile-grid dimensions (tw*tile, th*tile) that fit
236/// within max_side, compute the input size that would map to each, and pick the
237/// candidate that uses the fewest tiles while preserving the original aspect ratio
238/// as closely as possible.
239fn optimal_for_prescaling_model(width: u32, height: u32, max_side: u32, tile: u32) -> (u32, u32) {
240    let (fw, fh) = fit_within(width, height, max_side);
241
242    // Simply snap the fitted dimensions to the nearest tile boundary
243    let target_w = snap_to_tile_boundary(fw, tile).max(tile);
244    let target_h = snap_to_tile_boundary(fh, tile).max(tile);
245
246    // If image was larger than max_side, scale back to input space
247    if width > max_side || height > max_side {
248        let scale = width.max(height) as f64 / max_side as f64;
249        let opt_w = (target_w as f64 * scale).round() as u32;
250        let opt_h = (target_h as f64 * scale).round() as u32;
251        return (opt_w.max(1), opt_h.max(1));
252    }
253
254    (target_w, target_h)
255}
256
257/// Full token savings report for a before/after dimension pair across all models.
258pub struct TokenSavingsTable {
259    pub claude_before: TokenEstimate,
260    pub claude_after: TokenEstimate,
261    pub gpt4o_before: TokenEstimate,
262    pub gpt4o_after: TokenEstimate,
263    pub gpt5_before: TokenEstimate,
264    pub gpt5_after: TokenEstimate,
265    pub gemini_before: TokenEstimate,
266    pub gemini_after: TokenEstimate,
267}
268
269pub fn token_savings_table(orig_w: u32, orig_h: u32, opt_w: u32, opt_h: u32) -> TokenSavingsTable {
270    TokenSavingsTable {
271        claude_before: estimate_tokens(orig_w, orig_h, VisionModel::Claude),
272        claude_after: estimate_tokens(opt_w, opt_h, VisionModel::Claude),
273        gpt4o_before: estimate_tokens(orig_w, orig_h, VisionModel::Gpt4o),
274        gpt4o_after: estimate_tokens(opt_w, opt_h, VisionModel::Gpt4o),
275        gpt5_before: estimate_tokens(orig_w, orig_h, VisionModel::Gpt5),
276        gpt5_after: estimate_tokens(opt_w, opt_h, VisionModel::Gpt5),
277        gemini_before: estimate_tokens(orig_w, orig_h, VisionModel::Gemini15),
278        gemini_after: estimate_tokens(opt_w, opt_h, VisionModel::Gemini15),
279    }
280}
281
282impl TokenSavingsTable {
283    pub fn print(&self) {
284        println!(
285            "{:<12} {:>8} {:>8} {:>10}",
286            "Model", "Before", "After", "Saved"
287        );
288        println!("{}", "-".repeat(42));
289        self.print_row("Claude", &self.claude_before, &self.claude_after);
290        self.print_row("GPT-4o", &self.gpt4o_before, &self.gpt4o_after);
291        self.print_row("GPT-5", &self.gpt5_before, &self.gpt5_after);
292        self.print_row("Gemini", &self.gemini_before, &self.gemini_after);
293    }
294
295    fn print_row(&self, name: &str, before: &TokenEstimate, after: &TokenEstimate) {
296        let saved = before.tokens.saturating_sub(after.tokens);
297        let pct = if before.tokens > 0 {
298            saved as f64 / before.tokens as f64 * 100.0
299        } else {
300            0.0
301        };
302        println!(
303            "{:<12} {:>8} {:>8} {:>8} ({:.1}%)",
304            name, before.tokens, after.tokens, saved, pct
305        );
306    }
307}
308
309// ── Types ─────────────────────────────────────────────────────────────────────
310
311pub struct DimensionResult {
312    pub width: u32,
313    pub height: u32,
314    pub tiles_before: u32,
315    pub tiles_after: u32,
316}
317
318impl DimensionResult {
319    pub fn tokens_saved(&self) -> u32 {
320        self.tiles_before.saturating_sub(self.tiles_after)
321    }
322}
323
324#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
325pub enum ProcessMode {
326    /// General LLM vision — JPEG output at configured quality.
327    Standard,
328    /// Text extraction — high-contrast grayscale binarization (Otsu threshold).
329    Ocr,
330    /// Auto-detects if the image is mostly text (monochrome/grayscale).
331    #[default]
332    Auto,
333}
334
335pub fn detect_ocr_mode(img: &DynamicImage) -> bool {
336    let rgb = img.to_rgb8();
337    let mut colorful_count = 0;
338    let mut total_count = 0;
339    // Sample every 4th pixel for speed
340    for (x, y, p) in rgb.enumerate_pixels() {
341        if x % 4 == 0 && y % 4 == 0 {
342            total_count += 1;
343            let min = p[0].min(p[1]).min(p[2]);
344            let max = p[0].max(p[1]).max(p[2]);
345            if max.saturating_sub(min) > 25 {
346                colorful_count += 1;
347            }
348        }
349    }
350    let colorful_ratio = colorful_count as f64 / total_count.max(1) as f64;
351    colorful_ratio < 0.1 // if less than 10% of pixels are colorful, assume OCR
352}
353
354pub struct SavingsReport {
355    pub tiles_before: u32,
356    pub tiles_after: u32,
357    pub tiles_saved: u32,
358    pub bytes_before: Option<u64>,
359    pub bytes_after: Option<u64>,
360}
361
362impl SavingsReport {
363    pub fn size_reduction_pct(&self) -> Option<f64> {
364        match (self.bytes_before, self.bytes_after) {
365            (Some(b), Some(a)) if b > 0 => Some((1.0 - a as f64 / b as f64) * 100.0),
366            _ => None,
367        }
368    }
369
370    pub fn token_reduction_pct(&self) -> f64 {
371        if self.tiles_before == 0 {
372            return 0.0;
373        }
374        self.tiles_saved as f64 / self.tiles_before as f64 * 100.0
375    }
376}
377
378pub struct ProcessResult {
379    pub image: DynamicImage,
380    pub width: u32,
381    pub height: u32,
382    pub report: SavingsReport,
383}
384
385impl ProcessResult {
386    pub fn tokens_saved(&self) -> u32 {
387        self.report.tiles_saved
388    }
389}
390
391// ── Pipeline ──────────────────────────────────────────────────────────────────
392
393/// Full pipeline: [crop] → tile-snap resize → [OCR binarize].
394/// Pass `input_bytes = 0` if unknown (omits file-size from report).
395pub fn process(
396    img: DynamicImage,
397    mode: ProcessMode,
398    input_bytes: u64,
399    cfg: &ProcessConfig,
400) -> ProcessResult {
401    let (orig_w, orig_h) = (img.width(), img.height());
402    let tiles_before = match cfg.target_model {
403        Some(model) => estimate_tokens(orig_w, orig_h, model).tiles,
404        None => tile_count(orig_w, cfg.tile_size) * tile_count(orig_h, cfg.tile_size),
405    };
406
407    let after_crop = if cfg.crop {
408        crop_padding(img, cfg.bg_tolerance)
409    } else {
410        img
411    };
412    let (mut opt_w, mut opt_h) = match cfg.target_model {
413        Some(model) => optimal_send_dimensions(after_crop.width(), after_crop.height(), model),
414        None => {
415            let d = calculate_optimal_dimensions_with(
416                after_crop.width(),
417                after_crop.height(),
418                cfg.tile_size,
419            );
420            (d.width, d.height)
421        }
422    };
423
424    if let Some(max_t) = cfg.max_tiles {
425        let (nw, nh) = enforce_max_tiles(opt_w, opt_h, max_t, cfg.tile_size, cfg.target_model);
426        opt_w = nw;
427        opt_h = nh;
428    }
429
430    let tiles_after = match cfg.target_model {
431        Some(model) => {
432            let est = estimate_tokens(opt_w, opt_h, model);
433            est.tiles
434        }
435        None => tile_count(opt_w, cfg.tile_size) * tile_count(opt_h, cfg.tile_size),
436    };
437    let resized = after_crop.resize_exact(opt_w, opt_h, FilterType::Lanczos3);
438
439    let actual_mode = match mode {
440        ProcessMode::Auto => {
441            if detect_ocr_mode(&after_crop) {
442                ProcessMode::Ocr
443            } else {
444                ProcessMode::Standard
445            }
446        }
447        m => m,
448    };
449
450    let final_image = match actual_mode {
451        ProcessMode::Standard | ProcessMode::Auto => resized,
452        ProcessMode::Ocr => binarize(resized),
453    };
454
455    ProcessResult {
456        width: final_image.width(),
457        height: final_image.height(),
458        image: final_image,
459        report: SavingsReport {
460            tiles_before,
461            tiles_after,
462            tiles_saved: tiles_before.saturating_sub(tiles_after),
463            bytes_before: if input_bytes > 0 {
464                Some(input_bytes)
465            } else {
466                None
467            },
468            bytes_after: None,
469        },
470    }
471}
472
473fn enforce_max_tiles(
474    mut width: u32,
475    mut height: u32,
476    max_tiles: u32,
477    default_tile_size: u32,
478    model: Option<VisionModel>,
479) -> (u32, u32) {
480    if max_tiles == 0 {
481        return (width, height);
482    }
483
484    let mut scale = 1.0;
485    let orig_w = width;
486    let orig_h = height;
487
488    loop {
489        let (snapped_w, snapped_h) = match model {
490            Some(m) => optimal_send_dimensions(width, height, m),
491            None => {
492                let d = calculate_optimal_dimensions_with(width, height, default_tile_size);
493                (d.width, d.height)
494            }
495        };
496
497        let tiles = match model {
498            Some(m) => estimate_tokens(snapped_w, snapped_h, m).tiles,
499            None => {
500                tile_count(snapped_w, default_tile_size) * tile_count(snapped_h, default_tile_size)
501            }
502        };
503
504        if tiles <= max_tiles || scale < 0.1 {
505            return (snapped_w, snapped_h);
506        }
507
508        scale *= 0.95;
509        width = (orig_w as f64 * scale) as u32;
510        height = (orig_h as f64 * scale) as u32;
511        width = width.max(1);
512        height = height.max(1);
513    }
514}
515
516// ── Step 1: Tile-Aware Dimension Calculation ───────────────────────────────────
517
518/// Snap W×H to tile boundaries using default tile size (512).
519pub fn calculate_optimal_dimensions(width: u32, height: u32) -> DimensionResult {
520    calculate_optimal_dimensions_with(width, height, 512)
521}
522
523/// Snap W×H to tile boundaries using a custom tile size.
524pub fn calculate_optimal_dimensions_with(
525    width: u32,
526    height: u32,
527    tile_size: u32,
528) -> DimensionResult {
529    let opt_w = snap_to_tile_boundary(width, tile_size);
530    let opt_h = snap_to_tile_boundary(height, tile_size);
531
532    DimensionResult {
533        width: opt_w,
534        height: opt_h,
535        tiles_before: tile_count(width, tile_size) * tile_count(height, tile_size),
536        tiles_after: tile_count(opt_w, tile_size) * tile_count(opt_h, tile_size),
537    }
538}
539
540fn tile_count(dim: u32, tile_size: u32) -> u32 {
541    dim.div_ceil(tile_size)
542}
543
544fn snap_to_tile_boundary(dim: u32, tile_size: u32) -> u32 {
545    if dim.is_multiple_of(tile_size) {
546        return dim;
547    }
548    ((dim / tile_size) * tile_size).max(tile_size)
549}
550
551// ── Step 2: Semantic Crop (padding removal) ────────────────────────────────────
552
553/// Remove solid-color borders using corner sampling + configurable tolerance.
554pub fn crop_padding(img: DynamicImage, bg_tolerance: u8) -> DynamicImage {
555    let rgba = img.to_rgba8();
556    let (w, h) = rgba.dimensions();
557
558    let corners = [
559        *rgba.get_pixel(0, 0),
560        *rgba.get_pixel(w - 1, 0),
561        *rgba.get_pixel(0, h - 1),
562        *rgba.get_pixel(w - 1, h - 1),
563    ];
564    let bg = corners[0]; // first corner as background reference
565
566    let top = first_non_bg_row(&rgba, bg, bg_tolerance, true);
567    let bottom = first_non_bg_row(&rgba, bg, bg_tolerance, false);
568    let left = first_non_bg_col(&rgba, bg, bg_tolerance, true);
569    let right = first_non_bg_col(&rgba, bg, bg_tolerance, false);
570
571    if top >= bottom || left >= right {
572        return DynamicImage::ImageRgba8(rgba);
573    }
574
575    DynamicImage::ImageRgba8(
576        image::imageops::crop_imm(&rgba, left, top, right - left, bottom - top).to_image(),
577    )
578}
579
580fn is_bg(pixel: image::Rgba<u8>, bg: image::Rgba<u8>, tolerance: u8) -> bool {
581    pixel.0[3] < 10
582        || pixel.0[..3]
583            .iter()
584            .zip(bg.0[..3].iter())
585            .all(|(&a, &b)| a.abs_diff(b) <= tolerance)
586}
587
588fn first_non_bg_row(img: &image::RgbaImage, bg: image::Rgba<u8>, tol: u8, from_top: bool) -> u32 {
589    let (w, h) = img.dimensions();
590    let rows: Box<dyn Iterator<Item = u32>> = if from_top {
591        Box::new(0..h)
592    } else {
593        Box::new((0..h).rev())
594    };
595    for y in rows {
596        if (0..w).any(|x| !is_bg(*img.get_pixel(x, y), bg, tol)) {
597            return y;
598        }
599    }
600    0
601}
602
603fn first_non_bg_col(img: &image::RgbaImage, bg: image::Rgba<u8>, tol: u8, from_left: bool) -> u32 {
604    let (w, h) = img.dimensions();
605    let cols: Box<dyn Iterator<Item = u32>> = if from_left {
606        Box::new(0..w)
607    } else {
608        Box::new((0..w).rev())
609    };
610    for x in cols {
611        if (0..h).any(|y| !is_bg(*img.get_pixel(x, y), bg, tol)) {
612            return x;
613        }
614    }
615    0
616}
617
618// ── Step 3: OCR Binarization ───────────────────────────────────────────────────
619
620pub fn binarize(img: DynamicImage) -> DynamicImage {
621    let gray = img.to_luma8();
622    let (w, h) = gray.dimensions();
623    let threshold = otsu_threshold(&gray);
624    let binary: ImageBuffer<Luma<u8>, Vec<u8>> = ImageBuffer::from_fn(w, h, |x, y| {
625        let p = gray.get_pixel(x, y).0[0];
626        Luma([if p < threshold { 0u8 } else { 255u8 }])
627    });
628    DynamicImage::ImageLuma8(binary)
629}
630
631fn otsu_threshold(img: &image::GrayImage) -> u8 {
632    let mut histogram = [0u32; 256];
633    for p in img.pixels() {
634        histogram[p.0[0] as usize] += 1;
635    }
636    let total = img.width() * img.height();
637    let (mut sum, mut sum_bg, mut weight_bg) = (0f64, 0f64, 0f64);
638    for (i, &h) in histogram.iter().enumerate() {
639        sum += i as f64 * h as f64;
640    }
641    let (mut best_thresh, mut best_var) = (0u8, 0f64);
642    for (t, &h) in histogram.iter().enumerate() {
643        weight_bg += h as f64;
644        if weight_bg == 0.0 {
645            continue;
646        }
647        let weight_fg = total as f64 - weight_bg;
648        if weight_fg == 0.0 {
649            break;
650        }
651        sum_bg += t as f64 * h as f64;
652        let mean_bg = sum_bg / weight_bg;
653        let mean_fg = (sum - sum_bg) / weight_fg;
654        let var = weight_bg * weight_fg * (mean_bg - mean_fg).powi(2);
655        if var > best_var {
656            best_var = var;
657            best_thresh = t as u8;
658        }
659    }
660    best_thresh
661}
662
663// ── Base64 I/O ────────────────────────────────────────────────────────────────
664
665pub fn decode_base64_image(input: &str) -> Result<DynamicImage, String> {
666    let data = if let Some(c) = input.find(',') {
667        &input[c + 1..]
668    } else {
669        input
670    };
671    let bytes = B64.decode(data.trim()).map_err(|e| e.to_string())?;
672    image::load_from_memory(&bytes).map_err(|e| e.to_string())
673}
674
675pub fn encode_image_base64(img: &DynamicImage, cfg: &ProcessConfig) -> Result<String, String> {
676    let bytes = encode_to_bytes(img, cfg)?;
677    Ok(B64.encode(bytes))
678}
679
680/// Encode image to raw bytes using the configured output format.
681pub fn encode_to_bytes(img: &DynamicImage, cfg: &ProcessConfig) -> Result<Vec<u8>, String> {
682    match cfg.output_format {
683        OutputFormat::Jpeg => {
684            use image::codecs::jpeg::JpegEncoder;
685            let mut buf = Cursor::new(Vec::new());
686            let rgb = img.to_rgb8();
687            JpegEncoder::new_with_quality(&mut buf, cfg.quality)
688                .encode_image(&DynamicImage::ImageRgb8(rgb))
689                .map_err(|e| e.to_string())?;
690            Ok(buf.into_inner())
691        }
692        OutputFormat::WebP => {
693            let rgb = img.to_rgb8();
694            let enc = webp::Encoder::from_rgb(rgb.as_raw(), rgb.width(), rgb.height());
695            let mem = enc.encode(cfg.quality as f32);
696            Ok(mem.to_vec())
697        }
698    }
699}
700
701// ── MCP Tool: optimize_image ──────────────────────────────────────────────────
702
703pub struct OptimizeResult {
704    pub optimized_base64: String,
705    pub report: SavingsReport,
706    pub original_width: u32,
707    pub original_height: u32,
708    pub width: u32,
709    pub height: u32,
710    pub optimized_bytes: usize,
711}
712
713/// MCP entry point: base64 in → base64 JPEG out + savings report.
714pub fn optimize_image(
715    input_base64: &str,
716    mode: ProcessMode,
717    cfg: &ProcessConfig,
718) -> Result<OptimizeResult, String> {
719    let img = decode_base64_image(input_base64)?;
720    let (orig_w, orig_h) = (img.width(), img.height());
721    let input_bytes = {
722        let data = if let Some(c) = input_base64.find(',') {
723            &input_base64[c + 1..]
724        } else {
725            input_base64
726        };
727        B64.decode(data.trim()).map_err(|e| e.to_string())?.len() as u64
728    };
729
730    let mut result = process(img, mode, input_bytes, cfg);
731    let bytes = encode_to_bytes(&result.image, cfg)?;
732    let encoded = B64.encode(&bytes);
733    result.report.bytes_after = Some(bytes.len() as u64);
734
735    Ok(OptimizeResult {
736        optimized_base64: encoded,
737        report: result.report,
738        original_width: orig_w,
739        original_height: orig_h,
740        width: result.width,
741        height: result.height,
742        optimized_bytes: bytes.len(),
743    })
744}
745
746// ── Tests ─────────────────────────────────────────────────────────────────────
747
748// ── Step 4: Sandbox (Think in Code) ──────────────────────────────────────────
749
750/// Atomic image operations for the Sandbox mode.
751#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
752#[serde(rename_all = "lowercase", tag = "op")]
753pub enum ImageOp {
754    /// Crop a specific region: { x, y, width, height }
755    Crop {
756        x: u32,
757        y: u32,
758        width: u32,
759        height: u32,
760    },
761    /// Convert to grayscale.
762    Grayscale,
763    /// Binarize using Otsu's threshold (if threshold is None).
764    Binarize { threshold: Option<u8> },
765    /// Resize to exact dimensions.
766    Resize { width: u32, height: u32 },
767    /// Adjust contrast (e.g., 2.0 for double contrast).
768    Contrast { amount: f32 },
769    /// Adjust brightness (e.g., -20 to darken).
770    Brightness { amount: f32 },
771}
772
773/// Execute a sequence of operations on an image.
774pub fn process_with_operations(mut img: DynamicImage, ops: Vec<ImageOp>) -> DynamicImage {
775    for op in ops {
776        img = match op {
777            ImageOp::Crop {
778                x,
779                y,
780                width,
781                height,
782            } => img.crop_imm(x, y, width, height),
783            ImageOp::Grayscale => DynamicImage::ImageLuma8(img.to_luma8()),
784            ImageOp::Binarize { threshold } => {
785                let gray = img.to_luma8();
786                let thr = threshold.unwrap_or(128);
787                let mut binarized = ImageBuffer::new(gray.width(), gray.height());
788                for (x, y, p) in gray.enumerate_pixels() {
789                    let val = if p[0] > thr { 255 } else { 0 };
790                    binarized.put_pixel(x, y, Luma([val]));
791                }
792                DynamicImage::ImageLuma8(binarized)
793            }
794            ImageOp::Resize { width, height } => {
795                img.resize_exact(width, height, FilterType::Lanczos3)
796            }
797            ImageOp::Contrast { amount } => img.adjust_contrast(amount),
798            ImageOp::Brightness { amount } => img.brighten(amount as i32),
799        };
800    }
801    img
802}
803
804#[cfg(test)]
805mod tests {
806    use super::*;
807
808    fn cfg() -> ProcessConfig {
809        ProcessConfig::default()
810    }
811
812    #[test]
813    fn exact_boundary_unchanged() {
814        let r = calculate_optimal_dimensions(1024, 512);
815        assert_eq!((r.width, r.height), (1024, 512));
816        assert_eq!(r.tokens_saved(), 0);
817    }
818
819    #[test]
820    fn one_pixel_over_saves_full_tile_row() {
821        let r = calculate_optimal_dimensions(1025, 1025);
822        assert_eq!((r.width, r.height), (1024, 1024));
823        assert_eq!(r.tiles_before, 9);
824        assert_eq!(r.tiles_after, 4);
825        assert_eq!(r.tokens_saved(), 5);
826    }
827
828    #[test]
829    fn small_image_never_below_one_tile() {
830        let r = calculate_optimal_dimensions(100, 200);
831        assert_eq!((r.width, r.height), (512, 512));
832    }
833
834    #[test]
835    fn mid_boundary_snaps_down() {
836        let r = calculate_optimal_dimensions(768, 512);
837        assert_eq!(r.width, 512);
838        assert_eq!(r.tiles_after, 1);
839    }
840
841    #[test]
842    fn custom_tile_size_256() {
843        let r = calculate_optimal_dimensions_with(257, 512, 256);
844        assert_eq!(r.width, 256); // 257 → snaps down to 256
845        assert_eq!(r.tiles_before, 2 * 2); // ceil(257/256)*ceil(512/256) = 2*2
846        assert_eq!(r.tiles_after, 1 * 2); // 256/256 * 512/256 = 1*2
847    }
848
849    #[test]
850    fn full_pipeline_reduces_tiles() {
851        use image::{DynamicImage, Rgba, RgbaImage};
852        let mut img = RgbaImage::from_pixel(1025, 1025, Rgba([255, 255, 255, 255]));
853        for x in 400..600 {
854            for y in 400..600 {
855                img.put_pixel(x, y, Rgba([0, 0, 0, 255]));
856            }
857        }
858        let result = process(
859            DynamicImage::ImageRgba8(img),
860            ProcessMode::Standard,
861            0,
862            &cfg(),
863        );
864        assert!(result.report.tiles_after < result.report.tiles_before);
865    }
866
867    #[test]
868    fn crop_disabled_preserves_size() {
869        use image::{DynamicImage, Rgba, RgbaImage};
870        let img = RgbaImage::from_pixel(1024, 1024, Rgba([255, 255, 255, 255]));
871        let no_crop = ProcessConfig::builder().crop(false).build();
872        let result = process(
873            DynamicImage::ImageRgba8(img),
874            ProcessMode::Standard,
875            0,
876            &no_crop,
877        );
878        assert_eq!(result.width, 1024);
879    }
880
881    #[test]
882    fn crop_removes_white_border() {
883        use image::{Rgba, RgbaImage};
884        let mut img = RgbaImage::from_pixel(100, 100, Rgba([255, 255, 255, 255]));
885        for x in 45..55 {
886            for y in 45..55 {
887                img.put_pixel(x, y, Rgba([255, 0, 0, 255]));
888            }
889        }
890        let cropped = crop_padding(DynamicImage::ImageRgba8(img), 15);
891        assert!(cropped.width() < 100 && cropped.height() < 100);
892    }
893
894    #[test]
895    fn binarize_produces_only_black_white() {
896        use image::{DynamicImage, GrayImage, Luma};
897        let img = GrayImage::from_fn(64, 64, |x, _| Luma([if x < 32 { 50u8 } else { 200u8 }]));
898        let result = binarize(DynamicImage::ImageLuma8(img)).to_luma8();
899        for p in result.pixels() {
900            assert!(p.0[0] == 0 || p.0[0] == 255);
901        }
902    }
903
904    #[test]
905    fn high_bg_tolerance_crops_more() {
906        use image::{DynamicImage, Rgba, RgbaImage};
907        // Corners: pure white [255,255,255]. Border: off-white [240,240,240]. Center: black.
908        // diff = 15. strict(5): 15 > 5 → border NOT bg → no crop.
909        // loose(20): 15 ≤ 20 → border IS bg → crops.
910        let mut img = RgbaImage::from_pixel(100, 100, Rgba([240, 240, 240, 255]));
911        for corner in [(0u32, 0u32), (99, 0), (0, 99), (99, 99)] {
912            img.put_pixel(corner.0, corner.1, Rgba([255, 255, 255, 255]));
913        }
914        for x in 45..55 {
915            for y in 45..55 {
916                img.put_pixel(x, y, Rgba([0, 0, 0, 255]));
917            }
918        }
919        let strict = crop_padding(DynamicImage::ImageRgba8(img.clone()), 5);
920        let loose = crop_padding(DynamicImage::ImageRgba8(img), 20);
921        assert!(loose.width() < strict.width());
922    }
923}
924// ── Persistence & Analytics ───────────────────────────────────────────────────
925
926#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
927pub struct OptimizationReport {
928    pub timestamp: String,
929    pub model: String,
930    pub original_tokens: u32,
931    pub optimized_tokens: u32,
932    pub original_bytes: u64,
933    pub optimized_bytes: u64,
934    pub mode: String,
935}
936
937#[derive(Debug, serde::Serialize, serde::Deserialize)]
938pub struct SqueezerStats {
939    pub total_optimizations: u64,
940    pub total_original_tokens: u64,
941    pub total_optimized_tokens: u64,
942    pub total_original_bytes: u64,
943    pub total_optimized_bytes: u64,
944    pub history: Vec<OptimizationReport>,
945}
946
947impl SqueezerStats {
948    pub fn total_token_savings(&self) -> u64 {
949        self.total_original_tokens
950            .saturating_sub(self.total_optimized_tokens)
951    }
952
953    pub fn total_byte_savings(&self) -> u64 {
954        self.total_original_bytes
955            .saturating_sub(self.total_optimized_bytes)
956    }
957
958    pub fn estimated_usd_saved(&self) -> f64 {
959        // Blended average: $2.50 per 1M tokens (Claude/GPT-4o blend)
960        (self.total_token_savings() as f64 / 1_000_000.0) * 2.50
961    }
962}
963
964pub struct Persistence;
965
966impl Persistence {
967    fn get_db_path() -> PathBuf {
968        let mut path = dirs::home_dir().unwrap_or_else(|| PathBuf::from("."));
969        path.push(".vision-squeezer");
970        let _ = std::fs::create_dir_all(&path);
971        path.push("stats.db");
972        path
973    }
974
975    pub fn init_db() -> Result<(), String> {
976        let conn = Connection::open(Self::get_db_path()).map_err(|e| e.to_string())?;
977        conn.execute(
978            "CREATE TABLE IF NOT EXISTS optimizations (
979                id INTEGER PRIMARY KEY AUTOINCREMENT,
980                timestamp TEXT NOT NULL,
981                model TEXT NOT NULL,
982                original_tokens INTEGER NOT NULL,
983                optimized_tokens INTEGER NOT NULL,
984                original_bytes INTEGER NOT NULL,
985                optimized_bytes INTEGER NOT NULL,
986                mode TEXT NOT NULL
987            )",
988            [],
989        )
990        .map_err(|e| e.to_string())?;
991        Ok(())
992    }
993
994    pub fn log_optimization(
995        model: &str,
996        orig_tokens: u32,
997        opt_tokens: u32,
998        orig_bytes: u64,
999        opt_bytes: u64,
1000        mode: &str,
1001    ) -> Result<(), String> {
1002        let conn = Connection::open(Self::get_db_path()).map_err(|e| e.to_string())?;
1003        conn.execute(
1004            "INSERT INTO optimizations (timestamp, model, original_tokens, optimized_tokens, original_bytes, optimized_bytes, mode)
1005             VALUES (?, ?, ?, ?, ?, ?, ?)",
1006            params![
1007                Utc::now().to_rfc3339(),
1008                model,
1009                orig_tokens,
1010                opt_tokens,
1011                orig_bytes as i64,
1012                opt_bytes as i64,
1013                mode,
1014            ],
1015        ).map_err(|e| e.to_string())?;
1016        Ok(())
1017    }
1018
1019    pub fn get_stats() -> Result<SqueezerStats, String> {
1020        let conn = Connection::open(Self::get_db_path()).map_err(|e| e.to_string())?;
1021
1022        let mut stmt = conn
1023            .prepare(
1024                "SELECT 
1025                COUNT(*), 
1026                SUM(original_tokens), 
1027                SUM(optimized_tokens), 
1028                SUM(original_bytes), 
1029                SUM(optimized_bytes) 
1030             FROM optimizations",
1031            )
1032            .map_err(|e| e.to_string())?;
1033
1034        let (count, orig_t, opt_t, orig_b, opt_b) = stmt
1035            .query_row([], |row| {
1036                Ok((
1037                    row.get::<_, Option<i64>>(0)?.unwrap_or(0) as u64,
1038                    row.get::<_, Option<i64>>(1)?.unwrap_or(0) as u64,
1039                    row.get::<_, Option<i64>>(2)?.unwrap_or(0) as u64,
1040                    row.get::<_, Option<i64>>(3)?.unwrap_or(0) as u64,
1041                    row.get::<_, Option<i64>>(4)?.unwrap_or(0) as u64,
1042                ))
1043            })
1044            .map_err(|e| e.to_string())?;
1045
1046        let mut stmt = conn.prepare(
1047            "SELECT timestamp, model, original_tokens, optimized_tokens, original_bytes, optimized_bytes, mode 
1048             FROM optimizations ORDER BY timestamp DESC LIMIT 50"
1049        ).map_err(|e| e.to_string())?;
1050
1051        let history = stmt
1052            .query_map([], |row| {
1053                Ok(OptimizationReport {
1054                    timestamp: row.get(0)?,
1055                    model: row.get(1)?,
1056                    original_tokens: row.get(2)?,
1057                    optimized_tokens: row.get(3)?,
1058                    original_bytes: row.get::<_, i64>(4)? as u64,
1059                    optimized_bytes: row.get::<_, i64>(5)? as u64,
1060                    mode: row.get(6)?,
1061                })
1062            })
1063            .map_err(|e| e.to_string())?
1064            .collect::<Result<Vec<_>, _>>()
1065            .map_err(|e| e.to_string())?;
1066
1067        Ok(SqueezerStats {
1068            total_optimizations: count,
1069            total_original_tokens: orig_t,
1070            total_optimized_tokens: opt_t,
1071            total_original_bytes: orig_b,
1072            total_optimized_bytes: opt_b,
1073            history,
1074        })
1075    }
1076}
vision_squeezer/lib.rs

vision_squeezer/
lib.rs