harn-vm 0.9.12

Async bytecode virtual machine for the Harn programming language
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
//! Neutral computer-use tool projection and geometry helpers.
//!
//! harn-vm owns the *semantic* layer of computer use. It projects the single
//! neutral `computer` function tool onto each provider's native computer-use
//! surface, scales screenshots per provider, maps model-space coordinates back
//! to native pixels, and resolves element/mark grounding targets to points.
//!
//! The coordinate-native *execution* nucleus (screenshot capture, pointer /
//! keyboard input, accessibility tree) lives in `harn-hostlib`'s `computer`
//! module and is reached through the `hostlib_computer_*` builtins. Nothing in
//! this module touches the OS — it is pure projection and geometry so it stays
//! deterministic and unit-testable.
//!
//! ## De-overfitting
//!
//! There is exactly one host-facing `computer` tool. Per provider, this module
//! lowers it to:
//! - `native_anthropic` → the Anthropic `computer_20251124` tool,
//! - `native_openai` → the OpenAI Responses `computer` tool,
//! - `function` / `grounded` / unset → left as the plain function-schema tool
//!   (the generic fallback), untouched.

use serde_json::{json, Value};

use crate::llm::capabilities::{Capabilities, ComputerUseStyle, ScreenshotScaling};

/// Audit topic under which computer-use actions are recorded. Mirrors the
/// vision OCR audit topic pattern (`crate::stdlib::vision::VISION_OCR_AUDIT_TOPIC`).
///
/// Actual audit emission (one record per executed action, with the resolved
/// native coordinates and the grounding target that produced them) is wired by
/// the agent loop / host executor; this constant is the single canonical topic
/// string those emitters key on.
// Tested pure API awaiting live wiring (audit emission is a follow-up seam).
#[allow(dead_code)]
pub(crate) const COMPUTER_USE_AUDIT_TOPIC: &str = "audit.computer_use";

/// The host-facing neutral tool name every provider projection keys on.
pub(crate) const COMPUTER_TOOL_NAME: &str = "computer";

/// Default projected display width (Anthropic XGA). Used until the orchestrator
/// threads the real captured display size into the projection.
pub(crate) const DEFAULT_DISPLAY_WIDTH: u32 = 1024;
/// Default projected display height (Anthropic XGA).
pub(crate) const DEFAULT_DISPLAY_HEIGHT: u32 = 768;

/// The `name` a tool advertises, checking the top-level `name` first and then
/// the OpenAI `function.name` nesting.
fn tool_name(tool: &Value) -> Option<&str> {
    tool.get("name")
        .or_else(|| tool.get("function").and_then(|f| f.get("name")))
        .and_then(Value::as_str)
}

/// Whether `tool` is the plain function-schema `computer` tool (i.e. a tool the
/// host declared, not an already-projected provider-native computer tool).
pub(crate) fn is_computer_function_tool(tool: &Value) -> bool {
    let ty = tool.get("type").and_then(Value::as_str);
    // An already-projected native tool has a `computer*` type; skip it so the
    // projection is idempotent.
    if ty.is_some_and(|ty| ty.starts_with("computer")) {
        return false;
    }
    tool_name(tool) == Some(COMPUTER_TOOL_NAME)
}

/// The OpenAI Responses `environment` for the host OS. OpenAI accepts
/// `mac` / `windows` / `ubuntu` / `browser`; map the local platform onto the
/// desktop set, defaulting non-mac/-windows Unix to `ubuntu`.
pub(crate) fn environment_for_os() -> &'static str {
    match std::env::consts::OS {
        "macos" => "mac",
        "windows" => "windows",
        _ => "ubuntu",
    }
}

/// The Anthropic native computer tool descriptor (`computer_20251124`). Rides in
/// `provider_tools`; the Anthropic Messages adapter folds `provider_tools` into
/// the same `tools` array as function tools.
pub(crate) fn anthropic_computer_tool(display_width_px: u32, display_height_px: u32) -> Value {
    json!({
        "type": "computer_20251124",
        "name": COMPUTER_TOOL_NAME,
        "display_width_px": display_width_px,
        "display_height_px": display_height_px,
        "display_number": 1,
        "enable_zoom": true,
    })
}

/// The OpenAI Responses native computer tool descriptor. GPT-5.x uses the
/// `computer` type with a desktop `environment`; the display size mirrors the
/// (unscaled) capture because OpenAI wants `original`-scaled screenshots.
pub(crate) fn openai_computer_tool(
    display_width: u32,
    display_height: u32,
    environment: &str,
) -> Value {
    json!({
        "type": "computer",
        "display_width": display_width,
        "display_height": display_height,
        "environment": environment,
    })
}

/// Whether to project the neutral computer tool onto the provider's native
/// computer-use surface. Default OFF (the universal function-tool path is used);
/// opt in with `BURIN_COMPUTER_USE_NATIVE=1|on|true` once a route's native
/// action lowering is wired.
fn native_computer_projection_enabled() -> bool {
    matches!(
        std::env::var("BURIN_COMPUTER_USE_NATIVE")
            .unwrap_or_default()
            .trim()
            .to_ascii_lowercase()
            .as_str(),
        "1" | "on" | "true"
    )
}

/// Project the neutral `computer` function tool onto the route's native
/// computer-use surface, in place.
///
/// - `native_anthropic` / `native_openai`: remove the plain function-schema
///   `computer` copy from `native_tools` and push the provider-native tool into
///   `provider_tools`, so the model sees exactly one computer tool (the native
///   one).
/// - any other style (`function`, `grounded`, or unset): no-op — the plain
///   function-schema tool is the generic fallback the model calls directly.
///
/// NATIVE PATH IS NOT YET COORDINATE-SAFE — that is why it is default OFF and
/// behind `BURIN_COMPUTER_USE_NATIVE`. The native tool advertises a fixed
/// `DEFAULT_DISPLAY_WIDTH` x `DEFAULT_DISPLAY_HEIGHT` (Anthropic XGA) coordinate
/// space, but three seams are NOT yet wired and must be armed together as one
/// unit before this is enabled:
///
/// 1. the captured screenshot is NOT scaled to the advertised size (see
///    `scale_screenshot`);
/// 2. the model's returned coordinates are NOT mapped back to the real display
///    (see `map_coord_back`);
/// 3. the provider's native action vocabulary is NOT lowered to the neutral
///    `ComputerAction` the harn `computer` handler executes.
///
/// Until all three land, enabling the opt-in makes clicks land in the wrong
/// place. The default universal function-tool path uses none of this and is the
/// verified surface.
pub(crate) fn project_computer_tools(
    caps: &Capabilities,
    native_tools: &mut Option<Vec<Value>>,
    provider_tools: &mut Vec<Value>,
) {
    project_computer_tools_with(
        caps,
        native_tools,
        provider_tools,
        native_computer_projection_enabled(),
    );
}

fn project_computer_tools_with(
    caps: &Capabilities,
    native_tools: &mut Option<Vec<Value>>,
    provider_tools: &mut Vec<Value>,
    enable_native: bool,
) {
    if !enable_native {
        return;
    }
    let style = match caps.computer_use_style {
        Some(style @ (ComputerUseStyle::NativeAnthropic | ComputerUseStyle::NativeOpenai)) => style,
        // Function / Grounded / none: leave the function-schema tool as-is.
        _ => return,
    };
    let Some(tools) = native_tools.as_mut() else {
        return;
    };
    if !tools.iter().any(is_computer_function_tool) {
        return;
    }
    tools.retain(|tool| !is_computer_function_tool(tool));

    // See the INTEGRATION SEAM note above: default to XGA until real dims flow.
    let (width, height) = (DEFAULT_DISPLAY_WIDTH, DEFAULT_DISPLAY_HEIGHT);
    let native = match style {
        ComputerUseStyle::NativeAnthropic => anthropic_computer_tool(width, height),
        // NativeOpenai (the only other arm that reaches here).
        _ => openai_computer_tool(width, height, environment_for_os()),
    };
    provider_tools.push(native);
}

/// Fit `(width, height)` within `(max_w, max_h)` preserving aspect ratio,
/// never upscaling. Zero-sized inputs pass through unchanged.
#[allow(dead_code)] // used by scale_screenshot (a follow-up live-wiring seam).
fn fit_within(width: u32, height: u32, max_w: u32, max_h: u32) -> (u32, u32) {
    if width == 0 || height == 0 {
        return (width, height);
    }
    if width <= max_w && height <= max_h {
        return (width, height);
    }
    let scale = (f64::from(max_w) / f64::from(width)).min(f64::from(max_h) / f64::from(height));
    let scaled_w = ((f64::from(width) * scale).round() as u32).max(1);
    let scaled_h = ((f64::from(height) * scale).round() as u32).max(1);
    (scaled_w, scaled_h)
}

/// Scale a native screenshot to the model-facing target size for `style`.
///
/// - `xga` (Anthropic): fit within 1024x768 preserving aspect ratio, never
///   upscaling.
/// - `original` / `none` / unknown / unset (OpenAI et al.): identity.
//
// Items 5/6: these geometry + grounding helpers are the tested pure API that
// the orchestrator wires into the live screenshot/coordinate flow as a
// follow-up (see the INTEGRATION SEAM notes). `#[allow(dead_code)]` keeps the
// crate warning-clean until that wiring lands.
#[allow(dead_code)]
pub(crate) fn scale_screenshot(
    width: u32,
    height: u32,
    style: Option<ScreenshotScaling>,
) -> (u32, u32) {
    match style {
        Some(ScreenshotScaling::Xga) => {
            fit_within(width, height, DEFAULT_DISPLAY_WIDTH, DEFAULT_DISPLAY_HEIGHT)
        }
        _ => (width, height),
    }
}

/// Map a native coordinate into the model-facing target (scaled) space. Inverse
/// of [`map_coord_back`]. A zero native dimension passes the axis through.
#[allow(dead_code)]
pub(crate) fn map_coord_to_target(
    native_xy: (i32, i32),
    native_dims: (u32, u32),
    target_dims: (u32, u32),
) -> (i32, i32) {
    let (nx, ny) = native_xy;
    let (nw, nh) = native_dims;
    let (tw, th) = target_dims;
    let mx = if nw == 0 {
        nx
    } else {
        (f64::from(nx) * f64::from(tw) / f64::from(nw)).round() as i32
    };
    let my = if nh == 0 {
        ny
    } else {
        (f64::from(ny) * f64::from(th) / f64::from(nh)).round() as i32
    };
    (mx, my)
}

/// Map a model-space coordinate (expressed in the target/scaled dims the model
/// saw) back to absolute native pixels for the execution nucleus. A zero target
/// dimension passes the axis through unchanged.
///
/// INTEGRATION SEAM — live flow: the orchestrator should [`scale_screenshot`]
/// the captured image before sending it, remember the `(target_dims,
/// native_dims)` pair, and run every model-returned click/point through this
/// function before lowering to the coordinate-native `hostlib_computer_execute`
/// action list.
#[allow(dead_code)]
pub(crate) fn map_coord_back(
    model_xy: (i32, i32),
    target_dims: (u32, u32),
    native_dims: (u32, u32),
) -> (i32, i32) {
    let (mx, my) = model_xy;
    let (tw, th) = target_dims;
    let (nw, nh) = native_dims;
    let nx = if tw == 0 {
        mx
    } else {
        (f64::from(mx) * f64::from(nw) / f64::from(tw)).round() as i32
    };
    let ny = if th == 0 {
        my
    } else {
        (f64::from(my) * f64::from(nh) / f64::from(th)).round() as i32
    };
    (nx, ny)
}

/// One row of the accessibility element table used for grounding. Mirrors the
/// hostlib `UiElement` shape (`reference`, `role`, `name`, bbox) so callers can
/// build these directly from a `hostlib_computer_ui_tree` result.
#[derive(Debug, Clone, PartialEq, Eq)]
#[allow(dead_code)]
pub(crate) struct GroundingElement {
    /// Stable reference the model addresses.
    pub reference: String,
    /// Accessibility role (e.g. `AXButton`).
    pub role: String,
    /// Accessible name / label.
    pub name: String,
    /// Bounding-box x in native pixels.
    pub x: i32,
    /// Bounding-box y in native pixels.
    pub y: i32,
    /// Bounding-box width in native pixels.
    pub width: i32,
    /// Bounding-box height in native pixels.
    pub height: i32,
}

/// A grounding target a model may address instead of raw coordinates.
#[derive(Debug, Clone, PartialEq, Eq)]
#[allow(dead_code)]
pub(crate) enum GroundingTarget {
    /// Address an element by its stable `reference`.
    Element {
        /// The element's `reference`.
        reference: String,
    },
    /// Address a set-of-marks id (matched against `reference`, or a 1-based
    /// index into the element table when the id is numeric).
    Mark {
        /// The mark id.
        id: String,
    },
    /// A raw native point (pass-through).
    Point {
        /// Absolute x in native pixels.
        x: i32,
        /// Absolute y in native pixels.
        y: i32,
    },
}

/// The native center point of an element's bounding box.
#[allow(dead_code)]
fn bbox_center(element: &GroundingElement) -> (i32, i32) {
    (
        element.x + element.width / 2,
        element.y + element.height / 2,
    )
}

/// Resolve a grounding target to a native `(x, y)` point.
///
/// - `Point` returns its coordinates unchanged.
/// - `Element` returns the bbox center of the element whose `reference`
///   matches.
/// - `Mark` returns the bbox center of the element whose `reference` matches
///   the id, or (when the id is a positive integer) the 1-based index into the
///   element table.
///
/// Returns `None` when an element/mark target does not resolve.
#[allow(dead_code)]
pub(crate) fn resolve_grounding(
    elements: &[GroundingElement],
    target: &GroundingTarget,
) -> Option<(i32, i32)> {
    match target {
        GroundingTarget::Point { x, y } => Some((*x, *y)),
        GroundingTarget::Element { reference } => elements
            .iter()
            .find(|element| &element.reference == reference)
            .map(bbox_center),
        GroundingTarget::Mark { id } => elements
            .iter()
            .find(|element| &element.reference == id)
            .or_else(|| {
                id.parse::<usize>()
                    .ok()
                    .filter(|index| *index >= 1)
                    .and_then(|index| elements.get(index - 1))
            })
            .map(bbox_center),
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn caps_with_style(style: ComputerUseStyle) -> Capabilities {
        Capabilities {
            computer_use_style: Some(style),
            ..Capabilities::default()
        }
    }

    fn function_tool(name: &str) -> Value {
        json!({"type": "function", "function": {"name": name}})
    }

    #[test]
    fn anthropic_native_tool_golden_shape() {
        assert_eq!(
            anthropic_computer_tool(1024, 768),
            json!({
                "type": "computer_20251124",
                "name": "computer",
                "display_width_px": 1024,
                "display_height_px": 768,
                "display_number": 1,
                "enable_zoom": true,
            })
        );
    }

    #[test]
    fn openai_native_tool_golden_shape() {
        assert_eq!(
            openai_computer_tool(1440, 900, "mac"),
            json!({
                "type": "computer",
                "display_width": 1440,
                "display_height": 900,
                "environment": "mac",
            })
        );
    }

    #[test]
    fn projects_native_anthropic_and_suppresses_function_copy() {
        let caps = caps_with_style(ComputerUseStyle::NativeAnthropic);
        let mut native = Some(vec![function_tool("read_file"), function_tool("computer")]);
        let mut provider = Vec::new();
        project_computer_tools_with(&caps, &mut native, &mut provider, true);

        // The plain `computer` function copy is gone; other tools remain.
        let remaining = native.unwrap();
        assert_eq!(remaining.len(), 1);
        assert_eq!(tool_name(&remaining[0]), Some("read_file"));
        // The native tool is injected into provider_tools.
        assert_eq!(provider.len(), 1);
        assert_eq!(provider[0]["type"], "computer_20251124");
        assert_eq!(provider[0]["display_width_px"], 1024);
    }

    #[test]
    fn projects_native_openai_and_suppresses_function_copy() {
        let caps = caps_with_style(ComputerUseStyle::NativeOpenai);
        let mut native = Some(vec![function_tool("computer")]);
        let mut provider = Vec::new();
        project_computer_tools_with(&caps, &mut native, &mut provider, true);

        assert!(native.unwrap().is_empty());
        assert_eq!(provider.len(), 1);
        assert_eq!(provider[0]["type"], "computer");
        assert!(provider[0].get("environment").is_some());
    }

    #[test]
    fn function_style_leaves_tool_untouched() {
        for style in [ComputerUseStyle::Function, ComputerUseStyle::Grounded] {
            let caps = caps_with_style(style);
            let mut native = Some(vec![function_tool("computer")]);
            let mut provider = Vec::new();
            project_computer_tools_with(&caps, &mut native, &mut provider, true);
            assert_eq!(native.as_ref().unwrap().len(), 1, "{style:?}");
            assert!(provider.is_empty(), "{style:?}");
        }
    }

    #[test]
    fn projection_is_idempotent() {
        let caps = caps_with_style(ComputerUseStyle::NativeAnthropic);
        let mut native = Some(vec![function_tool("computer")]);
        let mut provider = Vec::new();
        project_computer_tools_with(&caps, &mut native, &mut provider, true);
        // Second pass: the native tool already lives in provider_tools and the
        // function copy is gone, so nothing changes.
        project_computer_tools_with(&caps, &mut native, &mut provider, true);
        assert!(native.unwrap().is_empty());
        assert_eq!(provider.len(), 1);
    }

    #[test]
    fn xga_scaling_fits_and_original_is_identity() {
        // 1920x1080 fits within 1024x768 → 1024x576 (letterboxed by width).
        assert_eq!(
            scale_screenshot(1920, 1080, Some(ScreenshotScaling::Xga)),
            (1024, 576)
        );
        // Already small: no upscaling.
        assert_eq!(
            scale_screenshot(800, 600, Some(ScreenshotScaling::Xga)),
            (800, 600)
        );
        // original / none / unset: identity.
        assert_eq!(
            scale_screenshot(1920, 1080, Some(ScreenshotScaling::Original)),
            (1920, 1080)
        );
        assert_eq!(scale_screenshot(1920, 1080, None), (1920, 1080));
    }

    #[test]
    fn coordinate_roundtrip_within_one_pixel() {
        let native_dims = (1920, 1080);
        let target_dims =
            scale_screenshot(native_dims.0, native_dims.1, Some(ScreenshotScaling::Xga));
        for native in [(0, 0), (960, 540), (1919, 1079), (100, 999)] {
            let model = map_coord_to_target(native, native_dims, target_dims);
            let back = map_coord_back(model, target_dims, native_dims);
            assert!(
                (back.0 - native.0).abs() <= 1 && (back.1 - native.1).abs() <= 1,
                "native {native:?} -> model {model:?} -> back {back:?}"
            );
        }
    }

    #[test]
    fn original_scaling_coordinate_identity() {
        let dims = (1440, 900);
        let target = scale_screenshot(dims.0, dims.1, Some(ScreenshotScaling::Original));
        assert_eq!(target, dims);
        assert_eq!(map_coord_back((123, 456), target, dims), (123, 456));
    }

    #[test]
    fn grounding_resolves_element_mark_and_point() {
        let elements = vec![
            GroundingElement {
                reference: "el-a".to_string(),
                role: "AXButton".to_string(),
                name: "OK".to_string(),
                x: 100,
                y: 200,
                width: 40,
                height: 20,
            },
            GroundingElement {
                reference: "el-b".to_string(),
                role: "AXTextField".to_string(),
                name: "Search".to_string(),
                x: 0,
                y: 0,
                width: 10,
                height: 10,
            },
        ];
        // Element by reference → bbox center.
        assert_eq!(
            resolve_grounding(
                &elements,
                &GroundingTarget::Element {
                    reference: "el-a".to_string()
                }
            ),
            Some((120, 210))
        );
        // Mark by 1-based index → element el-b center.
        assert_eq!(
            resolve_grounding(
                &elements,
                &GroundingTarget::Mark {
                    id: "2".to_string()
                }
            ),
            Some((5, 5))
        );
        // Point pass-through.
        assert_eq!(
            resolve_grounding(&elements, &GroundingTarget::Point { x: 7, y: 9 }),
            Some((7, 9))
        );
        // Unknown element → None.
        assert_eq!(
            resolve_grounding(
                &elements,
                &GroundingTarget::Element {
                    reference: "nope".to_string()
                }
            ),
            None
        );
    }

    #[test]
    fn audit_topic_is_stable() {
        assert_eq!(COMPUTER_USE_AUDIT_TOPIC, "audit.computer_use");
    }
}