rsclaw-plugin 0.1.0

Plugin crate for RsClaw — internal workspace crate, not for direct use
package rsclaw:plugin;

interface host-browser {
    browser-open: func(url: string) -> result<string, string>;
    browser-snapshot: func() -> result<string, string>;
    browser-click: func(ref-str: string) -> result<string, string>;
    /// Native (CDP-level) mouse click at viewport coordinates. Use this
    /// when synthetic JS click events are ignored by the page (typical for
    /// React PointerEvent handlers on popups / paywalls).
    browser-click-at: func(x: u32, y: u32) -> result<string, string>;
    browser-fill: func(ref-str: string, text: string) -> result<string, string>;
    browser-press: func(key: string) -> result<string, string>;
    browser-eval: func(code: string) -> result<string, string>;
    browser-wait-text: func(text: string, timeout-ms: u32) -> result<string, string>;
    browser-screenshot: func() -> result<string, string>;
    browser-download: func(ref-str: string, filename: string) -> result<string, string>;
    browser-upload: func(ref-str: string, filepath: string) -> result<string, string>;
    /// Upload MULTIPLE files at once to a single (multiple-capable) file input
    /// identified by data-ref. Equivalent to selecting several files in the OS
    /// dialog: sets input.files to all paths and fires change once. Use for
    /// galleries / multi-image uploaders backed by one `<input multiple>`.
    browser-upload-multi: func(ref-str: string, filepaths: list<string>) -> result<string, string>;
    /// Upload one or more files to an input opened via a native file-chooser
    /// dialog (e.g. a button that triggers a transient or cross-origin-iframe
    /// `<input type=file>`). Intercepts the chooser at the CDP level, clicks the
    /// trigger at (click-x, click-y), then sets all files at once. Works across
    /// iframes/origins where `browser-upload`'s main-document selector cannot.
    browser-upload-via-chooser: func(filepaths: list<string>, click-x: u32, click-y: u32) -> result<string, string>;
    browser-get-url: func() -> result<string, string>;

    /// Wait for an element matching `css-selector` to be present in the DOM.
    /// Polls every 250ms via JS until match or timeout. Returns "ok" on
    /// match, error on timeout. Use `browser-snapshot` afterwards if you
    /// need a clickable ref.
    wait-for-selector: func(css-selector: string, timeout-ms: u32) -> result<string, string>;

    /// Wait until network requests have been quiet for ~500ms, or until
    /// `timeout-ms` total. Returns "ok" or error on timeout.
    wait-for-network-idle: func(timeout-ms: u32) -> result<string, string>;

    /// Run a JavaScript function expression with structured arguments.
    /// `code` MUST evaluate to a function (e.g. `"async (args) => { ... }"`
    /// or `"function(args) { ... }"`). `args-json` is parsed and passed as
    /// the function's first argument. The function's return value is
    /// JSON-stringified and returned. Avoids the brittle string-interpolation
    /// and manual escaping that `browser-eval` requires.
    eval-with-args: func(code: string, args-json: string) -> result<string, string>;

    /// Switch the active browser tab to the most recently opened one.
    /// Useful when an action opens a popup window. Replaces the
    /// `browser-eval("__switch_latest_tab")` magic-string convention.
    switch-latest-tab: func() -> result<string, string>;
}

interface host-runtime {
    log: func(level: string, msg: string);
    sleep: func(ms: u32);
    read-file: func(path: string) -> result<string, string>;
    /// Extract readable text from a saved plugin artifact. The path is
    /// sandboxed to plugin-controlled artifact directories.
    extract-file-text: func(path: string) -> result<string, string>;
    /// Write `contents` to `path`. The path is sandboxed to the plugin
    /// workspace or var/plugins/<name>/ directory; writes outside these
    /// dirs are rejected. Parent directories are created automatically.
    write-file: func(path: string, contents: string) -> result<string, string>;
    /// Ensure a directory and all its parents exist (mkdir -p).
    /// The path is sandboxed the same way as write-file.
    ensure-dir: func(path: string) -> result<string, string>;
    /// Execute a SQL statement that does not return rows (CREATE, INSERT,
    /// UPDATE, DELETE, etc.). `params` are bound positionally (?1, ?2, ...)
    /// to prevent SQL injection. Returns JSON {"rows_affected": N,
    /// "last_insert_rowid": M}. Each plugin gets its own isolated SQLite DB.
    sql-execute: func(sql: string, params: list<string>) -> result<string, string>;
    /// Execute a SQL query that returns rows (SELECT). `params` are bound
    /// positionally. Returns a JSON array of objects, one per row.
    sql-query: func(sql: string, params: list<string>) -> result<string, string>;
    /// Send a progress/notification message to the user during long operations.
    notify: func(message: string) -> result<string, string>;

    /// Send a message + an inline image. The image-data-uri must be a
    /// `data:image/<format>;base64,<...>` string (the browser-screenshot
    /// host fn already returns this shape). The host always populates
    /// the OutboundMessage's `images` field with the data URI; channel
    /// handlers (feishu, wechat, desktop, etc.) decide how to render —
    /// IM channels upload it inline, desktop renders/saves it natively.
    notify-with-image: func(message: string, image-data-uri: string) -> result<string, string>;

    /// Send a message + a file attachment by absolute path. `mime` like
    /// "video/mp4" or "image/png". The file path must resolve under the
    /// plugin workspace (canonicalized & allowlisted by the host). The
    /// host populates OutboundMessage.files; channel handlers decide
    /// how to deliver (IM channels upload, desktop surfaces the path).
    notify-with-file: func(message: string, file-path: string, mime: string) -> result<string, string>;

    /// Ingest a prepared document into the local knowledge base. The
    /// collection is resolved by name and created when missing.
    kb-ingest-document: func(collection: string, title: string, content: string, mime: string)
        -> result<string, string>;
}

interface host-config {
    /// Return this plugin's resolved config JSON. Simple `{source:"env",id:"VAR"}`
    /// secret references are replaced with the environment value before the
    /// guest sees the object.
    plugin-config: func() -> result<string, string>;
}

interface host-context {
    /// Return the current invocation context as JSON. Empty fields mean the
    /// plugin was invoked outside an agent/channel turn.
    current-context: func() -> result<string, string>;
}

interface host-http {
    /// Perform an HTTP request. `headers-json` is an object, `body` is raw
    /// UTF-8 text, and the return value is JSON `{status, headers, body}`.
    request: func(method: string, url: string, headers-json: string, body: string, timeout-ms: u32)
        -> result<string, string>;
}

interface host-kv {
    /// Plugin-scoped key/value store backed by the host.
    kv-get: func(key: string) -> result<string, string>;
    kv-set: func(key: string, value: string) -> result<string, string>;
    kv-delete: func(key: string) -> result<string, string>;
}

interface host-device {
    /// Stable host-generated public key for this machine/profile.
    device-public-key: func() -> result<string, string>;
    /// Sign an arbitrary payload with the host-held private key.
    device-sign: func(payload: string) -> result<string, string>;
}

interface host-background {
    /// Register/update a cron-style task for this plugin.
    cron-register: func(name: string, schedule-json: string) -> result<string, string>;
    /// Subscribe to an SSE stream.
    sse-subscribe: func(name: string, url: string, headers-json: string, resume-key: string)
        -> result<string, string>;
    /// Inspect a plugin SSE subscription scoped to the current invocation context.
    sse-status: func(name: string) -> result<string, string>;
    /// Push an outbound message to a concrete channel/peer.
    push-outbound: func(channel: string, peer-id: string, message-json: string) -> result<string, string>;
    /// Submit a synthetic user turn to the agent queue.
    submit-agent-turn: func(session-key: string, prompt: string, route-json: string)
        -> result<string, string>;
}

interface host-storage {
    /// Request a writable absolute path for a new artifact file. The host
    /// picks the location (typically under its base-dir), creates the parent
    /// directory, and returns a normalized absolute path. Plugins MUST use
    /// this instead of constructing filesystem paths themselves — the host
    /// owns the on-disk layout.
    ///
    /// `filename` is a HINT — the host uses its extension to pick a category
    /// (i/v/a/d/f) but ignores the stem and writes a canonical
    /// `dl_<kind>_<YYYYMMDDHHmm><ab>.<ext>` instead. Pass any short
    /// representative name like `"video.mp4"` / `"image.png"`.
    allocate-artifact: func(filename: string) -> result<string, string>;

    /// Allocate a batch of related artifact paths sharing the same base
    /// (timestamp + 2-letter random suffix), differing only in the
    /// `_N` index suffix. Use this when a single tool call produces
    /// several outputs of the same kind (e.g. a 4-image batch).
    /// Returns paths in 1-based order.
    allocate-artifact-group: func(filename: string, count: u32) -> result<list<string>, string>;
}

interface host-media {
    /// Extract audio track from a local video or audio file using ffmpeg.
    /// Converts to 16kHz mono WAV (compatible with most STT engines).
    /// Input can be video (mp4/webm/mov/...) or audio (mp3/wav/...).
    extract-audio: func(input-path: string) -> result<string, string>;

    /// Transcribe audio to text using the host's configured STT engine.
    /// Provider priority: sherpa-onnx (local) → whisper.cpp → OpenAI API.
    /// `language` is a BCP-47 code (zh-CN, en-US, ja-JP, etc).
    transcribe: func(audio-path: string, language: string) -> result<string, string>;

    /// Extract keyframes from a local video file using ffmpeg.
    /// Spreads `count` frames evenly across the video duration.
    /// Returns a list of PNG image file paths.
    extract-keyframes: func(video-path: string, count: u32) -> result<list<string>, string>;
}

interface host-android {
    /// Tap screen at absolute coordinate.
    android-tap: func(x: u32, y: u32) -> result<string, string>;

    /// Swipe from (x1,y1) to (x2,y2) over duration-ms milliseconds.
    android-swipe: func(x1: u32, y1: u32, x2: u32, y2: u32, duration-ms: u32) -> result<string, string>;

    /// Type text into the currently focused element (equivalent to send_keys).
    android-type: func(text: string) -> result<string, string>;

    /// Set Android primary clipboard text.
    android-clipboard-set: func(text: string) -> result<string, string>;

    /// Paste Android primary clipboard text into the currently focused input.
    android-paste: func() -> result<string, string>;

    /// Press a hardware/nav key by name: back, home, enter.
    android-press: func(key: string) -> result<string, string>;

    /// Get full UI hierarchy as XML. Set compressed=false for WebView or Flutter content.
    android-get-ui-xml: func(compressed: bool) -> result<string, string>;

    /// Get current foreground "package/ActivityName".
    android-current-activity: func() -> result<string, string>;

    /// Launch app by package name and wait for it to become foreground.
    android-launch-app: func(pkg: string) -> result<string, string>;

    /// Force-stop app by package name.
    android-stop-app: func(pkg: string) -> result<string, string>;

    /// Capture screenshot as data:image/png;base64,... URI.
    android-screenshot: func() -> result<string, string>;

    /// Find elements by selector. Returns JSON array of {text, resource-id, content-desc, bounds, clickable}.
    /// selector-type: "resource-id" | "text" | "text-contains" | "content-desc" | "content-desc-contains" | "class"
    android-find-elements: func(selector-type: string, selector-value: string) -> result<string, string>;

    /// Tap the first element matching selector. Returns "tapped" or error.
    android-tap-element: func(selector-type: string, selector-value: string) -> result<string, string>;

    /// Get text content of the first element matching selector.
    android-get-element-text: func(selector-type: string, selector-value: string) -> result<string, string>;

    /// Set text of the first element matching selector, clearing existing content.
    android-set-element-text: func(selector-type: string, selector-value: string, text: string) -> result<string, string>;

    /// Return whether any element matching selector is currently on screen.
    android-element-exists: func(selector-type: string, selector-value: string) -> result<bool, string>;

    /// Wait up to timeout-ms for an element matching selector to appear. Returns "found" or error.
    android-wait-for-element: func(selector-type: string, selector-value: string, timeout-ms: u32) -> result<string, string>;

    /// Tap the brand-yellow primary button by its RENDERED COLOR. Flutter confirm
    /// dialogs (下架/删除/取消订单/发货) expose no UI tree, so element selectors
    /// can't find their 确定/确认 button — but it's always the bright-yellow action.
    /// Screenshots, finds the largest yellow region within the vertical band
    /// [y-min, y-max) (pass 0,0 for the whole screen), and taps its center.
    /// Returns "tapped:x,y" or an error if no yellow button is found.
    android-tap-yellow-button: func(y-min: u32, y-max: u32) -> result<string, string>;
}

interface host-ios {
    /// Connect to WebDriverAgent and optionally launch app by bundle id.
    /// Returns session id string on success.
    ios-connect: func(bundle-id: option<string>) -> result<string, string>;

    /// Find elements by WDA selector. Returns JSON array of
    /// [{text, label, type, enabled, rect:{x,y,w,h}}].
    ios-find-elements: func(selector-type: string, selector-value: string) -> result<string, string>;

    /// Tap the first element matching selector.
    ios-tap-element: func(selector-type: string, selector-value: string) -> result<string, string>;

    /// Tap at absolute screen coordinates (points, not pixels).
    ios-tap: func(x: f64, y: f64) -> result<string, string>;

    /// Type text via the keyboard (UIKit path).
    ios-type: func(text: string) -> result<string, string>;

    /// Swipe from (x1,y1) to (x2,y2) over duration-ms.
    ios-swipe: func(x1: f64, y1: f64, x2: f64, y2: f64, duration-ms: u32) -> result<string, string>;

    /// Get compact screen element labels as JSON array of strings.
    ios-get-labels: func() -> result<string, string>;

    /// Capture screenshot as data:image/png;base64,... URI.
    ios-screenshot: func() -> result<string, string>;

    /// Get screen size in points as JSON {width, height}.
    ios-screen-size: func() -> result<string, string>;

    /// Press a system keyboard button by name: "Send", "Return", "back", "home".
    ios-press-button: func(name: string) -> result<string, string>;

    /// Get current foreground app bundle id.
    ios-current-app: func() -> result<string, string>;

    /// Launch/foreground app by bundle id.
    ios-launch-app: func(bundle-id: string) -> result<string, string>;

    /// Terminate app by bundle id.
    ios-terminate-app: func(bundle-id: string) -> result<string, string>;
}

interface host-desktop {
    /// Activate an application by bundle-id (macOS), exe name (Windows), or WM_CLASS (Linux).
    desktop-activate-app: func(bundle-id: string) -> result<string, string>;

    /// List all windows of the target app. Returns JSON array:
    /// [{"idx":1,"title":"...","x":0,"y":0,"w":900,"h":600}]
    desktop-list-windows: func(bundle-id: string) -> result<string, string>;

    /// Close a specific window by its index (from list-windows).
    desktop-close-window: func(bundle-id: string, window-idx: u32) -> result<string, string>;

    /// Get the main window bounds (x, y, w, h) as JSON.
    /// Prefers title=="Weixin", falls back to largest window.
    desktop-get-main-window: func(bundle-id: string) -> result<string, string>;

    /// Screenshot the app's main window. Returns data:image/png;base64,... data URI.
    desktop-screenshot-window: func(bundle-id: string) -> result<string, string>;

    /// Screenshot a screen region. Returns data URI.
    desktop-screenshot-region: func(x: u32, y: u32, w: u32, h: u32) -> result<string, string>;

    /// Scan an absolute screen region for pixels near a target RGB colour
    /// (per-channel `tolerance`). Cross-platform pure-pixel test (no OCR/VLM),
    /// used for red-badge *presence* detection (we can't read the digit, but we
    /// can see the red cluster). Returns JSON
    /// `{"hit":bool,"count":u32,"total":u32,"ratio":float}` where
    /// `hit = count >= min-count`.
    desktop-region-has-color: func(x: u32, y: u32, w: u32, h: u32, r: u32, g: u32, b: u32, tolerance: u32, min-count: u32) -> result<string, string>;

    /// OCR the app's main window (macOS Vision). Returns JSON
    /// `[{"text":"...","x":143,"y":699}, ...]` with 0-1000 relative centre
    /// coords -- precise enough to click a named row.
    desktop-ocr-window: func(bundle-id: string) -> result<string, string>;

    /// Move the mouse cursor to absolute screen coordinates (x, y) without clicking.
    desktop-mouse-move: func(x: u32, y: u32) -> result<string, string>;

    /// Mouse left-click at absolute screen coordinates (x, y).
    desktop-mouse-click: func(x: u32, y: u32) -> result<string, string>;

    /// Mouse double-click at absolute screen coordinates (x, y).
    desktop-mouse-double-click: func(x: u32, y: u32) -> result<string, string>;

    /// Drag from (x1, y1) to (x2, y2).
    desktop-mouse-drag: func(x1: u32, y1: u32, x2: u32, y2: u32) -> result<string, string>;

    /// Scroll the mouse wheel. Positive clicks = down, negative = up.
    desktop-mouse-scroll: func(clicks: s32) -> result<string, string>;

    /// Press a key with optional modifiers.
    /// key: key name (e.g. "Return", "Escape", "v", "k").
    /// modifiers: list of "command", "control", "shift", "option" / "alt".
    desktop-key-press: func(key: string, modifiers: list<string>) -> result<string, string>;

    /// Set the system clipboard text.
    desktop-clipboard-set: func(text: string) -> result<string, string>;

    /// Get the system clipboard text.
    desktop-clipboard-get: func() -> result<string, string>;

    /// Set the system clipboard to a file reference (for paste-as-file).
    desktop-clipboard-set-file: func(file-path: string) -> result<string, string>;

    /// Get image data from the system clipboard (PNG, returns data URI).
    /// Returns empty/error if no image is on the clipboard.
    desktop-clipboard-get-image: func() -> result<string, string>;

    /// Mouse right-click at absolute screen coordinates (x, y).
    desktop-mouse-right-click: func(x: u32, y: u32) -> result<string, string>;

    /// Open a native file dialog. Returns the selected file path or error.
    desktop-file-dialog-open: func(title: string, filters: list<string>) -> result<string, string>;
}

interface host-vlm {
    /// Parse an image with a vision-language model.
    /// image-data-uri must be data:image/png;base64,... format.
    /// prompt is the task description sent to the VLM.
    /// Returns the VLM's raw text output.
    vlm-parse: func(image-data-uri: string, prompt: string, max-tokens: u32)
        -> result<string, string>;
}

interface plugin-api {
    handle-tool: func(tool-name: string, args-json: string) -> result<string, string>;
}

world jimeng-plugin {
    import host-browser;
    import host-runtime;
    import host-config;
    import host-context;
    import host-http;
    import host-kv;
    import host-device;
    import host-background;
    import host-storage;
    import host-media;
    import host-android;
    import host-ios;
    import host-desktop;
    import host-vlm;
    export plugin-api;
}