Skip to main content

vs_daemon/daemon/
store_ops.rs

1//! Store-only primitives: `vs_extract`, `vs_mark`, `vs_annotate`,
2//! `vs_log`. These don't touch the engine — they read from the page
3//! cache or write to the SQLite store.
4
5use vs_protocol::{Ref, StateToken, Tree};
6use vs_store::{ActionFilter, AnnotationTarget};
7
8use super::audit::AuditCtx;
9use super::responses::{AnnotateResponse, ExtractResponse, LogResponse, MarkResponse};
10use super::{short_id, Daemon};
11use crate::error::{DaemonError, Result};
12use crate::tokens;
13
14impl Daemon {
15    /// Extract structured data from a page using a known schema.
16    pub fn extract(
17        &self,
18        session_id: &str,
19        page_id: &str,
20        schema: &str,
21        before_token: StateToken,
22    ) -> Result<ExtractResponse> {
23        let ctx = AuditCtx::new("vs_extract", session_id)
24            .with_page(page_id)
25            .with_args(
26                schema.to_string(),
27                tokens::args_hash("vs_extract", &[schema.to_string()]),
28            )
29            .with_before(before_token);
30        self.audit_call(ctx, |ctx| {
31            let current = self.current_token(session_id, page_id)?;
32            if current != before_token {
33                return Err(DaemonError::StaleToken {
34                    current,
35                    reason: "mutate",
36                });
37            }
38            ctx.after_token = Some(current);
39
40            let sessions = self.inner.sessions.lock().expect("poisoned");
41            let page = sessions
42                .get(session_id)
43                .ok_or_else(|| DaemonError::UnknownSession(session_id.to_string()))?
44                .pages
45                .get(page_id)
46                .ok_or_else(|| DaemonError::UnknownPage(page_id.to_string()))?;
47            let tree = page.last_tree.as_ref().ok_or_else(|| {
48                DaemonError::BadRequest("no tree cached; call vs_view first".into())
49            })?;
50            let records = match schema {
51                "table" => extract_tables(tree),
52                "list" => extract_lists(tree),
53                "form" | "jsonld" | "webmcp" => {
54                    // Pull the engine handle out of the page WHILE we
55                    // hold the lock, then drop the lock before calling
56                    // the engine — so the engine's main-thread
57                    // dispatcher is free to take whatever locks it
58                    // needs without deadlocking.
59                    let engine_handle = page.engine_handle;
60                    drop(sessions);
61                    extract_via_engine(&self.inner.engine, engine_handle, schema)?
62                }
63                other => {
64                    return Err(DaemonError::BadRequest(format!("unknown schema: {other}")));
65                }
66            };
67            Ok(ExtractResponse {
68                token: current,
69                records,
70            })
71        })
72    }
73
74    /// Persist `r` as a named anchor in the session.
75    pub fn mark(
76        &self,
77        session_id: &str,
78        page_id: &str,
79        r: Ref,
80        name: &str,
81        before_token: StateToken,
82    ) -> Result<MarkResponse> {
83        let args = vec![r.to_string(), name.to_string()];
84        let ctx = AuditCtx::new("vs_mark", session_id)
85            .with_page(page_id)
86            .with_args(format!("{r} {name}"), tokens::args_hash("vs_mark", &args))
87            .with_before(before_token);
88        self.audit_call(ctx, |ctx| {
89            let current = self.current_token(session_id, page_id)?;
90            if current != before_token {
91                return Err(DaemonError::StaleToken {
92                    current,
93                    reason: "mutate",
94                });
95            }
96            ctx.after_token = Some(current);
97
98            let (dom_path, role, excerpt) = {
99                let sessions = self.inner.sessions.lock().expect("poisoned");
100                let page = sessions
101                    .get(session_id)
102                    .ok_or_else(|| DaemonError::UnknownSession(session_id.to_string()))?
103                    .pages
104                    .get(page_id)
105                    .ok_or_else(|| DaemonError::UnknownPage(page_id.to_string()))?;
106                let node = page.find_node(r).ok_or(DaemonError::UnknownRef(r.0))?;
107                (
108                    format!("{}#{}", node.role, r.0),
109                    Some(node.role.to_string()),
110                    Some(node.label.clone()),
111                )
112            };
113
114            let mark_id = format!("m_{}", short_id());
115            let mut store = self.inner.store.lock().expect("poisoned");
116            store.create_mark(
117                &mark_id,
118                session_id,
119                page_id,
120                name,
121                &dom_path,
122                role.as_deref(),
123                excerpt.as_deref(),
124            )?;
125            Ok(MarkResponse {
126                mark_id,
127                token: current,
128            })
129        })
130    }
131
132    /// Attach `(key, value)` to `target`.
133    pub fn annotate(
134        &self,
135        session_id: &str,
136        target: &AnnotationTarget,
137        key: &str,
138        value: Option<&str>,
139    ) -> Result<AnnotateResponse> {
140        let target_str = match target {
141            AnnotationTarget::Ref(r) => format!("ref:{r}"),
142            AnnotationTarget::Mark(name) => format!("mark:{name}"),
143            AnnotationTarget::Page => "page".to_string(),
144        };
145        let args = vec![target_str.clone(), key.to_string()];
146        let ctx = AuditCtx::new("vs_annotate", session_id).with_args(
147            format!("{target_str} {key}"),
148            tokens::args_hash("vs_annotate", &args),
149        );
150        self.audit_call(ctx, |_ctx| {
151            self.require_session(session_id)?;
152            let id = format!("an_{}", short_id());
153            let mut store = self.inner.store.lock().expect("poisoned");
154            let row = store.add_annotation(&id, target, key, value)?;
155            Ok(AnnotateResponse { id: row.id })
156        })
157    }
158
159    /// Slice the audit log for the session.
160    pub fn log(
161        &self,
162        session_id: &str,
163        page_id: Option<String>,
164        group_label: Option<String>,
165        since_started_at: Option<i64>,
166        limit: Option<i64>,
167    ) -> Result<LogResponse> {
168        let ctx = AuditCtx::new("vs_log", session_id)
169            .with_args(String::new(), tokens::args_hash("vs_log", &[]));
170        self.audit_call(ctx, |_ctx| {
171            self.require_session(session_id)?;
172            let filter = ActionFilter {
173                session_id: Some(session_id.to_string()),
174                page_id,
175                group_label,
176                since_started_at,
177                limit,
178            };
179            let store = self.inner.store.lock().expect("poisoned");
180            let rows = store.list_actions(&filter)?;
181            Ok(LogResponse { rows })
182        })
183    }
184}
185
186/// Walk the tree and emit one record per `tbl` row. Each record is a
187/// flat list of cell labels in document order. Rows can be nested
188/// under intermediate `el` placeholders (e.g. THEAD/TBODY collapse to
189/// `el` in the snapshot walker), so the row search recurses.
190fn extract_tables(tree: &Tree) -> Vec<Vec<String>> {
191    fn collect_rows(node: &vs_protocol::Node, out: &mut Vec<Vec<String>>) {
192        if matches!(node.role, vs_protocol::Role::Row) {
193            let cells: Vec<String> = collect_cells(node);
194            if !cells.is_empty() {
195                out.push(cells);
196            }
197            return;
198        }
199        for c in &node.children {
200            collect_rows(c, out);
201        }
202    }
203    fn collect_cells(node: &vs_protocol::Node) -> Vec<String> {
204        let mut acc = Vec::new();
205        for c in &node.children {
206            if matches!(c.role, vs_protocol::Role::Cell | vs_protocol::Role::Hdr) {
207                acc.push(c.label.clone());
208            } else {
209                // Cells nested under intermediate placeholders.
210                acc.extend(collect_cells(c));
211            }
212        }
213        acc
214    }
215    let mut out = Vec::new();
216    for node in tree {
217        if matches!(node.role, vs_protocol::Role::Tbl) {
218            for child in &node.children {
219                collect_rows(child, &mut out);
220            }
221        }
222    }
223    out
224}
225
226/// Walk the tree and emit one record per `lst` item: `[role, label]`.
227fn extract_lists(tree: &Tree) -> Vec<Vec<String>> {
228    fn collect_items(node: &vs_protocol::Node, out: &mut Vec<Vec<String>>) {
229        if matches!(node.role, vs_protocol::Role::Itm | vs_protocol::Role::Li) {
230            out.push(vec![node.role.to_string(), node.label.clone()]);
231            return;
232        }
233        for c in &node.children {
234            collect_items(c, out);
235        }
236    }
237    let mut out = Vec::new();
238    for node in tree {
239        if matches!(node.role, vs_protocol::Role::Lst) {
240            for child in &node.children {
241                collect_items(child, &mut out);
242            }
243        }
244    }
245    out
246}
247
248/// Run a JS extractor for `schema` against the live page and parse
249/// the JSON result into the same `Vec<Vec<String>>` shape the
250/// tree-walking extractors return. Used for `form` / `jsonld` /
251/// `webmcp` — schemas that need DOM access we don't carry on the tree.
252fn extract_via_engine(
253    engine: &vs_engine_webkit::EngineRuntime,
254    handle: vs_engine_webkit::PageHandle,
255    schema: &str,
256) -> Result<Vec<Vec<String>>> {
257    use vs_engine_webkit::inspector::EvalResult;
258    let js = match schema {
259        "form" => {
260            r"(function() {
261            var out = [];
262            for (var i = 0; i < document.forms.length; i++) {
263                var f = document.forms[i];
264                for (var j = 0; j < f.elements.length; j++) {
265                    var el = f.elements[j];
266                    if (!el.name && !el.id) continue;
267                    out.push([
268                        f.id || ('form_' + i),
269                        el.name || el.id,
270                        el.type || el.tagName.toLowerCase(),
271                        el.value || '',
272                    ]);
273                }
274            }
275            return JSON.stringify(out);
276        })()"
277        }
278        "jsonld" => {
279            r#"(function() {
280            var nodes = document.querySelectorAll('script[type="application/ld+json"]');
281            var out = [];
282            for (var i = 0; i < nodes.length; i++) {
283                out.push(['jsonld', nodes[i].textContent || '']);
284            }
285            return JSON.stringify(out);
286        })()"#
287        }
288        "webmcp" => {
289            r#"(function() {
290            var nodes = document.querySelectorAll('script[type="application/x-webmcp"]');
291            var out = [];
292            for (var i = 0; i < nodes.length; i++) {
293                out.push(['webmcp', nodes[i].textContent || '']);
294            }
295            return JSON.stringify(out);
296        })()"#
297        }
298        _ => return Err(DaemonError::BadRequest(format!("unknown schema: {schema}"))),
299    };
300    let result = engine
301        .eval_js(handle, js)
302        .map_err(|e| DaemonError::BadRequest(format!("engine: {e}")))?;
303    let value = match result {
304        EvalResult::Ok { value, .. } => value,
305        EvalResult::Thrown { kind, message } => {
306            return Err(DaemonError::BadRequest(format!(
307                "extract {schema}: {kind}: {message}"
308            )));
309        }
310        EvalResult::Syntax { message } => {
311            return Err(DaemonError::BadRequest(format!(
312                "extract {schema}: syntax: {message}"
313            )));
314        }
315    };
316    // The eval helper double-encodes: `value` is already a JSON string
317    // representing a JSON-encoded array. Decode once.
318    let arr: serde_json::Value = serde_json::from_str(&value)
319        .map_err(|e| DaemonError::BadRequest(format!("extract {schema}: parse: {e}")))?;
320    let rows = arr.as_array().cloned().unwrap_or_default();
321    let mut out = Vec::with_capacity(rows.len());
322    for row in rows {
323        let cells = row
324            .as_array()
325            .map(|a| {
326                a.iter()
327                    .map(|v: &serde_json::Value| {
328                        v.as_str().map_or_else(|| v.to_string(), str::to_string)
329                    })
330                    .collect::<Vec<_>>()
331            })
332            .unwrap_or_default();
333        out.push(cells);
334    }
335    Ok(out)
336}