1use crate::acquisition::http_client::HttpClient;
17use crate::acquisition::http_session::HttpSession;
18use crate::renderer::RenderContext;
19use anyhow::Result;
20use serde::{Deserialize, Serialize};
21use std::sync::OnceLock;
22
23const KNOWN_CANVAS_APIS_JSON: &str = include_str!("known_canvas_apis.json");
25
26#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
28pub enum CanvasAppType {
29 Spreadsheet,
30 DesignTool,
31 Map,
32 Whiteboard,
33 Game,
34 Diagram,
35 Unknown,
36}
37
38#[derive(Debug, Clone, Serialize, Deserialize)]
40pub struct GridData {
41 pub rows: u32,
43 pub cols: u32,
45 pub cells: Vec<(u32, u32, String)>,
47 pub headers: Vec<String>,
49}
50
51#[derive(Debug, Clone, Serialize, Deserialize)]
53pub struct Layer {
54 pub name: String,
56 pub visible: bool,
58 pub children: Vec<CanvasElement>,
60}
61
62#[derive(Debug, Clone, Serialize, Deserialize)]
64pub struct CanvasElement {
65 pub label: String,
67 pub role: String,
69 pub bounds: Option<(f32, f32, f32, f32)>,
71 pub action: Option<String>,
73}
74
75#[derive(Debug, Clone, Serialize, Deserialize)]
77pub struct CanvasState {
78 pub app_type: CanvasAppType,
80 pub grid: Option<GridData>,
82 pub layers: Option<Vec<Layer>>,
84 pub text_content: Vec<(String, f32, f32)>,
86 pub interactive_elements: Vec<CanvasElement>,
88 pub raw_state: Option<serde_json::Value>,
90 pub extraction_tier: ExtractionTier,
92}
93
94#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
96pub enum ExtractionTier {
97 KnownApi,
99 AccessibilityTree,
101 AppState,
103 None,
105}
106
107#[derive(Debug, Clone, Deserialize)]
110struct KnownCanvasApi {
111 data_api: String,
112 #[allow(dead_code)]
113 edit_api: Option<String>,
114 #[allow(dead_code)]
115 auth: Option<String>,
116 format: String,
117 app_type: String,
118}
119
120type CanvasApiRegistry = std::collections::HashMap<String, KnownCanvasApi>;
121
122fn canvas_api_registry() -> &'static CanvasApiRegistry {
123 static REGISTRY: OnceLock<CanvasApiRegistry> = OnceLock::new();
124 REGISTRY.get_or_init(|| serde_json::from_str(KNOWN_CANVAS_APIS_JSON).unwrap_or_default())
125}
126
127pub async fn extract_via_known_api(
136 url: &str,
137 _session: Option<&HttpSession>,
138 client: &HttpClient,
139) -> Option<CanvasState> {
140 let registry = canvas_api_registry();
141
142 let matching_config = registry
144 .iter()
145 .find(|(domain_prefix, _)| url.contains(domain_prefix.as_str()));
146
147 let (domain_key, config) = matching_config?;
148
149 let api_url = if config.data_api.starts_with("http") {
151 config.data_api.clone()
152 } else {
153 let base = url.split('?').next().unwrap_or(url);
155 format!("{}{}", base.trim_end_matches('/'), config.data_api)
156 };
157
158 let timeout = 10000;
160 let resp = client.get(&api_url, timeout).await.ok()?;
161
162 if resp.status != 200 {
163 return None;
164 }
165
166 let app_type = match config.app_type.as_str() {
167 "spreadsheet" => CanvasAppType::Spreadsheet,
168 "design" => CanvasAppType::DesignTool,
169 "map" => CanvasAppType::Map,
170 "whiteboard" => CanvasAppType::Whiteboard,
171 "diagram" => CanvasAppType::Diagram,
172 _ => CanvasAppType::Unknown,
173 };
174
175 let raw_state: Option<serde_json::Value> = if config.format == "json" {
177 serde_json::from_str(&resp.body).ok()
178 } else {
179 None
180 };
181
182 let grid = if app_type == CanvasAppType::Spreadsheet {
184 extract_grid_from_json(raw_state.as_ref())
185 } else {
186 None
187 };
188
189 let layers = if app_type == CanvasAppType::DesignTool {
191 extract_layers_from_json(raw_state.as_ref())
192 } else {
193 None
194 };
195
196 tracing::info!(
197 "Tier 1: extracted canvas state for {} via known API ({})",
198 domain_key,
199 config.app_type
200 );
201
202 Some(CanvasState {
203 app_type,
204 grid,
205 layers,
206 text_content: Vec::new(),
207 interactive_elements: Vec::new(),
208 raw_state,
209 extraction_tier: ExtractionTier::KnownApi,
210 })
211}
212
213pub async fn extract_via_accessibility(context: &dyn RenderContext) -> Option<CanvasState> {
223 let js = r#"
225 (() => {
226 const result = { elements: [], text: [] };
227
228 // Gather all elements with ARIA roles
229 const all = document.querySelectorAll('[role], [aria-label], [aria-valuetext]');
230 for (const el of all) {
231 const rect = el.getBoundingClientRect();
232 const entry = {
233 role: el.getAttribute('role') || el.tagName.toLowerCase(),
234 label: el.getAttribute('aria-label') || el.textContent?.trim()?.substring(0, 200) || '',
235 x: rect.x, y: rect.y, w: rect.width, h: rect.height,
236 action: el.getAttribute('href') || el.getAttribute('data-action') || null
237 };
238 if (entry.label && rect.width > 0 && rect.height > 0) {
239 result.elements.push(entry);
240 }
241 }
242
243 // Gather text from canvas-adjacent elements
244 const textEls = document.querySelectorAll('canvas ~ *, canvas + *, [aria-live]');
245 for (const el of textEls) {
246 const text = el.textContent?.trim();
247 if (text && text.length > 0 && text.length < 1000) {
248 const rect = el.getBoundingClientRect();
249 result.text.push({ text, x: rect.x, y: rect.y });
250 }
251 }
252
253 // Check for grid/table ARIA patterns
254 const grids = document.querySelectorAll('[role="grid"], [role="table"], [role="spreadsheet"]');
255 if (grids.length > 0) {
256 const grid = grids[0];
257 const rows = grid.querySelectorAll('[role="row"]');
258 const gridData = { rows: rows.length, cols: 0, cells: [], headers: [] };
259 rows.forEach((row, ri) => {
260 const cells = row.querySelectorAll('[role="gridcell"], [role="columnheader"], [role="cell"]');
261 gridData.cols = Math.max(gridData.cols, cells.length);
262 cells.forEach((cell, ci) => {
263 const text = cell.textContent?.trim() || '';
264 if (cell.getAttribute('role') === 'columnheader') {
265 gridData.headers.push(text);
266 }
267 if (text) {
268 gridData.cells.push([ri, ci, text]);
269 }
270 });
271 });
272 result.grid = gridData;
273 }
274
275 return JSON.stringify(result);
276 })()
277 "#;
278
279 let js_result = context.execute_js(js).await.ok()?;
280 let result_str = js_result.as_str()?;
281 let parsed: serde_json::Value = serde_json::from_str(result_str).ok()?;
282
283 let elements: Vec<CanvasElement> = parsed
284 .get("elements")
285 .and_then(|v| v.as_array())
286 .map(|arr| {
287 arr.iter()
288 .filter_map(|el| {
289 let label = el.get("label")?.as_str()?.to_string();
290 let role = el.get("role")?.as_str()?.to_string();
291 let x = el.get("x").and_then(|v| v.as_f64()).unwrap_or(0.0) as f32;
292 let y = el.get("y").and_then(|v| v.as_f64()).unwrap_or(0.0) as f32;
293 let w = el.get("w").and_then(|v| v.as_f64()).unwrap_or(0.0) as f32;
294 let h = el.get("h").and_then(|v| v.as_f64()).unwrap_or(0.0) as f32;
295 let action = el.get("action").and_then(|v| v.as_str()).map(String::from);
296 Some(CanvasElement {
297 label,
298 role,
299 bounds: Some((x, y, w, h)),
300 action,
301 })
302 })
303 .collect()
304 })
305 .unwrap_or_default();
306
307 let text_content: Vec<(String, f32, f32)> = parsed
308 .get("text")
309 .and_then(|v| v.as_array())
310 .map(|arr| {
311 arr.iter()
312 .filter_map(|t| {
313 let text = t.get("text")?.as_str()?.to_string();
314 let x = t.get("x").and_then(|v| v.as_f64()).unwrap_or(0.0) as f32;
315 let y = t.get("y").and_then(|v| v.as_f64()).unwrap_or(0.0) as f32;
316 Some((text, x, y))
317 })
318 .collect()
319 })
320 .unwrap_or_default();
321
322 let grid = parsed.get("grid").and_then(|g| {
324 let rows = g.get("rows")?.as_u64()? as u32;
325 let cols = g.get("cols")?.as_u64()? as u32;
326 let cells: Vec<(u32, u32, String)> = g
327 .get("cells")?
328 .as_array()?
329 .iter()
330 .filter_map(|c| {
331 let arr = c.as_array()?;
332 let r = arr.first()?.as_u64()? as u32;
333 let c_idx = arr.get(1)?.as_u64()? as u32;
334 let val = arr.get(2)?.as_str()?.to_string();
335 Some((r, c_idx, val))
336 })
337 .collect();
338 let headers: Vec<String> = g
339 .get("headers")?
340 .as_array()?
341 .iter()
342 .filter_map(|h| h.as_str().map(String::from))
343 .collect();
344 Some(GridData {
345 rows,
346 cols,
347 cells,
348 headers,
349 })
350 });
351
352 let app_type = if grid.is_some() {
353 CanvasAppType::Spreadsheet
354 } else if !elements.is_empty() {
355 CanvasAppType::Unknown
356 } else {
357 return None; };
359
360 tracing::info!(
361 "Tier 2: extracted {} elements + {} text entries from accessibility tree",
362 elements.len(),
363 text_content.len()
364 );
365
366 Some(CanvasState {
367 app_type,
368 grid,
369 layers: None,
370 text_content,
371 interactive_elements: elements,
372 raw_state: None,
373 extraction_tier: ExtractionTier::AccessibilityTree,
374 })
375}
376
377pub async fn extract_via_app_state(context: &dyn RenderContext) -> Option<CanvasState> {
388 let js = r#"
389 (() => {
390 // Try common state objects
391 const candidates = [
392 window.__INITIAL_STATE__,
393 window.__NEXT_DATA__,
394 window.__NUXT__,
395 window.__APP_STATE__,
396 window.__PRELOADED_STATE__,
397 ];
398
399 for (const state of candidates) {
400 if (state && typeof state === 'object') {
401 try {
402 const json = JSON.stringify(state);
403 if (json.length > 10 && json.length < 5000000) {
404 return json;
405 }
406 } catch(e) {}
407 }
408 }
409
410 // Try Redux store
411 try {
412 if (window.__REDUX_DEVTOOLS_EXTENSION__ || window.__store__) {
413 const store = window.__store__ || document.querySelector('[data-reactroot]')?.__store__;
414 if (store && typeof store.getState === 'function') {
415 const state = store.getState();
416 const json = JSON.stringify(state);
417 if (json.length > 10 && json.length < 5000000) {
418 return json;
419 }
420 }
421 }
422 } catch(e) {}
423
424 return null;
425 })()
426 "#;
427
428 let js_result = context.execute_js(js).await.ok()?;
429 let result_str = js_result.as_str()?;
430 let raw_state: serde_json::Value = serde_json::from_str(result_str).ok()?;
431
432 let app_type = classify_app_from_state(&raw_state);
434 let grid = extract_grid_from_json(Some(&raw_state));
435 let layers = extract_layers_from_json(Some(&raw_state));
436
437 tracing::info!("Tier 3: extracted app state ({:?})", app_type);
438
439 Some(CanvasState {
440 app_type,
441 grid,
442 layers,
443 text_content: Vec::new(),
444 interactive_elements: Vec::new(),
445 raw_state: Some(raw_state),
446 extraction_tier: ExtractionTier::AppState,
447 })
448}
449
450pub fn is_canvas_app(html: &str) -> bool {
457 html.contains("<canvas")
458 || html.contains("getContext('webgl')")
459 || html.contains("getContext(\"webgl\")")
460 || html.contains("getContext('2d')")
461 || html.contains("getContext(\"2d\")")
462 || html.contains("WebGLRenderingContext")
463}
464
465fn classify_app_from_state(state: &serde_json::Value) -> CanvasAppType {
469 let state_str = state.to_string().to_lowercase();
470
471 if state_str.contains("spreadsheet")
472 || (state_str.contains("\"cells\"")
473 || (state_str.contains("\"rows\"") && state_str.contains("\"columns\"")))
474 {
475 CanvasAppType::Spreadsheet
476 } else if state_str.contains("\"layers\"")
477 || (state_str.contains("\"canvas\"") && state_str.contains("\"frames\""))
478 {
479 CanvasAppType::DesignTool
480 } else if (state_str.contains("\"lat\"") && state_str.contains("\"lng\""))
481 || state_str.contains("\"latitude\"")
482 {
483 CanvasAppType::Map
484 } else if state_str.contains("\"whiteboard\"")
485 || (state_str.contains("\"board\"") && state_str.contains("\"shapes\""))
486 {
487 CanvasAppType::Whiteboard
488 } else {
489 CanvasAppType::Unknown
490 }
491}
492
493fn extract_grid_from_json(state: Option<&serde_json::Value>) -> Option<GridData> {
495 let state = state?;
496
497 if let Some(cells_obj) = state.get("cells").and_then(|v| v.as_object()) {
500 let mut cells = Vec::new();
501 let mut max_row = 0u32;
502 let mut max_col = 0u32;
503 let mut headers = Vec::new();
504
505 for (key, val) in cells_obj {
506 if let Some((row, col)) = parse_cell_ref(key) {
507 let value = val
508 .get("value")
509 .or_else(|| val.get("v"))
510 .and_then(|v| {
511 if v.is_string() {
512 v.as_str().map(String::from)
513 } else {
514 Some(v.to_string())
515 }
516 })
517 .unwrap_or_default();
518 if !value.is_empty() {
519 cells.push((row, col, value.clone()));
520 max_row = max_row.max(row);
521 max_col = max_col.max(col);
522 if row == 0 {
523 headers.push(value);
524 }
525 }
526 }
527 }
528
529 if !cells.is_empty() {
530 return Some(GridData {
531 rows: max_row + 1,
532 cols: max_col + 1,
533 cells,
534 headers,
535 });
536 }
537 }
538
539 if let Some(rows_arr) = state.get("rows").and_then(|v| v.as_array()) {
541 let mut cells = Vec::new();
542 let mut headers = Vec::new();
543 let mut max_col = 0u32;
544
545 for (ri, row) in rows_arr.iter().enumerate() {
546 if let Some(row_cells) = row.get("cells").and_then(|v| v.as_array()) {
547 for (ci, cell) in row_cells.iter().enumerate() {
548 let value = cell
549 .get("value")
550 .or_else(|| cell.get("v"))
551 .and_then(|v| {
552 if v.is_string() {
553 v.as_str().map(String::from)
554 } else {
555 Some(v.to_string())
556 }
557 })
558 .unwrap_or_default();
559 if !value.is_empty() {
560 cells.push((ri as u32, ci as u32, value.clone()));
561 max_col = max_col.max(ci as u32);
562 if ri == 0 {
563 headers.push(value);
564 }
565 }
566 }
567 }
568 }
569
570 if !cells.is_empty() {
571 return Some(GridData {
572 rows: rows_arr.len() as u32,
573 cols: max_col + 1,
574 cells,
575 headers,
576 });
577 }
578 }
579
580 None
581}
582
583fn parse_cell_ref(cell_ref: &str) -> Option<(u32, u32)> {
585 let mut col_part = String::new();
586 let mut row_part = String::new();
587
588 for ch in cell_ref.chars() {
589 if ch.is_ascii_alphabetic() {
590 col_part.push(ch.to_ascii_uppercase());
591 } else if ch.is_ascii_digit() {
592 row_part.push(ch);
593 } else {
594 return None;
595 }
596 }
597
598 if col_part.is_empty() || row_part.is_empty() {
599 return None;
600 }
601
602 let mut col: u32 = 0;
604 for ch in col_part.chars() {
605 col = col * 26 + (ch as u32 - 'A' as u32 + 1);
606 }
607 col -= 1; let row: u32 = row_part.parse::<u32>().ok()?.checked_sub(1)?;
610
611 Some((row, col))
612}
613
614fn extract_layers_from_json(state: Option<&serde_json::Value>) -> Option<Vec<Layer>> {
616 let state = state?;
617
618 let layers_arr = state
619 .get("layers")
620 .or_else(|| state.get("document").and_then(|d| d.get("layers")))
621 .or_else(|| state.get("children"))
622 .and_then(|v| v.as_array())?;
623
624 let layers: Vec<Layer> = layers_arr
625 .iter()
626 .filter_map(|l| {
627 let name = l
628 .get("name")
629 .or_else(|| l.get("id"))
630 .and_then(|v| v.as_str())
631 .map(String::from)?;
632 let visible = l.get("visible").and_then(|v| v.as_bool()).unwrap_or(true);
633 let children = l
634 .get("children")
635 .and_then(|v| v.as_array())
636 .map(|arr| {
637 arr.iter()
638 .filter_map(|c| {
639 let label = c
640 .get("name")
641 .or_else(|| c.get("id"))
642 .and_then(|v| v.as_str())
643 .map(String::from)?;
644 Some(CanvasElement {
645 label,
646 role: c
647 .get("type")
648 .and_then(|v| v.as_str())
649 .unwrap_or("unknown")
650 .to_string(),
651 bounds: None,
652 action: None,
653 })
654 })
655 .collect()
656 })
657 .unwrap_or_default();
658 Some(Layer {
659 name,
660 visible,
661 children,
662 })
663 })
664 .collect();
665
666 if layers.is_empty() {
667 None
668 } else {
669 Some(layers)
670 }
671}
672
673#[cfg(test)]
674mod tests {
675 use super::*;
676
677 #[test]
678 fn test_is_canvas_app() {
679 assert!(is_canvas_app(
680 "<html><body><canvas id='main'></canvas></body></html>"
681 ));
682 assert!(is_canvas_app("var ctx = el.getContext('2d');"));
683 assert!(!is_canvas_app("<html><body><h1>Hello</h1></body></html>"));
684 }
685
686 #[test]
687 fn test_parse_cell_ref() {
688 assert_eq!(parse_cell_ref("A1"), Some((0, 0)));
689 assert_eq!(parse_cell_ref("B3"), Some((2, 1)));
690 assert_eq!(parse_cell_ref("Z1"), Some((0, 25)));
691 assert_eq!(parse_cell_ref("AA1"), Some((0, 26)));
692 assert_eq!(parse_cell_ref(""), None);
693 assert_eq!(parse_cell_ref("123"), None);
694 assert_eq!(parse_cell_ref("A"), None);
695 }
696
697 #[test]
698 fn test_extract_grid_from_json_cells_pattern() {
699 let state = serde_json::json!({
700 "cells": {
701 "A1": {"value": "Name"},
702 "B1": {"value": "Price"},
703 "A2": {"value": "Widget"},
704 "B2": {"value": "29.99"}
705 }
706 });
707 let grid = extract_grid_from_json(Some(&state)).unwrap();
708 assert_eq!(grid.rows, 2);
709 assert_eq!(grid.cols, 2);
710 assert_eq!(grid.cells.len(), 4);
711 }
712
713 #[test]
714 fn test_extract_grid_from_json_rows_pattern() {
715 let state = serde_json::json!({
716 "rows": [
717 {"cells": [{"value": "Name"}, {"value": "Price"}]},
718 {"cells": [{"value": "Widget"}, {"value": "29.99"}]}
719 ]
720 });
721 let grid = extract_grid_from_json(Some(&state)).unwrap();
722 assert_eq!(grid.rows, 2);
723 assert_eq!(grid.cols, 2);
724 assert_eq!(grid.cells.len(), 4);
725 }
726
727 #[test]
728 fn test_extract_layers_from_json() {
729 let state = serde_json::json!({
730 "layers": [
731 {"name": "Background", "visible": true, "children": [
732 {"name": "Logo", "type": "image"}
733 ]},
734 {"name": "Content", "visible": true, "children": [
735 {"name": "Title", "type": "text"},
736 {"name": "Button", "type": "button"}
737 ]}
738 ]
739 });
740 let layers = extract_layers_from_json(Some(&state)).unwrap();
741 assert_eq!(layers.len(), 2);
742 assert_eq!(layers[0].name, "Background");
743 assert_eq!(layers[0].children.len(), 1);
744 assert_eq!(layers[1].children.len(), 2);
745 }
746
747 #[test]
748 fn test_classify_app_from_state() {
749 let spreadsheet = serde_json::json!({"cells": {}, "rows": [], "columns": []});
750 assert_eq!(
751 classify_app_from_state(&spreadsheet),
752 CanvasAppType::Spreadsheet
753 );
754
755 let map = serde_json::json!({"center": {"lat": 40.7, "lng": -74.0}});
756 assert_eq!(classify_app_from_state(&map), CanvasAppType::Map);
757
758 let unknown = serde_json::json!({"foo": "bar"});
759 assert_eq!(classify_app_from_state(&unknown), CanvasAppType::Unknown);
760 }
761
762 #[test]
763 fn test_empty_state() {
764 assert!(extract_grid_from_json(None).is_none());
765 assert!(extract_layers_from_json(None).is_none());
766 }
767}