Skip to main content

cortex_runtime/acquisition/
action_discovery.rs

1//! Discovers executable HTTP actions from HTML forms, JavaScript patterns,
2//! and known e-commerce platforms.
3//!
4//! This module complements the pattern engine by going deeper into action
5//! discovery: instead of just classifying buttons by label, it extracts the
6//! full HTTP-level details (URL, method, fields, body templates) needed to
7//! *execute* the action without a browser.
8//!
9//! Three discovery strategies are layered:
10//!
11//! 1. **HTML form parsing** — walks every `<form>` tag, resolves action URLs
12//!    against the base, and extracts all fields including hidden CSRF tokens.
13//! 2. **JavaScript API scanning** — regex-scans JS source for `fetch()`,
14//!    `axios`, `$.ajax`, and bare `/api/` string literals.
15//! 3. **Platform detection** — recognises Shopify, WooCommerce, Magento, and
16//!    BigCommerce from fingerprints in the HTML/JS and loads pre-built action
17//!    templates from an embedded JSON file.
18//!
19//! All public entry points are **synchronous**. Callers should wrap in
20//! `tokio::task::spawn_blocking` when integrating with the async runtime.
21
22use crate::map::types::OpCode;
23use regex::Regex;
24use scraper::{Html, Selector};
25use serde::{Deserialize, Serialize};
26use std::sync::OnceLock;
27
28// ── Compile-time platform configuration ─────────────────────────────────────
29
30/// Raw JSON content of the platform action templates, embedded at compile time.
31const PLATFORM_ACTIONS_JSON: &str = include_str!("platform_actions.json");
32
33// ── Public types ────────────────────────────────────────────────────────────
34
35/// A single field inside an HTML form.
36#[derive(Debug, Clone, Serialize, Deserialize)]
37pub struct FormField {
38    /// The `name` attribute of the field.
39    pub name: String,
40    /// The `type` attribute (e.g., `"text"`, `"hidden"`, `"email"`).
41    pub field_type: String,
42    /// Pre-filled `value` attribute, if any.
43    pub value: Option<String>,
44    /// Whether the field has the `required` attribute.
45    pub required: bool,
46}
47
48/// An action that can be executed via HTTP (no browser needed).
49#[derive(Debug, Clone, Serialize, Deserialize)]
50pub struct HttpAction {
51    /// OpCode `(category, action)` for the binary map spec.
52    pub opcode: OpCode,
53    /// Human-readable label (e.g., "Add to Cart", "Login").
54    pub label: String,
55    /// Where this action was discovered.
56    pub source: ActionSource,
57    /// Confidence that this action is correctly identified, in `[0.0, 1.0]`.
58    pub confidence: f32,
59}
60
61/// Where an [`HttpAction`] was discovered.
62#[derive(Debug, Clone, Serialize, Deserialize)]
63pub enum ActionSource {
64    /// Discovered from an HTML `<form>` element.
65    Form {
66        /// Resolved form action URL.
67        action_url: String,
68        /// HTTP method (`GET` or `POST`).
69        method: String,
70        /// Content type (e.g., `application/x-www-form-urlencoded`).
71        content_type: String,
72        /// All fields inside the form.
73        fields: Vec<FormField>,
74    },
75    /// Discovered from JavaScript API calls.
76    Api {
77        /// The API endpoint URL.
78        url: String,
79        /// HTTP method (`GET`, `POST`, `PUT`, `DELETE`).
80        method: String,
81        /// Optional body template extracted from JS source.
82        body_template: Option<String>,
83    },
84    /// Discovered from a known e-commerce platform.
85    Platform {
86        /// Platform name (e.g., `"shopify"`, `"woocommerce"`).
87        platform: String,
88        /// Action type from the platform template (e.g., `"add_to_cart"`).
89        action_type: String,
90    },
91}
92
93/// The detected e-commerce platform for a site.
94#[derive(Debug, Clone, PartialEq, Eq)]
95pub enum DetectedPlatform {
96    /// Shopify storefront.
97    Shopify,
98    /// WooCommerce (WordPress).
99    WooCommerce,
100    /// Adobe Commerce / Magento.
101    Magento,
102    /// BigCommerce.
103    BigCommerce,
104    /// Squarespace.
105    Squarespace,
106    /// Wix.
107    Wix,
108    /// PrestaShop.
109    PrestaShop,
110    /// OpenCart.
111    OpenCart,
112    /// Next.js Commerce / Vercel storefronts.
113    NextJsCommerce,
114    /// WordPress (non-WooCommerce).
115    WordPress,
116    /// Drupal.
117    Drupal,
118    /// No recognised platform.
119    Unknown,
120}
121
122// ── Internal JSON deserialization types ─────────────────────────────────────
123
124#[derive(Debug, Deserialize)]
125struct PlatformConfig {
126    indicators: PlatformIndicators,
127    actions: Vec<PlatformActionTemplate>,
128}
129
130#[derive(Debug, Deserialize)]
131struct PlatformIndicators {
132    js_patterns: Vec<String>,
133    html_patterns: Vec<String>,
134}
135
136#[derive(Debug, Deserialize)]
137struct PlatformActionTemplate {
138    label: String,
139    opcode: PlatformOpCode,
140    action_type: String,
141    #[allow(dead_code)]
142    url_template: String,
143    #[allow(dead_code)]
144    method: String,
145    confidence: f32,
146}
147
148#[derive(Debug, Deserialize)]
149struct PlatformOpCode {
150    category: u8,
151    action: u8,
152}
153
154type PlatformRegistry = std::collections::HashMap<String, PlatformConfig>;
155
156/// Parse and cache the embedded platform action templates.
157fn platform_registry() -> &'static PlatformRegistry {
158    static REGISTRY: OnceLock<PlatformRegistry> = OnceLock::new();
159    REGISTRY.get_or_init(|| serde_json::from_str(PLATFORM_ACTIONS_JSON).unwrap_or_default())
160}
161
162// ── Public API ──────────────────────────────────────────────────────────────
163
164/// Discover HTTP actions from HTML forms and interactive elements.
165///
166/// Parses every `<form>` tag in the document, resolves action URLs relative to
167/// `base_url`, and extracts all input fields (including hidden ones such as
168/// CSRF tokens and product IDs). Buttons associated with forms are also
169/// recognised.
170///
171/// # Arguments
172///
173/// * `html` - Raw HTML source of the page.
174/// * `base_url` - Base URL for resolving relative action URLs.
175///
176/// # Returns
177///
178/// A vector of [`HttpAction`] items, one per form discovered.
179pub fn discover_actions_from_html(html: &str, base_url: &str) -> Vec<HttpAction> {
180    let document = Html::parse_document(html);
181    let mut actions = Vec::new();
182
183    // Parse all <form> elements.
184    let form_sel = match Selector::parse("form") {
185        Ok(s) => s,
186        Err(_) => return actions,
187    };
188    let field_sel = Selector::parse("input, select, textarea").expect("field selector is valid");
189    let button_sel =
190        Selector::parse("button, input[type=\"submit\"]").expect("button selector is valid");
191
192    for form in document.select(&form_sel) {
193        let action_raw = form.value().attr("action").unwrap_or("");
194        let action_url = resolve_url(base_url, action_raw);
195        let method = form.value().attr("method").unwrap_or("GET").to_uppercase();
196        let enctype = form
197            .value()
198            .attr("enctype")
199            .unwrap_or("application/x-www-form-urlencoded")
200            .to_string();
201
202        let mut fields = Vec::new();
203        for field in form.select(&field_sel) {
204            let name = field.value().attr("name").unwrap_or("").to_string();
205            if name.is_empty() {
206                continue;
207            }
208            let field_type = field
209                .value()
210                .attr("type")
211                .unwrap_or(field.value().name())
212                .to_string();
213            let value = field.value().attr("value").map(String::from);
214            let required = field.value().attr("required").is_some();
215
216            fields.push(FormField {
217                name,
218                field_type,
219                value,
220                required,
221            });
222        }
223
224        // Derive label from submit button text or value.
225        let submit_label = form
226            .select(&button_sel)
227            .next()
228            .map(|el| {
229                el.value()
230                    .attr("value")
231                    .map(String::from)
232                    .unwrap_or_else(|| element_text(&el))
233            })
234            .filter(|s| !s.is_empty());
235
236        let label = submit_label.unwrap_or_else(|| format!("Form \u{2192} {action_url}"));
237        let opcode = classify_form_opcode(&label, &action_url);
238
239        actions.push(HttpAction {
240            opcode,
241            label,
242            source: ActionSource::Form {
243                action_url,
244                method,
245                content_type: enctype,
246                fields,
247            },
248            confidence: 0.90,
249        });
250    }
251
252    actions
253}
254
255/// Discover HTTP actions from JavaScript source code.
256///
257/// Uses regex patterns to find API calls made with `fetch()`, `axios`,
258/// `$.ajax`, and bare `/api/` string literals in the JS source.
259///
260/// # Arguments
261///
262/// * `js_source` - Raw JavaScript source text.
263/// * `base_url` - Base URL for resolving relative API paths.
264///
265/// # Returns
266///
267/// A vector of [`HttpAction`] items, one per discovered API endpoint.
268pub fn discover_actions_from_js(js_source: &str, base_url: &str) -> Vec<HttpAction> {
269    let mut actions = Vec::new();
270    let mut seen_urls: std::collections::HashSet<String> = std::collections::HashSet::new();
271
272    // Pattern 1: fetch("url", {method: "METHOD"})
273    let fetch_re =
274        Regex::new(r#"fetch\(\s*['"]([^'"]+)['"]\s*,\s*\{[^}]*method\s*:\s*['"](\w+)['"]"#)
275            .expect("fetch regex is valid");
276
277    for caps in fetch_re.captures_iter(js_source) {
278        let url_raw = caps.get(1).map_or("", |m| m.as_str());
279        let method = caps.get(2).map_or("GET", |m| m.as_str()).to_uppercase();
280        let url = resolve_url(base_url, url_raw);
281        if seen_urls.insert(format!("{method}:{url}")) {
282            actions.push(HttpAction {
283                opcode: classify_api_opcode(&url, &method),
284                label: format!("{method} {url_raw}"),
285                source: ActionSource::Api {
286                    url,
287                    method,
288                    body_template: None,
289                },
290                confidence: 0.80,
291            });
292        }
293    }
294
295    // Pattern 1b: fetch("url") -- no options, defaults to GET
296    let fetch_simple_re =
297        Regex::new(r#"fetch\(\s*['"]([^'"]+)['"]\s*\)"#).expect("fetch simple regex is valid");
298
299    for caps in fetch_simple_re.captures_iter(js_source) {
300        let url_raw = caps.get(1).map_or("", |m| m.as_str());
301        let method = "GET".to_string();
302        let url = resolve_url(base_url, url_raw);
303        if seen_urls.insert(format!("{method}:{url}")) {
304            actions.push(HttpAction {
305                opcode: classify_api_opcode(&url, &method),
306                label: format!("GET {url_raw}"),
307                source: ActionSource::Api {
308                    url,
309                    method,
310                    body_template: None,
311                },
312                confidence: 0.70,
313            });
314        }
315    }
316
317    // Pattern 2: axios.get|post|put|delete("url")
318    let axios_re = Regex::new(r#"axios\.(get|post|put|delete|patch)\(\s*['"]([^'"]+)['"]"#)
319        .expect("axios regex is valid");
320
321    for caps in axios_re.captures_iter(js_source) {
322        let method = caps.get(1).map_or("GET", |m| m.as_str()).to_uppercase();
323        let url_raw = caps.get(2).map_or("", |m| m.as_str());
324        let url = resolve_url(base_url, url_raw);
325        if seen_urls.insert(format!("{method}:{url}")) {
326            actions.push(HttpAction {
327                opcode: classify_api_opcode(&url, &method),
328                label: format!("{method} {url_raw}"),
329                source: ActionSource::Api {
330                    url,
331                    method,
332                    body_template: None,
333                },
334                confidence: 0.80,
335            });
336        }
337    }
338
339    // Pattern 3: $.ajax({url: "...", type: "..."})
340    let ajax_block_re =
341        Regex::new(r#"\$\.ajax\(\s*\{([^}]*)\}"#).expect("ajax block regex is valid");
342    let inner_url_re =
343        Regex::new(r#"url\s*:\s*['"]([^'"]+)['"]"#).expect("inner url regex is valid");
344    let inner_type_re =
345        Regex::new(r#"type\s*:\s*['"](\w+)['"]"#).expect("inner type regex is valid");
346
347    for caps in ajax_block_re.captures_iter(js_source) {
348        let block = caps.get(1).map_or("", |m| m.as_str());
349        if let Some(url_caps) = inner_url_re.captures(block) {
350            let url_raw = url_caps.get(1).map_or("", |m| m.as_str());
351            let method = inner_type_re
352                .captures(block)
353                .and_then(|c| c.get(1))
354                .map_or("GET", |m| m.as_str())
355                .to_uppercase();
356            let url = resolve_url(base_url, url_raw);
357            if seen_urls.insert(format!("{method}:{url}")) {
358                actions.push(HttpAction {
359                    opcode: classify_api_opcode(&url, &method),
360                    label: format!("{method} {url_raw}"),
361                    source: ActionSource::Api {
362                        url,
363                        method,
364                        body_template: None,
365                    },
366                    confidence: 0.75,
367                });
368            }
369        }
370    }
371
372    // Pattern 4: bare /api/ path string literals
373    let api_path_re = Regex::new(r#"['"](/api/[^'"]+)['"]"#).expect("api path regex is valid");
374
375    for caps in api_path_re.captures_iter(js_source) {
376        let url_raw = caps.get(1).map_or("", |m| m.as_str());
377        let url = resolve_url(base_url, url_raw);
378        let method = "GET".to_string();
379        if seen_urls.insert(format!("{method}:{url}")) {
380            actions.push(HttpAction {
381                opcode: classify_api_opcode(&url, &method),
382                label: format!("API {url_raw}"),
383                source: ActionSource::Api {
384                    url,
385                    method,
386                    body_template: None,
387                },
388                confidence: 0.60,
389            });
390        }
391    }
392
393    // Pattern 5: GraphQL endpoints
394    let graphql_re =
395        Regex::new(r#"['"]([^'"]*(?:/graphql|/gql)[^'"]*)['"]"#).expect("graphql regex is valid");
396
397    for caps in graphql_re.captures_iter(js_source) {
398        let url_raw = caps.get(1).map_or("", |m| m.as_str());
399        if url_raw.is_empty() || url_raw.len() > 200 {
400            continue;
401        }
402        let url = resolve_url(base_url, url_raw);
403        let method = "POST".to_string();
404        if seen_urls.insert(format!("{method}:{url}")) {
405            actions.push(HttpAction {
406                opcode: classify_api_opcode(&url, &method),
407                label: format!("GraphQL {url_raw}"),
408                source: ActionSource::Api {
409                    url,
410                    method,
411                    body_template: Some(r#"{"query":"","variables":{}}"#.to_string()),
412                },
413                confidence: 0.75,
414            });
415        }
416    }
417
418    // Pattern 6: XMLHttpRequest open calls
419    let xhr_re = Regex::new(r#"\.open\(\s*['"](\w+)['"]\s*,\s*['"]([^'"]+)['"]"#)
420        .expect("xhr regex is valid");
421
422    for caps in xhr_re.captures_iter(js_source) {
423        let method = caps.get(1).map_or("GET", |m| m.as_str()).to_uppercase();
424        let url_raw = caps.get(2).map_or("", |m| m.as_str());
425        if url_raw.is_empty() || url_raw.len() > 200 {
426            continue;
427        }
428        let url = resolve_url(base_url, url_raw);
429        if seen_urls.insert(format!("{method}:{url}")) {
430            actions.push(HttpAction {
431                opcode: classify_api_opcode(&url, &method),
432                label: format!("{method} {url_raw}"),
433                source: ActionSource::Api {
434                    url,
435                    method,
436                    body_template: None,
437                },
438                confidence: 0.70,
439            });
440        }
441    }
442
443    // Pattern 7: REST-style versioned API paths (/v1/, /v2/, etc.)
444    let rest_v_re =
445        Regex::new(r#"['"]([^'"]*?/v[0-9]+/[^'"]+)['"]"#).expect("rest version regex is valid");
446
447    for caps in rest_v_re.captures_iter(js_source) {
448        let url_raw = caps.get(1).map_or("", |m| m.as_str());
449        if url_raw.is_empty() || url_raw.len() > 200 || !url_raw.starts_with('/') {
450            continue;
451        }
452        let url = resolve_url(base_url, url_raw);
453        let method = "GET".to_string();
454        if seen_urls.insert(format!("{method}:{url}")) {
455            actions.push(HttpAction {
456                opcode: classify_api_opcode(&url, &method),
457                label: format!("REST {url_raw}"),
458                source: ActionSource::Api {
459                    url,
460                    method,
461                    body_template: None,
462                },
463                confidence: 0.55,
464            });
465        }
466    }
467
468    actions
469}
470
471/// Detect the e-commerce platform and return platform-specific actions.
472///
473/// Checks for Shopify, WooCommerce, Magento, and BigCommerce fingerprints
474/// in both HTML content and JavaScript. When a platform is detected, loads
475/// pre-built action templates from the embedded `platform_actions.json`.
476///
477/// # Arguments
478///
479/// * `domain` - The domain being mapped (used for URL resolution).
480/// * `page_html` - Raw HTML source of the page.
481///
482/// # Returns
483///
484/// A vector of [`HttpAction`] items from the detected platform's template.
485/// Returns an empty vector if no platform is detected.
486pub fn discover_actions_from_platform(_domain: &str, page_html: &str) -> Vec<HttpAction> {
487    let platform = detect_platform(page_html);
488    if platform == DetectedPlatform::Unknown {
489        return Vec::new();
490    }
491
492    let platform_key = match platform {
493        DetectedPlatform::Shopify => "shopify",
494        DetectedPlatform::WooCommerce => "woocommerce",
495        DetectedPlatform::Magento => "magento",
496        DetectedPlatform::BigCommerce => "bigcommerce",
497        DetectedPlatform::Squarespace => "squarespace",
498        DetectedPlatform::Wix => "wix",
499        DetectedPlatform::PrestaShop => "prestashop",
500        DetectedPlatform::OpenCart => "opencart",
501        DetectedPlatform::NextJsCommerce => "nextjs_commerce",
502        DetectedPlatform::WordPress => "wordpress",
503        DetectedPlatform::Drupal => "drupal",
504        DetectedPlatform::Unknown => return Vec::new(),
505    };
506
507    let registry = platform_registry();
508    let config = match registry.get(platform_key) {
509        Some(c) => c,
510        None => return Vec::new(),
511    };
512
513    config
514        .actions
515        .iter()
516        .map(|tmpl| HttpAction {
517            opcode: OpCode::new(tmpl.opcode.category, tmpl.opcode.action),
518            label: tmpl.label.clone(),
519            source: ActionSource::Platform {
520                platform: platform_key.to_string(),
521                action_type: tmpl.action_type.clone(),
522            },
523            confidence: tmpl.confidence,
524        })
525        .collect()
526}
527
528/// Detect the e-commerce platform from HTML content.
529///
530/// Checks for known fingerprints in the page source: JavaScript global
531/// objects, CDN URLs, CSS class names, and plugin paths.
532///
533/// # Arguments
534///
535/// * `html` - Raw HTML source of the page.
536///
537/// # Returns
538///
539/// The [`DetectedPlatform`] variant for the first matching platform,
540/// or [`DetectedPlatform::Unknown`] if none matched.
541pub fn detect_platform(html: &str) -> DetectedPlatform {
542    let registry = platform_registry();
543
544    // Check each platform in a deterministic order (most specific first).
545    let platform_order = [
546        "shopify",
547        "woocommerce",
548        "magento",
549        "bigcommerce",
550        "squarespace",
551        "wix",
552        "prestashop",
553        "opencart",
554        "nextjs_commerce",
555        "drupal",
556        "wordpress",
557    ];
558
559    for &key in &platform_order {
560        if let Some(config) = registry.get(key) {
561            let matched = config
562                .indicators
563                .js_patterns
564                .iter()
565                .any(|pat| html.contains(pat.as_str()))
566                || config
567                    .indicators
568                    .html_patterns
569                    .iter()
570                    .any(|pat| html.contains(pat.as_str()));
571
572            if matched {
573                return match key {
574                    "shopify" => DetectedPlatform::Shopify,
575                    "woocommerce" => DetectedPlatform::WooCommerce,
576                    "magento" => DetectedPlatform::Magento,
577                    "bigcommerce" => DetectedPlatform::BigCommerce,
578                    "squarespace" => DetectedPlatform::Squarespace,
579                    "wix" => DetectedPlatform::Wix,
580                    "prestashop" => DetectedPlatform::PrestaShop,
581                    "opencart" => DetectedPlatform::OpenCart,
582                    "nextjs_commerce" => DetectedPlatform::NextJsCommerce,
583                    "wordpress" => DetectedPlatform::WordPress,
584                    "drupal" => DetectedPlatform::Drupal,
585                    _ => DetectedPlatform::Unknown,
586                };
587            }
588        }
589    }
590
591    DetectedPlatform::Unknown
592}
593
594// ── Private helpers ─────────────────────────────────────────────────────────
595
596/// Collect all visible text content from an element, trimmed and
597/// whitespace-collapsed.
598fn element_text(el: &scraper::ElementRef<'_>) -> String {
599    el.text()
600        .collect::<Vec<_>>()
601        .join(" ")
602        .split_whitespace()
603        .collect::<Vec<_>>()
604        .join(" ")
605}
606
607/// Resolve a potentially relative URL against a base URL.
608fn resolve_url(base_url: &str, relative: &str) -> String {
609    if relative.is_empty() {
610        return base_url.to_string();
611    }
612    if relative.starts_with("http://") || relative.starts_with("https://") {
613        return relative.to_string();
614    }
615    if let Ok(base) = url::Url::parse(base_url) {
616        if let Ok(resolved) = base.join(relative) {
617            return resolved.to_string();
618        }
619    }
620    relative.to_string()
621}
622
623/// Classify a form into an OpCode based on its submit label and action URL.
624fn classify_form_opcode(label: &str, action_url: &str) -> OpCode {
625    let label_lower = label.to_lowercase();
626    let url_lower = action_url.to_lowercase();
627
628    if label_lower.contains("add to cart")
629        || label_lower.contains("add to bag")
630        || url_lower.contains("/cart/add")
631    {
632        return OpCode::new(0x02, 0x00); // commerce: add_to_cart
633    }
634    if label_lower.contains("search") || url_lower.contains("/search") {
635        return OpCode::new(0x00, 0x01); // nav: search
636    }
637    if label_lower.contains("log in")
638        || label_lower.contains("login")
639        || label_lower.contains("sign in")
640        || url_lower.contains("/login")
641        || url_lower.contains("/signin")
642    {
643        return OpCode::new(0x04, 0x01); // form: login
644    }
645    if label_lower.contains("subscribe") || label_lower.contains("newsletter") {
646        return OpCode::new(0x04, 0x03); // form: subscribe
647    }
648    if label_lower.contains("register")
649        || label_lower.contains("sign up")
650        || label_lower.contains("signup")
651    {
652        return OpCode::new(0x04, 0x02); // form: register
653    }
654
655    // Default: generic form submit
656    OpCode::new(0x04, 0x00)
657}
658
659/// Classify an API endpoint into an OpCode based on URL path and method.
660fn classify_api_opcode(url: &str, method: &str) -> OpCode {
661    let url_lower = url.to_lowercase();
662
663    if url_lower.contains("/cart") || url_lower.contains("/basket") {
664        return match method {
665            "POST" => OpCode::new(0x02, 0x00),   // commerce: add_to_cart
666            "DELETE" => OpCode::new(0x02, 0x01), // commerce: remove_from_cart
667            "PUT" | "PATCH" => OpCode::new(0x02, 0x02), // commerce: update_cart
668            _ => OpCode::new(0x02, 0x06),        // commerce: view_cart
669        };
670    }
671    if url_lower.contains("/search") || url_lower.contains("/query") {
672        return OpCode::new(0x00, 0x01); // nav: search
673    }
674    if url_lower.contains("/auth") || url_lower.contains("/login") || url_lower.contains("/session")
675    {
676        return OpCode::new(0x04, 0x01); // form: login
677    }
678
679    // Default: generic API call
680    OpCode::new(0x06, 0x00) // data: api_call
681}
682
683// ── Tests ───────────────────────────────────────────────────────────────────
684
685#[cfg(test)]
686mod tests {
687    use super::*;
688
689    #[test]
690    fn test_discover_form_actions() {
691        let html = r#"
692        <html><body>
693            <form action="/search" method="GET">
694                <input type="text" name="q" required />
695                <input type="hidden" name="ref" value="nav_search" />
696                <button type="submit">Search</button>
697            </form>
698        </body></html>
699        "#;
700
701        let actions = discover_actions_from_html(html, "https://example.com");
702        assert_eq!(actions.len(), 1);
703
704        let action = &actions[0];
705        if let ActionSource::Form { fields, .. } = &action.source {
706            assert_eq!(fields.len(), 2);
707
708            let q_field = fields.iter().find(|f| f.name == "q").unwrap();
709            assert_eq!(q_field.field_type, "text");
710            assert!(q_field.required);
711
712            let ref_field = fields.iter().find(|f| f.name == "ref").unwrap();
713            assert_eq!(ref_field.field_type, "hidden");
714            assert_eq!(ref_field.value.as_deref(), Some("nav_search"));
715        } else {
716            panic!("expected ActionSource::Form");
717        }
718    }
719
720    #[test]
721    fn test_discover_form_post_action() {
722        let html = r#"
723        <html><body>
724            <form action="/login" method="POST">
725                <input type="text" name="username" />
726                <input type="password" name="password" />
727                <button type="submit">Log In</button>
728            </form>
729        </body></html>
730        "#;
731
732        let actions = discover_actions_from_html(html, "https://example.com");
733        assert_eq!(actions.len(), 1);
734
735        let action = &actions[0];
736        if let ActionSource::Form {
737            action_url, method, ..
738        } = &action.source
739        {
740            assert_eq!(action_url, "https://example.com/login");
741            assert_eq!(method, "POST");
742        } else {
743            panic!("expected ActionSource::Form");
744        }
745    }
746
747    #[test]
748    fn test_discover_js_fetch() {
749        let js = r#"
750            async function addItem(id) {
751                const resp = await fetch('/api/cart', {method: 'POST', body: JSON.stringify({id})});
752                return resp.json();
753            }
754        "#;
755
756        let actions = discover_actions_from_js(js, "https://shop.example.com");
757        assert!(!actions.is_empty());
758
759        let cart_action = actions
760            .iter()
761            .find(|a| {
762                matches!(
763                    &a.source,
764                    ActionSource::Api { url, .. } if url.contains("/api/cart")
765                )
766            })
767            .expect("should find /api/cart action");
768
769        if let ActionSource::Api { method, .. } = &cart_action.source {
770            assert_eq!(method, "POST");
771        } else {
772            panic!("expected ActionSource::Api");
773        }
774    }
775
776    #[test]
777    fn test_discover_js_axios() {
778        let js = r#"
779            axios.post('/api/items', { name: 'widget' });
780            axios.get('/api/items/123');
781        "#;
782
783        let actions = discover_actions_from_js(js, "https://example.com");
784        assert!(actions.len() >= 2);
785
786        let post_action = actions
787            .iter()
788            .find(|a| {
789                matches!(
790                    &a.source,
791                    ActionSource::Api { url, method, .. }
792                        if url.contains("/api/items") && !url.contains("/123") && method == "POST"
793                )
794            })
795            .expect("should find POST /api/items");
796        assert!(
797            matches!(&post_action.source, ActionSource::Api { method, .. } if method == "POST")
798        );
799
800        let get_action = actions
801            .iter()
802            .find(|a| {
803                matches!(
804                    &a.source,
805                    ActionSource::Api { url, method, .. }
806                        if url.contains("/api/items/123") && method == "GET"
807                )
808            })
809            .expect("should find GET /api/items/123");
810        assert!(matches!(&get_action.source, ActionSource::Api { method, .. } if method == "GET"));
811    }
812
813    #[test]
814    fn test_detect_shopify() {
815        let html = r#"
816        <html>
817        <head>
818            <script src="https://cdn.shopify.com/s/files/1/shop.js"></script>
819        </head>
820        <body>
821            <div id="product">Widget</div>
822        </body>
823        </html>
824        "#;
825
826        assert_eq!(detect_platform(html), DetectedPlatform::Shopify);
827    }
828
829    #[test]
830    fn test_detect_woocommerce() {
831        let html = r#"
832        <html>
833        <body class="woocommerce woocommerce-page">
834            <div class="product">
835                <a href="/wp-content/plugins/woocommerce/assets/style.css"></a>
836            </div>
837        </body>
838        </html>
839        "#;
840
841        assert_eq!(detect_platform(html), DetectedPlatform::WooCommerce);
842    }
843
844    #[test]
845    fn test_detect_no_platform() {
846        let html = r#"
847        <html>
848        <body>
849            <h1>My Personal Blog</h1>
850            <p>Nothing to see here.</p>
851        </body>
852        </html>
853        "#;
854
855        assert_eq!(detect_platform(html), DetectedPlatform::Unknown);
856    }
857
858    #[test]
859    fn test_discover_platform_actions_shopify() {
860        let html = r#"
861        <html>
862        <head>
863            <script>
864                Shopify.shop = "mystore.myshopify.com";
865            </script>
866        </head>
867        <body>
868            <div class="product">Widget</div>
869        </body>
870        </html>
871        "#;
872
873        let actions = discover_actions_from_platform("mystore.myshopify.com", html);
874        assert!(!actions.is_empty());
875
876        let add_to_cart = actions
877            .iter()
878            .find(|a| {
879                matches!(
880                    &a.source,
881                    ActionSource::Platform { action_type, .. } if action_type == "add_to_cart"
882                )
883            })
884            .expect("should find add_to_cart action");
885
886        assert_eq!(add_to_cart.opcode, OpCode::new(0x02, 0x00));
887        assert!(add_to_cart.confidence > 0.0);
888    }
889
890    #[test]
891    fn test_empty_html() {
892        let actions = discover_actions_from_html("", "https://example.com");
893        assert!(actions.is_empty());
894    }
895
896    #[test]
897    fn test_form_csrf_token() {
898        let html = r#"
899        <html><body>
900            <form action="/transfer" method="POST">
901                <input type="hidden" name="_csrf" value="abc123def456" />
902                <input type="hidden" name="_token" value="xyz789" />
903                <input type="number" name="amount" required />
904                <button type="submit">Submit</button>
905            </form>
906        </body></html>
907        "#;
908
909        let actions = discover_actions_from_html(html, "https://bank.example.com");
910        assert_eq!(actions.len(), 1);
911
912        if let ActionSource::Form { fields, .. } = &actions[0].source {
913            let csrf = fields
914                .iter()
915                .find(|f| f.name == "_csrf")
916                .expect("should find _csrf field");
917            assert_eq!(csrf.field_type, "hidden");
918            assert_eq!(csrf.value.as_deref(), Some("abc123def456"));
919
920            let token = fields
921                .iter()
922                .find(|f| f.name == "_token")
923                .expect("should find _token field");
924            assert_eq!(token.field_type, "hidden");
925            assert_eq!(token.value.as_deref(), Some("xyz789"));
926        } else {
927            panic!("expected ActionSource::Form");
928        }
929    }
930}