Skip to main content

cortex_runtime/live/
perceive.rs

1//! PERCEIVE handler — render a single URL and return its encoding.
2
3use crate::cartography::feature_encoder;
4use crate::cartography::page_classifier;
5use crate::extraction::loader::{ExtractionLoader, ExtractionResult};
6use crate::renderer::{NavigationResult, RenderContext};
7use anyhow::Result;
8use serde::{Deserialize, Serialize};
9
10/// Result of perceiving a single page.
11#[derive(Debug, Clone, Serialize, Deserialize)]
12pub struct PerceiveResult {
13    /// The original URL that was requested.
14    pub url: String,
15    /// The final URL after redirects.
16    pub final_url: String,
17    /// Classified page type.
18    pub page_type: u8,
19    /// Classification confidence.
20    pub confidence: f32,
21    /// 128-dimension feature vector (sparse: only non-zero entries).
22    pub features: Vec<(usize, f32)>,
23    /// Optional raw text content of the page.
24    pub content: Option<String>,
25    /// Load time in milliseconds.
26    pub load_time_ms: u64,
27}
28
29/// Perceive a single URL: render, extract, encode.
30pub async fn perceive(
31    context: &mut dyn RenderContext,
32    url: &str,
33    include_content: bool,
34) -> Result<PerceiveResult> {
35    // Navigate to the page
36    let nav_result = context.navigate(url, 30_000).await?;
37
38    // Run extraction scripts
39    let extraction = run_extraction(context).await.unwrap_or_default();
40
41    // Classify the page
42    let (page_type, confidence) =
43        page_classifier::classify_page(&extraction, &nav_result.final_url);
44
45    // Encode features
46    let features = feature_encoder::encode_features(
47        &extraction,
48        &nav_result,
49        &nav_result.final_url,
50        page_type,
51        confidence,
52    );
53
54    // Convert to sparse representation
55    let sparse_features: Vec<(usize, f32)> = features
56        .iter()
57        .enumerate()
58        .filter(|(_, &v)| v != 0.0)
59        .map(|(i, &v)| (i, v))
60        .collect();
61
62    // Optionally extract text content
63    let content = if include_content {
64        extract_text_content(context).await.ok()
65    } else {
66        None
67    };
68
69    Ok(PerceiveResult {
70        url: url.to_string(),
71        final_url: nav_result.final_url,
72        page_type: page_type as u8,
73        confidence,
74        features: sparse_features,
75        content,
76        load_time_ms: nav_result.load_time_ms,
77    })
78}
79
80/// Run extraction scripts on the current page context.
81async fn run_extraction(context: &dyn RenderContext) -> Result<ExtractionResult> {
82    let loader = ExtractionLoader::new()?;
83    loader.inject_and_run(context).await
84}
85
86/// Extract visible text content from the page.
87async fn extract_text_content(context: &dyn RenderContext) -> Result<String> {
88    let result = context
89        .execute_js("document.body ? document.body.innerText : ''")
90        .await?;
91    Ok(result.as_str().unwrap_or("").to_string())
92}