semantic_dom_ssg/
lib.rs

1//! # semantic-dom-ssg
2//!
3//! Machine-readable web semantics for AI agents.
4//!
5//! This crate provides O(1) element lookup, deterministic navigation, and
6//! token-efficient serialization formats optimized for LLM consumption.
7//!
8//! ## Features
9//!
10//! - **O(1) Lookup**: Hash-indexed nodes via `AHashMap`
11//! - **State Graph**: Explicit FSM for UI states and transitions
12//! - **Agent Summary**: ~100 tokens vs ~800 for JSON (87% reduction)
13//! - **Security**: Input validation, URL sanitization, size limits
14//!
15//! ## Quick Start
16//!
17//! ```rust,no_run
18//! use semantic_dom_ssg::{SemanticDOM, Config};
19//!
20//! let html = r#"<html><body><main><button>Submit</button></main></body></html>"#;
21//! let sdom = SemanticDOM::parse(html, Config::default()).unwrap();
22//!
23//! // O(1) lookup by iterating index
24//! for (id, node) in &sdom.index {
25//!     println!("{}: {}", id, node.label);
26//! }
27//!
28//! // Token-efficient summary (~100 tokens)
29//! let summary = sdom.to_agent_summary();
30//! println!("{}", summary);
31//! ```
32//!
33//! ## Security
34//!
35//! This crate implements security hardening:
36//! - Input size limits (10MB default)
37//! - URL protocol validation (https, http, file only)
38//! - No script execution (HTML parsing only)
39
40#![warn(missing_docs)]
41#![warn(clippy::all)]
42#![allow(clippy::module_name_repetitions)]
43
44mod types;
45mod parser;
46mod certification;
47mod summary;
48mod security;
49
50pub use types::*;
51pub use parser::SemanticDOM;
52pub use certification::{AgentCertification, CertificationLevel, ValidationCheck};
53pub use summary::{to_agent_summary, to_one_liner, to_nav_summary, to_audio_summary, compare_token_usage, TokenComparison};
54pub use security::{validate_url, SecurityConfig};
55
56use thiserror::Error;
57
58/// Errors that can occur during SemanticDOM operations
59#[derive(Error, Debug)]
60pub enum Error {
61    /// Input exceeds maximum size limit
62    #[error("Input exceeds maximum size of {max_size} bytes (got {actual_size})")]
63    InputTooLarge {
64        /// Maximum allowed size
65        max_size: usize,
66        /// Actual input size
67        actual_size: usize,
68    },
69
70    /// Invalid URL protocol
71    #[error("URL has disallowed protocol: {protocol}")]
72    InvalidUrlProtocol {
73        /// The disallowed protocol
74        protocol: String,
75    },
76
77    /// HTML parsing error
78    #[error("Failed to parse HTML: {0}")]
79    ParseError(String),
80
81    /// IO error
82    #[error("IO error: {0}")]
83    IoError(#[from] std::io::Error),
84}
85
86/// Result type for SemanticDOM operations
87pub type Result<T> = std::result::Result<T, Error>;
88
89/// Configuration for SemanticDOM parsing
90#[derive(Debug, Clone)]
91pub struct Config {
92    /// Maximum input size in bytes (default: 10MB)
93    pub max_input_size: usize,
94    /// ID prefix for generated semantic IDs
95    pub id_prefix: String,
96    /// Maximum tree depth to parse
97    pub max_depth: usize,
98    /// Elements to exclude from parsing
99    pub exclude_tags: Vec<String>,
100    /// Whether to generate state graph
101    pub include_state_graph: bool,
102    /// Whether to run certification checks
103    pub validate: bool,
104}
105
106impl Default for Config {
107    fn default() -> Self {
108        Self {
109            max_input_size: 10 * 1024 * 1024, // 10MB
110            id_prefix: "sdom".to_string(),
111            max_depth: 50,
112            exclude_tags: vec![
113                "script".to_string(),
114                "style".to_string(),
115                "noscript".to_string(),
116                "template".to_string(),
117            ],
118            include_state_graph: true,
119            validate: true,
120        }
121    }
122}
123
124/// Standard reference
125pub const STANDARD: &str = "ISO/IEC-SDOM-SSG-DRAFT-2024";
126
127/// Crate version
128pub const VERSION: &str = env!("CARGO_PKG_VERSION");
129
130#[cfg(test)]
131mod tests {
132    use super::*;
133
134    #[test]
135    fn test_parse_simple_html() {
136        let html = r##"
137            <html>
138            <body>
139                <nav>
140                    <a href="#home">Home</a>
141                    <a href="#about">About</a>
142                </nav>
143                <main>
144                    <h1>Welcome</h1>
145                    <button>Click me</button>
146                </main>
147            </body>
148            </html>
149        "##;
150
151        let result = SemanticDOM::parse(html, Config::default());
152        assert!(result.is_ok());
153
154        let sdom = result.unwrap();
155        assert!(!sdom.landmarks.is_empty());
156        assert!(!sdom.interactables.is_empty());
157    }
158
159    #[test]
160    fn test_o1_lookup() {
161        let html = r#"<html><body><button id="test-btn">Test</button></body></html>"#;
162        let sdom = SemanticDOM::parse(html, Config::default()).unwrap();
163
164        // Lookup should be O(1) via HashMap
165        let node = sdom.index.values().find(|n| n.role == SemanticRole::Button);
166        assert!(node.is_some());
167    }
168
169    #[test]
170    fn test_agent_summary() {
171        let html = r##"
172            <html>
173            <body>
174                <nav><a href="#home">Home</a></nav>
175                <main><button>Submit</button></main>
176            </body>
177            </html>
178        "##;
179
180        let sdom = SemanticDOM::parse(html, Config::default()).unwrap();
181        let summary = sdom.to_agent_summary();
182
183        assert!(summary.contains("LANDMARKS:"));
184        assert!(summary.contains("ACTIONS:"));
185    }
186
187    #[test]
188    fn test_input_size_limit() {
189        let config = Config {
190            max_input_size: 100,
191            ..Default::default()
192        };
193
194        let html = "x".repeat(200);
195        let result = SemanticDOM::parse(&html, config);
196
197        assert!(matches!(result, Err(Error::InputTooLarge { .. })));
198    }
199
200    #[test]
201    fn test_url_validation() {
202        assert!(validate_url("https://example.com").is_ok());
203        assert!(validate_url("http://example.com").is_ok());
204        assert!(validate_url("file:///path/to/file").is_ok());
205        assert!(validate_url("/relative/path").is_ok());
206
207        assert!(validate_url("javascript:alert(1)").is_err());
208        assert!(validate_url("data:text/html,<script>").is_err());
209    }
210}