schemaorg_rs/types.rs
1//! Core data types for Schema.org structured data extraction.
2//!
3//! This module defines the shared data model used across all extraction
4//! formats (JSON-LD, Microdata, `RDFa` Lite). The central type is
5//! [`SchemaNode`], which represents a single structured data entity
6//! (e.g. a `Product`, an `Offer`). Nodes contain typed [`SchemaValue`]s
7//! organized in insertion-ordered property maps.
8//!
9//! # Data Model
10//!
11//! ```text
12//! StructuredDataGraph
13//! +---- Vec<SchemaNode>
14//! +---- types: ["Product"]
15//! +---- properties: { "name" -> [Text("Widget")],
16//! | "offers" -> [Node(Offer { ... })] }
17//! +---- source_format: JsonLd
18//! +---- source_location: Some({ line: 3, column: 1, byte_offset: 42 })
19//! ```
20//!
21//! # Examples
22//!
23//! ```
24//! use schemaorg_rs::types::{SchemaNode, SchemaValue, SourceFormat};
25//! use indexmap::IndexMap;
26//!
27//! let node = SchemaNode {
28//! types: vec!["Product".into()],
29//! properties: IndexMap::from([(
30//! "name".into(),
31//! vec![SchemaValue::Text("Widget".into())],
32//! )]),
33//! source_format: SourceFormat::JsonLd,
34//! source_location: None,
35//! };
36//!
37//! assert_eq!(node.types, vec!["Product"]);
38//! ```
39
40use std::fmt;
41
42use indexmap::IndexMap;
43use serde::{Deserialize, Serialize};
44
45/// Source format of the extracted structured data.
46///
47/// Indicates which HTML markup format a [`SchemaNode`] was extracted from.
48///
49/// # Examples
50///
51/// ```
52/// use schemaorg_rs::types::SourceFormat;
53///
54/// let format = SourceFormat::JsonLd;
55/// assert_eq!(format.to_string(), "JSON-LD");
56/// ```
57#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
58#[non_exhaustive]
59pub enum SourceFormat {
60 /// JSON-LD (`<script type="application/ld+json">`)
61 JsonLd,
62 /// HTML Microdata (`itemscope`, `itemprop`)
63 Microdata,
64 /// `RDFa` Lite 1.1 (`vocab`, `typeof`, `property`)
65 RdfaLite,
66}
67
68/// Location in the original HTML document.
69///
70/// Used to map extracted data back to the source markup for diagnostics
71/// and error reporting.
72///
73/// # Examples
74///
75/// ```
76/// use schemaorg_rs::types::SourceLocation;
77///
78/// let loc = SourceLocation { line: 5, column: 3, byte_offset: 120 };
79/// assert_eq!(loc.line, 5);
80/// ```
81#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
82pub struct SourceLocation {
83 /// 1-indexed line number.
84 pub line: usize,
85 /// 1-indexed column number.
86 pub column: usize,
87 /// 0-indexed byte offset from the start of the HTML document.
88 pub byte_offset: usize,
89}
90
91/// A value within a structured data node.
92///
93/// Represents the different value types that a Schema.org property can hold.
94/// Properties are multi-valued (stored as `Vec<SchemaValue>` in [`SchemaNode`]).
95///
96/// # `PartialEq` note
97///
98/// The `Number(f64)` variant uses `f64` partial equality via the derived impl.
99/// This means `NaN != NaN`, which is acceptable for test assertions but not for
100/// production equality checks.
101///
102/// # Examples
103///
104/// ```
105/// use schemaorg_rs::types::SchemaValue;
106///
107/// let text = SchemaValue::Text("Widget".into());
108/// let url = SchemaValue::Url("https://example.com".into());
109/// let flag = SchemaValue::Boolean(true);
110/// let price = SchemaValue::Number(29.99);
111/// ```
112#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
113#[non_exhaustive]
114pub enum SchemaValue {
115 /// Plain text content.
116 Text(String),
117 /// A URL value (starts with `http://`, `https://`, or `mailto:`).
118 Url(String),
119 /// A nested structured data node.
120 Node(Box<SchemaNode>),
121 /// A boolean value.
122 Boolean(bool),
123 /// A numeric value (IEEE 754 `f64`).
124 Number(f64),
125 /// A raw datetime string. Actual datetime validation happens in M2.
126 DateTime(String),
127}
128
129/// A single structured data node (e.g. a `Product`, an `Offer`).
130///
131/// Each node represents one Schema.org entity extracted from the HTML
132/// document, retaining its [`SourceFormat`] so callers can distinguish
133/// which markup produced it.
134///
135/// # Examples
136///
137/// ```
138/// use schemaorg_rs::types::{SchemaNode, SchemaValue, SourceFormat};
139/// use indexmap::IndexMap;
140///
141/// let node = SchemaNode {
142/// types: vec!["Product".into()],
143/// properties: IndexMap::from([(
144/// "name".into(),
145/// vec![SchemaValue::Text("Widget".into())],
146/// )]),
147/// source_format: SourceFormat::JsonLd,
148/// source_location: None,
149/// };
150///
151/// assert_eq!(node.id(), None);
152/// ```
153#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
154pub struct SchemaNode {
155 /// Schema.org type(s), e.g. `["Product", "IndividualProduct"]`.
156 pub types: Vec<String>,
157 /// Properties: key -> list of values (insertion-ordered).
158 pub properties: IndexMap<String, Vec<SchemaValue>>,
159 /// Source format that this node was extracted from.
160 pub source_format: SourceFormat,
161 /// Location in the original HTML document.
162 pub source_location: Option<SourceLocation>,
163}
164
165impl SchemaNode {
166 /// Returns the `@id` of this node, if present.
167 ///
168 /// # Examples
169 ///
170 /// ```
171 /// use schemaorg_rs::types::{SchemaNode, SchemaValue, SourceFormat};
172 /// use indexmap::IndexMap;
173 ///
174 /// let node = SchemaNode {
175 /// types: vec!["Product".into()],
176 /// properties: IndexMap::from([(
177 /// "@id".into(),
178 /// vec![SchemaValue::Text("#product1".into())],
179 /// )]),
180 /// source_format: SourceFormat::JsonLd,
181 /// source_location: None,
182 /// };
183 ///
184 /// assert_eq!(node.id(), Some("#product1"));
185 /// ```
186 #[must_use]
187 #[inline]
188 pub fn id(&self) -> Option<&str> {
189 self.properties
190 .get("@id")
191 .and_then(|vals| vals.first())
192 .and_then(|v| match v {
193 SchemaValue::Text(s) => Some(s.as_str()),
194 _ => None,
195 })
196 }
197}
198
199impl fmt::Display for SourceFormat {
200 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
201 match self {
202 Self::JsonLd => write!(f, "JSON-LD"),
203 Self::Microdata => write!(f, "Microdata"),
204 Self::RdfaLite => write!(f, "RDFa Lite"),
205 }
206 }
207}
208
209impl fmt::Display for SchemaValue {
210 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
211 match self {
212 Self::Text(s) | Self::Url(s) | Self::DateTime(s) => write!(f, "{s}"),
213 Self::Boolean(b) => write!(f, "{b}"),
214 Self::Number(n) => write!(f, "{n}"),
215 Self::Node(n) => write!(f, "[{} node]", n.types.join(", ")),
216 }
217 }
218}