Skip to main content

wme_models/
metadata.rs

1//! Metadata types for the Wikimedia Enterprise API.
2//!
3//! This module provides types for reference data, snapshots, and event metadata
4//! used across all Enterprise APIs. These types describe the context around
5//! articles rather than article content itself.
6//!
7//! # Namespaces
8//!
9//! Wikimedia projects use namespaces to organize different types of pages:
10//! - **0 (Main)**: Article content
11//! - **6 (File)**: Media files
12//! - **10 (Template)**: Reusable wikitext
13//! - **14 (Category)**: Topic groupings
14//!
15//! See [`Namespace`] for namespace information.
16//!
17//! # Snapshots
18//!
19//! Snapshots provide complete dumps of Wikimedia projects. Use [`SnapshotInfo`]
20//! to discover available snapshots and their download chunks.
21//!
22//! # Events
23//!
24//! The Realtime API uses event-based updates. See [`EventMetadata`] for
25//! partition/offset tracking to enable resume functionality.
26
27use crate::Article;
28use chrono::{DateTime, Utc};
29use serde::{Deserialize, Serialize};
30
31/// Event types for article changes.
32///
33/// The Realtime API streams events of these types as they happen.
34/// Event types indicate what kind of change occurred to an article.
35#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
36pub enum EventType {
37    /// Article was created or updated (content changed)
38    #[serde(rename = "update")]
39    Update,
40    /// Article was deleted
41    #[serde(rename = "delete")]
42    Delete,
43    /// Article visibility changed (content/editor/comment hidden)
44    #[serde(rename = "visibility-change")]
45    VisibilityChange,
46}
47
48/// Event metadata for articles.
49///
50/// Tracks events through the Wikimedia Enterprise system. The partition and
51/// offset fields enable resuming streams after disconnections.
52///
53/// # Resuming Streams
54///
55/// ```ignore
56/// // Store the last processed offset for each partition
57/// let offsets: HashMap<u32, u64> = load_checkpoints();
58///
59/// // When reconnecting, pass these offsets
60/// client.connect_with_offsets(offsets).await?;
61/// ```
62#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
63pub struct EventMetadata {
64    /// Event UUID (for tracking through the system)
65    pub identifier: String,
66    /// Event type (update, delete, visibility-change)
67    #[serde(rename = "type")]
68    pub event_type: EventType,
69    /// Event creation timestamp (when entered Enterprise system)
70    pub date_created: DateTime<Utc>,
71    /// Event publication timestamp (Realtime API only)
72    pub date_published: Option<DateTime<Utc>>,
73    /// Partition number (Realtime API only, for resume)
74    pub partition: Option<u32>,
75    /// Offset within partition (Realtime API only, for resume)
76    pub offset: Option<u64>,
77}
78
79/// Namespace information.
80///
81/// Namespaces organize different types of pages in Wikimedia projects.
82/// The identifier is the namespace number (0, 6, 10, 14, etc.).
83///
84/// # Common Namespaces
85///
86/// - `0` - Main (article content)
87/// - `6` - File (media uploads)
88/// - `10` - Template (reusable wikitext)
89/// - `14` - Category (topic groupings)
90#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
91pub struct Namespace {
92    /// Namespace ID (0 = Main, 6 = File, 10 = Template, 14 = Category)
93    pub identifier: u32,
94    /// Namespace name
95    pub name: Option<String>,
96    /// Namespace description
97    pub description: Option<String>,
98}
99
100/// Language information.
101///
102/// Describes a language supported by Wikimedia projects.
103#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
104pub struct Language {
105    /// Language code (e.g., "en", "de", "fr")
106    pub identifier: Option<String>,
107    /// Language name in English
108    pub name: Option<String>,
109    /// Alternate language name (native script)
110    pub alternate_name: Option<String>,
111    /// Text direction ("ltr" for left-to-right, "rtl" for right-to-left)
112    pub direction: Option<String>,
113}
114
115/// Project information.
116///
117/// Simple project reference with basic metadata.
118/// For full project metadata including language, see [`ProjectInfo`].
119#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
120pub struct Project {
121    /// Project identifier (e.g., "enwiki")
122    pub identifier: String,
123    /// Project URL
124    pub url: String,
125    /// Project type (e.g., "Wikipedia")
126    #[serde(rename = "type")]
127    pub project_type: Option<String>,
128}
129
130/// Project info for discovery API.
131///
132/// Full project metadata returned by the metadata endpoints.
133#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
134pub struct ProjectInfo {
135    /// Project identifier (e.g., "enwiki")
136    pub identifier: String,
137    /// Project code (e.g., "wikipedia")
138    pub code: Option<String>,
139    /// Project name
140    pub name: Option<String>,
141    /// Project URL
142    pub url: Option<String>,
143    /// Project language
144    pub in_language: Option<Language>,
145}
146
147/// Project type (code) information.
148///
149/// Describes project types like "wiki" (Wikipedia) or "wiktionary".
150#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
151pub struct ProjectType {
152    /// Type identifier (e.g., "wiki", "wiktionary")
153    pub identifier: String,
154    /// Type name
155    pub name: String,
156    /// Type description
157    pub description: Option<String>,
158}
159
160/// File size information with unit.
161///
162/// Used in snapshot and batch metadata to describe download sizes.
163#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
164pub struct Size {
165    /// Unit of measurement (e.g., "bytes", "MB", "GB")
166    pub unit_text: String,
167    /// Size value (can be fractional for small sizes)
168    pub value: f64,
169}
170
171/// Snapshot information.
172///
173/// Metadata about available project snapshots including download chunks
174/// for parallel downloading.
175///
176/// # Example
177///
178/// ```ignore
179/// // Get snapshot info and download chunks in parallel
180/// for chunk in snapshot.chunks {
181///     let data = download_chunk(chunk.url).await?;
182///     process_chunk(data).await?;
183/// }
184/// ```
185#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
186pub struct SnapshotInfo {
187    /// Snapshot identifier (e.g., "enwiki_namespace_0")
188    pub identifier: String,
189    /// Snapshot version
190    pub version: String,
191    /// Last modification timestamp
192    pub date_modified: DateTime<Utc>,
193    /// Language of the snapshot content
194    pub in_language: Language,
195    /// Project this snapshot belongs to
196    pub is_part_of: ProjectInfo,
197    /// Namespace of the snapshot
198    pub namespace: Namespace,
199    /// Snapshot size information
200    pub size: Size,
201    /// Downloadable chunk identifiers for parallel download
202    pub chunks: Option<Vec<String>>,
203}
204
205/// Chunk information for parallel downloads.
206///
207/// Large snapshots are split into chunks for parallel downloading.
208/// Each chunk can be downloaded independently and contains a subset
209/// of the articles.
210#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
211pub struct ChunkInfo {
212    /// Chunk identifier (e.g., "enwiki_namespace_0_chunk_0")
213    pub identifier: String,
214    /// Chunk version
215    pub version: String,
216    /// Last modification timestamp
217    pub date_modified: DateTime<Utc>,
218    /// Language of the chunk content
219    pub in_language: Language,
220    /// Project this chunk belongs to
221    pub is_part_of: ProjectInfo,
222    /// Namespace of the chunk
223    pub namespace: Namespace,
224    /// Chunk size information
225    pub size: Size,
226    /// Download URL for the chunk
227    pub url: Option<String>,
228}
229
230/// Simplified language information (used in Realtime API).
231///
232/// Realtime API uses a minimal language representation.
233#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
234pub struct SimplifiedLanguage {
235    /// Language code (e.g., "en", "de")
236    pub identifier: String,
237    /// Language name
238    pub name: String,
239}
240
241/// Realtime project information (used in streaming API).
242///
243/// Realtime API uses a different project structure with version
244/// and size information for the stream.
245#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
246pub struct RealtimeProject {
247    /// Project identifier (e.g., "enwiki")
248    pub identifier: String,
249    /// Project name
250    pub name: String,
251    /// Project URL
252    pub url: String,
253    /// Project version
254    pub version: String,
255    /// Last modification timestamp
256    pub date_modified: DateTime<Utc>,
257    /// Project size
258    pub size: Size,
259    /// Language information (simplified)
260    pub in_language: SimplifiedLanguage,
261}
262
263/// Simplified namespace (used in Realtime API).
264///
265/// Realtime API uses minimal namespace representation.
266#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
267pub struct SimplifiedNamespace {
268    /// Namespace ID
269    pub identifier: u32,
270    /// Namespace name
271    pub name: Option<String>,
272}
273
274/// Realtime batch info (hourly batches).
275///
276/// Metadata for hourly batch files available through the Realtime API.
277#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
278pub struct RealtimeBatchInfo {
279    /// Batch identifier
280    pub identifier: String,
281    /// Batch name
282    pub name: String,
283    /// Batch version
284    pub version: String,
285    /// Language information
286    pub in_language: Language,
287    /// Project this batch belongs to
288    pub is_part_of: ProjectInfo,
289    /// Namespace information
290    pub namespace: Namespace,
291    /// Batch size information
292    pub size: Size,
293}
294
295/// Batch information for Realtime API.
296///
297/// Basic batch metadata with event count.
298#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
299pub struct BatchInfo {
300    /// Batch identifier
301    pub identifier: String,
302    /// Batch creation timestamp
303    pub date_created: DateTime<Utc>,
304    /// Number of events in batch
305    pub event_count: u64,
306}
307
308/// Article update event for Realtime API.
309///
310/// Combines article data with event metadata for Realtime streams.
311#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
312pub struct ArticleUpdate {
313    /// Event type
314    pub event_type: EventType,
315    /// Article data
316    pub article: Article,
317    /// Event metadata
318    pub event_metadata: EventMetadata,
319}
320
321#[cfg(test)]
322mod tests {
323    use super::*;
324    use chrono::Utc;
325
326    #[test]
327    fn test_event_type_variants() {
328        let update = EventType::Update;
329        let delete = EventType::Delete;
330        let visibility = EventType::VisibilityChange;
331
332        assert!(matches!(update, EventType::Update));
333        assert!(matches!(delete, EventType::Delete));
334        assert!(matches!(visibility, EventType::VisibilityChange));
335    }
336
337    #[test]
338    fn test_event_metadata() {
339        let event = EventMetadata {
340            identifier: "evt-12345".to_string(),
341            event_type: EventType::Update,
342            date_created: Utc::now(),
343            date_published: Some(Utc::now()),
344            partition: Some(4),
345            offset: Some(3593806),
346        };
347
348        assert_eq!(event.identifier, "evt-12345");
349        assert_eq!(event.partition, Some(4));
350        assert_eq!(event.offset, Some(3593806));
351    }
352
353    #[test]
354    fn test_namespace_creation() {
355        let ns = Namespace {
356            identifier: 0,
357            name: Some("".to_string()),
358            description: Some("Main namespace".to_string()),
359        };
360
361        assert_eq!(ns.identifier, 0);
362        assert_eq!(ns.name, Some("".to_string()));
363    }
364
365    #[test]
366    fn test_language_creation() {
367        let lang = Language {
368            identifier: Some("en".to_string()),
369            name: Some("English".to_string()),
370            alternate_name: None,
371            direction: Some("ltr".to_string()),
372        };
373
374        assert_eq!(lang.identifier, Some("en".to_string()));
375        assert_eq!(lang.direction, Some("ltr".to_string()));
376    }
377
378    #[test]
379    fn test_rtl_language() {
380        let lang = Language {
381            identifier: Some("ar".to_string()),
382            name: Some("Arabic".to_string()),
383            alternate_name: Some("العربية".to_string()),
384            direction: Some("rtl".to_string()),
385        };
386
387        assert_eq!(lang.identifier, Some("ar".to_string()));
388        assert_eq!(lang.direction, Some("rtl".to_string()));
389    }
390
391    #[test]
392    fn test_project_info() {
393        let project = ProjectInfo {
394            identifier: "enwiki".to_string(),
395            code: Some("wikipedia".to_string()),
396            name: Some("English Wikipedia".to_string()),
397            url: Some("https://en.wikipedia.org".to_string()),
398            in_language: Some(Language {
399                identifier: Some("en".to_string()),
400                name: Some("English".to_string()),
401                alternate_name: None,
402                direction: Some("ltr".to_string()),
403            }),
404        };
405
406        assert_eq!(project.identifier, "enwiki");
407        assert_eq!(project.code, Some("wikipedia".to_string()));
408    }
409
410    #[test]
411    fn test_size() {
412        let size = Size {
413            unit_text: "MB".to_string(),
414            value: 1500.0,
415        };
416
417        assert_eq!(size.value, 1500.0);
418        assert_eq!(size.unit_text, "MB");
419    }
420
421    #[test]
422    fn test_snapshot_info() {
423        let snapshot = SnapshotInfo {
424            identifier: "enwiki_namespace_0".to_string(),
425            version: "2024-01-15".to_string(),
426            date_modified: Utc::now(),
427            in_language: Language {
428                identifier: Some("en".to_string()),
429                name: Some("English".to_string()),
430                alternate_name: None,
431                direction: Some("ltr".to_string()),
432            },
433            is_part_of: ProjectInfo {
434                identifier: "enwiki".to_string(),
435                code: Some("wikipedia".to_string()),
436                name: Some("English Wikipedia".to_string()),
437                url: Some("https://en.wikipedia.org".to_string()),
438                in_language: Some(Language {
439                    identifier: Some("en".to_string()),
440                    name: Some("English".to_string()),
441                    alternate_name: None,
442                    direction: Some("ltr".to_string()),
443                }),
444            },
445            namespace: Namespace {
446                identifier: 0,
447                name: Some("".to_string()),
448                description: Some("Main namespace".to_string()),
449            },
450            size: Size {
451                unit_text: "GB".to_string(),
452                value: 25.0,
453            },
454            chunks: Some(vec![]),
455        };
456
457        assert_eq!(snapshot.identifier, "enwiki_namespace_0");
458        assert_eq!(snapshot.chunks.as_ref().map(|v| v.len()).unwrap_or(0), 0);
459    }
460}