wme_models/article.rs
1//! Article types for the Wikimedia Enterprise API.
2//!
3//! This module provides the core [`Article`] type used across all Enterprise APIs
4//! (On-demand, Snapshot, and Realtime). The same schema is returned by all endpoints,
5//! allowing you to process articles from any source without separate parsers.
6//!
7//! # Article Structure
8//!
9//! Articles contain comprehensive metadata about a Wikipedia page:
10//! - **Identification**: `identifier`, `name`, `url`
11//! - **Content**: `abstract_text`, `article_body` (HTML/wikitext)
12//! - **Context**: `in_language`, `is_part_of`, `namespace`
13//! - **Version**: `version` with editor info and credibility signals
14//! - **Categorization**: `categories`, `templates`, `redirects`
15//! - **Event metadata**: `event` (Realtime API only)
16//!
17//! # Handling Duplicates
18//!
19//! Snapshot and Realtime Batch files may contain duplicate articles (< 1%).
20//! When processing, keep the article with the highest `version.identifier`:
21//!
22//! ```rust
23//! use wme_models::Article;
24//!
25//! fn keep_latest(existing: &Article, incoming: &Article) -> bool {
26//! incoming.version.identifier > existing.version.identifier
27//! }
28//! ```
29//!
30//! # Visibility Events
31//!
32//! The Realtime API may return `visibility-change` events where content is hidden.
33//! Check the `visibility` field to determine if text, editor, or comment is visible.
34
35use chrono::{DateTime, Utc};
36use serde::{Deserialize, Serialize};
37
38use crate::content::ArticleBody;
39use crate::metadata::{EventMetadata, Language};
40use crate::reference::Reference;
41use crate::structured::{Infobox, Section, Table};
42use crate::version::{OptionalPreviousVersion, Protection, Version};
43use crate::{
44 Category, Image, License, Namespace, Redirect, Template, WikidataEntity, WikidataEntityUsage,
45};
46
47/// Visibility flags for articles (present on visibility-change events).
48///
49/// When the editing community flags a revision as containing potentially damaging
50/// information, they change its visibility. The three booleans indicate whether
51/// the article body, editor name, or edit comment may contain harmful data.
52///
53/// # Examples
54///
55/// ```
56/// use wme_models::Visibility;
57///
58/// let visibility = Visibility {
59/// text: true,
60/// editor: false,
61/// comment: false,
62/// };
63///
64/// // When text=false, the article content is hidden
65/// // When editor=false, the editor name is hidden
66/// // When comment=false, the edit summary is hidden
67/// ```
68#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
69pub struct Visibility {
70 /// Is article text visible?
71 pub text: bool,
72 /// Is editor name visible?
73 pub editor: bool,
74 /// Is edit comment visible?
75 pub comment: bool,
76}
77
78/// Project reference (simplified, used in article responses).
79///
80/// This is a lightweight reference to the project an article belongs to.
81/// For full project metadata, see [`crate::metadata::ProjectInfo`].
82#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
83pub struct ProjectRef {
84 /// Project identifier (e.g., "enwiki")
85 pub identifier: String,
86 /// Project URL
87 pub url: Option<String>,
88}
89
90/// Complete article from Enterprise API.
91///
92/// This is the primary data structure returned by all Enterprise APIs.
93/// The same schema is used across On-demand, Snapshot, and Realtime endpoints.
94///
95/// # Example
96///
97/// ```
98/// use wme_models::Article;
99/// use serde_json;
100///
101/// let json = r#"{
102/// "name": "Squirrel",
103/// "identifier": 28492,
104/// "url": "https://en.wikipedia.org/wiki/Squirrel",
105/// "date_created": "2001-01-15T00:00:00Z",
106/// "date_modified": "2024-01-15T12:00:00Z",
107/// "in_language": {"identifier": "en", "name": "English"},
108/// "is_part_of": {"identifier": "enwiki"},
109/// "namespace": {"identifier": 0, "name": ""},
110/// "license": [{"name": "CC BY-SA 4.0", "url": "https://creativecommons.org/licenses/by-sa/4.0/"}],
111/// "version": {
112/// "identifier": 1182847293,
113/// "editor": {"identifier": 12345, "name": "SomeUser"}
114/// }
115/// }"#;
116///
117/// let article: Article = serde_json::from_str(json).unwrap();
118/// assert_eq!(article.name, "Squirrel");
119/// assert_eq!(article.identifier, 28492);
120/// ```
121#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
122pub struct Article {
123 /// Article ID (MediaWiki page ID)
124 pub identifier: u64,
125 /// Article name/title
126 pub name: String,
127 /// Article URL
128 pub url: String,
129 /// Article abstract/summary
130 #[serde(rename = "abstract")]
131 pub abstract_text: Option<String>,
132 /// Short description (e.g., "Family of rodents")
133 pub description: Option<String>,
134 /// Last modification timestamp
135 pub date_modified: DateTime<Utc>,
136 /// Before-last modification timestamp
137 pub date_previously_modified: Option<DateTime<Utc>>,
138 /// Language information
139 pub in_language: Language,
140 /// Project this article belongs to (simplified reference)
141 pub is_part_of: ProjectRef,
142 /// Namespace information
143 pub namespace: Option<Namespace>,
144 /// Main Wikidata entity (primary topic)
145 pub main_entity: Option<WikidataEntity>,
146 /// Additional Wikidata entities used
147 pub additional_entities: Option<Vec<WikidataEntityUsage>>,
148 /// Categories this article belongs to
149 pub categories: Option<Vec<Category>>,
150 /// Templates used in this article
151 pub templates: Option<Vec<Template>>,
152 /// Redirects to this article (alternative names)
153 pub redirects: Option<Vec<Redirect>>,
154 /// Current version information with credibility signals
155 pub version: Version,
156 /// Previous version information
157 #[serde(default)]
158 pub previous_version: OptionalPreviousVersion,
159 /// Number of editors watching this page
160 pub watchers_count: Option<u64>,
161 /// Protection settings (edit/move restrictions)
162 pub protection: Option<Vec<Protection>>,
163 /// Visibility flags (for visibility-change events)
164 pub visibility: Option<Visibility>,
165 /// Main image for the article
166 pub image: Option<Image>,
167 /// License(s) for this article (usually CC-BY-SA)
168 pub license: Vec<License>,
169 /// Article body content (HTML and wikitext)
170 pub article_body: Option<ArticleBody>,
171 /// Event metadata (present in Realtime API responses)
172 pub event: Option<EventMetadata>,
173 /// Has parts - structured content sections (when using fields filter)
174 pub has_parts: Option<Vec<Section>>,
175}
176
177/// Structured Contents (BETA) - Article with parsed content.
178///
179/// This type extends [`Article`] with fully parsed content including infoboxes,
180/// sections, and tables. Available through the Structured Contents BETA endpoint.
181///
182/// # Accessing Content
183///
184/// Use the convenience methods to access parsed content:
185///
186/// ```rust,ignore
187/// use wme_models::StructuredArticle;
188///
189/// // Get infobox by name
190/// if let Some(infobox) = article.infobox("Automatic taxobox") {
191/// // Process infobox fields
192/// }
193///
194/// // Get section by name
195/// if let Some(section) = article.section("Taxonomy") {
196/// // Process section content
197/// }
198/// ```
199#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
200pub struct StructuredArticle {
201 /// Base article fields (flattened)
202 #[serde(flatten)]
203 pub base: Article,
204 /// Creation timestamp (first revision) - only in structured contents
205 pub date_created: DateTime<Utc>,
206 /// Parsed infoboxes
207 pub infoboxes: Vec<Infobox>,
208 /// Parsed sections
209 pub sections: Vec<Section>,
210 /// Tables (single object in structured contents)
211 pub tables: Table,
212 /// References/citations (single object in structured contents)
213 pub references: Reference,
214}
215
216impl StructuredArticle {
217 /// Get infobox by name.
218 ///
219 /// # Examples
220 ///
221 /// ```rust,ignore
222 /// use wme_models::StructuredArticle;
223 ///
224 /// if let Some(infobox) = article.infobox("Automatic taxobox") {
225 /// println!("Found infobox: {:?}", infobox.name);
226 /// }
227 /// ```
228 pub fn infobox(&self, name: &str) -> Option<&Infobox> {
229 self.infoboxes
230 .iter()
231 .find(|i| i.name.as_ref().map(|n| n == name).unwrap_or(false))
232 }
233
234 /// Get section by name.
235 ///
236 /// # Examples
237 ///
238 /// ```rust,ignore
239 /// use wme_models::StructuredArticle;
240 ///
241 /// if let Some(section) = article.section("References") {
242 /// println!("Section has {} parts", section.has_parts.as_ref().map(|p| p.len()).unwrap_or(0));
243 /// }
244 /// ```
245 pub fn section(&self, name: &str) -> Option<&Section> {
246 self.sections
247 .iter()
248 .find(|s| s.name.as_ref().map(|n| n == name).unwrap_or(false))
249 }
250
251 /// Get table by identifier.
252 ///
253 /// # Examples
254 ///
255 /// ```rust,ignore
256 /// use wme_models::StructuredArticle;
257 ///
258 /// if let Some(table) = article.table("demographics_table1") {
259 /// println!("Table has {} rows", table.rows.len());
260 /// }
261 /// ```
262 pub fn table(&self, identifier: &str) -> Option<&Table> {
263 if self.tables.identifier == identifier {
264 Some(&self.tables)
265 } else {
266 None
267 }
268 }
269}
270
271impl std::ops::Deref for StructuredArticle {
272 type Target = Article;
273
274 fn deref(&self) -> &Self::Target {
275 &self.base
276 }
277}
278
279#[cfg(test)]
280mod tests {
281 use super::*;
282 use crate::version::{ArticleSize, Editor};
283 use chrono::Utc;
284
285 fn create_test_language() -> Language {
286 Language {
287 identifier: Some("en".to_string()),
288 name: Some("English".to_string()),
289 alternate_name: None,
290 direction: Some("ltr".to_string()),
291 }
292 }
293
294 fn create_test_namespace() -> Namespace {
295 Namespace {
296 identifier: 0,
297 name: Some("".to_string()),
298 description: Some("Main namespace".to_string()),
299 }
300 }
301
302 fn create_test_project_ref() -> ProjectRef {
303 ProjectRef {
304 identifier: "enwiki".to_string(),
305 url: Some("https://en.wikipedia.org".to_string()),
306 }
307 }
308
309 fn create_test_version() -> Version {
310 Version {
311 identifier: 1182847293,
312 editor: Some(Editor {
313 identifier: Some(12345),
314 name: Some("TestUser".to_string()),
315 is_bot: Some(false),
316 is_anonymous: Some(false),
317 date_started: Some(Utc::now()),
318 edit_count: Some(1000),
319 groups: Some(vec!["user".to_string()]),
320 is_admin: Some(false),
321 is_patroller: Some(false),
322 has_advanced_rights: Some(false),
323 }),
324 comment: Some("Test edit".to_string()),
325 tags: Some(vec!["mobile edit".to_string()]),
326 has_tag_needs_citation: Some(false),
327 is_minor_edit: Some(false),
328 is_flagged_stable: Some(true),
329 is_breaking_news: Some(false),
330 noindex: Some(false),
331 number_of_characters: Some(5000),
332 size: Some(ArticleSize {
333 value: 15000,
334 unit_text: "B".to_string(),
335 }),
336 maintenance_tags: None,
337 scores: None,
338 }
339 }
340
341 fn create_test_article() -> Article {
342 Article {
343 identifier: 28492,
344 name: "Squirrel".to_string(),
345 url: "https://en.wikipedia.org/wiki/Squirrel".to_string(),
346 abstract_text: Some("Squirrels are members of the family Sciuridae...".to_string()),
347 description: Some("Family of rodents".to_string()),
348 date_modified: Utc::now(),
349 date_previously_modified: None,
350 in_language: create_test_language(),
351 is_part_of: create_test_project_ref(),
352 namespace: Some(create_test_namespace()),
353 main_entity: None,
354 additional_entities: None,
355 categories: None,
356 templates: None,
357 redirects: None,
358 version: create_test_version(),
359 previous_version: OptionalPreviousVersion(None),
360 watchers_count: Some(42),
361 protection: None,
362 visibility: None,
363 image: None,
364 license: vec![],
365 article_body: None,
366 event: None,
367 has_parts: None,
368 }
369 }
370
371 #[test]
372 fn test_article_creation() {
373 let article = create_test_article();
374 assert_eq!(article.identifier, 28492);
375 assert_eq!(article.name, "Squirrel");
376 assert!(article.abstract_text.is_some());
377 }
378
379 #[test]
380 fn test_project_ref_creation() {
381 let project = ProjectRef {
382 identifier: "enwiki".to_string(),
383 url: Some("https://en.wikipedia.org".to_string()),
384 };
385 assert_eq!(project.identifier, "enwiki");
386 assert!(project.url.is_some());
387 }
388
389 #[test]
390 fn test_visibility_creation() {
391 let visibility = Visibility {
392 text: true,
393 editor: false,
394 comment: false,
395 };
396 assert!(visibility.text);
397 assert!(!visibility.editor);
398 assert!(!visibility.comment);
399 }
400
401 #[test]
402 fn test_version_comparison_for_dedup() {
403 let article1 = create_test_article();
404 let article2 = Article {
405 identifier: 28492,
406 name: "Squirrel".to_string(),
407 url: "https://en.wikipedia.org/wiki/Squirrel".to_string(),
408 abstract_text: None,
409 description: None,
410 date_modified: Utc::now(),
411 date_previously_modified: None,
412 in_language: create_test_language(),
413 is_part_of: create_test_project_ref(),
414 namespace: Some(create_test_namespace()),
415 main_entity: None,
416 additional_entities: None,
417 categories: None,
418 templates: None,
419 redirects: None,
420 version: Version {
421 identifier: 1182847294, // Higher version
422 editor: create_test_version().editor,
423 comment: None,
424 tags: None,
425 has_tag_needs_citation: None,
426 is_minor_edit: None,
427 is_flagged_stable: None,
428 is_breaking_news: None,
429 noindex: None,
430 number_of_characters: None,
431 size: None,
432 maintenance_tags: None,
433 scores: None,
434 },
435 previous_version: OptionalPreviousVersion(None),
436 watchers_count: None,
437 protection: None,
438 visibility: None,
439 image: None,
440 license: vec![],
441 article_body: None,
442 event: None,
443 has_parts: None,
444 };
445
446 // When deduplicating, keep the one with higher version.identifier
447 assert!(article2.version.identifier > article1.version.identifier);
448 }
449}