Skip to main content

wme_models/
article.rs

1//! Article types for the Wikimedia Enterprise API.
2//!
3//! This module provides the core [`Article`] type used across all Enterprise APIs
4//! (On-demand, Snapshot, and Realtime). The same schema is returned by all endpoints,
5//! allowing you to process articles from any source without separate parsers.
6//!
7//! # Article Structure
8//!
9//! Articles contain comprehensive metadata about a Wikipedia page:
10//! - **Identification**: `identifier`, `name`, `url`
11//! - **Content**: `abstract_text`, `article_body` (HTML/wikitext)
12//! - **Context**: `in_language`, `is_part_of`, `namespace`
13//! - **Version**: `version` with editor info and credibility signals
14//! - **Categorization**: `categories`, `templates`, `redirects`
15//! - **Event metadata**: `event` (Realtime API only)
16//!
17//! # Handling Duplicates
18//!
19//! Snapshot and Realtime Batch files may contain duplicate articles (< 1%).
20//! When processing, keep the article with the highest `version.identifier`:
21//!
22//! ```rust
23//! use wme_models::Article;
24//!
25//! fn keep_latest(existing: &Article, incoming: &Article) -> bool {
26//!     incoming.version.identifier > existing.version.identifier
27//! }
28//! ```
29//!
30//! # Visibility Events
31//!
32//! The Realtime API may return `visibility-change` events where content is hidden.
33//! Check the `visibility` field to determine if text, editor, or comment is visible.
34
35use chrono::{DateTime, Utc};
36use serde::{Deserialize, Serialize};
37
38use crate::content::ArticleBody;
39use crate::metadata::{EventMetadata, Language};
40use crate::reference::Reference;
41use crate::structured::{Infobox, Section, Table};
42use crate::version::{OptionalPreviousVersion, Protection, Version};
43use crate::{
44    Category, Image, License, Namespace, Redirect, Template, WikidataEntity, WikidataEntityUsage,
45};
46
47/// Visibility flags for articles (present on visibility-change events).
48///
49/// When the editing community flags a revision as containing potentially damaging
50/// information, they change its visibility. The three booleans indicate whether
51/// the article body, editor name, or edit comment may contain harmful data.
52///
53/// # Examples
54///
55/// ```
56/// use wme_models::Visibility;
57///
58/// let visibility = Visibility {
59///     text: true,
60///     editor: false,
61///     comment: false,
62/// };
63///
64/// // When text=false, the article content is hidden
65/// // When editor=false, the editor name is hidden
66/// // When comment=false, the edit summary is hidden
67/// ```
68#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
69pub struct Visibility {
70    /// Is article text visible?
71    pub text: bool,
72    /// Is editor name visible?
73    pub editor: bool,
74    /// Is edit comment visible?
75    pub comment: bool,
76}
77
78/// Project reference (simplified, used in article responses).
79///
80/// This is a lightweight reference to the project an article belongs to.
81/// For full project metadata, see [`crate::metadata::ProjectInfo`].
82#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
83pub struct ProjectRef {
84    /// Project identifier (e.g., "enwiki")
85    pub identifier: String,
86    /// Project URL
87    pub url: Option<String>,
88}
89
90/// Complete article from Enterprise API.
91///
92/// This is the primary data structure returned by all Enterprise APIs.
93/// The same schema is used across On-demand, Snapshot, and Realtime endpoints.
94///
95/// # Example
96///
97/// ```
98/// use wme_models::Article;
99/// use serde_json;
100///
101/// let json = r#"{
102///     "name": "Squirrel",
103///     "identifier": 28492,
104///     "url": "https://en.wikipedia.org/wiki/Squirrel",
105///     "date_created": "2001-01-15T00:00:00Z",
106///     "date_modified": "2024-01-15T12:00:00Z",
107///     "in_language": {"identifier": "en", "name": "English"},
108///     "is_part_of": {"identifier": "enwiki"},
109///     "namespace": {"identifier": 0, "name": ""},
110///     "license": [{"name": "CC BY-SA 4.0", "url": "https://creativecommons.org/licenses/by-sa/4.0/"}],
111///     "version": {
112///         "identifier": 1182847293,
113///         "editor": {"identifier": 12345, "name": "SomeUser"}
114///     }
115/// }"#;
116///
117/// let article: Article = serde_json::from_str(json).unwrap();
118/// assert_eq!(article.name, "Squirrel");
119/// assert_eq!(article.identifier, 28492);
120/// ```
121#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
122pub struct Article {
123    /// Article ID (MediaWiki page ID)
124    pub identifier: u64,
125    /// Article name/title
126    pub name: String,
127    /// Article URL
128    pub url: String,
129    /// Article abstract/summary
130    #[serde(rename = "abstract")]
131    pub abstract_text: Option<String>,
132    /// Short description (e.g., "Family of rodents")
133    pub description: Option<String>,
134    /// Last modification timestamp
135    pub date_modified: DateTime<Utc>,
136    /// Before-last modification timestamp
137    pub date_previously_modified: Option<DateTime<Utc>>,
138    /// Language information
139    pub in_language: Language,
140    /// Project this article belongs to (simplified reference)
141    pub is_part_of: ProjectRef,
142    /// Namespace information
143    pub namespace: Option<Namespace>,
144    /// Main Wikidata entity (primary topic)
145    pub main_entity: Option<WikidataEntity>,
146    /// Additional Wikidata entities used
147    pub additional_entities: Option<Vec<WikidataEntityUsage>>,
148    /// Categories this article belongs to
149    pub categories: Option<Vec<Category>>,
150    /// Templates used in this article
151    pub templates: Option<Vec<Template>>,
152    /// Redirects to this article (alternative names)
153    pub redirects: Option<Vec<Redirect>>,
154    /// Current version information with credibility signals
155    pub version: Version,
156    /// Previous version information
157    #[serde(default)]
158    pub previous_version: OptionalPreviousVersion,
159    /// Number of editors watching this page
160    pub watchers_count: Option<u64>,
161    /// Protection settings (edit/move restrictions)
162    pub protection: Option<Vec<Protection>>,
163    /// Visibility flags (for visibility-change events)
164    pub visibility: Option<Visibility>,
165    /// Main image for the article
166    pub image: Option<Image>,
167    /// License(s) for this article (usually CC-BY-SA)
168    pub license: Vec<License>,
169    /// Article body content (HTML and wikitext)
170    pub article_body: Option<ArticleBody>,
171    /// Event metadata (present in Realtime API responses)
172    pub event: Option<EventMetadata>,
173    /// Has parts - structured content sections (when using fields filter)
174    pub has_parts: Option<Vec<Section>>,
175}
176
177/// Structured Contents (BETA) - Article with parsed content.
178///
179/// This type extends [`Article`] with fully parsed content including infoboxes,
180/// sections, and tables. Available through the Structured Contents BETA endpoint.
181///
182/// # Accessing Content
183///
184/// Use the convenience methods to access parsed content:
185///
186/// ```rust,ignore
187/// use wme_models::StructuredArticle;
188///
189/// // Get infobox by name
190/// if let Some(infobox) = article.infobox("Automatic taxobox") {
191///     // Process infobox fields
192/// }
193///
194/// // Get section by name
195/// if let Some(section) = article.section("Taxonomy") {
196///     // Process section content
197/// }
198/// ```
199#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
200pub struct StructuredArticle {
201    /// Base article fields (flattened)
202    #[serde(flatten)]
203    pub base: Article,
204    /// Creation timestamp (first revision) - only in structured contents
205    pub date_created: DateTime<Utc>,
206    /// Parsed infoboxes
207    pub infoboxes: Vec<Infobox>,
208    /// Parsed sections
209    pub sections: Vec<Section>,
210    /// Tables (single object in structured contents)
211    pub tables: Table,
212    /// References/citations (single object in structured contents)
213    pub references: Reference,
214}
215
216impl StructuredArticle {
217    /// Get infobox by name.
218    ///
219    /// # Examples
220    ///
221    /// ```rust,ignore
222    /// use wme_models::StructuredArticle;
223    ///
224    /// if let Some(infobox) = article.infobox("Automatic taxobox") {
225    ///     println!("Found infobox: {:?}", infobox.name);
226    /// }
227    /// ```
228    pub fn infobox(&self, name: &str) -> Option<&Infobox> {
229        self.infoboxes
230            .iter()
231            .find(|i| i.name.as_ref().map(|n| n == name).unwrap_or(false))
232    }
233
234    /// Get section by name.
235    ///
236    /// # Examples
237    ///
238    /// ```rust,ignore
239    /// use wme_models::StructuredArticle;
240    ///
241    /// if let Some(section) = article.section("References") {
242    ///     println!("Section has {} parts", section.has_parts.as_ref().map(|p| p.len()).unwrap_or(0));
243    /// }
244    /// ```
245    pub fn section(&self, name: &str) -> Option<&Section> {
246        self.sections
247            .iter()
248            .find(|s| s.name.as_ref().map(|n| n == name).unwrap_or(false))
249    }
250
251    /// Get table by identifier.
252    ///
253    /// # Examples
254    ///
255    /// ```rust,ignore
256    /// use wme_models::StructuredArticle;
257    ///
258    /// if let Some(table) = article.table("demographics_table1") {
259    ///     println!("Table has {} rows", table.rows.len());
260    /// }
261    /// ```
262    pub fn table(&self, identifier: &str) -> Option<&Table> {
263        if self.tables.identifier == identifier {
264            Some(&self.tables)
265        } else {
266            None
267        }
268    }
269}
270
271impl std::ops::Deref for StructuredArticle {
272    type Target = Article;
273
274    fn deref(&self) -> &Self::Target {
275        &self.base
276    }
277}
278
279#[cfg(test)]
280mod tests {
281    use super::*;
282    use crate::version::{ArticleSize, Editor};
283    use chrono::Utc;
284
285    fn create_test_language() -> Language {
286        Language {
287            identifier: Some("en".to_string()),
288            name: Some("English".to_string()),
289            alternate_name: None,
290            direction: Some("ltr".to_string()),
291        }
292    }
293
294    fn create_test_namespace() -> Namespace {
295        Namespace {
296            identifier: 0,
297            name: Some("".to_string()),
298            description: Some("Main namespace".to_string()),
299        }
300    }
301
302    fn create_test_project_ref() -> ProjectRef {
303        ProjectRef {
304            identifier: "enwiki".to_string(),
305            url: Some("https://en.wikipedia.org".to_string()),
306        }
307    }
308
309    fn create_test_version() -> Version {
310        Version {
311            identifier: 1182847293,
312            editor: Some(Editor {
313                identifier: Some(12345),
314                name: Some("TestUser".to_string()),
315                is_bot: Some(false),
316                is_anonymous: Some(false),
317                date_started: Some(Utc::now()),
318                edit_count: Some(1000),
319                groups: Some(vec!["user".to_string()]),
320                is_admin: Some(false),
321                is_patroller: Some(false),
322                has_advanced_rights: Some(false),
323            }),
324            comment: Some("Test edit".to_string()),
325            tags: Some(vec!["mobile edit".to_string()]),
326            has_tag_needs_citation: Some(false),
327            is_minor_edit: Some(false),
328            is_flagged_stable: Some(true),
329            is_breaking_news: Some(false),
330            noindex: Some(false),
331            number_of_characters: Some(5000),
332            size: Some(ArticleSize {
333                value: 15000,
334                unit_text: "B".to_string(),
335            }),
336            maintenance_tags: None,
337            scores: None,
338        }
339    }
340
341    fn create_test_article() -> Article {
342        Article {
343            identifier: 28492,
344            name: "Squirrel".to_string(),
345            url: "https://en.wikipedia.org/wiki/Squirrel".to_string(),
346            abstract_text: Some("Squirrels are members of the family Sciuridae...".to_string()),
347            description: Some("Family of rodents".to_string()),
348            date_modified: Utc::now(),
349            date_previously_modified: None,
350            in_language: create_test_language(),
351            is_part_of: create_test_project_ref(),
352            namespace: Some(create_test_namespace()),
353            main_entity: None,
354            additional_entities: None,
355            categories: None,
356            templates: None,
357            redirects: None,
358            version: create_test_version(),
359            previous_version: OptionalPreviousVersion(None),
360            watchers_count: Some(42),
361            protection: None,
362            visibility: None,
363            image: None,
364            license: vec![],
365            article_body: None,
366            event: None,
367            has_parts: None,
368        }
369    }
370
371    #[test]
372    fn test_article_creation() {
373        let article = create_test_article();
374        assert_eq!(article.identifier, 28492);
375        assert_eq!(article.name, "Squirrel");
376        assert!(article.abstract_text.is_some());
377    }
378
379    #[test]
380    fn test_project_ref_creation() {
381        let project = ProjectRef {
382            identifier: "enwiki".to_string(),
383            url: Some("https://en.wikipedia.org".to_string()),
384        };
385        assert_eq!(project.identifier, "enwiki");
386        assert!(project.url.is_some());
387    }
388
389    #[test]
390    fn test_visibility_creation() {
391        let visibility = Visibility {
392            text: true,
393            editor: false,
394            comment: false,
395        };
396        assert!(visibility.text);
397        assert!(!visibility.editor);
398        assert!(!visibility.comment);
399    }
400
401    #[test]
402    fn test_version_comparison_for_dedup() {
403        let article1 = create_test_article();
404        let article2 = Article {
405            identifier: 28492,
406            name: "Squirrel".to_string(),
407            url: "https://en.wikipedia.org/wiki/Squirrel".to_string(),
408            abstract_text: None,
409            description: None,
410            date_modified: Utc::now(),
411            date_previously_modified: None,
412            in_language: create_test_language(),
413            is_part_of: create_test_project_ref(),
414            namespace: Some(create_test_namespace()),
415            main_entity: None,
416            additional_entities: None,
417            categories: None,
418            templates: None,
419            redirects: None,
420            version: Version {
421                identifier: 1182847294, // Higher version
422                editor: create_test_version().editor,
423                comment: None,
424                tags: None,
425                has_tag_needs_citation: None,
426                is_minor_edit: None,
427                is_flagged_stable: None,
428                is_breaking_news: None,
429                noindex: None,
430                number_of_characters: None,
431                size: None,
432                maintenance_tags: None,
433                scores: None,
434            },
435            previous_version: OptionalPreviousVersion(None),
436            watchers_count: None,
437            protection: None,
438            visibility: None,
439            image: None,
440            license: vec![],
441            article_body: None,
442            event: None,
443            has_parts: None,
444        };
445
446        // When deduplicating, keep the one with higher version.identifier
447        assert!(article2.version.identifier > article1.version.identifier);
448    }
449}