tools_interface/
pageviews.rs

1/// # Pageviews
2/// This implements a simple interface to the Wikimedia Pageviews API.
3/// More information can be found [here](https://wikitech.wikimedia.org/wiki/Analytics/AQS/Pageviews).
4/// Currently, only single-page views are supported.
5/// Aggregate and top views are not yet implemented.
6///
7/// ## Features
8/// Views for multiple pages, on multiple projects, can be retrieved concurrently for a single time span.
9///
10/// ## Example
11/// ```ignore
12/// let pv = Pageviews::new(
13///     PageviewsGranularity::Monthly, // Get monthly views
14///     PageviewsAccess::All, // Get all-access views
15///     PageviewsAgent::All, // Get views from all agents
16/// );
17///
18/// // Prepre a `(String,String)` vector of project-page pairs.
19/// let project_pages = [
20///     ("de.wikipedia", "Barack Obama"),
21///     ("de.wikipedia", "Trude Herr"),
22/// ].into_iter().map(|(a, b)| (a.into(), b.into())).collect();
23///
24/// // Get the pageviews for these pages for every month of 2016.
25/// let results = pv.get_multiple_articles(
26///     &project_pages,
27///     &Pageviews::month_start(2016, 1).unwrap(),
28///     &Pageviews::month_end(2016, 12).unwrap(),
29///     5,
30/// ).await.unwrap();
31///
32/// // Count all views of all pages.
33/// let overall_views: u64 = results.iter().map(|r| r.total_views()).sum();
34/// ```
35// NOTE: This does not use the `Tool` trait, it is too different.
36use chrono::{Duration, NaiveDate};
37
38/// Default delay in seconds when the API returns a 429 (throttling) response without a Retry-After header.
39const DEFAULT_RETRY_DELAY_SECS: u64 = 5;
40use futures::prelude::*;
41use serde::Deserialize;
42use serde_json::Value;
43
44#[derive(Clone, Debug, PartialEq, Deserialize)]
45pub enum PageviewsAccess {
46    #[serde(rename = "all-access")]
47    All,
48    #[serde(rename = "desktop")]
49    Desktop,
50    #[serde(rename = "mobile-app")]
51    MobileApp,
52    #[serde(rename = "mobile-web")]
53    MobileWeb,
54}
55
56impl PageviewsAccess {
57    pub fn as_str(&self) -> &str {
58        match self {
59            Self::All => "all-access",
60            Self::Desktop => "desktop",
61            Self::MobileApp => "mobile-app",
62            Self::MobileWeb => "mobile-web",
63        }
64    }
65}
66
67#[derive(Clone, Debug, PartialEq, Deserialize)]
68pub enum PageviewsAgent {
69    #[serde(rename = "all-agents")]
70    All,
71    #[serde(rename = "user")]
72    User,
73    #[serde(rename = "spider")]
74    Spider,
75    #[serde(rename = "automated")]
76    Automated,
77}
78
79impl PageviewsAgent {
80    pub fn as_str(&self) -> &str {
81        match self {
82            Self::All => "all-agents",
83            Self::User => "user",
84            Self::Spider => "spider",
85            Self::Automated => "automated",
86        }
87    }
88}
89
90#[derive(Clone, Debug, PartialEq, Deserialize)]
91pub enum PageviewsGranularity {
92    #[serde(rename = "hourly")]
93    Hourly,
94    #[serde(rename = "daily")]
95    Daily,
96    #[serde(rename = "monthly")]
97    Monthly,
98}
99
100impl PageviewsGranularity {
101    pub fn as_str(&self) -> &str {
102        match self {
103            Self::Hourly => "hourly",
104            Self::Daily => "daily",
105            Self::Monthly => "monthly",
106        }
107    }
108}
109
110#[derive(Clone, Debug, PartialEq, Deserialize)]
111pub struct PageviewsTimestamp {
112    year: u16,
113    month: u8,
114    day: u8,
115    hour: u8,
116}
117
118impl From<&str> for PageviewsTimestamp {
119    fn from(item: &str) -> Self {
120        Self {
121            year: item[0..4].parse().unwrap(),
122            month: item[4..6].parse().unwrap(),
123            day: item[6..8].parse().unwrap(),
124            hour: item[8..10].parse().unwrap(),
125        }
126    }
127}
128
129impl From<PageviewsTimestamp> for String {
130    fn from(val: PageviewsTimestamp) -> Self {
131        format!(
132            "{:04}{:02}{:02}{:02}",
133            val.year, val.month, val.day, val.hour
134        )
135    }
136}
137
138#[derive(Clone, Debug, PartialEq)]
139pub struct PageviewsParams {
140    pub timestamp: PageviewsTimestamp,
141    pub views: u64,
142}
143
144impl PageviewsParams {
145    fn from_json(item: &Value) -> Option<Self> {
146        let ts = item.get("timestamp")?.as_str()?;
147        Some(Self {
148            timestamp: ts.into(),
149            views: item.get("views")?.as_u64()?,
150        })
151    }
152}
153
154#[derive(Clone, Debug, PartialEq)]
155pub struct PageviewsResult {
156    pub project: String,
157    pub article: String,
158    pub granularity: PageviewsGranularity,
159    pub access: PageviewsAccess,
160    pub agent: PageviewsAgent,
161    pub entries: Vec<PageviewsParams>,
162}
163
164impl PageviewsResult {
165    pub fn total_views(&self) -> u64 {
166        self.entries.iter().map(|r| r.views).sum::<u64>()
167    }
168
169    pub fn len(&self) -> usize {
170        self.entries.len()
171    }
172
173    pub fn is_empty(&self) -> bool {
174        self.len() == 0
175    }
176}
177
178#[derive(Debug, PartialEq)]
179pub struct Pageviews {
180    granularity: PageviewsGranularity,
181    access: PageviewsAccess,
182    agent: PageviewsAgent,
183}
184
185impl Pageviews {
186    // Returns a `NaiveDate` representing the first day of the month.
187    pub fn month_start(year: i32, month: u32) -> Option<NaiveDate> {
188        NaiveDate::from_ymd_opt(year, month, 1)
189    }
190
191    // Returns a `NaiveDate` representing the last day of the month.
192    pub fn month_end(year: i32, month: u32) -> Option<NaiveDate> {
193        let mut last_day_of_month = NaiveDate::from_ymd_opt(year, month + 1, 1)
194            .or(NaiveDate::from_ymd_opt(year + 1, 1, 1))?;
195        last_day_of_month -= Duration::days(1);
196        Some(last_day_of_month)
197    }
198
199    /// Create a new `Pageviews` object.
200    pub fn new(
201        granularity: PageviewsGranularity,
202        access: PageviewsAccess,
203        agent: PageviewsAgent,
204    ) -> Self {
205        Self {
206            granularity,
207            access,
208            agent,
209        }
210    }
211
212    #[cfg(feature = "tokio")]
213    /// Get pageviews for a single page.
214    /// The result page title will have underscores ("_") instead of spaces.
215    /// This function will automatically retry if the Wikimedia API returns a 429 (throttling) status code.
216    pub async fn get_per_article<S1: Into<String>, S2: Into<String>>(
217        &self,
218        page: S1,
219        project: S2,
220        start: &NaiveDate,
221        end: &NaiveDate,
222    ) -> Result<PageviewsResult, crate::ToolsError> {
223        let project: String = project.into();
224        let page: String = page.into().replace(" ", "_");
225        let url = format!(
226            "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/{project}/{access}/{agent}/{page}/{granularity}/{start}/{end}",
227            access = self.access.as_str(),
228            agent = self.agent.as_str(),
229            granularity = self.granularity.as_str(),
230            start = start.format("%Y%m%d"),
231            end = end.format("%Y%m%d"),
232        );
233        let client = crate::ToolsInterface::tokio_client()?;
234        let json: Value;
235        loop {
236            let response = client.get(&url).send().await?;
237            let status = response.status();
238            if status == 429 {
239                // Throttling
240                let delay = response
241                    .headers()
242                    .get("Retry-After")
243                    .and_then(|s| s.to_str().ok())
244                    .and_then(|s| s.parse().ok())
245                    .unwrap_or(DEFAULT_RETRY_DELAY_SECS);
246                tokio::time::sleep(tokio::time::Duration::from_secs(delay)).await;
247                continue;
248            }
249            json = response.json().await?;
250            break;
251        }
252        if json.get("status").is_some() {
253            let message = match json.get("detail") {
254                Some(detail) => match detail.as_str() {
255                    Some(detail_str) => detail_str.to_string(),
256                    None => detail.to_string(), // Not a string, fallback
257                },
258                None => json["status"].to_string(), // We know this exists, fallback
259            };
260            return Err(crate::ToolsError::Tool(message));
261        }
262        let items = json
263            .get("items")
264            .ok_or_else(|| crate::ToolsError::Json("No 'items' in Pageviews JSON".to_string()))?
265            .as_array()
266            .ok_or_else(|| {
267                crate::ToolsError::Json("'items' is not an array in Pageviews JSON".to_string())
268            })?;
269        let ret = PageviewsResult {
270            project,
271            article: page,
272            granularity: self.granularity.to_owned(),
273            access: self.access.to_owned(),
274            agent: self.agent.to_owned(),
275            entries: items
276                .iter()
277                .filter_map(PageviewsParams::from_json)
278                .collect(),
279        };
280        Ok(ret)
281    }
282
283    #[cfg(feature = "tokio")]
284    /// Get pageviews for multiple pages.
285    /// The page titles in the results will have underscores ("_") instead of spaces.
286    /// Use a low `max_concurrent` value to avoid hitting the Wikimedia API rate limits.
287    /// Failed requests will be silently ignored.
288    pub async fn get_multiple_articles(
289        &self,
290        project_pages: &Vec<(String, String)>,
291        start: &NaiveDate,
292        end: &NaiveDate,
293        max_concurrent: usize,
294    ) -> Result<Vec<PageviewsResult>, crate::ToolsError> {
295        let mut futures = Vec::new();
296        for (project, page) in project_pages {
297            let fut = self.get_per_article(page, project, start, end);
298            futures.push(fut);
299        }
300        let stream = futures::stream::iter(futures).buffer_unordered(max_concurrent);
301        let results = stream.collect::<Vec<_>>().await;
302        Ok(results.into_iter().filter_map(|r| r.ok()).collect())
303    }
304
305    // TODO aggregate (all-projects)
306    // TODO top
307    // TODO top-per-country
308}
309
310#[cfg(test)]
311mod tests {
312    use super::*;
313
314    #[test]
315    #[rustfmt::skip]
316    fn test_last_of_month() {
317        assert_eq!(Pageviews::month_end(2021, 1).unwrap().format("%Y-%m-%d").to_string(), "2021-01-31");
318        assert_eq!(Pageviews::month_end(2021, 2).unwrap().format("%Y-%m-%d").to_string(), "2021-02-28");
319        assert_eq!(Pageviews::month_end(2024, 2).unwrap().format("%Y-%m-%d").to_string(), "2024-02-29"); // Leap year
320        assert_eq!(Pageviews::month_end(2021, 3).unwrap().format("%Y-%m-%d").to_string(), "2021-03-31");
321        assert_eq!(Pageviews::month_end(2021, 4).unwrap().format("%Y-%m-%d").to_string(), "2021-04-30");
322        assert_eq!(Pageviews::month_end(2021, 5).unwrap().format("%Y-%m-%d").to_string(), "2021-05-31");
323        assert_eq!(Pageviews::month_end(2021, 6).unwrap().format("%Y-%m-%d").to_string(), "2021-06-30");
324        assert_eq!(Pageviews::month_end(2021, 7).unwrap().format("%Y-%m-%d").to_string(), "2021-07-31");
325        assert_eq!(Pageviews::month_end(2021, 8).unwrap().format("%Y-%m-%d").to_string(), "2021-08-31");
326        assert_eq!(Pageviews::month_end(2021, 9).unwrap().format("%Y-%m-%d").to_string(), "2021-09-30");
327        assert_eq!(Pageviews::month_end(2021, 10).unwrap().format("%Y-%m-%d").to_string(), "2021-10-31");
328        assert_eq!(Pageviews::month_end(2021, 11).unwrap().format("%Y-%m-%d").to_string(), "2021-11-30");
329        assert_eq!(Pageviews::month_end(2021, 12).unwrap().format("%Y-%m-%d").to_string(), "2021-12-31");
330    }
331
332    #[cfg(feature = "tokio")]
333    #[tokio::test]
334    async fn test_pageviews_get_per_article_monthly_async() {
335        let pv = Pageviews::new(
336            PageviewsGranularity::Monthly,
337            PageviewsAccess::All,
338            PageviewsAgent::All,
339        );
340        let result = pv
341            .get_per_article(
342                "Barack_Obama",
343                "de.wikipedia",
344                &Pageviews::month_start(2016, 1).unwrap(),
345                &Pageviews::month_end(2016, 12).unwrap(),
346            )
347            .await
348            .unwrap();
349        assert_eq!(result.len(), 12);
350        assert_eq!(result.total_views(), 1_550_502);
351    }
352
353    #[cfg(feature = "tokio")]
354    #[tokio::test]
355    async fn test_pageviews_get_per_article_daily_async() {
356        let pv = Pageviews::new(
357            PageviewsGranularity::Daily,
358            PageviewsAccess::All,
359            PageviewsAgent::All,
360        );
361        let result = pv
362            .get_per_article(
363                "Barack_Obama",
364                "de.wikipedia",
365                &Pageviews::month_start(2016, 1).unwrap(),
366                &Pageviews::month_end(2016, 1).unwrap(),
367            )
368            .await
369            .unwrap();
370        assert_eq!(result.len(), 31);
371        assert_eq!(result.total_views(), 112_458);
372    }
373
374    #[cfg(feature = "tokio")]
375    #[tokio::test]
376    async fn test_pageviews_get_per_article_bad_date_async() {
377        let pv = Pageviews::new(
378            PageviewsGranularity::Daily,
379            PageviewsAccess::All,
380            PageviewsAgent::All,
381        );
382        let result = pv
383            .get_per_article(
384                "Barack_Obama",
385                "de.wikipedia",
386                &Pageviews::month_start(1016, 1).unwrap(),
387                &Pageviews::month_end(1016, 1).unwrap(),
388            )
389            .await;
390        assert!(result.is_err());
391    }
392
393    #[cfg(feature = "tokio")]
394    #[tokio::test]
395    async fn test_pageviews_multiple_articles_async() {
396        let pv = Pageviews::new(
397            PageviewsGranularity::Monthly,
398            PageviewsAccess::All,
399            PageviewsAgent::All,
400        );
401        let project_pages = [
402            ("de.wikipedia", "Barack Obama"),
403            ("de.wikipedia", "Trude Herr"),
404        ]
405        .into_iter()
406        .map(|(a, b)| (a.into(), b.into()))
407        .collect();
408        let results = pv
409            .get_multiple_articles(
410                &project_pages,
411                &Pageviews::month_start(2016, 1).unwrap(),
412                &Pageviews::month_end(2016, 12).unwrap(),
413                5,
414            )
415            .await
416            .unwrap();
417        assert_eq!(results.len(), 2);
418        let overall_views: u64 = results.iter().map(|r| r.total_views()).sum();
419        assert_eq!(overall_views, 1_670_723);
420    }
421
422    #[test]
423    fn test_pageviews_timestamp() {
424        let time_string = "2345123159";
425        let ts: PageviewsTimestamp = time_string.into();
426        let ts: String = ts.into();
427        assert_eq!(ts, time_string);
428    }
429}