wme-stream 0.1.1

Streaming utilities for the Wikimedia Enterprise API
Documentation
//! Visitor trait for article processing.
//!
//! The visitor pattern allows processing articles incrementally without
//! fully materializing them in memory. This is useful for:
//!
//! - **Graph building** - Extract nodes and edges without storing articles
//! - **Statistics** - Count occurrences without keeping articles
//! - **Filtering** - Selectively process specific fields
//! - **Large datasets** - Process snapshots larger than available RAM
//!
//! # How It Works
//!
//! The visitor is called for each component of an article:
//! 1. `visit_article_start()` - New article begins
//! 2. `visit_category()`, `visit_link()`, etc. - Components discovered
//! 3. `visit_article_end()` - Article complete
//!
//! # Implementing a Visitor
//!
//! ```rust
//! use wme_stream::ArticleVisitor;
//! use serde_json::Value;
//!
//! struct MyVisitor {
//!     article_count: u64,
//!     category_count: u64,
//! }
//!
//! impl ArticleVisitor for MyVisitor {
//!     fn visit_article_start(&mut self, _id: u64, _name: &str) {
//!         self.article_count += 1;
//!     }
//!
//!     fn visit_category(&mut self, _name: &str, _url: &str) {
//!         self.category_count += 1;
//!     }
//!
//!     fn visit_link(&mut self, _text: &str, _url: &str) {}
//!     fn visit_infobox(&mut self, _name: &str, _value: &str) {}
//!     fn visit_reference(&mut self, _id: &str, _ref_type: &str, _metadata: &Value) {}
//!     fn visit_article_end(&mut self) {}
//! }
//! ```
//!
//! # Included Visitors
//!
//! - `NoOpVisitor` - Does nothing (useful for testing)
//! - `StatsVisitor` - Counts articles, categories, links, etc.

use serde_json::Value;

/// Visitor trait for extracting data without full article materialization.
///
/// This trait allows incremental processing of articles, enabling
/// graph building, statistics collection, and other use cases without
/// keeping full articles in memory.
///
/// # Usage
///
/// Implement this trait to process specific parts of articles:
/// - Override methods for components you care about
/// - Leave others as no-ops
/// - Store state in your struct fields
///
/// # Example
///
/// ```rust
/// use wme_stream::ArticleVisitor;
/// use serde_json::Value;
///
/// struct CategoryCollector {
///     categories: Vec<String>,
/// }
///
/// impl ArticleVisitor for CategoryCollector {
///     fn visit_article_start(&mut self, _id: u64, _name: &str) {}
///
///     fn visit_category(&mut self, name: &str, _url: &str) {
///         self.categories.push(name.to_string());
///     }
///
///     fn visit_link(&mut self, _text: &str, _url: &str) {}
///     fn visit_infobox(&mut self, _name: &str, _value: &str) {}
///     fn visit_reference(&mut self, _id: &str, _ref_type: &str, _metadata: &Value) {}
///     fn visit_article_end(&mut self) {}
/// }
/// ```
pub trait ArticleVisitor {
    /// Called when starting to process an article.
    ///
    /// # Arguments
    ///
    /// * `id` - Article identifier
    /// * `name` - Article title
    fn visit_article_start(&mut self, id: u64, name: &str);

    /// Called when encountering a category.
    ///
    /// # Arguments
    ///
    /// * `name` - Category name (e.g., "Category:Living people")
    /// * `url` - Category URL
    fn visit_category(&mut self, name: &str, url: &str);

    /// Called when encountering a link.
    ///
    /// # Arguments
    ///
    /// * `text` - Link text
    /// * `url` - Link URL
    fn visit_link(&mut self, text: &str, url: &str);

    /// Called when encountering an infobox field.
    ///
    /// # Arguments
    ///
    /// * `name` - Field name
    /// * `value` - Field value
    fn visit_infobox(&mut self, name: &str, value: &str);

    /// Called when encountering a reference.
    ///
    /// # Arguments
    ///
    /// * `id` - Reference identifier
    /// * `ref_type` - Reference type ("web", "book", "text")
    /// * `metadata` - Reference metadata (author, title, URL, etc.)
    fn visit_reference(&mut self, id: &str, ref_type: &str, metadata: &Value);

    /// Called when finished processing an article.
    ///
    /// Use this to finalize per-article processing.
    fn visit_article_end(&mut self);
}

/// A no-op visitor that does nothing.
///
/// Useful for:
/// - Testing
/// - Benchmarking
/// - As a base for selective implementation
///
/// # Example
///
/// ```rust
/// use wme_stream::NoOpVisitor;
///
/// let mut visitor = NoOpVisitor;
/// // All methods are no-ops
/// ```
pub struct NoOpVisitor;

impl ArticleVisitor for NoOpVisitor {
    fn visit_article_start(&mut self, _id: u64, _name: &str) {}
    fn visit_category(&mut self, _name: &str, _url: &str) {}
    fn visit_link(&mut self, _text: &str, _url: &str) {}
    fn visit_infobox(&mut self, _name: &str, _value: &str) {}
    fn visit_reference(&mut self, _id: &str, _ref_type: &str, _metadata: &Value) {}
    fn visit_article_end(&mut self) {}
}

/// A visitor that collects counts only (low memory footprint).
///
/// Tracks aggregate statistics without storing individual articles.
/// Useful for getting overview statistics of a snapshot.
///
/// # Example
///
/// ```rust
/// use wme_stream::{ArticleVisitor, StatsVisitor};
/// use serde_json::Value;
///
/// let mut visitor = StatsVisitor::new();
///
/// // Simulate visiting articles
/// for i in 0..100 {
///     visitor.visit_article_start(i as u64, "Article");
///     visitor.visit_category("Category:Test", "https://en.wikipedia.org/wiki/Category:Test");
///     visitor.visit_link("Link", "https://en.wikipedia.org/wiki/Link");
///     visitor.visit_article_end();
/// }
///
/// assert_eq!(visitor.article_count, 100);
/// assert_eq!(visitor.category_count, 100);
/// assert_eq!(visitor.link_count, 100);
/// ```
pub struct StatsVisitor {
    /// Total articles processed
    pub article_count: u64,
    /// Total categories found
    pub category_count: u64,
    /// Total links found
    pub link_count: u64,
    /// Total infoboxes found
    pub infobox_count: u64,
    /// Total references found
    pub reference_count: u64,
}

impl StatsVisitor {
    /// Create a new stats visitor with all counts at zero.
    ///
    /// # Example
    ///
    /// ```rust
    /// use wme_stream::StatsVisitor;
    ///
    /// let visitor = StatsVisitor::new();
    /// assert_eq!(visitor.article_count, 0);
    /// assert_eq!(visitor.category_count, 0);
    /// ```
    pub fn new() -> Self {
        Self {
            article_count: 0,
            category_count: 0,
            link_count: 0,
            infobox_count: 0,
            reference_count: 0,
        }
    }
}

impl ArticleVisitor for StatsVisitor {
    fn visit_article_start(&mut self, _id: u64, _name: &str) {
        self.article_count += 1;
    }

    fn visit_category(&mut self, _name: &str, _url: &str) {
        self.category_count += 1;
    }

    fn visit_link(&mut self, _text: &str, _url: &str) {
        self.link_count += 1;
    }

    fn visit_infobox(&mut self, _name: &str, _value: &str) {
        self.infobox_count += 1;
    }

    fn visit_reference(&mut self, _id: &str, _ref_type: &str, _metadata: &Value) {
        self.reference_count += 1;
    }

    fn visit_article_end(&mut self) {}
}

impl Default for StatsVisitor {
    fn default() -> Self {
        Self::new()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_noop_visitor() {
        let mut visitor = NoOpVisitor;

        // Should not panic
        visitor.visit_article_start(1, "Test");
        visitor.visit_category("Cat", "https://example.com");
        visitor.visit_link("Link", "https://example.com");
        visitor.visit_infobox("Name", "Value");
        visitor.visit_reference("ref1", "web", &Value::Null);
        visitor.visit_article_end();
    }

    #[test]
    fn test_stats_visitor_new() {
        let visitor = StatsVisitor::new();
        assert_eq!(visitor.article_count, 0);
        assert_eq!(visitor.category_count, 0);
        assert_eq!(visitor.link_count, 0);
        assert_eq!(visitor.infobox_count, 0);
        assert_eq!(visitor.reference_count, 0);
    }

    #[test]
    fn test_stats_visitor_counts() {
        let mut visitor = StatsVisitor::new();

        // Simulate visiting articles
        for i in 0..10 {
            visitor.visit_article_start(i, &format!("Article {}", i));

            // Each article has 2 categories
            visitor.visit_category("Category:A", "https://example.com/A");
            visitor.visit_category("Category:B", "https://example.com/B");

            // Each article has 3 links
            visitor.visit_link("Link1", "https://example.com/1");
            visitor.visit_link("Link2", "https://example.com/2");
            visitor.visit_link("Link3", "https://example.com/3");

            // Each article has 1 infobox
            visitor.visit_infobox("Name", "Value");

            // Each article has 2 references
            visitor.visit_reference(&format!("ref{}", i * 2), "web", &Value::Null);
            visitor.visit_reference(&format!("ref{}", i * 2 + 1), "book", &Value::Null);

            visitor.visit_article_end();
        }

        assert_eq!(visitor.article_count, 10);
        assert_eq!(visitor.category_count, 20); // 10 * 2
        assert_eq!(visitor.link_count, 30); // 10 * 3
        assert_eq!(visitor.infobox_count, 10); // 10 * 1
        assert_eq!(visitor.reference_count, 20); // 10 * 2
    }

    #[test]
    fn test_stats_visitor_default() {
        let visitor: StatsVisitor = Default::default();
        assert_eq!(visitor.article_count, 0);
        assert_eq!(visitor.category_count, 0);
    }

    // Test a custom visitor implementation
    struct TestVisitor {
        last_article_id: Option<u64>,
        last_article_name: Option<String>,
        categories: Vec<String>,
    }

    impl ArticleVisitor for TestVisitor {
        fn visit_article_start(&mut self, id: u64, name: &str) {
            self.last_article_id = Some(id);
            self.last_article_name = Some(name.to_string());
        }

        fn visit_category(&mut self, name: &str, _url: &str) {
            self.categories.push(name.to_string());
        }

        fn visit_link(&mut self, _text: &str, _url: &str) {}
        fn visit_infobox(&mut self, _name: &str, _value: &str) {}
        fn visit_reference(&mut self, _id: &str, _ref_type: &str, _metadata: &Value) {}
        fn visit_article_end(&mut self) {}
    }

    #[test]
    fn test_custom_visitor() {
        let mut visitor = TestVisitor {
            last_article_id: None,
            last_article_name: None,
            categories: Vec::new(),
        };

        visitor.visit_article_start(42, "Test Article");
        visitor.visit_category("Category:Test", "https://example.com");
        visitor.visit_article_end();

        assert_eq!(visitor.last_article_id, Some(42));
        assert_eq!(visitor.last_article_name, Some("Test Article".to_string()));
        assert_eq!(visitor.categories, vec!["Category:Test"]);
    }
}