progscrape_application/persist/
memindex.rs

1use std::collections::HashMap;
2
3use itertools::Itertools;
4
5use progscrape_scrapers::{ScrapeCollection, StoryUrlNorm};
6
7use super::{shard::Shard, *};
8
9/// Builds an index of stories in memory, useful for pre-aggregation of scrapes into normalized URL collections.
10#[derive(Default, Serialize, Deserialize)]
11pub struct MemIndex {
12    /// A map from year/month to normalized story URL, to scrape source/ID to scrape.
13    stories: HashMap<Shard, HashMap<StoryUrlNorm, ScrapeCollection>>,
14}
15
16impl MemIndex {
17    pub fn get_all_stories(self) -> impl DoubleEndedIterator<Item = ScrapeCollection> {
18        let mut out = vec![];
19        for (_shard, stories) in self.stories.into_iter().sorted_by_cached_key(|f| f.0) {
20            for (_, story) in stories {
21                out.push(story);
22            }
23        }
24        out.into_iter()
25    }
26
27    fn map_mut(&mut self, shard: Shard) -> &mut HashMap<StoryUrlNorm, ScrapeCollection> {
28        self.stories.entry(shard).or_default()
29    }
30
31    fn map(&self, shard: &Shard) -> Option<&HashMap<StoryUrlNorm, ScrapeCollection>> {
32        self.stories.get(shard)
33    }
34
35    pub fn insert_scrapes<I: IntoIterator<Item = TypedScrape>>(
36        &mut self,
37        scrapes: I,
38    ) -> Result<(), PersistError> {
39        'outer: for scrape in scrapes {
40            let date = Shard::from_date_time(scrape.date);
41            let normalized_url = scrape.url.normalization();
42            // Try to pin it to an existing item
43            for n in -2..=2 {
44                let map0 = self.map_mut(date.plus_months(n));
45                if let Some((key, mut scrapes)) = map0.remove_entry(normalized_url) {
46                    // Merge and then re-insert the story in the correct shard
47                    scrapes.merge(scrape);
48                    self.map_mut(Shard::from_date_time(scrapes.earliest))
49                        .insert(key, scrapes);
50                    continue 'outer;
51                }
52            }
53
54            // Not found!
55            if let Some(_old) = self.map_mut(date).insert(
56                normalized_url.clone(),
57                ScrapeCollection::new_from_one(scrape),
58            ) {
59                // TODO: We need to merge duplicate scrapes
60                println!("Unexpected");
61            }
62        }
63        Ok(())
64    }
65}
66
67#[cfg(test)]
68mod test {
69
70    use progscrape_scrapers::ScrapeConfig;
71
72    use super::*;
73
74    #[test]
75    fn test_year_month() {
76        let date = Shard::from_year_month(2000, 12);
77        assert_eq!(Shard::from_year_month(2001, 1), date.plus_months(1));
78        assert_eq!(Shard::from_year_month(2001, 12), date.plus_months(12));
79        assert_eq!(Shard::from_year_month(1999, 12), date.sub_months(12));
80        assert_eq!(Shard::from_year_month(2000, 1), date.sub_months(11));
81
82        assert_eq!(
83            date,
84            Shard::from_string(&date.to_string()).expect("Failed to parse")
85        );
86    }
87
88    #[test]
89    fn test_index_lots() {
90        let stories = progscrape_scrapers::load_sample_scrapes(&ScrapeConfig::default());
91        let mut index = MemIndex::default();
92
93        let _eval = StoryEvaluator::new_for_test();
94        index
95            .insert_scrapes(stories)
96            .expect("Failed to insert scrapes");
97    }
98}