halldyll_core/storage/
normalized.rs

1//! Normalized - Storage of normalized documents
2
3use std::collections::HashMap;
4use std::sync::RwLock;
5use url::Url;
6
7use crate::types::Document;
8
9/// Store for normalized documents
10pub struct NormalizedStore {
11    /// Documents by URL
12    documents: RwLock<HashMap<String, Document>>,
13    /// Size limit
14    max_documents: usize,
15}
16
17impl Default for NormalizedStore {
18    fn default() -> Self {
19        Self::new(10000)
20    }
21}
22
23impl NormalizedStore {
24    /// New store
25    pub fn new(max_documents: usize) -> Self {
26        Self {
27            documents: RwLock::new(HashMap::new()),
28            max_documents,
29        }
30    }
31
32    /// Store a document
33    pub fn store(&self, document: Document) {
34        let url_key = document.source_url.to_string();
35
36        // Check the limit
37        {
38            let documents = self.documents.read().unwrap();
39            if documents.len() >= self.max_documents {
40                drop(documents);
41                let mut documents = self.documents.write().unwrap();
42                
43                // Remove 10% of the oldest
44                let to_remove: Vec<_> = documents
45                    .iter()
46                    .take(self.max_documents / 10)
47                    .map(|(k, _)| k.clone())
48                    .collect();
49                for k in to_remove {
50                    documents.remove(&k);
51                }
52            }
53        }
54
55        self.documents.write().unwrap().insert(url_key, document);
56    }
57
58    /// Get a document by URL
59    pub fn get(&self, url: &Url) -> Option<Document> {
60        let url_key = url.to_string();
61        self.documents.read().unwrap().get(&url_key).cloned()
62    }
63
64    /// Check if a document exists
65    pub fn has(&self, url: &Url) -> bool {
66        let url_key = url.to_string();
67        self.documents.read().unwrap().contains_key(&url_key)
68    }
69
70    /// Number of documents
71    pub fn len(&self) -> usize {
72        self.documents.read().unwrap().len()
73    }
74
75    /// Is store empty?
76    pub fn is_empty(&self) -> bool {
77        self.documents.read().unwrap().is_empty()
78    }
79
80    /// Clear the store
81    pub fn clear(&self) {
82        self.documents.write().unwrap().clear();
83    }
84
85    /// Iterate over all documents
86    pub fn iter<F>(&self, mut f: F)
87    where
88        F: FnMut(&Document),
89    {
90        let documents = self.documents.read().unwrap();
91        for doc in documents.values() {
92            f(doc);
93        }
94    }
95
96    /// Export all documents
97    pub fn export_all(&self) -> Vec<Document> {
98        self.documents.read().unwrap().values().cloned().collect()
99    }
100}