Skip to main content

ngdp_bpsv/
interned_document.rs

1//! BPSV document with string interning for memory efficiency
2
3use crate::error::Result;
4use crate::interner::{InternedValue, StringInterner};
5use crate::schema::BpsvSchema;
6use crate::value::BpsvValue;
7use std::sync::Arc;
8
9/// A BPSV document that uses string interning to reduce memory usage
10///
11/// This is particularly effective for config files where the same values
12/// appear repeatedly across rows (e.g., region codes, server names, etc.)
13#[derive(Debug, Clone)]
14pub struct InternedBpsvDocument {
15    /// The schema for this document
16    schema: Arc<BpsvSchema>,
17    /// Rows of interned values
18    rows: Vec<InternedRow>,
19    /// Sequence number from the document
20    sequence_number: Option<u32>,
21    /// String interner used by this document
22    interner: StringInterner,
23}
24
25/// A row of interned values
26#[derive(Debug, Clone)]
27pub struct InternedRow {
28    values: Vec<InternedValue>,
29}
30
31impl InternedBpsvDocument {
32    /// Create a new interned document from a regular document
33    pub fn from_document(doc: crate::document::BpsvDocument<'_>) -> Self {
34        let interner = StringInterner::with_capacity(100);
35        let mut interned_rows = Vec::with_capacity(doc.rows().len());
36
37        // Save the schema and sequence number before consuming the document
38        let schema = Arc::new(doc.schema().clone());
39        let sequence_number = doc.sequence_number();
40
41        // Convert each row to use interned strings
42        for row in doc.into_owned_rows() {
43            let mut interned_values = Vec::with_capacity(row.len());
44
45            // Parse typed values if needed
46            let typed_values = if let Some(typed) = row.typed_values {
47                typed
48            } else {
49                // Parse raw values to typed
50                let mut typed = Vec::new();
51                for (value, field) in row.raw_values.iter().zip(schema.fields()) {
52                    if let Ok(typed_value) = BpsvValue::parse(value, &field.field_type) {
53                        typed.push(typed_value);
54                    } else {
55                        typed.push(BpsvValue::Empty);
56                    }
57                }
58                typed
59            };
60
61            // Intern the typed values
62            for value in typed_values {
63                interned_values.push(InternedValue::from_bpsv_value(value, &interner));
64            }
65
66            interned_rows.push(InternedRow {
67                values: interned_values,
68            });
69        }
70
71        Self {
72            schema,
73            rows: interned_rows,
74            sequence_number,
75            interner,
76        }
77    }
78
79    /// Parse and create an interned document directly from BPSV data
80    pub fn parse(data: &str) -> Result<Self> {
81        let doc = crate::document::BpsvDocument::parse(data)?;
82        Ok(Self::from_document(doc))
83    }
84
85    /// Get the schema
86    pub fn schema(&self) -> &BpsvSchema {
87        &self.schema
88    }
89
90    /// Get all rows
91    pub fn rows(&self) -> &[InternedRow] {
92        &self.rows
93    }
94
95    /// Get the sequence number
96    pub fn sequence_number(&self) -> Option<u32> {
97        self.sequence_number
98    }
99
100    /// Get memory statistics for this document
101    pub fn memory_stats(&self) -> crate::interner::MemoryStats {
102        self.interner.memory_usage()
103    }
104
105    /// Get the interner hit rate
106    pub fn interner_hit_rate(&self) -> f64 {
107        self.interner.hit_rate()
108    }
109
110    /// Find rows where a field matches a value
111    pub fn find_rows(&self, field_name: &str, value: &str) -> Vec<&InternedRow> {
112        let field_index = match self.schema.get_field(field_name) {
113            Some(field) => field.index,
114            None => return vec![],
115        };
116
117        self.rows
118            .iter()
119            .filter(|row| {
120                row.values
121                    .get(field_index)
122                    .and_then(|v| v.as_str())
123                    .map(|s| s == value)
124                    .unwrap_or(false)
125            })
126            .collect()
127    }
128
129    /// Get a specific row by index
130    pub fn get_row(&self, index: usize) -> Option<&InternedRow> {
131        self.rows.get(index)
132    }
133
134    /// Get the number of rows
135    pub fn row_count(&self) -> usize {
136        self.rows.len()
137    }
138
139    /// Check if the document is empty
140    pub fn is_empty(&self) -> bool {
141        self.rows.is_empty()
142    }
143}
144
145impl InternedRow {
146    /// Get a value by field index
147    pub fn get(&self, index: usize) -> Option<&InternedValue> {
148        self.values.get(index)
149    }
150
151    /// Get a value by field name
152    pub fn get_by_name(&self, field_name: &str, schema: &BpsvSchema) -> Option<&InternedValue> {
153        schema
154            .get_field(field_name)
155            .and_then(|field| self.get(field.index))
156    }
157
158    /// Get all values
159    pub fn values(&self) -> &[InternedValue] {
160        &self.values
161    }
162
163    /// Get the number of values
164    pub fn len(&self) -> usize {
165        self.values.len()
166    }
167
168    /// Check if the row is empty
169    pub fn is_empty(&self) -> bool {
170        self.values.is_empty()
171    }
172}