ifc_lite_core/
decoder.rs

1// This Source Code Form is subject to the terms of the Mozilla Public
2// License, v. 2.0. If a copy of the MPL was not distributed with this
3// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4
5//! Entity Decoder - On-demand entity parsing
6//!
7//! Lazily decode IFC entities from byte offsets without loading entire file into memory.
8
9use crate::error::{Error, Result};
10use crate::parser::parse_entity;
11use crate::schema_gen::{AttributeValue, DecodedEntity};
12use rustc_hash::FxHashMap;
13use std::sync::Arc;
14
15/// Pre-built entity index type
16pub type EntityIndex = FxHashMap<u32, (usize, usize)>;
17
18/// Build entity index from content - O(n) scan using SIMD-accelerated search
19/// Returns index mapping entity IDs to byte offsets
20#[inline]
21pub fn build_entity_index(content: &str) -> EntityIndex {
22    let bytes = content.as_bytes();
23    let len = bytes.len();
24
25    // Pre-allocate with estimated capacity (roughly 1 entity per 50 bytes)
26    let estimated_entities = len / 50;
27    let mut index = FxHashMap::with_capacity_and_hasher(estimated_entities, Default::default());
28
29    let mut pos = 0;
30
31    while pos < len {
32        // Find next '#' using SIMD-accelerated search
33        let remaining = &bytes[pos..];
34        let hash_offset = match memchr::memchr(b'#', remaining) {
35            Some(offset) => offset,
36            None => break,
37        };
38
39        let start = pos + hash_offset;
40        pos = start + 1;
41
42        // Parse entity ID (inline for speed)
43        let id_start = pos;
44        while pos < len && bytes[pos].is_ascii_digit() {
45            pos += 1;
46        }
47        let id_end = pos;
48
49        // Skip whitespace before '=' (handles both `#45=` and `#45 = ` formats)
50        while pos < len && bytes[pos].is_ascii_whitespace() {
51            pos += 1;
52        }
53
54        if id_end > id_start && pos < len && bytes[pos] == b'=' {
55            // Fast integer parsing without allocation
56            let id = parse_u32_inline(bytes, id_start, id_end);
57
58            // Find end of entity (;) using SIMD
59            let entity_content = &bytes[pos..];
60            if let Some(semicolon_offset) = memchr::memchr(b';', entity_content) {
61                pos += semicolon_offset + 1; // Include semicolon
62                index.insert(id, (start, pos));
63            } else {
64                break; // No semicolon found, malformed
65            }
66        }
67    }
68
69    index
70}
71
72/// Fast u32 parsing without string allocation
73#[inline]
74fn parse_u32_inline(bytes: &[u8], start: usize, end: usize) -> u32 {
75    let mut result: u32 = 0;
76    for i in start..end {
77        let digit = bytes[i].wrapping_sub(b'0');
78        result = result.wrapping_mul(10).wrapping_add(digit as u32);
79    }
80    result
81}
82
83/// Entity decoder for lazy parsing - uses Arc for efficient cache sharing
84pub struct EntityDecoder<'a> {
85    content: &'a str,
86    /// Cache of decoded entities (entity_id -> Arc<DecodedEntity>)
87    /// Using Arc avoids expensive clones on cache hits
88    cache: FxHashMap<u32, Arc<DecodedEntity>>,
89    /// Index of entity offsets (entity_id -> (start, end))
90    /// Can be pre-built or built lazily
91    entity_index: Option<EntityIndex>,
92}
93
94impl<'a> EntityDecoder<'a> {
95    /// Create new decoder
96    pub fn new(content: &'a str) -> Self {
97        Self {
98            content,
99            cache: FxHashMap::default(),
100            entity_index: None,
101        }
102    }
103
104    /// Create decoder with pre-built index (faster for repeated lookups)
105    pub fn with_index(content: &'a str, index: EntityIndex) -> Self {
106        Self {
107            content,
108            cache: FxHashMap::default(),
109            entity_index: Some(index),
110        }
111    }
112
113    /// Build entity index for O(1) lookups
114    /// This scans the file once and maps entity IDs to byte offsets
115    fn build_index(&mut self) {
116        if self.entity_index.is_some() {
117            return; // Already built
118        }
119        self.entity_index = Some(build_entity_index(self.content));
120    }
121
122    /// Decode entity at byte offset
123    /// Returns cached entity if already decoded
124    #[inline]
125    pub fn decode_at(&mut self, start: usize, end: usize) -> Result<DecodedEntity> {
126        let line = &self.content[start..end];
127        let (id, ifc_type, tokens) = parse_entity(line).map_err(|e| {
128            // Add debug info about what failed to parse
129            Error::parse(0, format!("Failed to parse entity: {:?}, input: {:?}", e, &line[..line.len().min(100)]))
130        })?;
131
132        // Check cache first - return clone of inner DecodedEntity
133        if let Some(entity_arc) = self.cache.get(&id) {
134            return Ok(entity_arc.as_ref().clone());
135        }
136
137        // Convert tokens to AttributeValues
138        let attributes = tokens
139            .iter()
140            .map(|token| AttributeValue::from_token(token))
141            .collect();
142
143        let entity = DecodedEntity::new(id, ifc_type, attributes);
144        self.cache.insert(id, Arc::new(entity.clone()));
145        Ok(entity)
146    }
147
148    /// Decode entity by ID - O(1) lookup using entity index
149    #[inline]
150    pub fn decode_by_id(&mut self, entity_id: u32) -> Result<DecodedEntity> {
151        // Check cache first - return clone of inner DecodedEntity
152        if let Some(entity_arc) = self.cache.get(&entity_id) {
153            return Ok(entity_arc.as_ref().clone());
154        }
155
156        // Build index if not already built
157        self.build_index();
158
159        // O(1) lookup in index
160        let (start, end) = self.entity_index
161            .as_ref()
162            .and_then(|idx| idx.get(&entity_id).copied())
163            .ok_or_else(|| Error::parse(0, format!("Entity #{} not found", entity_id)))?;
164
165        self.decode_at(start, end)
166    }
167
168    /// Resolve entity reference (follow #ID)
169    /// Returns None for null/derived values
170    #[inline]
171    pub fn resolve_ref(&mut self, attr: &AttributeValue) -> Result<Option<DecodedEntity>> {
172        match attr.as_entity_ref() {
173            Some(id) => Ok(Some(self.decode_by_id(id)?)),
174            None => Ok(None),
175        }
176    }
177
178    /// Resolve list of entity references
179    pub fn resolve_ref_list(
180        &mut self,
181        attr: &AttributeValue,
182    ) -> Result<Vec<DecodedEntity>> {
183        let list = attr
184            .as_list()
185            .ok_or_else(|| Error::parse(0, "Expected list".to_string()))?;
186
187        let mut entities = Vec::with_capacity(list.len());
188        for item in list {
189            if let Some(id) = item.as_entity_ref() {
190                entities.push(self.decode_by_id(id)?);
191            }
192        }
193        Ok(entities)
194    }
195
196    /// Get cached entity (without decoding)
197    pub fn get_cached(&self, entity_id: u32) -> Option<DecodedEntity> {
198        self.cache.get(&entity_id).map(|arc| arc.as_ref().clone())
199    }
200
201    /// Clear cache to free memory
202    pub fn clear_cache(&mut self) {
203        self.cache.clear();
204    }
205
206    /// Get cache size
207    pub fn cache_size(&self) -> usize {
208        self.cache.len()
209    }
210
211    /// Get raw bytes for an entity (for direct/fast parsing)
212    /// Returns the full entity line including type and attributes
213    #[inline]
214    pub fn get_raw_bytes(&mut self, entity_id: u32) -> Option<&'a [u8]> {
215        self.build_index();
216        let (start, end) = self.entity_index.as_ref()?.get(&entity_id).copied()?;
217        Some(self.content[start..end].as_bytes())
218    }
219
220    /// Get raw content string for an entity
221    #[inline]
222    pub fn get_raw_content(&mut self, entity_id: u32) -> Option<&'a str> {
223        self.build_index();
224        let (start, end) = self.entity_index.as_ref()?.get(&entity_id).copied()?;
225        Some(&self.content[start..end])
226    }
227
228    /// Fast extraction of entity reference IDs from a list attribute in raw bytes
229    /// Useful for getting face list from ClosedShell, bounds from Face, etc.
230    /// Returns list of entity IDs
231    #[inline]
232    pub fn get_entity_ref_list_fast(&mut self, entity_id: u32) -> Option<Vec<u32>> {
233        let bytes = self.get_raw_bytes(entity_id)?;
234
235        // Pattern: IFCTYPE((#id1,#id2,...)); or IFCTYPE((#id1,#id2,...),other);
236        let mut i = 0;
237        let len = bytes.len();
238
239        // Skip to first '(' after '='
240        while i < len && bytes[i] != b'(' {
241            i += 1;
242        }
243        if i >= len {
244            return None;
245        }
246        i += 1; // Skip first '('
247
248        // Skip to second '(' for the list
249        while i < len && bytes[i] != b'(' {
250            i += 1;
251        }
252        if i >= len {
253            return None;
254        }
255        i += 1; // Skip second '('
256
257        // Parse entity IDs
258        let mut ids = Vec::with_capacity(32);
259
260        while i < len {
261            // Skip whitespace and commas
262            while i < len && (bytes[i] == b' ' || bytes[i] == b',' || bytes[i] == b'\n' || bytes[i] == b'\r') {
263                i += 1;
264            }
265
266            if i >= len || bytes[i] == b')' {
267                break;
268            }
269
270            // Expect '#' followed by number
271            if bytes[i] == b'#' {
272                i += 1;
273                let start = i;
274                while i < len && bytes[i].is_ascii_digit() {
275                    i += 1;
276                }
277                if i > start {
278                    // Fast integer parsing directly from ASCII digits
279                    let mut id = 0u32;
280                    for &b in &bytes[start..i] {
281                        id = id.wrapping_mul(10).wrapping_add((b - b'0') as u32);
282                    }
283                    ids.push(id);
284                }
285            } else {
286                i += 1; // Skip unknown character
287            }
288        }
289
290        if ids.is_empty() {
291            None
292        } else {
293            Some(ids)
294        }
295    }
296
297    /// Fast extraction of PolyLoop point IDs directly from raw bytes
298    /// Bypasses full entity decoding for BREP optimization
299    /// Returns list of entity IDs for CartesianPoints
300    #[inline]
301    pub fn get_polyloop_point_ids_fast(&mut self, entity_id: u32) -> Option<Vec<u32>> {
302        let bytes = self.get_raw_bytes(entity_id)?;
303
304        // IFCPOLYLOOP((#id1,#id2,#id3,...));
305        let mut i = 0;
306        let len = bytes.len();
307
308        // Skip to first '(' after '='
309        while i < len && bytes[i] != b'(' {
310            i += 1;
311        }
312        if i >= len {
313            return None;
314        }
315        i += 1; // Skip first '('
316
317        // Skip to second '(' for the point list
318        while i < len && bytes[i] != b'(' {
319            i += 1;
320        }
321        if i >= len {
322            return None;
323        }
324        i += 1; // Skip second '('
325
326        // Parse point IDs
327        let mut point_ids = Vec::with_capacity(8); // Most faces have 3-8 vertices
328
329        while i < len {
330            // Skip whitespace and commas
331            while i < len && (bytes[i] == b' ' || bytes[i] == b',' || bytes[i] == b'\n' || bytes[i] == b'\r') {
332                i += 1;
333            }
334
335            if i >= len || bytes[i] == b')' {
336                break;
337            }
338
339            // Expect '#' followed by number
340            if bytes[i] == b'#' {
341                i += 1;
342                let start = i;
343                while i < len && bytes[i].is_ascii_digit() {
344                    i += 1;
345                }
346                if i > start {
347                    // Fast integer parsing directly from ASCII digits
348                    let mut id = 0u32;
349                    for &b in &bytes[start..i] {
350                        id = id.wrapping_mul(10).wrapping_add((b - b'0') as u32);
351                    }
352                    point_ids.push(id);
353                }
354            } else {
355                i += 1; // Skip unknown character
356            }
357        }
358
359        if point_ids.is_empty() {
360            None
361        } else {
362            Some(point_ids)
363        }
364    }
365
366    /// Fast extraction of CartesianPoint coordinates directly from raw bytes
367    /// Bypasses full entity decoding for ~3x speedup on BREP-heavy files
368    /// Returns (x, y, z) as f64 tuple
369    #[inline]
370    pub fn get_cartesian_point_fast(&mut self, entity_id: u32) -> Option<(f64, f64, f64)> {
371        let bytes = self.get_raw_bytes(entity_id)?;
372
373        // Find opening paren for coordinates: IFCCARTESIANPOINT((x,y,z));
374        let mut i = 0;
375        let len = bytes.len();
376
377        // Skip to first '(' after '='
378        while i < len && bytes[i] != b'(' {
379            i += 1;
380        }
381        if i >= len { return None; }
382        i += 1; // Skip first '('
383
384        // Skip to second '(' for the coordinate list
385        while i < len && bytes[i] != b'(' {
386            i += 1;
387        }
388        if i >= len { return None; }
389        i += 1; // Skip second '('
390
391        // Parse x coordinate
392        let x = parse_next_float(&bytes[i..], &mut i)?;
393
394        // Parse y coordinate
395        let y = parse_next_float(&bytes[i..], &mut i)?;
396
397        // Parse z coordinate (optional for 2D points, default to 0)
398        let z = parse_next_float(&bytes[i..], &mut i).unwrap_or(0.0);
399
400        Some((x, y, z))
401    }
402
403    /// Fast extraction of FaceBound info directly from raw bytes
404    /// Returns (loop_id, orientation, is_outer_bound)
405    /// Bypasses full entity decoding for BREP optimization
406    #[inline]
407    pub fn get_face_bound_fast(&mut self, entity_id: u32) -> Option<(u32, bool, bool)> {
408        let bytes = self.get_raw_bytes(entity_id)?;
409        let len = bytes.len();
410
411        // Find '=' to locate start of type name, and '(' for end
412        let mut eq_pos = 0;
413        while eq_pos < len && bytes[eq_pos] != b'=' {
414            eq_pos += 1;
415        }
416        if eq_pos >= len {
417            return None;
418        }
419
420        // Check if this is an outer bound - just scan the type name part
421        // IFCFACEOUTERBOUND vs IFCFACEBOUND
422        // The type name is between '=' and '('
423        let mut is_outer = false;
424        let mut i = eq_pos + 1;
425        while i < len && bytes[i] != b'(' {
426            if bytes[i] == b'O' {
427                is_outer = true;
428            }
429            i += 1;
430        }
431        if i >= len {
432            return None;
433        }
434
435        i += 1; // Skip first '('
436
437        // Skip whitespace
438        while i < len && (bytes[i] == b' ' || bytes[i] == b'\n' || bytes[i] == b'\r') {
439            i += 1;
440        }
441
442        // Expect '#' for loop entity ref
443        if i >= len || bytes[i] != b'#' {
444            return None;
445        }
446        i += 1;
447
448        // Parse loop ID
449        let start = i;
450        while i < len && bytes[i].is_ascii_digit() {
451            i += 1;
452        }
453        if i <= start {
454            return None;
455        }
456        let mut loop_id = 0u32;
457        for &b in &bytes[start..i] {
458            loop_id = loop_id.wrapping_mul(10).wrapping_add((b - b'0') as u32);
459        }
460
461        // Find orientation after comma - default to true (.T.)
462        // Skip to comma
463        while i < len && bytes[i] != b',' {
464            i += 1;
465        }
466        i += 1; // Skip comma
467
468        // Skip whitespace
469        while i < len && (bytes[i] == b' ' || bytes[i] == b'\n' || bytes[i] == b'\r') {
470            i += 1;
471        }
472
473        // Check for .F. (false) or .T. (true)
474        let orientation = if i + 2 < len && bytes[i] == b'.' && bytes[i + 2] == b'.' {
475            bytes[i + 1] != b'F'
476        } else {
477            true // Default to true
478        };
479
480        Some((loop_id, orientation, is_outer))
481    }
482}
483
484/// Parse next float from bytes, advancing position past it
485#[inline]
486fn parse_next_float(bytes: &[u8], offset: &mut usize) -> Option<f64> {
487    let len = bytes.len();
488    let mut i = 0;
489
490    // Skip whitespace and commas
491    while i < len && (bytes[i] == b' ' || bytes[i] == b',' || bytes[i] == b'\n' || bytes[i] == b'\r') {
492        i += 1;
493    }
494
495    if i >= len || bytes[i] == b')' {
496        return None;
497    }
498
499    // Parse float using fast_float
500    match fast_float::parse_partial::<f64, _>(&bytes[i..]) {
501        Ok((value, consumed)) if consumed > 0 => {
502            *offset += i + consumed;
503            Some(value)
504        }
505        _ => None,
506    }
507}
508
509#[cfg(test)]
510mod tests {
511    use super::*;
512    use crate::IfcType;
513
514    #[test]
515    fn test_decode_entity() {
516        let content = r#"
517#1=IFCPROJECT('2vqT3bvqj9RBFjLlXpN8n9',$,$,$,$,$,$,$,$);
518#2=IFCWALL('3a4T3bvqj9RBFjLlXpN8n0',$,$,$,'Wall-001',$,#3,#4);
519#3=IFCLOCALPLACEMENT($,#4);
520#4=IFCAXIS2PLACEMENT3D(#5,$,$);
521#5=IFCCARTESIANPOINT((0.,0.,0.));
522"#;
523
524        let mut decoder = EntityDecoder::new(content);
525
526        // Find entity #2
527        let start = content.find("#2=").unwrap();
528        let end = content[start..].find(';').unwrap() + start + 1;
529
530        let entity = decoder.decode_at(start, end).unwrap();
531        assert_eq!(entity.id, 2);
532        assert_eq!(entity.ifc_type, IfcType::IfcWall);
533        assert_eq!(entity.attributes.len(), 8);
534        assert_eq!(entity.get_string(4), Some("Wall-001"));
535        assert_eq!(entity.get_ref(6), Some(3));
536        assert_eq!(entity.get_ref(7), Some(4));
537    }
538
539    #[test]
540    fn test_decode_by_id() {
541        let content = r#"
542#1=IFCPROJECT('guid',$,$,$,$,$,$,$,$);
543#5=IFCWALL('guid2',$,$,$,'Wall-001',$,$,$);
544#10=IFCDOOR('guid3',$,$,$,'Door-001',$,$,$);
545"#;
546
547        let mut decoder = EntityDecoder::new(content);
548
549        let entity = decoder.decode_by_id(5).unwrap();
550        assert_eq!(entity.id, 5);
551        assert_eq!(entity.ifc_type, IfcType::IfcWall);
552        assert_eq!(entity.get_string(4), Some("Wall-001"));
553
554        // Should be cached now
555        assert_eq!(decoder.cache_size(), 1);
556        let cached = decoder.get_cached(5).unwrap();
557        assert_eq!(cached.id, 5);
558    }
559
560    #[test]
561    fn test_resolve_ref() {
562        let content = r#"
563#1=IFCPROJECT('guid',$,$,$,$,$,$,$,$);
564#2=IFCWALL('guid2',$,$,$,$,$,#1,$);
565"#;
566
567        let mut decoder = EntityDecoder::new(content);
568
569        let wall = decoder.decode_by_id(2).unwrap();
570        let placement_attr = wall.get(6).unwrap();
571
572        let referenced = decoder.resolve_ref(placement_attr).unwrap().unwrap();
573        assert_eq!(referenced.id, 1);
574        assert_eq!(referenced.ifc_type, IfcType::IfcProject);
575    }
576
577    #[test]
578    fn test_resolve_ref_list() {
579        let content = r#"
580#1=IFCPROJECT('guid',$,$,$,$,$,$,$,$);
581#2=IFCWALL('guid1',$,$,$,$,$,$,$);
582#3=IFCDOOR('guid2',$,$,$,$,$,$,$);
583#4=IFCRELCONTAINEDINSPATIALSTRUCTURE('guid3',$,$,$,(#2,#3),$,#1);
584"#;
585
586        let mut decoder = EntityDecoder::new(content);
587
588        let rel = decoder.decode_by_id(4).unwrap();
589        let elements_attr = rel.get(4).unwrap();
590
591        let elements = decoder.resolve_ref_list(elements_attr).unwrap();
592        assert_eq!(elements.len(), 2);
593        assert_eq!(elements[0].id, 2);
594        assert_eq!(elements[0].ifc_type, IfcType::IfcWall);
595        assert_eq!(elements[1].id, 3);
596        assert_eq!(elements[1].ifc_type, IfcType::IfcDoor);
597    }
598
599    #[test]
600    fn test_cache() {
601        let content = r#"
602#1=IFCPROJECT('guid',$,$,$,$,$,$,$,$);
603#2=IFCWALL('guid2',$,$,$,$,$,$,$);
604"#;
605
606        let mut decoder = EntityDecoder::new(content);
607
608        assert_eq!(decoder.cache_size(), 0);
609
610        decoder.decode_by_id(1).unwrap();
611        assert_eq!(decoder.cache_size(), 1);
612
613        decoder.decode_by_id(2).unwrap();
614        assert_eq!(decoder.cache_size(), 2);
615
616        // Decode same entity - should use cache
617        decoder.decode_by_id(1).unwrap();
618        assert_eq!(decoder.cache_size(), 2);
619
620        decoder.clear_cache();
621        assert_eq!(decoder.cache_size(), 0);
622    }
623}