ifc_lite_core/
decoder.rs

1// This Source Code Form is subject to the terms of the Mozilla Public
2// License, v. 2.0. If a copy of the MPL was not distributed with this
3// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4
5//! Entity Decoder - On-demand entity parsing
6//!
7//! Lazily decode IFC entities from byte offsets without loading entire file into memory.
8
9use crate::error::{Error, Result};
10use crate::parser::parse_entity;
11use crate::schema_gen::{AttributeValue, DecodedEntity};
12use rustc_hash::FxHashMap;
13use std::sync::Arc;
14
15/// Pre-built entity index type
16pub type EntityIndex = FxHashMap<u32, (usize, usize)>;
17
18/// Build entity index from content - O(n) scan using SIMD-accelerated search
19/// Returns index mapping entity IDs to byte offsets
20#[inline]
21pub fn build_entity_index(content: &str) -> EntityIndex {
22    let bytes = content.as_bytes();
23    let len = bytes.len();
24
25    // Pre-allocate with estimated capacity (roughly 1 entity per 50 bytes)
26    let estimated_entities = len / 50;
27    let mut index = FxHashMap::with_capacity_and_hasher(estimated_entities, Default::default());
28
29    let mut pos = 0;
30
31    while pos < len {
32        // Find next '#' using SIMD-accelerated search
33        let remaining = &bytes[pos..];
34        let hash_offset = match memchr::memchr(b'#', remaining) {
35            Some(offset) => offset,
36            None => break,
37        };
38
39        let start = pos + hash_offset;
40        pos = start + 1;
41
42        // Parse entity ID (inline for speed)
43        let id_start = pos;
44        while pos < len && bytes[pos].is_ascii_digit() {
45            pos += 1;
46        }
47        let id_end = pos;
48
49        // Skip whitespace before '=' (handles both `#45=` and `#45 = ` formats)
50        while pos < len && bytes[pos].is_ascii_whitespace() {
51            pos += 1;
52        }
53
54        if id_end > id_start && pos < len && bytes[pos] == b'=' {
55            // Fast integer parsing without allocation
56            let id = parse_u32_inline(bytes, id_start, id_end);
57
58            // Find end of entity (;) using SIMD
59            let entity_content = &bytes[pos..];
60            if let Some(semicolon_offset) = memchr::memchr(b';', entity_content) {
61                pos += semicolon_offset + 1; // Include semicolon
62                index.insert(id, (start, pos));
63            } else {
64                break; // No semicolon found, malformed
65            }
66        }
67    }
68
69    index
70}
71
72/// Fast u32 parsing without string allocation
73#[inline]
74fn parse_u32_inline(bytes: &[u8], start: usize, end: usize) -> u32 {
75    let mut result: u32 = 0;
76    for &byte in &bytes[start..end] {
77        let digit = byte.wrapping_sub(b'0');
78        result = result.wrapping_mul(10).wrapping_add(digit as u32);
79    }
80    result
81}
82
83/// Entity decoder for lazy parsing - uses Arc for efficient cache sharing
84pub struct EntityDecoder<'a> {
85    content: &'a str,
86    /// Cache of decoded entities (entity_id -> `Arc<DecodedEntity>`)
87    /// Using Arc avoids expensive clones on cache hits
88    cache: FxHashMap<u32, Arc<DecodedEntity>>,
89    /// Index of entity offsets (entity_id -> (start, end))
90    /// Can be pre-built or built lazily
91    /// Using Arc to allow sharing across threads without cloning the HashMap
92    entity_index: Option<Arc<EntityIndex>>,
93}
94
95impl<'a> EntityDecoder<'a> {
96    /// Create new decoder
97    pub fn new(content: &'a str) -> Self {
98        Self {
99            content,
100            cache: FxHashMap::default(),
101            entity_index: None,
102        }
103    }
104
105    /// Create decoder with pre-built index (faster for repeated lookups)
106    pub fn with_index(content: &'a str, index: EntityIndex) -> Self {
107        Self {
108            content,
109            cache: FxHashMap::default(),
110            entity_index: Some(Arc::new(index)),
111        }
112    }
113
114    /// Create decoder with shared Arc index (for parallel processing)
115    pub fn with_arc_index(content: &'a str, index: Arc<EntityIndex>) -> Self {
116        Self {
117            content,
118            cache: FxHashMap::default(),
119            entity_index: Some(index),
120        }
121    }
122
123    /// Build entity index for O(1) lookups
124    /// This scans the file once and maps entity IDs to byte offsets
125    fn build_index(&mut self) {
126        if self.entity_index.is_some() {
127            return; // Already built
128        }
129        self.entity_index = Some(Arc::new(build_entity_index(self.content)));
130    }
131
132    /// Decode entity at byte offset
133    /// Returns cached entity if already decoded
134    #[inline]
135    pub fn decode_at(&mut self, start: usize, end: usize) -> Result<DecodedEntity> {
136        let line = &self.content[start..end];
137        let (id, ifc_type, tokens) = parse_entity(line).map_err(|e| {
138            // Add debug info about what failed to parse
139            Error::parse(
140                0,
141                format!(
142                    "Failed to parse entity: {:?}, input: {:?}",
143                    e,
144                    &line[..line.len().min(100)]
145                ),
146            )
147        })?;
148
149        // Check cache first - return clone of inner DecodedEntity
150        if let Some(entity_arc) = self.cache.get(&id) {
151            return Ok(entity_arc.as_ref().clone());
152        }
153
154        // Convert tokens to AttributeValues
155        let attributes = tokens
156            .iter()
157            .map(|token| AttributeValue::from_token(token))
158            .collect();
159
160        let entity = DecodedEntity::new(id, ifc_type, attributes);
161        self.cache.insert(id, Arc::new(entity.clone()));
162        Ok(entity)
163    }
164
165    /// Decode entity by ID - O(1) lookup using entity index
166    #[inline]
167    pub fn decode_by_id(&mut self, entity_id: u32) -> Result<DecodedEntity> {
168        // Check cache first - return clone of inner DecodedEntity
169        if let Some(entity_arc) = self.cache.get(&entity_id) {
170            return Ok(entity_arc.as_ref().clone());
171        }
172
173        // Build index if not already built
174        self.build_index();
175
176        // O(1) lookup in index
177        let (start, end) = self
178            .entity_index
179            .as_ref()
180            .and_then(|idx| idx.get(&entity_id).copied())
181            .ok_or_else(|| Error::parse(0, format!("Entity #{} not found", entity_id)))?;
182
183        self.decode_at(start, end)
184    }
185
186    /// Resolve entity reference (follow #ID)
187    /// Returns None for null/derived values
188    #[inline]
189    pub fn resolve_ref(&mut self, attr: &AttributeValue) -> Result<Option<DecodedEntity>> {
190        match attr.as_entity_ref() {
191            Some(id) => Ok(Some(self.decode_by_id(id)?)),
192            None => Ok(None),
193        }
194    }
195
196    /// Resolve list of entity references
197    pub fn resolve_ref_list(&mut self, attr: &AttributeValue) -> Result<Vec<DecodedEntity>> {
198        let list = attr
199            .as_list()
200            .ok_or_else(|| Error::parse(0, "Expected list".to_string()))?;
201
202        let mut entities = Vec::with_capacity(list.len());
203        for item in list {
204            if let Some(id) = item.as_entity_ref() {
205                entities.push(self.decode_by_id(id)?);
206            }
207        }
208        Ok(entities)
209    }
210
211    /// Get cached entity (without decoding)
212    pub fn get_cached(&self, entity_id: u32) -> Option<DecodedEntity> {
213        self.cache.get(&entity_id).map(|arc| arc.as_ref().clone())
214    }
215
216    /// Clear cache to free memory
217    pub fn clear_cache(&mut self) {
218        self.cache.clear();
219    }
220
221    /// Get cache size
222    pub fn cache_size(&self) -> usize {
223        self.cache.len()
224    }
225
226    /// Get raw bytes for an entity (for direct/fast parsing)
227    /// Returns the full entity line including type and attributes
228    #[inline]
229    pub fn get_raw_bytes(&mut self, entity_id: u32) -> Option<&'a [u8]> {
230        self.build_index();
231        let (start, end) = self.entity_index.as_ref()?.get(&entity_id).copied()?;
232        Some(&self.content.as_bytes()[start..end])
233    }
234
235    /// Get raw content string for an entity
236    #[inline]
237    pub fn get_raw_content(&mut self, entity_id: u32) -> Option<&'a str> {
238        self.build_index();
239        let (start, end) = self.entity_index.as_ref()?.get(&entity_id).copied()?;
240        Some(&self.content[start..end])
241    }
242
243    /// Fast extraction of entity reference IDs from a list attribute in raw bytes
244    /// Useful for getting face list from ClosedShell, bounds from Face, etc.
245    /// Returns list of entity IDs
246    #[inline]
247    pub fn get_entity_ref_list_fast(&mut self, entity_id: u32) -> Option<Vec<u32>> {
248        let bytes = self.get_raw_bytes(entity_id)?;
249
250        // Pattern: IFCTYPE((#id1,#id2,...)); or IFCTYPE((#id1,#id2,...),other);
251        let mut i = 0;
252        let len = bytes.len();
253
254        // Skip to first '(' after '='
255        while i < len && bytes[i] != b'(' {
256            i += 1;
257        }
258        if i >= len {
259            return None;
260        }
261        i += 1; // Skip first '('
262
263        // Skip to second '(' for the list
264        while i < len && bytes[i] != b'(' {
265            i += 1;
266        }
267        if i >= len {
268            return None;
269        }
270        i += 1; // Skip second '('
271
272        // Parse entity IDs
273        let mut ids = Vec::with_capacity(32);
274
275        while i < len {
276            // Skip whitespace and commas
277            while i < len
278                && (bytes[i] == b' ' || bytes[i] == b',' || bytes[i] == b'\n' || bytes[i] == b'\r')
279            {
280                i += 1;
281            }
282
283            if i >= len || bytes[i] == b')' {
284                break;
285            }
286
287            // Expect '#' followed by number
288            if bytes[i] == b'#' {
289                i += 1;
290                let start = i;
291                while i < len && bytes[i].is_ascii_digit() {
292                    i += 1;
293                }
294                if i > start {
295                    // Fast integer parsing directly from ASCII digits
296                    let mut id = 0u32;
297                    for &b in &bytes[start..i] {
298                        id = id.wrapping_mul(10).wrapping_add((b - b'0') as u32);
299                    }
300                    ids.push(id);
301                }
302            } else {
303                i += 1; // Skip unknown character
304            }
305        }
306
307        if ids.is_empty() {
308            None
309        } else {
310            Some(ids)
311        }
312    }
313
314    /// Fast extraction of PolyLoop point IDs directly from raw bytes
315    /// Bypasses full entity decoding for BREP optimization
316    /// Returns list of entity IDs for CartesianPoints
317    #[inline]
318    pub fn get_polyloop_point_ids_fast(&mut self, entity_id: u32) -> Option<Vec<u32>> {
319        let bytes = self.get_raw_bytes(entity_id)?;
320
321        // IFCPOLYLOOP((#id1,#id2,#id3,...));
322        let mut i = 0;
323        let len = bytes.len();
324
325        // Skip to first '(' after '='
326        while i < len && bytes[i] != b'(' {
327            i += 1;
328        }
329        if i >= len {
330            return None;
331        }
332        i += 1; // Skip first '('
333
334        // Skip to second '(' for the point list
335        while i < len && bytes[i] != b'(' {
336            i += 1;
337        }
338        if i >= len {
339            return None;
340        }
341        i += 1; // Skip second '('
342
343        // Parse point IDs
344        let mut point_ids = Vec::with_capacity(8); // Most faces have 3-8 vertices
345
346        while i < len {
347            // Skip whitespace and commas
348            while i < len
349                && (bytes[i] == b' ' || bytes[i] == b',' || bytes[i] == b'\n' || bytes[i] == b'\r')
350            {
351                i += 1;
352            }
353
354            if i >= len || bytes[i] == b')' {
355                break;
356            }
357
358            // Expect '#' followed by number
359            if bytes[i] == b'#' {
360                i += 1;
361                let start = i;
362                while i < len && bytes[i].is_ascii_digit() {
363                    i += 1;
364                }
365                if i > start {
366                    // Fast integer parsing directly from ASCII digits
367                    let mut id = 0u32;
368                    for &b in &bytes[start..i] {
369                        id = id.wrapping_mul(10).wrapping_add((b - b'0') as u32);
370                    }
371                    point_ids.push(id);
372                }
373            } else {
374                i += 1; // Skip unknown character
375            }
376        }
377
378        if point_ids.is_empty() {
379            None
380        } else {
381            Some(point_ids)
382        }
383    }
384
385    /// Fast extraction of CartesianPoint coordinates directly from raw bytes
386    /// Bypasses full entity decoding for ~3x speedup on BREP-heavy files
387    /// Returns (x, y, z) as f64 tuple
388    #[inline]
389    pub fn get_cartesian_point_fast(&mut self, entity_id: u32) -> Option<(f64, f64, f64)> {
390        let bytes = self.get_raw_bytes(entity_id)?;
391
392        // Find opening paren for coordinates: IFCCARTESIANPOINT((x,y,z));
393        let mut i = 0;
394        let len = bytes.len();
395
396        // Skip to first '(' after '='
397        while i < len && bytes[i] != b'(' {
398            i += 1;
399        }
400        if i >= len {
401            return None;
402        }
403        i += 1; // Skip first '('
404
405        // Skip to second '(' for the coordinate list
406        while i < len && bytes[i] != b'(' {
407            i += 1;
408        }
409        if i >= len {
410            return None;
411        }
412        i += 1; // Skip second '('
413
414        // Parse x coordinate
415        let x = parse_next_float(&bytes[i..], &mut i)?;
416
417        // Parse y coordinate
418        let y = parse_next_float(&bytes[i..], &mut i)?;
419
420        // Parse z coordinate (optional for 2D points, default to 0)
421        let z = parse_next_float(&bytes[i..], &mut i).unwrap_or(0.0);
422
423        Some((x, y, z))
424    }
425
426    /// Fast extraction of FaceBound info directly from raw bytes
427    /// Returns (loop_id, orientation, is_outer_bound)
428    /// Bypasses full entity decoding for BREP optimization
429    #[inline]
430    pub fn get_face_bound_fast(&mut self, entity_id: u32) -> Option<(u32, bool, bool)> {
431        let bytes = self.get_raw_bytes(entity_id)?;
432        let len = bytes.len();
433
434        // Find '=' to locate start of type name, and '(' for end
435        let mut eq_pos = 0;
436        while eq_pos < len && bytes[eq_pos] != b'=' {
437            eq_pos += 1;
438        }
439        if eq_pos >= len {
440            return None;
441        }
442
443        // Check if this is an outer bound - just scan the type name part
444        // IFCFACEOUTERBOUND vs IFCFACEBOUND
445        // The type name is between '=' and '('
446        let mut is_outer = false;
447        let mut i = eq_pos + 1;
448        while i < len && bytes[i] != b'(' {
449            if bytes[i] == b'O' {
450                is_outer = true;
451            }
452            i += 1;
453        }
454        if i >= len {
455            return None;
456        }
457
458        i += 1; // Skip first '('
459
460        // Skip whitespace
461        while i < len && (bytes[i] == b' ' || bytes[i] == b'\n' || bytes[i] == b'\r') {
462            i += 1;
463        }
464
465        // Expect '#' for loop entity ref
466        if i >= len || bytes[i] != b'#' {
467            return None;
468        }
469        i += 1;
470
471        // Parse loop ID
472        let start = i;
473        while i < len && bytes[i].is_ascii_digit() {
474            i += 1;
475        }
476        if i <= start {
477            return None;
478        }
479        let mut loop_id = 0u32;
480        for &b in &bytes[start..i] {
481            loop_id = loop_id.wrapping_mul(10).wrapping_add((b - b'0') as u32);
482        }
483
484        // Find orientation after comma - default to true (.T.)
485        // Skip to comma
486        while i < len && bytes[i] != b',' {
487            i += 1;
488        }
489        i += 1; // Skip comma
490
491        // Skip whitespace
492        while i < len && (bytes[i] == b' ' || bytes[i] == b'\n' || bytes[i] == b'\r') {
493            i += 1;
494        }
495
496        // Check for .F. (false) or .T. (true)
497        let orientation = if i + 2 < len && bytes[i] == b'.' && bytes[i + 2] == b'.' {
498            bytes[i + 1] != b'F'
499        } else {
500            true // Default to true
501        };
502
503        Some((loop_id, orientation, is_outer))
504    }
505}
506
507/// Parse next float from bytes, advancing position past it
508#[inline]
509fn parse_next_float(bytes: &[u8], offset: &mut usize) -> Option<f64> {
510    let len = bytes.len();
511    let mut i = 0;
512
513    // Skip whitespace and commas
514    while i < len
515        && (bytes[i] == b' ' || bytes[i] == b',' || bytes[i] == b'\n' || bytes[i] == b'\r')
516    {
517        i += 1;
518    }
519
520    if i >= len || bytes[i] == b')' {
521        return None;
522    }
523
524    // Parse float using fast_float
525    match fast_float::parse_partial::<f64, _>(&bytes[i..]) {
526        Ok((value, consumed)) if consumed > 0 => {
527            *offset += i + consumed;
528            Some(value)
529        }
530        _ => None,
531    }
532}
533
534#[cfg(test)]
535mod tests {
536    use super::*;
537    use crate::IfcType;
538
539    #[test]
540    fn test_decode_entity() {
541        let content = r#"
542#1=IFCPROJECT('2vqT3bvqj9RBFjLlXpN8n9',$,$,$,$,$,$,$,$);
543#2=IFCWALL('3a4T3bvqj9RBFjLlXpN8n0',$,$,$,'Wall-001',$,#3,#4);
544#3=IFCLOCALPLACEMENT($,#4);
545#4=IFCAXIS2PLACEMENT3D(#5,$,$);
546#5=IFCCARTESIANPOINT((0.,0.,0.));
547"#;
548
549        let mut decoder = EntityDecoder::new(content);
550
551        // Find entity #2
552        let start = content.find("#2=").unwrap();
553        let end = content[start..].find(';').unwrap() + start + 1;
554
555        let entity = decoder.decode_at(start, end).unwrap();
556        assert_eq!(entity.id, 2);
557        assert_eq!(entity.ifc_type, IfcType::IfcWall);
558        assert_eq!(entity.attributes.len(), 8);
559        assert_eq!(entity.get_string(4), Some("Wall-001"));
560        assert_eq!(entity.get_ref(6), Some(3));
561        assert_eq!(entity.get_ref(7), Some(4));
562    }
563
564    #[test]
565    fn test_decode_by_id() {
566        let content = r#"
567#1=IFCPROJECT('guid',$,$,$,$,$,$,$,$);
568#5=IFCWALL('guid2',$,$,$,'Wall-001',$,$,$);
569#10=IFCDOOR('guid3',$,$,$,'Door-001',$,$,$);
570"#;
571
572        let mut decoder = EntityDecoder::new(content);
573
574        let entity = decoder.decode_by_id(5).unwrap();
575        assert_eq!(entity.id, 5);
576        assert_eq!(entity.ifc_type, IfcType::IfcWall);
577        assert_eq!(entity.get_string(4), Some("Wall-001"));
578
579        // Should be cached now
580        assert_eq!(decoder.cache_size(), 1);
581        let cached = decoder.get_cached(5).unwrap();
582        assert_eq!(cached.id, 5);
583    }
584
585    #[test]
586    fn test_resolve_ref() {
587        let content = r#"
588#1=IFCPROJECT('guid',$,$,$,$,$,$,$,$);
589#2=IFCWALL('guid2',$,$,$,$,$,#1,$);
590"#;
591
592        let mut decoder = EntityDecoder::new(content);
593
594        let wall = decoder.decode_by_id(2).unwrap();
595        let placement_attr = wall.get(6).unwrap();
596
597        let referenced = decoder.resolve_ref(placement_attr).unwrap().unwrap();
598        assert_eq!(referenced.id, 1);
599        assert_eq!(referenced.ifc_type, IfcType::IfcProject);
600    }
601
602    #[test]
603    fn test_resolve_ref_list() {
604        let content = r#"
605#1=IFCPROJECT('guid',$,$,$,$,$,$,$,$);
606#2=IFCWALL('guid1',$,$,$,$,$,$,$);
607#3=IFCDOOR('guid2',$,$,$,$,$,$,$);
608#4=IFCRELCONTAINEDINSPATIALSTRUCTURE('guid3',$,$,$,(#2,#3),$,#1);
609"#;
610
611        let mut decoder = EntityDecoder::new(content);
612
613        let rel = decoder.decode_by_id(4).unwrap();
614        let elements_attr = rel.get(4).unwrap();
615
616        let elements = decoder.resolve_ref_list(elements_attr).unwrap();
617        assert_eq!(elements.len(), 2);
618        assert_eq!(elements[0].id, 2);
619        assert_eq!(elements[0].ifc_type, IfcType::IfcWall);
620        assert_eq!(elements[1].id, 3);
621        assert_eq!(elements[1].ifc_type, IfcType::IfcDoor);
622    }
623
624    #[test]
625    fn test_cache() {
626        let content = r#"
627#1=IFCPROJECT('guid',$,$,$,$,$,$,$,$);
628#2=IFCWALL('guid2',$,$,$,$,$,$,$);
629"#;
630
631        let mut decoder = EntityDecoder::new(content);
632
633        assert_eq!(decoder.cache_size(), 0);
634
635        decoder.decode_by_id(1).unwrap();
636        assert_eq!(decoder.cache_size(), 1);
637
638        decoder.decode_by_id(2).unwrap();
639        assert_eq!(decoder.cache_size(), 2);
640
641        // Decode same entity - should use cache
642        decoder.decode_by_id(1).unwrap();
643        assert_eq!(decoder.cache_size(), 2);
644
645        decoder.clear_cache();
646        assert_eq!(decoder.cache_size(), 0);
647    }
648}