bimifc-parser 0.3.0

High-performance IFC parser with IFC4 (STEP) and IFC5 (IFCX) support
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Fast entity scanner using SIMD-accelerated byte searching
//!
//! Scans IFC files to discover entities without full parsing.

use memchr::memchr;
use rustc_hash::FxHashMap;

/// Entity index mapping ID to byte offsets
pub type EntityIndex = FxHashMap<u32, (usize, usize)>;

/// Fast entity scanner for IFC files
///
/// Uses memchr for SIMD-accelerated scanning to quickly find entity
/// boundaries without full parsing.
pub struct EntityScanner<'a> {
    content: &'a str,
    pos: usize,
}

impl<'a> EntityScanner<'a> {
    /// Create a new scanner for the given content
    pub fn new(content: &'a str) -> Self {
        // Skip header section (find DATA; line)
        let pos = content.find("DATA;").map(|p| p + 5).unwrap_or(0);

        Self { content, pos }
    }

    /// Scan to find the next entity
    ///
    /// Returns (id, type_name, start_byte, end_byte)
    pub fn next_entity(&mut self) -> Option<(u32, &'a str, usize, usize)> {
        let bytes = self.content.as_bytes();

        // Find next # (entity start)
        while self.pos < bytes.len() {
            // Use memchr for fast # search
            let hash_pos = memchr(b'#', &bytes[self.pos..])?;
            self.pos += hash_pos;

            // Check if this is an entity definition (not a reference in attributes)
            // Entity definitions are at the start of a line (after whitespace/newline)
            let is_entity_start = self.pos == 0
                || bytes[self.pos - 1] == b'\n'
                || bytes[self.pos - 1] == b'\r'
                || bytes[self.pos - 1] == b';';

            if !is_entity_start {
                self.pos += 1;
                continue;
            }

            let start = self.pos;

            // Parse entity ID
            self.pos += 1; // Skip #
            let id_start = self.pos;

            while self.pos < bytes.len() && bytes[self.pos].is_ascii_digit() {
                self.pos += 1;
            }

            if self.pos == id_start {
                // No digits found
                continue;
            }

            let id: u32 = self.content[id_start..self.pos].parse().ok()?;

            // Skip whitespace and =
            while self.pos < bytes.len() && (bytes[self.pos] == b' ' || bytes[self.pos] == b'\t') {
                self.pos += 1;
            }

            if self.pos >= bytes.len() || bytes[self.pos] != b'=' {
                continue;
            }
            self.pos += 1; // Skip =

            // Skip whitespace
            while self.pos < bytes.len() && (bytes[self.pos] == b' ' || bytes[self.pos] == b'\t') {
                self.pos += 1;
            }

            // Parse type name
            let type_start = self.pos;
            while self.pos < bytes.len()
                && (bytes[self.pos].is_ascii_alphanumeric() || bytes[self.pos] == b'_')
            {
                self.pos += 1;
            }

            if self.pos == type_start {
                continue;
            }

            let type_name = &self.content[type_start..self.pos];

            // Find end of entity (semicolon, but handle strings)
            let end = self.find_entity_end()?;

            return Some((id, type_name, start, end));
        }

        None
    }

    /// Find the end of an entity (semicolon), handling quoted strings
    fn find_entity_end(&mut self) -> Option<usize> {
        let bytes = self.content.as_bytes();
        let mut in_string = false;

        while self.pos < bytes.len() {
            match bytes[self.pos] {
                b'\'' => {
                    // Check for escaped quote ''
                    if in_string && self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'\'' {
                        self.pos += 2;
                        continue;
                    }
                    in_string = !in_string;
                }
                b';' if !in_string => {
                    self.pos += 1;
                    return Some(self.pos);
                }
                _ => {}
            }
            self.pos += 1;
        }

        None
    }

    /// Build an index of all entities (ID -> byte offsets)
    pub fn build_index(content: &'a str) -> EntityIndex {
        let mut scanner = Self::new(content);
        let mut index = FxHashMap::default();

        while let Some((id, _, start, end)) = scanner.next_entity() {
            index.insert(id, (start, end));
        }

        index
    }

    /// Count entities by type
    pub fn count_by_type(content: &'a str) -> FxHashMap<String, usize> {
        let mut scanner = Self::new(content);
        let mut counts: FxHashMap<String, usize> = FxHashMap::default();

        while let Some((_, type_name, _, _)) = scanner.next_entity() {
            *counts.entry(type_name.to_uppercase()).or_insert(0) += 1;
        }

        counts
    }

    /// Find all entities of a specific type
    pub fn find_by_type(content: &'a str, target_type: &str) -> Vec<(u32, usize, usize)> {
        let mut scanner = Self::new(content);
        let mut results = Vec::new();
        let target_upper = target_type.to_uppercase();

        while let Some((id, type_name, start, end)) = scanner.next_entity() {
            if type_name.eq_ignore_ascii_case(&target_upper) {
                results.push((id, start, end));
            }
        }

        results
    }

    /// Get total entity count
    pub fn entity_count(content: &'a str) -> usize {
        let mut scanner = Self::new(content);
        let mut count = 0;

        while scanner.next_entity().is_some() {
            count += 1;
        }

        count
    }
}

/// Parse the header section to extract metadata
pub fn parse_header(content: &str) -> HeaderInfo {
    let mut info = HeaderInfo::default();

    // Find HEADER section
    let header_start = content.find("HEADER;").unwrap_or(0);
    let header_end = content.find("ENDSEC;").unwrap_or(content.len());
    let header = &content[header_start..header_end];

    // Extract FILE_SCHEMA
    if let Some(schema_start) = header.find("FILE_SCHEMA") {
        if let Some(paren_start) = header[schema_start..].find("((") {
            let start = schema_start + paren_start + 2;
            if let Some(paren_end) = header[start..].find("))") {
                let schema_list = &header[start..start + paren_end];
                // Extract first schema (usually the only one)
                if let Some(quote_start) = schema_list.find('\'') {
                    if let Some(quote_end) = schema_list[quote_start + 1..].find('\'') {
                        info.schema_version =
                            schema_list[quote_start + 1..quote_start + 1 + quote_end].to_string();
                    }
                }
            }
        }
    }

    // Extract FILE_NAME
    if let Some(name_start) = header.find("FILE_NAME") {
        // FILE_NAME(name, timestamp, author, organization, preprocessor, originating_system, authorization)
        if let Some(paren_start) = header[name_start..].find('(') {
            let start = name_start + paren_start + 1;
            // Parse first argument (file name)
            if let Some((file_name, rest)) = parse_header_string(&header[start..]) {
                info.file_name = Some(file_name);

                // Parse timestamp
                if let Some(comma) = rest.find(',') {
                    if let Some((timestamp, rest2)) = parse_header_string(&rest[comma + 1..]) {
                        info.timestamp = Some(timestamp);

                        // Parse author (list)
                        if let Some(comma2) = rest2.find(',') {
                            if let Some((author, rest3)) = parse_header_list(&rest2[comma2 + 1..]) {
                                info.author = author.first().cloned();

                                // Parse organization (list)
                                if let Some(comma3) = rest3.find(',') {
                                    if let Some((org, rest4)) =
                                        parse_header_list(&rest3[comma3 + 1..])
                                    {
                                        info.organization = org.first().cloned();

                                        // Parse preprocessor_version
                                        if let Some(comma4) = rest4.find(',') {
                                            if let Some((preproc, rest5)) =
                                                parse_header_string(&rest4[comma4 + 1..])
                                            {
                                                info.preprocessor_version = Some(preproc);

                                                // Parse originating_system
                                                if let Some(comma5) = rest5.find(',') {
                                                    if let Some((orig_sys, _)) =
                                                        parse_header_string(&rest5[comma5 + 1..])
                                                    {
                                                        info.originating_system = Some(orig_sys);
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }

    info
}

/// Parse a string from header ('value')
fn parse_header_string(s: &str) -> Option<(String, &str)> {
    let s = s.trim_start();
    if !s.starts_with('\'') {
        // Check for empty value
        if let Some(stripped) = s.strip_prefix('$') {
            return Some((String::new(), stripped));
        }
        return None;
    }

    let mut end = 1;
    let bytes = s.as_bytes();
    while end < bytes.len() {
        if bytes[end] == b'\'' {
            if end + 1 < bytes.len() && bytes[end + 1] == b'\'' {
                end += 2;
                continue;
            }
            break;
        }
        end += 1;
    }

    let value = s[1..end].replace("''", "'");
    Some((value, &s[end + 1..]))
}

/// Parse a list from header (('value1', 'value2'))
fn parse_header_list(s: &str) -> Option<(Vec<String>, &str)> {
    let s = s.trim_start();
    if !s.starts_with('(') {
        return Some((Vec::new(), s));
    }

    let mut items = Vec::new();
    let mut current = &s[1..]; // Skip opening paren

    loop {
        current = current.trim_start();
        if let Some(stripped) = current.strip_prefix(')') {
            return Some((items, stripped));
        }

        if let Some((item, rest)) = parse_header_string(current) {
            if !item.is_empty() {
                items.push(item);
            }
            current = rest.trim_start();
            if current.starts_with(',') {
                current = &current[1..];
            }
        } else {
            // Skip unknown content
            if let Some(pos) = current.find([',', ')']) {
                current = &current[pos..];
                if current.starts_with(',') {
                    current = &current[1..];
                }
            } else {
                break;
            }
        }
    }

    Some((items, current))
}

/// Header information extracted from IFC file
#[derive(Clone, Debug, Default)]
pub struct HeaderInfo {
    pub schema_version: String,
    pub file_name: Option<String>,
    pub timestamp: Option<String>,
    pub author: Option<String>,
    pub organization: Option<String>,
    pub preprocessor_version: Option<String>,
    pub originating_system: Option<String>,
}

#[cfg(test)]
mod tests {
    use super::*;

    const TEST_IFC: &str = r#"ISO-10303-21;
HEADER;
FILE_DESCRIPTION(('ViewDefinition [CoordinationView]'),'2;1');
FILE_NAME('test.ifc','2024-01-01T00:00:00',('Author'),('Org'),'Preprocessor','App','');
FILE_SCHEMA(('IFC2X3'));
ENDSEC;
DATA;
#1=IFCPROJECT('guid',$,'Project',$,$,$,$,$,#2);
#2=IFCUNITASSIGNMENT((#3));
#3=IFCSIUNIT(*,.LENGTHUNIT.,.MILLI.,.METRE.);
#4=IFCWALL('guid',$,'Wall 1',$,$,#5,#6,$);
ENDSEC;
END-ISO-10303-21;
"#;

    #[test]
    fn test_scanner_finds_entities() {
        let mut scanner = EntityScanner::new(TEST_IFC);
        let mut entities = Vec::new();

        while let Some((id, type_name, _, _)) = scanner.next_entity() {
            entities.push((id, type_name.to_string()));
        }

        assert_eq!(entities.len(), 4);
        assert_eq!(entities[0], (1, "IFCPROJECT".to_string()));
        assert_eq!(entities[3], (4, "IFCWALL".to_string()));
    }

    #[test]
    fn test_build_index() {
        let index = EntityScanner::build_index(TEST_IFC);
        assert_eq!(index.len(), 4);
        assert!(index.contains_key(&1));
        assert!(index.contains_key(&4));
    }

    #[test]
    fn test_count_by_type() {
        let counts = EntityScanner::count_by_type(TEST_IFC);
        assert_eq!(counts.get("IFCPROJECT"), Some(&1));
        assert_eq!(counts.get("IFCWALL"), Some(&1));
    }

    #[test]
    fn test_parse_header() {
        let info = parse_header(TEST_IFC);
        assert_eq!(info.schema_version, "IFC2X3");
        assert_eq!(info.file_name, Some("test.ifc".to_string()));
    }
}