Skip to main content

bimifc_parser/
scanner.rs

1// This Source Code Form is subject to the terms of the Mozilla Public
2// License, v. 2.0. If a copy of the MPL was not distributed with this
3// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4
5//! Fast entity scanner using SIMD-accelerated byte searching
6//!
7//! Scans IFC files to discover entities without full parsing.
8
9use memchr::memchr;
10use rustc_hash::FxHashMap;
11
12/// Entity index mapping ID to byte offsets
13pub type EntityIndex = FxHashMap<u32, (usize, usize)>;
14
15/// Fast entity scanner for IFC files
16///
17/// Uses memchr for SIMD-accelerated scanning to quickly find entity
18/// boundaries without full parsing.
19pub struct EntityScanner<'a> {
20    content: &'a str,
21    pos: usize,
22}
23
24impl<'a> EntityScanner<'a> {
25    /// Create a new scanner for the given content
26    pub fn new(content: &'a str) -> Self {
27        // Skip header section (find DATA; line)
28        let pos = content.find("DATA;").map(|p| p + 5).unwrap_or(0);
29
30        Self { content, pos }
31    }
32
33    /// Scan to find the next entity
34    ///
35    /// Returns (id, type_name, start_byte, end_byte)
36    pub fn next_entity(&mut self) -> Option<(u32, &'a str, usize, usize)> {
37        let bytes = self.content.as_bytes();
38
39        // Find next # (entity start)
40        while self.pos < bytes.len() {
41            // Use memchr for fast # search
42            let hash_pos = memchr(b'#', &bytes[self.pos..])?;
43            self.pos += hash_pos;
44
45            // Check if this is an entity definition (not a reference in attributes)
46            // Entity definitions are at the start of a line (after whitespace/newline)
47            let is_entity_start = self.pos == 0
48                || bytes[self.pos - 1] == b'\n'
49                || bytes[self.pos - 1] == b'\r'
50                || bytes[self.pos - 1] == b';';
51
52            if !is_entity_start {
53                self.pos += 1;
54                continue;
55            }
56
57            let start = self.pos;
58
59            // Parse entity ID
60            self.pos += 1; // Skip #
61            let id_start = self.pos;
62
63            while self.pos < bytes.len() && bytes[self.pos].is_ascii_digit() {
64                self.pos += 1;
65            }
66
67            if self.pos == id_start {
68                // No digits found
69                continue;
70            }
71
72            let id: u32 = self.content[id_start..self.pos].parse().ok()?;
73
74            // Skip whitespace and =
75            while self.pos < bytes.len() && (bytes[self.pos] == b' ' || bytes[self.pos] == b'\t') {
76                self.pos += 1;
77            }
78
79            if self.pos >= bytes.len() || bytes[self.pos] != b'=' {
80                continue;
81            }
82            self.pos += 1; // Skip =
83
84            // Skip whitespace
85            while self.pos < bytes.len() && (bytes[self.pos] == b' ' || bytes[self.pos] == b'\t') {
86                self.pos += 1;
87            }
88
89            // Parse type name
90            let type_start = self.pos;
91            while self.pos < bytes.len()
92                && (bytes[self.pos].is_ascii_alphanumeric() || bytes[self.pos] == b'_')
93            {
94                self.pos += 1;
95            }
96
97            if self.pos == type_start {
98                continue;
99            }
100
101            let type_name = &self.content[type_start..self.pos];
102
103            // Find end of entity (semicolon, but handle strings)
104            let end = self.find_entity_end()?;
105
106            return Some((id, type_name, start, end));
107        }
108
109        None
110    }
111
112    /// Find the end of an entity (semicolon), handling quoted strings
113    fn find_entity_end(&mut self) -> Option<usize> {
114        let bytes = self.content.as_bytes();
115        let mut in_string = false;
116
117        while self.pos < bytes.len() {
118            match bytes[self.pos] {
119                b'\'' => {
120                    // Check for escaped quote ''
121                    if in_string && self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'\'' {
122                        self.pos += 2;
123                        continue;
124                    }
125                    in_string = !in_string;
126                }
127                b';' if !in_string => {
128                    self.pos += 1;
129                    return Some(self.pos);
130                }
131                _ => {}
132            }
133            self.pos += 1;
134        }
135
136        None
137    }
138
139    /// Build an index of all entities (ID -> byte offsets)
140    pub fn build_index(content: &'a str) -> EntityIndex {
141        let mut scanner = Self::new(content);
142        let mut index = FxHashMap::default();
143
144        while let Some((id, _, start, end)) = scanner.next_entity() {
145            index.insert(id, (start, end));
146        }
147
148        index
149    }
150
151    /// Count entities by type
152    pub fn count_by_type(content: &'a str) -> FxHashMap<String, usize> {
153        let mut scanner = Self::new(content);
154        let mut counts: FxHashMap<String, usize> = FxHashMap::default();
155
156        while let Some((_, type_name, _, _)) = scanner.next_entity() {
157            *counts.entry(type_name.to_uppercase()).or_insert(0) += 1;
158        }
159
160        counts
161    }
162
163    /// Find all entities of a specific type
164    pub fn find_by_type(content: &'a str, target_type: &str) -> Vec<(u32, usize, usize)> {
165        let mut scanner = Self::new(content);
166        let mut results = Vec::new();
167        let target_upper = target_type.to_uppercase();
168
169        while let Some((id, type_name, start, end)) = scanner.next_entity() {
170            if type_name.eq_ignore_ascii_case(&target_upper) {
171                results.push((id, start, end));
172            }
173        }
174
175        results
176    }
177
178    /// Get total entity count
179    pub fn entity_count(content: &'a str) -> usize {
180        let mut scanner = Self::new(content);
181        let mut count = 0;
182
183        while scanner.next_entity().is_some() {
184            count += 1;
185        }
186
187        count
188    }
189}
190
191/// Parse the header section to extract metadata
192pub fn parse_header(content: &str) -> HeaderInfo {
193    let mut info = HeaderInfo::default();
194
195    // Find HEADER section
196    let header_start = content.find("HEADER;").unwrap_or(0);
197    let header_end = content.find("ENDSEC;").unwrap_or(content.len());
198    let header = &content[header_start..header_end];
199
200    // Extract FILE_SCHEMA
201    if let Some(schema_start) = header.find("FILE_SCHEMA") {
202        if let Some(paren_start) = header[schema_start..].find("((") {
203            let start = schema_start + paren_start + 2;
204            if let Some(paren_end) = header[start..].find("))") {
205                let schema_list = &header[start..start + paren_end];
206                // Extract first schema (usually the only one)
207                if let Some(quote_start) = schema_list.find('\'') {
208                    if let Some(quote_end) = schema_list[quote_start + 1..].find('\'') {
209                        info.schema_version =
210                            schema_list[quote_start + 1..quote_start + 1 + quote_end].to_string();
211                    }
212                }
213            }
214        }
215    }
216
217    // Extract FILE_NAME
218    if let Some(name_start) = header.find("FILE_NAME") {
219        // FILE_NAME(name, timestamp, author, organization, preprocessor, originating_system, authorization)
220        if let Some(paren_start) = header[name_start..].find('(') {
221            let start = name_start + paren_start + 1;
222            // Parse first argument (file name)
223            if let Some((file_name, rest)) = parse_header_string(&header[start..]) {
224                info.file_name = Some(file_name);
225
226                // Parse timestamp
227                if let Some(comma) = rest.find(',') {
228                    if let Some((timestamp, rest2)) = parse_header_string(&rest[comma + 1..]) {
229                        info.timestamp = Some(timestamp);
230
231                        // Parse author (list)
232                        if let Some(comma2) = rest2.find(',') {
233                            if let Some((author, rest3)) = parse_header_list(&rest2[comma2 + 1..]) {
234                                info.author = author.first().cloned();
235
236                                // Parse organization (list)
237                                if let Some(comma3) = rest3.find(',') {
238                                    if let Some((org, rest4)) =
239                                        parse_header_list(&rest3[comma3 + 1..])
240                                    {
241                                        info.organization = org.first().cloned();
242
243                                        // Parse preprocessor_version
244                                        if let Some(comma4) = rest4.find(',') {
245                                            if let Some((preproc, rest5)) =
246                                                parse_header_string(&rest4[comma4 + 1..])
247                                            {
248                                                info.preprocessor_version = Some(preproc);
249
250                                                // Parse originating_system
251                                                if let Some(comma5) = rest5.find(',') {
252                                                    if let Some((orig_sys, _)) =
253                                                        parse_header_string(&rest5[comma5 + 1..])
254                                                    {
255                                                        info.originating_system = Some(orig_sys);
256                                                    }
257                                                }
258                                            }
259                                        }
260                                    }
261                                }
262                            }
263                        }
264                    }
265                }
266            }
267        }
268    }
269
270    info
271}
272
273/// Parse a string from header ('value')
274fn parse_header_string(s: &str) -> Option<(String, &str)> {
275    let s = s.trim_start();
276    if !s.starts_with('\'') {
277        // Check for empty value
278        if let Some(stripped) = s.strip_prefix('$') {
279            return Some((String::new(), stripped));
280        }
281        return None;
282    }
283
284    let mut end = 1;
285    let bytes = s.as_bytes();
286    while end < bytes.len() {
287        if bytes[end] == b'\'' {
288            if end + 1 < bytes.len() && bytes[end + 1] == b'\'' {
289                end += 2;
290                continue;
291            }
292            break;
293        }
294        end += 1;
295    }
296
297    let value = s[1..end].replace("''", "'");
298    Some((value, &s[end + 1..]))
299}
300
301/// Parse a list from header (('value1', 'value2'))
302fn parse_header_list(s: &str) -> Option<(Vec<String>, &str)> {
303    let s = s.trim_start();
304    if !s.starts_with('(') {
305        return Some((Vec::new(), s));
306    }
307
308    let mut items = Vec::new();
309    let mut current = &s[1..]; // Skip opening paren
310
311    loop {
312        current = current.trim_start();
313        if let Some(stripped) = current.strip_prefix(')') {
314            return Some((items, stripped));
315        }
316
317        if let Some((item, rest)) = parse_header_string(current) {
318            if !item.is_empty() {
319                items.push(item);
320            }
321            current = rest.trim_start();
322            if current.starts_with(',') {
323                current = &current[1..];
324            }
325        } else {
326            // Skip unknown content
327            if let Some(pos) = current.find([',', ')']) {
328                current = &current[pos..];
329                if current.starts_with(',') {
330                    current = &current[1..];
331                }
332            } else {
333                break;
334            }
335        }
336    }
337
338    Some((items, current))
339}
340
341/// Header information extracted from IFC file
342#[derive(Clone, Debug, Default)]
343pub struct HeaderInfo {
344    pub schema_version: String,
345    pub file_name: Option<String>,
346    pub timestamp: Option<String>,
347    pub author: Option<String>,
348    pub organization: Option<String>,
349    pub preprocessor_version: Option<String>,
350    pub originating_system: Option<String>,
351}
352
353#[cfg(test)]
354mod tests {
355    use super::*;
356
357    const TEST_IFC: &str = r#"ISO-10303-21;
358HEADER;
359FILE_DESCRIPTION(('ViewDefinition [CoordinationView]'),'2;1');
360FILE_NAME('test.ifc','2024-01-01T00:00:00',('Author'),('Org'),'Preprocessor','App','');
361FILE_SCHEMA(('IFC2X3'));
362ENDSEC;
363DATA;
364#1=IFCPROJECT('guid',$,'Project',$,$,$,$,$,#2);
365#2=IFCUNITASSIGNMENT((#3));
366#3=IFCSIUNIT(*,.LENGTHUNIT.,.MILLI.,.METRE.);
367#4=IFCWALL('guid',$,'Wall 1',$,$,#5,#6,$);
368ENDSEC;
369END-ISO-10303-21;
370"#;
371
372    #[test]
373    fn test_scanner_finds_entities() {
374        let mut scanner = EntityScanner::new(TEST_IFC);
375        let mut entities = Vec::new();
376
377        while let Some((id, type_name, _, _)) = scanner.next_entity() {
378            entities.push((id, type_name.to_string()));
379        }
380
381        assert_eq!(entities.len(), 4);
382        assert_eq!(entities[0], (1, "IFCPROJECT".to_string()));
383        assert_eq!(entities[3], (4, "IFCWALL".to_string()));
384    }
385
386    #[test]
387    fn test_build_index() {
388        let index = EntityScanner::build_index(TEST_IFC);
389        assert_eq!(index.len(), 4);
390        assert!(index.contains_key(&1));
391        assert!(index.contains_key(&4));
392    }
393
394    #[test]
395    fn test_count_by_type() {
396        let counts = EntityScanner::count_by_type(TEST_IFC);
397        assert_eq!(counts.get("IFCPROJECT"), Some(&1));
398        assert_eq!(counts.get("IFCWALL"), Some(&1));
399    }
400
401    #[test]
402    fn test_parse_header() {
403        let info = parse_header(TEST_IFC);
404        assert_eq!(info.schema_version, "IFC2X3");
405        assert_eq!(info.file_name, Some("test.ifc".to_string()));
406    }
407}