ifc_lite_core/
parser.rs

1// This Source Code Form is subject to the terms of the Mozilla Public
2// License, v. 2.0. If a copy of the MPL was not distributed with this
3// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4
5//! STEP/IFC Parser using nom
6//!
7//! Zero-copy tokenization and fast entity scanning.
8
9use nom::{
10    branch::alt,
11    bytes::complete::{take_while, take_while1},
12    character::complete::{char, digit1, one_of},
13    combinator::{map, map_res, opt, recognize},
14    multi::separated_list0,
15    sequence::{delimited, pair, preceded, tuple},
16    IResult,
17};
18
19use crate::error::{Error, Result};
20use crate::schema::IfcType;
21
22/// STEP/IFC Token
23#[derive(Debug, Clone, PartialEq)]
24pub enum Token<'a> {
25    /// Entity reference: #123
26    EntityRef(u32),
27    /// String literal: 'text'
28    String(&'a str),
29    /// Integer: 42
30    Integer(i64),
31    /// Float: 3.14
32    Float(f64),
33    /// Enum: .TRUE., .FALSE., .UNKNOWN.
34    Enum(&'a str),
35    /// List: (1, 2, 3)
36    List(Vec<Token<'a>>),
37    /// Typed value: IFCPARAMETERVALUE(0.), IFCBOOLEAN(.T.)
38    TypedValue(&'a str, Vec<Token<'a>>),
39    /// Null value: $
40    Null,
41    /// Asterisk (derived value): *
42    Derived,
43}
44
45/// Parse entity reference: #123
46fn entity_ref(input: &str) -> IResult<&str, Token> {
47    map(
48        preceded(
49            char('#'),
50            map_res(digit1, |s: &str| s.parse::<u32>())
51        ),
52        Token::EntityRef
53    )(input)
54}
55
56/// Parse string literal: 'text' or "text"
57/// IFC uses '' to escape a single quote within a string
58/// Uses memchr for SIMD-accelerated quote searching
59fn string_literal(input: &str) -> IResult<&str, Token> {
60    // Helper to parse string content with escaped quotes - SIMD optimized
61    #[inline]
62    fn parse_string_content(input: &str, quote_byte: u8) -> IResult<&str, &str> {
63        let bytes = input.as_bytes();
64        let mut pos = 0;
65
66        // Use memchr for SIMD-accelerated searching
67        while let Some(found) = memchr::memchr(quote_byte, &bytes[pos..]) {
68            let idx = pos + found;
69            // Check if it's an escaped quote (doubled)
70            if idx + 1 < bytes.len() && bytes[idx + 1] == quote_byte {
71                pos = idx + 2; // Skip escaped quote pair
72                continue;
73            }
74            // End of string found
75            return Ok((&input[idx..], &input[..idx]));
76        }
77
78        // No closing quote found
79        Err(nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Char)))
80    }
81
82    alt((
83        map(
84            delimited(
85                char('\''),
86                |i| parse_string_content(i, b'\''),
87                char('\'')
88            ),
89            Token::String
90        ),
91        map(
92            delimited(
93                char('"'),
94                |i| parse_string_content(i, b'"'),
95                char('"')
96            ),
97            Token::String
98        ),
99    ))(input)
100}
101
102/// Parse integer: 42, -42
103/// Uses lexical-core for 10x faster parsing
104#[inline]
105fn integer(input: &str) -> IResult<&str, Token> {
106    map_res(
107        recognize(
108            tuple((
109                opt(char('-')),
110                digit1,
111            ))
112        ),
113        |s: &str| lexical_core::parse::<i64>(s.as_bytes())
114            .map(Token::Integer)
115            .map_err(|_| "parse error")
116    )(input)
117}
118
119/// Parse float: 3.14, -3.14, 1.5E-10, 0., 1.
120/// IFC allows floats like "0." without decimal digits
121/// Uses lexical-core for 10x faster parsing
122#[inline]
123fn float(input: &str) -> IResult<&str, Token> {
124    map_res(
125        recognize(
126            tuple((
127                opt(char('-')),
128                digit1,
129                char('.'),
130                opt(digit1),  // Made optional to support "0." format
131                opt(tuple((
132                    one_of("eE"),
133                    opt(one_of("+-")),
134                    digit1,
135                ))),
136            ))
137        ),
138        |s: &str| lexical_core::parse::<f64>(s.as_bytes())
139            .map(Token::Float)
140            .map_err(|_| "parse error")
141    )(input)
142}
143
144/// Parse enum: .TRUE., .FALSE., .UNKNOWN., .ELEMENT.
145fn enum_value(input: &str) -> IResult<&str, Token> {
146    map(
147        delimited(
148            char('.'),
149            take_while1(|c: char| c.is_alphanumeric() || c == '_'),
150            char('.')
151        ),
152        Token::Enum
153    )(input)
154}
155
156/// Parse null: $
157fn null(input: &str) -> IResult<&str, Token> {
158    map(char('$'), |_| Token::Null)(input)
159}
160
161/// Parse derived: *
162fn derived(input: &str) -> IResult<&str, Token> {
163    map(char('*'), |_| Token::Derived)(input)
164}
165
166/// Parse typed value: IFCPARAMETERVALUE(0.), IFCBOOLEAN(.T.)
167fn typed_value(input: &str) -> IResult<&str, Token> {
168    map(
169        pair(
170            // Type name (all caps with optional numbers/underscores)
171            take_while1(|c: char| c.is_alphanumeric() || c == '_'),
172            // Arguments
173            delimited(
174                char('('),
175                separated_list0(
176                    delimited(ws, char(','), ws),
177                    token
178                ),
179                char(')')
180            )
181        ),
182        |(type_name, args)| Token::TypedValue(type_name, args)
183    )(input)
184}
185
186/// Skip whitespace
187fn ws(input: &str) -> IResult<&str, ()> {
188    map(
189        take_while(|c: char| c.is_whitespace()),
190        |_| ()
191    )(input)
192}
193
194/// Parse a token with optional surrounding whitespace
195/// Optimized ordering: test cheapest patterns first (single-char markers)
196fn token(input: &str) -> IResult<&str, Token> {
197    delimited(
198        ws,
199        alt((
200            // Single-char markers first (O(1) check)
201            null,         // $
202            derived,      // *
203            entity_ref,   // # + digits
204            // Then by complexity
205            enum_value,   // .XXX.
206            string_literal, // 'xxx'
207            list,         // (...)
208            // Numbers: float before integer since float includes '.'
209            float,
210            integer,
211            typed_value,  // IFCPARAMETERVALUE(0.) - most expensive, last
212        )),
213        ws
214    )(input)
215}
216
217/// Parse list: (1, 2, 3) or nested lists
218fn list(input: &str) -> IResult<&str, Token> {
219    map(
220        delimited(
221            char('('),
222            separated_list0(
223                delimited(ws, char(','), ws),
224                token
225            ),
226            char(')')
227        ),
228        Token::List
229    )(input)
230}
231
232/// Parse a complete entity line
233/// Example: #123=IFCWALL('guid','owner',$,$,'name',$,$,$);
234pub fn parse_entity(input: &str) -> Result<(u32, IfcType, Vec<Token>)> {
235    let result: IResult<&str, (u32, &str, Vec<Token>)> = tuple((
236        // Entity ID: #123
237        delimited(
238            ws,
239            preceded(
240                char('#'),
241                map_res(digit1, |s: &str| s.parse::<u32>())
242            ),
243            ws
244        ),
245        // Equals sign
246        preceded(
247            char('='),
248            // Entity type: IFCWALL
249            delimited(
250                ws,
251                take_while1(|c: char| c.is_alphanumeric() || c == '_'),
252                ws
253            )
254        ),
255        // Arguments: ('guid', 'owner', ...)
256        delimited(
257            char('('),
258            separated_list0(
259                delimited(ws, char(','), ws),
260                token
261            ),
262            tuple((char(')'), ws, char(';')))
263        ),
264    ))(input);
265
266    match result {
267        Ok((_, (id, type_str, args))) => {
268            let ifc_type = IfcType::from_str(type_str);
269            Ok((id, ifc_type, args))
270        }
271        Err(e) => Err(Error::parse(0, format!("Failed to parse entity: {}", e))),
272    }
273}
274
275/// Fast entity scanner - scans file without full parsing
276/// O(n) performance for finding entities by type
277/// Uses memchr for SIMD-accelerated byte searching
278pub struct EntityScanner<'a> {
279    content: &'a str,
280    bytes: &'a [u8],
281    position: usize,
282}
283
284impl<'a> EntityScanner<'a> {
285    /// Create a new scanner
286    pub fn new(content: &'a str) -> Self {
287        Self {
288            content,
289            bytes: content.as_bytes(),
290            position: 0,
291        }
292    }
293
294    /// Scan for the next entity
295    /// Returns (entity_id, type_name, line_start, line_end)
296    #[inline]
297    pub fn next_entity(&mut self) -> Option<(u32, &'a str, usize, usize)> {
298        let remaining = &self.bytes[self.position..];
299
300        // Find next '#' that starts an entity using SIMD-accelerated search
301        let start_offset = memchr::memchr(b'#', remaining)?;
302        let line_start = self.position + start_offset;
303
304        // Find the end of the line (semicolon) using SIMD
305        let line_content = &self.bytes[line_start..];
306        let end_offset = memchr::memchr(b';', line_content)?;
307        let line_end = line_start + end_offset + 1;
308
309        // Parse entity ID (inline for speed)
310        let id_start = line_start + 1;
311        let mut id_end = id_start;
312        while id_end < line_end && self.bytes[id_end].is_ascii_digit() {
313            id_end += 1;
314        }
315
316        // Fast integer parsing without allocation
317        let id = self.parse_u32_fast(id_start, id_end)?;
318
319        // Find '=' after ID using SIMD
320        let eq_search = &self.bytes[id_end..line_end];
321        let eq_offset = memchr::memchr(b'=', eq_search)?;
322        let mut type_start = id_end + eq_offset + 1;
323
324        // Skip whitespace (inline)
325        while type_start < line_end && self.bytes[type_start].is_ascii_whitespace() {
326            type_start += 1;
327        }
328
329        // Find end of type name (at '(' or whitespace)
330        let mut type_end = type_start;
331        while type_end < line_end {
332            let b = self.bytes[type_end];
333            if b == b'(' || b.is_ascii_whitespace() {
334                break;
335            }
336            type_end += 1;
337        }
338
339        // Safe because IFC files are ASCII
340        let type_name = unsafe { std::str::from_utf8_unchecked(&self.bytes[type_start..type_end]) };
341
342        // Move position past this entity
343        self.position = line_end;
344
345        Some((id, type_name, line_start, line_end))
346    }
347
348    /// Fast u32 parsing without string allocation
349    #[inline]
350    fn parse_u32_fast(&self, start: usize, end: usize) -> Option<u32> {
351        let mut result: u32 = 0;
352        for i in start..end {
353            let digit = self.bytes[i].wrapping_sub(b'0');
354            if digit > 9 {
355                return None;
356            }
357            result = result.wrapping_mul(10).wrapping_add(digit as u32);
358        }
359        Some(result)
360    }
361
362    /// Find all entities of a specific type
363    pub fn find_by_type(&mut self, target_type: &str) -> Vec<(u32, usize, usize)> {
364        let mut results = Vec::new();
365
366        while let Some((id, type_name, start, end)) = self.next_entity() {
367            if type_name.eq_ignore_ascii_case(target_type) {
368                results.push((id, start, end));
369            }
370        }
371
372        results
373    }
374
375    /// Count entities by type
376    pub fn count_by_type(&mut self) -> rustc_hash::FxHashMap<String, usize> {
377        let mut counts = rustc_hash::FxHashMap::default();
378
379        while let Some((_, type_name, _, _)) = self.next_entity() {
380            *counts.entry(type_name.to_string()).or_insert(0) += 1;
381        }
382
383        counts
384    }
385
386    /// Reset scanner to beginning
387    pub fn reset(&mut self) {
388        self.position = 0;
389    }
390}
391
392#[cfg(test)]
393mod tests {
394    use super::*;
395
396    #[test]
397    fn test_entity_ref() {
398        assert_eq!(entity_ref("#123"), Ok(("", Token::EntityRef(123))));
399        assert_eq!(entity_ref("#0"), Ok(("", Token::EntityRef(0))));
400    }
401
402    #[test]
403    fn test_string_literal() {
404        assert_eq!(string_literal("'hello'"), Ok(("", Token::String("hello"))));
405        assert_eq!(string_literal("'with spaces'"), Ok(("", Token::String("with spaces"))));
406    }
407
408    #[test]
409    fn test_integer() {
410        assert_eq!(integer("42"), Ok(("", Token::Integer(42))));
411        assert_eq!(integer("-42"), Ok(("", Token::Integer(-42))));
412        assert_eq!(integer("0"), Ok(("", Token::Integer(0))));
413    }
414
415    #[test]
416    fn test_float() {
417        assert_eq!(float("3.14"), Ok(("", Token::Float(3.14))));
418        assert_eq!(float("-3.14"), Ok(("", Token::Float(-3.14))));
419        assert_eq!(float("1.5E-10"), Ok(("", Token::Float(1.5e-10))));
420    }
421
422    #[test]
423    fn test_enum() {
424        assert_eq!(enum_value(".TRUE."), Ok(("", Token::Enum("TRUE"))));
425        assert_eq!(enum_value(".FALSE."), Ok(("", Token::Enum("FALSE"))));
426        assert_eq!(enum_value(".ELEMENT."), Ok(("", Token::Enum("ELEMENT"))));
427    }
428
429    #[test]
430    fn test_list() {
431        let result = list("(1,2,3)");
432        assert!(result.is_ok());
433        let (_, token) = result.unwrap();
434        match token {
435            Token::List(items) => {
436                assert_eq!(items.len(), 3);
437                assert_eq!(items[0], Token::Integer(1));
438                assert_eq!(items[1], Token::Integer(2));
439                assert_eq!(items[2], Token::Integer(3));
440            }
441            _ => panic!("Expected List token"),
442        }
443    }
444
445    #[test]
446    fn test_nested_list() {
447        let result = list("(1,(2,3),4)");
448        assert!(result.is_ok());
449        let (_, token) = result.unwrap();
450        match token {
451            Token::List(items) => {
452                assert_eq!(items.len(), 3);
453                assert_eq!(items[0], Token::Integer(1));
454                match &items[1] {
455                    Token::List(inner) => {
456                        assert_eq!(inner.len(), 2);
457                        assert_eq!(inner[0], Token::Integer(2));
458                        assert_eq!(inner[1], Token::Integer(3));
459                    }
460                    _ => panic!("Expected nested List"),
461                }
462                assert_eq!(items[2], Token::Integer(4));
463            }
464            _ => panic!("Expected List token"),
465        }
466    }
467
468    #[test]
469    fn test_parse_entity() {
470        let input = "#123=IFCWALL('guid','owner',$,$,'name',$,$,$);";
471        let result = parse_entity(input);
472        assert!(result.is_ok());
473        let (id, ifc_type, args) = result.unwrap();
474        assert_eq!(id, 123);
475        assert_eq!(ifc_type, IfcType::IfcWall);
476        assert_eq!(args.len(), 8);
477    }
478
479    #[test]
480    fn test_parse_entity_with_nested_list() {
481        // First test: simple list (should work)
482        let simple = "(0.,0.,1.)";
483        println!("Testing simple list: {}", simple);
484        let simple_result = list(simple);
485        println!("Simple list result: {:?}", simple_result);
486
487        // Second test: nested in entity (what's failing)
488        let input = "#9=IFCDIRECTION((0.,0.,1.));";
489        println!("\nTesting full entity: {}", input);
490        let result = parse_entity(input);
491
492        if let Err(ref e) = result {
493            println!("Parse error: {:?}", e);
494
495            // Try parsing just the arguments part
496            println!("\nTrying to parse just arguments: ((0.,0.,1.))");
497            let args_input = "((0.,0.,1.))";
498            let args_result = list(args_input);
499            println!("Args list result: {:?}", args_result);
500        }
501
502        assert!(result.is_ok(), "Failed to parse: {:?}", result);
503        let (id, _ifc_type, args) = result.unwrap();
504        assert_eq!(id, 9);
505        assert_eq!(args.len(), 1);
506        // First arg should be a list containing 3 floats
507        if let Token::List(inner) = &args[0] {
508            assert_eq!(inner.len(), 3);
509        } else {
510            panic!("Expected Token::List, got {:?}", args[0]);
511        }
512    }
513
514    #[test]
515    fn test_entity_scanner() {
516        let content = r#"
517#1=IFCPROJECT('guid',$,$,$,$,$,$,$,$);
518#2=IFCWALL('guid2',$,$,$,$,$,$,$);
519#3=IFCDOOR('guid3',$,$,$,$,$,$,$);
520#4=IFCWALL('guid4',$,$,$,$,$,$,$);
521"#;
522
523        let mut scanner = EntityScanner::new(content);
524
525        // Test next_entity
526        let (id, type_name, _, _) = scanner.next_entity().unwrap();
527        assert_eq!(id, 1);
528        assert_eq!(type_name, "IFCPROJECT");
529
530        // Test find_by_type
531        scanner.reset();
532        let walls = scanner.find_by_type("IFCWALL");
533        assert_eq!(walls.len(), 2);
534        assert_eq!(walls[0].0, 2);
535        assert_eq!(walls[1].0, 4);
536
537        // Test count_by_type
538        scanner.reset();
539        let counts = scanner.count_by_type();
540        assert_eq!(counts.get("IFCPROJECT"), Some(&1));
541        assert_eq!(counts.get("IFCWALL"), Some(&2));
542        assert_eq!(counts.get("IFCDOOR"), Some(&1));
543    }
544}