Skip to main content

ifc_lite_core/
parser.rs

1// This Source Code Form is subject to the terms of the Mozilla Public
2// License, v. 2.0. If a copy of the MPL was not distributed with this
3// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4
5//! STEP/IFC Parser using nom
6//!
7//! Zero-copy tokenization and fast entity scanning.
8
9use nom::{
10    branch::alt,
11    bytes::complete::{take_while, take_while1},
12    character::complete::{char, digit1, one_of},
13    combinator::{map, map_res, opt, recognize},
14    multi::separated_list0,
15    sequence::{delimited, pair, preceded, tuple},
16    IResult,
17};
18
19use crate::error::{Error, Result};
20use crate::generated::IfcType;
21
22/// STEP/IFC Token
23#[derive(Debug, Clone, PartialEq)]
24pub enum Token<'a> {
25    /// Entity reference: #123
26    EntityRef(u32),
27    /// String literal: 'text'
28    String(&'a str),
29    /// Integer: 42
30    Integer(i64),
31    /// Float: 3.14
32    Float(f64),
33    /// Enum: .TRUE., .FALSE., .UNKNOWN.
34    Enum(&'a str),
35    /// List: (1, 2, 3)
36    List(Vec<Token<'a>>),
37    /// Typed value: IFCPARAMETERVALUE(0.), IFCBOOLEAN(.T.)
38    TypedValue(&'a str, Vec<Token<'a>>),
39    /// Null value: $
40    Null,
41    /// Asterisk (derived value): *
42    Derived,
43}
44
45/// Parse entity reference: #123
46fn entity_ref(input: &str) -> IResult<&str, Token<'_>> {
47    map(
48        preceded(char('#'), map_res(digit1, |s: &str| s.parse::<u32>())),
49        Token::EntityRef,
50    )(input)
51}
52
53/// Parse string literal: 'text' or "text"
54/// IFC uses '' to escape a single quote within a string
55/// Uses memchr for SIMD-accelerated quote searching
56fn string_literal(input: &str) -> IResult<&str, Token<'_>> {
57    // Helper to parse string content with escaped quotes - SIMD optimized
58    #[inline]
59    fn parse_string_content(input: &str, quote_byte: u8) -> IResult<&str, &str> {
60        let bytes = input.as_bytes();
61        let mut pos = 0;
62
63        // Use memchr for SIMD-accelerated searching
64        while let Some(found) = memchr::memchr(quote_byte, &bytes[pos..]) {
65            let idx = pos + found;
66            // Check if it's an escaped quote (doubled)
67            if idx + 1 < bytes.len() && bytes[idx + 1] == quote_byte {
68                pos = idx + 2; // Skip escaped quote pair
69                continue;
70            }
71            // End of string found
72            return Ok((&input[idx..], &input[..idx]));
73        }
74
75        // No closing quote found
76        Err(nom::Err::Error(nom::error::Error::new(
77            input,
78            nom::error::ErrorKind::Char,
79        )))
80    }
81
82    alt((
83        map(
84            delimited(char('\''), |i| parse_string_content(i, b'\''), char('\'')),
85            Token::String,
86        ),
87        map(
88            delimited(char('"'), |i| parse_string_content(i, b'"'), char('"')),
89            Token::String,
90        ),
91    ))(input)
92}
93
94/// Parse integer: 42, -42
95/// Uses lexical-core for 10x faster parsing
96#[inline]
97fn integer(input: &str) -> IResult<&str, Token<'_>> {
98    map_res(recognize(tuple((opt(char('-')), digit1))), |s: &str| {
99        lexical_core::parse::<i64>(s.as_bytes())
100            .map(Token::Integer)
101            .map_err(|_| "parse error")
102    })(input)
103}
104
105/// Parse float: 3.14, -3.14, 1.5E-10, 0., 1.
106/// IFC allows floats like "0." without decimal digits
107/// Uses lexical-core for 10x faster parsing
108#[inline]
109fn float(input: &str) -> IResult<&str, Token<'_>> {
110    map_res(
111        recognize(tuple((
112            opt(char('-')),
113            digit1,
114            char('.'),
115            opt(digit1), // Made optional to support "0." format
116            opt(tuple((one_of("eE"), opt(one_of("+-")), digit1))),
117        ))),
118        |s: &str| {
119            lexical_core::parse::<f64>(s.as_bytes())
120                .map(Token::Float)
121                .map_err(|_| "parse error")
122        },
123    )(input)
124}
125
126/// Parse enum: .TRUE., .FALSE., .UNKNOWN., .ELEMENT.
127fn enum_value(input: &str) -> IResult<&str, Token<'_>> {
128    map(
129        delimited(
130            char('.'),
131            take_while1(|c: char| c.is_alphanumeric() || c == '_'),
132            char('.'),
133        ),
134        Token::Enum,
135    )(input)
136}
137
138/// Parse null: $
139fn null(input: &str) -> IResult<&str, Token<'_>> {
140    map(char('$'), |_| Token::Null)(input)
141}
142
143/// Parse derived: *
144fn derived(input: &str) -> IResult<&str, Token<'_>> {
145    map(char('*'), |_| Token::Derived)(input)
146}
147
148/// Parse typed value: IFCPARAMETERVALUE(0.), IFCBOOLEAN(.T.)
149fn typed_value(input: &str) -> IResult<&str, Token<'_>> {
150    map(
151        pair(
152            // Type name (all caps with optional numbers/underscores)
153            take_while1(|c: char| c.is_alphanumeric() || c == '_'),
154            // Arguments
155            delimited(
156                char('('),
157                separated_list0(delimited(ws, char(','), ws), token),
158                char(')'),
159            ),
160        ),
161        |(type_name, args)| Token::TypedValue(type_name, args),
162    )(input)
163}
164
165/// Skip whitespace
166fn ws(input: &str) -> IResult<&str, ()> {
167    map(take_while(|c: char| c.is_whitespace()), |_| ())(input)
168}
169
170/// Parse a token with optional surrounding whitespace
171/// Optimized ordering: test cheapest patterns first (single-char markers)
172fn token(input: &str) -> IResult<&str, Token<'_>> {
173    delimited(
174        ws,
175        alt((
176            // Single-char markers first (O(1) check)
177            null,       // $
178            derived,    // *
179            entity_ref, // # + digits
180            // Then by complexity
181            enum_value,     // .XXX.
182            string_literal, // 'xxx'
183            list,           // (...)
184            // Numbers: float before integer since float includes '.'
185            float,
186            integer,
187            typed_value, // IFCPARAMETERVALUE(0.) - most expensive, last
188        )),
189        ws,
190    )(input)
191}
192
193/// Parse list: (1, 2, 3) or nested lists
194fn list(input: &str) -> IResult<&str, Token<'_>> {
195    map(
196        delimited(
197            char('('),
198            separated_list0(delimited(ws, char(','), ws), token),
199            char(')'),
200        ),
201        Token::List,
202    )(input)
203}
204
205/// Parse a complete entity line
206/// Example: #123=IFCWALL('guid','owner',$,$,'name',$,$,$);
207pub fn parse_entity(input: &str) -> Result<(u32, IfcType, Vec<Token<'_>>)> {
208    let result: IResult<&str, (u32, &str, Vec<Token>)> = tuple((
209        // Entity ID: #123
210        delimited(
211            ws,
212            preceded(char('#'), map_res(digit1, |s: &str| s.parse::<u32>())),
213            ws,
214        ),
215        // Equals sign
216        preceded(
217            char('='),
218            // Entity type: IFCWALL
219            delimited(
220                ws,
221                take_while1(|c: char| c.is_alphanumeric() || c == '_'),
222                ws,
223            ),
224        ),
225        // Arguments: ('guid', 'owner', ...)
226        delimited(
227            char('('),
228            separated_list0(delimited(ws, char(','), ws), token),
229            tuple((char(')'), ws, char(';'))),
230        ),
231    ))(input);
232
233    match result {
234        Ok((_, (id, type_str, args))) => {
235            let ifc_type = IfcType::from_str(type_str);
236            Ok((id, ifc_type, args))
237        }
238        Err(e) => Err(Error::parse(0, format!("Failed to parse entity: {}", e))),
239    }
240}
241
242/// Fast entity scanner - scans file without full parsing
243/// O(n) performance for finding entities by type
244/// Uses memchr for SIMD-accelerated byte searching
245pub struct EntityScanner<'a> {
246    #[allow(dead_code)]
247    content: &'a str,
248    bytes: &'a [u8],
249    position: usize,
250}
251
252impl<'a> EntityScanner<'a> {
253    /// Create a new scanner
254    pub fn new(content: &'a str) -> Self {
255        Self {
256            content,
257            bytes: content.as_bytes(),
258            position: 0,
259        }
260    }
261
262    /// Scan for the next entity
263    /// Returns (entity_id, type_name, line_start, line_end)
264    #[inline]
265    pub fn next_entity(&mut self) -> Option<(u32, &'a str, usize, usize)> {
266        let remaining = &self.bytes[self.position..];
267
268        // Find next '#' that starts an entity using SIMD-accelerated search
269        let start_offset = memchr::memchr(b'#', remaining)?;
270        let line_start = self.position + start_offset;
271
272        // Find the end of the entity (semicolon) while respecting quoted strings
273        // IFC strings use single quotes and can contain semicolons
274        let line_content = &self.bytes[line_start..];
275        let end_offset = self.find_entity_end(line_content)?;
276        let line_end = line_start + end_offset + 1;
277
278        // Parse entity ID (inline for speed)
279        let id_start = line_start + 1;
280        let mut id_end = id_start;
281        while id_end < line_end && self.bytes[id_end].is_ascii_digit() {
282            id_end += 1;
283        }
284
285        // Fast integer parsing without allocation
286        let id = self.parse_u32_fast(id_start, id_end)?;
287
288        // Find '=' after ID using SIMD
289        let eq_search = &self.bytes[id_end..line_end];
290        let eq_offset = memchr::memchr(b'=', eq_search)?;
291        let mut type_start = id_end + eq_offset + 1;
292
293        // Skip whitespace (inline)
294        while type_start < line_end && self.bytes[type_start].is_ascii_whitespace() {
295            type_start += 1;
296        }
297
298        // Find end of type name (at '(' or whitespace)
299        let mut type_end = type_start;
300        while type_end < line_end {
301            let b = self.bytes[type_end];
302            if b == b'(' || b.is_ascii_whitespace() {
303                break;
304            }
305            type_end += 1;
306        }
307
308        // Use safe UTF-8 conversion - malformed input should not cause UB
309        let type_name = std::str::from_utf8(&self.bytes[type_start..type_end]).unwrap_or("UNKNOWN");
310
311        // Move position past this entity
312        self.position = line_end;
313
314        Some((id, type_name, line_start, line_end))
315    }
316
317    /// Fast u32 parsing without string allocation
318    #[inline]
319    fn parse_u32_fast(&self, start: usize, end: usize) -> Option<u32> {
320        let mut result: u32 = 0;
321        for i in start..end {
322            let digit = self.bytes[i].wrapping_sub(b'0');
323            if digit > 9 {
324                return None;
325            }
326            result = result.wrapping_mul(10).wrapping_add(digit as u32);
327        }
328        Some(result)
329    }
330
331    /// Find the terminating semicolon of an entity, skipping over quoted strings.
332    /// IFC strings are enclosed in single quotes ('...') and can contain semicolons.
333    /// Returns the offset of the semicolon from the start of the slice.
334    #[inline]
335    fn find_entity_end(&self, content: &[u8]) -> Option<usize> {
336        let mut pos = 0;
337        let len = content.len();
338        let mut in_string = false;
339
340        while pos < len {
341            let b = content[pos];
342
343            if in_string {
344                if b == b'\'' {
345                    // Check for escaped quote ('') - if next char is also quote, skip both
346                    if pos + 1 < len && content[pos + 1] == b'\'' {
347                        pos += 2; // Skip escaped quote
348                        continue;
349                    }
350                    in_string = false;
351                }
352                pos += 1;
353            } else {
354                match b {
355                    b'\'' => {
356                        in_string = true;
357                        pos += 1;
358                    }
359                    b';' => {
360                        return Some(pos);
361                    }
362                    b'\n' => {
363                        // Entity definitions can span multiple lines in some IFC files
364                        pos += 1;
365                    }
366                    _ => {
367                        pos += 1;
368                    }
369                }
370            }
371        }
372        None
373    }
374
375    /// Find all entities of a specific type
376    pub fn find_by_type(&mut self, target_type: &str) -> Vec<(u32, usize, usize)> {
377        let mut results = Vec::new();
378
379        while let Some((id, type_name, start, end)) = self.next_entity() {
380            if type_name.eq_ignore_ascii_case(target_type) {
381                results.push((id, start, end));
382            }
383        }
384
385        results
386    }
387
388    /// Count entities by type
389    pub fn count_by_type(&mut self) -> rustc_hash::FxHashMap<String, usize> {
390        let mut counts = rustc_hash::FxHashMap::default();
391
392        while let Some((_, type_name, _, _)) = self.next_entity() {
393            *counts.entry(type_name.to_string()).or_insert(0) += 1;
394        }
395
396        counts
397    }
398
399    /// Reset scanner to beginning
400    pub fn reset(&mut self) {
401        self.position = 0;
402    }
403
404    /// Fast check if attribute at given index is non-null (not '$')
405    /// This is used to filter building elements that don't have representation
406    /// without full entity decode. Index 0 is first attribute after '('.
407    ///
408    /// Returns true if attribute exists and is not '$', false otherwise.
409    #[inline]
410    pub fn has_non_null_attribute(&self, start: usize, end: usize, attr_index: usize) -> bool {
411        let content = &self.bytes[start..end];
412
413        // Find the opening parenthesis
414        let paren_pos = match memchr::memchr(b'(', content) {
415            Some(p) => p + 1,
416            None => return false,
417        };
418
419        let mut pos = paren_pos;
420        let mut current_attr = 0;
421        let mut depth = 0; // Track nested parentheses
422        let mut in_string = false;
423
424        // Helper to check if we're at target attribute and return result
425        let check_target = |pos: usize, current_attr: usize, depth: usize| -> Option<bool> {
426            if current_attr == attr_index && depth == 0 {
427                // Skip whitespace
428                let mut p = pos;
429                while p < content.len() && content[p].is_ascii_whitespace() {
430                    p += 1;
431                }
432                // Check if it's '$' (null)
433                if p < content.len() {
434                    return Some(content[p] != b'$');
435                }
436                return Some(false);
437            }
438            None
439        };
440
441        // Check if target is first attribute (index 0)
442        if let Some(result) = check_target(pos, current_attr, depth) {
443            return result;
444        }
445
446        while pos < content.len() {
447            let b = content[pos];
448
449            if in_string {
450                if b == b'\'' {
451                    // Check for escaped quote ('')
452                    if pos + 1 < content.len() && content[pos + 1] == b'\'' {
453                        pos += 2;
454                        continue;
455                    }
456                    in_string = false;
457                }
458                pos += 1;
459                continue;
460            }
461
462            match b {
463                b'\'' => {
464                    in_string = true;
465                    pos += 1;
466                }
467                b'(' => {
468                    depth += 1;
469                    pos += 1;
470                }
471                b')' => {
472                    if depth == 0 {
473                        // End of entity - attribute not found
474                        return false;
475                    }
476                    depth -= 1;
477                    pos += 1;
478                }
479                b',' if depth == 0 => {
480                    current_attr += 1;
481                    pos += 1;
482                    // Skip whitespace after comma
483                    while pos < content.len() && content[pos].is_ascii_whitespace() {
484                        pos += 1;
485                    }
486                    // Check if we're now at target attribute
487                    if let Some(result) = check_target(pos, current_attr, depth) {
488                        return result;
489                    }
490                }
491                _ => {
492                    pos += 1;
493                }
494            }
495        }
496
497        false
498    }
499}
500
501#[cfg(test)]
502mod tests {
503    use super::*;
504
505    #[test]
506    fn test_entity_ref() {
507        assert_eq!(entity_ref("#123"), Ok(("", Token::EntityRef(123))));
508        assert_eq!(entity_ref("#0"), Ok(("", Token::EntityRef(0))));
509    }
510
511    #[test]
512    fn test_string_literal() {
513        assert_eq!(string_literal("'hello'"), Ok(("", Token::String("hello"))));
514        assert_eq!(
515            string_literal("'with spaces'"),
516            Ok(("", Token::String("with spaces")))
517        );
518    }
519
520    #[test]
521    fn test_integer() {
522        assert_eq!(integer("42"), Ok(("", Token::Integer(42))));
523        assert_eq!(integer("-42"), Ok(("", Token::Integer(-42))));
524        assert_eq!(integer("0"), Ok(("", Token::Integer(0))));
525    }
526
527    #[test]
528    #[allow(clippy::approx_constant)]
529    fn test_float() {
530        assert_eq!(float("3.14"), Ok(("", Token::Float(3.14))));
531        assert_eq!(float("-3.14"), Ok(("", Token::Float(-3.14))));
532        assert_eq!(float("1.5E-10"), Ok(("", Token::Float(1.5e-10))));
533    }
534
535    #[test]
536    fn test_enum() {
537        assert_eq!(enum_value(".TRUE."), Ok(("", Token::Enum("TRUE"))));
538        assert_eq!(enum_value(".FALSE."), Ok(("", Token::Enum("FALSE"))));
539        assert_eq!(enum_value(".ELEMENT."), Ok(("", Token::Enum("ELEMENT"))));
540    }
541
542    #[test]
543    fn test_list() {
544        let result = list("(1,2,3)");
545        assert!(result.is_ok());
546        let (_, token) = result.unwrap();
547        match token {
548            Token::List(items) => {
549                assert_eq!(items.len(), 3);
550                assert_eq!(items[0], Token::Integer(1));
551                assert_eq!(items[1], Token::Integer(2));
552                assert_eq!(items[2], Token::Integer(3));
553            }
554            _ => panic!("Expected List token"),
555        }
556    }
557
558    #[test]
559    fn test_nested_list() {
560        let result = list("(1,(2,3),4)");
561        assert!(result.is_ok());
562        let (_, token) = result.unwrap();
563        match token {
564            Token::List(items) => {
565                assert_eq!(items.len(), 3);
566                assert_eq!(items[0], Token::Integer(1));
567                match &items[1] {
568                    Token::List(inner) => {
569                        assert_eq!(inner.len(), 2);
570                        assert_eq!(inner[0], Token::Integer(2));
571                        assert_eq!(inner[1], Token::Integer(3));
572                    }
573                    _ => panic!("Expected nested List"),
574                }
575                assert_eq!(items[2], Token::Integer(4));
576            }
577            _ => panic!("Expected List token"),
578        }
579    }
580
581    #[test]
582    fn test_parse_entity() {
583        let input = "#123=IFCWALL('guid','owner',$,$,'name',$,$,$);";
584        let result = parse_entity(input);
585        assert!(result.is_ok());
586        let (id, ifc_type, args) = result.unwrap();
587        assert_eq!(id, 123);
588        assert_eq!(ifc_type, IfcType::IfcWall);
589        assert_eq!(args.len(), 8);
590    }
591
592    #[test]
593    fn test_parse_entity_with_nested_list() {
594        // First test: simple list (should work)
595        let simple = "(0.,0.,1.)";
596        println!("Testing simple list: {}", simple);
597        let simple_result = list(simple);
598        println!("Simple list result: {:?}", simple_result);
599
600        // Second test: nested in entity (what's failing)
601        let input = "#9=IFCDIRECTION((0.,0.,1.));";
602        println!("\nTesting full entity: {}", input);
603        let result = parse_entity(input);
604
605        if let Err(ref e) = result {
606            println!("Parse error: {:?}", e);
607
608            // Try parsing just the arguments part
609            println!("\nTrying to parse just arguments: ((0.,0.,1.))");
610            let args_input = "((0.,0.,1.))";
611            let args_result = list(args_input);
612            println!("Args list result: {:?}", args_result);
613        }
614
615        assert!(result.is_ok(), "Failed to parse: {:?}", result);
616        let (id, _ifc_type, args) = result.unwrap();
617        assert_eq!(id, 9);
618        assert_eq!(args.len(), 1);
619        // First arg should be a list containing 3 floats
620        if let Token::List(inner) = &args[0] {
621            assert_eq!(inner.len(), 3);
622        } else {
623            panic!("Expected Token::List, got {:?}", args[0]);
624        }
625    }
626
627    #[test]
628    fn test_entity_scanner() {
629        let content = r#"
630#1=IFCPROJECT('guid',$,$,$,$,$,$,$,$);
631#2=IFCWALL('guid2',$,$,$,$,$,$,$);
632#3=IFCDOOR('guid3',$,$,$,$,$,$,$);
633#4=IFCWALL('guid4',$,$,$,$,$,$,$);
634"#;
635
636        let mut scanner = EntityScanner::new(content);
637
638        // Test next_entity
639        let (id, type_name, _, _) = scanner.next_entity().unwrap();
640        assert_eq!(id, 1);
641        assert_eq!(type_name, "IFCPROJECT");
642
643        // Test find_by_type
644        scanner.reset();
645        let walls = scanner.find_by_type("IFCWALL");
646        assert_eq!(walls.len(), 2);
647        assert_eq!(walls[0].0, 2);
648        assert_eq!(walls[1].0, 4);
649
650        // Test count_by_type
651        scanner.reset();
652        let counts = scanner.count_by_type();
653        assert_eq!(counts.get("IFCPROJECT"), Some(&1));
654        assert_eq!(counts.get("IFCWALL"), Some(&2));
655        assert_eq!(counts.get("IFCDOOR"), Some(&1));
656    }
657}