Skip to main content

ifc_lite_core/
parser.rs

1// This Source Code Form is subject to the terms of the Mozilla Public
2// License, v. 2.0. If a copy of the MPL was not distributed with this
3// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4
5//! STEP/IFC Parser using nom
6//!
7//! Zero-copy tokenization and fast entity scanning.
8
9use nom::{
10    branch::alt,
11    bytes::complete::{take_while, take_while1},
12    character::complete::{char, digit1, one_of},
13    combinator::{map, map_res, opt, recognize},
14    multi::separated_list0,
15    sequence::{delimited, pair, preceded, tuple},
16    IResult,
17};
18
19use crate::error::{Error, Result};
20use crate::generated::IfcType;
21
22/// STEP/IFC Token
23#[derive(Debug, Clone, PartialEq)]
24pub enum Token<'a> {
25    /// Entity reference: #123
26    EntityRef(u32),
27    /// String literal: 'text'
28    String(&'a str),
29    /// Integer: 42
30    Integer(i64),
31    /// Float: 3.14
32    Float(f64),
33    /// Enum: .TRUE., .FALSE., .UNKNOWN.
34    Enum(&'a str),
35    /// List: (1, 2, 3)
36    List(Vec<Token<'a>>),
37    /// Typed value: IFCPARAMETERVALUE(0.), IFCBOOLEAN(.T.)
38    TypedValue(&'a str, Vec<Token<'a>>),
39    /// Null value: $
40    Null,
41    /// Asterisk (derived value): *
42    Derived,
43}
44
45/// Parse entity reference: #123
46fn entity_ref(input: &str) -> IResult<&str, Token<'_>> {
47    map(
48        preceded(char('#'), map_res(digit1, |s: &str| s.parse::<u32>())),
49        Token::EntityRef,
50    )(input)
51}
52
53/// Parse string literal: 'text' or "text"
54/// IFC uses '' to escape a single quote within a string
55/// Uses memchr for SIMD-accelerated quote searching
56fn string_literal(input: &str) -> IResult<&str, Token<'_>> {
57    // Helper to parse string content with escaped quotes - SIMD optimized
58    #[inline]
59    fn parse_string_content(input: &str, quote_byte: u8) -> IResult<&str, &str> {
60        let bytes = input.as_bytes();
61        let mut pos = 0;
62
63        // Use memchr for SIMD-accelerated searching
64        while let Some(found) = memchr::memchr(quote_byte, &bytes[pos..]) {
65            let idx = pos + found;
66            // Check if it's an escaped quote (doubled)
67            if idx + 1 < bytes.len() && bytes[idx + 1] == quote_byte {
68                pos = idx + 2; // Skip escaped quote pair
69                continue;
70            }
71            // End of string found
72            return Ok((&input[idx..], &input[..idx]));
73        }
74
75        // No closing quote found
76        Err(nom::Err::Error(nom::error::Error::new(
77            input,
78            nom::error::ErrorKind::Char,
79        )))
80    }
81
82    alt((
83        map(
84            delimited(char('\''), |i| parse_string_content(i, b'\''), char('\'')),
85            Token::String,
86        ),
87        map(
88            delimited(char('"'), |i| parse_string_content(i, b'"'), char('"')),
89            Token::String,
90        ),
91    ))(input)
92}
93
94/// Parse integer: 42, -42
95/// Uses lexical-core for 10x faster parsing
96#[inline]
97fn integer(input: &str) -> IResult<&str, Token<'_>> {
98    map_res(recognize(tuple((opt(char('-')), digit1))), |s: &str| {
99        lexical_core::parse::<i64>(s.as_bytes())
100            .map(Token::Integer)
101            .map_err(|_| "parse error")
102    })(input)
103}
104
105/// Parse float: 3.14, -3.14, 1.5E-10, 0., 1.
106/// IFC allows floats like "0." without decimal digits
107/// Uses lexical-core for 10x faster parsing
108#[inline]
109fn float(input: &str) -> IResult<&str, Token<'_>> {
110    map_res(
111        recognize(tuple((
112            opt(char('-')),
113            digit1,
114            char('.'),
115            opt(digit1), // Made optional to support "0." format
116            opt(tuple((one_of("eE"), opt(one_of("+-")), digit1))),
117        ))),
118        |s: &str| {
119            lexical_core::parse::<f64>(s.as_bytes())
120                .map(Token::Float)
121                .map_err(|_| "parse error")
122        },
123    )(input)
124}
125
126/// Parse enum: .TRUE., .FALSE., .UNKNOWN., .ELEMENT.
127fn enum_value(input: &str) -> IResult<&str, Token<'_>> {
128    map(
129        delimited(
130            char('.'),
131            take_while1(|c: char| c.is_alphanumeric() || c == '_'),
132            char('.'),
133        ),
134        Token::Enum,
135    )(input)
136}
137
138/// Parse null: $
139fn null(input: &str) -> IResult<&str, Token<'_>> {
140    map(char('$'), |_| Token::Null)(input)
141}
142
143/// Parse derived: *
144fn derived(input: &str) -> IResult<&str, Token<'_>> {
145    map(char('*'), |_| Token::Derived)(input)
146}
147
148/// Parse typed value: IFCPARAMETERVALUE(0.), IFCBOOLEAN(.T.)
149fn typed_value(input: &str) -> IResult<&str, Token<'_>> {
150    map(
151        pair(
152            // Type name (all caps with optional numbers/underscores)
153            take_while1(|c: char| c.is_alphanumeric() || c == '_'),
154            // Arguments
155            delimited(
156                char('('),
157                separated_list0(delimited(ws, char(','), ws), token),
158                char(')'),
159            ),
160        ),
161        |(type_name, args)| Token::TypedValue(type_name, args),
162    )(input)
163}
164
165/// Skip whitespace
166fn ws(input: &str) -> IResult<&str, ()> {
167    map(take_while(|c: char| c.is_whitespace()), |_| ())(input)
168}
169
170/// Parse a token with optional surrounding whitespace
171/// Optimized ordering: test cheapest patterns first (single-char markers)
172fn token(input: &str) -> IResult<&str, Token<'_>> {
173    delimited(
174        ws,
175        alt((
176            // Single-char markers first (O(1) check)
177            null,       // $
178            derived,    // *
179            entity_ref, // # + digits
180            // Then by complexity
181            enum_value,     // .XXX.
182            string_literal, // 'xxx'
183            list,           // (...)
184            // Numbers: float before integer since float includes '.'
185            float,
186            integer,
187            typed_value, // IFCPARAMETERVALUE(0.) - most expensive, last
188        )),
189        ws,
190    )(input)
191}
192
193/// Parse list: (1, 2, 3) or nested lists
194fn list(input: &str) -> IResult<&str, Token<'_>> {
195    map(
196        delimited(
197            char('('),
198            separated_list0(delimited(ws, char(','), ws), token),
199            char(')'),
200        ),
201        Token::List,
202    )(input)
203}
204
205/// Parse a complete entity line
206/// Example: #123=IFCWALL('guid','owner',$,$,'name',$,$,$);
207pub fn parse_entity(input: &str) -> Result<(u32, IfcType, Vec<Token<'_>>)> {
208    let result: IResult<&str, (u32, &str, Vec<Token>)> = tuple((
209        // Entity ID: #123
210        delimited(
211            ws,
212            preceded(char('#'), map_res(digit1, |s: &str| s.parse::<u32>())),
213            ws,
214        ),
215        // Equals sign
216        preceded(
217            char('='),
218            // Entity type: IFCWALL
219            delimited(
220                ws,
221                take_while1(|c: char| c.is_alphanumeric() || c == '_'),
222                ws,
223            ),
224        ),
225        // Arguments: ('guid', 'owner', ...)
226        delimited(
227            char('('),
228            separated_list0(delimited(ws, char(','), ws), token),
229            tuple((char(')'), ws, char(';'))),
230        ),
231    ))(input);
232
233    match result {
234        Ok((_, (id, type_str, args))) => {
235            let ifc_type = IfcType::from_str(type_str);
236            Ok((id, ifc_type, args))
237        }
238        Err(e) => Err(Error::parse(0, format!("Failed to parse entity: {}", e))),
239    }
240}
241
242/// Fast entity scanner - scans file without full parsing
243/// O(n) performance for finding entities by type
244/// Uses memchr for SIMD-accelerated byte searching
245pub struct EntityScanner<'a> {
246    #[allow(dead_code)]
247    content: &'a str,
248    bytes: &'a [u8],
249    position: usize,
250}
251
252impl<'a> EntityScanner<'a> {
253    /// Create a new scanner
254    pub fn new(content: &'a str) -> Self {
255        Self {
256            content,
257            bytes: content.as_bytes(),
258            position: 0,
259        }
260    }
261
262    /// Scan for the next entity
263    /// Returns (entity_id, type_name, line_start, line_end)
264    #[inline]
265    pub fn next_entity(&mut self) -> Option<(u32, &'a str, usize, usize)> {
266        let remaining = &self.bytes[self.position..];
267
268        // Find next '#' that starts an entity using SIMD-accelerated search
269        let start_offset = memchr::memchr(b'#', remaining)?;
270        let line_start = self.position + start_offset;
271
272        // Find the end of the entity (semicolon) while respecting quoted strings
273        // IFC strings use single quotes and can contain semicolons
274        let line_content = &self.bytes[line_start..];
275        let end_offset = self.find_entity_end(line_content)?;
276        let line_end = line_start + end_offset + 1;
277
278        // Parse entity ID (inline for speed)
279        let id_start = line_start + 1;
280        let mut id_end = id_start;
281        while id_end < line_end && self.bytes[id_end].is_ascii_digit() {
282            id_end += 1;
283        }
284
285        // Fast integer parsing without allocation
286        let id = self.parse_u32_fast(id_start, id_end)?;
287
288        // Find '=' after ID using SIMD
289        let eq_search = &self.bytes[id_end..line_end];
290        let eq_offset = memchr::memchr(b'=', eq_search)?;
291        let mut type_start = id_end + eq_offset + 1;
292
293        // Skip whitespace (inline)
294        while type_start < line_end && self.bytes[type_start].is_ascii_whitespace() {
295            type_start += 1;
296        }
297
298        // Find end of type name (at '(' or whitespace)
299        let mut type_end = type_start;
300        while type_end < line_end {
301            let b = self.bytes[type_end];
302            if b == b'(' || b.is_ascii_whitespace() {
303                break;
304            }
305            type_end += 1;
306        }
307
308        // Use safe UTF-8 conversion - malformed input should not cause UB
309        let type_name = std::str::from_utf8(&self.bytes[type_start..type_end])
310            .unwrap_or("UNKNOWN");
311
312        // Move position past this entity
313        self.position = line_end;
314
315        Some((id, type_name, line_start, line_end))
316    }
317
318    /// Fast u32 parsing without string allocation
319    #[inline]
320    fn parse_u32_fast(&self, start: usize, end: usize) -> Option<u32> {
321        let mut result: u32 = 0;
322        for i in start..end {
323            let digit = self.bytes[i].wrapping_sub(b'0');
324            if digit > 9 {
325                return None;
326            }
327            result = result.wrapping_mul(10).wrapping_add(digit as u32);
328        }
329        Some(result)
330    }
331
332    /// Find the terminating semicolon of an entity, skipping over quoted strings.
333    /// IFC strings are enclosed in single quotes ('...') and can contain semicolons.
334    /// Returns the offset of the semicolon from the start of the slice.
335    #[inline]
336    fn find_entity_end(&self, content: &[u8]) -> Option<usize> {
337        let mut pos = 0;
338        let len = content.len();
339        let mut in_string = false;
340
341        while pos < len {
342            let b = content[pos];
343            
344            if in_string {
345                if b == b'\'' {
346                    // Check for escaped quote ('') - if next char is also quote, skip both
347                    if pos + 1 < len && content[pos + 1] == b'\'' {
348                        pos += 2; // Skip escaped quote
349                        continue;
350                    }
351                    in_string = false;
352                }
353                pos += 1;
354            } else {
355                match b {
356                    b'\'' => {
357                        in_string = true;
358                        pos += 1;
359                    }
360                    b';' => {
361                        return Some(pos);
362                    }
363                    b'\n' => {
364                        // Entity definitions can span multiple lines in some IFC files
365                        pos += 1;
366                    }
367                    _ => {
368                        pos += 1;
369                    }
370                }
371            }
372        }
373        None
374    }
375
376    /// Find all entities of a specific type
377    pub fn find_by_type(&mut self, target_type: &str) -> Vec<(u32, usize, usize)> {
378        let mut results = Vec::new();
379
380        while let Some((id, type_name, start, end)) = self.next_entity() {
381            if type_name.eq_ignore_ascii_case(target_type) {
382                results.push((id, start, end));
383            }
384        }
385
386        results
387    }
388
389    /// Count entities by type
390    pub fn count_by_type(&mut self) -> rustc_hash::FxHashMap<String, usize> {
391        let mut counts = rustc_hash::FxHashMap::default();
392
393        while let Some((_, type_name, _, _)) = self.next_entity() {
394            *counts.entry(type_name.to_string()).or_insert(0) += 1;
395        }
396
397        counts
398    }
399
400    /// Reset scanner to beginning
401    pub fn reset(&mut self) {
402        self.position = 0;
403    }
404
405    /// Fast check if attribute at given index is non-null (not '$')
406    /// This is used to filter building elements that don't have representation
407    /// without full entity decode. Index 0 is first attribute after '('.
408    ///
409    /// Returns true if attribute exists and is not '$', false otherwise.
410    #[inline]
411    pub fn has_non_null_attribute(&self, start: usize, end: usize, attr_index: usize) -> bool {
412        let content = &self.bytes[start..end];
413
414        // Find the opening parenthesis
415        let paren_pos = match memchr::memchr(b'(', content) {
416            Some(p) => p + 1,
417            None => return false,
418        };
419
420        let mut pos = paren_pos;
421        let mut current_attr = 0;
422        let mut depth = 0; // Track nested parentheses
423        let mut in_string = false;
424
425        // Helper to check if we're at target attribute and return result
426        let check_target = |pos: usize, current_attr: usize, depth: usize| -> Option<bool> {
427            if current_attr == attr_index && depth == 0 {
428                // Skip whitespace
429                let mut p = pos;
430                while p < content.len() && content[p].is_ascii_whitespace() {
431                    p += 1;
432                }
433                // Check if it's '$' (null)
434                if p < content.len() {
435                    return Some(content[p] != b'$');
436                }
437                return Some(false);
438            }
439            None
440        };
441
442        // Check if target is first attribute (index 0)
443        if let Some(result) = check_target(pos, current_attr, depth) {
444            return result;
445        }
446
447        while pos < content.len() {
448            let b = content[pos];
449
450            if in_string {
451                if b == b'\'' {
452                    // Check for escaped quote ('')
453                    if pos + 1 < content.len() && content[pos + 1] == b'\'' {
454                        pos += 2;
455                        continue;
456                    }
457                    in_string = false;
458                }
459                pos += 1;
460                continue;
461            }
462
463            match b {
464                b'\'' => {
465                    in_string = true;
466                    pos += 1;
467                }
468                b'(' => {
469                    depth += 1;
470                    pos += 1;
471                }
472                b')' => {
473                    if depth == 0 {
474                        // End of entity - attribute not found
475                        return false;
476                    }
477                    depth -= 1;
478                    pos += 1;
479                }
480                b',' if depth == 0 => {
481                    current_attr += 1;
482                    pos += 1;
483                    // Skip whitespace after comma
484                    while pos < content.len() && content[pos].is_ascii_whitespace() {
485                        pos += 1;
486                    }
487                    // Check if we're now at target attribute
488                    if let Some(result) = check_target(pos, current_attr, depth) {
489                        return result;
490                    }
491                }
492                _ => {
493                    pos += 1;
494                }
495            }
496        }
497
498        false
499    }
500}
501
502#[cfg(test)]
503mod tests {
504    use super::*;
505
506    #[test]
507    fn test_entity_ref() {
508        assert_eq!(entity_ref("#123"), Ok(("", Token::EntityRef(123))));
509        assert_eq!(entity_ref("#0"), Ok(("", Token::EntityRef(0))));
510    }
511
512    #[test]
513    fn test_string_literal() {
514        assert_eq!(string_literal("'hello'"), Ok(("", Token::String("hello"))));
515        assert_eq!(
516            string_literal("'with spaces'"),
517            Ok(("", Token::String("with spaces")))
518        );
519    }
520
521    #[test]
522    fn test_integer() {
523        assert_eq!(integer("42"), Ok(("", Token::Integer(42))));
524        assert_eq!(integer("-42"), Ok(("", Token::Integer(-42))));
525        assert_eq!(integer("0"), Ok(("", Token::Integer(0))));
526    }
527
528    #[test]
529    #[allow(clippy::approx_constant)]
530    fn test_float() {
531        assert_eq!(float("3.14"), Ok(("", Token::Float(3.14))));
532        assert_eq!(float("-3.14"), Ok(("", Token::Float(-3.14))));
533        assert_eq!(float("1.5E-10"), Ok(("", Token::Float(1.5e-10))));
534    }
535
536    #[test]
537    fn test_enum() {
538        assert_eq!(enum_value(".TRUE."), Ok(("", Token::Enum("TRUE"))));
539        assert_eq!(enum_value(".FALSE."), Ok(("", Token::Enum("FALSE"))));
540        assert_eq!(enum_value(".ELEMENT."), Ok(("", Token::Enum("ELEMENT"))));
541    }
542
543    #[test]
544    fn test_list() {
545        let result = list("(1,2,3)");
546        assert!(result.is_ok());
547        let (_, token) = result.unwrap();
548        match token {
549            Token::List(items) => {
550                assert_eq!(items.len(), 3);
551                assert_eq!(items[0], Token::Integer(1));
552                assert_eq!(items[1], Token::Integer(2));
553                assert_eq!(items[2], Token::Integer(3));
554            }
555            _ => panic!("Expected List token"),
556        }
557    }
558
559    #[test]
560    fn test_nested_list() {
561        let result = list("(1,(2,3),4)");
562        assert!(result.is_ok());
563        let (_, token) = result.unwrap();
564        match token {
565            Token::List(items) => {
566                assert_eq!(items.len(), 3);
567                assert_eq!(items[0], Token::Integer(1));
568                match &items[1] {
569                    Token::List(inner) => {
570                        assert_eq!(inner.len(), 2);
571                        assert_eq!(inner[0], Token::Integer(2));
572                        assert_eq!(inner[1], Token::Integer(3));
573                    }
574                    _ => panic!("Expected nested List"),
575                }
576                assert_eq!(items[2], Token::Integer(4));
577            }
578            _ => panic!("Expected List token"),
579        }
580    }
581
582    #[test]
583    fn test_parse_entity() {
584        let input = "#123=IFCWALL('guid','owner',$,$,'name',$,$,$);";
585        let result = parse_entity(input);
586        assert!(result.is_ok());
587        let (id, ifc_type, args) = result.unwrap();
588        assert_eq!(id, 123);
589        assert_eq!(ifc_type, IfcType::IfcWall);
590        assert_eq!(args.len(), 8);
591    }
592
593    #[test]
594    fn test_parse_entity_with_nested_list() {
595        // First test: simple list (should work)
596        let simple = "(0.,0.,1.)";
597        println!("Testing simple list: {}", simple);
598        let simple_result = list(simple);
599        println!("Simple list result: {:?}", simple_result);
600
601        // Second test: nested in entity (what's failing)
602        let input = "#9=IFCDIRECTION((0.,0.,1.));";
603        println!("\nTesting full entity: {}", input);
604        let result = parse_entity(input);
605
606        if let Err(ref e) = result {
607            println!("Parse error: {:?}", e);
608
609            // Try parsing just the arguments part
610            println!("\nTrying to parse just arguments: ((0.,0.,1.))");
611            let args_input = "((0.,0.,1.))";
612            let args_result = list(args_input);
613            println!("Args list result: {:?}", args_result);
614        }
615
616        assert!(result.is_ok(), "Failed to parse: {:?}", result);
617        let (id, _ifc_type, args) = result.unwrap();
618        assert_eq!(id, 9);
619        assert_eq!(args.len(), 1);
620        // First arg should be a list containing 3 floats
621        if let Token::List(inner) = &args[0] {
622            assert_eq!(inner.len(), 3);
623        } else {
624            panic!("Expected Token::List, got {:?}", args[0]);
625        }
626    }
627
628    #[test]
629    fn test_entity_scanner() {
630        let content = r#"
631#1=IFCPROJECT('guid',$,$,$,$,$,$,$,$);
632#2=IFCWALL('guid2',$,$,$,$,$,$,$);
633#3=IFCDOOR('guid3',$,$,$,$,$,$,$);
634#4=IFCWALL('guid4',$,$,$,$,$,$,$);
635"#;
636
637        let mut scanner = EntityScanner::new(content);
638
639        // Test next_entity
640        let (id, type_name, _, _) = scanner.next_entity().unwrap();
641        assert_eq!(id, 1);
642        assert_eq!(type_name, "IFCPROJECT");
643
644        // Test find_by_type
645        scanner.reset();
646        let walls = scanner.find_by_type("IFCWALL");
647        assert_eq!(walls.len(), 2);
648        assert_eq!(walls[0].0, 2);
649        assert_eq!(walls[1].0, 4);
650
651        // Test count_by_type
652        scanner.reset();
653        let counts = scanner.count_by_type();
654        assert_eq!(counts.get("IFCPROJECT"), Some(&1));
655        assert_eq!(counts.get("IFCWALL"), Some(&2));
656        assert_eq!(counts.get("IFCDOOR"), Some(&1));
657    }
658}