Skip to main content

bimifc_parser/
tokenizer.rs

1// This Source Code Form is subject to the terms of the Mozilla Public
2// License, v. 2.0. If a copy of the MPL was not distributed with this
3// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4
5//! STEP file tokenizer using nom combinators
6//!
7//! Parses STEP/IFC entity definitions into tokens.
8
9use bimifc_model::{AttributeValue, DecodedEntity, EntityId, IfcType};
10use nom::{
11    branch::alt,
12    bytes::complete::{take_while, take_while1},
13    character::complete::{char, multispace0},
14    combinator::{opt, recognize},
15    multi::separated_list0,
16    sequence::{delimited, pair},
17    IResult, Parser,
18};
19
20/// Raw token from STEP file (before conversion to AttributeValue)
21#[derive(Clone, Debug, PartialEq)]
22pub enum Token<'a> {
23    /// Entity reference (#123)
24    EntityRef(u32),
25    /// String value ('text')
26    String(&'a str),
27    /// Integer value
28    Integer(i64),
29    /// Float value
30    Float(f64),
31    /// Enumeration (.VALUE.)
32    Enum(&'a str),
33    /// List of tokens
34    List(Vec<Token<'a>>),
35    /// Typed value like IFCLABEL('text')
36    TypedValue(&'a str, Vec<Token<'a>>),
37    /// Null value ($)
38    Null,
39    /// Derived value (*)
40    Derived,
41}
42
43impl<'a> Token<'a> {
44    /// Convert token to owned AttributeValue
45    pub fn to_attribute_value(&self) -> AttributeValue {
46        match self {
47            Token::EntityRef(id) => AttributeValue::EntityRef(EntityId(*id)),
48            Token::String(s) => AttributeValue::String((*s).to_string()),
49            Token::Integer(i) => AttributeValue::Integer(*i),
50            Token::Float(f) => AttributeValue::Float(*f),
51            Token::Enum(s) => AttributeValue::Enum((*s).to_string()),
52            Token::List(items) => {
53                AttributeValue::List(items.iter().map(|t| t.to_attribute_value()).collect())
54            }
55            Token::TypedValue(name, args) => AttributeValue::TypedValue(
56                (*name).to_string(),
57                args.iter().map(|t| t.to_attribute_value()).collect(),
58            ),
59            Token::Null => AttributeValue::Null,
60            Token::Derived => AttributeValue::Derived,
61        }
62    }
63}
64
65// ============================================================================
66// Parsing Primitives
67// ============================================================================
68
69/// Parse whitespace (including comments)
70fn ws(input: &str) -> IResult<&str, ()> {
71    let (input, _) = multispace0(input)?;
72    Ok((input, ()))
73}
74
75/// Parse an entity reference (#123)
76fn entity_ref(input: &str) -> IResult<&str, Token<'_>> {
77    let (input, _) = char('#')(input)?;
78    let (input, digits) = take_while1(|c: char| c.is_ascii_digit())(input)?;
79    let id = digits.parse::<u32>().unwrap_or(0);
80    Ok((input, Token::EntityRef(id)))
81}
82
83/// Parse a STEP string ('text' with '' for escaped quotes)
84fn step_string(input: &str) -> IResult<&str, Token<'_>> {
85    let (input, _) = char('\'')(input)?;
86
87    // Find the end of the string, handling escaped quotes ('')
88    let mut end = 0;
89    let bytes = input.as_bytes();
90    while end < bytes.len() {
91        if bytes[end] == b'\'' {
92            // Check for escaped quote
93            if end + 1 < bytes.len() && bytes[end + 1] == b'\'' {
94                end += 2;
95                continue;
96            }
97            break;
98        }
99        end += 1;
100    }
101
102    let content = &input[..end];
103    let remaining = &input[end + 1..]; // Skip closing quote
104
105    Ok((remaining, Token::String(content)))
106}
107
108/// Parse a number (integer or float)
109fn number(input: &str) -> IResult<&str, Token<'_>> {
110    let (input, num_str) = recognize((
111        opt(char('-')),
112        take_while1(|c: char| c.is_ascii_digit()),
113        opt(pair(char('.'), take_while(|c: char| c.is_ascii_digit()))),
114        opt((
115            alt((char('e'), char('E'))),
116            opt(alt((char('+'), char('-')))),
117            take_while1(|c: char| c.is_ascii_digit()),
118        )),
119    ))
120    .parse(input)?;
121
122    // Use lexical-core for fast parsing
123    if num_str.contains('.') || num_str.contains('e') || num_str.contains('E') {
124        let f: f64 = lexical_core::parse(num_str.as_bytes()).unwrap_or(0.0);
125        Ok((input, Token::Float(f)))
126    } else {
127        let i: i64 = lexical_core::parse(num_str.as_bytes()).unwrap_or(0);
128        Ok((input, Token::Integer(i)))
129    }
130}
131
132/// Parse an enumeration (.VALUE.)
133fn enumeration(input: &str) -> IResult<&str, Token<'_>> {
134    let (input, _) = char('.')(input)?;
135    let (input, name) = take_while1(|c: char| c.is_alphanumeric() || c == '_')(input)?;
136    let (input, _) = char('.')(input)?;
137    Ok((input, Token::Enum(name)))
138}
139
140/// Parse null ($)
141fn null_value(input: &str) -> IResult<&str, Token<'_>> {
142    let (input, _) = char('$')(input)?;
143    Ok((input, Token::Null))
144}
145
146/// Parse derived (*)
147fn derived_value(input: &str) -> IResult<&str, Token<'_>> {
148    let (input, _) = char('*')(input)?;
149    Ok((input, Token::Derived))
150}
151
152/// Parse a list of tokens
153fn list(input: &str) -> IResult<&str, Token<'_>> {
154    let (input, items) = delimited(
155        pair(char('('), ws),
156        separated_list0((ws, char(','), ws), token),
157        pair(ws, char(')')),
158    )
159    .parse(input)?;
160    Ok((input, Token::List(items)))
161}
162
163/// Parse a typed value like IFCLABEL('text')
164fn typed_value(input: &str) -> IResult<&str, Token<'_>> {
165    let (input, type_name) = take_while1(|c: char| c.is_alphanumeric() || c == '_')(input)?;
166    let (input, _) = ws(input)?;
167    let (input, args) = delimited(
168        pair(char('('), ws),
169        separated_list0((ws, char(','), ws), token),
170        pair(ws, char(')')),
171    )
172    .parse(input)?;
173    Ok((input, Token::TypedValue(type_name, args)))
174}
175
176/// Parse any token
177fn token(input: &str) -> IResult<&str, Token<'_>> {
178    alt((
179        entity_ref,
180        step_string,
181        null_value,
182        derived_value,
183        enumeration,
184        number,
185        list,
186        typed_value,
187    ))
188    .parse(input)
189}
190
191/// Parse entity attribute list
192fn attribute_list(input: &str) -> IResult<&str, Vec<Token<'_>>> {
193    delimited(
194        pair(char('('), ws),
195        separated_list0((ws, char(','), ws), token),
196        pair(ws, char(')')),
197    )
198    .parse(input)
199}
200
201// ============================================================================
202// Entity Parsing
203// ============================================================================
204
205/// Parse a complete entity definition
206///
207/// Format: `#123=IFCWALL(attr1,attr2,...);`
208pub fn parse_entity(input: &str) -> Result<DecodedEntity, String> {
209    // Skip leading whitespace
210    let input = input.trim_start();
211
212    // Parse entity ID
213    let (input, _) = char::<&str, nom::error::Error<&str>>('#')
214        .parse(input)
215        .map_err(|_| "Expected # at start of entity")?;
216
217    let (input, id_str) =
218        take_while1::<_, &str, nom::error::Error<&str>>(|c: char| c.is_ascii_digit())
219            .parse(input)
220            .map_err(|_| "Expected entity ID")?;
221
222    let id: u32 = id_str.parse().map_err(|_| "Invalid entity ID")?;
223
224    // Skip =
225    let (input, _) = (ws, char('='), ws)
226        .parse(input)
227        .map_err(|_: nom::Err<nom::error::Error<&str>>| "Expected = after entity ID")?;
228
229    // Parse type name
230    let (input, type_name) =
231        take_while1::<_, &str, nom::error::Error<&str>>(|c: char| c.is_alphanumeric() || c == '_')
232            .parse(input)
233            .map_err(|_| "Expected type name")?;
234
235    // Parse attributes
236    let (input, _) = ws(input).unwrap_or((input, ()));
237
238    let (_, tokens) =
239        attribute_list(input).map_err(|e| format!("Failed to parse attributes: {:?}", e))?;
240
241    // Convert tokens to attribute values
242    let attributes: Vec<AttributeValue> = tokens.iter().map(|t| t.to_attribute_value()).collect();
243
244    Ok(DecodedEntity {
245        id: EntityId(id),
246        ifc_type: IfcType::parse(type_name),
247        attributes,
248    })
249}
250
251/// Parse entity from raw bytes at given position
252pub fn parse_entity_at(content: &str, start: usize, end: usize) -> Result<DecodedEntity, String> {
253    let slice = &content[start..end];
254    parse_entity(slice)
255}
256
257// ============================================================================
258// Fast Path Parsers (for coordinate extraction)
259// ============================================================================
260
261/// Fast parse coordinate list from IfcCartesianPointList3D
262/// Returns flattened [x0,y0,z0, x1,y1,z1, ...]
263#[allow(dead_code)]
264pub fn parse_coordinate_list_3d_fast(content: &str) -> Option<Vec<f64>> {
265    // Find the coordinates list - typically attribute 0 after CoordList
266    let start = content.find("((")?;
267    let end = content.rfind("))")?;
268    let list_content = &content[start + 1..end + 1];
269
270    let mut coords = Vec::new();
271    let mut current = list_content;
272
273    while let Some(paren_start) = current.find('(') {
274        let paren_end = current[paren_start..].find(')')? + paren_start;
275        let point_str = &current[paren_start + 1..paren_end];
276
277        // Parse x, y, z
278        for num_str in point_str.split(',') {
279            let num_str = num_str.trim();
280            if !num_str.is_empty() {
281                let val: f64 = lexical_core::parse(num_str.as_bytes()).ok()?;
282                coords.push(val);
283            }
284        }
285
286        current = &current[paren_end + 1..];
287    }
288
289    if coords.is_empty() {
290        None
291    } else {
292        Some(coords)
293    }
294}
295
296/// Fast parse index list from IfcTriangulatedFaceSet
297/// Converts from 1-based IFC indices to 0-based
298#[allow(dead_code)]
299pub fn parse_index_list_fast(content: &str) -> Option<Vec<u32>> {
300    let start = content.find("((")?;
301    let end = content.rfind("))")?;
302    let list_content = &content[start + 1..end + 1];
303
304    let mut indices = Vec::new();
305    let mut current = list_content;
306
307    while let Some(paren_start) = current.find('(') {
308        let paren_end = current[paren_start..].find(')')? + paren_start;
309        let index_str = &current[paren_start + 1..paren_end];
310
311        for num_str in index_str.split(',') {
312            let num_str = num_str.trim();
313            if !num_str.is_empty() {
314                let val: u32 = lexical_core::parse(num_str.as_bytes()).ok()?;
315                // Convert from 1-based to 0-based
316                indices.push(val.saturating_sub(1));
317            }
318        }
319
320        current = &current[paren_end + 1..];
321    }
322
323    if indices.is_empty() {
324        None
325    } else {
326        Some(indices)
327    }
328}
329
330#[cfg(test)]
331mod tests {
332    use super::*;
333
334    #[test]
335    fn test_parse_entity_ref() {
336        let (remaining, token) = entity_ref("#123").unwrap();
337        assert_eq!(remaining, "");
338        assert_eq!(token, Token::EntityRef(123));
339    }
340
341    #[test]
342    fn test_parse_string() {
343        let (remaining, token) = step_string("'hello world'").unwrap();
344        assert_eq!(remaining, "");
345        assert_eq!(token, Token::String("hello world"));
346    }
347
348    #[test]
349    fn test_parse_string_with_escaped_quote() {
350        let (remaining, token) = step_string("'it''s a test'").unwrap();
351        assert_eq!(remaining, "");
352        assert_eq!(token, Token::String("it''s a test"));
353    }
354
355    #[test]
356    fn test_parse_number_integer() {
357        let (remaining, token) = number("42").unwrap();
358        assert_eq!(remaining, "");
359        assert_eq!(token, Token::Integer(42));
360    }
361
362    #[test]
363    fn test_parse_number_float() {
364        let (remaining, token) = number("3.14159").unwrap();
365        assert_eq!(remaining, "");
366        if let Token::Float(f) = token {
367            assert!((f - std::f64::consts::PI).abs() < 1e-5);
368        } else {
369            panic!("Expected float");
370        }
371    }
372
373    #[test]
374    fn test_parse_number_scientific() {
375        let (remaining, token) = number("1.5E-3").unwrap();
376        assert_eq!(remaining, "");
377        if let Token::Float(f) = token {
378            assert!((f - 0.0015).abs() < 1e-10);
379        } else {
380            panic!("Expected float");
381        }
382    }
383
384    #[test]
385    fn test_parse_enum() {
386        let (remaining, token) = enumeration(".TRUE.").unwrap();
387        assert_eq!(remaining, "");
388        assert_eq!(token, Token::Enum("TRUE"));
389    }
390
391    #[test]
392    fn test_parse_list() {
393        let (remaining, token) = list("(1, 2, 3)").unwrap();
394        assert_eq!(remaining, "");
395        if let Token::List(items) = token {
396            assert_eq!(items.len(), 3);
397        } else {
398            panic!("Expected list");
399        }
400    }
401
402    #[test]
403    fn test_parse_entity() {
404        let entity = parse_entity("#1=IFCWALL('abc',$,#2);").unwrap();
405        assert_eq!(entity.id, EntityId(1));
406        assert_eq!(entity.ifc_type, IfcType::IfcWall);
407        assert_eq!(entity.attributes.len(), 3);
408    }
409}