use memchr::memchr;
use rustc_hash::FxHashMap;
pub type EntityIndex = FxHashMap<u32, (usize, usize)>;
pub struct EntityScanner<'a> {
content: &'a str,
pos: usize,
}
impl<'a> EntityScanner<'a> {
pub fn new(content: &'a str) -> Self {
let pos = content.find("DATA;").map(|p| p + 5).unwrap_or(0);
Self { content, pos }
}
pub fn next_entity(&mut self) -> Option<(u32, &'a str, usize, usize)> {
let bytes = self.content.as_bytes();
while self.pos < bytes.len() {
let hash_pos = memchr(b'#', &bytes[self.pos..])?;
self.pos += hash_pos;
let is_entity_start = self.pos == 0
|| bytes[self.pos - 1] == b'\n'
|| bytes[self.pos - 1] == b'\r'
|| bytes[self.pos - 1] == b';';
if !is_entity_start {
self.pos += 1;
continue;
}
let start = self.pos;
self.pos += 1; let id_start = self.pos;
while self.pos < bytes.len() && bytes[self.pos].is_ascii_digit() {
self.pos += 1;
}
if self.pos == id_start {
continue;
}
let id: u32 = self.content[id_start..self.pos].parse().ok()?;
while self.pos < bytes.len() && (bytes[self.pos] == b' ' || bytes[self.pos] == b'\t') {
self.pos += 1;
}
if self.pos >= bytes.len() || bytes[self.pos] != b'=' {
continue;
}
self.pos += 1;
while self.pos < bytes.len() && (bytes[self.pos] == b' ' || bytes[self.pos] == b'\t') {
self.pos += 1;
}
let type_start = self.pos;
while self.pos < bytes.len()
&& (bytes[self.pos].is_ascii_alphanumeric() || bytes[self.pos] == b'_')
{
self.pos += 1;
}
if self.pos == type_start {
continue;
}
let type_name = &self.content[type_start..self.pos];
let end = self.find_entity_end()?;
return Some((id, type_name, start, end));
}
None
}
fn find_entity_end(&mut self) -> Option<usize> {
let bytes = self.content.as_bytes();
let mut in_string = false;
while self.pos < bytes.len() {
match bytes[self.pos] {
b'\'' => {
if in_string && self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'\'' {
self.pos += 2;
continue;
}
in_string = !in_string;
}
b';' if !in_string => {
self.pos += 1;
return Some(self.pos);
}
_ => {}
}
self.pos += 1;
}
None
}
pub fn build_index(content: &'a str) -> EntityIndex {
let mut scanner = Self::new(content);
let mut index = FxHashMap::default();
while let Some((id, _, start, end)) = scanner.next_entity() {
index.insert(id, (start, end));
}
index
}
pub fn count_by_type(content: &'a str) -> FxHashMap<String, usize> {
let mut scanner = Self::new(content);
let mut counts: FxHashMap<String, usize> = FxHashMap::default();
while let Some((_, type_name, _, _)) = scanner.next_entity() {
*counts.entry(type_name.to_uppercase()).or_insert(0) += 1;
}
counts
}
pub fn find_by_type(content: &'a str, target_type: &str) -> Vec<(u32, usize, usize)> {
let mut scanner = Self::new(content);
let mut results = Vec::new();
let target_upper = target_type.to_uppercase();
while let Some((id, type_name, start, end)) = scanner.next_entity() {
if type_name.eq_ignore_ascii_case(&target_upper) {
results.push((id, start, end));
}
}
results
}
pub fn entity_count(content: &'a str) -> usize {
let mut scanner = Self::new(content);
let mut count = 0;
while scanner.next_entity().is_some() {
count += 1;
}
count
}
}
pub fn parse_header(content: &str) -> HeaderInfo {
let mut info = HeaderInfo::default();
let header_start = content.find("HEADER;").unwrap_or(0);
let header_end = content.find("ENDSEC;").unwrap_or(content.len());
let header = &content[header_start..header_end];
if let Some(schema_start) = header.find("FILE_SCHEMA") {
if let Some(paren_start) = header[schema_start..].find("((") {
let start = schema_start + paren_start + 2;
if let Some(paren_end) = header[start..].find("))") {
let schema_list = &header[start..start + paren_end];
if let Some(quote_start) = schema_list.find('\'') {
if let Some(quote_end) = schema_list[quote_start + 1..].find('\'') {
info.schema_version =
schema_list[quote_start + 1..quote_start + 1 + quote_end].to_string();
}
}
}
}
}
if let Some(name_start) = header.find("FILE_NAME") {
if let Some(paren_start) = header[name_start..].find('(') {
let start = name_start + paren_start + 1;
if let Some((file_name, rest)) = parse_header_string(&header[start..]) {
info.file_name = Some(file_name);
if let Some(comma) = rest.find(',') {
if let Some((timestamp, rest2)) = parse_header_string(&rest[comma + 1..]) {
info.timestamp = Some(timestamp);
if let Some(comma2) = rest2.find(',') {
if let Some((author, rest3)) = parse_header_list(&rest2[comma2 + 1..]) {
info.author = author.first().cloned();
if let Some(comma3) = rest3.find(',') {
if let Some((org, rest4)) =
parse_header_list(&rest3[comma3 + 1..])
{
info.organization = org.first().cloned();
if let Some(comma4) = rest4.find(',') {
if let Some((preproc, rest5)) =
parse_header_string(&rest4[comma4 + 1..])
{
info.preprocessor_version = Some(preproc);
if let Some(comma5) = rest5.find(',') {
if let Some((orig_sys, _)) =
parse_header_string(&rest5[comma5 + 1..])
{
info.originating_system = Some(orig_sys);
}
}
}
}
}
}
}
}
}
}
}
}
}
info
}
fn parse_header_string(s: &str) -> Option<(String, &str)> {
let s = s.trim_start();
if !s.starts_with('\'') {
if let Some(stripped) = s.strip_prefix('$') {
return Some((String::new(), stripped));
}
return None;
}
let mut end = 1;
let bytes = s.as_bytes();
while end < bytes.len() {
if bytes[end] == b'\'' {
if end + 1 < bytes.len() && bytes[end + 1] == b'\'' {
end += 2;
continue;
}
break;
}
end += 1;
}
let value = s[1..end].replace("''", "'");
Some((value, &s[end + 1..]))
}
fn parse_header_list(s: &str) -> Option<(Vec<String>, &str)> {
let s = s.trim_start();
if !s.starts_with('(') {
return Some((Vec::new(), s));
}
let mut items = Vec::new();
let mut current = &s[1..];
loop {
current = current.trim_start();
if let Some(stripped) = current.strip_prefix(')') {
return Some((items, stripped));
}
if let Some((item, rest)) = parse_header_string(current) {
if !item.is_empty() {
items.push(item);
}
current = rest.trim_start();
if current.starts_with(',') {
current = ¤t[1..];
}
} else {
if let Some(pos) = current.find([',', ')']) {
current = ¤t[pos..];
if current.starts_with(',') {
current = ¤t[1..];
}
} else {
break;
}
}
}
Some((items, current))
}
#[derive(Clone, Debug, Default)]
pub struct HeaderInfo {
pub schema_version: String,
pub file_name: Option<String>,
pub timestamp: Option<String>,
pub author: Option<String>,
pub organization: Option<String>,
pub preprocessor_version: Option<String>,
pub originating_system: Option<String>,
}
#[cfg(test)]
mod tests {
use super::*;
const TEST_IFC: &str = r#"ISO-10303-21;
HEADER;
FILE_DESCRIPTION(('ViewDefinition [CoordinationView]'),'2;1');
FILE_NAME('test.ifc','2024-01-01T00:00:00',('Author'),('Org'),'Preprocessor','App','');
FILE_SCHEMA(('IFC2X3'));
ENDSEC;
DATA;
#1=IFCPROJECT('guid',$,'Project',$,$,$,$,$,#2);
#2=IFCUNITASSIGNMENT((#3));
#3=IFCSIUNIT(*,.LENGTHUNIT.,.MILLI.,.METRE.);
#4=IFCWALL('guid',$,'Wall 1',$,$,#5,#6,$);
ENDSEC;
END-ISO-10303-21;
"#;
#[test]
fn test_scanner_finds_entities() {
let mut scanner = EntityScanner::new(TEST_IFC);
let mut entities = Vec::new();
while let Some((id, type_name, _, _)) = scanner.next_entity() {
entities.push((id, type_name.to_string()));
}
assert_eq!(entities.len(), 4);
assert_eq!(entities[0], (1, "IFCPROJECT".to_string()));
assert_eq!(entities[3], (4, "IFCWALL".to_string()));
}
#[test]
fn test_build_index() {
let index = EntityScanner::build_index(TEST_IFC);
assert_eq!(index.len(), 4);
assert!(index.contains_key(&1));
assert!(index.contains_key(&4));
}
#[test]
fn test_count_by_type() {
let counts = EntityScanner::count_by_type(TEST_IFC);
assert_eq!(counts.get("IFCPROJECT"), Some(&1));
assert_eq!(counts.get("IFCWALL"), Some(&1));
}
#[test]
fn test_parse_header() {
let info = parse_header(TEST_IFC);
assert_eq!(info.schema_version, "IFC2X3");
assert_eq!(info.file_name, Some("test.ifc".to_string()));
}
}