Skip to main content

geonative_shapefile/
dbf.rs

1//! dBASE III+/IV table reader for `.dbf` attribute sidecars.
2//!
3//! ## Layout (header)
4//!
5//! | Bytes | Field |
6//! | --- | --- |
7//! | 0 | Version flags |
8//! | 1..4 | Last-update YY MM DD |
9//! | 4..8 | Number of records (u32 LE) |
10//! | 8..10 | Header length in bytes (u16 LE) |
11//! | 10..12 | Record length in bytes (u16 LE) |
12//! | 29 | Language Driver ID (codepage hint) |
13//!
14//! Then `(header_length - 32 - 1) / 32` field descriptors of 32 bytes each,
15//! terminated by `0x0D`. Each record starts with a 1-byte deletion flag
16//! (`0x20` active, `0x2A` deleted) and is followed by fixed-width field
17//! values.
18//!
19//! ## v0.1 field type coverage
20//!
21//! - `C` Character — UTF-8/ASCII trimmed, → `Value::String`
22//! - `N` Numeric (ASCII digits) — parsed to `Value::Int64` if no decimals,
23//!   else `Value::Float64`
24//! - `F` Float (ASCII) — `Value::Float64`
25//! - `D` Date (YYYYMMDD) — `Value::DateTime` (days since 1899-12-30)
26//! - `L` Logical (T/F/Y/N/?) — `Value::Bool`, blank or `?` → `Value::Null`
27//! - Anything else (`M` memo, `B`, `G`, `OLE`, …) → `Value::Null`
28//!
29//! ## Encoding
30//!
31//! v0.1 assumes ASCII / UTF-8 input. Real `.cpg` / LDID handling is
32//! deferred to v0.2 — until then non-UTF-8 strings come through with
33//! invalid bytes replaced.
34
35use geonative_core::{Crs, FieldDef, GeomField, Schema, Value, ValueType};
36
37use crate::error::{Result, ShpError};
38
39#[derive(Debug, Clone)]
40pub struct DbfHeader {
41    pub n_records: u32,
42    pub header_len: u16,
43    pub record_len: u16,
44    pub fields: Vec<DbfField>,
45}
46
47#[derive(Debug, Clone)]
48pub struct DbfField {
49    pub name: String,
50    pub kind: u8, // ASCII type char
51    pub length: u8,
52    pub decimals: u8,
53    /// Byte offset of this field within a record (after the 1-byte
54    /// deletion flag).
55    pub offset_in_record: usize,
56}
57
58pub fn parse_header(bytes: &[u8]) -> Result<DbfHeader> {
59    if bytes.len() < 32 {
60        return Err(ShpError::malformed("dbf shorter than 32-byte header"));
61    }
62    let n_records = u32::from_le_bytes(bytes[4..8].try_into().unwrap());
63    let header_len = u16::from_le_bytes(bytes[8..10].try_into().unwrap());
64    let record_len = u16::from_le_bytes(bytes[10..12].try_into().unwrap());
65    if (header_len as usize) > bytes.len() {
66        return Err(ShpError::malformed(format!(
67            "dbf header_len {header_len} > file size {}",
68            bytes.len()
69        )));
70    }
71
72    let mut fields = Vec::new();
73    let mut pos = 32usize;
74    let mut field_offset = 1usize; // after the 1-byte deletion flag
75    while pos < header_len as usize && bytes.get(pos) != Some(&0x0D) {
76        if pos + 32 > bytes.len() {
77            return Err(ShpError::malformed("dbf field descriptor truncated"));
78        }
79        let name_bytes = &bytes[pos..pos + 11];
80        let name_end = name_bytes
81            .iter()
82            .position(|&b| b == 0)
83            .unwrap_or(name_bytes.len());
84        let name = String::from_utf8_lossy(&name_bytes[..name_end]).into_owned();
85        let kind = bytes[pos + 11];
86        let length = bytes[pos + 16];
87        let decimals = bytes[pos + 17];
88        fields.push(DbfField {
89            name,
90            kind,
91            length,
92            decimals,
93            offset_in_record: field_offset,
94        });
95        field_offset += length as usize;
96        pos += 32;
97    }
98
99    Ok(DbfHeader {
100        n_records,
101        header_len,
102        record_len,
103        fields,
104    })
105}
106
107/// Map a DBF field to a core `FieldDef`.
108pub fn field_to_def(f: &DbfField) -> FieldDef {
109    let ty = match f.kind {
110        b'C' => ValueType::String,
111        b'N' if f.decimals == 0 => ValueType::Int64,
112        b'N' | b'F' => ValueType::Float64,
113        b'D' => ValueType::DateTime,
114        b'L' => ValueType::Bool,
115        _ => ValueType::String, // memo / binary / unsupported → represent as text
116    };
117    FieldDef::new(f.name.clone(), ty, true).with_width(f.length as u32)
118}
119
120/// Build a `Schema` from the DBF header + a geometry-column descriptor for
121/// the companion `.shp`.
122pub fn build_schema(header: &DbfHeader, geom: GeomField, crs: Crs) -> Schema {
123    let fields = header.fields.iter().map(field_to_def).collect();
124    Schema::new(fields, Some(geom), crs)
125}
126
127/// Decode the value of a single field within a record. `record_bytes`
128/// includes the 1-byte deletion flag at index 0.
129pub fn decode_field(record_bytes: &[u8], field: &DbfField) -> Value {
130    let start = field.offset_in_record;
131    let end = start + field.length as usize;
132    if end > record_bytes.len() {
133        return Value::Null;
134    }
135    let raw = &record_bytes[start..end];
136    let trimmed = trim_ascii(raw);
137
138    match field.kind {
139        b'C' => {
140            if trimmed.is_empty() {
141                Value::Null
142            } else {
143                Value::String(String::from_utf8_lossy(trimmed).into_owned())
144            }
145        }
146        b'N' => {
147            let s = std::str::from_utf8(trimmed).unwrap_or("").trim();
148            if s.is_empty() {
149                return Value::Null;
150            }
151            if field.decimals == 0 {
152                s.parse::<i64>()
153                    .ok()
154                    .map(Value::Int64)
155                    .unwrap_or(Value::Null)
156            } else {
157                s.parse::<f64>()
158                    .ok()
159                    .map(Value::Float64)
160                    .unwrap_or(Value::Null)
161            }
162        }
163        b'F' => {
164            let s = std::str::from_utf8(trimmed).unwrap_or("").trim();
165            if s.is_empty() {
166                return Value::Null;
167            }
168            s.parse::<f64>()
169                .ok()
170                .map(Value::Float64)
171                .unwrap_or(Value::Null)
172        }
173        b'D' => {
174            // 8-byte YYYYMMDD
175            if trimmed.len() != 8 {
176                return Value::Null;
177            }
178            let s = std::str::from_utf8(trimmed).unwrap_or("");
179            if s.chars().all(|c| c == ' ') || s.is_empty() {
180                return Value::Null;
181            }
182            let y: i32 = s[0..4].parse().unwrap_or(0);
183            let m: u32 = s[4..6].parse().unwrap_or(0);
184            let d: u32 = s[6..8].parse().unwrap_or(0);
185            if y == 0 && m == 0 && d == 0 {
186                return Value::Null;
187            }
188            Value::DateTime(ymd_to_gdb_days(y, m, d))
189        }
190        b'L' => match trimmed.first() {
191            Some(b'T') | Some(b't') | Some(b'Y') | Some(b'y') => Value::Bool(true),
192            Some(b'F') | Some(b'f') | Some(b'N') | Some(b'n') => Value::Bool(false),
193            _ => Value::Null,
194        },
195        _ => {
196            // Memo / binary / unsupported: surface as raw text or null.
197            if trimmed.is_empty() {
198                Value::Null
199            } else {
200                Value::String(String::from_utf8_lossy(trimmed).into_owned())
201            }
202        }
203    }
204}
205
206/// Trim ASCII spaces from both ends (matches dBASE padding convention).
207fn trim_ascii(b: &[u8]) -> &[u8] {
208    let mut start = 0;
209    let mut end = b.len();
210    while start < end && b[start] == b' ' {
211        start += 1;
212    }
213    while end > start && b[end - 1] == b' ' {
214        end -= 1;
215    }
216    &b[start..end]
217}
218
219/// Convert a calendar date to "days since 1899-12-30 00:00:00" (the same
220/// convention used by `geonative_core::Value::DateTime`). Pure integer math.
221fn ymd_to_gdb_days(year: i32, month: u32, day: u32) -> f64 {
222    // Gregorian → days since epoch, then offset.
223    // Use a simple Julian-style calculation.
224    let (y, m) = if month <= 2 {
225        (year - 1, month + 12)
226    } else {
227        (year, month)
228    };
229    let a = (y as i64) / 100;
230    let b = 2 - a + a / 4;
231    let jdn = (365.25 * (y as i64 + 4716) as f64) as i64
232        + (30.6001 * (m as i64 + 1) as f64) as i64
233        + day as i64
234        + b
235        - 1524;
236    // JDN of 1899-12-30 = 2415019
237    (jdn - 2_415_019) as f64
238}
239
240#[cfg(test)]
241mod tests {
242    use super::*;
243
244    fn make_dbf(fields: &[(&str, u8, u8, u8)], records: &[&[u8]]) -> Vec<u8> {
245        let n_records = records.len() as u32;
246        let header_len = (32 + fields.len() * 32 + 1) as u16;
247        let record_len: u16 = 1 + fields.iter().map(|f| f.2 as u16).sum::<u16>();
248        let mut buf = vec![0u8; 32];
249        buf[0] = 0x03;
250        buf[4..8].copy_from_slice(&n_records.to_le_bytes());
251        buf[8..10].copy_from_slice(&header_len.to_le_bytes());
252        buf[10..12].copy_from_slice(&record_len.to_le_bytes());
253        for (name, kind, length, decimals) in fields {
254            let mut desc = [0u8; 32];
255            let name_bytes = name.as_bytes();
256            desc[..name_bytes.len()].copy_from_slice(name_bytes);
257            desc[11] = *kind;
258            desc[16] = *length;
259            desc[17] = *decimals;
260            buf.extend_from_slice(&desc);
261        }
262        buf.push(0x0D);
263        for r in records {
264            assert_eq!(r.len(), record_len as usize, "test record length mismatch");
265            buf.extend_from_slice(r);
266        }
267        buf.push(0x1A);
268        buf
269    }
270
271    #[test]
272    fn parse_simple_header() {
273        // ID width 10 + NAME width 8 + 1 deletion flag = 19-byte records.
274        let dbf = make_dbf(
275            &[("ID", b'N', 10, 0), ("NAME", b'C', 8, 0)],
276            &[b" 0000000001Alice   "],
277        );
278        let h = parse_header(&dbf).unwrap();
279        assert_eq!(h.n_records, 1);
280        assert_eq!(h.fields.len(), 2);
281        assert_eq!(h.fields[0].name, "ID");
282        assert_eq!(h.fields[1].name, "NAME");
283    }
284
285    #[test]
286    fn decode_integer_string_and_bool() {
287        // 1 (deletion flag ' ') + 5 (ID " 0042") + 5 (NAME "Alice") + 1 (OK "T") = 12.
288        let dbf = make_dbf(
289            &[("ID", b'N', 5, 0), ("NAME", b'C', 5, 0), ("OK", b'L', 1, 0)],
290            &[b"  0042AliceT"],
291        );
292        let h = parse_header(&dbf).unwrap();
293        let rec_start = h.header_len as usize;
294        let rec = &dbf[rec_start..rec_start + h.record_len as usize];
295        assert_eq!(decode_field(rec, &h.fields[0]), Value::Int64(42));
296        assert_eq!(
297            decode_field(rec, &h.fields[1]),
298            Value::String("Alice".into())
299        );
300        assert_eq!(decode_field(rec, &h.fields[2]), Value::Bool(true));
301    }
302
303    #[test]
304    fn decode_float_with_decimals() {
305        // 1 + 7 = 8 bytes per record.
306        let dbf = make_dbf(&[("VAL", b'N', 7, 2)], &[b"   12.34"]);
307        let h = parse_header(&dbf).unwrap();
308        let rec_start = h.header_len as usize;
309        let rec = &dbf[rec_start..rec_start + h.record_len as usize];
310        match decode_field(rec, &h.fields[0]) {
311            Value::Float64(f) => assert!((f - 12.34).abs() < 1e-9),
312            other => panic!("expected float, got {:?}", other),
313        }
314    }
315
316    #[test]
317    fn decode_date_field() {
318        let dbf = make_dbf(&[("D", b'D', 8, 0)], &[b" 20240601"]);
319        let h = parse_header(&dbf).unwrap();
320        let rec_start = h.header_len as usize;
321        let rec = &dbf[rec_start..rec_start + h.record_len as usize];
322        match decode_field(rec, &h.fields[0]) {
323            Value::DateTime(d) => {
324                // Round-trip: 2024-06-01 ≈ 45444 GDB days (sanity check, not exact)
325                assert!(d > 45000.0 && d < 46000.0, "got {d}");
326            }
327            other => panic!("expected datetime, got {:?}", other),
328        }
329    }
330
331    #[test]
332    fn blank_numeric_is_null() {
333        // 1 + 5 = 6 bytes per record (all spaces = active flag + blank field).
334        let dbf = make_dbf(&[("N", b'N', 5, 0)], &[b"      "]);
335        let h = parse_header(&dbf).unwrap();
336        let rec_start = h.header_len as usize;
337        let rec = &dbf[rec_start..rec_start + h.record_len as usize];
338        assert_eq!(decode_field(rec, &h.fields[0]), Value::Null);
339    }
340}