Skip to main content

xsd_schema/parser/
location.rs

1//! Location tracking for XSD parsing
2//!
3//! This module provides accurate source location tracking using byte offsets from quick-xml.
4//! It supports three retention modes to balance memory usage vs. error reporting fidelity:
5//!
6//! - `Retain`: Keep full source text (default, ~200KB per 200KB schema)
7//! - `DropText`: Keep only line starts (~10KB per 200KB schema)
8//! - `DropAll`: No location info (minimal memory)
9//!
10//! Per XSD_PARSER_DESIGN.md:
11//! - Use quick-xml's `buffer_position()` for byte offsets
12//! - Build line_starts table once per document
13//! - Handle CR, LF, and CRLF line endings correctly
14//! - Column calculation counts UTF-8 characters, not bytes
15
16use crate::ids::DocumentId;
17use std::fmt;
18
19/// Byte range within a document
20#[derive(Debug, Clone, Copy, PartialEq, Eq)]
21pub struct SourceSpan {
22    pub start: usize,
23    pub end: usize,
24}
25
26impl SourceSpan {
27    pub fn new(start: usize, end: usize) -> Self {
28        Self { start, end }
29    }
30
31    pub fn len(&self) -> usize {
32        self.end.saturating_sub(self.start)
33    }
34
35    pub fn is_empty(&self) -> bool {
36        self.start >= self.end
37    }
38}
39
40/// Reference to a location within a schema document
41#[derive(Debug, Clone)]
42pub struct SourceRef {
43    pub doc_id: DocumentId,
44    pub span: SourceSpan,
45    /// When set, overrides `doc_id` for schema-document-level defaults
46    /// lookup (elementFormDefault, attributeFormDefault, blockDefault,
47    /// finalDefault, defaultAttributes). Used for `xs:override` children
48    /// that are conceptually placed in the overridden document D2 per
49    /// §4.2.5 / F.2 transformation semantics.
50    pub schema_defaults_doc: Option<DocumentId>,
51}
52
53impl SourceRef {
54    pub fn new(doc_id: DocumentId, span: SourceSpan) -> Self {
55        Self {
56            doc_id,
57            span,
58            schema_defaults_doc: None,
59        }
60    }
61
62    /// The document ID to use for schema-level defaults lookup.
63    ///
64    /// Returns `schema_defaults_doc` if set (override components),
65    /// otherwise falls back to `doc_id`.
66    pub fn defaults_doc(&self) -> DocumentId {
67        self.schema_defaults_doc.unwrap_or(self.doc_id)
68    }
69}
70
71/// Line/column location for error reporting (1-based)
72#[derive(Debug, Clone, PartialEq, Eq)]
73pub struct SourceLocation {
74    pub base_uri: String,
75    pub line: usize,   // 1-based
76    pub column: usize, // 1-based
77}
78
79impl fmt::Display for SourceLocation {
80    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
81        write!(f, "{}:{}:{}", self.base_uri, self.line, self.column)
82    }
83}
84
85/// Source buffer retention policy
86///
87/// Controls memory vs. error reporting trade-off:
88/// - `Retain`: Full source text available (~200KB per 200KB schema)
89/// - `DropText`: Only line starts (~10KB per 200KB schema, 90% savings)
90/// - `DropAll`: No location info (minimal memory)
91#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
92pub enum SourceRetention {
93    /// Keep source text for rich errors and XmlFragment access
94    #[default]
95    Retain,
96    /// Drop source text after parsing; keep only line_starts
97    DropText,
98    /// Drop entire SourceMap after parsing; no location info
99    DropAll,
100}
101
102/// Per-document source mapping for line/column resolution
103///
104/// Owns the source text buffer and line start index.
105/// Use `build_line_starts()` to construct the line index once.
106#[derive(Debug, Clone)]
107pub struct SourceMap {
108    pub base_uri: String,
109    pub text: String,
110    pub line_starts: Vec<usize>,
111}
112
113impl SourceMap {
114    /// Create a new source map from source text
115    pub fn new(base_uri: String, text: String) -> Self {
116        let line_starts = build_line_starts(text.as_bytes());
117        Self {
118            base_uri,
119            text,
120            line_starts,
121        }
122    }
123
124    /// Convert byte offset to line/column location
125    ///
126    /// Returns 1-based line and column numbers.
127    /// Column is counted in UTF-8 characters, not bytes.
128    pub fn locate(&self, offset: usize) -> SourceLocation {
129        let (line, line_start) = self.find_line(offset);
130        let column = self.count_utf8_chars(line_start, offset) + 1;
131
132        SourceLocation {
133            base_uri: self.base_uri.clone(),
134            line,
135            column,
136        }
137    }
138
139    /// Find line number (1-based) and line start offset for byte offset
140    fn find_line(&self, offset: usize) -> (usize, usize) {
141        // Binary search for the line containing this offset
142        match self.line_starts.binary_search(&offset) {
143            Ok(idx) => (idx + 1, offset), // Exact match at line start
144            Err(idx) => {
145                if idx == 0 {
146                    (1, 0)
147                } else {
148                    (idx, self.line_starts[idx - 1])
149                }
150            }
151        }
152    }
153
154    /// Count UTF-8 characters between start and end byte offsets
155    fn count_utf8_chars(&self, start: usize, end: usize) -> usize {
156        let end = end.min(self.text.len());
157        let start = start.min(end);
158        self.text[start..end].chars().count()
159    }
160
161    /// Get source text for a span (returns None if span invalid)
162    pub fn get_text(&self, span: &SourceSpan) -> Option<&str> {
163        if span.end <= self.text.len() && span.start <= span.end {
164            Some(&self.text[span.start..span.end])
165        } else {
166            None
167        }
168    }
169
170    /// Convert to compact source map (drops text, keeps line_starts)
171    pub fn into_compact(self) -> CompactSourceMap {
172        CompactSourceMap {
173            base_uri: self.base_uri,
174            line_starts: self.line_starts,
175            text_len: self.text.len(),
176        }
177    }
178}
179
180/// Compact source map when text is dropped (DropText mode)
181///
182/// Retains line mapping but not source text.
183/// Provides line/column location but cannot extract source text spans.
184#[derive(Debug, Clone)]
185pub struct CompactSourceMap {
186    pub base_uri: String,
187    pub line_starts: Vec<usize>,
188    pub text_len: usize, // for bounds checking
189}
190
191impl CompactSourceMap {
192    /// Convert byte offset to line/column location
193    ///
194    /// Column calculation is approximate (byte offset from line start)
195    /// since we don't have the original text to count UTF-8 characters.
196    pub fn locate(&self, offset: usize) -> SourceLocation {
197        let (line, line_start) = self.find_line(offset);
198        let column = offset.saturating_sub(line_start) + 1;
199
200        SourceLocation {
201            base_uri: self.base_uri.clone(),
202            line,
203            column,
204        }
205    }
206
207    fn find_line(&self, offset: usize) -> (usize, usize) {
208        match self.line_starts.binary_search(&offset) {
209            Ok(idx) => (idx + 1, offset),
210            Err(idx) => {
211                if idx == 0 {
212                    (1, 0)
213                } else {
214                    (idx, self.line_starts[idx - 1])
215                }
216            }
217        }
218    }
219}
220
221/// Centralized source map storage with configurable retention
222///
223/// Owned by SchemaSet to manage source buffers for all documents.
224/// See XSD.md (Source Buffer Storage section) for memory management.
225#[derive(Debug, Default)]
226pub enum SourceMapStorage {
227    /// Full source text retained (default)
228    Full(Vec<SourceMap>),
229    /// Text dropped; only line mapping kept
230    Compact(Vec<CompactSourceMap>),
231    /// No source info retained
232    #[default]
233    None,
234}
235
236impl SourceMapStorage {
237    /// Create new storage in Full mode
238    pub fn new() -> Self {
239        SourceMapStorage::Full(Vec::new())
240    }
241
242    /// Add a source map
243    pub fn add(&mut self, map: SourceMap) -> DocumentId {
244        match self {
245            SourceMapStorage::Full(maps) => {
246                let id = maps.len() as DocumentId;
247                maps.push(map);
248                id
249            }
250            SourceMapStorage::Compact(maps) => {
251                let id = maps.len() as DocumentId;
252                maps.push(map.into_compact());
253                id
254            }
255            SourceMapStorage::None => 0, // Discarded
256        }
257    }
258
259    /// Resolve SourceRef to SourceLocation
260    pub fn locate(&self, source_ref: &SourceRef) -> Option<SourceLocation> {
261        match self {
262            SourceMapStorage::Full(maps) => {
263                let map = maps.get(source_ref.doc_id as usize)?;
264                Some(map.locate(source_ref.span.start))
265            }
266            SourceMapStorage::Compact(maps) => {
267                let map = maps.get(source_ref.doc_id as usize)?;
268                Some(map.locate(source_ref.span.start))
269            }
270            SourceMapStorage::None => None,
271        }
272    }
273
274    /// Get source text slice for XmlFragment (requires Full mode)
275    pub fn get_text(&self, doc_id: DocumentId, span: &SourceSpan) -> Option<&str> {
276        match self {
277            SourceMapStorage::Full(maps) => {
278                let map = maps.get(doc_id as usize)?;
279                map.get_text(span)
280            }
281            _ => None,
282        }
283    }
284
285    /// Compact storage by dropping source text (Full -> Compact)
286    ///
287    /// Saves ~90% memory but loses ability to extract source text spans.
288    pub fn compact(&mut self) {
289        if let SourceMapStorage::Full(maps) = self {
290            let compact_maps = maps.drain(..).map(|map| map.into_compact()).collect();
291            *self = SourceMapStorage::Compact(compact_maps);
292        }
293    }
294
295    /// Drop all source info (any -> None)
296    ///
297    /// Minimal memory but no location info in errors.
298    pub fn drop_all(&mut self) {
299        *self = SourceMapStorage::None;
300    }
301
302    /// Check if storage is empty
303    pub fn is_empty(&self) -> bool {
304        match self {
305            SourceMapStorage::Full(maps) => maps.is_empty(),
306            SourceMapStorage::Compact(maps) => maps.is_empty(),
307            SourceMapStorage::None => true,
308        }
309    }
310
311    /// Get number of documents stored
312    pub fn len(&self) -> usize {
313        match self {
314            SourceMapStorage::Full(maps) => maps.len(),
315            SourceMapStorage::Compact(maps) => maps.len(),
316            SourceMapStorage::None => 0,
317        }
318    }
319}
320
321/// Build line start index from source bytes
322///
323/// Handles CR, LF, and CRLF line endings correctly:
324/// - LF (Unix): \n
325/// - CRLF (Windows): \r\n (counted as single line end)
326/// - CR (Mac Classic): \r
327///
328/// Returns byte offsets where each line starts (0-indexed).
329/// First line always starts at 0.
330pub fn build_line_starts(bytes: &[u8]) -> Vec<usize> {
331    let mut line_starts = vec![0];
332    let mut i = 0;
333
334    while i < bytes.len() {
335        match bytes[i] {
336            b'\n' => {
337                // LF: next line starts after \n
338                line_starts.push(i + 1);
339                i += 1;
340            }
341            b'\r' => {
342                // CR or CRLF
343                if i + 1 < bytes.len() && bytes[i + 1] == b'\n' {
344                    // CRLF: next line starts after \r\n
345                    line_starts.push(i + 2);
346                    i += 2;
347                } else {
348                    // CR: next line starts after \r
349                    line_starts.push(i + 1);
350                    i += 1;
351                }
352            }
353            _ => {
354                i += 1;
355            }
356        }
357    }
358
359    line_starts
360}
361
362#[cfg(test)]
363mod tests {
364    use super::*;
365
366    #[test]
367    fn test_build_line_starts_lf() {
368        let bytes = b"line1\nline2\nline3";
369        let starts = build_line_starts(bytes);
370        assert_eq!(starts, vec![0, 6, 12]);
371    }
372
373    #[test]
374    fn test_build_line_starts_crlf() {
375        let bytes = b"line1\r\nline2\r\nline3";
376        let starts = build_line_starts(bytes);
377        assert_eq!(starts, vec![0, 7, 14]);
378    }
379
380    #[test]
381    fn test_build_line_starts_cr() {
382        let bytes = b"line1\rline2\rline3";
383        let starts = build_line_starts(bytes);
384        assert_eq!(starts, vec![0, 6, 12]);
385    }
386
387    #[test]
388    fn test_build_line_starts_mixed() {
389        let bytes = b"line1\nline2\r\nline3\rline4";
390        let starts = build_line_starts(bytes);
391        assert_eq!(starts, vec![0, 6, 13, 19]);
392    }
393
394    #[test]
395    fn test_source_map_locate() {
396        let source = "line1\nline2\nline3".to_string();
397        let map = SourceMap::new("test.xsd".to_string(), source);
398
399        // First line, first column
400        let loc = map.locate(0);
401        assert_eq!(loc.line, 1);
402        assert_eq!(loc.column, 1);
403
404        // Second line, first column
405        let loc = map.locate(6);
406        assert_eq!(loc.line, 2);
407        assert_eq!(loc.column, 1);
408
409        // Second line, third column
410        let loc = map.locate(8);
411        assert_eq!(loc.line, 2);
412        assert_eq!(loc.column, 3);
413    }
414
415    #[test]
416    fn test_source_map_utf8_columns() {
417        let source = "Hello 世界\nNext line".to_string();
418        let map = SourceMap::new("test.xsd".to_string(), source);
419
420        // "世" is at byte offset 6 but character offset 7 (1-based)
421        let loc = map.locate(6);
422        assert_eq!(loc.line, 1);
423        assert_eq!(loc.column, 7); // UTF-8 character count, not bytes
424    }
425
426    #[test]
427    fn test_source_map_get_text() {
428        let source = "line1\nline2\nline3".to_string();
429        let map = SourceMap::new("test.xsd".to_string(), source);
430
431        let span = SourceSpan::new(0, 5);
432        assert_eq!(map.get_text(&span), Some("line1"));
433
434        let span = SourceSpan::new(6, 11);
435        assert_eq!(map.get_text(&span), Some("line2"));
436    }
437
438    #[test]
439    fn test_source_map_storage() {
440        let mut storage = SourceMapStorage::new();
441
442        let map1 = SourceMap::new("test1.xsd".to_string(), "line1\nline2".to_string());
443        let doc_id = storage.add(map1);
444
445        let source_ref = SourceRef::new(doc_id, SourceSpan::new(0, 5));
446        let loc = storage.locate(&source_ref).unwrap();
447        assert_eq!(loc.line, 1);
448        assert_eq!(loc.column, 1);
449
450        // Test compact
451        storage.compact();
452        let loc = storage.locate(&source_ref).unwrap();
453        assert_eq!(loc.line, 1);
454        assert!(loc.column > 0); // Approximate in compact mode
455
456        // Cannot get text in compact mode
457        assert!(storage.get_text(doc_id, &SourceSpan::new(0, 5)).is_none());
458    }
459}