nom_span/
lib.rs

1#![deny(clippy::all)]
2#![deny(clippy::pedantic)]
3//! This library expose `Spanned`, a struct that will wraps your input and allow you to keep track of the line number, the column number and the byte offset
4//!
5//! ## How to use it?
6//!
7//! Here is a basic example of how to create the input and how to retrieve all the informations you need.
8//!
9//! ```ignore
10//! use nom_span::Spanned;
11//!
12//! type Span<'a> = Spanned<&'a str>;
13//!
14//! fn main() {
15//!     let span = Span::new(
16//!       r#"{"hello": "world 🙌"}"#,
17//!       // Supporting UTF-8
18//!       true
19//!     );
20//!
21//!     assert_eq!(span.line(), 1);
22//!     assert_eq!(span.col(), 1);
23//!     assert_eq!(span.byte_offset(), 0);
24//! }
25//! ```
26//!
27//! You can notice that supporting UTF-8 is optional. The reason is that UTF-8 strings need to be handled in a different way than pure ASCII strings, and thus, there can be a performance gap with UTF-8 support (see the benchmark below)
28//!
29//! ### UTF-8 and ASCII comparison
30//!
31//! A UTF-8 char can be made of 1 to 4 bytes, so counting it the ASCII way would result in counting each byte of the UTF-8 char, and will result in unexpected column number:
32//!
33//! ```ignore
34//! use nom_span::Spanned;
35//!
36//! type Span<'a> = Spanned<&'a str>;
37//!
38//! fn utf8_vs_ascii() {
39//!     let utf8 = Span::new("🙌", true);
40//!     let ascii = Span::new("🙌", false);
41//!
42//!     let utf8_after: IResult<Span<'_>, Vec<char>> = many1(anychar)(utf8);
43//!     let ascii_after: IResult<Span<'_>, Vec<char>> = many1(anychar)(ascii);
44//!
45//!     let (utf8_after, _) = utf8_after.unwrap();
46//!     let (ascii_after, _) = ascii_after.unwrap();
47//!
48//!     assert_eq!(utf8_after.col(), 2);
49//!     assert_eq!(ascii_after.col(), 5);
50//! }
51//!
52//! ```
53//!
54//! ## What about [nom_locate](https://github.com/fflorent/nom_locate)?
55//!
56//! I was initially using [nom_locate](https://github.com/fflorent/nom_locate), but I faced some huge performance issue while building a [json parser](https://github.com/julesguesnon/spanned-json-parser), so I decided to implement my own input. I basically cloned [nom_locate](https://github.com/fflorent/nom_locate) and modified the counting function that was causing the performance issue. So thanks a lot for this awesome crate and please go add a star to it!
57//!
58//! ### What's the difference with [nom_locate](https://github.com/fflorent/nom_locate)?
59//!
60//! [nom_locate](https://github.com/fflorent/nom_locate) is recounting all the chars of your entire input (even if you already consumed it) when you're calling `get_column`. If you're calling `get_column` every char, runtime would be: `O(N^2)`
61//! With this crate, it's counting lines and columns everytime you're consuming your input. If you're calling `col` every char, runtime would be: `O(2N)`
62//!
63//! So if you're planning to get the column only a few times, for example, only when an error occur, it may be better to use [nom_locate](https://github.com/fflorent/nom_locate), but if you need it quite often, this crate should be better.
64
65use bytecount::num_chars;
66use memchr::Memchr;
67use nom::{
68    AsBytes, Compare, Err, ExtendInto, FindSubstring, FindToken, InputIter, InputLength, InputTake,
69    InputTakeAtPosition, Offset, ParseTo, Slice,
70};
71use std::{
72    ops::{RangeFrom, RangeTo},
73    str::FromStr,
74};
75
76extern crate bytecount;
77extern crate memchr;
78extern crate nom;
79
80/// You can wrap your input in this struct with [`Spanned::new`]
81#[derive(Clone, Debug, Copy, PartialEq, Eq)]
82pub struct Spanned<T> {
83    data: T,
84    line: usize,
85    col: usize,
86    offset: usize,
87    handle_utf8: bool,
88}
89
90impl<T> Spanned<T> {
91    pub fn new(data: T, handle_utf8: bool) -> Self {
92        Self {
93            data,
94            line: 1,
95            col: 1,
96            offset: 0,
97            handle_utf8,
98        }
99    }
100
101    pub fn new_for_ut8(data: T) -> Self {
102        Self {
103            data,
104            line: 1,
105            col: 1,
106            offset: 0,
107            handle_utf8: true,
108        }
109    }
110
111    pub fn new_for_ascii(data: T) -> Self {
112        Self {
113            data,
114            line: 1,
115            col: 1,
116            offset: 0,
117            handle_utf8: false,
118        }
119    }
120
121    /// Get the current line number
122    pub fn line(&self) -> usize {
123        self.line
124    }
125
126    /// Get the current column number
127    pub fn col(&self) -> usize {
128        self.col
129    }
130
131    /// Get the current byte offset
132    pub fn byte_offset(&self) -> usize {
133        self.offset
134    }
135
136    /// Get the current data in the span
137    pub fn data(&self) -> &T {
138        &self.data
139    }
140}
141
142impl<T> core::ops::Deref for Spanned<T> {
143    type Target = T;
144    fn deref(&self) -> &Self::Target {
145        &self.data
146    }
147}
148
149impl<T, U> core::convert::AsRef<U> for Spanned<&T>
150where
151    T: ?Sized + core::convert::AsRef<U>,
152    U: ?Sized,
153{
154    fn as_ref(&self) -> &U {
155        self.data.as_ref()
156    }
157}
158
159impl<T> AsBytes for Spanned<T>
160where
161    T: AsBytes,
162{
163    fn as_bytes(&self) -> &[u8] {
164        self.data.as_bytes()
165    }
166}
167
168impl<T, Comp> Compare<Comp> for Spanned<T>
169where
170    T: Compare<Comp>,
171{
172    fn compare(&self, t: Comp) -> nom::CompareResult {
173        self.data.compare(t)
174    }
175
176    fn compare_no_case(&self, t: Comp) -> nom::CompareResult {
177        self.data.compare_no_case(t)
178    }
179}
180
181impl<T> ExtendInto for Spanned<T>
182where
183    T: ExtendInto,
184{
185    type Item = T::Item;
186
187    type Extender = T::Extender;
188
189    fn new_builder(&self) -> Self::Extender {
190        self.data.new_builder()
191    }
192
193    fn extend_into(&self, acc: &mut Self::Extender) {
194        self.data.extend_into(acc);
195    }
196}
197
198impl<T> FindSubstring<T> for Spanned<T>
199where
200    T: FindSubstring<T>,
201{
202    fn find_substring(&self, substr: T) -> Option<usize> {
203        self.data.find_substring(substr)
204    }
205}
206
207impl<T, Token> FindToken<Token> for Spanned<T>
208where
209    T: FindToken<Token>,
210{
211    fn find_token(&self, token: Token) -> bool {
212        self.data.find_token(token)
213    }
214}
215
216impl<T> InputIter for Spanned<T>
217where
218    T: InputIter,
219{
220    type Item = T::Item;
221
222    type Iter = T::Iter;
223
224    type IterElem = T::IterElem;
225
226    fn iter_indices(&self) -> Self::Iter {
227        self.data.iter_indices()
228    }
229
230    fn iter_elements(&self) -> Self::IterElem {
231        self.data.iter_elements()
232    }
233
234    fn position<P>(&self, predicate: P) -> Option<usize>
235    where
236        P: Fn(Self::Item) -> bool,
237    {
238        self.data.position(predicate)
239    }
240
241    fn slice_index(&self, count: usize) -> Result<usize, nom::Needed> {
242        self.data.slice_index(count)
243    }
244}
245
246impl<T> InputLength for Spanned<T>
247where
248    T: InputLength,
249{
250    fn input_len(&self) -> usize {
251        self.data.input_len()
252    }
253}
254
255impl<T> InputTake for Spanned<T>
256where
257    Self: Slice<RangeFrom<usize>> + Slice<RangeTo<usize>>,
258{
259    fn take(&self, count: usize) -> Self {
260        self.slice(..count)
261    }
262
263    fn take_split(&self, count: usize) -> (Self, Self) {
264        (self.slice(count..), self.slice(..count))
265    }
266}
267
268impl<T> InputTakeAtPosition for Spanned<T>
269where
270    T: InputTakeAtPosition + InputLength + InputIter,
271    Self: Slice<RangeFrom<usize>> + Slice<RangeTo<usize>> + Clone,
272{
273    type Item = <T as InputIter>::Item;
274
275    fn split_at_position<P, E: nom::error::ParseError<Self>>(
276        &self,
277        predicate: P,
278    ) -> nom::IResult<Self, Self, E>
279    where
280        P: Fn(Self::Item) -> bool,
281    {
282        match self.data.position(predicate) {
283            Some(n) => Ok(self.take_split(n)),
284            None => Err(Err::Incomplete(nom::Needed::new(1))),
285        }
286    }
287
288    fn split_at_position1<P, E: nom::error::ParseError<Self>>(
289        &self,
290        predicate: P,
291        _e: nom::error::ErrorKind,
292    ) -> nom::IResult<Self, Self, E>
293    where
294        P: Fn(Self::Item) -> bool,
295    {
296        match self.data.position(predicate) {
297            Some(n) => Ok(self.take_split(n)),
298            None => Err(Err::Incomplete(nom::Needed::new(1))),
299        }
300    }
301
302    fn split_at_position_complete<P, E: nom::error::ParseError<Self>>(
303        &self,
304        predicate: P,
305    ) -> nom::IResult<Self, Self, E>
306    where
307        P: Fn(Self::Item) -> bool,
308    {
309        match self.split_at_position(predicate) {
310            Err(Err::Incomplete(_)) => Ok(self.take_split(self.input_len())),
311            res => res,
312        }
313    }
314
315    fn split_at_position1_complete<P, E: nom::error::ParseError<Self>>(
316        &self,
317        predicate: P,
318        e: nom::error::ErrorKind,
319    ) -> nom::IResult<Self, Self, E>
320    where
321        P: Fn(Self::Item) -> bool,
322    {
323        match self.data.position(predicate) {
324            Some(0) => Err(Err::Error(E::from_error_kind(self.clone(), e))),
325            Some(n) => Ok(self.take_split(n)),
326            None => {
327                if self.data.input_len() == 0 {
328                    Err(Err::Error(E::from_error_kind(self.clone(), e)))
329                } else {
330                    Ok(self.take_split(self.input_len()))
331                }
332            }
333        }
334    }
335}
336
337impl<T> Offset for Spanned<T>
338where
339    T: Offset,
340{
341    fn offset(&self, second: &Self) -> usize {
342        self.data.offset(&second.data)
343    }
344}
345
346impl<T, R: FromStr> ParseTo<R> for Spanned<T>
347where
348    T: ParseTo<R>,
349{
350    fn parse_to(&self) -> Option<R> {
351        self.data.parse_to()
352    }
353}
354
355impl<T, R> Slice<R> for Spanned<T>
356where
357    T: Slice<R> + Offset + AsBytes + Slice<RangeTo<usize>>,
358{
359    fn slice(&self, range: R) -> Self {
360        let next_data = self.data.slice(range);
361
362        let offset = self.data.offset(&next_data);
363
364        let old_data = self.data.slice(..offset);
365
366        if offset == 0 {
367            return Self {
368                data: next_data,
369                line: self.line,
370                col: self.col,
371                offset: self.offset,
372                handle_utf8: self.handle_utf8,
373            };
374        }
375
376        let new_line_iter = Memchr::new(b'\n', old_data.as_bytes());
377
378        let mut lines_to_add = 0;
379        let mut last_index = None;
380        for i in new_line_iter {
381            lines_to_add += 1;
382            last_index = Some(i);
383        }
384        let last_index = last_index.map_or(0, |v| v + 1);
385
386        let col = if self.handle_utf8 {
387            num_chars(old_data.as_bytes().slice(last_index..))
388        } else {
389            old_data.as_bytes().len() - last_index
390        };
391
392        Self {
393            data: next_data,
394            line: self.line + lines_to_add,
395            col: if lines_to_add == 0 {
396                self.col + col
397            } else {
398                // When going to a new line, char starts at 1
399                col + 1
400            },
401            offset: self.offset + offset,
402            handle_utf8: self.handle_utf8,
403        }
404    }
405}