textframe/
lib.rs

1/*
2TextFrame
3  by Maarten van Gompel <proycon@anaproy.nl>
4  Digital Infrastructure, KNAW Humanities Cluster
5  licensed under the GNU General Public Licence v3
6*/
7
8use filetime::FileTime;
9use hmac_sha256::Hash;
10use minicbor::{Decode, Encode};
11use smallvec::{smallvec, SmallVec};
12
13use std::collections::btree_map::Entry;
14use std::collections::BTreeMap;
15use std::fmt;
16use std::fs::File;
17use std::io::{BufRead, BufReader, BufWriter, Read, Seek, SeekFrom};
18use std::ops::Bound::Included;
19use std::path::{Path, PathBuf};
20use std::string::FromUtf8Error;
21use std::time::SystemTime;
22
23/// Handle to a frame (index in a vector)
24type FrameHandle = u32;
25
26#[derive(Debug)]
27pub enum Error {
28    OutOfBoundsError { begin: isize, end: isize },
29    InvalidUtf8Byte(usize),
30    EmptyText,
31    IOError(std::io::Error),
32    Utf8Error(FromUtf8Error),
33    InvalidHandle,
34    IndexError,
35    NotLoaded,
36    NoLineIndex,
37}
38
39impl fmt::Display for Error {
40    /// Formats the error message for printing
41    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
42        match self {
43            Self::OutOfBoundsError { begin, end } => write!(f, "Out of Bounds ({},{})", begin, end),
44            Self::InvalidUtf8Byte(byte) => write!(
45                f,
46                "Byte does not correspond with utf-8 character boundary ({})",
47                byte
48            ),
49            Self::EmptyText => write!(f, "text is empty"),
50            Self::IOError(e) => write!(f, "{}", e),
51            Self::Utf8Error(e) => write!(f, "{}", e),
52            Self::NotLoaded => write!(f, "text not loaded"),
53            Self::InvalidHandle => write!(f, "Invalid handle"),
54            Self::IndexError => write!(f, "Index I/O error"),
55            Self::NoLineIndex => write!(f, "No line index enabled"),
56        }
57    }
58}
59
60impl std::error::Error for Error {}
61
62#[derive(Debug, Clone, Decode, Encode)]
63pub struct PositionData<T>
64where
65    T: Eq + Ord + Copy,
66{
67    /// Unicode point
68    #[n(0)]
69    charpos: T,
70
71    /// UTF-8 byte offset
72    #[n(1)]
73    bytepos: T,
74
75    /// Size in bytes of this data point and all data points until the next one in the index
76    #[n(2)]
77    size: u8,
78}
79
80pub trait Position {
81    fn charpos(&self) -> usize;
82    fn bytepos(&self) -> usize;
83    fn size(&self) -> u8;
84}
85
86impl Position for PositionData<u32> {
87    fn charpos(&self) -> usize {
88        self.charpos as usize
89    }
90    fn bytepos(&self) -> usize {
91        self.bytepos as usize
92    }
93    fn size(&self) -> u8 {
94        self.size
95    }
96}
97
98impl Position for PositionData<u64> {
99    fn charpos(&self) -> usize {
100        self.charpos as usize
101    }
102    fn bytepos(&self) -> usize {
103        self.bytepos as usize
104    }
105    fn size(&self) -> u8 {
106        self.size
107    }
108}
109
110/// This represent a TextFile and associates a file on disk with
111/// immutable excerpts of it (frames) stored in memory.
112pub struct TextFile {
113    /// The path to the text file
114    path: PathBuf,
115
116    /// Holds loaded excerpts of the text (aka 'frames').
117    frames: Vec<TextFrame>,
118
119    /// Maps bytes to frame handles (indirection)
120    frametable: BTreeMap<usize, SmallVec<[FrameHandle; 1]>>,
121
122    /// Maps character positions to bytes
123    positionindex: PositionIndex,
124
125    /// Modification time (unix timestamp)
126    metadata: std::fs::Metadata,
127}
128
129/// A frame is a fragment of loaded text
130struct TextFrame {
131    beginbyte: usize,
132    endbyte: usize,
133    text: String,
134}
135
136#[derive(Debug, Clone, Decode, Encode)]
137struct PositionIndex {
138    /// Length of the text file in characters
139    #[n(0)]
140    charsize: usize,
141
142    /// Size of the text file in bytes
143    #[n(1)]
144    bytesize: usize,
145
146    /// Maps character positions to bytes
147    #[n(2)]
148    positions: Positions,
149
150    /// SHA256 checksum of the contents
151    #[n(3)]
152    checksum: [u8; 32],
153
154    /// Maps lines to bytes (if enabled)
155    #[n(4)]
156    lines: Lines,
157}
158
159impl Default for PositionIndex {
160    fn default() -> Self {
161        Self {
162            charsize: 0,
163            bytesize: 0,
164            lines: Lines::default(),
165            positions: Positions::Large(Vec::default()),
166            checksum: Default::default(),
167        }
168    }
169}
170
171#[derive(Debug, Clone, Decode, Encode)]
172/// Abstraction over differently sized position vectors
173pub enum Positions {
174    #[n(0)]
175    Small(#[n(0)] Vec<PositionData<u16>>),
176
177    #[n(1)]
178    Large(#[n(0)] Vec<PositionData<u32>>),
179
180    #[n(2)]
181    Huge(#[n(0)] Vec<PositionData<u64>>),
182}
183
184impl Positions {
185    pub fn new(filesize: usize) -> Self {
186        if filesize < 65536 {
187            Self::Small(Vec::new())
188        } else if filesize < 4294967296 {
189            Self::Large(Vec::new())
190        } else {
191            Self::Huge(Vec::new())
192        }
193    }
194
195    pub fn len(&self) -> usize {
196        match self {
197            Self::Small(positions) => positions.len(),
198            Self::Large(positions) => positions.len(),
199            Self::Huge(positions) => positions.len(),
200        }
201    }
202
203    pub fn bytepos(&self, index: usize) -> Option<usize> {
204        match self {
205            Self::Small(positions) => positions.get(index).map(|x| x.bytepos as usize),
206            Self::Large(positions) => positions.get(index).map(|x| x.bytepos as usize),
207            Self::Huge(positions) => positions.get(index).map(|x| x.bytepos as usize),
208        }
209    }
210    pub fn charpos(&self, index: usize) -> Option<usize> {
211        match self {
212            Self::Small(positions) => positions.get(index).map(|x| x.charpos as usize),
213            Self::Large(positions) => positions.get(index).map(|x| x.charpos as usize),
214            Self::Huge(positions) => positions.get(index).map(|x| x.charpos as usize),
215        }
216    }
217    pub fn size(&self, index: usize) -> Option<u8> {
218        match self {
219            Self::Small(positions) => positions.get(index).map(|x| x.size),
220            Self::Large(positions) => positions.get(index).map(|x| x.size),
221            Self::Huge(positions) => positions.get(index).map(|x| x.size),
222        }
223    }
224
225    pub fn binary_search(&self, charpos: usize) -> Result<usize, usize> {
226        match self {
227            Self::Small(positions) => positions
228                .binary_search_by_key(&charpos, |posdata: &PositionData<u16>| {
229                    posdata.charpos as usize
230                }),
231            Self::Large(positions) => positions
232                .binary_search_by_key(&charpos, |posdata: &PositionData<u32>| {
233                    posdata.charpos as usize
234                }),
235            Self::Huge(positions) => positions
236                .binary_search_by_key(&charpos, |posdata: &PositionData<u64>| {
237                    posdata.charpos as usize
238                }),
239        }
240    }
241
242    pub fn binary_search_by_bytepos(&self, bytepos: usize) -> Result<usize, usize> {
243        match self {
244            Self::Small(positions) => positions
245                .binary_search_by_key(&bytepos, |posdata: &PositionData<u16>| {
246                    posdata.bytepos as usize
247                }),
248            Self::Large(positions) => positions
249                .binary_search_by_key(&bytepos, |posdata: &PositionData<u32>| {
250                    posdata.bytepos as usize
251                }),
252            Self::Huge(positions) => positions
253                .binary_search_by_key(&bytepos, |posdata: &PositionData<u64>| {
254                    posdata.bytepos as usize
255                }),
256        }
257    }
258
259    pub fn push(&mut self, charpos: usize, bytepos: usize, charsize: u8) {
260        match self {
261            Self::Small(positions) => positions.push(PositionData {
262                charpos: charpos as u16,
263                bytepos: bytepos as u16,
264                size: charsize,
265            }),
266            Self::Large(positions) => positions.push(PositionData {
267                charpos: charpos as u32,
268                bytepos: bytepos as u32,
269                size: charsize,
270            }),
271            Self::Huge(positions) => positions.push(PositionData {
272                charpos: charpos as u64,
273                bytepos: bytepos as u64,
274                size: charsize,
275            }),
276        }
277    }
278}
279
280#[derive(Debug, Clone, Decode, Encode)]
281/// Abstraction over differently sized vectors
282/// Lines start at 0, the underlying vector contains as many items as there are lines
283pub enum Lines {
284    #[n(0)]
285    Small(#[n(0)] Vec<u16>),
286
287    #[n(1)]
288    Large(#[n(0)] Vec<u32>),
289
290    #[n(2)]
291    Huge(#[n(0)] Vec<u64>),
292}
293
294impl Lines {
295    pub fn new(filesize: usize) -> Self {
296        if filesize < 65536 {
297            Self::Small(Vec::new())
298        } else if filesize < 4294967296 {
299            Self::Large(Vec::new())
300        } else {
301            Self::Huge(Vec::new())
302        }
303    }
304
305    /// Returns the total number of lines
306    pub fn len(&self) -> usize {
307        match self {
308            Self::Small(positions) => positions.len(),
309            Self::Large(positions) => positions.len(),
310            Self::Huge(positions) => positions.len(),
311        }
312    }
313
314    /// Returns the byte position where a line begins
315    pub fn get(&self, index: usize) -> Option<usize> {
316        match self {
317            Self::Small(positions) => positions.get(index).map(|x| *x as usize),
318            Self::Large(positions) => positions.get(index).map(|x| *x as usize),
319            Self::Huge(positions) => positions.get(index).map(|x| *x as usize),
320        }
321    }
322
323    pub fn push(&mut self, line: usize) {
324        match self {
325            Self::Small(positions) => positions.push(line as u16),
326            Self::Large(positions) => positions.push(line as u32),
327            Self::Huge(positions) => positions.push(line as u64),
328        }
329    }
330}
331
332impl Default for Lines {
333    fn default() -> Self {
334        Self::Large(Vec::new())
335    }
336}
337
338#[derive(Clone, Copy, Debug, PartialEq)]
339/// Text file mode.
340pub enum TextFileMode {
341    /// Do not compute a line index (cheapest), set this if you're not interested in line-based queries
342    NoLineIndex,
343
344    /// Compute a line index (takes memory and cpu time), allows queries based on line ranges
345    WithLineIndex,
346}
347
348impl Default for TextFileMode {
349    fn default() -> Self {
350        Self::WithLineIndex
351    }
352}
353
354impl TextFile {
355    /// Associates with an existing text file on disk, you can optionally provide a path to an indexfile to use for caching the position index. Is such a cache is not available, the text file is scanned once and the index created.
356
357    /// * `path` - The text file
358    /// * `indexpath` - The associated index file, acts as a cache if provided to prevent recomputation every time
359    /// * `mode` - Additional options
360    pub fn new(
361        path: impl Into<PathBuf>,
362        indexpath: Option<&Path>,
363        mode: TextFileMode,
364    ) -> Result<Self, Error> {
365        let path: PathBuf = path.into();
366        let metadata = std::fs::metadata(path.as_path()).map_err(|e| Error::IOError(e))?;
367        let mut build_index = true;
368        let mut positionindex = PositionIndex::default();
369        if let Some(indexpath) = indexpath.as_ref() {
370            if indexpath.exists() {
371                let indexmetadata = std::fs::metadata(indexpath).map_err(|e| Error::IOError(e))?;
372                if FileTime::from_last_modification_time(&indexmetadata)
373                    >= FileTime::from_last_modification_time(&metadata)
374                {
375                    positionindex = PositionIndex::from_file(indexpath)?;
376                    build_index = false;
377                }
378            }
379        }
380        if build_index {
381            positionindex = PositionIndex::new(path.as_path(), metadata.len(), mode)?;
382        }
383        if let Some(indexpath) = indexpath.as_ref() {
384            positionindex.to_file(indexpath)?;
385        }
386        Ok(Self {
387            path,
388            frames: Vec::new(),
389            frametable: BTreeMap::new(),
390            positionindex,
391            metadata,
392        })
393    }
394
395    /// Returns the filename on disk
396    pub fn path(&self) -> &Path {
397        self.path.as_path()
398    }
399
400    /// Returns a text fragment. The fragment must already be in memory or an Error::NotLoaded will be returned.
401    /// Use `get_or_load()` instead if the fragment might not be loaded yet.
402    ///
403    /// * `begin` - The begin offset in unicode character points (0-indexed). If negative, it is interpreted relative to the end of the text.
404    /// * `end` - The end offset in unicode character points (0-indexed, non-inclusive). If 0 or negative, it is interpreted relative to the end of the text.
405    pub fn get(&self, begin: isize, end: isize) -> Result<&str, Error> {
406        let (beginchar, endchar) = self.absolute_pos(begin, end)?;
407        let beginbyte = self.chars_to_bytes(beginchar)?;
408        let endbyte = self.chars_to_bytes(endchar)?;
409        self.get_byterange_unchecked(beginbyte, endbyte)
410    }
411
412    /// Returns the text for a byte range, checks if the byte range is at valid UTF-8 character boundaries and returns an InvalidUtf8Bytes error if not
413    pub fn get_byterange(&self, beginbyte: usize, endbyte: usize) -> Result<&str, Error> {
414        self.frame(beginbyte, endbyte)
415            .ok_or(Error::NotLoaded)
416            .map(|frame| {
417                //verify beginbyte and endbyte are at a char boundary, return error if not
418                self.bytes_to_chars(beginbyte - frame.beginbyte)?;
419                self.bytes_to_chars(endbyte - frame.beginbyte)?;
420                Ok(
421                    &frame.text.as_str()
422                        [(beginbyte - frame.beginbyte)..(endbyte - frame.beginbyte)],
423                )
424            })?
425    }
426
427    /// Returns the text for a byte range, but may panic if the byte range is not at valid UTF-8 character offsets
428    /// This is more performant than get_byterange() but can only be used if you're sure the bytes are valid
429    pub fn get_byterange_unchecked(&self, beginbyte: usize, endbyte: usize) -> Result<&str, Error> {
430        self.frame(beginbyte, endbyte)
431            .ok_or(Error::NotLoaded)
432            .map(|frame| {
433                &frame.text.as_str()[(beginbyte - frame.beginbyte)..(endbyte - frame.beginbyte)]
434            })
435    }
436
437    /// Returns a text fragment by lines. The fragment must already be in memory or an Error::NotLoaded will be returned.
438    /// Use `get_lines_or_load()` instead if the fragment might not be loaded yet.
439    ///
440    /// * `begin` - The begin line (0-indexed!!). If negative, it is interpreted relative to the end of the text.
441    /// * `end` - The end line (0-indexed!! non-inclusive). If 0 or negative, it is interpreted relative to the end of the text.
442    ///
443    /// This will return Error::NoLineIndex if no line index was computed.
444    /// Trailing newline characters will always be returned.
445    pub fn get_lines(&self, begin: isize, end: isize) -> Result<&str, Error> {
446        let (beginbyte, endbyte) = self.line_range_to_byte_range(begin, end)?;
447        self.get_byterange_unchecked(beginbyte, endbyte)
448    }
449
450    /// Returns a text fragment, the fragment will be loaded from disk into memory if needed.
451    /// Use `get()` instead if you are already sure the fragment is loaded
452    ///
453    /// * `begin` - The begin offset in unicode character points (0-indexed). If negative, it is interpreted relative to the end of the text.
454    /// * `end` - The end offset in unicode character points (0-indexed, non-inclusive). If 0 or negative, it is interpreted relative to the end of the text.
455    pub fn get_or_load(&mut self, begin: isize, end: isize) -> Result<&str, Error> {
456        let (beginchar, endchar) = self.absolute_pos(begin, end)?;
457        let beginbyte = self.chars_to_bytes(beginchar)?;
458        let endbyte = self.chars_to_bytes(endchar)?;
459        match self.framehandle(beginbyte, endbyte) {
460            Some(framehandle) => {
461                let frame = self.resolve(framehandle)?;
462                Ok(
463                    &frame.text.as_str()
464                        [(beginbyte - frame.beginbyte)..(endbyte - frame.beginbyte)],
465                )
466            }
467            None => {
468                self.load_abs(beginchar, endchar)?;
469                self.get(begin, end)
470            }
471        }
472    }
473
474    /// Returns a text fragment, the fragment will be loaded from disk into memory if needed.
475    /// Use `get_lines()` instead if you are already sure the fragment is loaded
476    ///
477    /// * `begin` - The begin line (0-indexed!!). If negative, it is interpreted relative to the end of the text.
478    /// * `end` - The end line (0-indexed!! non-inclusive). If 0 or negative, it is interpreted relative to the end of the text.
479    ///
480    /// This will return Error::NoLineIndex if no line index was computed.
481    /// Trailing newline characters will always be returned.
482    pub fn get_or_load_lines(&mut self, begin: isize, end: isize) -> Result<&str, Error> {
483        let beginbyte = self.line_to_bytes(begin)?;
484        let endbyte = if end == 0 {
485            self.positionindex.bytesize
486        } else {
487            self.line_to_bytes(end)?
488        };
489        if let Some(framehandle) = self.framehandle(beginbyte, endbyte) {
490            let frame = self.resolve(framehandle)?;
491            return Ok(
492                &frame.text.as_str()[(beginbyte - frame.beginbyte)..(endbyte - frame.beginbyte)]
493            );
494        }
495        self.load_frame(beginbyte, endbyte)?;
496        if let Some(frame) = self.frame(beginbyte, endbyte) {
497            Ok(&frame.text.as_str()[(beginbyte - frame.beginbyte)..(endbyte - frame.beginbyte)])
498        } else {
499            Err(Error::NotLoaded)
500        }
501    }
502
503    /// Loads a particular text range into memory
504    ///
505    /// * `begin` - The begin offset in unicode character points (0-indexed). If negative, it is interpreted relative to the end of the text.
506    /// * `end` - The end offset in unicode character points (0-indexed, non-inclusive). If 0 or negative, it is interpreted relative to the end of the text.
507    pub fn load(&mut self, begin: isize, end: isize) -> Result<(), Error> {
508        let (beginchar, endchar) = self.absolute_pos(begin, end)?;
509        self.load_abs(beginchar, endchar)
510    }
511
512    /// Get a frame from a given handle
513    fn resolve(&self, handle: FrameHandle) -> Result<&TextFrame, Error> {
514        if let Some(frame) = self.frames.get(handle as usize) {
515            Ok(frame)
516        } else {
517            Err(Error::InvalidHandle)
518        }
519    }
520
521    /// Returns an existing frame handle that holds the given byte offset (if any is loaded)
522    fn framehandle(&self, beginbyte: usize, endbyte: usize) -> Option<FrameHandle> {
523        let mut iter = self.frametable.range((Included(&0), Included(&beginbyte)));
524        // read the (double-ended) iterator backwards
525        // and see if we find a frame that holds the bytes we want
526        while let Some((_, framehandles)) = iter.next_back() {
527            for handle in framehandles {
528                if let Some(frame) = self.frames.get(*handle as usize) {
529                    if frame.endbyte >= endbyte {
530                        return Some(*handle);
531                    }
532                }
533            }
534        }
535        None
536    }
537
538    /// Returns an existing frame that holds the given byte offset (if any is loaded)
539    fn frame(&self, beginbyte: usize, endbyte: usize) -> Option<&TextFrame> {
540        let mut iter = self.frametable.range((Included(&0), Included(&beginbyte)));
541        // read the (double-ended) iterator backwards
542        // and see if we find a frame that holds the bytes we want
543        while let Some((_, framehandles)) = iter.next_back() {
544            for handle in framehandles {
545                if let Some(frame) = self.frames.get(*handle as usize) {
546                    if frame.endbyte >= endbyte {
547                        return Some(frame);
548                    }
549                }
550            }
551        }
552        None
553    }
554
555    /// Loads a particular text range into memory, takes absolute offsets
556    fn load_abs(&mut self, beginchar: usize, endchar: usize) -> Result<(), Error> {
557        let beginbyte = self.chars_to_bytes(beginchar)?;
558        let endbyte = self.chars_to_bytes(endchar)?;
559        match self.load_frame(beginbyte, endbyte) {
560            Ok(_handle) => Ok(()),
561            Err(e) => Err(e),
562        }
563    }
564
565    /// Loads a text frame from disk into memory
566    fn load_frame(&mut self, beginbyte: usize, endbyte: usize) -> Result<FrameHandle, Error> {
567        if beginbyte > endbyte {
568            return Err(Error::OutOfBoundsError {
569                begin: beginbyte as isize,
570                end: endbyte as isize,
571            });
572        }
573        let mut buffer: Vec<u8> = vec![0; endbyte - beginbyte];
574        let mut file = File::open(self.path.as_path()).map_err(|e| Error::IOError(e))?;
575        file.seek(SeekFrom::Start(beginbyte as u64))
576            .map_err(|e| Error::IOError(e))?;
577        file.read_exact(&mut buffer)
578            .map_err(|e| Error::IOError(e))?;
579        let frame = TextFrame {
580            beginbyte,
581            endbyte,
582            text: String::from_utf8(buffer).map_err(|e| Error::Utf8Error(e))?,
583        };
584        self.frames.push(frame);
585        let handle = (self.frames.len() - 1) as FrameHandle;
586        match self.frametable.entry(beginbyte) {
587            Entry::Occupied(mut entry) => entry.get_mut().push(handle),
588            Entry::Vacant(entry) => {
589                entry.insert(smallvec!(handle));
590            }
591        }
592        Ok(handle)
593    }
594
595    /// Convert a character position to byte position
596    pub fn chars_to_bytes(&self, charpos: usize) -> Result<usize, Error> {
597        match self.positionindex.positions.binary_search(charpos) {
598            Ok(index) => {
599                //exact match
600                Ok(self
601                    .positionindex
602                    .positions
603                    .bytepos(index)
604                    .expect("position should exist"))
605            }
606            Err(0) => {
607                //insertion before first item should never happen **except if a file is empty**, because the first PositionData item is always the first char
608                Err(Error::EmptyText)
609            }
610            Err(index) => {
611                //miss, compute from the item just before, index (>0) will be the item just after the failure
612                let charpos2 = self
613                    .positionindex
614                    .positions
615                    .charpos(index - 1)
616                    .expect("position should exist");
617                let charoffset = charpos - charpos2;
618                let bytepos = self
619                    .positionindex
620                    .positions
621                    .bytepos(index - 1)
622                    .expect("position should exist")
623                    + (self
624                        .positionindex
625                        .positions
626                        .size(index - 1)
627                        .expect("position should exist") as usize
628                        * charoffset);
629                if bytepos > self.positionindex.bytesize {
630                    Err(Error::OutOfBoundsError {
631                        begin: bytepos as isize,
632                        end: 0,
633                    })
634                } else {
635                    Ok(bytepos)
636                }
637            }
638        }
639    }
640
641    /// Convert a UTF-8 byte position to a character position. Returns `Error::InvalidUtf8Byte` if the byte is not at a character boundary
642    pub fn bytes_to_chars(&self, bytepos: usize) -> Result<usize, Error> {
643        if bytepos > self.positionindex.bytesize {
644            return Err(Error::OutOfBoundsError {
645                begin: bytepos as isize,
646                end: 0,
647            });
648        }
649
650        match self
651            .positionindex
652            .positions
653            .binary_search_by_bytepos(bytepos)
654        {
655            Ok(index) => Ok(self.positionindex.positions.charpos(index).unwrap()),
656            Err(0) => {
657                //insertion before first item should never happen **except if a file is empty**, because the first PositionData item is always the first byte
658                Err(Error::EmptyText)
659            }
660            Err(index) => {
661                let prev_byte = self.positionindex.positions.bytepos(index - 1).unwrap();
662                let prev_char = self.positionindex.positions.charpos(index - 1).unwrap();
663                let size = self.positionindex.positions.size(index - 1).unwrap() as usize;
664                if (bytepos - prev_byte) % size == 0 {
665                    Ok(prev_char + (bytepos - prev_byte) / size)
666                } else {
667                    Err(Error::InvalidUtf8Byte(bytepos))
668                }
669            }
670        }
671    }
672
673    /// Convert a line number (0-indexed!! first line is 0!) to bytes position.
674    /// Relative lines numbers (negative) are supported here as well.
675    /// This will return an `Error::IndexError` if no line index was computed/loaded.
676    pub fn line_to_bytes(&self, line: isize) -> Result<usize, Error> {
677        let num_lines = self.positionindex.lines.len();
678
679        if num_lines == 0 {
680            return Err(Error::NoLineIndex);
681        }
682
683        // Handle negative indexing
684        let line = if line < 0 {
685            let abs = line.unsigned_abs();
686            if abs > num_lines {
687                return Err(Error::OutOfBoundsError {
688                    begin: line,
689                    end: 0,
690                });
691            }
692            num_lines - abs
693        } else {
694            line as usize
695        };
696
697        // One past the last line = end of file
698        if line == num_lines {
699            return Ok(self.positionindex.bytesize);
700        }
701
702        self.positionindex
703            .lines
704            .get(line)
705            .ok_or(Error::OutOfBoundsError {
706                begin: line as isize,
707                end: 0,
708            })
709    }
710
711    pub fn line_range_to_byte_range(
712        &self,
713        begin: isize,
714        end: isize,
715    ) -> Result<(usize, usize), Error> {
716        let beginbyte = self.line_to_bytes(begin)?;
717        let endbyte = if end == 0 {
718            self.positionindex.bytesize
719        } else {
720            self.line_to_bytes(end)?
721        };
722
723        Ok((beginbyte, endbyte))
724    }
725
726    /// Converts relative character offset to an absolute one. If the offset is already absolute, it will be returned as is.
727    ///
728    /// * `begin` - The begin offset in unicode character points (0-indexed). If negative, it is interpreted relative to the end of the text.
729    /// * `end` - The end offset in unicode character points (0-indexed, non-inclusive). If 0 or negative, it is interpreted relative to the end of the text.
730    pub fn absolute_pos(&self, mut begin: isize, mut end: isize) -> Result<(usize, usize), Error> {
731        if begin < 0 {
732            begin += self.positionindex.charsize as isize;
733        }
734
735        if end <= 0 {
736            end += self.positionindex.charsize as isize;
737        }
738
739        if begin < 0 || end < 0 || begin > end {
740            return Err(Error::OutOfBoundsError { begin, end });
741        }
742
743        Ok((begin as usize, end as usize))
744    }
745
746    /// Converts relative line offset into absolute character-based one. If the offset is already absolute, it will
747    /// be returned as is.
748    ///
749    /// * `begin` - The begin offset in line numbers. If negative, it is interpreted relative to
750    ///   the end of the text
751    /// * `end` - The end offset in line numbers. If zero or negative, it is interpreted relative to
752    ///   the end of the text
753    pub fn absolute_line_pos(
754        &self,
755        mut begin: isize,
756        mut end: isize,
757    ) -> Result<(usize, usize), Error> {
758        if begin < 0 {
759            begin += self.positionindex.lines.len() as isize;
760        }
761
762        if end <= 0 {
763            end += self.positionindex.lines.len() as isize;
764        }
765
766        if begin < 0 || end < 0 || begin > end {
767            return Err(Error::OutOfBoundsError { begin, end });
768        }
769
770        let beginbyte = self.line_to_bytes(begin)?;
771        let endbyte = self.line_to_bytes(end)?;
772
773        Ok((
774            self.bytes_to_chars(beginbyte)?,
775            self.bytes_to_chars(endbyte)?,
776        ))
777    }
778
779    /// Returns the length of the total text file in characters, i.e. the number of character in the text
780    pub fn len(&self) -> usize {
781        self.positionindex.charsize
782    }
783
784    /// Returns the length of the total text file in bytes
785    pub fn len_utf8(&self) -> usize {
786        self.positionindex.bytesize
787    }
788
789    /// Returns the unix timestamp when the file was last modified
790    pub fn mtime(&self) -> u64 {
791        if let Ok(modified) = self.metadata.modified() {
792            modified
793                .duration_since(SystemTime::UNIX_EPOCH)
794                .expect("invalid file timestamp (before unix epoch)")
795                .as_secs()
796        } else {
797            0
798        }
799    }
800
801    /// Returns the SHA-256 checksum
802    pub fn checksum(&self) -> &[u8; 32] {
803        &self.positionindex.checksum
804    }
805
806    /// Returns the SHA-256 checksum as a digest string
807    pub fn checksum_digest(&self) -> String {
808        format!("{:x}", HexDigest(self.checksum()))
809    }
810}
811
812impl PositionIndex {
813    /// Build a new positionindex for a given text file
814    fn new(textfile: &Path, filesize: u64, options: TextFileMode) -> Result<Self, Error> {
815        let mut charpos = 0;
816        let mut bytepos = 0;
817        let mut prevcharsize = 0;
818        let textfile = File::open(textfile).map_err(|e| Error::IOError(e))?;
819
820        // read with a line by line reader to prevent excessive read() syscalls and handle UTF-8 properly
821        let mut reader = BufReader::new(textfile);
822        let mut positions = Positions::new(filesize as usize);
823        let mut lines = Lines::new(filesize as usize);
824        let mut line = String::new();
825        let mut checksum = Hash::new();
826        loop {
827            let read_bytes = reader.read_line(&mut line).map_err(|e| Error::IOError(e))?;
828            if read_bytes == 0 {
829                //EOF
830                break;
831            } else {
832                checksum.update(&line);
833                if options == TextFileMode::WithLineIndex {
834                    lines.push(bytepos);
835                }
836                for char in line.chars() {
837                    let charsize = char.len_utf8() as u8;
838                    if charsize != prevcharsize {
839                        positions.push(charpos, bytepos, charsize);
840                    }
841                    charpos += 1;
842                    bytepos += charsize as usize;
843                    prevcharsize = charsize;
844                }
845                //clear buffer for next read
846                line.clear();
847            }
848        }
849        let checksum = checksum.finalize();
850        if options == TextFileMode::WithLineIndex {
851            //the last 'line' marks the end position
852            lines.push(bytepos);
853        }
854        Ok(PositionIndex {
855            charsize: charpos,
856            bytesize: bytepos,
857            positions,
858            checksum,
859            lines,
860        })
861    }
862
863    /// Save a positionindex to file
864    fn to_file(&mut self, path: &Path) -> Result<(), Error> {
865        let file = File::create(path).map_err(|e| Error::IOError(e))?;
866        let writer = BufWriter::new(file);
867        let writer = minicbor::encode::write::Writer::new(writer);
868        minicbor::encode(self, writer).map_err(|_| Error::IndexError)?;
869        Ok(())
870    }
871
872    /// Load a positionindex from file (quicker than recomputing)
873    fn from_file(path: &Path) -> Result<Self, Error> {
874        let file = File::open(path).map_err(|e| Error::IOError(e))?;
875        let mut reader = BufReader::new(file);
876        let mut buffer: Vec<u8> = Vec::new(); //will hold the entire CBOR file!!!
877        reader
878            .read_to_end(&mut buffer)
879            .map_err(|e| Error::IOError(e))?;
880        Ok(minicbor::decode(&buffer).map_err(|_| Error::IndexError)?)
881    }
882}
883
884struct HexDigest<'a>(&'a [u8; 32]);
885
886// You can choose to implement multiple traits, like Lower and UpperHex
887impl fmt::LowerHex for HexDigest<'_> {
888    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
889        for byte in self.0 {
890            write!(f, "{:02x}", byte)?;
891        }
892        Ok(())
893    }
894}
895
896#[cfg(test)]
897mod tests {
898    use super::*;
899    use std::io::Write;
900    use tempfile::NamedTempFile;
901
902    // all single byte-characters, for baseline testing
903    const EXAMPLE_ASCII_TEXT: &str = "
904Article 1
905
906All human beings are born free and equal in dignity and rights. They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood.
907
908Article 2
909
910Everyone is entitled to all the rights and freedoms set forth in this Declaration, without distinction of any kind, such as race, colour, sex, language, religion, political or other opinion, national or social origin, property, birth or other status. Furthermore, no distinction shall be made on the basis of the political, jurisdictional or international status of the country or territory to which a person belongs, whether it be independent, trust, non-self-governing or under any other limitation of sovereignty.
911
912Article 3
913
914Everyone has the right to life, liberty and security of person.
915
916Article 4
917
918No one shall be held in slavery or servitude; slavery and the slave trade shall be prohibited in all their forms.
919";
920
921    // multi-byte characters (mixed with single-byte)
922    const EXAMPLE_UNICODE_TEXT: &str = "
923第一条
924
925人人生而自由,在尊严和权利上一律平等。他们赋有理性和良心,并应以兄弟关系的精神相对待。
926第二条
927
928人人有资格享有本宣言所载的一切权利和自由,不分种族、肤色、性别、语言、宗教、政治或其他见解、国籍或社会出身、财产、出生或其他身分等任何区别。
929
930并且不得因一人所属的国家或领土的政治的、行政的或者国际的地位之不同而有所区别,无论该领土是独立领土、托管领土、非自治领土或者处于其他任何主权受限制的情况之下。
931第三条
932
933人人有权享有生命、自由和人身安全。
934第四条
935
936任何人不得使为奴隶或奴役;一切形式的奴隶制度和奴隶买卖,均应予以禁止。
937";
938    const EXAMPLE_3_TEXT: &str = "ПРИВЕТ";
939
940    fn setup_ascii() -> NamedTempFile {
941        let mut file = tempfile::NamedTempFile::new().expect("temp file");
942        write!(file, "{}", EXAMPLE_ASCII_TEXT).expect("write must work");
943        file
944    }
945
946    fn setup_unicode() -> NamedTempFile {
947        let mut file = tempfile::NamedTempFile::new().expect("temp file");
948        write!(file, "{}", EXAMPLE_UNICODE_TEXT).expect("write must work");
949        file
950    }
951
952    fn setup_3() -> NamedTempFile {
953        let mut file = tempfile::NamedTempFile::new().expect("temp file");
954        write!(file, "{}", EXAMPLE_3_TEXT).expect("write must work");
955        file
956    }
957
958    fn setup_empty() -> NamedTempFile {
959        let file = tempfile::NamedTempFile::new().expect("temp file");
960        file
961    }
962
963    #[test]
964    pub fn test001_init_ascii() {
965        let file = setup_ascii();
966        let textfile =
967            TextFile::new(file.path(), None, Default::default()).expect("file must load");
968        assert_eq!(textfile.len(), 914);
969        assert_eq!(textfile.len_utf8(), 914);
970    }
971
972    #[test]
973    pub fn test001_init_unicode() {
974        let file = setup_unicode();
975        let textfile =
976            TextFile::new(file.path(), None, Default::default()).expect("file must load");
977        assert_eq!(textfile.len(), 271);
978        assert_eq!(textfile.len_utf8(), 771);
979    }
980
981    #[test]
982    pub fn test002_load_ascii() {
983        let file = setup_ascii();
984        let mut textfile =
985            TextFile::new(file.path(), None, Default::default()).expect("file must load");
986        let text = textfile.get_or_load(0, 0).expect("text should exist");
987        assert_eq!(text, EXAMPLE_ASCII_TEXT);
988    }
989
990    #[test]
991    pub fn test002_load_ascii_explicit() {
992        let file = setup_ascii();
993        let mut textfile =
994            TextFile::new(file.path(), None, Default::default()).expect("file must load");
995        assert!(textfile.load(0, 0).is_ok());
996        let text = textfile.get(0, 0).expect("text should exist");
997        assert_eq!(text, EXAMPLE_ASCII_TEXT);
998    }
999
1000    #[test]
1001    pub fn test002_load_unicode() {
1002        let file = setup_unicode();
1003        let mut textfile =
1004            TextFile::new(file.path(), None, Default::default()).expect("file must load");
1005        let text = textfile.get_or_load(0, 0).expect("text should exist");
1006        assert_eq!(text, EXAMPLE_UNICODE_TEXT);
1007    }
1008
1009    #[test]
1010    pub fn test002_load_unicode_tiny() {
1011        let file = setup_3();
1012        let mut textfile =
1013            TextFile::new(file.path(), None, Default::default()).expect("file must load");
1014        let text = textfile.get_or_load(0, 0).expect("text should exist");
1015        assert_eq!(text, EXAMPLE_3_TEXT);
1016    }
1017
1018    #[test]
1019    pub fn test003_subpart_of_loaded_frame() {
1020        let file = setup_ascii();
1021        let mut textfile =
1022            TextFile::new(file.path(), None, Default::default()).expect("file must load");
1023        assert!(textfile.load(0, 0).is_ok());
1024        let text = textfile.get(1, 10).expect("text should exist");
1025        assert_eq!(text, "Article 1");
1026    }
1027
1028    #[test]
1029    pub fn test004_excerpt_in_frame() {
1030        let file = setup_ascii();
1031        let mut textfile =
1032            TextFile::new(file.path(), None, Default::default()).expect("file must load");
1033        let text = textfile.get_or_load(1, 10).expect("text should exist");
1034        assert_eq!(text, "Article 1");
1035    }
1036
1037    #[test]
1038    pub fn test004_end_excerpt_in_frame() {
1039        let file = setup_ascii();
1040        let mut textfile =
1041            TextFile::new(file.path(), None, Default::default()).expect("file must load");
1042        let text = textfile.get_or_load(-7, 0).expect("text should exist");
1043        assert_eq!(text, "forms.\n");
1044    }
1045
1046    #[test]
1047    pub fn test004_excerpt_in_frame_unicode() {
1048        let file = setup_unicode();
1049        let mut textfile =
1050            TextFile::new(file.path(), None, Default::default()).expect("file must load");
1051        let text = textfile.get_or_load(1, 4).expect("text should exist");
1052        assert_eq!(text, "第一条");
1053    }
1054
1055    #[test]
1056    pub fn test004_end_excerpt_in_frame_unicode() {
1057        let file = setup_unicode();
1058        let mut textfile =
1059            TextFile::new(file.path(), None, Default::default()).expect("file must load");
1060        let text = textfile.get_or_load(-3, 0).expect("text should exist");
1061        assert_eq!(text, "止。\n");
1062    }
1063
1064    #[test]
1065    pub fn test005_out_of_bounds() {
1066        let file = setup_ascii();
1067        let mut textfile =
1068            TextFile::new(file.path(), None, Default::default()).expect("file must load");
1069        assert!(textfile.load(0, 0).is_ok());
1070        assert!(textfile.get(1, 999).is_err());
1071    }
1072
1073    #[test]
1074    pub fn test006_checksum() {
1075        let file = setup_ascii();
1076        /*
1077        // compute reference
1078        let output = std::process::Command::new("sha256sum")
1079            .arg(file.path())
1080            .output()
1081            .expect("Failed to execute command");
1082        let refsum = String::from_utf8_lossy(&output.stdout).to_owned();
1083        eprintln!(refsum);
1084        */
1085        let textfile =
1086            TextFile::new(file.path(), None, Default::default()).expect("file must load");
1087        assert_eq!(
1088            textfile.checksum_digest(),
1089            "c6b079e561f19702d63111a3201d4850e9649b8a3ef1929d6530a780f3815215"
1090        );
1091    }
1092
1093    #[test]
1094    pub fn test007_positionindex_size() {
1095        let file = setup_3();
1096        let mut textfile =
1097            TextFile::new(file.path(), None, Default::default()).expect("file must load");
1098        assert!(textfile.load(0, 0).is_ok());
1099        assert_eq!(textfile.positionindex.positions.len(), 1);
1100    }
1101
1102    #[test]
1103    pub fn test008_line_ascii() {
1104        let file = setup_ascii();
1105        let mut textfile =
1106            TextFile::new(file.path(), None, Default::default()).expect("file must load");
1107        let text = textfile.get_or_load_lines(1, 2).expect("text should exist"); //actual first line is empty in example, this is line 2
1108        assert_eq!(text, "Article 1\n");
1109    }
1110
1111    #[test]
1112    pub fn test008_empty_line_ascii() {
1113        let file = setup_ascii();
1114        let mut textfile =
1115            TextFile::new(file.path(), None, Default::default()).expect("file must load");
1116        let text = textfile.get_or_load_lines(0, 1).expect("text should exist"); //actual first line is empty
1117        assert_eq!(text, "\n");
1118    }
1119
1120    #[test]
1121    pub fn test008_empty_last_line_ascii() {
1122        let file = setup_ascii();
1123        let mut textfile =
1124            TextFile::new(file.path(), None, Default::default()).expect("file must load");
1125        let text = textfile
1126            .get_or_load_lines(-1, 0)
1127            .expect("text should exist"); //actual last line is empty in example without trailing newline
1128        assert_eq!(text, "");
1129    }
1130
1131    #[test]
1132    pub fn test008_empty_last_line() {
1133        let file = setup_ascii();
1134        let mut textfile =
1135            TextFile::new(file.path(), None, Default::default()).expect("file must load");
1136        let text = textfile
1137            .get_or_load_lines(-2, -1)
1138            .expect("text should exist");
1139        assert_eq!(text, "No one shall be held in slavery or servitude; slavery and the slave trade shall be prohibited in all their forms.\n");
1140    }
1141
1142    #[test]
1143    pub fn test008_all_lines() {
1144        let file = setup_unicode();
1145        let mut textfile =
1146            TextFile::new(file.path(), None, Default::default()).expect("file must load");
1147        assert!(textfile.load(0, 0).is_ok());
1148        let text = textfile.get_lines(0, 0).expect("text shoulde exist");
1149        assert_eq!(text, EXAMPLE_UNICODE_TEXT);
1150    }
1151
1152    #[test]
1153    pub fn test009_line_out_of_bounds() {
1154        let file = setup_ascii();
1155        let mut textfile =
1156            TextFile::new(file.path(), None, Default::default()).expect("file must load");
1157        assert!(textfile.load(0, 0).is_ok());
1158        assert!(textfile.get_lines(1, 999).is_err());
1159    }
1160
1161    #[test]
1162    pub fn test010_bytes_to_chars_ascii() {
1163        let file = setup_ascii();
1164        let textfile =
1165            TextFile::new(file.path(), None, Default::default()).expect("file must load");
1166        // ASCII: 1 byte = 1 char
1167        assert_eq!(textfile.bytes_to_chars(0).unwrap(), 0);
1168        assert_eq!(textfile.bytes_to_chars(10).unwrap(), 10);
1169        assert_eq!(textfile.bytes_to_chars(914).unwrap(), 914);
1170    }
1171
1172    #[test]
1173    pub fn test010_bytes_to_chars_unicode() {
1174        let file = setup_unicode();
1175        let textfile =
1176            TextFile::new(file.path(), None, Default::default()).expect("file must load");
1177        // First char is newline (1 byte)
1178        assert_eq!(textfile.bytes_to_chars(0).unwrap(), 0);
1179        assert_eq!(textfile.bytes_to_chars(1).unwrap(), 1);
1180        // Chinese chars are 3 bytes each
1181        // byte 1 = char 1 (第), byte 4 = char 2 (一), byte 7 = char 3 (条)
1182        assert_eq!(textfile.bytes_to_chars(4).unwrap(), 2);
1183        assert_eq!(textfile.bytes_to_chars(7).unwrap(), 3);
1184        // End of file
1185        assert_eq!(textfile.bytes_to_chars(771).unwrap(), 271);
1186    }
1187
1188    #[test]
1189    pub fn test010_bytes_to_chars_roundtrip() {
1190        let file = setup_unicode();
1191        let textfile =
1192            TextFile::new(file.path(), None, Default::default()).expect("file must load");
1193        // chars_to_bytes and bytes_to_chars should be inverses
1194        for charpos in [0, 1, 10, 50, 100, 200, 271] {
1195            let bytepos = textfile.chars_to_bytes(charpos).unwrap();
1196            let back = textfile.bytes_to_chars(bytepos).unwrap();
1197            assert_eq!(back, charpos, "roundtrip failed for charpos {}", charpos);
1198        }
1199    }
1200
1201    #[test]
1202    pub fn test010_bytes_to_chars_out_of_bounds() {
1203        let file = setup_ascii();
1204        let textfile =
1205            TextFile::new(file.path(), None, Default::default()).expect("file must load");
1206        assert!(textfile.bytes_to_chars(9999).is_err());
1207    }
1208
1209    #[test]
1210    pub fn test010_get_byterange() {
1211        let file = setup_unicode();
1212        let mut textfile =
1213            TextFile::new(file.path(), None, Default::default()).expect("file must load");
1214        textfile.load(0, 0).unwrap();
1215        let text = textfile.get_byterange(1, 4).expect("text should exist");
1216        assert_eq!(text, "第");
1217    }
1218
1219    #[test]
1220    pub fn test010_get_invalid_byterange() {
1221        let file = setup_unicode();
1222        let mut textfile =
1223            TextFile::new(file.path(), None, Default::default()).expect("file must load");
1224        textfile.load(0, 0).unwrap();
1225        assert!(matches!(
1226            textfile.get_byterange(1, 3), //this would slice inside 第 and is invalid
1227            Err(Error::InvalidUtf8Byte(..))
1228        ));
1229    }
1230
1231    #[test]
1232    pub fn test011_absolute_line_pos() {
1233        let file = setup_ascii();
1234        let textfile =
1235            TextFile::new(file.path(), None, Default::default()).expect("file must load");
1236        // Line 0 starts at char 0
1237        let (begin, end) = textfile.absolute_line_pos(0, 1).unwrap();
1238        assert_eq!(begin, 0);
1239        // first line only contains a '\n'
1240        assert_eq!(end, 1);
1241        // Line 1 starts at char 1 (after the initial newline)
1242        let (begin, end) = textfile.absolute_line_pos(1, 2).unwrap();
1243        assert_eq!(begin, 1);
1244        assert_eq!(end, 11);
1245    }
1246
1247    #[test]
1248    pub fn test011_absolute_line_pos_negative() {
1249        let file = setup_ascii();
1250        let textfile =
1251            TextFile::new(file.path(), None, Default::default()).expect("file must load");
1252        // -2, 0 should give the last two lines (the very last line is empty)
1253        let (begin, end) = textfile.absolute_line_pos(-2, 0).unwrap();
1254        assert_eq!(begin, textfile.len() - 114);
1255        assert_eq!(end, textfile.len());
1256    }
1257
1258    #[test]
1259    pub fn test011_absolute_line_pos_full() {
1260        let file = setup_unicode();
1261        let textfile =
1262            TextFile::new(file.path(), None, Default::default()).expect("file must load");
1263        // 0, 0 should span the entire file
1264        let (begin, end) = textfile.absolute_line_pos(0, 0).unwrap();
1265        assert_eq!(begin, 0);
1266        assert_eq!(end, textfile.len());
1267    }
1268
1269    #[test]
1270    pub fn test011_absolute_line_pos_no_line_index() {
1271        let file = setup_ascii();
1272        let textfile =
1273            TextFile::new(file.path(), None, TextFileMode::NoLineIndex).expect("file must load");
1274        assert!(matches!(
1275            textfile.absolute_line_pos(0, 1),
1276            Err(Error::NoLineIndex)
1277        ));
1278    }
1279
1280    #[test]
1281    pub fn test011_absolute_line_pos_out_of_bounds() {
1282        let file = setup_ascii();
1283        let textfile =
1284            TextFile::new(file.path(), None, Default::default()).expect("file must load");
1285        assert!(textfile.absolute_line_pos(0, 9999).is_err());
1286        assert!(textfile.absolute_line_pos(-9999, 0).is_err());
1287    }
1288
1289    #[test]
1290    pub fn test012_empty_file() {
1291        let file = setup_empty();
1292        let textfile =
1293            TextFile::new(file.path(), None, Default::default()).expect("file must load");
1294        assert!(matches!(textfile.bytes_to_chars(0), Err(Error::EmptyText)));
1295        assert!(matches!(textfile.chars_to_bytes(0), Err(Error::EmptyText)));
1296    }
1297}
textframe/lib.rs

textframe/
lib.rs