finalfrontier_utils/
data.rs

1use std::fs::File;
2
3use failure::{Error, ResultExt};
4use memmap::{Mmap, MmapOptions};
5
6/// Get thread-specific data.
7///
8/// This function will return a memory map of the corpus data. The initial
9/// starting position for the given thread is also returned. This starting
10/// Position will always be the beginning of a sentence.
11pub fn thread_data_text(f: &File, thread: usize, n_threads: usize) -> Result<(Mmap, usize), Error> {
12    assert!(
13        thread < n_threads,
14        "Thread {} out of index [0, {})",
15        thread,
16        n_threads
17    );
18
19    let size = f.metadata().context("Cannot get file metadata")?.len();
20    let chunk_size = size as usize / n_threads;
21
22    let mmap = unsafe { MmapOptions::new().map(&f)? };
23
24    if thread == 0 {
25        return Ok((mmap, 0));
26    }
27
28    let mut start = thread * chunk_size;
29    while start < mmap.len() {
30        let next = mmap[start];
31        start += 1;
32        if next == b'\n' {
33            break;
34        }
35    }
36
37    Ok((mmap, start))
38}
39
40/// Get thread-specific data for a CONLLX-Corpus.
41///
42/// This function will return a memory map of the corpus data. The initial
43/// starting position for the given thread is also returned. This starting
44/// Position will always be the beginning of a sentence.
45pub fn thread_data_conllx(
46    f: &File,
47    thread: usize,
48    n_threads: usize,
49) -> Result<(Mmap, usize), Error> {
50    assert!(
51        thread < n_threads,
52        "Thread {} out of index [0, {})",
53        thread,
54        n_threads
55    );
56
57    let size = f.metadata().context("Cannot get file metadata")?.len();
58    let chunk_size = size as usize / n_threads;
59
60    let mmap = unsafe { MmapOptions::new().map(&f)? };
61
62    if thread == 0 {
63        return Ok((mmap, 0));
64    }
65
66    let mut start = thread * chunk_size;
67    while start < mmap.len() - 1 {
68        let next = mmap[start];
69        start += 1;
70        if next == b'\n' && mmap[start] == b'\n' {
71            start += 1;
72            break;
73        }
74    }
75
76    Ok((mmap, start))
77}
78
79#[cfg(test)]
80mod tests {
81    use std::fs::File;
82
83    use super::{thread_data_conllx, thread_data_text};
84
85    static CHUNKING_TEST_DATA: &str =
86        "a b c\nd e f\ng h i\nj k l\nm n o\np q r\ns t u\nv w x\ny z\n";
87
88    static CHUNKING_TEST_DATA_DEPS: &str =
89        "a b c\nd e f\n\ng h i\nj k l\n\nm n o\np q r\n\ns t u\nv w x\ny z\n";
90
91    #[test]
92    fn thread_data_test() {
93        let f = File::open("testdata/chunking.txt").unwrap();
94
95        let (mmap, start) = thread_data_text(&f, 0, 3).unwrap();
96        assert_eq!(
97            &*mmap,
98            CHUNKING_TEST_DATA.as_bytes(),
99            "Memory mapping is incorrect"
100        );
101        assert_eq!(start, 0, "Incorrect start index");
102
103        let (mmap, start) = thread_data_text(&f, 1, 3).unwrap();
104        assert_eq!(
105            &*mmap,
106            CHUNKING_TEST_DATA.as_bytes(),
107            "Memory mapping is incorrect"
108        );
109        assert_eq!(start, 18, "Incorrect start index");
110
111        let (mmap, start) = thread_data_text(&f, 2, 3).unwrap();
112        assert_eq!(
113            &*mmap,
114            CHUNKING_TEST_DATA.as_bytes(),
115            "Memory mapping is incorrect"
116        );
117        assert_eq!(start, 36, "Incorrect start index");
118    }
119
120    #[test]
121    fn deps_thread_data_test() {
122        // file size is 55 bytes
123        // starts scanning at index 19
124        // first double linebreak is at 26
125        // second at 39
126        let f = File::open("testdata/dep_chunking.txt").unwrap();
127        let (mmap, start) = thread_data_conllx(&f, 0, 3).unwrap();
128        assert_eq!(
129            &*mmap,
130            CHUNKING_TEST_DATA_DEPS.as_bytes(),
131            "Memory mapping is incorrect"
132        );
133        assert_eq!(start, 0, "Incorrect start index");
134
135        let (mmap, start) = thread_data_conllx(&f, 1, 3).unwrap();
136        assert_eq!(
137            &*mmap,
138            CHUNKING_TEST_DATA_DEPS.as_bytes(),
139            "Memory mapping is incorrect"
140        );
141        assert_eq!(start, 26, "Incorrect start index");
142
143        let (mmap, start) = thread_data_conllx(&f, 2, 3).unwrap();
144        assert_eq!(
145            &*mmap,
146            CHUNKING_TEST_DATA_DEPS.as_bytes(),
147            "Memory mapping is incorrect"
148        );
149        assert_eq!(start, 39, "Incorrect start index");
150    }
151
152    #[should_panic]
153    #[test]
154    fn thread_data_out_of_bounds_test() {
155        let f = File::open("testdata/chunking.txt").unwrap();
156        let _ = thread_data_conllx(&f, 3, 3).unwrap();
157    }
158}