xi_core_lib/
word_boundaries.rs

1// Copyright 2017 The xi-editor Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Segmentation of word boundaries. Note: this current implementation
16//! is intended to work for code. Future work is to make it Unicode aware.
17
18use xi_rope::{Cursor, Rope, RopeInfo};
19
20pub struct WordCursor<'a> {
21    inner: Cursor<'a, RopeInfo>,
22}
23
24impl<'a> WordCursor<'a> {
25    pub fn new(text: &'a Rope, pos: usize) -> WordCursor<'a> {
26        let inner = Cursor::new(text, pos);
27        WordCursor { inner }
28    }
29
30    /// Get previous boundary, and set the cursor at the boundary found.
31    pub fn prev_boundary(&mut self) -> Option<usize> {
32        if let Some(ch) = self.inner.prev_codepoint() {
33            let mut prop = get_word_property(ch);
34            let mut candidate = self.inner.pos();
35            while let Some(prev) = self.inner.prev_codepoint() {
36                let prop_prev = get_word_property(prev);
37                if classify_boundary(prop_prev, prop).is_start() {
38                    break;
39                }
40                prop = prop_prev;
41                candidate = self.inner.pos();
42            }
43            self.inner.set(candidate);
44            return Some(candidate);
45        }
46        None
47    }
48
49    /// Get next boundary, and set the cursor at the boundary found.
50    pub fn next_boundary(&mut self) -> Option<usize> {
51        if let Some(ch) = self.inner.next_codepoint() {
52            let mut prop = get_word_property(ch);
53            let mut candidate = self.inner.pos();
54            while let Some(next) = self.inner.next_codepoint() {
55                let prop_next = get_word_property(next);
56                if classify_boundary(prop, prop_next).is_end() {
57                    break;
58                }
59                prop = prop_next;
60                candidate = self.inner.pos();
61            }
62            self.inner.set(candidate);
63            return Some(candidate);
64        }
65        None
66    }
67
68    /// Return the selection for the word containing the current cursor. The
69    /// cursor is moved to the end of that selection.
70    pub fn select_word(&mut self) -> (usize, usize) {
71        let initial = self.inner.pos();
72        let init_prop_after = self.inner.next_codepoint().map(get_word_property);
73        self.inner.set(initial);
74        let init_prop_before = self.inner.prev_codepoint().map(get_word_property);
75        let mut start = initial;
76        let init_boundary = if let (Some(pb), Some(pa)) = (init_prop_before, init_prop_after) {
77            classify_boundary_initial(pb, pa)
78        } else {
79            WordBoundary::Both
80        };
81        let mut prop_after = init_prop_after;
82        let mut prop_before = init_prop_before;
83        if prop_after.is_none() {
84            start = self.inner.pos();
85            prop_after = prop_before;
86            prop_before = self.inner.prev_codepoint().map(get_word_property);
87        }
88        while let (Some(pb), Some(pa)) = (prop_before, prop_after) {
89            if start == initial {
90                if init_boundary.is_start() {
91                    break;
92                }
93            } else if !init_boundary.is_boundary() {
94                if classify_boundary(pb, pa).is_boundary() {
95                    break;
96                }
97            } else if classify_boundary(pb, pa).is_start() {
98                break;
99            }
100            start = self.inner.pos();
101            prop_after = prop_before;
102            prop_before = self.inner.prev_codepoint().map(get_word_property);
103        }
104        self.inner.set(initial);
105        let mut end = initial;
106        prop_after = init_prop_after;
107        prop_before = init_prop_before;
108        if prop_before.is_none() {
109            prop_before = self.inner.next_codepoint().map(get_word_property);
110            end = self.inner.pos();
111            prop_after = self.inner.next_codepoint().map(get_word_property);
112        }
113        while let (Some(pb), Some(pa)) = (prop_before, prop_after) {
114            if end == initial {
115                if init_boundary.is_end() {
116                    break;
117                }
118            } else if !init_boundary.is_boundary() {
119                if classify_boundary(pb, pa).is_boundary() {
120                    break;
121                }
122            } else if classify_boundary(pb, pa).is_end() {
123                break;
124            }
125            end = self.inner.pos();
126            prop_before = prop_after;
127            prop_after = self.inner.next_codepoint().map(get_word_property);
128        }
129        self.inner.set(end);
130        (start, end)
131    }
132}
133
134#[derive(PartialEq, Eq)]
135enum WordBoundary {
136    Interior,
137    Start, // a boundary indicating the end of a word
138    End,   // a boundary indicating the start of a word
139    Both,
140}
141
142impl WordBoundary {
143    fn is_start(&self) -> bool {
144        *self == WordBoundary::Start || *self == WordBoundary::Both
145    }
146
147    fn is_end(&self) -> bool {
148        *self == WordBoundary::End || *self == WordBoundary::Both
149    }
150
151    fn is_boundary(&self) -> bool {
152        *self != WordBoundary::Interior
153    }
154}
155
156fn classify_boundary(prev: WordProperty, next: WordProperty) -> WordBoundary {
157    use self::WordBoundary::*;
158    use self::WordProperty::*;
159    match (prev, next) {
160        (Lf, _) => Both,
161        (_, Lf) => Both,
162        (Space, Other) => Start,
163        (Space, Punctuation) => Start,
164        (Punctuation, Other) => Start,
165        (Other, Space) => End,
166        (Punctuation, Space) => End,
167        (Other, Punctuation) => End,
168        _ => Interior,
169    }
170}
171
172fn classify_boundary_initial(prev: WordProperty, next: WordProperty) -> WordBoundary {
173    use self::WordBoundary::*;
174    use self::WordProperty::*;
175    match (prev, next) {
176        (Lf, Other) => Start,
177        (Other, Lf) => End,
178        (Lf, Space) => Interior,
179        (Lf, Punctuation) => Interior,
180        (Space, Lf) => Interior,
181        (Punctuation, Lf) => Interior,
182        (Space, Punctuation) => Interior,
183        (Punctuation, Space) => Interior,
184        _ => classify_boundary(prev, next),
185    }
186}
187
188#[derive(Copy, Clone)]
189enum WordProperty {
190    Lf,
191    Space,
192    Punctuation,
193    Other, // includes letters and all of non-ascii unicode
194}
195
196fn get_word_property(codepoint: char) -> WordProperty {
197    if codepoint <= ' ' {
198        // TODO: deal with \r
199        if codepoint == '\n' {
200            return WordProperty::Lf;
201        }
202        return WordProperty::Space;
203    } else if codepoint <= '\u{3f}' {
204        // Hardcoded: !"#$%&'()*+,-./:;<=>?
205        if (0xfc00fffe00000000u64 >> (codepoint as u32)) & 1 != 0 {
206            return WordProperty::Punctuation;
207        }
208    } else if codepoint <= '\u{7f}' {
209        // Hardcoded: @[\]^`{|}~
210        if (0x7800000178000001u64 >> ((codepoint as u32) & 0x3f)) & 1 != 0 {
211            return WordProperty::Punctuation;
212        }
213    }
214    WordProperty::Other
215}