justify/
lib.rs

1//! This crate justifies plaintext for display in a terminal emulator in a  (mostly)
2//! Unicode friendly way.
3//!
4//! **Examples of use can be found in the file `tests/tests.rs`.**
5//!
6//! If the crate is compiled with the `unicode-width` feature (e.g. via `cargo build
7//! --features  unicode-width`), Unicode is handled gracefully. With this feature, a
8//! CJK  character  such as 한 takes two spaces, while combining characters take  0.
9//! Without this feature, every Unicode character takes one space, which can lead to
10//! poor  output  in  some  cases.  If you will only ever  justify  ASCII  text,  or
11//! NFC-normalized Unicode text of Latin languages, you don't need the feature.
12//!
13//! The width information is provided by the `wcwidth` crate.
14//!
15//! Without `unicode-width` (example text from
16//! [here](https://en.wikipedia.org/wiki/Korea#Etymology)):
17//!
18//! ```text
19//! "Korea"  is the modern spelling of "Corea", a name attested in English as  early
20//! as  1614.[citation  needed] Korea was transliterated as Cauli in The Travels  of
21//! Marco  Polo,[10] based on the kingdom of Goryeo (Hangul: 고려; Hanja:  高麗;
22//! MR:  Koryŏ), which ruled most of the Korean peninsula during Marco Polo's time.
23//! Korea's  introduction to the West resulted from trade and contact with merchants
24//! from  Arabic  lands,[11]  with  some  records dating back  as  far  as  the  9th
25//! century.[12]  Goryeo's  name  was  a continuation  of  Goguryeo  (Koguryŏ)  the
26//! northernmost  of  the  Three Kingdoms of Korea, which was  officially  known  as
27//! Goryeo  beginning in the 5th century.[13] The original name was a combination of
28//! the  adjective  go ("high, lofty") with the name of a local Yemaek tribe,  whose
29//! original  name  is  thought to have been either *Guru  (溝樓,  "walled  city,"
30//! inferred   from  some  toponyms  in  Chinese  historical  documents)  or  *Gauri
31//! (가우리, "center").
32//! ```
33//!
34//! With `unicode-width` and `wcwidth: true` in `Settings` struct:
35//!
36//! ```text
37//! "Korea"  is the modern spelling of "Corea", a name attested in English as  early
38//! as  1614.[citation  needed] Korea was transliterated as Cauli in The Travels  of
39//! Marco  Polo,[10] based on the kingdom of Goryeo (Hangul: 고려; Hanja: 高麗;  MR:
40//! Koryŏ),  which  ruled  most of the Korean peninsula during  Marco  Polo's  time.
41//! Korea's  introduction to the West resulted from trade and contact with merchants
42//! from  Arabic  lands,[11]  with  some  records dating back  as  far  as  the  9th
43//! century.[12]  Goryeo's  name  was  a  continuation  of  Goguryeo  (Koguryŏ)  the
44//! northernmost  of  the  Three Kingdoms of Korea, which was  officially  known  as
45//! Goryeo  beginning in the 5th century.[13] The original name was a combination of
46//! the  adjective  go ("high, lofty") with the name of a local Yemaek tribe,  whose
47//! original  name  is  thought  to have been either  *Guru  (溝樓,  "walled  city,"
48//! inferred  from some toponyms in Chinese historical documents) or *Gauri (가우리,
49//! "center").
50//! ```
51//!
52//! Notice  that  the  justification is better with `unicode-width`, but  there  are
53//! still  lines where the justification is one off. That's because it's not  always
54//! possible  to  justify perfectly: as Korean characters take two terminal  spaces,
55//! and  Latin  letters  take one, it's possible for there to be an  odd  number  of
56//! characters  on  a line to be justified. Also, depending on your browser, it  may
57//! not look right, try pasting it into a terminal emulator.
58
59#[cfg(feature="unicode-width")] extern crate unicode_width;
60#[cfg(feature="unicode-width")] use unicode_width::{UnicodeWidthChar, UnicodeWidthStr};
61
62/// Where to insert spaces (use with `Settings`)
63pub enum InsertAt<'a> {
64    /// Spaces are added starting at the left.
65    Left,
66    /// Spaces are added starting at the right.
67    Right,
68    /// Default;  e.g.  if there are 5 places spaced could be added,  the  first
69    /// space  goes in place 1, the second space in place 5, the third space  in
70    /// place 2, fourth space in place 4, etc.
71    Balanced,
72    /// The function receives the current 0-indexed iteration in position 1, the
73    /// total number of spaces to be added in position 2, the number of possible
74    /// entry  points in position 3, and the line being justified in position 4.
75    /// This  could  be used, for example, to implement insertion of  spaces  at
76    /// random  points. If using this, you may not need every argument, but they
77    /// are provided anyway for maximum extensibility.
78    Custom(&'a dyn Fn(usize, usize, usize, &Vec<&str>)->usize)
79}
80
81/// Settings used by `justify` and `justify_paragraph`
82pub struct Settings<'a> {
83    /// Whether the last line should also be justified. Can result in weird output if the last line
84    /// contains very few words.
85    pub justify_last_line: bool,
86    /// Hyphenate if a word is longer than `self.width`
87    pub hyphenate_overflow: bool,
88    /// Width (in codepoints)
89    pub width: usize,
90    /// In a given line, the pattern spaces should be inserted at.
91    pub insert_at: InsertAt<'a>,
92    #[cfg(feature="unicode-width")]
93    /// On unicode text, attempt to use wcwidth
94    pub wcwidth: bool,
95    /// This feature is sometimes useful with CJK text in conjunction with hyphenate_overflow. When
96    /// on, spaces are not considered when justifying text.
97    pub ignore_spaces: bool,
98    /// The string that should be used to separate lines. Perhaps useful on Windows where you might
99    /// want "\r\n" instead.
100    pub newline: &'a str,
101    /// The hyphen that should be used if `hyphenate_overflow` is true
102    pub hyphen: &'a str,
103    /// The separator between paragraphs when `justify` is called
104    pub separator: &'a str
105}
106
107impl<'a> Default for Settings<'a> {
108    fn default() -> Self {
109        Settings {
110            justify_last_line: false,
111            width: 80,
112            hyphenate_overflow: false,
113            insert_at: InsertAt::Balanced,
114            #[cfg(feature="unicode-width")]
115            wcwidth: false,
116            ignore_spaces: false,
117            newline: "\n",
118            hyphen: "-",
119            separator: "\n\n"
120        }
121    }
122}
123
124/// Generate where we should break and put it into v, like
125/// vec![0, 12, 26, 40, 52, 65]
126fn get_break_indexes(words: &Vec<&str>, settings: &Settings) -> Vec<usize> {
127    let mut n = 0;
128    let mut v = Vec::with_capacity(words.len()/4);
129    v.push(0);
130
131    for (i, word) in words.iter().enumerate() {
132        let mut c;
133        #[cfg(feature="unicode-width")] {
134        if settings.wcwidth {
135            c = n + word.width();
136        } else {
137            c = n + word.len();
138        }
139        }
140        #[cfg(not(feature="unicode-width"))] {
141            c = n + word.len();
142        }
143        if word.len() == 0 { continue }
144        // If the last character in the word is whitespace, we have to ignore it in the
145        // comparison, otherwise lines which are exactly the right width will be broken
146        // as if they were one character too long.
147        let cc = word.chars().nth(word.len()-1);
148        if c - if cc.map_or(false, char::is_whitespace) { 1 } else { 0 } > settings.width {
149            v.push(i);
150            n = word.len();
151        } else {
152            n = c;
153        }
154    }
155
156    v
157}
158
159fn lines_from_indexes<'a>(words: &Vec<&'a str>, breaks: &Vec<usize>) -> Vec<Vec<&'a str>> {
160    let mut lines: Vec<Vec<&str>> = Vec::with_capacity(breaks.len());
161
162    for i in 0..breaks.len()-1 {
163        let mut t_v = Vec::from(&words[breaks[i]..breaks[i+1]]);
164        let t_l = t_v.len();
165        // Chop the final " " off of the last string in a line
166        // last element of t_v = last element of t_v[0..length of last element of t_v-1]
167        if t_v.len() == 0 { continue }
168        t_v[t_l-1] = &t_v[t_l-1][0..&t_v[t_l-1].len()-1];
169        lines.push(t_v);
170    }
171
172    // Handle last line
173    lines.push(Vec::from(&words[breaks[breaks.len()-1]..]));
174
175    lines
176}
177
178/// Determines how many spaces need to be added to the line to get it to width.
179fn spaces_to_add(lines: &Vec<Vec<&str>>, settings: &Settings) -> Vec<usize> {
180    let mut spaces: Vec<usize> = Vec::with_capacity(lines.len());
181
182    for line in lines.iter() {
183        let mut size = line.iter().fold(0, |acc, &x| acc + x.len());
184        #[cfg(feature="unicode-width")]
185        match settings.wcwidth {
186            true => {size = line.iter().fold(0, |acc, &x| acc + x.width())},
187            false => {}
188        }
189
190        if settings.width < size {
191            spaces.push(0);
192        } else {
193            spaces.push(settings.width - size);
194        }
195    }
196
197    spaces
198}
199
200/// Adds the spaces. Should be used with `spaces_to_add`
201fn add_spaces(add: usize, line: &Vec<&str>, insert_at: &InsertAt) -> String {
202    if line.len() == 0 { return String::new() }
203    let v_i = line.len()-1;
204    let mut add_v = vec![0; v_i];
205
206    if v_i == 0 {
207        return line[0].to_owned()
208    }
209
210    match *insert_at {
211        InsertAt::Left => {
212            for j in (1..v_i+1).into_iter().cycle().take(add) {
213                add_v[j-1] += 1;
214            }
215        },
216        InsertAt::Right => {
217            for j in (1..v_i+1).rev().into_iter().cycle().take(add) {
218                add_v[j-1] += 1;
219            }
220        },
221        InsertAt::Balanced => {
222            for j in (1..v_i+1).into_iter().cycle().take(add) {
223                if j % 2 == 0 { //EVEN
224                    add_v[v_i - (j/2)] += 1;
225                } else { //ODD
226                    add_v[(j/2)] += 1;
227                }
228            }
229        },
230        InsertAt::Custom(f) => {
231            for j in 0..add {
232                add_v[f(j, add, v_i, line)] += 1;
233            }
234        }
235    }
236
237    let space_s: Vec<String> = add_v.iter()
238        .map(|i|" ".repeat(*i))
239        .collect();
240
241    // Length of spaces
242    let space_l: usize = add_v.iter().sum();
243    // Length of text in line
244    let line_l: usize = line.iter().map(|e|e.len()).sum();
245
246    line.iter()
247        .enumerate()
248        .fold(
249            String::with_capacity(space_l + line_l),
250            |acc, (i, x)| {
251                if i < line.len()-1 {
252                    acc + x + &space_s[i]
253                } else {
254                    acc + x
255                }
256            }
257        )
258}
259
260/// This function is needed because there is no better way(?) to split a string such that the sum
261/// of the lengths of the output equals the length of the input.  That is to say: "e
262/// e".split(char::is_whitespace) returns vec!["e", "e"] while we want vec!["e ", "e"]
263fn split_into_words(text: &str) -> Vec<&str> {
264    let zero = vec![0];
265
266    let indices: Vec<_> = zero.into_iter()
267        .chain(
268            text.match_indices(char::is_whitespace)
269            .map(|(i, _)|i+1)
270            )
271        .collect();
272
273    let mut wwords = Vec::with_capacity(indices.len());
274
275    for i in 0..indices.len()-1 {
276        let t = &text[indices[i]..indices[i+1]];
277        if !t.chars().all(char::is_whitespace) {
278            wwords.push(t);
279        }
280    }
281
282    wwords.push(&text[indices[indices.len()-1]..]);
283
284    wwords
285}
286
287#[cfg(feature="unicode-width")]
288fn hyphenate_overflow(text: &str, settings: &Settings) -> String {
289    let mut ret = String::with_capacity(text.len());
290    let sws: Vec<_>;
291    let joiner: &str;
292    if settings.ignore_spaces {
293        sws = text.split(settings.newline).collect();
294        joiner = settings.newline;
295    } else {
296        sws = text.split_whitespace().collect();
297        joiner = " ";
298    }
299    let tl = sws.len();
300
301    for (i, s) in sws.iter().enumerate() {
302        if s.len() > settings.width {
303            let h = s.chars()
304                .collect::<Vec<_>>();
305
306            let widths: Vec<usize> = h.iter()
307                .map(|e| e.width().unwrap_or(0))
308                .collect();
309
310            let mut q = 0;
311            let mut hq = vec![0];
312            for (i, w) in widths.into_iter().enumerate() {
313                q += w;
314                if q > settings.width-(settings.hyphen.len()) {
315                    hq.push(i);
316                    q=w;
317                }
318            }
319
320            let mut hhq = Vec::new();
321            for e in hq.windows(2) {
322                if e.len() == 2 {
323                    hhq.push(&h[e[0]..e[1]]);
324                } else {
325                    continue
326                }
327            }
328            hhq.push(&h[*hq.last().unwrap()..]);
329
330            let mut hh = hhq.iter().peekable();
331
332            let mut f: Vec<String> = Vec::new();
333            loop {
334                let s: String = hh.next().unwrap().iter().collect();
335                if hh.peek().is_some() {
336                    f.push(s + settings.hyphen);
337                } else {
338                    f.push(s);
339                    break
340                }
341            }
342
343            ret += &f.join(joiner);
344        } else {
345            ret += s;
346        }
347        if i != tl-1 {
348            ret += joiner;
349        }
350    }
351
352    ret
353}
354
355#[cfg(not(feature="unicode-width"))]
356fn hyphenate_overflow(text: &str, settings: &Settings) -> String {
357    let mut ret = String::with_capacity(text.len());
358    let sws: Vec<_>;
359    let joiner: &str;
360    if settings.ignore_spaces {
361        sws = text.split(settings.newline).collect();
362        joiner = settings.newline;
363    } else {
364        sws = text.split_whitespace().collect();
365        joiner = " ";
366    }
367    let tl = sws.len();
368
369    for (i, s) in sws.iter().enumerate() {
370        if s.len() > settings.width {
371            let h = s.chars().collect::<Vec<_>>();
372
373            let mut f: Vec<String> = Vec::new();
374            let mut p = h.chunks(settings.width-(settings.hyphen.len())).peekable();
375
376            loop {
377                let s: String = p.next().unwrap().iter().collect();
378                if p.peek().is_some() {
379                    f.push(s + settings.hyphen);
380                } else {
381                    f.push(s);
382                    break
383                }
384            }
385
386            ret += &f.join(joiner);
387        } else {
388            ret += s;
389        }
390        if i != tl-1 {
391            ret += joiner;
392        }
393    }
394
395    ret
396}
397
398/// Justify a single paragraph. Panics if "paragraph" contains newlines.
399pub fn justify_paragraph(text: &str, settings: &Settings) -> String {
400    if text.contains("\n") {
401        panic!("Expected `text` to contain no newlines but it did")
402    }
403
404    let mut ret = String::with_capacity(text.len() + (text.len() / 3));
405
406    let words = split_into_words(text);
407    //eprintln!("W:{:?}",words);
408    let breaks = get_break_indexes(&words, &settings);
409    //eprintln!("B:{:?}",breaks);
410    let lines = lines_from_indexes(&words, &breaks);
411    //eprintln!("L:{:?}",lines);
412    let spaces = spaces_to_add(&lines, &settings);
413    //eprintln!("S:{:?}",spaces);
414
415    for (i, space) in spaces.iter().enumerate() {
416        if !settings.justify_last_line && i == spaces.len() - 1 {
417            ret += &lines[spaces.len()-1].join("");
418            break
419        }
420        if !settings.ignore_spaces {
421            let add = &add_spaces(*space, &lines[i], &settings.insert_at);
422            ret += add;
423        } else {
424            ret += &lines[i].join(" ");
425        }
426        ret += settings.newline;
427    }
428
429    ret
430}
431
432/// Justify `text` according to the parameters in `settings`.
433pub fn justify(text: &str, settings: &Settings) -> String {
434    let mut h = String::new();
435    if settings.hyphenate_overflow {
436        h = hyphenate_overflow(text, &settings);
437    }
438
439    if settings.ignore_spaces {
440        return h;
441    }
442
443    if settings.hyphenate_overflow { h.as_str() } else { text }
444        .split(settings.newline)
445        .filter(
446            |e|e.len()!=0
447            )
448        .map(
449            |p| justify_paragraph(p, settings)
450            )
451        .collect::<Vec<_>>()
452        .join(settings.separator)
453}