justify/lib.rs
1//! This crate justifies plaintext for display in a terminal emulator in a (mostly)
2//! Unicode friendly way.
3//!
4//! **Examples of use can be found in the file `tests/tests.rs`.**
5//!
6//! If the crate is compiled with the `unicode-width` feature (e.g. via `cargo build
7//! --features unicode-width`), Unicode is handled gracefully. With this feature, a
8//! CJK character such as 한 takes two spaces, while combining characters take 0.
9//! Without this feature, every Unicode character takes one space, which can lead to
10//! poor output in some cases. If you will only ever justify ASCII text, or
11//! NFC-normalized Unicode text of Latin languages, you don't need the feature.
12//!
13//! The width information is provided by the `wcwidth` crate.
14//!
15//! Without `unicode-width` (example text from
16//! [here](https://en.wikipedia.org/wiki/Korea#Etymology)):
17//!
18//! ```text
19//! "Korea" is the modern spelling of "Corea", a name attested in English as early
20//! as 1614.[citation needed] Korea was transliterated as Cauli in The Travels of
21//! Marco Polo,[10] based on the kingdom of Goryeo (Hangul: 고려; Hanja: 高麗;
22//! MR: Koryŏ), which ruled most of the Korean peninsula during Marco Polo's time.
23//! Korea's introduction to the West resulted from trade and contact with merchants
24//! from Arabic lands,[11] with some records dating back as far as the 9th
25//! century.[12] Goryeo's name was a continuation of Goguryeo (Koguryŏ) the
26//! northernmost of the Three Kingdoms of Korea, which was officially known as
27//! Goryeo beginning in the 5th century.[13] The original name was a combination of
28//! the adjective go ("high, lofty") with the name of a local Yemaek tribe, whose
29//! original name is thought to have been either *Guru (溝樓, "walled city,"
30//! inferred from some toponyms in Chinese historical documents) or *Gauri
31//! (가우리, "center").
32//! ```
33//!
34//! With `unicode-width` and `wcwidth: true` in `Settings` struct:
35//!
36//! ```text
37//! "Korea" is the modern spelling of "Corea", a name attested in English as early
38//! as 1614.[citation needed] Korea was transliterated as Cauli in The Travels of
39//! Marco Polo,[10] based on the kingdom of Goryeo (Hangul: 고려; Hanja: 高麗; MR:
40//! Koryŏ), which ruled most of the Korean peninsula during Marco Polo's time.
41//! Korea's introduction to the West resulted from trade and contact with merchants
42//! from Arabic lands,[11] with some records dating back as far as the 9th
43//! century.[12] Goryeo's name was a continuation of Goguryeo (Koguryŏ) the
44//! northernmost of the Three Kingdoms of Korea, which was officially known as
45//! Goryeo beginning in the 5th century.[13] The original name was a combination of
46//! the adjective go ("high, lofty") with the name of a local Yemaek tribe, whose
47//! original name is thought to have been either *Guru (溝樓, "walled city,"
48//! inferred from some toponyms in Chinese historical documents) or *Gauri (가우리,
49//! "center").
50//! ```
51//!
52//! Notice that the justification is better with `unicode-width`, but there are
53//! still lines where the justification is one off. That's because it's not always
54//! possible to justify perfectly: as Korean characters take two terminal spaces,
55//! and Latin letters take one, it's possible for there to be an odd number of
56//! characters on a line to be justified. Also, depending on your browser, it may
57//! not look right, try pasting it into a terminal emulator.
58
59#[cfg(feature="unicode-width")] extern crate unicode_width;
60#[cfg(feature="unicode-width")] use unicode_width::{UnicodeWidthChar, UnicodeWidthStr};
61
62/// Where to insert spaces (use with `Settings`)
63pub enum InsertAt<'a> {
64 /// Spaces are added starting at the left.
65 Left,
66 /// Spaces are added starting at the right.
67 Right,
68 /// Default; e.g. if there are 5 places spaced could be added, the first
69 /// space goes in place 1, the second space in place 5, the third space in
70 /// place 2, fourth space in place 4, etc.
71 Balanced,
72 /// The function receives the current 0-indexed iteration in position 1, the
73 /// total number of spaces to be added in position 2, the number of possible
74 /// entry points in position 3, and the line being justified in position 4.
75 /// This could be used, for example, to implement insertion of spaces at
76 /// random points. If using this, you may not need every argument, but they
77 /// are provided anyway for maximum extensibility.
78 Custom(&'a dyn Fn(usize, usize, usize, &Vec<&str>)->usize)
79}
80
81/// Settings used by `justify` and `justify_paragraph`
82pub struct Settings<'a> {
83 /// Whether the last line should also be justified. Can result in weird output if the last line
84 /// contains very few words.
85 pub justify_last_line: bool,
86 /// Hyphenate if a word is longer than `self.width`
87 pub hyphenate_overflow: bool,
88 /// Width (in codepoints)
89 pub width: usize,
90 /// In a given line, the pattern spaces should be inserted at.
91 pub insert_at: InsertAt<'a>,
92 #[cfg(feature="unicode-width")]
93 /// On unicode text, attempt to use wcwidth
94 pub wcwidth: bool,
95 /// This feature is sometimes useful with CJK text in conjunction with hyphenate_overflow. When
96 /// on, spaces are not considered when justifying text.
97 pub ignore_spaces: bool,
98 /// The string that should be used to separate lines. Perhaps useful on Windows where you might
99 /// want "\r\n" instead.
100 pub newline: &'a str,
101 /// The hyphen that should be used if `hyphenate_overflow` is true
102 pub hyphen: &'a str,
103 /// The separator between paragraphs when `justify` is called
104 pub separator: &'a str
105}
106
107impl<'a> Default for Settings<'a> {
108 fn default() -> Self {
109 Settings {
110 justify_last_line: false,
111 width: 80,
112 hyphenate_overflow: false,
113 insert_at: InsertAt::Balanced,
114 #[cfg(feature="unicode-width")]
115 wcwidth: false,
116 ignore_spaces: false,
117 newline: "\n",
118 hyphen: "-",
119 separator: "\n\n"
120 }
121 }
122}
123
124/// Generate where we should break and put it into v, like
125/// vec![0, 12, 26, 40, 52, 65]
126fn get_break_indexes(words: &Vec<&str>, settings: &Settings) -> Vec<usize> {
127 let mut n = 0;
128 let mut v = Vec::with_capacity(words.len()/4);
129 v.push(0);
130
131 for (i, word) in words.iter().enumerate() {
132 let mut c;
133 #[cfg(feature="unicode-width")] {
134 if settings.wcwidth {
135 c = n + word.width();
136 } else {
137 c = n + word.len();
138 }
139 }
140 #[cfg(not(feature="unicode-width"))] {
141 c = n + word.len();
142 }
143 if word.len() == 0 { continue }
144 // If the last character in the word is whitespace, we have to ignore it in the
145 // comparison, otherwise lines which are exactly the right width will be broken
146 // as if they were one character too long.
147 let cc = word.chars().nth(word.len()-1);
148 if c - if cc.map_or(false, char::is_whitespace) { 1 } else { 0 } > settings.width {
149 v.push(i);
150 n = word.len();
151 } else {
152 n = c;
153 }
154 }
155
156 v
157}
158
159fn lines_from_indexes<'a>(words: &Vec<&'a str>, breaks: &Vec<usize>) -> Vec<Vec<&'a str>> {
160 let mut lines: Vec<Vec<&str>> = Vec::with_capacity(breaks.len());
161
162 for i in 0..breaks.len()-1 {
163 let mut t_v = Vec::from(&words[breaks[i]..breaks[i+1]]);
164 let t_l = t_v.len();
165 // Chop the final " " off of the last string in a line
166 // last element of t_v = last element of t_v[0..length of last element of t_v-1]
167 if t_v.len() == 0 { continue }
168 t_v[t_l-1] = &t_v[t_l-1][0..&t_v[t_l-1].len()-1];
169 lines.push(t_v);
170 }
171
172 // Handle last line
173 lines.push(Vec::from(&words[breaks[breaks.len()-1]..]));
174
175 lines
176}
177
178/// Determines how many spaces need to be added to the line to get it to width.
179fn spaces_to_add(lines: &Vec<Vec<&str>>, settings: &Settings) -> Vec<usize> {
180 let mut spaces: Vec<usize> = Vec::with_capacity(lines.len());
181
182 for line in lines.iter() {
183 let mut size = line.iter().fold(0, |acc, &x| acc + x.len());
184 #[cfg(feature="unicode-width")]
185 match settings.wcwidth {
186 true => {size = line.iter().fold(0, |acc, &x| acc + x.width())},
187 false => {}
188 }
189
190 if settings.width < size {
191 spaces.push(0);
192 } else {
193 spaces.push(settings.width - size);
194 }
195 }
196
197 spaces
198}
199
200/// Adds the spaces. Should be used with `spaces_to_add`
201fn add_spaces(add: usize, line: &Vec<&str>, insert_at: &InsertAt) -> String {
202 if line.len() == 0 { return String::new() }
203 let v_i = line.len()-1;
204 let mut add_v = vec![0; v_i];
205
206 if v_i == 0 {
207 return line[0].to_owned()
208 }
209
210 match *insert_at {
211 InsertAt::Left => {
212 for j in (1..v_i+1).into_iter().cycle().take(add) {
213 add_v[j-1] += 1;
214 }
215 },
216 InsertAt::Right => {
217 for j in (1..v_i+1).rev().into_iter().cycle().take(add) {
218 add_v[j-1] += 1;
219 }
220 },
221 InsertAt::Balanced => {
222 for j in (1..v_i+1).into_iter().cycle().take(add) {
223 if j % 2 == 0 { //EVEN
224 add_v[v_i - (j/2)] += 1;
225 } else { //ODD
226 add_v[(j/2)] += 1;
227 }
228 }
229 },
230 InsertAt::Custom(f) => {
231 for j in 0..add {
232 add_v[f(j, add, v_i, line)] += 1;
233 }
234 }
235 }
236
237 let space_s: Vec<String> = add_v.iter()
238 .map(|i|" ".repeat(*i))
239 .collect();
240
241 // Length of spaces
242 let space_l: usize = add_v.iter().sum();
243 // Length of text in line
244 let line_l: usize = line.iter().map(|e|e.len()).sum();
245
246 line.iter()
247 .enumerate()
248 .fold(
249 String::with_capacity(space_l + line_l),
250 |acc, (i, x)| {
251 if i < line.len()-1 {
252 acc + x + &space_s[i]
253 } else {
254 acc + x
255 }
256 }
257 )
258}
259
260/// This function is needed because there is no better way(?) to split a string such that the sum
261/// of the lengths of the output equals the length of the input. That is to say: "e
262/// e".split(char::is_whitespace) returns vec!["e", "e"] while we want vec!["e ", "e"]
263fn split_into_words(text: &str) -> Vec<&str> {
264 let zero = vec![0];
265
266 let indices: Vec<_> = zero.into_iter()
267 .chain(
268 text.match_indices(char::is_whitespace)
269 .map(|(i, _)|i+1)
270 )
271 .collect();
272
273 let mut wwords = Vec::with_capacity(indices.len());
274
275 for i in 0..indices.len()-1 {
276 let t = &text[indices[i]..indices[i+1]];
277 if !t.chars().all(char::is_whitespace) {
278 wwords.push(t);
279 }
280 }
281
282 wwords.push(&text[indices[indices.len()-1]..]);
283
284 wwords
285}
286
287#[cfg(feature="unicode-width")]
288fn hyphenate_overflow(text: &str, settings: &Settings) -> String {
289 let mut ret = String::with_capacity(text.len());
290 let sws: Vec<_>;
291 let joiner: &str;
292 if settings.ignore_spaces {
293 sws = text.split(settings.newline).collect();
294 joiner = settings.newline;
295 } else {
296 sws = text.split_whitespace().collect();
297 joiner = " ";
298 }
299 let tl = sws.len();
300
301 for (i, s) in sws.iter().enumerate() {
302 if s.len() > settings.width {
303 let h = s.chars()
304 .collect::<Vec<_>>();
305
306 let widths: Vec<usize> = h.iter()
307 .map(|e| e.width().unwrap_or(0))
308 .collect();
309
310 let mut q = 0;
311 let mut hq = vec![0];
312 for (i, w) in widths.into_iter().enumerate() {
313 q += w;
314 if q > settings.width-(settings.hyphen.len()) {
315 hq.push(i);
316 q=w;
317 }
318 }
319
320 let mut hhq = Vec::new();
321 for e in hq.windows(2) {
322 if e.len() == 2 {
323 hhq.push(&h[e[0]..e[1]]);
324 } else {
325 continue
326 }
327 }
328 hhq.push(&h[*hq.last().unwrap()..]);
329
330 let mut hh = hhq.iter().peekable();
331
332 let mut f: Vec<String> = Vec::new();
333 loop {
334 let s: String = hh.next().unwrap().iter().collect();
335 if hh.peek().is_some() {
336 f.push(s + settings.hyphen);
337 } else {
338 f.push(s);
339 break
340 }
341 }
342
343 ret += &f.join(joiner);
344 } else {
345 ret += s;
346 }
347 if i != tl-1 {
348 ret += joiner;
349 }
350 }
351
352 ret
353}
354
355#[cfg(not(feature="unicode-width"))]
356fn hyphenate_overflow(text: &str, settings: &Settings) -> String {
357 let mut ret = String::with_capacity(text.len());
358 let sws: Vec<_>;
359 let joiner: &str;
360 if settings.ignore_spaces {
361 sws = text.split(settings.newline).collect();
362 joiner = settings.newline;
363 } else {
364 sws = text.split_whitespace().collect();
365 joiner = " ";
366 }
367 let tl = sws.len();
368
369 for (i, s) in sws.iter().enumerate() {
370 if s.len() > settings.width {
371 let h = s.chars().collect::<Vec<_>>();
372
373 let mut f: Vec<String> = Vec::new();
374 let mut p = h.chunks(settings.width-(settings.hyphen.len())).peekable();
375
376 loop {
377 let s: String = p.next().unwrap().iter().collect();
378 if p.peek().is_some() {
379 f.push(s + settings.hyphen);
380 } else {
381 f.push(s);
382 break
383 }
384 }
385
386 ret += &f.join(joiner);
387 } else {
388 ret += s;
389 }
390 if i != tl-1 {
391 ret += joiner;
392 }
393 }
394
395 ret
396}
397
398/// Justify a single paragraph. Panics if "paragraph" contains newlines.
399pub fn justify_paragraph(text: &str, settings: &Settings) -> String {
400 if text.contains("\n") {
401 panic!("Expected `text` to contain no newlines but it did")
402 }
403
404 let mut ret = String::with_capacity(text.len() + (text.len() / 3));
405
406 let words = split_into_words(text);
407 //eprintln!("W:{:?}",words);
408 let breaks = get_break_indexes(&words, &settings);
409 //eprintln!("B:{:?}",breaks);
410 let lines = lines_from_indexes(&words, &breaks);
411 //eprintln!("L:{:?}",lines);
412 let spaces = spaces_to_add(&lines, &settings);
413 //eprintln!("S:{:?}",spaces);
414
415 for (i, space) in spaces.iter().enumerate() {
416 if !settings.justify_last_line && i == spaces.len() - 1 {
417 ret += &lines[spaces.len()-1].join("");
418 break
419 }
420 if !settings.ignore_spaces {
421 let add = &add_spaces(*space, &lines[i], &settings.insert_at);
422 ret += add;
423 } else {
424 ret += &lines[i].join(" ");
425 }
426 ret += settings.newline;
427 }
428
429 ret
430}
431
432/// Justify `text` according to the parameters in `settings`.
433pub fn justify(text: &str, settings: &Settings) -> String {
434 let mut h = String::new();
435 if settings.hyphenate_overflow {
436 h = hyphenate_overflow(text, &settings);
437 }
438
439 if settings.ignore_spaces {
440 return h;
441 }
442
443 if settings.hyphenate_overflow { h.as_str() } else { text }
444 .split(settings.newline)
445 .filter(
446 |e|e.len()!=0
447 )
448 .map(
449 |p| justify_paragraph(p, settings)
450 )
451 .collect::<Vec<_>>()
452 .join(settings.separator)
453}