1use crate::{
2 Snip, Localize, Local,
3 PipeParser,
4 Error,
5};
6
7#[derive(Debug,Clone,Copy,Default,PartialEq,Eq,PartialOrd,Ord,Hash)]
8pub struct Processed {
9 pub chars: usize,
10 pub bytes: usize,
11}
12
13pub trait Source {
14 fn next_char(&mut self) -> SourceResult;
15 fn processed(&self) -> Processed;
16}
17
18pub trait IntoSource {
19 type Source: Source;
20
21 fn into_source(self) -> Self::Source;
22}
23
24pub trait Sourcefy {
25 fn sourcefy(self) -> SourceEvent;
26}
27
28pub type SourceResult = Result<Option<Local<SourceEvent>>,Error>;
29
30#[derive(Debug,Clone,Copy,Eq,PartialEq)]
31pub enum Breaker {
33 None,
34 Space,
35 Word,
36 Line,
37 Sentence,
38 Paragraph,
39 Section,
40}
41
42#[derive(Debug,Clone,Copy,Eq,PartialEq)]
43pub enum SourceEvent {
44 Char(char),
45 Breaker(Breaker),
46}
47impl Sourcefy for char {
48 fn sourcefy(self) -> SourceEvent {
49 SourceEvent::Char(self)
50 }
51}
52impl Sourcefy for Breaker {
53 fn sourcefy(self) -> SourceEvent {
54 SourceEvent::Breaker(self)
55 }
56}
57
58impl Breaker {
59 pub fn into_source_as(self, s: &str) -> OptSource {
60 let blen = s.len();
61 let clen = s.chars().count();
62 OptSource::new(self.sourcefy().localize(Snip{ offset: 0, length: clen }, Snip{ offset: 0, length: blen }))
63 }
64}
65
66impl IntoSource for char {
67 type Source = OptSource;
68 fn into_source(self) -> Self::Source {
69 let blen = self.len_utf8();
70 OptSource::new(self.sourcefy().localize(Snip{ offset: 0, length: 1 }, Snip{ offset: 0, length: blen }))
71 }
72}
73
74pub struct EmptySource;
75impl Source for EmptySource {
76 fn next_char(&mut self) -> SourceResult {
77 Ok(None)
78 }
79 fn processed(&self) -> Processed {
80 Processed::default()
81 }
82}
83
84impl<S> Source for Option<S>
85where S: Source
86{
87 fn next_char(&mut self) -> SourceResult {
88 match self {
89 Some(source) => source.next_char(),
90 None => Ok(None),
91 }
92 }
93 fn processed(&self) -> Processed {
94 match self {
95 Some(source) => source.processed(),
96 None => Processed::default(),
97 }
98 }
99}
100
101pub struct ParserSource<'p,'s,P,S> {
102 parser: &'p mut P,
103 source: &'s mut S,
104}
105impl<'p,'s,P,S> ParserSource<'p,'s,P,S> {
106 pub fn new<'a,'b>(parser: &'a mut P, source: &'b mut S) -> ParserSource<'a,'b,P,S> {
107 ParserSource { parser, source }
108 }
109}
110impl<'p,'s,P,S> Source for ParserSource<'p,'s,P,S>
111where P: PipeParser,
112 S: Source
113{
114 fn next_char(&mut self) -> SourceResult {
115 self.parser.next_char(self.source)
116 }
117 fn processed(&self) -> Processed {
118 self.source.processed()
119 }
120}
121
122pub struct OptSource {
123 source: Option<Local<SourceEvent>>,
124 done: Processed,
125}
126impl OptSource {
127 pub fn new(local_se: Local<SourceEvent>) -> OptSource {
128 OptSource {
129 source: Some(local_se),
130 done: Processed::default(),
131 }
132 }
133}
134impl Source for OptSource {
135 fn next_char(&mut self) -> SourceResult {
136 let r = self.source.take();
137 if let Some(local_se) = &r {
138 self.done.chars += local_se.chars().length;
139 self.done.bytes += local_se.bytes().length;
140 }
141 Ok(r)
142 }
143 fn processed(&self) -> Processed {
144 self.done
145 }
146}
147
148impl<'s> IntoSource for &'s str {
149 type Source = StrSource<'s>;
150 fn into_source(self) -> Self::Source {
151 StrSource::new(self)
152 }
153}
154
155impl<'s> IntoSource for &'s String {
156 type Source = StrSource<'s>;
157 fn into_source(self) -> Self::Source {
158 StrSource::new(self as &str)
159 }
160}
161
162pub struct StrSource<'s> {
163 source: std::iter::Enumerate<std::str::CharIndices<'s>>,
164 done: Processed,
165}
166impl<'s> StrSource<'s> {
167 pub fn new(s: &str) -> StrSource {
168 StrSource {
169 source: s.char_indices().enumerate(),
170 done: Processed::default(),
171 }
172 }
173}
174impl<'s> Source for StrSource<'s> {
175 fn next_char(&mut self) -> SourceResult {
176 Ok(self.source.next().map(|(char_index,(byte_index,c))| {
177 let chars = Snip { offset: char_index, length: 1 };
178 let bytes = Snip { offset: byte_index, length: c.len_utf8() };
179 let r = c.sourcefy().localize(chars,bytes);
180 self.done.chars += r.chars().length;
181 self.done.bytes += r.bytes().length;
182 r
183 }))
184 }
185 fn processed(&self) -> Processed {
186 self.done
187 }
188}
189
190impl<T: Source> SourceExt for T {}
191
192pub trait SourceExt: Source + Sized {
193 fn pipe<P>(self, parser: P) -> Pipe<Self,P>
194 where P: PipeParser
195 {
196 Pipe {
197 source: self,
198 parser,
199 }
200 }
201 fn filter_char<F>(self, filter: F) -> Filtered<Self,F>
202 where F: FnMut(char) -> Option<char>
203 {
204 Filtered {
205 source: self,
206 filter,
207 }
208 }
209 fn map_char<M>(self, mapper: M) -> MapChar<Self,M> {
210 MapChar {
211 source: self,
212 mapper,
213 }
214 }
215 fn into_separator(self) -> IntoSeparator<Self> {
216 IntoSeparator {
217 source: self,
218 }
219 }
220 fn merge_separators(self) -> MergeSeparator<Self> {
221 MergeSeparator {
222 source: self,
223 buffer: None,
224 current: None,
225 }
226 }
227 fn chain<S: Source>(self, chained: S) -> Chain<Self,S> {
235 Chain {
236 inner: InnerChain::First(self),
237 second: Some(chained),
238 }
239 }
240 fn try_map<M>(self, mapper: M) -> Map<Self,M> {
241 Map {
242 source: self,
243 mapper,
244 }
245 }
246}
247
248pub trait CharMapper {
249 fn map(&mut self, c: char) -> char;
250}
251
252pub trait Mapper {
253 fn map(&mut self, se: &SourceEvent) -> Option<SourceEvent>;
254}
255
256pub struct MapChar<S,M>
257{
258 source: S,
259 mapper: M,
260}
261impl<S,M> Source for MapChar<S,M>
262where S: Source,
263 M: CharMapper
264{
265 fn next_char(&mut self) -> SourceResult {
266 self.source.next_char().map(|ole| ole.map(|local_se| local_se.map(|se| match se {
267 SourceEvent::Char(c) => SourceEvent::Char(self.mapper.map(c)),
268 b @ SourceEvent::Breaker(_) => b,
269 })))
270 }
271 fn processed(&self) -> Processed {
272 self.source.processed()
273 }
274}
275
276
277pub struct Map<S,M>
278{
279 source: S,
280 mapper: M,
281}
282impl<S,M> Source for Map<S,M>
283where S: Source,
284 M: Mapper
285{
286 fn next_char(&mut self) -> SourceResult {
287 Ok(match self.source.next_char()? {
288 Some(local_se) => {
289 let (local,se) = local_se.into_inner();
290 Some(match self.mapper.map(&se) {
291 Some(se) => local.local(se),
292 None => local.local(se),
293 })
294 },
295 None => None,
296 })
297 }
298 fn processed(&self) -> Processed {
299 self.source.processed()
300 }
301}
302
303pub struct Pipe<S,P>
304{
305 source: S,
306 parser: P,
307}
308impl<S,P> Source for Pipe<S,P>
309where S: Source,
310 P: PipeParser
311{
312 fn next_char(&mut self) -> SourceResult {
313 self.parser.next_char(&mut self.source)
314 }
315 fn processed(&self) -> Processed {
316 self.source.processed()
317 }
318}
319
320pub struct Filtered<S,F> {
321 source: S,
322 filter: F,
323}
324impl<S,F> Source for Filtered<S,F>
325where S: Source,
326 F: FnMut(char) -> Option<char>
327{
328 fn next_char(&mut self) -> SourceResult {
329 loop {
330 match self.source.next_char()? {
331 Some(local_se) => {
332 let (local,se) = local_se.into_inner();
333 match se {
334 SourceEvent::Char(c) => match (&mut self.filter)(c) {
335 Some(c) => break Ok(Some(local.with_inner(SourceEvent::Char(c)))),
336 None => continue,
337 },
338 SourceEvent::Breaker(b) => break Ok(Some(local.with_inner(SourceEvent::Breaker(b)))),
339 }
340 },
341 None => break Ok(None),
342 }
343 }
344 }
345 fn processed(&self) -> Processed {
346 self.source.processed()
347 }
348}
349
350struct Shift<S> {
351 source: S,
352 char_offset: usize,
353 byte_offset: usize,
354}
355impl<S> Shift<S> {
356 fn new(source: S, shift: Processed) -> Shift<S> {
357 Shift {
358 source,
359 char_offset: shift.chars,
360 byte_offset: shift.bytes,
361 }
362 }
363}
364impl<S> Source for Shift<S>
365where S: Source
366{
367 fn next_char(&mut self) -> SourceResult {
368 Ok(match self.source.next_char()? {
369 Some(ev) => Some(ev.with_shift(self.char_offset,self.byte_offset)),
370 None => None,
371 })
372 }
373 fn processed(&self) -> Processed {
374 let mut p = self.source.processed();
375 p.chars += self.char_offset;
376 p.bytes += self.byte_offset;
377 p
378 }
379}
380
381enum InnerChain<S1,S2> {
382 First(S1),
383 Second(Shift<S2>),
384 Done(Processed)
385}
386
387pub struct Chain<S1,S2> {
388 inner: InnerChain<S1,S2>,
389 second: Option<S2>,
390}
391impl<S1,S2> Source for Chain<S1,S2>
392where S1: Source,
393 S2: Source
394{
395 fn next_char(&mut self) -> SourceResult {
396 loop {
397 match &mut self.inner {
398 InnerChain::First(first) => match first.next_char()? {
399 Some(ev) => break Ok(Some(ev)),
400 None => match self.second.take() {
401 Some(second) => self.inner = InnerChain::Second(Shift::new(second,first.processed())),
402 None => self.inner = InnerChain::Done(first.processed()),
403 }
404 },
405 InnerChain::Second(second) => match second.next_char()? {
406 Some(ev) => break Ok(Some(ev)),
407 None => self.inner = InnerChain::Done(second.processed()),
408 },
409 InnerChain::Done(_) => break Ok(None),
410 }
411 }
412 }
413 fn processed(&self) -> Processed {
414 match &self.inner {
415 InnerChain::First(first) => first.processed(),
416 InnerChain::Second(second) => second.processed(),
417 InnerChain::Done(p) => *p,
418 }
419 }
420}
421
422use unicode_properties::{
423 UnicodeGeneralCategory,
424 GeneralCategory,
425};
426
427pub struct IntoSeparator<S> {
447 source: S,
448}
449impl<S> Source for IntoSeparator<S>
450where S: Source
451{
452 fn next_char(&mut self) -> SourceResult {
453 self.source.next_char().map(|opt_lse| {
454 opt_lse.map(|local_se| {
455 local_se.map(|se| {
456 match se {
457 SourceEvent::Char(c) => {
458 match c {
459 '\n' => SourceEvent::Breaker(Breaker::Line),
460 _ => match c.general_category() {
461 GeneralCategory::Control |
462 GeneralCategory::SpaceSeparator => SourceEvent::Breaker(Breaker::Space),
463 GeneralCategory::LineSeparator => SourceEvent::Breaker(Breaker::Line),
464 GeneralCategory::ParagraphSeparator => SourceEvent::Breaker(Breaker::Paragraph),
465 _ => SourceEvent::Char(c),
466 },
467 }
468 },
469 b @ SourceEvent::Breaker(..) => b,
470 }
471 })
472 })
473 })
474 }
475 fn processed(&self) -> Processed {
476 self.source.processed()
477 }
478}
479
480
481pub struct MergeSeparator<S> {
482 source: S,
483 buffer: Option<Local<SourceEvent>>,
484 current: Option<(Local<()>,Breaker)>,
485}
486impl<S> Source for MergeSeparator<S>
487where S: Source
488{
489 fn next_char(&mut self) -> SourceResult {
490 fn merge_breakers(cur_loc: Local<()>, cur_b: Breaker, nxt_loc: Local<()>, nxt_b: Breaker) -> Result<(Local<()>,Breaker),Error> {
491 let loc = Local::from_segment(cur_loc,nxt_loc)?;
492 Ok((loc,match (cur_b,nxt_b) {
493 (Breaker::None,_) => nxt_b,
494 (_,Breaker::None) => cur_b,
495 (Breaker::Space,_) => nxt_b,
496 (_,Breaker::Space) => cur_b,
497 (Breaker::Word,_) => nxt_b,
498 (_,Breaker::Word) => cur_b,
499 (Breaker::Line,_) => nxt_b,
500 (_,Breaker::Line) => cur_b,
501 (Breaker::Sentence,_) => nxt_b,
502 (_,Breaker::Sentence) => cur_b,
503 (Breaker::Paragraph,_) => nxt_b,
504 (_,Breaker::Paragraph) => cur_b,
505 (Breaker::Section,Breaker::Section) => nxt_b,
506 }))
507 }
508
509
510 loop {
511 match self.buffer.take() {
512 Some(lse) => break Ok(Some(lse)),
513 None => {
514 match self.source.next_char()? {
515 Some(local_se) => {
516 let (local,se) = local_se.into_inner();
517 match se {
518 c @ SourceEvent::Char(..) => match self.current.take() {
519 Some((local_br,br)) => {
520 self.buffer = Some(local.with_inner(c));
521 break Ok(Some(local_br.with_inner(SourceEvent::Breaker(br))));
522 },
523 None => break Ok(Some(local.with_inner(c))),
524 },
525 SourceEvent::Breaker(br) => match self.current.take() {
526 Some((c_local,c_br)) => {
527 self.current = Some(merge_breakers(c_local,c_br,local,br)?);
528 },
529 None => {
530 self.current = Some((local,br));
531 },
532 },
533 }
534 },
535 None => match self.current.take() {
536 Some((local,br)) => break Ok(Some(local.with_inner(SourceEvent::Breaker(br)))),
537 None => break Ok(None),
538 },
539 }
540 },
541 }
542 }
543 }
544 fn processed(&self) -> Processed {
545 self.source.processed()
546 }
547}
548
549
550#[cfg(test)]
551mod tests {
552 use crate::*;
553 use super::*;
554
555 use unicode_properties::{
556 UnicodeGeneralCategory,
557 GeneralCategory,
558 };
559
560 #[test]
561 fn basic() {
562 let mut src = " ⪢ 	 💯  ‍ ‌  ❤"
563 .into_source()
564 .pipe(crate::entities::Builder::new().create().into_piped())
565 .filter_char(|c| {
566 match c.general_category() {
567 GeneralCategory::Format if c != '\u{200d}' => None,
568 GeneralCategory::Unassigned => None,
569 _ if c == '\u{f8e6}' => None,
570 _ => Some(c),
571 }
572 });
573
574 let mut res_iter = [
575 SourceEvent::Char(' ').localize(Snip { offset: 0, length: 1 },Snip { offset: 0, length: 1 }),
576 SourceEvent::Char('⪢').localize(Snip { offset: 1, length: 16 },Snip { offset: 1, length: 16 }),
577 SourceEvent::Char(' ').localize(Snip { offset: 17, length: 1 },Snip { offset: 17, length: 1 }),
578 SourceEvent::Char('\t').localize(Snip { offset: 18, length: 6 },Snip { offset: 18, length: 6 }),
579 SourceEvent::Char(' ').localize(Snip { offset: 24, length: 1 },Snip { offset: 24, length: 1 }),
580 SourceEvent::Char('💯').localize(Snip { offset: 25, length: 9 },Snip { offset: 25, length: 9 }),
581 SourceEvent::Char(' ').localize(Snip { offset: 34, length: 1 },Snip { offset: 34, length: 1 }),
582 SourceEvent::Char(' ').localize(Snip { offset: 43, length: 1 },Snip { offset: 43, length: 1 }),
583 SourceEvent::Char('\u{200d}').localize(Snip { offset: 44, length: 8 },Snip { offset: 44, length: 8 }),
584 SourceEvent::Char(' ').localize(Snip { offset: 52, length: 1 },Snip { offset: 52, length: 1 }),
585 SourceEvent::Char(' ').localize(Snip { offset: 61, length: 1 },Snip { offset: 61, length: 1 }),
586 SourceEvent::Char(' ').localize(Snip { offset: 70, length: 1 },Snip { offset: 70, length: 1 }),
587 SourceEvent::Char('❤').localize(Snip { offset: 71, length: 8 },Snip { offset: 71, length: 8 }),
588 ].into_iter();
589
590 while let Some(local_event) = src.next_char().unwrap() {
591 match res_iter.next() {
594 Some(ev) => {
595 println!("Source: {:?}",local_event);
596 println!("Result: {:?}",ev);
597 assert_eq!(local_event,ev);
598 },
599 None => {
600 panic!("parser has more events then test result");
601 },
602 }
603 }
604 }
605
606 #[test]
607 fn basic_breaker() {
608 let mut src = " ⪢ 	 💯  ‍ ‌  ❤ "
609 .into_source()
610 .pipe(crate::entities::Builder::new().create().into_piped())
611 .filter_char(|c| {
612 match c.general_category() {
613 GeneralCategory::Format if c != '\u{200d}' => None,
614 GeneralCategory::Unassigned => None,
615 _ if c == '\u{f8e6}' => None,
616 _ => Some(c),
617 }
618 })
619 .into_separator()
620 .pipe(crate::paragraph::Builder::new().create())
621 .merge_separators();
622
623 let mut res_iter = [
624 SourceEvent::Breaker(Breaker::Space).localize(Snip { offset: 0, length: 1 },Snip { offset: 0, length: 1 }),
625 SourceEvent::Char('⪢').localize(Snip { offset: 1, length: 16 },Snip { offset: 1, length: 16 }),
626 SourceEvent::Breaker(Breaker::Space).localize(Snip { offset: 17, length: 8 },Snip { offset: 17, length: 8 }),
627 SourceEvent::Char('💯').localize(Snip { offset: 25, length: 9 },Snip { offset: 25, length: 9 }),
628 SourceEvent::Breaker(Breaker::Space).localize(Snip { offset: 34, length: 10 },Snip { offset: 34, length: 10 }),
629 SourceEvent::Char('\u{200d}').localize(Snip { offset: 44, length: 8 },Snip { offset: 44, length: 8 }),
630 SourceEvent::Breaker(Breaker::Space).localize(Snip { offset: 52, length: 19 },Snip { offset: 52, length: 19 }),
631 SourceEvent::Char('❤').localize(Snip { offset: 71, length: 8 },Snip { offset: 71, length: 8 }),
632 SourceEvent::Breaker(Breaker::Space).localize(Snip { offset: 79, length: 1 },Snip { offset: 79, length: 1 }),
633 ].into_iter();
634
635 while let Some(local_event) = src.next_char().unwrap() {
636 match res_iter.next() {
639 Some(ev) => {
640 println!("Source: {:?}",local_event);
641 println!("Result: {:?}",ev);
642 assert_eq!(local_event,ev);
643 },
644 None => {
645 panic!("parser has more events then test result");
646 },
647 }
648 }
649 }
650
651 #[test]
652 fn basic_breaker_2() {
653 let mut src = " ⪢ 	\n 💯  ‍ \n ‌ \n  ❤ "
654 .into_source()
655 .pipe(crate::entities::Builder::new().create().into_piped())
656 .filter_char(|c| {
657 match c.general_category() {
658 GeneralCategory::Format if c != '\u{200d}' => None,
659 GeneralCategory::Unassigned => None,
660 _ if c == '\u{f8e6}' => None,
661 _ => Some(c),
662 }
663 })
664 .into_separator()
665 .pipe(crate::paragraph::Builder::new().create())
666 .merge_separators();
667
668 let mut res_iter = [
669 SourceEvent::Breaker(Breaker::Space).localize(Snip { offset: 0, length: 1 },Snip { offset: 0, length: 1 }),
670 SourceEvent::Char('⪢').localize(Snip { offset: 1, length: 16 },Snip { offset: 1, length: 16 }),
671 SourceEvent::Breaker(Breaker::Line).localize(Snip { offset: 17, length: 9 },Snip { offset: 17, length: 9 }),
672 SourceEvent::Char('💯').localize(Snip { offset: 26, length: 9 },Snip { offset: 26, length: 9 }),
673 SourceEvent::Breaker(Breaker::Space).localize(Snip { offset: 35, length: 10 },Snip { offset: 35, length: 10 }),
674 SourceEvent::Char('\u{200d}').localize(Snip { offset: 45, length: 8 },Snip { offset: 45, length: 8 }),
675 SourceEvent::Breaker(Breaker::Paragraph).localize(Snip { offset: 53, length: 23 },Snip { offset: 53, length: 23 }),
676 SourceEvent::Char('❤').localize(Snip { offset: 76, length: 8 },Snip { offset: 76, length: 8 }),
677 SourceEvent::Breaker(Breaker::Space).localize(Snip { offset: 84, length: 1 },Snip { offset: 84, length: 1 }),
678 ].into_iter();
679
680 while let Some(local_event) = src.next_char().unwrap() {
681 match res_iter.next() {
684 Some(ev) => {
685 println!("Source: {:?}",local_event);
686 println!("Result: {:?}",ev);
687 assert_eq!(local_event,ev);
688 },
689 None => {
690 panic!("parser has more events then test result");
691 },
692 }
693 }
694 }
695
696
697 #[test]
698 fn chain_1() {
699 let src = " ⪢ 	\n 💯 ";
700 let mut src = src.into_source()
701 .chain(Breaker::Word.into_source_as(" "))
702 .chain("‍ \n ‌ \n  ❤ ".into_source())
703 .pipe(crate::entities::Builder::new().create().into_piped())
704 .filter_char(|c| {
705 match c.general_category() {
706 GeneralCategory::Format if c != '\u{200d}' => None,
707 GeneralCategory::Unassigned => None,
708 _ if c == '\u{f8e6}' => None,
709 _ => Some(c),
710 }
711 })
712 .into_separator()
713 .pipe(crate::paragraph::Builder::new().create())
714 .merge_separators();
715
716 let mut res_iter = [
717 SourceEvent::Breaker(Breaker::Space).localize(Snip { offset: 0, length: 1 },Snip { offset: 0, length: 1 }),
718 SourceEvent::Char('⪢').localize(Snip { offset: 1, length: 16 },Snip { offset: 1, length: 16 }),
719 SourceEvent::Breaker(Breaker::Line).localize(Snip { offset: 17, length: 9 },Snip { offset: 17, length: 9 }),
720 SourceEvent::Char('💯').localize(Snip { offset: 26, length: 9 },Snip { offset: 26, length: 9 }),
721 SourceEvent::Breaker(Breaker::Word).localize(Snip { offset: 35, length: 10 },Snip { offset: 35, length: 10 }),
722 SourceEvent::Char('\u{200d}').localize(Snip { offset: 45, length: 8 },Snip { offset: 45, length: 8 }),
723 SourceEvent::Breaker(Breaker::Paragraph).localize(Snip { offset: 53, length: 23 },Snip { offset: 53, length: 23 }),
724 SourceEvent::Char('❤').localize(Snip { offset: 76, length: 8 },Snip { offset: 76, length: 8 }),
725 SourceEvent::Breaker(Breaker::Space).localize(Snip { offset: 84, length: 1 },Snip { offset: 84, length: 1 }),
726 ].into_iter();
727
728 while let Some(local_event) = src.next_char().unwrap() {
729 match res_iter.next() {
732 Some(ev) => {
733 println!("Source: {:?}",local_event);
734 println!("Result: {:?}",ev);
735 assert_eq!(local_event,ev);
736 },
737 None => {
738 panic!("parser has more events then test result");
739 },
740 }
741 }
742 }
743
744}