xml_oxide/sax/
parser.rs

1use nom::Offset;
2
3use crate::{
4    sax as xml_sax,
5    sax::internal::{
6        content_relaxed, insidecdata, insidecomment, misc, misc_before_doctype,
7        misc_before_xmldecl, Attribute2, AttributeRange, ContentRelaxed, InsideCdata,
8        InsideComment, Misc, MiscBeforeDoctype, MiscBeforeXmlDecl, QName, SAXAttribute2,
9    },
10};
11
12enum InternalSuccess<'a> {
13    StartDocument,
14    EndDocument,
15
16    ContentRelaxed(ContentRelaxed<'a>),
17    InsideCdata(InsideCdata<'a>),
18    InsideComment(InsideComment<'a>),
19    Misc(Misc<'a>),
20    MiscBeforeDoctype(MiscBeforeDoctype<'a>),
21    MiscBeforeXmlDecl(MiscBeforeXmlDecl<'a>),
22}
23
24use std::{
25    borrow::BorrowMut,
26    cell::RefCell,
27    io::{BufRead, BufReader, Read, Write},
28    ops::Range,
29    vec,
30};
31
32use super::{circular, Attribute};
33
34#[derive(Clone, Copy, Debug, Eq, PartialEq)]
35enum ParserState {
36    Initial,
37    DocStartBeforeXmlDecl, // when xmldecl parsed move to DocStartBeforeDocType, if something else parsed(including whitespace) the same!
38    // DocStartBeforeXmlDeclInsideComment, // not possible - this means that doc doesn't have xmldecl, move to DocStartBeforeDocType
39    DocStartBeforeDocType,              //when doctype parsed move to docstart
40    DocStartBeforeDocTypeInsideComment, // this doesn't mean that doc doesn't have doctype, move to DocStartBeforeDocType
41
42    DocStart,
43    DocStartInsideComment,
44
45    Content,
46    InsideCdata,
47    InsideComment, //can be at the start or end of the document? specified all
48
49    DocEnd, //misc
50    DocEndInsideComment,
51}
52
53struct Namespace {
54    level: usize,
55    prefix: Range<usize>,
56    value: Range<usize>,
57}
58pub struct Parser<R: Read> {
59    state: ParserState,
60    bufreader: BufReader<R>,
61    buffer3: circular::Buffer,
62
63    strbuffer: String,
64    offset: usize,
65
66    // document_complete: bool, //if element_level reaches 0 again , we control this via state
67    element_level: usize,
68    element_strbuffer: String,
69    element_list: Vec<Range<usize>>,
70
71    is_namespace_aware: bool,
72    namespace_strbuffer: String,
73    namespace_list: Vec<Namespace>,
74
75    attribute_list: Vec<AttributeRange>,
76}
77
78pub(crate) fn convert_attribute_range<'a>(
79    strbuffer: &'a str,
80    namespace_strbuffer: &'a str,
81    range: AttributeRange,
82) -> Attribute<'a> {
83    Attribute {
84        value: &strbuffer[range.value],
85        name: &strbuffer[range.name],
86        local_name: &strbuffer[range.local_name],
87        prefix: &strbuffer[range.prefix],
88        namespace: &namespace_strbuffer[range.namespace],
89    }
90}
91
92fn convert_start_element_name_and_add_attributes<'a>(
93    strbuffer: &'a mut String,
94    namespace_strbuffer: &'a mut String,
95
96    event1: crate::sax::internal::StartElement,
97    buffer3: &circular::Buffer,
98    attribute_list: &'a mut Vec<AttributeRange>,
99) -> SaxResult<Range<usize>> {
100    attribute_list.clear();
101
102    let start = strbuffer.len();
103    let size = event1.name.len();
104    let element_name_range = start..start + size;
105    strbuffer.push_str(event1.name);
106
107    // let mut attributes2: Vec<SAXAttribute2> = vec![];
108
109    let start = strbuffer.len();
110    let size = event1.attributes_chunk.len();
111    let attributes_chunk = unsafe { std::str::from_utf8_unchecked(event1.attributes_chunk) };
112    strbuffer.push_str(attributes_chunk);
113
114    let mut inp = strbuffer[start..start + size].as_bytes();
115    let mut offset1: usize = start;
116    //parse key,value and how many attributes.
117    loop {
118        if inp.len() == 0 {
119            break;
120        }
121
122        let res = Attribute2(inp);
123
124        match res {
125            Ok((remainder, mut attr_range)) => {
126                attr_range.name =
127                    (attr_range.name.start + offset1)..(attr_range.name.end + offset1);
128                attr_range.value =
129                    (attr_range.value.start + offset1)..(attr_range.value.end + offset1);
130
131                offset1 += inp.offset(remainder);
132                inp = remainder;
133
134                attribute_list.push(attr_range)
135            }
136            Err(_e) => {
137                return Err(error::Error::Parsing(format!(
138                    "Error while parsing attributes.",
139                )))
140            }
141        }
142    }
143
144    Ok(element_name_range)
145}
146
147struct ElementRange {
148    prefix_range: Range<usize>,
149    local_name_range: Range<usize>,
150    namespace_range: Range<usize>,
151}
152
153fn parse_start_element(
154    start_element_name_range: Range<usize>,
155    is_namespace_aware: bool,
156    element_level: usize,
157
158    strbuffer: &mut String,
159    attribute_list: &mut Vec<AttributeRange>,
160    namespace_strbuffer: &mut String,
161    namespace_list: &mut Vec<Namespace>,
162) -> SaxResult<ElementRange> {
163    let start_element_name = &strbuffer[start_element_name_range];
164
165    // let mut element_local_name = "";
166    // let mut element_namespace = "";
167    // let mut element_prefix = "";
168
169    let mut prefix_range = 0..0;
170    let mut local_name_range = 0..0;
171    let mut namespace_range = 0..0;
172
173    // add namespaces
174    if is_namespace_aware {
175        //first process namespace definitions & parse prefix:local_name
176        for attr in attribute_list.iter_mut() {
177            let inp = strbuffer[attr.name.clone()].as_bytes();
178
179            match QName(inp) {
180                Ok(qres) => {
181                    let qname = qres.1;
182
183                    if qname.prefix == "" && qname.local_name == "xmlns" {
184                        //set default namespace
185                        let ns = push_ns_values_get_ns(
186                            namespace_strbuffer,
187                            "",
188                            &strbuffer[attr.value.clone()],
189                            element_level,
190                        );
191                        namespace_list.push(ns);
192                    }
193
194                    if qname.prefix == "xmlns" {
195                        //set prefixed namespace
196                        let prefix = qname.local_name;
197                        let ns = push_ns_values_get_ns(
198                            namespace_strbuffer,
199                            prefix,
200                            &strbuffer[attr.value.clone()],
201                            element_level,
202                        );
203                        namespace_list.push(ns);
204                    }
205                    attr.local_name = Range {
206                        start: qname.local_name_range.start + attr.name.start.clone(),
207                        end: qname.local_name_range.end + attr.name.start.clone(),
208                    };
209                    // println!("TEST: {:?}", &strbuffer[attr.local_name.clone()]);
210                    attr.prefix = Range {
211                        start: qname.prefix_range.start + attr.name.start.clone(),
212                        end: qname.prefix_range.end + attr.name.start.clone(),
213                    };
214                    // let range_local_name = push_str_get_range(
215                    //     &mut strbuffer,
216                    //     qname.local_name,
217                    // );
218                    // attr.local_name = &strbuffer[range_local_name];
219                }
220                Err(_e) => {
221                    return Err(error::Error::Parsing(format!(
222                        "Attribute does not conform to QName spec: {}",
223                        &strbuffer[attr.name.clone()]
224                    )))
225                }
226            }
227        }
228
229        //resolve namespaces for element and attributes.
230
231        for attr in attribute_list.iter_mut() {
232            //Default namespace doesn't apply to attributes
233            if &strbuffer[attr.prefix.clone()] == "" || &strbuffer[attr.prefix.clone()] == "xmlns" {
234                continue;
235            }
236            match namespace_list.iter().rfind(|ns| {
237                &namespace_strbuffer[ns.prefix.clone()] == &strbuffer[attr.prefix.clone()]
238            }) {
239                Some(ns) => attr.namespace = ns.value.clone(),
240                None => {
241                    return Err(error::Error::Parsing(format!(
242                        "Namespace not found for prefix: {} , attribute: {} , element: {}",
243                        &strbuffer[attr.prefix.clone()],
244                        &strbuffer[attr.name.clone()],
245                        start_element_name
246                    )))
247                }
248            }
249        }
250
251        match QName(start_element_name.as_bytes()) {
252            Ok(qres) => {
253                let qname = qres.1;
254                // element_local_name = qname.local_name;
255                // element_prefix = qname.prefix;
256                local_name_range = qname.local_name_range;
257                prefix_range = qname.prefix_range;
258
259                match namespace_list.iter().rfind(|ns| {
260                    &namespace_strbuffer[ns.prefix.clone()] == &strbuffer[prefix_range.clone()]
261                }) {
262                    Some(ns) => namespace_range = ns.value.clone(),
263
264                    None => {
265                        if &strbuffer[prefix_range.clone()] == "" {
266                            //it is fine
267                        } else {
268                            return Err(error::Error::Parsing(format!(
269                                "Namespace prefix not found for element: {}",
270                                start_element_name
271                            )));
272                        }
273                    }
274                }
275            }
276            Err(_e) => {
277                return Err(error::Error::Parsing(format!(
278                    "Element name does not conform to QName spec: {}",
279                    start_element_name
280                )))
281            }
282        }
283    }
284
285    Ok(ElementRange {
286        prefix_range: prefix_range,
287        local_name_range: local_name_range,
288        namespace_range: namespace_range,
289    })
290}
291
292fn push_str_get_range(strbuffer: &mut String, addition: &str) -> Range<usize> {
293    let start = strbuffer.len();
294    let size = addition.len();
295    let range = Range {
296        start: start,
297        end: start + size,
298    };
299    strbuffer.push_str(addition);
300    range
301}
302
303fn push_ns_values_get_ns(
304    namespace_strbuffer: &mut String,
305    prefix: &str,
306    value: &str,
307    element_level: usize,
308) -> Namespace {
309    let range_prefix = push_str_get_range(namespace_strbuffer, prefix);
310    let range_value = push_str_get_range(namespace_strbuffer, value);
311    Namespace {
312        level: element_level,
313        prefix: range_prefix,
314        value: range_value,
315    }
316}
317
318pub type SaxResult<T> = Result<T, error::Error>;
319
320mod error {
321    use thiserror::Error;
322    #[derive(Debug, Error)]
323    pub enum Error {
324        #[error(transparent)]
325        Io(#[from] std::io::Error),
326
327        // Generic
328        #[error("SAX Parsing Err: {0}")]
329        Parsing(String),
330
331        #[error("SAX Parsing Err: Unexpected EOF")]
332        UnexpectedEof,
333    }
334}
335
336// https://doc.rust-lang.org/nomicon/borrow-splitting.html
337fn read_data_splitted<R: Read>(
338    bufreader: &mut BufReader<R>,
339    buffer2: &mut Vec<u8>,
340) -> Result<(), std::io::Error> {
341    match bufreader.fill_buf() {
342        Ok(_ok) => {}
343        Err(err) => return Err(err),
344    }
345
346    let amt: usize;
347    {
348        let data2 = bufreader.buffer();
349
350        buffer2.extend_from_slice(data2);
351        amt = data2.len();
352    }
353    bufreader.consume(amt);
354    Ok(())
355}
356fn read_data_splitted_refcell<R: Read>(
357    bufreader: &mut BufReader<R>,
358    buffer2: &RefCell<Vec<u8>>,
359) -> Result<(), std::io::Error> {
360    match bufreader.fill_buf() {
361        Ok(_ok) => {}
362        Err(err) => return Err(err),
363    }
364
365    let amt: usize;
366    {
367        let data2 = bufreader.buffer();
368
369        buffer2.borrow_mut().extend_from_slice(data2);
370        amt = data2.len();
371    }
372    bufreader.consume(amt);
373    Ok(())
374}
375
376//todo move all states to read_event_splitted
377//todo? simplify the enum here to remove duplicates,then we move complexity to read_event method
378fn event_converter<'a, 'b>(
379    mut state: ParserState,
380    internal_event: InternalSuccess<'b>,
381    buffer3: &'b circular::Buffer,
382
383    element_list: &mut Vec<Range<usize>>,
384    mut strbuffer: &'a mut String,
385    mut namespace_strbuffer: &'a mut String,
386    namespace_list: &mut Vec<Namespace>,
387
388    is_namespace_aware: bool,
389    mut element_level: usize,
390    mut element_strbuffer: &mut String,
391
392    attribute_list: &'a mut Vec<AttributeRange>,
393) -> SaxResult<(xml_sax::Event<'a>, ParserState, usize)> {
394    let event = match internal_event {
395        InternalSuccess::StartDocument => xml_sax::Event::StartDocument,
396        InternalSuccess::EndDocument => xml_sax::Event::EndDocument,
397        InternalSuccess::ContentRelaxed(cr) => match cr {
398            ContentRelaxed::CharData(event1) => {
399                let start = strbuffer.len();
400                let size = event1.len();
401                strbuffer.push_str(unsafe { std::str::from_utf8_unchecked(event1) });
402                xml_sax::Event::Characters(&strbuffer[start..(start + size)])
403            }
404            ContentRelaxed::StartElement(event1) => {
405                //todo decode
406
407                if is_namespace_aware {
408                    // clear up namespaces
409                    match namespace_list
410                        .iter()
411                        .rposition(|ns| ns.level <= element_level)
412                    {
413                        Some(pos) => {
414                            if let Some(starting_pos) =
415                                namespace_list.get(pos + 1).map(|ns| ns.prefix.start)
416                            {
417                                namespace_list.truncate(pos + 1);
418                                namespace_strbuffer.truncate(starting_pos);
419                            }
420                        }
421                        None => {
422                            // nothing to remove
423                        }
424                    }
425                }
426
427                let start_element_name_range = convert_start_element_name_and_add_attributes(
428                    strbuffer,
429                    namespace_strbuffer,
430                    event1,
431                    buffer3,
432                    attribute_list,
433                )?;
434
435                element_level += 1;
436
437                //add element to list for expected tags check
438
439                let element_list_range = push_str_get_range(
440                    &mut element_strbuffer,
441                    &strbuffer[start_element_name_range.clone()],
442                );
443                element_list.push(element_list_range.clone());
444
445                // let mut element_local_name = "";
446                // let mut element_namespace = "";
447                // let mut element_prefix = "";
448                let element_ranges = parse_start_element(
449                    start_element_name_range.clone(),
450                    is_namespace_aware,
451                    element_level,
452                    strbuffer,
453                    attribute_list,
454                    namespace_strbuffer,
455                    namespace_list,
456                )?;
457
458                let start_element = xml_sax::StartElement {
459                    name: &strbuffer[start_element_name_range],
460                    // attributes: attributes,
461                    is_empty: false,
462
463                    local_name: &strbuffer[element_ranges.local_name_range],
464                    namespace: &namespace_strbuffer[element_ranges.namespace_range],
465                    prefix: &strbuffer[element_ranges.prefix_range],
466
467                    range_list: attribute_list,
468                    strbuffer: strbuffer,
469                    namespace_strbuffer: namespace_strbuffer,
470                };
471
472                xml_sax::Event::StartElement(start_element)
473            }
474            ContentRelaxed::EmptyElemTag(event1) => {
475                if is_namespace_aware {
476                    // clear up namespaces
477                    match namespace_list
478                        .iter()
479                        .rposition(|ns| ns.level <= element_level)
480                    {
481                        Some(pos) => {
482                            if let Some(starting_pos) =
483                                namespace_list.get(pos + 1).map(|ns| ns.prefix.start)
484                            {
485                                namespace_list.truncate(pos + 1);
486                                namespace_strbuffer.truncate(starting_pos);
487                            }
488                        }
489                        None => {
490                            // nothing to remove
491                        }
492                    }
493                }
494
495                let start_element_name_range = convert_start_element_name_and_add_attributes(
496                    strbuffer,
497                    namespace_strbuffer,
498                    event1,
499                    buffer3,
500                    attribute_list,
501                )?;
502
503                element_level += 1; // this is important before namespace handling
504
505                // element_list_range is not important for empty element tag
506
507                let element_ranges = parse_start_element(
508                    start_element_name_range.clone(),
509                    is_namespace_aware,
510                    element_level,
511                    strbuffer,
512                    attribute_list,
513                    namespace_strbuffer,
514                    namespace_list,
515                )?;
516
517                let start_element = xml_sax::StartElement {
518                    name: &strbuffer[start_element_name_range],
519                    // attributes: attributes,
520                    is_empty: false,
521
522                    local_name: &strbuffer[element_ranges.local_name_range],
523                    namespace: &namespace_strbuffer[element_ranges.namespace_range],
524                    prefix: &strbuffer[element_ranges.prefix_range],
525
526                    range_list: attribute_list,
527                    strbuffer: strbuffer,
528                    namespace_strbuffer: namespace_strbuffer,
529                };
530
531                // element_level -= 1;
532                // if element_level == 0 {
533                //     //could be a root only document.
534                //     state = ParserState::DocEnd;
535                // }
536
537                element_level -= 1;
538                if element_level == 0 {
539                    state = ParserState::DocEnd;
540                }
541
542                xml_sax::Event::StartElement(start_element)
543            }
544            ContentRelaxed::EndElement(event1) => {
545                //todo: check if it is the expected tag
546
547                match element_list.pop() {
548                    Some(r) => {
549                        if &element_strbuffer[r.clone()] == event1.name {
550                            element_strbuffer.truncate(r.start);
551                        } else {
552                            return Err(error::Error::Parsing(format!(
553                                "Expected closing tag: {} ,found: {}",
554                                &element_strbuffer[r.clone()],
555                                event1.name
556                            )));
557
558                            // TODO Expected closing tag: ... &element_strbuffer[r.clone()] found event1.name
559                        }
560                    }
561                    None => {
562                        return Err(error::Error::Parsing(format!(
563                            "No starting tag for: {}",
564                            event1.name
565                        )))
566                    }
567                }
568
569                if is_namespace_aware {
570                    // clear up namespaces
571                    match namespace_list
572                        .iter()
573                        .rposition(|ns| ns.level <= element_level)
574                    {
575                        Some(pos) => {
576                            if let Some(starting_pos) =
577                                namespace_list.get(pos + 1).map(|ns| ns.prefix.start)
578                            {
579                                namespace_list.truncate(pos + 1);
580                                namespace_strbuffer.truncate(starting_pos);
581                            }
582                        }
583                        None => {
584                            // nothing to remove
585                        }
586                    }
587                }
588
589                // let range = push_str_get_range(
590                //     &mut element_strbuffer,
591                //     start_element.name,
592                // );
593                // element_list.push(range);
594
595                let start = strbuffer.len();
596                let size = event1.name.len();
597                strbuffer.push_str(event1.name);
598                let mut end_element = xml_sax::EndElement {
599                    name: &strbuffer[start..(start + size)],
600                    local_name: "",
601                    prefix: "",
602                    namespace: "",
603                };
604
605                element_level -= 1;
606                if element_level == 0 {
607                    state = ParserState::DocEnd;
608                }
609
610                if is_namespace_aware {
611                    match QName(end_element.name.as_bytes()) {
612                        Ok(qres) => {
613                            let qname = qres.1;
614                            end_element.local_name = qname.local_name;
615                            end_element.prefix = qname.prefix;
616
617                            match namespace_list.iter().rfind(|ns| {
618                                &namespace_strbuffer[ns.prefix.clone()] == end_element.prefix
619                            }) {
620                                Some(ns) => {
621                                    end_element.namespace = &namespace_strbuffer[ns.value.clone()]
622                                }
623                                None => {
624                                    if end_element.prefix == "" {
625                                        //it is fine
626                                    } else {
627                                        return Err(error::Error::Parsing(format!(
628                                            "Namespace prefix not found for element: {}",
629                                            end_element.name
630                                        )));
631                                    }
632                                }
633                            }
634                        }
635                        Err(_e) => {
636                            return Err(error::Error::Parsing(format!(
637                                "Element name does not conform to QName spec: {}",
638                                end_element.name
639                            )))
640                        }
641                    }
642                }
643                xml_sax::Event::EndElement(end_element)
644            }
645            ContentRelaxed::Reference(event1) => {
646                // let start = strbuffer.len();
647                // let size = event1.initial.len();
648                // let range_initial = Range {
649                //     start: start,
650                //     end: start + size,
651                // };
652                // strbuffer.push_str(event1.initial);
653
654                let range: Range<usize> = push_str_get_range(&mut strbuffer, event1.initial);
655
656                //we handle the case when it is a character, not a string reference
657                let raw = event1.initial;
658                let resolved_char: Option<char>;
659                if raw.starts_with("&#x") {
660                    let hex_val = &raw[3..raw.len() - 1];
661
662                    resolved_char = match u32::from_str_radix(&hex_val, 16) {
663                        Ok(a) => match char::from_u32(a) {
664                            Some(c) => Some(c),
665                            None => None,
666                        },
667                        Err(_) => None,
668                    }
669                } else if raw.starts_with("&#") {
670                    let hex_val = &raw[2..raw.len() - 1];
671
672                    resolved_char = match u32::from_str_radix(&hex_val, 10) {
673                        Ok(a) => match char::from_u32(a) {
674                            Some(c) => Some(c),
675                            None => None,
676                        },
677                        Err(_) => None,
678                    }
679                } else {
680                    resolved_char = match event1.initial {
681                        // we don't need .as_ref() or &* as it is not String -> https://github.com/rust-lang/rust/issues/28606
682                        "&amp;" => Some('&'),
683                        "&lt;" => Some('<'),
684                        "&gt;" => Some('>'),
685                        "&quot;" => Some('"'),
686                        "&apos;" => Some('\''),
687                        _ => None,
688                    }
689                }
690
691                let range_resolved: Option<Range<usize>> = match resolved_char {
692                    Some(ch) => {
693                        let mut tmp = [0u8; 4];
694                        let addition = ch.encode_utf8(&mut tmp);
695                        Some(push_str_get_range(&mut strbuffer, addition))
696                    }
697                    None => None,
698                    // &* -> https://github.com/rust-lang/rust/issues/28606
699                    // "&amp;" => Some(push_str_get_range(&mut strbuffer, "&")),
700                    // "&lt;" => Some(push_str_get_range(&mut strbuffer, "<")),
701                    // "&gt;" => Some(push_str_get_range(&mut strbuffer, ">")),
702                    // "&quot;" => Some(push_str_get_range(&mut strbuffer, "\"")),
703                    // "&apos;" => Some(push_str_get_range(&mut strbuffer, "'")),
704                    // _ => None,
705                };
706
707                //we are ignoring DTD entity refs
708
709                let reference_event = xml_sax::Reference {
710                    raw: &strbuffer[range],
711                    resolved: match range_resolved {
712                        Some(range) => Some(&strbuffer[range]),
713                        None => None,
714                    },
715                };
716
717                xml_sax::Event::Reference(reference_event)
718            }
719            ContentRelaxed::CdataStart => xml_sax::Event::StartCdataSection,
720            ContentRelaxed::CommentStart => xml_sax::Event::StartComment,
721        },
722        InternalSuccess::InsideCdata(ic) => match ic {
723            InsideCdata::Characters(characters) => {
724                let start = strbuffer.len();
725                let size = characters.len();
726                strbuffer.push_str(unsafe { std::str::from_utf8_unchecked(characters) });
727                xml_sax::Event::Cdata(&strbuffer[start..(start + size)])
728            }
729            InsideCdata::CdataEnd => xml_sax::Event::EndCdataSection,
730        },
731        InternalSuccess::InsideComment(ic) => match ic {
732            InsideComment::Characters(characters) => {
733                let start = strbuffer.len();
734                let size = characters.len();
735                strbuffer.push_str(unsafe { std::str::from_utf8_unchecked(characters) });
736
737                xml_sax::Event::Comment(&strbuffer[start..(start + size)])
738            }
739            InsideComment::CommentEnd => xml_sax::Event::EndComment,
740        },
741        InternalSuccess::Misc(misc) => match misc {
742            Misc::PI(a) => {
743                let str = unsafe { std::str::from_utf8_unchecked(a) };
744                let range = push_str_get_range(&mut strbuffer, &str);
745                xml_sax::Event::ProcessingInstruction(&strbuffer[range])
746            }
747            Misc::Whitespace(a) => {
748                let str = unsafe { std::str::from_utf8_unchecked(a) };
749                let range = push_str_get_range(&mut strbuffer, &str);
750                xml_sax::Event::Whitespace(&strbuffer[range])
751            }
752            Misc::CommentStart => xml_sax::Event::StartComment,
753        },
754        InternalSuccess::MiscBeforeDoctype(misc) => match misc {
755            MiscBeforeDoctype::PI(a) => {
756                let str = unsafe { std::str::from_utf8_unchecked(a) };
757                let range = push_str_get_range(&mut strbuffer, &str);
758                xml_sax::Event::ProcessingInstruction(&strbuffer[range])
759            }
760            MiscBeforeDoctype::Whitespace(a) => {
761                let str = unsafe { std::str::from_utf8_unchecked(a) };
762                let range = push_str_get_range(&mut strbuffer, &str);
763                xml_sax::Event::Whitespace(&strbuffer[range])
764            }
765            MiscBeforeDoctype::CommentStart => xml_sax::Event::StartComment,
766            MiscBeforeDoctype::DocType(a) => {
767                let str = unsafe { std::str::from_utf8_unchecked(a) };
768                let range = push_str_get_range(&mut strbuffer, &str);
769                xml_sax::Event::DocumentTypeDeclaration(&strbuffer[range])
770            }
771        },
772        InternalSuccess::MiscBeforeXmlDecl(misc) => match misc {
773            MiscBeforeXmlDecl::XmlDecl(a) => {
774                let str = unsafe { std::str::from_utf8_unchecked(a) };
775                let range = push_str_get_range(&mut strbuffer, &str);
776                xml_sax::Event::XmlDeclaration(&strbuffer[range])
777            }
778            MiscBeforeXmlDecl::PI(a) => {
779                let str = unsafe { std::str::from_utf8_unchecked(a) };
780                let range = push_str_get_range(&mut strbuffer, &str);
781                xml_sax::Event::ProcessingInstruction(&strbuffer[range])
782            }
783            MiscBeforeXmlDecl::Whitespace(a) => {
784                let str = unsafe { std::str::from_utf8_unchecked(a) };
785                let range = push_str_get_range(&mut strbuffer, &str);
786                xml_sax::Event::Whitespace(&strbuffer[range])
787            }
788            MiscBeforeXmlDecl::CommentStart => xml_sax::Event::StartComment,
789            MiscBeforeXmlDecl::DocType(a) => {
790                let str = unsafe { std::str::from_utf8_unchecked(a) };
791                let range = push_str_get_range(&mut strbuffer, &str);
792                xml_sax::Event::DocumentTypeDeclaration(&strbuffer[range])
793            }
794        },
795    };
796    Ok((event, state, element_level))
797}
798
799fn read_event_splitted<'a, 'b, R: Read>(
800    mut state: ParserState,
801
802    bufreader: &BufReader<R>,
803
804    buffer3: &'b circular::Buffer,
805
806    mut offset: usize,
807    // document_complete: bool, //if element_level reaches 0 again , we control this via state
808) -> SaxResult<(InternalSuccess<'b>, ParserState, usize)> {
809    let event2: InternalSuccess;
810    match state {
811        ParserState::Initial => {
812            state = ParserState::DocStartBeforeXmlDecl;
813            return Ok((InternalSuccess::StartDocument, state, offset));
814        }
815        ParserState::DocStartBeforeXmlDecl => {
816            let res = misc_before_xmldecl(&buffer3.data());
817            match res {
818                Ok(parseresult) => {
819                    offset = buffer3.data().offset(parseresult.0);
820                    state = ParserState::DocStartBeforeDocType;
821
822                    match parseresult.1 {
823                        MiscBeforeXmlDecl::XmlDecl(_a) => {}
824                        MiscBeforeXmlDecl::PI(_a) => {}
825                        MiscBeforeXmlDecl::Whitespace(_a) => {}
826                        MiscBeforeXmlDecl::CommentStart => {
827                            state = ParserState::DocStartBeforeDocTypeInsideComment;
828                        }
829                        MiscBeforeXmlDecl::DocType(_a) => {
830                            state = ParserState::DocStart;
831                        }
832                    }
833                    event2 = InternalSuccess::MiscBeforeXmlDecl(parseresult.1);
834                }
835                Err(nom::Err::Incomplete(_e)) => {
836                    return Err(error::Error::UnexpectedEof);
837                }
838                Err(_err) => {
839                    //try content!
840                    state = ParserState::Content;
841                    return read_event_splitted(state, bufreader, buffer3, offset);
842                }
843            }
844        }
845        ParserState::DocStartBeforeDocType => {
846            let res = misc_before_doctype(&buffer3.data());
847            match res {
848                Ok(parseresult) => {
849                    offset = buffer3.data().offset(parseresult.0);
850
851                    match parseresult.1 {
852                        MiscBeforeDoctype::PI(_a) => {}
853                        MiscBeforeDoctype::Whitespace(_a) => {}
854                        MiscBeforeDoctype::CommentStart => {
855                            state = ParserState::DocStartBeforeDocTypeInsideComment;
856                        }
857                        MiscBeforeDoctype::DocType(_a) => {
858                            state = ParserState::DocStart;
859                        }
860                    }
861                    event2 = InternalSuccess::MiscBeforeDoctype(parseresult.1);
862                }
863                Err(nom::Err::Incomplete(_e)) => {
864                    return Err(error::Error::UnexpectedEof);
865                }
866                Err(_err) => {
867                    //try content!
868                    state = ParserState::Content;
869                    return read_event_splitted(state, bufreader, buffer3, offset);
870                }
871            }
872        }
873        ParserState::DocStartBeforeDocTypeInsideComment => {
874            //expect comment or comment-end
875            let res = insidecomment(&buffer3.data());
876            match res {
877                Ok(parseresult) => {
878                    offset = buffer3.data().offset(parseresult.0);
879
880                    match parseresult.1 {
881                        InsideComment::Characters(_characters) => {}
882                        InsideComment::CommentEnd => {
883                            state = ParserState::DocStartBeforeDocType;
884                        }
885                    }
886                    event2 = InternalSuccess::InsideComment(parseresult.1);
887                }
888                Err(nom::Err::Incomplete(_e)) => {
889                    return Err(error::Error::UnexpectedEof);
890                }
891                Err(_err) => {
892                    return Err(error::Error::Parsing(
893                        "Expected Comment content or Comment end".to_owned(),
894                    ))
895                }
896            }
897        }
898        ParserState::DocStart => {
899            let res = misc(&buffer3.data());
900            match res {
901                Ok(parseresult) => {
902                    offset = buffer3.data().offset(parseresult.0);
903                    // state = ParserState::DocStartBeforeDocType;
904
905                    match parseresult.1 {
906                        Misc::PI(_a) => {}
907                        Misc::Whitespace(_a) => {}
908                        Misc::CommentStart => {
909                            state = ParserState::DocStartInsideComment;
910                        }
911                    }
912                    event2 = InternalSuccess::Misc(parseresult.1);
913                }
914
915                Err(nom::Err::Incomplete(_e)) => {
916                    return Err(error::Error::UnexpectedEof);
917                }
918                Err(_err) => {
919                    //try content!
920                    state = ParserState::Content;
921                    return read_event_splitted(state, bufreader, buffer3, offset);
922                }
923            }
924        }
925        ParserState::DocStartInsideComment => {
926            //expect comment or comment-end
927            let res = insidecomment(&buffer3.data());
928            match res {
929                Ok(parseresult) => {
930                    offset = buffer3.data().offset(parseresult.0);
931
932                    match parseresult.1 {
933                        InsideComment::Characters(_characters) => {}
934                        InsideComment::CommentEnd => {
935                            state = ParserState::DocStart;
936                        }
937                    }
938                    event2 = InternalSuccess::InsideComment(parseresult.1);
939                }
940                Err(nom::Err::Incomplete(_e)) => {
941                    return Err(error::Error::UnexpectedEof);
942                }
943                Err(_err) => {
944                    return Err(error::Error::Parsing(format!(
945                        "Expecting comment content or comment closing tag "
946                    )))
947                }
948            }
949        }
950        ParserState::Content => {
951            let res = content_relaxed(&buffer3.data());
952            match res {
953                Ok(parseresult) => {
954                    offset = buffer3.data().offset(parseresult.0);
955
956                    match &parseresult.1 {
957                        ContentRelaxed::CharData(_event1) => {}
958                        ContentRelaxed::StartElement(_event1) => {}
959                        ContentRelaxed::EmptyElemTag(_event1) => {}
960                        ContentRelaxed::EndElement(_event1) => {}
961                        ContentRelaxed::Reference(_event1) => {}
962                        ContentRelaxed::CdataStart => {
963                            state = ParserState::InsideCdata;
964                        }
965                        ContentRelaxed::CommentStart => {
966                            state = ParserState::InsideComment;
967                        }
968                    }
969                    event2 = InternalSuccess::ContentRelaxed(parseresult.1);
970                }
971                // let ending = String::from_utf8_lossy(&buffer2);
972                Err(nom::Err::Incomplete(_e)) => {
973                    return Err(error::Error::UnexpectedEof);
974                }
975                Err(_e) => {
976                    let ending = String::from_utf8_lossy(&buffer3.data());
977                    let ending_truncated = match ending.char_indices().nth(50) {
978                        None => &ending,
979                        Some((idx, _)) => &ending[..idx],
980                    };
981
982                    return Err(error::Error::Parsing(format!(
983                        "Expected one of (CharData | element | Reference | CDSect | PI | Comment), found: {}",
984                        ending_truncated
985                    )));
986                }
987            }
988        }
989
990        ParserState::InsideCdata => {
991            //expect cdata or cdata-end
992            let res = insidecdata(&buffer3.data());
993            match res {
994                Ok(parseresult) => {
995                    offset = buffer3.data().offset(parseresult.0);
996
997                    match parseresult.1 {
998                        InsideCdata::Characters(_characters) => {}
999                        InsideCdata::CdataEnd => {
1000                            state = ParserState::Content;
1001                        }
1002                    }
1003                    event2 = InternalSuccess::InsideCdata(parseresult.1);
1004                }
1005                Err(nom::Err::Incomplete(_e)) => {
1006                    return Err(error::Error::UnexpectedEof);
1007                }
1008                Err(_err) => {
1009                    return Err(error::Error::Parsing(format!(
1010                        "Expecting CDATA content or CDATA closing tag "
1011                    )))
1012                }
1013            }
1014        }
1015        ParserState::InsideComment => {
1016            //expect comment or comment-end
1017            let res = insidecomment(&buffer3.data());
1018            match res {
1019                Ok(parseresult) => {
1020                    offset = buffer3.data().offset(parseresult.0);
1021
1022                    match parseresult.1 {
1023                        InsideComment::Characters(_characters) => {}
1024                        InsideComment::CommentEnd => {
1025                            state = ParserState::Content;
1026                        }
1027                    }
1028                    event2 = InternalSuccess::InsideComment(parseresult.1);
1029                }
1030                Err(nom::Err::Incomplete(_e)) => {
1031                    return Err(error::Error::UnexpectedEof);
1032                }
1033                Err(_err) => {
1034                    return Err(error::Error::Parsing(format!(
1035                        "Expecting comment content or comment closing tag "
1036                    )))
1037                }
1038            }
1039        }
1040        ParserState::DocEnd => {
1041            // EOF
1042            if buffer3.data().len() == 0 {
1043                // event2 = xml_sax::Event::EndDocument;
1044                return Ok((InternalSuccess::EndDocument, state, offset));
1045            }
1046
1047            let res = misc(&buffer3.data());
1048            match res {
1049                Ok(parseresult) => {
1050                    offset = buffer3.data().offset(parseresult.0);
1051
1052                    match parseresult.1 {
1053                        Misc::PI(_a) => {}
1054                        Misc::Whitespace(_a) => {}
1055                        Misc::CommentStart => {
1056                            state = ParserState::DocEndInsideComment;
1057                        }
1058                    }
1059                    event2 = InternalSuccess::Misc(parseresult.1);
1060                }
1061                Err(nom::Err::Incomplete(_e)) => {
1062                    return Err(error::Error::UnexpectedEof);
1063                }
1064                Err(_err) => {
1065                    return Err(error::Error::Parsing(format!(
1066                        "Unexpected entity/content at the end of the document."
1067                    )))
1068                }
1069            }
1070        }
1071        ParserState::DocEndInsideComment => {
1072            //expect comment or comment-end
1073            let res = insidecomment(&buffer3.data());
1074            match res {
1075                Ok(parseresult) => {
1076                    offset = buffer3.data().offset(parseresult.0);
1077
1078                    match parseresult.1 {
1079                        InsideComment::Characters(_characters) => {}
1080                        InsideComment::CommentEnd => {
1081                            state = ParserState::DocEnd;
1082                        }
1083                    }
1084                    event2 = InternalSuccess::InsideComment(parseresult.1);
1085                }
1086                Err(nom::Err::Incomplete(_e)) => {
1087                    return Err(error::Error::UnexpectedEof);
1088                }
1089                Err(_err) => {
1090                    return Err(error::Error::Parsing(format!(
1091                        "Expecting comment content or comment closing tag "
1092                    )))
1093                }
1094            }
1095        }
1096    }
1097
1098    Ok((event2, state, offset))
1099}
1100
1101impl<R: Read> Parser<R> {
1102    pub fn from_reader(reader: R) -> Parser<R> {
1103        Parser {
1104            state: ParserState::Initial,
1105            bufreader: BufReader::with_capacity(8 * 1024, reader),
1106            offset: 0,
1107
1108            buffer3: circular::Buffer::with_capacity(16 * 1024),
1109            strbuffer: String::new(),
1110
1111            element_level: 0, // should be same as self.element_list.len()
1112            element_list: Vec::with_capacity(10),
1113            element_strbuffer: String::new(),
1114
1115            is_namespace_aware: true,
1116            namespace_list: Vec::with_capacity(10),
1117            namespace_strbuffer: String::new(),
1118
1119            attribute_list: Vec::with_capacity(5),
1120        }
1121    }
1122
1123    fn read_data(&mut self) -> Result<usize, std::io::Error> {
1124        let newread: usize;
1125        match self.bufreader.fill_buf() {
1126            Ok(ok) => {
1127                newread = ok.len();
1128            }
1129            Err(err) => return Err(err),
1130        }
1131
1132        let amt: usize;
1133        {
1134            let data2 = self.bufreader.buffer();
1135            let data_len = data2.len();
1136            //is it bigger than available space?
1137
1138            self.buffer3.shift();
1139            if data_len > self.buffer3.available_space() {
1140                let new_size = std::cmp::max(
1141                    self.buffer3.position() + data_len,
1142                    self.buffer3.capacity() * 2,
1143                );
1144
1145                self.buffer3.grow(new_size);
1146            }
1147
1148            // self.buffer2.borrow_mut().extend_from_slice(data2);
1149            // println!("buffer: {:?} , datalen: {:?}",self.buffer3.available_space(),data2.len());
1150            self.buffer3.write_all(data2).unwrap();
1151            // self.buffer3.spa
1152            amt = data2.len();
1153        }
1154        self.bufreader.consume(amt);
1155
1156        Ok(newread)
1157    }
1158
1159    // rust is not yet smart about loops, nll, structs, conditional lifetimes
1160
1161    pub fn read_event<'a>(&'a mut self) -> SaxResult<xml_sax::Event<'a>> {
1162        self.buffer3.consume(self.offset);
1163        // self.buffer2.borrow_mut().drain(0..self.offset);
1164        self.offset = 0;
1165        // {
1166        //     let vec1;
1167        //     {
1168        //         vec1 = self.buffer2.borrow_mut().split_off(self.offset)
1169        //     }
1170        //     // let mut buf = self.buffer2.borrow_mut();
1171        //     *self.buffer2.borrow_mut() = vec1;
1172        // }
1173
1174        self.strbuffer.clear();
1175        // read_data_splitted(&mut self.bufreader, &mut self.buffer2)?;
1176        // let event1;
1177
1178        let mut bytes_read: usize = 1; //magic number
1179
1180        // if self.bufreader.capacity() > self.buffer2.borrow().len() {
1181        if self.buffer3.available_space() > self.bufreader.capacity() {
1182            bytes_read = self.read_data()?;
1183        }
1184
1185        let mut read_more_data = false;
1186        loop {
1187            if read_more_data {
1188                // read_data_splitted(&mut self.bufreader, &mut self.buffer2.borrow_mut())?;
1189                bytes_read = self.read_data()?;
1190                read_more_data = false;
1191            } else {
1192                let res =
1193                    read_event_splitted(self.state, &self.bufreader, &self.buffer3, self.offset);
1194                match res {
1195                    Ok(o) => {
1196                        self.state = o.1;
1197                        self.offset = o.2;
1198
1199                        // event1 = o.0;
1200
1201                        let event = event_converter(
1202                            self.state,
1203                            o.0,
1204                            &self.buffer3,
1205                            &mut self.element_list,
1206                            &mut self.strbuffer,
1207                            &mut self.namespace_strbuffer,
1208                            &mut self.namespace_list,
1209                            self.is_namespace_aware,
1210                            self.element_level,
1211                            &mut self.element_strbuffer,
1212                            &mut self.attribute_list,
1213                        );
1214                        match event {
1215                            Ok(tpl) => {
1216                                self.state = tpl.1;
1217                                self.element_level = tpl.2;
1218
1219                                return Ok(tpl.0);
1220                            }
1221                            Err(err) => return Err(err),
1222                        };
1223                    }
1224                    Err(error::Error::UnexpectedEof) => {
1225                        //try reading again
1226                        // read_data_splitted_refcell(&mut self.bufreader, &self.buffer2)?;
1227                        if bytes_read == 0 {
1228                            return Err(error::Error::UnexpectedEof);
1229                        } else {
1230                            read_more_data = true;
1231                        }
1232                    }
1233                    Err(err) => {
1234                        //todo check eof increase internal buffer.
1235                        return Err(err);
1236                    }
1237                }
1238            }
1239        }
1240    }
1241}
1242
1243#[test]
1244fn test_parser1() {
1245    let data = r#"<root><A a='x'>
1246    <B b="val" a:b12='val2' ><C/></B> </A> </root>"#
1247        .as_bytes();
1248
1249    // let mut buf = vec![];
1250    let mut p = Parser::from_reader(data);
1251    loop {
1252        let res = p.read_event();
1253        println!("{:?}", res);
1254        match res {
1255            Ok(event) => match event {
1256                xml_sax::Event::StartDocument => {}
1257                xml_sax::Event::EndDocument => {
1258                    break;
1259                }
1260                xml_sax::Event::StartElement(_el) => {}
1261                xml_sax::Event::EndElement(_) => {}
1262                xml_sax::Event::Characters(_c) => {}
1263                xml_sax::Event::Reference(_c) => {}
1264                _ => {}
1265            },
1266
1267            Err(_err) => {
1268                break;
1269            }
1270        }
1271    }
1272}