Skip to main content

xrust/parser/
mod.rs

1/*!
2A parser combinator, inspired by nom.
3
4This parser combinator passes a context into the function, which includes the string being parsed. This supports resolving context-based constructs such as general entities and XML Namespaces.
5*/
6
7use crate::externals::URLResolver;
8use crate::item::Node;
9use crate::xdmerror::{Error, ErrorKind};
10use crate::xmldecl::DTD;
11use qualname::{NamespaceMap, NamespacePrefix, NamespaceUri};
12use std::collections::HashSet;
13use std::fmt;
14use std::rc::Rc;
15
16pub mod avt;
17pub mod combinators;
18pub(crate) mod common;
19pub mod xml;
20pub mod xpath;
21
22pub mod datetime;
23
24#[allow(type_alias_bounds)]
25pub type ParseInput<'a, N: Node> = (&'a str, ParserState<N>);
26
27#[allow(type_alias_bounds)]
28pub type ParseResult<'a, N: Node, Output> = Result<(ParseInput<'a, N>, Output), ParseError>;
29
30#[derive(Clone, Debug, PartialEq)]
31pub enum ParseError {
32    // The "Combinator" error just means a parser hasn't matched, its not serious necessarily.
33    // Every other error should get returned.
34    Combinator(String), // Combinator isn't correct, not a serious error.
35    //InvalidChar{ row:usize, col:usize },
36    //MissingClosingElement{ row:usize, col:usize, element: String},
37    //IncorrectClosingElement{ row:usize, col:usize, open: String, close:String},
38    MissingGenEntity { row: usize, col: usize },
39    MissingParamEntity { row: usize, col: usize },
40    EntityDepth { row: usize, col: usize },
41    Validation { row: usize, col: usize },
42    //Unknown { row: usize, col: usize },
43    MissingNameSpace,
44    IncorrectArguments,
45    // An unexpected character has been encountered
46    NotWellFormed(String),
47    // An attribute has been declared more than once
48    DuplicateAttribute(String),
49    Unbalanced,
50    Notimplemented,
51    ExtDTDLoadError,
52    NSResolveError(String),
53    IDError(String),
54}
55
56/// Parser state configuration that cannot be cloned.
57/// Also state that needs to be persistent during parsing.
58pub struct StaticState<L>
59where
60    L: FnMut(&NamespacePrefix) -> Result<NamespaceUri, ParseError>,
61{
62    // Tracking ID-type attributes
63    ids_read: HashSet<String>,
64    ids_pending: HashSet<String>,
65
66    /*
67       A method for resolving a prefix to a namespace URI.
68    */
69    pub namespace: Option<L>,
70
71    /* entity downloader function */
72    pub ext_dtd_resolver: Option<URLResolver>,
73}
74
75impl<L> StaticState<L>
76where
77    L: FnMut(&NamespacePrefix) -> Result<NamespaceUri, ParseError>,
78{
79    pub fn new() -> Self {
80        Self {
81            namespace: None,
82            ext_dtd_resolver: None,
83            ids_read: Default::default(),
84            ids_pending: Default::default(),
85        }
86    }
87    pub fn resolve(&self, locdir: Option<String>, uri: String) -> Result<String, Error> {
88        self.ext_dtd_resolver.map_or(
89            Err(Error::new(
90                ErrorKind::Unknown,
91                "No external DTD resolver provided.".to_string(),
92            )),
93            |e| e(locdir, uri),
94        )
95    }
96}
97
98pub struct StaticStateBuilder<L: FnMut(&NamespacePrefix) -> Result<NamespaceUri, ParseError>>(
99    StaticState<L>,
100);
101
102impl<L> Default for StaticStateBuilder<L>
103where
104    L: FnMut(&NamespacePrefix) -> Result<NamespaceUri, ParseError>,
105{
106    fn default() -> Self {
107        Self::new()
108    }
109}
110
111impl<L> StaticStateBuilder<L>
112where
113    L: FnMut(&NamespacePrefix) -> Result<NamespaceUri, ParseError>,
114{
115    pub fn new() -> Self {
116        StaticStateBuilder(StaticState::new())
117    }
118    pub fn namespace(mut self, n: L) -> Self {
119        self.0.namespace = Some(n);
120        self
121    }
122    pub fn dtd_resolver(mut self, r: URLResolver) -> Self {
123        self.0.ext_dtd_resolver = Some(r);
124        self
125    }
126    pub fn build(self) -> StaticState<L> {
127        self.0
128    }
129}
130
131/// Parser state that can be cloned
132#[derive(Clone)]
133pub struct ParserState<N: Node> {
134    // Document node to use to create nodes
135    doc: Option<N>,
136    // Element to use to determine in-scope namespaces
137    cur: Option<N>,
138
139    // Reference-counted with copy-on-write: cloning ParserState (which the
140    // combinators do on every `alt` alternative) is then a refcount bump rather
141    // than a deep copy of these HashMap-heavy structures. Mutations go through
142    // Rc::make_mut, which copies only when the value is actually shared.
143    dtd: Rc<DTD>,
144    // Do we add DTD specified attributes or not
145    attr_defaults: bool,
146
147    // The in-scope namespace declarations.
148    // This will be reset when the parsing context changes
149    pub(crate) in_scope_namespaces: Rc<NamespaceMap>,
150
151    /*
152      ID tracking:
153      ids_read covers all IDs for duplicate checking. Where an IDREF is found and the ID is not
154      yet encountered, we pull into StaticState::ids_pending and will review those when we have finished
155      parsing the document.
156    */
157    id_tracking: bool,
158
159    standalone: bool,
160    xmlversion: String,
161    /*
162    The below will track Entity Expansion, ensuring that there are no recursive entities and
163    some protections from zip bombs
164     */
165    maxentitydepth: usize,
166    currententitydepth: usize,
167    /* eventual error location reporting */
168    currentcol: usize,
169    currentrow: usize,
170    /* For tracking down stack overflows */
171    //stack: Vec<String>,
172    //limit: Option<usize>,
173    ext_entities_to_parse: Vec<String>,
174    docloc: Option<String>,
175    /*
176    ParamEntities are not allowed in internal subsets, but they are allowed in external DTDs,
177    so we need to track when we are currently in the main document or outside it.
178     */
179    currentlyexternal: bool,
180}
181
182impl<N: Node> Default for ParserState<N> {
183    fn default() -> Self {
184        Self::new()
185    }
186}
187
188impl<N: Node> ParserState<N> {
189    pub fn new() -> Self {
190        ParserState {
191            doc: None,
192            cur: None,
193            dtd: Rc::new(DTD::new()),
194            standalone: false,
195            xmlversion: "1.0".to_string(), // Always assume 1.0
196            in_scope_namespaces: Rc::new(NamespaceMap::new()),
197            id_tracking: true,
198            maxentitydepth: 8,
199            attr_defaults: true,
200            currententitydepth: 1,
201            currentcol: 1,
202            currentrow: 1,
203            //stack: vec![],
204            //limit: None,
205            ext_entities_to_parse: vec![],
206            docloc: None,
207            currentlyexternal: false,
208        }
209    }
210
211    /// Get the result document
212    pub fn doc(&self) -> Option<N> {
213        self.doc.clone()
214    }
215    /// Get the current node
216    pub fn current(&self) -> Option<N> {
217        self.cur.clone()
218    }
219}
220
221impl<N: Node> PartialEq for ParserState<N> {
222    fn eq(&self, _: &ParserState<N>) -> bool {
223        true
224    }
225}
226
227impl<N: Node> fmt::Debug for ParserState<N> {
228    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
229        f.debug_struct("ParserState").finish()
230    }
231}
232
233pub struct ParserStateBuilder<N: Node>(ParserState<N>);
234
235impl<N: Node> Default for ParserStateBuilder<N> {
236    fn default() -> Self {
237        Self::new()
238    }
239}
240
241impl<N: Node> ParserStateBuilder<N> {
242    pub fn new() -> Self {
243        ParserStateBuilder(ParserState::new())
244    }
245    pub fn doc(mut self, d: N) -> Self {
246        self.0.doc = Some(d);
247        self
248    }
249    pub fn current(mut self, d: N) -> Self {
250        self.0.cur = Some(d);
251        self
252    }
253    pub fn dtd(mut self, d: DTD) -> Self {
254        self.0.dtd = Rc::new(d);
255        self
256    }
257    pub fn attribute_defaults(mut self, a: bool) -> Self {
258        self.0.attr_defaults = a;
259        self
260    }
261    pub fn in_scope_namespaces(mut self, nsm: NamespaceMap) -> Self {
262        self.0.in_scope_namespaces = Rc::new(nsm);
263        self
264    }
265    pub fn id_tracking(mut self, a: bool) -> Self {
266        self.0.id_tracking = a;
267        self
268    }
269    pub fn standalone(mut self, a: bool) -> Self {
270        self.0.standalone = a;
271        self
272    }
273    pub fn currently_external(mut self, a: bool) -> Self {
274        self.0.currentlyexternal = a;
275        self
276    }
277    pub fn xml_version(mut self, x: String) -> Self {
278        self.0.xmlversion = x;
279        self
280    }
281    pub fn maximum_entity_depth(mut self, d: usize) -> Self {
282        self.0.maxentitydepth = d;
283        self
284    }
285    pub fn document_location(mut self, l: String) -> Self {
286        self.0.docloc = Some(l);
287        self
288    }
289    pub fn build(self) -> ParserState<N> {
290        self.0
291    }
292}