rosie/
lib.rs

1#![crate_name = "rosie"]
2
3#![doc(html_logo_url = "https://rosie-lang.org/images/rosie-circle-blog.png")]
4//Q-06.04, Can Jamie host a version of this logo that doesn't have a border?  i.e. just the circle occupying the whole frame, with an alpha-channel so the corners are transparent
5
6#![doc = include_str!("../README.md")]
7
8use core::mem::swap;
9use core::fmt;
10use core::fmt::Display;
11use std::str;
12use std::convert::{TryFrom, TryInto};
13use std::path::{Path, PathBuf};
14use std::fs;
15use std::sync::Mutex;
16use std::cell::UnsafeCell;
17
18use linked_hash_map::LinkedHashMap;
19
20use once_cell::sync::Lazy; // TODO: As soon as std::sync::SyncLazy is pushed to stable, we will migrate there and eliminate this dependency
21
22extern crate rosie_sys;
23use rosie_sys::{
24    RosieString,
25    rosie_new_string,
26    rosie_home_default,
27    rosie_home_init,
28    rosie_free_rplx,
29};
30
31//Private Internal code for managing most calls into librosie
32mod sys_wrapper;
33use sys_wrapper::{*};
34
35//Public re-exports
36mod sys_shadow; //Shadow implementations of things from rosie_sys::
37pub use sys_shadow::RosieError; //pub use rosie_sys::RosieError;
38/// An Encoder module used to format the results, when using [Pattern::raw_match]
39pub use rosie_sys::MatchEncoder;
40/// A structure containing the match results from a [Pattern::raw_match] call.
41/// 
42/// **NOTE**: A RawMatchResult points to memory inside the engine that is associated with the pattern, therefore you may
43/// not perform any additional matching with that pattern until the RawMatchResult has been released.  This is enforced with
44/// borrowing semantics [Pattern::raw_match].
45pub use rosie_sys::RawMatchResult;
46/// A format for debugging output, to be used with [Pattern::trace]
47pub use rosie_sys::TraceFormat;
48
49/// Functionality to access [RosieEngine]s directly
50/// 
51/// The majority of use cases don't require direct access to a RosieEngine.  However, this module can be used to:
52/// - Create multiple simultaneous engines
53/// - Change the environment (Standard Pattern Library or config)
54/// - Explicitly load rpl packages 
55/// - Constrain memory usage
56/// 
57/// ## Example Usage
58/// ```
59/// use rosie::*;
60/// let mut engine = engine::RosieEngine::new(None).unwrap();
61/// engine.import_pkg("date", None, None);
62/// 
63/// let date_pat = engine.compile("date.us_long", None).unwrap();
64/// assert!(date_pat.match_str::<bool>("Saturday, Nov 5, 1955").unwrap());
65/// ```
66/// 
67pub mod engine {
68    pub use crate::sys_wrapper::RosieEngine;
69}
70
71/// Alternative Rosie entry points that support sharing between threads
72/// 
73/// [RosieEngine]s and compiled [Pattern]s can't be accessed by other threads, so this module
74/// implements versions that can, but the tradeoff is runtime performance costs due to locking
75/// and copying the results to private buffers
76/// 
77/// NOTE: I feel like this is a short-term solution.  Longer term, I envision a RosieEngine
78/// being separated into two objects; a compiler and a matching engine.  Then the matching engine
79/// could be fully reentrant and lock-free, and new compiled patterns could be added atomically
80/// to a matching engine, obviating the need for everything in the `thread_portable` module.
81/// 
82/// I'd like to discuss this direction with Jamie, and possibly even an implementation of the
83/// matching engine in safe Rust.
84pub mod thread_portable;
85
86use thread_portable::PortablePattern;
87
88//The number of compiled patterns in the pattern cache
89const PATTERN_CACHE_SIZE: usize = 8;
90
91//Global to track the state of librosie
92static LIBROSIE_INITIALIZED: Lazy<Mutex<bool>> = Lazy::new(|| Mutex::new(false));
93
94//Global per-thread singleton engine and pattern cache
95struct ThreadLocals {
96    engine : RosieEngine,
97    pattern_cache : LinkedHashMap<String, Pattern>
98}
99thread_local!{
100    //TODO: Waiting for the stabilization of `#[thread_local]` attribute so we can get rid of this UnsafeCell
101    // Don't want to pay the price of a RefCell for no reason
102    // https://github.com/rust-lang/rust/issues/29594
103    static THREAD_LOCALS : UnsafeCell<ThreadLocals> = UnsafeCell::new(ThreadLocals::new())
104}
105
106impl ThreadLocals {
107    fn new() -> Self {
108        Self {
109            engine : {
110                let mut messages = RosieMessage::empty();
111                if let Ok(engine) = RosieEngine::new(Some(&mut messages)) {
112                    engine
113                } else {
114                    panic!("ERROR Creating RosieEngine: {}", messages.as_str())
115                }
116            },
117            pattern_cache : LinkedHashMap::with_capacity(PATTERN_CACHE_SIZE)
118        }
119    }
120}
121
122/// A buffer to obtain text from Rosie.
123/// 
124/// The contents of the buffer depend on the situation under which it is returned.
125/// Sometimes the returned text is formatted as JSON and other times it is a human-readable message.
126/// 
127/// # Example: Getting a message from the expression compiler
128/// ```
129/// # use rosie::*;
130/// let mut engine = engine::RosieEngine::new(None).unwrap();
131/// let mut message = RosieMessage::empty();
132/// engine.compile("invalid pattern", Some(&mut message));
133/// println!("{}", message);
134/// ```
135#[derive(Debug)]
136pub struct RosieMessage(RosieString<'static>);
137
138//For some strings, we are responsible for freeing any string buffers, even if librosie allocated them
139impl Drop for RosieMessage {
140    fn drop(&mut self) {
141        self.0.manual_drop();
142    }
143}
144
145impl RosieMessage {
146    /// Creates an empty RosieMessage.  Used to allocate a location into which another function may write output.
147    pub fn empty() -> Self {
148        Self(RosieString::empty())
149    }
150    /// Creates a new RosieMessage by copying the contents of the argument &[str](std::str) into the newly created RosieMessage.
151    pub fn from_str(s: &str) -> Self {
152        let rosie_string = unsafe { rosie_new_string(s.as_ptr(), s.len()) };
153        Self(rosie_string)
154    }
155    pub fn from_bytes(b: &[u8]) -> Self {
156        let rosie_string = unsafe { rosie_new_string(b.as_ptr(), b.len()) };
157        Self(rosie_string)
158    }
159    /// Returns `true` if the RosieMessage contains text.  Returns `false` if it is empty.
160    pub fn is_valid(&self) -> bool {
161        self.0.is_valid()
162    }
163    /// Borrows the RosieMessage contents as a slice of bytes.  If the RosieMessage is empty the resulting slice will have a length of zero.
164    pub fn as_bytes(&self) -> &[u8] {
165        self.0.as_bytes()
166    }
167    /// Borrows the RosieMessage contents as a &[str](std::str).  If the RosieMessage is empty the result will have a length of zero.
168    pub fn as_str(&self) -> &str {
169        self.0.as_str()
170    }
171    /// Same as [as_str](RosieMessage::as_str) but won't panic
172    pub fn try_as_str(&self) -> Option<&str> {
173        self.0.try_as_str()
174    }
175    /// Returns the length, in bytes, of the contents of the RosieMessage.
176    pub fn len(&self) -> usize {
177        self.0.len()
178    }
179}
180
181impl Display for RosieMessage {
182    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
183        self.0.fmt(f)
184    }
185}
186
187/// The interface to top-level rosie functionality
188pub struct Rosie;
189
190impl Rosie {
191    /// Matches the specified `expression` in the specified `input` bytes.
192    /// 
193    /// Returns the requested type if a match was found, otherwise returns an appropriate error code.
194    /// 
195    /// Compiled patterns are managed automatically using a least-recently-used cache and are recompiled as needed.
196    /// 
197    /// NOTE: This method may return several different return types, including [bool], and [MatchResult].
198    /// If you need the fastest possible performance calling this method to return a [bool] will use the
199    /// [Status](MatchEncoder::Status) encoder and bypass a lot of overhead formatting the results.
200    pub fn match_bytes<'input, T>(expression : &str, input : &'input [u8]) -> T 
201    where T : MatchOutput<'input> {
202        THREAD_LOCALS.with(|locals_cell| {
203
204            //TODO: Get rid of UnsafeCell.  See note near declaration of THREAD_LOCALS.
205            let locals : &mut ThreadLocals = unsafe{ &mut *locals_cell.get() };
206
207            //See if we have the expression in our pattern cache
208            let mut pat = if let Some(existing_pat) = locals.pattern_cache.remove(expression) {
209                existing_pat
210            } else {
211                //If we don't have the expression, make sure there is space for it in the cache
212                if locals.pattern_cache.len() > PATTERN_CACHE_SIZE-1 {
213                    //Toss out the least-recently-added item
214                    let _ = locals.pattern_cache.pop_front();
215                }
216
217                //And compile the expression
218                locals.engine.import_expression_deps(expression, None).unwrap();
219                locals.engine.compile(expression, None).unwrap()
220            };
221
222            //Call the return-type-specific match call
223            let result = T::match_bytes(&mut pat, input).unwrap();
224
225            //Put the pattern back on the top of the LRU stack
226            locals.pattern_cache.insert(expression.to_string(), pat);
227
228            result
229        })
230    }
231    /// Matches the specified `expression` in the specified `input` str.
232    /// 
233    /// Returns the requested type if a match was found, otherwise returns an appropriate error code.
234    /// 
235    /// Compiled patterns are managed automatically using a least-recently-used cache and are recompiled as needed.
236    /// 
237    /// NOTE: This method may return several different return types, including [bool], and [MatchResult].
238    /// If you need the fastest possible performance calling this method to return a [bool] will use the
239    /// [Status](MatchEncoder::Status) encoder and bypass a lot of overhead formatting the results.
240    pub fn match_str<'input, T>(expression : &str, input : &'input str) -> T 
241    where T : MatchOutput<'input> {
242        Self::match_bytes(expression, input.as_bytes())
243    }
244    /// Compiles the specified expression, returning a `Pattern` that can then be used to match that expression.
245    /// 
246    /// The expression may be either the name of a previously loaded `rpl` pattern, or it may be a raw `rpl` expression.
247    /// 
248    /// **NOTE**: This method is high-level.  If you want more control, performance, or feedback, see [RosieEngine::compile].
249    /// 
250    /// - This method automatically evaluates the expression for dependencies and automatically loads any dependencies it
251    /// finds, while RosieEngine::compile skips the dependency analysis
252    /// - This method's returned `Pattern` will be hosted by the thread's default engine.  RosieEngine::compile allows you
253    /// to host the Pattern on another engine
254    /// - This method doesn't provide any compile warnings or errors.  To debug a compilation failure, call
255    /// RosieEngine::compile
256    /// 
257    /// # Examples
258    /// ```
259    /// # use rosie::*;
260    /// let date_pat = Rosie::compile("date.us_long").unwrap();
261    /// ```
262    /// 
263    /// ```
264    /// # use rosie::*;
265    /// let two_digit_year_pat = Rosie::compile("{[012][0-9]}").unwrap();
266    /// ```
267    /// 
268    pub fn compile(expression : &str) -> Result<Pattern, RosieError> {
269        THREAD_LOCALS.with(|locals_cell| {
270            
271            //TODO: Get rid of UnsafeCell.  See note near declaration of THREAD_LOCALS.
272            let locals : &ThreadLocals = unsafe{ &*locals_cell.get() };
273
274            locals.engine.import_expression_deps(expression, None)?;
275            locals.engine.compile(expression, None)
276        })
277    }
278    /// Sets a custom location for the rosie_home path used for support scripts and the default Standard Pattern Library. 
279    /// 
280    /// **WARNING** This method must be called before any other rosie calls, or it will not be sucessful
281    pub fn set_rosie_home_path<P: AsRef<Path>>(path: P) {
282        librosie_init(Some(path))
283    }
284    /// Returns the thread's default RosieEngine, replacing it with a newly initialized engine
285    /// 
286    /// NOTE: This will clear the compiled pattern cache used by [Rosie::match_str]
287    /// 
288    pub fn take_thread_default_engine() -> RosieEngine {
289        THREAD_LOCALS.with(|locals_cell| {
290
291            //TODO: Get rid of UnsafeCell.  See note near declaration of THREAD_LOCALS.
292            let locals : &mut ThreadLocals = unsafe{ &mut *locals_cell.get() };
293
294            let mut new_locals = ThreadLocals::new();
295            swap(&mut new_locals, locals);
296            new_locals.engine
297        })
298    }
299    /// Replaces the thread's default RosieEngine with the engine supplied
300    /// 
301    /// NOTE: This will clear the compiled pattern cache used by [Rosie::match_str]
302    /// 
303    pub fn replace_thread_default_engine(engine : RosieEngine) {
304        THREAD_LOCALS.with(|locals_cell| {
305
306            //TODO: Get rid of UnsafeCell.  See note near declaration of THREAD_LOCALS.
307            let locals : &mut ThreadLocals = unsafe{ &mut *locals_cell.get() };
308
309            locals.engine = engine;
310            locals.pattern_cache = LinkedHashMap::with_capacity(PATTERN_CACHE_SIZE);
311        })
312    }
313}
314
315/// Implemented for types that can be returned by a match operation
316pub trait MatchOutput<'a> : Sized {
317    fn match_bytes(pat : &Pattern, input : &'a [u8]) -> Result<Self, RosieError>;
318    fn match_bytes_portable(pat : &PortablePattern, input : &'a [u8]) -> Result<Self, RosieError>;
319}
320
321impl MatchOutput<'_> for bool {
322    fn match_bytes(pat : &Pattern, input : &[u8]) -> Result<Self, RosieError> {
323        //NOTE: we're calling directly into the engine because we want to bypass the requirement for a &mut self in Pattern::raw_match.
324        // That &mut is just there to ensure we have an exclusive borrow, so subsequent calls don't match the same compiled pattern and
325        // collide with the pattern's buffer in the engine.
326        let raw_match_result = pat.engine.0.match_pattern_raw(pat.id, 1, input, &MatchEncoder::Status).unwrap();
327        Ok(raw_match_result.did_match())
328    }
329    fn match_bytes_portable(pat : &PortablePattern, input : &[u8]) -> Result<Self, RosieError> {
330        let guard = pat.engine.0.lock().unwrap();
331        let raw_match_result = guard.match_pattern_raw(pat.id, 1, input, &MatchEncoder::Status).unwrap();
332        Ok(raw_match_result.did_match())
333    }
334}
335
336//IMPLEMENTATION NOTE: I chose to delete the `String` implementation because the common case for MatchResult, i.e. the case where the
337// pattern is not a constant-capture pattern means that the matched string is just a slice referencing the input.  But the `String`
338// implementation forces a copy in every situation.  So we want to accidentally direct people to the slow-path by making it convenient.
339//
340//In a perfect world, I would like there to be an implementation for `&'a str`, but the problem with that is constant-capture patterns
341// point to data that isn't in the input.  I went pretty far into a change that got rid of the MaybeOwnedBytes inside of MatchResult,
342// in order to implement `MatchResult::into_str`, but that meant constant-capture patterns (and therefore all MatchResults) needed to
343// borrow the engine buffer associated with the pattern (like a RawMatchResult does).  This is unworkable because of the pattern cache.
344//
345// impl <'a>MatchOutput<'a> for String {
346//     fn match_str(pat : &Pattern, input : &'a str) -> Result<Self, RosieError> {
347//         let match_result = pat.engine.match_pattern(pat.id, 1, input)?;
348//         Ok(match_result.matched_str().to_string())
349//     }
350// }
351
352impl <'a>MatchOutput<'a> for MatchResult<'a> {
353    fn match_bytes(pat : &Pattern, input : &'a [u8]) -> Result<Self, RosieError> {
354        pat.engine.0.match_pattern(pat.id, 1, input)
355    }
356    fn match_bytes_portable(pat : &PortablePattern, input : &'a [u8]) -> Result<Self, RosieError> {
357        pat.engine.0.lock().unwrap().match_pattern(pat.id, 1, input)
358    }
359}
360
361//Private function to make sure librosie is initialized and initialize it if it isn't
362//Internal NOTE: This function is responsible for internal librosie initialization, so it is also called by RosieEngine::new()
363fn librosie_init<P: AsRef<Path>>(path: Option<P>) {
364
365    //Get the global status var, or block until we can get it
366    let mut init_status = LIBROSIE_INITIALIZED.lock().unwrap();
367
368    //If librosie isn't initialized yet, then initialize it
369    if !(*init_status) {
370
371        let mut did_init = false;
372
373        //Decide among the different paths we might use
374        let dir_path = if let Some(dir_path) = path {
375            Some(PathBuf::from(dir_path.as_ref())) // If we were passed a path, use it.
376        } else {
377            if let Some(default_path_str) = rosie_home_default() {
378                Some(PathBuf::from(str::from_utf8(default_path_str).unwrap())) //We will pass the path compiled into our binary
379            } else {
380                None //We will let librosie try to find it
381            }
382        };
383
384        //Make sure the path is a valid directory before calling rosie_home_init(),
385        // because rosie_home_init() will buy whatever we're selling, even if it's garbage
386        if let Some(dir_path) = dir_path {
387            if let Ok(dir_metadata) = fs::metadata(&dir_path) {
388                if dir_metadata.is_dir() {
389                    let mut message_buf = RosieString::empty();
390                    unsafe{ rosie_home_init(&RosieString::from_str(&dir_path.to_str().unwrap()), &mut message_buf) };
391                    did_init = true;
392                }
393            }
394        }
395        
396        *init_status = did_init;
397    }
398}
399
400/// The compiled form of an RPL expression, used for matching
401/// 
402/// A Pattern can be created by either [Rosie::compile] or [RosieEngine::compile].
403/// 
404/// **Performance NOTE**: Compiling a pattern is hundreds of times more expensive than a typical matching operation.
405/// Therefore it is usually worthwhile to retain compiled patterns rather than allowing them to be dropped and
406/// recompiling them when needed.
407/// 
408//TODO: Add API to load compiled patterns, without needing to go via RPL, when it is supported by librosie.
409//
410//INTERNAL NOTE: Pattern doesn't implement Clone because a RawMatchResult holds a pointer to a buffer inside the
411// engine, for which there is one-per-pattern.  If a pattern could be cloned, we could end up invalidating the
412// memory out from under a RawMatchResult.
413pub struct Pattern {
414    engine : RosieEngine,
415    id : i32
416}
417
418impl Drop for Pattern {
419    fn drop(&mut self) {
420        unsafe { rosie_free_rplx(self.engine.0.0, self.id) };
421    }
422}
423
424impl Pattern {
425        
426    /// Matches the `Pattern` in the specified `input` bytes string.
427    /// 
428    /// Returns the requested type if a match was found, otherwise returns an appropriate error code.
429    /// 
430    /// NOTE: This method may return several different return types, including [bool], and [MatchResult].
431    /// If you need the fastest possible performance calling this method to return a [bool] will use the
432    /// [Status](MatchEncoder::Status) encoder and bypass a lot of overhead formatting the results.
433    pub fn match_bytes<'input, T>(&self, input : &'input [u8]) -> Result<T, RosieError> 
434    where T : MatchOutput<'input> {
435        //Call the return-type-specific match call
436        T::match_bytes(self, input)
437    }
438    
439    /// Matches the `Pattern` in the specified `input` str.
440    /// 
441    /// Returns the requested type if a match was found, otherwise returns an appropriate error code.
442    /// 
443    /// NOTE: This method may return several different return types, including [bool], and [MatchResult].
444    /// If you need the fastest possible performance calling this method to return a [bool] will use the
445    /// [Status](MatchEncoder::Status) encoder and bypass a lot of overhead formatting the results.
446    pub fn match_str<'input, T>(&self, input : &'input str) -> Result<T, RosieError> 
447    where T : MatchOutput<'input> {
448        self.match_bytes(input.as_bytes())
449    }
450
451    /// Matches the `Pattern` in the specified `input` string, beginning from the `start` index, using the specified `encoder`.
452    /// 
453    /// Returns a [RawMatchResult] or an error code if a problem was encountered.  This is a lower-level API than [match_str](Pattern::match_str),
454    /// and the primary reason to use it is to get the output from a particular [MatchEncoder].  For example, the [JSON](MatchEncoder::JSON) or [JSONPretty](MatchEncoder::JSONPretty) encoders.
455    /// 
456    /// **NOTE**: The returned [RawMatchResult] takes a mutable borrow of the `Pattern` because it references internal data
457    /// associated with the `Pattern`.  Therefore the `Pattern` cannot be accessed while the RawMatchResult is in use; copying
458    /// the data from the RawMatchResult will allow the `Pattern` to be released.
459    /// 
460    /// **NOTE**: The values for `start` are 1-based.  Meaning passing 1 will begin the match from the beginning of the input, and
461    /// passing 0 (zero) is an error.
462    /// 
463    /// # Example using the JSON encoder with serde_json
464    /// ```
465    /// extern crate serde_json;
466    /// use serde::{*};
467    /// use rosie::*;
468    /// 
469    /// #[derive(Debug, Deserialize)]
470    /// struct JSONMatchResult {
471    ///     #[serde(rename = "type")]
472    ///     pat_name : String, // The pattern that was matched
473    ///     #[serde(rename = "s")]
474    ///     start : usize, // The offset of the start of the match in the input buffer
475    ///     #[serde(rename = "e")]
476    ///     end : usize, // The offset of the end of the match in the input buffer
477    ///     data : String, // The matched text, copied from the input buffer
478    ///     #[serde(default = "Vec::new")]
479    ///     subs : Vec<JSONMatchResult> // The sub-matches within the pattern
480    /// }
481    /// 
482    /// let mut date_pat = Rosie::compile("date.any").unwrap();
483    /// let raw_result = date_pat.raw_match(1, b"Sat Nov 5, 1955", &MatchEncoder::JSON).unwrap();
484    /// let parsed_result : JSONMatchResult = serde_json::from_slice(raw_result.as_bytes()).unwrap();
485    /// ```
486    /// 
487    pub fn raw_match<'pat>(&'pat mut self, start : usize, input : &[u8], encoder : &MatchEncoder) -> Result<RawMatchResult<'pat>, RosieError> {
488        self.engine.0.match_pattern_raw(self.id, start, input, encoder)
489    }
490
491    /// Traces a pattern match, providing information useful for debugging the pattern expression.
492    /// 
493    /// Returns a bool indicating whether the specified pattern matched the input.  The caller must allocate an empty [RosieMessage]
494    /// in order to receive the resulting trace information.
495    /// 
496    /// The caller must also pass a [TraceFormat], to specify the format for the resulting information.
497    /// [Condensed](TraceFormat::Condensed) is the most human-readable format, but a other formats may contain more complete
498    /// information or be easier to automatically parse.
499    /// 
500    /// # Example
501    /// ```
502    /// # use rosie::*;
503    /// let date_pat = Rosie::compile("date.any").unwrap();
504    /// 
505    /// let mut trace = RosieMessage::empty();
506    /// let did_match = date_pat.trace(1, "Sat. Nov. 5, 1955", TraceFormat::Condensed, &mut trace).unwrap();
507    /// println!("{}", trace.as_str());
508    /// ```
509    ///
510    pub fn trace(&self, start : usize, input : &str, format : TraceFormat, trace : &mut RosieMessage) -> Result<bool, RosieError> {
511        self.engine.0.trace_pattern(self.id, start, input, format, trace)
512    }
513}
514
515//A variant on maybe_owned::MaybeOwned, except it can either be a Vec<u8> or an &[u8].
516#[derive(Debug)]
517enum MaybeOwnedBytes<'a> {
518    Owned(Vec<u8>),
519    Borrowed(&'a [u8]),
520}
521
522impl MaybeOwnedBytes<'_> {
523    pub fn try_as_str(&self) -> Option<&str> {
524        str::from_utf8(self.as_bytes()).ok()
525    }
526    pub fn as_bytes(&self) -> &[u8] {
527        match self {
528            MaybeOwnedBytes::Owned(the_vec) => &the_vec[..],
529            MaybeOwnedBytes::Borrowed(the_slice) => the_slice
530        }
531    }
532}
533
534/// Represents the results of a match operation, performed by [Pattern::match_str] or [Rosie::match_str]
535/// 
536//**TODO** I feel like a more caller-friendly interface is possible; i.e. the ability to specify sub-patterns with a "path"
537#[derive(Debug)]
538pub struct MatchResult<'a> {
539    pat_name : String,
540    start : usize,
541    end : usize,
542    data : MaybeOwnedBytes<'a>,
543    subs : Vec<MatchResult<'a>>
544}
545
546impl <'a>MatchResult<'a> {
547
548    //This method is a port from the python code here: https://gitlab.com/rosie-community/clients/python/-/blob/master/rosie/decode.py
549    fn from_bytes_buffer<'input>(input : &'input [u8], match_buffer : &mut &[u8], existing_start_pos : Option<usize>) -> MatchResult<'input> {
550
551        //If we received a start position, it is because we are in the middle of a recursive call stack
552        let start_position = match existing_start_pos {
553            Some(start_position) => start_position,
554            None => {
555                //Otherwise, Read the first 4 bytes, interpret them as a signed little-endian 32 bit integer,
556                //  and then negate them to get the start position
557                let (start_pos_chars, remainder) = match_buffer.split_at(4);
558                *match_buffer = remainder;
559                let signed_start_pos = i32::from_le_bytes(start_pos_chars.try_into().unwrap());
560                assert!(signed_start_pos < 0);
561                usize::try_from(signed_start_pos * -1).unwrap()
562            }
563        };
564        
565        //Read the next 2 bytes, interpret them as a signed little-endian 16 but integer,
566        let (type_len_chars, remainder) = match_buffer.split_at(2);
567        *match_buffer = remainder;
568        let mut type_len = i16::from_le_bytes(type_len_chars.try_into().unwrap()); //The length of the pattern name
569
570        //constant-capture means data is a user-provided string, (i.e. a string from the encoder)
571        //Otherwise regular-capture means data is a subset of the input string
572        let constant_capture = if type_len < 0 {
573            type_len = type_len * -1;
574            true
575        } else {
576            false
577        };
578        
579        //Read type_len characters, intperpreting it as the pattern name
580        let (type_name_chars, remainder) = match_buffer.split_at(usize::try_from(type_len).unwrap());
581        *match_buffer = remainder;
582        let pattern_name = String::from_utf8(type_name_chars.to_vec()).unwrap();
583
584        //Get the data out of the match_buffer, or the input string, depending on whether the pattern is "constant-capture" or not
585        let mut data = if constant_capture {
586            let (data_len_chars, remainder) = match_buffer.split_at(2);
587            *match_buffer = remainder;
588            let data_len = i16::from_le_bytes(data_len_chars.try_into().unwrap()); //The length of the data name
589            assert!(data_len >= 0);
590
591            let (data_chars, remainder) = match_buffer.split_at(usize::try_from(data_len).unwrap());
592            *match_buffer = remainder;
593            MaybeOwnedBytes::Owned(data_chars.to_vec())
594        } else {
595            let (_, match_data) = input.split_at(start_position-1);
596            MaybeOwnedBytes::Borrowed(match_data)
597        };
598
599        //The empty array for our sub-patterns.
600        let mut subs = Vec::new();
601        
602        //Read the next 4 bytes, and interpret them as a little-endian signed int.  It it's negative, then
603        //that means we negate it to get the start of the next sub-match, and call ourselves recursively to.
604        //continue parsing the sub-match.  If the number is positive, then we have come to the end of this
605        //sub-pattern array, so the number is the end position of this pattern.
606        let end_position;
607        loop {
608            let (next_pos_chars, remainder) = match_buffer.split_at(4);
609            *match_buffer = remainder;
610            let signed_next_pos = i32::from_le_bytes(next_pos_chars.try_into().unwrap());
611            
612            if signed_next_pos < 0 {
613                let next_position = usize::try_from(signed_next_pos * -1).unwrap();
614                let sub_match = MatchResult::from_bytes_buffer(input, match_buffer, Some(next_position));
615                subs.push(sub_match);
616            } else {
617                end_position = usize::try_from(signed_next_pos).unwrap();
618                break;
619            }
620        }
621
622        //If we have a borrowed data pointer, cut its length at the appropriate place
623        if let MaybeOwnedBytes::Borrowed(match_data) = data {
624            let (new_data_ref, _) = match_data.split_at(end_position - start_position);
625            data = MaybeOwnedBytes::Borrowed(new_data_ref);
626        }
627        
628        MatchResult{
629            pat_name : pattern_name,
630            start : start_position,
631            end : end_position,
632            data : data,
633            subs : subs
634        }
635    }
636    fn from_byte_match_result<'input>(input : &'input [u8], src_result : RawMatchResult) -> MatchResult<'input> {
637        let mut data_buf_ref = src_result.as_bytes();
638        MatchResult::from_bytes_buffer(input, &mut data_buf_ref, None)
639    }
640    fn new_no_match() -> MatchResult<'static> {
641        MatchResult{
642            pat_name : "".to_string(),
643            start : 0,
644            end : 0,
645            data : MaybeOwnedBytes::Borrowed(&[]),
646            subs : vec![]
647        }
648    }
649    /// Returns `true` if the pattern was matched in the input, otherwise returns `false`.
650    pub fn did_match(&self) -> bool {
651        if self.start == 0 && self.end == 0 {
652            false
653        } else {
654            true
655        }
656    }
657    /// Returns the name of the pattern that matched
658    pub fn pat_name_str(&self) -> &str {
659        self.pat_name.as_str()
660    }
661    /// Returns the subset of the input that was matched by the pattern as an &str
662    /// 
663    /// NOTE: This may panic if the matched data includes part but not all of a unicode character.
664    /// Use [try_matched_str](Self::try_matched_str) for a non-panicking alternative
665    pub fn matched_str(&self) -> &str {
666        self.try_matched_str().unwrap()
667    }
668    /// Returns the subset of the input that was matched by the pattern as an &str
669    pub fn try_matched_str(&self) -> Option<&str> {
670        self.data.try_as_str()
671    }
672    /// Returns the subset of the input that was matched by the pattern as an &[u8]
673    pub fn matched_bytes(&self) -> &[u8] {
674        self.data.as_bytes()
675    }
676    /// Returns the character offset of the beginning of the match, within the input
677    /// 
678    /// NOTE: Offsets are 1-based
679    pub fn start(&self) -> usize {
680        self.start
681    }
682    /// Returns the character offset, within the input, of the end of the match
683    /// 
684    /// NOTE: Offsets are 1-based
685    pub fn end(&self) -> usize {
686        self.end
687    }
688    /// Returns the number of matched sub-patterns that comprise the matched pattern
689    pub fn sub_pat_count(&self) -> usize {
690        self.subs.len()
691    }
692    /// Returns an [Iterator] over all of the sub-patterns within this matched pattern
693    pub fn sub_pat_iter(&self) -> impl Iterator<Item=&MatchResult> {
694        self.subs.iter()
695    }
696}
697
698#[cfg(test)]
699mod tests {
700    use crate::{*};
701    use std::thread;
702    use rand::prelude::*;
703    use rand_pcg::Pcg64;
704
705    #[test]
706    /// Tests the RosieString and RosieMessage functionality, without a RosieEngine
707    fn rosie_string() {
708
709        //A basic RosieString, pointing to a static string
710        let hello_str = "hello";
711        let rosie_string = RosieString::from_str(hello_str);
712        assert_eq!(rosie_string.len(), hello_str.len());
713        assert_eq!(rosie_string.as_str(), hello_str);
714
715        //A RosieString pointing to a heap-allocated string
716        let hello_string = String::from("hi there");
717        let rosie_string = RosieString::from_str(hello_string.as_str());
718        assert_eq!(rosie_string.len(), hello_string.len());
719        assert_eq!(rosie_string.as_str(), hello_string);
720
721        //Ensure we can't deallocate our rust String without deallocating our RosieString first
722        drop(hello_string);
723        //TODO: Implement a TryBuild harness in order to ensure the line below will not compile 
724        //assert!(rosie_string.is_valid());
725
726        //Make a RosieMessage, pointing to a heap-allocated string
727        let hello_string = String::from("howdy");
728        let rosie_message = RosieMessage::from_str(hello_string.as_str());
729        assert_eq!(rosie_message.len(), hello_string.len());
730        assert_eq!(rosie_message.as_str(), hello_string);
731
732        //Now test that we can safely deallocate the heap-allocated String that we used to create a RosieMessage
733        drop(hello_string);
734        assert!(rosie_message.is_valid());
735    }
736
737    #[test]
738    /// Some tests for working with the default thread singleton engine
739    fn default_engine() {
740
741        //Try with the one liner, returning a bool
742        assert!(Rosie::match_str::<bool>("{ [H][^]* }", "Hello, Rosie!"));
743
744        //Try with explicit compilation using the default engine
745        let pat = Rosie::compile("{ [H][^]* }").unwrap();
746        let result : MatchResult = pat.match_str("Hello, Rosie!").unwrap();
747        assert_eq!(result.matched_str(), "Hello, Rosie!");
748
749        //Take the default engine and then drop it, but make sure extant patterns aren't affected
750        let engine = Rosie::take_thread_default_engine();
751        drop(engine);
752        assert!(pat.match_str::<bool>("Hello, Rosie!").unwrap());
753
754        //Create a new explicit engine, and make it the default, and ensure everything is ok
755        let engine = RosieEngine::new(None).unwrap();
756        Rosie::replace_thread_default_engine(engine);
757        let new_pat = Rosie::compile("{ [H][^]* }").unwrap();
758        assert!(pat.match_str::<bool>("Hello, Rosie!").unwrap());
759        assert!(new_pat.match_str::<bool>("Hello, Rosie!").unwrap());
760    }
761
762    #[test]
763    /// Tests the interfaces to explicitly manage RosieEngines
764    fn explicit_engine() {
765
766        //Create the engine and check that it was sucessful
767        let mut engine = RosieEngine::new(None).unwrap();
768
769        //Make sure we can get the engine config
770        let _ = engine.config_as_json().unwrap();
771
772        //Check that we can get the library path, and then append a new path to it
773        let mut lib_paths = engine.lib_paths().unwrap();
774        //println!("lib_paths[0] = {}", lib_paths[0].display());
775
776        //Now append a new path to it
777        let new_rpl_dir = Path::new(env!("CARGO_MANIFEST_DIR")).join("test_rpl");
778        lib_paths.push(new_rpl_dir);
779        engine.set_lib_paths(&lib_paths).unwrap();
780
781        //Make sure we can read it back and see the path we added, in addition to the original
782        let lib_paths = engine.lib_paths().unwrap();
783        assert_eq!(lib_paths.len(), 2);
784        assert!(lib_paths.contains(&Path::new(env!("CARGO_MANIFEST_DIR")).join("test_rpl")));
785
786        //Validate we can find packages in the new directory
787        engine.import_expression_deps("rust_test_1.c_vegetables", None).unwrap();
788
789        //Check the alloc limit, set it to unlimited, check the usage
790        let _ = engine.mem_alloc_limit().unwrap();
791        assert!(engine.set_mem_alloc_limit(0).is_ok());
792        let _ = engine.mem_usage().unwrap();
793
794        //Compile a valid rpl pattern, and confirm there is no error
795        let pat = engine.compile("{[012][0-9]}", None).unwrap();
796
797        //Make sure we can sucessfully free the pattern
798        drop(pat);
799        
800        //Try to compile an invalid pattern (syntax error), and check the error and error message
801        let mut message = RosieMessage::empty();
802        let compile_result = engine.compile("year = bogus", Some(&mut message));
803        assert!(compile_result.is_err());
804        assert!(message.len() > 0);
805        //println!("Compile Error: {}", message.as_str());
806
807        //Try and import the dependencies for an invalid pattern, and check the error message
808        let mut message = RosieMessage::empty();
809        let import_result = engine.import_expression_deps("invalid.any", Some(&mut message));
810        assert!(import_result.is_err());
811        assert!(message.len() > 0);
812        //println!("Import Error: {}", message.as_str());
813
814        //Load the dependencies for a valid pattern
815        engine.import_expression_deps("time.any", None).unwrap();
816
817        //Recompile a pattern expression and match it against a matching input using match_pattern_raw
818        let mut pat = engine.compile("{[012][0-9]}", None).unwrap();
819        let raw_match_result = pat.raw_match(1, b"21", &MatchEncoder::Status).unwrap();
820        //Validate that we can't access the pattern while our raw_match_result is in use.
821        //TODO: Implement a TryBuild harness in order to ensure the two lines below will not compile together, although each will compile separately.
822        // assert!(!pat.match_str::<bool>("35").unwrap());
823        assert_eq!(raw_match_result.did_match(), true);
824        assert!(raw_match_result.time_elapsed_matching() <= raw_match_result.time_elapsed_total()); //A little lame as tests go, but validates they are called at least.
825
826        //Now try the match with the high-level match_str call
827        let match_result : MatchResult = pat.match_str("21").unwrap();
828        assert_eq!(match_result.pat_name_str(), "*");
829        assert_eq!(match_result.matched_str(), "21");
830        assert_eq!(match_result.start(), 1);
831        assert_eq!(match_result.end(), 3);
832        assert_eq!(match_result.sub_pat_count(), 0);
833
834        //Try it against non-matching input, and make sure we get no match
835        let match_result : MatchResult = pat.match_str("99").unwrap();
836        assert_eq!(match_result.did_match(), false);
837
838        //Test the trace method, and make sure we get a reasonable result
839        let mut trace = RosieMessage::empty();
840        assert!(pat.trace(1, "21", TraceFormat::Condensed, &mut trace).is_ok());
841        assert!(trace.as_str().len() > 0);
842        //println!("{}", trace.as_str());
843
844        //Test loading a package from a string
845        let pkg_name = engine.load_pkg_from_str("package two_digit_year\n\nyear = {[012][0-9]}", None).unwrap();
846        assert_eq!(pkg_name.as_str(), "two_digit_year");
847
848        //Test loading a package from a file
849        let rpl_file = Path::new(env!("CARGO_MANIFEST_DIR")).join("test_rpl").join("rust_test_2.rpl");
850        let pkg_name = engine.load_pkg_from_file(rpl_file.to_str().unwrap(), None).unwrap();
851        assert_eq!(pkg_name.as_str(), "rust_test_2");
852
853        //Test importing a package
854        let pkg_name = engine.import_pkg("net", None, None).unwrap();
855        assert_eq!(pkg_name.as_str(), "net");
856
857        //Test matching a pattern with some recursive sub-patterns
858        engine.import_pkg("date", None, None).unwrap();
859        let mut date_pat = engine.compile("date.us_long", None).unwrap();
860        let match_result : MatchResult = date_pat.match_str("Saturday, Nov 5, 1955").unwrap();
861        assert_eq!(match_result.pat_name_str(), "date.us_long");
862        assert_eq!(match_result.matched_str(), "Saturday, Nov 5, 1955");
863        assert_eq!(match_result.start(), 1);
864        assert_eq!(match_result.end(), 22);
865        assert_eq!(match_result.sub_pat_count(), 4);
866        let sub_match_pat_names : Vec<&str> = match_result.sub_pat_iter().map(|result| result.pat_name_str()).collect();
867        assert!(sub_match_pat_names.contains(&"date.day_name"));
868        assert!(sub_match_pat_names.contains(&"date.month_name"));
869        assert!(sub_match_pat_names.contains(&"date.day"));
870        assert!(sub_match_pat_names.contains(&"date.year"));
871        let sub_result = match_result.sub_pat_iter().find(|sub_result| sub_result.pat_name_str() == "date.month_name").unwrap();
872        assert_eq!(sub_result.matched_str(), "Nov");
873        assert_eq!(sub_result.start(), 11);
874        assert_eq!(sub_result.end(), 14);
875
876        //Verify that the RawMatchResults from two different compiled patterns don't interfere with each other
877        //Also test the JSONPretty encoder while we're at it
878        engine.import_expression_deps("time.any", None).unwrap();
879        let mut time_pat = engine.compile("time.any", None).unwrap();
880        let date_raw_match_result = date_pat.raw_match(1, b"Saturday, Nov 5, 1955", &MatchEncoder::JSONPretty).unwrap();
881        let time_raw_match_result = time_pat.raw_match(1, b"2:21 am", &MatchEncoder::JSONPretty).unwrap();
882        assert!(date_raw_match_result.as_str() != time_raw_match_result.as_str());
883        //NOTE: I know these checks might break with perfectly legal changes to JSON formatting, but at least they
884        // will flag it, so a human can take a look and ensure something more fundamental didn't break.
885        assert_eq!(date_raw_match_result.as_str().len(), 660);
886        assert_eq!(time_raw_match_result.as_str().len(), 453);
887    }
888
889    #[test]
890    /// Tests a whole bunch of threads all doing compiling and matching at the same time
891    fn thread_stress() {
892
893        const NUM_THREADS : usize = 50;
894        const NUM_ITERATIONS : usize = 50; //Each iteration includes one compile
895        const NUM_MATCHES : usize = 500; //Number of matches to perform each iteration
896
897        let mut thread_handles = vec![];
898
899        for thread_idx in 0..NUM_THREADS {
900            let handle = thread::spawn(move || {
901
902                let mut rng = Pcg64::seed_from_u64(thread_idx.try_into().unwrap()); //non-cryptographic random used for repeatability
903
904                for _ in 0..NUM_ITERATIONS {
905
906                    let pat_idx : u8 = rng.gen_range(0..3);
907                    let pat_expr = match pat_idx {
908                        0 => "{ [H][^]* }",
909                        1 => "date.any",
910                        2 => "time.any",
911                        _ => panic!()
912                    };
913
914                    let pat = Rosie::compile(pat_expr).unwrap();
915
916                    for _ in 0..NUM_MATCHES {
917
918                        let str_idx : u8 = rng.gen_range(0..3);
919                        let str_val = match str_idx {
920                            0 => "Hello, Rosie!",
921                            1 => "Saturday, Nov 5, 1955",
922                            2 => "2:21 am",
923                            _ => panic!()
924                        };
925    
926                        let result : MatchResult = pat.match_str(str_val).unwrap();
927    
928                        match (pat_idx, str_idx) {
929                            (0, 0) => assert_eq!(result.matched_str(), "Hello, Rosie!"),
930                            (1, 1) => assert_eq!(result.matched_str(), "Saturday, Nov 5, 1955"),
931                            (2, 2) => assert_eq!(result.matched_str(), "2:21 am"),
932                            _ => assert!(!result.did_match()),
933                        }
934                    }
935                }
936            });
937
938            thread_handles.push(handle);
939        }
940
941        //Make sure every thread has a chance to finish
942        for handle in thread_handles {
943            handle.join().unwrap();
944        }
945    }
946
947}