rosie/lib.rs
1#![crate_name = "rosie"]
2
3#![doc(html_logo_url = "https://rosie-lang.org/images/rosie-circle-blog.png")]
4//Q-06.04, Can Jamie host a version of this logo that doesn't have a border? i.e. just the circle occupying the whole frame, with an alpha-channel so the corners are transparent
5
6#![doc = include_str!("../README.md")]
7
8use core::mem::swap;
9use core::fmt;
10use core::fmt::Display;
11use std::str;
12use std::convert::{TryFrom, TryInto};
13use std::path::{Path, PathBuf};
14use std::fs;
15use std::sync::Mutex;
16use std::cell::UnsafeCell;
17
18use linked_hash_map::LinkedHashMap;
19
20use once_cell::sync::Lazy; // TODO: As soon as std::sync::SyncLazy is pushed to stable, we will migrate there and eliminate this dependency
21
22extern crate rosie_sys;
23use rosie_sys::{
24 RosieString,
25 rosie_new_string,
26 rosie_home_default,
27 rosie_home_init,
28 rosie_free_rplx,
29};
30
31//Private Internal code for managing most calls into librosie
32mod sys_wrapper;
33use sys_wrapper::{*};
34
35//Public re-exports
36mod sys_shadow; //Shadow implementations of things from rosie_sys::
37pub use sys_shadow::RosieError; //pub use rosie_sys::RosieError;
38/// An Encoder module used to format the results, when using [Pattern::raw_match]
39pub use rosie_sys::MatchEncoder;
40/// A structure containing the match results from a [Pattern::raw_match] call.
41///
42/// **NOTE**: A RawMatchResult points to memory inside the engine that is associated with the pattern, therefore you may
43/// not perform any additional matching with that pattern until the RawMatchResult has been released. This is enforced with
44/// borrowing semantics [Pattern::raw_match].
45pub use rosie_sys::RawMatchResult;
46/// A format for debugging output, to be used with [Pattern::trace]
47pub use rosie_sys::TraceFormat;
48
49/// Functionality to access [RosieEngine]s directly
50///
51/// The majority of use cases don't require direct access to a RosieEngine. However, this module can be used to:
52/// - Create multiple simultaneous engines
53/// - Change the environment (Standard Pattern Library or config)
54/// - Explicitly load rpl packages
55/// - Constrain memory usage
56///
57/// ## Example Usage
58/// ```
59/// use rosie::*;
60/// let mut engine = engine::RosieEngine::new(None).unwrap();
61/// engine.import_pkg("date", None, None);
62///
63/// let date_pat = engine.compile("date.us_long", None).unwrap();
64/// assert!(date_pat.match_str::<bool>("Saturday, Nov 5, 1955").unwrap());
65/// ```
66///
67pub mod engine {
68 pub use crate::sys_wrapper::RosieEngine;
69}
70
71/// Alternative Rosie entry points that support sharing between threads
72///
73/// [RosieEngine]s and compiled [Pattern]s can't be accessed by other threads, so this module
74/// implements versions that can, but the tradeoff is runtime performance costs due to locking
75/// and copying the results to private buffers
76///
77/// NOTE: I feel like this is a short-term solution. Longer term, I envision a RosieEngine
78/// being separated into two objects; a compiler and a matching engine. Then the matching engine
79/// could be fully reentrant and lock-free, and new compiled patterns could be added atomically
80/// to a matching engine, obviating the need for everything in the `thread_portable` module.
81///
82/// I'd like to discuss this direction with Jamie, and possibly even an implementation of the
83/// matching engine in safe Rust.
84pub mod thread_portable;
85
86use thread_portable::PortablePattern;
87
88//The number of compiled patterns in the pattern cache
89const PATTERN_CACHE_SIZE: usize = 8;
90
91//Global to track the state of librosie
92static LIBROSIE_INITIALIZED: Lazy<Mutex<bool>> = Lazy::new(|| Mutex::new(false));
93
94//Global per-thread singleton engine and pattern cache
95struct ThreadLocals {
96 engine : RosieEngine,
97 pattern_cache : LinkedHashMap<String, Pattern>
98}
99thread_local!{
100 //TODO: Waiting for the stabilization of `#[thread_local]` attribute so we can get rid of this UnsafeCell
101 // Don't want to pay the price of a RefCell for no reason
102 // https://github.com/rust-lang/rust/issues/29594
103 static THREAD_LOCALS : UnsafeCell<ThreadLocals> = UnsafeCell::new(ThreadLocals::new())
104}
105
106impl ThreadLocals {
107 fn new() -> Self {
108 Self {
109 engine : {
110 let mut messages = RosieMessage::empty();
111 if let Ok(engine) = RosieEngine::new(Some(&mut messages)) {
112 engine
113 } else {
114 panic!("ERROR Creating RosieEngine: {}", messages.as_str())
115 }
116 },
117 pattern_cache : LinkedHashMap::with_capacity(PATTERN_CACHE_SIZE)
118 }
119 }
120}
121
122/// A buffer to obtain text from Rosie.
123///
124/// The contents of the buffer depend on the situation under which it is returned.
125/// Sometimes the returned text is formatted as JSON and other times it is a human-readable message.
126///
127/// # Example: Getting a message from the expression compiler
128/// ```
129/// # use rosie::*;
130/// let mut engine = engine::RosieEngine::new(None).unwrap();
131/// let mut message = RosieMessage::empty();
132/// engine.compile("invalid pattern", Some(&mut message));
133/// println!("{}", message);
134/// ```
135#[derive(Debug)]
136pub struct RosieMessage(RosieString<'static>);
137
138//For some strings, we are responsible for freeing any string buffers, even if librosie allocated them
139impl Drop for RosieMessage {
140 fn drop(&mut self) {
141 self.0.manual_drop();
142 }
143}
144
145impl RosieMessage {
146 /// Creates an empty RosieMessage. Used to allocate a location into which another function may write output.
147 pub fn empty() -> Self {
148 Self(RosieString::empty())
149 }
150 /// Creates a new RosieMessage by copying the contents of the argument &[str](std::str) into the newly created RosieMessage.
151 pub fn from_str(s: &str) -> Self {
152 let rosie_string = unsafe { rosie_new_string(s.as_ptr(), s.len()) };
153 Self(rosie_string)
154 }
155 pub fn from_bytes(b: &[u8]) -> Self {
156 let rosie_string = unsafe { rosie_new_string(b.as_ptr(), b.len()) };
157 Self(rosie_string)
158 }
159 /// Returns `true` if the RosieMessage contains text. Returns `false` if it is empty.
160 pub fn is_valid(&self) -> bool {
161 self.0.is_valid()
162 }
163 /// Borrows the RosieMessage contents as a slice of bytes. If the RosieMessage is empty the resulting slice will have a length of zero.
164 pub fn as_bytes(&self) -> &[u8] {
165 self.0.as_bytes()
166 }
167 /// Borrows the RosieMessage contents as a &[str](std::str). If the RosieMessage is empty the result will have a length of zero.
168 pub fn as_str(&self) -> &str {
169 self.0.as_str()
170 }
171 /// Same as [as_str](RosieMessage::as_str) but won't panic
172 pub fn try_as_str(&self) -> Option<&str> {
173 self.0.try_as_str()
174 }
175 /// Returns the length, in bytes, of the contents of the RosieMessage.
176 pub fn len(&self) -> usize {
177 self.0.len()
178 }
179}
180
181impl Display for RosieMessage {
182 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
183 self.0.fmt(f)
184 }
185}
186
187/// The interface to top-level rosie functionality
188pub struct Rosie;
189
190impl Rosie {
191 /// Matches the specified `expression` in the specified `input` bytes.
192 ///
193 /// Returns the requested type if a match was found, otherwise returns an appropriate error code.
194 ///
195 /// Compiled patterns are managed automatically using a least-recently-used cache and are recompiled as needed.
196 ///
197 /// NOTE: This method may return several different return types, including [bool], and [MatchResult].
198 /// If you need the fastest possible performance calling this method to return a [bool] will use the
199 /// [Status](MatchEncoder::Status) encoder and bypass a lot of overhead formatting the results.
200 pub fn match_bytes<'input, T>(expression : &str, input : &'input [u8]) -> T
201 where T : MatchOutput<'input> {
202 THREAD_LOCALS.with(|locals_cell| {
203
204 //TODO: Get rid of UnsafeCell. See note near declaration of THREAD_LOCALS.
205 let locals : &mut ThreadLocals = unsafe{ &mut *locals_cell.get() };
206
207 //See if we have the expression in our pattern cache
208 let mut pat = if let Some(existing_pat) = locals.pattern_cache.remove(expression) {
209 existing_pat
210 } else {
211 //If we don't have the expression, make sure there is space for it in the cache
212 if locals.pattern_cache.len() > PATTERN_CACHE_SIZE-1 {
213 //Toss out the least-recently-added item
214 let _ = locals.pattern_cache.pop_front();
215 }
216
217 //And compile the expression
218 locals.engine.import_expression_deps(expression, None).unwrap();
219 locals.engine.compile(expression, None).unwrap()
220 };
221
222 //Call the return-type-specific match call
223 let result = T::match_bytes(&mut pat, input).unwrap();
224
225 //Put the pattern back on the top of the LRU stack
226 locals.pattern_cache.insert(expression.to_string(), pat);
227
228 result
229 })
230 }
231 /// Matches the specified `expression` in the specified `input` str.
232 ///
233 /// Returns the requested type if a match was found, otherwise returns an appropriate error code.
234 ///
235 /// Compiled patterns are managed automatically using a least-recently-used cache and are recompiled as needed.
236 ///
237 /// NOTE: This method may return several different return types, including [bool], and [MatchResult].
238 /// If you need the fastest possible performance calling this method to return a [bool] will use the
239 /// [Status](MatchEncoder::Status) encoder and bypass a lot of overhead formatting the results.
240 pub fn match_str<'input, T>(expression : &str, input : &'input str) -> T
241 where T : MatchOutput<'input> {
242 Self::match_bytes(expression, input.as_bytes())
243 }
244 /// Compiles the specified expression, returning a `Pattern` that can then be used to match that expression.
245 ///
246 /// The expression may be either the name of a previously loaded `rpl` pattern, or it may be a raw `rpl` expression.
247 ///
248 /// **NOTE**: This method is high-level. If you want more control, performance, or feedback, see [RosieEngine::compile].
249 ///
250 /// - This method automatically evaluates the expression for dependencies and automatically loads any dependencies it
251 /// finds, while RosieEngine::compile skips the dependency analysis
252 /// - This method's returned `Pattern` will be hosted by the thread's default engine. RosieEngine::compile allows you
253 /// to host the Pattern on another engine
254 /// - This method doesn't provide any compile warnings or errors. To debug a compilation failure, call
255 /// RosieEngine::compile
256 ///
257 /// # Examples
258 /// ```
259 /// # use rosie::*;
260 /// let date_pat = Rosie::compile("date.us_long").unwrap();
261 /// ```
262 ///
263 /// ```
264 /// # use rosie::*;
265 /// let two_digit_year_pat = Rosie::compile("{[012][0-9]}").unwrap();
266 /// ```
267 ///
268 pub fn compile(expression : &str) -> Result<Pattern, RosieError> {
269 THREAD_LOCALS.with(|locals_cell| {
270
271 //TODO: Get rid of UnsafeCell. See note near declaration of THREAD_LOCALS.
272 let locals : &ThreadLocals = unsafe{ &*locals_cell.get() };
273
274 locals.engine.import_expression_deps(expression, None)?;
275 locals.engine.compile(expression, None)
276 })
277 }
278 /// Sets a custom location for the rosie_home path used for support scripts and the default Standard Pattern Library.
279 ///
280 /// **WARNING** This method must be called before any other rosie calls, or it will not be sucessful
281 pub fn set_rosie_home_path<P: AsRef<Path>>(path: P) {
282 librosie_init(Some(path))
283 }
284 /// Returns the thread's default RosieEngine, replacing it with a newly initialized engine
285 ///
286 /// NOTE: This will clear the compiled pattern cache used by [Rosie::match_str]
287 ///
288 pub fn take_thread_default_engine() -> RosieEngine {
289 THREAD_LOCALS.with(|locals_cell| {
290
291 //TODO: Get rid of UnsafeCell. See note near declaration of THREAD_LOCALS.
292 let locals : &mut ThreadLocals = unsafe{ &mut *locals_cell.get() };
293
294 let mut new_locals = ThreadLocals::new();
295 swap(&mut new_locals, locals);
296 new_locals.engine
297 })
298 }
299 /// Replaces the thread's default RosieEngine with the engine supplied
300 ///
301 /// NOTE: This will clear the compiled pattern cache used by [Rosie::match_str]
302 ///
303 pub fn replace_thread_default_engine(engine : RosieEngine) {
304 THREAD_LOCALS.with(|locals_cell| {
305
306 //TODO: Get rid of UnsafeCell. See note near declaration of THREAD_LOCALS.
307 let locals : &mut ThreadLocals = unsafe{ &mut *locals_cell.get() };
308
309 locals.engine = engine;
310 locals.pattern_cache = LinkedHashMap::with_capacity(PATTERN_CACHE_SIZE);
311 })
312 }
313}
314
315/// Implemented for types that can be returned by a match operation
316pub trait MatchOutput<'a> : Sized {
317 fn match_bytes(pat : &Pattern, input : &'a [u8]) -> Result<Self, RosieError>;
318 fn match_bytes_portable(pat : &PortablePattern, input : &'a [u8]) -> Result<Self, RosieError>;
319}
320
321impl MatchOutput<'_> for bool {
322 fn match_bytes(pat : &Pattern, input : &[u8]) -> Result<Self, RosieError> {
323 //NOTE: we're calling directly into the engine because we want to bypass the requirement for a &mut self in Pattern::raw_match.
324 // That &mut is just there to ensure we have an exclusive borrow, so subsequent calls don't match the same compiled pattern and
325 // collide with the pattern's buffer in the engine.
326 let raw_match_result = pat.engine.0.match_pattern_raw(pat.id, 1, input, &MatchEncoder::Status).unwrap();
327 Ok(raw_match_result.did_match())
328 }
329 fn match_bytes_portable(pat : &PortablePattern, input : &[u8]) -> Result<Self, RosieError> {
330 let guard = pat.engine.0.lock().unwrap();
331 let raw_match_result = guard.match_pattern_raw(pat.id, 1, input, &MatchEncoder::Status).unwrap();
332 Ok(raw_match_result.did_match())
333 }
334}
335
336//IMPLEMENTATION NOTE: I chose to delete the `String` implementation because the common case for MatchResult, i.e. the case where the
337// pattern is not a constant-capture pattern means that the matched string is just a slice referencing the input. But the `String`
338// implementation forces a copy in every situation. So we want to accidentally direct people to the slow-path by making it convenient.
339//
340//In a perfect world, I would like there to be an implementation for `&'a str`, but the problem with that is constant-capture patterns
341// point to data that isn't in the input. I went pretty far into a change that got rid of the MaybeOwnedBytes inside of MatchResult,
342// in order to implement `MatchResult::into_str`, but that meant constant-capture patterns (and therefore all MatchResults) needed to
343// borrow the engine buffer associated with the pattern (like a RawMatchResult does). This is unworkable because of the pattern cache.
344//
345// impl <'a>MatchOutput<'a> for String {
346// fn match_str(pat : &Pattern, input : &'a str) -> Result<Self, RosieError> {
347// let match_result = pat.engine.match_pattern(pat.id, 1, input)?;
348// Ok(match_result.matched_str().to_string())
349// }
350// }
351
352impl <'a>MatchOutput<'a> for MatchResult<'a> {
353 fn match_bytes(pat : &Pattern, input : &'a [u8]) -> Result<Self, RosieError> {
354 pat.engine.0.match_pattern(pat.id, 1, input)
355 }
356 fn match_bytes_portable(pat : &PortablePattern, input : &'a [u8]) -> Result<Self, RosieError> {
357 pat.engine.0.lock().unwrap().match_pattern(pat.id, 1, input)
358 }
359}
360
361//Private function to make sure librosie is initialized and initialize it if it isn't
362//Internal NOTE: This function is responsible for internal librosie initialization, so it is also called by RosieEngine::new()
363fn librosie_init<P: AsRef<Path>>(path: Option<P>) {
364
365 //Get the global status var, or block until we can get it
366 let mut init_status = LIBROSIE_INITIALIZED.lock().unwrap();
367
368 //If librosie isn't initialized yet, then initialize it
369 if !(*init_status) {
370
371 let mut did_init = false;
372
373 //Decide among the different paths we might use
374 let dir_path = if let Some(dir_path) = path {
375 Some(PathBuf::from(dir_path.as_ref())) // If we were passed a path, use it.
376 } else {
377 if let Some(default_path_str) = rosie_home_default() {
378 Some(PathBuf::from(str::from_utf8(default_path_str).unwrap())) //We will pass the path compiled into our binary
379 } else {
380 None //We will let librosie try to find it
381 }
382 };
383
384 //Make sure the path is a valid directory before calling rosie_home_init(),
385 // because rosie_home_init() will buy whatever we're selling, even if it's garbage
386 if let Some(dir_path) = dir_path {
387 if let Ok(dir_metadata) = fs::metadata(&dir_path) {
388 if dir_metadata.is_dir() {
389 let mut message_buf = RosieString::empty();
390 unsafe{ rosie_home_init(&RosieString::from_str(&dir_path.to_str().unwrap()), &mut message_buf) };
391 did_init = true;
392 }
393 }
394 }
395
396 *init_status = did_init;
397 }
398}
399
400/// The compiled form of an RPL expression, used for matching
401///
402/// A Pattern can be created by either [Rosie::compile] or [RosieEngine::compile].
403///
404/// **Performance NOTE**: Compiling a pattern is hundreds of times more expensive than a typical matching operation.
405/// Therefore it is usually worthwhile to retain compiled patterns rather than allowing them to be dropped and
406/// recompiling them when needed.
407///
408//TODO: Add API to load compiled patterns, without needing to go via RPL, when it is supported by librosie.
409//
410//INTERNAL NOTE: Pattern doesn't implement Clone because a RawMatchResult holds a pointer to a buffer inside the
411// engine, for which there is one-per-pattern. If a pattern could be cloned, we could end up invalidating the
412// memory out from under a RawMatchResult.
413pub struct Pattern {
414 engine : RosieEngine,
415 id : i32
416}
417
418impl Drop for Pattern {
419 fn drop(&mut self) {
420 unsafe { rosie_free_rplx(self.engine.0.0, self.id) };
421 }
422}
423
424impl Pattern {
425
426 /// Matches the `Pattern` in the specified `input` bytes string.
427 ///
428 /// Returns the requested type if a match was found, otherwise returns an appropriate error code.
429 ///
430 /// NOTE: This method may return several different return types, including [bool], and [MatchResult].
431 /// If you need the fastest possible performance calling this method to return a [bool] will use the
432 /// [Status](MatchEncoder::Status) encoder and bypass a lot of overhead formatting the results.
433 pub fn match_bytes<'input, T>(&self, input : &'input [u8]) -> Result<T, RosieError>
434 where T : MatchOutput<'input> {
435 //Call the return-type-specific match call
436 T::match_bytes(self, input)
437 }
438
439 /// Matches the `Pattern` in the specified `input` str.
440 ///
441 /// Returns the requested type if a match was found, otherwise returns an appropriate error code.
442 ///
443 /// NOTE: This method may return several different return types, including [bool], and [MatchResult].
444 /// If you need the fastest possible performance calling this method to return a [bool] will use the
445 /// [Status](MatchEncoder::Status) encoder and bypass a lot of overhead formatting the results.
446 pub fn match_str<'input, T>(&self, input : &'input str) -> Result<T, RosieError>
447 where T : MatchOutput<'input> {
448 self.match_bytes(input.as_bytes())
449 }
450
451 /// Matches the `Pattern` in the specified `input` string, beginning from the `start` index, using the specified `encoder`.
452 ///
453 /// Returns a [RawMatchResult] or an error code if a problem was encountered. This is a lower-level API than [match_str](Pattern::match_str),
454 /// and the primary reason to use it is to get the output from a particular [MatchEncoder]. For example, the [JSON](MatchEncoder::JSON) or [JSONPretty](MatchEncoder::JSONPretty) encoders.
455 ///
456 /// **NOTE**: The returned [RawMatchResult] takes a mutable borrow of the `Pattern` because it references internal data
457 /// associated with the `Pattern`. Therefore the `Pattern` cannot be accessed while the RawMatchResult is in use; copying
458 /// the data from the RawMatchResult will allow the `Pattern` to be released.
459 ///
460 /// **NOTE**: The values for `start` are 1-based. Meaning passing 1 will begin the match from the beginning of the input, and
461 /// passing 0 (zero) is an error.
462 ///
463 /// # Example using the JSON encoder with serde_json
464 /// ```
465 /// extern crate serde_json;
466 /// use serde::{*};
467 /// use rosie::*;
468 ///
469 /// #[derive(Debug, Deserialize)]
470 /// struct JSONMatchResult {
471 /// #[serde(rename = "type")]
472 /// pat_name : String, // The pattern that was matched
473 /// #[serde(rename = "s")]
474 /// start : usize, // The offset of the start of the match in the input buffer
475 /// #[serde(rename = "e")]
476 /// end : usize, // The offset of the end of the match in the input buffer
477 /// data : String, // The matched text, copied from the input buffer
478 /// #[serde(default = "Vec::new")]
479 /// subs : Vec<JSONMatchResult> // The sub-matches within the pattern
480 /// }
481 ///
482 /// let mut date_pat = Rosie::compile("date.any").unwrap();
483 /// let raw_result = date_pat.raw_match(1, b"Sat Nov 5, 1955", &MatchEncoder::JSON).unwrap();
484 /// let parsed_result : JSONMatchResult = serde_json::from_slice(raw_result.as_bytes()).unwrap();
485 /// ```
486 ///
487 pub fn raw_match<'pat>(&'pat mut self, start : usize, input : &[u8], encoder : &MatchEncoder) -> Result<RawMatchResult<'pat>, RosieError> {
488 self.engine.0.match_pattern_raw(self.id, start, input, encoder)
489 }
490
491 /// Traces a pattern match, providing information useful for debugging the pattern expression.
492 ///
493 /// Returns a bool indicating whether the specified pattern matched the input. The caller must allocate an empty [RosieMessage]
494 /// in order to receive the resulting trace information.
495 ///
496 /// The caller must also pass a [TraceFormat], to specify the format for the resulting information.
497 /// [Condensed](TraceFormat::Condensed) is the most human-readable format, but a other formats may contain more complete
498 /// information or be easier to automatically parse.
499 ///
500 /// # Example
501 /// ```
502 /// # use rosie::*;
503 /// let date_pat = Rosie::compile("date.any").unwrap();
504 ///
505 /// let mut trace = RosieMessage::empty();
506 /// let did_match = date_pat.trace(1, "Sat. Nov. 5, 1955", TraceFormat::Condensed, &mut trace).unwrap();
507 /// println!("{}", trace.as_str());
508 /// ```
509 ///
510 pub fn trace(&self, start : usize, input : &str, format : TraceFormat, trace : &mut RosieMessage) -> Result<bool, RosieError> {
511 self.engine.0.trace_pattern(self.id, start, input, format, trace)
512 }
513}
514
515//A variant on maybe_owned::MaybeOwned, except it can either be a Vec<u8> or an &[u8].
516#[derive(Debug)]
517enum MaybeOwnedBytes<'a> {
518 Owned(Vec<u8>),
519 Borrowed(&'a [u8]),
520}
521
522impl MaybeOwnedBytes<'_> {
523 pub fn try_as_str(&self) -> Option<&str> {
524 str::from_utf8(self.as_bytes()).ok()
525 }
526 pub fn as_bytes(&self) -> &[u8] {
527 match self {
528 MaybeOwnedBytes::Owned(the_vec) => &the_vec[..],
529 MaybeOwnedBytes::Borrowed(the_slice) => the_slice
530 }
531 }
532}
533
534/// Represents the results of a match operation, performed by [Pattern::match_str] or [Rosie::match_str]
535///
536//**TODO** I feel like a more caller-friendly interface is possible; i.e. the ability to specify sub-patterns with a "path"
537#[derive(Debug)]
538pub struct MatchResult<'a> {
539 pat_name : String,
540 start : usize,
541 end : usize,
542 data : MaybeOwnedBytes<'a>,
543 subs : Vec<MatchResult<'a>>
544}
545
546impl <'a>MatchResult<'a> {
547
548 //This method is a port from the python code here: https://gitlab.com/rosie-community/clients/python/-/blob/master/rosie/decode.py
549 fn from_bytes_buffer<'input>(input : &'input [u8], match_buffer : &mut &[u8], existing_start_pos : Option<usize>) -> MatchResult<'input> {
550
551 //If we received a start position, it is because we are in the middle of a recursive call stack
552 let start_position = match existing_start_pos {
553 Some(start_position) => start_position,
554 None => {
555 //Otherwise, Read the first 4 bytes, interpret them as a signed little-endian 32 bit integer,
556 // and then negate them to get the start position
557 let (start_pos_chars, remainder) = match_buffer.split_at(4);
558 *match_buffer = remainder;
559 let signed_start_pos = i32::from_le_bytes(start_pos_chars.try_into().unwrap());
560 assert!(signed_start_pos < 0);
561 usize::try_from(signed_start_pos * -1).unwrap()
562 }
563 };
564
565 //Read the next 2 bytes, interpret them as a signed little-endian 16 but integer,
566 let (type_len_chars, remainder) = match_buffer.split_at(2);
567 *match_buffer = remainder;
568 let mut type_len = i16::from_le_bytes(type_len_chars.try_into().unwrap()); //The length of the pattern name
569
570 //constant-capture means data is a user-provided string, (i.e. a string from the encoder)
571 //Otherwise regular-capture means data is a subset of the input string
572 let constant_capture = if type_len < 0 {
573 type_len = type_len * -1;
574 true
575 } else {
576 false
577 };
578
579 //Read type_len characters, intperpreting it as the pattern name
580 let (type_name_chars, remainder) = match_buffer.split_at(usize::try_from(type_len).unwrap());
581 *match_buffer = remainder;
582 let pattern_name = String::from_utf8(type_name_chars.to_vec()).unwrap();
583
584 //Get the data out of the match_buffer, or the input string, depending on whether the pattern is "constant-capture" or not
585 let mut data = if constant_capture {
586 let (data_len_chars, remainder) = match_buffer.split_at(2);
587 *match_buffer = remainder;
588 let data_len = i16::from_le_bytes(data_len_chars.try_into().unwrap()); //The length of the data name
589 assert!(data_len >= 0);
590
591 let (data_chars, remainder) = match_buffer.split_at(usize::try_from(data_len).unwrap());
592 *match_buffer = remainder;
593 MaybeOwnedBytes::Owned(data_chars.to_vec())
594 } else {
595 let (_, match_data) = input.split_at(start_position-1);
596 MaybeOwnedBytes::Borrowed(match_data)
597 };
598
599 //The empty array for our sub-patterns.
600 let mut subs = Vec::new();
601
602 //Read the next 4 bytes, and interpret them as a little-endian signed int. It it's negative, then
603 //that means we negate it to get the start of the next sub-match, and call ourselves recursively to.
604 //continue parsing the sub-match. If the number is positive, then we have come to the end of this
605 //sub-pattern array, so the number is the end position of this pattern.
606 let end_position;
607 loop {
608 let (next_pos_chars, remainder) = match_buffer.split_at(4);
609 *match_buffer = remainder;
610 let signed_next_pos = i32::from_le_bytes(next_pos_chars.try_into().unwrap());
611
612 if signed_next_pos < 0 {
613 let next_position = usize::try_from(signed_next_pos * -1).unwrap();
614 let sub_match = MatchResult::from_bytes_buffer(input, match_buffer, Some(next_position));
615 subs.push(sub_match);
616 } else {
617 end_position = usize::try_from(signed_next_pos).unwrap();
618 break;
619 }
620 }
621
622 //If we have a borrowed data pointer, cut its length at the appropriate place
623 if let MaybeOwnedBytes::Borrowed(match_data) = data {
624 let (new_data_ref, _) = match_data.split_at(end_position - start_position);
625 data = MaybeOwnedBytes::Borrowed(new_data_ref);
626 }
627
628 MatchResult{
629 pat_name : pattern_name,
630 start : start_position,
631 end : end_position,
632 data : data,
633 subs : subs
634 }
635 }
636 fn from_byte_match_result<'input>(input : &'input [u8], src_result : RawMatchResult) -> MatchResult<'input> {
637 let mut data_buf_ref = src_result.as_bytes();
638 MatchResult::from_bytes_buffer(input, &mut data_buf_ref, None)
639 }
640 fn new_no_match() -> MatchResult<'static> {
641 MatchResult{
642 pat_name : "".to_string(),
643 start : 0,
644 end : 0,
645 data : MaybeOwnedBytes::Borrowed(&[]),
646 subs : vec![]
647 }
648 }
649 /// Returns `true` if the pattern was matched in the input, otherwise returns `false`.
650 pub fn did_match(&self) -> bool {
651 if self.start == 0 && self.end == 0 {
652 false
653 } else {
654 true
655 }
656 }
657 /// Returns the name of the pattern that matched
658 pub fn pat_name_str(&self) -> &str {
659 self.pat_name.as_str()
660 }
661 /// Returns the subset of the input that was matched by the pattern as an &str
662 ///
663 /// NOTE: This may panic if the matched data includes part but not all of a unicode character.
664 /// Use [try_matched_str](Self::try_matched_str) for a non-panicking alternative
665 pub fn matched_str(&self) -> &str {
666 self.try_matched_str().unwrap()
667 }
668 /// Returns the subset of the input that was matched by the pattern as an &str
669 pub fn try_matched_str(&self) -> Option<&str> {
670 self.data.try_as_str()
671 }
672 /// Returns the subset of the input that was matched by the pattern as an &[u8]
673 pub fn matched_bytes(&self) -> &[u8] {
674 self.data.as_bytes()
675 }
676 /// Returns the character offset of the beginning of the match, within the input
677 ///
678 /// NOTE: Offsets are 1-based
679 pub fn start(&self) -> usize {
680 self.start
681 }
682 /// Returns the character offset, within the input, of the end of the match
683 ///
684 /// NOTE: Offsets are 1-based
685 pub fn end(&self) -> usize {
686 self.end
687 }
688 /// Returns the number of matched sub-patterns that comprise the matched pattern
689 pub fn sub_pat_count(&self) -> usize {
690 self.subs.len()
691 }
692 /// Returns an [Iterator] over all of the sub-patterns within this matched pattern
693 pub fn sub_pat_iter(&self) -> impl Iterator<Item=&MatchResult> {
694 self.subs.iter()
695 }
696}
697
698#[cfg(test)]
699mod tests {
700 use crate::{*};
701 use std::thread;
702 use rand::prelude::*;
703 use rand_pcg::Pcg64;
704
705 #[test]
706 /// Tests the RosieString and RosieMessage functionality, without a RosieEngine
707 fn rosie_string() {
708
709 //A basic RosieString, pointing to a static string
710 let hello_str = "hello";
711 let rosie_string = RosieString::from_str(hello_str);
712 assert_eq!(rosie_string.len(), hello_str.len());
713 assert_eq!(rosie_string.as_str(), hello_str);
714
715 //A RosieString pointing to a heap-allocated string
716 let hello_string = String::from("hi there");
717 let rosie_string = RosieString::from_str(hello_string.as_str());
718 assert_eq!(rosie_string.len(), hello_string.len());
719 assert_eq!(rosie_string.as_str(), hello_string);
720
721 //Ensure we can't deallocate our rust String without deallocating our RosieString first
722 drop(hello_string);
723 //TODO: Implement a TryBuild harness in order to ensure the line below will not compile
724 //assert!(rosie_string.is_valid());
725
726 //Make a RosieMessage, pointing to a heap-allocated string
727 let hello_string = String::from("howdy");
728 let rosie_message = RosieMessage::from_str(hello_string.as_str());
729 assert_eq!(rosie_message.len(), hello_string.len());
730 assert_eq!(rosie_message.as_str(), hello_string);
731
732 //Now test that we can safely deallocate the heap-allocated String that we used to create a RosieMessage
733 drop(hello_string);
734 assert!(rosie_message.is_valid());
735 }
736
737 #[test]
738 /// Some tests for working with the default thread singleton engine
739 fn default_engine() {
740
741 //Try with the one liner, returning a bool
742 assert!(Rosie::match_str::<bool>("{ [H][^]* }", "Hello, Rosie!"));
743
744 //Try with explicit compilation using the default engine
745 let pat = Rosie::compile("{ [H][^]* }").unwrap();
746 let result : MatchResult = pat.match_str("Hello, Rosie!").unwrap();
747 assert_eq!(result.matched_str(), "Hello, Rosie!");
748
749 //Take the default engine and then drop it, but make sure extant patterns aren't affected
750 let engine = Rosie::take_thread_default_engine();
751 drop(engine);
752 assert!(pat.match_str::<bool>("Hello, Rosie!").unwrap());
753
754 //Create a new explicit engine, and make it the default, and ensure everything is ok
755 let engine = RosieEngine::new(None).unwrap();
756 Rosie::replace_thread_default_engine(engine);
757 let new_pat = Rosie::compile("{ [H][^]* }").unwrap();
758 assert!(pat.match_str::<bool>("Hello, Rosie!").unwrap());
759 assert!(new_pat.match_str::<bool>("Hello, Rosie!").unwrap());
760 }
761
762 #[test]
763 /// Tests the interfaces to explicitly manage RosieEngines
764 fn explicit_engine() {
765
766 //Create the engine and check that it was sucessful
767 let mut engine = RosieEngine::new(None).unwrap();
768
769 //Make sure we can get the engine config
770 let _ = engine.config_as_json().unwrap();
771
772 //Check that we can get the library path, and then append a new path to it
773 let mut lib_paths = engine.lib_paths().unwrap();
774 //println!("lib_paths[0] = {}", lib_paths[0].display());
775
776 //Now append a new path to it
777 let new_rpl_dir = Path::new(env!("CARGO_MANIFEST_DIR")).join("test_rpl");
778 lib_paths.push(new_rpl_dir);
779 engine.set_lib_paths(&lib_paths).unwrap();
780
781 //Make sure we can read it back and see the path we added, in addition to the original
782 let lib_paths = engine.lib_paths().unwrap();
783 assert_eq!(lib_paths.len(), 2);
784 assert!(lib_paths.contains(&Path::new(env!("CARGO_MANIFEST_DIR")).join("test_rpl")));
785
786 //Validate we can find packages in the new directory
787 engine.import_expression_deps("rust_test_1.c_vegetables", None).unwrap();
788
789 //Check the alloc limit, set it to unlimited, check the usage
790 let _ = engine.mem_alloc_limit().unwrap();
791 assert!(engine.set_mem_alloc_limit(0).is_ok());
792 let _ = engine.mem_usage().unwrap();
793
794 //Compile a valid rpl pattern, and confirm there is no error
795 let pat = engine.compile("{[012][0-9]}", None).unwrap();
796
797 //Make sure we can sucessfully free the pattern
798 drop(pat);
799
800 //Try to compile an invalid pattern (syntax error), and check the error and error message
801 let mut message = RosieMessage::empty();
802 let compile_result = engine.compile("year = bogus", Some(&mut message));
803 assert!(compile_result.is_err());
804 assert!(message.len() > 0);
805 //println!("Compile Error: {}", message.as_str());
806
807 //Try and import the dependencies for an invalid pattern, and check the error message
808 let mut message = RosieMessage::empty();
809 let import_result = engine.import_expression_deps("invalid.any", Some(&mut message));
810 assert!(import_result.is_err());
811 assert!(message.len() > 0);
812 //println!("Import Error: {}", message.as_str());
813
814 //Load the dependencies for a valid pattern
815 engine.import_expression_deps("time.any", None).unwrap();
816
817 //Recompile a pattern expression and match it against a matching input using match_pattern_raw
818 let mut pat = engine.compile("{[012][0-9]}", None).unwrap();
819 let raw_match_result = pat.raw_match(1, b"21", &MatchEncoder::Status).unwrap();
820 //Validate that we can't access the pattern while our raw_match_result is in use.
821 //TODO: Implement a TryBuild harness in order to ensure the two lines below will not compile together, although each will compile separately.
822 // assert!(!pat.match_str::<bool>("35").unwrap());
823 assert_eq!(raw_match_result.did_match(), true);
824 assert!(raw_match_result.time_elapsed_matching() <= raw_match_result.time_elapsed_total()); //A little lame as tests go, but validates they are called at least.
825
826 //Now try the match with the high-level match_str call
827 let match_result : MatchResult = pat.match_str("21").unwrap();
828 assert_eq!(match_result.pat_name_str(), "*");
829 assert_eq!(match_result.matched_str(), "21");
830 assert_eq!(match_result.start(), 1);
831 assert_eq!(match_result.end(), 3);
832 assert_eq!(match_result.sub_pat_count(), 0);
833
834 //Try it against non-matching input, and make sure we get no match
835 let match_result : MatchResult = pat.match_str("99").unwrap();
836 assert_eq!(match_result.did_match(), false);
837
838 //Test the trace method, and make sure we get a reasonable result
839 let mut trace = RosieMessage::empty();
840 assert!(pat.trace(1, "21", TraceFormat::Condensed, &mut trace).is_ok());
841 assert!(trace.as_str().len() > 0);
842 //println!("{}", trace.as_str());
843
844 //Test loading a package from a string
845 let pkg_name = engine.load_pkg_from_str("package two_digit_year\n\nyear = {[012][0-9]}", None).unwrap();
846 assert_eq!(pkg_name.as_str(), "two_digit_year");
847
848 //Test loading a package from a file
849 let rpl_file = Path::new(env!("CARGO_MANIFEST_DIR")).join("test_rpl").join("rust_test_2.rpl");
850 let pkg_name = engine.load_pkg_from_file(rpl_file.to_str().unwrap(), None).unwrap();
851 assert_eq!(pkg_name.as_str(), "rust_test_2");
852
853 //Test importing a package
854 let pkg_name = engine.import_pkg("net", None, None).unwrap();
855 assert_eq!(pkg_name.as_str(), "net");
856
857 //Test matching a pattern with some recursive sub-patterns
858 engine.import_pkg("date", None, None).unwrap();
859 let mut date_pat = engine.compile("date.us_long", None).unwrap();
860 let match_result : MatchResult = date_pat.match_str("Saturday, Nov 5, 1955").unwrap();
861 assert_eq!(match_result.pat_name_str(), "date.us_long");
862 assert_eq!(match_result.matched_str(), "Saturday, Nov 5, 1955");
863 assert_eq!(match_result.start(), 1);
864 assert_eq!(match_result.end(), 22);
865 assert_eq!(match_result.sub_pat_count(), 4);
866 let sub_match_pat_names : Vec<&str> = match_result.sub_pat_iter().map(|result| result.pat_name_str()).collect();
867 assert!(sub_match_pat_names.contains(&"date.day_name"));
868 assert!(sub_match_pat_names.contains(&"date.month_name"));
869 assert!(sub_match_pat_names.contains(&"date.day"));
870 assert!(sub_match_pat_names.contains(&"date.year"));
871 let sub_result = match_result.sub_pat_iter().find(|sub_result| sub_result.pat_name_str() == "date.month_name").unwrap();
872 assert_eq!(sub_result.matched_str(), "Nov");
873 assert_eq!(sub_result.start(), 11);
874 assert_eq!(sub_result.end(), 14);
875
876 //Verify that the RawMatchResults from two different compiled patterns don't interfere with each other
877 //Also test the JSONPretty encoder while we're at it
878 engine.import_expression_deps("time.any", None).unwrap();
879 let mut time_pat = engine.compile("time.any", None).unwrap();
880 let date_raw_match_result = date_pat.raw_match(1, b"Saturday, Nov 5, 1955", &MatchEncoder::JSONPretty).unwrap();
881 let time_raw_match_result = time_pat.raw_match(1, b"2:21 am", &MatchEncoder::JSONPretty).unwrap();
882 assert!(date_raw_match_result.as_str() != time_raw_match_result.as_str());
883 //NOTE: I know these checks might break with perfectly legal changes to JSON formatting, but at least they
884 // will flag it, so a human can take a look and ensure something more fundamental didn't break.
885 assert_eq!(date_raw_match_result.as_str().len(), 660);
886 assert_eq!(time_raw_match_result.as_str().len(), 453);
887 }
888
889 #[test]
890 /// Tests a whole bunch of threads all doing compiling and matching at the same time
891 fn thread_stress() {
892
893 const NUM_THREADS : usize = 50;
894 const NUM_ITERATIONS : usize = 50; //Each iteration includes one compile
895 const NUM_MATCHES : usize = 500; //Number of matches to perform each iteration
896
897 let mut thread_handles = vec![];
898
899 for thread_idx in 0..NUM_THREADS {
900 let handle = thread::spawn(move || {
901
902 let mut rng = Pcg64::seed_from_u64(thread_idx.try_into().unwrap()); //non-cryptographic random used for repeatability
903
904 for _ in 0..NUM_ITERATIONS {
905
906 let pat_idx : u8 = rng.gen_range(0..3);
907 let pat_expr = match pat_idx {
908 0 => "{ [H][^]* }",
909 1 => "date.any",
910 2 => "time.any",
911 _ => panic!()
912 };
913
914 let pat = Rosie::compile(pat_expr).unwrap();
915
916 for _ in 0..NUM_MATCHES {
917
918 let str_idx : u8 = rng.gen_range(0..3);
919 let str_val = match str_idx {
920 0 => "Hello, Rosie!",
921 1 => "Saturday, Nov 5, 1955",
922 2 => "2:21 am",
923 _ => panic!()
924 };
925
926 let result : MatchResult = pat.match_str(str_val).unwrap();
927
928 match (pat_idx, str_idx) {
929 (0, 0) => assert_eq!(result.matched_str(), "Hello, Rosie!"),
930 (1, 1) => assert_eq!(result.matched_str(), "Saturday, Nov 5, 1955"),
931 (2, 2) => assert_eq!(result.matched_str(), "2:21 am"),
932 _ => assert!(!result.did_match()),
933 }
934 }
935 }
936 });
937
938 thread_handles.push(handle);
939 }
940
941 //Make sure every thread has a chance to finish
942 for handle in thread_handles {
943 handle.join().unwrap();
944 }
945 }
946
947}