#include "prism/regexp.h"
#define PM_REGEXP_PARSE_DEPTH_MAX 4096
typedef struct {
pm_parser_t *parser;
const uint8_t *start;
const uint8_t *cursor;
const uint8_t *end;
bool extended_mode;
bool encoding_changed;
const pm_encoding_t *encoding;
pm_regexp_name_callback_t name_callback;
void *name_data;
pm_regexp_error_callback_t error_callback;
void *error_data;
} pm_regexp_parser_t;
static inline void
pm_regexp_parse_error(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, const char *message) {
parser->error_callback(start, end, message, parser->error_data);
}
static void
pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
pm_string_t string;
pm_string_shared_init(&string, start, end);
parser->name_callback(&string, parser->name_data);
pm_string_free(&string);
}
static inline bool
pm_regexp_char_is_eof(pm_regexp_parser_t *parser) {
return parser->cursor >= parser->end;
}
static inline bool
pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) {
if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
parser->cursor++;
return true;
}
return false;
}
static inline bool
pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) {
if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
parser->cursor++;
return true;
}
return false;
}
static bool
pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) {
if (pm_regexp_char_is_eof(parser)) {
return false;
}
const uint8_t *end = (const uint8_t *) pm_memchr(parser->cursor, value, (size_t) (parser->end - parser->cursor), parser->encoding_changed, parser->encoding);
if (end == NULL) {
return false;
}
parser->cursor = end + 1;
return true;
}
static bool
pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
const uint8_t *savepoint = parser->cursor;
enum {
PM_REGEXP_RANGE_QUANTIFIER_STATE_START,
PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM,
PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM,
PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA
} state = PM_REGEXP_RANGE_QUANTIFIER_STATE_START;
while (1) {
if (parser->cursor >= parser->end) {
parser->cursor = savepoint;
return true;
}
switch (state) {
case PM_REGEXP_RANGE_QUANTIFIER_STATE_START:
switch (*parser->cursor) {
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
parser->cursor++;
state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM;
break;
case ',':
parser->cursor++;
state = PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA;
break;
default:
parser->cursor = savepoint;
return true;
}
break;
case PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM:
switch (*parser->cursor) {
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
parser->cursor++;
break;
case ',':
parser->cursor++;
state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
break;
case '}':
parser->cursor++;
return true;
default:
parser->cursor = savepoint;
return true;
}
break;
case PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA:
switch (*parser->cursor) {
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
parser->cursor++;
state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
break;
default:
parser->cursor = savepoint;
return true;
}
break;
case PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM:
switch (*parser->cursor) {
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
parser->cursor++;
break;
case '}':
parser->cursor++;
return true;
default:
parser->cursor = savepoint;
return true;
}
break;
}
}
return true;
}
static bool
pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
while (!pm_regexp_char_is_eof(parser)) {
switch (*parser->cursor) {
case '*':
case '+':
case '?':
parser->cursor++;
break;
case '{':
parser->cursor++;
if (!pm_regexp_parse_range_quantifier(parser)) return false;
break;
default:
return true;
}
}
return true;
}
static bool
pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
if (!pm_regexp_char_expect(parser, ':')) {
return false;
}
pm_regexp_char_accept(parser, '^');
return (
pm_regexp_char_find(parser, ':') &&
pm_regexp_char_expect(parser, ']') &&
pm_regexp_char_expect(parser, ']')
);
}
static bool
pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth);
static bool
pm_regexp_parse_character_set(pm_regexp_parser_t *parser, uint16_t depth) {
pm_regexp_char_accept(parser, '^');
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ']') {
switch (*parser->cursor++) {
case '[':
pm_regexp_parse_lbracket(parser, (uint16_t) (depth + 1));
break;
case '\\':
if (!pm_regexp_char_is_eof(parser)) {
parser->cursor++;
}
break;
default:
break;
}
}
return pm_regexp_char_expect(parser, ']');
}
static bool
pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth) {
if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) {
pm_regexp_parse_error(parser, parser->start, parser->end, "parse depth limit over");
return false;
}
if ((parser->cursor < parser->end) && parser->cursor[0] == ']') {
parser->cursor++;
pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "empty char-class");
return true;
}
const uint8_t *reset = parser->cursor;
if ((parser->cursor + 2 < parser->end) && parser->cursor[0] == '[' && parser->cursor[1] == ':') {
parser->cursor++;
if (pm_regexp_parse_posix_class(parser)) return true;
parser->cursor = reset;
}
return pm_regexp_parse_character_set(parser, depth);
}
static bool
pm_regexp_parse_expression(pm_regexp_parser_t *parser, uint16_t depth);
typedef enum {
PM_REGEXP_OPTION_STATE_INVALID,
PM_REGEXP_OPTION_STATE_TOGGLEABLE,
PM_REGEXP_OPTION_STATE_ADDABLE,
PM_REGEXP_OPTION_STATE_ADDED,
PM_REGEXP_OPTION_STATE_REMOVED
} pm_regexp_option_state_t;
#define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a'
#define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x'
#define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1)
typedef struct {
uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS];
} pm_regexp_options_t;
static void
pm_regexp_options_init(pm_regexp_options_t *options) {
memset(options, PM_REGEXP_OPTION_STATE_INVALID, sizeof(uint8_t) * PRISM_REGEXP_OPTION_STATE_SLOTS);
options->values['i' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
options->values['m' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
options->values['x' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
options->values['d' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
options->values['a' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
options->values['u' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
}
static bool
pm_regexp_options_add(pm_regexp_options_t *options, uint8_t key) {
if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
switch (options->values[key]) {
case PM_REGEXP_OPTION_STATE_INVALID:
case PM_REGEXP_OPTION_STATE_REMOVED:
return false;
case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
case PM_REGEXP_OPTION_STATE_ADDABLE:
options->values[key] = PM_REGEXP_OPTION_STATE_ADDED;
return true;
case PM_REGEXP_OPTION_STATE_ADDED:
return true;
}
}
return false;
}
static bool
pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
switch (options->values[key]) {
case PM_REGEXP_OPTION_STATE_INVALID:
case PM_REGEXP_OPTION_STATE_ADDABLE:
return false;
case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
case PM_REGEXP_OPTION_STATE_ADDED:
case PM_REGEXP_OPTION_STATE_REMOVED:
options->values[key] = PM_REGEXP_OPTION_STATE_REMOVED;
return true;
}
}
return false;
}
static uint8_t
pm_regexp_options_state(pm_regexp_options_t *options, uint8_t key) {
if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
return options->values[key];
}
return false;
}
static bool
pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) {
const uint8_t *group_start = parser->cursor;
pm_regexp_options_t options;
pm_regexp_options_init(&options);
if (pm_regexp_char_accept(parser, '?')) {
if (pm_regexp_char_is_eof(parser)) {
pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern in group");
return false;
}
switch (*parser->cursor) {
case '#': { parser->cursor++;
if (pm_regexp_char_is_eof(parser)) {
pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern in group");
return false;
}
if (parser->encoding_changed && parser->encoding->multibyte) {
bool escaped = false;
while (parser->cursor < parser->end) {
if (!escaped && *parser->cursor == ')') {
parser->cursor++;
return true;
}
size_t width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
if (width == 0) return false;
escaped = (width == 1) && (*parser->cursor == '\\');
parser->cursor += width;
}
return false;
} else {
bool found = pm_regexp_char_find(parser, ')');
while (found && (parser->start <= parser->cursor - 2) && (*(parser->cursor - 2) == '\\')) {
found = pm_regexp_char_find(parser, ')');
}
return found;
}
}
case ':': case '=': case '!': case '>': case '~': parser->cursor++;
break;
case '<':
parser->cursor++;
if (pm_regexp_char_is_eof(parser)) {
pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern with unmatched parenthesis");
return false;
}
switch (*parser->cursor) {
case '=': case '!': parser->cursor++;
break;
default: { const uint8_t *start = parser->cursor;
if (!pm_regexp_char_find(parser, '>')) {
return false;
}
if (parser->cursor - start == 1) {
pm_regexp_parse_error(parser, start, parser->cursor, "group name is empty");
}
if (parser->name_callback != NULL) {
pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
}
break;
}
}
break;
case '\'': { const uint8_t *start = ++parser->cursor;
if (!pm_regexp_char_find(parser, '\'')) {
return false;
}
if (parser->name_callback != NULL) {
pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
}
break;
}
case '(': if (!pm_regexp_char_find(parser, ')')) {
return false;
}
break;
case 'i': case 'm': case 'x': case 'd': case 'a': case 'u': while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '-' && *parser->cursor != ':' && *parser->cursor != ')') {
if (!pm_regexp_options_add(&options, *parser->cursor)) {
return false;
}
parser->cursor++;
}
if (pm_regexp_char_is_eof(parser)) {
return false;
}
if (*parser->cursor == ')') {
if (pm_regexp_options_state(&options, 'x') == PM_REGEXP_OPTION_STATE_ADDED) {
parser->extended_mode = true;
}
parser->cursor++;
return true;
}
if (*parser->cursor != '-') break;
PRISM_FALLTHROUGH
case '-':
parser->cursor++;
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ':' && *parser->cursor != ')') {
if (!pm_regexp_options_remove(&options, *parser->cursor)) {
return false;
}
parser->cursor++;
}
if (pm_regexp_char_is_eof(parser)) {
return false;
}
if (*parser->cursor == ')') {
switch (pm_regexp_options_state(&options, 'x')) {
case PM_REGEXP_OPTION_STATE_ADDED:
parser->extended_mode = true;
break;
case PM_REGEXP_OPTION_STATE_REMOVED:
parser->extended_mode = false;
break;
}
parser->cursor++;
return true;
}
break;
default:
parser->cursor++;
pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "undefined group option");
break;
}
}
bool extended_mode = parser->extended_mode;
switch (pm_regexp_options_state(&options, 'x')) {
case PM_REGEXP_OPTION_STATE_ADDED:
parser->extended_mode = true;
break;
case PM_REGEXP_OPTION_STATE_REMOVED:
parser->extended_mode = false;
break;
}
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')') {
if (!pm_regexp_parse_expression(parser, (uint16_t) (depth + 1))) {
parser->extended_mode = extended_mode;
return false;
}
pm_regexp_char_accept(parser, '|');
}
parser->extended_mode = extended_mode;
if (pm_regexp_char_expect(parser, ')')) return true;
pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern with unmatched parenthesis");
return false;
}
static bool
pm_regexp_parse_item(pm_regexp_parser_t *parser, uint16_t depth) {
switch (*parser->cursor) {
case '^':
case '$':
parser->cursor++;
return pm_regexp_parse_quantifier(parser);
case '\\':
parser->cursor++;
if (!pm_regexp_char_is_eof(parser)) {
parser->cursor++;
}
return pm_regexp_parse_quantifier(parser);
case '(':
parser->cursor++;
return pm_regexp_parse_group(parser, depth) && pm_regexp_parse_quantifier(parser);
case '[':
parser->cursor++;
return pm_regexp_parse_lbracket(parser, depth) && pm_regexp_parse_quantifier(parser);
case '*':
case '?':
case '+':
parser->cursor++;
pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "target of repeat operator is not specified");
return true;
case ')':
parser->cursor++;
pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "unmatched close parenthesis");
return true;
case '#':
if (parser->extended_mode) {
if (!pm_regexp_char_find(parser, '\n')) parser->cursor = parser->end;
return true;
}
PRISM_FALLTHROUGH
default: {
size_t width;
if (!parser->encoding_changed) {
width = pm_encoding_utf_8_char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
} else {
width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
}
if (width == 0) return false; parser->cursor += width;
return pm_regexp_parse_quantifier(parser);
}
}
}
static bool
pm_regexp_parse_expression(pm_regexp_parser_t *parser, uint16_t depth) {
if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) {
pm_regexp_parse_error(parser, parser->start, parser->end, "parse depth limit over");
return false;
}
if (!pm_regexp_parse_item(parser, depth)) {
return false;
}
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')' && *parser->cursor != '|') {
if (!pm_regexp_parse_item(parser, depth)) {
return false;
}
}
return true;
}
static bool
pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
do {
if (pm_regexp_char_is_eof(parser)) return true;
if (!pm_regexp_parse_expression(parser, 0)) return false;
} while (pm_regexp_char_accept(parser, '|'));
return pm_regexp_char_is_eof(parser);
}
PRISM_EXPORTED_FUNCTION void
pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool extended_mode, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data) {
pm_regexp_parse_pattern(&(pm_regexp_parser_t) {
.parser = parser,
.start = source,
.cursor = source,
.end = source + size,
.extended_mode = extended_mode,
.encoding_changed = parser->encoding_changed,
.encoding = parser->encoding,
.name_callback = name_callback,
.name_data = name_data,
.error_callback = error_callback,
.error_data = error_data
});
}