#define PCRE2_CODE_UNIT_WIDTH 8
#include <pcre2.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "regexp_provider.h"
static uint32_t g_match_limit = TSRUN_PCRE2_DEFAULT_MATCH_LIMIT;
static __thread char g_error_buffer[256];
static const char* format_pcre2_error(int errorcode) {
pcre2_get_error_message(errorcode, (PCRE2_UCHAR*)g_error_buffer, sizeof(g_error_buffer));
return g_error_buffer;
}
static int parse_flags(const char* flags, uint32_t* options_out, int* is_global_out) {
*options_out = PCRE2_UTF; *is_global_out = 0;
if (!flags) return 0;
for (const char* p = flags; *p; p++) {
switch (*p) {
case 'i':
*options_out |= PCRE2_CASELESS;
break;
case 'm':
*options_out |= PCRE2_MULTILINE;
break;
case 's':
*options_out |= PCRE2_DOTALL;
break;
case 'g':
*is_global_out = 1;
break;
case 'u':
break;
case 'y':
snprintf(g_error_buffer, sizeof(g_error_buffer),
"sticky flag (y) is not supported by PCRE2 provider");
return -1;
default:
snprintf(g_error_buffer, sizeof(g_error_buffer),
"unknown regex flag: %c", *p);
return -1;
}
}
return 0;
}
typedef struct {
pcre2_code* code;
pcre2_match_context* match_ctx;
int is_global;
uint32_t capture_count;
} CompiledRegex;
static void* pcre2_compile_fn(
void* userdata,
const char* pattern,
const char* flags,
const char** error_out
) {
(void)userdata;
uint32_t options;
int is_global;
if (parse_flags(flags, &options, &is_global) < 0) {
*error_out = g_error_buffer;
return NULL;
}
int errorcode;
PCRE2_SIZE erroroffset;
pcre2_code* code = pcre2_compile(
(PCRE2_SPTR)pattern,
PCRE2_ZERO_TERMINATED,
options,
&errorcode,
&erroroffset,
NULL );
if (!code) {
char err_msg[200];
pcre2_get_error_message(errorcode, (PCRE2_UCHAR*)err_msg, sizeof(err_msg));
snprintf(g_error_buffer, sizeof(g_error_buffer),
"regex compile error at offset %zu: %s", (size_t)erroroffset, err_msg);
*error_out = g_error_buffer;
return NULL;
}
uint32_t capture_count;
pcre2_pattern_info(code, PCRE2_INFO_CAPTURECOUNT, &capture_count);
pcre2_match_context* match_ctx = pcre2_match_context_create(NULL);
if (match_ctx && g_match_limit > 0) {
pcre2_set_match_limit(match_ctx, g_match_limit);
}
CompiledRegex* handle = malloc(sizeof(CompiledRegex));
if (!handle) {
pcre2_code_free(code);
if (match_ctx) pcre2_match_context_free(match_ctx);
*error_out = "out of memory";
return NULL;
}
handle->code = code;
handle->match_ctx = match_ctx;
handle->is_global = is_global;
handle->capture_count = capture_count;
return handle;
}
static int pcre2_is_match_fn(
void* userdata,
void* handle,
const char* input,
size_t input_len,
const char** error_out
) {
(void)userdata;
CompiledRegex* re = (CompiledRegex*)handle;
pcre2_match_data* match_data = pcre2_match_data_create_from_pattern(re->code, NULL);
if (!match_data) {
*error_out = "out of memory";
return -1;
}
int rc = pcre2_match(
re->code,
(PCRE2_SPTR)input,
input_len,
0, 0, match_data,
re->match_ctx
);
pcre2_match_data_free(match_data);
if (rc >= 0) {
return 1; } else if (rc == PCRE2_ERROR_NOMATCH) {
return 0; } else if (rc == PCRE2_ERROR_MATCHLIMIT) {
snprintf(g_error_buffer, sizeof(g_error_buffer),
"regex match limit exceeded (possible catastrophic backtracking)");
*error_out = g_error_buffer;
return -1;
} else {
*error_out = format_pcre2_error(rc);
return -1;
}
}
static int pcre2_find_fn(
void* userdata,
void* handle,
const char* input,
size_t input_len,
size_t start_pos,
TsRunRegexMatch* match_out,
const char** error_out
) {
(void)userdata;
CompiledRegex* re = (CompiledRegex*)handle;
pcre2_match_data* match_data = pcre2_match_data_create_from_pattern(re->code, NULL);
if (!match_data) {
*error_out = "out of memory";
return -1;
}
int rc = pcre2_match(
re->code,
(PCRE2_SPTR)input,
input_len,
start_pos,
0, match_data,
re->match_ctx
);
if (rc == PCRE2_ERROR_NOMATCH) {
pcre2_match_data_free(match_data);
return 0; } else if (rc < 0) {
pcre2_match_data_free(match_data);
if (rc == PCRE2_ERROR_MATCHLIMIT) {
snprintf(g_error_buffer, sizeof(g_error_buffer),
"regex match limit exceeded (possible catastrophic backtracking)");
*error_out = g_error_buffer;
} else {
*error_out = format_pcre2_error(rc);
}
return -1;
}
PCRE2_SIZE* ovector = pcre2_get_ovector_pointer(match_data);
uint32_t ovector_count = pcre2_get_ovector_count(match_data);
match_out->start = ovector[0];
match_out->end = ovector[1];
if (ovector_count > 0) {
match_out->capture_count = ovector_count;
match_out->captures = malloc(ovector_count * sizeof(TsRunRegexCapture));
if (!match_out->captures) {
pcre2_match_data_free(match_data);
*error_out = "out of memory";
return -1;
}
for (uint32_t i = 0; i < ovector_count; i++) {
PCRE2_SIZE start = ovector[2 * i];
PCRE2_SIZE end = ovector[2 * i + 1];
if (start == PCRE2_UNSET) {
match_out->captures[i].start = -1;
match_out->captures[i].end = -1;
} else {
match_out->captures[i].start = (intptr_t)start;
match_out->captures[i].end = (intptr_t)end;
}
}
} else {
match_out->captures = NULL;
match_out->capture_count = 0;
}
pcre2_match_data_free(match_data);
return 1; }
static void pcre2_free_fn(void* userdata, void* handle) {
(void)userdata;
CompiledRegex* re = (CompiledRegex*)handle;
if (re) {
if (re->code) pcre2_code_free(re->code);
if (re->match_ctx) pcre2_match_context_free(re->match_ctx);
free(re);
}
}
static void pcre2_free_captures_fn(
void* userdata,
TsRunRegexCapture* captures,
size_t count
) {
(void)userdata;
(void)count;
free(captures);
}
TsRunRegexCallbacks tsrun_pcre2_provider(const TsRunPcre2Config* config) {
if (config && config->match_limit > 0) {
g_match_limit = config->match_limit;
} else {
g_match_limit = TSRUN_PCRE2_DEFAULT_MATCH_LIMIT;
}
TsRunRegexCallbacks callbacks = {
.compile = pcre2_compile_fn,
.is_match = pcre2_is_match_fn,
.find = pcre2_find_fn,
.free = pcre2_free_fn,
.free_captures = pcre2_free_captures_fn,
.userdata = NULL
};
return callbacks;
}