#include "tree_sitter/array.h"
#include "tree_sitter/parser.h"
#include "tsp_unicode.h"
#include "tsp_keywords.h"
#include "tsp_intuit_more.h"
#include "tsp_intuit_readline.h"
static char *tsp_strchr(register const char *s, int c) {
do {
if (*s == c) {
return (char *)s;
}
} while (*s++);
return (0);
}
#undef DEBUGGING
#ifdef DEBUGGING
#include <stdio.h>
#define DEBUG(fmt, ...) fprintf(stderr, "scanner.c:%d DEBUG: " fmt, __LINE__, __VA_ARGS__)
#else
#define DEBUG(fmt, ...)
#endif
#define streq(a, b) (strcmp(a, b) == 0)
#include <wctype.h>
enum TokenType {
TOKEN_APOSTROPHE,
TOKEN_DOUBLE_QUOTE,
TOKEN_BACKTICK,
TOKEN_SEARCH_SLASH,
NO_TOKEN_SEARCH_SLASH_PLZ,
TOKEN_OPEN_READLINE_BRACKET,
TOKEN_OPEN_FILEGLOB_BRACKET,
PERLY_SEMICOLON,
PERLY_HEREDOC,
TOKEN_CTRL_Z,
TOKEN_QUOTELIKE_BEGIN,
TOKEN_QUOTELIKE_MIDDLE_CLOSE,
TOKEN_QUOTELIKE_MIDDLE_SKIP,
TOKEN_QUOTELIKE_END_ZW,
TOKEN_QUOTELIKE_END,
TOKEN_Q_STRING_CONTENT,
TOKEN_QQ_STRING_CONTENT,
TOKEN_ESCAPE_SEQUENCE,
TOKEN_ESCAPED_DELIMITER,
TOKEN_DOLLAR_IN_REGEXP,
TOKEN_REGEXP_OPEN_BRACKET,
TOKEN_REGEXP_OPEN_BRACE,
TOKEN_POD,
TOKEN_GOBBLED_CONTENT,
TOKEN_ATTRIBUTE_VALUE_BEGIN,
TOKEN_ATTRIBUTE_VALUE,
TOKEN_PROTOTYPE,
TOKEN_SIGNATURE_START,
TOKEN_HEREDOC_DELIM,
TOKEN_COMMAND_HEREDOC_DELIM,
TOKEN_HEREDOC_START,
TOKEN_HEREDOC_MIDDLE,
TOKEN_HEREDOC_END,
TOKEN_FAT_COMMA_AUTOQUOTED,
TOKEN_FILETEST,
TOKEN_BRACE_AUTOQUOTED,
TOKEN_BRACE_END_ZW,
TOKEN_DOLLAR_IDENT_ZW,
TOKEN_NO_INTERP_WHITESPACE_ZW,
TOKEN_NONASSOC,
TOKEN_RECOVER_PAREN_CLOSE,
TOKEN_RECOVER_BRACKET_CLOSE,
TOKEN_RECOVER_BRACE_CLOSE,
TOKEN_RECOVER_ARROW,
TOKEN_X_OP,
TOKEN_ERROR
};
#define MAX_TSPSTRING_LEN 8
typedef struct {
int length;
int32_t contents[MAX_TSPSTRING_LEN];
} TSPString;
static void tspstring_push(TSPString *s, int32_t c) {
if (s->length++ < MAX_TSPSTRING_LEN) s->contents[s->length - 1] = c;
}
static bool tspstring_eq(TSPString *s1, TSPString *s2) {
if (s1->length != s2->length) return false;
int max_len = s1->length < MAX_TSPSTRING_LEN ? s1->length : MAX_TSPSTRING_LEN;
for (int i = 0; i < max_len; i++) {
if (s1->contents[i] != s2->contents[i]) return false;
}
return true;
}
static void tspstring_reset(TSPString *s) { s->length = 0; }
static int32_t close_for_open(int32_t c) {
switch (c) {
case '(':
return ')';
case '[':
return ']';
case '{':
return '}';
case '<':
return '>';
default:
return 0;
}
}
typedef struct {
int32_t open, close, count;
bool body_leads_with_delim;
} TSPQuote;
static TSPQuote tspquote_new() { return (TSPQuote){0, 0, 0, false}; }
enum HeredocState { HEREDOC_NONE, HEREDOC_START, HEREDOC_UNKNOWN, HEREDOC_CONTINUE, HEREDOC_END };
#define HEREDOC_QUEUE_MAX 8
typedef struct {
TSPString delim;
bool interpolates, indents;
} HeredocEntry;
typedef struct {
Array(TSPQuote) quotes;
HeredocEntry heredoc_queue[HEREDOC_QUEUE_MAX];
uint8_t heredoc_count;
enum HeredocState heredoc_state;
bool recovery_emitted;
} LexerState;
static void lexerstate_push_quote(LexerState *state, int32_t opener) {
TSPQuote q = tspquote_new();
int32_t closer = close_for_open(opener);
q.close = closer ? closer : opener;
q.open = closer ? opener : 0;
q.count = 0;
array_push(&state->quotes, q);
}
static int32_t lexerstate_is_quote_opener(LexerState *state, int32_t check) {
for (int i = state->quotes.size - 1; i >= 0; i--) {
TSPQuote *q = array_get(&state->quotes, i);
if (q->open && check == q->open) return i + 1;
}
return 0;
}
static void lexerstate_saw_opener(LexerState *state, int32_t idx) {
TSPQuote *q = array_get(&state->quotes, idx - 1);
q->count++;
DEBUG("Got a opener for %c, we are at %d \n", q->open, q->count);
}
static int32_t lexerstate_is_quote_closer(LexerState *state, int32_t c) {
for (int i = state->quotes.size - 1; i >= 0; i--) {
TSPQuote *q = array_get(&state->quotes, i);
if (q->close && c == q->close) return i + 1;
}
return 0;
}
static void lexerstate_saw_closer(LexerState *state, int32_t idx) {
TSPQuote *q = array_get(&state->quotes, idx - 1);
if (q->count) {
q->count--;
DEBUG("Got a closer, we are at %d \n", q->count);
}
}
static bool lexerstate_is_quote_closed(LexerState *state, int32_t idx) {
TSPQuote *q = array_get(&state->quotes, idx - 1);
return !q->count;
}
static void lexerstate_pop_quote(LexerState *state, int32_t idx) {
array_erase(&state->quotes, idx - 1);
}
static bool lexerstate_is_paired_delimiter(LexerState *state) {
TSPQuote *q = array_back(&state->quotes);
return !!q->open;
}
static bool lexerstate_take_body_lead(LexerState *state, int32_t c) {
if (!state->quotes.size) return false;
TSPQuote *q = array_back(&state->quotes);
if (q->body_leads_with_delim && c == q->open) {
q->body_leads_with_delim = false;
return true;
}
return false;
}
static HeredocEntry *lexerstate_front_heredoc(LexerState *state) {
return &state->heredoc_queue[0];
}
static void lexerstate_add_heredoc(LexerState *state, TSPString *delim, bool interp, bool indent) {
HeredocEntry *e;
if (state->heredoc_count >= HEREDOC_QUEUE_MAX)
e = &state->heredoc_queue[HEREDOC_QUEUE_MAX - 1];
else
e = &state->heredoc_queue[state->heredoc_count++];
e->delim = *delim;
e->interpolates = interp;
e->indents = indent;
if (state->heredoc_count == 1) state->heredoc_state = HEREDOC_START;
}
static void lexerstate_finish_heredoc(LexerState *state) {
if (state->heredoc_count > 0) {
state->heredoc_count--;
for (int i = 0; i < state->heredoc_count; i++)
state->heredoc_queue[i] = state->heredoc_queue[i + 1];
}
state->heredoc_queue[state->heredoc_count] = (HeredocEntry){0};
state->heredoc_state = state->heredoc_count > 0 ? HEREDOC_START : HEREDOC_NONE;
}
#define ADVANCE_C \
do { \
if (lexer->lookahead == '\r') \
DEBUG("> advance U+%04X = \\r\n", lexer->lookahead); \
else if (lexer->lookahead == '\n') \
DEBUG("> advance U+%04X = \\n\n", lexer->lookahead); \
else \
DEBUG("> advance U+%04X = '%c'\n", lexer->lookahead, lexer->lookahead); \
lexer->advance(lexer, false); \
c = lexer->lookahead; \
} while (0)
#define TOKEN(type) \
do { \
DEBUG("token(%s)\n", #type); \
lexer->result_symbol = type; \
return true; \
} while (0)
#define MARK_END \
do { \
lexer->mark_end(lexer); \
DEBUG("marking end of token\n", 0); \
} while (0)
#define EMIT_RECOVERY_TOKENS(brace_ok) do { \
if (valid_symbols[TOKEN_RECOVER_ARROW]) { state->recovery_emitted = true; TOKEN(TOKEN_RECOVER_ARROW); } \
if (valid_symbols[TOKEN_RECOVER_PAREN_CLOSE]) { state->recovery_emitted = true; TOKEN(TOKEN_RECOVER_PAREN_CLOSE); } \
if (valid_symbols[TOKEN_RECOVER_BRACKET_CLOSE]) { state->recovery_emitted = true; TOKEN(TOKEN_RECOVER_BRACKET_CLOSE); } \
if ((brace_ok) && valid_symbols[TOKEN_RECOVER_BRACE_CLOSE]) { state->recovery_emitted = true; TOKEN(TOKEN_RECOVER_BRACE_CLOSE); } \
} while(0)
static void skip_whitespace(TSLexer *lexer) {
while (1) {
int32_t c = lexer->lookahead;
if (!c) return;
if (is_tsp_whitespace(c)) lexer->advance(lexer, true);
else
return;
}
}
static bool skip_ws_to_eol(TSLexer *lexer) {
while (1) {
int32_t c = lexer->lookahead;
if (!c) return false;
if (is_tsp_whitespace(c)) {
lexer->advance(lexer, true);
if (c == '\n') return true;
} else
return false;
}
}
static bool isidfirst(int32_t c);
static bool isidcont(int32_t c);
enum PeekResult {
PEEK_NO_MATCH, PEEK_KEYWORD, PEEK_FAT_COMMA, PEEK_NOT_KEYWORD, };
static enum PeekResult peek_is_statement_keyword(TSLexer *lexer) {
int32_t la = lexer->lookahead;
if (KEYWORD_FIRST_CHAR_FILTER(la))
return PEEK_NO_MATCH;
char word[16];
int len = 0;
while (KEYWORD_WORD_CHAR(la)) {
if (len < 15) word[len++] = (char)la;
lexer->advance(lexer, false);
la = lexer->lookahead;
}
word[len] = '\0';
if (isidcont(la))
return PEEK_NOT_KEYWORD;
bool needs_name = false;
KEYWORD_MATCH(word, needs_name);
while (is_tsp_whitespace(la)) {
lexer->advance(lexer, true);
la = lexer->lookahead;
}
if (la == '=') return PEEK_FAT_COMMA;
if (needs_name) {
if (!isidfirst(la))
return PEEK_NOT_KEYWORD; }
return PEEK_KEYWORD;
}
static void _skip_chars(TSLexer *lexer, int maxlen, const char *allow) {
int32_t c = lexer->lookahead;
while (maxlen)
if (!c)
return;
else if (tsp_strchr(allow, c)) {
ADVANCE_C;
if (maxlen > 0) maxlen--;
} else
break;
}
#define skip_hexdigits(lexer, maxlen) _skip_chars(lexer, maxlen, "0123456789ABCDEFabcdef")
#define skip_digits(lexer, maxlen) _skip_chars(lexer, maxlen, "0123456789")
#define skip_octdigits(lexer, maxlen) _skip_chars(lexer, maxlen, "01234567")
static void skip_braced(TSLexer *lexer) {
int32_t c = lexer->lookahead;
if (c != '{') return;
ADVANCE_C;
while (c && c != '}') ADVANCE_C;
ADVANCE_C;
}
static bool isidfirst(int32_t c) { return c == '_' || is_tsp_id_start(c); }
static bool isidcont(int32_t c) { return c == '_' || is_tsp_id_continue(c); }
static bool is_interpolation_escape(int32_t c) { return c < 256 && tsp_strchr("$@-[{\\", c); }
#define SER_FIXED_OVERHEAD \
(1 + \
1 + 1 + \
HEREDOC_QUEUE_MAX * (1 + 1 + (int)sizeof(TSPString)) + \
1 )
#define MAX_SERIALIZED_QUOTES \
((TREE_SITTER_SERIALIZATION_BUFFER_SIZE - SER_FIXED_OVERHEAD) / (int)sizeof(TSPQuote))
unsigned int tree_sitter_perl_external_scanner_serialize(void *payload, char *buffer) {
LexerState *state = payload;
size_t size = 0;
size_t quote_count = state->quotes.size;
if (quote_count > MAX_SERIALIZED_QUOTES) {
quote_count = MAX_SERIALIZED_QUOTES;
}
buffer[size++] = (char)quote_count;
if (quote_count > 0) {
memcpy(&buffer[size], state->quotes.contents, quote_count * sizeof(TSPQuote));
}
size += quote_count * sizeof(TSPQuote);
buffer[size++] = (char)state->heredoc_state;
buffer[size++] = (char)state->heredoc_count;
for (uint8_t i = 0; i < state->heredoc_count; i++) {
HeredocEntry *e = &state->heredoc_queue[i];
buffer[size++] = (char)e->interpolates;
buffer[size++] = (char)e->indents;
memcpy(&buffer[size], &e->delim, sizeof(TSPString));
size += sizeof(TSPString);
}
buffer[size++] = (char)state->recovery_emitted;
return size;
}
void tree_sitter_perl_external_scanner_deserialize(void *payload, const char *buffer,
unsigned int length) {
LexerState *state = payload;
size_t size = 0;
array_delete(&state->quotes);
if (length > 0) {
size_t quote_count = (uint8_t)buffer[size++];
if (quote_count > MAX_SERIALIZED_QUOTES) quote_count = MAX_SERIALIZED_QUOTES;
if (quote_count > 0) {
array_reserve(&state->quotes, quote_count);
state->quotes.size = quote_count;
memcpy(state->quotes.contents, &buffer[size], quote_count * sizeof(TSPQuote));
size += quote_count * sizeof(TSPQuote);
}
state->heredoc_state = (enum HeredocState)buffer[size++];
state->heredoc_count = (uint8_t)buffer[size++];
if (state->heredoc_count > HEREDOC_QUEUE_MAX) state->heredoc_count = HEREDOC_QUEUE_MAX;
for (uint8_t i = 0; i < state->heredoc_count; i++) {
HeredocEntry *e = &state->heredoc_queue[i];
e->interpolates = (bool)buffer[size++];
e->indents = (bool)buffer[size++];
memcpy(&e->delim, &buffer[size], sizeof(TSPString));
size += sizeof(TSPString);
}
state->recovery_emitted = (bool)buffer[size++];
} else {
state->heredoc_count = 0;
state->heredoc_state = HEREDOC_NONE;
}
}
bool tree_sitter_perl_external_scanner_scan(void *payload, TSLexer *lexer,
const bool *valid_symbols) {
LexerState *state = payload;
bool is_ERROR = valid_symbols[TOKEN_ERROR];
bool skipped_whitespace = false;
bool crossed_newline = false;
bool recovery_emitted = state->recovery_emitted;
state->recovery_emitted = false;
int32_t c = lexer->lookahead;
if (!is_ERROR && valid_symbols[TOKEN_GOBBLED_CONTENT]) {
while (!lexer->eof(lexer)) ADVANCE_C;
TOKEN(TOKEN_GOBBLED_CONTENT);
}
if (!is_ERROR && valid_symbols[TOKEN_NONASSOC]) TOKEN(TOKEN_NONASSOC);
if (!is_ERROR && valid_symbols[TOKEN_X_OP] && c == 'x') {
ADVANCE_C;
if (c >= '0' && c <= '9') { MARK_END; TOKEN(TOKEN_X_OP); }
return false;
}
if (valid_symbols[TOKEN_HEREDOC_MIDDLE] && !is_ERROR && state->heredoc_count > 0) {
HeredocEntry *front = lexerstate_front_heredoc(state);
DEBUG("Beginning heredoc contents\n", 0);
if (state->heredoc_state != HEREDOC_CONTINUE) {
TSPString line = {0};
while (!lexer->eof(lexer)) {
tspstring_reset(&line);
bool is_valid_start_pos =
state->heredoc_state == HEREDOC_END || lexer->get_column(lexer) == 0;
bool saw_escape = false;
DEBUG("Starting loop at col %d\n", lexer->get_column(lexer));
if (is_valid_start_pos && front->indents) {
DEBUG("Skipping initial whitespace in heredoc\n", 0);
skip_whitespace(lexer);
c = lexer->lookahead;
}
MARK_END;
while (c != '\n' && !lexer->eof(lexer)) {
if (c == '\r') {
ADVANCE_C;
if (c == '\n') break;
tspstring_push(&line, '\r');
}
tspstring_push(&line, c);
if (c == '$' || c == '@' || c == '\\') saw_escape = true;
ADVANCE_C;
}
DEBUG("got length %d, want length %d\n", line.length, front->delim.length);
if (is_valid_start_pos && tspstring_eq(&line, &front->delim)) {
if (state->heredoc_state != HEREDOC_END) {
state->heredoc_state = HEREDOC_END;
TOKEN(TOKEN_HEREDOC_MIDDLE);
}
MARK_END;
lexerstate_finish_heredoc(state);
TOKEN(TOKEN_HEREDOC_END);
}
if (saw_escape && front->interpolates) {
state->heredoc_state = HEREDOC_CONTINUE;
TOKEN(TOKEN_HEREDOC_MIDDLE);
}
ADVANCE_C;
}
} else {
DEBUG("Entering heredoc continue mode\n", 0);
bool saw_chars = false;
while (1) {
if (is_interpolation_escape(c)) {
MARK_END;
break;
}
if (c == '\n') {
MARK_END;
state->heredoc_state = HEREDOC_UNKNOWN;
TOKEN(TOKEN_HEREDOC_MIDDLE);
}
saw_chars = true;
ADVANCE_C;
}
if (saw_chars) TOKEN(TOKEN_HEREDOC_MIDDLE);
}
}
if (!is_ERROR && iswspace(c) && valid_symbols[TOKEN_NO_INTERP_WHITESPACE_ZW]) {
TOKEN(TOKEN_NO_INTERP_WHITESPACE_ZW);
}
crossed_newline = skip_ws_to_eol(lexer);
if (valid_symbols[TOKEN_HEREDOC_START]) {
if (state->heredoc_state == HEREDOC_START && lexer->get_column(lexer) == 0) {
state->heredoc_state = HEREDOC_UNKNOWN;
TOKEN(TOKEN_HEREDOC_START);
}
}
if (!is_ERROR && valid_symbols[TOKEN_ATTRIBUTE_VALUE_BEGIN] && c == '(') {
TOKEN(TOKEN_ATTRIBUTE_VALUE_BEGIN);
}
if (!is_ERROR && valid_symbols[TOKEN_ATTRIBUTE_VALUE]) {
DEBUG("Attribute value started...\n", 0);
int delimcount = 0;
while (!lexer->eof(lexer)) {
if (c == '\\') {
ADVANCE_C;
} else if (c == '(')
delimcount++;
else if (c == ')') {
if (delimcount)
delimcount--;
else {
break;
}
}
ADVANCE_C;
}
TOKEN(TOKEN_ATTRIBUTE_VALUE);
}
if (is_tsp_whitespace(c)) {
skipped_whitespace = true;
skip_whitespace(lexer);
c = lexer->lookahead;
}
if (c == 26 && valid_symbols[TOKEN_CTRL_Z]) TOKEN(TOKEN_CTRL_Z);
bool any_recovery_valid =
valid_symbols[TOKEN_RECOVER_ARROW] ||
valid_symbols[TOKEN_RECOVER_PAREN_CLOSE] ||
valid_symbols[TOKEN_RECOVER_BRACKET_CLOSE] ||
valid_symbols[TOKEN_RECOVER_BRACE_CLOSE] ||
valid_symbols[PERLY_SEMICOLON];
if (!is_ERROR) {
if (c == '}' || c == ';' || lexer->eof(lexer)) {
EMIT_RECOVERY_TOKENS(c != '}');
}
}
if (valid_symbols[PERLY_SEMICOLON]) {
if (c == '}' || lexer->eof(lexer)) {
if (is_ERROR || !valid_symbols[TOKEN_BRACE_END_ZW]) {
DEBUG("Fake PERLY_SEMICOLON at end-of-scope\n", 0);
TOKEN(PERLY_SEMICOLON);
}
}
}
if (lexer->eof(lexer)) return false;
if ((crossed_newline || recovery_emitted) && !is_ERROR && any_recovery_valid &&
!KEYWORD_FIRST_CHAR_FILTER(c)) {
MARK_END; enum PeekResult peek = peek_is_statement_keyword(lexer);
if (peek == PEEK_KEYWORD) {
DEBUG("keyword boundary\n", 0);
EMIT_RECOVERY_TOKENS(true);
if (valid_symbols[PERLY_SEMICOLON]) TOKEN(PERLY_SEMICOLON);
}
if (peek == PEEK_FAT_COMMA) {
MARK_END;
c = lexer->lookahead;
goto fat_comma_check;
}
if (peek == PEEK_NOT_KEYWORD)
return false;
}
if (valid_symbols[TOKEN_OPEN_FILEGLOB_BRACKET] || valid_symbols[TOKEN_OPEN_READLINE_BRACKET] || valid_symbols[PERLY_HEREDOC]) {
if (c == '<') {
ADVANCE_C;
MARK_END;
if (c == '<') goto heredoc_token_handling;
char content[256];
size_t clen = 0;
if (c == '$') { content[clen++] = '$'; ADVANCE_C; }
while (isidcont(c)) {
if (clen < sizeof(content)) content[clen++] = (c < 0x80) ? (char)c : (char)0x7f;
ADVANCE_C;
}
if (c == '>') TOKEN(TOKEN_OPEN_READLINE_BRACKET);
if (valid_symbols[TOKEN_OPEN_FILEGLOB_BRACKET]) {
while (c != '>' && c != '<' && c != ';' && c != '\n' &&
!lexer->eof(lexer)) {
if (clen < sizeof(content))
content[clen++] = (c < 0x80) ? (char)c : (char)0x7f;
ADVANCE_C;
}
if (c == '>') {
ADVANCE_C; char after[256];
size_t alen = 0;
while (alen < sizeof(after) && c != '\n' && !lexer->eof(lexer)) {
after[alen++] = (c < 0x80) ? (char)c : (char)0x7f;
ADVANCE_C;
}
if (tsp_is_fileglob(content, clen, after, alen)) {
lexerstate_push_quote(state, '<');
TOKEN(TOKEN_OPEN_FILEGLOB_BRACKET);
}
}
}
return false;
}
}
if (!is_ERROR && valid_symbols[TOKEN_DOLLAR_IDENT_ZW]) {
if (!tsp_strchr("${", c) && (skipped_whitespace || !isidcont(c))) {
if (c == ':') {
MARK_END;
ADVANCE_C;
if (c == ':') {
return false;
}
}
TOKEN(TOKEN_DOLLAR_IDENT_ZW);
}
}
if ((valid_symbols[TOKEN_SEARCH_SLASH] && c == '/') &&
!valid_symbols[NO_TOKEN_SEARCH_SLASH_PLZ]) {
ADVANCE_C;
MARK_END;
if (c != '/') {
lexerstate_push_quote(state, '/');
TOKEN(TOKEN_SEARCH_SLASH);
}
return false;
}
if (valid_symbols[TOKEN_APOSTROPHE] && c == '\'') {
ADVANCE_C;
lexerstate_push_quote(state, '\'');
TOKEN(TOKEN_APOSTROPHE);
}
if (valid_symbols[TOKEN_DOUBLE_QUOTE] && c == '"') {
ADVANCE_C;
lexerstate_push_quote(state, '"');
TOKEN(TOKEN_DOUBLE_QUOTE);
}
if (valid_symbols[TOKEN_BACKTICK] && c == '`') {
ADVANCE_C;
lexerstate_push_quote(state, '`');
TOKEN(TOKEN_BACKTICK);
}
if (valid_symbols[TOKEN_DOLLAR_IN_REGEXP] && c == '$') {
DEBUG("Dollar in regexp\n", 0);
ADVANCE_C;
if (lexerstate_is_quote_closer(state, c)) TOKEN(TOKEN_DOLLAR_IN_REGEXP);
switch (c) {
case '(':
case ')':
case '|':
TOKEN(TOKEN_DOLLAR_IN_REGEXP);
}
return false;
}
if ((valid_symbols[TOKEN_REGEXP_OPEN_BRACKET] && c == '[') ||
(valid_symbols[TOKEN_REGEXP_OPEN_BRACE] && c == '{')) {
bool leading = lexerstate_take_body_lead(state, c);
if (leading) {
} else {
int32_t open = c;
int32_t close = (open == '[') ? ']' : '}';
char buf[256];
int n = 0;
buf[n++] = (char)open;
ADVANCE_C;
MARK_END;
while (n < (int)sizeof(buf) && c != 0 && !lexer->eof(lexer)) {
buf[n++] = (c < 0x80) ? (char)c : (char)0x7f;
if (c == close) break;
ADVANCE_C;
}
if (!tsp_intuit_more(buf, n)) {
int32_t qi = lexerstate_is_quote_opener(state, open);
if (qi) lexerstate_saw_opener(state, qi);
TOKEN(open == '[' ? TOKEN_REGEXP_OPEN_BRACKET : TOKEN_REGEXP_OPEN_BRACE);
}
return false;
}
}
if (valid_symbols[TOKEN_POD]) {
int column = lexer->get_column(lexer);
if (column == 0 && c == '=') {
DEBUG("POD started...\n", 0);
static const char *cut_marker = "=cut";
int stage = -1;
while (!lexer->eof(lexer)) {
if (c == '\r')
;
else if (stage < 1 && c == '\n')
stage = 0;
else if (stage >= 0 && stage < 4 && c == cut_marker[stage])
stage++;
else if (stage == 4 && (c == ' ' || c == '\t'))
stage = 5;
else if (stage == 4 && c == '\n')
stage = 6;
else
stage = -1;
if (stage > 4) break;
ADVANCE_C;
}
if (stage < 6)
while (!lexer->eof(lexer)) {
if (c == '\n') break;
ADVANCE_C;
}
TOKEN(TOKEN_POD);
}
}
if (is_ERROR) return false;
if (valid_symbols[TOKEN_HEREDOC_DELIM] || valid_symbols[TOKEN_COMMAND_HEREDOC_DELIM]) {
bool should_indent = false;
bool should_interpolate = true;
TSPString delim = {0};
tspstring_reset(&delim);
if (!skipped_whitespace) {
if (c == '~') {
ADVANCE_C;
should_indent = true;
}
if (c == '\\') {
ADVANCE_C;
should_interpolate = false;
}
if (isidfirst(c)) {
while (isidcont(c)) {
tspstring_push(&delim, c);
ADVANCE_C;
}
lexerstate_add_heredoc(state, &delim, should_interpolate, should_indent);
TOKEN(TOKEN_HEREDOC_DELIM);
}
}
if (should_indent) {
skip_whitespace(lexer);
c = lexer->lookahead;
}
if (should_interpolate && (c == '\'' || c == '"' || c == '`')) {
int delim_open = c;
should_interpolate = c != '\'';
ADVANCE_C;
while (c != delim_open && !lexer->eof(lexer)) {
if (c == '\\') {
int to_add = c;
ADVANCE_C;
if (c == delim_open) {
to_add = delim_open;
ADVANCE_C;
}
tspstring_push(&delim, to_add);
} else {
tspstring_push(&delim, c);
ADVANCE_C;
}
}
if (c == delim_open) {
ADVANCE_C;
lexerstate_add_heredoc(state, &delim, should_interpolate, should_indent);
if (delim_open == '`') TOKEN(TOKEN_COMMAND_HEREDOC_DELIM);
TOKEN(TOKEN_HEREDOC_DELIM);
}
}
}
if (valid_symbols[TOKEN_QUOTELIKE_MIDDLE_SKIP]) {
if (!lexerstate_is_paired_delimiter(state)) TOKEN(TOKEN_QUOTELIKE_MIDDLE_SKIP);
}
if (valid_symbols[TOKEN_QUOTELIKE_BEGIN]) {
int delim = c;
if (skipped_whitespace && c == '#') return false;
MARK_END;
ADVANCE_C;
if (valid_symbols[TOKEN_BRACE_END_ZW] && delim == '}') {
TOKEN(TOKEN_BRACE_END_ZW);
}
MARK_END;
if (valid_symbols[TOKEN_QUOTELIKE_MIDDLE_SKIP] && state->quotes.size) {
lexerstate_pop_quote(state, state->quotes.size);
}
lexerstate_push_quote(state, delim);
if (close_for_open(delim)) {
while (is_tsp_whitespace(c)) ADVANCE_C;
if (c == delim) array_back(&state->quotes)->body_leads_with_delim = true;
}
TOKEN(TOKEN_QUOTELIKE_BEGIN);
}
if (c == '\\' &&
!(valid_symbols[TOKEN_QUOTELIKE_END] && lexerstate_is_quote_closer(state, '\\'))) {
ADVANCE_C;
int esc_c = c;
if (!is_tsp_whitespace(c)) ADVANCE_C;
if (valid_symbols[TOKEN_ESCAPED_DELIMITER]) {
if (lexerstate_is_quote_opener(state, esc_c) || lexerstate_is_quote_closer(state, esc_c)) {
MARK_END;
TOKEN(TOKEN_ESCAPED_DELIMITER);
}
}
if (valid_symbols[TOKEN_ESCAPE_SEQUENCE]) {
MARK_END;
if (esc_c == '\\') TOKEN(TOKEN_ESCAPE_SEQUENCE);
if (valid_symbols[TOKEN_Q_STRING_CONTENT]) {
TOKEN(TOKEN_Q_STRING_CONTENT);
}
switch (esc_c) {
case 'x':
if (c == '{')
skip_braced(lexer);
else
skip_hexdigits(lexer, 2);
break;
case 'N':
skip_braced(lexer);
break;
case 'o':
skip_braced(lexer);
break;
case '0':
skip_octdigits(lexer, 3);
break;
default:
break;
}
TOKEN(TOKEN_ESCAPE_SEQUENCE);
}
}
if (valid_symbols[TOKEN_Q_STRING_CONTENT] || valid_symbols[TOKEN_QQ_STRING_CONTENT]) {
bool is_qq = valid_symbols[TOKEN_QQ_STRING_CONTENT];
bool valid = false;
while (c) {
if (c == '\\') break;
int32_t quote_index = lexerstate_is_quote_opener(state, c);
if (quote_index)
lexerstate_saw_opener(state, quote_index);
else {
quote_index = lexerstate_is_quote_closer(state, c);
if (quote_index) {
if (lexerstate_is_quote_closed(state, quote_index)) {
break;
}
lexerstate_saw_closer(state, quote_index);
} else if (is_qq && is_interpolation_escape(c))
break;
}
valid = true;
ADVANCE_C;
}
if (valid) {
if (is_qq)
TOKEN(TOKEN_QQ_STRING_CONTENT);
else
TOKEN(TOKEN_Q_STRING_CONTENT);
}
}
if (valid_symbols[TOKEN_QUOTELIKE_MIDDLE_CLOSE]) {
int32_t quote_index = lexerstate_is_quote_closer(state, c);
if (quote_index && lexerstate_is_quote_closed(state, quote_index)) {
ADVANCE_C;
TOKEN(TOKEN_QUOTELIKE_MIDDLE_CLOSE);
}
}
if (valid_symbols[TOKEN_QUOTELIKE_END]) {
int32_t quote_index = lexerstate_is_quote_closer(state, c);
if (quote_index) {
if (valid_symbols[TOKEN_QUOTELIKE_END_ZW]) TOKEN(TOKEN_QUOTELIKE_END_ZW);
ADVANCE_C;
lexerstate_pop_quote(state, quote_index);
TOKEN(TOKEN_QUOTELIKE_END);
}
}
if (c == '(' && (valid_symbols[TOKEN_PROTOTYPE] || valid_symbols[TOKEN_SIGNATURE_START])) {
ADVANCE_C;
lexer->mark_end(lexer);
int count = 0;
while (!lexer->eof(lexer)) {
if (c == ')' && !count) {
ADVANCE_C;
break;
} else if (c == ')')
count--;
else if (c == '(')
count++;
else if (is_tsp_id_continue(c))
TOKEN(TOKEN_SIGNATURE_START);
ADVANCE_C;
}
lexer->mark_end(lexer);
TOKEN(TOKEN_PROTOTYPE);
}
int32_t c1 = c;
if (c == '-' && valid_symbols[TOKEN_FILETEST]) {
ADVANCE_C;
if (tsp_strchr("rwxoRWXOezsfdlpSbctugkTBMAC", c)) {
ADVANCE_C;
if (!isidcont(c)) TOKEN(TOKEN_FILETEST);
}
return false;
}
if (valid_symbols[TOKEN_BRACE_AUTOQUOTED] && !isidfirst(c) &&
c > ' ' && c != '}' && c != '{' && c != '^' && c != '#' &&
!(c >= '0' && c <= '9')) {
ADVANCE_C;
MARK_END;
while (is_tsp_whitespace(c)) ADVANCE_C;
if (c == '}') TOKEN(TOKEN_BRACE_AUTOQUOTED);
return false;
}
if (isidfirst(c) &&
(valid_symbols[TOKEN_FAT_COMMA_AUTOQUOTED] || valid_symbols[TOKEN_BRACE_AUTOQUOTED])) {
do {
ADVANCE_C;
} while (c && isidcont(c));
MARK_END;
while (is_tsp_whitespace(c) || c == '#') {
while (is_tsp_whitespace(c)) ADVANCE_C;
if (c == '#') {
ADVANCE_C;
while (lexer->get_column(lexer) && !lexer->eof(lexer)) ADVANCE_C;
}
if (lexer->eof(lexer)) return false;
}
fat_comma_check:
c1 = lexer->lookahead;
ADVANCE_C;
if (valid_symbols[TOKEN_FAT_COMMA_AUTOQUOTED]) {
if (c1 == '=' && c == '>') TOKEN(TOKEN_FAT_COMMA_AUTOQUOTED);
}
if (valid_symbols[TOKEN_BRACE_AUTOQUOTED]) {
if (c1 == '}') TOKEN(TOKEN_BRACE_AUTOQUOTED);
}
} else {
MARK_END;
ADVANCE_C;
int32_t c2 = c;
if (lexer->eof(lexer)) return false;
#define EQ2(s) (c1 == s[0] && c2 == s[1])
if (EQ2("<<")) {
heredoc_token_handling:
DEBUG("checking if << is indeed a heredoc\n", 0);
ADVANCE_C;
MARK_END;
if (c == '\\' || c == '~' || isidfirst(c)) {
TOKEN(PERLY_HEREDOC);
}
skip_whitespace(lexer);
c = lexer->lookahead;
if (c == '\'' || c == '"' || c == '`') {
TOKEN(PERLY_HEREDOC);
}
return false;
}
if (valid_symbols[TOKEN_BRACE_END_ZW]) {
DEBUG("ZW-lookahead for brace-end in autoquote\n", 0);
if (c1 == '}') TOKEN(TOKEN_BRACE_END_ZW);
}
}
return false;
}
void *tree_sitter_perl_external_scanner_create() {
LexerState *state = calloc(1, sizeof(LexerState));
array_init(&state->quotes);
tree_sitter_perl_external_scanner_deserialize(state, NULL, 0);
return state;
}
void tree_sitter_perl_external_scanner_destroy(void *payload) {
LexerState *state = payload;
array_delete(&state->quotes);
free(state);
}