#include "tree_sitter/array.h"
#include "tree_sitter/parser.h"
#include <string.h>
#include <wchar.h>
#include <wctype.h>
enum TokenType {
AUTOMATIC_SEMICOLON,
ENCAPSED_STRING_CHARS,
ENCAPSED_STRING_CHARS_AFTER_VARIABLE,
EXECUTION_STRING_CHARS,
EXECUTION_STRING_CHARS_AFTER_VARIABLE,
ENCAPSED_STRING_CHARS_HEREDOC,
ENCAPSED_STRING_CHARS_AFTER_VARIABLE_HEREDOC,
EOF_TOKEN,
HEREDOC_START,
HEREDOC_END,
NOWDOC_STRING,
SENTINEL_ERROR, };
typedef Array(int32_t) String;
static inline bool string_eq(String *self, String *other) {
if (self->size != other->size) {
return false;
}
if (self->size == 0) {
return self->size == other->size;
}
return memcmp(self->contents, other->contents, self->size * sizeof(self->contents[0])) == 0;
}
typedef struct {
bool end_word_indentation_allowed;
String word;
} Heredoc;
#define heredoc_new() \
{ \
.end_word_indentation_allowed = false, \
.word = array_new(), \
};
typedef struct {
bool has_leading_whitespace;
Array(Heredoc) heredocs;
} Scanner;
typedef enum { Error, End } ScanContentResult;
static inline void reset_heredoc(Heredoc *heredoc) {
array_delete(&heredoc->word);
heredoc->end_word_indentation_allowed = false;
}
static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
static unsigned serialize(Scanner *scanner, char *buffer) {
unsigned size = 0;
buffer[size++] = (char)scanner->heredocs.size;
for (unsigned j = 0; j < scanner->heredocs.size; j++) {
Heredoc *heredoc = &scanner->heredocs.contents[j];
unsigned word_size = heredoc->word.size * sizeof(heredoc->word.contents[0]);
if (size + 5 + word_size >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
return 0;
}
buffer[size++] = (char)heredoc->end_word_indentation_allowed;
memcpy(&buffer[size], &heredoc->word.size, sizeof(uint32_t));
size += sizeof(uint32_t);
if (heredoc->word.size > 0) {
memcpy(&buffer[size], heredoc->word.contents, word_size);
size += word_size;
}
}
return size;
}
static void deserialize(Scanner *scanner, const char *buffer, unsigned length) {
unsigned size = 0;
scanner->has_leading_whitespace = false;
for (uint32_t i = 0; i < scanner->heredocs.size; i++) {
reset_heredoc(array_get(&scanner->heredocs, i));
}
if (length == 0) {
return;
}
uint8_t open_heredoc_count = buffer[size++];
for (unsigned i = 0; i < open_heredoc_count; i++) {
Heredoc *heredoc = NULL;
if (i < scanner->heredocs.size) {
heredoc = array_get(&scanner->heredocs, i);
} else {
Heredoc new_heredoc = heredoc_new();
array_push(&scanner->heredocs, new_heredoc);
heredoc = array_back(&scanner->heredocs);
}
heredoc->end_word_indentation_allowed = buffer[size++];
memcpy(&heredoc->word.size, &buffer[size], sizeof(uint32_t));
size += sizeof(uint32_t);
unsigned word_size = heredoc->word.size * sizeof(heredoc->word.contents[0]);
if (word_size > 0) {
array_reserve(&heredoc->word, heredoc->word.size);
memcpy(heredoc->word.contents, &buffer[size], word_size);
size += word_size;
}
}
assert(size == length);
}
static inline bool scan_whitespace(TSLexer *lexer) {
for (;;) {
while (iswspace(lexer->lookahead)) {
advance(lexer);
}
if (lexer->lookahead == '/') {
advance(lexer);
if (lexer->lookahead == '/') {
advance(lexer);
while (lexer->lookahead != 0 && lexer->lookahead != '\n') {
advance(lexer);
}
} else {
return false;
}
} else {
return true;
}
}
}
static inline bool is_valid_name_char(TSLexer *lexer) {
return iswalnum(lexer->lookahead) || lexer->lookahead == '_' || lexer->lookahead >= 0x80;
}
static inline bool is_escapable_sequence(TSLexer *lexer) {
int32_t letter = lexer->lookahead;
if (letter == 'n' || letter == 'r' || letter == 't' || letter == 'v' || letter == 'e' || letter == 'f' ||
letter == '\\' || letter == '$' || letter == '"') {
return true;
}
if (letter == 'x') {
advance(lexer);
return iswxdigit(lexer->lookahead);
}
if (letter == 'u') {
return true; }
return iswdigit(lexer->lookahead) && lexer->lookahead >= '0' && lexer->lookahead <= '7';
}
static String scan_heredoc_word(TSLexer *lexer) {
String result = (String)array_new();
while (is_valid_name_char(lexer)) {
array_push(&result, lexer->lookahead);
advance(lexer);
}
return result;
}
static inline bool scan_nowdoc_string(Scanner *scanner, TSLexer *lexer) {
bool has_consumed_content = false;
if (scanner->heredocs.size == 0) {
return false;
}
while (iswspace(lexer->lookahead)) {
advance(lexer);
has_consumed_content = true;
}
bool end_tag_matched = false;
String heredoc_tag = array_back(&scanner->heredocs)->word;
for (uint32_t i = 0; i < heredoc_tag.size; i++) {
if (lexer->lookahead != heredoc_tag.contents[i]) {
break;
}
advance(lexer);
has_consumed_content = true;
end_tag_matched = (i == heredoc_tag.size - 1 && (iswspace(lexer->lookahead) || lexer->lookahead == ';' ||
lexer->lookahead == ',' || lexer->lookahead == ')'));
}
if (end_tag_matched) {
while (iswspace(lexer->lookahead) && lexer->lookahead != '\r' && lexer->lookahead != '\n') {
advance(lexer);
has_consumed_content = true;
}
if (lexer->lookahead == ';' || lexer->lookahead == ',' || lexer->lookahead == ')' || lexer->lookahead == '\n' ||
lexer->lookahead == '\r') {
return false;
}
}
for (bool has_content = has_consumed_content;; has_content = true) {
lexer->mark_end(lexer);
switch (lexer->lookahead) {
case '\n':
case '\r':
return has_content;
default:
if (lexer->eof(lexer)) {
return false;
}
advance(lexer);
}
}
return false;
}
static bool scan_encapsed_part_string(Scanner *scanner, TSLexer *lexer, bool is_after_variable, bool is_heredoc,
bool is_execution_string) {
bool has_consumed_content = false;
if (is_heredoc && scanner->heredocs.size > 0) {
while (iswspace(lexer->lookahead) && lexer->lookahead != '\r' && lexer->lookahead != '\n') {
advance(lexer);
has_consumed_content = true;
}
String heredoc_tag = array_back(&scanner->heredocs)->word;
bool end_tag_matched = false;
for (uint32_t i = 0; i < heredoc_tag.size; i++) {
if (lexer->lookahead != heredoc_tag.contents[i]) {
break;
}
has_consumed_content = true;
advance(lexer);
end_tag_matched = (i == heredoc_tag.size - 1 && (iswspace(lexer->lookahead) || lexer->lookahead == ';' ||
lexer->lookahead == ',' || lexer->lookahead == ')'));
}
if (end_tag_matched) {
while (iswspace(lexer->lookahead) && lexer->lookahead != '\r' && lexer->lookahead != '\n') {
advance(lexer);
has_consumed_content = true;
}
if (lexer->lookahead == ';' || lexer->lookahead == ',' || lexer->lookahead == ')' ||
lexer->lookahead == '\n' || lexer->lookahead == '\r') {
return false;
}
}
}
for (bool has_content = has_consumed_content;; has_content = true) {
lexer->mark_end(lexer);
switch (lexer->lookahead) {
case '"':
if (!is_heredoc && !is_execution_string) {
return has_content;
}
advance(lexer);
break;
case '`':
if (is_execution_string) {
return has_content;
}
advance(lexer);
break;
case '\n':
case '\r':
if (is_heredoc) {
return has_content;
}
advance(lexer);
break;
case '\\':
advance(lexer);
if (lexer->lookahead == '{') {
advance(lexer);
break;
}
if (is_execution_string && lexer->lookahead == '`') {
return has_content;
}
if (is_heredoc && lexer->lookahead == '\\') {
advance(lexer);
break;
}
if (is_escapable_sequence(lexer)) {
return has_content;
}
break;
case '$':
advance(lexer);
if ((is_valid_name_char(lexer) && !iswdigit(lexer->lookahead)) || lexer->lookahead == '{') {
return has_content;
}
break;
case '-':
if (is_after_variable) {
advance(lexer);
if (lexer->lookahead == '>') {
advance(lexer);
if (is_valid_name_char(lexer)) {
return has_content;
}
break;
}
break;
}
case '[':
if (is_after_variable) {
return has_content;
}
advance(lexer);
break;
case '{':
advance(lexer);
if (lexer->lookahead == '$') {
return has_content;
}
break;
default:
if (lexer->eof(lexer)) {
return false;
}
advance(lexer);
}
is_after_variable = false;
}
return false;
}
static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
const bool is_error_recovery = valid_symbols[SENTINEL_ERROR];
if (is_error_recovery) {
return false;
}
scanner->has_leading_whitespace = false;
lexer->mark_end(lexer);
if (valid_symbols[ENCAPSED_STRING_CHARS_AFTER_VARIABLE]) {
lexer->result_symbol = ENCAPSED_STRING_CHARS_AFTER_VARIABLE;
return scan_encapsed_part_string(scanner, lexer,
true,
false,
false);
}
if (valid_symbols[ENCAPSED_STRING_CHARS]) {
lexer->result_symbol = ENCAPSED_STRING_CHARS;
return scan_encapsed_part_string(scanner, lexer,
false,
false,
false);
}
if (valid_symbols[EXECUTION_STRING_CHARS_AFTER_VARIABLE]) {
lexer->result_symbol = EXECUTION_STRING_CHARS_AFTER_VARIABLE;
return scan_encapsed_part_string(scanner, lexer,
true,
false,
true);
}
if (valid_symbols[EXECUTION_STRING_CHARS]) {
lexer->result_symbol = EXECUTION_STRING_CHARS;
return scan_encapsed_part_string(scanner, lexer,
false,
false,
true);
}
if (valid_symbols[ENCAPSED_STRING_CHARS_AFTER_VARIABLE_HEREDOC]) {
lexer->result_symbol = ENCAPSED_STRING_CHARS_AFTER_VARIABLE_HEREDOC;
return scan_encapsed_part_string(scanner, lexer,
true,
true,
false);
}
if (valid_symbols[ENCAPSED_STRING_CHARS_HEREDOC]) {
lexer->result_symbol = ENCAPSED_STRING_CHARS_HEREDOC;
return scan_encapsed_part_string(scanner, lexer,
false,
true,
false);
}
if (valid_symbols[NOWDOC_STRING]) {
lexer->result_symbol = NOWDOC_STRING;
return scan_nowdoc_string(scanner, lexer);
}
if (valid_symbols[HEREDOC_END]) {
lexer->result_symbol = HEREDOC_END;
if (scanner->heredocs.size == 0) {
return false;
}
Heredoc heredoc = *array_back(&scanner->heredocs);
while (iswspace(lexer->lookahead)) {
skip(lexer);
}
String word = scan_heredoc_word(lexer);
if (!string_eq(&word, &heredoc.word)) {
array_delete(&word);
return false;
}
array_delete(&word);
lexer->mark_end(lexer);
array_delete(&array_pop(&scanner->heredocs).word);
return true;
}
if (!scan_whitespace(lexer)) {
return false;
}
if (valid_symbols[EOF_TOKEN] && lexer->eof(lexer)) {
lexer->result_symbol = EOF_TOKEN;
return true;
}
if (valid_symbols[HEREDOC_START]) {
lexer->result_symbol = HEREDOC_START;
Heredoc heredoc = heredoc_new();
while (iswspace(lexer->lookahead)) {
skip(lexer);
}
heredoc.word = scan_heredoc_word(lexer);
if (heredoc.word.size == 0) {
array_delete(&heredoc.word);
return false;
}
lexer->mark_end(lexer);
array_push(&scanner->heredocs, heredoc);
return true;
}
if (valid_symbols[AUTOMATIC_SEMICOLON]) {
lexer->result_symbol = AUTOMATIC_SEMICOLON;
if (lexer->lookahead != '?') {
return false;
}
advance(lexer);
return lexer->lookahead == '>';
}
return false;
}
static inline void *external_scanner_create() {
Scanner *scanner = ts_calloc(1, sizeof(Scanner));
array_init(&scanner->heredocs);
return scanner;
}
static inline unsigned external_scanner_serialize(void *payload, char *buffer) {
Scanner *scanner = (Scanner *)payload;
return serialize(scanner, buffer);
}
static inline void external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
Scanner *scanner = (Scanner *)payload;
deserialize(scanner, buffer, length);
}
static inline bool external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
Scanner *scanner = (Scanner *)payload;
return scan(scanner, lexer, valid_symbols);
}
static inline void external_scanner_destroy(void *payload) {
Scanner *scanner = (Scanner *)payload;
for (size_t i = 0; i < scanner->heredocs.size; i++) {
array_delete(&scanner->heredocs.contents[i].word);
}
array_delete(&scanner->heredocs);
ts_free(scanner);
}