#include "tree_sitter/parser.h"
#include <ctype.h>
#include <stdio.h>
#include <string.h>
#include <wctype.h>
enum TokenType {
COMMENT,
LINE_CONTINUATION,
COMMAND_NAME,
COMMAND_ARGUMENT,
SINGLE_QUOTE_STRING_START,
SINGLE_QUOTE_STRING_END,
DOUBLE_QUOTE_STRING_START,
DOUBLE_QUOTE_STRING_END,
FORMATTING_SEQUENCE,
ESCAPE_SEQUENCE,
STRING_CONTENT,
ENTRY_DELIMITER,
MULTIOUTPUT_VARIABLE_START,
ERROR_SENTINEL,
};
typedef struct
{
bool is_inside_command;
bool line_continuation;
bool is_shell_scape;
char string_delimiter;
} Scanner;
static const char* const keywords[] = {
"arguments", "break", "case", "catch", "classdef", "continue", "else",
"elseif", "end", "enumeration", "events", "false", "for", "function",
"global", "if", "methods", "otherwise", "parfor", "persistent", "properties",
"return", "spmd", "switch", "true", "try", "while",
};
static inline void advance(TSLexer* lexer)
{
lexer->advance(lexer, false);
}
static inline void skip(TSLexer* lexer)
{
lexer->advance(lexer, true);
}
static inline bool consume_char(char chr, TSLexer* lexer)
{
if (lexer->lookahead != chr) {
return false;
}
advance(lexer);
return true;
}
static inline bool is_eol(const uint32_t chr)
{
return chr == '\n' || chr == '\r' || chr == ',' || chr == ';';
}
static inline bool iswspace_matlab(const uint32_t chr)
{
return iswspace(chr) && chr != '\n' && chr != '\r';
}
static inline bool is_identifier(const uint32_t chr, const bool start)
{
if (chr >= 0x80) {
return false;
}
const bool alpha = isalpha(chr);
const bool numeric = !start && isdigit(chr);
const bool special = chr == '_';
return alpha || numeric || special;
}
static inline void consume_identifier(TSLexer* lexer, char* buffer)
{
size_t size = 0;
if (is_identifier(lexer->lookahead, true)) {
buffer[size] = (char) lexer->lookahead;
advance(lexer);
while (is_identifier(lexer->lookahead, false)) {
if (size == 255) {
buffer[0] = 0;
return;
}
buffer[++size] = (char) lexer->lookahead;
advance(lexer);
}
return;
}
buffer[0] = 0;
}
static inline int skip_whitespaces(TSLexer* lexer)
{
int skipped = 0;
while (!lexer->eof(lexer) && iswspace(lexer->lookahead)) {
if (lexer->lookahead == '\n' || lexer->lookahead == '\r') {
skipped = skipped | 2;
}
skip(lexer);
skipped = skipped | 1;
}
return skipped;
}
static inline void consume_whitespaces(TSLexer* lexer)
{
while (iswspace(lexer->lookahead)) {
advance(lexer);
}
}
void* tree_sitter_matlab_external_scanner_create()
{
Scanner* scanner = calloc(1, sizeof(Scanner));
return scanner;
}
void tree_sitter_matlab_external_scanner_destroy(void* payload)
{
if (payload != NULL) {
free(payload);
}
}
unsigned tree_sitter_matlab_external_scanner_serialize(void* payload, char* buffer)
{
Scanner* scanner = (Scanner*) payload;
buffer[0] = (char) scanner->is_inside_command;
buffer[1] = (char) scanner->line_continuation;
buffer[2] = (char) scanner->is_shell_scape;
buffer[3] = scanner->string_delimiter;
return 4;
}
void tree_sitter_matlab_external_scanner_deserialize(
void* payload,
const char* buffer,
unsigned length)
{
Scanner* scanner = (Scanner*) payload;
if (length == 4) {
scanner->is_inside_command = buffer[0];
scanner->line_continuation = buffer[1];
scanner->is_shell_scape = buffer[2];
scanner->string_delimiter = buffer[3];
}
}
static inline void consume_comment_line(TSLexer* lexer)
{
while (lexer->lookahead != '\n' && lexer->lookahead != '\r' && !lexer->eof(lexer)) {
advance(lexer);
}
}
static bool scan_comment(TSLexer* lexer, bool entry_delimiter)
{
lexer->mark_end(lexer);
const bool percent = lexer->lookahead == '%';
const bool block = percent && consume_char('%', lexer) && consume_char('{', lexer);
const bool line_continuation = lexer->lookahead == '.' && consume_char('.', lexer)
&& consume_char('.', lexer) && consume_char('.', lexer);
if (entry_delimiter && !percent && !line_continuation) {
lexer->result_symbol = ENTRY_DELIMITER;
return isdigit(lexer->lookahead);
}
if (block) {
while (!lexer->eof(lexer)) {
consume_comment_line(lexer);
advance(lexer);
consume_whitespaces(lexer);
if (consume_char('%', lexer) && consume_char('}', lexer)) {
lexer->result_symbol = COMMENT;
lexer->mark_end(lexer);
return true;
}
}
return false;
}
if (percent || line_continuation) {
consume_comment_line(lexer);
if (line_continuation) {
advance(lexer);
}
lexer->mark_end(lexer);
if (!line_continuation) {
lexer->result_symbol = COMMENT;
advance(lexer);
} else {
while (lexer->lookahead == '\r' || lexer->lookahead == '\n') {
advance(lexer);
}
lexer->mark_end(lexer);
lexer->result_symbol = LINE_CONTINUATION;
}
while (!lexer->eof(lexer) && (lexer->lookahead == ' ' || lexer->lookahead == '\t')) {
advance(lexer);
}
if (lexer->lookahead == '%') {
return scan_comment(lexer, false);
}
return true;
}
return false;
}
static bool scan_command(Scanner* scanner, TSLexer* lexer)
{
if (lexer->lookahead == '!') {
advance(lexer);
while (iswspace_matlab(lexer->lookahead)) {
advance(lexer);
}
while (lexer->lookahead != ' ' && lexer->lookahead != '\n' && !lexer->eof(lexer)) {
advance(lexer);
}
lexer->result_symbol = COMMAND_NAME;
lexer->mark_end(lexer);
while (iswspace_matlab(lexer->lookahead)) {
advance(lexer);
}
scanner->is_inside_command = lexer->lookahead != '\n';
scanner->is_shell_scape = scanner->is_inside_command;
return true;
}
if (!is_identifier(lexer->lookahead, true)) {
return false;
}
char buffer[256] = {0};
consume_identifier(lexer, buffer);
if (buffer[0] != 0) {
for (int i = 0; i < 27; i++) {
if (strcmp(keywords[i], buffer) == 0) {
return false;
}
}
}
if (is_eol(lexer->lookahead)) {
lexer->result_symbol = COMMAND_NAME;
lexer->mark_end(lexer);
return true;
}
if (lexer->lookahead != ' ') {
return false;
}
lexer->result_symbol = COMMAND_NAME;
lexer->mark_end(lexer);
consume_whitespaces(lexer);
if (is_eol(lexer->lookahead)) {
return true;
}
if (lexer->lookahead == '=' || lexer->lookahead == '(' || lexer->lookahead == ')') {
return false;
}
if (lexer->lookahead == '\'') {
scanner->is_inside_command = true;
return true;
}
if (is_identifier(lexer->lookahead, false)) {
scanner->is_inside_command = true;
return true;
}
if (lexer->lookahead >= 0xC0) {
scanner->is_inside_command = true;
return true;
}
if (ispunct(lexer->lookahead)) {
const uint32_t first = lexer->lookahead;
advance(lexer);
const uint32_t second = lexer->lookahead;
if (is_eol(second)) {
scanner->is_inside_command = true;
return true;
}
if (iswspace_matlab(second)) {
const char operators[] = {
'!',
'&',
'*',
'+',
'-',
'/',
'<',
'>',
'@',
'\\',
'^',
'|',
};
bool is_invalid = false;
for (int i = 0; i < sizeof(operators); i++) {
if (first == operators[i]) {
is_invalid = true;
break;
}
}
if (is_invalid) {
advance(lexer);
while (iswspace_matlab(lexer->lookahead)) {
advance(lexer);
}
scanner->is_inside_command = is_eol(lexer->lookahead);
return scanner->is_inside_command;
}
scanner->is_inside_command = true;
return true;
}
advance(lexer);
if (lexer->lookahead != ' ') {
scanner->is_inside_command = true;
return true;
}
const char operators[][2] = {
{'&', '&'},
{'|', '|'},
{'=', '='},
{'~', '='},
{'<', '='},
{'>', '='},
{'.', '+'},
{'.', '-'},
{'.', '*'},
{'.', '/'},
{'.', '\\'},
{'.', '^'},
};
for (int i = 0; i < 12; i++) {
if (operators[i][0] == first && operators[i][1] == second) {
return false;
}
}
scanner->is_inside_command = true;
return true;
}
return false;
}
static bool scan_command_argument(Scanner* scanner, TSLexer* lexer)
{
if (scanner->is_shell_scape) {
if (lexer->eof(lexer)) {
return false;
}
while (lexer->lookahead != ' ' && lexer->lookahead != '\n' && !lexer->eof(lexer)) {
advance(lexer);
}
lexer->result_symbol = COMMAND_ARGUMENT;
lexer->mark_end(lexer);
while (iswspace_matlab(lexer->lookahead)) {
advance(lexer);
}
if (lexer->lookahead == '\n') {
scanner->is_inside_command = false;
scanner->is_shell_scape = false;
}
return true;
}
if (lexer->eof(lexer)) {
return false;
}
bool quote = false;
int32_t parens = 0;
bool consumed = false;
while (!lexer->eof(lexer)) {
const bool cond1 = lexer->lookahead == '\n' || lexer->lookahead == '\r';
const bool cond2 = !quote && parens == 0
&& (is_eol(lexer->lookahead) || iswspace_matlab(lexer->lookahead));
const bool cond3 = !quote && parens != 0 && lexer->lookahead == ';';
if (cond1 || cond2 || cond3) {
lexer->result_symbol = COMMAND_ARGUMENT;
lexer->mark_end(lexer);
while (iswspace_matlab(lexer->lookahead)) {
advance(lexer);
}
if (is_eol(lexer->lookahead)) {
scanner->is_inside_command = false;
}
return true;
}
if ((!quote || (quote && parens != 0)) && lexer->lookahead == '%') {
scanner->is_inside_command = false;
if (consumed) {
lexer->result_symbol = COMMAND_ARGUMENT;
lexer->mark_end(lexer);
return true;
}
return scan_comment(lexer, false);
}
if ((!quote || (quote && parens != 0)) && lexer->lookahead == '.') {
lexer->result_symbol = COMMAND_ARGUMENT;
lexer->mark_end(lexer);
advance(lexer);
if (lexer->lookahead == '.') {
advance(lexer);
if (lexer->lookahead == '.') {
if (consumed) {
scanner->line_continuation = true;
} else {
consume_comment_line(lexer);
lexer->result_symbol = LINE_CONTINUATION;
lexer->mark_end(lexer);
}
return true;
}
consumed = true;
continue;
}
consumed = true;
continue;
}
if ((lexer->lookahead == '(' || lexer->lookahead == '[' || lexer->lookahead == '{')
&& (!quote || (quote && parens != 0))) {
parens++;
}
if ((lexer->lookahead == ')' || lexer->lookahead == ']' || lexer->lookahead == '}')
&& (!quote || (quote && parens != 0))) {
parens--;
}
if (lexer->lookahead == '\'') {
quote = !quote;
}
advance(lexer);
consumed = true;
}
if (lexer->eof(lexer)) {
lexer->result_symbol = COMMAND_ARGUMENT;
lexer->mark_end(lexer);
return true;
}
return false;
}
static bool scan_string_open(Scanner* scanner, TSLexer* lexer)
{
switch (lexer->lookahead) {
case '"':
scanner->string_delimiter = '"';
advance(lexer);
lexer->result_symbol = DOUBLE_QUOTE_STRING_START;
lexer->mark_end(lexer);
return true;
case '\'':
scanner->string_delimiter = '\'';
advance(lexer);
lexer->result_symbol = SINGLE_QUOTE_STRING_START;
lexer->mark_end(lexer);
return true;
default:
return false;
}
}
static bool scan_string_close(Scanner* scanner, TSLexer* lexer)
{
if (lexer->lookahead == scanner->string_delimiter) {
advance(lexer);
if (lexer->lookahead == scanner->string_delimiter) {
advance(lexer);
lexer->result_symbol = STRING_CONTENT;
goto content;
}
lexer->result_symbol = scanner->string_delimiter == '"' ? DOUBLE_QUOTE_STRING_END
: SINGLE_QUOTE_STRING_END;
lexer->mark_end(lexer);
scanner->string_delimiter = 0;
return true;
}
if (lexer->lookahead == '\n' || lexer->lookahead == '\r' || lexer->eof(lexer)) {
lexer->result_symbol = scanner->string_delimiter == '"' ? DOUBLE_QUOTE_STRING_END
: SINGLE_QUOTE_STRING_END;
lexer->mark_end(lexer);
scanner->string_delimiter = 0;
return true;
}
if (lexer->lookahead == '%') {
advance(lexer);
if (lexer->lookahead == '%') {
advance(lexer);
lexer->result_symbol = FORMATTING_SEQUENCE;
lexer->mark_end(lexer);
return true;
}
const char* valid_tokens = "1234567890.-+ #btcdeEfgGosuxX";
const char* end_tokens = "cdeEfgGosuxX";
while (!lexer->eof(lexer) && lexer->lookahead != '\n' && lexer->lookahead != '\r') {
bool is_valid = false;
for (int i = 0; i < strlen(valid_tokens); i++) {
if (valid_tokens[i] == lexer->lookahead) {
is_valid = true;
break;
}
}
if (!is_valid) {
lexer->result_symbol = STRING_CONTENT;
goto content;
}
for (int i = 0; i < 12; i++) {
if (end_tokens[i] == lexer->lookahead) {
advance(lexer);
lexer->result_symbol = FORMATTING_SEQUENCE;
lexer->mark_end(lexer);
return true;
}
}
advance(lexer);
}
scanner->string_delimiter = 0;
return false;
}
if (lexer->lookahead == '\\') {
advance(lexer);
if (lexer->lookahead == 'x') {
advance(lexer);
while (!lexer->eof(lexer)) {
const char* hexa_chars = "1234567890abcdefABCDEF";
bool is_valid = false;
for (int i = 0; i < 22; i++) {
if (hexa_chars[i] == lexer->lookahead) {
is_valid = true;
break;
}
}
if (!is_valid) {
lexer->result_symbol = ESCAPE_SEQUENCE;
lexer->mark_end(lexer);
return true;
}
advance(lexer);
}
}
if (lexer->lookahead >= '0' && lexer->lookahead <= '7') {
while (lexer->lookahead >= '0' && lexer->lookahead <= '7' && !lexer->eof(lexer)) {
advance(lexer);
}
lexer->result_symbol = ESCAPE_SEQUENCE;
lexer->mark_end(lexer);
return true;
}
const char* escapes = "abfnrtv\\";
bool is_valid = false;
for (int i = 0; i < 8; i++) {
if (escapes[i] == lexer->lookahead) {
is_valid = true;
break;
}
}
if (is_valid) {
advance(lexer);
lexer->result_symbol = ESCAPE_SEQUENCE;
lexer->mark_end(lexer);
return true;
}
}
content:
while (lexer->lookahead != '\n' && lexer->lookahead != '\r' && !lexer->eof(lexer)) {
if (lexer->lookahead == scanner->string_delimiter) {
lexer->result_symbol = STRING_CONTENT;
lexer->mark_end(lexer);
advance(lexer);
if (lexer->lookahead != scanner->string_delimiter) {
return true;
}
advance(lexer);
continue;
}
if (lexer->lookahead == '%' || lexer->lookahead == '\\') {
lexer->result_symbol = STRING_CONTENT;
lexer->mark_end(lexer);
advance(lexer);
if (lexer->lookahead == scanner->string_delimiter || iswspace_matlab(lexer->lookahead)) {
goto content;
}
return true;
}
advance(lexer);
}
if (lexer->lookahead == '\n' || lexer->lookahead == '\r' || lexer->eof(lexer)) {
lexer->result_symbol = STRING_CONTENT;
lexer->mark_end(lexer);
return true;
}
scanner->string_delimiter = 0;
return false;
}
static inline bool scan_multioutput_var_start(TSLexer* lexer)
{
advance(lexer);
lexer->result_symbol = MULTIOUTPUT_VARIABLE_START;
lexer->mark_end(lexer);
while (!lexer->eof(lexer)) {
if (consume_char('.', lexer) && consume_char('.', lexer) && consume_char('.', lexer)) {
consume_comment_line(lexer);
advance(lexer);
}
if (lexer->lookahead != ']' && lexer->lookahead != '\n' && lexer->lookahead != '\r') {
advance(lexer);
} else {
break;
}
}
if (lexer->lookahead != ']') {
return false;
}
advance(lexer);
while (!lexer->eof(lexer)) {
if (consume_char('.', lexer) && consume_char('.', lexer) && consume_char('.', lexer)) {
consume_comment_line(lexer);
advance(lexer);
} else if (iswspace_matlab(lexer->lookahead)) {
advance(lexer);
} else {
break;
}
}
if (lexer->lookahead == '=') {
advance(lexer);
if (lexer->lookahead != '=') {
return true;
}
}
return false;
}
static bool scan_entry_delimiter(TSLexer* lexer, int skipped)
{
lexer->mark_end(lexer);
lexer->result_symbol = ENTRY_DELIMITER;
if (skipped & 2) {
return false;
}
if (lexer->lookahead == ',') {
advance(lexer);
lexer->mark_end(lexer);
lexer->result_symbol = ENTRY_DELIMITER;
return true;
}
if (lexer->lookahead == '.') {
advance(lexer);
advance(lexer);
return isdigit(lexer->lookahead);
}
if (lexer->lookahead == '{' || lexer->lookahead == '(' || lexer->lookahead == '\'') {
return skipped != 0;
}
if (lexer->lookahead == '[') {
return true;
}
const char no_end[] = {']', '}', '&', '|', '=', '<', '>', '*', '/', '\\', '^', ';', ':'};
for (int i = 0; i < sizeof(no_end); i++) {
if (no_end[i] == lexer->lookahead) {
return false;
}
}
if (lexer->lookahead == '~') {
advance(lexer);
return lexer->lookahead != '=';
}
const char maybe_end[] = {'+', '-'};
for (int i = 0; i < sizeof(maybe_end); i++) {
if (maybe_end[i] == lexer->lookahead) {
advance(lexer);
if (lexer->lookahead == ' ') {
return false;
}
return skipped != 0;
}
}
return skipped != 0;
}
bool tree_sitter_matlab_external_scanner_scan(void* payload, TSLexer* lexer, const bool* valid_symbols)
{
Scanner* scanner = (Scanner*) payload;
if (scanner->string_delimiter == 0) {
int skipped = skip_whitespaces(lexer);
if ((scanner->line_continuation || !scanner->is_inside_command) && valid_symbols[COMMENT]
&& (lexer->lookahead == '%' || lexer->lookahead == '.')) {
return scan_comment(lexer, valid_symbols[ENTRY_DELIMITER]);
}
if (!scanner->is_inside_command) {
if ((valid_symbols[SINGLE_QUOTE_STRING_START] && lexer->lookahead == '\'')
|| (valid_symbols[DOUBLE_QUOTE_STRING_START] && lexer->lookahead == '"')) {
return scan_string_open(scanner, lexer);
}
if (!scanner->line_continuation) {
if (valid_symbols[MULTIOUTPUT_VARIABLE_START] && lexer->lookahead == '[') {
return scan_multioutput_var_start(lexer);
}
if (valid_symbols[ENTRY_DELIMITER]) {
return scan_entry_delimiter(lexer, skipped);
}
}
if (valid_symbols[COMMAND_NAME]) {
scanner->is_inside_command = false;
scanner->is_shell_scape = false;
return scan_command(scanner, lexer);
}
} else {
if (valid_symbols[COMMAND_ARGUMENT]) {
return scan_command_argument(scanner, lexer);
}
}
} else {
if (valid_symbols[DOUBLE_QUOTE_STRING_END] || valid_symbols[SINGLE_QUOTE_STRING_END]
|| valid_symbols[FORMATTING_SEQUENCE]) {
return scan_string_close(scanner, lexer);
}
}
return false;
}