#include <liblangutil/Common.h>
#include <liblangutil/Exceptions.h>
#include <liblangutil/Scanner.h>
#include <boost/algorithm/string/classification.hpp>
#include <optional>
#include <string_view>
#include <tuple>
#include <array>
using namespace std;
namespace solidity::langutil
{
string to_string(ScannerError _errorCode)
{
switch (_errorCode)
{
case ScannerError::NoError: return "No error.";
case ScannerError::IllegalToken: return "Invalid token.";
case ScannerError::IllegalHexString: return "Expected even number of hex-nibbles.";
case ScannerError::IllegalHexDigit: return "Hexadecimal digit missing or invalid.";
case ScannerError::IllegalCommentTerminator: return "Expected multi-line comment-terminator.";
case ScannerError::IllegalEscapeSequence: return "Invalid escape sequence.";
case ScannerError::IllegalCharacterInString: return "Invalid character in string.";
case ScannerError::IllegalStringEndQuote: return "Expected string end-quote.";
case ScannerError::IllegalNumberSeparator: return "Invalid use of number separator '_'.";
case ScannerError::IllegalExponent: return "Invalid exponent.";
case ScannerError::IllegalNumberEnd: return "Identifier-start is not allowed at end of a number.";
case ScannerError::OctalNotAllowed: return "Octal numbers not allowed.";
case ScannerError::DirectionalOverrideUnderflow: return "Unicode direction override underflow in comment or string literal.";
case ScannerError::DirectionalOverrideMismatch: return "Mismatching directional override markers in comment or string literal.";
default:
solAssert(false, "Unhandled case in to_string(ScannerError)");
return "";
}
}
ostream& operator<<(ostream& os, ScannerError _errorCode)
{
return os << to_string(_errorCode);
}
enum LiteralType
{
LITERAL_TYPE_STRING,
LITERAL_TYPE_NUMBER, LITERAL_TYPE_COMMENT
};
class LiteralScope
{
public:
explicit LiteralScope(Scanner* _self, enum LiteralType _type):
m_type(_type),
m_scanner(_self),
m_complete(false)
{
if (_type == LITERAL_TYPE_COMMENT)
m_scanner->m_skippedComments[Scanner::NextNext].literal.clear();
else
m_scanner->m_tokens[Scanner::NextNext].literal.clear();
}
~LiteralScope()
{
if (!m_complete)
{
if (m_type == LITERAL_TYPE_COMMENT)
m_scanner->m_skippedComments[Scanner::NextNext].literal.clear();
else
m_scanner->m_tokens[Scanner::NextNext].literal.clear();
}
}
void complete() { m_complete = true; }
private:
enum LiteralType m_type;
Scanner* m_scanner;
bool m_complete;
};
void Scanner::reset()
{
m_source.reset();
m_kind = ScannerKind::Solidity;
m_char = m_source.get();
skipWhitespace();
next();
next();
next();
}
void Scanner::setPosition(size_t _offset)
{
m_char = m_source.setPosition(_offset);
scanToken();
next();
next();
}
bool Scanner::scanHexByte(char& o_scannedByte)
{
char x = 0;
for (size_t i = 0; i < 2; i++)
{
int d = hexValue(m_char);
if (d < 0)
{
rollback(i);
return false;
}
x = static_cast<char>(x * 16 + d);
advance();
}
o_scannedByte = x;
return true;
}
std::optional<unsigned> Scanner::scanUnicode()
{
unsigned x = 0;
for (size_t i = 0; i < 4; i++)
{
int d = hexValue(m_char);
if (d < 0)
{
rollback(i);
return {};
}
x = x * 16 + static_cast<unsigned>(d);
advance();
}
return x;
}
void Scanner::addUnicodeAsUTF8(unsigned codepoint)
{
if (codepoint <= 0x7f)
addLiteralChar(char(codepoint));
else if (codepoint <= 0x7ff)
{
addLiteralChar(char(0xc0u | (codepoint >> 6u)));
addLiteralChar(char(0x80u | (codepoint & 0x3fu)));
}
else
{
addLiteralChar(char(0xe0u | (codepoint >> 12u)));
addLiteralChar(char(0x80u | ((codepoint >> 6u) & 0x3fu)));
addLiteralChar(char(0x80u | (codepoint & 0x3fu)));
}
}
void Scanner::rescan()
{
size_t rollbackTo = 0;
if (m_skippedComments[Current].literal.empty())
rollbackTo = static_cast<size_t>(m_tokens[Current].location.start);
else
rollbackTo = static_cast<size_t>(m_skippedComments[Current].location.start);
m_char = m_source.rollback(m_source.position() - rollbackTo);
next();
next();
next();
}
BOOST_STATIC_ASSERT(TokenTraits::count() <= 0x100);
Token Scanner::next()
{
m_tokens[Current] = std::move(m_tokens[Next]);
m_tokens[Next] = std::move(m_tokens[NextNext]);
m_skippedComments[Current] = std::move(m_skippedComments[Next]);
m_skippedComments[Next] = std::move(m_skippedComments[NextNext]);
scanToken();
return m_tokens[Current].token;
}
Token Scanner::selectToken(char _next, Token _then, Token _else)
{
advance();
if (m_char == _next)
return selectToken(_then);
else
return _else;
}
bool Scanner::skipWhitespace()
{
size_t const startPosition = sourcePos();
while (isWhiteSpace(m_char))
advance();
return sourcePos() != startPosition;
}
bool Scanner::skipWhitespaceExceptUnicodeLinebreak()
{
size_t const startPosition = sourcePos();
while (isWhiteSpace(m_char) && !isUnicodeLinebreak())
advance();
return sourcePos() != startPosition;
}
namespace
{
static ScannerError validateBiDiMarkup(CharStream& _stream, size_t _startPosition)
{
static array<pair<string_view, int>, 5> constexpr directionalSequences{
pair<string_view, int>{"\xE2\x80\xAD", 1}, pair<string_view, int>{"\xE2\x80\xAE", 1}, pair<string_view, int>{"\xE2\x80\xAA", 1}, pair<string_view, int>{"\xE2\x80\xAB", 1}, pair<string_view, int>{"\xE2\x80\xAC", -1} };
size_t endPosition = _stream.position();
_stream.setPosition(_startPosition);
int directionOverrideDepth = 0;
for (size_t currentPos = _startPosition; currentPos < endPosition; ++currentPos)
{
_stream.setPosition(currentPos);
for (auto const& [sequence, depthChange]: directionalSequences)
if (_stream.prefixMatch(sequence))
directionOverrideDepth += depthChange;
if (directionOverrideDepth < 0)
return ScannerError::DirectionalOverrideUnderflow;
}
_stream.setPosition(endPosition);
return directionOverrideDepth > 0 ? ScannerError::DirectionalOverrideMismatch : ScannerError::NoError;
}
}
Token Scanner::skipSingleLineComment()
{
size_t startPosition = m_source.position();
while (!isUnicodeLinebreak())
if (!advance())
break;
ScannerError unicodeDirectionError = validateBiDiMarkup(m_source, startPosition);
if (unicodeDirectionError != ScannerError::NoError)
return setError(unicodeDirectionError);
return Token::Whitespace;
}
bool Scanner::atEndOfLine() const
{
return m_char == '\n' || m_char == '\r';
}
bool Scanner::tryScanEndOfLine()
{
if (m_char == '\n')
{
advance();
return true;
}
if (m_char == '\r')
{
if (advance() && m_char == '\n')
advance();
return true;
}
return false;
}
size_t Scanner::scanSingleLineDocComment()
{
LiteralScope literal(this, LITERAL_TYPE_COMMENT);
size_t endPosition = m_source.position();
skipWhitespaceExceptUnicodeLinebreak();
while (!isSourcePastEndOfInput())
{
endPosition = m_source.position();
if (tryScanEndOfLine())
{
if (!skipWhitespaceExceptUnicodeLinebreak())
endPosition = m_source.position();
if (!m_source.isPastEndOfInput(3) &&
m_source.get(0) == '/' &&
m_source.get(1) == '/' &&
m_source.get(2) == '/')
{
if (!m_source.isPastEndOfInput(4) && m_source.get(3) == '/')
break; m_char = m_source.advanceAndGet(3);
if (atEndOfLine())
continue;
addCommentLiteralChar('\n');
}
else
break; }
else if (isUnicodeLinebreak())
break;
addCommentLiteralChar(m_char);
advance();
}
literal.complete();
return endPosition;
}
Token Scanner::skipMultiLineComment()
{
size_t startPosition = m_source.position();
while (!isSourcePastEndOfInput())
{
char prevChar = m_char;
advance();
if (prevChar == '*' && m_char == '/')
{
ScannerError unicodeDirectionError = validateBiDiMarkup(m_source, startPosition);
if (unicodeDirectionError != ScannerError::NoError)
return setError(unicodeDirectionError);
m_char = ' ';
return Token::Whitespace;
}
}
return setError(ScannerError::IllegalCommentTerminator);
}
Token Scanner::scanMultiLineDocComment()
{
LiteralScope literal(this, LITERAL_TYPE_COMMENT);
bool endFound = false;
bool charsAdded = false;
while (isWhiteSpace(m_char) && !atEndOfLine())
advance();
while (!isSourcePastEndOfInput())
{
if (atEndOfLine())
{
skipWhitespace();
if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) == '*')
{ addCommentLiteralChar('*');
advance();
}
else if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) != '/')
{ m_char = m_source.advanceAndGet(1);
if (atEndOfLine()) continue;
if (charsAdded)
addCommentLiteralChar('\n'); }
else if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) == '/')
{ m_char = m_source.advanceAndGet(2);
endFound = true;
break;
}
else if (charsAdded)
addCommentLiteralChar('\n');
}
if (!m_source.isPastEndOfInput(1) && m_source.get(0) == '*' && m_source.get(1) == '/')
{
m_char = m_source.advanceAndGet(2);
endFound = true;
break;
}
addCommentLiteralChar(m_char);
charsAdded = true;
advance();
}
literal.complete();
if (!endFound)
return setError(ScannerError::IllegalCommentTerminator);
else
return Token::CommentLiteral;
}
Token Scanner::scanSlash()
{
int firstSlashPosition = static_cast<int>(sourcePos());
advance();
if (m_char == '/')
{
if (!advance())
return Token::Whitespace;
else if (m_char == '/')
{
advance();
if (m_char == '/')
return skipSingleLineComment();
m_skippedComments[NextNext].location.start = firstSlashPosition;
m_skippedComments[NextNext].location.sourceName = m_sourceName;
m_skippedComments[NextNext].token = Token::CommentLiteral;
m_skippedComments[NextNext].location.end = static_cast<int>(scanSingleLineDocComment());
return Token::Whitespace;
}
else
return skipSingleLineComment();
}
else if (m_char == '*')
{
if (!advance())
return setError(ScannerError::IllegalCommentTerminator);
else if (m_char == '*')
{
advance();
if (m_char == '/')
{
advance(); return Token::Whitespace;
}
if (m_char == '*')
return skipMultiLineComment();
m_skippedComments[NextNext].location.start = firstSlashPosition;
m_skippedComments[NextNext].location.sourceName = m_sourceName;
Token comment = scanMultiLineDocComment();
m_skippedComments[NextNext].location.end = static_cast<int>(sourcePos());
m_skippedComments[NextNext].token = comment;
if (comment == Token::Illegal)
return Token::Illegal; else
return Token::Whitespace;
}
else
return skipMultiLineComment();
}
else if (m_char == '=')
return selectToken(Token::AssignDiv);
else
return Token::Div;
}
void Scanner::scanToken()
{
m_tokens[NextNext] = {};
m_skippedComments[NextNext] = {};
Token token;
unsigned m;
unsigned n;
do
{
m_tokens[NextNext].location.start = static_cast<int>(sourcePos());
switch (m_char)
{
case '"':
case '\'':
token = scanString(false);
break;
case '<':
advance();
if (m_char == '=')
token = selectToken(Token::LessThanOrEqual);
else if (m_char == '<')
token = selectToken('=', Token::AssignShl, Token::SHL);
else
token = Token::LessThan;
break;
case '>':
advance();
if (m_char == '=')
token = selectToken(Token::GreaterThanOrEqual);
else if (m_char == '>')
{
advance();
if (m_char == '=')
token = selectToken(Token::AssignSar);
else if (m_char == '>')
token = selectToken('=', Token::AssignShr, Token::SHR);
else
token = Token::SAR;
}
else
token = Token::GreaterThan;
break;
case '=':
advance();
if (m_char == '=')
token = selectToken(Token::Equal);
else if (m_char == '>')
token = selectToken(Token::DoubleArrow);
else
token = Token::Assign;
break;
case '!':
advance();
if (m_char == '=')
token = selectToken(Token::NotEqual);
else
token = Token::Not;
break;
case '+':
advance();
if (m_char == '+')
token = selectToken(Token::Inc);
else if (m_char == '=')
token = selectToken(Token::AssignAdd);
else
token = Token::Add;
break;
case '-':
advance();
if (m_char == '-')
token = selectToken(Token::Dec);
else if (m_char == '=')
token = selectToken(Token::AssignSub);
else if (m_char == '>')
token = selectToken(Token::RightArrow);
else
token = Token::Sub;
break;
case '*':
advance();
if (m_char == '*')
token = selectToken(Token::Exp);
else if (m_char == '=')
token = selectToken(Token::AssignMul);
else
token = Token::Mul;
break;
case '%':
token = selectToken('=', Token::AssignMod, Token::Mod);
break;
case '/':
token = scanSlash();
break;
case '&':
advance();
if (m_char == '&')
token = selectToken(Token::And);
else if (m_char == '=')
token = selectToken(Token::AssignBitAnd);
else
token = Token::BitAnd;
break;
case '|':
advance();
if (m_char == '|')
token = selectToken(Token::Or);
else if (m_char == '=')
token = selectToken(Token::AssignBitOr);
else
token = Token::BitOr;
break;
case '^':
token = selectToken('=', Token::AssignBitXor, Token::BitXor);
break;
case '.':
advance();
if (isDecimalDigit(m_char))
token = scanNumber('.');
else
token = Token::Period;
break;
case ':':
advance();
if (m_char == '=')
token = selectToken(Token::AssemblyAssign);
else
token = Token::Colon;
break;
case ';':
token = selectToken(Token::Semicolon);
break;
case ',':
token = selectToken(Token::Comma);
break;
case '(':
token = selectToken(Token::LParen);
break;
case ')':
token = selectToken(Token::RParen);
break;
case '[':
token = selectToken(Token::LBrack);
break;
case ']':
token = selectToken(Token::RBrack);
break;
case '{':
token = selectToken(Token::LBrace);
break;
case '}':
token = selectToken(Token::RBrace);
break;
case '?':
token = selectToken(Token::Conditional);
break;
case '~':
token = selectToken(Token::BitNot);
break;
default:
if (isIdentifierStart(m_char))
{
tie(token, m, n) = scanIdentifierOrKeyword();
if (token == Token::Hex)
{
m = 0;
n = 0;
if (m_char == '"' || m_char == '\'')
token = scanHexString();
else
token = setError(ScannerError::IllegalToken);
}
else if (token == Token::Unicode && m_kind != ScannerKind::Yul)
{
m = 0;
n = 0;
if (m_char == '"' || m_char == '\'')
token = scanString(true);
else
token = setError(ScannerError::IllegalToken);
}
}
else if (isDecimalDigit(m_char))
token = scanNumber();
else if (skipWhitespace())
token = Token::Whitespace;
else if (isSourcePastEndOfInput())
token = Token::EOS;
else
token = selectErrorToken(ScannerError::IllegalToken);
break;
}
}
while (token == Token::Whitespace);
m_tokens[NextNext].location.end = static_cast<int>(sourcePos());
m_tokens[NextNext].location.sourceName = m_sourceName;
m_tokens[NextNext].token = token;
m_tokens[NextNext].extendedTokenInfo = make_tuple(m, n);
}
bool Scanner::scanEscape()
{
char c = m_char;
if (tryScanEndOfLine())
return true;
advance();
switch (c)
{
case '\'': case '"': case '\\':
break;
case 'n':
c = '\n';
break;
case 'r':
c = '\r';
break;
case 't':
c = '\t';
break;
case 'u':
{
if (auto const codepoint = scanUnicode(); codepoint.has_value())
addUnicodeAsUTF8(*codepoint);
else
return false;
return true;
}
case 'x':
if (!scanHexByte(c))
return false;
break;
default:
return false;
}
addLiteralChar(c);
return true;
}
bool Scanner::isUnicodeLinebreak()
{
if (0x0a <= m_char && m_char <= 0x0d)
return true;
if (!m_source.isPastEndOfInput(1) && uint8_t(m_source.get(0)) == 0xc2 && uint8_t(m_source.get(1)) == 0x85)
return true;
if (!m_source.isPastEndOfInput(2) && uint8_t(m_source.get(0)) == 0xe2 && uint8_t(m_source.get(1)) == 0x80 && (
uint8_t(m_source.get(2)) == 0xa8 || uint8_t(m_source.get(2)) == 0xa9
))
return true;
return false;
}
Token Scanner::scanString(bool const _isUnicode)
{
size_t startPosition = m_source.position();
char const quote = m_char;
advance(); LiteralScope literal(this, LITERAL_TYPE_STRING);
while (m_char != quote && !isSourcePastEndOfInput() && !isUnicodeLinebreak())
{
char c = m_char;
advance();
if (c == '\\')
{
if (isSourcePastEndOfInput() || !scanEscape())
return setError(ScannerError::IllegalEscapeSequence);
}
else
{
if (!_isUnicode && (static_cast<unsigned>(c) <= 0x1f || static_cast<unsigned>(c) >= 0x7f))
return setError(ScannerError::IllegalCharacterInString);
addLiteralChar(c);
}
}
if (m_char != quote)
return setError(ScannerError::IllegalStringEndQuote);
if (_isUnicode)
{
ScannerError unicodeDirectionError = validateBiDiMarkup(m_source, startPosition);
if (unicodeDirectionError != ScannerError::NoError)
return setError(unicodeDirectionError);
}
literal.complete();
advance(); return _isUnicode ? Token::UnicodeStringLiteral : Token::StringLiteral;
}
Token Scanner::scanHexString()
{
char const quote = m_char;
advance(); LiteralScope literal(this, LITERAL_TYPE_STRING);
bool allowUnderscore = false;
while (m_char != quote && !isSourcePastEndOfInput())
{
char c = m_char;
if (scanHexByte(c))
{
addLiteralChar(c);
allowUnderscore = true;
}
else if (c == '_')
{
advance();
if (!allowUnderscore || m_char == quote)
return setError(ScannerError::IllegalNumberSeparator);
allowUnderscore = false;
}
else
return setError(ScannerError::IllegalHexString);
}
if (m_char != quote)
return setError(ScannerError::IllegalStringEndQuote);
literal.complete();
advance(); return Token::HexStringLiteral;
}
void Scanner::scanDecimalDigits()
{
if (!isDecimalDigit(m_char))
return;
do
addLiteralCharAndAdvance();
while (!m_source.isPastEndOfInput() && (isDecimalDigit(m_char) || m_char == '_'));
}
Token Scanner::scanNumber(char _charSeen)
{
enum { DECIMAL, HEX, BINARY } kind = DECIMAL;
LiteralScope literal(this, LITERAL_TYPE_NUMBER);
if (_charSeen == '.')
{
addLiteralChar('.');
if (m_char == '_')
return setError(ScannerError::IllegalToken);
scanDecimalDigits(); }
else
{
solAssert(_charSeen == 0, "");
if (m_char == '0')
{
addLiteralCharAndAdvance();
if (m_char == 'x')
{
kind = HEX;
addLiteralCharAndAdvance();
if (!isHexDigit(m_char))
return setError(ScannerError::IllegalHexDigit);
while (isHexDigit(m_char) || m_char == '_') addLiteralCharAndAdvance();
}
else if (isDecimalDigit(m_char))
return setError(ScannerError::OctalNotAllowed);
}
if (kind == DECIMAL)
{
scanDecimalDigits(); if (m_char == '.')
{
if (!m_source.isPastEndOfInput(1) && m_source.get(1) == '_')
{
addLiteralCharAndAdvance(); addLiteralCharAndAdvance(); scanDecimalDigits();
}
if (m_source.isPastEndOfInput() || !isDecimalDigit(m_source.get(1)))
{
literal.complete();
return Token::Number;
}
addLiteralCharAndAdvance();
scanDecimalDigits();
}
}
}
if (m_char == 'e' || m_char == 'E')
{
solAssert(kind != HEX, "'e'/'E' must be scanned as part of the hex number");
if (kind != DECIMAL)
return setError(ScannerError::IllegalExponent);
else if (!m_source.isPastEndOfInput(1) && m_source.get(1) == '_')
{
addLiteralCharAndAdvance(); addLiteralCharAndAdvance(); scanDecimalDigits();
literal.complete();
return Token::Number;
}
addLiteralCharAndAdvance(); if (m_char == '+' || m_char == '-')
addLiteralCharAndAdvance();
if (!isDecimalDigit(m_char)) return setError(ScannerError::IllegalExponent);
scanDecimalDigits();
}
if (isDecimalDigit(m_char) || isIdentifierStart(m_char))
return setError(ScannerError::IllegalNumberEnd);
literal.complete();
return Token::Number;
}
tuple<Token, unsigned, unsigned> Scanner::scanIdentifierOrKeyword()
{
solAssert(isIdentifierStart(m_char), "");
LiteralScope literal(this, LITERAL_TYPE_STRING);
addLiteralCharAndAdvance();
while (isIdentifierPart(m_char) || (m_char == '.' && m_kind == ScannerKind::Yul))
addLiteralCharAndAdvance();
literal.complete();
auto const token = TokenTraits::fromIdentifierOrKeyword(m_tokens[NextNext].literal);
if (m_kind == ScannerKind::Yul)
{
if (m_tokens[NextNext].literal == "leave")
return std::make_tuple(Token::Leave, 0, 0);
if (!TokenTraits::isYulKeyword(std::get<0>(token)))
return std::make_tuple(Token::Identifier, 0, 0);
}
return token;
}
}