This commit is contained in:
Alexey Andreev 2023-10-22 21:35:00 +02:00
parent a52a3ade2a
commit 4db36ba827
3 changed files with 544 additions and 40 deletions

View File

@ -17,6 +17,7 @@ package org.teavm.js.lexer;
final class CharacterClassifier {
static final int EOF = -1;
static final int CODEPOINT_COUNT = 0x10f800;
private CharacterClassifier() {
}
@ -53,6 +54,27 @@ final class CharacterClassifier {
}
static boolean isIdentifierStart(int codePoint) {
return Character.isUnicodeIdentifierStart(codePoint);
return Character.isUnicodeIdentifierStart(codePoint) || codePoint == '_' || codePoint == '$';
}
static boolean isIdentifierPart(int codePoint) {
return Character.isUnicodeIdentifierPart(codePoint) || codePoint == '_' || codePoint == '$'
|| codePoint == 0x200C || codePoint == 0x200D;
}
static boolean isDecimalDigit(int codePoint) {
return codePoint >= '0' && codePoint <= '9';
}
static int hexDigit(int codePoint) {
if (codePoint >= 0 && codePoint <= '9') {
return codePoint - '0';
} else if (codePoint >= 'A' && codePoint <= 'F') {
return 10 + (codePoint - 'A');
} else if (codePoint >= 'a' && codePoint <= 'f') {
return 10 + (codePoint - 'a');
} else {
return -1;
}
}
}

View File

@ -26,6 +26,7 @@ public class Lexer {
private int tokenEndLine;
private int tokenEndColumn;
private boolean expectRegex;
private boolean hasEscapeSequence;
private Token token;
private String tokenValue;
@ -40,44 +41,168 @@ public class Lexer {
this.expectRegex = expectRegex;
}
public Token token() {
return token;
}
public String tokenValue() {
return tokenValue;
}
public boolean hasEscapeSequence() {
return hasEscapeSequence;
}
public int tokenStartLine() {
return tokenStartLine;
}
public int tokenStartColumn() {
return tokenStartColumn;
}
public int tokenStartOffset() {
return tokenStartOffset;
}
public int tokenEndLine() {
return tokenEndLine;
}
public int tokenEndColumn() {
return tokenEndColumn;
}
public int tokenEndOffset() {
return tokenEndOffset;
}
public void next() {
skipSpaces();
tokenStartOffset = reader.offset();
tokenStartLine = reader.line();
tokenStartColumn = reader.column();
tokenValue = null;
token = null;
while (token == null) {
tokenStartOffset = reader.offset();
tokenStartLine = reader.line();
tokenStartColumn = reader.column();
tokenValue = null;
hasEscapeSequence = false;
switch (reader.currentChar()) {
case CharacterClassifier.EOF:
token = Token.EOF;
break;
case 0x000D:
readLineTerminator();
break;
switch (reader.currentChar()) {
case CharacterClassifier.EOF:
token = Token.EOF;
break;
case 0x000D:
readLineTerminator();
break;
case 0x000A:
case 0x2028:
case 0x2029:
reader.next();
token = Token.EOF;
break;
case 0x000A:
case 0x2028:
case 0x2029:
tokenAfterCurrentChar(Token.LINE_TERMINATOR);
break;
case '$':
case '_':
case '\\':
readIdentifier();
break;
case '/':
readSlash();
break;
default:
if (CharacterClassifier.isIdentifierStart(reader.currentChar())) {
case '$':
case '_':
case '\\':
readIdentifier();
}
break;
break;
case '/':
readSlash();
break;
case '{':
tokenAfterCurrentChar(Token.LEFT_CURLY_BRACKET);
break;
case '}':
tokenAfterCurrentChar(Token.RIGHT_CURLY_BRACKET);
break;
case '(':
tokenAfterCurrentChar(Token.LEFT_PARENTHESIS);
break;
case ')':
tokenAfterCurrentChar(Token.RIGHT_PARENTHESIS);
break;
case '[':
tokenAfterCurrentChar(Token.LEFT_SQUARE_BRACKET);
break;
case ']':
tokenAfterCurrentChar(Token.RIGHT_SQUARE_BRACKET);
break;
case '.':
readDot();
break;
case ';':
tokenAfterCurrentChar(Token.SEMICOLON);
break;
case ',':
tokenAfterCurrentChar(Token.COMMA);
break;
case '<':
readLess();
break;
case '>':
readGreater();
break;
case '=':
readEqual();
break;
case '!':
readExclamationSign();
break;
case '+':
readOperationOrAssign(Token.PLUS_ASSIGN, Token.PLUS);
break;
case '-':
readOperationOrAssign(Token.MINUS_ASSIGN, Token.MINUS);
break;
case '*':
readOperationOrAssign(Token.MULTIPLY_ASSIGN, Token.MULTIPLY);
break;
case '%':
readOperationOrAssign(Token.REMAINDER_ASSIGN, Token.REMAINDER);
break;
case '&':
readLogical(Token.AND_ASSIGN, Token.AND, Token.AND_AND);
break;
case '|':
readLogical(Token.OR_ASSIGN, Token.OR, Token.OR_OR);
break;
case '^':
readOperationOrAssign(Token.XOR_ASSIGN, Token.XOR);
break;
case '~':
tokenAfterCurrentChar(Token.INVERT);
break;
case '?':
tokenAfterCurrentChar(Token.QUESTION);
break;
case ':':
tokenAfterCurrentChar(Token.COLON);
break;
case '0':
readNumericLiteral();
break;
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
readDecimalLiteral(new StringBuilder());
break;
default:
if (CharacterClassifier.isIdentifierStart(reader.currentChar())) {
readIdentifier();
}
break;
}
}
tokenEndOffset = reader.offset();
@ -101,7 +226,11 @@ public class Lexer {
if (expectRegex) {
readRegexLiteral();
} else {
token = Token.SLASH;
if (reader.currentChar() == '=') {
tokenAfterCurrentChar(Token.DIVIDE_ASSIGN);
} else {
token = Token.DIVIDE;
}
}
}
@ -159,16 +288,44 @@ public class Lexer {
int column = reader.column();
int offset = reader.offset();
int codePoint = tryReadUnicodeEscapeSequence();
if (codePoint >= 0) {
if (!Character.isUnicodeIdentifierStart(codePoint) && codePoint != '$' && codePoint != '_') {
errorReporter.reportError(line, column, offset, "Invalid ");
if (codePoint < 0) {
return;
}
if (!CharacterClassifier.isIdentifierStart(codePoint)) {
errorReporter.reportError(line, column, offset, "Invalid identifier start character"
+ "represented by escape sequence");
return;
}
sb.appendCodePoint(codePoint);
hasEscapeSequence = true;
} else {
sb.appendCodePoint(reader.currentChar());
reader.next();
}
while (true) {
if (reader.currentChar() == '\\') {
reader.next();
int line = reader.line();
int column = reader.column();
int offset = reader.offset();
int codePoint = tryReadUnicodeEscapeSequence();
if (codePoint < 0) {
break;
}
if (!CharacterClassifier.isIdentifierPart(codePoint)) {
errorReporter.reportError(line, column, offset, "Invalid identifier character "
+ "represented by escape sequence");
} else {
sb.appendCodePoint(codePoint);
hasEscapeSequence = true;
}
} else if (CharacterClassifier.isIdentifierStart(reader.currentChar())) {
sb.appendCodePoint(reader.currentChar());
reader.next();
} else {
break;
}
} else {
sb.append(reader.currentChar());
reader.next();
}
tokenValue = sb.toString();
@ -176,20 +333,295 @@ public class Lexer {
}
private int tryReadUnicodeEscapeSequence() {
if (reader.currentChar() != 'u') {
reportError("Invalid unicode escape sequence in identifier");
return -1;
}
reader.next();
return readUnicodeEscapeSequence();
}
private void readDot() {
reader.next();
if (reader.currentChar() == '.') {
reader.next();
if (reader.currentChar() != '.') {
token = Token.DOT;
reportError("One more dot expected to for ellipsis (...)");
} else {
token = Token.ELLIPSIS;
}
} else if (reader.currentChar() >= '0' && reader.currentChar() <= '9') {
token = Token.NUMERIC_LITERAL;
readFractionalAfterDot(new StringBuilder("."));
checkCharAfterNumber();
} else {
token = Token.DOT;
}
}
private void readLess() {
reader.next();
switch (reader.currentChar()) {
case '=':
tokenAfterCurrentChar(Token.LESS_OR_EQUAL);
break;
case '<':
reader.next();
if (reader.currentChar() == '=') {
tokenAfterCurrentChar(Token.SHIFT_LEFT_ASSIGN);
} else {
token = Token.SHIFT_LEFT;
}
break;
default:
token = Token.LESS;
break;
}
}
private void readGreater() {
reader.next();
switch (reader.currentChar()) {
case '=':
tokenAfterCurrentChar(Token.GREATER_OR_EQUAL);
break;
case '>':
reader.next();
switch (reader.currentChar()) {
case '>':
if (reader.currentChar() == '=') {
tokenAfterCurrentChar(Token.SHIFT_RIGHT_UNSIGNED);
} else {
token = Token.SHIFT_RIGHT_UNSIGNED;
}
break;
case '=':
tokenAfterCurrentChar(Token.SHIFT_RIGHT_ASSIGN);
break;
default:
token = Token.SHIFT_RIGHT;
break;
}
break;
default:
token = Token.GREATER;
break;
}
}
private void readEqual() {
reader.next();
switch (reader.currentChar()) {
case '=':
reader.next();
if (reader.currentChar() == '=') {
tokenAfterCurrentChar(Token.STRICT_EQUAL);
} else {
token = Token.EQUAL;
}
break;
case '>':
token = Token.ARROW;
break;
default:
token = Token.ASSIGN;
break;
}
}
private void readExclamationSign() {
reader.next();
if (reader.currentChar() == '=') {
reader.next();
if (reader.currentChar() == '=') {
tokenAfterCurrentChar(Token.STRICT_NOT_EQUAL);
} else {
token = Token.NOT_EQUAL;
}
} else {
token = Token.NOT;
}
}
private void readOperationOrAssign(Token assign, Token regular) {
reader.next();
if (reader.currentChar() == '=') {
tokenAfterCurrentChar(assign);
} else {
token = regular;
}
}
private void readLogical(Token assign, Token bitwise, Token logical) {
var c = reader.currentChar();
reader.next();
if (reader.currentChar() == '=') {
tokenAfterCurrentChar(assign);
} else if (reader.currentChar() == c) {
tokenAfterCurrentChar(logical);
} else {
token = bitwise;
}
}
private void readNumericLiteral() {
var sb = new StringBuilder();
sb.appendCodePoint(reader.currentChar());
reader.next();
switch (reader.currentChar()) {
case '.':
readFractional(sb);
break;
case 'x':
case 'X':
readHexLiteral();
break;
case 'b':
case 'B':
readBinaryLiteral();
break;
case 'o':
case 'O':
readOctalLiteral();
break;
case 'E':
case 'e':
readExponent(new StringBuilder());
break;
default:
readDecimalLiteral(sb);
break;
}
}
private void readDecimalLiteral(StringBuilder sb) {
token = Token.NUMERIC_LITERAL;
sb.appendCodePoint(reader.currentChar());
reader.next();
while (CharacterClassifier.isDecimalDigit(reader.currentChar())) {
sb.append(reader.currentChar());
reader.next();
}
if (reader.currentChar() == '.') {
sb.append('.');
reader.next();
readFractional(sb);
} else if (reader.currentChar() == 'e' || reader.currentChar() == 'E') {
readExponent(sb);
}
checkCharAfterNumber();
}
private void readFractional(StringBuilder sb) {
sb.append('.');
reader.next();
readFractionalAfterDot(sb);
}
private void readFractionalAfterDot(StringBuilder sb) {
sb.append(reader.currentChar());
while (CharacterClassifier.isDecimalDigit(reader.currentChar())) {
sb.append(reader.currentChar());
}
if (reader.currentChar() == 'e' || reader.currentChar() == 'E') {
readExponent(sb);
}
}
private void readExponent(StringBuilder sb) {
sb.appendCodePoint(reader.currentChar());
reader.next();
if (reader.currentChar() == '+' || reader.currentChar() == '-') {
sb.appendCodePoint(reader.currentChar());
reader.next();
}
if (CharacterClassifier.isDecimalDigit(reader.currentChar())) {
}
}
private void readHexLiteral() {
}
private void readBinaryLiteral() {
}
private void readOctalLiteral() {
}
private void checkCharAfterNumber() {
}
private int readUnicodeEscapeSequence() {
var codePoint = 0;
if (reader.currentChar() == '{') {
reader.next();
var hasError = false;
var line = reader.line();
var column = reader.column();
int offset = reader.offset();
while (reader.currentChar() != '}') {
var digit = readHexDigit();
if (digit < 0) {
reportError("Invalid hex digit in unicode escape sequence");
hasError = true;
break;
}
codePoint <<= 4;
codePoint += digit;
if (codePoint >= CharacterClassifier.CODEPOINT_COUNT) {
codePoint = CharacterClassifier.CODEPOINT_COUNT - 1;
errorReporter.reportError(line, column, offset, "Too big codepoint value in escape sequence");
}
}
if (hasError) {
while (reader.currentChar() != '}' && !CharacterClassifier.isLineTerminator(reader.currentChar())) {
reader.next();
}
}
} else {
for (var i = 0; i < 4; ++i) {
var digit = readHexDigit();
if (digit < 0) {
reportError("Invalid hex digit in unicode escape sequence");
break;
}
codePoint <<= 4;
codePoint |= digit;
}
}
return codePoint;
}
private int readHexDigit() {
var digit = CharacterClassifier.hexDigit(reader.currentChar());
if (digit >= 0) {
reader.next();
}
return digit;
}
private void readRegexLiteral() {
}
private void tokenAfterCurrentChar(Token token) {
reader.next();
this.token = token;
}
private void skipSpaces() {
while (CharacterClassifier.isWhiteSpace(reader.currentChar())) {
reader.next();
}
}
private void reportError(String error) {
errorReporter.reportError(reader.line(), reader.column(), reader.offset(), error);
}
}

View File

@ -18,8 +18,58 @@ package org.teavm.js.lexer;
public enum Token {
EOF,
LINE_TERMINATOR,
SLASH,
COMMENT,
IDENTIFER,
LEFT_CURLY_BRACKET,
RIGHT_CURLY_BRACKET,
LEFT_PARENTHESIS,
RIGHT_PARENTHESIS,
LEFT_SQUARE_BRACKET,
RIGHT_SQUARE_BRACKET,
DOT,
ELLIPSIS,
SEMICOLON,
COMMA,
LESS,
GREATER,
LESS_OR_EQUAL,
GREATER_OR_EQUAL,
EQUAL,
NOT_EQUAL,
STRICT_EQUAL,
STRICT_NOT_EQUAL,
SHIFT_LEFT,
SHIFT_RIGHT,
SHIFT_RIGHT_UNSIGNED,
PLUS,
MINUS,
MULTIPLY,
DIVIDE,
REMAINDER,
AND,
OR,
XOR,
NOT,
INVERT,
AND_AND,
OR_OR,
QUESTION,
COLON,
ASSIGN,
PLUS_ASSIGN,
MINUS_ASSIGN,
MULTIPLY_ASSIGN,
DIVIDE_ASSIGN,
REMAINDER_ASSIGN,
AND_ASSIGN,
OR_ASSIGN,
XOR_ASSIGN,
SHIFT_LEFT_ASSIGN,
SHIFT_RIGHT_ASSIGN,
SHIFT_RIGHT_UNSIGNED_ASSIGN,
ARROW,
NUMERIC_LITERAL,
REGEX_LITERAL
}