From 4db36ba82727c751e44f07a682df6ef1124763be Mon Sep 17 00:00:00 2001 From: Alexey Andreev Date: Sun, 22 Oct 2023 21:35:00 +0200 Subject: [PATCH] ~WIP --- .../teavm/js/lexer/CharacterClassifier.java | 24 +- .../main/java/org/teavm/js/lexer/Lexer.java | 508 ++++++++++++++++-- .../main/java/org/teavm/js/lexer/Token.java | 52 +- 3 files changed, 544 insertions(+), 40 deletions(-) diff --git a/js-parser/src/main/java/org/teavm/js/lexer/CharacterClassifier.java b/js-parser/src/main/java/org/teavm/js/lexer/CharacterClassifier.java index 446c07a09..6e05ee58a 100644 --- a/js-parser/src/main/java/org/teavm/js/lexer/CharacterClassifier.java +++ b/js-parser/src/main/java/org/teavm/js/lexer/CharacterClassifier.java @@ -17,6 +17,7 @@ package org.teavm.js.lexer; final class CharacterClassifier { static final int EOF = -1; + static final int CODEPOINT_COUNT = 0x10f800; private CharacterClassifier() { } @@ -53,6 +54,27 @@ final class CharacterClassifier { } static boolean isIdentifierStart(int codePoint) { - return Character.isUnicodeIdentifierStart(codePoint); + return Character.isUnicodeIdentifierStart(codePoint) || codePoint == '_' || codePoint == '$'; + } + + static boolean isIdentifierPart(int codePoint) { + return Character.isUnicodeIdentifierPart(codePoint) || codePoint == '_' || codePoint == '$' + || codePoint == 0x200C || codePoint == 0x200D; + } + + static boolean isDecimalDigit(int codePoint) { + return codePoint >= '0' && codePoint <= '9'; + } + + static int hexDigit(int codePoint) { + if (codePoint >= 0 && codePoint <= '9') { + return codePoint - '0'; + } else if (codePoint >= 'A' && codePoint <= 'F') { + return 10 + (codePoint - 'A'); + } else if (codePoint >= 'a' && codePoint <= 'f') { + return 10 + (codePoint - 'a'); + } else { + return -1; + } } } diff --git a/js-parser/src/main/java/org/teavm/js/lexer/Lexer.java b/js-parser/src/main/java/org/teavm/js/lexer/Lexer.java index 261419f11..074c6dd12 100644 --- a/js-parser/src/main/java/org/teavm/js/lexer/Lexer.java +++ b/js-parser/src/main/java/org/teavm/js/lexer/Lexer.java @@ -26,6 +26,7 @@ public class Lexer { private int tokenEndLine; private int tokenEndColumn; private boolean expectRegex; + private boolean hasEscapeSequence; private Token token; private String tokenValue; @@ -40,44 +41,168 @@ public class Lexer { this.expectRegex = expectRegex; } + public Token token() { + return token; + } + + public String tokenValue() { + return tokenValue; + } + + public boolean hasEscapeSequence() { + return hasEscapeSequence; + } + + public int tokenStartLine() { + return tokenStartLine; + } + + public int tokenStartColumn() { + return tokenStartColumn; + } + + public int tokenStartOffset() { + return tokenStartOffset; + } + + public int tokenEndLine() { + return tokenEndLine; + } + + public int tokenEndColumn() { + return tokenEndColumn; + } + + public int tokenEndOffset() { + return tokenEndOffset; + } + public void next() { skipSpaces(); - tokenStartOffset = reader.offset(); - tokenStartLine = reader.line(); - tokenStartColumn = reader.column(); - tokenValue = null; + token = null; + while (token == null) { + tokenStartOffset = reader.offset(); + tokenStartLine = reader.line(); + tokenStartColumn = reader.column(); + tokenValue = null; + hasEscapeSequence = false; - switch (reader.currentChar()) { - case CharacterClassifier.EOF: - token = Token.EOF; - break; - case 0x000D: - readLineTerminator(); - break; + switch (reader.currentChar()) { + case CharacterClassifier.EOF: + token = Token.EOF; + break; + case 0x000D: + readLineTerminator(); + break; - case 0x000A: - case 0x2028: - case 0x2029: - reader.next(); - token = Token.EOF; - break; + case 0x000A: + case 0x2028: + case 0x2029: + tokenAfterCurrentChar(Token.LINE_TERMINATOR); + break; - case '$': - case '_': - case '\\': - readIdentifier(); - break; - - case '/': - readSlash(); - break; - - default: - if (CharacterClassifier.isIdentifierStart(reader.currentChar())) { + case '$': + case '_': + case '\\': readIdentifier(); - } - break; + break; + + case '/': + readSlash(); + break; + + case '{': + tokenAfterCurrentChar(Token.LEFT_CURLY_BRACKET); + break; + case '}': + tokenAfterCurrentChar(Token.RIGHT_CURLY_BRACKET); + break; + case '(': + tokenAfterCurrentChar(Token.LEFT_PARENTHESIS); + break; + case ')': + tokenAfterCurrentChar(Token.RIGHT_PARENTHESIS); + break; + case '[': + tokenAfterCurrentChar(Token.LEFT_SQUARE_BRACKET); + break; + case ']': + tokenAfterCurrentChar(Token.RIGHT_SQUARE_BRACKET); + break; + case '.': + readDot(); + break; + case ';': + tokenAfterCurrentChar(Token.SEMICOLON); + break; + case ',': + tokenAfterCurrentChar(Token.COMMA); + break; + case '<': + readLess(); + break; + case '>': + readGreater(); + break; + case '=': + readEqual(); + break; + case '!': + readExclamationSign(); + break; + case '+': + readOperationOrAssign(Token.PLUS_ASSIGN, Token.PLUS); + break; + case '-': + readOperationOrAssign(Token.MINUS_ASSIGN, Token.MINUS); + break; + case '*': + readOperationOrAssign(Token.MULTIPLY_ASSIGN, Token.MULTIPLY); + break; + case '%': + readOperationOrAssign(Token.REMAINDER_ASSIGN, Token.REMAINDER); + break; + case '&': + readLogical(Token.AND_ASSIGN, Token.AND, Token.AND_AND); + break; + case '|': + readLogical(Token.OR_ASSIGN, Token.OR, Token.OR_OR); + break; + case '^': + readOperationOrAssign(Token.XOR_ASSIGN, Token.XOR); + break; + case '~': + tokenAfterCurrentChar(Token.INVERT); + break; + case '?': + tokenAfterCurrentChar(Token.QUESTION); + break; + case ':': + tokenAfterCurrentChar(Token.COLON); + break; + + case '0': + readNumericLiteral(); + break; + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + readDecimalLiteral(new StringBuilder()); + break; + + default: + if (CharacterClassifier.isIdentifierStart(reader.currentChar())) { + readIdentifier(); + } + break; + } } tokenEndOffset = reader.offset(); @@ -101,7 +226,11 @@ public class Lexer { if (expectRegex) { readRegexLiteral(); } else { - token = Token.SLASH; + if (reader.currentChar() == '=') { + tokenAfterCurrentChar(Token.DIVIDE_ASSIGN); + } else { + token = Token.DIVIDE; + } } } @@ -159,16 +288,44 @@ public class Lexer { int column = reader.column(); int offset = reader.offset(); int codePoint = tryReadUnicodeEscapeSequence(); - if (codePoint >= 0) { - if (!Character.isUnicodeIdentifierStart(codePoint) && codePoint != '$' && codePoint != '_') { - errorReporter.reportError(line, column, offset, "Invalid "); + if (codePoint < 0) { + return; + } + if (!CharacterClassifier.isIdentifierStart(codePoint)) { + errorReporter.reportError(line, column, offset, "Invalid identifier start character" + + "represented by escape sequence"); + return; + } + sb.appendCodePoint(codePoint); + hasEscapeSequence = true; + } else { + sb.appendCodePoint(reader.currentChar()); + reader.next(); + } + + while (true) { + if (reader.currentChar() == '\\') { + reader.next(); + int line = reader.line(); + int column = reader.column(); + int offset = reader.offset(); + int codePoint = tryReadUnicodeEscapeSequence(); + if (codePoint < 0) { + break; + } + if (!CharacterClassifier.isIdentifierPart(codePoint)) { + errorReporter.reportError(line, column, offset, "Invalid identifier character " + + "represented by escape sequence"); } else { sb.appendCodePoint(codePoint); + hasEscapeSequence = true; } + } else if (CharacterClassifier.isIdentifierStart(reader.currentChar())) { + sb.appendCodePoint(reader.currentChar()); + reader.next(); + } else { + break; } - } else { - sb.append(reader.currentChar()); - reader.next(); } tokenValue = sb.toString(); @@ -176,20 +333,295 @@ public class Lexer { } private int tryReadUnicodeEscapeSequence() { + if (reader.currentChar() != 'u') { + reportError("Invalid unicode escape sequence in identifier"); + return -1; + } + reader.next(); + return readUnicodeEscapeSequence(); + } + + private void readDot() { + reader.next(); + if (reader.currentChar() == '.') { + reader.next(); + if (reader.currentChar() != '.') { + token = Token.DOT; + reportError("One more dot expected to for ellipsis (...)"); + } else { + token = Token.ELLIPSIS; + } + } else if (reader.currentChar() >= '0' && reader.currentChar() <= '9') { + token = Token.NUMERIC_LITERAL; + readFractionalAfterDot(new StringBuilder(".")); + checkCharAfterNumber(); + } else { + token = Token.DOT; + } + } + + private void readLess() { + reader.next(); + switch (reader.currentChar()) { + case '=': + tokenAfterCurrentChar(Token.LESS_OR_EQUAL); + break; + case '<': + reader.next(); + if (reader.currentChar() == '=') { + tokenAfterCurrentChar(Token.SHIFT_LEFT_ASSIGN); + } else { + token = Token.SHIFT_LEFT; + } + break; + default: + token = Token.LESS; + break; + } + } + + private void readGreater() { + reader.next(); + switch (reader.currentChar()) { + case '=': + tokenAfterCurrentChar(Token.GREATER_OR_EQUAL); + break; + case '>': + reader.next(); + switch (reader.currentChar()) { + case '>': + if (reader.currentChar() == '=') { + tokenAfterCurrentChar(Token.SHIFT_RIGHT_UNSIGNED); + } else { + token = Token.SHIFT_RIGHT_UNSIGNED; + } + break; + case '=': + tokenAfterCurrentChar(Token.SHIFT_RIGHT_ASSIGN); + break; + default: + token = Token.SHIFT_RIGHT; + break; + } + break; + default: + token = Token.GREATER; + break; + } + } + + private void readEqual() { + reader.next(); + switch (reader.currentChar()) { + case '=': + reader.next(); + if (reader.currentChar() == '=') { + tokenAfterCurrentChar(Token.STRICT_EQUAL); + } else { + token = Token.EQUAL; + } + break; + case '>': + token = Token.ARROW; + break; + default: + token = Token.ASSIGN; + break; + } + } + + private void readExclamationSign() { + reader.next(); + if (reader.currentChar() == '=') { + reader.next(); + if (reader.currentChar() == '=') { + tokenAfterCurrentChar(Token.STRICT_NOT_EQUAL); + } else { + token = Token.NOT_EQUAL; + } + } else { + token = Token.NOT; + } + } + + private void readOperationOrAssign(Token assign, Token regular) { + reader.next(); + if (reader.currentChar() == '=') { + tokenAfterCurrentChar(assign); + } else { + token = regular; + } + } + + private void readLogical(Token assign, Token bitwise, Token logical) { + var c = reader.currentChar(); + reader.next(); + if (reader.currentChar() == '=') { + tokenAfterCurrentChar(assign); + } else if (reader.currentChar() == c) { + tokenAfterCurrentChar(logical); + } else { + token = bitwise; + } + } + + private void readNumericLiteral() { + var sb = new StringBuilder(); + sb.appendCodePoint(reader.currentChar()); + reader.next(); + switch (reader.currentChar()) { + case '.': + readFractional(sb); + break; + case 'x': + case 'X': + readHexLiteral(); + break; + case 'b': + case 'B': + readBinaryLiteral(); + break; + case 'o': + case 'O': + readOctalLiteral(); + break; + case 'E': + case 'e': + readExponent(new StringBuilder()); + break; + default: + readDecimalLiteral(sb); + break; + } + } + + private void readDecimalLiteral(StringBuilder sb) { + token = Token.NUMERIC_LITERAL; + sb.appendCodePoint(reader.currentChar()); + reader.next(); + while (CharacterClassifier.isDecimalDigit(reader.currentChar())) { + sb.append(reader.currentChar()); + reader.next(); + } + if (reader.currentChar() == '.') { + sb.append('.'); + reader.next(); + readFractional(sb); + } else if (reader.currentChar() == 'e' || reader.currentChar() == 'E') { + readExponent(sb); + } + checkCharAfterNumber(); + } + + private void readFractional(StringBuilder sb) { + sb.append('.'); + reader.next(); + readFractionalAfterDot(sb); + } + + private void readFractionalAfterDot(StringBuilder sb) { + sb.append(reader.currentChar()); + while (CharacterClassifier.isDecimalDigit(reader.currentChar())) { + sb.append(reader.currentChar()); + } + if (reader.currentChar() == 'e' || reader.currentChar() == 'E') { + readExponent(sb); + } + } + + private void readExponent(StringBuilder sb) { + sb.appendCodePoint(reader.currentChar()); + reader.next(); + if (reader.currentChar() == '+' || reader.currentChar() == '-') { + sb.appendCodePoint(reader.currentChar()); + reader.next(); + } + if (CharacterClassifier.isDecimalDigit(reader.currentChar())) { + + } + } + + private void readHexLiteral() { + + } + + private void readBinaryLiteral() { + + } + + private void readOctalLiteral() { + + } + + private void checkCharAfterNumber() { } private int readUnicodeEscapeSequence() { + var codePoint = 0; + if (reader.currentChar() == '{') { + reader.next(); + var hasError = false; + var line = reader.line(); + var column = reader.column(); + int offset = reader.offset(); + while (reader.currentChar() != '}') { + var digit = readHexDigit(); + if (digit < 0) { + reportError("Invalid hex digit in unicode escape sequence"); + hasError = true; + break; + } + codePoint <<= 4; + codePoint += digit; + if (codePoint >= CharacterClassifier.CODEPOINT_COUNT) { + codePoint = CharacterClassifier.CODEPOINT_COUNT - 1; + errorReporter.reportError(line, column, offset, "Too big codepoint value in escape sequence"); + } + } + if (hasError) { + while (reader.currentChar() != '}' && !CharacterClassifier.isLineTerminator(reader.currentChar())) { + reader.next(); + } + } + } else { + for (var i = 0; i < 4; ++i) { + var digit = readHexDigit(); + if (digit < 0) { + reportError("Invalid hex digit in unicode escape sequence"); + break; + } + codePoint <<= 4; + codePoint |= digit; + } + } + return codePoint; + } + private int readHexDigit() { + var digit = CharacterClassifier.hexDigit(reader.currentChar()); + if (digit >= 0) { + reader.next(); + } + return digit; } private void readRegexLiteral() { } + private void tokenAfterCurrentChar(Token token) { + reader.next(); + this.token = token; + } + private void skipSpaces() { while (CharacterClassifier.isWhiteSpace(reader.currentChar())) { reader.next(); } } + + private void reportError(String error) { + errorReporter.reportError(reader.line(), reader.column(), reader.offset(), error); + } } diff --git a/js-parser/src/main/java/org/teavm/js/lexer/Token.java b/js-parser/src/main/java/org/teavm/js/lexer/Token.java index f6da16214..7c657e9e2 100644 --- a/js-parser/src/main/java/org/teavm/js/lexer/Token.java +++ b/js-parser/src/main/java/org/teavm/js/lexer/Token.java @@ -18,8 +18,58 @@ package org.teavm.js.lexer; public enum Token { EOF, LINE_TERMINATOR, - SLASH, COMMENT, IDENTIFER, + + LEFT_CURLY_BRACKET, + RIGHT_CURLY_BRACKET, + LEFT_PARENTHESIS, + RIGHT_PARENTHESIS, + LEFT_SQUARE_BRACKET, + RIGHT_SQUARE_BRACKET, + DOT, + ELLIPSIS, + SEMICOLON, + COMMA, + LESS, + GREATER, + LESS_OR_EQUAL, + GREATER_OR_EQUAL, + EQUAL, + NOT_EQUAL, + STRICT_EQUAL, + STRICT_NOT_EQUAL, + SHIFT_LEFT, + SHIFT_RIGHT, + SHIFT_RIGHT_UNSIGNED, + PLUS, + MINUS, + MULTIPLY, + DIVIDE, + REMAINDER, + AND, + OR, + XOR, + NOT, + INVERT, + AND_AND, + OR_OR, + QUESTION, + COLON, + ASSIGN, + PLUS_ASSIGN, + MINUS_ASSIGN, + MULTIPLY_ASSIGN, + DIVIDE_ASSIGN, + REMAINDER_ASSIGN, + AND_ASSIGN, + OR_ASSIGN, + XOR_ASSIGN, + SHIFT_LEFT_ASSIGN, + SHIFT_RIGHT_ASSIGN, + SHIFT_RIGHT_UNSIGNED_ASSIGN, + ARROW, + + NUMERIC_LITERAL, REGEX_LITERAL }