rum-goggles/v1/vendor/github.com/robertkrimen/otto/parser/lexer.go
2024-04-04 10:46:14 -04:00

855 lines
18 KiB
Go

package parser
import (
"bytes"
"errors"
"fmt"
"regexp"
"strconv"
"strings"
"unicode"
"unicode/utf8"
"github.com/robertkrimen/otto/ast"
"github.com/robertkrimen/otto/file"
"github.com/robertkrimen/otto/token"
)
type chr struct { //nolint: unused
value rune
width int
}
var matchIdentifier = regexp.MustCompile(`^[$_\p{L}][$_\p{L}\d}]*$`)
func isDecimalDigit(chr rune) bool {
return '0' <= chr && chr <= '9'
}
func digitValue(chr rune) int {
switch {
case '0' <= chr && chr <= '9':
return int(chr - '0')
case 'a' <= chr && chr <= 'f':
return int(chr - 'a' + 10)
case 'A' <= chr && chr <= 'F':
return int(chr - 'A' + 10)
}
return 16 // Larger than any legal digit value
}
func isDigit(chr rune, base int) bool {
return digitValue(chr) < base
}
func isIdentifierStart(chr rune) bool {
return chr == '$' || chr == '_' || chr == '\\' ||
'a' <= chr && chr <= 'z' || 'A' <= chr && chr <= 'Z' ||
chr >= utf8.RuneSelf && unicode.IsLetter(chr)
}
func isIdentifierPart(chr rune) bool {
return chr == '$' || chr == '_' || chr == '\\' ||
'a' <= chr && chr <= 'z' || 'A' <= chr && chr <= 'Z' ||
'0' <= chr && chr <= '9' ||
chr >= utf8.RuneSelf && (unicode.IsLetter(chr) || unicode.IsDigit(chr))
}
func (p *parser) scanIdentifier() (string, error) {
offset := p.chrOffset
parse := false
for isIdentifierPart(p.chr) {
if p.chr == '\\' {
distance := p.chrOffset - offset
p.read()
if p.chr != 'u' {
return "", fmt.Errorf("invalid identifier escape character: %c (%s)", p.chr, string(p.chr))
}
parse = true
var value rune
for j := 0; j < 4; j++ {
p.read()
decimal, ok := hex2decimal(byte(p.chr))
if !ok {
return "", fmt.Errorf("invalid identifier escape character: %c (%s)", p.chr, string(p.chr))
}
value = value<<4 | decimal
}
switch {
case value == '\\':
return "", fmt.Errorf("invalid identifier escape value: %c (%s)", value, string(value))
case distance == 0:
if !isIdentifierStart(value) {
return "", fmt.Errorf("invalid identifier escape value: %c (%s)", value, string(value))
}
case distance > 0:
if !isIdentifierPart(value) {
return "", fmt.Errorf("invalid identifier escape value: %c (%s)", value, string(value))
}
}
}
p.read()
}
literal := p.str[offset:p.chrOffset]
if parse {
return parseStringLiteral(literal)
}
return literal, nil
}
// 7.2.
func isLineWhiteSpace(chr rune) bool { //nolint: unused, deadcode
switch chr {
case '\u0009', '\u000b', '\u000c', '\u0020', '\u00a0', '\ufeff':
return true
case '\u000a', '\u000d', '\u2028', '\u2029':
return false
case '\u0085':
return false
}
return unicode.IsSpace(chr)
}
// 7.3.
func isLineTerminator(chr rune) bool {
switch chr {
case '\u000a', '\u000d', '\u2028', '\u2029':
return true
}
return false
}
func (p *parser) scan() (tkn token.Token, literal string, idx file.Idx) { //nolint: nonamedreturns
p.implicitSemicolon = false
for {
p.skipWhiteSpace()
idx = p.idxOf(p.chrOffset)
insertSemicolon := false
switch chr := p.chr; {
case isIdentifierStart(chr):
var err error
literal, err = p.scanIdentifier()
if err != nil {
tkn = token.ILLEGAL
break
}
if len(literal) > 1 {
// Keywords are longer than 1 character, avoid lookup otherwise
var strict bool
tkn, strict = token.IsKeyword(literal)
switch tkn {
case 0: // Not a keyword
switch literal {
case "true", "false":
p.insertSemicolon = true
return token.BOOLEAN, literal, idx
case "null":
p.insertSemicolon = true
return token.NULL, literal, idx
}
case token.KEYWORD:
if strict {
// TODO If strict and in strict mode, then this is not a break
break
}
return token.KEYWORD, literal, idx
case
token.THIS,
token.BREAK,
token.THROW, // A newline after a throw is not allowed, but we need to detect it
token.RETURN,
token.CONTINUE,
token.DEBUGGER:
p.insertSemicolon = true
return tkn, literal, idx
default:
return tkn, literal, idx
}
}
p.insertSemicolon = true
return token.IDENTIFIER, literal, idx
case '0' <= chr && chr <= '9':
p.insertSemicolon = true
tkn, literal = p.scanNumericLiteral(false)
return tkn, literal, idx
default:
p.read()
switch chr {
case -1:
if p.insertSemicolon {
p.insertSemicolon = false
p.implicitSemicolon = true
}
tkn = token.EOF
case '\r', '\n', '\u2028', '\u2029':
p.insertSemicolon = false
p.implicitSemicolon = true
p.comments.AtLineBreak()
continue
case ':':
tkn = token.COLON
case '.':
if digitValue(p.chr) < 10 {
insertSemicolon = true
tkn, literal = p.scanNumericLiteral(true)
} else {
tkn = token.PERIOD
}
case ',':
tkn = token.COMMA
case ';':
tkn = token.SEMICOLON
case '(':
tkn = token.LEFT_PARENTHESIS
case ')':
tkn = token.RIGHT_PARENTHESIS
insertSemicolon = true
case '[':
tkn = token.LEFT_BRACKET
case ']':
tkn = token.RIGHT_BRACKET
insertSemicolon = true
case '{':
tkn = token.LEFT_BRACE
case '}':
tkn = token.RIGHT_BRACE
insertSemicolon = true
case '+':
tkn = p.switch3(token.PLUS, token.ADD_ASSIGN, '+', token.INCREMENT)
if tkn == token.INCREMENT {
insertSemicolon = true
}
case '-':
tkn = p.switch3(token.MINUS, token.SUBTRACT_ASSIGN, '-', token.DECREMENT)
if tkn == token.DECREMENT {
insertSemicolon = true
}
case '*':
tkn = p.switch2(token.MULTIPLY, token.MULTIPLY_ASSIGN)
case '/':
switch p.chr {
case '/':
if p.mode&StoreComments != 0 {
literal := string(p.readSingleLineComment())
p.comments.AddComment(ast.NewComment(literal, idx))
continue
}
p.skipSingleLineComment()
continue
case '*':
if p.mode&StoreComments != 0 {
literal = string(p.readMultiLineComment())
p.comments.AddComment(ast.NewComment(literal, idx))
continue
}
p.skipMultiLineComment()
continue
default:
// Could be division, could be RegExp literal
tkn = p.switch2(token.SLASH, token.QUOTIENT_ASSIGN)
insertSemicolon = true
}
case '%':
tkn = p.switch2(token.REMAINDER, token.REMAINDER_ASSIGN)
case '^':
tkn = p.switch2(token.EXCLUSIVE_OR, token.EXCLUSIVE_OR_ASSIGN)
case '<':
tkn = p.switch4(token.LESS, token.LESS_OR_EQUAL, '<', token.SHIFT_LEFT, token.SHIFT_LEFT_ASSIGN)
case '>':
tkn = p.switch6(token.GREATER, token.GREATER_OR_EQUAL, '>', token.SHIFT_RIGHT, token.SHIFT_RIGHT_ASSIGN, '>', token.UNSIGNED_SHIFT_RIGHT, token.UNSIGNED_SHIFT_RIGHT_ASSIGN)
case '=':
tkn = p.switch2(token.ASSIGN, token.EQUAL)
if tkn == token.EQUAL && p.chr == '=' {
p.read()
tkn = token.STRICT_EQUAL
}
case '!':
tkn = p.switch2(token.NOT, token.NOT_EQUAL)
if tkn == token.NOT_EQUAL && p.chr == '=' {
p.read()
tkn = token.STRICT_NOT_EQUAL
}
case '&':
if p.chr == '^' {
p.read()
tkn = p.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
} else {
tkn = p.switch3(token.AND, token.AND_ASSIGN, '&', token.LOGICAL_AND)
}
case '|':
tkn = p.switch3(token.OR, token.OR_ASSIGN, '|', token.LOGICAL_OR)
case '~':
tkn = token.BITWISE_NOT
case '?':
tkn = token.QUESTION_MARK
case '"', '\'':
insertSemicolon = true
tkn = token.STRING
var err error
literal, err = p.scanString(p.chrOffset - 1)
if err != nil {
tkn = token.ILLEGAL
}
default:
p.errorUnexpected(idx, chr)
tkn = token.ILLEGAL
}
}
p.insertSemicolon = insertSemicolon
return tkn, literal, idx
}
}
func (p *parser) switch2(tkn0, tkn1 token.Token) token.Token {
if p.chr == '=' {
p.read()
return tkn1
}
return tkn0
}
func (p *parser) switch3(tkn0, tkn1 token.Token, chr2 rune, tkn2 token.Token) token.Token {
if p.chr == '=' {
p.read()
return tkn1
}
if p.chr == chr2 {
p.read()
return tkn2
}
return tkn0
}
func (p *parser) switch4(tkn0, tkn1 token.Token, chr2 rune, tkn2, tkn3 token.Token) token.Token {
if p.chr == '=' {
p.read()
return tkn1
}
if p.chr == chr2 {
p.read()
if p.chr == '=' {
p.read()
return tkn3
}
return tkn2
}
return tkn0
}
func (p *parser) switch6(tkn0, tkn1 token.Token, chr2 rune, tkn2, tkn3 token.Token, chr3 rune, tkn4, tkn5 token.Token) token.Token {
if p.chr == '=' {
p.read()
return tkn1
}
if p.chr == chr2 {
p.read()
if p.chr == '=' {
p.read()
return tkn3
}
if p.chr == chr3 {
p.read()
if p.chr == '=' {
p.read()
return tkn5
}
return tkn4
}
return tkn2
}
return tkn0
}
func (p *parser) chrAt(index int) chr { //nolint: unused
value, width := utf8.DecodeRuneInString(p.str[index:])
return chr{
value: value,
width: width,
}
}
func (p *parser) peek() rune {
if p.offset+1 < p.length {
return rune(p.str[p.offset+1])
}
return -1
}
func (p *parser) read() {
if p.offset < p.length {
p.chrOffset = p.offset
chr, width := rune(p.str[p.offset]), 1
if chr >= utf8.RuneSelf { // !ASCII
chr, width = utf8.DecodeRuneInString(p.str[p.offset:])
if chr == utf8.RuneError && width == 1 {
p.error(p.chrOffset, "Invalid UTF-8 character")
}
}
p.offset += width
p.chr = chr
} else {
p.chrOffset = p.length
p.chr = -1 // EOF
}
}
// This is here since the functions are so similar.
func (p *regExpParser) read() {
if p.offset < p.length {
p.chrOffset = p.offset
chr, width := rune(p.str[p.offset]), 1
if chr >= utf8.RuneSelf { // !ASCII
chr, width = utf8.DecodeRuneInString(p.str[p.offset:])
if chr == utf8.RuneError && width == 1 {
p.error(p.chrOffset, "Invalid UTF-8 character")
}
}
p.offset += width
p.chr = chr
} else {
p.chrOffset = p.length
p.chr = -1 // EOF
}
}
func (p *parser) readSingleLineComment() []rune {
var result []rune
for p.chr != -1 {
p.read()
if isLineTerminator(p.chr) {
return result
}
result = append(result, p.chr)
}
// Get rid of the trailing -1
return result[:len(result)-1]
}
func (p *parser) readMultiLineComment() []rune {
var result []rune
p.read()
for p.chr >= 0 {
chr := p.chr
p.read()
if chr == '*' && p.chr == '/' {
p.read()
return result
}
result = append(result, chr)
}
p.errorUnexpected(0, p.chr)
return result
}
func (p *parser) skipSingleLineComment() {
for p.chr != -1 {
p.read()
if isLineTerminator(p.chr) {
return
}
}
}
func (p *parser) skipMultiLineComment() {
p.read()
for p.chr >= 0 {
chr := p.chr
p.read()
if chr == '*' && p.chr == '/' {
p.read()
return
}
}
p.errorUnexpected(0, p.chr)
}
func (p *parser) skipWhiteSpace() {
for {
switch p.chr {
case ' ', '\t', '\f', '\v', '\u00a0', '\ufeff':
p.read()
continue
case '\r':
if p.peek() == '\n' {
p.comments.AtLineBreak()
p.read()
}
fallthrough
case '\u2028', '\u2029', '\n':
if p.insertSemicolon {
return
}
p.comments.AtLineBreak()
p.read()
continue
}
if p.chr >= utf8.RuneSelf {
if unicode.IsSpace(p.chr) {
p.read()
continue
}
}
break
}
}
func (p *parser) scanMantissa(base int) {
for digitValue(p.chr) < base {
p.read()
}
}
func (p *parser) scanEscape(quote rune) {
var length, base uint32
switch p.chr {
// Octal:
// length, base, limit = 3, 8, 255
case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"', '\'', '0':
p.read()
return
case '\r', '\n', '\u2028', '\u2029':
p.scanNewline()
return
case 'x':
p.read()
length, base = 2, 16
case 'u':
p.read()
length, base = 4, 16
default:
p.read() // Always make progress
return
}
var value uint32
for ; length > 0 && p.chr != quote && p.chr >= 0; length-- {
digit := uint32(digitValue(p.chr))
if digit >= base {
break
}
value = value*base + digit
p.read()
}
}
func (p *parser) scanString(offset int) (string, error) {
// " ' /
quote := rune(p.str[offset])
for p.chr != quote {
chr := p.chr
if chr == '\n' || chr == '\r' || chr == '\u2028' || chr == '\u2029' || chr < 0 {
goto newline
}
p.read()
switch {
case chr == '\\':
if quote == '/' {
if p.chr == '\n' || p.chr == '\r' || p.chr == '\u2028' || p.chr == '\u2029' || p.chr < 0 {
goto newline
}
p.read()
} else {
p.scanEscape(quote)
}
case chr == '[' && quote == '/':
// Allow a slash (/) in a bracket character class ([...])
// TODO Fix this, this is hacky...
quote = -1
case chr == ']' && quote == -1:
quote = '/'
}
}
// " ' /
p.read()
return p.str[offset:p.chrOffset], nil
newline:
p.scanNewline()
err := "String not terminated"
if quote == '/' {
err = "Invalid regular expression: missing /"
p.error(p.idxOf(offset), err)
}
return "", errors.New(err)
}
func (p *parser) scanNewline() {
if p.chr == '\r' {
p.read()
if p.chr != '\n' {
return
}
}
p.read()
}
func hex2decimal(chr byte) (rune, bool) {
r := rune(chr)
switch {
case '0' <= r && r <= '9':
return r - '0', true
case 'a' <= r && r <= 'f':
return r - 'a' + 10, true
case 'A' <= r && r <= 'F':
return r - 'A' + 10, true
default:
return 0, false
}
}
func parseNumberLiteral(literal string) (value interface{}, err error) { //nolint: nonamedreturns
// TODO Is Uint okay? What about -MAX_UINT
value, err = strconv.ParseInt(literal, 0, 64)
if err == nil {
return value, nil
}
parseIntErr := err // Save this first error, just in case
value, err = strconv.ParseFloat(literal, 64)
if err == nil {
return value, nil
} else if errors.Is(err, strconv.ErrRange) {
// Infinity, etc.
return value, nil
}
// TODO(steve): Fix as this is assigning to err so we know the type.
// Need to understand what this was trying to do?
err = parseIntErr
if errors.Is(err, strconv.ErrRange) {
if len(literal) > 2 && literal[0] == '0' && (literal[1] == 'X' || literal[1] == 'x') {
// Could just be a very large number (e.g. 0x8000000000000000)
var value float64
literal = literal[2:]
for _, chr := range literal {
digit := digitValue(chr)
if digit >= 16 {
return nil, fmt.Errorf("illegal numeric literal: %v (>= 16)", digit)
}
value = value*16 + float64(digit)
}
return value, nil
}
}
return nil, errors.New("illegal numeric literal")
}
func parseStringLiteral(literal string) (string, error) {
// Best case scenario...
if literal == "" {
return "", nil
}
// Slightly less-best case scenario...
if !strings.ContainsRune(literal, '\\') {
return literal, nil
}
str := literal
buffer := bytes.NewBuffer(make([]byte, 0, 3*len(literal)/2))
for len(str) > 0 {
switch chr := str[0]; {
// We do not explicitly handle the case of the quote
// value, which can be: " ' /
// This assumes we're already passed a partially well-formed literal
case chr >= utf8.RuneSelf:
chr, size := utf8.DecodeRuneInString(str)
buffer.WriteRune(chr)
str = str[size:]
continue
case chr != '\\':
buffer.WriteByte(chr)
str = str[1:]
continue
}
if len(str) <= 1 {
panic("len(str) <= 1")
}
chr := str[1]
var value rune
if chr >= utf8.RuneSelf {
str = str[1:]
var size int
value, size = utf8.DecodeRuneInString(str)
str = str[size:] // \ + <character>
} else {
str = str[2:] // \<character>
switch chr {
case 'b':
value = '\b'
case 'f':
value = '\f'
case 'n':
value = '\n'
case 'r':
value = '\r'
case 't':
value = '\t'
case 'v':
value = '\v'
case 'x', 'u':
size := 0
switch chr {
case 'x':
size = 2
case 'u':
size = 4
}
if len(str) < size {
return "", fmt.Errorf("invalid escape: \\%s: len(%q) != %d", string(chr), str, size)
}
for j := 0; j < size; j++ {
decimal, ok := hex2decimal(str[j])
if !ok {
return "", fmt.Errorf("invalid escape: \\%s: %q", string(chr), str[:size])
}
value = value<<4 | decimal
}
str = str[size:]
if chr == 'x' {
break
}
if value > utf8.MaxRune {
panic("value > utf8.MaxRune")
}
case '0':
if len(str) == 0 || '0' > str[0] || str[0] > '7' {
value = 0
break
}
fallthrough
case '1', '2', '3', '4', '5', '6', '7':
// TODO strict
value = rune(chr) - '0'
j := 0
for ; j < 2; j++ {
if len(str) < j+1 {
break
}
chr := str[j]
if '0' > chr || chr > '7' {
break
}
decimal := rune(str[j]) - '0'
value = (value << 3) | decimal
}
str = str[j:]
case '\\':
value = '\\'
case '\'', '"':
value = rune(chr)
case '\r':
if len(str) > 0 {
if str[0] == '\n' {
str = str[1:]
}
}
fallthrough
case '\n':
continue
default:
value = rune(chr)
}
}
buffer.WriteRune(value)
}
return buffer.String(), nil
}
func (p *parser) scanNumericLiteral(decimalPoint bool) (token.Token, string) {
offset := p.chrOffset
tkn := token.NUMBER
if decimalPoint {
offset--
p.scanMantissa(10)
goto exponent
}
if p.chr == '0' {
offset := p.chrOffset
p.read()
switch p.chr {
case 'x', 'X':
// Hexadecimal
p.read()
if isDigit(p.chr, 16) {
p.read()
} else {
return token.ILLEGAL, p.str[offset:p.chrOffset]
}
p.scanMantissa(16)
if p.chrOffset-offset <= 2 {
// Only "0x" or "0X"
p.error(0, "Illegal hexadecimal number")
}
goto hexadecimal
case '.':
// Float
goto float
default:
// Octal, Float
if p.chr == 'e' || p.chr == 'E' {
goto exponent
}
p.scanMantissa(8)
if p.chr == '8' || p.chr == '9' {
return token.ILLEGAL, p.str[offset:p.chrOffset]
}
goto octal
}
}
p.scanMantissa(10)
float:
if p.chr == '.' {
p.read()
p.scanMantissa(10)
}
exponent:
if p.chr == 'e' || p.chr == 'E' {
p.read()
if p.chr == '-' || p.chr == '+' {
p.read()
}
if isDecimalDigit(p.chr) {
p.read()
p.scanMantissa(10)
} else {
return token.ILLEGAL, p.str[offset:p.chrOffset]
}
}
hexadecimal:
octal:
if isIdentifierStart(p.chr) || isDecimalDigit(p.chr) {
return token.ILLEGAL, p.str[offset:p.chrOffset]
}
return tkn, p.str[offset:p.chrOffset]
}