implement string escapes

This commit is contained in:
Andrew Kelley
2016-05-01 14:53:48 -07:00
parent 037283c3b3
commit 9ccd0ba961
4 changed files with 180 additions and 34 deletions

View File

@@ -103,6 +103,21 @@
ALPHA: \
case '_'
#define HEX_DIGIT \
'a': \
case 'b': \
case 'c': \
case 'd': \
case 'e': \
case 'f': \
case 'A': \
case 'B': \
case 'C': \
case 'D': \
case 'E': \
case 'F': \
case DIGIT
const char * zig_keywords[] = {
"true", "false", "null", "fn", "return", "var", "const", "extern",
"pub", "export", "use", "if", "else", "goto", "asm",
@@ -132,11 +147,11 @@ enum TokenizeState {
TokenizeStateFloatExponentUnsigned, // "123.456e", "123e", "0x123p"
TokenizeStateFloatExponentNumber, // "123.456e-", "123.456e5", "123.456e5e-5"
TokenizeStateString,
TokenizeStateStringEscape,
TokenizeStateRawString,
TokenizeStateRawStringContents,
TokenizeStateRawStringMaybeEnd,
TokenizeStateCharLiteral,
TokenizeStateCharLiteralEscape,
TokenizeStateCharLiteralEnd,
TokenizeStateSawStar,
TokenizeStateSawSlash,
@@ -162,6 +177,7 @@ enum TokenizeState {
TokenizeStateSawDotDot,
TokenizeStateSawQuestionMark,
TokenizeStateSawAtSign,
TokenizeStateHex,
TokenizeStateError,
};
@@ -179,6 +195,7 @@ struct Tokenize {
int raw_string_id_start;
int raw_string_id_end;
int raw_string_id_cmp_pos;
int hex_chars_left;
};
__attribute__ ((format (printf, 2, 3)))
@@ -921,10 +938,63 @@ void tokenize(Buf *buf, Tokenization *out) {
case '\n':
tokenize_error(&t, "use raw string for multiline string literal");
break;
case '\\':
t.state = TokenizeStateStringEscape;
break;
default:
break;
}
break;
case TokenizeStateStringEscape:
switch (c) {
case 'x':
t.state = TokenizeStateHex;
t.hex_chars_left = 2;
break;
case 'u':
t.state = TokenizeStateHex;
t.hex_chars_left = 4;
break;
case 'U':
t.state = TokenizeStateHex;
t.hex_chars_left = 6;
break;
case 'n':
case 'r':
case '\\':
case 't':
case '\'':
case '"':
if (t.cur_tok->id == TokenIdCharLiteral) {
t.state = TokenizeStateCharLiteralEnd;
} else if (t.cur_tok->id == TokenIdStringLiteral) {
t.state = TokenizeStateString;
} else {
zig_unreachable();
}
break;
default:
tokenize_error(&t, "invalid character: '%c'", c);
}
break;
case TokenizeStateHex:
switch (c) {
case HEX_DIGIT:
t.hex_chars_left -= 1;
if (t.hex_chars_left == 0) {
if (t.cur_tok->id == TokenIdCharLiteral) {
t.state = TokenizeStateCharLiteralEnd;
} else if (t.cur_tok->id == TokenIdStringLiteral) {
t.state = TokenizeStateString;
} else {
zig_unreachable();
}
}
break;
default:
tokenize_error(&t, "invalid character: '%c'", c);
}
break;
case TokenizeStateRawString:
if (c == '(') {
t.raw_string_id_end = t.pos;
@@ -963,16 +1033,13 @@ void tokenize(Buf *buf, Tokenization *out) {
t.state = TokenizeStateStart;
break;
case '\\':
t.state = TokenizeStateCharLiteralEscape;
t.state = TokenizeStateStringEscape;
break;
default:
t.state = TokenizeStateCharLiteralEnd;
break;
}
break;
case TokenizeStateCharLiteralEscape:
t.state = TokenizeStateCharLiteralEnd;
break;
case TokenizeStateCharLiteralEnd:
switch (c) {
case '\'':
@@ -1136,13 +1203,22 @@ void tokenize(Buf *buf, Tokenization *out) {
case TokenizeStateString:
tokenize_error(&t, "unterminated string");
break;
case TokenizeStateStringEscape:
case TokenizeStateHex:
if (t.cur_tok->id == TokenIdStringLiteral) {
tokenize_error(&t, "unterminated string");
} else if (t.cur_tok->id == TokenIdCharLiteral) {
tokenize_error(&t, "unterminated character literal");
} else {
zig_unreachable();
}
break;
case TokenizeStateRawString:
case TokenizeStateRawStringContents:
case TokenizeStateRawStringMaybeEnd:
tokenize_error(&t, "unterminated raw string");
break;
case TokenizeStateCharLiteral:
case TokenizeStateCharLiteralEscape:
case TokenizeStateCharLiteralEnd:
tokenize_error(&t, "unterminated character literal");
break;