diff --git a/doc/langref.html.in b/doc/langref.html.in index a716336015..645c03dcbb 100644 --- a/doc/langref.html.in +++ b/doc/langref.html.in @@ -10447,13 +10447,40 @@ fn readU32Be() u32 {} {#header_close#} {#header_open|Source Encoding#}

Zig source code is encoded in UTF-8. An invalid UTF-8 byte sequence results in a compile error.

-

Throughout all zig source code (including in comments), some codepoints are never allowed:

+

Throughout all zig source code (including in comments), some code points are never allowed:

-

The codepoint U+000a (LF) (which is encoded as the single-byte value 0x0a) is the line terminator character. This character always terminates a line of zig source code (except possibly the last line of the file).

-

For some discussion on the rationale behind these design decisions, see issue #663

+

+ LF (byte value 0x0a, code point U+000a, {#syntax#}'\n'{#endsyntax#}) is the line terminator in Zig source code. + This byte value terminates every line of zig source code except the last line of the file. + It is recommended that non-empty source files end with an empty line, which means the last byte would be 0x0a (LF). +

+

+ Each LF may be immediately preceded by a single CR (byte value 0x0d, code point U+000d, {#syntax#}'\r'{#endsyntax#}) + to form a Windows style line ending, but this is discouraged. + A CR in any other context is not allowed. +

+

+ HT hard tabs (byte value 0x09, code point U+0009, {#syntax#}'\t'{#endsyntax#}) are interchangeable with + SP spaces (byte value 0x20, code point U+0020, {#syntax#}' '{#endsyntax#}) as a token separator, + but use of hard tabs is discouraged. See {#link|Grammar#}. +

+

+ Note that running zig fmt on a source file will implement all recommendations mentioned here. + Note also that the stage1 compiler does not yet support CR or HT control characters. +

+

+ Note that a tool reading Zig source code can make assumptions if the source code is assumed to be correct Zig code. + For example, when identifying the ends of lines, a tool can use a naive search such as /\n/, + or an advanced + search such as /\r\n?|[\n\u0085\u2028\u2029]/, and in either case line endings will be correctly identified. + For another example, when identifying the whitespace before the first token on a line, + a tool can either use a naive search such as /[ \t]/, + or an advanced search such as /\s/, + and in either case whitespace will be correctly identified. +

{#header_close#} {#header_open|Keyword Reference#} @@ -11373,6 +11400,7 @@ ExprList <- (Expr COMMA)* Expr? # *** Tokens *** eof <- !. +eol <- ('\r'? '\n') | eof hex <- [0-9a-fA-F] hex_ <- ('_'/hex) dec <- [0-9] @@ -11382,39 +11410,39 @@ dec_int <- dec (dec_* dec)? hex_int <- hex (hex_* dec)? char_escape - <- "\\x" hex hex - / "\\u{" hex+ "}" - / "\\" [nr\\t'"] + <- '\\x' hex hex + / '\\u{' hex+ '}' + / '\\' [nr\\t'"] char_char <- char_escape - / [^\\'\n] + / [^\\'\r\n] string_char <- char_escape - / [^\\"\n] + / [^\\"\r\n] -line_comment <- '//'[^\n]* -line_string <- ("\\\\" [^\n]* [ \n]*)+ -skip <- ([ \n] / line_comment)* +line_comment <- '//'[^\r\n]* eol +line_string <- ('\\\\' [^\r\n]* eol skip)+ +skip <- ([ \t] / eol / line_comment)* CHAR_LITERAL <- "'" char_char "'" skip FLOAT - <- "0x" hex_* hex "." hex_int ([pP] [-+]? hex_int)? skip - / dec_int "." dec_int ([eE] [-+]? dec_int)? skip - / "0x" hex_* hex "."? [pP] [-+]? hex_int skip - / dec_int "."? [eE] [-+]? dec_int skip + <- '0x' hex_* hex '.' hex_int ([pP] [-+]? hex_int)? skip + / dec_int '.' dec_int ([eE] [-+]? dec_int)? skip + / '0x' hex_* hex '.'? [pP] [-+]? hex_int skip + / dec_int '.'? [eE] [-+]? dec_int skip INTEGER - <- "0b" [_01]* [01] skip - / "0o" [_0-7]* [0-7] skip - / "0x" hex_* hex skip + <- '0b' [_01]* [01] skip + / '0o' [_0-7]* [0-7] skip + / '0x' hex_* hex skip / dec_int skip -STRINGLITERALSINGLE <- "\"" string_char* "\"" skip +STRINGLITERALSINGLE <- '"' string_char* '"' skip STRINGLITERAL <- STRINGLITERALSINGLE - / line_string skip + / line_string skip IDENTIFIER <- !keyword [A-Za-z_] [A-Za-z0-9_]* skip - / "@\"" string_char* "\"" skip -BUILTINIDENTIFIER <- "@"[A-Za-z_][A-Za-z0-9_]* skip + / '@"' string_char* '"' skip +BUILTINIDENTIFIER <- '@'[A-Za-z_][A-Za-z0-9_]* skip AMPERSAND <- '&' ![=] skip