zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

commit e357134f0f02cbd458b59eaf8de3efa0cf473b35 (tree)
parent 09bb2315e22c50a4aa9900102fcbde0302717110
Author: Andrew Kelley <andrew@ziglang.org>
Date:   Wed, 10 Jun 2026 12:43:22 -0700

std: delete tests that rely on embedded copyrighted files

rfc1951.txt wasn't even being used anywhere.

closes #35709

Diffstat:
Mbuild.zig | 15---------------
Dlib/std/compress/flate/testdata/rfc1951.txt | 955-------------------------------------------------------------------------------
Dlib/std/compress/testdata/rfc8478.txt | 3027-------------------------------------------------------------------------------
Dlib/std/compress/testdata/rfc8478.txt.zst.19 | 0
Dlib/std/compress/testdata/rfc8478.txt.zst.3 | 0
Mlib/std/compress/zstd.zig | 11-----------
6 files changed, 0 insertions(+), 4008 deletions(-)

diff --git a/build.zig b/build.zig @@ -142,24 +142,9 @@ pub fn build(b: *std.Build) !void { .install_dir = if (flat) .prefix else .lib, .install_subdir = if (flat) "lib" else "zig", .exclude_extensions = &[_][]const u8{ - // exclude files from lib/std/compress/testdata - ".gz", - ".z.0", - ".z.9", - ".zst.3", - ".zst.19", - "rfc1951.txt", - "rfc1952.txt", - "rfc8478.txt", // exclude files from lib/std/compress/flate/testdata ".expect", - ".expect-noinput", - ".golden", ".input", - "compress-e.txt", - "compress-gettysburg.txt", - "compress-pi.txt", - "rfc1951.txt", // exclude files from lib/std/compress/lzma/testdata ".lzma", // exclude files from lib/std/compress/xz/testdata diff --git a/lib/std/compress/flate/testdata/rfc1951.txt b/lib/std/compress/flate/testdata/rfc1951.txt @@ -1,955 +0,0 @@ - - - - - - -Network Working Group P. Deutsch -Request for Comments: 1951 Aladdin Enterprises -Category: Informational May 1996 - - - DEFLATE Compressed Data Format Specification version 1.3 - -Status of This Memo - - This memo provides information for the Internet community. This memo - does not specify an Internet standard of any kind. Distribution of - this memo is unlimited. - -IESG Note: - - The IESG takes no position on the validity of any Intellectual - Property Rights statements contained in this document. - -Notices - - Copyright (c) 1996 L. Peter Deutsch - - Permission is granted to copy and distribute this document for any - purpose and without charge, including translations into other - languages and incorporation into compilations, provided that the - copyright notice and this notice are preserved, and that any - substantive changes or deletions from the original are clearly - marked. - - A pointer to the latest version of this and related documentation in - HTML format can be found at the URL - <ftp://ftp.uu.net/graphics/png/documents/zlib/zdoc-index.html>. - -Abstract - - This specification defines a lossless compressed data format that - compresses data using a combination of the LZ77 algorithm and Huffman - coding, with efficiency comparable to the best currently available - general-purpose compression methods. The data can be produced or - consumed, even for an arbitrarily long sequentially presented input - data stream, using only an a priori bounded amount of intermediate - storage. The format can be implemented readily in a manner not - covered by patents. - - - - - - - - -Deutsch Informational [Page 1] - -RFC 1951 DEFLATE Compressed Data Format Specification May 1996 - - -Table of Contents - - 1. Introduction ................................................... 2 - 1.1. Purpose ................................................... 2 - 1.2. Intended audience ......................................... 3 - 1.3. Scope ..................................................... 3 - 1.4. Compliance ................................................ 3 - 1.5. Definitions of terms and conventions used ................ 3 - 1.6. Changes from previous versions ............................ 4 - 2. Compressed representation overview ............................. 4 - 3. Detailed specification ......................................... 5 - 3.1. Overall conventions ....................................... 5 - 3.1.1. Packing into bytes .................................. 5 - 3.2. Compressed block format ................................... 6 - 3.2.1. Synopsis of prefix and Huffman coding ............... 6 - 3.2.2. Use of Huffman coding in the "deflate" format ....... 7 - 3.2.3. Details of block format ............................. 9 - 3.2.4. Non-compressed blocks (BTYPE=00) ................... 11 - 3.2.5. Compressed blocks (length and distance codes) ...... 11 - 3.2.6. Compression with fixed Huffman codes (BTYPE=01) .... 12 - 3.2.7. Compression with dynamic Huffman codes (BTYPE=10) .. 13 - 3.3. Compliance ............................................... 14 - 4. Compression algorithm details ................................. 14 - 5. References .................................................... 16 - 6. Security Considerations ....................................... 16 - 7. Source code ................................................... 16 - 8. Acknowledgements .............................................. 16 - 9. Author's Address .............................................. 17 - -1. Introduction - - 1.1. Purpose - - The purpose of this specification is to define a lossless - compressed data format that: - * Is independent of CPU type, operating system, file system, - and character set, and hence can be used for interchange; - * Can be produced or consumed, even for an arbitrarily long - sequentially presented input data stream, using only an a - priori bounded amount of intermediate storage, and hence - can be used in data communications or similar structures - such as Unix filters; - * Compresses data with efficiency comparable to the best - currently available general-purpose compression methods, - and in particular considerably better than the "compress" - program; - * Can be implemented readily in a manner not covered by - patents, and hence can be practiced freely; - - - -Deutsch Informational [Page 2] - -RFC 1951 DEFLATE Compressed Data Format Specification May 1996 - - - * Is compatible with the file format produced by the current - widely used gzip utility, in that conforming decompressors - will be able to read data produced by the existing gzip - compressor. - - The data format defined by this specification does not attempt to: - - * Allow random access to compressed data; - * Compress specialized data (e.g., raster graphics) as well - as the best currently available specialized algorithms. - - A simple counting argument shows that no lossless compression - algorithm can compress every possible input data set. For the - format defined here, the worst case expansion is 5 bytes per 32K- - byte block, i.e., a size increase of 0.015% for large data sets. - English text usually compresses by a factor of 2.5 to 3; - executable files usually compress somewhat less; graphical data - such as raster images may compress much more. - - 1.2. Intended audience - - This specification is intended for use by implementors of software - to compress data into "deflate" format and/or decompress data from - "deflate" format. - - The text of the specification assumes a basic background in - programming at the level of bits and other primitive data - representations. Familiarity with the technique of Huffman coding - is helpful but not required. - - 1.3. Scope - - The specification specifies a method for representing a sequence - of bytes as a (usually shorter) sequence of bits, and a method for - packing the latter bit sequence into bytes. - - 1.4. Compliance - - Unless otherwise indicated below, a compliant decompressor must be - able to accept and decompress any data set that conforms to all - the specifications presented here; a compliant compressor must - produce data sets that conform to all the specifications presented - here. - - 1.5. Definitions of terms and conventions used - - Byte: 8 bits stored or transmitted as a unit (same as an octet). - For this specification, a byte is exactly 8 bits, even on machines - - - -Deutsch Informational [Page 3] - -RFC 1951 DEFLATE Compressed Data Format Specification May 1996 - - - which store a character on a number of bits different from eight. - See below, for the numbering of bits within a byte. - - String: a sequence of arbitrary bytes. - - 1.6. Changes from previous versions - - There have been no technical changes to the deflate format since - version 1.1 of this specification. In version 1.2, some - terminology was changed. Version 1.3 is a conversion of the - specification to RFC style. - -2. Compressed representation overview - - A compressed data set consists of a series of blocks, corresponding - to successive blocks of input data. The block sizes are arbitrary, - except that non-compressible blocks are limited to 65,535 bytes. - - Each block is compressed using a combination of the LZ77 algorithm - and Huffman coding. The Huffman trees for each block are independent - of those for previous or subsequent blocks; the LZ77 algorithm may - use a reference to a duplicated string occurring in a previous block, - up to 32K input bytes before. - - Each block consists of two parts: a pair of Huffman code trees that - describe the representation of the compressed data part, and a - compressed data part. (The Huffman trees themselves are compressed - using Huffman encoding.) The compressed data consists of a series of - elements of two types: literal bytes (of strings that have not been - detected as duplicated within the previous 32K input bytes), and - pointers to duplicated strings, where a pointer is represented as a - pair <length, backward distance>. The representation used in the - "deflate" format limits distances to 32K bytes and lengths to 258 - bytes, but does not limit the size of a block, except for - uncompressible blocks, which are limited as noted above. - - Each type of value (literals, distances, and lengths) in the - compressed data is represented using a Huffman code, using one code - tree for literals and lengths and a separate code tree for distances. - The code trees for each block appear in a compact form just before - the compressed data for that block. - - - - - - - - - - -Deutsch Informational [Page 4] - -RFC 1951 DEFLATE Compressed Data Format Specification May 1996 - - -3. Detailed specification - - 3.1. Overall conventions In the diagrams below, a box like this: - - +---+ - | | <-- the vertical bars might be missing - +---+ - - represents one byte; a box like this: - - +==============+ - | | - +==============+ - - represents a variable number of bytes. - - Bytes stored within a computer do not have a "bit order", since - they are always treated as a unit. However, a byte considered as - an integer between 0 and 255 does have a most- and least- - significant bit, and since we write numbers with the most- - significant digit on the left, we also write bytes with the most- - significant bit on the left. In the diagrams below, we number the - bits of a byte so that bit 0 is the least-significant bit, i.e., - the bits are numbered: - - +--------+ - |76543210| - +--------+ - - Within a computer, a number may occupy multiple bytes. All - multi-byte numbers in the format described here are stored with - the least-significant byte first (at the lower memory address). - For example, the decimal number 520 is stored as: - - 0 1 - +--------+--------+ - |00001000|00000010| - +--------+--------+ - ^ ^ - | | - | + more significant byte = 2 x 256 - + less significant byte = 8 - - 3.1.1. Packing into bytes - - This document does not address the issue of the order in which - bits of a byte are transmitted on a bit-sequential medium, - since the final data format described here is byte- rather than - - - -Deutsch Informational [Page 5] - -RFC 1951 DEFLATE Compressed Data Format Specification May 1996 - - - bit-oriented. However, we describe the compressed block format - in below, as a sequence of data elements of various bit - lengths, not a sequence of bytes. We must therefore specify - how to pack these data elements into bytes to form the final - compressed byte sequence: - - * Data elements are packed into bytes in order of - increasing bit number within the byte, i.e., starting - with the least-significant bit of the byte. - * Data elements other than Huffman codes are packed - starting with the least-significant bit of the data - element. - * Huffman codes are packed starting with the most- - significant bit of the code. - - In other words, if one were to print out the compressed data as - a sequence of bytes, starting with the first byte at the - *right* margin and proceeding to the *left*, with the most- - significant bit of each byte on the left as usual, one would be - able to parse the result from right to left, with fixed-width - elements in the correct MSB-to-LSB order and Huffman codes in - bit-reversed order (i.e., with the first bit of the code in the - relative LSB position). - - 3.2. Compressed block format - - 3.2.1. Synopsis of prefix and Huffman coding - - Prefix coding represents symbols from an a priori known - alphabet by bit sequences (codes), one code for each symbol, in - a manner such that different symbols may be represented by bit - sequences of different lengths, but a parser can always parse - an encoded string unambiguously symbol-by-symbol. - - We define a prefix code in terms of a binary tree in which the - two edges descending from each non-leaf node are labeled 0 and - 1 and in which the leaf nodes correspond one-for-one with (are - labeled with) the symbols of the alphabet; then the code for a - symbol is the sequence of 0's and 1's on the edges leading from - the root to the leaf labeled with that symbol. For example: - - - - - - - - - - - -Deutsch Informational [Page 6] - -RFC 1951 DEFLATE Compressed Data Format Specification May 1996 - - - /\ Symbol Code - 0 1 ------ ---- - / \ A 00 - /\ B B 1 - 0 1 C 011 - / \ D 010 - A /\ - 0 1 - / \ - D C - - A parser can decode the next symbol from an encoded input - stream by walking down the tree from the root, at each step - choosing the edge corresponding to the next input bit. - - Given an alphabet with known symbol frequencies, the Huffman - algorithm allows the construction of an optimal prefix code - (one which represents strings with those symbol frequencies - using the fewest bits of any possible prefix codes for that - alphabet). Such a code is called a Huffman code. (See - reference [1] in Chapter 5, references for additional - information on Huffman codes.) - - Note that in the "deflate" format, the Huffman codes for the - various alphabets must not exceed certain maximum code lengths. - This constraint complicates the algorithm for computing code - lengths from symbol frequencies. Again, see Chapter 5, - references for details. - - 3.2.2. Use of Huffman coding in the "deflate" format - - The Huffman codes used for each alphabet in the "deflate" - format have two additional rules: - - * All codes of a given bit length have lexicographically - consecutive values, in the same order as the symbols - they represent; - - * Shorter codes lexicographically precede longer codes. - - - - - - - - - - - - -Deutsch Informational [Page 7] - -RFC 1951 DEFLATE Compressed Data Format Specification May 1996 - - - We could recode the example above to follow this rule as - follows, assuming that the order of the alphabet is ABCD: - - Symbol Code - ------ ---- - A 10 - B 0 - C 110 - D 111 - - I.e., 0 precedes 10 which precedes 11x, and 110 and 111 are - lexicographically consecutive. - - Given this rule, we can define the Huffman code for an alphabet - just by giving the bit lengths of the codes for each symbol of - the alphabet in order; this is sufficient to determine the - actual codes. In our example, the code is completely defined - by the sequence of bit lengths (2, 1, 3, 3). The following - algorithm generates the codes as integers, intended to be read - from most- to least-significant bit. The code lengths are - initially in tree[I].Len; the codes are produced in - tree[I].Code. - - 1) Count the number of codes for each code length. Let - bl_count[N] be the number of codes of length N, N >= 1. - - 2) Find the numerical value of the smallest code for each - code length: - - code = 0; - bl_count[0] = 0; - for (bits = 1; bits <= MAX_BITS; bits++) { - code = (code + bl_count[bits-1]) << 1; - next_code[bits] = code; - } - - 3) Assign numerical values to all codes, using consecutive - values for all codes of the same length with the base - values determined at step 2. Codes that are never used - (which have a bit length of zero) must not be assigned a - value. - - for (n = 0; n <= max_code; n++) { - len = tree[n].Len; - if (len != 0) { - tree[n].Code = next_code[len]; - next_code[len]++; - } - - - -Deutsch Informational [Page 8] - -RFC 1951 DEFLATE Compressed Data Format Specification May 1996 - - - } - - Example: - - Consider the alphabet ABCDEFGH, with bit lengths (3, 3, 3, 3, - 3, 2, 4, 4). After step 1, we have: - - N bl_count[N] - - ----------- - 2 1 - 3 5 - 4 2 - - Step 2 computes the following next_code values: - - N next_code[N] - - ------------ - 1 0 - 2 0 - 3 2 - 4 14 - - Step 3 produces the following code values: - - Symbol Length Code - ------ ------ ---- - A 3 010 - B 3 011 - C 3 100 - D 3 101 - E 3 110 - F 2 00 - G 4 1110 - H 4 1111 - - 3.2.3. Details of block format - - Each block of compressed data begins with 3 header bits - containing the following data: - - first bit BFINAL - next 2 bits BTYPE - - Note that the header bits do not necessarily begin on a byte - boundary, since a block does not necessarily occupy an integral - number of bytes. - - - - - -Deutsch Informational [Page 9] - -RFC 1951 DEFLATE Compressed Data Format Specification May 1996 - - - BFINAL is set if and only if this is the last block of the data - set. - - BTYPE specifies how the data are compressed, as follows: - - 00 - no compression - 01 - compressed with fixed Huffman codes - 10 - compressed with dynamic Huffman codes - 11 - reserved (error) - - The only difference between the two compressed cases is how the - Huffman codes for the literal/length and distance alphabets are - defined. - - In all cases, the decoding algorithm for the actual data is as - follows: - - do - read block header from input stream. - if stored with no compression - skip any remaining bits in current partially - processed byte - read LEN and NLEN (see next section) - copy LEN bytes of data to output - otherwise - if compressed with dynamic Huffman codes - read representation of code trees (see - subsection below) - loop (until end of block code recognized) - decode literal/length value from input stream - if value < 256 - copy value (literal byte) to output stream - otherwise - if value = end of block (256) - break from loop - otherwise (value = 257..285) - decode distance from input stream - - move backwards distance bytes in the output - stream, and copy length bytes from this - position to the output stream. - end loop - while not last block - - Note that a duplicated string reference may refer to a string - in a previous block; i.e., the backward distance may cross one - or more block boundaries. However a distance cannot refer past - the beginning of the output stream. (An application using a - - - -Deutsch Informational [Page 10] - -RFC 1951 DEFLATE Compressed Data Format Specification May 1996 - - - preset dictionary might discard part of the output stream; a - distance can refer to that part of the output stream anyway) - Note also that the referenced string may overlap the current - position; for example, if the last 2 bytes decoded have values - X and Y, a string reference with <length = 5, distance = 2> - adds X,Y,X,Y,X to the output stream. - - We now specify each compression method in turn. - - 3.2.4. Non-compressed blocks (BTYPE=00) - - Any bits of input up to the next byte boundary are ignored. - The rest of the block consists of the following information: - - 0 1 2 3 4... - +---+---+---+---+================================+ - | LEN | NLEN |... LEN bytes of literal data...| - +---+---+---+---+================================+ - - LEN is the number of data bytes in the block. NLEN is the - one's complement of LEN. - - 3.2.5. Compressed blocks (length and distance codes) - - As noted above, encoded data blocks in the "deflate" format - consist of sequences of symbols drawn from three conceptually - distinct alphabets: either literal bytes, from the alphabet of - byte values (0..255), or <length, backward distance> pairs, - where the length is drawn from (3..258) and the distance is - drawn from (1..32,768). In fact, the literal and length - alphabets are merged into a single alphabet (0..285), where - values 0..255 represent literal bytes, the value 256 indicates - end-of-block, and values 257..285 represent length codes - (possibly in conjunction with extra bits following the symbol - code) as follows: - - - - - - - - - - - - - - - - -Deutsch Informational [Page 11] - -RFC 1951 DEFLATE Compressed Data Format Specification May 1996 - - - Extra Extra Extra - Code Bits Length(s) Code Bits Lengths Code Bits Length(s) - ---- ---- ------ ---- ---- ------- ---- ---- ------- - 257 0 3 267 1 15,16 277 4 67-82 - 258 0 4 268 1 17,18 278 4 83-98 - 259 0 5 269 2 19-22 279 4 99-114 - 260 0 6 270 2 23-26 280 4 115-130 - 261 0 7 271 2 27-30 281 5 131-162 - 262 0 8 272 2 31-34 282 5 163-194 - 263 0 9 273 3 35-42 283 5 195-226 - 264 0 10 274 3 43-50 284 5 227-257 - 265 1 11,12 275 3 51-58 285 0 258 - 266 1 13,14 276 3 59-66 - - The extra bits should be interpreted as a machine integer - stored with the most-significant bit first, e.g., bits 1110 - represent the value 14. - - Extra Extra Extra - Code Bits Dist Code Bits Dist Code Bits Distance - ---- ---- ---- ---- ---- ------ ---- ---- -------- - 0 0 1 10 4 33-48 20 9 1025-1536 - 1 0 2 11 4 49-64 21 9 1537-2048 - 2 0 3 12 5 65-96 22 10 2049-3072 - 3 0 4 13 5 97-128 23 10 3073-4096 - 4 1 5,6 14 6 129-192 24 11 4097-6144 - 5 1 7,8 15 6 193-256 25 11 6145-8192 - 6 2 9-12 16 7 257-384 26 12 8193-12288 - 7 2 13-16 17 7 385-512 27 12 12289-16384 - 8 3 17-24 18 8 513-768 28 13 16385-24576 - 9 3 25-32 19 8 769-1024 29 13 24577-32768 - - 3.2.6. Compression with fixed Huffman codes (BTYPE=01) - - The Huffman codes for the two alphabets are fixed, and are not - represented explicitly in the data. The Huffman code lengths - for the literal/length alphabet are: - - Lit Value Bits Codes - --------- ---- ----- - 0 - 143 8 00110000 through - 10111111 - 144 - 255 9 110010000 through - 111111111 - 256 - 279 7 0000000 through - 0010111 - 280 - 287 8 11000000 through - 11000111 - - - -Deutsch Informational [Page 12] - -RFC 1951 DEFLATE Compressed Data Format Specification May 1996 - - - The code lengths are sufficient to generate the actual codes, - as described above; we show the codes in the table for added - clarity. Literal/length values 286-287 will never actually - occur in the compressed data, but participate in the code - construction. - - Distance codes 0-31 are represented by (fixed-length) 5-bit - codes, with possible additional bits as shown in the table - shown in Paragraph 3.2.5, above. Note that distance codes 30- - 31 will never actually occur in the compressed data. - - 3.2.7. Compression with dynamic Huffman codes (BTYPE=10) - - The Huffman codes for the two alphabets appear in the block - immediately after the header bits and before the actual - compressed data, first the literal/length code and then the - distance code. Each code is defined by a sequence of code - lengths, as discussed in Paragraph 3.2.2, above. For even - greater compactness, the code length sequences themselves are - compressed using a Huffman code. The alphabet for code lengths - is as follows: - - 0 - 15: Represent code lengths of 0 - 15 - 16: Copy the previous code length 3 - 6 times. - The next 2 bits indicate repeat length - (0 = 3, ... , 3 = 6) - Example: Codes 8, 16 (+2 bits 11), - 16 (+2 bits 10) will expand to - 12 code lengths of 8 (1 + 6 + 5) - 17: Repeat a code length of 0 for 3 - 10 times. - (3 bits of length) - 18: Repeat a code length of 0 for 11 - 138 times - (7 bits of length) - - A code length of 0 indicates that the corresponding symbol in - the literal/length or distance alphabet will not occur in the - block, and should not participate in the Huffman code - construction algorithm given earlier. If only one distance - code is used, it is encoded using one bit, not zero bits; in - this case there is a single code length of one, with one unused - code. One distance code of zero bits means that there are no - distance codes used at all (the data is all literals). - - We can now define the format of the block: - - 5 Bits: HLIT, # of Literal/Length codes - 257 (257 - 286) - 5 Bits: HDIST, # of Distance codes - 1 (1 - 32) - 4 Bits: HCLEN, # of Code Length codes - 4 (4 - 19) - - - -Deutsch Informational [Page 13] - -RFC 1951 DEFLATE Compressed Data Format Specification May 1996 - - - (HCLEN + 4) x 3 bits: code lengths for the code length - alphabet given just above, in the order: 16, 17, 18, - 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 - - These code lengths are interpreted as 3-bit integers - (0-7); as above, a code length of 0 means the - corresponding symbol (literal/length or distance code - length) is not used. - - HLIT + 257 code lengths for the literal/length alphabet, - encoded using the code length Huffman code - - HDIST + 1 code lengths for the distance alphabet, - encoded using the code length Huffman code - - The actual compressed data of the block, - encoded using the literal/length and distance Huffman - codes - - The literal/length symbol 256 (end of data), - encoded using the literal/length Huffman code - - The code length repeat codes can cross from HLIT + 257 to the - HDIST + 1 code lengths. In other words, all code lengths form - a single sequence of HLIT + HDIST + 258 values. - - 3.3. Compliance - - A compressor may limit further the ranges of values specified in - the previous section and still be compliant; for example, it may - limit the range of backward pointers to some value smaller than - 32K. Similarly, a compressor may limit the size of blocks so that - a compressible block fits in memory. - - A compliant decompressor must accept the full range of possible - values defined in the previous section, and must accept blocks of - arbitrary size. - -4. Compression algorithm details - - While it is the intent of this document to define the "deflate" - compressed data format without reference to any particular - compression algorithm, the format is related to the compressed - formats produced by LZ77 (Lempel-Ziv 1977, see reference [2] below); - since many variations of LZ77 are patented, it is strongly - recommended that the implementor of a compressor follow the general - algorithm presented here, which is known not to be patented per se. - The material in this section is not part of the definition of the - - - -Deutsch Informational [Page 14] - -RFC 1951 DEFLATE Compressed Data Format Specification May 1996 - - - specification per se, and a compressor need not follow it in order to - be compliant. - - The compressor terminates a block when it determines that starting a - new block with fresh trees would be useful, or when the block size - fills up the compressor's block buffer. - - The compressor uses a chained hash table to find duplicated strings, - using a hash function that operates on 3-byte sequences. At any - given point during compression, let XYZ be the next 3 input bytes to - be examined (not necessarily all different, of course). First, the - compressor examines the hash chain for XYZ. If the chain is empty, - the compressor simply writes out X as a literal byte and advances one - byte in the input. If the hash chain is not empty, indicating that - the sequence XYZ (or, if we are unlucky, some other 3 bytes with the - same hash function value) has occurred recently, the compressor - compares all strings on the XYZ hash chain with the actual input data - sequence starting at the current point, and selects the longest - match. - - The compressor searches the hash chains starting with the most recent - strings, to favor small distances and thus take advantage of the - Huffman encoding. The hash chains are singly linked. There are no - deletions from the hash chains; the algorithm simply discards matches - that are too old. To avoid a worst-case situation, very long hash - chains are arbitrarily truncated at a certain length, determined by a - run-time parameter. - - To improve overall compression, the compressor optionally defers the - selection of matches ("lazy matching"): after a match of length N has - been found, the compressor searches for a longer match starting at - the next input byte. If it finds a longer match, it truncates the - previous match to a length of one (thus producing a single literal - byte) and then emits the longer match. Otherwise, it emits the - original match, and, as described above, advances N bytes before - continuing. - - Run-time parameters also control this "lazy match" procedure. If - compression ratio is most important, the compressor attempts a - complete second search regardless of the length of the first match. - In the normal case, if the current match is "long enough", the - compressor reduces the search for a longer match, thus speeding up - the process. If speed is most important, the compressor inserts new - strings in the hash table only when no match was found, or when the - match is not "too long". This degrades the compression ratio but - saves time since there are both fewer insertions and fewer searches. - - - - - -Deutsch Informational [Page 15] - -RFC 1951 DEFLATE Compressed Data Format Specification May 1996 - - -5. References - - [1] Huffman, D. A., "A Method for the Construction of Minimum - Redundancy Codes", Proceedings of the Institute of Radio - Engineers, September 1952, Volume 40, Number 9, pp. 1098-1101. - - [2] Ziv J., Lempel A., "A Universal Algorithm for Sequential Data - Compression", IEEE Transactions on Information Theory, Vol. 23, - No. 3, pp. 337-343. - - [3] Gailly, J.-L., and Adler, M., ZLIB documentation and sources, - available in ftp://ftp.uu.net/pub/archiving/zip/doc/ - - [4] Gailly, J.-L., and Adler, M., GZIP documentation and sources, - available as gzip-*.tar in ftp://prep.ai.mit.edu/pub/gnu/ - - [5] Schwartz, E. S., and Kallick, B. "Generating a canonical prefix - encoding." Comm. ACM, 7,3 (Mar. 1964), pp. 166-169. - - [6] Hirschberg and Lelewer, "Efficient decoding of prefix codes," - Comm. ACM, 33,4, April 1990, pp. 449-459. - -6. Security Considerations - - Any data compression method involves the reduction of redundancy in - the data. Consequently, any corruption of the data is likely to have - severe effects and be difficult to correct. Uncompressed text, on - the other hand, will probably still be readable despite the presence - of some corrupted bytes. - - It is recommended that systems using this data format provide some - means of validating the integrity of the compressed data. See - reference [3], for example. - -7. Source code - - Source code for a C language implementation of a "deflate" compliant - compressor and decompressor is available within the zlib package at - ftp://ftp.uu.net/pub/archiving/zip/zlib/. - -8. Acknowledgements - - Trademarks cited in this document are the property of their - respective owners. - - Phil Katz designed the deflate format. Jean-Loup Gailly and Mark - Adler wrote the related software described in this specification. - Glenn Randers-Pehrson converted this document to RFC and HTML format. - - - -Deutsch Informational [Page 16] - -RFC 1951 DEFLATE Compressed Data Format Specification May 1996 - - -9. Author's Address - - L. Peter Deutsch - Aladdin Enterprises - 203 Santa Margarita Ave. - Menlo Park, CA 94025 - - Phone: (415) 322-0103 (AM only) - FAX: (415) 322-1734 - EMail: <ghost@aladdin.com> - - Questions about the technical content of this specification can be - sent by email to: - - Jean-Loup Gailly <gzip@prep.ai.mit.edu> and - Mark Adler <madler@alumni.caltech.edu> - - Editorial comments on this specification can be sent by email to: - - L. Peter Deutsch <ghost@aladdin.com> and - Glenn Randers-Pehrson <randeg@alumni.rpi.edu> - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Deutsch Informational [Page 17] - diff --git a/lib/std/compress/testdata/rfc8478.txt b/lib/std/compress/testdata/rfc8478.txt @@ -1,3027 +0,0 @@ - - - - - - -Internet Engineering Task Force (IETF) Y. Collet -Request for Comments: 8478 M. Kucherawy, Ed. -Category: Informational Facebook -ISSN: 2070-1721 October 2018 - - - Zstandard Compression and the application/zstd Media Type - -Abstract - - Zstandard, or "zstd" (pronounced "zee standard"), is a data - compression mechanism. This document describes the mechanism and - registers a media type and content encoding to be used when - transporting zstd-compressed content via Multipurpose Internet Mail - Extensions (MIME). - - Despite use of the word "standard" as part of its name, readers are - advised that this document is not an Internet Standards Track - specification; it is being published for informational purposes only. - -Status of This Memo - - This document is not an Internet Standards Track specification; it is - published for informational purposes. - - This document is a product of the Internet Engineering Task Force - (IETF). It represents the consensus of the IETF community. It has - received public review and has been approved for publication by the - Internet Engineering Steering Group (IESG). Not all documents - approved by the IESG are candidates for any level of Internet - Standard; see Section 2 of RFC 7841. - - Information about the current status of this document, any errata, - and how to provide feedback on it may be obtained at - https://www.rfc-editor.org/info/rfc8478. - - - - - - - - - - - - - - - - -Collet & Kucherawy Informational [Page 1] - -RFC 8478 application/zstd October 2018 - - -Copyright Notice - - Copyright (c) 2018 IETF Trust and the persons identified as the - document authors. All rights reserved. - - This document is subject to BCP 78 and the IETF Trust's Legal - Provisions Relating to IETF Documents - (https://trustee.ietf.org/license-info) in effect on the date of - publication of this document. Please review these documents - carefully, as they describe your rights and restrictions with respect - to this document. Code Components extracted from this document must - include Simplified BSD License text as described in Section 4.e of - the Trust Legal Provisions and are provided without warranty as - described in the Simplified BSD License. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Collet & Kucherawy Informational [Page 2] - -RFC 8478 application/zstd October 2018 - - -Table of Contents - - 1. Introduction . . . . . . . . . . . . . . . . . . . . . . . . 4 - 2. Definitions . . . . . . . . . . . . . . . . . . . . . . . . . 4 - 3. Compression Algorithm . . . . . . . . . . . . . . . . . . . . 5 - 3.1. Frames . . . . . . . . . . . . . . . . . . . . . . . . . 6 - 3.1.1. Zstandard Frames . . . . . . . . . . . . . . . . . . 6 - 3.1.1.1. Frame Header . . . . . . . . . . . . . . . . . . 7 - 3.1.1.2. Blocks . . . . . . . . . . . . . . . . . . . . . 12 - 3.1.1.3. Compressed Blocks . . . . . . . . . . . . . . . . 14 - 3.1.1.4. Sequence Execution . . . . . . . . . . . . . . . 28 - 3.1.1.5. Repeat Offsets . . . . . . . . . . . . . . . . . 29 - 3.1.2. Skippable Frames . . . . . . . . . . . . . . . . . . 30 - 4. Entropy Encoding . . . . . . . . . . . . . . . . . . . . . . 30 - 4.1. FSE . . . . . . . . . . . . . . . . . . . . . . . . . . . 31 - 4.1.1. FSE Table Description . . . . . . . . . . . . . . . . 31 - 4.2. Huffman Coding . . . . . . . . . . . . . . . . . . . . . 34 - 4.2.1. Huffman Tree Description . . . . . . . . . . . . . . 35 - 4.2.1.1. Huffman Tree Header . . . . . . . . . . . . . . . 36 - 4.2.1.2. FSE Compression of Huffman Weights . . . . . . . 37 - 4.2.1.3. Conversion from Weights to Huffman Prefix Codes . 38 - 4.2.2. Huffman-Coded Streams . . . . . . . . . . . . . . . . 39 - 5. Dictionary Format . . . . . . . . . . . . . . . . . . . . . . 40 - 6. IANA Considerations . . . . . . . . . . . . . . . . . . . . . 42 - 6.1. The 'application/zstd' Media Type . . . . . . . . . . . . 42 - 6.2. Content Encoding . . . . . . . . . . . . . . . . . . . . 43 - 6.3. Dictionaries . . . . . . . . . . . . . . . . . . . . . . 43 - 7. Security Considerations . . . . . . . . . . . . . . . . . . . 43 - 8. Implementation Status . . . . . . . . . . . . . . . . . . . . 44 - 9. References . . . . . . . . . . . . . . . . . . . . . . . . . 45 - 9.1. Normative References . . . . . . . . . . . . . . . . . . 45 - 9.2. Informative References . . . . . . . . . . . . . . . . . 45 - Appendix A. Decoding Tables for Predefined Codes . . . . . . . . 46 - A.1. Literal Length Code Table . . . . . . . . . . . . . . . . 46 - A.2. Match Length Code Table . . . . . . . . . . . . . . . . . 49 - A.3. Offset Code Table . . . . . . . . . . . . . . . . . . . . 52 - Acknowledgments . . . . . . . . . . . . . . . . . . . . . . . . . 53 - Authors' Addresses . . . . . . . . . . . . . . . . . . . . . . . 54 - - - - - - - - - - - - - -Collet & Kucherawy Informational [Page 3] - -RFC 8478 application/zstd October 2018 - - -1. Introduction - - Zstandard, or "zstd" (pronounced "zee standard"), is a data - compression mechanism, akin to gzip [RFC1952]. - - Despite use of the word "standard" as part of its name, readers are - advised that this document is not an Internet Standards Track - specification; it is being published for informational purposes only. - - This document describes the Zstandard format. Also, to enable the - transport of a data object compressed with Zstandard, this document - registers a media type that can be used to identify such content when - it is used in a payload encoded using Multipurpose Internet Mail - Extensions (MIME). - -2. Definitions - - Some terms used elsewhere in this document are defined here for - clarity. - - uncompressed: Describes an arbitrary set of bytes in their original - form, prior to being subjected to compression. - - compress, compression: The act of processing a set of bytes via the - compression mechanism described here. - - compressed: Describes the result of passing a set of bytes through - this mechanism. The original input has thus been compressed. - - decompress, decompression: The act of processing a set of bytes - through the inverse of the compression mechanism described here, - in an attempt to recover the original set of bytes prior to - compression. - - decompressed: Describes the result of passing a set of bytes through - the reverse of this mechanism. When this is successful, the - decompressed payload and the uncompressed payload are - indistinguishable. - - encode: The process of translating data from one form to another; - this may include compression or it may refer to other translations - done as part of this specification. - - decode: The reverse of "encode"; describes a process of reversing a - prior encoding to recover the original content. - - - - - - -Collet & Kucherawy Informational [Page 4] - -RFC 8478 application/zstd October 2018 - - - frame: Content compressed by Zstandard is transformed into a - Zstandard frame. Multiple frames can be appended into a single - file or stream. A frame is completely independent, has a defined - beginning and end, and has a set of parameters that tells the - decoder how to decompress it. - - block: A frame encapsulates one or multiple blocks. Each block - contains arbitrary content, which is described by its header, and - has a guaranteed maximum content size that depends upon frame - parameters. Unlike frames, each block depends on previous blocks - for proper decoding. However, each block can be decompressed - without waiting for its successor, allowing streaming operations. - - natural order: A sequence or ordering of objects or values that is - typical of that type of object or value. A set of unique - integers, for example, is in "natural order" if when progressing - from one element in the set or sequence to the next, there is - never a decrease in value. - - The naming convention for identifiers within the specification is - Mixed_Case_With_Underscores. Identifiers inside square brackets - indicate that the identifier is optional in the presented context. - -3. Compression Algorithm - - This section describes the Zstandard algorithm. - - The purpose of this document is to define a lossless compressed data - format that is a) independent of the CPU type, operating system, file - system, and character set and b) is suitable for file compression and - pipe and streaming compression, using the Zstandard algorithm. The - text of the specification assumes a basic background in programming - at the level of bits and other primitive data representations. - - The data can be produced or consumed, even for an arbitrarily long - sequentially presented input data stream, using only an a priori - bounded amount of intermediate storage, and hence can be used in data - communications. The format uses the Zstandard compression method, - and an optional xxHash-64 checksum method [XXHASH], for detection of - data corruption. - - The data format defined by this specification does not attempt to - allow random access to compressed data. - - Unless otherwise indicated below, a compliant compressor must produce - data sets that conform to the specifications presented here. - However, it does not need to support all options. - - - - -Collet & Kucherawy Informational [Page 5] - -RFC 8478 application/zstd October 2018 - - - A compliant decompressor must be able to decompress at least one - working set of parameters that conforms to the specifications - presented here. It may also ignore informative fields, such as the - checksum. Whenever it does not support a parameter defined in the - compressed stream, it must produce a non-ambiguous error code and - associated error message explaining which parameter is unsupported. - - This specification is intended for use by implementers of software to - compress data into Zstandard format and/or decompress data from - Zstandard format. The Zstandard format is supported by an open - source reference implementation, written in portable C, and available - at [ZSTD]. - -3.1. Frames - - Zstandard compressed data is made up of one or more frames. Each - frame is independent and can be decompressed independently of other - frames. The decompressed content of multiple concatenated frames is - the concatenation of each frame's decompressed content. - - There are two frame formats defined for Zstandard: Zstandard frames - and skippable frames. Zstandard frames contain compressed data, - while skippable frames contain custom user metadata. - -3.1.1. Zstandard Frames - - The structure of a single Zstandard frame is as follows: - - +--------------------+------------+ - | Magic_Number | 4 bytes | - +--------------------+------------+ - | Frame_Header | 2-14 bytes | - +--------------------+------------+ - | Data_Block | n bytes | - +--------------------+------------+ - | [More Data_Blocks] | | - +--------------------+------------+ - | [Content_Checksum] | 0-4 bytes | - +--------------------+------------+ - - Magic_Number: 4 bytes, little-endian format. Value: 0xFD2FB528. - - Frame_Header: 2 to 14 bytes, detailed in Section 3.1.1.1. - - Data_Block: Detailed in Section 3.1.1.2. This is where data - appears. - - - - - -Collet & Kucherawy Informational [Page 6] - -RFC 8478 application/zstd October 2018 - - - Content_Checksum: An optional 32-bit checksum, only present if - Content_Checksum_Flag is set. The content checksum is the result - of the XXH64() hash function [XXHASH] digesting the original - (decoded) data as input, and a seed of zero. The low 4 bytes of - the checksum are stored in little-endian format. - - The magic number was selected to be less probable to find at the - beginning of an arbitrary file. It avoids trivial patterns (0x00, - 0xFF, repeated bytes, increasing bytes, etc.), contains byte values - outside of ASCII range, and doesn't map into UTF-8 space, all of - which reduce the likelihood of its appearance at the top of a text - file. - -3.1.1.1. Frame Header - - The frame header has a variable size, with a minimum of 2 bytes and - up to 14 bytes depending on optional parameters. The structure of - Frame_Header is as follows: - - +-------------------------+-----------+ - | Frame_Header_Descriptor | 1 byte | - +-------------------------+-----------+ - | [Window_Descriptor] | 0-1 byte | - +-------------------------+-----------+ - | [Dictionary_ID] | 0-4 bytes | - +-------------------------+-----------+ - | [Frame_Content_Size] | 0-8 bytes | - +-------------------------+-----------+ - - - - - - - - - - - - - - - - - - - - - - - -Collet & Kucherawy Informational [Page 7] - -RFC 8478 application/zstd October 2018 - - -3.1.1.1.1. Frame_Header_Descriptor - - The first header's byte is called the Frame_Header_Descriptor. It - describes which other fields are present. Decoding this byte is - enough to tell the size of Frame_Header. - - +------------+-------------------------+ - | Bit Number | Field Name | - +------------+-------------------------+ - | 7-6 | Frame_Content_Size_Flag | - +------------+-------------------------+ - | 5 | Single_Segment_Flag | - +------------+-------------------------+ - | 4 | (unused) | - +------------+-------------------------+ - | 3 | (reserved) | - +------------+-------------------------+ - | 2 | Content_Checksum_Flag | - +------------+-------------------------+ - | 1-0 | Dictionary_ID_Flag | - +------------+-------------------------+ - - In this table, bit 7 is the highest bit, while bit 0 is the lowest - one. - -3.1.1.1.1.1. Frame_Content_Size_Flag - - This is a 2-bit flag (equivalent to Frame_Header_Descriptor right- - shifted 6 bits) specifying whether Frame_Content_Size (the - decompressed data size) is provided within the header. Flag_Value - provides FCS_Field_Size, which is the number of bytes used by - Frame_Content_Size according to the following table: - - +----------------+--------+---+---+---+ - | Flag_Value | 0 | 1 | 2 | 3 | - +----------------+--------+---+---+---+ - | FCS_Field_Size | 0 or 1 | 2 | 4 | 8 | - +----------------+--------+---+---+---+ - - When Flag_Value is 0, FCS_Field_Size depends on Single_Segment_Flag: - If Single_Segment_Flag is set, FCS_Field_Size is 1. Otherwise, - FCS_Field_Size is 0; Frame_Content_Size is not provided. - - - - - - - - - -Collet & Kucherawy Informational [Page 8] - -RFC 8478 application/zstd October 2018 - - -3.1.1.1.1.2. Single_Segment_Flag - - If this flag is set, data must be regenerated within a single - continuous memory segment. - - In this case, Window_Descriptor byte is skipped, but - Frame_Content_Size is necessarily present. As a consequence, the - decoder must allocate a memory segment of size equal or larger than - Frame_Content_Size. - - In order to protect the decoder from unreasonable memory - requirements, a decoder is allowed to reject a compressed frame that - requests a memory size beyond the decoder's authorized range. - - For broader compatibility, decoders are recommended to support memory - sizes of at least 8 MB. This is only a recommendation; each decoder - is free to support higher or lower limits, depending on local - limitations. - -3.1.1.1.1.3. Unused Bit - - A decoder compliant with this specification version shall not - interpret this bit. It might be used in a future version, to signal - a property that is not mandatory to properly decode the frame. An - encoder compliant with this specification must set this bit to zero. - -3.1.1.1.1.4. Reserved Bit - - This bit is reserved for some future feature. Its value must be - zero. A decoder compliant with this specification version must - ensure it is not set. This bit may be used in a future revision, to - signal a feature that must be interpreted to decode the frame - correctly. - -3.1.1.1.1.5. Content_Checksum_Flag - - If this flag is set, a 32-bit Content_Checksum will be present at the - frame's end. See the description of Content_Checksum above. - - - - - - - - - - - - - -Collet & Kucherawy Informational [Page 9] - -RFC 8478 application/zstd October 2018 - - -3.1.1.1.1.6. Dictionary_ID_Flag - - This is a 2-bit flag (= Frame_Header_Descriptor & 0x3) indicating - whether a dictionary ID is provided within the header. It also - specifies the size of this field as DID_Field_Size: - - +----------------+---+---+---+---+ - | Flag_Value | 0 | 1 | 2 | 3 | - +----------------+---+---+---+---+ - | DID_Field_Size | 0 | 1 | 2 | 4 | - +----------------+---+---+---+---+ - -3.1.1.1.2. Window Descriptor - - This provides guarantees about the minimum memory buffer required to - decompress a frame. This information is important for decoders to - allocate enough memory. - - The Window_Descriptor byte is optional. When Single_Segment_Flag is - set, Window_Descriptor is not present. In this case, Window_Size is - Frame_Content_Size, which can be any value from 0 to 2^64-1 bytes (16 - ExaBytes). - - +------------+----------+----------+ - | Bit Number | 7-3 | 2-0 | - +------------+----------+----------+ - | Field Name | Exponent | Mantissa | - +------------+----------+----------+ - - The minimum memory buffer size is called Window_Size. It is - described by the following formulae: - - windowLog = 10 + Exponent; - windowBase = 1 << windowLog; - windowAdd = (windowBase / 8) * Mantissa; - Window_Size = windowBase + windowAdd; - - The minimum Window_Size is 1 KB. The maximum Window_Size is (1<<41) - + 7*(1<<38) bytes, which is 3.75 TB. - - In general, larger Window_Size values tend to improve the compression - ratio, but at the cost of increased memory usage. - - To properly decode compressed data, a decoder will need to allocate a - buffer of at least Window_Size bytes. - - - - - - -Collet & Kucherawy Informational [Page 10] - -RFC 8478 application/zstd October 2018 - - - In order to protect decoders from unreasonable memory requirements, a - decoder is allowed to reject a compressed frame that requests a - memory size beyond decoder's authorized range. - - For improved interoperability, it's recommended for decoders to - support values of Window_Size up to 8 MB and for encoders not to - generate frames requiring a Window_Size larger than 8 MB. It's - merely a recommendation though, and decoders are free to support - larger or lower limits, depending on local limitations. - -3.1.1.1.3. Dictionary_ID - - This is a variable size field, which contains the ID of the - dictionary required to properly decode the frame. This field is - optional. When it's not present, it's up to the decoder to know - which dictionary to use. - - Dictionary_ID field size is provided by DID_Field_Size. - DID_Field_Size is directly derived from the value of - Dictionary_ID_Flag. One byte can represent an ID 0-255; 2 bytes can - represent an ID 0-65535; 4 bytes can represent an ID 0-4294967295. - Format is little-endian. - - It is permitted to represent a small ID (for example, 13) with a - large 4-byte dictionary ID, even if it is less efficient. - - Within private environments, any dictionary ID can be used. However, - for frames and dictionaries distributed in public space, - Dictionary_ID must be attributed carefully. The following ranges are - reserved for use only with dictionaries that have been registered - with IANA (see Section 6.3): - - low range: <= 32767 - high range: >= (1 << 31) - - Any other value for Dictionary_ID can be used by private arrangement - between participants. - - Any payload presented for decompression that references an - unregistered reserved dictionary ID results in an error. - - - - - - - - - - - -Collet & Kucherawy Informational [Page 11] - -RFC 8478 application/zstd October 2018 - - -3.1.1.1.4. Frame Content Size - - This is the original (uncompressed) size. This information is - optional. Frame_Content_Size uses a variable number of bytes, - provided by FCS_Field_Size. FCS_Field_Size is provided by the value - of Frame_Content_Size_Flag. FCS_Field_Size can be equal to 0 (not - present), 1, 2, 4, or 8 bytes. - - +----------------+--------------+ - | FCS Field Size | Range | - +----------------+--------------+ - | 0 | unknown | - +----------------+--------------+ - | 1 | 0 - 255 | - +----------------+--------------+ - | 2 | 256 - 65791 | - +----------------+--------------+ - | 4 | 0 - 2^32 - 1 | - +----------------+--------------+ - | 8 | 0 - 2^64 - 1 | - +----------------+--------------+ - - Frame_Content_Size format is little-endian. When FCS_Field_Size is - 1, 4, or 8 bytes, the value is read directly. When FCS_Field_Size is - 2, the offset of 256 is added. It's allowed to represent a small - size (for example 18) using any compatible variant. - -3.1.1.2. Blocks - - After Magic_Number and Frame_Header, there are some number of blocks. - Each frame must have at least 1 block, but there is no upper limit on - the number of blocks per frame. - - The structure of a block is as follows: - - +--------------+---------------+ - | Block_Header | Block_Content | - +--------------+---------------+ - | 3 bytes | n bytes | - +--------------+---------------+ - - - - - - - - - - - -Collet & Kucherawy Informational [Page 12] - -RFC 8478 application/zstd October 2018 - - - Block_Header uses 3 bytes, written using little-endian convention. - It contains three fields: - - +------------+------------+------------+ - | Last_Block | Block_Type | Block_Size | - +------------+------------+------------+ - | bit 0 | bits 1-2 | bits 3-23 | - +------------+------------+------------+ - -3.1.1.2.1. Last_Block - - The lowest bit (Last_Block) signals whether this block is the last - one. The frame will end after this last block. It may be followed - by an optional Content_Checksum (see Section 3.1.1). - -3.1.1.2.2. Block_Type - - The next 2 bits represent the Block_Type. There are four block - types: - - +-----------+------------------+ - | Value | Block_Type | - +-----------+------------------+ - | 0 | Raw_Block | - +-----------+------------------+ - | 1 | RLE_Block | - +-----------+------------------+ - | 2 | Compressed_Block | - +-----------+------------------+ - | 3 | Reserved | - +-----------+------------------+ - - Raw_Block: This is an uncompressed block. Block_Content contains - Block_Size bytes. - - RLE_Block: This is a single byte, repeated Block_Size times. - Block_Content consists of a single byte. On the decompression - side, this byte must be repeated Block_Size times. - - Compressed_Block: This is a compressed block as described in - Section 3.1.1.3. Block_Size is the length of Block_Content, - namely the compressed data. The decompressed size is not known, - but its maximum possible value is guaranteed (see below). - - Reserved: This is not a block. This value cannot be used with the - current specification. If such a value is present, it is - considered to be corrupt data. - - - - -Collet & Kucherawy Informational [Page 13] - -RFC 8478 application/zstd October 2018 - - -3.1.1.2.3. Block_Size - - The upper 21 bits of Block_Header represent the Block_Size. - Block_Size is the size of the block excluding the header. A block - can contain any number of bytes (even zero), up to - Block_Maximum_Decompressed_Size, which is the smallest of: - - o Window_Size - - o 128 KB - - A Compressed_Block has the extra restriction that Block_Size is - always strictly less than the decompressed size. If this condition - cannot be respected, the block must be sent uncompressed instead - (i.e., treated as a Raw_Block). - -3.1.1.3. Compressed Blocks - - To decompress a compressed block, the compressed size must be - provided from the Block_Size field within Block_Header. - - A compressed block consists of two sections: a Literals - Section (Section 3.1.1.3.1) and a - Sequences_Section (Section 3.1.1.3.2). The results of the two - sections are then combined to produce the decompressed data in - Sequence Execution (Section 3.1.1.4). - - To decode a compressed block, the following elements are necessary: - - o Previous decoded data, up to a distance of Window_Size, or the - beginning of the Frame, whichever is smaller. Single_Segment_Flag - will be set in the latter case. - - o List of "recent offsets" from the previous Compressed_Block. - - o The previous Huffman tree, required by Treeless_Literals_Block - type. - - o Previous Finite State Entropy (FSE) decoding tables, required by - Repeat_Mode, for each symbol type (literals lengths, match - lengths, offsets). - - Note that decoding tables are not always from the previous - Compressed_Block: - - o Every decoding table can come from a dictionary. - - - - - -Collet & Kucherawy Informational [Page 14] - -RFC 8478 application/zstd October 2018 - - - o The Huffman tree comes from the previous - Compressed_Literals_Block. - -3.1.1.3.1. Literals_Section_Header - - All literals are regrouped in the first part of the block. They can - be decoded first and then copied during Sequence Execution (see - Section 3.1.1.4), or they can be decoded on the flow during Sequence - Execution. - - Literals can be stored uncompressed or compressed using Huffman - prefix codes. When compressed, an optional tree description can be - present, followed by 1 or 4 streams. - - +----------------------------+ - | Literals_Section_Header | - +----------------------------+ - | [Huffman_Tree_Description] | - +----------------------------+ - | [Jump_Table] | - +----------------------------+ - | Stream_1 | - +----------------------------+ - | [Stream_2] | - +----------------------------+ - | [Stream_3] | - +----------------------------+ - | [Stream_4] | - +----------------------------+ - -3.1.1.3.1.1. Literals_Section_Header - - This field describes how literals are packed. It's a byte-aligned - variable-size bit field, ranging from 1 to 5 bytes, using little- - endian convention. - - +---------------------+-----------+ - | Literals_Block_Type | 2 bits | - +---------------------+-----------+ - | Size_Format | 1-2 bits | - +---------------------+-----------+ - | Regenerated_Size | 5-20 bits | - +---------------------+-----------+ - | [Compressed_Size] | 0-18 bits | - +---------------------+-----------+ - - In this representation, bits at the top are the lowest bits. - - - - -Collet & Kucherawy Informational [Page 15] - -RFC 8478 application/zstd October 2018 - - - The Literals_Block_Type field uses the two lowest bits of the first - byte, describing four different block types: - - +---------------------------+-------+ - | Literals_Block_Type | Value | - +---------------------------+-------+ - | Raw_Literals_Block | 0 | - +---------------------------+-------+ - | RLE_Literals_Block | 1 | - +---------------------------+-------+ - | Compressed_Literals_Block | 2 | - +---------------------------+-------+ - | Treeless_Literals_Block | 3 | - +---------------------------+-------+ - - Raw_Literals_Block: Literals are stored uncompressed. - Literals_Section_Content is Regenerated_Size. - - RLE_Literals_Block: Literals consist of a single-byte value repeated - Regenerated_Size times. Literals_Section_Content is 1. - - Compressed_Literals_Block: This is a standard Huffman-compressed - block, starting with a Huffman tree description. See details - below. Literals_Section_Content is Compressed_Size. - - Treeless_Literals_Block: This is a Huffman-compressed block, using - the Huffman tree from the previous Compressed_Literals_Block, or a - dictionary if there is no previous Huffman-compressed literals - block. Huffman_Tree_Description will be skipped. Note that if - this mode is triggered without any previous Huffman-table in the - frame (or dictionary, per Section 5), it should be treated as data - corruption. Literals_Section_Content is Compressed_Size. - - The Size_Format is divided into two families: - - o For Raw_Literals_Block and RLE_Literals_Block, it's only necessary - to decode Regenerated_Size. There is no Compressed_Size field. - - o For Compressed_Block and Treeless_Literals_Block, it's required to - decode both Compressed_Size and Regenerated_Size (the decompressed - size). It's also necessary to decode the number of streams (1 or - 4). - - For values spanning several bytes, the convention is little endian. - - Size_Format for Raw_Literals_Block and RLE_Literals_Block uses 1 or 2 - bits. Its value is (Literals_Section_Header[0]>>2) & 0x3. - - - - -Collet & Kucherawy Informational [Page 16] - -RFC 8478 application/zstd October 2018 - - - Size_Format == 00 or 10: Size_Format uses 1 bit. Regenerated_Size - uses 5 bits (value 0-31). Literals_Section_Header uses 1 byte. - Regenerated_Size = Literal_Section_Header[0]>>3. - - Size_Format == 01: Size_Format uses 2 bits. Regenerated_Size uses - 12 bits (values 0-4095). Literals_Section_Header uses 2 bytes. - Regenerated_Size = (Literals_Section_Header[0]>>4) + - (Literals_Section_Header[1]<<4). - - Size_Format == 11: Size_Format uses 2 bits. Regenerated_Size uses - 20 bits (values 0-1048575). Literals_Section_Header uses 3 bytes. - Regenerated_Size = (Literals_Section_Header[0]>>4) + - (Literals_Section_Header[1]<<4) + (Literals_Section_Header[2]<<12) - - Only Stream_1 is present for these cases. Note that it is permitted - to represent a short value (for example, 13) using a long format, - even if it's less efficient. - - Size_Format for Compressed_Literals_Block and Treeless_Literals_Block - always uses 2 bits. - - Size_Format == 00: A single stream. Both Regenerated_Size and - Compressed_Size use 10 bits (values 0-1023). - Literals_Section_Header uses 3 bytes. - - Size_Format == 01: 4 streams. Both Regenerated_Size and - Compressed_Size use 10 bits (values 0-1023). - Literals_Section_Header uses 3 bytes. - - Size_Format == 10: 4 streams. Both Regenerated_Size and - Compressed_Size use 14 bits (values 0-16383). - Literals_Section_Header uses 4 bytes. - - Size_Format == 11: 4 streams. Both Regenerated_Size and - Compressed_Size use 18 bits (values 0-262143). - Literals_Section_Header uses 5 bytes. - - Both the Compressed_Size and Regenerated_Size fields follow little- - endian convention. Note that Compressed_Size includes the size of - the Huffman_Tree_Description when it is present. - -3.1.1.3.1.2. Raw_Literals_Block - - The data in Stream_1 is Regenerated_Size bytes long. It contains the - raw literals data to be used during Sequence Execution - (Section 3.1.1.3.2). - - - - - -Collet & Kucherawy Informational [Page 17] - -RFC 8478 application/zstd October 2018 - - -3.1.1.3.1.3. RLE_Literals_Block - - Stream_1 consists of a single byte that should be repeated - Regenerated_Size times to generate the decoded literals. - -3.1.1.3.1.4. Compressed_Literals_Block and Treeless_Literals_Block - - Both of these modes contain Huffman-encoded data. For - Treeless_Literals_Block, the Huffman table comes from the previously - compressed literals block, or from a dictionary; see Section 5. - -3.1.1.3.1.5. Huffman_Tree_Description - - This section is only present when the Literals_Block_Type type is - Compressed_Literals_Block (2). The format of - Huffman_Tree_Description can be found in Section 4.2.1. The size of - Huffman_Tree_Description is determined during the decoding process. - It must be used to determine where streams begin. - - Total_Streams_Size = Compressed_Size - - Huffman_Tree_Description_Size - -3.1.1.3.1.6. Jump_Table - - The Jump_Table is only present when there are 4 Huffman-coded - streams. - - (Reminder: Huffman-compressed data consists of either 1 or 4 Huffman- - coded streams.) - - If only 1 stream is present, it is a single bitstream occupying the - entire remaining portion of the literals block, encoded as described - within Section 4.2.2. - - If there are 4 streams, Literals_Section_Header only provides enough - information to know the decompressed and compressed sizes of all 4 - streams combined. The decompressed size of each stream is equal to - (Regenerated_Size+3)/4, except for the last stream, which may be up - to 3 bytes smaller, to reach a total decompressed size as specified - in Regenerated_Size. - - The compressed size of each stream is provided explicitly in the - Jump_Table. The Jump_Table is 6 bytes long and consists of three - 2-byte little-endian fields, describing the compressed sizes of the - first 3 streams. Stream4_Size is computed from Total_Streams_Size - minus sizes of other streams. - - - - - -Collet & Kucherawy Informational [Page 18] - -RFC 8478 application/zstd October 2018 - - - Stream4_Size = Total_Streams_Size - 6 - - Stream1_Size - Stream2_Size - - Stream3_Size - - Note that if Stream1_Size + Stream2_Size + Stream3_Size exceeds - Total_Streams_Size, the data are considered corrupted. - - Each of these 4 bitstreams is then decoded independently as a - Huffman-Coded stream, as described in Section 4.2.2. - -3.1.1.3.2. Sequences_Section - - A compressed block is a succession of sequences. A sequence is a - literal copy command, followed by a match copy command. A literal - copy command specifies a length. It is the number of bytes to be - copied (or extracted) from the Literals Section. A match copy - command specifies an offset and a length. - - When all sequences are decoded, if there are literals left in the - literals section, these bytes are added at the end of the block. - - This is described in more detail in Section 3.1.1.4. - - The Sequences_Section regroups all symbols required to decode - commands. There are three symbol types: literals lengths, offsets, - and match lengths. They are encoded together, interleaved, in a - single "bitstream". - - The Sequences_Section starts by a header, followed by optional - probability tables for each symbol type, followed by the bitstream. - - Sequences_Section_Header - [Literals_Length_Table] - [Offset_Table] - [Match_Length_Table] - bitStream - - To decode the Sequences_Section, it's necessary to know its size. - This size is deduced from the size of the Literals_Section: - Sequences_Section_Size = Block_Size - Literals_Section_Header - - Literals_Section_Content - - - - - - - - - - -Collet & Kucherawy Informational [Page 19] - -RFC 8478 application/zstd October 2018 - - -3.1.1.3.2.1. Sequences_Section_Header - - This header consists of two items: - - o Number_of_Sequences - - o Symbol_Compression_Modes - - Number_of_Sequences is a variable size field using between 1 and 3 - bytes. If the first byte is "byte0": - - o if (byte0 == 0): there are no sequences. The sequence section - stops here. Decompressed content is defined entirely as Literals - Section content. The FSE tables used in Repeat_Mode are not - updated. - - o if (byte0 < 128): Number_of_Sequences = byte0. Uses 1 byte. - - o if (byte0 < 255): Number_of_Sequences = ((byte0 - 128) << 8) + - byte1. Uses 2 bytes. - - o if (byte0 == 255): Number_of_Sequences = byte1 + (byte2 << 8) + - 0x7F00. Uses 3 bytes. - - Symbol_Compression_Modes is a single byte, defining the compression - mode of each symbol type. - - +-------------+----------------------+ - | Bit Number | Field Name | - +-------------+----------------------+ - | 7-6 | Literal_Lengths_Mode | - +-------------+----------------------+ - | 5-4 | Offsets_Mode | - +-------------+----------------------+ - | 3-2 | Match_Lengths_Mode | - +-------------+----------------------+ - | 1-0 | Reserved | - +-------------+----------------------+ - - The last field, Reserved, must be all zeroes. - - - - - - - - - - - -Collet & Kucherawy Informational [Page 20] - -RFC 8478 application/zstd October 2018 - - - Literals_Lengths_Mode, Offsets_Mode, and Match_Lengths_Mode define - the Compression_Mode of literals lengths, offsets, and match lengths - symbols, respectively. They follow the same enumeration: - - +-------+---------------------+ - | Value | Compression_Mode | - +-------+---------------------+ - | 0 | Predefined_Mode | - +-------+---------------------+ - | 1 | RLE_Mode | - +-------+---------------------+ - | 2 | FSE_Compressed_Mode | - +-------+---------------------+ - | 3 | Repeat_Mode | - +-------+---------------------+ - - Predefined_Mode: A predefined FSE (see Section 4.1) distribution - table is used, as defined in Section 3.1.1.3.2.2. No distribution - table will be present. - - RLE_Mode: The table description consists of a single byte, which - contains the symbol's value. This symbol will be used for all - sequences. - - FSE_Compressed_Mode: Standard FSE compression. A distribution table - will be present. The format of this distribution table is - described in Section 4.1.1. Note that the maximum allowed - accuracy log for literals length and match length tables is 9, and - the maximum accuracy log for the offsets table is 8. This mode - must not be used when only one symbol is present; RLE_Mode should - be used instead (although any other mode will work). - - Repeat_Mode: The table used in the previous Compressed_Block with - Number_Of_Sequences > 0 will be used again, or if this is the - first block, the table in the dictionary will be used. Note that - this includes RLE_Mode, so if Repeat_Mode follows RLE_Mode, the - same symbol will be repeated. It also includes Predefined_Mode, - in which case Repeat_Mode will have the same outcome as - Predefined_Mode. No distribution table will be present. If this - mode is used without any previous sequence table in the frame (or - dictionary; see Section 5) to repeat, this should be treated as - corruption. - - - - - - - - - -Collet & Kucherawy Informational [Page 21] - -RFC 8478 application/zstd October 2018 - - -3.1.1.3.2.1.1. Sequence Codes for Lengths and Offsets - - Each symbol is a code in its own context, which specifies Baseline - and Number_of_Bits to add. Codes are FSE compressed and interleaved - with raw additional bits in the same bitstream. - - Literals length codes are values ranging from 0 to 35 inclusive. - They define lengths from 0 to 131071 bytes. The literals length is - equal to the decoded Baseline plus the result of reading - Number_of_Bits bits from the bitstream, as a little-endian value. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Collet & Kucherawy Informational [Page 22] - -RFC 8478 application/zstd October 2018 - - - +----------------------+----------+----------------+ - | Literals_Length_Code | Baseline | Number_of_Bits | - +----------------------+----------+----------------+ - | 0-15 | length | 0 | - +----------------------+----------+----------------+ - | 16 | 16 | 1 | - +----------------------+----------+----------------+ - | 17 | 18 | 1 | - +----------------------+----------+----------------+ - | 18 | 20 | 1 | - +----------------------+----------+----------------+ - | 19 | 22 | 1 | - +----------------------+----------+----------------+ - | 20 | 24 | 2 | - +----------------------+----------+----------------+ - | 21 | 28 | 2 | - +----------------------+----------+----------------+ - | 22 | 32 | 3 | - +----------------------+----------+----------------+ - | 23 | 40 | 3 | - +----------------------+----------+----------------+ - | 24 | 48 | 4 | - +----------------------+----------+----------------+ - | 25 | 64 | 6 | - +----------------------+----------+----------------+ - | 26 | 128 | 7 | - +----------------------+----------+----------------+ - | 27 | 256 | 8 | - +----------------------+----------+----------------+ - | 28 | 512 | 9 | - +----------------------+----------+----------------+ - | 29 | 1024 | 10 | - +----------------------+----------+----------------+ - | 30 | 2048 | 11 | - +----------------------+----------+----------------+ - | 31 | 4096 | 12 | - +----------------------+----------+----------------+ - | 32 | 8192 | 13 | - +----------------------+----------+----------------+ - | 33 | 16384 | 14 | - +----------------------+----------+----------------+ - | 34 | 32768 | 15 | - +----------------------+----------+----------------+ - | 35 | 65536 | 16 | - +----------------------+----------+----------------+ - - - - - - -Collet & Kucherawy Informational [Page 23] - -RFC 8478 application/zstd October 2018 - - - Match length codes are values ranging from 0 to 52 inclusive. They - define lengths from 3 to 131074 bytes. The match length is equal to - the decoded Baseline plus the result of reading Number_of_Bits bits - from the bitstream, as a little-endian value. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Collet & Kucherawy Informational [Page 24] - -RFC 8478 application/zstd October 2018 - - - +-------------------+-----------------------+----------------+ - | Match_Length_Code | Baseline | Number_of_Bits | - +-------------------+-----------------------+----------------+ - | 0-31 | Match_Length_Code + 3 | 0 | - +-------------------+-----------------------+----------------+ - | 32 | 35 | 1 | - +-------------------+-----------------------+----------------+ - | 33 | 37 | 1 | - +-------------------+-----------------------+----------------+ - | 34 | 39 | 1 | - +-------------------+-----------------------+----------------+ - | 35 | 41 | 1 | - +-------------------+-----------------------+----------------+ - | 36 | 43 | 2 | - +-------------------+-----------------------+----------------+ - | 37 | 47 | 2 | - +-------------------+-----------------------+----------------+ - | 38 | 51 | 3 | - +-------------------+-----------------------+----------------+ - | 39 | 59 | 3 | - +-------------------+-----------------------+----------------+ - | 40 | 67 | 4 | - +-------------------+-----------------------+----------------+ - | 41 | 83 | 4 | - +-------------------+-----------------------+----------------+ - | 42 | 99 | 5 | - +-------------------+-----------------------+----------------+ - | 43 | 131 | 7 | - +-------------------+-----------------------+----------------+ - | 44 | 259 | 8 | - +-------------------+-----------------------+----------------+ - | 45 | 515 | 9 | - +-------------------+-----------------------+----------------+ - | 46 | 1027 | 10 | - +-------------------+-----------------------+----------------+ - | 47 | 2051 | 11 | - +-------------------+-----------------------+----------------+ - | 48 | 4099 | 12 | - +-------------------+-----------------------+----------------+ - | 49 | 8195 | 13 | - +-------------------+-----------------------+----------------+ - | 50 | 16387 | 14 | - +-------------------+-----------------------+----------------+ - | 51 | 32771 | 15 | - +-------------------+-----------------------+----------------+ - | 52 | 65539 | 16 | - +-------------------+-----------------------+----------------+ - - - - -Collet & Kucherawy Informational [Page 25] - -RFC 8478 application/zstd October 2018 - - - Offset codes are values ranging from 0 to N. - - A decoder is free to limit its maximum supported value for N. - Support for values of at least 22 is recommended. At the time of - this writing, the reference decoder supports a maximum N value of 31. - - An offset code is also the number of additional bits to read in - little-endian fashion and can be translated into an Offset_Value - using the following formulas: - - Offset_Value = (1 << offsetCode) + readNBits(offsetCode); - if (Offset_Value > 3) Offset = Offset_Value - 3; - - This means that maximum Offset_Value is (2^(N+1))-1, supporting back- - reference distance up to (2^(N+1))-4, but it is limited by the - maximum back-reference distance (see Section 3.1.1.1.2). - - Offset_Value from 1 to 3 are special: they define "repeat codes". - This is described in more detail in Section 3.1.1.5. - -3.1.1.3.2.1.2. Decoding Sequences - - FSE bitstreams are read in reverse of the direction they are written. - In zstd, the compressor writes bits forward into a block, and the - decompressor must read the bitstream backwards. - - To find the start of the bitstream, it is therefore necessary to know - the offset of the last byte of the block, which can be found by - counting Block_Size bytes after the block header. - - After writing the last bit containing information, the compressor - writes a single 1 bit and then fills the byte with 0-7 zero bits of - padding. The last byte of the compressed bitstream cannot be zero - for that reason. - - When decompressing, the last byte containing the padding is the first - byte to read. The decompressor needs to skip 0-7 initial zero bits - until the first 1 bit occurs. Afterwards, the useful part of the - bitstream begins. - - FSE decoding requires a 'state' to be carried from symbol to symbol. - For more explanation on FSE decoding, see Section 4.1. - - For sequence decoding, a separate state keeps track of each literal - lengths, offsets, and match lengths symbols. Some FSE primitives are - also used. For more details on the operation of these primitives, - see Section 4.1. - - - - -Collet & Kucherawy Informational [Page 26] - -RFC 8478 application/zstd October 2018 - - - The bitstream starts with initial FSE state values, each using the - required number of bits in their respective accuracy, decoded - previously from their normalized distribution. It starts with - Literals_Length_State, followed by Offset_State, and finally - Match_Length_State. - - Note that all values are read backward, so the 'start' of the - bitstream is at the highest position in memory, immediately before - the last 1 bit for padding. - - After decoding the starting states, a single sequence is decoded - Number_Of_Sequences times. These sequences are decoded in order from - first to last. Since the compressor writes the bitstream in the - forward direction, this means the compressor must encode the - sequences starting with the last one and ending with the first. - - For each of the symbol types, the FSE state can be used to determine - the appropriate code. The code then defines the Baseline and - Number_of_Bits to read for each type. The description of the codes - for how to determine these values can be found in - Section 3.1.1.3.2.1. - - Decoding starts by reading the Number_of_Bits required to decode - offset. It does the same for Match_Length and then for - Literals_Length. This sequence is then used for Sequence Execution - (see Section 3.1.1.4). - - If it is not the last sequence in the block, the next operation is to - update states. Using the rules pre-calculated in the decoding - tables, Literals_Length_State is updated, followed by - Match_Length_State, and then Offset_State. See Section 4.1 for - details on how to update states from the bitstream. - - This operation will be repeated Number_of_Sequences times. At the - end, the bitstream shall be entirely consumed; otherwise, the - bitstream is considered corrupted. - -3.1.1.3.2.2. Default Distributions - - If Predefined_Mode is selected for a symbol type, its FSE decoding - table is generated from a predefined distribution table defined here. - For details on how to convert this distribution into a decoding - table, see Section 4.1. - - - - - - - - -Collet & Kucherawy Informational [Page 27] - -RFC 8478 application/zstd October 2018 - - -3.1.1.3.2.2.1. Literals Length - - The decoding table uses an accuracy log of 6 bits (64 states). - - short literalsLength_defaultDistribution[36] = - { 4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1, - -1,-1,-1,-1 - }; - -3.1.1.3.2.2.2. Match Length - - The decoding table uses an accuracy log of 6 bits (64 states). - - short matchLengths_defaultDistribution[53] = - { 1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,-1,-1, - -1,-1,-1,-1,-1 - }; - -3.1.1.3.2.2.3. Offset Codes - - The decoding table uses an accuracy log of 5 bits (32 states), and - supports a maximum N value of 28, allowing offset values up to - 536,870,908. - - If any sequence in the compressed block requires a larger offset than - this, it's not possible to use the default distribution to represent - it. - - short offsetCodes_defaultDistribution[29] = - { 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1,-1,-1,-1,-1,-1 - }; - -3.1.1.4. Sequence Execution - - Once literals and sequences have been decoded, they are combined to - produce the decoded content of a block. - - Each sequence consists of a tuple of (literals_length, offset_value, - match_length), decoded as described in the - Sequences_Section (Section 3.1.1.3.2). To execute a sequence, first - copy literals_length bytes from the decoded literals to the output. - - - - - - -Collet & Kucherawy Informational [Page 28] - -RFC 8478 application/zstd October 2018 - - - Then, match_length bytes are copied from previous decoded data. The - offset to copy from is determined by offset_value: - - o if Offset_Value > 3, then the offset is Offset_Value - 3; - - o if Offset_Value is from 1-3, the offset is a special repeat offset - value. See Section 3.1.1.5 for how the offset is determined in - this case. - - The offset is defined as from the current position (after copying the - literals), so an offset of 6 and a match length of 3 means that 3 - bytes should be copied from 6 bytes back. Note that all offsets - leading to previously decoded data must be smaller than Window_Size - defined in Frame_Header_Descriptor (Section 3.1.1.1.1). - -3.1.1.5. Repeat Offsets - - As seen above, the first three values define a repeated offset; we - will call them Repeated_Offset1, Repeated_Offset2, and - Repeated_Offset3. They are sorted in recency order, with - Repeated_Offset1 meaning "most recent one". - - If offset_value is 1, then the offset used is Repeated_Offset1, etc. - - There is one exception: When the current sequence's literals_length - is 0, repeated offsets are shifted by 1, so an offset_value of 1 - means Repeated_Offset2, an offset_value of 2 means Repeated_Offset3, - and an offset_value of 3 means Repeated_Offset1 - 1_byte. - - For the first block, the starting offset history is populated with - the following values: Repeated_Offset1 (1), Repeated_Offset2 (4), and - Repeated_Offset3 (8), unless a dictionary is used, in which case they - come from the dictionary. - - Then each block gets its starting offset history from the ending - values of the most recent Compressed_Block. Note that blocks that - are not Compressed_Block are skipped; they do not contribute to - offset history. - - The newest offset takes the lead in offset history, shifting others - back (up to its previous place if it was already present). This - means that when Repeated_Offset1 (most recent) is used, history is - unmodified. When Repeated_Offset2 is used, it is swapped with - Repeated_Offset1. If any other offset is used, it becomes - Repeated_Offset1, and the rest are shifted back by 1. - - - - - - -Collet & Kucherawy Informational [Page 29] - -RFC 8478 application/zstd October 2018 - - -3.1.2. Skippable Frames - - +--------------+------------+-----------+ - | Magic_Number | Frame_Size | User_Data | - +--------------+------------+-----------+ - | 4 bytes | 4 bytes | n bytes | - +--------------+------------+-----------+ - - Skippable frames allow the insertion of user-defined metadata into a - flow of concatenated frames. - - Skippable frames defined in this specification are compatible with - skippable frames in [LZ4]. - - From a compliant decoder perspective, skippable frames simply need to - be skipped, and their content ignored, resuming decoding after the - skippable frame. - - It should be noted that a skippable frame can be used to watermark a - stream of concatenated frames embedding any kind of tracking - information (even just a Universally Unique Identifier (UUID)). - Users wary of such possibility should scan the stream of concatenated - frames in an attempt to detect such frames for analysis or removal. - - The fields are: - - Magic_Number: 4 bytes, little-endian format. Value: 0x184D2A5?, - which means any value from 0x184D2A50 to 0x184D2A5F. All 16 - values are valid to identify a skippable frame. This - specification does not detail any specific tagging methods for - skippable frames. - - Frame_Size: This is the size, in bytes, of the following User_Data - (without including the magic number nor the size field itself). - This field is represented using 4 bytes, little-endian format, - unsigned 32 bits. This means User_Data can't be bigger than - (2^32-1) bytes. - - User_Data: This field can be anything. Data will just be skipped by - the decoder. - -4. Entropy Encoding - - Two types of entropy encoding are used by the Zstandard format: FSE - and Huffman coding. Huffman is used to compress literals, while FSE - is used for all other symbols (Literals_Length_Code, - Match_Length_Code, and offset codes) and to compress Huffman headers. - - - - -Collet & Kucherawy Informational [Page 30] - -RFC 8478 application/zstd October 2018 - - -4.1. FSE - - FSE, short for Finite State Entropy, is an entropy codec based on - [ANS]. FSE encoding/decoding involves a state that is carried over - between symbols, so decoding must be done in the opposite direction - as encoding. Therefore, all FSE bitstreams are read from end to - beginning. Note that the order of the bits in the stream is not - reversed; they are simply read in the reverse order from which they - were written. - - For additional details on FSE, see Finite State Entropy [FSE]. - - FSE decoding involves a decoding table that has a power of 2 size and - contains three elements: Symbol, Num_Bits, and Baseline. The base 2 - logarithm of the table size is its Accuracy_Log. An FSE state value - represents an index in this table. - - To obtain the initial state value, consume Accuracy_Log bits from the - stream as a little-endian value. The next symbol in the stream is - the Symbol indicated in the table for that state. To obtain the next - state value, the decoder should consume Num_Bits bits from the stream - as a little-endian value and add it to Baseline. - -4.1.1. FSE Table Description - - To decode FSE streams, it is necessary to construct the decoding - table. The Zstandard format encodes FSE table descriptions as - described here. - - An FSE distribution table describes the probabilities of all symbols - from 0 to the last present one (included) on a normalized scale of - (1 << Accuracy_Log). Note that there must be two or more symbols - with non-zero probability. - - A bitstream is read forward, in little-endian fashion. It is not - necessary to know its exact size, since the size will be discovered - and reported by the decoding process. The bitstream starts by - reporting on which scale it operates. If low4bits designates the - lowest 4 bits of the first byte, then Accuracy_Log = low4bits + 5. - - - - - - - - - - - - -Collet & Kucherawy Informational [Page 31] - -RFC 8478 application/zstd October 2018 - - - This is followed by each symbol value, from 0 to the last present - one. The number of bits used by each field is variable and depends - on: - - Remaining probabilities + 1: For example, presuming an Accuracy_Log - of 8, and presuming 100 probabilities points have already been - distributed, the decoder may read any value from 0 to - (256 - 100 + 1) == 157, inclusive. Therefore, it must read - log2sup(157) == 8 bits. - - Value decoded: Small values use 1 fewer bit. For example, presuming - values from 0 to 157 (inclusive) are possible, 255 - 157 = 98 - values are remaining in an 8-bit field. The first 98 values - (hence from 0 to 97) use only 7 bits, and values from 98 to 157 - use 8 bits. This is achieved through this scheme: - - +------------+---------------+-----------+ - | Value Read | Value Decoded | Bits Used | - +------------+---------------+-----------+ - | 0 - 97 | 0 - 97 | 7 | - +------------+---------------+-----------+ - | 98 - 127 | 98 - 127 | 8 | - +------------+---------------+-----------+ - | 128 - 225 | 0 - 97 | 7 | - +------------+---------------+-----------+ - | 226 - 255 | 128 - 157 | 8 | - +------------+---------------+-----------+ - - Symbol probabilities are read one by one, in order. The probability - is obtained from Value decoded using the formula P = Value - 1. This - means the value 0 becomes the negative probability -1. This is a - special probability that means "less than 1". Its effect on the - distribution table is described below. For the purpose of - calculating total allocated probability points, it counts as 1. - - When a symbol has a probability of zero, it is followed by a 2-bit - repeat flag. This repeat flag tells how many probabilities of zeroes - follow the current one. It provides a number ranging from 0 to 3. - If it is a 3, another 2-bit repeat flag follows, and so on. - - When the last symbol reaches a cumulated total of - (1 << Accuracy_Log), decoding is complete. If the last symbol makes - the cumulated total go above (1 << Accuracy_Log), distribution is - considered corrupted. - - - - - - - -Collet & Kucherawy Informational [Page 32] - -RFC 8478 application/zstd October 2018 - - - Finally, the decoder can tell how many bytes were used in this - process and how many symbols are present. The bitstream consumes a - round number of bytes. Any remaining bit within the last byte is - simply unused. - - The distribution of normalized probabilities is enough to create a - unique decoding table. The table has a size of (1 << Accuracy_Log). - Each cell describes the symbol decoded and instructions to get the - next state. - - Symbols are scanned in their natural order for "less than 1" - probabilities as described above. Symbols with this probability are - being attributed a single cell, starting from the end of the table - and retreating. These symbols define a full state reset, reading - Accuracy_Log bits. - - All remaining symbols are allocated in their natural order. Starting - from symbol 0 and table position 0, each symbol gets allocated as - many cells as its probability. Cell allocation is spread, not - linear; each successor position follows this rule: - - position += (tableSize >> 1) + (tableSize >> 3) + 3; - position &= tableSize - 1; - - A position is skipped if it is already occupied by a "less than 1" - probability symbol. Position does not reset between symbols; it - simply iterates through each position in the table, switching to the - next symbol when enough states have been allocated to the current - one. - - The result is a list of state values. Each state will decode the - current symbol. - - To get the Number_of_Bits and Baseline required for the next state, - it is first necessary to sort all states in their natural order. The - lower states will need 1 more bit than higher ones. The process is - repeated for each symbol. - - For example, presuming a symbol has a probability of 5, it receives - five state values. States are sorted in natural order. The next - power of 2 is 8. The space of probabilities is divided into 8 equal - parts. Presuming the Accuracy_Log is 7, this defines 128 states, and - each share (divided by 8) is 16 in size. In order to reach 8, 8 - 5 - = 3 lowest states will count "double", doubling the number of shares - (32 in width), requiring 1 more bit in the process. - - - - - - -Collet & Kucherawy Informational [Page 33] - -RFC 8478 application/zstd October 2018 - - - Baseline is assigned starting from the higher states using fewer - bits, and proceeding naturally, then resuming at the first state, - each taking its allocated width from Baseline. - - +----------------+-------+-------+--------+------+-------+ - | state order | 0 | 1 | 2 | 3 | 4 | - +----------------+-------+-------+--------+------+-------+ - | width | 32 | 32 | 32 | 16 | 16 | - +----------------+-------+-------+--------+------+-------+ - | Number_of_Bits | 5 | 5 | 5 | 4 | 4 | - +----------------+-------+-------+--------+------+-------+ - | range number | 2 | 4 | 6 | 0 | 1 | - +----------------+-------+-------+--------+------+-------+ - | Baseline | 32 | 64 | 96 | 0 | 16 | - +----------------+-------+-------+--------+------+-------+ - | range | 32-63 | 64-95 | 96-127 | 0-15 | 16-31 | - +----------------+-------+-------+--------+------+-------+ - - The next state is determined from the current state by reading the - required Number_of_Bits and adding the specified Baseline. - - See Appendix A for the results of this process that are applied to - the default distributions. - -4.2. Huffman Coding - - Zstandard Huffman-coded streams are read backwards, similar to the - FSE bitstreams. Therefore, to find the start of the bitstream, it is - necessary to know the offset of the last byte of the Huffman-coded - stream. - - After writing the last bit containing information, the compressor - writes a single 1 bit and then fills the byte with 0-7 0 bits of - padding. The last byte of the compressed bitstream cannot be 0 for - that reason. - - When decompressing, the last byte containing the padding is the first - byte to read. The decompressor needs to skip 0-7 initial 0 bits and - the first 1 bit that occurs. Afterwards, the useful part of the - bitstream begins. - - The bitstream contains Huffman-coded symbols in little-endian order, - with the codes defined by the method below. - - - - - - - - -Collet & Kucherawy Informational [Page 34] - -RFC 8478 application/zstd October 2018 - - -4.2.1. Huffman Tree Description - - Prefix coding represents symbols from an a priori known alphabet by - bit sequences (codewords), one codeword for each symbol, in a manner - such that different symbols may be represented by bit sequences of - different lengths, but a parser can always parse an encoded string - unambiguously symbol by symbol. - - Given an alphabet with known symbol frequencies, the Huffman - algorithm allows the construction of an optimal prefix code using the - fewest bits of any possible prefix codes for that alphabet. - - The prefix code must not exceed a maximum code length. More bits - improve accuracy but yield a larger header size and require more - memory or more complex decoding operations. This specification - limits the maximum code length to 11 bits. - - All literal values from zero (included) to the last present one - (excluded) are represented by Weight with values from 0 to - Max_Number_of_Bits. Transformation from Weight to Number_of_Bits - follows this pseudocode: - - if Weight == 0 - Number_of_Bits = 0 - else - Number_of_Bits = Max_Number_of_Bits + 1 - Weight - - The last symbol's Weight is deduced from previously decoded ones, by - completing to the nearest power of 2. This power of 2 gives - Max_Number_of_Bits the depth of the current tree. - - For example, presume the following Huffman tree must be described: - - +---------------+----------------+ - | Literal Value | Number_of_Bits | - +---------------+----------------+ - | 0 | 1 | - +---------------+----------------+ - | 1 | 2 | - +---------------+----------------+ - | 2 | 3 | - +---------------+----------------+ - | 3 | 0 | - +---------------+----------------+ - | 4 | 4 | - +---------------+----------------+ - | 5 | 4 | - +---------------+----------------+ - - - -Collet & Kucherawy Informational [Page 35] - -RFC 8478 application/zstd October 2018 - - - The tree depth is 4, since its longest element uses 4 bits. (The - longest elements are those with the smallest frequencies.) Value 5 - will not be listed as it can be determined from the values for 0-4, - nor will values above 5 as they are all 0. Values from 0 to 4 will - be listed using Weight instead of Number_of_Bits. The pseudocode to - determine Weight is: - - if Number_of_Bits == 0 - Weight = 0 - else - Weight = Max_Number_of_Bits + 1 - Number_of_Bits - - It gives the following series of weights: - - +---------------+--------+ - | Literal Value | Weight | - +---------------+--------+ - | 0 | 4 | - +---------------+--------+ - | 1 | 3 | - +---------------+--------+ - | 2 | 2 | - +---------------+--------+ - | 3 | 0 | - +---------------+--------+ - | 4 | 1 | - +---------------+--------+ - - The decoder will do the inverse operation: having collected weights - of literals from 0 to 4, it knows the last literal, 5, is present - with a non-zero Weight. The Weight of 5 can be determined by - advancing to the next power of 2. The sum of 2^(Weight-1) (excluding - 0's) is 15. The nearest power of 2 is 16. Therefore, - Max_Number_of_Bits = 4 and Weight[5] = 16 - 15 = 1. - -4.2.1.1. Huffman Tree Header - - This is a single byte value (0-255), which describes how the series - of weights is encoded. - - headerByte < 128: The series of weights is compressed using FSE (see - below). The length of the FSE-compressed series is equal to - headerByte (0-127). - - - - - - - - -Collet & Kucherawy Informational [Page 36] - -RFC 8478 application/zstd October 2018 - - - headerByte >= 128: This is a direct representation, where each - Weight is written directly as a 4-bit field (0-15). They are - encoded forward, 2 weights to a byte with the first weight taking - the top 4 bits and the second taking the bottom 4; for example, - the following operations could be used to read the weights: - - Weight[0] = (Byte[0] >> 4) - Weight[1] = (Byte[0] & 0xf), - etc. - - The full representation occupies ceiling(Number_of_Symbols/2) - bytes, meaning it uses only full bytes even if Number_of_Symbols - is odd. Number_of_Symbols = headerByte - 127. Note that maximum - Number_of_Symbols is 255 - 127 = 128. If any literal has a value - over 128, raw header mode is not possible, and it is necessary to - use FSE compression. - -4.2.1.2. FSE Compression of Huffman Weights - - In this case, the series of Huffman weights is compressed using FSE - compression. It is a single bitstream with two interleaved states, - sharing a single distribution table. - - To decode an FSE bitstream, it is necessary to know its compressed - size. Compressed size is provided by headerByte. It's also - necessary to know its maximum possible decompressed size, which is - 255, since literal values span from 0 to 255, and the last symbol's - Weight is not represented. - - An FSE bitstream starts by a header, describing probabilities - distribution. It will create a decoding table. For a list of - Huffman weights, the maximum accuracy log is 6 bits. For more - details, see Section 4.1.1. - - The Huffman header compression uses two states, which share the same - FSE distribution table. The first state (State1) encodes the even- - numbered index symbols, and the second (State2) encodes the odd- - numbered index symbols. State1 is initialized first, and then - State2, and they take turns decoding a single symbol and updating - their state. For more details on these FSE operations, see - Section 4.1. - - The number of symbols to be decoded is determined by tracking the - bitStream overflow condition: If updating state after decoding a - symbol would require more bits than remain in the stream, it is - assumed that extra bits are zero. Then, symbols for each of the - final states are decoded and the process is complete. - - - - -Collet & Kucherawy Informational [Page 37] - -RFC 8478 application/zstd October 2018 - - -4.2.1.3. Conversion from Weights to Huffman Prefix Codes - - All present symbols will now have a Weight value. It is possible to - transform weights into Number_of_Bits, using this formula: - - if Weight > 0 - Number_of_Bits = Max_Number_of_Bits + 1 - Weight - else - Number_of_Bits = 0 - - Symbols are sorted by Weight. Within the same Weight, symbols keep - natural sequential order. Symbols with a Weight of zero are removed. - Then, starting from the lowest Weight, prefix codes are distributed - in sequential order. - - For example, assume the following list of weights has been decoded: - - +---------+--------+ - | Literal | Weight | - +---------+--------+ - | 0 | 4 | - +---------+--------+ - | 1 | 3 | - +---------+--------+ - | 2 | 2 | - +---------+--------+ - | 3 | 0 | - +---------+--------+ - | 4 | 1 | - +---------+--------+ - | 5 | 1 | - +---------+--------+ - - - - - - - - - - - - - - - - - - - -Collet & Kucherawy Informational [Page 38] - -RFC 8478 application/zstd October 2018 - - - Sorting by weight and then the natural sequential order yields the - following distribution: - - +---------+--------+----------------+--------------+ - | Literal | Weight | Number_Of_Bits | Prefix Codes | - +---------+--------+----------------|--------------+ - | 3 | 0 | 0 | N/A | - +---------+--------+----------------|--------------+ - | 4 | 1 | 4 | 0000 | - +---------+--------+----------------|--------------+ - | 5 | 1 | 4 | 0001 | - +---------+--------+----------------|--------------+ - | 2 | 2 | 3 | 001 | - +---------+--------+----------------|--------------+ - | 1 | 3 | 2 | 01 | - +---------+--------+----------------|--------------+ - | 0 | 4 | 1 | 1 | - +---------+--------+----------------|--------------+ - -4.2.2. Huffman-Coded Streams - - Given a Huffman decoding table, it is possible to decode a Huffman- - coded stream. - - Each bitstream must be read backward, which starts from the end and - goes up to the beginning. Therefore, it is necessary to know the - size of each bitstream. - - It is also necessary to know exactly which bit is the last. This is - detected by a final bit flag: the highest bit of the last byte is a - final-bit-flag. Consequently, a last byte of 0 is not possible. And - the final-bit-flag itself is not part of the useful bitstream. - Hence, the last byte contains between 0 and 7 useful bits. - - Starting from the end, it is possible to read the bitstream in a - little-endian fashion, keeping track of already used bits. Since the - bitstream is encoded in reverse order, starting from the end, read - symbols in forward order. - - - - - - - - - - - - - -Collet & Kucherawy Informational [Page 39] - -RFC 8478 application/zstd October 2018 - - - For example, if the literal sequence "0145" was encoded using the - above prefix code, it would be encoded (in reverse order) as: - - +---------+----------+ - | Symbol | Encoding | - +---------+----------+ - | 5 | 0000 | - +---------+----------+ - | 4 | 0001 | - +---------+----------+ - | 1 | 01 | - +---------+----------+ - | 0 | 1 | - +---------+----------+ - | Padding | 00001 | - +---------+----------+ - - This results in the following 2-byte bitstream: - - 00010000 00001101 - - Here is an alternative representation with the symbol codes separated - by underscores: - - 0001_0000 00001_1_01 - - Reading the highest Max_Number_of_Bits bits, it's possible to compare - the extracted value to the decoding table, determining the symbol to - decode and number of bits to discard. - - The process continues reading up to the required number of symbols - per stream. If a bitstream is not entirely and exactly consumed, - hence reaching exactly its beginning position with all bits consumed, - the decoding process is considered faulty. - -5. Dictionary Format - - Zstandard is compatible with "raw content" dictionaries, free of any - format restriction, except that they must be at least 8 bytes. These - dictionaries function as if they were just the content part of a - formatted dictionary. - - However, dictionaries created by "zstd --train" in the reference - implementation follow a specific format, described here. - - Dictionaries are not included in the compressed content but rather - are provided out of band. That is, the Dictionary_ID identifies - which should be used, but this specification does not describe the - - - -Collet & Kucherawy Informational [Page 40] - -RFC 8478 application/zstd October 2018 - - - mechanism by which the dictionary is obtained prior to use during - compression or decompression. - - A dictionary has a size, defined either by a buffer limit or a file - size. The general format is: - - +--------------+---------------+----------------+---------+ - | Magic_Number | Dictionary_ID | Entropy_Tables | Content | - +--------------+---------------+----------------+---------+ - - Magic_Number: 4 bytes ID, value 0xEC30A437, little-endian format. - - Dictionary_ID: 4 bytes, stored in little-endian format. - Dictionary_ID can be any value, except 0 (which means no - Dictionary_ID). It is used by decoders to check if they use the - correct dictionary. If the frame is going to be distributed in a - private environment, any Dictionary_ID can be used. However, for - public distribution of compressed frames, the following ranges are - reserved and shall not be used: - - low range: <= 32767 - high range: >= (2^31) - - Entropy_Tables: Follow the same format as the tables in compressed - blocks. See the relevant FSE and Huffman sections for how to - decode these tables. They are stored in the following order: - Huffman table for literals, FSE table for offsets, FSE table for - match lengths, and FSE table for literals lengths. These tables - populate the Repeat Stats literals mode and Repeat distribution - mode for sequence decoding. It is finally followed by 3 offset - values, populating repeat offsets (instead of using {1,4,8}), - stored in order, 4-bytes little-endian each, for a total of 12 - bytes. Each repeat offset must have a value less than the - dictionary size. - - Content: The rest of the dictionary is its content. The content - acts as a "past" in front of data to be compressed or - decompressed, so it can be referenced in sequence commands. As - long as the amount of data decoded from this frame is less than or - equal to Window_Size, sequence commands may specify offsets longer - than the total length of decoded output so far to reference back - to the dictionary, even parts of the dictionary with offsets - larger than Window_Size. After the total output has surpassed - Window_Size, however, this is no longer allowed, and the - dictionary is no longer accessible. - - - - - - -Collet & Kucherawy Informational [Page 41] - -RFC 8478 application/zstd October 2018 - - -6. IANA Considerations - - IANA has made two registrations, as described below. - -6.1. The 'application/zstd' Media Type - - The 'application/zstd' media type identifies a block of data that is - compressed using zstd compression. The data is a stream of bytes as - described in this document. IANA has added the following to the - "Media Types" registry: - - Type name: application - - Subtype name: zstd - - Required parameters: N/A - - Optional parameters: N/A - - Encoding considerations: binary - - Security considerations: See Section 7 of RFC 8478 - - Interoperability considerations: N/A - - Published specification: RFC 8478 - - Applications that use this media type: anywhere data size is an - issue - - Additional information: - - Magic number(s): 4 bytes, little-endian format. - Value: 0xFD2FB528 - - File extension(s): zst - - Macintosh file type code(s): N/A - - For further information: See [ZSTD] - - Intended usage: common - - Restrictions on usage: N/A - - Author: Murray S. Kucherawy - - Change Controller: IETF - - - -Collet & Kucherawy Informational [Page 42] - -RFC 8478 application/zstd October 2018 - - - Provisional registration: no - -6.2. Content Encoding - - IANA has added the following entry to the "HTTP Content Coding - Registry" within the "Hypertext Transfer Protocol (HTTP) Parameters" - registry: - - Name: zstd - - Description: A stream of bytes compressed using the Zstandard - protocol - - Pointer to specification text: RFC 8478 - -6.3. Dictionaries - - Work in progress includes development of dictionaries that will - optimize compression and decompression of particular types of data. - Specification of such dictionaries for public use will necessitate - registration of a code point from the reserved range described in - Section 3.1.1.1.3 and its association with a specific dictionary. - - However, there are at present no such dictionaries published for - public use, so this document makes no immediate request of IANA to - create such a registry. - -7. Security Considerations - - Any data compression method involves the reduction of redundancy in - the data. Zstandard is no exception, and the usual precautions - apply. - - One should never compress a message whose content must remain secret - with a message generated by a third party. Such a compression can be - used to guess the content of the secret message through analysis of - entropy reduction. This was demonstrated in the Compression Ratio - Info-leak Made Easy (CRIME) attack [CRIME], for example. - - A decoder has to demonstrate capabilities to detect and prevent any - kind of data tampering in the compressed frame from triggering system - faults, such as reading or writing beyond allowed memory ranges. - This can be guaranteed by either the implementation language or - careful bound checkings. Of particular note is the encoding of - Number_of_Sequences values that cause the decoder to read into the - block header (and beyond), as well as the indication of a - Frame_Content_Size that is smaller than the actual decompressed data, - in an attempt to trigger a buffer overflow. It is highly recommended - - - -Collet & Kucherawy Informational [Page 43] - -RFC 8478 application/zstd October 2018 - - - to fuzz-test (i.e., provide invalid, unexpected, or random input and - verify safe operation of) decoder implementations to test and harden - their capability to detect bad frames and deal with them without any - adverse system side effect. - - An attacker may provide correctly formed compressed frames with - unreasonable memory requirements. A decoder must always control - memory requirements and enforce some (system-specific) limits in - order to protect memory usage from such scenarios. - - Compression can be optimized by training a dictionary on a variety of - related content payloads. This dictionary must then be available at - the decoder for decompression of the payload to be possible. While - this document does not specify how to acquire a dictionary for a - given compressed payload, it is worth noting that third-party - dictionaries may interact unexpectedly with a decoder, leading to - possible memory or other resource exhaustion attacks. We expect such - topics to be discussed in further detail in the Security - Considerations section of a forthcoming RFC for dictionary - acquisition and transmission, but highlight this issue now out of an - abundance of caution. - - As discussed in Section 3.1.2, it is possible to store arbitrary user - metadata in skippable frames. While such frames are ignored during - decompression of the data, they can be used as a watermark to track - the path of the compressed payload. - -8. Implementation Status - - Source code for a C language implementation of a Zstandard-compliant - library is available at [ZSTD-GITHUB]. This implementation is - considered to be the reference implementation and is production - ready; it implements the full range of the specification. It is - routinely tested against security hazards and widely deployed within - Facebook infrastructure. - - The reference version is optimized for speed and is highly portable. - It has been proven to run safely on multiple architectures (e.g., - x86, x64, ARM, MIPS, PowerPC, IA64) featuring 32- or 64-bit - addressing schemes, a little- or big-endian storage scheme, a number - of different operating systems (e.g., UNIX (including Linux, BSD, - OS-X, and Solaris) and Windows), and a number of compilers (e.g., - gcc, clang, visual, and icc). - - - - - - - - -Collet & Kucherawy Informational [Page 44] - -RFC 8478 application/zstd October 2018 - - -9. References - -9.1. Normative References - - [ZSTD] "Zstandard", <http://www.zstd.net>. - -9.2. Informative References - - [ANS] Duda, J., "Asymmetric numeral systems: entropy coding - combining speed of Huffman coding with compression rate of - arithmetic coding", January 2014, - <https://arxiv.org/pdf/1311.2540>. - - [CRIME] "CRIME", June 2018, <https://en.wikipedia.org/w/ - index.php?title=CRIME&oldid=844538656>. - - [FSE] "FiniteStateEntropy", commit 6efa78a, June 2018, - <https://github.com/Cyan4973/FiniteStateEntropy/>. - - [LZ4] "LZ4 Frame Format Description", commit d03224b, January - 2018, <https://github.com/lz4/lz4/blob/master/doc/ - lz4_Frame_format.md>. - - [RFC1952] Deutsch, P., "GZIP file format specification version 4.3", - RFC 1952, DOI 10.17487/RFC1952, May 1996, - <https://www.rfc-editor.org/info/rfc1952>. - - [XXHASH] "XXHASH Algorithm", <http://www.xxhash.org>. - - [ZSTD-GITHUB] - "zstd", commit 8514bd8, August 2018, - <https://github.com/facebook/zstd>. - - - - - - - - - - - - - - - - - - - -Collet & Kucherawy Informational [Page 45] - -RFC 8478 application/zstd October 2018 - - -Appendix A. Decoding Tables for Predefined Codes - - This appendix contains FSE decoding tables for the predefined literal - length, match length, and offset codes. The tables have been - constructed using the algorithm as given above in Section 4.1.1. The - tables here can be used as examples to crosscheck that an - implementation has built its decoding tables correctly. - -A.1. Literal Length Code Table - - +-------+--------+----------------+------+ - | State | Symbol | Number_Of_Bits | Base | - +-------+--------+----------------+------+ - | 0 | 0 | 0 | 0 | - +-------+--------+----------------+------+ - | 0 | 0 | 4 | 0 | - +-------+--------+----------------+------+ - | 1 | 0 | 4 | 16 | - +-------+--------+----------------+------+ - | 2 | 1 | 5 | 32 | - +-------+--------+----------------+------+ - | 3 | 3 | 5 | 0 | - +-------+--------+----------------+------+ - | 4 | 4 | 5 | 0 | - +-------+--------+----------------+------+ - | 5 | 6 | 5 | 0 | - +-------+--------+----------------+------+ - | 6 | 7 | 5 | 0 | - +-------+--------+----------------+------+ - | 7 | 9 | 5 | 0 | - +-------+--------+----------------+------+ - | 8 | 10 | 5 | 0 | - +-------+--------+----------------+------+ - | 9 | 12 | 5 | 0 | - +-------+--------+----------------+------+ - | 10 | 14 | 6 | 0 | - +-------+--------+----------------+------+ - | 11 | 16 | 5 | 0 | - +-------+--------+----------------+------+ - | 12 | 18 | 5 | 0 | - +-------+--------+----------------+------+ - | 13 | 19 | 5 | 0 | - +-------+--------+----------------+------+ - | 14 | 21 | 5 | 0 | - +-------+--------+----------------+------+ - | 15 | 22 | 5 | 0 | - +-------+--------+----------------+------+ - | 16 | 24 | 5 | 0 | - - - -Collet & Kucherawy Informational [Page 46] - -RFC 8478 application/zstd October 2018 - - - +-------+--------+----------------+------+ - | 17 | 25 | 5 | 32 | - +-------+--------+----------------+------+ - | 18 | 26 | 5 | 0 | - +-------+--------+----------------+------+ - | 19 | 27 | 6 | 0 | - +-------+--------+----------------+------+ - | 20 | 29 | 6 | 0 | - +-------+--------+----------------+------+ - | 21 | 31 | 6 | 0 | - +-------+--------+----------------+------+ - | 22 | 0 | 4 | 32 | - +-------+--------+----------------+------+ - | 23 | 1 | 4 | 0 | - +-------+--------+----------------+------+ - | 24 | 2 | 5 | 0 | - +-------+--------+----------------+------+ - | 25 | 4 | 5 | 32 | - +-------+--------+----------------+------+ - | 26 | 5 | 5 | 0 | - +-------+--------+----------------+------+ - | 27 | 7 | 5 | 32 | - +-------+--------+----------------+------+ - | 28 | 8 | 5 | 0 | - +-------+--------+----------------+------+ - | 29 | 10 | 5 | 32 | - +-------+--------+----------------+------+ - | 30 | 11 | 5 | 0 | - +-------+--------+----------------+------+ - | 31 | 13 | 6 | 0 | - +-------+--------+----------------+------+ - | 32 | 16 | 5 | 32 | - +-------+--------+----------------+------+ - | 33 | 17 | 5 | 0 | - +-------+--------+----------------+------+ - | 34 | 19 | 5 | 32 | - +-------+--------+----------------+------+ - | 35 | 20 | 5 | 0 | - +-------+--------+----------------+------+ - | 36 | 22 | 5 | 32 | - +-------+--------+----------------+------+ - | 37 | 23 | 5 | 0 | - +-------+--------+----------------+------+ - | 38 | 25 | 4 | 0 | - +-------+--------+----------------+------+ - | 39 | 25 | 4 | 16 | - +-------+--------+----------------+------+ - | 40 | 26 | 5 | 32 | - - - -Collet & Kucherawy Informational [Page 47] - -RFC 8478 application/zstd October 2018 - - - +-------+--------+----------------+------+ - | 41 | 28 | 6 | 0 | - +-------+--------+----------------+------+ - | 42 | 30 | 6 | 0 | - +-------+--------+----------------+------+ - | 43 | 0 | 4 | 48 | - +-------+--------+----------------+------+ - | 44 | 1 | 4 | 16 | - +-------+--------+----------------+------+ - | 45 | 2 | 5 | 32 | - +-------+--------+----------------+------+ - | 46 | 3 | 5 | 32 | - +-------+--------+----------------+------+ - | 47 | 5 | 5 | 32 | - +-------+--------+----------------+------+ - | 48 | 6 | 5 | 32 | - +-------+--------+----------------+------+ - | 49 | 8 | 5 | 32 | - +-------+--------+----------------+------+ - | 50 | 9 | 5 | 32 | - +-------+--------+----------------+------+ - | 51 | 11 | 5 | 32 | - +-------+--------+----------------+------+ - | 52 | 12 | 5 | 32 | - +-------+--------+----------------+------+ - | 53 | 15 | 6 | 0 | - +-------+--------+----------------+------+ - | 54 | 17 | 5 | 32 | - +-------+--------+----------------+------+ - | 55 | 18 | 5 | 32 | - +-------+--------+----------------+------+ - | 56 | 20 | 5 | 32 | - +-------+--------+----------------+------+ - | 57 | 21 | 5 | 32 | - +-------+--------+----------------+------+ - | 58 | 23 | 5 | 32 | - +-------+--------+----------------+------+ - | 59 | 24 | 5 | 32 | - +-------+--------+----------------+------+ - | 60 | 35 | 6 | 0 | - +-------+--------+----------------+------+ - | 61 | 34 | 6 | 0 | - +-------+--------+----------------+------+ - | 62 | 33 | 6 | 0 | - +-------+--------+----------------+------+ - | 63 | 32 | 6 | 0 | - +-------+--------+----------------+------+ - - - - -Collet & Kucherawy Informational [Page 48] - -RFC 8478 application/zstd October 2018 - - -A.2. Match Length Code Table - - +-------+--------+----------------+------+ - | State | Symbol | Number_Of_Bits | Base | - +-------+--------+----------------+------+ - | 0 | 0 | 0 | 0 | - +-------+--------+----------------+------+ - | 0 | 0 | 6 | 0 | - +-------+--------+----------------+------+ - | 1 | 1 | 4 | 0 | - +-------+--------+----------------+------+ - | 2 | 2 | 5 | 32 | - +-------+--------+----------------+------+ - | 3 | 3 | 5 | 0 | - +-------+--------+----------------+------+ - | 4 | 5 | 5 | 0 | - +-------+--------+----------------+------+ - | 5 | 6 | 5 | 0 | - +-------+--------+----------------+------+ - | 6 | 8 | 5 | 0 | - +-------+--------+----------------+------+ - | 7 | 10 | 6 | 0 | - +-------+--------+----------------+------+ - | 8 | 13 | 6 | 0 | - +-------+--------+----------------+------+ - | 9 | 16 | 6 | 0 | - +-------+--------+----------------+------+ - | 10 | 19 | 6 | 0 | - +-------+--------+----------------+------+ - | 11 | 22 | 6 | 0 | - +-------+--------+----------------+------+ - | 12 | 25 | 6 | 0 | - +-------+--------+----------------+------+ - | 13 | 28 | 6 | 0 | - +-------+--------+----------------+------+ - | 14 | 31 | 6 | 0 | - +-------+--------+----------------+------+ - | 15 | 33 | 6 | 0 | - +-------+--------+----------------+------+ - | 16 | 35 | 6 | 0 | - +-------+--------+----------------+------+ - | 17 | 37 | 6 | 0 | - +-------+--------+----------------+------+ - | 18 | 39 | 6 | 0 | - +-------+--------+----------------+------+ - | 19 | 41 | 6 | 0 | - +-------+--------+----------------+------+ - | 20 | 43 | 6 | 0 | - - - -Collet & Kucherawy Informational [Page 49] - -RFC 8478 application/zstd October 2018 - - - +-------+--------+----------------+------+ - | 21 | 45 | 6 | 0 | - +-------+--------+----------------+------+ - | 22 | 1 | 4 | 16 | - +-------+--------+----------------+------+ - | 23 | 2 | 4 | 0 | - +-------+--------+----------------+------+ - | 24 | 3 | 5 | 32 | - +-------+--------+----------------+------+ - | 25 | 4 | 5 | 0 | - +-------+--------+----------------+------+ - | 26 | 6 | 5 | 32 | - +-------+--------+----------------+------+ - | 27 | 7 | 5 | 0 | - +-------+--------+----------------+------+ - | 28 | 9 | 6 | 0 | - +-------+--------+----------------+------+ - | 29 | 12 | 6 | 0 | - +-------+--------+----------------+------+ - | 30 | 15 | 6 | 0 | - +-------+--------+----------------+------+ - | 31 | 18 | 6 | 0 | - +-------+--------+----------------+------+ - | 32 | 21 | 6 | 0 | - +-------+--------+----------------+------+ - | 33 | 24 | 6 | 0 | - +-------+--------+----------------+------+ - | 34 | 27 | 6 | 0 | - +-------+--------+----------------+------+ - | 35 | 30 | 6 | 0 | - +-------+--------+----------------+------+ - | 36 | 32 | 6 | 0 | - +-------+--------+----------------+------+ - | 37 | 34 | 6 | 0 | - +-------+--------+----------------+------+ - | 38 | 36 | 6 | 0 | - +-------+--------+----------------+------+ - | 39 | 38 | 6 | 0 | - +-------+--------+----------------+------+ - | 40 | 40 | 6 | 0 | - +-------+--------+----------------+------+ - | 41 | 42 | 6 | 0 | - +-------+--------+----------------+------+ - | 42 | 44 | 6 | 0 | - +-------+--------+----------------+------+ - | 43 | 1 | 4 | 32 | - +-------+--------+----------------+------+ - | 44 | 1 | 4 | 48 | - - - -Collet & Kucherawy Informational [Page 50] - -RFC 8478 application/zstd October 2018 - - - +-------+--------+----------------+------+ - | 45 | 2 | 4 | 16 | - +-------+--------+----------------+------+ - | 46 | 4 | 5 | 32 | - +-------+--------+----------------+------+ - | 47 | 5 | 5 | 32 | - +-------+--------+----------------+------+ - | 48 | 7 | 5 | 32 | - +-------+--------+----------------+------+ - | 49 | 8 | 5 | 32 | - +-------+--------+----------------+------+ - | 50 | 11 | 6 | 0 | - +-------+--------+----------------+------+ - | 51 | 14 | 6 | 0 | - +-------+--------+----------------+------+ - | 52 | 17 | 6 | 0 | - +-------+--------+----------------+------+ - | 53 | 20 | 6 | 0 | - +-------+--------+----------------+------+ - | 54 | 23 | 6 | 0 | - +-------+--------+----------------+------+ - | 55 | 26 | 6 | 0 | - +-------+--------+----------------+------+ - | 56 | 29 | 6 | 0 | - +-------+--------+----------------+------+ - | 57 | 52 | 6 | 0 | - +-------+--------+----------------+------+ - | 58 | 51 | 6 | 0 | - +-------+--------+----------------+------+ - | 59 | 50 | 6 | 0 | - +-------+--------+----------------+------+ - | 60 | 49 | 6 | 0 | - +-------+--------+----------------+------+ - | 61 | 48 | 6 | 0 | - +-------+--------+----------------+------+ - | 62 | 47 | 6 | 0 | - +-------+--------+----------------+------+ - | 63 | 46 | 6 | 0 | - +-------+--------+----------------+------+ - - - - - - - - - - - - -Collet & Kucherawy Informational [Page 51] - -RFC 8478 application/zstd October 2018 - - -A.3. Offset Code Table - - +-------+--------+----------------+------+ - | State | Symbol | Number_Of_Bits | Base | - +-------+--------+----------------+------+ - | 0 | 0 | 0 | 0 | - +-------+--------+----------------+------+ - | 0 | 0 | 5 | 0 | - +-------+--------+----------------+------+ - | 1 | 6 | 4 | 0 | - +-------+--------+----------------+------+ - | 2 | 9 | 5 | 0 | - +-------+--------+----------------+------+ - | 3 | 15 | 5 | 0 | - +-------+--------+----------------+------+ - | 4 | 21 | 5 | 0 | - +-------+--------+----------------+------+ - | 5 | 3 | 5 | 0 | - +-------+--------+----------------+------+ - | 6 | 7 | 4 | 0 | - +-------+--------+----------------+------+ - | 7 | 12 | 5 | 0 | - +-------+--------+----------------+------+ - | 8 | 18 | 5 | 0 | - +-------+--------+----------------+------+ - | 9 | 23 | 5 | 0 | - +-------+--------+----------------+------+ - | 10 | 5 | 5 | 0 | - +-------+--------+----------------+------+ - | 11 | 8 | 4 | 0 | - +-------+--------+----------------+------+ - | 12 | 14 | 5 | 0 | - +-------+--------+----------------+------+ - | 13 | 20 | 5 | 0 | - +-------+--------+----------------+------+ - | 14 | 2 | 5 | 0 | - +-------+--------+----------------+------+ - | 15 | 7 | 4 | 16 | - +-------+--------+----------------+------+ - | 16 | 11 | 5 | 0 | - +-------+--------+----------------+------+ - | 17 | 17 | 5 | 0 | - +-------+--------+----------------+------+ - | 18 | 22 | 5 | 0 | - +-------+--------+----------------+------+ - | 19 | 4 | 5 | 0 | - +-------+--------+----------------+------+ - | 20 | 8 | 4 | 16 | - - - -Collet & Kucherawy Informational [Page 52] - -RFC 8478 application/zstd October 2018 - - - +-------+--------+----------------+------+ - | 21 | 13 | 5 | 0 | - +-------+--------+----------------+------+ - | 22 | 19 | 5 | 0 | - +-------+--------+----------------+------+ - | 23 | 1 | 5 | 0 | - +-------+--------+----------------+------+ - | 24 | 6 | 4 | 16 | - +-------+--------+----------------+------+ - | 25 | 10 | 5 | 0 | - +-------+--------+----------------+------+ - | 26 | 16 | 5 | 0 | - +-------+--------+----------------+------+ - | 27 | 28 | 5 | 0 | - +-------+--------+----------------+------+ - | 28 | 27 | 5 | 0 | - +-------+--------+----------------+------+ - | 29 | 26 | 5 | 0 | - +-------+--------+----------------+------+ - | 30 | 25 | 5 | 0 | - +-------+--------+----------------+------+ - | 31 | 24 | 5 | 0 | - +-------+--------+----------------+------+ - -Acknowledgments - - zstd was developed by Yann Collet. - - Bobo Bose-Kolanu, Felix Handte, Kyle Nekritz, Nick Terrell, and David - Schleimer provided helpful feedback during the development of this - document. - - - - - - - - - - - - - - - - - - - - -Collet & Kucherawy Informational [Page 53] - -RFC 8478 application/zstd October 2018 - - -Authors' Addresses - - Yann Collet - Facebook - 1 Hacker Way - Menlo Park, CA 94025 - United States of America - - Email: cyan@fb.com - - - Murray S. Kucherawy (editor) - Facebook - 1 Hacker Way - Menlo Park, CA 94025 - United States of America - - Email: msk@fb.com - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Collet & Kucherawy Informational [Page 54] - diff --git a/lib/std/compress/testdata/rfc8478.txt.zst.19 b/lib/std/compress/testdata/rfc8478.txt.zst.19 Binary files differ. diff --git a/lib/std/compress/testdata/rfc8478.txt.zst.3 b/lib/std/compress/testdata/rfc8478.txt.zst.3 Binary files differ. diff --git a/lib/std/compress/zstd.zig b/lib/std/compress/zstd.zig @@ -121,17 +121,6 @@ fn testExpectDecompressError(err: anyerror, compressed: []const u8) !void { try std.testing.expectError(err, zstd_stream.err orelse {}); } -test Decompress { - const uncompressed = @embedFile("testdata/rfc8478.txt"); - const compressed3 = @embedFile("testdata/rfc8478.txt.zst.3"); - const compressed19 = @embedFile("testdata/rfc8478.txt.zst.19"); - - try testExpectDecompress(uncompressed, compressed3); - try testExpectDecompress(uncompressed, compressed19); - try std.testing.expectEqual(uncompressed.len, testDiscard(std.testing.allocator, compressed3)); - try std.testing.expectEqual(uncompressed.len, testDiscard(std.testing.allocator, compressed19)); -} - test "partial magic number" { const input_raw = "\x28\xb5\x2f"; // 3 bytes of the 4-byte zstandard frame magic number