Add an xz decoder to the standard library

This commit is contained in:
fn ⌃ ⌥
2023-01-23 11:46:40 -08:00
committed by Andrew Kelley
parent dfcedfdca0
commit 06ce15e8f7
32 changed files with 1230 additions and 0 deletions

View File

@@ -122,6 +122,8 @@ pub fn build(b: *Builder) !void {
"compress-gettysburg.txt",
"compress-pi.txt",
"rfc1951.txt",
// exclude files from lib/std/compress/xz/testdata
".xz",
// exclude files from lib/std/tz/
".tzif",
// others

5
lib/std/compress/xz.zig Normal file
View File

@@ -0,0 +1,5 @@
pub usingnamespace @import("xz/stream.zig");
test {
_ = @import("xz/stream.zig");
}

View File

@@ -0,0 +1,319 @@
const std = @import("std");
const check = @import("check.zig");
const lzma = @import("lzma.zig");
const multibyte = @import("multibyte.zig");
const Allocator = std.mem.Allocator;
const Crc32 = std.hash.Crc32;
const Crc64 = std.hash.crc.Crc64Xz;
const Sha256 = std.crypto.hash.sha2.Sha256;
const DecodeError = error{
CorruptInput,
EndOfStream,
EndOfStreamWithNoError,
WrongChecksum,
Unsupported,
Overflow,
};
pub fn decoder(allocator: Allocator, reader: anytype, check_kind: check.Kind) !Decoder(@TypeOf(reader)) {
return Decoder(@TypeOf(reader)).init(allocator, reader, check_kind);
}
pub fn Decoder(comptime ReaderType: type) type {
return struct {
const Self = @This();
pub const Error =
ReaderType.Error ||
DecodeError ||
Allocator.Error;
pub const Reader = std.io.Reader(*Self, Error, read);
allocator: Allocator,
inner_reader: ReaderType,
check_kind: check.Kind,
err: ?Error,
accum: lzma.LzAccumBuffer,
lzma_state: lzma.DecoderState,
block_count: usize,
fn init(allocator: Allocator, in_reader: ReaderType, check_kind: check.Kind) !Self {
return Self{
.allocator = allocator,
.inner_reader = in_reader,
.check_kind = check_kind,
.err = null,
.accum = .{},
.lzma_state = try lzma.DecoderState.init(allocator),
.block_count = 0,
};
}
pub fn deinit(self: *Self) void {
self.accum.deinit(self.allocator);
self.lzma_state.deinit(self.allocator);
}
pub fn reader(self: *Self) Reader {
return .{ .context = self };
}
pub fn read(self: *Self, output: []u8) Error!usize {
while (true) {
if (self.accum.to_read.items.len > 0) {
const n = self.accum.read(output);
if (self.accum.to_read.items.len == 0 and self.err != null) {
if (self.err.? == DecodeError.EndOfStreamWithNoError) {
return n;
}
return self.err.?;
}
return n;
}
if (self.err != null) {
if (self.err.? == DecodeError.EndOfStreamWithNoError) {
return 0;
}
return self.err.?;
}
self.readBlock() catch |e| {
self.err = e;
if (self.accum.to_read.items.len == 0) {
try self.accum.reset(self.allocator);
}
};
}
}
fn readBlock(self: *Self) Error!void {
const unpacked_pos = self.accum.to_read.items.len;
var block_counter = std.io.countingReader(self.inner_reader);
const block_reader = block_counter.reader();
var packed_size: ?u64 = null;
var unpacked_size: ?u64 = null;
// Block Header
{
var header_hasher = std.compress.hashedReader(block_reader, Crc32.init());
const header_reader = header_hasher.reader();
const header_size = try header_reader.readByte() * 4;
if (header_size == 0)
return error.EndOfStreamWithNoError;
const Flags = packed struct(u8) {
last_filter_index: u2,
reserved: u4,
has_packed_size: bool,
has_unpacked_size: bool,
};
const flags = try header_reader.readStruct(Flags);
const filter_count = @as(u3, flags.last_filter_index) + 1;
if (filter_count > 1)
return error.Unsupported;
if (flags.has_packed_size)
packed_size = try multibyte.readInt(header_reader);
if (flags.has_unpacked_size)
unpacked_size = try multibyte.readInt(header_reader);
const FilterId = enum(u64) {
lzma2 = 0x21,
_,
};
const filter_id = @intToEnum(
FilterId,
try multibyte.readInt(header_reader),
);
if (@enumToInt(filter_id) >= 0x4000_0000_0000_0000)
return error.CorruptInput;
if (filter_id != .lzma2)
return error.Unsupported;
const properties_size = try multibyte.readInt(header_reader);
if (properties_size != 1)
return error.CorruptInput;
// TODO: use filter properties
_ = try header_reader.readByte();
while (block_counter.bytes_read != header_size) {
if (try header_reader.readByte() != 0)
return error.CorruptInput;
}
const hash_a = header_hasher.hasher.final();
const hash_b = try header_reader.readIntLittle(u32);
if (hash_a != hash_b)
return error.WrongChecksum;
}
// Compressed Data
var packed_counter = std.io.countingReader(block_reader);
const packed_reader = packed_counter.reader();
while (try self.readLzma2Chunk(packed_reader)) {}
if (packed_size) |s| {
if (s != packed_counter.bytes_read)
return error.CorruptInput;
}
const unpacked_bytes = self.accum.to_read.items[unpacked_pos..];
if (unpacked_size) |s| {
if (s != unpacked_bytes.len)
return error.CorruptInput;
}
// Block Padding
while (block_counter.bytes_read % 4 != 0) {
if (try block_reader.readByte() != 0)
return error.CorruptInput;
}
// Check
switch (self.check_kind) {
.none => {},
.crc32 => {
const hash_a = Crc32.hash(unpacked_bytes);
const hash_b = try self.inner_reader.readIntLittle(u32);
if (hash_a != hash_b)
return error.WrongChecksum;
},
.crc64 => {
const hash_a = Crc64.hash(unpacked_bytes);
const hash_b = try self.inner_reader.readIntLittle(u64);
if (hash_a != hash_b)
return error.WrongChecksum;
},
.sha256 => {
var hash_a: [Sha256.digest_length]u8 = undefined;
Sha256.hash(unpacked_bytes, &hash_a, .{});
var hash_b: [Sha256.digest_length]u8 = undefined;
try self.inner_reader.readNoEof(&hash_b);
if (!std.mem.eql(u8, &hash_a, &hash_b))
return error.WrongChecksum;
},
else => return error.Unsupported,
}
self.block_count += 1;
}
fn readLzma2Chunk(self: *Self, packed_reader: anytype) Error!bool {
const status = try packed_reader.readByte();
switch (status) {
0 => {
try self.accum.reset(self.allocator);
return false;
},
1, 2 => {
if (status == 1)
try self.accum.reset(self.allocator);
const size = try packed_reader.readIntBig(u16) + 1;
try self.accum.ensureUnusedCapacity(self.allocator, size);
var i: usize = 0;
while (i < size) : (i += 1)
self.accum.appendAssumeCapacity(try packed_reader.readByte());
return true;
},
else => {
if (status & 0x80 == 0)
return error.CorruptInput;
const Reset = struct {
dict: bool,
state: bool,
props: bool,
};
const reset = switch ((status >> 5) & 0x3) {
0 => Reset{
.dict = false,
.state = false,
.props = false,
},
1 => Reset{
.dict = false,
.state = true,
.props = false,
},
2 => Reset{
.dict = false,
.state = true,
.props = true,
},
3 => Reset{
.dict = true,
.state = true,
.props = true,
},
else => unreachable,
};
const unpacked_size = blk: {
var tmp: u64 = status & 0x1F;
tmp <<= 16;
tmp |= try packed_reader.readIntBig(u16);
break :blk tmp + 1;
};
const packed_size = blk: {
var tmp: u64 = try packed_reader.readIntBig(u16);
break :blk tmp + 1;
};
if (reset.dict)
try self.accum.reset(self.allocator);
if (reset.state) {
var new_props = self.lzma_state.lzma_props;
if (reset.props) {
var props = try packed_reader.readByte();
if (props >= 225)
return error.CorruptInput;
const lc = @intCast(u4, props % 9);
props /= 9;
const lp = @intCast(u3, props % 5);
props /= 5;
const pb = @intCast(u3, props);
if (lc + lp > 4)
return error.CorruptInput;
new_props = .{ .lc = lc, .lp = lp, .pb = pb };
}
try self.lzma_state.reset_state(self.allocator, new_props);
}
self.lzma_state.unpacked_size = unpacked_size + self.accum.len();
const buffer = try self.allocator.alloc(u8, packed_size);
defer self.allocator.free(buffer);
for (buffer) |*b|
b.* = try packed_reader.readByte();
var rangecoder = try lzma.RangeDecoder.init(buffer);
try self.lzma_state.process(self.allocator, &self.accum, &rangecoder);
return true;
},
}
}
};
}

View File

@@ -0,0 +1,7 @@
pub const Kind = enum(u4) {
none = 0x00,
crc32 = 0x01,
crc64 = 0x04,
sha256 = 0x0A,
_,
};

View File

@@ -0,0 +1,658 @@
// Ported from https://github.com/gendx/lzma-rs
const std = @import("std");
const assert = std.debug.assert;
const Allocator = std.mem.Allocator;
const ArrayListUnmanaged = std.ArrayListUnmanaged;
const LzmaProperties = struct {
lc: u4,
lp: u3,
pb: u3,
fn validate(self: LzmaProperties) void {
assert(self.lc <= 8);
assert(self.lp <= 4);
assert(self.pb <= 4);
}
};
pub const DecoderState = struct {
lzma_props: LzmaProperties,
unpacked_size: ?u64,
literal_probs: Vec2D(u16),
pos_slot_decoder: [4]BitTree,
align_decoder: BitTree,
pos_decoders: [115]u16,
is_match: [192]u16,
is_rep: [12]u16,
is_rep_g0: [12]u16,
is_rep_g1: [12]u16,
is_rep_g2: [12]u16,
is_rep_0long: [192]u16,
state: usize,
rep: [4]usize,
len_decoder: LenDecoder,
rep_len_decoder: LenDecoder,
pub fn init(allocator: Allocator) !DecoderState {
return .{
.lzma_props = LzmaProperties{ .lc = 0, .lp = 0, .pb = 0 },
.unpacked_size = null,
.literal_probs = try Vec2D(u16).init(allocator, 0x400, 1, 0x300),
.pos_slot_decoder = .{
try BitTree.init(allocator, 6),
try BitTree.init(allocator, 6),
try BitTree.init(allocator, 6),
try BitTree.init(allocator, 6),
},
.align_decoder = try BitTree.init(allocator, 4),
.pos_decoders = .{0x400} ** 115,
.is_match = .{0x400} ** 192,
.is_rep = .{0x400} ** 12,
.is_rep_g0 = .{0x400} ** 12,
.is_rep_g1 = .{0x400} ** 12,
.is_rep_g2 = .{0x400} ** 12,
.is_rep_0long = .{0x400} ** 192,
.state = 0,
.rep = .{0} ** 4,
.len_decoder = try LenDecoder.init(allocator),
.rep_len_decoder = try LenDecoder.init(allocator),
};
}
pub fn deinit(self: *DecoderState, allocator: Allocator) void {
self.literal_probs.deinit(allocator);
for (self.pos_slot_decoder) |*t| t.deinit(allocator);
self.align_decoder.deinit(allocator);
self.len_decoder.deinit(allocator);
self.rep_len_decoder.deinit(allocator);
}
pub fn reset_state(self: *DecoderState, allocator: Allocator, new_props: LzmaProperties) !void {
new_props.validate();
if (self.lzma_props.lc + self.lzma_props.lp == new_props.lc + new_props.lp) {
self.literal_probs.fill(0x400);
} else {
self.literal_probs.deinit(allocator);
self.literal_probs = try Vec2D(u16).init(allocator, 0x400, @as(usize, 1) << (new_props.lc + new_props.lp), 0x300);
}
self.lzma_props = new_props;
for (self.pos_slot_decoder) |*t| t.reset();
self.align_decoder.reset();
self.pos_decoders = .{0x400} ** 115;
self.is_match = .{0x400} ** 192;
self.is_rep = .{0x400} ** 12;
self.is_rep_g0 = .{0x400} ** 12;
self.is_rep_g1 = .{0x400} ** 12;
self.is_rep_g2 = .{0x400} ** 12;
self.is_rep_0long = .{0x400} ** 192;
self.state = 0;
self.rep = .{0} ** 4;
self.len_decoder.reset();
self.rep_len_decoder.reset();
}
fn processNextInner(
self: *DecoderState,
allocator: Allocator,
output: *LzAccumBuffer,
rangecoder: *RangeDecoder,
update: bool,
) !ProcessingStatus {
const pos_state = output.len() & ((@as(usize, 1) << self.lzma_props.pb) - 1);
if (!try rangecoder.decodeBit(
&self.is_match[(self.state << 4) + pos_state],
update,
)) {
const byte: u8 = try self.decodeLiteral(output, rangecoder, update);
if (update) {
try output.appendLiteral(allocator, byte);
self.state = if (self.state < 4)
0
else if (self.state < 10)
self.state - 3
else
self.state - 6;
}
return .continue_;
}
var len: usize = undefined;
if (try rangecoder.decodeBit(&self.is_rep[self.state], update)) {
if (!try rangecoder.decodeBit(&self.is_rep_g0[self.state], update)) {
if (!try rangecoder.decodeBit(
&self.is_rep_0long[(self.state << 4) + pos_state],
update,
)) {
if (update) {
self.state = if (self.state < 7) 9 else 11;
const dist = self.rep[0] + 1;
try output.appendLz(allocator, 1, dist);
}
return .continue_;
}
} else {
const idx: usize = if (!try rangecoder.decodeBit(&self.is_rep_g1[self.state], update))
1
else if (!try rangecoder.decodeBit(&self.is_rep_g2[self.state], update))
2
else
3;
if (update) {
const dist = self.rep[idx];
var i = idx;
while (i > 0) : (i -= 1) {
self.rep[i] = self.rep[i - 1];
}
self.rep[0] = dist;
}
}
len = try self.rep_len_decoder.decode(rangecoder, pos_state, update);
if (update) {
self.state = if (self.state < 7) 8 else 11;
}
} else {
if (update) {
self.rep[3] = self.rep[2];
self.rep[2] = self.rep[1];
self.rep[1] = self.rep[0];
}
len = try self.len_decoder.decode(rangecoder, pos_state, update);
if (update) {
self.state = if (self.state < 7) 7 else 10;
}
const rep_0 = try self.decodeDistance(rangecoder, len, update);
if (update) {
self.rep[0] = rep_0;
if (self.rep[0] == 0xFFFF_FFFF) {
if (rangecoder.isFinished()) {
return .finished;
}
return error.CorruptInput;
}
}
}
if (update) {
len += 2;
const dist = self.rep[0] + 1;
try output.appendLz(allocator, len, dist);
}
return .continue_;
}
fn processNext(
self: *DecoderState,
allocator: Allocator,
output: *LzAccumBuffer,
rangecoder: *RangeDecoder,
) !ProcessingStatus {
return self.processNextInner(allocator, output, rangecoder, true);
}
pub fn process(
self: *DecoderState,
allocator: Allocator,
output: *LzAccumBuffer,
rangecoder: *RangeDecoder,
) !void {
while (true) {
if (self.unpacked_size) |unpacked_size| {
if (output.len() >= unpacked_size) {
break;
}
} else if (rangecoder.isFinished()) {
break;
}
if (try self.processNext(allocator, output, rangecoder) == .finished) {
break;
}
}
if (self.unpacked_size) |len| {
if (len != output.len()) {
return error.CorruptInput;
}
}
}
fn decodeLiteral(
self: *DecoderState,
output: *LzAccumBuffer,
rangecoder: *RangeDecoder,
update: bool,
) !u8 {
const def_prev_byte = 0;
const prev_byte = @as(usize, output.lastOr(def_prev_byte));
var result: usize = 1;
const lit_state = ((output.len() & ((@as(usize, 1) << self.lzma_props.lp) - 1)) << self.lzma_props.lc) +
(prev_byte >> (8 - self.lzma_props.lc));
const probs = try self.literal_probs.get(lit_state);
if (self.state >= 7) {
var match_byte = @as(usize, try output.lastN(self.rep[0] + 1));
while (result < 0x100) {
const match_bit = (match_byte >> 7) & 1;
match_byte <<= 1;
const bit = @boolToInt(try rangecoder.decodeBit(
&probs[((@as(usize, 1) + match_bit) << 8) + result],
update,
));
result = (result << 1) ^ bit;
if (match_bit != bit) {
break;
}
}
}
while (result < 0x100) {
result = (result << 1) ^ @boolToInt(try rangecoder.decodeBit(&probs[result], update));
}
return @truncate(u8, result - 0x100);
}
fn decodeDistance(
self: *DecoderState,
rangecoder: *RangeDecoder,
length: usize,
update: bool,
) !usize {
const len_state = if (length > 3) 3 else length;
const pos_slot = @as(usize, try self.pos_slot_decoder[len_state].parse(rangecoder, update));
if (pos_slot < 4)
return pos_slot;
const num_direct_bits = @intCast(u5, (pos_slot >> 1) - 1);
var result = (2 ^ (pos_slot & 1)) << num_direct_bits;
if (pos_slot < 14) {
result += try rangecoder.parseReverseBitTree(
num_direct_bits,
&self.pos_decoders,
result - pos_slot,
update,
);
} else {
result += @as(usize, try rangecoder.get(num_direct_bits - 4)) << 4;
result += try self.align_decoder.parseReverse(rangecoder, update);
}
return result;
}
};
const ProcessingStatus = enum {
continue_,
finished,
};
pub const LzAccumBuffer = struct {
to_read: ArrayListUnmanaged(u8) = .{},
buf: ArrayListUnmanaged(u8) = .{},
pub fn deinit(self: *LzAccumBuffer, allocator: Allocator) void {
self.to_read.deinit(allocator);
self.buf.deinit(allocator);
}
pub fn read(self: *LzAccumBuffer, output: []u8) usize {
const input = self.to_read.items;
const n = std.math.min(input.len, output.len);
std.mem.copy(u8, output[0..n], input[0..n]);
std.mem.copy(u8, input, input[n..]);
self.to_read.shrinkRetainingCapacity(input.len - n);
return n;
}
pub fn ensureUnusedCapacity(
self: *LzAccumBuffer,
allocator: Allocator,
additional_count: usize,
) !void {
try self.buf.ensureUnusedCapacity(allocator, additional_count);
}
pub fn appendAssumeCapacity(self: *LzAccumBuffer, byte: u8) void {
self.buf.appendAssumeCapacity(byte);
}
pub fn reset(self: *LzAccumBuffer, allocator: Allocator) !void {
try self.to_read.appendSlice(allocator, self.buf.items);
self.buf.clearRetainingCapacity();
}
pub fn len(self: *const LzAccumBuffer) usize {
return self.buf.items.len;
}
pub fn lastOr(self: *const LzAccumBuffer, lit: u8) u8 {
const buf_len = self.buf.items.len;
return if (buf_len == 0)
lit
else
self.buf.items[buf_len - 1];
}
pub fn lastN(self: *const LzAccumBuffer, dist: usize) !u8 {
const buf_len = self.buf.items.len;
if (dist > buf_len) {
return error.CorruptInput;
}
return self.buf.items[buf_len - dist];
}
pub fn appendLiteral(self: *LzAccumBuffer, allocator: Allocator, lit: u8) !void {
try self.buf.append(allocator, lit);
}
pub fn appendLz(self: *LzAccumBuffer, allocator: Allocator, length: usize, dist: usize) !void {
const buf_len = self.buf.items.len;
if (dist > buf_len) {
return error.CorruptInput;
}
var offset = buf_len - dist;
var i: usize = 0;
while (i < length) : (i += 1) {
const x = self.buf.items[offset];
try self.buf.append(allocator, x);
offset += 1;
}
}
};
pub const RangeDecoder = struct {
stream: std.io.FixedBufferStream([]const u8),
range: u32,
code: u32,
pub fn init(buffer: []const u8) !RangeDecoder {
var dec = RangeDecoder{
.stream = std.io.fixedBufferStream(buffer),
.range = 0xFFFF_FFFF,
.code = 0,
};
const reader = dec.stream.reader();
_ = try reader.readByte();
dec.code = try reader.readIntBig(u32);
return dec;
}
pub fn fromParts(
buffer: []const u8,
range: u32,
code: u32,
) RangeDecoder {
return .{
.stream = std.io.fixedBufferStream(buffer),
.range = range,
.code = code,
};
}
pub fn set(self: *RangeDecoder, range: u32, code: u32) void {
self.range = range;
self.code = code;
}
pub fn readInto(self: *RangeDecoder, dest: []u8) !usize {
return self.stream.read(dest);
}
pub inline fn isFinished(self: *const RangeDecoder) bool {
return self.code == 0 and self.isEof();
}
pub inline fn isEof(self: *const RangeDecoder) bool {
return self.stream.pos == self.stream.buffer.len;
}
inline fn normalize(self: *RangeDecoder) !void {
if (self.range < 0x0100_0000) {
self.range <<= 8;
self.code = (self.code << 8) ^ @as(u32, try self.stream.reader().readByte());
}
}
inline fn getBit(self: *RangeDecoder) !bool {
self.range >>= 1;
const bit = self.code >= self.range;
if (bit)
self.code -= self.range;
try self.normalize();
return bit;
}
fn get(self: *RangeDecoder, count: usize) !u32 {
var result: u32 = 0;
var i: usize = 0;
while (i < count) : (i += 1)
result = (result << 1) ^ @boolToInt(try self.getBit());
return result;
}
pub inline fn decodeBit(self: *RangeDecoder, prob: *u16, update: bool) !bool {
const bound = (self.range >> 11) * prob.*;
if (self.code < bound) {
if (update)
prob.* += (0x800 - prob.*) >> 5;
self.range = bound;
try self.normalize();
return false;
} else {
if (update)
prob.* -= prob.* >> 5;
self.code -= bound;
self.range -= bound;
try self.normalize();
return true;
}
}
fn parseBitTree(
self: *RangeDecoder,
num_bits: u5,
probs: []u16,
update: bool,
) !u32 {
var tmp: u32 = 1;
var i: u5 = 0;
while (i < num_bits) : (i += 1) {
const bit = try self.decodeBit(&probs[tmp], update);
tmp = (tmp << 1) ^ @boolToInt(bit);
}
return tmp - (@as(u32, 1) << num_bits);
}
pub fn parseReverseBitTree(
self: *RangeDecoder,
num_bits: u5,
probs: []u16,
offset: usize,
update: bool,
) !u32 {
var result: u32 = 0;
var tmp: usize = 1;
var i: u5 = 0;
while (i < num_bits) : (i += 1) {
const bit = @boolToInt(try self.decodeBit(&probs[offset + tmp], update));
tmp = (tmp << 1) ^ bit;
result ^= @as(u32, bit) << i;
}
return result;
}
};
fn Vec2D(comptime T: type) type {
return struct {
data: []T,
cols: usize,
const Self = @This();
pub fn init(allocator: Allocator, data: T, rows: usize, cols: usize) !Self {
const len = try std.math.mul(usize, rows, cols);
var vec2d = Self{
.data = try allocator.alloc(T, len),
.cols = cols,
};
vec2d.fill(data);
return vec2d;
}
pub fn deinit(self: *Self, allocator: Allocator) void {
allocator.free(self.data);
}
pub fn fill(self: *Self, value: T) void {
std.mem.set(T, self.data, value);
}
pub fn get(self: *Self, row: usize) ![]T {
const start_row = try std.math.mul(usize, row, self.cols);
return self.data[start_row .. start_row + self.cols];
}
};
}
const BitTree = struct {
num_bits: u5,
probs: ArrayListUnmanaged(u16),
pub fn init(allocator: Allocator, num_bits: u5) !BitTree {
var probs_len = @as(usize, 1) << num_bits;
var probs = try ArrayListUnmanaged(u16).initCapacity(allocator, probs_len);
while (probs_len > 0) : (probs_len -= 1)
probs.appendAssumeCapacity(0x400);
return .{ .num_bits = num_bits, .probs = probs };
}
pub fn deinit(self: *BitTree, allocator: Allocator) void {
self.probs.deinit(allocator);
}
pub fn parse(
self: *BitTree,
rangecoder: *RangeDecoder,
update: bool,
) !u32 {
return rangecoder.parseBitTree(self.num_bits, self.probs.items, update);
}
pub fn parseReverse(
self: *BitTree,
rangecoder: *RangeDecoder,
update: bool,
) !u32 {
return rangecoder.parseReverseBitTree(self.num_bits, self.probs.items, 0, update);
}
pub fn reset(self: *BitTree) void {
std.mem.set(u16, self.probs.items, 0x400);
}
};
const LenDecoder = struct {
choice: u16,
choice2: u16,
low_coder: [16]BitTree,
mid_coder: [16]BitTree,
high_coder: BitTree,
pub fn init(allocator: Allocator) !LenDecoder {
return .{
.choice = 0x400,
.choice2 = 0x400,
.low_coder = .{
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
},
.mid_coder = .{
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
try BitTree.init(allocator, 3),
},
.high_coder = try BitTree.init(allocator, 8),
};
}
pub fn deinit(self: *LenDecoder, allocator: Allocator) void {
for (self.low_coder) |*t| t.deinit(allocator);
for (self.mid_coder) |*t| t.deinit(allocator);
self.high_coder.deinit(allocator);
}
pub fn decode(
self: *LenDecoder,
rangecoder: *RangeDecoder,
pos_state: usize,
update: bool,
) !usize {
if (!try rangecoder.decodeBit(&self.choice, update)) {
return @as(usize, try self.low_coder[pos_state].parse(rangecoder, update));
} else if (!try rangecoder.decodeBit(&self.choice2, update)) {
return @as(usize, try self.mid_coder[pos_state].parse(rangecoder, update)) + 8;
} else {
return @as(usize, try self.high_coder.parse(rangecoder, update)) + 16;
}
}
pub fn reset(self: *LenDecoder) void {
self.choice = 0x400;
self.choice2 = 0x400;
for (self.low_coder) |*t| t.reset();
for (self.mid_coder) |*t| t.reset();
self.high_coder.reset();
}
};

View File

@@ -0,0 +1,23 @@
const Multibyte = packed struct(u8) {
value: u7,
more: bool,
};
pub fn readInt(reader: anytype) !u64 {
const max_size = 9;
var chunk = try reader.readStruct(Multibyte);
var num: u64 = chunk.value;
var i: u6 = 0;
while (chunk.more) {
chunk = try reader.readStruct(Multibyte);
i += 1;
if (i >= max_size or @bitCast(u8, chunk) == 0x00)
return error.CorruptInput;
num |= @as(u64, chunk.value) << (i * 7);
}
return num;
}

View File

@@ -0,0 +1,136 @@
const std = @import("std");
const block = @import("block.zig");
const check = @import("check.zig");
const multibyte = @import("multibyte.zig");
const Allocator = std.mem.Allocator;
const Crc32 = std.hash.Crc32;
test {
_ = @import("stream_test.zig");
}
const Flags = packed struct(u16) {
reserved1: u8,
check_kind: check.Kind,
reserved2: u4,
};
pub fn stream(allocator: Allocator, reader: anytype) !Stream(@TypeOf(reader)) {
return Stream(@TypeOf(reader)).init(allocator, reader);
}
pub fn Stream(comptime ReaderType: type) type {
return struct {
const Self = @This();
pub const Error = ReaderType.Error || block.Decoder(ReaderType).Error;
pub const Reader = std.io.Reader(*Self, Error, read);
allocator: Allocator,
block_decoder: block.Decoder(ReaderType),
in_reader: ReaderType,
fn init(allocator: Allocator, source: ReaderType) !Self {
const Header = extern struct {
magic: [6]u8,
flags: Flags,
crc32: u32,
};
const header = try source.readStruct(Header);
if (!std.mem.eql(u8, &header.magic, &.{ 0xFD, '7', 'z', 'X', 'Z', 0x00 }))
return error.BadHeader;
if (header.flags.reserved1 != 0 or header.flags.reserved2 != 0)
return error.BadHeader;
const hash = Crc32.hash(std.mem.asBytes(&header.flags));
if (hash != header.crc32)
return error.WrongChecksum;
return Self{
.allocator = allocator,
.block_decoder = try block.decoder(allocator, source, header.flags.check_kind),
.in_reader = source,
};
}
pub fn deinit(self: *Self) void {
self.block_decoder.deinit();
}
pub fn reader(self: *Self) Reader {
return .{ .context = self };
}
pub fn read(self: *Self, buffer: []u8) Error!usize {
if (buffer.len == 0)
return 0;
const r = try self.block_decoder.read(buffer);
if (r != 0)
return r;
const index_size = blk: {
var hasher = std.compress.hashedReader(self.in_reader, Crc32.init());
hasher.hasher.update(&[1]u8{0x00});
var counter = std.io.countingReader(hasher.reader());
counter.bytes_read += 1;
const counting_reader = counter.reader();
const record_count = try multibyte.readInt(counting_reader);
if (record_count != self.block_decoder.block_count)
return error.CorruptInput;
var i: usize = 0;
while (i < record_count) : (i += 1) {
// TODO: validate records
_ = try multibyte.readInt(counting_reader);
_ = try multibyte.readInt(counting_reader);
}
while (counter.bytes_read % 4 != 0) {
if (try counting_reader.readByte() != 0)
return error.CorruptInput;
}
const hash_a = hasher.hasher.final();
const hash_b = try counting_reader.readIntLittle(u32);
if (hash_a != hash_b)
return error.WrongChecksum;
break :blk counter.bytes_read;
};
const Footer = extern struct {
crc32: u32,
backward_size: u32,
flags: Flags,
magic: [2]u8,
};
const footer = try self.in_reader.readStruct(Footer);
const backward_size = (footer.backward_size + 1) * 4;
if (backward_size != index_size)
return error.CorruptInput;
if (footer.flags.reserved1 != 0 or footer.flags.reserved2 != 0)
return error.CorruptInput;
var hasher = Crc32.init();
hasher.update(std.mem.asBytes(&footer.backward_size));
hasher.update(std.mem.asBytes(&footer.flags));
const hash = hasher.final();
if (hash != footer.crc32)
return error.WrongChecksum;
if (!std.mem.eql(u8, &footer.magic, &.{ 'Y', 'Z' }))
return error.CorruptInput;
return 0;
}
};
}

View File

@@ -0,0 +1,80 @@
const std = @import("std");
const testing = std.testing;
const stream = @import("stream.zig").stream;
fn decompress(data: []const u8) ![]u8 {
var in_stream = std.io.fixedBufferStream(data);
var xz_stream = try stream(testing.allocator, in_stream.reader());
defer xz_stream.deinit();
return xz_stream.reader().readAllAlloc(testing.allocator, std.math.maxInt(usize));
}
fn testReader(data: []const u8, comptime expected: []const u8) !void {
const buf = try decompress(data);
defer testing.allocator.free(buf);
try testing.expectEqualSlices(u8, expected, buf);
}
test "compressed data" {
try testReader(@embedFile("testdata/good-0-empty.xz"), "");
inline for ([_][]const u8{
"good-1-check-none.xz",
"good-1-check-crc32.xz",
"good-1-check-crc64.xz",
"good-1-check-sha256.xz",
"good-2-lzma2.xz",
"good-1-block_header-1.xz",
"good-1-block_header-2.xz",
"good-1-block_header-3.xz",
}) |filename| {
try testReader(@embedFile("testdata/" ++ filename),
\\Hello
\\World!
\\
);
}
inline for ([_][]const u8{
"good-1-lzma2-1.xz",
"good-1-lzma2-2.xz",
"good-1-lzma2-3.xz",
"good-1-lzma2-4.xz",
}) |filename| {
try testReader(@embedFile("testdata/" ++ filename),
\\Lorem ipsum dolor sit amet, consectetur adipisicing
\\elit, sed do eiusmod tempor incididunt ut
\\labore et dolore magna aliqua. Ut enim
\\ad minim veniam, quis nostrud exercitation ullamco
\\laboris nisi ut aliquip ex ea commodo
\\consequat. Duis aute irure dolor in reprehenderit
\\in voluptate velit esse cillum dolore eu
\\fugiat nulla pariatur. Excepteur sint occaecat cupidatat
\\non proident, sunt in culpa qui officia
\\deserunt mollit anim id est laborum.
\\
);
}
try testReader(@embedFile("testdata/good-1-lzma2-5.xz"), "");
}
test "unsupported" {
inline for ([_][]const u8{
"good-1-delta-lzma2.tiff.xz",
"good-1-x86-lzma2.xz",
"good-1-sparc-lzma2.xz",
"good-1-arm64-lzma2-1.xz",
"good-1-arm64-lzma2-2.xz",
"good-1-3delta-lzma2.xz",
"good-1-empty-bcj-lzma2.xz",
}) |filename| {
try testing.expectError(
error.Unsupported,
decompress(@embedFile("testdata/" ++ filename)),
);
}
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.