PackObjectSizeIndex: interface and impl for the object-size index

Operations like "clone --filter=blob:limit=N" or the "object-info"
command need to read the size of the objects from the storage. An
index would provide those sizes at once rather than having to seek in
the packfile.

Introduce an interface for the Object-size index. This index returns
the inflated size of an object. Not all objects could be indexed (to
limit memory usage).

This implementation indexes only blobs (no trees, nor
commits) *above* certain size threshold (configurable). Lower
threshold adds more objects to the index, consumes more memory and
provides better performance. 0 means "all blobs" and -1 "disabled".

If we don't index everything, for the filter use case is more
efficient to index the biggest objects first: the set is small and
most objects are filtered by NOT being in the index. For the
object-size, the more objects in the index the better, regardless
their size. All together, it is more helpful to index above threshold.

Change-Id: I9ed608ac240677e199b90ca40d420bcad9231489
This commit is contained in:
Ivan Frade 2021-12-14 16:36:54 -08:00
parent 62d0e7be7c
commit c9552abaf3
5 changed files with 989 additions and 0 deletions

View File

@ -0,0 +1,392 @@
/*
* Copyright (C) 2022, Google LLC and others
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Distribution License v. 1.0 which is available at
* https://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
package org.eclipse.jgit.internal.storage.file;
import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.eclipse.jgit.lib.Constants;
import org.eclipse.jgit.lib.ObjectId;
import org.eclipse.jgit.transport.PackedObjectInfo;
import org.eclipse.jgit.util.BlockList;
import org.junit.Test;
public class PackObjectSizeIndexV1Test {
private static final ObjectId OBJ_ID = ObjectId
.fromString("b8b1d53172fb3fb19647adce4b38fab4371c2454");
private static final long GB = 1 << 30;
private static final int MAX_24BITS_UINT = 0xffffff;
@Test
public void write_24bPositions_32bSizes() throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
PackObjectSizeIndexWriter writer = PackObjectSizeIndexWriter
.createWriter(out, 0);
List<PackedObjectInfo> objs = new ArrayList<>();
objs.add(blobWithSize(100));
objs.add(blobWithSize(400));
objs.add(blobWithSize(200));
writer.write(objs);
byte[] expected = new byte[] { -1, 's', 'i', 'z', // header
0x01, // version
0x00, 0x00, 0x00, 0x00, // minimum object size
0x00, 0x00, 0x00, 0x03, // obj count
0x18, // Unsigned 3 bytes
0x00, 0x00, 0x00, 0x03, // 3 positions
0x00, 0x00, 0x00, // positions
0x00, 0x00, 0x01, //
0x00, 0x00, 0x02, //
0x00, // No more positions
0x00, 0x00, 0x00, 0x64, // size 100
0x00, 0x00, 0x01, (byte) 0x90, // size 400
0x00, 0x00, 0x00, (byte) 0xc8, // size 200
0x00, 0x00, 0x00, 0x00 // 64bit sizes counter
};
byte[] output = out.toByteArray();
assertArrayEquals(expected, output);
}
@Test
public void write_32bPositions_32bSizes() throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
PackObjectSizeIndexWriter writer = PackObjectSizeIndexWriter
.createWriter(out, 0);
List<PackedObjectInfo> objs = new BlockList<>(9_000_000);
// The 24 bit range is full of commits and trees
PackedObjectInfo commit = objInfo(Constants.OBJ_COMMIT, 100);
for (int i = 0; i <= MAX_24BITS_UINT; i++) {
objs.add(commit);
}
objs.add(blobWithSize(100));
objs.add(blobWithSize(400));
objs.add(blobWithSize(200));
writer.write(objs);
byte[] expected = new byte[] { -1, 's', 'i', 'z', // header
0x01, // version
0x00, 0x00, 0x00, 0x00, // minimum object size
0x00, 0x00, 0x00, 0x03, // obj count
(byte) 0x20, // Signed 4 bytes
0x00, 0x00, 0x00, 0x03, // 3 positions
0x01, 0x00, 0x00, 0x00, // positions
0x01, 0x00, 0x00, 0x01, //
0x01, 0x00, 0x00, 0x02, //
0x00, // No more positions
0x00, 0x00, 0x00, 0x64, // size 100
0x00, 0x00, 0x01, (byte) 0x90, // size 400
0x00, 0x00, 0x00, (byte) 0xc8, // size 200
0x00, 0x00, 0x00, 0x00 // 64bit sizes counter
};
byte[] output = out.toByteArray();
assertArrayEquals(expected, output);
}
@Test
public void write_24b32bPositions_32bSizes() throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
PackObjectSizeIndexWriter writer = PackObjectSizeIndexWriter
.createWriter(out, 0);
List<PackedObjectInfo> objs = new BlockList<>(9_000_000);
// The 24 bit range is full of commits and trees
PackedObjectInfo commit = objInfo(Constants.OBJ_COMMIT, 100);
for (int i = 0; i < MAX_24BITS_UINT; i++) {
objs.add(commit);
}
objs.add(blobWithSize(100));
objs.add(blobWithSize(400));
objs.add(blobWithSize(200));
writer.write(objs);
byte[] expected = new byte[] { -1, 's', 'i', 'z', // header
0x01, // version
0x00, 0x00, 0x00, 0x00, // minimum object size
0x00, 0x00, 0x00, 0x03, // obj count
0x18, // 3 bytes
0x00, 0x00, 0x00, 0x01, // 1 position
(byte) 0xff, (byte) 0xff, (byte) 0xff,
(byte) 0x20, // 4 bytes (32 bits)
0x00, 0x00, 0x00, 0x02, // 2 positions
0x01, 0x00, 0x00, 0x00, // positions
0x01, 0x00, 0x00, 0x01, //
0x00, // No more positions
0x00, 0x00, 0x00, 0x64, // size 100
0x00, 0x00, 0x01, (byte) 0x90, // size 400
0x00, 0x00, 0x00, (byte) 0xc8, // size 200
0x00, 0x00, 0x00, 0x00 // 64bit sizes counter
};
byte[] output = out.toByteArray();
assertArrayEquals(expected, output);
}
@Test
public void write_64bitsSize() throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
PackObjectSizeIndexWriter writer = PackObjectSizeIndexWriter
.createWriter(out, 0);
List<PackedObjectInfo> objs = new ArrayList<>();
objs.add(blobWithSize(100));
objs.add(blobWithSize(3 * GB));
objs.add(blobWithSize(4 * GB));
objs.add(blobWithSize(400));
writer.write(objs);
byte[] expected = new byte[] { -1, 's', 'i', 'z', // header
0x01, // version
0x00, 0x00, 0x00, 0x00, // minimum object size
0x00, 0x00, 0x00, 0x04, // Object count
0x18, // Unsigned 3 byte positions
0x00, 0x00, 0x00, 0x04, // 4 positions
0x00, 0x00, 0x00, // positions
0x00, 0x00, 0x01, //
0x00, 0x00, 0x02, //
0x00, 0x00, 0x03, //
0x00, // No more positions
0x00, 0x00, 0x00, 0x64, // size 100
(byte) 0xff, (byte) 0xff, (byte) 0xff, (byte) 0xff, // -1 (3GB)
(byte) 0xff, (byte) 0xff, (byte) 0xff, (byte) 0xfe, // -2 (4GB)
0x00, 0x00, 0x01, (byte) 0x90, // size 400
0x00, 0x00, 0x00, (byte) 0x02, // 64bit sizes counter (2)
0x00, 0x00, 0x00, 0x00, // size 3Gb
(byte) 0xc0, 0x00, 0x00, 0x00, //
0x00, 0x00, 0x00, 0x01, // size 4GB
(byte) 0x00, 0x00, 0x00, 0x00, //
0x00, 0x00, 0x00, 0x00 // 128bit sizes counter
};
byte[] output = out.toByteArray();
assertArrayEquals(expected, output);
}
@Test
public void write_allObjectsTooSmall() throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
PackObjectSizeIndexWriter writer = PackObjectSizeIndexWriter
.createWriter(out, 1 << 10);
List<PackedObjectInfo> objs = new ArrayList<>();
// too small blobs
objs.add(blobWithSize(100));
objs.add(blobWithSize(200));
objs.add(blobWithSize(400));
writer.write(objs);
byte[] expected = new byte[] { -1, 's', 'i', 'z', // header
0x01, // version
0x00, 0x00, 0x04, 0x00, // minimum object size
0x00, 0x00, 0x00, 0x00, // Object count
};
byte[] output = out.toByteArray();
assertArrayEquals(expected, output);
}
@Test
public void write_onlyBlobsIndexed() throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
PackObjectSizeIndexWriter writer = PackObjectSizeIndexWriter
.createWriter(out, 0);
List<PackedObjectInfo> objs = new ArrayList<>();
objs.add(objInfo(Constants.OBJ_COMMIT, 1000));
objs.add(blobWithSize(100));
objs.add(objInfo(Constants.OBJ_TAG, 1000));
objs.add(blobWithSize(400));
objs.add(blobWithSize(200));
writer.write(objs);
byte[] expected = new byte[] { -1, 's', 'i', 'z', // header
0x01, // version
0x00, 0x00, 0x00, 0x00, // minimum object size
0x00, 0x00, 0x00, 0x03, // Object count
0x18, // Positions in 3 bytes
0x00, 0x00, 0x00, 0x03, // 3 entries
0x00, 0x00, 0x01, // positions
0x00, 0x00, 0x03, //
0x00, 0x00, 0x04, //
0x00, // No more positions
0x00, 0x00, 0x00, 0x64, // size 100
0x00, 0x00, 0x01, (byte) 0x90, // size 400
0x00, 0x00, 0x00, (byte) 0xc8, // size 200
0x00, 0x00, 0x00, 0x00 // 64bit sizes counter
};
byte[] output = out.toByteArray();
assertArrayEquals(expected, output);
}
@Test
public void write_noObjects() throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
PackObjectSizeIndexWriter writer = PackObjectSizeIndexWriter
.createWriter(out, 0);
List<PackedObjectInfo> objs = new ArrayList<>();
writer.write(objs);
byte[] expected = new byte[] { -1, 's', 'i', 'z', // header
0x01, // version
0x00, 0x00, 0x00, 0x00, // minimum object size
0x00, 0x00, 0x00, 0x00, // Object count
};
byte[] output = out.toByteArray();
assertArrayEquals(expected, output);
}
@Test
public void read_empty() throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
PackObjectSizeIndexWriter writer = PackObjectSizeIndexWriter
.createWriter(out, 0);
List<PackedObjectInfo> objs = new ArrayList<>();
writer.write(objs);
ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
PackObjectSizeIndex index = PackObjectSizeIndexLoader.load(in);
assertEquals(-1, index.getSize(0));
assertEquals(-1, index.getSize(1));
assertEquals(-1, index.getSize(1 << 30));
assertEquals(0, index.getThreshold());
}
@Test
public void read_only24bitsPositions() throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
PackObjectSizeIndexWriter writer = PackObjectSizeIndexWriter
.createWriter(out, 0);
List<PackedObjectInfo> objs = new ArrayList<>();
objs.add(blobWithSize(100));
objs.add(blobWithSize(200));
objs.add(blobWithSize(400));
objs.add(blobWithSize(1500));
writer.write(objs);
ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
PackObjectSizeIndex index = PackObjectSizeIndexLoader.load(in);
assertEquals(100, index.getSize(0));
assertEquals(200, index.getSize(1));
assertEquals(400, index.getSize(2));
assertEquals(1500, index.getSize(3));
assertEquals(0, index.getThreshold());
}
@Test
public void read_only32bitsPositions() throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
PackObjectSizeIndexWriter writer = PackObjectSizeIndexWriter
.createWriter(out, 2000);
List<PackedObjectInfo> objs = new ArrayList<>();
PackedObjectInfo smallObj = blobWithSize(100);
for (int i = 0; i <= MAX_24BITS_UINT; i++) {
objs.add(smallObj);
}
objs.add(blobWithSize(1000));
objs.add(blobWithSize(3000));
objs.add(blobWithSize(2500));
objs.add(blobWithSize(1000));
writer.write(objs);
ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
PackObjectSizeIndex index = PackObjectSizeIndexLoader.load(in);
assertEquals(-1, index.getSize(5));
assertEquals(-1, index.getSize(MAX_24BITS_UINT+1));
assertEquals(3000, index.getSize(MAX_24BITS_UINT+2));
assertEquals(2500, index.getSize(MAX_24BITS_UINT+3));
assertEquals(-1, index.getSize(MAX_24BITS_UINT+4)); // Not indexed
assertEquals(2000, index.getThreshold());
}
@Test
public void read_24and32BitsPositions() throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
PackObjectSizeIndexWriter writer = PackObjectSizeIndexWriter
.createWriter(out, 2000);
List<PackedObjectInfo> objs = new ArrayList<>();
PackedObjectInfo smallObj = blobWithSize(100);
for (int i = 0; i <= MAX_24BITS_UINT; i++) {
if (i == 500 || i == 1000 || i == 1500) {
objs.add(blobWithSize(2500));
continue;
}
objs.add(smallObj);
}
objs.add(blobWithSize(3000));
objs.add(blobWithSize(1000));
objs.add(blobWithSize(2500));
objs.add(blobWithSize(1000));
writer.write(objs);
ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
PackObjectSizeIndex index = PackObjectSizeIndexLoader.load(in);
// 24 bit positions
assertEquals(-1, index.getSize(5));
assertEquals(2500, index.getSize(500));
// 32 bit positions
assertEquals(3000, index.getSize(MAX_24BITS_UINT+1));
assertEquals(-1, index.getSize(MAX_24BITS_UINT+2));
assertEquals(2500, index.getSize(MAX_24BITS_UINT+3));
assertEquals(-1, index.getSize(MAX_24BITS_UINT+4)); // Not indexed
assertEquals(2000, index.getThreshold());
}
@Test
public void read_only64bits() throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
PackObjectSizeIndexWriter writer = PackObjectSizeIndexWriter
.createWriter(out, 0);
List<PackedObjectInfo> objs = new ArrayList<>();
objs.add(blobWithSize(3 * GB));
objs.add(blobWithSize(8 * GB));
writer.write(objs);
ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
PackObjectSizeIndex index = PackObjectSizeIndexLoader.load(in);
assertEquals(3 * GB, index.getSize(0));
assertEquals(8 * GB, index.getSize(1));
assertEquals(0, index.getThreshold());
}
@Test
public void read_withMinSize() throws IOException {
int minSize = 1000;
ByteArrayOutputStream out = new ByteArrayOutputStream();
PackObjectSizeIndexWriter writer = PackObjectSizeIndexWriter
.createWriter(out, minSize);
List<PackedObjectInfo> objs = new ArrayList<>();
objs.add(blobWithSize(3 * GB));
objs.add(blobWithSize(1500));
objs.add(blobWithSize(500));
objs.add(blobWithSize(1000));
objs.add(blobWithSize(2000));
writer.write(objs);
ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
PackObjectSizeIndex index = PackObjectSizeIndexLoader.load(in);
assertEquals(3 * GB, index.getSize(0));
assertEquals(1500, index.getSize(1));
assertEquals(-1, index.getSize(2));
assertEquals(1000, index.getSize(3));
assertEquals(2000, index.getSize(4));
assertEquals(minSize, index.getThreshold());
}
private static PackedObjectInfo blobWithSize(long size) {
return objInfo(Constants.OBJ_BLOB, size);
}
private static PackedObjectInfo objInfo(int type, long size) {
PackedObjectInfo objectInfo = new PackedObjectInfo(OBJ_ID);
objectInfo.setType(type);
objectInfo.setFullSize(size);
return objectInfo;
}
}

View File

@ -0,0 +1,45 @@
/*
* Copyright (C) 2022, Google LLC and others
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Distribution License v. 1.0 which is available at
* https://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
package org.eclipse.jgit.internal.storage.file;
/**
* Index of object sizes in a pack
*
* It is not guaranteed that the implementation contains the sizes of all
* objects (e.g. it could store only objects over certain threshold).
*/
public interface PackObjectSizeIndex {
/**
* Returns the inflated size of the object.
*
* @param idxOffset
* position in the pack (as returned from PackIndex)
* @return size of the object, -1 if not found in the index.
*/
long getSize(int idxOffset);
/**
* Number of objects in the index
*
* @return number of objects in the index
*/
long getObjectCount();
/**
* Minimal size of an object to be included in this index
*
* Cut-off value used at generation time to decide what objects to index.
*
* @return size in bytes
*/
int getThreshold();
}

View File

@ -0,0 +1,43 @@
/*
* Copyright (C) 2022, Google LLC and others
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Distribution License v. 1.0 which is available at
* https://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
package org.eclipse.jgit.internal.storage.file;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
/**
* Chooses the specific implementation of the object-size index based on the
* file version.
*/
public class PackObjectSizeIndexLoader {
/**
* Read an object size index from the stream
*
* @param in
* input stream at the beginning of the object size data
* @return an implementation of the object size index
* @throws IOException
* error reading the streams
*/
public static PackObjectSizeIndex load(InputStream in) throws IOException {
byte[] header = in.readNBytes(4);
if (!Arrays.equals(header, PackObjectSizeIndexWriter.HEADER)) {
throw new IOException("Stream is not an object index"); //$NON-NLS-1$
}
int version = in.readNBytes(1)[0];
if (version != 1) {
throw new IOException("Unknown object size version: " + version); //$NON-NLS-1$
}
return PackObjectSizeIndexV1.parse(in);
}
}

View File

@ -0,0 +1,224 @@
/*
* Copyright (C) 2022, Google LLC and others
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Distribution License v. 1.0 which is available at
* https://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
package org.eclipse.jgit.internal.storage.file;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import org.eclipse.jgit.util.NB;
/**
* Memory representation of the object-size index
*
* The object size index is a map from position in the primary idx (i.e.
* position of the object-id in lexicographical order) to size.
*
* Most of the positions fit in unsigned 3 bytes (up to 16 million)
*/
class PackObjectSizeIndexV1 implements PackObjectSizeIndex {
private static final byte BITS_24 = 0x18;
private static final byte BITS_32 = 0x20;
private final int threshold;
private final UInt24Array positions24;
private final int[] positions32;
/**
* Parallel array to concat(positions24, positions32) with the size of the
* objects.
*
* A value >= 0 is the size of the object. A negative value means the size
* doesn't fit in an int and |value|-1 is the position for the size in the
* size64 array e.g. a value of -1 is sizes64[0], -2 = sizes64[1], ...
*/
private final int[] sizes32;
private final long[] sizes64;
static PackObjectSizeIndex parse(InputStream in) throws IOException {
/** Header and version already out of the input */
IndexInputStreamReader stream = new IndexInputStreamReader(in);
int threshold = stream.readInt(); // minSize
int objCount = stream.readInt();
if (objCount == 0) {
return new EmptyPackObjectSizeIndex(threshold);
}
return new PackObjectSizeIndexV1(stream, threshold, objCount);
}
private PackObjectSizeIndexV1(IndexInputStreamReader stream, int threshold,
int objCount) throws IOException {
this.threshold = threshold;
UInt24Array pos24 = null;
int[] pos32 = null;
byte positionEncoding;
while ((positionEncoding = stream.readByte()) != 0) {
if (Byte.compareUnsigned(positionEncoding, BITS_24) == 0) {
int sz = stream.readInt();
pos24 = new UInt24Array(stream.readNBytes(sz * 3));
} else if (Byte.compareUnsigned(positionEncoding, BITS_32) == 0) {
int sz = stream.readInt();
pos32 = stream.readIntArray(sz);
} else {
throw new UnsupportedEncodingException(
String.format("Unknown position encoding %s",
Integer.toHexString(positionEncoding)));
}
}
positions24 = pos24 != null ? pos24 : UInt24Array.EMPTY;
positions32 = pos32 != null ? pos32 : new int[0];
sizes32 = stream.readIntArray(objCount);
int c64sizes = stream.readInt();
if (c64sizes == 0) {
sizes64 = new long[0];
return;
}
sizes64 = stream.readLongArray(c64sizes);
int c128sizes = stream.readInt();
if (c128sizes != 0) {
// this MUST be 0 (we don't support 128 bits sizes yet)
throw new IOException("Unsupported sizes in object-size-index");
}
}
@Override
public long getSize(int idxOffset) {
int pos = -1;
if (!positions24.isEmpty() && idxOffset <= positions24.getLastValue()) {
pos = positions24.binarySearch(idxOffset);
} else if (positions32.length > 0 && idxOffset >= positions32[0]) {
int pos32 = Arrays.binarySearch(positions32, idxOffset);
if (pos32 >= 0) {
pos = pos32 + positions24.size();
}
}
if (pos < 0) {
return -1;
}
int objSize = sizes32[pos];
if (objSize < 0) {
int secondPos = Math.abs(objSize) - 1;
return sizes64[secondPos];
}
return objSize;
}
@Override
public long getObjectCount() {
return positions24.size() + positions32.length;
}
@Override
public int getThreshold() {
return threshold;
}
/**
* Wrapper to read parsed content from the byte stream
*/
private static class IndexInputStreamReader {
private final byte[] buffer = new byte[8];
private final InputStream in;
IndexInputStreamReader(InputStream in) {
this.in = in;
}
int readInt() throws IOException {
int n = in.readNBytes(buffer, 0, 4);
if (n < 4) {
throw new IOException(
"Unable to read a full int from the stream");
}
return NB.decodeInt32(buffer, 0);
}
int[] readIntArray(int intsCount) throws IOException {
if (intsCount == 0) {
return new int[0];
}
int[] dest = new int[intsCount];
for (int i = 0; i < intsCount; i++) {
dest[i] = readInt();
}
return dest;
}
long readLong() throws IOException {
int n = in.readNBytes(buffer, 0, 8);
if (n < 8) {
throw new IOException(
"Unable to read a full int from the stream");
}
return NB.decodeInt64(buffer, 0);
}
long[] readLongArray(int longsCount) throws IOException {
if (longsCount == 0) {
return new long[0];
}
long[] dest = new long[longsCount];
for (int i = 0; i < longsCount; i++) {
dest[i] = readLong();
}
return dest;
}
byte readByte() throws IOException {
int n = in.readNBytes(buffer, 0, 1);
if (n != 1) {
throw new IOException("Cannot read byte from stream");
}
return buffer[0];
}
byte[] readNBytes(int sz) throws IOException {
return in.readNBytes(sz);
}
}
private static class EmptyPackObjectSizeIndex
implements PackObjectSizeIndex {
private final int threshold;
EmptyPackObjectSizeIndex(int threshold) {
this.threshold = threshold;
}
@Override
public long getSize(int idxOffset) {
return -1;
}
@Override
public long getObjectCount() {
return 0;
}
@Override
public int getThreshold() {
return threshold;
}
}
}

View File

@ -0,0 +1,285 @@
/*
* Copyright (C) 2022, Google LLC and others
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Distribution License v. 1.0 which is available at
* https://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
package org.eclipse.jgit.internal.storage.file;
import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.List;
import org.eclipse.jgit.lib.Constants;
import org.eclipse.jgit.transport.PackedObjectInfo;
import org.eclipse.jgit.util.NB;
/**
* Write an object index in the output stream
*/
public abstract class PackObjectSizeIndexWriter {
private static final int MAX_24BITS_UINT = 0xffffff;
private static final PackObjectSizeIndexWriter NULL_WRITER = new PackObjectSizeIndexWriter() {
@Override
public void write(List<? extends PackedObjectInfo> objs) {
// Do nothing
}
};
/** Magic constant for the object size index file */
protected static final byte[] HEADER = { -1, 's', 'i', 'z' };
/**
* Returns a writer for the latest index version
*
* @param os
* Output stream where to write the index
* @param minSize
* objects strictly smaller than this size won't be added to the
* index. Negative size won't write AT ALL. Other sizes could write
* an empty index.
* @return the index writer
*/
public static PackObjectSizeIndexWriter createWriter(OutputStream os,
int minSize) {
if (minSize < 0) {
return NULL_WRITER;
}
return new PackObjectSizeWriterV1(os, minSize);
}
/**
* Add the objects to the index
*
* @param objs
* objects in the pack, in sha1 order. Their position in the list
* matches their position in the primary index.
* @throws IOException
* problem writing to the stream
*/
public abstract void write(List<? extends PackedObjectInfo> objs)
throws IOException;
/**
* Object size index v1.
*
* Store position (in the main index) to size as parallel arrays.
*
* <p>Positions in the main index fit well in unsigned 24 bits (16M) for most
* repositories, but some outliers have even more objects, so we need to
* store also 32 bits positions.
*
* <p>Sizes are stored as a first array parallel to positions. If a size
* doesn't fit in an element of that array, then we encode there a position
* on the next-size array. This "overflow" array doesn't have entries for
* all positions.
*
* <pre>
*
* positions [10, 500, 1000, 1001]
* sizes (32bits) [15MB, -1, 6MB, -2]
* ___/ ______/
* / /
* sizes (64 bits) [3GB, 6GB]
* </pre>
*
* <p>For sizes we use 32 bits as the first level and 64 for the rare objects
* over 2GB.
*
* <p>A 24/32/64 bits hierarchy of arrays saves space if we have a lot of small
* objects, but wastes space if we have only big ones. The min size to index is
* controlled by conf and in principle we want to index only rather
* big objects (e.g. > 10MB). We could support more dynamics read/write of sizes
* (e.g. 24 only if the threshold will include many of those objects) but it
* complicates a lot code and spec. If needed it could go for a v2 of the protocol.
*
* <p>Format:
*
* <li>A header with the magic number (4 bytes)
* <li>The index version (1 byte)
* <li>The minimum object size (4 bytes)
* <li>Total count of objects indexed (C, 4 bytes)
* (if count == 0, stop here)
*
* Blocks of
* <li>Size per entry in bits (1 byte, either 24 (0x18) or 32 (0x20))
* <li>Count of entries (4 bytes) (c, as a signed int)
* <li>positions encoded in s bytes each (i.e s*c bytes)
*
* <li>0 (as a "size-per-entry = 0", marking end of the section)
*
* <li>32 bit sizes (C * 4 bytes). Negative size means
* nextLevel[abs(size)-1]
* <li>Count of 64 bit sizes (s64) (or 0 if no more indirections)
* <li>64 bit sizes (s64 * 8 bytes)
* <li>0 (end)
*/
static class PackObjectSizeWriterV1 extends PackObjectSizeIndexWriter {
private final OutputStream os;
private final int minObjSize;
private final byte[] intBuffer = new byte[4];
PackObjectSizeWriterV1(OutputStream os, int minSize) {
this.os = new BufferedOutputStream(os);
this.minObjSize = minSize;
}
@Override
public void write(List<? extends PackedObjectInfo> allObjects)
throws IOException {
os.write(HEADER);
writeUInt8(1); // Version
writeInt32(minObjSize);
PackedObjectStats stats = countIndexableObjects(allObjects);
int[] indexablePositions = findIndexablePositions(allObjects,
stats.indexableObjs);
writeInt32(indexablePositions.length); // Total # of objects
if (indexablePositions.length == 0) {
os.flush();
return;
}
// Positions that fit in 3 bytes
if (stats.pos24Bits > 0) {
writeUInt8(24);
writeInt32(stats.pos24Bits);
applyToRange(indexablePositions, 0, stats.pos24Bits,
this::writeInt24);
}
// Positions that fit in 4 bytes
// We only use 31 bits due to sign,
// but that covers 2 billion objs
if (stats.pos31Bits > 0) {
writeUInt8(32);
writeInt32(stats.pos31Bits);
applyToRange(indexablePositions, stats.pos24Bits,
stats.pos24Bits + stats.pos31Bits, this::writeInt32);
}
writeUInt8(0);
writeSizes(allObjects, indexablePositions, stats.sizeOver2GB);
os.flush();
}
private void writeUInt8(int i) throws IOException {
if (i > 255) {
throw new IllegalStateException(
"Number doesn't fit in a single byte");
}
NB.encodeInt32(intBuffer, 0, i);
os.write(intBuffer, 3, 1);
}
private void writeInt24(int i) throws IOException {
NB.encodeInt24(intBuffer, 1, i);
os.write(intBuffer, 1, 3);
}
private void writeInt32(int i) throws IOException {
NB.encodeInt32(intBuffer, 0, i);
os.write(intBuffer);
}
private void writeSizes(List<? extends PackedObjectInfo> allObjects,
int[] indexablePositions, int objsBiggerThan2Gb)
throws IOException {
if (indexablePositions.length == 0) {
writeInt32(0);
return;
}
byte[] sizes64bits = new byte[8 * objsBiggerThan2Gb];
int s64 = 0;
for (int i = 0; i < indexablePositions.length; i++) {
PackedObjectInfo info = allObjects.get(indexablePositions[i]);
if (info.getFullSize() < Integer.MAX_VALUE) {
writeInt32((int) info.getFullSize());
} else {
// Size needs more than 32 bits. Store -1 * offset in the
// next table as size.
writeInt32(-1 * (s64 + 1));
NB.encodeInt64(sizes64bits, s64 * 8, info.getFullSize());
s64++;
}
}
if (objsBiggerThan2Gb > 0) {
writeInt32(objsBiggerThan2Gb);
os.write(sizes64bits);
}
writeInt32(0);
}
private int[] findIndexablePositions(
List<? extends PackedObjectInfo> allObjects,
int indexableObjs) {
int[] positions = new int[indexableObjs];
int positionIdx = 0;
for (int i = 0; i < allObjects.size(); i++) {
PackedObjectInfo o = allObjects.get(i);
if (!shouldIndex(o)) {
continue;
}
positions[positionIdx++] = i;
}
return positions;
}
private PackedObjectStats countIndexableObjects(
List<? extends PackedObjectInfo> objs) {
PackedObjectStats stats = new PackedObjectStats();
for (int i = 0; i < objs.size(); i++) {
PackedObjectInfo o = objs.get(i);
if (!shouldIndex(o)) {
continue;
}
stats.indexableObjs++;
if (o.getFullSize() > Integer.MAX_VALUE) {
stats.sizeOver2GB++;
}
if (i <= MAX_24BITS_UINT) {
stats.pos24Bits++;
} else {
stats.pos31Bits++;
// i is a positive int, cannot be bigger than this
}
}
return stats;
}
private boolean shouldIndex(PackedObjectInfo o) {
return (o.getType() == Constants.OBJ_BLOB)
&& (o.getFullSize() >= minObjSize);
}
private static class PackedObjectStats {
int indexableObjs;
int pos24Bits;
int pos31Bits;
int sizeOver2GB;
}
@FunctionalInterface
interface IntEncoder {
void encode(int i) throws IOException;
}
private static void applyToRange(int[] allPositions, int start, int end,
IntEncoder encoder) throws IOException {
for (int i = start; i < end; i++) {
encoder.encode(allPositions[i]);
}
}
}
}