PackReverseIndexWriter: write out version 1 reverse index file

The reverse index for a pack is used to quickly find an object's
position in the pack's forward index based on that object's pack offset.
It is currently computed from the forward index by sorting the index
entries by the corresponding pack offset. This computation uses
bucket sort with insertion sort, which has an average runtime of
O(n log n) and worst case runtime of O(n^2); and memory usage of
3*size(int)*n because it maintains 3 int arrays, even after sorting is
completed. The computation must be performed every time that the reverse
index object is created in memory.

In contrast, Cgit persists a pack reverse index file to avoid
recomputing the reverse index ordering every time that it is needed.
Instead they write a file with format
https://git-scm.com/docs/pack-format#_pack_rev_files_have_the_format
which can later be read and parsed into an in-memory reverse index each
time it is needed.

Introduce these reverse index files to JGit. PackReverseIndexWriter
writes out a reverse index file to be read later when needed. Subclass
PackReverseIndexWriterV1 writes a file with the official version 1
format.

To avoid temporarily allocating an Integer collection while sorting and
writing out the contents, using memory 4*size(Integer)*n, use an
IntList and its #sort method, which uses quicksort.

Change-Id: I6437745777a16f723e2f1cfcce4e0d94e599dcee
Signed-off-by: Anna Papitto <annapapitto@google.com>
This commit is contained in:
Anna Papitto 2023-04-27 11:01:30 -07:00
parent 7d3f893d31
commit 64615b44e6
6 changed files with 391 additions and 0 deletions

View File

@ -0,0 +1,43 @@
/*
* Copyright (C) 2023, Google LLC and others
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Distribution License v. 1.0 which is available at
* https://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
package org.eclipse.jgit.internal.storage.file;
import static org.junit.Assert.assertThrows;
import static org.junit.Assert.assertTrue;
import java.io.ByteArrayOutputStream;
import org.junit.Test;
public class PackReverseIndexWriterTest {
@Test
public void createWriter_defaultVersion() {
PackReverseIndexWriter version1 = PackReverseIndexWriter
.createWriter(new ByteArrayOutputStream());
assertTrue(version1 instanceof PackReverseIndexWriterV1);
}
@Test
public void createWriter_version1() {
PackReverseIndexWriter version1 = PackReverseIndexWriter
.createWriter(new ByteArrayOutputStream(), 1);
assertTrue(version1 instanceof PackReverseIndexWriterV1);
}
@Test
public void createWriter_unsupportedVersion() {
assertThrows(IllegalArgumentException.class,
() -> PackReverseIndexWriter
.createWriter(new ByteArrayOutputStream(), 2));
}
}

View File

@ -0,0 +1,122 @@
/*
* Copyright (C) 2023, Google LLC and others
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Distribution License v. 1.0 which is available at
* https://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
package org.eclipse.jgit.internal.storage.file;
import static org.eclipse.jgit.lib.Constants.OBJECT_ID_STRING_LENGTH;
import static org.eclipse.jgit.lib.Constants.OBJ_BLOB;
import static org.eclipse.jgit.lib.Constants.OBJ_COMMIT;
import static org.eclipse.jgit.lib.Constants.OBJ_TREE;
import static org.junit.Assert.assertArrayEquals;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.eclipse.jgit.lib.ObjectId;
import org.eclipse.jgit.transport.PackedObjectInfo;
import org.junit.Test;
public class PackReverseIndexWriterV1Test {
private static byte[] PACK_CHECKSUM = new byte[] { 'P', 'A', 'C', 'K', 'C',
'H', 'E', 'C', 'K', 'S', 'U', 'M', '3', '4', '5', '6', '7', '8',
'9', '0', };
@Test
public void write_noObjects() throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
PackReverseIndexWriter writer = PackReverseIndexWriter.createWriter(out,
1);
List<PackedObjectInfo> objectsSortedByName = new ArrayList<>();
writer.write(objectsSortedByName, PACK_CHECKSUM);
byte[] expected = new byte[] { 'R', 'I', 'D', 'X', // magic
0x00, 0x00, 0x00, 0x01, // file version
0x00, 0x00, 0x00, 0x01, // oid version
// pack checksum to copy into at byte 12
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// checksum
(byte) 0xd1, 0x1d, 0x17, (byte) 0xd5, (byte) 0xa1, 0x5c,
(byte) 0x8f, 0x45, 0x7e, 0x06, (byte) 0x91, (byte) 0xf2, 0x7e,
0x20, 0x35, 0x2c, (byte) 0xdc, 0x4c, 0x46, (byte) 0xe4, };
System.arraycopy(PACK_CHECKSUM, 0, expected, 12, PACK_CHECKSUM.length);
assertArrayEquals(expected, out.toByteArray());
}
@Test
public void write_oneObject() throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
PackReverseIndexWriter writer = PackReverseIndexWriter.createWriter(out,
1);
PackedObjectInfo a = objectInfo("a", OBJ_COMMIT, 0);
List<PackedObjectInfo> objectsSortedByName = List.of(a);
writer.write(objectsSortedByName, PACK_CHECKSUM);
byte[] expected = new byte[] { 'R', 'I', 'D', 'X', // magic
0x00, 0x00, 0x00, 0x01, // file version
0x00, 0x00, 0x00, 0x01, // oid version
0x00, 0x00, 0x00, 0x00, // 0: "a" -> 0
// pack checksum to copy into at byte 16
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// checksum
0x48, 0x04, 0x29, 0x69, 0x2f, (byte) 0xf3, (byte) 0x80,
(byte) 0xa1, (byte) 0xf5, (byte) 0x92, (byte) 0xc2, 0x21, 0x46,
(byte) 0xd5, 0x08, 0x44, (byte) 0xa4, (byte) 0xf4, (byte) 0xc0,
(byte) 0xec, };
System.arraycopy(PACK_CHECKSUM, 0, expected, 16, PACK_CHECKSUM.length);
assertArrayEquals(expected, out.toByteArray());
}
@Test
public void write_multipleObjects() throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
PackReverseIndexWriter writer = PackReverseIndexWriter.createWriter(out,
1);
PackedObjectInfo a = objectInfo("a", OBJ_BLOB, 20);
PackedObjectInfo b = objectInfo("b", OBJ_COMMIT, 0);
PackedObjectInfo c = objectInfo("c", OBJ_COMMIT, 52);
PackedObjectInfo d = objectInfo("d", OBJ_TREE, 7);
PackedObjectInfo e = objectInfo("e", OBJ_COMMIT, 38);
List<PackedObjectInfo> objectsSortedByName = List.of(a, b, c, d, e);
writer.write(objectsSortedByName, PACK_CHECKSUM);
byte[] expected = new byte[] { 'R', 'I', 'D', 'X', // magic
0x00, 0x00, 0x00, 0x01, // file version
0x00, 0x00, 0x00, 0x01, // oid version
0x00, 0x00, 0x00, 0x01, // offset 0: "b" -> index @ 1
0x00, 0x00, 0x00, 0x03, // offset 7: "d" -> index @ 3
0x00, 0x00, 0x00, 0x00, // offset 20: "a" -> index @ 0
0x00, 0x00, 0x00, 0x04, // offset 38: "e" -> index @ 4
0x00, 0x00, 0x00, 0x02, // offset 52: "c" -> index @ 2
// pack checksum to copy into at byte 32
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// checksum
(byte) 0xe3, 0x1c, (byte) 0xd3, (byte) 0xeb, (byte) 0x83, 0x47,
(byte) 0x80, (byte) 0xb1, (byte) 0xe0, 0x3c, 0x2b, 0x25,
(byte) 0xd6, 0x3e, (byte) 0xdc, (byte) 0xde, (byte) 0xbe, 0x4b,
0x0a, (byte) 0xe2, };
System.arraycopy(PACK_CHECKSUM, 0, expected, 32, PACK_CHECKSUM.length);
assertArrayEquals(expected, out.toByteArray());
}
private static PackedObjectInfo objectInfo(String objectId, int type,
long offset) {
assert (objectId.length() == 1);
PackedObjectInfo objectInfo = new PackedObjectInfo(
ObjectId.fromString(objectId.repeat(OBJECT_ID_STRING_LENGTH)));
objectInfo.setType(type);
objectInfo.setOffset(offset);
return objectInfo;
}
}

View File

@ -839,6 +839,7 @@ unsupportedGC=Unsupported garbage collector for repository type: {0}
unsupportedMark=Mark not supported
unsupportedOperationNotAddAtEnd=Not add-at-end: {0}
unsupportedPackIndexVersion=Unsupported pack index version {0}
unsupportedPackReverseIndexVersion=Unsupported pack reverse index version {0}
unsupportedPackVersion=Unsupported pack version {0}.
unsupportedReftableVersion=Unsupported reftable version {0}.
unsupportedRepositoryDescription=Repository description not supported

View File

@ -868,6 +868,7 @@ public static JGitText get() {
/***/ public String unsupportedMark;
/***/ public String unsupportedOperationNotAddAtEnd;
/***/ public String unsupportedPackIndexVersion;
/***/ public String unsupportedPackReverseIndexVersion;
/***/ public String unsupportedPackVersion;
/***/ public String unsupportedReftableVersion;
/***/ public String unsupportedRepositoryDescription;

View File

@ -0,0 +1,149 @@
/*
* Copyright (C) 2023, Google LLC and others
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Distribution License v. 1.0 which is available at
* https://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
package org.eclipse.jgit.internal.storage.file;
import java.io.BufferedOutputStream;
import java.io.DataOutput;
import java.io.IOException;
import java.io.OutputStream;
import java.security.DigestOutputStream;
import java.text.MessageFormat;
import java.util.List;
import org.eclipse.jgit.internal.JGitText;
import org.eclipse.jgit.lib.Constants;
import org.eclipse.jgit.transport.PackedObjectInfo;
/**
* Writes reverse index files conforming to the requested version.
* <p>
* The reverse index file format is specified at
* https://git-scm.com/docs/pack-format#_pack_rev_files_have_the_format.
*/
public abstract class PackReverseIndexWriter {
/**
* Magic bytes that uniquely identify git reverse index files.
*/
protected static byte[] MAGIC = { 'R', 'I', 'D', 'X' };
/**
* The first reverse index file version.
*/
protected static final int VERSION_1 = 1;
/**
* Stream to write contents to while maintaining a checksum.
*/
protected final DigestOutputStream out;
/**
* Stream to write primitive type contents to while maintaining a checksum.
*/
protected final DataOutput dataOutput;
private static final int DEFAULT_VERSION = VERSION_1;
/**
* Construct the components of a PackReverseIndexWriter that are shared
* between subclasses.
*
* @param dst
* the OutputStream that the instance will write contents to
*/
protected PackReverseIndexWriter(OutputStream dst) {
out = new DigestOutputStream(
dst instanceof BufferedOutputStream ? dst
: new BufferedOutputStream(dst),
Constants.newMessageDigest());
dataOutput = new SimpleDataOutput(out);
}
/**
* Create a writer instance for the default file format version.
*
* @param dst
* the OutputStream that contents will be written to
* @return the new writer instance
*/
public static PackReverseIndexWriter createWriter(OutputStream dst) {
return createWriter(dst, DEFAULT_VERSION);
}
/**
* Create a writer instance for the specified file format version.
*
* @param dst
* the OutputStream that contents will be written to
* @param version
* the reverse index format version to write contents as
* @return the new writer instance
*/
public static PackReverseIndexWriter createWriter(OutputStream dst,
int version) {
if (version == VERSION_1) {
return new PackReverseIndexWriterV1(dst);
}
throw new IllegalArgumentException(MessageFormat.format(
JGitText.get().unsupportedPackReverseIndexVersion,
Integer.toString(version)));
}
/**
* Write the contents of a reverse index file for the given objects.
*
* @param objectsByIndexPos
* the objects whose forward index file positions should be
* written, sorted by forward index file position (currently SHA1
* ordering)
* @param packChecksum
* the checksum of the corresponding pack file
* @throws IOException
* if writing the output fails
*/
public void write(
List<? extends PackedObjectInfo> objectsByIndexPos,
byte[] packChecksum) throws IOException {
writeHeader();
writeBody(objectsByIndexPos);
writeFooter(packChecksum);
out.flush();
}
/**
* Write the header of a reverse index file, usually the magic bytes and the
* file format version.
*
* @throws IOException
* if writing the output fails
*/
protected abstract void writeHeader() throws IOException;
/**
* Write the body of a reverse index file, usually the forward index
* positions of the given objects, sorted by those objects' pack file
* offsets.
*
* @param objectsSortedByIndexPosition
* the objects whose forward index file positions should be
* written, sorted by forward index file position; not modified
* during method
* @throws IOException
* if writing the output fails
*/
protected abstract void writeBody(
List<? extends PackedObjectInfo> objectsSortedByIndexPosition)
throws IOException;
private void writeFooter(byte[] packChecksum) throws IOException {
out.write(packChecksum);
byte[] selfChecksum = out.getMessageDigest().digest();
out.write(selfChecksum);
}
}

View File

@ -0,0 +1,75 @@
/*
* Copyright (C) 2023, Google LLC and others
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Distribution License v. 1.0 which is available at
* https://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
package org.eclipse.jgit.internal.storage.file;
import java.io.IOException;
import java.io.OutputStream;
import java.util.List;
import org.eclipse.jgit.transport.PackedObjectInfo;
import org.eclipse.jgit.util.IntList;
import org.eclipse.jgit.util.IntList.IntComparator;
/**
* Writes reverse index files following the version 1 format.
* <p>
* The file format is specified at
* https://git-scm.com/docs/pack-format#_pack_rev_files_have_the_format.
*/
final class PackReverseIndexWriterV1 extends PackReverseIndexWriter {
private static final int OID_VERSION_SHA1 = 1;
private static final int DEFAULT_OID_VERSION = OID_VERSION_SHA1;
PackReverseIndexWriterV1(final OutputStream dst) {
super(dst);
}
@Override
protected void writeHeader() throws IOException {
out.write(MAGIC);
dataOutput.writeInt(VERSION_1);
dataOutput.writeInt(DEFAULT_OID_VERSION);
}
@Override
protected void writeBody(List<? extends PackedObjectInfo> objectsByIndexPos)
throws IOException {
IntList positionsByOffset = IntList.filledWithRange(0,
objectsByIndexPos.size());
positionsByOffset
.sort(new IndexPositionsByOffsetComparator(objectsByIndexPos));
for (int i = 0; i < positionsByOffset.size(); i++) {
int indexPosition = positionsByOffset.get(i);
dataOutput.writeInt(indexPosition);
}
}
private static class IndexPositionsByOffsetComparator
implements IntComparator {
private List<? extends PackedObjectInfo> objectsByIndexPos;
private IndexPositionsByOffsetComparator(
List<? extends PackedObjectInfo> objectsByIndexPos) {
this.objectsByIndexPos = objectsByIndexPos;
}
@Override
public int compare(int firstIndexPosition, int secondIndexPosition) {
return Long.compare(getOffset(firstIndexPosition),
getOffset(secondIndexPosition));
}
private long getOffset(int indexPosition) {
return objectsByIndexPos.get(indexPosition).getOffset();
}
}
}