From 7d2669587f475f3fba1acce9df257aed3c0714cd Mon Sep 17 00:00:00 2001 From: Anna Papitto Date: Fri, 14 Jul 2023 12:19:27 -0700 Subject: [PATCH 1/2] ComputedPackReverseIndex: Clarify custom bucket sort algorithm The ComputedPackReverseIndex uses a custom sorting algorithm, based on bucket sort with insertion sort but with the data managed as a linked list across two int arrays. This custom algorithm relies on the set of values being sorted being exactly 0, ..., n-1; so that they can serve a second purpose of being indexes into a second equally sized list. This custom algorithm was introduced ~10 years ago in https://eclipse.googlesource.com/jgit/jgit/+/6cc532a43cf28403cb623d3df8600a2542a40a43. The original author is no longer an active contributor, so it is valuable for the code to be readable, especially as there is currently active work on reverse indexes. Rename variables and add comments to clarify the algorithm and improve readability. There are no functional changes to the algorithm. Change-Id: Ic3b682203f20e06f9f865f81259e034230f9720a Signed-off-by: Anna Papitto --- .../file/PackReverseIndexComputed.java | 118 +++++++++++------- 1 file changed, 71 insertions(+), 47 deletions(-) diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackReverseIndexComputed.java b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackReverseIndexComputed.java index 33bdb0974..d6eaa965d 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackReverseIndexComputed.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackReverseIndexComputed.java @@ -30,25 +30,30 @@ final class PackReverseIndexComputed implements PackReverseIndex { private final PackIndex index; /** - * The number of bytes per entry in the offsetIndex. + * The difference in offset between the start of an offset bucket and the + * start of its succeeding bucket. */ private final long bucketSize; /** - * An index into the nth mapping, where the value is the position after the - * the last index that contains the values of the bucket. For example given - * offset o (and bucket = o / bucketSize), the offset will be contained in - * the range nth[offsetIndex[bucket - 1]] inclusive to - * nth[offsetIndex[bucket]] exclusive. + * The indexes into indexPosInOffsetOrder at which the next bucket starts. + *

+ * For example, given offset o (and therefore bucket = o / bucketSize), the + * indexPos corresponding to o will be contained in the range + * indexPosInOffsetOrder[nextBucketStart[bucket - 1]] inclusive to + * indexPosInOffsetOrder[nextBucketStart[bucket]] exclusive. + *

+ * This range information can speed up #binarySearch by identifying the + * relevant bucket and only searching within its range. *

* See {@link #binarySearch} */ - private final int[] offsetIndex; + private final int[] nextBucketStart; /** * Mapping from indices in offset order to indices in SHA-1 order. */ - private final int[] nth; + private final int[] indexPosInOffsetOrder; /** * Create reverse index from straight/forward pack index, by indexing all @@ -60,62 +65,81 @@ final class PackReverseIndexComputed implements PackReverseIndex { PackReverseIndexComputed(PackIndex packIndex) { index = packIndex; - final long cnt = index.getObjectCount(); - if (cnt + 1 > Integer.MAX_VALUE) { + long rawCnt = index.getObjectCount(); + if (rawCnt + 1 > Integer.MAX_VALUE) { throw new IllegalArgumentException( JGitText.get().hugeIndexesAreNotSupportedByJgitYet); } + int cnt = (int) rawCnt; if (cnt == 0) { bucketSize = Long.MAX_VALUE; - offsetIndex = new int[1]; - nth = new int[0]; + nextBucketStart = new int[1]; + indexPosInOffsetOrder = new int[0]; return; } - final long[] offsetsBySha1 = new long[(int) cnt]; - + // Sort the index positions according to the corresponding pack offsets. + // Use bucket sort since the offsets are somewhat uniformly distributed + // over the range (0, pack size). + long[] offsetsInIndexOrder = new long[cnt]; long maxOffset = 0; - int ith = 0; - for (MutableEntry me : index) { - final long o = me.getOffset(); - offsetsBySha1[ith++] = o; - if (o > maxOffset) { - maxOffset = o; + int i = 0; + for (MutableEntry entry : index) { + long offset = entry.getOffset(); + offsetsInIndexOrder[i++] = offset; + if (offset > maxOffset) { + maxOffset = offset; } } bucketSize = maxOffset / cnt + 1; - int[] bucketIndex = new int[(int) cnt]; - int[] bucketValues = new int[(int) cnt + 1]; - for (int oi = 0; oi < offsetsBySha1.length; oi++) { - final long o = offsetsBySha1[oi]; - final int bucket = (int) (o / bucketSize); - final int bucketValuesPos = oi + 1; - final int current = bucketIndex[bucket]; - bucketIndex[bucket] = bucketValuesPos; - bucketValues[bucketValuesPos] = current; + // The values in each bucket, stored as a linked list. Given a bucket, + // headValues[bucket] contains the first value, + // furtherValues[headValues[bucket]] contains the second, + // furtherValues[furtherValues[headValues[bucket]]] the third, and so + // on. The linked list stops when a value is 0. The values themselves + // are shifted index positions. There won't be any + // collisions because every index position is unique. + int[] headValues = new int[cnt]; + int[] furtherValues = new int[cnt + 1]; + for (int indexPos = 0; indexPos < cnt; indexPos++) { + // The offset determines which bucket this index position falls + // into, since the goal is sort into offset order. + long offset = offsetsInIndexOrder[indexPos]; + int bucket = (int) (offset / bucketSize); + // Store the index positions as 1-indexed so that default + // initialized value 0 can be interpreted as the end of the bucket + // values. + int asBucketValue = indexPos + 1; + // If there is an existing value in this bucket, push the value to + // the front of the linked list. + int current = headValues[bucket]; + headValues[bucket] = asBucketValue; + furtherValues[asBucketValue] = current; } int nthByOffset = 0; - nth = new int[offsetsBySha1.length]; - offsetIndex = bucketIndex; // Reuse the allocation - for (int bi = 0; bi < bucketIndex.length; bi++) { - final int start = nthByOffset; + indexPosInOffsetOrder = new int[cnt]; + nextBucketStart = headValues; // Reuse the allocation + for (int bi = 0; bi < headValues.length; bi++) { // Insertion sort of the values in the bucket. - for (int vi = bucketIndex[bi]; vi > 0; vi = bucketValues[vi]) { - final int nthBySha1 = vi - 1; - final long o = offsetsBySha1[nthBySha1]; + int start = nthByOffset; + for (int vi = headValues[bi]; vi > 0; vi = furtherValues[vi]) { + int nthBySha1 = vi - 1; + long o = offsetsInIndexOrder[nthBySha1]; int insertion = nthByOffset++; for (; start < insertion; insertion--) { - if (o > offsetsBySha1[nth[insertion - 1]]) { + if (o > offsetsInIndexOrder[indexPosInOffsetOrder[insertion + - 1]]) { break; } - nth[insertion] = nth[insertion - 1]; + indexPosInOffsetOrder[insertion] = indexPosInOffsetOrder[insertion + - 1]; } - nth[insertion] = nthBySha1; + indexPosInOffsetOrder[insertion] = nthBySha1; } - offsetIndex[bi] = nthByOffset; + nextBucketStart[bi] = nthByOffset; } } @@ -125,7 +149,7 @@ public ObjectId findObject(long offset) { if (ith < 0) { return null; } - return index.getObjectId(nth[ith]); + return index.getObjectId(indexPosInOffsetOrder[ith]); } @Override @@ -138,10 +162,10 @@ public long findNextOffset(long offset, long maxOffset) Long.valueOf(offset))); } - if (ith + 1 == nth.length) { + if (ith + 1 == indexPosInOffsetOrder.length) { return maxOffset; } - return index.getOffset(nth[ith + 1]); + return index.getOffset(indexPosInOffsetOrder[ith + 1]); } @Override @@ -151,11 +175,11 @@ public int findPosition(long offset) { private int binarySearch(long offset) { int bucket = (int) (offset / bucketSize); - int low = bucket == 0 ? 0 : offsetIndex[bucket - 1]; - int high = offsetIndex[bucket]; + int low = bucket == 0 ? 0 : nextBucketStart[bucket - 1]; + int high = nextBucketStart[bucket]; while (low < high) { final int mid = (low + high) >>> 1; - final long o = index.getOffset(nth[mid]); + final long o = index.getOffset(indexPosInOffsetOrder[mid]); if (offset < o) { high = mid; } else if (offset == o) { @@ -169,6 +193,6 @@ private int binarySearch(long offset) { @Override public ObjectId findObjectByPosition(int nthPosition) { - return index.getObjectId(nth[nthPosition]); + return index.getObjectId(indexPosInOffsetOrder[nthPosition]); } } From 000e7caf5e35ac0fb357aa10f13329f03b4dbee0 Mon Sep 17 00:00:00 2001 From: Anna Papitto Date: Fri, 14 Jul 2023 12:19:27 -0700 Subject: [PATCH 2/2] PackReverseIndexV1: reverse index parsed from version 1 file The reverse index for a pack is used to quickly find an object's position in the pack's forward index based on that object's pack offset. It is currently computed from the forward index by sorting the index entries by the corresponding pack offset. This computation uses insertion sort, which has an average runtime of O(n^2). Cgit persists a pack reverse index file to avoid recomputing the reverse index ordering. Instead they write a file with format https://git-scm.com/docs/pack-format#_pack_rev_files_have_the_format which can later be read and parsed into the in-memory reverse index each time it is needed. PackReverseIndexV1 parses a reverse index file with the official version 1 format into an in-memory representation of the reverse index which implements methods to find an object's forward index position from its offset in logorithmic time. Change-Id: I60a92463fbd6a8cc9c1c7451df1c14d0a21a0f64 Signed-off-by: Anna Papitto --- .../storage/file/PackReverseIndexTest.java | 56 ++++ .../storage/file/PackReverseIndexV1Test.java | 246 ++++++++++++++++++ .../file/PackReverseIndexV1WriteReadTest.java | 119 +++++++++ .../eclipse/jgit/internal/JGitText.properties | 1 + .../org/eclipse/jgit/internal/JGitText.java | 1 + .../storage/file/PackReverseIndex.java | 10 + .../storage/file/PackReverseIndexFactory.java | 55 ++++ .../storage/file/PackReverseIndexV1.java | 182 +++++++++++++ .../storage/file/PackReverseIndexWriter.java | 12 +- .../file/PackReverseIndexWriterV1.java | 6 +- 10 files changed, 676 insertions(+), 12 deletions(-) create mode 100644 org.eclipse.jgit.test/tst/org/eclipse/jgit/internal/storage/file/PackReverseIndexTest.java create mode 100644 org.eclipse.jgit.test/tst/org/eclipse/jgit/internal/storage/file/PackReverseIndexV1Test.java create mode 100644 org.eclipse.jgit.test/tst/org/eclipse/jgit/internal/storage/file/PackReverseIndexV1WriteReadTest.java create mode 100644 org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackReverseIndexV1.java diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/internal/storage/file/PackReverseIndexTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/internal/storage/file/PackReverseIndexTest.java new file mode 100644 index 000000000..84fc58e05 --- /dev/null +++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/internal/storage/file/PackReverseIndexTest.java @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2022, Google LLC and others + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Distribution License v. 1.0 which is available at + * https://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + */ +package org.eclipse.jgit.internal.storage.file; + +import static org.junit.Assert.assertThrows; + +import java.io.ByteArrayInputStream; +import java.io.IOException; + +import org.junit.Test; + +public class PackReverseIndexTest { + @Test + public void read_badMagic() { + byte[] badMagic = new byte[] { 'R', 'B', 'A', 'D', // magic + 0x00, 0x00, 0x00, 0x01, // file version + 0x00, 0x00, 0x00, 0x01, // oid version + // pack checksum + 'P', 'A', 'C', 'K', 'C', 'H', 'E', 'C', 'K', 'S', 'U', 'M', '3', + '4', '5', '6', '7', '8', '9', '0', + // checksum + 0x66, 0x01, (byte) 0xbc, (byte) 0xe8, 0x51, 0x4b, 0x2f, + (byte) 0xa1, (byte) 0xa9, (byte) 0xcd, (byte) 0xbe, (byte) 0xd6, + 0x4f, (byte) 0xa8, 0x7d, (byte) 0xab, 0x50, (byte) 0xa3, + (byte) 0xf7, (byte) 0xcc, }; + ByteArrayInputStream in = new ByteArrayInputStream(badMagic); + + assertThrows(IOException.class, + () -> PackReverseIndexFactory.readFromFile(in, 0, () -> null)); + } + + @Test + public void read_unsupportedVersion2() { + byte[] version2 = new byte[] { 'R', 'I', 'D', 'X', // magic + 0x00, 0x00, 0x00, 0x02, // file version + 0x00, 0x00, 0x00, 0x01, // oid version + // pack checksum + 'P', 'A', 'C', 'K', 'C', 'H', 'E', 'C', 'K', 'S', 'U', 'M', '3', + '4', '5', '6', '7', '8', '9', '0', + // checksum + 0x70, 0x17, 0x10, 0x51, (byte) 0xfe, (byte) 0xab, (byte) 0x9b, + 0x68, (byte) 0xed, 0x3a, 0x3f, 0x27, 0x1d, (byte) 0xce, + (byte) 0xff, 0x38, 0x09, (byte) 0x9b, 0x29, 0x58, }; + ByteArrayInputStream in = new ByteArrayInputStream(version2); + + assertThrows(IOException.class, + () -> PackReverseIndexFactory.readFromFile(in, 0, () -> null)); + } +} diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/internal/storage/file/PackReverseIndexV1Test.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/internal/storage/file/PackReverseIndexV1Test.java new file mode 100644 index 000000000..b4849f99f --- /dev/null +++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/internal/storage/file/PackReverseIndexV1Test.java @@ -0,0 +1,246 @@ +/* + * Copyright (C) 2022, Google LLC and others + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Distribution License v. 1.0 which is available at + * https://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + */ +package org.eclipse.jgit.internal.storage.file; + +import static org.eclipse.jgit.lib.Constants.OBJ_BLOB; +import static org.eclipse.jgit.lib.Constants.OBJ_COMMIT; +import static org.eclipse.jgit.lib.Constants.OBJ_TREE; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertThrows; + +import java.io.ByteArrayInputStream; +import java.io.IOException; + +import org.eclipse.jgit.errors.CorruptObjectException; +import org.eclipse.jgit.junit.JGitTestUtil; +import org.eclipse.jgit.lib.ObjectId; +import org.eclipse.jgit.transport.PackedObjectInfo; +import org.junit.Before; +import org.junit.Test; + +public class PackReverseIndexV1Test { + private static final byte[] FAKE_PACK_CHECKSUM = new byte[] { 'P', 'A', 'C', + 'K', 'C', 'H', 'E', 'C', 'K', 'S', 'U', 'M', '3', '4', '5', '6', + '7', '8', '9', '0', }; + + private static final byte[] NO_OBJECTS = new byte[] { 'R', 'I', 'D', 'X', // magic + 0x00, 0x00, 0x00, 0x01, // file version + 0x00, 0x00, 0x00, 0x01, // oid version + // pack checksum to copy into at byte 12 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // checksum + (byte) 0xd1, 0x1d, 0x17, (byte) 0xd5, (byte) 0xa1, 0x5c, + (byte) 0x8f, 0x45, 0x7e, 0x06, (byte) 0x91, (byte) 0xf2, 0x7e, 0x20, + 0x35, 0x2c, (byte) 0xdc, 0x4c, 0x46, (byte) 0xe4, }; + + private static final byte[] SMALL_PACK_CHECKSUM = new byte[] { (byte) 0xbb, + 0x1d, 0x25, 0x3d, (byte) 0xd3, (byte) 0xf0, 0x08, 0x75, (byte) 0xc8, + 0x04, (byte) 0xd0, 0x6f, 0x73, (byte) 0xe9, 0x00, (byte) 0x82, + (byte) 0xdb, 0x09, (byte) 0xc8, 0x13, }; + + private static final byte[] SMALL_CONTENTS = new byte[] { 'R', 'I', 'D', + 'X', // magic + 0x00, 0x00, 0x00, 0x01, // file version + 0x00, 0x00, 0x00, 0x01, // oid version + 0x00, 0x00, 0x00, 0x04, // offset 12: "68" -> index @ 4 + 0x00, 0x00, 0x00, 0x02, // offset 165: "5c" -> index @ 2 + 0x00, 0x00, 0x00, 0x03, // offset 257: "62" -> index @ 3 + 0x00, 0x00, 0x00, 0x01, // offset 450: "58" -> index @ 1 + 0x00, 0x00, 0x00, 0x05, // offset 556: "c5" -> index @ 5 + 0x00, 0x00, 0x00, 0x00, // offset 614: "2d" -> index @ 0 + // pack checksum to copy into at byte 36 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // checksum + (byte) 0xf0, 0x6d, 0x03, (byte) 0xd7, 0x6f, (byte) 0x9f, + (byte) 0xc1, 0x36, 0x26, (byte) 0xbc, (byte) 0xcb, 0x75, 0x36, + (byte) 0xa1, 0x26, 0x6a, 0x2b, (byte) 0x84, 0x16, (byte) 0x83, }; + + private PackReverseIndex emptyReverseIndex; + + /** + * Reverse index for the pack-cbdeda40019ae0e6e789088ea0f51f164f489d14.idx + * with contents `SHA-1 type size size-in-packfile offset-in-packfile` as + * shown by `verify-pack`: + * 2d04ee74dba30078c2dcdb713ddb8be4bc084d76 blob 8 17 614 + * 58728c938a9a8b9970cc09236caf94ada4689923 blob 140 106 450 + * 5ce00008cf3fb8f194f52742020bd40d78f3f1b3 commit 81 92 165 1 68cb1f232964f3cd698afc1dafe583937203c587 + * 62299a7ae290d685196e948a2fcb7d8c07f95c7d tree 198 193 257 + * 68cb1f232964f3cd698afc1dafe583937203c587 commit 220 153 12 + * c5ab27309491cf641eb11bb4b7a78641f280b482 tree 46 58 556 1 62299a7ae290d685196e948a2fcb7d8c07f95c7d + */ + private PackReverseIndex smallReverseIndex; + + private final PackedObjectInfo object614 = objectInfo( + "2d04ee74dba30078c2dcdb713ddb8be4bc084d76", OBJ_BLOB, 614); + + private final PackedObjectInfo object450 = objectInfo( + "58728c938a9a8b9970cc09236caf94ada4689923", OBJ_BLOB, 450); + + private final PackedObjectInfo object165 = objectInfo( + "5ce00008cf3fb8f194f52742020bd40d78f3f1b3", OBJ_COMMIT, 165); + + private final PackedObjectInfo object257 = objectInfo( + "62299a7ae290d685196e948a2fcb7d8c07f95c7d", OBJ_TREE, 257); + + private final PackedObjectInfo object12 = objectInfo( + "68cb1f232964f3cd698afc1dafe583937203c587", OBJ_COMMIT, 12); + + private final PackedObjectInfo object556 = objectInfo( + "c5ab27309491cf641eb11bb4b7a78641f280b482", OBJ_TREE, 556); + + // last object's offset + last object's length + private final long smallMaxOffset = 631; + + @Before + public void setUp() throws Exception { + System.arraycopy(SMALL_PACK_CHECKSUM, 0, SMALL_CONTENTS, 36, + SMALL_PACK_CHECKSUM.length); + ByteArrayInputStream smallIn = new ByteArrayInputStream(SMALL_CONTENTS); + smallReverseIndex = PackReverseIndexFactory.readFromFile(smallIn, 6, + () -> PackIndex.open(JGitTestUtil.getTestResourceFile( + "pack-cbdeda40019ae0e6e789088ea0f51f164f489d14.idx"))); + + System.arraycopy(FAKE_PACK_CHECKSUM, 0, NO_OBJECTS, 12, + FAKE_PACK_CHECKSUM.length); + ByteArrayInputStream emptyIn = new ByteArrayInputStream(NO_OBJECTS); + emptyReverseIndex = PackReverseIndexFactory.readFromFile(emptyIn, 0, + () -> null); + } + + @Test + public void read_unsupportedOidSHA256() { + byte[] version2 = new byte[] { 'R', 'I', 'D', 'X', // magic + 0x00, 0x00, 0x00, 0x01, // file version + 0x00, 0x00, 0x00, 0x02, // oid version + // pack checksum to copy into at byte 12 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // checksum + 0x6e, 0x78, 0x75, 0x67, (byte) 0x84, (byte) 0x89, (byte) 0xde, + (byte) 0xe3, (byte) 0x86, 0x6a, 0x3b, (byte) 0x98, 0x51, + (byte) 0xd8, (byte) 0x8c, (byte) 0xec, 0x50, (byte) 0xe7, + (byte) 0xfb, 0x22, }; + System.arraycopy(FAKE_PACK_CHECKSUM, 0, version2, 12, + FAKE_PACK_CHECKSUM.length); + ByteArrayInputStream in = new ByteArrayInputStream(version2); + + assertThrows(IOException.class, + () -> PackReverseIndexFactory.readFromFile(in, 0, () -> null)); + } + + @Test + public void read_objectCountTooLarge() { + ByteArrayInputStream dummyInput = new ByteArrayInputStream(NO_OBJECTS); + long biggerThanInt = ((long) Integer.MAX_VALUE) + 1; + + assertThrows(IllegalArgumentException.class, + () -> PackReverseIndexFactory.readFromFile(dummyInput, + biggerThanInt, + () -> null)); + } + + @Test + public void findObject_noObjects() { + assertNull(emptyReverseIndex.findObject(0)); + } + + @Test + public void findObject_multipleObjects() { + assertEquals(object614, smallReverseIndex.findObject(614)); + assertEquals(object450, smallReverseIndex.findObject(450)); + assertEquals(object165, smallReverseIndex.findObject(165)); + assertEquals(object257, smallReverseIndex.findObject(257)); + assertEquals(object12, smallReverseIndex.findObject(12)); + assertEquals(object556, smallReverseIndex.findObject(556)); + } + + @Test + public void findObject_badOffset() { + assertNull(smallReverseIndex.findObject(0)); + } + + @Test + public void findNextOffset_noObjects() { + assertThrows(IOException.class, + () -> emptyReverseIndex.findNextOffset(0, Long.MAX_VALUE)); + } + + @Test + public void findNextOffset_multipleObjects() throws CorruptObjectException { + assertEquals(smallMaxOffset, + smallReverseIndex.findNextOffset(614, smallMaxOffset)); + assertEquals(614, + smallReverseIndex.findNextOffset(556, smallMaxOffset)); + assertEquals(556, + smallReverseIndex.findNextOffset(450, smallMaxOffset)); + assertEquals(450, + smallReverseIndex.findNextOffset(257, smallMaxOffset)); + assertEquals(257, + smallReverseIndex.findNextOffset(165, smallMaxOffset)); + assertEquals(165, smallReverseIndex.findNextOffset(12, smallMaxOffset)); + } + + @Test + public void findNextOffset_badOffset() { + assertThrows(IOException.class, + () -> smallReverseIndex.findNextOffset(0, Long.MAX_VALUE)); + } + + @Test + public void findPosition_noObjects() { + assertEquals(-1, emptyReverseIndex.findPosition(0)); + } + + @Test + public void findPosition_multipleObjects() { + assertEquals(0, smallReverseIndex.findPosition(12)); + assertEquals(1, smallReverseIndex.findPosition(165)); + assertEquals(2, smallReverseIndex.findPosition(257)); + assertEquals(3, smallReverseIndex.findPosition(450)); + assertEquals(4, smallReverseIndex.findPosition(556)); + assertEquals(5, smallReverseIndex.findPosition(614)); + } + + @Test + public void findPosition_badOffset() { + assertEquals(-1, smallReverseIndex.findPosition(10)); + } + + @Test + public void findObjectByPosition_noObjects() { + assertThrows(AssertionError.class, + () -> emptyReverseIndex.findObjectByPosition(0)); + } + + @Test + public void findObjectByPosition_multipleObjects() { + assertEquals(object12, smallReverseIndex.findObjectByPosition(0)); + assertEquals(object165, smallReverseIndex.findObjectByPosition(1)); + assertEquals(object257, smallReverseIndex.findObjectByPosition(2)); + assertEquals(object450, smallReverseIndex.findObjectByPosition(3)); + assertEquals(object556, smallReverseIndex.findObjectByPosition(4)); + assertEquals(object614, smallReverseIndex.findObjectByPosition(5)); + } + + @Test + public void findObjectByPosition_badOffset() { + assertThrows(AssertionError.class, + () -> smallReverseIndex.findObjectByPosition(10)); + } + + private static PackedObjectInfo objectInfo(String objectId, int type, + long offset) { + PackedObjectInfo objectInfo = new PackedObjectInfo( + ObjectId.fromString(objectId)); + objectInfo.setType(type); + objectInfo.setOffset(offset); + return objectInfo; + } +} diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/internal/storage/file/PackReverseIndexV1WriteReadTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/internal/storage/file/PackReverseIndexV1WriteReadTest.java new file mode 100644 index 000000000..372a4c7cb --- /dev/null +++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/internal/storage/file/PackReverseIndexV1WriteReadTest.java @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2022, Google LLC and others + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Distribution License v. 1.0 which is available at + * https://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + */ +package org.eclipse.jgit.internal.storage.file; + +import static org.eclipse.jgit.lib.Constants.OBJECT_ID_STRING_LENGTH; +import static org.eclipse.jgit.lib.Constants.OBJ_BLOB; +import static org.eclipse.jgit.lib.Constants.OBJ_COMMIT; +import static org.eclipse.jgit.lib.Constants.OBJ_TREE; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThrows; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.eclipse.jgit.lib.ObjectId; +import org.eclipse.jgit.transport.PackedObjectInfo; +import org.junit.Test; + +public class PackReverseIndexV1WriteReadTest { + + private static byte[] PACK_CHECKSUM = new byte[] { 'P', 'A', 'C', 'K', 'C', + 'H', 'E', 'C', 'K', 'S', 'U', 'M', '3', '4', '5', '6', '7', '8', + '9', '0', }; + + @Test + public void writeThenRead_noObjects() throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + PackReverseIndexWriter writer = PackReverseIndexWriter.createWriter(out, + 1); + List objectsSortedByName = new ArrayList<>(); + + // write + writer.write(objectsSortedByName, PACK_CHECKSUM); + + ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); + + // read + PackReverseIndex noObjectsReverseIndex = PackReverseIndexFactory + .readFromFile(in, 0, () -> null); + + // use + assertThrows(AssertionError.class, + () -> noObjectsReverseIndex.findObjectByPosition(0)); + } + + @Test + public void writeThenRead_oneObject() throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + PackReverseIndexWriter writer = PackReverseIndexWriter.createWriter(out, + 1); + PackedObjectInfo a = objectInfo("a", OBJ_COMMIT, 0); + List objectsSortedByName = List.of(a); + + // write + writer.write(objectsSortedByName, PACK_CHECKSUM); + + ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); + PackIndex mockForwardIndex = mock(PackIndex.class); + when(mockForwardIndex.getObjectId(0)).thenReturn(a); + + // read + PackReverseIndex oneObjectReverseIndex = PackReverseIndexFactory + .readFromFile(in, 1, () -> mockForwardIndex); + + // use + assertEquals(a, oneObjectReverseIndex.findObjectByPosition(0)); + } + + @Test + public void writeThenRead_multipleObjectsLargeOffsets() throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + PackReverseIndexWriter writer = PackReverseIndexWriter.createWriter(out, + 1); + PackedObjectInfo a = objectInfo("a", OBJ_BLOB, 200000000); + PackedObjectInfo b = objectInfo("b", OBJ_COMMIT, 0); + PackedObjectInfo c = objectInfo("c", OBJ_COMMIT, 52000000000L); + PackedObjectInfo d = objectInfo("d", OBJ_TREE, 7); + PackedObjectInfo e = objectInfo("e", OBJ_COMMIT, 38000000000L); + List objectsSortedByName = List.of(a, b, c, d, e); + + writer.write(objectsSortedByName, PACK_CHECKSUM); + + // write + writer.write(objectsSortedByName, PACK_CHECKSUM); + + ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); + PackIndex mockForwardIndex = mock(PackIndex.class); + when(mockForwardIndex.getObjectId(4)).thenReturn(e); + + // read + PackReverseIndex multipleObjectsReverseIndex = PackReverseIndexFactory + .readFromFile(in, 5, () -> mockForwardIndex); + + // use with minimal mocked forward index use + assertEquals(e, multipleObjectsReverseIndex.findObjectByPosition(3)); + } + + private static PackedObjectInfo objectInfo(String objectId, int type, + long offset) { + assert (objectId.length() == 1); + PackedObjectInfo objectInfo = new PackedObjectInfo( + ObjectId.fromString(objectId.repeat(OBJECT_ID_STRING_LENGTH))); + objectInfo.setType(type); + objectInfo.setOffset(offset); + return objectInfo; + } +} diff --git a/org.eclipse.jgit/resources/org/eclipse/jgit/internal/JGitText.properties b/org.eclipse.jgit/resources/org/eclipse/jgit/internal/JGitText.properties index bc8144f1c..bf8a1ef5d 100644 --- a/org.eclipse.jgit/resources/org/eclipse/jgit/internal/JGitText.properties +++ b/org.eclipse.jgit/resources/org/eclipse/jgit/internal/JGitText.properties @@ -689,6 +689,7 @@ searchForReachableBranches=Finding reachable branches saveFileStoreAttributesFailed=Saving measured FileStore attributes to user config failed searchForReuse=Finding sources searchForReuseTimeout=Search for reuse timed out after {0} seconds +unsupportedObjectIdVersion=Object id version {0} is not supported searchForSizes=Getting sizes secondsAgo={0} seconds ago selectingCommits=Selecting commits diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/internal/JGitText.java b/org.eclipse.jgit/src/org/eclipse/jgit/internal/JGitText.java index 518e0b7d9..a032d24c7 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/internal/JGitText.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/internal/JGitText.java @@ -866,6 +866,7 @@ public static JGitText get() { /***/ public String unsupportedEncryptionVersion; /***/ public String unsupportedGC; /***/ public String unsupportedMark; + /***/ public String unsupportedObjectIdVersion; /***/ public String unsupportedOperationNotAddAtEnd; /***/ public String unsupportedPackIndexVersion; /***/ public String unsupportedPackReverseIndexVersion; diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackReverseIndex.java b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackReverseIndex.java index c4290c371..74ea89025 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackReverseIndex.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackReverseIndex.java @@ -24,6 +24,16 @@ * @see Pack */ public interface PackReverseIndex { + /** + * Magic bytes that uniquely identify git reverse index files. + */ + byte[] MAGIC = { 'R', 'I', 'D', 'X' }; + + /** + * The first reverse index file version. + */ + int VERSION_1 = 1; + /** * Search for object id with the specified start offset in this pack * (reverse) index. diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackReverseIndexFactory.java b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackReverseIndexFactory.java index 2bf2f79cc..b16da5ae8 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackReverseIndexFactory.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackReverseIndexFactory.java @@ -10,6 +10,20 @@ package org.eclipse.jgit.internal.storage.file; +import static org.eclipse.jgit.internal.storage.file.PackReverseIndex.MAGIC; +import static org.eclipse.jgit.internal.storage.file.PackReverseIndex.VERSION_1; + +import java.io.DataInput; +import java.io.IOException; +import java.io.InputStream; +import java.security.DigestInputStream; +import java.text.MessageFormat; +import java.util.Arrays; + +import org.eclipse.jgit.internal.JGitText; +import org.eclipse.jgit.lib.Constants; +import org.eclipse.jgit.util.IO; + /** * Factory for creating instances of {@link PackReverseIndex}. */ @@ -26,4 +40,45 @@ public final class PackReverseIndexFactory { public static PackReverseIndex computeFromIndex(PackIndex packIndex) { return new PackReverseIndexComputed(packIndex); } + + /** + * Read an in-memory pack reverse index from the given input stream. This + * has a linear runtime. + * + * @param src + * the input stream to read the contents from + * @param objectCount + * the number of objects in the corresponding pack + * @param packIndexSupplier + * a function to lazily get the corresponding forward index + * @return the reverse index instance + * @throws IOException + * if reading from the input stream fails + */ + static PackReverseIndex readFromFile(InputStream src, long objectCount, + PackBitmapIndex.SupplierWithIOException packIndexSupplier) + throws IOException { + final DigestInputStream digestIn = new DigestInputStream(src, + Constants.newMessageDigest()); + + final byte[] magic = new byte[MAGIC.length]; + IO.readFully(digestIn, magic); + if (!Arrays.equals(magic, MAGIC)) { + throw new IOException( + MessageFormat.format(JGitText.get().expectedGot, + Arrays.toString(MAGIC), Arrays.toString(magic))); + } + + DataInput dataIn = new SimpleDataInput(digestIn); + int version = dataIn.readInt(); + switch (version) { + case VERSION_1: + return new PackReverseIndexV1(digestIn, objectCount, + packIndexSupplier); + default: + throw new IOException(MessageFormat.format( + JGitText.get().unsupportedPackReverseIndexVersion, + String.valueOf(version))); + } + } } diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackReverseIndexV1.java b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackReverseIndexV1.java new file mode 100644 index 000000000..76f0793cc --- /dev/null +++ b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackReverseIndexV1.java @@ -0,0 +1,182 @@ +/* + * Copyright (C) 2022, Google LLC and others + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Distribution License v. 1.0 which is available at + * https://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + */ +package org.eclipse.jgit.internal.storage.file; + +import static org.eclipse.jgit.lib.Constants.OBJECT_ID_LENGTH; + +import java.io.DataInput; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.security.DigestInputStream; +import java.text.MessageFormat; + +import org.eclipse.jgit.errors.CorruptObjectException; +import org.eclipse.jgit.internal.JGitText; +import org.eclipse.jgit.lib.ObjectId; +import org.eclipse.jgit.util.IO; + +/** + * Reverse index for forward pack index which is parsed from a version 1 file. + *

+ * The file format is specified at + * https://git-scm.com/docs/pack-format#_pack_rev_files_have_the_format. + */ +final class PackReverseIndexV1 implements PackReverseIndex { + static final int OID_VERSION_SHA1 = 1; + + static final int OID_VERSION_SHA256 = 2; + + private static final int SHA1_BYTES = OBJECT_ID_LENGTH; + + private final DigestInputStream inputStream; + + private final DataInput dataIn; + + /** + * A lazy supplier for the corresponding PackIndex. The PackIndex is not + * needed during instantiation and parsing, only later when querying the + * reverse index. Allow lazy loading so that the parsing of the forward and + * reverse indices could happen in parallel. + */ + private final PackBitmapIndex.SupplierWithIOException packIndexSupplier; + + private int objectCount; + + private byte[] packChecksum; + + private int[] indexPositionsSortedByOffset; + + private PackIndex packIndex; + + PackReverseIndexV1(DigestInputStream inputStream, long objectCount, + PackBitmapIndex.SupplierWithIOException packIndexSupplier) + throws IOException { + try { + this.objectCount = Math.toIntExact(objectCount); + } catch (ArithmeticException e) { + throw new IllegalArgumentException( + JGitText.get().hugeIndexesAreNotSupportedByJgitYet); + } + + this.inputStream = inputStream; + dataIn = new SimpleDataInput(inputStream); + + int oid_version = dataIn.readInt(); + switch (oid_version) { + case OID_VERSION_SHA1: + // JGit Pack only supports AnyObjectId, which represents SHA1. + break; + case OID_VERSION_SHA256: + throw new IOException(MessageFormat.format( + JGitText.get().unsupportedObjectIdVersion, "SHA256")); //$NON-NLS-1$ + default: + throw new IOException(MessageFormat.format( + JGitText.get().unsupportedObjectIdVersion, + String.valueOf(oid_version))); + } + + indexPositionsSortedByOffset = new int[this.objectCount]; + this.packIndexSupplier = packIndexSupplier; + + parseBody(); + parseChecksums(); + } + + private void parseBody() throws IOException { + for (int i = 0; i < objectCount; i++) { + indexPositionsSortedByOffset[i] = dataIn.readInt(); + } + } + + private void parseChecksums() throws IOException { + packChecksum = new byte[SHA1_BYTES]; + IO.readFully(inputStream, packChecksum); + // TODO: verify checksum + + byte[] readSelfChecksum = new byte[SHA1_BYTES]; + IO.readFully(inputStream, readSelfChecksum); + } + + @Override + public ObjectId findObject(long offset) { + int reversePosition = findPosition(offset); + if (reversePosition < 0) { + return null; + } + int forwardPosition = findForwardPositionByReversePosition( + reversePosition); + return getPackIndex().getObjectId(forwardPosition); + } + + @Override + public long findNextOffset(long offset, long maxOffset) + throws CorruptObjectException { + int position = findPosition(offset); + if (position < 0) { + throw new CorruptObjectException(MessageFormat.format(JGitText + .get().cantFindObjectInReversePackIndexForTheSpecifiedOffset, + Long.valueOf(offset))); + } + if (position + 1 == objectCount) { + return maxOffset; + } + return findOffsetByReversePosition(position + 1); + } + + @Override + public int findPosition(long offset) { + return binarySearchByOffset(offset); + } + + @Override + public ObjectId findObjectByPosition(int position) { + return getPackIndex() + .getObjectId(findForwardPositionByReversePosition(position)); + } + + private long findOffsetByReversePosition(int position) { + return getPackIndex() + .getOffset(findForwardPositionByReversePosition(position)); + } + + private int findForwardPositionByReversePosition(int reversePosition) { + assert (reversePosition >= 0); + assert (reversePosition < indexPositionsSortedByOffset.length); + return indexPositionsSortedByOffset[reversePosition]; + } + + private int binarySearchByOffset(long wantedOffset) { + int low = 0; + int high = objectCount - 1; + while (low <= high) { + int mid = (low + high) >>> 1; + long offsetAtMid = findOffsetByReversePosition(mid); + if (offsetAtMid == wantedOffset) { + return mid; + } else if (offsetAtMid > wantedOffset) { + high = mid - 1; + } else { + low = mid + 1; + } + } + return -1; + } + + private PackIndex getPackIndex() { + if (packIndex == null) { + try { + packIndex = packIndexSupplier.get(); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + return packIndex; + } +} diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackReverseIndexWriter.java b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackReverseIndexWriter.java index 4c8417b11..94a4e9c12 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackReverseIndexWriter.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackReverseIndexWriter.java @@ -9,6 +9,8 @@ */ package org.eclipse.jgit.internal.storage.file; +import static org.eclipse.jgit.internal.storage.file.PackReverseIndex.VERSION_1; + import java.io.BufferedOutputStream; import java.io.DataOutput; import java.io.IOException; @@ -28,16 +30,6 @@ * https://git-scm.com/docs/pack-format#_pack_rev_files_have_the_format. */ public abstract class PackReverseIndexWriter { - /** - * Magic bytes that uniquely identify git reverse index files. - */ - protected static byte[] MAGIC = { 'R', 'I', 'D', 'X' }; - - /** - * The first reverse index file version. - */ - protected static final int VERSION_1 = 1; - /** * Stream to write contents to while maintaining a checksum. */ diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackReverseIndexWriterV1.java b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackReverseIndexWriterV1.java index 7630724d0..ff1809f44 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackReverseIndexWriterV1.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackReverseIndexWriterV1.java @@ -9,6 +9,10 @@ */ package org.eclipse.jgit.internal.storage.file; +import static org.eclipse.jgit.internal.storage.file.PackReverseIndex.MAGIC; +import static org.eclipse.jgit.internal.storage.file.PackReverseIndex.VERSION_1; +import static org.eclipse.jgit.internal.storage.file.PackReverseIndexV1.OID_VERSION_SHA1; + import java.io.IOException; import java.io.OutputStream; import java.util.List; @@ -24,8 +28,6 @@ * https://git-scm.com/docs/pack-format#_pack_rev_files_have_the_format. */ final class PackReverseIndexWriterV1 extends PackReverseIndexWriter { - private static final int OID_VERSION_SHA1 = 1; - private static final int DEFAULT_OID_VERSION = OID_VERSION_SHA1; PackReverseIndexWriterV1(final OutputStream dst) {