ComputedPackReverseIndex: Clarify custom bucket sort algorithm

The ComputedPackReverseIndex uses a custom sorting algorithm, based on
bucket sort with insertion sort but with the data managed as a linked
list across two int arrays. This custom algorithm relies on the set of
values being sorted being exactly 0, ..., n-1; so that they can serve a
second purpose of being indexes into a second equally sized list.

This custom algorithm was introduced ~10 years ago in
6cc532a43c.
The original author is no longer an active contributor, so it is
valuable for the code to be readable, especially as there is currently
active work on reverse indexes.

Rename variables and add comments to clarify the algorithm and improve
readability. There are no functional changes to the algorithm.

Change-Id: Ic3b682203f20e06f9f865f81259e034230f9720a
Signed-off-by: Anna Papitto <annapapitto@google.com>
This commit is contained in:
Anna Papitto 2023-07-14 12:19:27 -07:00
parent 5dc63514d0
commit 7d2669587f
1 changed files with 71 additions and 47 deletions

View File

@ -30,25 +30,30 @@ final class PackReverseIndexComputed implements PackReverseIndex {
private final PackIndex index; private final PackIndex index;
/** /**
* The number of bytes per entry in the offsetIndex. * The difference in offset between the start of an offset bucket and the
* start of its succeeding bucket.
*/ */
private final long bucketSize; private final long bucketSize;
/** /**
* An index into the nth mapping, where the value is the position after the * The indexes into indexPosInOffsetOrder at which the next bucket starts.
* the last index that contains the values of the bucket. For example given * <p>
* offset o (and bucket = o / bucketSize), the offset will be contained in * For example, given offset o (and therefore bucket = o / bucketSize), the
* the range nth[offsetIndex[bucket - 1]] inclusive to * indexPos corresponding to o will be contained in the range
* nth[offsetIndex[bucket]] exclusive. * indexPosInOffsetOrder[nextBucketStart[bucket - 1]] inclusive to
* indexPosInOffsetOrder[nextBucketStart[bucket]] exclusive.
* <p>
* This range information can speed up #binarySearch by identifying the
* relevant bucket and only searching within its range.
* <p> * <p>
* See {@link #binarySearch} * See {@link #binarySearch}
*/ */
private final int[] offsetIndex; private final int[] nextBucketStart;
/** /**
* Mapping from indices in offset order to indices in SHA-1 order. * Mapping from indices in offset order to indices in SHA-1 order.
*/ */
private final int[] nth; private final int[] indexPosInOffsetOrder;
/** /**
* Create reverse index from straight/forward pack index, by indexing all * Create reverse index from straight/forward pack index, by indexing all
@ -60,62 +65,81 @@ final class PackReverseIndexComputed implements PackReverseIndex {
PackReverseIndexComputed(PackIndex packIndex) { PackReverseIndexComputed(PackIndex packIndex) {
index = packIndex; index = packIndex;
final long cnt = index.getObjectCount(); long rawCnt = index.getObjectCount();
if (cnt + 1 > Integer.MAX_VALUE) { if (rawCnt + 1 > Integer.MAX_VALUE) {
throw new IllegalArgumentException( throw new IllegalArgumentException(
JGitText.get().hugeIndexesAreNotSupportedByJgitYet); JGitText.get().hugeIndexesAreNotSupportedByJgitYet);
} }
int cnt = (int) rawCnt;
if (cnt == 0) { if (cnt == 0) {
bucketSize = Long.MAX_VALUE; bucketSize = Long.MAX_VALUE;
offsetIndex = new int[1]; nextBucketStart = new int[1];
nth = new int[0]; indexPosInOffsetOrder = new int[0];
return; return;
} }
final long[] offsetsBySha1 = new long[(int) cnt]; // Sort the index positions according to the corresponding pack offsets.
// Use bucket sort since the offsets are somewhat uniformly distributed
// over the range (0, pack size).
long[] offsetsInIndexOrder = new long[cnt];
long maxOffset = 0; long maxOffset = 0;
int ith = 0; int i = 0;
for (MutableEntry me : index) { for (MutableEntry entry : index) {
final long o = me.getOffset(); long offset = entry.getOffset();
offsetsBySha1[ith++] = o; offsetsInIndexOrder[i++] = offset;
if (o > maxOffset) { if (offset > maxOffset) {
maxOffset = o; maxOffset = offset;
} }
} }
bucketSize = maxOffset / cnt + 1; bucketSize = maxOffset / cnt + 1;
int[] bucketIndex = new int[(int) cnt]; // The values in each bucket, stored as a linked list. Given a bucket,
int[] bucketValues = new int[(int) cnt + 1]; // headValues[bucket] contains the first value,
for (int oi = 0; oi < offsetsBySha1.length; oi++) { // furtherValues[headValues[bucket]] contains the second,
final long o = offsetsBySha1[oi]; // furtherValues[furtherValues[headValues[bucket]]] the third, and so
final int bucket = (int) (o / bucketSize); // on. The linked list stops when a value is 0. The values themselves
final int bucketValuesPos = oi + 1; // are shifted index positions. There won't be any
final int current = bucketIndex[bucket]; // collisions because every index position is unique.
bucketIndex[bucket] = bucketValuesPos; int[] headValues = new int[cnt];
bucketValues[bucketValuesPos] = current; int[] furtherValues = new int[cnt + 1];
for (int indexPos = 0; indexPos < cnt; indexPos++) {
// The offset determines which bucket this index position falls
// into, since the goal is sort into offset order.
long offset = offsetsInIndexOrder[indexPos];
int bucket = (int) (offset / bucketSize);
// Store the index positions as 1-indexed so that default
// initialized value 0 can be interpreted as the end of the bucket
// values.
int asBucketValue = indexPos + 1;
// If there is an existing value in this bucket, push the value to
// the front of the linked list.
int current = headValues[bucket];
headValues[bucket] = asBucketValue;
furtherValues[asBucketValue] = current;
} }
int nthByOffset = 0; int nthByOffset = 0;
nth = new int[offsetsBySha1.length]; indexPosInOffsetOrder = new int[cnt];
offsetIndex = bucketIndex; // Reuse the allocation nextBucketStart = headValues; // Reuse the allocation
for (int bi = 0; bi < bucketIndex.length; bi++) { for (int bi = 0; bi < headValues.length; bi++) {
final int start = nthByOffset;
// Insertion sort of the values in the bucket. // Insertion sort of the values in the bucket.
for (int vi = bucketIndex[bi]; vi > 0; vi = bucketValues[vi]) { int start = nthByOffset;
final int nthBySha1 = vi - 1; for (int vi = headValues[bi]; vi > 0; vi = furtherValues[vi]) {
final long o = offsetsBySha1[nthBySha1]; int nthBySha1 = vi - 1;
long o = offsetsInIndexOrder[nthBySha1];
int insertion = nthByOffset++; int insertion = nthByOffset++;
for (; start < insertion; insertion--) { for (; start < insertion; insertion--) {
if (o > offsetsBySha1[nth[insertion - 1]]) { if (o > offsetsInIndexOrder[indexPosInOffsetOrder[insertion
- 1]]) {
break; break;
} }
nth[insertion] = nth[insertion - 1]; indexPosInOffsetOrder[insertion] = indexPosInOffsetOrder[insertion
- 1];
} }
nth[insertion] = nthBySha1; indexPosInOffsetOrder[insertion] = nthBySha1;
} }
offsetIndex[bi] = nthByOffset; nextBucketStart[bi] = nthByOffset;
} }
} }
@ -125,7 +149,7 @@ public ObjectId findObject(long offset) {
if (ith < 0) { if (ith < 0) {
return null; return null;
} }
return index.getObjectId(nth[ith]); return index.getObjectId(indexPosInOffsetOrder[ith]);
} }
@Override @Override
@ -138,10 +162,10 @@ public long findNextOffset(long offset, long maxOffset)
Long.valueOf(offset))); Long.valueOf(offset)));
} }
if (ith + 1 == nth.length) { if (ith + 1 == indexPosInOffsetOrder.length) {
return maxOffset; return maxOffset;
} }
return index.getOffset(nth[ith + 1]); return index.getOffset(indexPosInOffsetOrder[ith + 1]);
} }
@Override @Override
@ -151,11 +175,11 @@ public int findPosition(long offset) {
private int binarySearch(long offset) { private int binarySearch(long offset) {
int bucket = (int) (offset / bucketSize); int bucket = (int) (offset / bucketSize);
int low = bucket == 0 ? 0 : offsetIndex[bucket - 1]; int low = bucket == 0 ? 0 : nextBucketStart[bucket - 1];
int high = offsetIndex[bucket]; int high = nextBucketStart[bucket];
while (low < high) { while (low < high) {
final int mid = (low + high) >>> 1; final int mid = (low + high) >>> 1;
final long o = index.getOffset(nth[mid]); final long o = index.getOffset(indexPosInOffsetOrder[mid]);
if (offset < o) { if (offset < o) {
high = mid; high = mid;
} else if (offset == o) { } else if (offset == o) {
@ -169,6 +193,6 @@ private int binarySearch(long offset) {
@Override @Override
public ObjectId findObjectByPosition(int nthPosition) { public ObjectId findObjectByPosition(int nthPosition) {
return index.getObjectId(nth[nthPosition]); return index.getObjectId(indexPosInOffsetOrder[nthPosition]);
} }
} }