From 11f99fecfd4b839d258a77092e06daedd5660821 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Thu, 23 Sep 2010 17:13:20 -0700
Subject: [PATCH] Reduce content hash function collisions

The hash code returned by RawTextComparator (or that is used
by the SimilarityIndex) play an important role in the speed of
any algorithm that is based upon them.  The lower the number of
collisions produced by the hash function, the shorter the hash
chains within hash tables will be, and the less likely we are to
fall into O(N^2) runtime behaviors for algorithms like PatienceDiff.

Our prior hash function was absolutely horrid, so replace it with
the proper definition of the DJB hash that was originally published
by Professor Daniel J. Bernstein.

To support this assertion, below is a table listing the maximum
number of collisions that result when hashing the unique lines in
each source code file of 3 randomly chosen projects:

  test_jgit: 931 files; 122 avg. unique lines/file
   Algorithm    | Collisions
   -------------+-----------
   prior_hash            418
   djb                     5
   sha1                    6
   string_hash31          11

  test_linux26: 30198 files; 258 avg. unique lines/file
   Algorithm    | Collisions
   -------------+-----------
   prior_hash           8675
   djb                    32
   sha1                    8
   string_hash31          32

  test_frameworks_base: 8381 files; 184 avg. unique lines/file
   Algorithm    | Collisions
   -------------+-----------
   prior_hash           4615
   djb                    10
   sha1                    6
   string_hash31          13

We can clearly see that prior_hash performed very poorly, resulting
in 8,675 collisions (elements in the same hash bucket) for at least
one file in the Linux kernel repository.  This leads to some very
bad O(N) style insertion and lookup performance, even though the
hash table was sized to be the next power-of-2 larger than the
total number of unique lines in the file.

The djb hash we are replacing prior_hash with performs closer to
SHA-1 in terms of having very few collisions.  This indicates it
provides a reasonably distributed output for this type of input,
despite being a much simpler algorithm (and therefore will be much
faster to execute).

The string_hash31 function is provided just to compare results with,
it is the algorithm commonly used by java.lang.String hashCode().

However, life isn't quite this simple.

djb produces a 32 bit hash code, but our hash tables are always
smaller than 2^32 buckets.  Mashing the 32 bit code into an array
index used to be done by simply taking the lower bits of the hash
code by a bitwise and operator.  This unfortuntely still produces
many collisions, e.g. 32 on the linux-2.6 repository files.

From [1] we can apply a final "cleanup" step to the hash code to
mix the bits together a little better, and give priority to the
higher order bits as they include data from more bytes of input:

  test_jgit: 931 files; 122 avg. unique lines/file
   Algorithm    | Collisions
   -------------+-----------
   prior_hash            418
   djb                     5
   djb + cleanup           6

  test_linux26: 30198 files; 258 avg. unique lines/file
   Algorithm    | Collisions
   -------------+-----------
   prior_hash           8675
   djb                    32
   djb + cleanup           7

  test_frameworks_base: 8381 files; 184 avg. unique lines/file
   Algorithm    | Collisions
   -------------+-----------
   prior_hash           4615
   djb                    10
   djb + cleanup           7

This is a massive improvement, as the number of collisions for
common inputs drops to acceptable levels, and we haven't really
made the hash functions any more complex than they were before.

[1] http://lkml.org/lkml/2009/10/27/404

Change-Id: Ia753b695de9526a157ddba265824240bd05dead1
Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 .../eclipse/jgit/diff/PatienceDiffIndex.java  | 31 ++++++++-----
 .../eclipse/jgit/diff/RawTextComparator.java  | 16 +++----
 .../eclipse/jgit/diff/SimilarityIndex.java    | 44 ++++++++-----------
 3 files changed, 45 insertions(+), 46 deletions(-)

diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/PatienceDiffIndex.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/PatienceDiffIndex.java
index da21c483b..27cf9252e 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/PatienceDiffIndex.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/PatienceDiffIndex.java
@@ -91,10 +91,11 @@ final class PatienceDiffIndex<S extends Sequence> {
 	/** 1 past the last valid entry in {@link #pCommon}. */
 	private final int pEnd;
 
-	/** Keyed by {@code cmp.hash() & tableMask} to yield an entry offset. */
+	/** Keyed by {@link #hash(HashedSequence, int)} to get an entry offset. */
 	private final int[] table;
 
-	private final int tableMask;
+	/** Number of low bits to discard from a key to index {@link #table}. */
+	private final int keyShift;
 
 	// To save memory the buckets for hash chains are stored in correlated
 	// arrays. This permits us to get 3 values per entry, without paying
@@ -158,8 +159,9 @@ final class PatienceDiffIndex<S extends Sequence> {
 		this.pEnd = pCnt;
 
 		final int sz = region.getLengthB();
-		table = new int[tableSize(sz)];
-		tableMask = table.length - 1;
+		final int tableBits = tableBits(sz);
+		table = new int[1 << tableBits];
+		keyShift = 32 - tableBits;
 
 		// As we insert elements we preincrement so that 0 is never a
 		// valid entry. Therefore we have to allocate one extra space.
@@ -187,7 +189,7 @@ private void scanB() {
 		final int end = region.endB;
 		int pIdx = pBegin;
 		SCAN: while (ptr < end) {
-			final int tIdx = cmp.hash(b, ptr) & tableMask;
+			final int tIdx = hash(b, ptr);
 
 			if (pIdx < pEnd) {
 				final long priorRec = pCommon[pIdx];
@@ -244,7 +246,7 @@ private void scanA() {
 		final int end = region.endA;
 		int pLast = pBegin - 1;
 		SCAN: while (ptr < end) {
-			final int tIdx = cmp.hash(a, ptr) & tableMask;
+			final int tIdx = hash(a, ptr);
 
 			for (int eIdx = table[tIdx]; eIdx != 0; eIdx = next[eIdx]) {
 				final long rec = ptrs[eIdx];
@@ -391,6 +393,10 @@ Edit findLongestCommonSequence() {
 		return lcs;
 	}
 
+	private int hash(HashedSequence<S> s, int idx) {
+		return (cmp.hash(s, idx) * 0x9e370001 /* mix bits */) >>> keyShift;
+	}
+
 	private static boolean isDuplicate(long rec) {
 		return (((int) rec) & DUPLICATE_MASK) != 0;
 	}
@@ -407,11 +413,12 @@ private static int bOf(long rec) {
 		return (int) (rec >>> B_SHIFT);
 	}
 
-	private static int tableSize(final int worstCaseBlockCnt) {
-		int shift = 32 - Integer.numberOfLeadingZeros(worstCaseBlockCnt);
-		int sz = 1 << (shift - 1);
-		if (sz < worstCaseBlockCnt)
-			sz <<= 1;
-		return sz;
+	private static int tableBits(final int sz) {
+		int bits = 31 - Integer.numberOfLeadingZeros(sz);
+		if (bits == 0)
+			bits = 1;
+		if (1 << bits < sz)
+			bits++;
+		return bits;
 	}
 }
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawTextComparator.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawTextComparator.java
index a1e2e6cec..f9cf376ab 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawTextComparator.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawTextComparator.java
@@ -78,7 +78,7 @@ public boolean equals(RawText a, int ai, RawText b, int bi) {
 		protected int hashRegion(final byte[] raw, int ptr, final int end) {
 			int hash = 5381;
 			for (; ptr < end; ptr++)
-				hash = (hash << 5) ^ (raw[ptr] & 0xff);
+				hash = ((hash << 5) + hash) + (raw[ptr] & 0xff);
 			return hash;
 		}
 	};
@@ -128,7 +128,7 @@ protected int hashRegion(byte[] raw, int ptr, int end) {
 			for (; ptr < end; ptr++) {
 				byte c = raw[ptr];
 				if (!isWhitespace(c))
-					hash = (hash << 5) ^ (c & 0xff);
+					hash = ((hash << 5) + hash) + (c & 0xff);
 			}
 			return hash;
 		}
@@ -163,9 +163,8 @@ public boolean equals(RawText a, int ai, RawText b, int bi) {
 		protected int hashRegion(final byte[] raw, int ptr, int end) {
 			int hash = 5381;
 			ptr = trimLeadingWhitespace(raw, ptr, end);
-			for (; ptr < end; ptr++) {
-				hash = (hash << 5) ^ (raw[ptr] & 0xff);
-			}
+			for (; ptr < end; ptr++)
+				hash = ((hash << 5) + hash) + (raw[ptr] & 0xff);
 			return hash;
 		}
 	};
@@ -199,9 +198,8 @@ public boolean equals(RawText a, int ai, RawText b, int bi) {
 		protected int hashRegion(final byte[] raw, int ptr, int end) {
 			int hash = 5381;
 			end = trimTrailingWhitespace(raw, ptr, end);
-			for (; ptr < end; ptr++) {
-				hash = (hash << 5) ^ (raw[ptr] & 0xff);
-			}
+			for (; ptr < end; ptr++)
+				hash = ((hash << 5) + hash) + (raw[ptr] & 0xff);
 			return hash;
 		}
 	};
@@ -247,7 +245,7 @@ protected int hashRegion(final byte[] raw, int ptr, int end) {
 			end = trimTrailingWhitespace(raw, ptr, end);
 			while (ptr < end) {
 				byte c = raw[ptr];
-				hash = (hash << 5) ^ (c & 0xff);
+				hash = ((hash << 5) + hash) + (c & 0xff);
 				if (isWhitespace(c))
 					ptr = trimLeadingWhitespace(raw, ptr, end);
 				else
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
index 39bcebb4c..6627268e4 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
@@ -68,20 +68,13 @@ class SimilarityIndex {
 	/** The {@link #idHash} table stops growing at {@code 1 << MAX_HASH_BITS}. */
 	private static final int MAX_HASH_BITS = 17;
 
-	/** The {@link #idHash} table will not grow bigger than this, ever. */
-	private static final int MAX_HASH_SIZE = 1 << MAX_HASH_BITS;
-
-	/** Prime just before {@link #MAX_HASH_SIZE}. */
-	private static final int P = 131071;
-
 	/**
 	 * Shift to apply before storing a key.
 	 * <p>
 	 * Within the 64 bit table record space, we leave the highest bit unset so
-	 * all values are positive, and we need {@link #MAX_HASH_BITS} bits for the
-	 * keys. The lower 32 bits are used to count bytes impacted.
+	 * all values are positive. The lower 32 bits to count bytes.
 	 */
-	private static final int KEY_SHIFT = 64 - 1 - MAX_HASH_BITS;
+	private static final int KEY_SHIFT = 32;
 
 	/** Total size of the file we hashed into the structure. */
 	private long fileSize;
@@ -100,8 +93,12 @@ class SimilarityIndex {
 	 */
 	private long[] idHash;
 
+	/** {@code idHash.length == 1 << idHashBits}. */
+	private int idHashBits;
+
 	SimilarityIndex() {
-		idHash = new long[256];
+		idHashBits = 8;
+		idHash = new long[1 << idHashBits];
 	}
 
 	long getFileSize() {
@@ -138,7 +135,7 @@ void hash(byte[] raw, int ptr, final int end) {
 				int c = raw[ptr++] & 0xff;
 				if (c == '\n')
 					break;
-				hash = (hash << 5) ^ c;
+				hash = (hash << 5) + hash + c;
 			} while (ptr < end && ptr - start < 64);
 			add(hash, ptr - start);
 		}
@@ -166,7 +163,7 @@ void hash(InputStream in, long remaining) throws IOException {
 				int c = buf[ptr++] & 0xff;
 				if (c == '\n')
 					break;
-				hash = (hash << 5) ^ c;
+				hash = (hash << 5) + hash + c;
 			} while (n < 64 && n < remaining);
 			add(hash, n);
 			remaining -= n;
@@ -272,7 +269,8 @@ private int packedIndex(int idx) {
 	}
 
 	void add(int key, int cnt) {
-		key = hash(key);
+		key = (key * 0x9e370001) >>> 1; // Mix bits and ensure not negative.
+
 		int j = slot(key);
 		for (;;) {
 			long v = idHash[j];
@@ -298,28 +296,24 @@ void add(int key, int cnt) {
 		}
 	}
 
-	private static int hash(int key) {
-		// Make the key fit into our table. Since we have a maximum size
-		// that we cap the table at, all keys get squashed before going
-		// into the table. This prevents overflow.
-		//
-		return (key >>> 1) % P;
-	}
-
 	private int slot(int key) {
-		return key % idHash.length;
+		// We use 31 - idHashBits because the upper bit was already forced
+		// to be 0 and we want the remaining high bits to be used as the
+		// table slot.
+		//
+		return key >>> (31 - idHashBits);
 	}
 
 	private boolean shouldGrow() {
-		int n = idHash.length;
-		return n < MAX_HASH_SIZE && n <= idSize * 2;
+		return idHashBits < MAX_HASH_BITS && idHash.length <= idSize * 2;
 	}
 
 	private void grow() {
 		long[] oldHash = idHash;
 		int oldSize = idHash.length;
 
-		idHash = new long[2 * oldSize];
+		idHashBits++;
+		idHash = new long[1 << idHashBits];
 		for (int i = 0; i < oldSize; i++) {
 			long v = oldHash[i];
 			if (v != 0) {