From f3b511568b5f6af219bd9c948092bbec46098484 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Fri, 12 Nov 2010 11:50:38 -0800
Subject: [PATCH 1/7] SimilarityIndex: Correct comment explaining the logic

This comment was wrong, due to a copy-and-paste error.  Here the
code is looking at records of dst that do not exist in src, and
are skipping past them to find another match.

Change-Id: I07c1fba7dee093a1eeffcf7e0c7ec85446777ffb
Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 .../src/org/eclipse/jgit/diff/SimilarityIndex.java            | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
index 6627268e4..853132589 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
@@ -230,8 +230,8 @@ private static int common(long[] srcHash, int srcIdx, //
 					break;
 				srcKey = keyOf(srcHash[srcIdx]);
 
-			} else /* if (srcKey > dstKey) */{
-				// Regions of dst which do not appear in dst.
+			} else /* if (dstKey < srcKey) */{
+				// Regions of dst which do not appear in src.
 				if (++dstIdx == dstHash.length)
 					break;
 				dstKey = keyOf(dstHash[dstIdx]);

From d63887127e20c0a70c53c48a9aa5ffbdb1cf8873 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Thu, 11 Nov 2010 14:10:32 -0800
Subject: [PATCH 2/7] SimilarityIndex: Accept files larger than 8 MB

Files bigger than 8 MB (2^23 bytes) tended to overflow the internal
hashtable, as the table was capped in size to 2^17 records.  If a
file contained 2^17 unique data blocks/lines, the table insertion
got stuck in an infinite loop as the able couldn't grow, and there
was no open slot for the new item.

Remove the artifical 2^17 table limit and instead allow the table
to grow to be as big as 2^30.  With a 64 byte block size, this
permits hashing inputs as large as 64 GB.

If the table reaches 2^30 (or cannot be allocated) hashing is
aborted.  RenameDetector no longer tries to break a modify file pair,
and it does not try to match the file for rename or copy detection.

Change-Id: Ibb4d756844f4667e181e24a34a468dc3655863ac
Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 .../jgit/diff/SimilarityIndexTest.java        | 22 ++++----
 .../org/eclipse/jgit/diff/RenameDetector.java | 25 +++++++---
 .../eclipse/jgit/diff/SimilarityIndex.java    | 50 +++++++++++++------
 .../jgit/diff/SimilarityRenameDetector.java   | 29 +++++++++--
 4 files changed, 92 insertions(+), 34 deletions(-)

diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java
index 7e42e5358..1da5828b3 100644
--- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java
+++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java
@@ -48,10 +48,11 @@
 
 import junit.framework.TestCase;
 
+import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
 import org.eclipse.jgit.lib.Constants;
 
 public class SimilarityIndexTest extends TestCase {
-	public void testIndexingSmallObject() {
+	public void testIndexingSmallObject() throws TableFullException {
 		SimilarityIndex si = hash("" //
 				+ "A\n" //
 				+ "B\n" //
@@ -70,7 +71,8 @@ public void testIndexingSmallObject() {
 		assertEquals(2, si.count(si.findIndex(key_D)));
 	}
 
-	public void testIndexingLargeObject() throws IOException {
+	public void testIndexingLargeObject() throws IOException,
+			TableFullException {
 		byte[] in = ("" //
 				+ "A\n" //
 				+ "B\n" //
@@ -81,7 +83,7 @@ public void testIndexingLargeObject() throws IOException {
 		assertEquals(2, si.size());
 	}
 
-	public void testCommonScore_SameFiles() {
+	public void testCommonScore_SameFiles() throws TableFullException {
 		String text = "" //
 				+ "A\n" //
 				+ "B\n" //
@@ -96,21 +98,22 @@ public void testCommonScore_SameFiles() {
 		assertEquals(100, dst.score(src, 100));
 	}
 
-	public void testCommonScore_EmptyFiles() {
+	public void testCommonScore_EmptyFiles() throws TableFullException {
 		SimilarityIndex src = hash("");
 		SimilarityIndex dst = hash("");
 		assertEquals(0, src.common(dst));
 		assertEquals(0, dst.common(src));
 	}
 
-	public void testCommonScore_TotallyDifferentFiles() {
+	public void testCommonScore_TotallyDifferentFiles()
+			throws TableFullException {
 		SimilarityIndex src = hash("A\n");
 		SimilarityIndex dst = hash("D\n");
 		assertEquals(0, src.common(dst));
 		assertEquals(0, dst.common(src));
 	}
 
-	public void testCommonScore_SimiliarBy75() {
+	public void testCommonScore_SimiliarBy75() throws TableFullException {
 		SimilarityIndex src = hash("A\nB\nC\nD\n");
 		SimilarityIndex dst = hash("A\nB\nC\nQ\n");
 		assertEquals(6, src.common(dst));
@@ -120,10 +123,11 @@ public void testCommonScore_SimiliarBy75() {
 		assertEquals(75, dst.score(src, 100));
 	}
 
-	private static SimilarityIndex hash(String text) {
+	private static SimilarityIndex hash(String text) throws TableFullException {
 		SimilarityIndex src = new SimilarityIndex() {
 			@Override
-			void hash(byte[] raw, int ptr, final int end) {
+			void hash(byte[] raw, int ptr, final int end)
+					throws TableFullException {
 				while (ptr < end) {
 					int hash = raw[ptr] & 0xff;
 					int start = ptr;
@@ -143,7 +147,7 @@ void hash(byte[] raw, int ptr, final int end) {
 		return src;
 	}
 
-	private static int keyFor(String line) {
+	private static int keyFor(String line) throws TableFullException {
 		SimilarityIndex si = hash(line);
 		assertEquals("single line scored", 1, si.size());
 		return si.key(0);
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java
index 66218f640..9d9a96d8d 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java
@@ -57,6 +57,7 @@
 
 import org.eclipse.jgit.JGitText;
 import org.eclipse.jgit.diff.DiffEntry.ChangeType;
+import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
 import org.eclipse.jgit.lib.AbbreviatedObjectId;
 import org.eclipse.jgit.lib.FileMode;
 import org.eclipse.jgit.lib.NullProgressMonitor;
@@ -445,14 +446,23 @@ private void rejoinModifies(ProgressMonitor pm) {
 
 	private int calculateModifyScore(ContentSource.Pair reader, DiffEntry d)
 			throws IOException {
-		SimilarityIndex src = new SimilarityIndex();
-		src.hash(reader.open(OLD, d));
-		src.sort();
+		try {
+			SimilarityIndex src = new SimilarityIndex();
+			src.hash(reader.open(OLD, d));
+			src.sort();
 
-		SimilarityIndex dst = new SimilarityIndex();
-		dst.hash(reader.open(NEW, d));
-		dst.sort();
-		return src.score(dst, 100);
+			SimilarityIndex dst = new SimilarityIndex();
+			dst.hash(reader.open(NEW, d));
+			dst.sort();
+			return src.score(dst, 100);
+		} catch (TableFullException tableFull) {
+			// If either table overflowed while being constructed, don't allow
+			// the pair to be broken. Returning 1 higher than breakScore will
+			// ensure its not similar, but not quite dissimilar enough to break.
+			//
+			overRenameLimit = true;
+			return breakScore + 1;
+		}
 	}
 
 	private void findContentRenames(ContentSource.Pair reader,
@@ -468,6 +478,7 @@ private void findContentRenames(ContentSource.Pair reader,
 			d = new SimilarityRenameDetector(reader, deleted, added);
 			d.setRenameScore(getRenameScore());
 			d.compute(pm);
+			overRenameLimit |= d.isTableOverflow();
 			deleted = d.getLeftOverSources();
 			added = d.getLeftOverDestinations();
 			entries.addAll(d.getMatches());
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
index 853132589..045300613 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
@@ -65,8 +65,8 @@
  * file are discovered.
  */
 class SimilarityIndex {
-	/** The {@link #idHash} table stops growing at {@code 1 << MAX_HASH_BITS}. */
-	private static final int MAX_HASH_BITS = 17;
+	/** A special {@link TableFullException} used in place of OutOfMemoryError. */
+	private static final TableFullException TABLE_FULL_OUT_OF_MEMORY = new TableFullException();
 
 	/**
 	 * Shift to apply before storing a key.
@@ -82,14 +82,17 @@ class SimilarityIndex {
 	/** Number of non-zero entries in {@link #idHash}. */
 	private int idSize;
 
+	/** {@link #idSize} that triggers {@link #idHash} to double in size. */
+	private int idGrowAt;
+
 	/**
 	 * Pairings of content keys and counters.
 	 * <p>
 	 * Slots in the table are actually two ints wedged into a single long. The
-	 * upper {@link #MAX_HASH_BITS} bits stores the content key, and the
-	 * remaining lower bits stores the number of bytes associated with that key.
-	 * Empty slots are denoted by 0, which cannot occur because the count cannot
-	 * be 0. Values can only be positive, which we enforce during key addition.
+	 * upper 32 bits stores the content key, and the remaining lower bits stores
+	 * the number of bytes associated with that key. Empty slots are denoted by
+	 * 0, which cannot occur because the count cannot be 0. Values can only be
+	 * positive, which we enforce during key addition.
 	 */
 	private long[] idHash;
 
@@ -99,6 +102,7 @@ class SimilarityIndex {
 	SimilarityIndex() {
 		idHashBits = 8;
 		idHash = new long[1 << idHashBits];
+		idGrowAt = growAt(idHashBits);
 	}
 
 	long getFileSize() {
@@ -109,7 +113,8 @@ void setFileSize(long size) {
 		fileSize = size;
 	}
 
-	void hash(ObjectLoader obj) throws MissingObjectException, IOException {
+	void hash(ObjectLoader obj) throws MissingObjectException, IOException,
+			TableFullException {
 		if (obj.isLarge()) {
 			ObjectStream in = obj.openStream();
 			try {
@@ -125,7 +130,7 @@ void hash(ObjectLoader obj) throws MissingObjectException, IOException {
 		}
 	}
 
-	void hash(byte[] raw, int ptr, final int end) {
+	void hash(byte[] raw, int ptr, final int end) throws TableFullException {
 		while (ptr < end) {
 			int hash = 5381;
 			int start = ptr;
@@ -141,7 +146,8 @@ void hash(byte[] raw, int ptr, final int end) {
 		}
 	}
 
-	void hash(InputStream in, long remaining) throws IOException {
+	void hash(InputStream in, long remaining) throws IOException,
+			TableFullException {
 		byte[] buf = new byte[4096];
 		int ptr = 0;
 		int cnt = 0;
@@ -268,7 +274,7 @@ private int packedIndex(int idx) {
 		return (idHash.length - idSize) + idx;
 	}
 
-	void add(int key, int cnt) {
+	void add(int key, int cnt) throws TableFullException {
 		key = (key * 0x9e370001) >>> 1; // Mix bits and ensure not negative.
 
 		int j = slot(key);
@@ -276,7 +282,7 @@ void add(int key, int cnt) {
 			long v = idHash[j];
 			if (v == 0) {
 				// Empty slot in the table, store here.
-				if (shouldGrow()) {
+				if (idGrowAt <= idSize) {
 					grow();
 					j = slot(key);
 					continue;
@@ -304,16 +310,26 @@ private int slot(int key) {
 		return key >>> (31 - idHashBits);
 	}
 
-	private boolean shouldGrow() {
-		return idHashBits < MAX_HASH_BITS && idHash.length <= idSize * 2;
+	private static int growAt(int idHashBits) {
+		return (1 << idHashBits) * (idHashBits - 3) / idHashBits;
 	}
 
-	private void grow() {
+	private void grow() throws TableFullException {
+		if (idHashBits == 30)
+			throw new TableFullException();
+
 		long[] oldHash = idHash;
 		int oldSize = idHash.length;
 
 		idHashBits++;
-		idHash = new long[1 << idHashBits];
+		idGrowAt = growAt(idHashBits);
+
+		try {
+			idHash = new long[1 << idHashBits];
+		} catch (OutOfMemoryError noMemory) {
+			throw TABLE_FULL_OUT_OF_MEMORY;
+		}
+
 		for (int i = 0; i < oldSize; i++) {
 			long v = oldHash[i];
 			if (v != 0) {
@@ -333,4 +349,8 @@ private static int keyOf(long v) {
 	private static int countOf(long v) {
 		return (int) v;
 	}
+
+	static class TableFullException extends Exception {
+		private static final long serialVersionUID = 1L;
+	}
 }
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java
index 3075c223a..89e71e666 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java
@@ -53,6 +53,7 @@
 
 import org.eclipse.jgit.JGitText;
 import org.eclipse.jgit.diff.DiffEntry.ChangeType;
+import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
 import org.eclipse.jgit.lib.FileMode;
 import org.eclipse.jgit.lib.NullProgressMonitor;
 import org.eclipse.jgit.lib.ProgressMonitor;
@@ -110,6 +111,9 @@ class SimilarityRenameDetector {
 	/** Score a pair must exceed to be considered a rename. */
 	private int renameScore = 60;
 
+	/** Set if any {@link SimilarityIndex.TableFullException} occurs. */
+	private boolean tableOverflow;
+
 	private List<DiffEntry> out;
 
 	SimilarityRenameDetector(ContentSource.Pair reader, List<DiffEntry> srcs,
@@ -182,6 +186,10 @@ List<DiffEntry> getLeftOverDestinations() {
 		return dsts;
 	}
 
+	boolean isTableOverflow() {
+		return tableOverflow;
+	}
+
 	private static List<DiffEntry> compactSrcList(List<DiffEntry> in) {
 		ArrayList<DiffEntry> r = new ArrayList<DiffEntry>(in.size());
 		for (DiffEntry e : in) {
@@ -226,7 +234,14 @@ private int buildMatrix(ProgressMonitor pm) throws IOException {
 				continue;
 			}
 
-			SimilarityIndex s = hash(OLD, srcEnt);
+			SimilarityIndex s;
+			try {
+				s = hash(OLD, srcEnt);
+			} catch (TableFullException tableFull) {
+				tableOverflow = true;
+				continue;
+			}
+
 			for (int dstIdx = 0; dstIdx < dsts.size(); dstIdx++) {
 				DiffEntry dstEnt = dsts.get(dstIdx);
 
@@ -260,7 +275,15 @@ private int buildMatrix(ProgressMonitor pm) throws IOException {
 					continue;
 				}
 
-				SimilarityIndex d = hash(NEW, dstEnt);
+				SimilarityIndex d;
+				try {
+					d = hash(NEW, dstEnt);
+				} catch (TableFullException tableFull) {
+					tableOverflow = true;
+					pm.update(1);
+					continue;
+				}
+
 				int contentScore = s.score(d, 10000);
 
 				// nameScore returns a value between 0 and 100, but we want it
@@ -336,7 +359,7 @@ static int nameScore(String a, String b) {
 	}
 
 	private SimilarityIndex hash(DiffEntry.Side side, DiffEntry ent)
-			throws IOException {
+			throws IOException, TableFullException {
 		SimilarityIndex r = new SimilarityIndex();
 		r.hash(reader.open(side, ent));
 		r.sort();

From 0e307a6afddbb564ea6c34b3766d749f80e4442a Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Fri, 12 Nov 2010 11:56:40 -0800
Subject: [PATCH 3/7] SimilarityIndex: Don't overflow internal counter fields

The counter portion of each pair is only 32 bits wide, but is part
of a larger 64 bit integer.  If the file size was larger than 4 GB
the counter could overflow and impact the key, changing the hash,
and later resulting in an incorrect similarity score.

Guard against this overflow condition by capping the count for each
record at 2^32-1.  If any record contains more than that many bytes
the table aborts hashing and throws TableFullException.

This permits the index to scan and work on files that exceed 4 GB
in size, but only if the file contains more than one unique block.
The index throws TableFullException on a 4 GB file containing all
zeros, but should succeed on a 6 GB file containing unique lines.

The index now uses a 64 bit accumulator during the common scoring
algorithm, possibly resulting in slower summations.  However this
index is already heavily dependent upon 64 bit integer operations
being efficient, so increasing from 32 bits to 64 bits allows us
to correctly handle 6 GB files.

Change-Id: I14e6dbc88d54ead19336a4c0c25eae18e73e6ec2
Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 .../eclipse/jgit/diff/SimilarityIndex.java    | 29 +++++++++++++------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
index 045300613..17ccb9726 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
@@ -76,6 +76,9 @@ class SimilarityIndex {
 	 */
 	private static final int KEY_SHIFT = 32;
 
+	/** Maximum value of the count field, also mask to extract the count. */
+	private static final long MAX_COUNT = (1L << KEY_SHIFT) - 1;
+
 	/** Total size of the file we hashed into the structure. */
 	private long fileSize;
 
@@ -196,11 +199,11 @@ int score(SimilarityIndex dst, int maxScore) {
 		return (int) ((common(dst) * maxScore) / max);
 	}
 
-	int common(SimilarityIndex dst) {
+	long common(SimilarityIndex dst) {
 		return common(this, dst);
 	}
 
-	private static int common(SimilarityIndex src, SimilarityIndex dst) {
+	private static long common(SimilarityIndex src, SimilarityIndex dst) {
 		int srcIdx = src.packedIndex(0);
 		int dstIdx = dst.packedIndex(0);
 		long[] srcHash = src.idHash;
@@ -208,12 +211,12 @@ private static int common(SimilarityIndex src, SimilarityIndex dst) {
 		return common(srcHash, srcIdx, dstHash, dstIdx);
 	}
 
-	private static int common(long[] srcHash, int srcIdx, //
+	private static long common(long[] srcHash, int srcIdx, //
 			long[] dstHash, int dstIdx) {
 		if (srcIdx == srcHash.length || dstIdx == dstHash.length)
 			return 0;
 
-		int common = 0;
+		long common = 0;
 		int srcKey = keyOf(srcHash[srcIdx]);
 		int dstKey = keyOf(dstHash[dstIdx]);
 
@@ -287,13 +290,15 @@ void add(int key, int cnt) throws TableFullException {
 					j = slot(key);
 					continue;
 				}
-				idHash[j] = (((long) key) << KEY_SHIFT) | cnt;
+				idHash[j] = pair(key, cnt);
 				idSize++;
 				return;
 
 			} else if (keyOf(v) == key) {
-				// Same key, increment the counter.
-				idHash[j] = v + cnt;
+				// Same key, increment the counter. If it overflows, fail
+				// indexing to prevent the key from being impacted.
+				//
+				idHash[j] = pair(key, countOf(v) + cnt);
 				return;
 
 			} else if (++j >= idHash.length) {
@@ -302,6 +307,12 @@ void add(int key, int cnt) throws TableFullException {
 		}
 	}
 
+	private static long pair(int key, long cnt) throws TableFullException {
+		if (MAX_COUNT < cnt)
+			throw new TableFullException();
+		return (((long) key) << KEY_SHIFT) | cnt;
+	}
+
 	private int slot(int key) {
 		// We use 31 - idHashBits because the upper bit was already forced
 		// to be 0 and we want the remaining high bits to be used as the
@@ -346,8 +357,8 @@ private static int keyOf(long v) {
 		return (int) (v >>> KEY_SHIFT);
 	}
 
-	private static int countOf(long v) {
-		return (int) v;
+	private static long countOf(long v) {
+		return v & MAX_COUNT;
 	}
 
 	static class TableFullException extends Exception {

From 918e6e20f04350557579add806f0deb2a59ba837 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Thu, 11 Nov 2010 14:25:01 -0800
Subject: [PATCH 4/7] SimilarityRenameDetector: Only attempt to index large
 files once

If a file fails to index the first time the loop encounters it, the
file is likely to fail to index again on the next row.  Rather than
wasting a huge amount of CPU to index it again and fail, remember
which destination files failed to index and skip over them on each
subsequent row.

Because this condition is very unlikely, avoid allocating the BitSet
until its actually needed.  This keeps the memory usage unaffected
for the common case.

Change-Id: I93509b28b61a9bba8f681a7b4df4c6127bca2a09
Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 .../eclipse/jgit/diff/SimilarityRenameDetector.java    | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java
index 89e71e666..bf1bbda63 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java
@@ -49,6 +49,7 @@
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.BitSet;
 import java.util.List;
 
 import org.eclipse.jgit.JGitText;
@@ -216,6 +217,7 @@ private int buildMatrix(ProgressMonitor pm) throws IOException {
 
 		long[] srcSizes = new long[srcs.size()];
 		long[] dstSizes = new long[dsts.size()];
+		BitSet dstTooLarge = null;
 
 		// Init the size arrays to some value that indicates that we haven't
 		// calculated the size yet. Since sizes cannot be negative, -1 will work
@@ -255,6 +257,11 @@ private int buildMatrix(ProgressMonitor pm) throws IOException {
 					continue;
 				}
 
+				if (dstTooLarge != null && dstTooLarge.get(dstIdx)) {
+					pm.update(1);
+					continue;
+				}
+
 				long srcSize = srcSizes[srcIdx];
 				if (srcSize < 0) {
 					srcSize = size(OLD, srcEnt);
@@ -279,6 +286,9 @@ private int buildMatrix(ProgressMonitor pm) throws IOException {
 				try {
 					d = hash(NEW, dstEnt);
 				} catch (TableFullException tableFull) {
+					if (dstTooLarge == null)
+						dstTooLarge = new BitSet(dsts.size());
+					dstTooLarge.set(dstIdx);
 					tableOverflow = true;
 					pm.update(1);
 					continue;

From 68baa3097e721cec42e6a52b72e7a2fe3ea57b18 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Thu, 11 Nov 2010 14:29:11 -0800
Subject: [PATCH 5/7] SimilarityRenameDetector: Avoid allocating source index

If the only file added is really small, and all of the deleted
files are really big, none of the permutations will match up due
to the sizes being too far apart to fit the current rename score.

Avoid allocating the really big deleted SimilarityIndex by deferring
its construction until at least one add along that row has a
reasonable chance of matching it.

This avoids expending a lot of CPU time looking at big deleted
binary files when a small modified text file was broken due to a
high percentage of changed lines.

Change-Id: I11ae37edb80a7be1eef8cc01d79412017c2fc075
Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 .../jgit/diff/SimilarityRenameDetector.java   | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java
index bf1bbda63..f47caf97f 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java
@@ -229,20 +229,14 @@ private int buildMatrix(ProgressMonitor pm) throws IOException {
 		// later find the best matches.
 		//
 		int mNext = 0;
-		for (int srcIdx = 0; srcIdx < srcs.size(); srcIdx++) {
+		SRC: for (int srcIdx = 0; srcIdx < srcs.size(); srcIdx++) {
 			DiffEntry srcEnt = srcs.get(srcIdx);
 			if (!isFile(srcEnt.oldMode)) {
 				pm.update(dsts.size());
 				continue;
 			}
 
-			SimilarityIndex s;
-			try {
-				s = hash(OLD, srcEnt);
-			} catch (TableFullException tableFull) {
-				tableOverflow = true;
-				continue;
-			}
+			SimilarityIndex s = null;
 
 			for (int dstIdx = 0; dstIdx < dsts.size(); dstIdx++) {
 				DiffEntry dstEnt = dsts.get(dstIdx);
@@ -282,6 +276,15 @@ private int buildMatrix(ProgressMonitor pm) throws IOException {
 					continue;
 				}
 
+				if (s == null) {
+					try {
+						s = hash(OLD, srcEnt);
+					} catch (TableFullException tableFull) {
+						tableOverflow = true;
+						continue SRC;
+					}
+				}
+
 				SimilarityIndex d;
 				try {
 					d = hash(NEW, dstEnt);

From 05653bda04a8199fceacd7f8b26c8af4dd8a8f3a Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Thu, 11 Nov 2010 14:43:22 -0800
Subject: [PATCH 6/7] SimilarityRenameDetector: Initialize sizes to 0

Setting the array elements to -1 is more expensive than relying on
the allocator to zero the array for us first.  Shifting the code to
always add 1 to the size (so an empty file is actually 1 byte long)
allows us to detect an unloaded size by comparing to 0, thus saving
the array fill calls.

Change-Id: Iad859e910655675b53ba70de8e6fceaef7cfcdd1
Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 .../eclipse/jgit/diff/SimilarityRenameDetector.java | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java
index f47caf97f..3a9847545 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java
@@ -219,11 +219,6 @@ private int buildMatrix(ProgressMonitor pm) throws IOException {
 		long[] dstSizes = new long[dsts.size()];
 		BitSet dstTooLarge = null;
 
-		// Init the size arrays to some value that indicates that we haven't
-		// calculated the size yet. Since sizes cannot be negative, -1 will work
-		Arrays.fill(srcSizes, -1);
-		Arrays.fill(dstSizes, -1);
-
 		// Consider each pair of files, if the score is above the minimum
 		// threshold we need record that scoring in the matrix so we can
 		// later find the best matches.
@@ -257,14 +252,14 @@ private int buildMatrix(ProgressMonitor pm) throws IOException {
 				}
 
 				long srcSize = srcSizes[srcIdx];
-				if (srcSize < 0) {
-					srcSize = size(OLD, srcEnt);
+				if (srcSize == 0) {
+					srcSize = size(OLD, srcEnt) + 1;
 					srcSizes[srcIdx] = srcSize;
 				}
 
 				long dstSize = dstSizes[dstIdx];
-				if (dstSize < 0) {
-					dstSize = size(NEW, dstEnt);
+				if (dstSize == 0) {
+					dstSize = size(NEW, dstEnt) + 1;
 					dstSizes[dstIdx] = dstSize;
 				}
 

From bc9bca064d9d41f11763a383aaca9c1710c61b4f Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Thu, 11 Nov 2010 15:02:57 -0800
Subject: [PATCH 7/7] RenameDetector: Only scan deletes if adds exist

If there are only deletes, don't need perform rename or copy
detection.  There are no adds (aka destinations) for the deletes
to match against.

Change-Id: I00fb90c509fa26a053de561dd8506cc1e0f5799a
Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 .../org/eclipse/jgit/diff/RenameDetector.java   | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java
index 9d9a96d8d..dfaf5886e 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java
@@ -357,9 +357,17 @@ public List<DiffEntry> compute(ContentSource.Pair reader, ProgressMonitor pm)
 
 			if (pm == null)
 				pm = NullProgressMonitor.INSTANCE;
+
+			if (0 < breakScore)
 				breakModifies(reader, pm);
+
+			if (!added.isEmpty() && !deleted.isEmpty())
 				findExactRenames(pm);
+
+			if (!added.isEmpty() && !deleted.isEmpty())
 				findContentRenames(reader, pm);
+
+			if (0 < breakScore && !added.isEmpty() && !deleted.isEmpty())
 				rejoinModifies(pm);
 
 			entries.addAll(added);
@@ -383,9 +391,6 @@ public void reset() {
 
 	private void breakModifies(ContentSource.Pair reader, ProgressMonitor pm)
 			throws IOException {
-		if (breakScore <= 0)
-			return;
-
 		ArrayList<DiffEntry> newEntries = new ArrayList<DiffEntry>(entries.size());
 
 		pm.beginTask(JGitText.get().renamesBreakingModifies, entries.size());
@@ -469,9 +474,6 @@ private void findContentRenames(ContentSource.Pair reader,
 			ProgressMonitor pm)
 			throws IOException {
 		int cnt = Math.max(added.size(), deleted.size());
-		if (cnt == 0)
-			return;
-
 		if (getRenameLimit() == 0 || cnt <= getRenameLimit()) {
 			SimilarityRenameDetector d;
 
@@ -489,9 +491,6 @@ private void findContentRenames(ContentSource.Pair reader,
 
 	@SuppressWarnings("unchecked")
 	private void findExactRenames(ProgressMonitor pm) {
-		if (added.isEmpty() || deleted.isEmpty())
-			return;
-
 		pm.beginTask(JGitText.get().renamesFindingExact, //
 				added.size() + added.size() + deleted.size()
 						+ added.size() * deleted.size());