SimilarityIndex: Accept files larger than 8 MB
Files bigger than 8 MB (2^23 bytes) tended to overflow the internal hashtable, as the table was capped in size to 2^17 records. If a file contained 2^17 unique data blocks/lines, the table insertion got stuck in an infinite loop as the able couldn't grow, and there was no open slot for the new item. Remove the artifical 2^17 table limit and instead allow the table to grow to be as big as 2^30. With a 64 byte block size, this permits hashing inputs as large as 64 GB. If the table reaches 2^30 (or cannot be allocated) hashing is aborted. RenameDetector no longer tries to break a modify file pair, and it does not try to match the file for rename or copy detection. Change-Id: Ibb4d756844f4667e181e24a34a468dc3655863ac Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
This commit is contained in:
parent
f3b511568b
commit
d63887127e
|
@ -48,10 +48,11 @@
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
|
||||||
import org.eclipse.jgit.lib.Constants;
|
import org.eclipse.jgit.lib.Constants;
|
||||||
|
|
||||||
public class SimilarityIndexTest extends TestCase {
|
public class SimilarityIndexTest extends TestCase {
|
||||||
public void testIndexingSmallObject() {
|
public void testIndexingSmallObject() throws TableFullException {
|
||||||
SimilarityIndex si = hash("" //
|
SimilarityIndex si = hash("" //
|
||||||
+ "A\n" //
|
+ "A\n" //
|
||||||
+ "B\n" //
|
+ "B\n" //
|
||||||
|
@ -70,7 +71,8 @@ public void testIndexingSmallObject() {
|
||||||
assertEquals(2, si.count(si.findIndex(key_D)));
|
assertEquals(2, si.count(si.findIndex(key_D)));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testIndexingLargeObject() throws IOException {
|
public void testIndexingLargeObject() throws IOException,
|
||||||
|
TableFullException {
|
||||||
byte[] in = ("" //
|
byte[] in = ("" //
|
||||||
+ "A\n" //
|
+ "A\n" //
|
||||||
+ "B\n" //
|
+ "B\n" //
|
||||||
|
@ -81,7 +83,7 @@ public void testIndexingLargeObject() throws IOException {
|
||||||
assertEquals(2, si.size());
|
assertEquals(2, si.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testCommonScore_SameFiles() {
|
public void testCommonScore_SameFiles() throws TableFullException {
|
||||||
String text = "" //
|
String text = "" //
|
||||||
+ "A\n" //
|
+ "A\n" //
|
||||||
+ "B\n" //
|
+ "B\n" //
|
||||||
|
@ -96,21 +98,22 @@ public void testCommonScore_SameFiles() {
|
||||||
assertEquals(100, dst.score(src, 100));
|
assertEquals(100, dst.score(src, 100));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testCommonScore_EmptyFiles() {
|
public void testCommonScore_EmptyFiles() throws TableFullException {
|
||||||
SimilarityIndex src = hash("");
|
SimilarityIndex src = hash("");
|
||||||
SimilarityIndex dst = hash("");
|
SimilarityIndex dst = hash("");
|
||||||
assertEquals(0, src.common(dst));
|
assertEquals(0, src.common(dst));
|
||||||
assertEquals(0, dst.common(src));
|
assertEquals(0, dst.common(src));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testCommonScore_TotallyDifferentFiles() {
|
public void testCommonScore_TotallyDifferentFiles()
|
||||||
|
throws TableFullException {
|
||||||
SimilarityIndex src = hash("A\n");
|
SimilarityIndex src = hash("A\n");
|
||||||
SimilarityIndex dst = hash("D\n");
|
SimilarityIndex dst = hash("D\n");
|
||||||
assertEquals(0, src.common(dst));
|
assertEquals(0, src.common(dst));
|
||||||
assertEquals(0, dst.common(src));
|
assertEquals(0, dst.common(src));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testCommonScore_SimiliarBy75() {
|
public void testCommonScore_SimiliarBy75() throws TableFullException {
|
||||||
SimilarityIndex src = hash("A\nB\nC\nD\n");
|
SimilarityIndex src = hash("A\nB\nC\nD\n");
|
||||||
SimilarityIndex dst = hash("A\nB\nC\nQ\n");
|
SimilarityIndex dst = hash("A\nB\nC\nQ\n");
|
||||||
assertEquals(6, src.common(dst));
|
assertEquals(6, src.common(dst));
|
||||||
|
@ -120,10 +123,11 @@ public void testCommonScore_SimiliarBy75() {
|
||||||
assertEquals(75, dst.score(src, 100));
|
assertEquals(75, dst.score(src, 100));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static SimilarityIndex hash(String text) {
|
private static SimilarityIndex hash(String text) throws TableFullException {
|
||||||
SimilarityIndex src = new SimilarityIndex() {
|
SimilarityIndex src = new SimilarityIndex() {
|
||||||
@Override
|
@Override
|
||||||
void hash(byte[] raw, int ptr, final int end) {
|
void hash(byte[] raw, int ptr, final int end)
|
||||||
|
throws TableFullException {
|
||||||
while (ptr < end) {
|
while (ptr < end) {
|
||||||
int hash = raw[ptr] & 0xff;
|
int hash = raw[ptr] & 0xff;
|
||||||
int start = ptr;
|
int start = ptr;
|
||||||
|
@ -143,7 +147,7 @@ void hash(byte[] raw, int ptr, final int end) {
|
||||||
return src;
|
return src;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static int keyFor(String line) {
|
private static int keyFor(String line) throws TableFullException {
|
||||||
SimilarityIndex si = hash(line);
|
SimilarityIndex si = hash(line);
|
||||||
assertEquals("single line scored", 1, si.size());
|
assertEquals("single line scored", 1, si.size());
|
||||||
return si.key(0);
|
return si.key(0);
|
||||||
|
|
|
@ -57,6 +57,7 @@
|
||||||
|
|
||||||
import org.eclipse.jgit.JGitText;
|
import org.eclipse.jgit.JGitText;
|
||||||
import org.eclipse.jgit.diff.DiffEntry.ChangeType;
|
import org.eclipse.jgit.diff.DiffEntry.ChangeType;
|
||||||
|
import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
|
||||||
import org.eclipse.jgit.lib.AbbreviatedObjectId;
|
import org.eclipse.jgit.lib.AbbreviatedObjectId;
|
||||||
import org.eclipse.jgit.lib.FileMode;
|
import org.eclipse.jgit.lib.FileMode;
|
||||||
import org.eclipse.jgit.lib.NullProgressMonitor;
|
import org.eclipse.jgit.lib.NullProgressMonitor;
|
||||||
|
@ -445,14 +446,23 @@ private void rejoinModifies(ProgressMonitor pm) {
|
||||||
|
|
||||||
private int calculateModifyScore(ContentSource.Pair reader, DiffEntry d)
|
private int calculateModifyScore(ContentSource.Pair reader, DiffEntry d)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
SimilarityIndex src = new SimilarityIndex();
|
try {
|
||||||
src.hash(reader.open(OLD, d));
|
SimilarityIndex src = new SimilarityIndex();
|
||||||
src.sort();
|
src.hash(reader.open(OLD, d));
|
||||||
|
src.sort();
|
||||||
|
|
||||||
SimilarityIndex dst = new SimilarityIndex();
|
SimilarityIndex dst = new SimilarityIndex();
|
||||||
dst.hash(reader.open(NEW, d));
|
dst.hash(reader.open(NEW, d));
|
||||||
dst.sort();
|
dst.sort();
|
||||||
return src.score(dst, 100);
|
return src.score(dst, 100);
|
||||||
|
} catch (TableFullException tableFull) {
|
||||||
|
// If either table overflowed while being constructed, don't allow
|
||||||
|
// the pair to be broken. Returning 1 higher than breakScore will
|
||||||
|
// ensure its not similar, but not quite dissimilar enough to break.
|
||||||
|
//
|
||||||
|
overRenameLimit = true;
|
||||||
|
return breakScore + 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void findContentRenames(ContentSource.Pair reader,
|
private void findContentRenames(ContentSource.Pair reader,
|
||||||
|
@ -468,6 +478,7 @@ private void findContentRenames(ContentSource.Pair reader,
|
||||||
d = new SimilarityRenameDetector(reader, deleted, added);
|
d = new SimilarityRenameDetector(reader, deleted, added);
|
||||||
d.setRenameScore(getRenameScore());
|
d.setRenameScore(getRenameScore());
|
||||||
d.compute(pm);
|
d.compute(pm);
|
||||||
|
overRenameLimit |= d.isTableOverflow();
|
||||||
deleted = d.getLeftOverSources();
|
deleted = d.getLeftOverSources();
|
||||||
added = d.getLeftOverDestinations();
|
added = d.getLeftOverDestinations();
|
||||||
entries.addAll(d.getMatches());
|
entries.addAll(d.getMatches());
|
||||||
|
|
|
@ -65,8 +65,8 @@
|
||||||
* file are discovered.
|
* file are discovered.
|
||||||
*/
|
*/
|
||||||
class SimilarityIndex {
|
class SimilarityIndex {
|
||||||
/** The {@link #idHash} table stops growing at {@code 1 << MAX_HASH_BITS}. */
|
/** A special {@link TableFullException} used in place of OutOfMemoryError. */
|
||||||
private static final int MAX_HASH_BITS = 17;
|
private static final TableFullException TABLE_FULL_OUT_OF_MEMORY = new TableFullException();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Shift to apply before storing a key.
|
* Shift to apply before storing a key.
|
||||||
|
@ -82,14 +82,17 @@ class SimilarityIndex {
|
||||||
/** Number of non-zero entries in {@link #idHash}. */
|
/** Number of non-zero entries in {@link #idHash}. */
|
||||||
private int idSize;
|
private int idSize;
|
||||||
|
|
||||||
|
/** {@link #idSize} that triggers {@link #idHash} to double in size. */
|
||||||
|
private int idGrowAt;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Pairings of content keys and counters.
|
* Pairings of content keys and counters.
|
||||||
* <p>
|
* <p>
|
||||||
* Slots in the table are actually two ints wedged into a single long. The
|
* Slots in the table are actually two ints wedged into a single long. The
|
||||||
* upper {@link #MAX_HASH_BITS} bits stores the content key, and the
|
* upper 32 bits stores the content key, and the remaining lower bits stores
|
||||||
* remaining lower bits stores the number of bytes associated with that key.
|
* the number of bytes associated with that key. Empty slots are denoted by
|
||||||
* Empty slots are denoted by 0, which cannot occur because the count cannot
|
* 0, which cannot occur because the count cannot be 0. Values can only be
|
||||||
* be 0. Values can only be positive, which we enforce during key addition.
|
* positive, which we enforce during key addition.
|
||||||
*/
|
*/
|
||||||
private long[] idHash;
|
private long[] idHash;
|
||||||
|
|
||||||
|
@ -99,6 +102,7 @@ class SimilarityIndex {
|
||||||
SimilarityIndex() {
|
SimilarityIndex() {
|
||||||
idHashBits = 8;
|
idHashBits = 8;
|
||||||
idHash = new long[1 << idHashBits];
|
idHash = new long[1 << idHashBits];
|
||||||
|
idGrowAt = growAt(idHashBits);
|
||||||
}
|
}
|
||||||
|
|
||||||
long getFileSize() {
|
long getFileSize() {
|
||||||
|
@ -109,7 +113,8 @@ void setFileSize(long size) {
|
||||||
fileSize = size;
|
fileSize = size;
|
||||||
}
|
}
|
||||||
|
|
||||||
void hash(ObjectLoader obj) throws MissingObjectException, IOException {
|
void hash(ObjectLoader obj) throws MissingObjectException, IOException,
|
||||||
|
TableFullException {
|
||||||
if (obj.isLarge()) {
|
if (obj.isLarge()) {
|
||||||
ObjectStream in = obj.openStream();
|
ObjectStream in = obj.openStream();
|
||||||
try {
|
try {
|
||||||
|
@ -125,7 +130,7 @@ void hash(ObjectLoader obj) throws MissingObjectException, IOException {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void hash(byte[] raw, int ptr, final int end) {
|
void hash(byte[] raw, int ptr, final int end) throws TableFullException {
|
||||||
while (ptr < end) {
|
while (ptr < end) {
|
||||||
int hash = 5381;
|
int hash = 5381;
|
||||||
int start = ptr;
|
int start = ptr;
|
||||||
|
@ -141,7 +146,8 @@ void hash(byte[] raw, int ptr, final int end) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void hash(InputStream in, long remaining) throws IOException {
|
void hash(InputStream in, long remaining) throws IOException,
|
||||||
|
TableFullException {
|
||||||
byte[] buf = new byte[4096];
|
byte[] buf = new byte[4096];
|
||||||
int ptr = 0;
|
int ptr = 0;
|
||||||
int cnt = 0;
|
int cnt = 0;
|
||||||
|
@ -268,7 +274,7 @@ private int packedIndex(int idx) {
|
||||||
return (idHash.length - idSize) + idx;
|
return (idHash.length - idSize) + idx;
|
||||||
}
|
}
|
||||||
|
|
||||||
void add(int key, int cnt) {
|
void add(int key, int cnt) throws TableFullException {
|
||||||
key = (key * 0x9e370001) >>> 1; // Mix bits and ensure not negative.
|
key = (key * 0x9e370001) >>> 1; // Mix bits and ensure not negative.
|
||||||
|
|
||||||
int j = slot(key);
|
int j = slot(key);
|
||||||
|
@ -276,7 +282,7 @@ void add(int key, int cnt) {
|
||||||
long v = idHash[j];
|
long v = idHash[j];
|
||||||
if (v == 0) {
|
if (v == 0) {
|
||||||
// Empty slot in the table, store here.
|
// Empty slot in the table, store here.
|
||||||
if (shouldGrow()) {
|
if (idGrowAt <= idSize) {
|
||||||
grow();
|
grow();
|
||||||
j = slot(key);
|
j = slot(key);
|
||||||
continue;
|
continue;
|
||||||
|
@ -304,16 +310,26 @@ private int slot(int key) {
|
||||||
return key >>> (31 - idHashBits);
|
return key >>> (31 - idHashBits);
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean shouldGrow() {
|
private static int growAt(int idHashBits) {
|
||||||
return idHashBits < MAX_HASH_BITS && idHash.length <= idSize * 2;
|
return (1 << idHashBits) * (idHashBits - 3) / idHashBits;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void grow() {
|
private void grow() throws TableFullException {
|
||||||
|
if (idHashBits == 30)
|
||||||
|
throw new TableFullException();
|
||||||
|
|
||||||
long[] oldHash = idHash;
|
long[] oldHash = idHash;
|
||||||
int oldSize = idHash.length;
|
int oldSize = idHash.length;
|
||||||
|
|
||||||
idHashBits++;
|
idHashBits++;
|
||||||
idHash = new long[1 << idHashBits];
|
idGrowAt = growAt(idHashBits);
|
||||||
|
|
||||||
|
try {
|
||||||
|
idHash = new long[1 << idHashBits];
|
||||||
|
} catch (OutOfMemoryError noMemory) {
|
||||||
|
throw TABLE_FULL_OUT_OF_MEMORY;
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < oldSize; i++) {
|
for (int i = 0; i < oldSize; i++) {
|
||||||
long v = oldHash[i];
|
long v = oldHash[i];
|
||||||
if (v != 0) {
|
if (v != 0) {
|
||||||
|
@ -333,4 +349,8 @@ private static int keyOf(long v) {
|
||||||
private static int countOf(long v) {
|
private static int countOf(long v) {
|
||||||
return (int) v;
|
return (int) v;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static class TableFullException extends Exception {
|
||||||
|
private static final long serialVersionUID = 1L;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -53,6 +53,7 @@
|
||||||
|
|
||||||
import org.eclipse.jgit.JGitText;
|
import org.eclipse.jgit.JGitText;
|
||||||
import org.eclipse.jgit.diff.DiffEntry.ChangeType;
|
import org.eclipse.jgit.diff.DiffEntry.ChangeType;
|
||||||
|
import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
|
||||||
import org.eclipse.jgit.lib.FileMode;
|
import org.eclipse.jgit.lib.FileMode;
|
||||||
import org.eclipse.jgit.lib.NullProgressMonitor;
|
import org.eclipse.jgit.lib.NullProgressMonitor;
|
||||||
import org.eclipse.jgit.lib.ProgressMonitor;
|
import org.eclipse.jgit.lib.ProgressMonitor;
|
||||||
|
@ -110,6 +111,9 @@ class SimilarityRenameDetector {
|
||||||
/** Score a pair must exceed to be considered a rename. */
|
/** Score a pair must exceed to be considered a rename. */
|
||||||
private int renameScore = 60;
|
private int renameScore = 60;
|
||||||
|
|
||||||
|
/** Set if any {@link SimilarityIndex.TableFullException} occurs. */
|
||||||
|
private boolean tableOverflow;
|
||||||
|
|
||||||
private List<DiffEntry> out;
|
private List<DiffEntry> out;
|
||||||
|
|
||||||
SimilarityRenameDetector(ContentSource.Pair reader, List<DiffEntry> srcs,
|
SimilarityRenameDetector(ContentSource.Pair reader, List<DiffEntry> srcs,
|
||||||
|
@ -182,6 +186,10 @@ List<DiffEntry> getLeftOverDestinations() {
|
||||||
return dsts;
|
return dsts;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
boolean isTableOverflow() {
|
||||||
|
return tableOverflow;
|
||||||
|
}
|
||||||
|
|
||||||
private static List<DiffEntry> compactSrcList(List<DiffEntry> in) {
|
private static List<DiffEntry> compactSrcList(List<DiffEntry> in) {
|
||||||
ArrayList<DiffEntry> r = new ArrayList<DiffEntry>(in.size());
|
ArrayList<DiffEntry> r = new ArrayList<DiffEntry>(in.size());
|
||||||
for (DiffEntry e : in) {
|
for (DiffEntry e : in) {
|
||||||
|
@ -226,7 +234,14 @@ private int buildMatrix(ProgressMonitor pm) throws IOException {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
SimilarityIndex s = hash(OLD, srcEnt);
|
SimilarityIndex s;
|
||||||
|
try {
|
||||||
|
s = hash(OLD, srcEnt);
|
||||||
|
} catch (TableFullException tableFull) {
|
||||||
|
tableOverflow = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
for (int dstIdx = 0; dstIdx < dsts.size(); dstIdx++) {
|
for (int dstIdx = 0; dstIdx < dsts.size(); dstIdx++) {
|
||||||
DiffEntry dstEnt = dsts.get(dstIdx);
|
DiffEntry dstEnt = dsts.get(dstIdx);
|
||||||
|
|
||||||
|
@ -260,7 +275,15 @@ private int buildMatrix(ProgressMonitor pm) throws IOException {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
SimilarityIndex d = hash(NEW, dstEnt);
|
SimilarityIndex d;
|
||||||
|
try {
|
||||||
|
d = hash(NEW, dstEnt);
|
||||||
|
} catch (TableFullException tableFull) {
|
||||||
|
tableOverflow = true;
|
||||||
|
pm.update(1);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
int contentScore = s.score(d, 10000);
|
int contentScore = s.score(d, 10000);
|
||||||
|
|
||||||
// nameScore returns a value between 0 and 100, but we want it
|
// nameScore returns a value between 0 and 100, but we want it
|
||||||
|
@ -336,7 +359,7 @@ static int nameScore(String a, String b) {
|
||||||
}
|
}
|
||||||
|
|
||||||
private SimilarityIndex hash(DiffEntry.Side side, DiffEntry ent)
|
private SimilarityIndex hash(DiffEntry.Side side, DiffEntry ent)
|
||||||
throws IOException {
|
throws IOException, TableFullException {
|
||||||
SimilarityIndex r = new SimilarityIndex();
|
SimilarityIndex r = new SimilarityIndex();
|
||||||
r.hash(reader.open(side, ent));
|
r.hash(reader.open(side, ent));
|
||||||
r.sort();
|
r.sort();
|
||||||
|
|
Loading…
Reference in New Issue