Merge branch 'rename-detection'
* rename-detection: RenameDetector: Only scan deletes if adds exist SimilarityRenameDetector: Initialize sizes to 0 SimilarityRenameDetector: Avoid allocating source index SimilarityRenameDetector: Only attempt to index large files once SimilarityIndex: Don't overflow internal counter fields SimilarityIndex: Accept files larger than 8 MB SimilarityIndex: Correct comment explaining the logic
This commit is contained in:
commit
51bf8ea2a4
|
@ -48,10 +48,11 @@
|
|||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
|
||||
import org.eclipse.jgit.lib.Constants;
|
||||
|
||||
public class SimilarityIndexTest extends TestCase {
|
||||
public void testIndexingSmallObject() {
|
||||
public void testIndexingSmallObject() throws TableFullException {
|
||||
SimilarityIndex si = hash("" //
|
||||
+ "A\n" //
|
||||
+ "B\n" //
|
||||
|
@ -70,7 +71,8 @@ public void testIndexingSmallObject() {
|
|||
assertEquals(2, si.count(si.findIndex(key_D)));
|
||||
}
|
||||
|
||||
public void testIndexingLargeObject() throws IOException {
|
||||
public void testIndexingLargeObject() throws IOException,
|
||||
TableFullException {
|
||||
byte[] in = ("" //
|
||||
+ "A\n" //
|
||||
+ "B\n" //
|
||||
|
@ -81,7 +83,7 @@ public void testIndexingLargeObject() throws IOException {
|
|||
assertEquals(2, si.size());
|
||||
}
|
||||
|
||||
public void testCommonScore_SameFiles() {
|
||||
public void testCommonScore_SameFiles() throws TableFullException {
|
||||
String text = "" //
|
||||
+ "A\n" //
|
||||
+ "B\n" //
|
||||
|
@ -96,21 +98,22 @@ public void testCommonScore_SameFiles() {
|
|||
assertEquals(100, dst.score(src, 100));
|
||||
}
|
||||
|
||||
public void testCommonScore_EmptyFiles() {
|
||||
public void testCommonScore_EmptyFiles() throws TableFullException {
|
||||
SimilarityIndex src = hash("");
|
||||
SimilarityIndex dst = hash("");
|
||||
assertEquals(0, src.common(dst));
|
||||
assertEquals(0, dst.common(src));
|
||||
}
|
||||
|
||||
public void testCommonScore_TotallyDifferentFiles() {
|
||||
public void testCommonScore_TotallyDifferentFiles()
|
||||
throws TableFullException {
|
||||
SimilarityIndex src = hash("A\n");
|
||||
SimilarityIndex dst = hash("D\n");
|
||||
assertEquals(0, src.common(dst));
|
||||
assertEquals(0, dst.common(src));
|
||||
}
|
||||
|
||||
public void testCommonScore_SimiliarBy75() {
|
||||
public void testCommonScore_SimiliarBy75() throws TableFullException {
|
||||
SimilarityIndex src = hash("A\nB\nC\nD\n");
|
||||
SimilarityIndex dst = hash("A\nB\nC\nQ\n");
|
||||
assertEquals(6, src.common(dst));
|
||||
|
@ -120,10 +123,11 @@ public void testCommonScore_SimiliarBy75() {
|
|||
assertEquals(75, dst.score(src, 100));
|
||||
}
|
||||
|
||||
private static SimilarityIndex hash(String text) {
|
||||
private static SimilarityIndex hash(String text) throws TableFullException {
|
||||
SimilarityIndex src = new SimilarityIndex() {
|
||||
@Override
|
||||
void hash(byte[] raw, int ptr, final int end) {
|
||||
void hash(byte[] raw, int ptr, final int end)
|
||||
throws TableFullException {
|
||||
while (ptr < end) {
|
||||
int hash = raw[ptr] & 0xff;
|
||||
int start = ptr;
|
||||
|
@ -143,7 +147,7 @@ void hash(byte[] raw, int ptr, final int end) {
|
|||
return src;
|
||||
}
|
||||
|
||||
private static int keyFor(String line) {
|
||||
private static int keyFor(String line) throws TableFullException {
|
||||
SimilarityIndex si = hash(line);
|
||||
assertEquals("single line scored", 1, si.size());
|
||||
return si.key(0);
|
||||
|
|
|
@ -57,6 +57,7 @@
|
|||
|
||||
import org.eclipse.jgit.JGitText;
|
||||
import org.eclipse.jgit.diff.DiffEntry.ChangeType;
|
||||
import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
|
||||
import org.eclipse.jgit.lib.AbbreviatedObjectId;
|
||||
import org.eclipse.jgit.lib.FileMode;
|
||||
import org.eclipse.jgit.lib.NullProgressMonitor;
|
||||
|
@ -356,9 +357,17 @@ public List<DiffEntry> compute(ContentSource.Pair reader, ProgressMonitor pm)
|
|||
|
||||
if (pm == null)
|
||||
pm = NullProgressMonitor.INSTANCE;
|
||||
|
||||
if (0 < breakScore)
|
||||
breakModifies(reader, pm);
|
||||
|
||||
if (!added.isEmpty() && !deleted.isEmpty())
|
||||
findExactRenames(pm);
|
||||
|
||||
if (!added.isEmpty() && !deleted.isEmpty())
|
||||
findContentRenames(reader, pm);
|
||||
|
||||
if (0 < breakScore && !added.isEmpty() && !deleted.isEmpty())
|
||||
rejoinModifies(pm);
|
||||
|
||||
entries.addAll(added);
|
||||
|
@ -382,9 +391,6 @@ public void reset() {
|
|||
|
||||
private void breakModifies(ContentSource.Pair reader, ProgressMonitor pm)
|
||||
throws IOException {
|
||||
if (breakScore <= 0)
|
||||
return;
|
||||
|
||||
ArrayList<DiffEntry> newEntries = new ArrayList<DiffEntry>(entries.size());
|
||||
|
||||
pm.beginTask(JGitText.get().renamesBreakingModifies, entries.size());
|
||||
|
@ -445,29 +451,36 @@ private void rejoinModifies(ProgressMonitor pm) {
|
|||
|
||||
private int calculateModifyScore(ContentSource.Pair reader, DiffEntry d)
|
||||
throws IOException {
|
||||
SimilarityIndex src = new SimilarityIndex();
|
||||
src.hash(reader.open(OLD, d));
|
||||
src.sort();
|
||||
try {
|
||||
SimilarityIndex src = new SimilarityIndex();
|
||||
src.hash(reader.open(OLD, d));
|
||||
src.sort();
|
||||
|
||||
SimilarityIndex dst = new SimilarityIndex();
|
||||
dst.hash(reader.open(NEW, d));
|
||||
dst.sort();
|
||||
return src.score(dst, 100);
|
||||
SimilarityIndex dst = new SimilarityIndex();
|
||||
dst.hash(reader.open(NEW, d));
|
||||
dst.sort();
|
||||
return src.score(dst, 100);
|
||||
} catch (TableFullException tableFull) {
|
||||
// If either table overflowed while being constructed, don't allow
|
||||
// the pair to be broken. Returning 1 higher than breakScore will
|
||||
// ensure its not similar, but not quite dissimilar enough to break.
|
||||
//
|
||||
overRenameLimit = true;
|
||||
return breakScore + 1;
|
||||
}
|
||||
}
|
||||
|
||||
private void findContentRenames(ContentSource.Pair reader,
|
||||
ProgressMonitor pm)
|
||||
throws IOException {
|
||||
int cnt = Math.max(added.size(), deleted.size());
|
||||
if (cnt == 0)
|
||||
return;
|
||||
|
||||
if (getRenameLimit() == 0 || cnt <= getRenameLimit()) {
|
||||
SimilarityRenameDetector d;
|
||||
|
||||
d = new SimilarityRenameDetector(reader, deleted, added);
|
||||
d.setRenameScore(getRenameScore());
|
||||
d.compute(pm);
|
||||
overRenameLimit |= d.isTableOverflow();
|
||||
deleted = d.getLeftOverSources();
|
||||
added = d.getLeftOverDestinations();
|
||||
entries.addAll(d.getMatches());
|
||||
|
@ -478,9 +491,6 @@ private void findContentRenames(ContentSource.Pair reader,
|
|||
|
||||
@SuppressWarnings("unchecked")
|
||||
private void findExactRenames(ProgressMonitor pm) {
|
||||
if (added.isEmpty() || deleted.isEmpty())
|
||||
return;
|
||||
|
||||
pm.beginTask(JGitText.get().renamesFindingExact, //
|
||||
added.size() + added.size() + deleted.size()
|
||||
+ added.size() * deleted.size());
|
||||
|
|
|
@ -65,8 +65,8 @@
|
|||
* file are discovered.
|
||||
*/
|
||||
class SimilarityIndex {
|
||||
/** The {@link #idHash} table stops growing at {@code 1 << MAX_HASH_BITS}. */
|
||||
private static final int MAX_HASH_BITS = 17;
|
||||
/** A special {@link TableFullException} used in place of OutOfMemoryError. */
|
||||
private static final TableFullException TABLE_FULL_OUT_OF_MEMORY = new TableFullException();
|
||||
|
||||
/**
|
||||
* Shift to apply before storing a key.
|
||||
|
@ -76,20 +76,26 @@ class SimilarityIndex {
|
|||
*/
|
||||
private static final int KEY_SHIFT = 32;
|
||||
|
||||
/** Maximum value of the count field, also mask to extract the count. */
|
||||
private static final long MAX_COUNT = (1L << KEY_SHIFT) - 1;
|
||||
|
||||
/** Total size of the file we hashed into the structure. */
|
||||
private long fileSize;
|
||||
|
||||
/** Number of non-zero entries in {@link #idHash}. */
|
||||
private int idSize;
|
||||
|
||||
/** {@link #idSize} that triggers {@link #idHash} to double in size. */
|
||||
private int idGrowAt;
|
||||
|
||||
/**
|
||||
* Pairings of content keys and counters.
|
||||
* <p>
|
||||
* Slots in the table are actually two ints wedged into a single long. The
|
||||
* upper {@link #MAX_HASH_BITS} bits stores the content key, and the
|
||||
* remaining lower bits stores the number of bytes associated with that key.
|
||||
* Empty slots are denoted by 0, which cannot occur because the count cannot
|
||||
* be 0. Values can only be positive, which we enforce during key addition.
|
||||
* upper 32 bits stores the content key, and the remaining lower bits stores
|
||||
* the number of bytes associated with that key. Empty slots are denoted by
|
||||
* 0, which cannot occur because the count cannot be 0. Values can only be
|
||||
* positive, which we enforce during key addition.
|
||||
*/
|
||||
private long[] idHash;
|
||||
|
||||
|
@ -99,6 +105,7 @@ class SimilarityIndex {
|
|||
SimilarityIndex() {
|
||||
idHashBits = 8;
|
||||
idHash = new long[1 << idHashBits];
|
||||
idGrowAt = growAt(idHashBits);
|
||||
}
|
||||
|
||||
long getFileSize() {
|
||||
|
@ -109,7 +116,8 @@ void setFileSize(long size) {
|
|||
fileSize = size;
|
||||
}
|
||||
|
||||
void hash(ObjectLoader obj) throws MissingObjectException, IOException {
|
||||
void hash(ObjectLoader obj) throws MissingObjectException, IOException,
|
||||
TableFullException {
|
||||
if (obj.isLarge()) {
|
||||
ObjectStream in = obj.openStream();
|
||||
try {
|
||||
|
@ -125,7 +133,7 @@ void hash(ObjectLoader obj) throws MissingObjectException, IOException {
|
|||
}
|
||||
}
|
||||
|
||||
void hash(byte[] raw, int ptr, final int end) {
|
||||
void hash(byte[] raw, int ptr, final int end) throws TableFullException {
|
||||
while (ptr < end) {
|
||||
int hash = 5381;
|
||||
int start = ptr;
|
||||
|
@ -141,7 +149,8 @@ void hash(byte[] raw, int ptr, final int end) {
|
|||
}
|
||||
}
|
||||
|
||||
void hash(InputStream in, long remaining) throws IOException {
|
||||
void hash(InputStream in, long remaining) throws IOException,
|
||||
TableFullException {
|
||||
byte[] buf = new byte[4096];
|
||||
int ptr = 0;
|
||||
int cnt = 0;
|
||||
|
@ -190,11 +199,11 @@ int score(SimilarityIndex dst, int maxScore) {
|
|||
return (int) ((common(dst) * maxScore) / max);
|
||||
}
|
||||
|
||||
int common(SimilarityIndex dst) {
|
||||
long common(SimilarityIndex dst) {
|
||||
return common(this, dst);
|
||||
}
|
||||
|
||||
private static int common(SimilarityIndex src, SimilarityIndex dst) {
|
||||
private static long common(SimilarityIndex src, SimilarityIndex dst) {
|
||||
int srcIdx = src.packedIndex(0);
|
||||
int dstIdx = dst.packedIndex(0);
|
||||
long[] srcHash = src.idHash;
|
||||
|
@ -202,12 +211,12 @@ private static int common(SimilarityIndex src, SimilarityIndex dst) {
|
|||
return common(srcHash, srcIdx, dstHash, dstIdx);
|
||||
}
|
||||
|
||||
private static int common(long[] srcHash, int srcIdx, //
|
||||
private static long common(long[] srcHash, int srcIdx, //
|
||||
long[] dstHash, int dstIdx) {
|
||||
if (srcIdx == srcHash.length || dstIdx == dstHash.length)
|
||||
return 0;
|
||||
|
||||
int common = 0;
|
||||
long common = 0;
|
||||
int srcKey = keyOf(srcHash[srcIdx]);
|
||||
int dstKey = keyOf(dstHash[dstIdx]);
|
||||
|
||||
|
@ -230,8 +239,8 @@ private static int common(long[] srcHash, int srcIdx, //
|
|||
break;
|
||||
srcKey = keyOf(srcHash[srcIdx]);
|
||||
|
||||
} else /* if (srcKey > dstKey) */{
|
||||
// Regions of dst which do not appear in dst.
|
||||
} else /* if (dstKey < srcKey) */{
|
||||
// Regions of dst which do not appear in src.
|
||||
if (++dstIdx == dstHash.length)
|
||||
break;
|
||||
dstKey = keyOf(dstHash[dstIdx]);
|
||||
|
@ -268,7 +277,7 @@ private int packedIndex(int idx) {
|
|||
return (idHash.length - idSize) + idx;
|
||||
}
|
||||
|
||||
void add(int key, int cnt) {
|
||||
void add(int key, int cnt) throws TableFullException {
|
||||
key = (key * 0x9e370001) >>> 1; // Mix bits and ensure not negative.
|
||||
|
||||
int j = slot(key);
|
||||
|
@ -276,18 +285,20 @@ void add(int key, int cnt) {
|
|||
long v = idHash[j];
|
||||
if (v == 0) {
|
||||
// Empty slot in the table, store here.
|
||||
if (shouldGrow()) {
|
||||
if (idGrowAt <= idSize) {
|
||||
grow();
|
||||
j = slot(key);
|
||||
continue;
|
||||
}
|
||||
idHash[j] = (((long) key) << KEY_SHIFT) | cnt;
|
||||
idHash[j] = pair(key, cnt);
|
||||
idSize++;
|
||||
return;
|
||||
|
||||
} else if (keyOf(v) == key) {
|
||||
// Same key, increment the counter.
|
||||
idHash[j] = v + cnt;
|
||||
// Same key, increment the counter. If it overflows, fail
|
||||
// indexing to prevent the key from being impacted.
|
||||
//
|
||||
idHash[j] = pair(key, countOf(v) + cnt);
|
||||
return;
|
||||
|
||||
} else if (++j >= idHash.length) {
|
||||
|
@ -296,6 +307,12 @@ void add(int key, int cnt) {
|
|||
}
|
||||
}
|
||||
|
||||
private static long pair(int key, long cnt) throws TableFullException {
|
||||
if (MAX_COUNT < cnt)
|
||||
throw new TableFullException();
|
||||
return (((long) key) << KEY_SHIFT) | cnt;
|
||||
}
|
||||
|
||||
private int slot(int key) {
|
||||
// We use 31 - idHashBits because the upper bit was already forced
|
||||
// to be 0 and we want the remaining high bits to be used as the
|
||||
|
@ -304,16 +321,26 @@ private int slot(int key) {
|
|||
return key >>> (31 - idHashBits);
|
||||
}
|
||||
|
||||
private boolean shouldGrow() {
|
||||
return idHashBits < MAX_HASH_BITS && idHash.length <= idSize * 2;
|
||||
private static int growAt(int idHashBits) {
|
||||
return (1 << idHashBits) * (idHashBits - 3) / idHashBits;
|
||||
}
|
||||
|
||||
private void grow() {
|
||||
private void grow() throws TableFullException {
|
||||
if (idHashBits == 30)
|
||||
throw new TableFullException();
|
||||
|
||||
long[] oldHash = idHash;
|
||||
int oldSize = idHash.length;
|
||||
|
||||
idHashBits++;
|
||||
idHash = new long[1 << idHashBits];
|
||||
idGrowAt = growAt(idHashBits);
|
||||
|
||||
try {
|
||||
idHash = new long[1 << idHashBits];
|
||||
} catch (OutOfMemoryError noMemory) {
|
||||
throw TABLE_FULL_OUT_OF_MEMORY;
|
||||
}
|
||||
|
||||
for (int i = 0; i < oldSize; i++) {
|
||||
long v = oldHash[i];
|
||||
if (v != 0) {
|
||||
|
@ -330,7 +357,11 @@ private static int keyOf(long v) {
|
|||
return (int) (v >>> KEY_SHIFT);
|
||||
}
|
||||
|
||||
private static int countOf(long v) {
|
||||
return (int) v;
|
||||
private static long countOf(long v) {
|
||||
return v & MAX_COUNT;
|
||||
}
|
||||
|
||||
static class TableFullException extends Exception {
|
||||
private static final long serialVersionUID = 1L;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -49,10 +49,12 @@
|
|||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.BitSet;
|
||||
import java.util.List;
|
||||
|
||||
import org.eclipse.jgit.JGitText;
|
||||
import org.eclipse.jgit.diff.DiffEntry.ChangeType;
|
||||
import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
|
||||
import org.eclipse.jgit.lib.FileMode;
|
||||
import org.eclipse.jgit.lib.NullProgressMonitor;
|
||||
import org.eclipse.jgit.lib.ProgressMonitor;
|
||||
|
@ -110,6 +112,9 @@ class SimilarityRenameDetector {
|
|||
/** Score a pair must exceed to be considered a rename. */
|
||||
private int renameScore = 60;
|
||||
|
||||
/** Set if any {@link SimilarityIndex.TableFullException} occurs. */
|
||||
private boolean tableOverflow;
|
||||
|
||||
private List<DiffEntry> out;
|
||||
|
||||
SimilarityRenameDetector(ContentSource.Pair reader, List<DiffEntry> srcs,
|
||||
|
@ -182,6 +187,10 @@ List<DiffEntry> getLeftOverDestinations() {
|
|||
return dsts;
|
||||
}
|
||||
|
||||
boolean isTableOverflow() {
|
||||
return tableOverflow;
|
||||
}
|
||||
|
||||
private static List<DiffEntry> compactSrcList(List<DiffEntry> in) {
|
||||
ArrayList<DiffEntry> r = new ArrayList<DiffEntry>(in.size());
|
||||
for (DiffEntry e : in) {
|
||||
|
@ -208,25 +217,22 @@ private int buildMatrix(ProgressMonitor pm) throws IOException {
|
|||
|
||||
long[] srcSizes = new long[srcs.size()];
|
||||
long[] dstSizes = new long[dsts.size()];
|
||||
|
||||
// Init the size arrays to some value that indicates that we haven't
|
||||
// calculated the size yet. Since sizes cannot be negative, -1 will work
|
||||
Arrays.fill(srcSizes, -1);
|
||||
Arrays.fill(dstSizes, -1);
|
||||
BitSet dstTooLarge = null;
|
||||
|
||||
// Consider each pair of files, if the score is above the minimum
|
||||
// threshold we need record that scoring in the matrix so we can
|
||||
// later find the best matches.
|
||||
//
|
||||
int mNext = 0;
|
||||
for (int srcIdx = 0; srcIdx < srcs.size(); srcIdx++) {
|
||||
SRC: for (int srcIdx = 0; srcIdx < srcs.size(); srcIdx++) {
|
||||
DiffEntry srcEnt = srcs.get(srcIdx);
|
||||
if (!isFile(srcEnt.oldMode)) {
|
||||
pm.update(dsts.size());
|
||||
continue;
|
||||
}
|
||||
|
||||
SimilarityIndex s = hash(OLD, srcEnt);
|
||||
SimilarityIndex s = null;
|
||||
|
||||
for (int dstIdx = 0; dstIdx < dsts.size(); dstIdx++) {
|
||||
DiffEntry dstEnt = dsts.get(dstIdx);
|
||||
|
||||
|
@ -240,15 +246,20 @@ private int buildMatrix(ProgressMonitor pm) throws IOException {
|
|||
continue;
|
||||
}
|
||||
|
||||
if (dstTooLarge != null && dstTooLarge.get(dstIdx)) {
|
||||
pm.update(1);
|
||||
continue;
|
||||
}
|
||||
|
||||
long srcSize = srcSizes[srcIdx];
|
||||
if (srcSize < 0) {
|
||||
srcSize = size(OLD, srcEnt);
|
||||
if (srcSize == 0) {
|
||||
srcSize = size(OLD, srcEnt) + 1;
|
||||
srcSizes[srcIdx] = srcSize;
|
||||
}
|
||||
|
||||
long dstSize = dstSizes[dstIdx];
|
||||
if (dstSize < 0) {
|
||||
dstSize = size(NEW, dstEnt);
|
||||
if (dstSize == 0) {
|
||||
dstSize = size(NEW, dstEnt) + 1;
|
||||
dstSizes[dstIdx] = dstSize;
|
||||
}
|
||||
|
||||
|
@ -260,7 +271,27 @@ private int buildMatrix(ProgressMonitor pm) throws IOException {
|
|||
continue;
|
||||
}
|
||||
|
||||
SimilarityIndex d = hash(NEW, dstEnt);
|
||||
if (s == null) {
|
||||
try {
|
||||
s = hash(OLD, srcEnt);
|
||||
} catch (TableFullException tableFull) {
|
||||
tableOverflow = true;
|
||||
continue SRC;
|
||||
}
|
||||
}
|
||||
|
||||
SimilarityIndex d;
|
||||
try {
|
||||
d = hash(NEW, dstEnt);
|
||||
} catch (TableFullException tableFull) {
|
||||
if (dstTooLarge == null)
|
||||
dstTooLarge = new BitSet(dsts.size());
|
||||
dstTooLarge.set(dstIdx);
|
||||
tableOverflow = true;
|
||||
pm.update(1);
|
||||
continue;
|
||||
}
|
||||
|
||||
int contentScore = s.score(d, 10000);
|
||||
|
||||
// nameScore returns a value between 0 and 100, but we want it
|
||||
|
@ -336,7 +367,7 @@ static int nameScore(String a, String b) {
|
|||
}
|
||||
|
||||
private SimilarityIndex hash(DiffEntry.Side side, DiffEntry ent)
|
||||
throws IOException {
|
||||
throws IOException, TableFullException {
|
||||
SimilarityIndex r = new SimilarityIndex();
|
||||
r.hash(reader.open(side, ent));
|
||||
r.sort();
|
||||
|
|
Loading…
Reference in New Issue