Implemented file path based tie breaking to exact rename detection

During the exact rename detection phase in RenameDetector, ties were
resolved on a first-found basis. I added support for file path based
tie breaking during that phase. Basically, there are four situations
that have to be handled:

One add matching one delete:
	In this simple case, we pair them as a rename.

One add matching many deletes:
	Find the delete whos path matches the add the closest, and
	pair them as a rename.

Many adds matching one delete:
	Similar to the above case, we find the add that matches the
	delete the closest, and pair them as a rename. The other adds
	are marked as copies of the delete.

Many adds matching many deletes:
	Build a scoring matrix similar to the one used for content-
	based matching, scoring instead by file path. Some of the
	utility functions in SimilarityRenameDetector are used in
	this case, as we use the same encoding scheme. Once the
	matrix is built, scan it for the best matches, marking them
	as renames. The rest are marked as copies.

I don't particularly like the idea of using utility functions right
out of SimilarityRenameDetector, but it works for the moment. A later
commit will likely refactor this into a common utility class, as well
as bringing exact rename detection out of RenameDetector and into a
separate class, much like SimilarityRenameDetector.

Change-Id: I1fb08390aebdcbf20d049aecf402a36506e55611
This commit is contained in:
Jeff Schumacher 2010-07-13 12:21:12 -07:00
parent f666cc755b
commit 31311cacfd
4 changed files with 258 additions and 48 deletions

View File

@ -146,6 +146,48 @@ public void testExactRename_MultipleIdenticalDeletes() throws Exception {
assertRename(a, d, 100, entries.get(2));
}
public void testExactRename_PathBreaksTie() throws Exception {
ObjectId foo = blob("foo");
DiffEntry a = DiffEntry.add("src/com/foo/a.java", foo);
DiffEntry b = DiffEntry.delete("src/com/foo/b.java", foo);
DiffEntry c = DiffEntry.add("c.txt", foo);
DiffEntry d = DiffEntry.delete("d.txt", foo);
// Add out of order to avoid first-match succeeding
rd.add(a);
rd.add(d);
rd.add(b);
rd.add(c);
List<DiffEntry> entries = rd.compute();
assertEquals(2, entries.size());
assertRename(d, c, 100, entries.get(0));
assertRename(b, a, 100, entries.get(1));
}
public void testExactRename_OneDeleteManyAdds() throws Exception {
ObjectId foo = blob("foo");
DiffEntry a = DiffEntry.add("src/com/foo/a.java", foo);
DiffEntry b = DiffEntry.add("src/com/foo/b.java", foo);
DiffEntry c = DiffEntry.add("c.txt", foo);
DiffEntry d = DiffEntry.delete("d.txt", foo);
rd.add(a);
rd.add(b);
rd.add(c);
rd.add(d);
List<DiffEntry> entries = rd.compute();
assertEquals(3, entries.size());
assertRename(d, c, 100, entries.get(0));
assertCopy(d, a, 100, entries.get(1));
assertCopy(d, b, 100, entries.get(2));
}
public void testInexactRename_OnePair() throws Exception {
ObjectId aId = blob("foo\nbar\nbaz\nblarg\n");
ObjectId bId = blob("foo\nbar\nbaz\nblah\n");
@ -385,4 +427,20 @@ private static void assertRename(DiffEntry o, DiffEntry n, int score,
assertEquals(score, rename.getScore());
}
private static void assertCopy(DiffEntry o, DiffEntry n, int score,
DiffEntry copy) {
assertEquals(ChangeType.COPY, copy.getChangeType());
assertEquals(o.getOldName(), copy.getOldName());
assertEquals(n.getNewName(), copy.getNewName());
assertEquals(o.getOldMode(), copy.getOldMode());
assertEquals(n.getNewMode(), copy.getNewMode());
assertEquals(o.getOldId(), copy.getOldId());
assertEquals(n.getNewId(), copy.getNewId());
assertEquals(score, copy.getScore());
}
}

View File

@ -315,4 +315,31 @@ public AbbreviatedObjectId getOldId() {
public AbbreviatedObjectId getNewId() {
return newId;
}
@Override
public String toString() {
StringBuilder buf = new StringBuilder();
buf.append("DiffEntry[");
buf.append(changeType);
buf.append(" ");
switch (changeType) {
case ADD:
buf.append(newName);
break;
case COPY:
buf.append(oldName + "->" + newName);
break;
case DELETE:
buf.append(oldName);
break;
case MODIFY:
buf.append(oldName);
break;
case RENAME:
buf.append(oldName + "->" + newName);
break;
}
buf.append("]");
return buf.toString();
}
}

View File

@ -45,6 +45,7 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
@ -312,70 +313,131 @@ private void findExactRenames(ProgressMonitor pm) {
return;
pm.beginTask(JGitText.get().renamesFindingExact, //
added.size() + deleted.size());
added.size() + added.size() + deleted.size()
+ added.size() * deleted.size());
HashMap<AbbreviatedObjectId, Object> map = new HashMap<AbbreviatedObjectId, Object>();
for (DiffEntry del : deleted) {
Object old = map.put(del.oldId, del);
if (old instanceof DiffEntry) {
ArrayList<DiffEntry> list = new ArrayList<DiffEntry>(2);
list.add((DiffEntry) old);
list.add(del);
map.put(del.oldId, list);
HashMap<AbbreviatedObjectId, Object> deletedMap = populateMap(deleted, pm);
HashMap<AbbreviatedObjectId, Object> addedMap = populateMap(added, pm);
} else if (old != null) {
// Must be a list of DiffEntries
((List) old).add(del);
map.put(del.oldId, old);
}
pm.update(1);
ArrayList<DiffEntry> uniqueAdds = new ArrayList<DiffEntry>(added.size());
ArrayList<List<DiffEntry>> nonUniqueAdds = new ArrayList<List<DiffEntry>>();
for (Object o : addedMap.values()) {
if (o instanceof DiffEntry)
uniqueAdds.add((DiffEntry) o);
else
nonUniqueAdds.add((List<DiffEntry>) o);
}
ArrayList<DiffEntry> left = new ArrayList<DiffEntry>(added.size());
for (DiffEntry dst : added) {
Object del = map.get(dst.newId);
for (DiffEntry a : uniqueAdds) {
Object del = deletedMap.get(a.newId);
if (del instanceof DiffEntry) {
// We have one add to one delete: pair them if they are the same
// type
DiffEntry e = (DiffEntry) del;
if (sameType(e.oldMode, dst.newMode)) {
if (e.changeType == ChangeType.DELETE) {
e.changeType = ChangeType.RENAME;
entries.add(exactRename(e, dst));
} else {
entries.add(exactCopy(e, dst));
}
if (sameType(e.oldMode, a.newMode)) {
e.changeType = ChangeType.RENAME;
entries.add(exactRename(e, a));
} else {
left.add(dst);
left.add(a);
}
} else if (del != null) {
// We have one add to many deletes: find the delete with the
// same type and closest name to the add, then pair them
List<DiffEntry> list = (List<DiffEntry>) del;
DiffEntry best = null;
for (DiffEntry e : list) {
if (sameType(e.oldMode, dst.newMode)) {
best = e;
break;
}
}
DiffEntry best = bestPathMatch(a, list);
if (best != null) {
if (best.changeType == ChangeType.DELETE) {
best.changeType = ChangeType.RENAME;
entries.add(exactRename(best, dst));
} else {
entries.add(exactCopy(best, dst));
}
best.changeType = ChangeType.RENAME;
entries.add(exactRename(best, a));
} else {
left.add(dst);
left.add(a);
}
} else {
left.add(dst);
left.add(a);
}
pm.update(1);
}
for (List<DiffEntry> adds : nonUniqueAdds) {
Object o = deletedMap.get(adds.get(0).newId);
if (o instanceof DiffEntry) {
// We have many adds to one delete: find the add with the same
// type and closest name to the delete, then pair them. Mark the
// rest as copies of the delete.
DiffEntry d = (DiffEntry) o;
DiffEntry best = bestPathMatch(d, adds);
if (best != null) {
d.changeType = ChangeType.RENAME;
entries.add(exactRename(d, best));
for (DiffEntry a : adds) {
if (a != best) {
if (sameType(d.oldMode, a.newMode)) {
entries.add(exactCopy(d, a));
} else {
left.add(a);
}
}
}
} else {
left.addAll(adds);
}
} else {
// We have many adds to many deletes: score all the adds against
// all the deletes by path name, take the best matches, pair
// them as renames, then call the rest copies
List<DiffEntry> dels = (List<DiffEntry>) o;
long[] matrix = new long[dels.size() * adds.size()];
int mNext = 0;
for (int addIdx = 0; addIdx < adds.size(); addIdx++) {
String addedName = adds.get(addIdx).newName;
for (int delIdx = 0; delIdx < dels.size(); delIdx++) {
String deletedName = dels.get(delIdx).oldName;
int score = SimilarityRenameDetector.nameScore(addedName, deletedName);
matrix[mNext] = SimilarityRenameDetector.encode(score, addIdx, delIdx);
mNext++;
}
}
Arrays.sort(matrix);
for (--mNext; mNext >= 0; mNext--) {
long ent = matrix[mNext];
int delIdx = SimilarityRenameDetector.srcFile(ent);
int addIdx = SimilarityRenameDetector.dstFile(ent);
DiffEntry d = dels.get(delIdx);
DiffEntry a = adds.get(addIdx);
if (a == null) {
pm.update(1);
continue; // was already matched earlier
}
ChangeType type;
if (d.changeType == ChangeType.DELETE) {
// First use of this source file. Tag it as a rename so we
// later know it is already been used as a rename, other
// matches (if any) will claim themselves as copies instead.
//
d.changeType = ChangeType.RENAME;
type = ChangeType.RENAME;
} else {
type = ChangeType.COPY;
}
entries.add(DiffEntry.pair(type, d, a, 100));
adds.set(addIdx, null); // Claim the destination was matched.
pm.update(1);
}
}
}
added = left;
deleted = new ArrayList<DiffEntry>(map.size());
for (Object o : map.values()) {
deleted = new ArrayList<DiffEntry>(deletedMap.size());
for (Object o : deletedMap.values()) {
if (o instanceof DiffEntry) {
DiffEntry e = (DiffEntry) o;
if (e.changeType == ChangeType.DELETE)
@ -391,6 +453,69 @@ private void findExactRenames(ProgressMonitor pm) {
pm.endTask();
}
/**
* Find the best match by file path for a given DiffEntry from a list of
* DiffEntrys. The returned DiffEntry will be of the same type as <src>. If
* no DiffEntry can be found that has the same type, this method will return
* null.
*
* @param src
* the DiffEntry to try to find a match for
* @param list
* a list of DiffEntrys to search through
* @return the DiffEntry from <list> who's file path best matches <src>
*/
private static DiffEntry bestPathMatch(DiffEntry src, List<DiffEntry> list) {
DiffEntry best = null;
int score = -1;
for (DiffEntry d : list) {
if (sameType(mode(d), mode(src))) {
int tmp = SimilarityRenameDetector
.nameScore(path(d), path(src));
if (tmp > score) {
best = d;
score = tmp;
}
}
}
return best;
}
@SuppressWarnings("unchecked")
private HashMap<AbbreviatedObjectId, Object> populateMap(
List<DiffEntry> diffEntries, ProgressMonitor pm) {
HashMap<AbbreviatedObjectId, Object> map = new HashMap<AbbreviatedObjectId, Object>();
for (DiffEntry de : diffEntries) {
Object old = map.put(id(de), de);
if (old instanceof DiffEntry) {
ArrayList<DiffEntry> list = new ArrayList<DiffEntry>(2);
list.add((DiffEntry) old);
list.add(de);
map.put(id(de), list);
} else if (old != null) {
// Must be a list of DiffEntries
((List<DiffEntry>) old).add(de);
map.put(id(de), old);
}
pm.update(1);
}
return map;
}
private static String path(DiffEntry de) {
return de.changeType == ChangeType.DELETE ? de.oldName : de.newName;
}
private static FileMode mode(DiffEntry de) {
return de.changeType == ChangeType.DELETE ? de.oldMode : de.newMode;
}
private static AbbreviatedObjectId id(DiffEntry de) {
return de.changeType == ChangeType.DELETE ? de.oldId : de.newId;
}
static boolean sameType(FileMode a, FileMode b) {
// Files have to be of the same type in order to rename them.
// We would never want to rename a file to a gitlink, or a

View File

@ -287,7 +287,7 @@ private int buildMatrix(ProgressMonitor pm) throws IOException {
return mNext;
}
private int nameScore(String a, String b) {
static int nameScore(String a, String b) {
int aDirLen = a.lastIndexOf("/") + 1;
int bDirLen = b.lastIndexOf("/") + 1;
@ -349,15 +349,15 @@ private static int score(long value) {
return (int) (value >>> SCORE_SHIFT);
}
private static int srcFile(long value) {
static int srcFile(long value) {
return decodeFile(((int) (value >>> BITS_PER_INDEX)) & INDEX_MASK);
}
private static int dstFile(long value) {
static int dstFile(long value) {
return decodeFile(((int) value) & INDEX_MASK);
}
private static long encode(int score, int srcIdx, int dstIdx) {
static long encode(int score, int srcIdx, int dstIdx) {
return (((long) score) << SCORE_SHIFT) //
| (encodeFile(srcIdx) << BITS_PER_INDEX) //
| encodeFile(dstIdx);