Added file path similarity to scoring metric in rename detection
The scoring method was not taking into account the similarity of the file paths and file names. I changed the metric so that it is 99% based on content (which used to be 100% of the old metric), and 1% based on path similarity. Of that 1%, half (.5% of the total final score) is based on the actual file names (e.g. "foo.java"), and half on the directory (e.g. "src/com/foo/bar/"). Change-Id: I94f0c23bf6413c491b10d5625f6ad7d2ecfb4def
This commit is contained in:
parent
4c14b7869d
commit
9a48de86d8
|
@ -124,8 +124,8 @@ public void testExactRename_ManyRenames() throws Exception {
|
|||
}
|
||||
|
||||
public void testInexactRename_OnePair() throws Exception {
|
||||
ObjectId aId = blob("foo\nbar\nbaz\n");
|
||||
ObjectId bId = blob("foo\nbar\nblah\n");
|
||||
ObjectId aId = blob("foo\nbar\nbaz\nblarg\n");
|
||||
ObjectId bId = blob("foo\nbar\nbaz\nblah\n");
|
||||
|
||||
DiffEntry a = DiffEntry.add(PATH_A, aId);
|
||||
DiffEntry b = DiffEntry.delete(PATH_Q, bId);
|
||||
|
@ -135,12 +135,12 @@ public void testInexactRename_OnePair() throws Exception {
|
|||
|
||||
List<DiffEntry> entries = rd.compute();
|
||||
assertEquals(1, entries.size());
|
||||
assertRename(b, a, 61, entries.get(0));
|
||||
assertRename(b, a, 66, entries.get(0));
|
||||
}
|
||||
|
||||
public void testInexactRename_OneRenameTwoUnrelatedFiles() throws Exception {
|
||||
ObjectId aId = blob("foo\nbar\nbaz\n");
|
||||
ObjectId bId = blob("foo\nbar\nblah\n");
|
||||
ObjectId aId = blob("foo\nbar\nbaz\nblarg\n");
|
||||
ObjectId bId = blob("foo\nbar\nbaz\nblah\n");
|
||||
DiffEntry a = DiffEntry.add(PATH_A, aId);
|
||||
DiffEntry b = DiffEntry.delete(PATH_Q, bId);
|
||||
|
||||
|
@ -158,7 +158,7 @@ public void testInexactRename_OneRenameTwoUnrelatedFiles() throws Exception {
|
|||
assertEquals(3, entries.size());
|
||||
assertSame(c, entries.get(0));
|
||||
assertSame(d, entries.get(1));
|
||||
assertRename(b, a, 61, entries.get(2));
|
||||
assertRename(b, a, 66, entries.get(2));
|
||||
}
|
||||
|
||||
public void testInexactRename_LastByteDifferent() throws Exception {
|
||||
|
|
|
@ -78,8 +78,8 @@ public void testCommonScore_SameFiles() {
|
|||
assertEquals(8, src.common(dst));
|
||||
assertEquals(8, dst.common(src));
|
||||
|
||||
assertEquals(100, src.score(dst));
|
||||
assertEquals(100, dst.score(src));
|
||||
assertEquals(100, src.score(dst, 100));
|
||||
assertEquals(100, dst.score(src, 100));
|
||||
}
|
||||
|
||||
public void testCommonScore_EmptyFiles() {
|
||||
|
@ -102,8 +102,8 @@ public void testCommonScore_SimiliarBy75() {
|
|||
assertEquals(6, src.common(dst));
|
||||
assertEquals(6, dst.common(src));
|
||||
|
||||
assertEquals(75, src.score(dst));
|
||||
assertEquals(75, dst.score(src));
|
||||
assertEquals(75, src.score(dst, 100));
|
||||
assertEquals(75, dst.score(src, 100));
|
||||
}
|
||||
|
||||
private static SimilarityIndex hash(String text) {
|
||||
|
|
|
@ -142,11 +142,11 @@ void sort() {
|
|||
Arrays.sort(idHash);
|
||||
}
|
||||
|
||||
int score(SimilarityIndex dst) {
|
||||
int score(SimilarityIndex dst, int maxScore) {
|
||||
long max = Math.max(fileSize, dst.fileSize);
|
||||
if (max == 0)
|
||||
return 100;
|
||||
return (int) ((common(dst) * 100L) / max);
|
||||
return maxScore;
|
||||
return (int) ((common(dst) * maxScore) / max);
|
||||
}
|
||||
|
||||
int common(SimilarityIndex dst) {
|
||||
|
|
|
@ -260,7 +260,14 @@ private int buildMatrix(ProgressMonitor pm) throws IOException {
|
|||
}
|
||||
|
||||
SimilarityIndex d = hash(dstEnt.newId.toObjectId());
|
||||
int score = s.score(d);
|
||||
int contentScore = s.score(d, 10000);
|
||||
|
||||
// nameScore returns a value between 0 and 100, but we want it
|
||||
// to be in the same range as the content score. This allows it
|
||||
// to be dropped into the pretty formula for the final score.
|
||||
int nameScore = nameScore(srcEnt.oldName, dstEnt.newName) * 100;
|
||||
|
||||
int score = (contentScore * 99 + nameScore * 1) / 10000;
|
||||
|
||||
if (score < renameScore) {
|
||||
pm.update(1);
|
||||
|
@ -280,6 +287,53 @@ private int buildMatrix(ProgressMonitor pm) throws IOException {
|
|||
return mNext;
|
||||
}
|
||||
|
||||
private int nameScore(String a, String b) {
|
||||
int aDirLen = a.lastIndexOf("/") + 1;
|
||||
int bDirLen = b.lastIndexOf("/") + 1;
|
||||
|
||||
int dirMin = Math.min(aDirLen, bDirLen);
|
||||
int dirMax = Math.max(aDirLen, bDirLen);
|
||||
|
||||
final int dirScoreLtr;
|
||||
final int dirScoreRtl;
|
||||
|
||||
if (dirMax == 0) {
|
||||
dirScoreLtr = 100;
|
||||
dirScoreRtl = 100;
|
||||
} else {
|
||||
int dirSim = 0;
|
||||
for (; dirSim < dirMin; dirSim++) {
|
||||
if (a.charAt(dirSim) != b.charAt(dirSim))
|
||||
break;
|
||||
}
|
||||
dirScoreLtr = (dirSim * 100) / dirMax;
|
||||
|
||||
if (dirScoreLtr == 100) {
|
||||
dirScoreRtl = 100;
|
||||
} else {
|
||||
for (dirSim = 0; dirSim < dirMin; dirSim++) {
|
||||
if (a.charAt(aDirLen - 1 - dirSim) != b.charAt(bDirLen - 1
|
||||
- dirSim))
|
||||
break;
|
||||
}
|
||||
dirScoreRtl = (dirSim * 100) / dirMax;
|
||||
}
|
||||
}
|
||||
|
||||
int fileMin = Math.min(a.length() - aDirLen, b.length() - bDirLen);
|
||||
int fileMax = Math.max(a.length() - aDirLen, b.length() - bDirLen);
|
||||
|
||||
int fileSim = 0;
|
||||
for (; fileSim < fileMin; fileSim++) {
|
||||
if (a.charAt(a.length() - 1 - fileSim) != b.charAt(b.length() - 1
|
||||
- fileSim))
|
||||
break;
|
||||
}
|
||||
int fileScore = (fileSim * 100) / fileMax;
|
||||
|
||||
return (((dirScoreLtr + dirScoreRtl) * 25) + (fileScore * 50)) / 100;
|
||||
}
|
||||
|
||||
private SimilarityIndex hash(ObjectId objectId) throws IOException {
|
||||
SimilarityIndex r = new SimilarityIndex();
|
||||
r.hash(repo.openObject(objectId));
|
||||
|
|
Loading…
Reference in New Issue