Rename detection should canonicalize line endings
Native Git canonicalizes line endings when detecting renames, more specifically it replaces CRLF by LF. See: hash_chars in diffcore-delta.c Bug: 449545 Change-Id: Iec2aab12ae9e67074cccb7fbd4d9defe176a0130 Signed-off-by: Marc Strapetz <marc.strapetz@syntevo.com> Signed-off-by: Matthias Sohn <matthias.sohn@sap.com>
This commit is contained in:
parent
c053900c5b
commit
1cb5668441
|
@ -83,7 +83,7 @@ public void testIndexingLargeObject() throws IOException,
|
|||
+ "B\n" //
|
||||
+ "B\n").getBytes("UTF-8");
|
||||
SimilarityIndex si = new SimilarityIndex();
|
||||
si.hash(new ByteArrayInputStream(in), in.length);
|
||||
si.hash(new ByteArrayInputStream(in), in.length, false);
|
||||
assertEquals(2, si.size());
|
||||
}
|
||||
|
||||
|
@ -103,6 +103,48 @@ public void testCommonScore_SameFiles() throws TableFullException {
|
|||
assertEquals(100, dst.score(src, 100));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCommonScore_SameFiles_CR_canonicalization()
|
||||
throws TableFullException {
|
||||
String text = "" //
|
||||
+ "A\r\n" //
|
||||
+ "B\r\n" //
|
||||
+ "D\r\n" //
|
||||
+ "B\r\n";
|
||||
SimilarityIndex src = hash(text);
|
||||
SimilarityIndex dst = hash(text.replace("\r", ""));
|
||||
assertEquals(8, src.common(dst));
|
||||
assertEquals(8, dst.common(src));
|
||||
|
||||
assertEquals(100, src.score(dst, 100));
|
||||
assertEquals(100, dst.score(src, 100));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCommonScoreLargeObject_SameFiles_CR_canonicalization()
|
||||
throws TableFullException, IOException {
|
||||
String text = "" //
|
||||
+ "A\r\n" //
|
||||
+ "B\r\n" //
|
||||
+ "D\r\n" //
|
||||
+ "B\r\n";
|
||||
SimilarityIndex src = new SimilarityIndex();
|
||||
byte[] bytes1 = text.getBytes("UTF-8");
|
||||
src.hash(new ByteArrayInputStream(bytes1), bytes1.length, true);
|
||||
src.sort();
|
||||
|
||||
SimilarityIndex dst = new SimilarityIndex();
|
||||
byte[] bytes2 = text.replace("\r", "").getBytes("UTF-8");
|
||||
dst.hash(new ByteArrayInputStream(bytes2), bytes2.length, true);
|
||||
dst.sort();
|
||||
|
||||
assertEquals(8, src.common(dst));
|
||||
assertEquals(8, dst.common(src));
|
||||
|
||||
assertEquals(100, src.score(dst, 100));
|
||||
assertEquals(100, dst.score(src, 100));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCommonScore_EmptyFiles() throws TableFullException {
|
||||
SimilarityIndex src = hash("");
|
||||
|
@ -132,24 +174,8 @@ public void testCommonScore_SimiliarBy75() throws TableFullException {
|
|||
}
|
||||
|
||||
private static SimilarityIndex hash(String text) throws TableFullException {
|
||||
SimilarityIndex src = new SimilarityIndex() {
|
||||
@Override
|
||||
void hash(byte[] raw, int ptr, final int end)
|
||||
throws TableFullException {
|
||||
while (ptr < end) {
|
||||
int hash = raw[ptr] & 0xff;
|
||||
int start = ptr;
|
||||
do {
|
||||
int c = raw[ptr++] & 0xff;
|
||||
if (c == '\n')
|
||||
break;
|
||||
} while (ptr < end && ptr - start < 64);
|
||||
add(hash, ptr - start);
|
||||
}
|
||||
}
|
||||
};
|
||||
SimilarityIndex src = new SimilarityIndex();
|
||||
byte[] raw = Constants.encode(text);
|
||||
src.setFileSize(raw.length);
|
||||
src.hash(raw, 0, raw.length);
|
||||
src.sort();
|
||||
return src;
|
||||
|
|
|
@ -79,8 +79,11 @@ class SimilarityIndex {
|
|||
/** Maximum value of the count field, also mask to extract the count. */
|
||||
private static final long MAX_COUNT = (1L << KEY_SHIFT) - 1;
|
||||
|
||||
/** Total size of the file we hashed into the structure. */
|
||||
private long fileSize;
|
||||
/**
|
||||
* Total amount of bytes hashed into the structure, including \n. This is
|
||||
* usually the size of the file minus number of CRLF encounters.
|
||||
*/
|
||||
private long hashedCnt;
|
||||
|
||||
/** Number of non-zero entries in {@link #idHash}. */
|
||||
private int idSize;
|
||||
|
@ -108,48 +111,59 @@ class SimilarityIndex {
|
|||
idGrowAt = growAt(idHashBits);
|
||||
}
|
||||
|
||||
long getFileSize() {
|
||||
return fileSize;
|
||||
}
|
||||
|
||||
void setFileSize(long size) {
|
||||
fileSize = size;
|
||||
}
|
||||
|
||||
void hash(ObjectLoader obj) throws MissingObjectException, IOException,
|
||||
TableFullException {
|
||||
if (obj.isLarge()) {
|
||||
ObjectStream in = obj.openStream();
|
||||
try {
|
||||
setFileSize(in.getSize());
|
||||
hash(in, fileSize);
|
||||
} finally {
|
||||
in.close();
|
||||
}
|
||||
hashLargeObject(obj);
|
||||
} else {
|
||||
byte[] raw = obj.getCachedBytes();
|
||||
setFileSize(raw.length);
|
||||
hash(raw, 0, raw.length);
|
||||
}
|
||||
}
|
||||
|
||||
private void hashLargeObject(ObjectLoader obj) throws IOException,
|
||||
TableFullException {
|
||||
ObjectStream in1 = obj.openStream();
|
||||
boolean text;
|
||||
try {
|
||||
text = !RawText.isBinary(in1);
|
||||
} finally {
|
||||
in1.close();
|
||||
}
|
||||
|
||||
ObjectStream in2 = obj.openStream();
|
||||
try {
|
||||
hash(in2, in2.getSize(), text);
|
||||
} finally {
|
||||
in2.close();
|
||||
}
|
||||
}
|
||||
|
||||
void hash(byte[] raw, int ptr, final int end) throws TableFullException {
|
||||
final boolean text = !RawText.isBinary(raw);
|
||||
hashedCnt = 0;
|
||||
while (ptr < end) {
|
||||
int hash = 5381;
|
||||
int blockHashedCnt = 0;
|
||||
int start = ptr;
|
||||
|
||||
// Hash one line, or one block, whichever occurs first.
|
||||
do {
|
||||
int c = raw[ptr++] & 0xff;
|
||||
// Ignore CR in CRLF sequence if text
|
||||
if (text && c == '\r' && ptr < end && raw[ptr] == '\n')
|
||||
continue;
|
||||
blockHashedCnt++;
|
||||
if (c == '\n')
|
||||
break;
|
||||
hash = (hash << 5) + hash + c;
|
||||
} while (ptr < end && ptr - start < 64);
|
||||
add(hash, ptr - start);
|
||||
hashedCnt += blockHashedCnt;
|
||||
add(hash, blockHashedCnt);
|
||||
}
|
||||
}
|
||||
|
||||
void hash(InputStream in, long remaining) throws IOException,
|
||||
void hash(InputStream in, long remaining, boolean text) throws IOException,
|
||||
TableFullException {
|
||||
byte[] buf = new byte[4096];
|
||||
int ptr = 0;
|
||||
|
@ -157,6 +171,7 @@ void hash(InputStream in, long remaining) throws IOException,
|
|||
|
||||
while (0 < remaining) {
|
||||
int hash = 5381;
|
||||
int blockHashedCnt = 0;
|
||||
|
||||
// Hash one line, or one block, whichever occurs first.
|
||||
int n = 0;
|
||||
|
@ -170,11 +185,16 @@ void hash(InputStream in, long remaining) throws IOException,
|
|||
|
||||
n++;
|
||||
int c = buf[ptr++] & 0xff;
|
||||
// Ignore CR in CRLF sequence if text
|
||||
if (text && c == '\r' && ptr < cnt && buf[ptr] == '\n')
|
||||
continue;
|
||||
blockHashedCnt++;
|
||||
if (c == '\n')
|
||||
break;
|
||||
hash = (hash << 5) + hash + c;
|
||||
} while (n < 64 && n < remaining);
|
||||
add(hash, n);
|
||||
hashedCnt += blockHashedCnt;
|
||||
add(hash, blockHashedCnt);
|
||||
remaining -= n;
|
||||
}
|
||||
}
|
||||
|
@ -193,7 +213,7 @@ void sort() {
|
|||
}
|
||||
|
||||
int score(SimilarityIndex dst, int maxScore) {
|
||||
long max = Math.max(fileSize, dst.fileSize);
|
||||
long max = Math.max(hashedCnt, dst.hashedCnt);
|
||||
if (max == 0)
|
||||
return maxScore;
|
||||
return (int) ((common(dst) * maxScore) / max);
|
||||
|
|
Loading…
Reference in New Issue