Rename detection should canonicalize line endings

Native Git canonicalizes line endings when detecting
renames, more specifically it replaces CRLF by LF.
See: hash_chars in diffcore-delta.c

Bug: 449545
Change-Id: Iec2aab12ae9e67074cccb7fbd4d9defe176a0130
Signed-off-by: Marc Strapetz <marc.strapetz@syntevo.com>
Signed-off-by: Matthias Sohn <matthias.sohn@sap.com>
This commit is contained in:
Marc Strapetz 2014-10-30 19:36:49 +01:00 committed by Matthias Sohn
parent c053900c5b
commit 1cb5668441
2 changed files with 86 additions and 40 deletions

View File

@ -83,7 +83,7 @@ public void testIndexingLargeObject() throws IOException,
+ "B\n" //
+ "B\n").getBytes("UTF-8");
SimilarityIndex si = new SimilarityIndex();
si.hash(new ByteArrayInputStream(in), in.length);
si.hash(new ByteArrayInputStream(in), in.length, false);
assertEquals(2, si.size());
}
@ -103,6 +103,48 @@ public void testCommonScore_SameFiles() throws TableFullException {
assertEquals(100, dst.score(src, 100));
}
@Test
public void testCommonScore_SameFiles_CR_canonicalization()
throws TableFullException {
String text = "" //
+ "A\r\n" //
+ "B\r\n" //
+ "D\r\n" //
+ "B\r\n";
SimilarityIndex src = hash(text);
SimilarityIndex dst = hash(text.replace("\r", ""));
assertEquals(8, src.common(dst));
assertEquals(8, dst.common(src));
assertEquals(100, src.score(dst, 100));
assertEquals(100, dst.score(src, 100));
}
@Test
public void testCommonScoreLargeObject_SameFiles_CR_canonicalization()
throws TableFullException, IOException {
String text = "" //
+ "A\r\n" //
+ "B\r\n" //
+ "D\r\n" //
+ "B\r\n";
SimilarityIndex src = new SimilarityIndex();
byte[] bytes1 = text.getBytes("UTF-8");
src.hash(new ByteArrayInputStream(bytes1), bytes1.length, true);
src.sort();
SimilarityIndex dst = new SimilarityIndex();
byte[] bytes2 = text.replace("\r", "").getBytes("UTF-8");
dst.hash(new ByteArrayInputStream(bytes2), bytes2.length, true);
dst.sort();
assertEquals(8, src.common(dst));
assertEquals(8, dst.common(src));
assertEquals(100, src.score(dst, 100));
assertEquals(100, dst.score(src, 100));
}
@Test
public void testCommonScore_EmptyFiles() throws TableFullException {
SimilarityIndex src = hash("");
@ -132,24 +174,8 @@ public void testCommonScore_SimiliarBy75() throws TableFullException {
}
private static SimilarityIndex hash(String text) throws TableFullException {
SimilarityIndex src = new SimilarityIndex() {
@Override
void hash(byte[] raw, int ptr, final int end)
throws TableFullException {
while (ptr < end) {
int hash = raw[ptr] & 0xff;
int start = ptr;
do {
int c = raw[ptr++] & 0xff;
if (c == '\n')
break;
} while (ptr < end && ptr - start < 64);
add(hash, ptr - start);
}
}
};
SimilarityIndex src = new SimilarityIndex();
byte[] raw = Constants.encode(text);
src.setFileSize(raw.length);
src.hash(raw, 0, raw.length);
src.sort();
return src;

View File

@ -79,8 +79,11 @@ class SimilarityIndex {
/** Maximum value of the count field, also mask to extract the count. */
private static final long MAX_COUNT = (1L << KEY_SHIFT) - 1;
/** Total size of the file we hashed into the structure. */
private long fileSize;
/**
* Total amount of bytes hashed into the structure, including \n. This is
* usually the size of the file minus number of CRLF encounters.
*/
private long hashedCnt;
/** Number of non-zero entries in {@link #idHash}. */
private int idSize;
@ -108,48 +111,59 @@ class SimilarityIndex {
idGrowAt = growAt(idHashBits);
}
long getFileSize() {
return fileSize;
}
void setFileSize(long size) {
fileSize = size;
}
void hash(ObjectLoader obj) throws MissingObjectException, IOException,
TableFullException {
if (obj.isLarge()) {
ObjectStream in = obj.openStream();
try {
setFileSize(in.getSize());
hash(in, fileSize);
} finally {
in.close();
}
hashLargeObject(obj);
} else {
byte[] raw = obj.getCachedBytes();
setFileSize(raw.length);
hash(raw, 0, raw.length);
}
}
private void hashLargeObject(ObjectLoader obj) throws IOException,
TableFullException {
ObjectStream in1 = obj.openStream();
boolean text;
try {
text = !RawText.isBinary(in1);
} finally {
in1.close();
}
ObjectStream in2 = obj.openStream();
try {
hash(in2, in2.getSize(), text);
} finally {
in2.close();
}
}
void hash(byte[] raw, int ptr, final int end) throws TableFullException {
final boolean text = !RawText.isBinary(raw);
hashedCnt = 0;
while (ptr < end) {
int hash = 5381;
int blockHashedCnt = 0;
int start = ptr;
// Hash one line, or one block, whichever occurs first.
do {
int c = raw[ptr++] & 0xff;
// Ignore CR in CRLF sequence if text
if (text && c == '\r' && ptr < end && raw[ptr] == '\n')
continue;
blockHashedCnt++;
if (c == '\n')
break;
hash = (hash << 5) + hash + c;
} while (ptr < end && ptr - start < 64);
add(hash, ptr - start);
hashedCnt += blockHashedCnt;
add(hash, blockHashedCnt);
}
}
void hash(InputStream in, long remaining) throws IOException,
void hash(InputStream in, long remaining, boolean text) throws IOException,
TableFullException {
byte[] buf = new byte[4096];
int ptr = 0;
@ -157,6 +171,7 @@ void hash(InputStream in, long remaining) throws IOException,
while (0 < remaining) {
int hash = 5381;
int blockHashedCnt = 0;
// Hash one line, or one block, whichever occurs first.
int n = 0;
@ -170,11 +185,16 @@ void hash(InputStream in, long remaining) throws IOException,
n++;
int c = buf[ptr++] & 0xff;
// Ignore CR in CRLF sequence if text
if (text && c == '\r' && ptr < cnt && buf[ptr] == '\n')
continue;
blockHashedCnt++;
if (c == '\n')
break;
hash = (hash << 5) + hash + c;
} while (n < 64 && n < remaining);
add(hash, n);
hashedCnt += blockHashedCnt;
add(hash, blockHashedCnt);
remaining -= n;
}
}
@ -193,7 +213,7 @@ void sort() {
}
int score(SimilarityIndex dst, int maxScore) {
long max = Math.max(fileSize, dst.fileSize);
long max = Math.max(hashedCnt, dst.hashedCnt);
if (max == 0)
return maxScore;
return (int) ((common(dst) * maxScore) / max);