diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java index 95423609a..4724677bb 100644 --- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java +++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java @@ -83,7 +83,7 @@ public void testIndexingLargeObject() throws IOException, + "B\n" // + "B\n").getBytes("UTF-8"); SimilarityIndex si = new SimilarityIndex(); - si.hash(new ByteArrayInputStream(in), in.length); + si.hash(new ByteArrayInputStream(in), in.length, false); assertEquals(2, si.size()); } @@ -103,6 +103,48 @@ public void testCommonScore_SameFiles() throws TableFullException { assertEquals(100, dst.score(src, 100)); } + @Test + public void testCommonScore_SameFiles_CR_canonicalization() + throws TableFullException { + String text = "" // + + "A\r\n" // + + "B\r\n" // + + "D\r\n" // + + "B\r\n"; + SimilarityIndex src = hash(text); + SimilarityIndex dst = hash(text.replace("\r", "")); + assertEquals(8, src.common(dst)); + assertEquals(8, dst.common(src)); + + assertEquals(100, src.score(dst, 100)); + assertEquals(100, dst.score(src, 100)); + } + + @Test + public void testCommonScoreLargeObject_SameFiles_CR_canonicalization() + throws TableFullException, IOException { + String text = "" // + + "A\r\n" // + + "B\r\n" // + + "D\r\n" // + + "B\r\n"; + SimilarityIndex src = new SimilarityIndex(); + byte[] bytes1 = text.getBytes("UTF-8"); + src.hash(new ByteArrayInputStream(bytes1), bytes1.length, true); + src.sort(); + + SimilarityIndex dst = new SimilarityIndex(); + byte[] bytes2 = text.replace("\r", "").getBytes("UTF-8"); + dst.hash(new ByteArrayInputStream(bytes2), bytes2.length, true); + dst.sort(); + + assertEquals(8, src.common(dst)); + assertEquals(8, dst.common(src)); + + assertEquals(100, src.score(dst, 100)); + assertEquals(100, dst.score(src, 100)); + } + @Test public void testCommonScore_EmptyFiles() throws TableFullException { SimilarityIndex src = hash(""); @@ -132,24 +174,8 @@ public void testCommonScore_SimiliarBy75() throws TableFullException { } private static SimilarityIndex hash(String text) throws TableFullException { - SimilarityIndex src = new SimilarityIndex() { - @Override - void hash(byte[] raw, int ptr, final int end) - throws TableFullException { - while (ptr < end) { - int hash = raw[ptr] & 0xff; - int start = ptr; - do { - int c = raw[ptr++] & 0xff; - if (c == '\n') - break; - } while (ptr < end && ptr - start < 64); - add(hash, ptr - start); - } - } - }; + SimilarityIndex src = new SimilarityIndex(); byte[] raw = Constants.encode(text); - src.setFileSize(raw.length); src.hash(raw, 0, raw.length); src.sort(); return src; diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java index 17ccb9726..f376b8e36 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java @@ -79,8 +79,11 @@ class SimilarityIndex { /** Maximum value of the count field, also mask to extract the count. */ private static final long MAX_COUNT = (1L << KEY_SHIFT) - 1; - /** Total size of the file we hashed into the structure. */ - private long fileSize; + /** + * Total amount of bytes hashed into the structure, including \n. This is + * usually the size of the file minus number of CRLF encounters. + */ + private long hashedCnt; /** Number of non-zero entries in {@link #idHash}. */ private int idSize; @@ -108,48 +111,59 @@ class SimilarityIndex { idGrowAt = growAt(idHashBits); } - long getFileSize() { - return fileSize; - } - - void setFileSize(long size) { - fileSize = size; - } - void hash(ObjectLoader obj) throws MissingObjectException, IOException, TableFullException { if (obj.isLarge()) { - ObjectStream in = obj.openStream(); - try { - setFileSize(in.getSize()); - hash(in, fileSize); - } finally { - in.close(); - } + hashLargeObject(obj); } else { byte[] raw = obj.getCachedBytes(); - setFileSize(raw.length); hash(raw, 0, raw.length); } } + private void hashLargeObject(ObjectLoader obj) throws IOException, + TableFullException { + ObjectStream in1 = obj.openStream(); + boolean text; + try { + text = !RawText.isBinary(in1); + } finally { + in1.close(); + } + + ObjectStream in2 = obj.openStream(); + try { + hash(in2, in2.getSize(), text); + } finally { + in2.close(); + } + } + void hash(byte[] raw, int ptr, final int end) throws TableFullException { + final boolean text = !RawText.isBinary(raw); + hashedCnt = 0; while (ptr < end) { int hash = 5381; + int blockHashedCnt = 0; int start = ptr; // Hash one line, or one block, whichever occurs first. do { int c = raw[ptr++] & 0xff; + // Ignore CR in CRLF sequence if text + if (text && c == '\r' && ptr < end && raw[ptr] == '\n') + continue; + blockHashedCnt++; if (c == '\n') break; hash = (hash << 5) + hash + c; } while (ptr < end && ptr - start < 64); - add(hash, ptr - start); + hashedCnt += blockHashedCnt; + add(hash, blockHashedCnt); } } - void hash(InputStream in, long remaining) throws IOException, + void hash(InputStream in, long remaining, boolean text) throws IOException, TableFullException { byte[] buf = new byte[4096]; int ptr = 0; @@ -157,6 +171,7 @@ void hash(InputStream in, long remaining) throws IOException, while (0 < remaining) { int hash = 5381; + int blockHashedCnt = 0; // Hash one line, or one block, whichever occurs first. int n = 0; @@ -170,11 +185,16 @@ void hash(InputStream in, long remaining) throws IOException, n++; int c = buf[ptr++] & 0xff; + // Ignore CR in CRLF sequence if text + if (text && c == '\r' && ptr < cnt && buf[ptr] == '\n') + continue; + blockHashedCnt++; if (c == '\n') break; hash = (hash << 5) + hash + c; } while (n < 64 && n < remaining); - add(hash, n); + hashedCnt += blockHashedCnt; + add(hash, blockHashedCnt); remaining -= n; } } @@ -193,7 +213,7 @@ void sort() { } int score(SimilarityIndex dst, int maxScore) { - long max = Math.max(fileSize, dst.fileSize); + long max = Math.max(hashedCnt, dst.hashedCnt); if (max == 0) return maxScore; return (int) ((common(dst) * maxScore) / max);