Micro-optimize reduceCommonStartEnd for RawText

This is a faster exact match based form that tries to improve performance for the common case of the header and trailer of a text file not changing at all. After this fast path we use the slower path based on the super class' using equals() to allow for whitespace ignore modes to still work. Some simple performance testing showed a major improvement over the older implementation for a common edit we see in JGit. The test compared blob 29a89bc and 372a978, which is the ObjectDirectory.java file difference in commit 41dd9ed1c0. The two text files are approximately 22 KiB in size. DEFAULT old 203900 ns DEFAULT new 100400 ns This new version is 2x faster for the DEFAULT comparator, which does not treat space specially. This is because we can now examine a larger swath of text with fewer instructions per byte compared. The older algorithm had to stop at each line break and recompute how to examine the next line, while the new algorithm only stops when the first difference is found. WS_IGNORE_ALL old 298500 ns WS_IGNORE_ALL new 63300 ns Its 4.7x faster for the whitespace ignore comparator, as the common header and footer do not have a whitespace difference. Avoiding the special case handling for whitespace on each byte considered saves a lot of time. Since most edits to source code (and other text like files) appears in the interior of the file, fast elimination of common header/footer means faster diff throughput. In the less common case of an actual header or footer edit, the common header/footer elimination is stopped rather quickly either way, so there is very little downside to the optimiation applied here. Change-Id: I1d501b4c3ff80ed086b20bf12faf51ae62167db7 Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
2010-09-10 22:14:57 -07:00 · 2010-09-10 22:14:57 -07:00 · e0970cd1b4
parent 590a9f94a1
commit e0970cd1b4
2 changed files with 109 additions and 0 deletions
--- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RawTextTest.java
+++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RawTextTest.java
@ -46,6 +46,7 @@

 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
+import java.io.UnsupportedEncodingException;

 import junit.framework.TestCase;

@ -98,4 +99,51 @@ public void testWriteLine3() throws IOException {
 		final byte[] r = o.toByteArray();
 		assertEquals("", RawParseUtils.decode(r));
 	}
+
+	public void testComparatorReduceCommonStartEnd()
+			throws UnsupportedEncodingException {
+		final RawTextComparator c = RawTextComparator.DEFAULT;
+		Edit e;
+
+		e = c.reduceCommonStartEnd(t(""), t(""), new Edit(0, 0, 0, 0));
+		assertEquals(new Edit(0, 0, 0, 0), e);
+
+		e = c.reduceCommonStartEnd(t("a"), t("b"), new Edit(0, 1, 0, 1));
+		assertEquals(new Edit(0, 1, 0, 1), e);
+
+		e = c.reduceCommonStartEnd(t("a"), t("a"), new Edit(0, 1, 0, 1));
+		assertEquals(new Edit(1, 1, 1, 1), e);
+
+		e = c.reduceCommonStartEnd(t("axB"), t("axC"), new Edit(0, 3, 0, 3));
+		assertEquals(new Edit(2, 3, 2, 3), e);
+
+		e = c.reduceCommonStartEnd(t("Bxy"), t("Cxy"), new Edit(0, 3, 0, 3));
+		assertEquals(new Edit(0, 1, 0, 1), e);
+
+		e = c.reduceCommonStartEnd(t("bc"), t("Abc"), new Edit(0, 2, 0, 3));
+		assertEquals(new Edit(0, 0, 0, 1), e);
+
+		e = new Edit(0, 5, 0, 5);
+		e = c.reduceCommonStartEnd(t("abQxy"), t("abRxy"), e);
+		assertEquals(new Edit(2, 3, 2, 3), e);
+
+		RawText a = new RawText("p\na b\nQ\nc d\n".getBytes("UTF-8"));
+		RawText b = new RawText("p\na  b \nR\n c  d \n".getBytes("UTF-8"));
+		e = new Edit(0, 4, 0, 4);
+		e = RawTextComparator.WS_IGNORE_ALL.reduceCommonStartEnd(a, b, e);
+		assertEquals(new Edit(2, 3, 2, 3), e);
+	}
+
+	private static RawText t(String text) {
+		StringBuilder r = new StringBuilder();
+		for (int i = 0; i < text.length(); i++) {
+			r.append(text.charAt(i));
+			r.append('\n');
+		}
+		try {
+			return new RawText(r.toString().getBytes("UTF-8"));
+		} catch (UnsupportedEncodingException e) {
+			throw new RuntimeException(e);
+		}
+	}
 }
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawTextComparator.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawTextComparator.java
@ -48,6 +48,8 @@
 import static org.eclipse.jgit.util.RawCharUtil.trimLeadingWhitespace;
 import static org.eclipse.jgit.util.RawCharUtil.trimTrailingWhitespace;

+import org.eclipse.jgit.util.IntList;
+
 /** Equivalence function for {@link RawText}. */
 public abstract class RawTextComparator extends SequenceComparator<RawText> {
 	/** No special treatment. */
@ -275,6 +277,65 @@ public int hash(RawText seq, int ptr) {
 		return seq.hashes[ptr + 1];
 	}

+	@Override
+	public Edit reduceCommonStartEnd(RawText a, RawText b, Edit e) {
+		// This is a faster exact match based form that tries to improve
+		// performance for the common case of the header and trailer of
+		// a text file not changing at all. After this fast path we use
+		// the slower path based on the super class' using equals() to
+		// allow for whitespace ignore modes to still work.
+
+		if (e.beginA == e.endA || e.beginB == e.endB)
+			return e;
+
+		byte[] aRaw = a.content;
+		byte[] bRaw = b.content;
+
+		int aPtr = a.lines.get(e.beginA + 1);
+		int bPtr = a.lines.get(e.beginB + 1);
+
+		int aEnd = a.lines.get(e.endA + 1);
+		int bEnd = b.lines.get(e.endB + 1);
+
+		// This can never happen, but the JIT doesn't know that. If we
+		// define this assertion before the tight while loops below it
+		// should be able to skip the array bound checks on access.
+		//
+		if (aPtr < 0 || bPtr < 0 || aEnd > aRaw.length || bEnd > bRaw.length)
+			throw new ArrayIndexOutOfBoundsException();
+
+		while (aPtr < aEnd && bPtr < bEnd && aRaw[aPtr] == bRaw[bPtr]) {
+			aPtr++;
+			bPtr++;
+		}
+
+		while (aPtr < aEnd && bPtr < bEnd && aRaw[aEnd - 1] == bRaw[bEnd - 1]) {
+			aEnd--;
+			bEnd--;
+		}
+
+		e.beginA = findForwardLine(a.lines, e.beginA, aPtr);
+		e.beginB = findForwardLine(b.lines, e.beginB, bPtr);
+
+		e.endA = findReverseLine(a.lines, e.endA, aEnd);
+		e.endB = findReverseLine(b.lines, e.endB, bEnd);
+
+		return super.reduceCommonStartEnd(a, b, e);
+	}
+
+	private static int findForwardLine(IntList lines, int idx, int ptr) {
+		final int end = lines.size() - 2;
+		while (idx < end && lines.get(idx + 2) <= ptr)
+			idx++;
+		return idx;
+	}
+
+	private static int findReverseLine(IntList lines, int idx, int ptr) {
+		while (0 < idx && ptr <= lines.get(idx))
+			idx--;
+		return idx;
+	}
+
 	/**
 	 * Compute a hash code for a region.
 	 *