From baaddd51f1ffcda73640579af47983da8f7ce5f9 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Fri, 10 Sep 2010 17:48:24 -0700 Subject: [PATCH] Move cached element hash codes to HashedSequence Most diff implementations really want to use cached hash codes for elements, rather than element equality, as they need to perform many compares and unique hash codes for elements can really speed that process up. To make it easier to define element hash functions, move the caching of hash codes into a wrapper sequence type, so that individual sequence types like RawText don't need to do this themselves. This has a nice property of also allowing the sequence to no longer care about the specific SequenceComparator that is going to be used, and permits the caching to only examine the middle region that isn't common to the two inputs. Change-Id: If8623556da9419117b07c5073e8bce39de02570e Signed-off-by: Shawn O. Pearce --- .../org/eclipse/jgit/diff/DiffFormatter.java | 4 +- .../org/eclipse/jgit/diff/HashedSequence.java | 72 ++++++++++++ .../jgit/diff/HashedSequenceComparator.java | 77 +++++++++++++ .../eclipse/jgit/diff/HashedSequencePair.java | 108 ++++++++++++++++++ .../src/org/eclipse/jgit/diff/MyersDiff.java | 47 ++++++-- .../src/org/eclipse/jgit/diff/RawText.java | 14 --- .../eclipse/jgit/diff/RawTextComparator.java | 33 ++---- 7 files changed, 306 insertions(+), 49 deletions(-) create mode 100644 org.eclipse.jgit/src/org/eclipse/jgit/diff/HashedSequence.java create mode 100644 org.eclipse.jgit/src/org/eclipse/jgit/diff/HashedSequenceComparator.java create mode 100644 org.eclipse.jgit/src/org/eclipse/jgit/diff/HashedSequencePair.java diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffFormatter.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffFormatter.java index ec1ced62a..1aa76bc43 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffFormatter.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffFormatter.java @@ -882,8 +882,8 @@ private FormatResult createFormatResult(DiffEntry ent) throws IOException, type = PatchType.BINARY; } else { - res.a = new RawText(comparator, aRaw); - res.b = new RawText(comparator, bRaw); + res.a = new RawText(aRaw); + res.b = new RawText(bRaw); editList = diff(res.a, res.b); type = PatchType.UNIFIED; diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/HashedSequence.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/HashedSequence.java new file mode 100644 index 000000000..2a26700c1 --- /dev/null +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/HashedSequence.java @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2010, Google Inc. + * and other copyright owners as documented in the project's IP log. + * + * This program and the accompanying materials are made available + * under the terms of the Eclipse Distribution License v1.0 which + * accompanies this distribution, is reproduced below, and is + * available at http://www.eclipse.org/org/documents/edl-v10.php + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * - Neither the name of the Eclipse Foundation, Inc. nor the + * names of its contributors may be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND + * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.eclipse.jgit.diff; + +/** + * Wraps a {@link Sequence} to assign hash codes to elements. + * + * This sequence acts as a proxy for the real sequence, caching element hash + * codes so they don't need to be recomputed each time. Sequences of this type + * must be used with a {@link HashedSequenceComparator}. + * + * To construct an instance of this type use {@link HashedSequencePair}. + * + * @param + * the base sequence type. + */ +public final class HashedSequence extends Sequence { + final S base; + + final int[] hashes; + + HashedSequence(S base, int[] hashes) { + this.base = base; + this.hashes = hashes; + } + + @Override + public int size() { + return base.size(); + } +} diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/HashedSequenceComparator.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/HashedSequenceComparator.java new file mode 100644 index 000000000..2aa84e49e --- /dev/null +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/HashedSequenceComparator.java @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2010, Google Inc. + * and other copyright owners as documented in the project's IP log. + * + * This program and the accompanying materials are made available + * under the terms of the Eclipse Distribution License v1.0 which + * accompanies this distribution, is reproduced below, and is + * available at http://www.eclipse.org/org/documents/edl-v10.php + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * - Neither the name of the Eclipse Foundation, Inc. nor the + * names of its contributors may be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND + * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.eclipse.jgit.diff; + +/** + * Wrap another comparator for use with {@link HashedSequence}. + * + * This comparator acts as a proxy for the real comparator, evaluating the + * cached hash code before testing the underlying comparator's equality. + * Comparators of this type must be used with a {@link HashedSequence}. + * + * To construct an instance of this type use {@link HashedSequencePair}. + * + * @param + * the base sequence type. + */ +public final class HashedSequenceComparator extends + SequenceComparator> { + private final SequenceComparator cmp; + + HashedSequenceComparator(SequenceComparator cmp) { + this.cmp = cmp; + } + + @Override + public boolean equals(HashedSequence a, int ai, // + HashedSequence b, int bi) { + return a.hashes[ai] == b.hashes[bi] + && cmp.equals(a.base, ai, b.base, bi); + } + + @Override + public int hash(HashedSequence seq, int ptr) { + return seq.hashes[ptr]; + } +} diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/HashedSequencePair.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/HashedSequencePair.java new file mode 100644 index 000000000..b72202ffc --- /dev/null +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/HashedSequencePair.java @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2010, Google Inc. + * and other copyright owners as documented in the project's IP log. + * + * This program and the accompanying materials are made available + * under the terms of the Eclipse Distribution License v1.0 which + * accompanies this distribution, is reproduced below, and is + * available at http://www.eclipse.org/org/documents/edl-v10.php + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * - Neither the name of the Eclipse Foundation, Inc. nor the + * names of its contributors may be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND + * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.eclipse.jgit.diff; + +/** + * Wraps two {@link Sequence} instances to cache their element hash codes. + * + * This pair wraps two sequences that contain cached hash codes for the input + * sequences. + * + * @param + * the base sequence type. + */ +public class HashedSequencePair { + private final SequenceComparator cmp; + + private final S baseA; + + private final S baseB; + + private HashedSequence cachedA; + + private HashedSequence cachedB; + + /** + * Construct a pair to provide fast hash codes. + * + * @param cmp + * the base comparator for the sequence elements. + * @param a + * the A sequence. + * @param b + * the B sequence. + */ + public HashedSequencePair(SequenceComparator cmp, S a, S b) { + this.cmp = cmp; + this.baseA = a; + this.baseB = b; + } + + /** @return obtain a comparator that uses the cached hash codes. */ + public HashedSequenceComparator getComparator() { + return new HashedSequenceComparator(cmp); + } + + /** @return wrapper around A that includes cached hash codes. */ + public HashedSequence getA() { + if (cachedA == null) + cachedA = wrap(baseA); + return cachedA; + } + + /** @return wrapper around B that includes cached hash codes. */ + public HashedSequence getB() { + if (cachedB == null) + cachedB = wrap(baseB); + return cachedB; + } + + private HashedSequence wrap(S base) { + final int end = base.size(); + final int[] hashes = new int[end]; + for (int ptr = 0; ptr < end; ptr++) + hashes[ptr] = cmp.hash(base, ptr); + return new HashedSequence(base, hashes); + } +} diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/MyersDiff.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/MyersDiff.java index 45abb58cd..3fad2c349 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/MyersDiff.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/MyersDiff.java @@ -110,7 +110,26 @@ public class MyersDiff { public static final DiffAlgorithm INSTANCE = new DiffAlgorithm() { public > EditList diff( C cmp, S a, S b) { - return new MyersDiff(cmp, a, b).getEdits(); + Edit region = new Edit(0, a.size(), 0, b.size()); + region = cmp.reduceCommonStartEnd(a, b, region); + + switch (region.getType()) { + case INSERT: + case DELETE: { + EditList r = new EditList(); + r.add(region); + return r; + } + + case REPLACE: + return new MyersDiff(cmp, a, b, region).getEdits(); + + case EMPTY: + return new EditList(); + + default: + throw new IllegalStateException(); + } } }; @@ -120,23 +139,31 @@ public > EditList di protected EditList edits; /** Comparison function for sequences. */ - protected SequenceComparator cmp; + protected HashedSequenceComparator> cmp; /** * The first text to be compared. Referred to as "Text A" in the comments */ - protected S a; + protected HashedSequence> a; /** * The second text to be compared. Referred to as "Text B" in the comments */ - protected S b; + protected HashedSequence> b; + + private MyersDiff(SequenceComparator cmp, S a, S b, Edit region) { + Subsequence as = Subsequence.a(a, region); + Subsequence bs = Subsequence.b(b, region); + + HashedSequencePair> pair = new HashedSequencePair>( + new SubsequenceComparator(cmp), as, bs); + + this.cmp = pair.getComparator(); + this.a = pair.getA(); + this.b = pair.getB(); - private MyersDiff(SequenceComparator cmp, S a, S b) { - this.cmp = cmp; - this.a = a; - this.b = b; calculateEdits(); + Subsequence.toBase(edits, as, bs); } /** @@ -538,8 +565,8 @@ public static void main(String[] args) { try { RawText a = new RawText(new java.io.File(args[0])); RawText b = new RawText(new java.io.File(args[1])); - MyersDiff diff = new MyersDiff(RawTextComparator.DEFAULT, a, b); - System.out.println(diff.getEdits().toString()); + EditList res = INSTANCE.diff(RawTextComparator.DEFAULT, a, b); + System.out.println(res.toString()); } catch (Exception e) { e.printStackTrace(); } diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java index 07d28572a..7d0987450 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java @@ -75,9 +75,6 @@ public class RawText extends Sequence { /** Map of line number to starting position within {@link #content}. */ protected final IntList lines; - /** Hash code for each line, for fast equality elimination. */ - protected final int[] hashes; - /** * Create a new sequence from an existing content byte array. *

@@ -105,7 +102,6 @@ public RawText(final byte[] input) { public RawText(RawTextComparator cmp, byte[] input) { content = input; lines = RawParseUtils.lineMap(content, 0, content.length); - hashes = computeHashes(cmp); } /** @@ -170,16 +166,6 @@ public boolean isMissingNewlineAtEnd() { return content[end - 1] != '\n'; } - private int[] computeHashes(RawTextComparator cmp) { - final int[] r = new int[lines.size()]; - for (int lno = 1; lno < lines.size() - 1; lno++) { - final int ptr = lines.get(lno); - final int end = lines.get(lno + 1); - r[lno] = cmp.hashRegion(content, ptr, end); - } - return r; - } - /** * Determine heuristically whether a byte array represents binary (as * opposed to text) content. diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawTextComparator.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawTextComparator.java index 767bf61f7..a1e2e6cec 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawTextComparator.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawTextComparator.java @@ -59,9 +59,6 @@ public boolean equals(RawText a, int ai, RawText b, int bi) { ai++; bi++; - if (a.hashes[ai] != b.hashes[bi]) - return false; - int as = a.lines.get(ai); int bs = b.lines.get(bi); final int ae = a.lines.get(ai + 1); @@ -78,7 +75,7 @@ public boolean equals(RawText a, int ai, RawText b, int bi) { } @Override - public int hashRegion(final byte[] raw, int ptr, final int end) { + protected int hashRegion(final byte[] raw, int ptr, final int end) { int hash = 5381; for (; ptr < end; ptr++) hash = (hash << 5) ^ (raw[ptr] & 0xff); @@ -93,9 +90,6 @@ public boolean equals(RawText a, int ai, RawText b, int bi) { ai++; bi++; - if (a.hashes[ai] != b.hashes[bi]) - return false; - int as = a.lines.get(ai); int bs = b.lines.get(bi); int ae = a.lines.get(ai + 1); @@ -129,7 +123,7 @@ public boolean equals(RawText a, int ai, RawText b, int bi) { } @Override - public int hashRegion(byte[] raw, int ptr, int end) { + protected int hashRegion(byte[] raw, int ptr, int end) { int hash = 5381; for (; ptr < end; ptr++) { byte c = raw[ptr]; @@ -147,9 +141,6 @@ public boolean equals(RawText a, int ai, RawText b, int bi) { ai++; bi++; - if (a.hashes[ai] != b.hashes[bi]) - return false; - int as = a.lines.get(ai); int bs = b.lines.get(bi); int ae = a.lines.get(ai + 1); @@ -169,7 +160,7 @@ public boolean equals(RawText a, int ai, RawText b, int bi) { } @Override - public int hashRegion(final byte[] raw, int ptr, int end) { + protected int hashRegion(final byte[] raw, int ptr, int end) { int hash = 5381; ptr = trimLeadingWhitespace(raw, ptr, end); for (; ptr < end; ptr++) { @@ -186,9 +177,6 @@ public boolean equals(RawText a, int ai, RawText b, int bi) { ai++; bi++; - if (a.hashes[ai] != b.hashes[bi]) - return false; - int as = a.lines.get(ai); int bs = b.lines.get(bi); int ae = a.lines.get(ai + 1); @@ -208,7 +196,7 @@ public boolean equals(RawText a, int ai, RawText b, int bi) { } @Override - public int hashRegion(final byte[] raw, int ptr, int end) { + protected int hashRegion(final byte[] raw, int ptr, int end) { int hash = 5381; end = trimTrailingWhitespace(raw, ptr, end); for (; ptr < end; ptr++) { @@ -225,9 +213,6 @@ public boolean equals(RawText a, int ai, RawText b, int bi) { ai++; bi++; - if (a.hashes[ai] != b.hashes[bi]) - return false; - int as = a.lines.get(ai); int bs = b.lines.get(bi); int ae = a.lines.get(ai + 1); @@ -257,7 +242,7 @@ public boolean equals(RawText a, int ai, RawText b, int bi) { } @Override - public int hashRegion(final byte[] raw, int ptr, int end) { + protected int hashRegion(final byte[] raw, int ptr, int end) { int hash = 5381; end = trimTrailingWhitespace(raw, ptr, end); while (ptr < end) { @@ -273,8 +258,10 @@ public int hashRegion(final byte[] raw, int ptr, int end) { }; @Override - public int hash(RawText seq, int ptr) { - return seq.hashes[ptr + 1]; + public int hash(RawText seq, int lno) { + final int begin = seq.lines.get(lno + 1); + final int end = seq.lines.get(lno + 2); + return hashRegion(seq.content, begin, end); } @Override @@ -347,5 +334,5 @@ private static int findReverseLine(IntList lines, int idx, int ptr) { * 1 past the last byte of the region. * @return hash code for the region [ptr, end) of raw. */ - public abstract int hashRegion(byte[] raw, int ptr, int end); + protected abstract int hashRegion(byte[] raw, int ptr, int end); }