Move cached element hash codes to HashedSequence

Most diff implementations really want to use cached hash codes for
elements, rather than element equality, as they need to perform many
compares and unique hash codes for elements can really speed that
process up.

To make it easier to define element hash functions, move the caching
of hash codes into a wrapper sequence type, so that individual
sequence types like RawText don't need to do this themselves.  This
has a nice property of also allowing the sequence to no longer care
about the specific SequenceComparator that is going to be used, and
permits the caching to only examine the middle region that isn't
common to the two inputs.

Change-Id: If8623556da9419117b07c5073e8bce39de02570e
Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
This commit is contained in:
Shawn O. Pearce 2010-09-10 17:48:24 -07:00
parent e0970cd1b4
commit baaddd51f1
7 changed files with 306 additions and 49 deletions

View File

@ -882,8 +882,8 @@ private FormatResult createFormatResult(DiffEntry ent) throws IOException,
type = PatchType.BINARY;
} else {
res.a = new RawText(comparator, aRaw);
res.b = new RawText(comparator, bRaw);
res.a = new RawText(aRaw);
res.b = new RawText(bRaw);
editList = diff(res.a, res.b);
type = PatchType.UNIFIED;

View File

@ -0,0 +1,72 @@
/*
* Copyright (C) 2010, Google Inc.
* and other copyright owners as documented in the project's IP log.
*
* This program and the accompanying materials are made available
* under the terms of the Eclipse Distribution License v1.0 which
* accompanies this distribution, is reproduced below, and is
* available at http://www.eclipse.org/org/documents/edl-v10.php
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials provided
* with the distribution.
*
* - Neither the name of the Eclipse Foundation, Inc. nor the
* names of its contributors may be used to endorse or promote
* products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package org.eclipse.jgit.diff;
/**
* Wraps a {@link Sequence} to assign hash codes to elements.
*
* This sequence acts as a proxy for the real sequence, caching element hash
* codes so they don't need to be recomputed each time. Sequences of this type
* must be used with a {@link HashedSequenceComparator}.
*
* To construct an instance of this type use {@link HashedSequencePair}.
*
* @param <S>
* the base sequence type.
*/
public final class HashedSequence<S extends Sequence> extends Sequence {
final S base;
final int[] hashes;
HashedSequence(S base, int[] hashes) {
this.base = base;
this.hashes = hashes;
}
@Override
public int size() {
return base.size();
}
}

View File

@ -0,0 +1,77 @@
/*
* Copyright (C) 2010, Google Inc.
* and other copyright owners as documented in the project's IP log.
*
* This program and the accompanying materials are made available
* under the terms of the Eclipse Distribution License v1.0 which
* accompanies this distribution, is reproduced below, and is
* available at http://www.eclipse.org/org/documents/edl-v10.php
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials provided
* with the distribution.
*
* - Neither the name of the Eclipse Foundation, Inc. nor the
* names of its contributors may be used to endorse or promote
* products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package org.eclipse.jgit.diff;
/**
* Wrap another comparator for use with {@link HashedSequence}.
*
* This comparator acts as a proxy for the real comparator, evaluating the
* cached hash code before testing the underlying comparator's equality.
* Comparators of this type must be used with a {@link HashedSequence}.
*
* To construct an instance of this type use {@link HashedSequencePair}.
*
* @param <S>
* the base sequence type.
*/
public final class HashedSequenceComparator<S extends Sequence> extends
SequenceComparator<HashedSequence<S>> {
private final SequenceComparator<? super S> cmp;
HashedSequenceComparator(SequenceComparator<? super S> cmp) {
this.cmp = cmp;
}
@Override
public boolean equals(HashedSequence<S> a, int ai, //
HashedSequence<S> b, int bi) {
return a.hashes[ai] == b.hashes[bi]
&& cmp.equals(a.base, ai, b.base, bi);
}
@Override
public int hash(HashedSequence<S> seq, int ptr) {
return seq.hashes[ptr];
}
}

View File

@ -0,0 +1,108 @@
/*
* Copyright (C) 2010, Google Inc.
* and other copyright owners as documented in the project's IP log.
*
* This program and the accompanying materials are made available
* under the terms of the Eclipse Distribution License v1.0 which
* accompanies this distribution, is reproduced below, and is
* available at http://www.eclipse.org/org/documents/edl-v10.php
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials provided
* with the distribution.
*
* - Neither the name of the Eclipse Foundation, Inc. nor the
* names of its contributors may be used to endorse or promote
* products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package org.eclipse.jgit.diff;
/**
* Wraps two {@link Sequence} instances to cache their element hash codes.
*
* This pair wraps two sequences that contain cached hash codes for the input
* sequences.
*
* @param <S>
* the base sequence type.
*/
public class HashedSequencePair<S extends Sequence> {
private final SequenceComparator<? super S> cmp;
private final S baseA;
private final S baseB;
private HashedSequence<S> cachedA;
private HashedSequence<S> cachedB;
/**
* Construct a pair to provide fast hash codes.
*
* @param cmp
* the base comparator for the sequence elements.
* @param a
* the A sequence.
* @param b
* the B sequence.
*/
public HashedSequencePair(SequenceComparator<? super S> cmp, S a, S b) {
this.cmp = cmp;
this.baseA = a;
this.baseB = b;
}
/** @return obtain a comparator that uses the cached hash codes. */
public HashedSequenceComparator<S> getComparator() {
return new HashedSequenceComparator<S>(cmp);
}
/** @return wrapper around A that includes cached hash codes. */
public HashedSequence<S> getA() {
if (cachedA == null)
cachedA = wrap(baseA);
return cachedA;
}
/** @return wrapper around B that includes cached hash codes. */
public HashedSequence<S> getB() {
if (cachedB == null)
cachedB = wrap(baseB);
return cachedB;
}
private HashedSequence<S> wrap(S base) {
final int end = base.size();
final int[] hashes = new int[end];
for (int ptr = 0; ptr < end; ptr++)
hashes[ptr] = cmp.hash(base, ptr);
return new HashedSequence<S>(base, hashes);
}
}

View File

@ -110,7 +110,26 @@ public class MyersDiff<S extends Sequence> {
public static final DiffAlgorithm INSTANCE = new DiffAlgorithm() {
public <S extends Sequence, C extends SequenceComparator<? super S>> EditList diff(
C cmp, S a, S b) {
return new MyersDiff<S>(cmp, a, b).getEdits();
Edit region = new Edit(0, a.size(), 0, b.size());
region = cmp.reduceCommonStartEnd(a, b, region);
switch (region.getType()) {
case INSERT:
case DELETE: {
EditList r = new EditList();
r.add(region);
return r;
}
case REPLACE:
return new MyersDiff<S>(cmp, a, b, region).getEdits();
case EMPTY:
return new EditList();
default:
throw new IllegalStateException();
}
}
};
@ -120,23 +139,31 @@ public <S extends Sequence, C extends SequenceComparator<? super S>> EditList di
protected EditList edits;
/** Comparison function for sequences. */
protected SequenceComparator<? super S> cmp;
protected HashedSequenceComparator<Subsequence<S>> cmp;
/**
* The first text to be compared. Referred to as "Text A" in the comments
*/
protected S a;
protected HashedSequence<Subsequence<S>> a;
/**
* The second text to be compared. Referred to as "Text B" in the comments
*/
protected S b;
protected HashedSequence<Subsequence<S>> b;
private MyersDiff(SequenceComparator<? super S> cmp, S a, S b, Edit region) {
Subsequence<S> as = Subsequence.a(a, region);
Subsequence<S> bs = Subsequence.b(b, region);
HashedSequencePair<Subsequence<S>> pair = new HashedSequencePair<Subsequence<S>>(
new SubsequenceComparator<S>(cmp), as, bs);
this.cmp = pair.getComparator();
this.a = pair.getA();
this.b = pair.getB();
private MyersDiff(SequenceComparator<? super S> cmp, S a, S b) {
this.cmp = cmp;
this.a = a;
this.b = b;
calculateEdits();
Subsequence.toBase(edits, as, bs);
}
/**
@ -538,8 +565,8 @@ public static void main(String[] args) {
try {
RawText a = new RawText(new java.io.File(args[0]));
RawText b = new RawText(new java.io.File(args[1]));
MyersDiff diff = new MyersDiff(RawTextComparator.DEFAULT, a, b);
System.out.println(diff.getEdits().toString());
EditList res = INSTANCE.diff(RawTextComparator.DEFAULT, a, b);
System.out.println(res.toString());
} catch (Exception e) {
e.printStackTrace();
}

View File

@ -75,9 +75,6 @@ public class RawText extends Sequence {
/** Map of line number to starting position within {@link #content}. */
protected final IntList lines;
/** Hash code for each line, for fast equality elimination. */
protected final int[] hashes;
/**
* Create a new sequence from an existing content byte array.
* <p>
@ -105,7 +102,6 @@ public RawText(final byte[] input) {
public RawText(RawTextComparator cmp, byte[] input) {
content = input;
lines = RawParseUtils.lineMap(content, 0, content.length);
hashes = computeHashes(cmp);
}
/**
@ -170,16 +166,6 @@ public boolean isMissingNewlineAtEnd() {
return content[end - 1] != '\n';
}
private int[] computeHashes(RawTextComparator cmp) {
final int[] r = new int[lines.size()];
for (int lno = 1; lno < lines.size() - 1; lno++) {
final int ptr = lines.get(lno);
final int end = lines.get(lno + 1);
r[lno] = cmp.hashRegion(content, ptr, end);
}
return r;
}
/**
* Determine heuristically whether a byte array represents binary (as
* opposed to text) content.

View File

@ -59,9 +59,6 @@ public boolean equals(RawText a, int ai, RawText b, int bi) {
ai++;
bi++;
if (a.hashes[ai] != b.hashes[bi])
return false;
int as = a.lines.get(ai);
int bs = b.lines.get(bi);
final int ae = a.lines.get(ai + 1);
@ -78,7 +75,7 @@ public boolean equals(RawText a, int ai, RawText b, int bi) {
}
@Override
public int hashRegion(final byte[] raw, int ptr, final int end) {
protected int hashRegion(final byte[] raw, int ptr, final int end) {
int hash = 5381;
for (; ptr < end; ptr++)
hash = (hash << 5) ^ (raw[ptr] & 0xff);
@ -93,9 +90,6 @@ public boolean equals(RawText a, int ai, RawText b, int bi) {
ai++;
bi++;
if (a.hashes[ai] != b.hashes[bi])
return false;
int as = a.lines.get(ai);
int bs = b.lines.get(bi);
int ae = a.lines.get(ai + 1);
@ -129,7 +123,7 @@ public boolean equals(RawText a, int ai, RawText b, int bi) {
}
@Override
public int hashRegion(byte[] raw, int ptr, int end) {
protected int hashRegion(byte[] raw, int ptr, int end) {
int hash = 5381;
for (; ptr < end; ptr++) {
byte c = raw[ptr];
@ -147,9 +141,6 @@ public boolean equals(RawText a, int ai, RawText b, int bi) {
ai++;
bi++;
if (a.hashes[ai] != b.hashes[bi])
return false;
int as = a.lines.get(ai);
int bs = b.lines.get(bi);
int ae = a.lines.get(ai + 1);
@ -169,7 +160,7 @@ public boolean equals(RawText a, int ai, RawText b, int bi) {
}
@Override
public int hashRegion(final byte[] raw, int ptr, int end) {
protected int hashRegion(final byte[] raw, int ptr, int end) {
int hash = 5381;
ptr = trimLeadingWhitespace(raw, ptr, end);
for (; ptr < end; ptr++) {
@ -186,9 +177,6 @@ public boolean equals(RawText a, int ai, RawText b, int bi) {
ai++;
bi++;
if (a.hashes[ai] != b.hashes[bi])
return false;
int as = a.lines.get(ai);
int bs = b.lines.get(bi);
int ae = a.lines.get(ai + 1);
@ -208,7 +196,7 @@ public boolean equals(RawText a, int ai, RawText b, int bi) {
}
@Override
public int hashRegion(final byte[] raw, int ptr, int end) {
protected int hashRegion(final byte[] raw, int ptr, int end) {
int hash = 5381;
end = trimTrailingWhitespace(raw, ptr, end);
for (; ptr < end; ptr++) {
@ -225,9 +213,6 @@ public boolean equals(RawText a, int ai, RawText b, int bi) {
ai++;
bi++;
if (a.hashes[ai] != b.hashes[bi])
return false;
int as = a.lines.get(ai);
int bs = b.lines.get(bi);
int ae = a.lines.get(ai + 1);
@ -257,7 +242,7 @@ public boolean equals(RawText a, int ai, RawText b, int bi) {
}
@Override
public int hashRegion(final byte[] raw, int ptr, int end) {
protected int hashRegion(final byte[] raw, int ptr, int end) {
int hash = 5381;
end = trimTrailingWhitespace(raw, ptr, end);
while (ptr < end) {
@ -273,8 +258,10 @@ public int hashRegion(final byte[] raw, int ptr, int end) {
};
@Override
public int hash(RawText seq, int ptr) {
return seq.hashes[ptr + 1];
public int hash(RawText seq, int lno) {
final int begin = seq.lines.get(lno + 1);
final int end = seq.lines.get(lno + 2);
return hashRegion(seq.content, begin, end);
}
@Override
@ -347,5 +334,5 @@ private static int findReverseLine(IntList lines, int idx, int ptr) {
* 1 past the last byte of the region.
* @return hash code for the region <code>[ptr, end)</code> of raw.
*/
public abstract int hashRegion(byte[] raw, int ptr, int end);
protected abstract int hashRegion(byte[] raw, int ptr, int end);
}