Perform common start/end elimination by default for DiffAlgorithm

As it turns out, every single diff algorithm we might try to
implement can benfit from using the SequenceComparator's native
concept of the simple reduceCommonStartEnd() step.  For most inputs,
there can be a significant number of elements that can be removed
from the space the DiffAlgorithm needs to consider, which will
reduce the overall running time for the final solution.

Pool this logic inside of DiffAlgorithm itself as a default, but
permit a specific algorithm to override it when necessary.

Convert MyersDiff to use this reduction to reduce the space it
needs to search, making it perform slightly better on common inputs.

Change-Id: I14004d771117e4a4ab2a02cace8deaeda9814bc1
Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
This commit is contained in:
Shawn O. Pearce 2010-09-21 08:12:51 -07:00
parent e84d826eb6
commit 857d68d173
4 changed files with 122 additions and 100 deletions

View File

@ -52,14 +52,12 @@
* algorithms may support parameterization, in which case the caller can create
* a unique instance per thread.
*/
public interface DiffAlgorithm {
public abstract class DiffAlgorithm {
/**
* Compare two sequences and identify a list of edits between them.
*
*
* @param <S>
* type of sequence being compared.
* @param <C>
* type of comparator to evaluate the sequence elements.
* @param cmp
* the comparator supplying the element equivalence function.
* @param a
@ -74,6 +72,57 @@ public interface DiffAlgorithm {
* sequences are identical according to {@code cmp}'s rules. The
* result list is never null.
*/
public <S extends Sequence, C extends SequenceComparator<? super S>> EditList diff(
C cmp, S a, S b);
public <S extends Sequence> EditList diff(
SequenceComparator<? super S> cmp, S a, S b) {
Edit region = cmp.reduceCommonStartEnd(a, b, coverEdit(a, b));
switch (region.getType()) {
case INSERT:
case DELETE:
return EditList.singleton(region);
case REPLACE: {
SubsequenceComparator<S> cs = new SubsequenceComparator<S>(cmp);
Subsequence<S> as = Subsequence.a(a, region);
Subsequence<S> bs = Subsequence.b(b, region);
return Subsequence.toBase(diffNonCommon(cs, as, bs), as, bs);
}
case EMPTY:
return new EditList(0);
default:
throw new IllegalStateException();
}
}
private static <S extends Sequence> Edit coverEdit(S a, S b) {
return new Edit(0, a.size(), 0, b.size());
}
/**
* Compare two sequences and identify a list of edits between them.
*
* This method should be invoked only after the two sequences have been
* proven to have no common starting or ending elements. The expected
* elimination of common starting and ending elements is automatically
* performed by the {@link #diff(SequenceComparator, Sequence, Sequence)}
* method, which invokes this method using {@link Subsequence}s.
*
* @param <S>
* type of sequence being compared.
* @param cmp
* the comparator supplying the element equivalence function.
* @param a
* the first (also known as old or pre-image) sequence. Edits
* returned by this algorithm will reference indexes using the
* 'A' side: {@link Edit#getBeginA()}, {@link Edit#getEndA()}.
* @param b
* the second (also known as new or post-image) sequence. Edits
* returned by this algorithm will reference indexes using the
* 'B' side: {@link Edit#getBeginB()}, {@link Edit#getEndB()}.
* @return a modifiable edit list comparing the two sequences.
*/
public abstract <S extends Sequence> EditList diffNonCommon(
SequenceComparator<? super S> cmp, S a, S b);
}

View File

@ -48,6 +48,19 @@
/** Specialized list of {@link Edit}s in a document. */
public class EditList extends AbstractList<Edit> {
/**
* Construct an edit list containing a single edit.
*
* @param edit
* the edit to return in the list.
* @return list containing only {@code edit}.
*/
public static EditList singleton(Edit edit) {
EditList res = new EditList(1);
res.add(edit);
return res;
}
private final ArrayList<Edit> container;
/** Create a new, empty edit list. */
@ -55,6 +68,17 @@ public EditList() {
container = new ArrayList<Edit>();
}
/**
* Create an empty edit list with the specified capacity.
*
* @param capacity
* the initial capacity of the edit list. If additional edits are
* added to the list, it will be grown to support them.
*/
public EditList(int capacity) {
container = new ArrayList<Edit>(capacity);
}
@Override
public int size() {
return container.size();

View File

@ -108,28 +108,9 @@
public class MyersDiff<S extends Sequence> {
/** Singleton instance of MyersDiff. */
public static final DiffAlgorithm INSTANCE = new DiffAlgorithm() {
public <S extends Sequence, C extends SequenceComparator<? super S>> EditList diff(
C cmp, S a, S b) {
Edit region = new Edit(0, a.size(), 0, b.size());
region = cmp.reduceCommonStartEnd(a, b, region);
switch (region.getType()) {
case INSERT:
case DELETE: {
EditList r = new EditList();
r.add(region);
return r;
}
case REPLACE:
return new MyersDiff<S>(cmp, a, b, region).getEdits();
case EMPTY:
return new EditList();
default:
throw new IllegalStateException();
}
public <S extends Sequence> EditList diffNonCommon(
SequenceComparator<? super S> cmp, S a, S b) {
return new MyersDiff<S>(cmp, a, b).edits;
}
};
@ -139,38 +120,27 @@ public <S extends Sequence, C extends SequenceComparator<? super S>> EditList di
protected EditList edits;
/** Comparison function for sequences. */
protected HashedSequenceComparator<Subsequence<S>> cmp;
protected HashedSequenceComparator<S> cmp;
/**
* The first text to be compared. Referred to as "Text A" in the comments
*/
protected HashedSequence<Subsequence<S>> a;
protected HashedSequence<S> a;
/**
* The second text to be compared. Referred to as "Text B" in the comments
*/
protected HashedSequence<Subsequence<S>> b;
protected HashedSequence<S> b;
private MyersDiff(SequenceComparator<? super S> cmp, S a, S b, Edit region) {
Subsequence<S> as = Subsequence.a(a, region);
Subsequence<S> bs = Subsequence.b(b, region);
HashedSequencePair<Subsequence<S>> pair = new HashedSequencePair<Subsequence<S>>(
new SubsequenceComparator<S>(cmp), as, bs);
private MyersDiff(SequenceComparator<? super S> cmp, S a, S b) {
HashedSequencePair<S> pair;
pair = new HashedSequencePair<S>(cmp, a, b);
this.cmp = pair.getComparator();
this.a = pair.getA();
this.b = pair.getB();
calculateEdits();
Subsequence.toBase(edits, as, bs);
}
/**
* @return the list of edits found during the last call to {@link #calculateEdits()}
*/
public EditList getEdits() {
return edits;
}
// TODO: use ThreadLocal for future multi-threaded operations
@ -565,8 +535,8 @@ public static void main(String[] args) {
try {
RawText a = new RawText(new java.io.File(args[0]));
RawText b = new RawText(new java.io.File(args[1]));
EditList res = INSTANCE.diff(RawTextComparator.DEFAULT, a, b);
System.out.println(res.toString());
EditList r = INSTANCE.diff(RawTextComparator.DEFAULT, a, b);
System.out.println(r.toString());
} catch (Exception e) {
e.printStackTrace();
}

View File

@ -99,7 +99,7 @@
* by the prior step 2 or 5.</li>
* </ol>
*/
public class PatienceDiff implements DiffAlgorithm {
public class PatienceDiff extends DiffAlgorithm {
/** Algorithm we use when there are no common unique lines in a region. */
private DiffAlgorithm fallback;
@ -114,38 +114,10 @@ public void setFallbackAlgorithm(DiffAlgorithm alg) {
fallback = alg;
}
public <S extends Sequence, C extends SequenceComparator<? super S>> EditList diff(
C cmp, S a, S b) {
Edit region = new Edit(0, a.size(), 0, b.size());
region = cmp.reduceCommonStartEnd(a, b, region);
switch (region.getType()) {
case INSERT:
case DELETE: {
EditList r = new EditList();
r.add(region);
return r;
}
case REPLACE: {
SubsequenceComparator<S> cs = new SubsequenceComparator<S>(cmp);
Subsequence<S> as = Subsequence.a(a, region);
Subsequence<S> bs = Subsequence.b(b, region);
return Subsequence.toBase(diffImpl(cs, as, bs), as, bs);
}
case EMPTY:
return new EditList();
default:
throw new IllegalStateException();
}
}
private <S extends Sequence, C extends SequenceComparator<? super S>> EditList diffImpl(
C cmp, S a, S b) {
public <S extends Sequence> EditList diffNonCommon(
SequenceComparator<? super S> cmp, S a, S b) {
State<S> s = new State<S>(new HashedSequencePair<S>(cmp, a, b));
s.diff(new Edit(0, s.a.size(), 0, s.b.size()), null, 0, 0);
s.diffReplace(new Edit(0, s.a.size(), 0, s.b.size()), null, 0, 0);
return s.edits;
}
@ -166,25 +138,12 @@ private class State<S extends Sequence> {
this.edits = new EditList();
}
private void diff(Edit r, long[] pCommon, int pIdx, int pEnd) {
switch (r.getType()) {
case INSERT:
case DELETE:
edits.add(r);
return;
case REPLACE:
break;
case EMPTY:
default:
throw new IllegalStateException();
}
void diffReplace(Edit r, long[] pCommon, int pIdx, int pEnd) {
PatienceDiffIndex<S> p;
Edit lcs;
p = new PatienceDiffIndex<S>(cmp, a, b, r, pCommon, pIdx, pEnd);
Edit lcs = p.findLongestCommonSequence();
lcs = p.findLongestCommonSequence();
if (lcs != null) {
pCommon = p.nCommon;
@ -196,20 +155,40 @@ private void diff(Edit r, long[] pCommon, int pIdx, int pEnd) {
diff(r.after(lcs), pCommon, pIdx + 1, pEnd);
} else if (fallback != null) {
p = null;
pCommon = null;
p = null;
SubsequenceComparator<HashedSequence<S>> cs;
cs = new SubsequenceComparator<HashedSequence<S>>(cmp);
SubsequenceComparator<HashedSequence<S>> cs = subcmp();
Subsequence<HashedSequence<S>> as = Subsequence.a(a, r);
Subsequence<HashedSequence<S>> bs = Subsequence.b(b, r);
EditList res = fallback.diff(cs, as, bs);
EditList res = fallback.diffNonCommon(cs, as, bs);
edits.addAll(Subsequence.toBase(res, as, bs));
} else {
edits.add(r);
}
}
private void diff(Edit r, long[] pCommon, int pIdx, int pEnd) {
switch (r.getType()) {
case INSERT:
case DELETE:
edits.add(r);
break;
case REPLACE:
diffReplace(r, pCommon, pIdx, pEnd);
break;
case EMPTY:
default:
throw new IllegalStateException();
}
}
private SubsequenceComparator<HashedSequence<S>> subcmp() {
return new SubsequenceComparator<HashedSequence<S>>(cmp);
}
}
}