RevWalk: use changed path filters

Teach RevWalk, TreeRevFilter, PathFilter, and FollowFilter to use
changed path filters, whenever available, to speed revision walks by
skipping commits that fail the changed path filter.

This work is based on earlier work by Kyle Zhao
(I441be984b609669cff77617ecfc838b080ce0816).

Change-Id: I7396f70241e571c63aabe337f6de1b8b9800f7ed
Signed-off-by: kylezhao <kylezhao@tencent.com>
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
This commit is contained in:
Jonathan Tan 2023-05-02 10:44:16 -07:00
parent ff0f7c174f
commit d3b40e72ac
8 changed files with 211 additions and 13 deletions

View File

@ -26,6 +26,7 @@
import java.util.List;
import org.eclipse.jgit.api.Git;
import org.eclipse.jgit.diff.DiffConfig;
import org.eclipse.jgit.internal.storage.file.GC;
import org.eclipse.jgit.lib.AnyObjectId;
import org.eclipse.jgit.lib.ConfigConstants;
@ -166,6 +167,72 @@ public void testTreeFilter() throws Exception {
assertNull(rw.next());
}
@Test
public void testChangedPathFilter() throws Exception {
RevCommit c1 = commitFile("file1", "1", "master");
commitFile("file2", "2", "master");
RevCommit c3 = commitFile("file1", "3", "master");
RevCommit c4 = commitFile("file2", "4", "master");
enableAndWriteCommitGraph();
TreeRevFilter trf = new TreeRevFilter(rw, PathFilter.create("file1"));
rw.markStart(rw.lookupCommit(c4));
rw.setRevFilter(trf);
assertEquals(c3, rw.next());
assertEquals(c1, rw.next());
assertNull(rw.next());
// 1 commit that has exactly one parent and matches path
assertEquals(1, trf.getChangedPathFilterTruePositive());
// No false positives
assertEquals(0, trf.getChangedPathFilterFalsePositive());
// 2 commits that have exactly one parent and don't match path
assertEquals(2, trf.getChangedPathFilterNegative());
}
@Test
public void testChangedPathFilterWithFollowFilter() throws Exception {
RevCommit c0 = commit(tree());
RevCommit c1 = commit(tree(file("file", blob("contents"))), c0);
RevCommit c2 = commit(tree(file("file", blob("contents")),
file("unrelated", blob("unrelated change"))), c1);
RevCommit c3 = commit(tree(file("renamed-file", blob("contents")),
file("unrelated", blob("unrelated change"))), c2);
RevCommit c4 = commit(
tree(file("renamed-file", blob("contents")),
file("unrelated", blob("another unrelated change"))),
c3);
branch(c4, "master");
enableAndWriteCommitGraph();
db.getConfig().setString(ConfigConstants.CONFIG_DIFF_SECTION, null,
ConfigConstants.CONFIG_KEY_RENAMES, "true");
TreeRevFilter trf = new TreeRevFilter(rw,
new FollowFilter(PathFilter.create("renamed-file"),
db.getConfig().get(DiffConfig.KEY)));
rw.markStart(rw.lookupCommit(c4));
rw.setRevFilter(trf);
assertEquals(c3, rw.next());
assertEquals(c1, rw.next());
assertNull(rw.next());
// Path "renamed-file" is in c3's bloom filter, and another path "file"
// is in c1's bloom filter (we know of "file" because the rev walk
// detected that "renamed-file" is a renaming of "file")
assertEquals(2, trf.getChangedPathFilterTruePositive());
// No false positives
assertEquals(0, trf.getChangedPathFilterFalsePositive());
// 2 commits that have exactly one parent and don't match path
assertEquals(2, trf.getChangedPathFilterNegative());
}
@Test
public void testWalkWithCommitMessageFilter() throws Exception {
RevCommit a = commit();

View File

@ -11,6 +11,8 @@
package org.eclipse.jgit.revwalk;
import java.io.IOException;
import java.util.Optional;
import java.util.Set;
import org.eclipse.jgit.diff.DiffConfig;
import org.eclipse.jgit.errors.IncorrectObjectTypeException;
@ -89,6 +91,12 @@ public boolean shouldBeRecursive() {
return path.shouldBeRecursive() || ANY_DIFF.shouldBeRecursive();
}
@Override
public Optional<Set<byte[]>> getPathsBestEffort() {
return path.getPathsBestEffort();
}
/** {@inheritDoc} */
@Override
public TreeFilter clone() {
return new FollowFilter(path.clone(), cfg);

View File

@ -25,6 +25,7 @@
import org.eclipse.jgit.annotations.Nullable;
import org.eclipse.jgit.errors.IncorrectObjectTypeException;
import org.eclipse.jgit.errors.MissingObjectException;
import org.eclipse.jgit.internal.storage.commitgraph.ChangedPathFilter;
import org.eclipse.jgit.lib.AnyObjectId;
import org.eclipse.jgit.lib.Constants;
import org.eclipse.jgit.lib.MutableObjectId;
@ -686,6 +687,20 @@ int getGeneration() {
return Constants.COMMIT_GENERATION_UNKNOWN;
}
/**
* Get the changed path filter of the commit.
* <p>
* This is null when there is no commit graph file, the commit is not in the
* commit graph file, or the commit graph file was generated without changed
* path filters.
*
* @return the changed path filter
* @since 6.7
*/
ChangedPathFilter getChangedPathFilter() {
return null;
}
/**
* Reset this commit to allow another RevWalk with the same instances.
* <p>

View File

@ -14,6 +14,7 @@
import org.eclipse.jgit.errors.IncorrectObjectTypeException;
import org.eclipse.jgit.errors.MissingObjectException;
import org.eclipse.jgit.internal.storage.commitgraph.ChangedPathFilter;
import org.eclipse.jgit.internal.storage.commitgraph.CommitGraph;
import org.eclipse.jgit.lib.AnyObjectId;
import org.eclipse.jgit.lib.Constants;
@ -29,6 +30,8 @@ class RevCommitCG extends RevCommit {
private final int graphPosition;
private final ChangedPathFilter changedPathFilter;
private int generation = Constants.COMMIT_GENERATION_UNKNOWN;
/**
@ -38,10 +41,14 @@ class RevCommitCG extends RevCommit {
* object name for the commit.
* @param graphPosition
* the position in the commit-graph of the object.
* @param changedPathFilter
* the changed path filter if one exists
*/
protected RevCommitCG(AnyObjectId id, int graphPosition) {
protected RevCommitCG(AnyObjectId id, int graphPosition,
ChangedPathFilter changedPathFilter) {
super(id);
this.graphPosition = graphPosition;
this.changedPathFilter = changedPathFilter;
}
@Override
@ -100,4 +107,10 @@ private void parseInGraph(RevWalk walk) throws IOException {
int getGeneration() {
return generation;
}
/** {@inheritDoc} */
@Override
ChangedPathFilter getChangedPathFilter() {
return changedPathFilter;
}
}

View File

@ -1713,7 +1713,8 @@ protected RevCommit createCommit(AnyObjectId id) {
private RevCommit createCommit(AnyObjectId id, int graphPos) {
if (graphPos >= 0) {
return new RevCommitCG(id, graphPos);
return new RevCommitCG(id, graphPos,
commitGraph().getChangedPathFilter(graphPos));
}
return new RevCommit(id);
}

View File

@ -12,7 +12,10 @@
import java.io.IOException;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import org.eclipse.jgit.internal.storage.commitgraph.ChangedPathFilter;
import org.eclipse.jgit.diff.DiffConfig;
import org.eclipse.jgit.diff.DiffEntry;
import org.eclipse.jgit.diff.DiffEntry.ChangeType;
@ -47,6 +50,12 @@ public class TreeRevFilter extends RevFilter {
private final TreeWalk pathFilter;
private long changedPathFilterTruePositive = 0;
private long changedPathFilterFalsePositive = 0;
private long changedPathFilterNegative = 0;
/**
* Create a {@link org.eclipse.jgit.revwalk.filter.RevFilter} from a
* {@link org.eclipse.jgit.treewalk.filter.TreeFilter}.
@ -122,12 +131,41 @@ public boolean include(RevWalk walker, RevCommit c)
// We have exactly one parent. This is a very common case.
//
int chgs = 0, adds = 0;
while (tw.next()) {
chgs++;
if (tw.getRawMode(0) == 0 && tw.getRawMode(1) != 0) {
adds++;
} else {
break; // no point in looking at this further.
boolean changedPathFilterUsed = false;
boolean mustCalculateChgs = true;
ChangedPathFilter cpf = c.getChangedPathFilter();
if (cpf != null) {
Optional<Set<byte[]>> paths = pathFilter.getFilter()
.getPathsBestEffort();
if (paths.isPresent()) {
changedPathFilterUsed = true;
for (byte[] path : paths.get()) {
if (!cpf.maybeContains(path)) {
mustCalculateChgs = false;
break;
}
}
}
}
if (mustCalculateChgs) {
while (tw.next()) {
chgs++;
if (tw.getRawMode(0) == 0 && tw.getRawMode(1) != 0) {
adds++;
} else {
break; // no point in looking at this further.
}
}
if (changedPathFilterUsed) {
if (chgs > 0) {
changedPathFilterTruePositive++;
} else {
changedPathFilterFalsePositive++;
}
}
} else {
if (changedPathFilterUsed) {
changedPathFilterNegative++;
}
}
@ -244,6 +282,37 @@ public boolean requiresCommitBody() {
return false;
}
/**
* Return how many times a changed path filter correctly predicted that a
* path was changed in a commit, for statistics gathering purposes.
*
* @return count of true positives
*/
public long getChangedPathFilterTruePositive() {
return changedPathFilterTruePositive;
}
/**
* Return how many times a changed path filter wrongly predicted that a path
* was changed in a commit, for statistics gathering purposes.
*
* @return count of false positives
*/
public long getChangedPathFilterFalsePositive() {
return changedPathFilterFalsePositive;
}
/**
* Return how many times a changed path filter predicted that a path was not
* changed in a commit (allowing that commit to be skipped), for statistics
* gathering purposes.
*
* @return count of negatives
*/
public long getChangedPathFilterNegative() {
return changedPathFilterNegative;
}
private void updateFollowFilter(ObjectId[] trees, DiffConfig cfg)
throws MissingObjectException, IncorrectObjectTypeException,
CorruptObjectException, IOException {

View File

@ -11,6 +11,9 @@
package org.eclipse.jgit.treewalk.filter;
import java.util.Optional;
import java.util.Set;
import java.util.HashSet;
import org.eclipse.jgit.internal.JGitText;
import org.eclipse.jgit.lib.Constants;
import org.eclipse.jgit.treewalk.TreeWalk;
@ -45,7 +48,8 @@ public static PathFilter create(String path) {
while (path.endsWith("/")) //$NON-NLS-1$
path = path.substring(0, path.length() - 1);
if (path.length() == 0)
throw new IllegalArgumentException(JGitText.get().emptyPathNotPermitted);
throw new IllegalArgumentException(
JGitText.get().emptyPathNotPermitted);
return new PathFilter(path);
}
@ -85,6 +89,14 @@ public boolean shouldBeRecursive() {
return false;
}
@Override
public Optional<Set<byte[]>> getPathsBestEffort() {
HashSet<byte[]> s = new HashSet<>();
s.add(pathRaw);
return Optional.of(s);
}
/** {@inheritDoc} */
@Override
public PathFilter clone() {
return this;

View File

@ -11,6 +11,8 @@
package org.eclipse.jgit.treewalk.filter;
import java.io.IOException;
import java.util.Optional;
import java.util.Set;
import org.eclipse.jgit.dircache.DirCacheIterator;
import org.eclipse.jgit.errors.IncorrectObjectTypeException;
@ -188,10 +190,8 @@ public abstract boolean include(TreeWalk walker)
* as thrown by {@link #include(TreeWalk)}
* @since 4.7
*/
public int matchFilter(TreeWalk walker)
throws MissingObjectException, IncorrectObjectTypeException,
IOException
{
public int matchFilter(TreeWalk walker) throws MissingObjectException,
IncorrectObjectTypeException, IOException {
return include(walker) ? 0 : 1;
}
@ -209,6 +209,19 @@ public int matchFilter(TreeWalk walker)
*/
public abstract boolean shouldBeRecursive();
/**
* If this filter checks that a specific set of paths have all been
* modified, returns that set of paths to be checked against a changed path
* filter. Otherwise, returns empty.
*
* @return a set of paths, or empty
*
* @since 6.7
*/
public Optional<Set<byte[]>> getPathsBestEffort() {
return Optional.empty();
}
/**
* {@inheritDoc}
*