Speed up PathFilterGroup for large sets of paths

The ByteArraySet is a distorted copy of ObjectIdSubclassMap.

Bug: 397185
Change-Id: Ife0cc2da6cc46431be94a6b14f64f6adbe983afb
This commit is contained in:
Robin Rosenberg 2013-01-20 12:56:25 +01:00
parent 05180574a9
commit 522fc6a9c6
3 changed files with 643 additions and 30 deletions

View File

@ -0,0 +1,256 @@
/*
* Copyright (C) 2013, Robin Rosenberg
* and other copyright owners as documented in the project's IP log.
*
* This program and the accompanying materials are made available
* under the terms of the Eclipse Distribution License v1.0 which
* accompanies this distribution, is reproduced below, and is
* available at http://www.eclipse.org/org/documents/edl-v10.php
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials provided
* with the distribution.
*
* - Neither the name of the Eclipse Foundation, Inc. nor the
* names of its contributors may be used to endorse or promote
* products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package org.eclipse.jgit.treewalk.filter;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.lessThan;
import static org.junit.Assert.assertEquals;
import java.io.IOException;
import org.eclipse.jgit.dircache.DirCache;
import org.eclipse.jgit.dircache.DirCacheEditor;
import org.eclipse.jgit.dircache.DirCacheEntry;
import org.eclipse.jgit.dircache.DirCacheIterator;
import org.eclipse.jgit.errors.IncorrectObjectTypeException;
import org.eclipse.jgit.errors.MissingObjectException;
import org.eclipse.jgit.lib.FileMode;
import org.eclipse.jgit.lib.ObjectReader;
import org.eclipse.jgit.treewalk.TreeWalk;
import org.junit.Test;
/**
* Performance test suite for PathFilterGroup
*/
public class PathFilterGroupTest2 {
private DirCache dc;
private String[] paths = new String[100000];
private TreeFilter pf;
private String data;
private long tInit;
public void setup(int pathsize, int filtersize, String[] filters) {
long t1 = System.nanoTime();
String[] seed = { "abc", "def", "ghi", "jkl", "mno", "pqr", "stu",
"vwx", "xyz", "\u00e5\u00e4\u00f6" };
dc = DirCache.newInCore();
DirCacheEditor ed = dc.editor();
int pi = 0;
B: for (String a : seed) {
for (String b : seed) {
for (String c : seed) {
for (String d : seed) {
for (String e : seed) {
if (pi >= pathsize)
break B;
String p1 = a + "/" + b + "/" + c + "/" + d + "/"
+ e;
paths[pi] = p1;
ed.add(new DirCacheEditor.PathEdit(p1) {
@Override
public void apply(DirCacheEntry ent) {
ent.setFileMode(FileMode.REGULAR_FILE);
}
});
++pi;
}
}
}
}
}
ed.finish();
long t2 = System.nanoTime();
if (filters != null)
pf = PathFilterGroup.createFromStrings(filters);
else {
// System.out.println(dc.getEntryCount());
String[] filterPaths = new String[filtersize];
System.arraycopy(paths, pathsize - filtersize, filterPaths, 0,
filtersize);
pf = PathFilterGroup.createFromStrings(filterPaths);
}
long t3 = System.nanoTime();
data = "PX\t" + (t2 - t1) / 1E9 + "\t" + (t3 - t2) / 1E9 + "\t";
tInit = t2-t1;
}
public void test(int pathSize, int filterSize)
throws MissingObjectException, IncorrectObjectTypeException,
IOException {
setup(pathSize, filterSize, null);
data += pathSize + "\t" + filterSize + "\t";
long t1 = System.nanoTime();
TreeWalk tw = new TreeWalk((ObjectReader) null);
tw.reset();
tw.addTree(new DirCacheIterator(dc));
tw.setFilter(pf);
tw.setRecursive(true);
int n = 0;
while (tw.next())
n++;
long t2 = System.nanoTime();
data += (t2 - t1) / 1E9;
System.out.println(data);
assertEquals(filterSize, n);
}
@SuppressWarnings("boxing")
public void test(int pathSize, int expectedWalkCount, String... filters)
throws MissingObjectException, IncorrectObjectTypeException,
IOException {
setup(pathSize, -1, filters);
data += pathSize + "\t" + filters.length + "\t";
long t1 = System.nanoTime();
TreeWalk tw = new TreeWalk((ObjectReader) null);
tw.reset();
tw.addTree(new DirCacheIterator(dc));
tw.setFilter(pf);
tw.setRecursive(true);
int n = 0;
while (tw.next())
n++;
long t2 = System.nanoTime();
data += (t2 - t1) / 1E9;
System.out.println(data);
assertEquals(expectedWalkCount, n);
// Walk time should be (much) faster then setup time
assertThat(t2 - t1, lessThan(tInit / 2));
}
@Test
public void test1_1() throws MissingObjectException,
IncorrectObjectTypeException, IOException {
test(1, 1);
}
@Test
public void test2_2() throws MissingObjectException,
IncorrectObjectTypeException, IOException {
test(2, 2);
}
@Test
public void test10_10() throws MissingObjectException,
IncorrectObjectTypeException, IOException {
test(10, 10);
}
@Test
public void test100_100() throws MissingObjectException,
IncorrectObjectTypeException, IOException {
test(100, 100);
}
@Test
public void test1000_1000() throws MissingObjectException,
IncorrectObjectTypeException, IOException {
test(1000, 1000);
}
@Test
public void test10000_10000() throws MissingObjectException,
IncorrectObjectTypeException, IOException {
test(10000, 10000);
}
@Test
public void test100000_100000() throws MissingObjectException,
IncorrectObjectTypeException, IOException {
test(100000, 100000);
}
@Test
public void test100000_10000() throws MissingObjectException,
IncorrectObjectTypeException, IOException {
test(100000, 10000);
}
@Test
public void test100000_1000() throws MissingObjectException,
IncorrectObjectTypeException, IOException {
test(100000, 1000);
}
@Test
public void test100000_100() throws MissingObjectException,
IncorrectObjectTypeException, IOException {
test(100000, 100);
}
@Test
public void test100000_10() throws MissingObjectException,
IncorrectObjectTypeException, IOException {
test(100000, 10);
}
@Test
public void test100000_1() throws MissingObjectException,
IncorrectObjectTypeException, IOException {
test(100000, 1);
}
@Test
public void test10000_1F() throws MissingObjectException,
IncorrectObjectTypeException, IOException {
test(100000, 10000, "abc");
}
@Test
public void test10000_L2() throws MissingObjectException,
IncorrectObjectTypeException, IOException {
test(100000, 20000, "abc", "def");
}
@Test
public void test10000_L10() throws MissingObjectException,
IncorrectObjectTypeException, IOException {
test(100000, 10000, "\u00e5\u00e4\u00f6");
}
}

View File

@ -0,0 +1,318 @@
/*
* Copyright (C) 2009, Google Inc.
* Copyright (C) 2008, Marek Zawirski <marek.zawirski@gmail.com>
* Copyright (C) 2008, Shawn O. Pearce <spearce@spearce.org>
* Copyright (C) 2013, Robin Rosenberg
* and other copyright owners as documented in the project's IP log.
*
* This program and the accompanying materials are made available
* under the terms of the Eclipse Distribution License v1.0 which
* accompanies this distribution, is reproduced below, and is
* available at http://www.eclipse.org/org/documents/edl-v10.php
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials provided
* with the distribution.
*
* - Neither the name of the Eclipse Foundation, Inc. nor the
* names of its contributors may be used to endorse or promote
* products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package org.eclipse.jgit.treewalk.filter;
import org.eclipse.jgit.util.RawParseUtils;
/**
* Specialized set for byte arrays, interpreted as strings for use in
* {@link PathFilterGroup.Group}. Most methods assume the hash is already know
* and therefore requires the caller to supply it beforehand. The implementation
* is a loose derivative of ObjectIdSubclassMap.
*/
class ByteArraySet {
private int size;
private int grow;
private int mask;
private byte[][] table;
/**
* Create an empty set.
*
* @param capacity
*/
ByteArraySet(int capacity) {
initTable(1 << Integer.highestOneBit((capacity * 2) - 1));
}
private byte[] get(final byte[] toFind, int length, int hash) {
final int msk = mask;
int i = hash & msk;
final byte[][] tbl = table;
byte[] obj;
while ((obj = tbl[i]) != null) {
if (equals(obj, toFind, length))
return obj;
i = (i + 1) & msk;
}
return null;
}
private static boolean equals(byte[] a, byte[] b, int length) {
if (a.length < length || b.length < length)
return false;
for (int i = 0; i < length; ++i) {
if (a[i] != b[i])
return false;
}
return true;
}
/**
* Returns true if this set contains the specified array.
*
* @param toFind
* array to find.
* @param length
* The number of bytes in toFind that are used
* @param hash
* pre-computed hash of toFind
* @return true if the mapping exists for this byte array; false otherwise.
*/
boolean contains(final byte[] toFind, int length, int hash) {
return get(toFind, length, hash) != null;
}
/**
* Store a byte array for future lookup.
* <p>
* Stores {@code newValue}, but only if it does not already exist in the
* set. Callers can tell if the value is new by checking the return value
* with reference equality:
*
* <pre>
* byte[] obj = ...;
* boolean wasNew = map.addIfAbsent(array, length, hash) == array;
* </pre>
*
* @param newValue
* the array to store.
* @param length
* The number of bytes in newValue that are used
* @param hash
* pre-computed hash of toFind
* @return {@code newValue} if stored, or the prior value already stored and
* that would have been returned had the caller used
* {@code get(newValue)} first.
*/
byte[] addIfAbsent(final byte[] newValue, int length, int hash) {
final int msk = mask;
int i = hash & msk;
final byte[][] tbl = table;
byte[] obj;
while ((obj = tbl[i]) != null) {
if (equals(obj, newValue, length))
return obj;
i = (i + 1) & msk;
}
byte[] valueToInsert = copyIfNotSameSize(newValue, length);
if (++size == grow) {
grow();
insert(valueToInsert, hash);
} else
tbl[i] = valueToInsert;
return valueToInsert;
}
private static byte[] copyIfNotSameSize(byte[] newValue, int length) {
if (newValue.length == length)
return newValue;
byte[] ret = new byte[length];
System.arraycopy(newValue, 0, ret, 0, length);
return ret;
}
/**
* @return number of arrays in the set
*/
int size() {
return size;
}
/** @return true if {@link #size()} is 0. */
boolean isEmpty() {
return size == 0;
}
private void insert(final byte[] newValue, int hash) {
final int msk = mask;
int j = hash & msk;
final byte[][] tbl = table;
while (tbl[j] != null)
j = (j + 1) & msk;
tbl[j] = newValue;
}
private Hasher hasher = new Hasher(null, 0);
private void grow() {
final byte[][] oldTable = table;
final int oldSize = table.length;
initTable(oldSize << 1);
for (int i = 0; i < oldSize; i++) {
final byte[] obj = oldTable[i];
if (obj != null) {
hasher.init(obj, obj.length);
insert(obj, hasher.hash());
}
}
}
private void initTable(int sz) {
if (sz < 2)
sz = 2;
grow = sz >> 1;
mask = sz - 1;
table = new byte[sz][];
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append('[');
for (byte[] b : table) {
if (b == null)
continue;
if (sb.length() > 1)
sb.append(" , "); //$NON-NLS-1$
sb.append('"');
sb.append(RawParseUtils.decode(b));
sb.append('"');
sb.append('(');
sb.append(chainlength(b));
sb.append(')');
}
sb.append(']');
return sb.toString();
}
private int chainlength(byte[] b) {
Hasher h = new Hasher(b, b.length);
int hash = h.hash();
final int msk = mask;
int i = hash & msk;
final byte[][] tbl = table;
byte[] obj;
int n = 0;
while ((obj = tbl[i]) != null) {
if (equals(obj, b, b.length))
return n;
i = (i + 1) & msk;
++n;
}
return -1;
}
static class Hasher {
private int hash;
private int pos;
private byte[] data;
private int length;
Hasher(byte[] data, int length) {
init(data, length);
}
void init(byte[] d, int l) {
this.data = d;
this.length = l;
pos = 0;
hash = 0;
}
int hash() {
while (pos < length)
hash = hash * 31 + data[pos++];
return hash;
}
int nextHash() {
for (;;) {
hash = hash * 31 + data[pos];
++pos;
if (pos == length || data[pos] == '/')
return hash;
}
}
int getHash() {
return hash;
}
boolean hasNext() {
return pos < length;
}
public int length() {
return pos;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < pos; ++i)
sb.append((char) data[i]);
sb.append(" | "); //$NON-NLS-1$
for (int i = pos; i < length; ++i)
sb.append((char) data[i]);
return sb.toString();
}
}
byte[][] toArray() {
byte[][] ret = new byte[size][];
int i = 0;
for (byte[] entry : table) {
if (entry != null)
ret[i++] = entry;
}
return ret;
}
}

View File

@ -44,13 +44,13 @@
package org.eclipse.jgit.treewalk.filter;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import org.eclipse.jgit.errors.StopWalkException;
import org.eclipse.jgit.internal.JGitText;
import org.eclipse.jgit.treewalk.TreeWalk;
import org.eclipse.jgit.treewalk.filter.ByteArraySet.Hasher;
import org.eclipse.jgit.util.RawParseUtils;
/**
* Includes tree entries only if they match one or more configured paths.
@ -83,7 +83,8 @@ public class PathFilterGroup {
*/
public static TreeFilter createFromStrings(final Collection<String> paths) {
if (paths.isEmpty())
throw new IllegalArgumentException(JGitText.get().atLeastOnePathIsRequired);
throw new IllegalArgumentException(
JGitText.get().atLeastOnePathIsRequired);
final PathFilter[] p = new PathFilter[paths.size()];
int i = 0;
for (final String s : paths)
@ -131,7 +132,8 @@ public static TreeFilter createFromStrings(final String... paths) {
*/
public static TreeFilter create(final Collection<PathFilter> paths) {
if (paths.isEmpty())
throw new IllegalArgumentException(JGitText.get().atLeastOnePathIsRequired);
throw new IllegalArgumentException(
JGitText.get().atLeastOnePathIsRequired);
final PathFilter[] p = new PathFilter[paths.size()];
paths.toArray(p);
return create(p);
@ -177,41 +179,74 @@ public String toString() {
}
static class Group extends TreeFilter {
private static final Comparator<PathFilter> PATH_SORT = new Comparator<PathFilter>() {
public int compare(final PathFilter o1, final PathFilter o2) {
return o1.pathStr.compareTo(o2.pathStr);
private ByteArraySet fullpaths;
private ByteArraySet prefixes;
private byte[] max;
private Group(final PathFilter[] pathFilters) {
fullpaths = new ByteArraySet(pathFilters.length);
prefixes = new ByteArraySet(pathFilters.length / 5);
// 5 is an empirically derived ratio of #paths/#prefixes from:
// egit/jgit: 8
// git: 5
// linux kernel: 13
// eclipse.platform.ui: 7
max = pathFilters[0].pathRaw;
Hasher hasher = new Hasher(null, 0);
for (PathFilter pf : pathFilters) {
hasher.init(pf.pathRaw, pf.pathRaw.length);
while (hasher.hasNext()) {
int hash = hasher.nextHash();
if (hasher.hasNext())
prefixes.addIfAbsent(pf.pathRaw, hasher.length(), hash);
}
fullpaths.addIfAbsent(pf.pathRaw, pf.pathRaw.length,
hasher.getHash());
if (compare(max, pf.pathRaw) < 0)
max = pf.pathRaw;
}
};
}
private final PathFilter[] paths;
private Group(final PathFilter[] p) {
paths = p;
Arrays.sort(paths, PATH_SORT);
private static int compare(byte[] a, byte[] b) {
int i = 0;
while (i < a.length && i < b.length) {
int ba = a[i] & 0xFF;
int bb = b[i] & 0xFF;
int cmp = ba - bb;
if (cmp != 0)
return cmp;
++i;
}
return a.length - b.length;
}
@Override
public boolean include(final TreeWalk walker) {
final int n = paths.length;
for (int i = 0;;) {
final byte[] r = paths[i].pathRaw;
final int cmp = walker.isPathPrefix(r, r.length);
if (cmp == 0)
byte[] rp = walker.getRawPath();
Hasher hasher = new Hasher(rp, walker.getPathLength());
while (hasher.hasNext()) {
int hash = hasher.nextHash();
if (fullpaths.contains(rp, hasher.length(), hash))
return true;
if (++i < n)
continue;
if (cmp > 0)
throw StopWalkException.INSTANCE;
return false;
if (!hasher.hasNext())
if (prefixes.contains(rp, hasher.length(), hash))
return true;
}
final int cmp = walker.isPathPrefix(max, max.length);
if (cmp > 0)
throw StopWalkException.INSTANCE;
return false;
}
@Override
public boolean shouldBeRecursive() {
for (final PathFilter p : paths)
if (p.shouldBeRecursive())
return true;
return false;
return !prefixes.isEmpty();
}
@Override
@ -222,13 +257,17 @@ public TreeFilter clone() {
public String toString() {
final StringBuilder r = new StringBuilder();
r.append("FAST("); //$NON-NLS-1$
for (int i = 0; i < paths.length; i++) {
if (i > 0)
boolean first = true;
for (byte[] p : fullpaths.toArray()) {
if (!first) {
r.append(" OR "); //$NON-NLS-1$
r.append(paths[i].toString());
}
r.append(RawParseUtils.decode(p));
first = false;
}
r.append(")"); //$NON-NLS-1$
return r.toString();
}
}
}