Merge changes from topic 'dfs-gc'

* changes:
  Prune UNREACHABLE_GARBAGE packs when they expire
  Use try-with-resources in DfsGarbageCollector.writePack
  Fix lastModified to be consistent in DfsGarbageCollector
  Add GC_REST PackSource to better order DFS packs
This commit is contained in:
Shawn Pearce 2016-06-27 11:52:00 -04:00 committed by Gerrit Code Review @ Eclipse.org
commit ec5190e2e3
3 changed files with 377 additions and 46 deletions

View File

@ -0,0 +1,239 @@
package org.eclipse.jgit.internal.storage.dfs;
import static org.eclipse.jgit.internal.storage.dfs.DfsObjDatabase.PackSource.GC;
import static org.eclipse.jgit.internal.storage.dfs.DfsObjDatabase.PackSource.INSERT;
import static org.eclipse.jgit.internal.storage.dfs.DfsObjDatabase.PackSource.UNREACHABLE_GARBAGE;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import java.io.IOException;
import java.util.concurrent.TimeUnit;
import org.eclipse.jgit.internal.storage.dfs.DfsObjDatabase.PackSource;
import org.eclipse.jgit.junit.TestRepository;
import org.eclipse.jgit.lib.AnyObjectId;
import org.eclipse.jgit.lib.Ref;
import org.eclipse.jgit.lib.Repository;
import org.eclipse.jgit.revwalk.RevCommit;
import org.eclipse.jgit.revwalk.RevWalk;
import org.junit.Before;
import org.junit.Test;
public class DfsGarbageCollectorTest {
private TestRepository<InMemoryRepository> git;
private InMemoryRepository repo;
private DfsObjDatabase odb;
@Before
public void setUp() throws IOException {
DfsRepositoryDescription desc = new DfsRepositoryDescription("test");
git = new TestRepository<>(new InMemoryRepository(desc));
repo = git.getRepository();
odb = repo.getObjectDatabase();
}
@Test
public void testCollectionWithNoGarbage() throws Exception {
RevCommit commit0 = commit().message("0").create();
RevCommit commit1 = commit().message("1").parent(commit0).create();
git.update("master", commit1);
assertTrue("commit0 reachable", isReachable(repo, commit0));
assertTrue("commit1 reachable", isReachable(repo, commit1));
// Packs start out as INSERT.
assertEquals(2, odb.getPacks().length);
for (DfsPackFile pack : odb.getPacks()) {
assertEquals(INSERT, pack.getPackDescription().getPackSource());
}
gcNoTtl();
// Single GC pack present with all objects.
assertEquals(1, odb.getPacks().length);
DfsPackFile pack = odb.getPacks()[0];
assertEquals(GC, pack.getPackDescription().getPackSource());
assertTrue("commit0 in pack", isObjectInPack(commit0, pack));
assertTrue("commit1 in pack", isObjectInPack(commit1, pack));
}
@Test
public void testCollectionWithGarbage() throws Exception {
RevCommit commit0 = commit().message("0").create();
RevCommit commit1 = commit().message("1").parent(commit0).create();
git.update("master", commit0);
assertTrue("commit0 reachable", isReachable(repo, commit0));
assertFalse("commit1 garbage", isReachable(repo, commit1));
gcNoTtl();
assertEquals(2, odb.getPacks().length);
DfsPackFile gc = null;
DfsPackFile garbage = null;
for (DfsPackFile pack : odb.getPacks()) {
DfsPackDescription d = pack.getPackDescription();
if (d.getPackSource() == GC) {
gc = pack;
} else if (d.getPackSource() == UNREACHABLE_GARBAGE) {
garbage = pack;
} else {
fail("unexpected " + d.getPackSource());
}
}
assertNotNull("created GC pack", gc);
assertTrue(isObjectInPack(commit0, gc));
assertNotNull("created UNREACHABLE_GARBAGE pack", garbage);
assertTrue(isObjectInPack(commit1, garbage));
}
@Test
public void testCollectionWithGarbageAndGarbagePacksPurged()
throws Exception {
RevCommit commit0 = commit().message("0").create();
RevCommit commit1 = commit().message("1").parent(commit0).create();
git.update("master", commit0);
gcNoTtl();
gcWithTtl();
// The repository has an UNREACHABLE_GARBAGE pack that could have
// expired, but since we never purge the most recent UNREACHABLE_GARBAGE
// pack, it must have survived the GC.
boolean commit1Found = false;
for (DfsPackFile pack : odb.getPacks()) {
DfsPackDescription d = pack.getPackDescription();
if (d.getPackSource() == GC) {
assertTrue("has commit0", isObjectInPack(commit0, pack));
assertFalse("no commit1", isObjectInPack(commit1, pack));
} else if (d.getPackSource() == UNREACHABLE_GARBAGE) {
commit1Found |= isObjectInPack(commit1, pack);
} else {
fail("unexpected " + d.getPackSource());
}
}
assertTrue("garbage commit1 still readable", commit1Found);
// Find oldest UNREACHABLE_GARBAGE; it will be pruned by next GC.
DfsPackDescription oldestGarbagePack = null;
for (DfsPackFile pack : odb.getPacks()) {
DfsPackDescription d = pack.getPackDescription();
if (d.getPackSource() == UNREACHABLE_GARBAGE) {
oldestGarbagePack = oldestPack(oldestGarbagePack, d);
}
}
assertNotNull("has UNREACHABLE_GARBAGE", oldestGarbagePack);
gcWithTtl();
assertTrue("has packs", odb.getPacks().length > 0);
for (DfsPackFile pack : odb.getPacks()) {
assertNotEquals(oldestGarbagePack, pack.getPackDescription());
}
}
@Test
public void testCollectionWithGarbageCoalescence() throws Exception {
RevCommit commit0 = commit().message("0").create();
RevCommit commit1 = commit().message("1").parent(commit0).create();
git.update("master", commit0);
for (int i = 0; i < 3; i++) {
commit1 = commit().message("g" + i).parent(commit1).create();
// Make sure we don't have more than 1 UNREACHABLE_GARBAGE pack
// because they're coalesced.
gcNoTtl();
assertEquals(1, countPacks(UNREACHABLE_GARBAGE));
}
}
@Test
public void testCollectionWithGarbageNoCoalescence() throws Exception {
RevCommit commit0 = commit().message("0").create();
RevCommit commit1 = commit().message("1").parent(commit0).create();
git.update("master", commit0);
for (int i = 0; i < 3; i++) {
commit1 = commit().message("g" + i).parent(commit1).create();
DfsGarbageCollector gc = new DfsGarbageCollector(repo);
gc.setCoalesceGarbageLimit(0);
gc.setGarbageTtl(0, TimeUnit.MILLISECONDS);
run(gc);
assertEquals(1 + i, countPacks(UNREACHABLE_GARBAGE));
}
}
private TestRepository<InMemoryRepository>.CommitBuilder commit() {
return git.commit();
}
private void gcNoTtl() throws IOException {
DfsGarbageCollector gc = new DfsGarbageCollector(repo);
gc.setGarbageTtl(0, TimeUnit.MILLISECONDS); // disable TTL
run(gc);
}
private void gcWithTtl() throws InterruptedException, IOException {
// Wait for the system clock to move by at least 1 millisecond.
// This allows the DfsGarbageCollector to recognize the boundary.
long start = System.currentTimeMillis();
do {
Thread.sleep(10);
} while (System.currentTimeMillis() <= start);
DfsGarbageCollector gc = new DfsGarbageCollector(repo);
gc.setGarbageTtl(1, TimeUnit.MILLISECONDS);
run(gc);
}
private void run(DfsGarbageCollector gc) throws IOException {
assertTrue("gc repacked", gc.pack(null));
odb.clearCache();
}
private static boolean isReachable(Repository repo, AnyObjectId id)
throws IOException {
try (RevWalk rw = new RevWalk(repo)) {
for (Ref ref : repo.getAllRefs().values()) {
rw.markStart(rw.parseCommit(ref.getObjectId()));
}
for (RevCommit next; (next = rw.next()) != null;) {
if (AnyObjectId.equals(next, id)) {
return true;
}
}
}
return false;
}
private boolean isObjectInPack(AnyObjectId id, DfsPackFile pack)
throws IOException {
try (DfsReader reader = new DfsReader(odb)) {
return pack.hasObject(reader, id);
}
}
private static DfsPackDescription oldestPack(DfsPackDescription a,
DfsPackDescription b) {
if (a != null && a.getLastModified() < b.getLastModified()) {
return a;
}
return b;
}
private int countPacks(PackSource source) throws IOException {
int cnt = 0;
for (DfsPackFile pack : odb.getPacks()) {
if (pack.getPackDescription().getPackSource() == source) {
cnt++;
}
}
return cnt;
}
}

View File

@ -44,6 +44,7 @@
package org.eclipse.jgit.internal.storage.dfs;
import static org.eclipse.jgit.internal.storage.dfs.DfsObjDatabase.PackSource.GC;
import static org.eclipse.jgit.internal.storage.dfs.DfsObjDatabase.PackSource.GC_REST;
import static org.eclipse.jgit.internal.storage.dfs.DfsObjDatabase.PackSource.GC_TXN;
import static org.eclipse.jgit.internal.storage.dfs.DfsObjDatabase.PackSource.UNREACHABLE_GARBAGE;
import static org.eclipse.jgit.internal.storage.pack.PackExt.BITMAP_INDEX;
@ -53,9 +54,11 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import org.eclipse.jgit.internal.JGitText;
import org.eclipse.jgit.internal.storage.dfs.DfsObjDatabase.PackSource;
@ -92,9 +95,13 @@ public class DfsGarbageCollector {
private PackConfig packConfig;
// See pack(), below, for how these two variables interact.
private long coalesceGarbageLimit = 50 << 20;
private long garbageTtlMillis = TimeUnit.DAYS.toMillis(1);
private long startTimeMillis;
private List<DfsPackFile> packsBefore;
private List<DfsPackFile> expiredGarbagePacks;
private Set<ObjectId> allHeads;
private Set<ObjectId> nonHeads;
@ -166,6 +173,34 @@ public DfsGarbageCollector setCoalesceGarbageLimit(long limit) {
return this;
}
/**
* @return garbage packs older than this limit (in milliseconds) will be
* pruned as part of the garbage collection process if the value is
* > 0, otherwise garbage packs are retained.
*/
public long getGarbageTtlMillis() {
return garbageTtlMillis;
}
/**
* Set the time to live for garbage objects.
* <p>
* Any UNREACHABLE_GARBAGE older than this limit will be pruned at the end
* of the run.
* <p>
* If timeToLiveMillis is set to 0, UNREACHABLE_GARBAGE purging is disabled.
*
* @param ttl
* Time to live whatever unit is specified.
* @param unit
* The specified time unit.
* @return {@code this}
*/
public DfsGarbageCollector setGarbageTtl(long ttl, TimeUnit unit) {
garbageTtlMillis = unit.toMillis(ttl);
return this;
}
/**
* Create a single new pack file containing all of the live objects.
* <p>
@ -188,16 +223,28 @@ public boolean pack(ProgressMonitor pm) throws IOException {
if (packConfig.getIndexVersion() != 2)
throw new IllegalStateException(
JGitText.get().supportOnlyPackIndexVersion2);
if (garbageTtlMillis > 0) {
// We disable coalescing because the coalescing step will keep
// refreshing the UNREACHABLE_GARBAGE pack and we wouldn't
// actually prune anything.
coalesceGarbageLimit = 0;
}
startTimeMillis = System.currentTimeMillis();
ctx = (DfsReader) objdb.newReader();
try {
refdb.refresh();
objdb.clearCache();
Collection<Ref> refsBefore = getAllRefs();
packsBefore = packsToRebuild();
if (packsBefore.isEmpty())
readPacksBefore();
if (packsBefore.isEmpty()) {
if (!expiredGarbagePacks.isEmpty()) {
objdb.commitPack(noPacks(), toPrune());
}
return true;
}
allHeads = new HashSet<ObjectId>();
nonHeads = new HashSet<ObjectId>();
@ -252,17 +299,60 @@ private Collection<Ref> getAllRefs() throws IOException {
return refs;
}
private List<DfsPackFile> packsToRebuild() throws IOException {
private void readPacksBefore() throws IOException {
DfsPackFile[] packs = objdb.getPacks();
List<DfsPackFile> out = new ArrayList<DfsPackFile>(packs.length);
packsBefore = new ArrayList<DfsPackFile>(packs.length);
expiredGarbagePacks = new ArrayList<DfsPackFile>(packs.length);
long mostRecentGC = mostRecentGC(packs);
long now = System.currentTimeMillis();
for (DfsPackFile p : packs) {
DfsPackDescription d = p.getPackDescription();
if (d.getPackSource() != UNREACHABLE_GARBAGE)
out.add(p);
else if (d.getFileSize(PackExt.PACK) < coalesceGarbageLimit)
out.add(p);
if (d.getPackSource() != UNREACHABLE_GARBAGE) {
packsBefore.add(p);
} else if (packIsExpiredGarbage(d, mostRecentGC, now)) {
expiredGarbagePacks.add(p);
} else if (d.getFileSize(PackExt.PACK) < coalesceGarbageLimit) {
packsBefore.add(p);
}
}
return out;
}
private static long mostRecentGC(DfsPackFile[] packs) {
long r = 0;
for (DfsPackFile p : packs) {
DfsPackDescription d = p.getPackDescription();
if (d.getPackSource() == GC || d.getPackSource() == GC_REST) {
r = Math.max(r, d.getLastModified());
}
}
return r;
}
private boolean packIsExpiredGarbage(DfsPackDescription d,
long mostRecentGC, long now) {
// It should be safe to remove an UNREACHABLE_GARBAGE pack if it:
//
// (a) Predates the most recent prior run of this class. This check
// ensures the graph traversal algorithm had a chance to consider
// all objects in this pack and copied them into a GC or GC_REST
// pack if the graph contained live edges to the objects.
//
// This check is safe because of the ordering of packing; the GC
// packs are written first and then the UNREACHABLE_GARBAGE is
// constructed. Any UNREACHABLE_GARBAGE dated earlier than the GC
// was input to the prior GC's graph traversal.
//
// (b) Is older than garbagePackTtl. This check gives concurrent
// inserter threads sufficient time to identify an object is not
// in the graph and should have a new copy written, rather than
// relying on something from an UNREACHABLE_GARBAGE pack.
//
// Both (a) and (b) must be met to safely remove UNREACHABLE_GARBAGE.
return d.getPackSource() == UNREACHABLE_GARBAGE
&& d.getLastModified() < mostRecentGC
&& garbageTtlMillis > 0
&& now - d.getLastModified() >= garbageTtlMillis;
}
/** @return all of the source packs that fed into this compaction. */
@ -283,8 +373,12 @@ public List<PackStatistics> getNewPackStatistics() {
private List<DfsPackDescription> toPrune() {
int cnt = packsBefore.size();
List<DfsPackDescription> all = new ArrayList<DfsPackDescription>(cnt);
for (DfsPackFile pack : packsBefore)
for (DfsPackFile pack : packsBefore) {
all.add(pack.getPackDescription());
}
for (DfsPackFile pack : expiredGarbagePacks) {
all.add(pack.getPackDescription());
}
return all;
}
@ -299,6 +393,7 @@ private void packHeads(ProgressMonitor pm) throws IOException {
writePack(GC, pw, pm);
}
}
private void packRest(ProgressMonitor pm) throws IOException {
if (nonHeads.isEmpty())
return;
@ -308,7 +403,7 @@ private void packRest(ProgressMonitor pm) throws IOException {
pw.excludeObjects(packedObjs);
pw.preparePack(pm, nonHeads, allHeads);
if (0 < pw.getObjectCount())
writePack(GC, pw, pm);
writePack(GC_REST, pw, pm);
}
}
@ -326,7 +421,6 @@ private void packRefTreeGraph(ProgressMonitor pm) throws IOException {
}
private void packGarbage(ProgressMonitor pm) throws IOException {
// TODO(sop) This is ugly. The garbage pack needs to be deleted.
PackConfig cfg = new PackConfig(packConfig);
cfg.setReuseDeltas(true);
cfg.setReuseObjects(true);
@ -383,47 +477,42 @@ private PackWriter newPackWriter() {
private DfsPackDescription writePack(PackSource source, PackWriter pw,
ProgressMonitor pm) throws IOException {
DfsOutputStream out;
DfsPackDescription pack = repo.getObjectDatabase().newPack(source);
newPackDesc.add(pack);
out = objdb.writeFile(pack, PACK);
try {
try (DfsOutputStream out = objdb.writeFile(pack, PACK)) {
pw.writePack(pm, pm, out);
pack.addFileExt(PACK);
} finally {
out.close();
}
out = objdb.writeFile(pack, INDEX);
try {
CountingOutputStream cnt = new CountingOutputStream(out);
try (DfsOutputStream out = objdb.writeFile(pack, INDEX);
CountingOutputStream cnt = new CountingOutputStream(out)) {
pw.writeIndex(cnt);
pack.addFileExt(INDEX);
pack.setFileSize(INDEX, cnt.getCount());
pack.setIndexVersion(pw.getIndexVersion());
} finally {
out.close();
}
if (pw.prepareBitmapIndex(pm)) {
out = objdb.writeFile(pack, BITMAP_INDEX);
try {
CountingOutputStream cnt = new CountingOutputStream(out);
try (DfsOutputStream out = objdb.writeFile(pack, BITMAP_INDEX);
CountingOutputStream cnt = new CountingOutputStream(out)) {
pw.writeBitmapIndex(cnt);
pack.addFileExt(BITMAP_INDEX);
pack.setFileSize(BITMAP_INDEX, cnt.getCount());
} finally {
out.close();
}
}
PackStatistics stats = pw.getStatistics();
pack.setPackStats(stats);
pack.setLastModified(startTimeMillis);
newPackStats.add(stats);
newPackObj.add(pw.getObjectSet());
DfsBlockCache.getInstance().getOrCreate(pack, null);
return pack;
}
private static List<DfsPackDescription> noPacks() {
return Collections.emptyList();
}
}

View File

@ -79,24 +79,6 @@ public static enum PackSource {
*/
RECEIVE(0),
/**
* Pack was created by Git garbage collection by this implementation.
* <p>
* This source is only used by the {@link DfsGarbageCollector} when it
* builds a pack file by traversing the object graph and copying all
* reachable objects into a new pack stream.
*
* @see DfsGarbageCollector
*/
GC(1),
/**
* RefTreeGraph pack was created by Git garbage collection.
*
* @see DfsGarbageCollector
*/
GC_TXN(1),
/**
* The pack was created by compacting multiple packs together.
* <p>
@ -108,6 +90,27 @@ public static enum PackSource {
*/
COMPACT(1),
/**
* Pack was created by Git garbage collection by this implementation.
* <p>
* This source is only used by the {@link DfsGarbageCollector} when it
* builds a pack file by traversing the object graph and copying all
* reachable objects into a new pack stream.
*
* @see DfsGarbageCollector
*/
GC(2),
/** Created from non-heads by {@link DfsGarbageCollector}. */
GC_REST(3),
/**
* RefTreeGraph pack was created by Git garbage collection.
*
* @see DfsGarbageCollector
*/
GC_TXN(4),
/**
* Pack was created by Git garbage collection.
* <p>
@ -115,7 +118,7 @@ public static enum PackSource {
* last GC pass. It is retained in a new pack until it is safe to prune
* these objects from the repository.
*/
UNREACHABLE_GARBAGE(2);
UNREACHABLE_GARBAGE(5);
final int category;