From c2204bb6835e4e6dc666bb34eaea910fb1484092 Mon Sep 17 00:00:00 2001 From: Thomas Wolf Date: Wed, 13 Oct 2021 00:28:38 +0200 Subject: [PATCH] Make the buffer size for text/binary detection configurable The various streams used in JGit for text/binary and CR-LF detection used different buffer sizes. Most used 8000, but one used 8KiB, and one used 8096 (SIC!) bytes. Considering only the first 8kB of a file/blob is not sufficient; it may give behavior incompatible with C git. C git considers the whole blob; since it uses memory-mapped files it can do so with acceptable performance. Doing this in JGit would most likely incur a noticeable performance penalty. But 8kB is a bit small; in the file in bug 576971 the limit was hit before the first CR-LF, which occurred on line 155 at offset 9759 in the file. Make RawText.FIRST_FEW_BYTES only a default and minimum setting, and set it to 8KiB. Make the actual buffer size configurable: provide static methods getBufferSize() and setBuffersize(), and use getBufferSize() throughout instead of the constant. This enables users of the JGit library to set their own possibly larger buffer size. Bug: 576971 Change-Id: I447762c9a5147a521f73d2864ba59ed89f555d54 Signed-off-by: Thomas Wolf --- .../eclipse/jgit/diff/DiffFormatterTest.java | 16 ++--- .../org/eclipse/jgit/merge/MergerTest.java | 8 ++- .../jgit/util/io/AutoCRLFInputStreamTest.java | 4 +- .../util/io/AutoCRLFOutputStreamTest.java | 4 +- .../src/org/eclipse/jgit/diff/RawText.java | 67 +++++++++++++++---- .../jgit/util/io/AutoCRLFInputStream.java | 9 ++- .../jgit/util/io/AutoCRLFOutputStream.java | 9 ++- .../jgit/util/io/AutoLFInputStream.java | 11 +-- .../jgit/util/io/AutoLFOutputStream.java | 12 ++-- 9 files changed, 94 insertions(+), 46 deletions(-) diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/DiffFormatterTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/DiffFormatterTest.java index b694f4aaf..a093cc78d 100644 --- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/DiffFormatterTest.java +++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/DiffFormatterTest.java @@ -608,7 +608,7 @@ public void testDiffAutoCrlfSmallFile() throws Exception { public void testDiffAutoCrlfMediumFile() throws Exception { String content = mediumCrLfString(); String expectedDiff = "diff --git a/test.txt b/test.txt\n" - + "index 215c502..c10f08c 100644\n" // + + "index 6d9ffed..50d7b5a 100644\n" // + "--- a/test.txt\n" // + "+++ b/test.txt\n" // + "@@ -1,4 +1,5 @@\n" // @@ -624,7 +624,7 @@ public void testDiffAutoCrlfMediumFile() throws Exception { public void testDiffAutoCrlfLargeFile() throws Exception { String content = largeCrLfString(); String expectedDiff = "diff --git a/test.txt b/test.txt\n" - + "index 7014942..c0487a7 100644\n" // + + "index d6399a1..de26ce5 100644\n" // + "--- a/test.txt\n" // + "+++ b/test.txt\n" // + "@@ -1,4 +1,5 @@\n" @@ -665,9 +665,9 @@ private void doAutoCrLfTest(String content, String expectedDiff) private static String largeCrLfString() { String line = "012345678901234567890123456789012345678901234567\r\n"; - StringBuilder builder = new StringBuilder( - 2 * RawText.FIRST_FEW_BYTES); - while (builder.length() < 2 * RawText.FIRST_FEW_BYTES) { + int bufferSize = RawText.getBufferSize(); + StringBuilder builder = new StringBuilder(2 * bufferSize); + while (builder.length() < 2 * bufferSize) { builder.append(line); } return builder.toString(); @@ -677,9 +677,9 @@ private static String mediumCrLfString() { // Create a CR-LF string longer than RawText.FIRST_FEW_BYTES whose // canonical representation is shorter than RawText.FIRST_FEW_BYTES. String line = "01234567\r\n"; // 10 characters - StringBuilder builder = new StringBuilder( - RawText.FIRST_FEW_BYTES + line.length()); - while (builder.length() <= RawText.FIRST_FEW_BYTES) { + int bufferSize = RawText.getBufferSize(); + StringBuilder builder = new StringBuilder(bufferSize + line.length()); + while (builder.length() <= bufferSize) { builder.append(line); } return builder.toString(); diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/merge/MergerTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/merge/MergerTest.java index 6cbb4a89b..dd8573d2b 100644 --- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/merge/MergerTest.java +++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/merge/MergerTest.java @@ -33,6 +33,7 @@ import org.eclipse.jgit.api.errors.CheckoutConflictException; import org.eclipse.jgit.api.errors.GitAPIException; import org.eclipse.jgit.api.errors.JGitInternalException; +import org.eclipse.jgit.diff.RawText; import org.eclipse.jgit.dircache.DirCache; import org.eclipse.jgit.dircache.DirCacheEditor; import org.eclipse.jgit.dircache.DirCacheEntry; @@ -826,6 +827,8 @@ public void checkContentMergeLargeBinaries(MergeStrategy strategy) throws Except RevCommit sideCommit = git.commit().setAll(true) .setMessage("modified file l 1500").call(); + int originalBufferSize = RawText.getBufferSize(); + int smallBufferSize = RawText.setBufferSize(8000); try (ObjectInserter ins = db.newObjectInserter()) { // Check that we don't read the large blobs. ObjectInserter forbidInserter = new ObjectInserter.Filter() { @@ -836,7 +839,8 @@ protected ObjectInserter delegate() { @Override public ObjectReader newReader() { - return new BigReadForbiddenReader(super.newReader(), 8000); + return new BigReadForbiddenReader(super.newReader(), + smallBufferSize); } }; @@ -844,6 +848,8 @@ public ObjectReader newReader() { (ResolveMerger) strategy.newMerger(forbidInserter, db.getConfig()); boolean noProblems = merger.merge(masterCommit, sideCommit); assertFalse(noProblems); + } finally { + RawText.setBufferSize(originalBufferSize); } } diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/util/io/AutoCRLFInputStreamTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/util/io/AutoCRLFInputStreamTest.java index ae8c7ec7a..cd4e50339 100644 --- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/util/io/AutoCRLFInputStreamTest.java +++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/util/io/AutoCRLFInputStreamTest.java @@ -17,6 +17,7 @@ import java.io.IOException; import java.io.InputStream; +import org.eclipse.jgit.diff.RawText; import org.junit.Assert; import org.junit.Test; @@ -38,7 +39,8 @@ public void test() throws IOException { @Test public void testBoundary() throws IOException { - for (int i = AutoCRLFInputStream.BUFFER_SIZE - 10; i < AutoCRLFInputStream.BUFFER_SIZE + 10; i++) { + int boundary = RawText.getBufferSize(); + for (int i = boundary - 10; i < boundary + 10; i++) { String s1 = Strings.repeat("a", i); assertNoCrLf(s1, s1); String s2 = Strings.repeat("\0", i); diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/util/io/AutoCRLFOutputStreamTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/util/io/AutoCRLFOutputStreamTest.java index db2f6da31..150df0845 100644 --- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/util/io/AutoCRLFOutputStreamTest.java +++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/util/io/AutoCRLFOutputStreamTest.java @@ -19,6 +19,7 @@ import java.io.InputStream; import java.io.OutputStream; +import org.eclipse.jgit.diff.RawText; import org.junit.Assert; import org.junit.Test; @@ -40,7 +41,8 @@ public void test() throws IOException { @Test public void testBoundary() throws IOException { - for (int i = AutoCRLFOutputStream.BUFFER_SIZE - 10; i < AutoCRLFOutputStream.BUFFER_SIZE + 10; i++) { + int bufferSize = RawText.getBufferSize(); + for (int i = bufferSize - 10; i < bufferSize + 10; i++) { String s1 = Strings.repeat("a", i); assertNoCrLf(s1, s1); String s2 = Strings.repeat("\0", i); diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java index d09da019d..914fa5f6f 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java @@ -17,6 +17,7 @@ import java.io.InputStream; import java.io.OutputStream; import java.nio.ByteBuffer; +import java.util.concurrent.atomic.AtomicInteger; import org.eclipse.jgit.errors.BinaryBlobException; import org.eclipse.jgit.errors.LargeObjectException; @@ -38,11 +39,20 @@ * they are converting from "line number" to "element index". */ public class RawText extends Sequence { + /** A RawText of length 0 */ public static final RawText EMPTY_TEXT = new RawText(new byte[0]); - /** Number of bytes to check for heuristics in {@link #isBinary(byte[])} */ - static final int FIRST_FEW_BYTES = 8000; + /** + * Default and minimum for {@link #BUFFER_SIZE}. + */ + private static final int FIRST_FEW_BYTES = 8 * 1024; + + /** + * Number of bytes to check for heuristics in {@link #isBinary(byte[])}. + */ + private static final AtomicInteger BUFFER_SIZE = new AtomicInteger( + FIRST_FEW_BYTES); /** The file content for this sequence. */ protected final byte[] content; @@ -247,6 +257,33 @@ public static boolean isBinary(byte[] raw) { return isBinary(raw, raw.length); } + /** + * Obtains the buffer size to use for analyzing whether certain content is + * text or binary, or what line endings are used if it's text. + * + * @return the buffer size, by default {@link #FIRST_FEW_BYTES} bytes + * @since 6.0 + */ + public static int getBufferSize() { + return BUFFER_SIZE.get(); + } + + /** + * Sets the buffer size to use for analyzing whether certain content is text + * or binary, or what line endings are used if it's text. If the given + * {@code bufferSize} is smaller than {@link #FIRST_FEW_BYTES} set the + * buffer size to {@link #FIRST_FEW_BYTES}. + * + * @param bufferSize + * Size to set + * @return the size actually set + * @since 6.0 + */ + public static int setBufferSize(int bufferSize) { + int newSize = Math.max(FIRST_FEW_BYTES, bufferSize); + return BUFFER_SIZE.updateAndGet(curr -> newSize); + } + /** * Determine heuristically whether the bytes contained in a stream * represents binary (as opposed to text) content. @@ -263,7 +300,7 @@ public static boolean isBinary(byte[] raw) { * if input stream could not be read */ public static boolean isBinary(InputStream raw) throws IOException { - final byte[] buffer = new byte[FIRST_FEW_BYTES]; + final byte[] buffer = new byte[getBufferSize()]; int cnt = 0; while (cnt < buffer.length) { final int n = raw.read(buffer, cnt, buffer.length - cnt); @@ -287,13 +324,16 @@ public static boolean isBinary(InputStream raw) throws IOException { * @return true if raw is likely to be a binary file, false otherwise */ public static boolean isBinary(byte[] raw, int length) { - // Same heuristic as C Git - if (length > FIRST_FEW_BYTES) - length = FIRST_FEW_BYTES; - for (int ptr = 0; ptr < length; ptr++) - if (raw[ptr] == '\0') + // Same heuristic as C Git (except for the buffer size) + int maxLength = getBufferSize(); + if (length > maxLength) { + length = maxLength; + } + for (int ptr = 0; ptr < length; ptr++) { + if (raw[ptr] == '\0') { return true; - + } + } return false; } @@ -329,7 +369,7 @@ public static boolean isCrLfText(byte[] raw) { * @since 5.3 */ public static boolean isCrLfText(InputStream raw) throws IOException { - byte[] buffer = new byte[FIRST_FEW_BYTES]; + byte[] buffer = new byte[getBufferSize()]; int cnt = 0; while (cnt < buffer.length) { int n = raw.read(buffer, cnt, buffer.length - cnt); @@ -409,15 +449,16 @@ public static RawText load(ObjectLoader ldr, int threshold) throw new BinaryBlobException(); } - if (sz <= FIRST_FEW_BYTES) { - byte[] data = ldr.getCachedBytes(FIRST_FEW_BYTES); + int bufferSize = getBufferSize(); + if (sz <= bufferSize) { + byte[] data = ldr.getCachedBytes(bufferSize); if (isBinary(data)) { throw new BinaryBlobException(); } return new RawText(data); } - byte[] head = new byte[FIRST_FEW_BYTES]; + byte[] head = new byte[bufferSize]; try (InputStream stream = ldr.openStream()) { int off = 0; int left = head.length; diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoCRLFInputStream.java b/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoCRLFInputStream.java index 9da890343..1b03d097b 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoCRLFInputStream.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoCRLFInputStream.java @@ -21,16 +21,15 @@ * * Existing CRLF are not expanded to CRCRLF, but retained as is. * - * Optionally, a binary check on the first 8000 bytes is performed and in case - * of binary files, canonicalization is turned off (for the complete file). + * Optionally, a binary check on the first {@link RawText#getBufferSize()} bytes + * is performed and in case of binary files, canonicalization is turned off (for + * the complete file). */ public class AutoCRLFInputStream extends InputStream { - static final int BUFFER_SIZE = 8096; - private final byte[] single = new byte[1]; - private final byte[] buf = new byte[BUFFER_SIZE]; + private final byte[] buf = new byte[RawText.getBufferSize()]; private final InputStream in; diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoCRLFOutputStream.java b/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoCRLFOutputStream.java index 97fe01e5d..05e271feb 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoCRLFOutputStream.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoCRLFOutputStream.java @@ -20,18 +20,17 @@ * * Existing CRLF are not expanded to CRCRLF, but retained as is. * - * A binary check on the first 8000 bytes is performed and in case of binary - * files, canonicalization is turned off (for the complete file). + * A binary check on the first {@link RawText#getBufferSize()} bytes is + * performed and in case of binary files, canonicalization is turned off (for + * the complete file). */ public class AutoCRLFOutputStream extends OutputStream { - static final int BUFFER_SIZE = 8000; - private final OutputStream out; private int buf = -1; - private byte[] binbuf = new byte[BUFFER_SIZE]; + private byte[] binbuf = new byte[RawText.getBufferSize()]; private byte[] onebytebuf = new byte[1]; diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoLFInputStream.java b/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoLFInputStream.java index 0e335a9dc..b6d1848b3 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoLFInputStream.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoLFInputStream.java @@ -25,10 +25,11 @@ * Existing single CR are not changed to LF but are retained as is. *

*

- * Optionally, a binary check on the first 8kB is performed and in case of - * binary files, canonicalization is turned off (for the complete file). If - * binary checking determines that the input is CR/LF-delimited text and the - * stream has been created for checkout, canonicalization is also turned off. + * Optionally, a binary check on the first {@link RawText#getBufferSize()} bytes + * is performed and in case of binary files, canonicalization is turned off (for + * the complete file). If binary checking determines that the input is + * CR/LF-delimited text and the stream has been created for checkout, + * canonicalization is also turned off. *

* * @since 4.3 @@ -64,7 +65,7 @@ public enum StreamFlag { private final byte[] single = new byte[1]; - private final byte[] buf = new byte[8 * 1024]; + private final byte[] buf = new byte[RawText.getBufferSize()]; private final InputStream in; diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoLFOutputStream.java b/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoLFOutputStream.java index 195fdb421..e08a53f50 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoLFOutputStream.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoLFOutputStream.java @@ -22,23 +22,21 @@ * Existing single CR are not changed to LF, but retained as is. *

*

- * A binary check on the first 8000 bytes is performed and in case of binary - * files, canonicalization is turned off (for the complete file). If the binary - * check determines that the input is not binary but text with CR/LF, - * canonicalization is also turned off. + * A binary check on the first {@link RawText#getBufferSize()} bytes is + * performed and in case of binary files, canonicalization is turned off (for + * the complete file). If the binary check determines that the input is not + * binary but text with CR/LF, canonicalization is also turned off. *

* * @since 4.3 */ public class AutoLFOutputStream extends OutputStream { - static final int BUFFER_SIZE = 8000; - private final OutputStream out; private int buf = -1; - private byte[] binbuf = new byte[BUFFER_SIZE]; + private byte[] binbuf = new byte[RawText.getBufferSize()]; private byte[] onebytebuf = new byte[1];