From c2204bb6835e4e6dc666bb34eaea910fb1484092 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomas.wolf@paranor.ch>
Date: Wed, 13 Oct 2021 00:28:38 +0200
Subject: [PATCH] Make the buffer size for text/binary detection configurable

The various streams used in JGit for text/binary and CR-LF detection
used different buffer sizes. Most used 8000, but one used 8KiB, and one
used 8096 (SIC!) bytes.

Considering only the first 8kB of a file/blob is not sufficient; it
may give behavior incompatible with C git. C git considers the whole
blob; since it uses memory-mapped files it can do so with acceptable
performance. Doing this in JGit would most likely incur a noticeable
performance penalty. But 8kB is a bit small; in the file in bug 576971
the limit was hit before the first CR-LF, which occurred on line 155
at offset 9759 in the file.

Make RawText.FIRST_FEW_BYTES only a default and minimum setting, and
set it to 8KiB. Make the actual buffer size configurable: provide
static methods getBufferSize() and setBuffersize(), and use
getBufferSize() throughout instead of the constant.

This enables users of the JGit library to set their own possibly larger
buffer size.

Bug: 576971
Change-Id: I447762c9a5147a521f73d2864ba59ed89f555d54
Signed-off-by: Thomas Wolf <thomas.wolf@paranor.ch>
---
 .../eclipse/jgit/diff/DiffFormatterTest.java  | 16 ++---
 .../org/eclipse/jgit/merge/MergerTest.java    |  8 ++-
 .../jgit/util/io/AutoCRLFInputStreamTest.java |  4 +-
 .../util/io/AutoCRLFOutputStreamTest.java     |  4 +-
 .../src/org/eclipse/jgit/diff/RawText.java    | 67 +++++++++++++++----
 .../jgit/util/io/AutoCRLFInputStream.java     |  9 ++-
 .../jgit/util/io/AutoCRLFOutputStream.java    |  9 ++-
 .../jgit/util/io/AutoLFInputStream.java       | 11 +--
 .../jgit/util/io/AutoLFOutputStream.java      | 12 ++--
 9 files changed, 94 insertions(+), 46 deletions(-)

diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/DiffFormatterTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/DiffFormatterTest.java
index b694f4aaf..a093cc78d 100644
--- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/DiffFormatterTest.java
+++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/DiffFormatterTest.java
@@ -608,7 +608,7 @@ public void testDiffAutoCrlfSmallFile() throws Exception {
 	public void testDiffAutoCrlfMediumFile() throws Exception {
 		String content = mediumCrLfString();
 		String expectedDiff = "diff --git a/test.txt b/test.txt\n"
-				+ "index 215c502..c10f08c 100644\n" //
+				+ "index 6d9ffed..50d7b5a 100644\n" //
 				+ "--- a/test.txt\n" //
 				+ "+++ b/test.txt\n" //
 				+ "@@ -1,4 +1,5 @@\n" //
@@ -624,7 +624,7 @@ public void testDiffAutoCrlfMediumFile() throws Exception {
 	public void testDiffAutoCrlfLargeFile() throws Exception {
 		String content = largeCrLfString();
 		String expectedDiff = "diff --git a/test.txt b/test.txt\n"
-				+ "index 7014942..c0487a7 100644\n" //
+				+ "index d6399a1..de26ce5 100644\n" //
 				+ "--- a/test.txt\n" //
 				+ "+++ b/test.txt\n" //
 				+ "@@ -1,4 +1,5 @@\n"
@@ -665,9 +665,9 @@ private void doAutoCrLfTest(String content, String expectedDiff)
 
 	private static String largeCrLfString() {
 		String line = "012345678901234567890123456789012345678901234567\r\n";
-		StringBuilder builder = new StringBuilder(
-				2 * RawText.FIRST_FEW_BYTES);
-		while (builder.length() < 2 * RawText.FIRST_FEW_BYTES) {
+		int bufferSize = RawText.getBufferSize();
+		StringBuilder builder = new StringBuilder(2 * bufferSize);
+		while (builder.length() < 2 * bufferSize) {
 			builder.append(line);
 		}
 		return builder.toString();
@@ -677,9 +677,9 @@ private static String mediumCrLfString() {
 		// Create a CR-LF string longer than RawText.FIRST_FEW_BYTES whose
 		// canonical representation is shorter than RawText.FIRST_FEW_BYTES.
 		String line = "01234567\r\n"; // 10 characters
-		StringBuilder builder = new StringBuilder(
-				RawText.FIRST_FEW_BYTES + line.length());
-		while (builder.length() <= RawText.FIRST_FEW_BYTES) {
+		int bufferSize = RawText.getBufferSize();
+		StringBuilder builder = new StringBuilder(bufferSize + line.length());
+		while (builder.length() <= bufferSize) {
 			builder.append(line);
 		}
 		return builder.toString();
diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/merge/MergerTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/merge/MergerTest.java
index 6cbb4a89b..dd8573d2b 100644
--- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/merge/MergerTest.java
+++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/merge/MergerTest.java
@@ -33,6 +33,7 @@
 import org.eclipse.jgit.api.errors.CheckoutConflictException;
 import org.eclipse.jgit.api.errors.GitAPIException;
 import org.eclipse.jgit.api.errors.JGitInternalException;
+import org.eclipse.jgit.diff.RawText;
 import org.eclipse.jgit.dircache.DirCache;
 import org.eclipse.jgit.dircache.DirCacheEditor;
 import org.eclipse.jgit.dircache.DirCacheEntry;
@@ -826,6 +827,8 @@ public void checkContentMergeLargeBinaries(MergeStrategy strategy) throws Except
 		RevCommit sideCommit = git.commit().setAll(true)
 			.setMessage("modified file l 1500").call();
 
+		int originalBufferSize = RawText.getBufferSize();
+		int smallBufferSize = RawText.setBufferSize(8000);
 		try (ObjectInserter ins = db.newObjectInserter()) {
 			// Check that we don't read the large blobs.
 			ObjectInserter forbidInserter = new ObjectInserter.Filter() {
@@ -836,7 +839,8 @@ protected ObjectInserter delegate() {
 
 				@Override
 				public ObjectReader newReader() {
-					return new BigReadForbiddenReader(super.newReader(), 8000);
+					return new BigReadForbiddenReader(super.newReader(),
+							smallBufferSize);
 				}
 			};
 
@@ -844,6 +848,8 @@ public ObjectReader newReader() {
 				(ResolveMerger) strategy.newMerger(forbidInserter, db.getConfig());
 			boolean noProblems = merger.merge(masterCommit, sideCommit);
 			assertFalse(noProblems);
+		} finally {
+			RawText.setBufferSize(originalBufferSize);
 		}
 	}
 
diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/util/io/AutoCRLFInputStreamTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/util/io/AutoCRLFInputStreamTest.java
index ae8c7ec7a..cd4e50339 100644
--- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/util/io/AutoCRLFInputStreamTest.java
+++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/util/io/AutoCRLFInputStreamTest.java
@@ -17,6 +17,7 @@
 import java.io.IOException;
 import java.io.InputStream;
 
+import org.eclipse.jgit.diff.RawText;
 import org.junit.Assert;
 import org.junit.Test;
 
@@ -38,7 +39,8 @@ public void test() throws IOException {
 
 	@Test
 	public void testBoundary() throws IOException {
-		for (int i = AutoCRLFInputStream.BUFFER_SIZE - 10; i < AutoCRLFInputStream.BUFFER_SIZE + 10; i++) {
+		int boundary = RawText.getBufferSize();
+		for (int i = boundary - 10; i < boundary + 10; i++) {
 			String s1 = Strings.repeat("a", i);
 			assertNoCrLf(s1, s1);
 			String s2 = Strings.repeat("\0", i);
diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/util/io/AutoCRLFOutputStreamTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/util/io/AutoCRLFOutputStreamTest.java
index db2f6da31..150df0845 100644
--- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/util/io/AutoCRLFOutputStreamTest.java
+++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/util/io/AutoCRLFOutputStreamTest.java
@@ -19,6 +19,7 @@
 import java.io.InputStream;
 import java.io.OutputStream;
 
+import org.eclipse.jgit.diff.RawText;
 import org.junit.Assert;
 import org.junit.Test;
 
@@ -40,7 +41,8 @@ public void test() throws IOException {
 
 	@Test
 	public void testBoundary() throws IOException {
-		for (int i = AutoCRLFOutputStream.BUFFER_SIZE - 10; i < AutoCRLFOutputStream.BUFFER_SIZE + 10; i++) {
+		int bufferSize = RawText.getBufferSize();
+		for (int i = bufferSize - 10; i < bufferSize + 10; i++) {
 			String s1 = Strings.repeat("a", i);
 			assertNoCrLf(s1, s1);
 			String s2 = Strings.repeat("\0", i);
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java
index d09da019d..914fa5f6f 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java
@@ -17,6 +17,7 @@
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.nio.ByteBuffer;
+import java.util.concurrent.atomic.AtomicInteger;
 
 import org.eclipse.jgit.errors.BinaryBlobException;
 import org.eclipse.jgit.errors.LargeObjectException;
@@ -38,11 +39,20 @@
  * they are converting from "line number" to "element index".
  */
 public class RawText extends Sequence {
+
 	/** A RawText of length 0 */
 	public static final RawText EMPTY_TEXT = new RawText(new byte[0]);
 
-	/** Number of bytes to check for heuristics in {@link #isBinary(byte[])} */
-	static final int FIRST_FEW_BYTES = 8000;
+	/**
+	 * Default and minimum for {@link #BUFFER_SIZE}.
+	 */
+	private static final int FIRST_FEW_BYTES = 8 * 1024;
+
+	/**
+	 * Number of bytes to check for heuristics in {@link #isBinary(byte[])}.
+	 */
+	private static final AtomicInteger BUFFER_SIZE = new AtomicInteger(
+			FIRST_FEW_BYTES);
 
 	/** The file content for this sequence. */
 	protected final byte[] content;
@@ -247,6 +257,33 @@ public static boolean isBinary(byte[] raw) {
 		return isBinary(raw, raw.length);
 	}
 
+	/**
+	 * Obtains the buffer size to use for analyzing whether certain content is
+	 * text or binary, or what line endings are used if it's text.
+	 *
+	 * @return the buffer size, by default {@link #FIRST_FEW_BYTES} bytes
+	 * @since 6.0
+	 */
+	public static int getBufferSize() {
+		return BUFFER_SIZE.get();
+	}
+
+	/**
+	 * Sets the buffer size to use for analyzing whether certain content is text
+	 * or binary, or what line endings are used if it's text. If the given
+	 * {@code bufferSize} is smaller than {@link #FIRST_FEW_BYTES} set the
+	 * buffer size to {@link #FIRST_FEW_BYTES}.
+	 *
+	 * @param bufferSize
+	 *            Size to set
+	 * @return the size actually set
+	 * @since 6.0
+	 */
+	public static int setBufferSize(int bufferSize) {
+		int newSize = Math.max(FIRST_FEW_BYTES, bufferSize);
+		return BUFFER_SIZE.updateAndGet(curr -> newSize);
+	}
+
 	/**
 	 * Determine heuristically whether the bytes contained in a stream
 	 * represents binary (as opposed to text) content.
@@ -263,7 +300,7 @@ public static boolean isBinary(byte[] raw) {
 	 *             if input stream could not be read
 	 */
 	public static boolean isBinary(InputStream raw) throws IOException {
-		final byte[] buffer = new byte[FIRST_FEW_BYTES];
+		final byte[] buffer = new byte[getBufferSize()];
 		int cnt = 0;
 		while (cnt < buffer.length) {
 			final int n = raw.read(buffer, cnt, buffer.length - cnt);
@@ -287,13 +324,16 @@ public static boolean isBinary(InputStream raw) throws IOException {
 	 * @return true if raw is likely to be a binary file, false otherwise
 	 */
 	public static boolean isBinary(byte[] raw, int length) {
-		// Same heuristic as C Git
-		if (length > FIRST_FEW_BYTES)
-			length = FIRST_FEW_BYTES;
-		for (int ptr = 0; ptr < length; ptr++)
-			if (raw[ptr] == '\0')
+		// Same heuristic as C Git (except for the buffer size)
+		int maxLength = getBufferSize();
+		if (length > maxLength) {
+			length = maxLength;
+		}
+		for (int ptr = 0; ptr < length; ptr++) {
+			if (raw[ptr] == '\0') {
 				return true;
-
+			}
+		}
 		return false;
 	}
 
@@ -329,7 +369,7 @@ public static boolean isCrLfText(byte[] raw) {
 	 * @since 5.3
 	 */
 	public static boolean isCrLfText(InputStream raw) throws IOException {
-		byte[] buffer = new byte[FIRST_FEW_BYTES];
+		byte[] buffer = new byte[getBufferSize()];
 		int cnt = 0;
 		while (cnt < buffer.length) {
 			int n = raw.read(buffer, cnt, buffer.length - cnt);
@@ -409,15 +449,16 @@ public static RawText load(ObjectLoader ldr, int threshold)
 			throw new BinaryBlobException();
 		}
 
-		if (sz <= FIRST_FEW_BYTES) {
-			byte[] data = ldr.getCachedBytes(FIRST_FEW_BYTES);
+		int bufferSize = getBufferSize();
+		if (sz <= bufferSize) {
+			byte[] data = ldr.getCachedBytes(bufferSize);
 			if (isBinary(data)) {
 				throw new BinaryBlobException();
 			}
 			return new RawText(data);
 		}
 
-		byte[] head = new byte[FIRST_FEW_BYTES];
+		byte[] head = new byte[bufferSize];
 		try (InputStream stream = ldr.openStream()) {
 			int off = 0;
 			int left = head.length;
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoCRLFInputStream.java b/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoCRLFInputStream.java
index 9da890343..1b03d097b 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoCRLFInputStream.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoCRLFInputStream.java
@@ -21,16 +21,15 @@
  *
  * Existing CRLF are not expanded to CRCRLF, but retained as is.
  *
- * Optionally, a binary check on the first 8000 bytes is performed and in case
- * of binary files, canonicalization is turned off (for the complete file).
+ * Optionally, a binary check on the first {@link RawText#getBufferSize()} bytes
+ * is performed and in case of binary files, canonicalization is turned off (for
+ * the complete file).
  */
 public class AutoCRLFInputStream extends InputStream {
 
-	static final int BUFFER_SIZE = 8096;
-
 	private final byte[] single = new byte[1];
 
-	private final byte[] buf = new byte[BUFFER_SIZE];
+	private final byte[] buf = new byte[RawText.getBufferSize()];
 
 	private final InputStream in;
 
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoCRLFOutputStream.java b/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoCRLFOutputStream.java
index 97fe01e5d..05e271feb 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoCRLFOutputStream.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoCRLFOutputStream.java
@@ -20,18 +20,17 @@
  *
  * Existing CRLF are not expanded to CRCRLF, but retained as is.
  *
- * A binary check on the first 8000 bytes is performed and in case of binary
- * files, canonicalization is turned off (for the complete file).
+ * A binary check on the first {@link RawText#getBufferSize()} bytes is
+ * performed and in case of binary files, canonicalization is turned off (for
+ * the complete file).
  */
 public class AutoCRLFOutputStream extends OutputStream {
 
-	static final int BUFFER_SIZE = 8000;
-
 	private final OutputStream out;
 
 	private int buf = -1;
 
-	private byte[] binbuf = new byte[BUFFER_SIZE];
+	private byte[] binbuf = new byte[RawText.getBufferSize()];
 
 	private byte[] onebytebuf = new byte[1];
 
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoLFInputStream.java b/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoLFInputStream.java
index 0e335a9dc..b6d1848b3 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoLFInputStream.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoLFInputStream.java
@@ -25,10 +25,11 @@
  * Existing single CR are not changed to LF but are retained as is.
  * </p>
  * <p>
- * Optionally, a binary check on the first 8kB is performed and in case of
- * binary files, canonicalization is turned off (for the complete file). If
- * binary checking determines that the input is CR/LF-delimited text and the
- * stream has been created for checkout, canonicalization is also turned off.
+ * Optionally, a binary check on the first {@link RawText#getBufferSize()} bytes
+ * is performed and in case of binary files, canonicalization is turned off (for
+ * the complete file). If binary checking determines that the input is
+ * CR/LF-delimited text and the stream has been created for checkout,
+ * canonicalization is also turned off.
  * </p>
  *
  * @since 4.3
@@ -64,7 +65,7 @@ public enum StreamFlag {
 
 	private final byte[] single = new byte[1];
 
-	private final byte[] buf = new byte[8 * 1024];
+	private final byte[] buf = new byte[RawText.getBufferSize()];
 
 	private final InputStream in;
 
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoLFOutputStream.java b/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoLFOutputStream.java
index 195fdb421..e08a53f50 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoLFOutputStream.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/util/io/AutoLFOutputStream.java
@@ -22,23 +22,21 @@
  * Existing single CR are not changed to LF, but retained as is.
  * </p>
  * <p>
- * A binary check on the first 8000 bytes is performed and in case of binary
- * files, canonicalization is turned off (for the complete file). If the binary
- * check determines that the input is not binary but text with CR/LF,
- * canonicalization is also turned off.
+ * A binary check on the first {@link RawText#getBufferSize()} bytes is
+ * performed and in case of binary files, canonicalization is turned off (for
+ * the complete file). If the binary check determines that the input is not
+ * binary but text with CR/LF, canonicalization is also turned off.
  * </p>
  *
  * @since 4.3
  */
 public class AutoLFOutputStream extends OutputStream {
 
-	static final int BUFFER_SIZE = 8000;
-
 	private final OutputStream out;
 
 	private int buf = -1;
 
-	private byte[] binbuf = new byte[BUFFER_SIZE];
+	private byte[] binbuf = new byte[RawText.getBufferSize()];
 
 	private byte[] onebytebuf = new byte[1];