Make the buffer size for text/binary detection configurable
The various streams used in JGit for text/binary and CR-LF detection used different buffer sizes. Most used 8000, but one used 8KiB, and one used 8096 (SIC!) bytes. Considering only the first 8kB of a file/blob is not sufficient; it may give behavior incompatible with C git. C git considers the whole blob; since it uses memory-mapped files it can do so with acceptable performance. Doing this in JGit would most likely incur a noticeable performance penalty. But 8kB is a bit small; in the file in bug 576971 the limit was hit before the first CR-LF, which occurred on line 155 at offset 9759 in the file. Make RawText.FIRST_FEW_BYTES only a default and minimum setting, and set it to 8KiB. Make the actual buffer size configurable: provide static methods getBufferSize() and setBuffersize(), and use getBufferSize() throughout instead of the constant. This enables users of the JGit library to set their own possibly larger buffer size. Bug: 576971 Change-Id: I447762c9a5147a521f73d2864ba59ed89f555d54 Signed-off-by: Thomas Wolf <thomas.wolf@paranor.ch>
This commit is contained in:
parent
331a489c7f
commit
c2204bb683
|
@ -608,7 +608,7 @@ public void testDiffAutoCrlfSmallFile() throws Exception {
|
||||||
public void testDiffAutoCrlfMediumFile() throws Exception {
|
public void testDiffAutoCrlfMediumFile() throws Exception {
|
||||||
String content = mediumCrLfString();
|
String content = mediumCrLfString();
|
||||||
String expectedDiff = "diff --git a/test.txt b/test.txt\n"
|
String expectedDiff = "diff --git a/test.txt b/test.txt\n"
|
||||||
+ "index 215c502..c10f08c 100644\n" //
|
+ "index 6d9ffed..50d7b5a 100644\n" //
|
||||||
+ "--- a/test.txt\n" //
|
+ "--- a/test.txt\n" //
|
||||||
+ "+++ b/test.txt\n" //
|
+ "+++ b/test.txt\n" //
|
||||||
+ "@@ -1,4 +1,5 @@\n" //
|
+ "@@ -1,4 +1,5 @@\n" //
|
||||||
|
@ -624,7 +624,7 @@ public void testDiffAutoCrlfMediumFile() throws Exception {
|
||||||
public void testDiffAutoCrlfLargeFile() throws Exception {
|
public void testDiffAutoCrlfLargeFile() throws Exception {
|
||||||
String content = largeCrLfString();
|
String content = largeCrLfString();
|
||||||
String expectedDiff = "diff --git a/test.txt b/test.txt\n"
|
String expectedDiff = "diff --git a/test.txt b/test.txt\n"
|
||||||
+ "index 7014942..c0487a7 100644\n" //
|
+ "index d6399a1..de26ce5 100644\n" //
|
||||||
+ "--- a/test.txt\n" //
|
+ "--- a/test.txt\n" //
|
||||||
+ "+++ b/test.txt\n" //
|
+ "+++ b/test.txt\n" //
|
||||||
+ "@@ -1,4 +1,5 @@\n"
|
+ "@@ -1,4 +1,5 @@\n"
|
||||||
|
@ -665,9 +665,9 @@ private void doAutoCrLfTest(String content, String expectedDiff)
|
||||||
|
|
||||||
private static String largeCrLfString() {
|
private static String largeCrLfString() {
|
||||||
String line = "012345678901234567890123456789012345678901234567\r\n";
|
String line = "012345678901234567890123456789012345678901234567\r\n";
|
||||||
StringBuilder builder = new StringBuilder(
|
int bufferSize = RawText.getBufferSize();
|
||||||
2 * RawText.FIRST_FEW_BYTES);
|
StringBuilder builder = new StringBuilder(2 * bufferSize);
|
||||||
while (builder.length() < 2 * RawText.FIRST_FEW_BYTES) {
|
while (builder.length() < 2 * bufferSize) {
|
||||||
builder.append(line);
|
builder.append(line);
|
||||||
}
|
}
|
||||||
return builder.toString();
|
return builder.toString();
|
||||||
|
@ -677,9 +677,9 @@ private static String mediumCrLfString() {
|
||||||
// Create a CR-LF string longer than RawText.FIRST_FEW_BYTES whose
|
// Create a CR-LF string longer than RawText.FIRST_FEW_BYTES whose
|
||||||
// canonical representation is shorter than RawText.FIRST_FEW_BYTES.
|
// canonical representation is shorter than RawText.FIRST_FEW_BYTES.
|
||||||
String line = "01234567\r\n"; // 10 characters
|
String line = "01234567\r\n"; // 10 characters
|
||||||
StringBuilder builder = new StringBuilder(
|
int bufferSize = RawText.getBufferSize();
|
||||||
RawText.FIRST_FEW_BYTES + line.length());
|
StringBuilder builder = new StringBuilder(bufferSize + line.length());
|
||||||
while (builder.length() <= RawText.FIRST_FEW_BYTES) {
|
while (builder.length() <= bufferSize) {
|
||||||
builder.append(line);
|
builder.append(line);
|
||||||
}
|
}
|
||||||
return builder.toString();
|
return builder.toString();
|
||||||
|
|
|
@ -33,6 +33,7 @@
|
||||||
import org.eclipse.jgit.api.errors.CheckoutConflictException;
|
import org.eclipse.jgit.api.errors.CheckoutConflictException;
|
||||||
import org.eclipse.jgit.api.errors.GitAPIException;
|
import org.eclipse.jgit.api.errors.GitAPIException;
|
||||||
import org.eclipse.jgit.api.errors.JGitInternalException;
|
import org.eclipse.jgit.api.errors.JGitInternalException;
|
||||||
|
import org.eclipse.jgit.diff.RawText;
|
||||||
import org.eclipse.jgit.dircache.DirCache;
|
import org.eclipse.jgit.dircache.DirCache;
|
||||||
import org.eclipse.jgit.dircache.DirCacheEditor;
|
import org.eclipse.jgit.dircache.DirCacheEditor;
|
||||||
import org.eclipse.jgit.dircache.DirCacheEntry;
|
import org.eclipse.jgit.dircache.DirCacheEntry;
|
||||||
|
@ -826,6 +827,8 @@ public void checkContentMergeLargeBinaries(MergeStrategy strategy) throws Except
|
||||||
RevCommit sideCommit = git.commit().setAll(true)
|
RevCommit sideCommit = git.commit().setAll(true)
|
||||||
.setMessage("modified file l 1500").call();
|
.setMessage("modified file l 1500").call();
|
||||||
|
|
||||||
|
int originalBufferSize = RawText.getBufferSize();
|
||||||
|
int smallBufferSize = RawText.setBufferSize(8000);
|
||||||
try (ObjectInserter ins = db.newObjectInserter()) {
|
try (ObjectInserter ins = db.newObjectInserter()) {
|
||||||
// Check that we don't read the large blobs.
|
// Check that we don't read the large blobs.
|
||||||
ObjectInserter forbidInserter = new ObjectInserter.Filter() {
|
ObjectInserter forbidInserter = new ObjectInserter.Filter() {
|
||||||
|
@ -836,7 +839,8 @@ protected ObjectInserter delegate() {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ObjectReader newReader() {
|
public ObjectReader newReader() {
|
||||||
return new BigReadForbiddenReader(super.newReader(), 8000);
|
return new BigReadForbiddenReader(super.newReader(),
|
||||||
|
smallBufferSize);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -844,6 +848,8 @@ public ObjectReader newReader() {
|
||||||
(ResolveMerger) strategy.newMerger(forbidInserter, db.getConfig());
|
(ResolveMerger) strategy.newMerger(forbidInserter, db.getConfig());
|
||||||
boolean noProblems = merger.merge(masterCommit, sideCommit);
|
boolean noProblems = merger.merge(masterCommit, sideCommit);
|
||||||
assertFalse(noProblems);
|
assertFalse(noProblems);
|
||||||
|
} finally {
|
||||||
|
RawText.setBufferSize(originalBufferSize);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
|
||||||
|
import org.eclipse.jgit.diff.RawText;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
|
@ -38,7 +39,8 @@ public void test() throws IOException {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testBoundary() throws IOException {
|
public void testBoundary() throws IOException {
|
||||||
for (int i = AutoCRLFInputStream.BUFFER_SIZE - 10; i < AutoCRLFInputStream.BUFFER_SIZE + 10; i++) {
|
int boundary = RawText.getBufferSize();
|
||||||
|
for (int i = boundary - 10; i < boundary + 10; i++) {
|
||||||
String s1 = Strings.repeat("a", i);
|
String s1 = Strings.repeat("a", i);
|
||||||
assertNoCrLf(s1, s1);
|
assertNoCrLf(s1, s1);
|
||||||
String s2 = Strings.repeat("\0", i);
|
String s2 = Strings.repeat("\0", i);
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
|
|
||||||
|
import org.eclipse.jgit.diff.RawText;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
|
@ -40,7 +41,8 @@ public void test() throws IOException {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testBoundary() throws IOException {
|
public void testBoundary() throws IOException {
|
||||||
for (int i = AutoCRLFOutputStream.BUFFER_SIZE - 10; i < AutoCRLFOutputStream.BUFFER_SIZE + 10; i++) {
|
int bufferSize = RawText.getBufferSize();
|
||||||
|
for (int i = bufferSize - 10; i < bufferSize + 10; i++) {
|
||||||
String s1 = Strings.repeat("a", i);
|
String s1 = Strings.repeat("a", i);
|
||||||
assertNoCrLf(s1, s1);
|
assertNoCrLf(s1, s1);
|
||||||
String s2 = Strings.repeat("\0", i);
|
String s2 = Strings.repeat("\0", i);
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
import org.eclipse.jgit.errors.BinaryBlobException;
|
import org.eclipse.jgit.errors.BinaryBlobException;
|
||||||
import org.eclipse.jgit.errors.LargeObjectException;
|
import org.eclipse.jgit.errors.LargeObjectException;
|
||||||
|
@ -38,11 +39,20 @@
|
||||||
* they are converting from "line number" to "element index".
|
* they are converting from "line number" to "element index".
|
||||||
*/
|
*/
|
||||||
public class RawText extends Sequence {
|
public class RawText extends Sequence {
|
||||||
|
|
||||||
/** A RawText of length 0 */
|
/** A RawText of length 0 */
|
||||||
public static final RawText EMPTY_TEXT = new RawText(new byte[0]);
|
public static final RawText EMPTY_TEXT = new RawText(new byte[0]);
|
||||||
|
|
||||||
/** Number of bytes to check for heuristics in {@link #isBinary(byte[])} */
|
/**
|
||||||
static final int FIRST_FEW_BYTES = 8000;
|
* Default and minimum for {@link #BUFFER_SIZE}.
|
||||||
|
*/
|
||||||
|
private static final int FIRST_FEW_BYTES = 8 * 1024;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Number of bytes to check for heuristics in {@link #isBinary(byte[])}.
|
||||||
|
*/
|
||||||
|
private static final AtomicInteger BUFFER_SIZE = new AtomicInteger(
|
||||||
|
FIRST_FEW_BYTES);
|
||||||
|
|
||||||
/** The file content for this sequence. */
|
/** The file content for this sequence. */
|
||||||
protected final byte[] content;
|
protected final byte[] content;
|
||||||
|
@ -247,6 +257,33 @@ public static boolean isBinary(byte[] raw) {
|
||||||
return isBinary(raw, raw.length);
|
return isBinary(raw, raw.length);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Obtains the buffer size to use for analyzing whether certain content is
|
||||||
|
* text or binary, or what line endings are used if it's text.
|
||||||
|
*
|
||||||
|
* @return the buffer size, by default {@link #FIRST_FEW_BYTES} bytes
|
||||||
|
* @since 6.0
|
||||||
|
*/
|
||||||
|
public static int getBufferSize() {
|
||||||
|
return BUFFER_SIZE.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the buffer size to use for analyzing whether certain content is text
|
||||||
|
* or binary, or what line endings are used if it's text. If the given
|
||||||
|
* {@code bufferSize} is smaller than {@link #FIRST_FEW_BYTES} set the
|
||||||
|
* buffer size to {@link #FIRST_FEW_BYTES}.
|
||||||
|
*
|
||||||
|
* @param bufferSize
|
||||||
|
* Size to set
|
||||||
|
* @return the size actually set
|
||||||
|
* @since 6.0
|
||||||
|
*/
|
||||||
|
public static int setBufferSize(int bufferSize) {
|
||||||
|
int newSize = Math.max(FIRST_FEW_BYTES, bufferSize);
|
||||||
|
return BUFFER_SIZE.updateAndGet(curr -> newSize);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Determine heuristically whether the bytes contained in a stream
|
* Determine heuristically whether the bytes contained in a stream
|
||||||
* represents binary (as opposed to text) content.
|
* represents binary (as opposed to text) content.
|
||||||
|
@ -263,7 +300,7 @@ public static boolean isBinary(byte[] raw) {
|
||||||
* if input stream could not be read
|
* if input stream could not be read
|
||||||
*/
|
*/
|
||||||
public static boolean isBinary(InputStream raw) throws IOException {
|
public static boolean isBinary(InputStream raw) throws IOException {
|
||||||
final byte[] buffer = new byte[FIRST_FEW_BYTES];
|
final byte[] buffer = new byte[getBufferSize()];
|
||||||
int cnt = 0;
|
int cnt = 0;
|
||||||
while (cnt < buffer.length) {
|
while (cnt < buffer.length) {
|
||||||
final int n = raw.read(buffer, cnt, buffer.length - cnt);
|
final int n = raw.read(buffer, cnt, buffer.length - cnt);
|
||||||
|
@ -287,13 +324,16 @@ public static boolean isBinary(InputStream raw) throws IOException {
|
||||||
* @return true if raw is likely to be a binary file, false otherwise
|
* @return true if raw is likely to be a binary file, false otherwise
|
||||||
*/
|
*/
|
||||||
public static boolean isBinary(byte[] raw, int length) {
|
public static boolean isBinary(byte[] raw, int length) {
|
||||||
// Same heuristic as C Git
|
// Same heuristic as C Git (except for the buffer size)
|
||||||
if (length > FIRST_FEW_BYTES)
|
int maxLength = getBufferSize();
|
||||||
length = FIRST_FEW_BYTES;
|
if (length > maxLength) {
|
||||||
for (int ptr = 0; ptr < length; ptr++)
|
length = maxLength;
|
||||||
if (raw[ptr] == '\0')
|
}
|
||||||
|
for (int ptr = 0; ptr < length; ptr++) {
|
||||||
|
if (raw[ptr] == '\0') {
|
||||||
return true;
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -329,7 +369,7 @@ public static boolean isCrLfText(byte[] raw) {
|
||||||
* @since 5.3
|
* @since 5.3
|
||||||
*/
|
*/
|
||||||
public static boolean isCrLfText(InputStream raw) throws IOException {
|
public static boolean isCrLfText(InputStream raw) throws IOException {
|
||||||
byte[] buffer = new byte[FIRST_FEW_BYTES];
|
byte[] buffer = new byte[getBufferSize()];
|
||||||
int cnt = 0;
|
int cnt = 0;
|
||||||
while (cnt < buffer.length) {
|
while (cnt < buffer.length) {
|
||||||
int n = raw.read(buffer, cnt, buffer.length - cnt);
|
int n = raw.read(buffer, cnt, buffer.length - cnt);
|
||||||
|
@ -409,15 +449,16 @@ public static RawText load(ObjectLoader ldr, int threshold)
|
||||||
throw new BinaryBlobException();
|
throw new BinaryBlobException();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sz <= FIRST_FEW_BYTES) {
|
int bufferSize = getBufferSize();
|
||||||
byte[] data = ldr.getCachedBytes(FIRST_FEW_BYTES);
|
if (sz <= bufferSize) {
|
||||||
|
byte[] data = ldr.getCachedBytes(bufferSize);
|
||||||
if (isBinary(data)) {
|
if (isBinary(data)) {
|
||||||
throw new BinaryBlobException();
|
throw new BinaryBlobException();
|
||||||
}
|
}
|
||||||
return new RawText(data);
|
return new RawText(data);
|
||||||
}
|
}
|
||||||
|
|
||||||
byte[] head = new byte[FIRST_FEW_BYTES];
|
byte[] head = new byte[bufferSize];
|
||||||
try (InputStream stream = ldr.openStream()) {
|
try (InputStream stream = ldr.openStream()) {
|
||||||
int off = 0;
|
int off = 0;
|
||||||
int left = head.length;
|
int left = head.length;
|
||||||
|
|
|
@ -21,16 +21,15 @@
|
||||||
*
|
*
|
||||||
* Existing CRLF are not expanded to CRCRLF, but retained as is.
|
* Existing CRLF are not expanded to CRCRLF, but retained as is.
|
||||||
*
|
*
|
||||||
* Optionally, a binary check on the first 8000 bytes is performed and in case
|
* Optionally, a binary check on the first {@link RawText#getBufferSize()} bytes
|
||||||
* of binary files, canonicalization is turned off (for the complete file).
|
* is performed and in case of binary files, canonicalization is turned off (for
|
||||||
|
* the complete file).
|
||||||
*/
|
*/
|
||||||
public class AutoCRLFInputStream extends InputStream {
|
public class AutoCRLFInputStream extends InputStream {
|
||||||
|
|
||||||
static final int BUFFER_SIZE = 8096;
|
|
||||||
|
|
||||||
private final byte[] single = new byte[1];
|
private final byte[] single = new byte[1];
|
||||||
|
|
||||||
private final byte[] buf = new byte[BUFFER_SIZE];
|
private final byte[] buf = new byte[RawText.getBufferSize()];
|
||||||
|
|
||||||
private final InputStream in;
|
private final InputStream in;
|
||||||
|
|
||||||
|
|
|
@ -20,18 +20,17 @@
|
||||||
*
|
*
|
||||||
* Existing CRLF are not expanded to CRCRLF, but retained as is.
|
* Existing CRLF are not expanded to CRCRLF, but retained as is.
|
||||||
*
|
*
|
||||||
* A binary check on the first 8000 bytes is performed and in case of binary
|
* A binary check on the first {@link RawText#getBufferSize()} bytes is
|
||||||
* files, canonicalization is turned off (for the complete file).
|
* performed and in case of binary files, canonicalization is turned off (for
|
||||||
|
* the complete file).
|
||||||
*/
|
*/
|
||||||
public class AutoCRLFOutputStream extends OutputStream {
|
public class AutoCRLFOutputStream extends OutputStream {
|
||||||
|
|
||||||
static final int BUFFER_SIZE = 8000;
|
|
||||||
|
|
||||||
private final OutputStream out;
|
private final OutputStream out;
|
||||||
|
|
||||||
private int buf = -1;
|
private int buf = -1;
|
||||||
|
|
||||||
private byte[] binbuf = new byte[BUFFER_SIZE];
|
private byte[] binbuf = new byte[RawText.getBufferSize()];
|
||||||
|
|
||||||
private byte[] onebytebuf = new byte[1];
|
private byte[] onebytebuf = new byte[1];
|
||||||
|
|
||||||
|
|
|
@ -25,10 +25,11 @@
|
||||||
* Existing single CR are not changed to LF but are retained as is.
|
* Existing single CR are not changed to LF but are retained as is.
|
||||||
* </p>
|
* </p>
|
||||||
* <p>
|
* <p>
|
||||||
* Optionally, a binary check on the first 8kB is performed and in case of
|
* Optionally, a binary check on the first {@link RawText#getBufferSize()} bytes
|
||||||
* binary files, canonicalization is turned off (for the complete file). If
|
* is performed and in case of binary files, canonicalization is turned off (for
|
||||||
* binary checking determines that the input is CR/LF-delimited text and the
|
* the complete file). If binary checking determines that the input is
|
||||||
* stream has been created for checkout, canonicalization is also turned off.
|
* CR/LF-delimited text and the stream has been created for checkout,
|
||||||
|
* canonicalization is also turned off.
|
||||||
* </p>
|
* </p>
|
||||||
*
|
*
|
||||||
* @since 4.3
|
* @since 4.3
|
||||||
|
@ -64,7 +65,7 @@ public enum StreamFlag {
|
||||||
|
|
||||||
private final byte[] single = new byte[1];
|
private final byte[] single = new byte[1];
|
||||||
|
|
||||||
private final byte[] buf = new byte[8 * 1024];
|
private final byte[] buf = new byte[RawText.getBufferSize()];
|
||||||
|
|
||||||
private final InputStream in;
|
private final InputStream in;
|
||||||
|
|
||||||
|
|
|
@ -22,23 +22,21 @@
|
||||||
* Existing single CR are not changed to LF, but retained as is.
|
* Existing single CR are not changed to LF, but retained as is.
|
||||||
* </p>
|
* </p>
|
||||||
* <p>
|
* <p>
|
||||||
* A binary check on the first 8000 bytes is performed and in case of binary
|
* A binary check on the first {@link RawText#getBufferSize()} bytes is
|
||||||
* files, canonicalization is turned off (for the complete file). If the binary
|
* performed and in case of binary files, canonicalization is turned off (for
|
||||||
* check determines that the input is not binary but text with CR/LF,
|
* the complete file). If the binary check determines that the input is not
|
||||||
* canonicalization is also turned off.
|
* binary but text with CR/LF, canonicalization is also turned off.
|
||||||
* </p>
|
* </p>
|
||||||
*
|
*
|
||||||
* @since 4.3
|
* @since 4.3
|
||||||
*/
|
*/
|
||||||
public class AutoLFOutputStream extends OutputStream {
|
public class AutoLFOutputStream extends OutputStream {
|
||||||
|
|
||||||
static final int BUFFER_SIZE = 8000;
|
|
||||||
|
|
||||||
private final OutputStream out;
|
private final OutputStream out;
|
||||||
|
|
||||||
private int buf = -1;
|
private int buf = -1;
|
||||||
|
|
||||||
private byte[] binbuf = new byte[BUFFER_SIZE];
|
private byte[] binbuf = new byte[RawText.getBufferSize()];
|
||||||
|
|
||||||
private byte[] onebytebuf = new byte[1];
|
private byte[] onebytebuf = new byte[1];
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue