Added check for binary files while diffing
Added a check in Diff to ensure that files that are most likely not text are not line-by-line diffed. Files are determined to be binary by checking the first 8000 bytes for a null character. This is a similar heuristic to what C Git uses. Change-Id: I2b6f05674c88d89b3f549a5db483f850f7f46c26
This commit is contained in:
parent
730b708dae
commit
9f2249bd26
|
@ -132,16 +132,28 @@ protected void outputDiff(PrintStream out, String path,
|
|||
+ (mode1.equals(mode2) ? " " + mode1 : ""));
|
||||
out.println("--- " + (isNew ? "/dev/null" : name1));
|
||||
out.println("+++ " + (isDelete ? "/dev/null" : name2));
|
||||
RawText a = getRawText(id1);
|
||||
RawText b = getRawText(id2);
|
||||
|
||||
byte[] aRaw = getRawBytes(id1);
|
||||
byte[] bRaw = getRawBytes(id2);
|
||||
|
||||
if (RawText.isBinary(aRaw) || RawText.isBinary(bRaw)) {
|
||||
out.println("Binary files differ");
|
||||
return;
|
||||
}
|
||||
|
||||
RawText a = getRawText(aRaw);
|
||||
RawText b = getRawText(bRaw);
|
||||
MyersDiff diff = new MyersDiff(a, b);
|
||||
fmt.formatEdits(out, a, b, diff.getEdits());
|
||||
}
|
||||
|
||||
private RawText getRawText(ObjectId id) throws IOException {
|
||||
private byte[] getRawBytes(ObjectId id) throws IOException {
|
||||
if (id.equals(ObjectId.zeroId()))
|
||||
return new RawText(new byte[] {});
|
||||
byte[] raw = db.openBlob(id).getCachedBytes();
|
||||
return new byte[] {};
|
||||
return db.openBlob(id).getCachedBytes();
|
||||
}
|
||||
|
||||
private RawText getRawText(byte[] raw) {
|
||||
if (ignoreWsAll)
|
||||
return new RawTextIgnoreAllWhitespace(raw);
|
||||
else if (ignoreWsTrailing)
|
||||
|
@ -154,4 +166,3 @@ else if (ignoreWsLeading)
|
|||
return new RawText(raw);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -65,6 +65,9 @@
|
|||
* they are converting from "line number" to "element index".
|
||||
*/
|
||||
public class RawText implements Sequence {
|
||||
/** Number of bytes to check for heuristics in {@link #isBinary(byte[])} */
|
||||
private static final int FIRST_FEW_BYTES = 8000;
|
||||
|
||||
/** The file content for this sequence. */
|
||||
protected final byte[] content;
|
||||
|
||||
|
@ -202,4 +205,22 @@ protected int hashLine(final byte[] raw, int ptr, final int end) {
|
|||
hash = (hash << 5) ^ (raw[ptr] & 0xff);
|
||||
return hash;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine heuristically whether a byte array represents binary (as
|
||||
* opposed to text) content.
|
||||
*
|
||||
* @param raw
|
||||
* the raw file content.
|
||||
* @return true if raw is likely to be a binary file, false otherwise
|
||||
*/
|
||||
public static boolean isBinary(byte[] raw) {
|
||||
// Same heuristic as C Git
|
||||
int size = raw.length > FIRST_FEW_BYTES ? FIRST_FEW_BYTES : raw.length;
|
||||
for (int ptr = 0; ptr < size; ptr++)
|
||||
if (raw[ptr] == '\0')
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue