From 76b76a6048b9b3dbca965463d4f6f5732ffb784d Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomas.wolf@paranor.ch>
Date: Wed, 10 Mar 2021 14:25:37 +0100
Subject: [PATCH] ApplyCommand: use byte arrays for text patches, not strings

Instead of converting the patch bytes to strings apply the patch on
byte level, like C git does. Converting the input lines and the hunk
lines from bytes to strings and then applying the patch based on
strings may give surprising results if a patch converts a text file
from one encoding to another. Moreover, in the end we don't know which
encoding to use to write the result.

Previous code just wrote the result as UTF-8, which forcibly changed
the encoding if the original input had some other encoding (even if the
patch had the same non-UTF-8 encoding). It was also wrong if the input
was UTF-8, and the patch should have changed the encoding to something
else.

So use ByteBuffers instead of Strings. This has the additional advantage
that all these ByteBuffers can share the underlying byte arrays of the
input and of the patch, so it also reduces memory consumption.

Change-Id: I450975f2ba0e7d0bec8973e3113cc2e7aea187ee
Signed-off-by: Thomas Wolf <thomas.wolf@paranor.ch>
---
 .../org.eclipse.core.resources.prefs          |   3 +-
 .../org/eclipse/jgit/diff/umlaut.patch        | Bin 0 -> 110 bytes
 .../org/eclipse/jgit/diff/umlaut_PostImage    | Bin 0 -> 4 bytes
 .../org/eclipse/jgit/diff/umlaut_PreImage     | Bin 0 -> 7 bytes
 .../eclipse/jgit/api/ApplyCommandTest.java    |   8 ++
 .../org/eclipse/jgit/api/ApplyCommand.java    |  70 ++++++++----------
 .../src/org/eclipse/jgit/diff/RawText.java    |  24 +++++-
 7 files changed, 65 insertions(+), 40 deletions(-)
 create mode 100644 org.eclipse.jgit.test/tst-rsrc/org/eclipse/jgit/diff/umlaut.patch
 create mode 100644 org.eclipse.jgit.test/tst-rsrc/org/eclipse/jgit/diff/umlaut_PostImage
 create mode 100644 org.eclipse.jgit.test/tst-rsrc/org/eclipse/jgit/diff/umlaut_PreImage

diff --git a/org.eclipse.jgit.test/.settings/org.eclipse.core.resources.prefs b/org.eclipse.jgit.test/.settings/org.eclipse.core.resources.prefs
index 6a9621db1..cddb99d1d 100644
--- a/org.eclipse.jgit.test/.settings/org.eclipse.core.resources.prefs
+++ b/org.eclipse.jgit.test/.settings/org.eclipse.core.resources.prefs
@@ -1,5 +1,6 @@
-#Sat Dec 20 21:21:24 CET 2008
 eclipse.preferences.version=1
+encoding//tst-rsrc/org/eclipse/jgit/diff/umlaut.patch=ISO-8859-1
+encoding//tst-rsrc/org/eclipse/jgit/diff/umlaut_PostImage=ISO-8859-1
 encoding//tst-rsrc/org/eclipse/jgit/patch/testGetText_BothISO88591.patch=ISO-8859-1
 encoding//tst-rsrc/org/eclipse/jgit/patch/testGetText_Convert.patch=ISO-8859-1
 encoding//tst-rsrc/org/eclipse/jgit/patch/testGetText_DiffCc.patch=ISO-8859-1
diff --git a/org.eclipse.jgit.test/tst-rsrc/org/eclipse/jgit/diff/umlaut.patch b/org.eclipse.jgit.test/tst-rsrc/org/eclipse/jgit/diff/umlaut.patch
new file mode 100644
index 0000000000000000000000000000000000000000..7380dbed822eaf9ef3016fb181f19286d0c8bd48
GIT binary patch
literal 110
zcmYevOiNSH)lJVVQApG;&CN+HEm25<GPpAHQc^1v3=E7D4NOh+^h{07)69+16buav
t%uGzUbai#%dbqT;wGsLp929g76|@Z%92~fG54Rkic6bh#_K|CMxBx4NA@u+N

literal 0
HcmV?d00001

diff --git a/org.eclipse.jgit.test/tst-rsrc/org/eclipse/jgit/diff/umlaut_PostImage b/org.eclipse.jgit.test/tst-rsrc/org/eclipse/jgit/diff/umlaut_PostImage
new file mode 100644
index 0000000000000000000000000000000000000000..557f72f513dab99472edb3bbe92c1fa84fdf5805
GIT binary patch
literal 4
LcmX?d?G6_J2Ur4u

literal 0
HcmV?d00001

diff --git a/org.eclipse.jgit.test/tst-rsrc/org/eclipse/jgit/diff/umlaut_PreImage b/org.eclipse.jgit.test/tst-rsrc/org/eclipse/jgit/diff/umlaut_PreImage
new file mode 100644
index 0000000000000000000000000000000000000000..003a054114319b5e78fd7476b949a16e4ff26a09
GIT binary patch
literal 7
OcmX@ya(LR|Ia~k|CIkuq

literal 0
HcmV?d00001

diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/api/ApplyCommandTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/api/ApplyCommandTest.java
index e9b8924e6..b997ac009 100644
--- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/api/ApplyCommandTest.java
+++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/api/ApplyCommandTest.java
@@ -287,6 +287,14 @@ public void testBinaryLiteralAdd() throws Exception {
 		checkBinary("literal_add", false);
 	}
 
+	@Test
+	public void testEncodingChange() throws Exception {
+		// This is a text patch that changes a file containing ÄÖÜ in UTF-8 to
+		// the same characters in ISO-8859-1. The patch file itself uses mixed
+		// encoding. Since checkFile() works with strings use the binary check.
+		checkBinary("umlaut", true);
+	}
+
 	@Test
 	public void testAddA1() throws Exception {
 		ApplyResult result = init("A1", false, true);
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/api/ApplyCommand.java b/org.eclipse.jgit/src/org/eclipse/jgit/api/ApplyCommand.java
index f7eba88ac..ec53412fc 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/api/ApplyCommand.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/api/ApplyCommand.java
@@ -10,7 +10,6 @@
 package org.eclipse.jgit.api;
 
 import java.io.BufferedInputStream;
-import java.io.BufferedWriter;
 import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.FileInputStream;
@@ -18,9 +17,7 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.nio.charset.StandardCharsets;
+import java.nio.ByteBuffer;
 import java.nio.file.Files;
 import java.nio.file.StandardCopyOption;
 import java.text.MessageFormat;
@@ -549,11 +546,11 @@ private void applyBinary(Repository repository, String path, File f,
 	private void applyText(Repository repository, String path, RawText rt,
 			File f, FileHeader fh, CheckoutMetadata checkOut)
 			throws IOException, PatchApplyException {
-		List<String> oldLines = new ArrayList<>(rt.size());
+		List<ByteBuffer> oldLines = new ArrayList<>(rt.size());
 		for (int i = 0; i < rt.size(); i++) {
-			oldLines.add(rt.getString(i));
+			oldLines.add(rt.getRawString(i));
 		}
-		List<String> newLines = new ArrayList<>(oldLines);
+		List<ByteBuffer> newLines = new ArrayList<>(oldLines);
 		int afterLastHunk = 0;
 		int lineNumberShift = 0;
 		int lastHunkNewLine = -1;
@@ -571,9 +568,9 @@ private void applyText(Repository repository, String path, RawText rt,
 					b.length);
 			RawText hrt = new RawText(b);
 
-			List<String> hunkLines = new ArrayList<>(hrt.size());
+			List<ByteBuffer> hunkLines = new ArrayList<>(hrt.size());
 			for (int i = 0; i < hrt.size(); i++) {
-				hunkLines.add(hrt.getString(i));
+				hunkLines.add(hrt.getRawString(i));
 			}
 
 			if (hh.getNewStartLine() == 0) {
@@ -642,8 +639,8 @@ && canApplyAt(hunkLines, newLines, 0)) {
 			lineNumberShift = applyAt - hh.getNewStartLine() + 1;
 			int sz = hunkLines.size();
 			for (int j = 1; j < sz; j++) {
-				String hunkLine = hunkLines.get(j);
-				switch (hunkLine.charAt(0)) {
+				ByteBuffer hunkLine = hunkLines.get(j);
+				switch (hunkLine.array()[hunkLine.position()]) {
 				case ' ':
 					applyAt++;
 					break;
@@ -651,7 +648,7 @@ && canApplyAt(hunkLines, newLines, 0)) {
 					newLines.remove(applyAt);
 					break;
 				case '+':
-					newLines.add(applyAt++, hunkLine.substring(1));
+					newLines.add(applyAt++, slice(hunkLine, 1));
 					break;
 				default:
 					break;
@@ -660,28 +657,29 @@ && canApplyAt(hunkLines, newLines, 0)) {
 			afterLastHunk = applyAt;
 		}
 		if (!isNoNewlineAtEndOfFile(fh)) {
-			newLines.add(""); //$NON-NLS-1$
+			newLines.add(null);
 		}
 		if (!rt.isMissingNewlineAtEnd()) {
-			oldLines.add(""); //$NON-NLS-1$
+			oldLines.add(null);
 		}
-		if (!isChanged(oldLines, newLines)) {
-			return; // Don't touch the file
+		if (oldLines.equals(newLines)) {
+			return; // Unchanged; don't touch the file
 		}
 
-		// TODO: forcing UTF-8 is a bit strange and may lead to re-coding if the
-		// input was some other encoding, but it's what previous versions of
-		// this code used. (Even earlier the code used the default encoding,
-		// which has the same problem.) Perhaps using bytes instead of Strings
-		// for the lines would be better.
 		TemporaryBuffer buffer = new TemporaryBuffer.LocalFile(null);
 		try {
-			try (Writer w = new BufferedWriter(
-					new OutputStreamWriter(buffer, StandardCharsets.UTF_8))) {
-				for (Iterator<String> l = newLines.iterator(); l.hasNext();) {
-					w.write(l.next());
+			try (OutputStream out = buffer) {
+				for (Iterator<ByteBuffer> l = newLines.iterator(); l
+						.hasNext();) {
+					ByteBuffer line = l.next();
+					if (line == null) {
+						// Must be the marker for the final newline
+						break;
+					}
+					out.write(line.array(), line.position(),
+							line.limit() - line.position());
 					if (l.hasNext()) {
-						w.write('\n');
+						out.write('\n');
 					}
 				}
 			}
@@ -698,18 +696,18 @@ && canApplyAt(hunkLines, newLines, 0)) {
 				fh.getNewMode() == FileMode.EXECUTABLE_FILE);
 	}
 
-	private boolean canApplyAt(List<String> hunkLines, List<String> newLines,
-			int line) {
+	private boolean canApplyAt(List<ByteBuffer> hunkLines,
+			List<ByteBuffer> newLines, int line) {
 		int sz = hunkLines.size();
 		int limit = newLines.size();
 		int pos = line;
 		for (int j = 1; j < sz; j++) {
-			String hunkLine = hunkLines.get(j);
-			switch (hunkLine.charAt(0)) {
+			ByteBuffer hunkLine = hunkLines.get(j);
+			switch (hunkLine.array()[hunkLine.position()]) {
 			case ' ':
 			case '-':
 				if (pos >= limit
-						|| !newLines.get(pos).equals(hunkLine.substring(1))) {
+						|| !newLines.get(pos).equals(slice(hunkLine, 1))) {
 					return false;
 				}
 				pos++;
@@ -721,13 +719,9 @@ private boolean canApplyAt(List<String> hunkLines, List<String> newLines,
 		return true;
 	}
 
-	private static boolean isChanged(List<String> ol, List<String> nl) {
-		if (ol.size() != nl.size())
-			return true;
-		for (int i = 0; i < ol.size(); i++)
-			if (!ol.get(i).equals(nl.get(i)))
-				return true;
-		return false;
+	private ByteBuffer slice(ByteBuffer b, int off) {
+		int newOffset = b.position() + off;
+		return ByteBuffer.wrap(b.array(), newOffset, b.limit() - newOffset);
 	}
 
 	private boolean isNoNewlineAtEndOfFile(FileHeader fh) {
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java
index 9f4b1fa49..d09da019d 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2009, Google Inc.
- * Copyright (C) 2008-2009, Johannes E. Schindelin <johannes.schindelin@gmx.de> and others
+ * Copyright (C) 2008-2021, Johannes E. Schindelin <johannes.schindelin@gmx.de> and others
  *
  * This program and the accompanying materials are made available under the
  * terms of the Eclipse Distribution License v. 1.0 which is available at
@@ -16,6 +16,7 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
+import java.nio.ByteBuffer;
 
 import org.eclipse.jgit.errors.BinaryBlobException;
 import org.eclipse.jgit.errors.LargeObjectException;
@@ -164,6 +165,27 @@ public String getString(int i) {
 		return getString(i, i + 1, true);
 	}
 
+	/**
+	 * Get the raw text for a single line.
+	 *
+	 * @param i
+	 *            index of the line to extract. Note this is 0-based, so line
+	 *            number 1 is actually index 0.
+	 * @return the text for the line, without a trailing LF, as a
+	 *         {@link ByteBuffer} that is backed by a slice of the
+	 *         {@link #getRawContent() raw content}, with the buffer's position
+	 *         on the start of the line and the limit at the end.
+	 * @since 5.12
+	 */
+	public ByteBuffer getRawString(int i) {
+		int s = getStart(i);
+		int e = getEnd(i);
+		if (e > 0 && content[e - 1] == '\n') {
+			e--;
+		}
+		return ByteBuffer.wrap(content, s, e - s);
+	}
+
 	/**
 	 * Get the text for a region of lines.
 	 *