fix #1368: Guess Zip file encoding using Commons Compress

This commit is contained in:
Glavo 2022-02-14 23:05:14 +08:00 committed by Yuhui Huang
parent 07f9b30ed5
commit 0f22c9e31a

View File

@ -17,22 +17,18 @@
*/
package org.jackhuang.hmcl.util.io;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipFile;
import org.jackhuang.hmcl.util.platform.OperatingSystem;
import org.jetbrains.annotations.NotNull;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.*;
import java.nio.file.*;
import java.nio.file.attribute.BasicFileAttributes;
import java.nio.file.spi.FileSystemProvider;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.*;
import java.util.zip.ZipError;
import java.util.zip.ZipException;
@ -51,38 +47,42 @@ public final class CompressingUtils {
private CompressingUtils() {
}
@NotNull
private static FileVisitResult testZipPath(Path file, Path root, AtomicBoolean result) {
try {
root.relativize(file).toString(); // throw IllegalArgumentException for wrong encoding.
return FileVisitResult.CONTINUE;
} catch (Exception e) {
result.set(false);
return FileVisitResult.TERMINATE;
}
private static CharsetDecoder newCharsetDecoder(Charset charset) {
return charset.newDecoder().onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(CodingErrorAction.REPORT);
}
public static boolean testEncoding(Path zipFile, Charset encoding) throws IOException {
AtomicBoolean result = new AtomicBoolean(true);
try (FileSystem fs = CompressingUtils.createReadOnlyZipFileSystem(zipFile, encoding)) {
Path root = fs.getPath("/");
Files.walkFileTree(root, new SimpleFileVisitor<Path>() {
@Override
public FileVisitResult visitFile(Path file,
BasicFileAttributes attrs) {
return testZipPath(file, root, result);
}
@Override
public FileVisitResult preVisitDirectory(Path dir,
BasicFileAttributes attrs) {
return testZipPath(dir, root, result);
}
});
} catch (IllegalArgumentException e) {
throw new IOException(e);
try (ZipFile zf = openZipFile(zipFile, encoding)) {
return testEncoding(zf, encoding);
}
return result.get();
}
public static boolean testEncoding(ZipFile zipFile, Charset encoding) throws IOException {
Enumeration<ZipArchiveEntry> entries = zipFile.getEntries();
CharsetDecoder cd = newCharsetDecoder(encoding);
CharBuffer cb = CharBuffer.allocate(32);
while (entries.hasMoreElements()) {
ZipArchiveEntry entry = entries.nextElement();
if (entry.getGeneralPurposeBit().usesUTF8ForNames()) continue;
cd.reset();
byte[] ba = entry.getRawName();
int clen = (int)(ba.length * cd.maxCharsPerByte());
if (clen == 0) continue;
if (clen <= cb.capacity())
cb.clear();
else
cb = CharBuffer.allocate(clen);
ByteBuffer bb = ByteBuffer.wrap(ba, 0, ba.length);
CoderResult cr = cd.decode(bb, cb, true);
if (!cr.isUnderflow()) return false;
cr = cd.flush(cb);
if (!cr.isUnderflow()) return false;
}
return true;
}
public static Charset findSuitableEncoding(Path zipFile) throws IOException {
@ -90,6 +90,16 @@ public final class CompressingUtils {
}
public static Charset findSuitableEncoding(Path zipFile, Collection<Charset> candidates) throws IOException {
try (ZipFile zf = openZipFile(zipFile, StandardCharsets.UTF_8)) {
return findSuitableEncoding(zf, candidates);
}
}
public static Charset findSuitableEncoding(ZipFile zipFile) throws IOException {
return findSuitableEncoding(zipFile, Charset.availableCharsets().values());
}
public static Charset findSuitableEncoding(ZipFile zipFile, Collection<Charset> candidates) throws IOException {
if (testEncoding(zipFile, StandardCharsets.UTF_8)) return StandardCharsets.UTF_8;
if (OperatingSystem.NATIVE_CHARSET != StandardCharsets.UTF_8 && testEncoding(zipFile, OperatingSystem.NATIVE_CHARSET))
return OperatingSystem.NATIVE_CHARSET;
@ -100,6 +110,14 @@ public final class CompressingUtils {
throw new IOException("Cannot find suitable encoding for the zip.");
}
public static ZipFile openZipFile(Path zipFile) throws IOException {
return new ZipFile(Files.newByteChannel(zipFile));
}
public static ZipFile openZipFile(Path zipFile, Charset charset) throws IOException {
return new ZipFile(Files.newByteChannel(zipFile), charset.name());
}
public static final class Builder {
private boolean autoDetectEncoding = false;
private Collection<Charset> charsetCandidates;
@ -212,7 +230,7 @@ public final class CompressingUtils {
* @return the plain text content of given file.
*/
public static String readTextZipEntry(Path zipFile, String name, Charset encoding) throws IOException {
try (ZipFile s = new ZipFile(zipFile.toFile(), encoding.name())) {
try (ZipFile s = openZipFile(zipFile, encoding)) {
return IOUtils.readFullyAsString(s.getInputStream(s.getEntry(name)), StandardCharsets.UTF_8);
}
}