TikaProcessor.java
/*
* Licensed to the Apache Software Foundation (ASF) under one *
* or more contributor license agreements. See the NOTICE file *
* distributed with this work for additional information *
* regarding copyright ownership. The ASF licenses this file *
* to you under the Apache License, Version 2.0 (the *
* "License"); you may not use this file except in compliance *
* with the License. You may obtain a copy of the License at *
* *
* http://www.apache.org/licenses/LICENSE-2.0 *
* *
* Unless required by applicable law or agreed to in writing, *
* software distributed under the License is distributed on an *
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
* KIND, either express or implied. See the License for the *
* specific language governing permissions and limitations *
* under the License. *
*/
package org.apache.rat.analysis;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import java.util.HashMap;
import java.util.Map;
import org.apache.rat.api.Document;
import org.apache.rat.document.DocumentName;
import org.apache.rat.document.RatDocumentAnalysisException;
import org.apache.rat.document.guesser.NoteGuesser;
import org.apache.rat.utils.DefaultLog;
import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.CharsetMatch;
/**
* A wrapping around the Tika processor.
*/
public final class TikaProcessor {
/** The Tika parser */
private static final Tika TIKA = new Tika();
/** A map of mime type string to non-BINARY types.
* "text" types are already handled somewhere else
* BINARY unless listed here
*/
private static final Map<String, Document.Type> DOCUMENT_TYPE_MAP;
static {
DOCUMENT_TYPE_MAP = new HashMap<>();
// org.apache.tika.parser.epub.EpubParser
DOCUMENT_TYPE_MAP.put("application/x-ibooks+zip", Document.Type.ARCHIVE);
DOCUMENT_TYPE_MAP.put("application/epub+zip", Document.Type.ARCHIVE);
DOCUMENT_TYPE_MAP.put("application/vnd.wap.xhtml+xml", Document.Type.STANDARD);
DOCUMENT_TYPE_MAP.put("application/x-asp", Document.Type.STANDARD);
DOCUMENT_TYPE_MAP.put("application/xhtml+xml", Document.Type.STANDARD);
// org.apache.tika.parser.pdf.PDFParser", Type.BINARY);
DOCUMENT_TYPE_MAP.put("application/pdf", Document.Type.BINARY);
//org.apache.tika.parser.pkg.CompressorParser
DOCUMENT_TYPE_MAP.put("application/zlib", Document.Type.ARCHIVE);
DOCUMENT_TYPE_MAP.put("application/x-gzip", Document.Type.ARCHIVE);
DOCUMENT_TYPE_MAP.put("application/x-bzip2", Document.Type.ARCHIVE);
DOCUMENT_TYPE_MAP.put("application/x-compress", Document.Type.ARCHIVE);
DOCUMENT_TYPE_MAP.put("application/x-java-pack200", Document.Type.ARCHIVE);
DOCUMENT_TYPE_MAP.put("application/x-lzma", Document.Type.ARCHIVE);
DOCUMENT_TYPE_MAP.put("application/deflate64", Document.Type.ARCHIVE);
DOCUMENT_TYPE_MAP.put("application/x-lz4", Document.Type.ARCHIVE);
DOCUMENT_TYPE_MAP.put("application/x-snappy", Document.Type.ARCHIVE);
DOCUMENT_TYPE_MAP.put("application/x-brotli", Document.Type.ARCHIVE);
DOCUMENT_TYPE_MAP.put("application/gzip", Document.Type.ARCHIVE);
DOCUMENT_TYPE_MAP.put("application/x-bzip", Document.Type.ARCHIVE);
DOCUMENT_TYPE_MAP.put("application/x-xz", Document.Type.ARCHIVE);
//org.apache.tika.parser.pkg.PackageParser
DOCUMENT_TYPE_MAP.put("application/x-tar", Document.Type.ARCHIVE);
DOCUMENT_TYPE_MAP.put("application/java-archive", Document.Type.ARCHIVE);
DOCUMENT_TYPE_MAP.put("application/x-arj", Document.Type.ARCHIVE);
DOCUMENT_TYPE_MAP.put("application/x-archive", Document.Type.ARCHIVE);
DOCUMENT_TYPE_MAP.put("application/zip", Document.Type.ARCHIVE);
DOCUMENT_TYPE_MAP.put("application/x-cpio", Document.Type.ARCHIVE);
DOCUMENT_TYPE_MAP.put("application/x-tika-unix-dump", Document.Type.ARCHIVE);
DOCUMENT_TYPE_MAP.put("application/x-7z-compressed", Document.Type.ARCHIVE);
//org.apache.tika.parser.pkg.RarParser
DOCUMENT_TYPE_MAP.put("application/x-rar-compressed", Document.Type.ARCHIVE);
// org.apache.tika.parser.xliff.XLIFF12Parser
DOCUMENT_TYPE_MAP.put("application/x-xliff+xml", Document.Type.STANDARD);
// org.apache.tika.parser.xliff.XLZParser
DOCUMENT_TYPE_MAP.put("application/x-xliff+zip", Document.Type.ARCHIVE);
// org.apache.tika.parser.xml.DcXMLParser
DOCUMENT_TYPE_MAP.put("application/xml", Document.Type.STANDARD);
DOCUMENT_TYPE_MAP.put("image/svg+xml", Document.Type.STANDARD);
// org.apache.tika.parser.xml.FictionBookParser
DOCUMENT_TYPE_MAP.put("application/x-fictionbook+xml", Document.Type.STANDARD);
}
private TikaProcessor() {
// do not instantiate
}
/**
* Creates a copy of the document type map.
* Exposed for testing.
* @return a copy of the document type map.
*/
static Map<String, Document.Type> getDocumentTypeMap() {
return new HashMap<>(DOCUMENT_TYPE_MAP);
}
/**
* Ensures that the input stream supports {@code mark}.
* @param stream the stream to test.
* @return a stream that supports {@code mark}.
*/
public static InputStream markSupportedInputStream(final InputStream stream) {
return stream.markSupported() ? stream : new BufferedInputStream(stream);
}
/**
* Process the input document.
* @param document the Document to process.
* @return the mimetype as a string.
* @throws RatDocumentAnalysisException on error.
*/
public static String process(final Document document) throws RatDocumentAnalysisException {
try (InputStream stream = markSupportedInputStream(document.inputStream())) {
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, document.getName().getShortName());
String result = TIKA.detect(stream, metadata);
String[] parts = result.split("/");
MediaType mediaType = new MediaType(parts[0], parts[1]);
document.getMetaData().setMediaType(mediaType);
if (!document.isIgnored()) {
document.getMetaData()
.setDocumentType(fromMediaType(mediaType));
}
if (Document.Type.STANDARD == document.getMetaData().getDocumentType()) {
try {
document.getMetaData().setCharset(detectCharset(stream, document.getName()));
if (NoteGuesser.isNote(document)) {
document.getMetaData().setDocumentType(Document.Type.NOTICE);
}
} catch (UnsupportedCharsetException e) {
document.getMetaData().setDocumentType(Document.Type.UNKNOWN);
}
}
return result;
} catch (IOException e) {
throw new RatDocumentAnalysisException(e);
}
}
/**
* Determine the character set for the input stream. Input stream must implement {@code mark}.
* @param stream the stream to check.
* @param documentName the name of the document being processed.
* @return the detected character set or {@code null} if not detectable.
* @throws IOException on IO error.
* @throws UnsupportedCharsetException on unsupported charset.
*/
private static Charset detectCharset(final InputStream stream, final DocumentName documentName) throws IOException, UnsupportedCharsetException {
final int bytesForCharsetDetection = 256;
CharsetDetector encodingDetector = new CharsetDetector(bytesForCharsetDetection);
encodingDetector.setText(stream);
CharsetMatch charsetMatch = encodingDetector.detect();
if (charsetMatch != null) {
try {
return Charset.forName(charsetMatch.getName());
} catch (UnsupportedCharsetException e) {
DefaultLog.getInstance().warn(String.format("Unsupported character set '%s' in file '%s'",
charsetMatch.getName(), documentName));
throw e;
}
}
return null;
}
/**
* Gets the Document.Type based on the MediaType.
* @param mediaType the media type to check.
* @return The document type.
*/
public static Document.Type fromMediaType(final MediaType mediaType) {
if ("text".equals(mediaType.getType())) {
return Document.Type.STANDARD;
}
Document.Type result = DOCUMENT_TYPE_MAP.get(mediaType.toString());
return result == null ? Document.Type.BINARY : result;
}
}