View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one   *
3    * or more contributor license agreements.  See the NOTICE file *
4    * distributed with this work for additional information        *
5    * regarding copyright ownership.  The ASF licenses this file   *
6    * to you under the Apache License, Version 2.0 (the            *
7    * "License"); you may not use this file except in compliance   *
8    * with the License.  You may obtain a copy of the License at   *
9    *                                                              *
10   *   http://www.apache.org/licenses/LICENSE-2.0                 *
11   *                                                              *
12   * Unless required by applicable law or agreed to in writing,   *
13   * software distributed under the License is distributed on an  *
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
15   * KIND, either express or implied.  See the License for the    *
16   * specific language governing permissions and limitations      *
17   * under the License.                                           *
18   */
19  package org.apache.rat.analysis;
20  
21  import java.io.BufferedInputStream;
22  import java.io.IOException;
23  import java.io.InputStream;
24  import java.nio.charset.Charset;
25  import java.nio.charset.UnsupportedCharsetException;
26  import java.util.HashMap;
27  import java.util.Map;
28  
29  import org.apache.rat.api.Document;
30  import org.apache.rat.document.DocumentName;
31  import org.apache.rat.document.RatDocumentAnalysisException;
32  import org.apache.rat.document.guesser.NoteGuesser;
33  import org.apache.rat.utils.DefaultLog;
34  import org.apache.tika.Tika;
35  import org.apache.tika.metadata.Metadata;
36  import org.apache.tika.metadata.TikaCoreProperties;
37  import org.apache.tika.mime.MediaType;
38  import org.apache.tika.parser.txt.CharsetDetector;
39  import org.apache.tika.parser.txt.CharsetMatch;
40  
41  /**
42   * A wrapping around the Tika processor.
43   */
44  public final class TikaProcessor {
45  
46      /** The Tika parser */
47      private static final Tika TIKA = new Tika();
48      /** A map of mime type string to non-BINARY types.
49       * "text" types are already handled somewhere else
50       * BINARY unless listed here
51       */
52      private static final Map<String, Document.Type> DOCUMENT_TYPE_MAP;
53  
54      static {
55          DOCUMENT_TYPE_MAP = new HashMap<>();
56  //        org.apache.tika.parser.epub.EpubParser
57          DOCUMENT_TYPE_MAP.put("application/x-ibooks+zip", Document.Type.ARCHIVE);
58          DOCUMENT_TYPE_MAP.put("application/epub+zip", Document.Type.ARCHIVE);
59  
60          DOCUMENT_TYPE_MAP.put("application/vnd.wap.xhtml+xml", Document.Type.STANDARD);
61          DOCUMENT_TYPE_MAP.put("application/x-asp", Document.Type.STANDARD);
62          DOCUMENT_TYPE_MAP.put("application/xhtml+xml", Document.Type.STANDARD);
63  
64  //        org.apache.tika.parser.pdf.PDFParser", Type.BINARY);
65          DOCUMENT_TYPE_MAP.put("application/pdf", Document.Type.STANDARD);
66  //org.apache.tika.parser.pkg.CompressorParser
67          DOCUMENT_TYPE_MAP.put("application/zlib", Document.Type.ARCHIVE);
68          DOCUMENT_TYPE_MAP.put("application/x-gzip", Document.Type.ARCHIVE);
69          DOCUMENT_TYPE_MAP.put("application/x-bzip2", Document.Type.ARCHIVE);
70          DOCUMENT_TYPE_MAP.put("application/x-compress", Document.Type.ARCHIVE);
71          DOCUMENT_TYPE_MAP.put("application/x-java-pack200", Document.Type.ARCHIVE);
72          DOCUMENT_TYPE_MAP.put("application/x-lzma", Document.Type.ARCHIVE);
73          DOCUMENT_TYPE_MAP.put("application/deflate64", Document.Type.ARCHIVE);
74          DOCUMENT_TYPE_MAP.put("application/x-lz4", Document.Type.ARCHIVE);
75          DOCUMENT_TYPE_MAP.put("application/x-snappy", Document.Type.ARCHIVE);
76          DOCUMENT_TYPE_MAP.put("application/x-brotli", Document.Type.ARCHIVE);
77          DOCUMENT_TYPE_MAP.put("application/gzip", Document.Type.ARCHIVE);
78          DOCUMENT_TYPE_MAP.put("application/x-bzip", Document.Type.ARCHIVE);
79          DOCUMENT_TYPE_MAP.put("application/x-xz", Document.Type.ARCHIVE);
80  //org.apache.tika.parser.pkg.PackageParser
81          DOCUMENT_TYPE_MAP.put("application/x-tar", Document.Type.ARCHIVE);
82          DOCUMENT_TYPE_MAP.put("application/java-archive", Document.Type.ARCHIVE);
83          DOCUMENT_TYPE_MAP.put("application/x-arj", Document.Type.ARCHIVE);
84          DOCUMENT_TYPE_MAP.put("application/x-archive", Document.Type.ARCHIVE);
85          DOCUMENT_TYPE_MAP.put("application/zip", Document.Type.ARCHIVE);
86          DOCUMENT_TYPE_MAP.put("application/x-cpio", Document.Type.ARCHIVE);
87          DOCUMENT_TYPE_MAP.put("application/x-tika-unix-dump", Document.Type.ARCHIVE);
88          DOCUMENT_TYPE_MAP.put("application/x-7z-compressed", Document.Type.ARCHIVE);
89  //org.apache.tika.parser.pkg.RarParser
90          DOCUMENT_TYPE_MAP.put("application/x-rar-compressed", Document.Type.ARCHIVE);
91  
92  //        org.apache.tika.parser.xliff.XLIFF12Parser
93          DOCUMENT_TYPE_MAP.put("application/x-xliff+xml", Document.Type.STANDARD);
94  //        org.apache.tika.parser.xliff.XLZParser
95          DOCUMENT_TYPE_MAP.put("application/x-xliff+zip", Document.Type.ARCHIVE);
96  //        org.apache.tika.parser.xml.DcXMLParser
97          DOCUMENT_TYPE_MAP.put("application/xml", Document.Type.STANDARD);
98          DOCUMENT_TYPE_MAP.put("image/svg+xml", Document.Type.STANDARD);
99  //        org.apache.tika.parser.xml.FictionBookParser
100         DOCUMENT_TYPE_MAP.put("application/x-fictionbook+xml", Document.Type.STANDARD);
101     }
102 
103     private TikaProcessor() {
104         // do not instantiate
105     }
106 
107     /**
108      * Creates a copy of the document type map.
109      * Exposed for testing.
110      * @return a copy of the document type map.
111      */
112     static Map<String, Document.Type> getDocumentTypeMap() {
113         return new HashMap<>(DOCUMENT_TYPE_MAP);
114     }
115 
116     /**
117      * Ensures that the input stream supports {@code mark}.
118      * @param stream the stream to test.
119      * @return a stream that supports {@code mark}.
120      */
121     public static InputStream markSupportedInputStream(final InputStream stream) {
122         return stream.markSupported() ? stream : new BufferedInputStream(stream);
123     }
124 
125     /**
126      * Process the input document.
127      * @param document the Document to process.
128      * @return the mimetype as a string.
129      * @throws RatDocumentAnalysisException on error.
130      */
131     public static String process(final Document document) throws RatDocumentAnalysisException {
132         try (InputStream stream = markSupportedInputStream(document.inputStream())) {
133             Metadata metadata = new Metadata();
134             metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, document.getName().getShortName());
135             String result = TIKA.detect(stream, metadata);
136             String[] parts = result.split("/");
137             MediaType mediaType = new MediaType(parts[0], parts[1]);
138             document.getMetaData().setMediaType(mediaType);
139             if (!document.isIgnored()) {
140                 document.getMetaData()
141                         .setDocumentType(fromMediaType(mediaType));
142             }
143             if (Document.Type.STANDARD == document.getMetaData().getDocumentType()) {
144                 document.getMetaData().setCharset(detectCharset(stream, document.getName()));
145                 if (NoteGuesser.isNote(document)) {
146                     document.getMetaData().setDocumentType(Document.Type.NOTICE);
147                 }
148             }
149             return result;
150         } catch (IOException e) {
151             throw new RatDocumentAnalysisException(e);
152         }
153     }
154 
155     /**
156      * Determine the character set for the input stream. Input stream must implement {@code mark}.
157      * @param stream the stream to check.
158      * @param documentName the name of the document being read.
159      * @return the detected character set or {@code null} if not detectable.
160      * @throws IOException on IO error.
161      */
162     private static Charset detectCharset(final InputStream stream, final DocumentName documentName) throws IOException {
163         CharsetDetector encodingDetector = new CharsetDetector();
164         encodingDetector.setText(stream);
165         CharsetMatch charsetMatch = encodingDetector.detect();
166         if (charsetMatch != null) {
167             try {
168                 return Charset.forName(charsetMatch.getName());
169             } catch (UnsupportedCharsetException e) {
170                 DefaultLog.getInstance().warn(String.format("Unsupported character set '%s' in file '%s'.  Will use system default encoding.",
171                                 charsetMatch.getName(), documentName));
172             }
173         }
174         return null;
175     }
176 
177     /**
178      * Gets the Document.Type based on the MediaType.
179      * @param mediaType the media type to check.
180      * @return The document type.
181      */
182     public static Document.Type fromMediaType(final MediaType mediaType) {
183         if ("text".equals(mediaType.getType())) {
184             return Document.Type.STANDARD;
185         }
186 
187         Document.Type result = DOCUMENT_TYPE_MAP.get(mediaType.toString());
188         return result == null ? Document.Type.BINARY : result;
189     }
190 }