View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one   *
3    * or more contributor license agreements.  See the NOTICE file *
4    * distributed with this work for additional information        *
5    * regarding copyright ownership.  The ASF licenses this file   *
6    * to you under the Apache License, Version 2.0 (the            *
7    * "License"); you may not use this file except in compliance   *
8    * with the License.  You may obtain a copy of the License at   *
9    *                                                              *
10   *   http://www.apache.org/licenses/LICENSE-2.0                 *
11   *                                                              *
12   * Unless required by applicable law or agreed to in writing,   *
13   * software distributed under the License is distributed on an  *
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
15   * KIND, either express or implied.  See the License for the    *
16   * specific language governing permissions and limitations      *
17   * under the License.                                           *
18   */
19  package org.apache.rat.analysis;
20  
21  import org.apache.rat.api.Document;
22  import org.apache.rat.document.RatDocumentAnalysisException;
23  import org.apache.rat.document.impl.guesser.NoteGuesser;
24  import org.apache.rat.utils.Log;
25  import org.apache.tika.Tika;
26  import org.apache.tika.metadata.Metadata;
27  import org.apache.tika.metadata.TikaCoreProperties;
28  import org.apache.tika.mime.MediaType;
29  
30  import java.io.IOException;
31  import java.io.InputStream;
32  import java.util.HashMap;
33  import java.util.Map;
34  
35  /**
36   * A wrapping around the tika processor.
37   */
38  public class TikaProcessor {
39  
40      /** the Tika parser */
41      private static final Tika TIKA = new Tika();
42      /** A map of mime type string to non BINARY types.
43       * "text" types are already handled everything else
44       * BINARY unless listed here*/
45      private static Map<String, Document.Type> documentTypeMap;
46  
47      static {
48          documentTypeMap = new HashMap<>();
49  //        org.apache.tika.parser.epub.EpubParser
50          documentTypeMap.put("application/x-ibooks+zip", Document.Type.ARCHIVE);
51          documentTypeMap.put("application/epub+zip", Document.Type.ARCHIVE);
52  
53          documentTypeMap.put("application/vnd.wap.xhtml+xml", Document.Type.STANDARD);
54          documentTypeMap.put("application/x-asp", Document.Type.STANDARD);
55          documentTypeMap.put("application/xhtml+xml", Document.Type.STANDARD);
56  
57  //        org.apache.tika.parser.pdf.PDFParser", Type.BINARY);
58          documentTypeMap.put("application/pdf", Document.Type.STANDARD);
59  //org.apache.tika.parser.pkg.CompressorParser
60          documentTypeMap.put("application/zlib", Document.Type.ARCHIVE);
61          documentTypeMap.put("application/x-gzip", Document.Type.ARCHIVE);
62          documentTypeMap.put("application/x-bzip2", Document.Type.ARCHIVE);
63          documentTypeMap.put("application/x-compress", Document.Type.ARCHIVE);
64          documentTypeMap.put("application/x-java-pack200", Document.Type.ARCHIVE);
65          documentTypeMap.put("application/x-lzma", Document.Type.ARCHIVE);
66          documentTypeMap.put("application/deflate64", Document.Type.ARCHIVE);
67          documentTypeMap.put("application/x-lz4", Document.Type.ARCHIVE);
68          documentTypeMap.put("application/x-snappy", Document.Type.ARCHIVE);
69          documentTypeMap.put("application/x-brotli", Document.Type.ARCHIVE);
70          documentTypeMap.put("application/gzip", Document.Type.ARCHIVE);
71          documentTypeMap.put("application/x-bzip", Document.Type.ARCHIVE);
72          documentTypeMap.put("application/x-xz", Document.Type.ARCHIVE);
73  //org.apache.tika.parser.pkg.PackageParser
74          documentTypeMap.put("application/x-tar", Document.Type.ARCHIVE);
75          documentTypeMap.put("application/java-archive", Document.Type.ARCHIVE);
76          documentTypeMap.put("application/x-arj", Document.Type.ARCHIVE);
77          documentTypeMap.put("application/x-archive", Document.Type.ARCHIVE);
78          documentTypeMap.put("application/zip", Document.Type.ARCHIVE);
79          documentTypeMap.put("application/x-cpio", Document.Type.ARCHIVE);
80          documentTypeMap.put("application/x-tika-unix-dump", Document.Type.ARCHIVE);
81          documentTypeMap.put("application/x-7z-compressed", Document.Type.ARCHIVE);
82  //org.apache.tika.parser.pkg.RarParser
83          documentTypeMap.put("application/x-rar-compressed", Document.Type.ARCHIVE);
84  
85  //        org.apache.tika.parser.xliff.XLIFF12Parser
86          documentTypeMap.put("application/x-xliff+xml", Document.Type.STANDARD);
87  //        org.apache.tika.parser.xliff.XLZParser
88          documentTypeMap.put("application/x-xliff+zip", Document.Type.ARCHIVE);
89  //        org.apache.tika.parser.xml.DcXMLParser
90          documentTypeMap.put("application/xml", Document.Type.STANDARD);
91          documentTypeMap.put("image/svg+xml", Document.Type.STANDARD);
92  //        org.apache.tika.parser.xml.FictionBookParser
93          documentTypeMap.put("application/x-fictionbook+xml", Document.Type.STANDARD);
94      }
95  
96      /**
97       * Creates a copy of the document type map.
98       * Exposed for testing.
99       * @return a copy of the document type map.
100      */
101     static Map<String, Document.Type> getDocumentTypeMap() {
102         return new HashMap<>(documentTypeMap);
103     }
104 
105     /**
106      * Process the input document.
107      * @param log the log for messages.
108      * @param document the Document to process.
109      * @return the mimetype as a string.
110      * @throws RatDocumentAnalysisException on error.
111      */
112     public static String process(final Log log, final Document document) throws RatDocumentAnalysisException {
113         Metadata metadata = new Metadata();
114         try (InputStream stream = document.inputStream()) {
115             metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, document.getName());
116             String result = TIKA.detect(stream, metadata);
117             String[] parts = result.split("/");
118             MediaType mediaType = new MediaType(parts[0], parts[1]);
119             document.getMetaData().setMediaType(mediaType);
120             document.getMetaData()
121                     .setDocumentType(fromMediaType(mediaType, log));
122             if (Document.Type.STANDARD == document.getMetaData().getDocumentType()) {
123                 if (NoteGuesser.isNote(document)) {
124                     document.getMetaData().setDocumentType(Document.Type.NOTICE);
125                 }
126             }
127 
128             return result;
129         } catch (IOException /* | SAXException | TikaException */ e) {
130             throw new RatDocumentAnalysisException(e);
131         }
132     }
133 
134     public static Document.Type fromMediaType(final MediaType mediaType, final Log log) {
135         if ("text".equals(mediaType.getType())) {
136             return Document.Type.STANDARD;
137         }
138 
139         Document.Type result = documentTypeMap.get(mediaType.toString());
140         return result == null ? Document.Type.BINARY : result;
141     }
142 }