1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.rat.analysis;
20
21 import org.apache.rat.api.Document;
22 import org.apache.rat.document.RatDocumentAnalysisException;
23 import org.apache.rat.document.impl.guesser.NoteGuesser;
24 import org.apache.rat.utils.Log;
25 import org.apache.tika.Tika;
26 import org.apache.tika.metadata.Metadata;
27 import org.apache.tika.metadata.TikaCoreProperties;
28 import org.apache.tika.mime.MediaType;
29
30 import java.io.IOException;
31 import java.io.InputStream;
32 import java.util.HashMap;
33 import java.util.Map;
34
35
36
37
38 public class TikaProcessor {
39
40
41 private static final Tika TIKA = new Tika();
42
43
44
45 private static Map<String, Document.Type> documentTypeMap;
46
47 static {
48 documentTypeMap = new HashMap<>();
49
50 documentTypeMap.put("application/x-ibooks+zip", Document.Type.ARCHIVE);
51 documentTypeMap.put("application/epub+zip", Document.Type.ARCHIVE);
52
53 documentTypeMap.put("application/vnd.wap.xhtml+xml", Document.Type.STANDARD);
54 documentTypeMap.put("application/x-asp", Document.Type.STANDARD);
55 documentTypeMap.put("application/xhtml+xml", Document.Type.STANDARD);
56
57
58 documentTypeMap.put("application/pdf", Document.Type.STANDARD);
59
60 documentTypeMap.put("application/zlib", Document.Type.ARCHIVE);
61 documentTypeMap.put("application/x-gzip", Document.Type.ARCHIVE);
62 documentTypeMap.put("application/x-bzip2", Document.Type.ARCHIVE);
63 documentTypeMap.put("application/x-compress", Document.Type.ARCHIVE);
64 documentTypeMap.put("application/x-java-pack200", Document.Type.ARCHIVE);
65 documentTypeMap.put("application/x-lzma", Document.Type.ARCHIVE);
66 documentTypeMap.put("application/deflate64", Document.Type.ARCHIVE);
67 documentTypeMap.put("application/x-lz4", Document.Type.ARCHIVE);
68 documentTypeMap.put("application/x-snappy", Document.Type.ARCHIVE);
69 documentTypeMap.put("application/x-brotli", Document.Type.ARCHIVE);
70 documentTypeMap.put("application/gzip", Document.Type.ARCHIVE);
71 documentTypeMap.put("application/x-bzip", Document.Type.ARCHIVE);
72 documentTypeMap.put("application/x-xz", Document.Type.ARCHIVE);
73
74 documentTypeMap.put("application/x-tar", Document.Type.ARCHIVE);
75 documentTypeMap.put("application/java-archive", Document.Type.ARCHIVE);
76 documentTypeMap.put("application/x-arj", Document.Type.ARCHIVE);
77 documentTypeMap.put("application/x-archive", Document.Type.ARCHIVE);
78 documentTypeMap.put("application/zip", Document.Type.ARCHIVE);
79 documentTypeMap.put("application/x-cpio", Document.Type.ARCHIVE);
80 documentTypeMap.put("application/x-tika-unix-dump", Document.Type.ARCHIVE);
81 documentTypeMap.put("application/x-7z-compressed", Document.Type.ARCHIVE);
82
83 documentTypeMap.put("application/x-rar-compressed", Document.Type.ARCHIVE);
84
85
86 documentTypeMap.put("application/x-xliff+xml", Document.Type.STANDARD);
87
88 documentTypeMap.put("application/x-xliff+zip", Document.Type.ARCHIVE);
89
90 documentTypeMap.put("application/xml", Document.Type.STANDARD);
91 documentTypeMap.put("image/svg+xml", Document.Type.STANDARD);
92
93 documentTypeMap.put("application/x-fictionbook+xml", Document.Type.STANDARD);
94 }
95
96
97
98
99
100
101 static Map<String, Document.Type> getDocumentTypeMap() {
102 return new HashMap<>(documentTypeMap);
103 }
104
105
106
107
108
109
110
111
112 public static String process(final Log log, final Document document) throws RatDocumentAnalysisException {
113 Metadata metadata = new Metadata();
114 try (InputStream stream = document.inputStream()) {
115 metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, document.getName());
116 String result = TIKA.detect(stream, metadata);
117 String[] parts = result.split("/");
118 MediaType mediaType = new MediaType(parts[0], parts[1]);
119 document.getMetaData().setMediaType(mediaType);
120 document.getMetaData()
121 .setDocumentType(fromMediaType(mediaType, log));
122 if (Document.Type.STANDARD == document.getMetaData().getDocumentType()) {
123 if (NoteGuesser.isNote(document)) {
124 document.getMetaData().setDocumentType(Document.Type.NOTICE);
125 }
126 }
127
128 return result;
129 } catch (IOException e) {
130 throw new RatDocumentAnalysisException(e);
131 }
132 }
133
134 public static Document.Type fromMediaType(final MediaType mediaType, final Log log) {
135 if ("text".equals(mediaType.getType())) {
136 return Document.Type.STANDARD;
137 }
138
139 Document.Type result = documentTypeMap.get(mediaType.toString());
140 return result == null ? Document.Type.BINARY : result;
141 }
142 }