1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.rat.analysis;
20
21 import java.io.BufferedInputStream;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.nio.charset.Charset;
25 import java.nio.charset.UnsupportedCharsetException;
26 import java.util.HashMap;
27 import java.util.Map;
28
29 import org.apache.rat.api.Document;
30 import org.apache.rat.document.DocumentName;
31 import org.apache.rat.document.RatDocumentAnalysisException;
32 import org.apache.rat.document.guesser.NoteGuesser;
33 import org.apache.rat.utils.DefaultLog;
34 import org.apache.tika.Tika;
35 import org.apache.tika.metadata.Metadata;
36 import org.apache.tika.metadata.TikaCoreProperties;
37 import org.apache.tika.mime.MediaType;
38 import org.apache.tika.parser.txt.CharsetDetector;
39 import org.apache.tika.parser.txt.CharsetMatch;
40
41
42
43
44 public final class TikaProcessor {
45
46
47 private static final Tika TIKA = new Tika();
48
49
50
51
52 private static final Map<String, Document.Type> DOCUMENT_TYPE_MAP;
53
54 static {
55 DOCUMENT_TYPE_MAP = new HashMap<>();
56
57 DOCUMENT_TYPE_MAP.put("application/x-ibooks+zip", Document.Type.ARCHIVE);
58 DOCUMENT_TYPE_MAP.put("application/epub+zip", Document.Type.ARCHIVE);
59
60 DOCUMENT_TYPE_MAP.put("application/vnd.wap.xhtml+xml", Document.Type.STANDARD);
61 DOCUMENT_TYPE_MAP.put("application/x-asp", Document.Type.STANDARD);
62 DOCUMENT_TYPE_MAP.put("application/xhtml+xml", Document.Type.STANDARD);
63
64
65 DOCUMENT_TYPE_MAP.put("application/pdf", Document.Type.STANDARD);
66
67 DOCUMENT_TYPE_MAP.put("application/zlib", Document.Type.ARCHIVE);
68 DOCUMENT_TYPE_MAP.put("application/x-gzip", Document.Type.ARCHIVE);
69 DOCUMENT_TYPE_MAP.put("application/x-bzip2", Document.Type.ARCHIVE);
70 DOCUMENT_TYPE_MAP.put("application/x-compress", Document.Type.ARCHIVE);
71 DOCUMENT_TYPE_MAP.put("application/x-java-pack200", Document.Type.ARCHIVE);
72 DOCUMENT_TYPE_MAP.put("application/x-lzma", Document.Type.ARCHIVE);
73 DOCUMENT_TYPE_MAP.put("application/deflate64", Document.Type.ARCHIVE);
74 DOCUMENT_TYPE_MAP.put("application/x-lz4", Document.Type.ARCHIVE);
75 DOCUMENT_TYPE_MAP.put("application/x-snappy", Document.Type.ARCHIVE);
76 DOCUMENT_TYPE_MAP.put("application/x-brotli", Document.Type.ARCHIVE);
77 DOCUMENT_TYPE_MAP.put("application/gzip", Document.Type.ARCHIVE);
78 DOCUMENT_TYPE_MAP.put("application/x-bzip", Document.Type.ARCHIVE);
79 DOCUMENT_TYPE_MAP.put("application/x-xz", Document.Type.ARCHIVE);
80
81 DOCUMENT_TYPE_MAP.put("application/x-tar", Document.Type.ARCHIVE);
82 DOCUMENT_TYPE_MAP.put("application/java-archive", Document.Type.ARCHIVE);
83 DOCUMENT_TYPE_MAP.put("application/x-arj", Document.Type.ARCHIVE);
84 DOCUMENT_TYPE_MAP.put("application/x-archive", Document.Type.ARCHIVE);
85 DOCUMENT_TYPE_MAP.put("application/zip", Document.Type.ARCHIVE);
86 DOCUMENT_TYPE_MAP.put("application/x-cpio", Document.Type.ARCHIVE);
87 DOCUMENT_TYPE_MAP.put("application/x-tika-unix-dump", Document.Type.ARCHIVE);
88 DOCUMENT_TYPE_MAP.put("application/x-7z-compressed", Document.Type.ARCHIVE);
89
90 DOCUMENT_TYPE_MAP.put("application/x-rar-compressed", Document.Type.ARCHIVE);
91
92
93 DOCUMENT_TYPE_MAP.put("application/x-xliff+xml", Document.Type.STANDARD);
94
95 DOCUMENT_TYPE_MAP.put("application/x-xliff+zip", Document.Type.ARCHIVE);
96
97 DOCUMENT_TYPE_MAP.put("application/xml", Document.Type.STANDARD);
98 DOCUMENT_TYPE_MAP.put("image/svg+xml", Document.Type.STANDARD);
99
100 DOCUMENT_TYPE_MAP.put("application/x-fictionbook+xml", Document.Type.STANDARD);
101 }
102
103 private TikaProcessor() {
104
105 }
106
107
108
109
110
111
112 static Map<String, Document.Type> getDocumentTypeMap() {
113 return new HashMap<>(DOCUMENT_TYPE_MAP);
114 }
115
116
117
118
119
120
121 public static InputStream markSupportedInputStream(final InputStream stream) {
122 return stream.markSupported() ? stream : new BufferedInputStream(stream);
123 }
124
125
126
127
128
129
130
131 public static String process(final Document document) throws RatDocumentAnalysisException {
132 try (InputStream stream = markSupportedInputStream(document.inputStream())) {
133 Metadata metadata = new Metadata();
134 metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, document.getName().getShortName());
135 String result = TIKA.detect(stream, metadata);
136 String[] parts = result.split("/");
137 MediaType mediaType = new MediaType(parts[0], parts[1]);
138 document.getMetaData().setMediaType(mediaType);
139 if (!document.isIgnored()) {
140 document.getMetaData()
141 .setDocumentType(fromMediaType(mediaType));
142 }
143 if (Document.Type.STANDARD == document.getMetaData().getDocumentType()) {
144 document.getMetaData().setCharset(detectCharset(stream, document.getName()));
145 if (NoteGuesser.isNote(document)) {
146 document.getMetaData().setDocumentType(Document.Type.NOTICE);
147 }
148 }
149 return result;
150 } catch (IOException e) {
151 throw new RatDocumentAnalysisException(e);
152 }
153 }
154
155
156
157
158
159
160
161
162 private static Charset detectCharset(final InputStream stream, final DocumentName documentName) throws IOException {
163 CharsetDetector encodingDetector = new CharsetDetector();
164 encodingDetector.setText(stream);
165 CharsetMatch charsetMatch = encodingDetector.detect();
166 if (charsetMatch != null) {
167 try {
168 return Charset.forName(charsetMatch.getName());
169 } catch (UnsupportedCharsetException e) {
170 DefaultLog.getInstance().warn(String.format("Unsupported character set '%s' in file '%s'. Will use system default encoding.",
171 charsetMatch.getName(), documentName));
172 }
173 }
174 return null;
175 }
176
177
178
179
180
181
182 public static Document.Type fromMediaType(final MediaType mediaType) {
183 if ("text".equals(mediaType.getType())) {
184 return Document.Type.STANDARD;
185 }
186
187 Document.Type result = DOCUMENT_TYPE_MAP.get(mediaType.toString());
188 return result == null ? Document.Type.BINARY : result;
189 }
190 }