1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.rat.document.impl.guesser;
20
21 import org.apache.commons.io.IOUtils;
22 import org.apache.rat.api.Document;
23
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.io.Reader;
27 import java.nio.ByteBuffer;
28 import java.nio.CharBuffer;
29 import java.nio.charset.*;
30 import java.util.Locale;
31
32
33
34
35 public class BinaryGuesser {
36
37 private static final String DOT = ".";
38
39 static final String FILE_ENCODING = "file.encoding";
40 private static final Charset CHARSET_FROM_FILE_ENCODING_OR_UTF8 = getFileEncodingOrUTF8AsFallback();
41
42 private static boolean isBinaryDocument(Document document) {
43 boolean result;
44 InputStream stream = null;
45 try {
46 stream = document.inputStream();
47 result = isBinary(stream);
48 } catch (IOException e) {
49 result = false;
50 } finally {
51 IOUtils.closeQuietly(stream);
52 }
53 return result;
54 }
55
56 private static boolean isBinary(CharSequence taste) {
57 int highBytes = 0;
58 final int length = taste.length();
59 for (int i = 0; i < length; i++) {
60 char c = taste.charAt(i);
61 if (c > BinaryGuesser.NON_ASCII_THRESHOLD
62 || c <= BinaryGuesser.ASCII_CHAR_THRESHOLD) {
63 highBytes++;
64 }
65 }
66 return highBytes * BinaryGuesser.HIGH_BYTES_RATIO
67 > length * BinaryGuesser.TOTAL_READ_RATIO;
68 }
69
70
71
72
73
74
75
76
77
78
79 public static boolean isBinary(Reader in) {
80 char[] taste = new char[100];
81 try {
82 int bytesRead = in.read(taste);
83 if (bytesRead > 0) {
84 return isBinary(new String(taste, 0, bytesRead));
85 }
86 } catch (IOException e) {
87
88 }
89 return false;
90 }
91
92
93
94
95
96
97
98
99
100
101
102
103 public static boolean isBinary(InputStream in) {
104 try {
105 byte[] taste = new byte[200];
106 int bytesRead = in.read(taste);
107 if (bytesRead > 0) {
108 ByteBuffer bytes = ByteBuffer.wrap(taste, 0, bytesRead);
109 CharBuffer chars = CharBuffer.allocate(2 * bytesRead);
110 CharsetDecoder cd = CHARSET_FROM_FILE_ENCODING_OR_UTF8.newDecoder()
111 .onMalformedInput(CodingErrorAction.REPORT)
112 .onUnmappableCharacter(CodingErrorAction.REPORT);
113 while (bytes.remaining() > 0) {
114 CoderResult res = cd.decode(bytes, chars, true);
115 if (res.isMalformed() || res.isUnmappable()) {
116 return true;
117 } else if (res.isOverflow()) {
118 chars.limit(chars.position());
119 chars.rewind();
120 int c = chars.capacity() * 2;
121 CharBuffer on = CharBuffer.allocate(c);
122 on.put(chars);
123 chars = on;
124 }
125 }
126 chars.limit(chars.position());
127 chars.rewind();
128 return isBinary(chars);
129 }
130 } catch (IOException e) {
131
132 }
133 return false;
134 }
135
136 static Charset getFileEncodingOrUTF8AsFallback() {
137 try {
138 return Charset.forName(System.getProperty(FILE_ENCODING));
139 } catch (UnsupportedCharsetException e) {
140 return StandardCharsets.UTF_8;
141 }
142 }
143
144
145
146
147
148 public static boolean isBinaryData(final String name) {
149 return extensionMatches(name, DATA_EXTENSIONS);
150 }
151
152
153
154
155
156 public static boolean isNonBinary(final String name) {
157 return name != null && extensionMatches(name.toUpperCase(Locale.US), BinaryGuesser.NON_BINARY_EXTENSIONS);
158 }
159
160
161
162
163
164 public static boolean isExecutable(final String name) {
165 return name.equals(BinaryGuesser.JAVA) || extensionMatches(name, EXE_EXTENSIONS)
166 || containsExtension(name, EXE_EXTENSIONS);
167 }
168
169 public static boolean containsExtension(final String name,
170 final String[] exts) {
171 for (String ext : exts) {
172 if (name.contains(DOT + ext + DOT)) {
173 return true;
174 }
175 }
176 return false;
177 }
178
179 public static boolean extensionMatches(final String name,
180 final String[] exts) {
181 for (String ext : exts) {
182 if (name.endsWith(DOT + ext)) {
183 return true;
184 }
185 }
186 return false;
187 }
188
189 public static boolean isBytecode(final String name) {
190 return BinaryGuesser.extensionMatches(name, BYTECODE_EXTENSIONS);
191 }
192
193 public static boolean isImage(final String name) {
194 return BinaryGuesser.extensionMatches(name, IMAGE_EXTENSIONS);
195 }
196
197 public static boolean isKeystore(final String name) {
198 return BinaryGuesser.extensionMatches(name, KEYSTORE_EXTENSIONS);
199 }
200
201 public static boolean isAudio(final String name) {
202 return BinaryGuesser.extensionMatches( name, AUDIO_EXTENSIONS );
203 }
204
205
206
207
208
209 public static boolean isBinary(final String name) {
210 if (name == null) {
211 return false;
212 }
213 String normalisedName = GuessUtils.normalise(name);
214 return BinaryGuesser.JAR_MANIFEST.equalsIgnoreCase(name) || BinaryGuesser.isImage(normalisedName)
215 || BinaryGuesser.isKeystore(normalisedName) || BinaryGuesser.isBytecode(normalisedName)
216 || BinaryGuesser.isBinaryData(normalisedName) || BinaryGuesser.isExecutable(normalisedName)
217 || BinaryGuesser.isAudio( normalisedName );
218 }
219
220 private static final String[] DATA_EXTENSIONS = {
221 "DAT", "DOC",
222 "NCB", "IDB",
223 "SUO", "XCF",
224 "RAJ", "CERT",
225 "KS", "ODP", "SWF",
226
227 "WOFF2", "WOFF", "TTF", "EOT",
228
229 "JSON"
230 };
231
232 private static final String[] EXE_EXTENSIONS = {
233 "EXE", "DLL",
234 "LIB", "SO",
235 "A", "EXP",
236 };
237
238 private static final String[] KEYSTORE_EXTENSIONS = {
239 "JKS", "KEYSTORE", "PEM", "CRL", "TRUSTSTORE"
240 };
241
242 private static final String[] IMAGE_EXTENSIONS = {
243 "PNG", "PDF",
244 "GIF", "GIFF",
245 "TIF", "TIFF",
246 "JPG", "JPEG",
247 "ICO", "ICNS",
248 "PSD",
249 };
250
251 private static final String[] BYTECODE_EXTENSIONS = {
252 "CLASS", "PYD",
253 "OBJ", "PYC",
254 };
255
256 private static final String[] AUDIO_EXTENSIONS = {
257 "AIF", "IFF",
258 "M3U", "M4A",
259 "MID", "MP3",
260 "MPA", "WAV",
261 "WMA"
262 };
263
264
265
266
267 private static final String[] NON_BINARY_EXTENSIONS = {
268 "AART",
269 "AC",
270 "AM",
271 "BAT",
272 "C",
273 "CAT",
274 "CGI",
275 "CLASSPATH",
276 "CMD",
277 "CONFIG",
278 "CPP",
279 "CSS",
280 "CWIKI",
281 "DATA",
282 "DCL",
283 "DTD",
284 "EGRM",
285 "ENT",
286 "FT",
287 "FN",
288 "FV",
289 "GRM",
290 "G",
291 "GO",
292 "H",
293 "HTACCESS",
294 "HTML",
295 "IHTML",
296 "IN",
297 "JAVA",
298 "JMX",
299 "JSP",
300 "JS",
301 "JSON",
302 "JUNIT",
303 "JX",
304 "M4",
305 "MANIFEST",
306 "MD",
307 "MF",
308 "META",
309 "MOD",
310 "N3",
311 "PEN",
312 "PL",
313 "PM",
314 "POD",
315 "POM",
316 "PROJECT",
317 "PROPERTIES",
318 "PY",
319 "RB",
320 "RDF",
321 "RNC",
322 "RNG",
323 "RNX",
324 "ROLES",
325 "RSS",
326 "SH",
327 "SQL",
328 "SVG",
329 "TLD",
330 "TXT",
331 "TYPES",
332 "VM",
333 "VSL",
334 "WSDD",
335 "WSDL",
336 "XARGS",
337 "XCAT",
338 "XCONF",
339 "XEGRM",
340 "XGRM",
341 "XLEX",
342 "XLOG",
343 "XMAP",
344 "XML",
345 "XROLES",
346 "XSAMPLES",
347 "XSD",
348 "XSL",
349 "XSLT",
350 "XSP",
351 "XUL",
352 "XWEB",
353 "XWELCOME",
354 };
355 public static final String JAR_MANIFEST = "MANIFEST.MF";
356 public static final String JAVA = "JAVA";
357 public static final int HIGH_BYTES_RATIO = 100;
358 public static final int TOTAL_READ_RATIO = 30;
359 public static final int NON_ASCII_THRESHOLD = 256;
360 public static final int ASCII_CHAR_THRESHOLD = 8;
361
362 public static boolean isBinary(final Document document) {
363
364
365
366 return isBinary(document.getName())
367 ||
368
369 isBinaryDocument(document);
370 }
371
372 }