View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one   *
3    * or more contributor license agreements.  See the NOTICE file *
4    * distributed with this work for additional information        *
5    * regarding copyright ownership.  The ASF licenses this file   *
6    * to you under the Apache License, Version 2.0 (the            *
7    * "License"); you may not use this file except in compliance   *
8    * with the License.  You may obtain a copy of the License at   *
9    *                                                              *
10   *   http://www.apache.org/licenses/LICENSE-2.0                 *
11   *                                                              *
12   * Unless required by applicable law or agreed to in writing,   *
13   * software distributed under the License is distributed on an  *
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
15   * KIND, either express or implied.  See the License for the    *
16   * specific language governing permissions and limitations      *
17   * under the License.                                           *
18   */
19  package org.apache.rat.document.impl.guesser;
20  
21  import org.apache.commons.io.IOUtils;
22  import org.apache.rat.api.Document;
23  
24  import java.io.IOException;
25  import java.io.InputStream;
26  import java.io.Reader;
27  import java.nio.ByteBuffer;
28  import java.nio.CharBuffer;
29  import java.nio.charset.Charset;
30  import java.nio.charset.CharsetDecoder;
31  import java.nio.charset.CoderResult;
32  import java.nio.charset.CodingErrorAction;
33  import java.nio.charset.UnsupportedCharsetException;
34  import java.util.Locale;
35  
36  /**
37   * TODO: factor into MIME guesser and MIME->binary guesser
38   */
39  public class BinaryGuesser {
40  
41      private static final String DOT = ".";
42  
43      static final String FILE_ENCODING = "file.encoding";
44      private static Charset CHARSET_FROM_FILE_ENCODING_OR_UTF8 = getFileEncodingOrUTF8AsFallback();
45  
46      private static boolean isBinaryDocument(Document document) {
47          boolean result = false;
48          InputStream stream = null;
49          try {
50              stream = document.inputStream();
51              result = isBinary(stream);
52          } catch (IOException e) {
53              result = false;
54          } finally {
55              IOUtils.closeQuietly(stream);
56          }
57          return result;
58      }
59  
60      private static boolean isBinary(CharSequence taste) {
61          int highBytes = 0;
62          final int length = taste.length();
63          for (int i = 0; i < length; i++) {
64              char c = taste.charAt(i);
65              if (c > BinaryGuesser.NON_ASCII_THRESHOLD
66                      || c <= BinaryGuesser.ASCII_CHAR_THRESHOLD) {
67                  highBytes++;
68              }
69          }
70          return highBytes * BinaryGuesser.HIGH_BYTES_RATIO
71                  > length * BinaryGuesser.TOTAL_READ_RATIO;
72      }
73  
74      /**
75       * @param in the file to check.
76       * @return Do the first few bytes of the stream hint at a binary file?
77       * <p>Any IOException is swallowed internally and the test returns
78       * false.</p>
79       * <p>This method may lead to false negatives if the reader throws
80       * an exception because it can't read characters according to the
81       * reader's encoding from the underlying stream.</p>
82       */
83      public static boolean isBinary(Reader in) {
84          char[] taste = new char[100];
85          try {
86              int bytesRead = in.read(taste);
87              if (bytesRead > 0) {
88                  return isBinary(new String(taste, 0, bytesRead));
89              }
90          } catch (IOException e) {
91              // SWALLOW 
92          }
93          return false;
94      }
95  
96      /**
97       * @param in the file to check.
98       * @return Do the first few bytes of the stream hint at a binary file?
99       * <p>Any IOException is swallowed internally and the test returns
100      * false.</p>
101      * <p>This method will try to read bytes from the stream and
102      * translate them to characters according to the platform's
103      * default encoding.  If any bytes can not be translated to
104      * characters it will assume the original data must be binary and
105      * return true.</p>
106      */
107     public static boolean isBinary(InputStream in) {
108         try {
109             byte[] taste = new byte[200];
110             int bytesRead = in.read(taste);
111             if (bytesRead > 0) {
112                 ByteBuffer bytes = ByteBuffer.wrap(taste, 0, bytesRead);
113                 CharBuffer chars = CharBuffer.allocate(2 * bytesRead);
114                 CharsetDecoder cd = CHARSET_FROM_FILE_ENCODING_OR_UTF8.newDecoder()
115                         .onMalformedInput(CodingErrorAction.REPORT)
116                         .onUnmappableCharacter(CodingErrorAction.REPORT);
117                 while (bytes.remaining() > 0) {
118                     CoderResult res = cd.decode(bytes, chars, true);
119                     if (res.isMalformed() || res.isUnmappable()) {
120                         return true;
121                     } else if (res.isOverflow()) {
122                         chars.limit(chars.position());
123                         chars.rewind();
124                         int c = chars.capacity() * 2;
125                         CharBuffer on = CharBuffer.allocate(c);
126                         on.put(chars);
127                         chars = on;
128                     }
129                 }
130                 chars.limit(chars.position());
131                 chars.rewind();
132                 return isBinary(chars);
133             }
134         } catch (IOException e) {
135             // SWALLOW 
136         }
137         return false;
138     }
139 
140     static Charset getFileEncodingOrUTF8AsFallback() {
141         try {
142             return Charset.forName(System.getProperty(FILE_ENCODING));
143         } catch (UnsupportedCharsetException e) {
144             return Charset.forName("UTF-8");
145         }
146     }
147 
148     /**
149      * @param name current file name.
150      * @return whether given name is binary.
151      */
152     public static final boolean isBinaryData(final String name) {
153         return extensionMatches(name, DATA_EXTENSIONS);
154     }
155 
156     /**
157      * @param name current file name.
158      * @return Is a file by that name a known non-binary file?
159      */
160     public static final boolean isNonBinary(final String name) {
161         return name != null && extensionMatches(name.toUpperCase(Locale.US), BinaryGuesser.NON_BINARY_EXTENSIONS);
162     }
163 
164     /**
165      * @param name current file name.
166      * @return Is a file by that name an executable/binary file?
167      */
168     public static final boolean isExecutable(final String name) {
169         return name.equals(BinaryGuesser.JAVA) || extensionMatches(name, EXE_EXTENSIONS)
170                 || containsExtension(name, EXE_EXTENSIONS);
171     }
172 
173     public static boolean containsExtension(final String name,
174                                             final String[] exts) {
175         for (String ext : exts) {
176             if (name.contains(DOT + ext + DOT)) {
177                 return true;
178             }
179         }
180         return false;
181     }
182 
183     public static boolean extensionMatches(final String name,
184                                            final String[] exts) {
185         for (String ext : exts) {
186             if (name.endsWith(DOT + ext)) {
187                 return true;
188             }
189         }
190         return false;
191     }
192 
193     public static boolean isBytecode(final String name) {
194         return BinaryGuesser.extensionMatches(name, BYTECODE_EXTENSIONS);
195     }
196 
197     public static final boolean isImage(final String name) {
198         return BinaryGuesser.extensionMatches(name, IMAGE_EXTENSIONS);
199     }
200 
201     public static final boolean isKeystore(final String name) {
202         return BinaryGuesser.extensionMatches(name, KEYSTORE_EXTENSIONS);
203     }
204 
205     public static final boolean isAudio(final String name) {
206         return BinaryGuesser.extensionMatches( name, AUDIO_EXTENSIONS );
207     }
208 
209     /**
210      * @param name file name.
211      * @return Is a file by that name a known binary file?
212      */
213     public static final boolean isBinary(final String name) {
214         if (name == null) {
215             return false;
216         }
217         String normalisedName = GuessUtils.normalise(name);
218         return BinaryGuesser.JAR_MANIFEST.equalsIgnoreCase(name) || BinaryGuesser.isImage(normalisedName)
219                 || BinaryGuesser.isKeystore(normalisedName) || BinaryGuesser.isBytecode(normalisedName)
220                 || BinaryGuesser.isBinaryData(normalisedName) || BinaryGuesser.isExecutable(normalisedName)
221                 || BinaryGuesser.isAudio( normalisedName );
222     }
223 
224     private static final String[] DATA_EXTENSIONS = {
225             "DAT", "DOC",
226             "NCB", "IDB",
227             "SUO", "XCF",
228             "RAJ", "CERT",
229             "KS", "ODP", "SWF",
230             // fonts
231             "WOFF2", "WOFF", "TTF", "EOT"
232     };
233 
234     private static final String[] EXE_EXTENSIONS = {
235             "EXE", "DLL",
236             "LIB", "SO",
237             "A", "EXP",
238     };
239 
240     private static final String[] KEYSTORE_EXTENSIONS = {
241             "JKS", "KEYSTORE", "PEM", "CRL", "TRUSTSTORE"
242     };
243 
244     private static final String[] IMAGE_EXTENSIONS = {
245             "PNG", "PDF",
246             "GIF", "GIFF",
247             "TIF", "TIFF",
248             "JPG", "JPEG",
249             "ICO", "ICNS",
250             "PSD",
251     };
252 
253     private static final String[] BYTECODE_EXTENSIONS = {
254             "CLASS", "PYD",
255             "OBJ", "PYC",
256     };
257 
258     private static final String[] AUDIO_EXTENSIONS = {
259             "AIF", "IFF",
260             "M3U", "M4A",
261             "MID", "MP3",
262             "MPA", "WAV",
263             "WMA"
264     };
265     
266     /**
267      * Based on https://www.apache.org/dev/svn-eol-style.txt
268      */
269     private static final String[] NON_BINARY_EXTENSIONS = {
270             "AART",
271             "AC",
272             "AM",
273             "BAT",
274             "C",
275             "CAT",
276             "CGI",
277             "CLASSPATH",
278             "CMD",
279             "CONFIG",
280             "CPP",
281             "CSS",
282             "CWIKI",
283             "DATA",
284             "DCL",
285             "DTD",
286             "EGRM",
287             "ENT",
288             "FT",
289             "FN",
290             "FV",
291             "GRM",
292             "G",
293             "GO",
294             "H",
295             "HTACCESS",
296             "HTML",
297             "IHTML",
298             "IN",
299             "JAVA",
300             "JMX",
301             "JSP",
302             "JS",
303             "JSON",
304             "JUNIT",
305             "JX",
306             "M4",            
307             "MANIFEST",
308             "MD",
309             "MF",
310             "META",
311             "MOD",
312             "N3",
313             "PEN",
314             "PL",
315             "PM",
316             "POD",
317             "POM",
318             "PROJECT",
319             "PROPERTIES",
320             "PY",
321             "RB",
322             "RDF",
323             "RNC",
324             "RNG",
325             "RNX",
326             "ROLES",
327             "RSS",
328             "SH",
329             "SQL",
330             "SVG",
331             "TLD",
332             "TXT",
333             "TYPES",
334             "VM",
335             "VSL",
336             "WSDD",
337             "WSDL",
338             "XARGS",
339             "XCAT",
340             "XCONF",
341             "XEGRM",
342             "XGRM",
343             "XLEX",
344             "XLOG",
345             "XMAP",
346             "XML",
347             "XROLES",
348             "XSAMPLES",
349             "XSD",
350             "XSL",
351             "XSLT",
352             "XSP",
353             "XUL",
354             "XWEB",
355             "XWELCOME",
356     };
357     public static final String JAR_MANIFEST = "MANIFEST.MF";
358     public static final String JAVA = "JAVA";
359     public static final int HIGH_BYTES_RATIO = 100;
360     public static final int TOTAL_READ_RATIO = 30;
361     public static final int NON_ASCII_THRESHOLD = 256;
362     public static final int ASCII_CHAR_THRESHOLD = 8;
363 
364     public static final boolean isBinary(final Document document) {
365         // TODO: reimplement the binary test algorithm?
366         // TODO: more efficient to move into standard analysis
367         // TODO: then use binary as default
368         return isBinary(document.getName())
369                 ||
370                 // try a taste
371                 isBinaryDocument(document);
372     }
373 
374 }