View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one   *
3    * or more contributor license agreements.  See the NOTICE file *
4    * distributed with this work for additional information        *
5    * regarding copyright ownership.  The ASF licenses this file   *
6    * to you under the Apache License, Version 2.0 (the            *
7    * "License"); you may not use this file except in compliance   *
8    * with the License.  You may obtain a copy of the License at   *
9    *                                                              *
10   *   http://www.apache.org/licenses/LICENSE-2.0                 *
11   *                                                              *
12   * Unless required by applicable law or agreed to in writing,   *
13   * software distributed under the License is distributed on an  *
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
15   * KIND, either express or implied.  See the License for the    *
16   * specific language governing permissions and limitations      *
17   * under the License.                                           *
18   */
19  package org.apache.rat.document.impl.guesser;
20  
21  import org.apache.commons.io.IOUtils;
22  import org.apache.rat.api.Document;
23  
24  import java.io.IOException;
25  import java.io.InputStream;
26  import java.io.Reader;
27  import java.nio.ByteBuffer;
28  import java.nio.CharBuffer;
29  import java.nio.charset.*;
30  import java.util.Locale;
31  
32  /**
33   * TODO: factor into MIME guesser and MIME->binary guesser
34   */
35  public class BinaryGuesser {
36  
37      private static final String DOT = ".";
38  
39      static final String FILE_ENCODING = "file.encoding";
40      private static final Charset CHARSET_FROM_FILE_ENCODING_OR_UTF8 = getFileEncodingOrUTF8AsFallback();
41  
42      private static boolean isBinaryDocument(Document document) {
43          boolean result;
44          InputStream stream = null;
45          try {
46              stream = document.inputStream();
47              result = isBinary(stream);
48          } catch (IOException e) {
49              result = false;
50          } finally {
51              IOUtils.closeQuietly(stream);
52          }
53          return result;
54      }
55  
56      private static boolean isBinary(CharSequence taste) {
57          int highBytes = 0;
58          final int length = taste.length();
59          for (int i = 0; i < length; i++) {
60              char c = taste.charAt(i);
61              if (c > BinaryGuesser.NON_ASCII_THRESHOLD
62                      || c <= BinaryGuesser.ASCII_CHAR_THRESHOLD) {
63                  highBytes++;
64              }
65          }
66          return highBytes * BinaryGuesser.HIGH_BYTES_RATIO
67                  > length * BinaryGuesser.TOTAL_READ_RATIO;
68      }
69  
70      /**
71       * @param in the file to check.
72       * @return Do the first few bytes of the stream hint at a binary file?
73       * <p>Any IOException is swallowed internally and the test returns
74       * false.</p>
75       * <p>This method may lead to false negatives if the reader throws
76       * an exception because it can't read characters according to the
77       * reader's encoding from the underlying stream.</p>
78       */
79      public static boolean isBinary(Reader in) {
80          char[] taste = new char[100];
81          try {
82              int bytesRead = in.read(taste);
83              if (bytesRead > 0) {
84                  return isBinary(new String(taste, 0, bytesRead));
85              }
86          } catch (IOException e) {
87              // SWALLOW 
88          }
89          return false;
90      }
91  
92      /**
93       * @param in the file to check.
94       * @return Do the first few bytes of the stream hint at a binary file?
95       * <p>Any IOException is swallowed internally and the test returns
96       * false.</p>
97       * <p>This method will try to read bytes from the stream and
98       * translate them to characters according to the platform's
99       * default encoding.  If any bytes can not be translated to
100      * characters it will assume the original data must be binary and
101      * return true.</p>
102      */
103     public static boolean isBinary(InputStream in) {
104         try {
105             byte[] taste = new byte[200];
106             int bytesRead = in.read(taste);
107             if (bytesRead > 0) {
108                 ByteBuffer bytes = ByteBuffer.wrap(taste, 0, bytesRead);
109                 CharBuffer chars = CharBuffer.allocate(2 * bytesRead);
110                 CharsetDecoder cd = CHARSET_FROM_FILE_ENCODING_OR_UTF8.newDecoder()
111                         .onMalformedInput(CodingErrorAction.REPORT)
112                         .onUnmappableCharacter(CodingErrorAction.REPORT);
113                 while (bytes.remaining() > 0) {
114                     CoderResult res = cd.decode(bytes, chars, true);
115                     if (res.isMalformed() || res.isUnmappable()) {
116                         return true;
117                     } else if (res.isOverflow()) {
118                         chars.limit(chars.position());
119                         chars.rewind();
120                         int c = chars.capacity() * 2;
121                         CharBuffer on = CharBuffer.allocate(c);
122                         on.put(chars);
123                         chars = on;
124                     }
125                 }
126                 chars.limit(chars.position());
127                 chars.rewind();
128                 return isBinary(chars);
129             }
130         } catch (IOException e) {
131             // SWALLOW 
132         }
133         return false;
134     }
135 
136     static Charset getFileEncodingOrUTF8AsFallback() {
137         try {
138             return Charset.forName(System.getProperty(FILE_ENCODING));
139         } catch (UnsupportedCharsetException e) {
140             return StandardCharsets.UTF_8;
141         }
142     }
143 
144     /**
145      * @param name current file name.
146      * @return whether given name is binary.
147      */
148     public static boolean isBinaryData(final String name) {
149         return extensionMatches(name, DATA_EXTENSIONS);
150     }
151 
152     /**
153      * @param name current file name.
154      * @return Is a file by that name a known non-binary file?
155      */
156     public static boolean isNonBinary(final String name) {
157         return name != null && extensionMatches(name.toUpperCase(Locale.US), BinaryGuesser.NON_BINARY_EXTENSIONS);
158     }
159 
160     /**
161      * @param name current file name.
162      * @return Is a file by that name an executable/binary file?
163      */
164     public static boolean isExecutable(final String name) {
165         return name.equals(BinaryGuesser.JAVA) || extensionMatches(name, EXE_EXTENSIONS)
166                 || containsExtension(name, EXE_EXTENSIONS);
167     }
168 
169     public static boolean containsExtension(final String name,
170                                             final String[] exts) {
171         for (String ext : exts) {
172             if (name.contains(DOT + ext + DOT)) {
173                 return true;
174             }
175         }
176         return false;
177     }
178 
179     public static boolean extensionMatches(final String name,
180                                            final String[] exts) {
181         for (String ext : exts) {
182             if (name.endsWith(DOT + ext)) {
183                 return true;
184             }
185         }
186         return false;
187     }
188 
189     public static boolean isBytecode(final String name) {
190         return BinaryGuesser.extensionMatches(name, BYTECODE_EXTENSIONS);
191     }
192 
193     public static boolean isImage(final String name) {
194         return BinaryGuesser.extensionMatches(name, IMAGE_EXTENSIONS);
195     }
196 
197     public static boolean isKeystore(final String name) {
198         return BinaryGuesser.extensionMatches(name, KEYSTORE_EXTENSIONS);
199     }
200 
201     public static boolean isAudio(final String name) {
202         return BinaryGuesser.extensionMatches( name, AUDIO_EXTENSIONS );
203     }
204 
205     /**
206      * @param name file name.
207      * @return Is a file by that name a known binary file?
208      */
209     public static boolean isBinary(final String name) {
210         if (name == null) {
211             return false;
212         }
213         String normalisedName = GuessUtils.normalise(name);
214         return BinaryGuesser.JAR_MANIFEST.equalsIgnoreCase(name) || BinaryGuesser.isImage(normalisedName)
215                 || BinaryGuesser.isKeystore(normalisedName) || BinaryGuesser.isBytecode(normalisedName)
216                 || BinaryGuesser.isBinaryData(normalisedName) || BinaryGuesser.isExecutable(normalisedName)
217                 || BinaryGuesser.isAudio( normalisedName );
218     }
219 
220     private static final String[] DATA_EXTENSIONS = {
221             "DAT", "DOC",
222             "NCB", "IDB",
223             "SUO", "XCF",
224             "RAJ", "CERT",
225             "KS", "ODP", "SWF",
226             // fonts
227             "WOFF2", "WOFF", "TTF", "EOT",
228             // JSON structure does not allow comments/license injections in the way RAT expects it
229             "JSON"
230     };
231 
232     private static final String[] EXE_EXTENSIONS = {
233             "EXE", "DLL",
234             "LIB", "SO",
235             "A", "EXP",
236     };
237 
238     private static final String[] KEYSTORE_EXTENSIONS = {
239             "JKS", "KEYSTORE", "PEM", "CRL", "TRUSTSTORE"
240     };
241 
242     private static final String[] IMAGE_EXTENSIONS = {
243             "PNG", "PDF",
244             "GIF", "GIFF",
245             "TIF", "TIFF",
246             "JPG", "JPEG",
247             "ICO", "ICNS",
248             "PSD",
249     };
250 
251     private static final String[] BYTECODE_EXTENSIONS = {
252             "CLASS", "PYD",
253             "OBJ", "PYC",
254     };
255 
256     private static final String[] AUDIO_EXTENSIONS = {
257             "AIF", "IFF",
258             "M3U", "M4A",
259             "MID", "MP3",
260             "MPA", "WAV",
261             "WMA"
262     };
263     
264     /**
265      * Based on <a href="https://www.apache.org/dev/svn-eol-style.txt">https://www.apache.org/dev/svn-eol-style.txt</a>
266      */
267     private static final String[] NON_BINARY_EXTENSIONS = {
268             "AART",
269             "AC",
270             "AM",
271             "BAT",
272             "C",
273             "CAT",
274             "CGI",
275             "CLASSPATH",
276             "CMD",
277             "CONFIG",
278             "CPP",
279             "CSS",
280             "CWIKI",
281             "DATA",
282             "DCL",
283             "DTD",
284             "EGRM",
285             "ENT",
286             "FT",
287             "FN",
288             "FV",
289             "GRM",
290             "G",
291             "GO",
292             "H",
293             "HTACCESS",
294             "HTML",
295             "IHTML",
296             "IN",
297             "JAVA",
298             "JMX",
299             "JSP",
300             "JS",
301             "JSON",
302             "JUNIT",
303             "JX",
304             "M4",            
305             "MANIFEST",
306             "MD",
307             "MF",
308             "META",
309             "MOD",
310             "N3",
311             "PEN",
312             "PL",
313             "PM",
314             "POD",
315             "POM",
316             "PROJECT",
317             "PROPERTIES",
318             "PY",
319             "RB",
320             "RDF",
321             "RNC",
322             "RNG",
323             "RNX",
324             "ROLES",
325             "RSS",
326             "SH",
327             "SQL",
328             "SVG",
329             "TLD",
330             "TXT",
331             "TYPES",
332             "VM",
333             "VSL",
334             "WSDD",
335             "WSDL",
336             "XARGS",
337             "XCAT",
338             "XCONF",
339             "XEGRM",
340             "XGRM",
341             "XLEX",
342             "XLOG",
343             "XMAP",
344             "XML",
345             "XROLES",
346             "XSAMPLES",
347             "XSD",
348             "XSL",
349             "XSLT",
350             "XSP",
351             "XUL",
352             "XWEB",
353             "XWELCOME",
354     };
355     public static final String JAR_MANIFEST = "MANIFEST.MF";
356     public static final String JAVA = "JAVA";
357     public static final int HIGH_BYTES_RATIO = 100;
358     public static final int TOTAL_READ_RATIO = 30;
359     public static final int NON_ASCII_THRESHOLD = 256;
360     public static final int ASCII_CHAR_THRESHOLD = 8;
361 
362     public static boolean isBinary(final Document document) {
363         // TODO: reimplement the binary test algorithm?
364         // TODO: more efficient to move into standard analysis
365         // TODO: then use binary as default
366         return isBinary(document.getName())
367                 ||
368                 // try a taste
369                 isBinaryDocument(document);
370     }
371 
372 }