View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one   *
3    * or more contributor license agreements.  See the NOTICE file *
4    * distributed with this work for additional information        *
5    * regarding copyright ownership.  The ASF licenses this file   *
6    * to you under the Apache License, Version 2.0 (the            *
7    * "License"); you may not use this file except in compliance   *
8    * with the License.  You may obtain a copy of the License at   *
9    *                                                              *
10   *   http://www.apache.org/licenses/LICENSE-2.0                 *
11   *                                                              *
12   * Unless required by applicable law or agreed to in writing,   *
13   * software distributed under the License is distributed on an  *
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
15   * KIND, either express or implied.  See the License for the    *
16   * specific language governing permissions and limitations      *
17   * under the License.                                           *
18   */
19  package org.apache.rat.document.impl.guesser;
20  
21  import org.apache.commons.io.IOUtils;
22  import org.apache.rat.api.Document;
23  
24  import java.io.IOException;
25  import java.io.InputStream;
26  import java.io.Reader;
27  import java.nio.ByteBuffer;
28  import java.nio.CharBuffer;
29  import java.nio.charset.Charset;
30  import java.nio.charset.CharsetDecoder;
31  import java.nio.charset.CoderResult;
32  import java.nio.charset.CodingErrorAction;
33  import java.nio.charset.UnsupportedCharsetException;
34  import java.util.Locale;
35  
36  /**
37   * TODO: factor into MIME guesser and MIME->binary guesser
38   */
39  public class BinaryGuesser {
40  
41      private static final String DOT = ".";
42  
43      static final String FILE_ENCODING = "file.encoding";
44      private static Charset CHARSET_FROM_FILE_ENCODING_OR_UTF8 = getFileEncodingOrUTF8AsFallback();
45  
46      private static boolean isBinaryDocument(Document document) {
47          boolean result = false;
48          InputStream stream = null;
49          try {
50              stream = document.inputStream();
51              result = isBinary(stream);
52          } catch (IOException e) {
53              result = false;
54          } finally {
55              IOUtils.closeQuietly(stream);
56          }
57          return result;
58      }
59  
60      private static boolean isBinary(CharSequence taste) {
61          int highBytes = 0;
62          final int length = taste.length();
63          for (int i = 0; i < length; i++) {
64              char c = taste.charAt(i);
65              if (c > BinaryGuesser.NON_ASCII_THREASHOLD
66                      || c <= BinaryGuesser.ASCII_CHAR_THREASHOLD) {
67                  highBytes++;
68              }
69          }
70          return highBytes * BinaryGuesser.HIGH_BYTES_RATIO
71                  > length * BinaryGuesser.TOTAL_READ_RATIO;
72      }
73  
74      /**
75       * @param in the file to check.
76       * @return Do the first few bytes of the stream hint at a binary file?
77       * <p>Any IOException is swallowed internally and the test returns
78       * false.</p>
79       * <p>This method may lead to false negatives if the reader throws
80       * an exception because it can't read characters according to the
81       * reader's encoding from the underlying stream.</p>
82       */
83      public static boolean isBinary(Reader in) {
84          char[] taste = new char[100];
85          try {
86              int bytesRead = in.read(taste);
87              if (bytesRead > 0) {
88                  return isBinary(new String(taste, 0, bytesRead));
89              }
90          } catch (IOException e) {
91              // SWALLOW 
92          }
93          return false;
94      }
95  
96      /**
97       * @param in the file to check.
98       * @return Do the first few bytes of the stream hint at a binary file?
99       * <p>Any IOException is swallowed internally and the test returns
100      * false.</p>
101      * <p>This method will try to read bytes from the stream and
102      * translate them to characters according to the platform's
103      * default encoding.  If any bytes can not be translated to
104      * characters it will assume the original data must be binary and
105      * return true.</p>
106      */
107     public static boolean isBinary(InputStream in) {
108         try {
109             byte[] taste = new byte[200];
110             int bytesRead = in.read(taste);
111             if (bytesRead > 0) {
112                 ByteBuffer bytes = ByteBuffer.wrap(taste, 0, bytesRead);
113                 CharBuffer chars = CharBuffer.allocate(2 * bytesRead);
114                 CharsetDecoder cd = CHARSET_FROM_FILE_ENCODING_OR_UTF8.newDecoder()
115                         .onMalformedInput(CodingErrorAction.REPORT)
116                         .onUnmappableCharacter(CodingErrorAction.REPORT);
117                 while (bytes.remaining() > 0) {
118                     CoderResult res = cd.decode(bytes, chars, true);
119                     if (res.isMalformed() || res.isUnmappable()) {
120                         return true;
121                     } else if (res.isOverflow()) {
122                         chars.limit(chars.position());
123                         chars.rewind();
124                         int c = chars.capacity() * 2;
125                         CharBuffer on = CharBuffer.allocate(c);
126                         on.put(chars);
127                         chars = on;
128                     }
129                 }
130                 chars.limit(chars.position());
131                 chars.rewind();
132                 return isBinary(chars);
133             }
134         } catch (IOException e) {
135             // SWALLOW 
136         }
137         return false;
138     }
139 
140     static Charset getFileEncodingOrUTF8AsFallback() {
141         try {
142             return Charset.forName(System.getProperty(FILE_ENCODING));
143         } catch (UnsupportedCharsetException e) {
144             return Charset.forName("UTF-8");
145         }
146     }
147 
148     /**
149      * @param name current file name.
150      * @return whether given name is binary.
151      */
152     public static final boolean isBinaryData(final String name) {
153         return extensionMatches(name, DATA_EXTENSIONS);
154     }
155 
156     /**
157      * @param name current file name.
158      * @return Is a file by that name a known non-binary file?
159      */
160     public static final boolean isNonBinary(final String name) {
161         if (name == null) {
162             return false;
163         }
164         return extensionMatches(name.toUpperCase(Locale.US),
165                 BinaryGuesser.NON_BINARY_EXTENSIONS);
166     }
167 
168     /**
169      * @param name current file name.
170      * @return Is a file by that name an executable/binary file?
171      */
172     public static final boolean isExecutable(final String name) {
173         return name.equals(BinaryGuesser.JAVA) || extensionMatches(name, EXE_EXTENSIONS)
174                 || containsExtension(name, EXE_EXTENSIONS);
175     }
176 
177     public static boolean containsExtension(final String name,
178                                             final String[] exts) {
179         for (int i = 0; i < exts.length; i++) {
180             if (name.contains(DOT + exts[i] + DOT)) {
181                 return true;
182             }
183         }
184         return false;
185     }
186 
187     public static boolean extensionMatches(final String name,
188                                            final String[] exts) {
189         for (int i = 0; i < exts.length; i++) {
190             if (name.endsWith(DOT + exts[i])) {
191                 return true;
192             }
193         }
194         return false;
195     }
196 
197     public static boolean isBytecode(final String name) {
198         return BinaryGuesser.extensionMatches(name, BYTECODE_EXTENSIONS);
199     }
200 
201     public static final boolean isImage(final String name) {
202         return BinaryGuesser.extensionMatches(name, IMAGE_EXTENSIONS);
203     }
204 
205     public static final boolean isKeystore(final String name) {
206         return BinaryGuesser.extensionMatches(name, KEYSTORE_EXTENSIONS);
207     }
208 
209     /**
210      * @param name file name.
211      * @return Is a file by that name a known binary file?
212      */
213     public static final boolean isBinary(final String name) {
214         if (name == null) {
215             return false;
216         }
217         String normalisedName = GuessUtils.normalise(name);
218         return BinaryGuesser.JAR_MANIFEST.equalsIgnoreCase(name) || BinaryGuesser.isImage(normalisedName)
219                 || BinaryGuesser.isKeystore(normalisedName) || BinaryGuesser.isBytecode(normalisedName)
220                 || BinaryGuesser.isBinaryData(normalisedName) || BinaryGuesser.isExecutable(normalisedName);
221     }
222 
223     private static final String[] DATA_EXTENSIONS = {
224             "DAT", "DOC",
225             "NCB", "IDB",
226             "SUO", "XCF",
227             "RAJ", "CERT",
228             "KS", "TS",
229             "ODP", "SWF",
230             // fonts
231             "WOFF2", "WOFF", "TTF", "EOT"
232     };
233 
234     private static final String[] EXE_EXTENSIONS = {
235             "EXE", "DLL",
236             "LIB", "SO",
237             "A", "EXP",
238     };
239 
240     private static final String[] KEYSTORE_EXTENSIONS = {
241             "JKS", "KEYSTORE", "PEM", "CRL", "TRUSTSTORE"
242     };
243 
244     private static final String[] IMAGE_EXTENSIONS = {
245             "PNG", "PDF",
246             "GIF", "GIFF",
247             "TIF", "TIFF",
248             "JPG", "JPEG",
249             "ICO", "ICNS",
250             "PSD",
251     };
252 
253     private static final String[] BYTECODE_EXTENSIONS = {
254             "CLASS", "PYD",
255             "OBJ", "PYC",
256     };
257 
258     /**
259      * Based on http://www.apache.org/dev/svn-eol-style.txt
260      */
261     private static final String[] NON_BINARY_EXTENSIONS = {
262             "AART",
263             "AC",
264             "AM",
265             "BAT",
266             "C",
267             "CAT",
268             "CGI",
269             "CLASSPATH",
270             "CMD",
271             "CONFIG",
272             "CPP",
273             "CSS",
274             "CWIKI",
275             "DATA",
276             "DCL",
277             "DTD",
278             "EGRM",
279             "ENT",
280             "FT",
281             "FN",
282             "FV",
283             "GRM",
284             "G",
285             "H",
286             "HTACCESS",
287             "HTML",
288             "IHTML",
289             "IN",
290             "JAVA",
291             "JMX",
292             "JSP",
293             "JS",
294             "JUNIT",
295             "JX",
296             "MANIFEST",
297             "M4",
298             "MF",
299             "MF",
300             "META",
301             "MOD",
302             "N3",
303             "PEN",
304             "PL",
305             "PM",
306             "POD",
307             "POM",
308             "PROJECT",
309             "PROPERTIES",
310             "PY",
311             "RB",
312             "RDF",
313             "RNC",
314             "RNG",
315             "RNX",
316             "ROLES",
317             "RSS",
318             "SH",
319             "SQL",
320             "SVG",
321             "TLD",
322             "TXT",
323             "TYPES",
324             "VM",
325             "VSL",
326             "WSDD",
327             "WSDL",
328             "XARGS",
329             "XCAT",
330             "XCONF",
331             "XEGRM",
332             "XGRM",
333             "XLEX",
334             "XLOG",
335             "XMAP",
336             "XML",
337             "XROLES",
338             "XSAMPLES",
339             "XSD",
340             "XSL",
341             "XSLT",
342             "XSP",
343             "XUL",
344             "XWEB",
345             "XWELCOME",
346     };
347     public static final String JAR_MANIFEST = "MANIFEST.MF";
348     public static final String JAVA = "JAVA";
349     public static final int HIGH_BYTES_RATIO = 100;
350     public static final int TOTAL_READ_RATIO = 30;
351     public static final int NON_ASCII_THREASHOLD = 256;
352     public static final int ASCII_CHAR_THREASHOLD = 8;
353 
354     public static final boolean isBinary(final Document document) {
355         // TODO: reimplement the binary test algorithm?
356         // TODO: more efficient to move into standard analysis
357         // TODO: then use binary as default
358         return isBinary(document.getName())
359                 ||
360                 // try a taste
361                 isBinaryDocument(document);
362     }
363 
364 
365 }