View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one   *
3    * or more contributor license agreements.  See the NOTICE file *
4    * distributed with this work for additional information        *
5    * regarding copyright ownership.  The ASF licenses this file   *
6    * to you under the Apache License, Version 2.0 (the            *
7    * "License"); you may not use this file except in compliance   *
8    * with the License.  You may obtain a copy of the License at   *
9    *                                                              *
10   *   http://www.apache.org/licenses/LICENSE-2.0                 *
11   *                                                              *
12   * Unless required by applicable law or agreed to in writing,   *
13   * software distributed under the License is distributed on an  *
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
15   * KIND, either express or implied.  See the License for the    *
16   * specific language governing permissions and limitations      *
17   * under the License.                                           *
18   */
19  package org.apache.rat.api;
20  
21  import java.io.IOException;
22  import java.io.InputStream;
23  import java.io.Reader;
24  import java.util.SortedSet;
25  
26  import org.apache.rat.analysis.TikaProcessor;
27  import org.apache.rat.document.DocumentName;
28  import org.apache.rat.document.DocumentNameMatcher;
29  import org.apache.tika.parser.txt.CharsetDetector;
30  
31  /**
32   * The representation of a document being scanned.
33   */
34  public abstract class Document implements Comparable<Document> {
35  
36      /**
37       * An enumeration of document types.
38       */
39      public enum Type {
40          /** An archive type document. */
41          ARCHIVE,
42          /** A binary file. */
43          BINARY,
44          /** A generated document that is ignored. */
45          IGNORED,
46          /** A notice document (e.g. LICENSE file). */
47          NOTICE,
48          /** A standard document. */
49          STANDARD,
50          /** An unknown document type. */
51          UNKNOWN
52      }
53  
54      /** The path matcher used by this document */
55      protected final DocumentNameMatcher nameMatcher;
56      /** The metadata for this document */
57      private final MetaData metaData;
58      /** The fully qualified name of this document */
59      protected final DocumentName name;
60  
61      /**
62       * Creates an instance.
63       * @param name the native NameSet of the resource.
64       * @param nameMatcher the document name matcher to filter directories/files.
65       */
66      protected Document(final DocumentName name, final DocumentNameMatcher nameMatcher) {
67          this.name = name;
68          this.nameMatcher = nameMatcher;
69          this.metaData = new MetaData();
70      }
71  
72      /**
73       * Gets the name of the current document.
74       * @return the name of the current document.
75       */
76      public final DocumentName getName() {
77          return name;
78      }
79  
80      /**
81       * Gets the file filter this document was created with.
82       * @return the file filter this document was created with.
83       */
84      public final DocumentNameMatcher getNameMatcher() {
85          return nameMatcher;
86      }
87  
88      @Override
89      public int compareTo(final Document doc) {
90          return name.compareTo(doc.name);
91      }
92  
93      @Override
94      public int hashCode() {
95          return name.hashCode();
96      }
97  
98      @Override
99      public boolean equals(final Object obj) {
100         if (!(obj instanceof Document)) {
101             return false;
102         }
103         return name.equals(((Document) obj).name);
104     }
105 
106     /**
107      * Reads the contents of this document.
108      * @return <code>Reader</code> not null
109      * @throws IOException if this document cannot be read.
110      */
111     public Reader reader() throws IOException {
112         // RAT-494: Tika's CharsetDetector.getReader() may return null if the read can not be constructed due to I/O or encoding errors
113         Reader result = new CharsetDetector().getReader(TikaProcessor.markSupportedInputStream(inputStream()), getMetaData().getCharset().name());
114         if (result == null) {
115             throw new IOException(String.format("Can not read document `%s`", getName()));
116         }
117         return result;
118     }
119 
120     /**
121      * Streams the document's contents.
122      * @return a non-null input stream of the document.
123      * @throws IOException when stream could not be opened.
124      */
125     public abstract InputStream inputStream() throws IOException;
126 
127     /**
128      * Gets data describing this resource.
129      * @return a non-null MetaData object.
130      */
131     public final MetaData getMetaData() {
132         return metaData;
133     }
134 
135     /**
136      * Checks if document is ignored or not.
137      * @return {@code true} if the document is of type {@code IGNORED}.
138      */
139     public final boolean isIgnored() {
140         return Type.IGNORED == metaData.getDocumentType();
141     }
142 
143     /**
144      * Representations suitable for logging.
145      * @return a <code>String</code> representation
146      * of this object.
147      */
148     @Override
149     public String toString() {
150         return String.format("%s( name = %s metaData = %s )", this.getClass().getSimpleName(), getName().localized(), getMetaData());
151     }
152 
153     /**
154      * Determines if this document is a directory type.
155      * @return {@code true} if this is a directory.
156      */
157     public abstract boolean isDirectory();
158 
159     /**
160      * Gets a sorted set of documents that are children of this document.
161      * @return A sorted set of child Documents. May be empty.
162      */
163     public abstract SortedSet<Document> listChildren();
164 }