1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one * 3 * or more contributor license agreements. See the NOTICE file * 4 * distributed with this work for additional information * 5 * regarding copyright ownership. The ASF licenses this file * 6 * to you under the Apache License, Version 2.0 (the * 7 * "License"); you may not use this file except in compliance * 8 * with the License. You may obtain a copy of the License at * 9 * * 10 * http://www.apache.org/licenses/LICENSE-2.0 * 11 * * 12 * Unless required by applicable law or agreed to in writing, * 13 * software distributed under the License is distributed on an * 14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * 15 * KIND, either express or implied. See the License for the * 16 * specific language governing permissions and limitations * 17 * under the License. * 18 */ 19 package org.apache.rat.api; 20 21 import java.io.IOException; 22 import java.io.InputStream; 23 import java.io.Reader; 24 import java.util.SortedSet; 25 26 import org.apache.rat.analysis.TikaProcessor; 27 import org.apache.rat.document.DocumentName; 28 import org.apache.rat.document.DocumentNameMatcher; 29 import org.apache.tika.parser.txt.CharsetDetector; 30 31 /** 32 * The representation of a document being scanned. 33 */ 34 public abstract class Document implements Comparable<Document> { 35 36 /** 37 * An enumeration of document types. 38 */ 39 public enum Type { 40 /** An archive type document. */ 41 ARCHIVE, 42 /** A binary file. */ 43 BINARY, 44 /** A generated document that is ignored. */ 45 IGNORED, 46 /** A notice document (e.g. LICENSE file). */ 47 NOTICE, 48 /** A standard document. */ 49 STANDARD, 50 /** An unknown document type. */ 51 UNKNOWN 52 } 53 54 /** The path matcher used by this document */ 55 protected final DocumentNameMatcher nameMatcher; 56 /** The metadata for this document */ 57 private final MetaData metaData; 58 /** The fully qualified name of this document */ 59 protected final DocumentName name; 60 61 /** 62 * Creates an instance. 63 * @param name the native NameSet of the resource. 64 * @param nameMatcher the document name matcher to filter directories/files. 65 */ 66 protected Document(final DocumentName name, final DocumentNameMatcher nameMatcher) { 67 this.name = name; 68 this.nameMatcher = nameMatcher; 69 this.metaData = new MetaData(); 70 } 71 72 /** 73 * Gets the name of the current document. 74 * @return the name of the current document. 75 */ 76 public final DocumentName getName() { 77 return name; 78 } 79 80 /** 81 * Gets the file filter this document was created with. 82 * @return the file filter this document was created with. 83 */ 84 public final DocumentNameMatcher getNameMatcher() { 85 return nameMatcher; 86 } 87 88 @Override 89 public int compareTo(final Document doc) { 90 return name.compareTo(doc.name); 91 } 92 93 @Override 94 public int hashCode() { 95 return name.hashCode(); 96 } 97 98 @Override 99 public boolean equals(final Object obj) { 100 if (!(obj instanceof Document)) { 101 return false; 102 } 103 return name.equals(((Document) obj).name); 104 } 105 106 /** 107 * Reads the contents of this document. 108 * @return <code>Reader</code> not null 109 * @throws IOException if this document cannot be read. 110 */ 111 public Reader reader() throws IOException { 112 // RAT-494: Tika's CharsetDetector.getReader() may return null if the read can not be constructed due to I/O or encoding errors 113 Reader result = new CharsetDetector().getReader(TikaProcessor.markSupportedInputStream(inputStream()), getMetaData().getCharset().name()); 114 if (result == null) { 115 throw new IOException(String.format("Can not read document `%s`", getName())); 116 } 117 return result; 118 } 119 120 /** 121 * Streams the document's contents. 122 * @return a non-null input stream of the document. 123 * @throws IOException when stream could not be opened. 124 */ 125 public abstract InputStream inputStream() throws IOException; 126 127 /** 128 * Gets data describing this resource. 129 * @return a non-null MetaData object. 130 */ 131 public final MetaData getMetaData() { 132 return metaData; 133 } 134 135 /** 136 * Checks if document is ignored or not. 137 * @return {@code true} if the document is of type {@code IGNORED}. 138 */ 139 public final boolean isIgnored() { 140 return Type.IGNORED == metaData.getDocumentType(); 141 } 142 143 /** 144 * Representations suitable for logging. 145 * @return a <code>String</code> representation 146 * of this object. 147 */ 148 @Override 149 public String toString() { 150 return String.format("%s( name = %s metaData = %s )", this.getClass().getSimpleName(), getName().localized(), getMetaData()); 151 } 152 153 /** 154 * Determines if this document is a directory type. 155 * @return {@code true} if this is a directory. 156 */ 157 public abstract boolean isDirectory(); 158 159 /** 160 * Gets a sorted set of documents that are children of this document. 161 * @return A sorted set of child Documents. May be empty. 162 */ 163 public abstract SortedSet<Document> listChildren(); 164 }