View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one   *
3    * or more contributor license agreements.  See the NOTICE file *
4    * distributed with this work for additional information        *
5    * regarding copyright ownership.  The ASF licenses this file   *
6    * to you under the Apache License, Version 2.0 (the            *
7    * "License"); you may not use this file except in compliance   *
8    * with the License.  You may obtain a copy of the License at   *
9    *                                                              *
10   *   http://www.apache.org/licenses/LICENSE-2.0                 *
11   *                                                              *
12   * Unless required by applicable law or agreed to in writing,   *
13   * software distributed under the License is distributed on an  *
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
15   * KIND, either express or implied.  See the License for the    *
16   * specific language governing permissions and limitations      *
17   * under the License.                                           *
18   */
19  package org.apache.rat.analysis.matchers;
20  
21  import java.util.Arrays;
22  import java.util.regex.Matcher;
23  import java.util.regex.Pattern;
24  import java.util.stream.Collectors;
25  
26  import org.apache.commons.lang3.StringUtils;
27  import org.apache.rat.ConfigurationException;
28  import org.apache.rat.analysis.IHeaders;
29  import org.apache.rat.config.parameters.ComponentType;
30  import org.apache.rat.config.parameters.ConfigComponent;
31  
32  /**
33   * Matches a typical Copyright header line only based on a regex pattern which
34   * allows for one (starting) year or year range, and a configurable copyright
35   * owner.<br>
36   * <br>
37   * The matching is done case-insensitive<br>
38   * <br>
39   * Example supported Copyright header lines, using copyright owner
40   * &quot;FooBar&quot;
41   * <ul>
42   * <li>Copyright 2010 FooBar.</li>
43   * <li>Copyright 2010-2012 FooBar.</li>
44   * <li>copyright 2012 foobar</li>
45   * </ul>
46   * <p>
47   * Note also that the copyright owner is appended to the regex pattern and so
48   * can support additional regex but also requires escaping where needed,<br>
49   * e.g. use &quot;FooBar \\(www\\.foobar\\.com\\)&quot; or &quot;FooBar
50   * \\Q(www.foobar.com)\\E&quot; to match &quot;FooBar (www.foobar.com)&quot;
51   * </p>
52   * <p>
53   * The matcher also accepts "(C)", "(c)", and "©" in place of (or in addition
54   * to) the "Copyright" or "copyright" keyword
55   * </p>
56   */
57  @ConfigComponent(type = ComponentType.MATCHER, name = "copyright",
58          desc = "A matcher that matches Copyright text. " +
59                  "Uses regular expressions and so should only be used when looking for copyrights with specific " +
60                  "patterns that are not caught by a standard text matcher. This matcher will match \"(C)\", \"copyright\", " +
61                  "or \"©\". (text is not case sensitive). It will also match things like Copyright (c) joe 1995 as well " +
62                  "as Copyright (C) 1995 joe and Copyright (C) joe 1995.")
63  public class CopyrightMatcher extends AbstractHeaderMatcher {
64      /** String to build a pattern to match the various recognized copyright symbols */
65      private static final String COPYRIGHT_SYMBOL_DEFN = "\\([Cc]\\)|©|\\&[Cc][Oo][Pp][Yy]\\;";
66      /** String to build a pattern to match symbols or word "copyright" */
67      private static final String COPYRIGHT_PATTERN_DEFN = "(\\b)?" + COPYRIGHT_SYMBOL_DEFN + "|Copyright\\b";
68      /** The compiled pattern from {@link #COPYRIGHT_PATTERN_DEFN} */
69      private static final Pattern COPYRIGHT_PATTERN = Pattern.compile(COPYRIGHT_PATTERN_DEFN);
70      /** The string to build a Pattern to match a copyright with a single part (date or name) */
71      private static final String ONE_PART = "\\W+((" + COPYRIGHT_SYMBOL_DEFN + ")\\W+)?%s";
72      /** The string to build a Pattern to match a copyright iwth both name and date */
73      private static final String TWO_PART = "\\W+((" + COPYRIGHT_SYMBOL_DEFN + ")\\W+)?%s,?\\W+%s";
74      /** Format string to build a pattern to match two dates */
75      private static final String DOUBLE_DATE_FMT = "%s\\W*-\\W*%s";
76      /** String to build a pattern to match an arbitrary date (year) */
77      private static final String ARBITRARY_DATE = "[0-9]{4}";
78      /** The built Pattern for matching "Copyright date owner" */
79      private final Pattern dateOwnerPattern;
80      /** The built pattern for matching "Copyright owner date" */
81      private final Pattern ownerDatePattern;
82      /** The start date of the copyright. May be null. */
83      @ConfigComponent(type = ComponentType.PARAMETER, desc = "The initial year of the copyright if any")
84      private final String start;
85      /** The end date of the copyright. May be null. */
86      @ConfigComponent(type = ComponentType.PARAMETER, desc = "The last year the copyright. Only valid with 'start'")
87      private final String end;
88      /** The owner of the copyright. May be null */
89      @ConfigComponent(type = ComponentType.PARAMETER, desc = "The owner of the copyright")
90      private final String owner;
91  
92      /**
93       * Constructs the CopyrightMatcher with the specified start, stop and owner
94       * strings and a unique random id.
95       *
96       * @param start the start date for the copyright, may be null.
97       * @param end the stop date for the copyright, may be null. May not be
98       * specified if start is not specified.
99       * @param owner the owner of the copyright, may be null.
100      */
101     public CopyrightMatcher(final String start, final String end, final String owner) {
102         this(null, start, end, owner);
103     }
104 
105     private static void assertNumber(final String label, final String value) {
106         try {
107             if (StringUtils.isNotEmpty(value)) {
108                 Integer.parseInt(value);
109             }
110         } catch (NumberFormatException e) {
111             throw new ConfigurationException(String.format("'%s' must be numeric (value provided: '%s')", label, value));
112         }
113     }
114 
115     /**
116      * Converts the owner string into a set of tokens that may be separated by multiple whitespaces and comment characters.
117      * Any non word character may appear between the tokens. Tokens are escaped so that any text is not interpreted as
118      * regex characters.
119      * @param owner the owner string.
120      * @return the regex string to match the owner string.
121      */
122     private String parseOwner(final String owner) {
123         return Arrays.stream(owner.split("\\s+")).map(s -> "\\Q" + s + "\\E").collect(Collectors.joining("\\W+"));
124     }
125 
126     /**
127      * Constructs the CopyrightMatcher with the specified start, stop and owner
128      * strings.
129      *
130      * @param id the id for the matcher.
131      * @param start the start date for the copyright, may be null.
132      * @param end the end date for the copyright, may be null. May not be
133      * specified if start is not specified.
134      * @param owner the owner of the copyright. May be null.
135      */
136     public CopyrightMatcher(final String id, final String start, final String end, final String owner) {
137         super(id);
138         if (StringUtils.isBlank(start) && !StringUtils.isBlank(end)) {
139             throw new ConfigurationException("'end' may not be set if 'start' is not set.");
140         }
141         assertNumber("start", start);
142         assertNumber("end", end);
143         this.start = start;
144         this.end = end;
145         this.owner = owner;
146         String dateDefn = "";
147         if (StringUtils.isNotEmpty(start)) {
148             if (StringUtils.isNotEmpty(end)) {
149                 dateDefn = String.format(DOUBLE_DATE_FMT, this.start, this.end);
150             } else {
151                 dateDefn = this.start;
152             }
153         }
154         if (StringUtils.isEmpty(owner)) {
155             // no owner
156             if (StringUtils.isEmpty(dateDefn)) {
157                 dateDefn = ARBITRARY_DATE;
158             }
159             dateOwnerPattern = Pattern.compile(String.format(ONE_PART, dateDefn));
160             ownerDatePattern = null;
161         } else {
162             String formattedOwner = parseOwner(owner);
163             if (StringUtils.isEmpty(dateDefn)) {
164                 dateDefn = String.format(DOUBLE_DATE_FMT, "(((" + ARBITRARY_DATE, ")?" + ARBITRARY_DATE + "))?");
165                 dateOwnerPattern = Pattern.compile(String.format(TWO_PART, dateDefn, formattedOwner));
166                 ownerDatePattern = Pattern.compile(String.format(ONE_PART, formattedOwner));
167             } else {
168                 dateOwnerPattern = Pattern.compile(String.format(TWO_PART, dateDefn, formattedOwner));
169                 ownerDatePattern = Pattern.compile(String.format(TWO_PART, formattedOwner, dateDefn));
170             }
171         }
172     }
173 
174     /**
175      * Gets the start date of the copyright.
176      * @return the start date of the copyright or {@code null} if not set
177      */
178     public String getStart() {
179         return start;
180     }
181 
182     /**
183      * Gets the end date of the copyright.
184      * @return the end date of the copyright or {@code null} if not set
185      */
186     public String getEnd() {
187         return end;
188     }
189 
190     /**
191      * Gets the owner of the copyright.
192      * @return the owner of the copyright or {@code null} if not set
193      */
194     public String getOwner() {
195         return owner;
196     }
197 
198     @Override
199     public boolean matches(final IHeaders headers) {
200         String lowerLine = headers.raw().toLowerCase();
201         if (lowerLine.contains("copyright") || lowerLine.contains("(c)") || lowerLine.contains("©") ||
202                 lowerLine.contains("&copy;")) {
203             Matcher matcher = COPYRIGHT_PATTERN.matcher(headers.raw());
204             if (matcher.find()) {
205                 String buffer = headers.raw().substring(matcher.end());
206                 matcher = dateOwnerPattern.matcher(buffer);
207                 if (matcher.find() && matcher.start() == 0) {
208                     return true;
209                 }
210                 if (ownerDatePattern != null) {
211                     matcher = ownerDatePattern.matcher(buffer);
212                     return matcher.find() && matcher.start() == 0;
213                 }
214             }
215         }
216         return false;
217     }
218 }