CopyrightMatcher.java
/*
* Licensed to the Apache Software Foundation (ASF) under one *
* or more contributor license agreements. See the NOTICE file *
* distributed with this work for additional information *
* regarding copyright ownership. The ASF licenses this file *
* to you under the Apache License, Version 2.0 (the *
* "License"); you may not use this file except in compliance *
* with the License. You may obtain a copy of the License at *
* *
* http://www.apache.org/licenses/LICENSE-2.0 *
* *
* Unless required by applicable law or agreed to in writing, *
* software distributed under the License is distributed on an *
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
* KIND, either express or implied. See the License for the *
* specific language governing permissions and limitations *
* under the License. *
*/
package org.apache.rat.analysis.matchers;
import java.util.Arrays;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.apache.rat.ConfigurationException;
import org.apache.rat.analysis.IHeaders;
import org.apache.rat.config.parameters.ComponentType;
import org.apache.rat.config.parameters.ConfigComponent;
/**
* Matches a typical Copyright header line only based on a regex pattern which
* allows for one (starting) year or year range, and a configurable copyright
* owner.<br>
* <br>
* The matching is done case-insensitive<br>
* <br>
* Example supported Copyright header lines, using copyright owner
* "FooBar"
* <ul>
* <li>Copyright 2010 FooBar.</li>
* <li>Copyright 2010-2012 FooBar.</li>
* <li>copyright 2012 foobar</li>
* </ul>
* <p>
* Note also that the copyright owner is appended to the regex pattern and so
* can support additional regex but also requires escaping where needed,<br>
* e.g. use "FooBar \\(www\\.foobar\\.com\\)" or "FooBar
* \\Q(www.foobar.com)\\E" to match "FooBar (www.foobar.com)"
* </p>
* <p>
* The matcher also accepts "(C)", "(c)", and "©" in place of (or in addition
* to) the "Copyright" or "copyright" keyword
* </p>
*/
@ConfigComponent(type = ComponentType.MATCHER, name = "copyright",
desc = "A matcher that matches Copyright text. " +
"Uses regular expressions and so should only be used when looking for copyrights with specific " +
"patterns that are not caught by a standard text matcher. This matcher will match \"(C)\", \"copyright\", " +
"or \"©\". (text is not case sensitive). It will also match things like Copyright (c) joe 1995 as well " +
"as Copyright (C) 1995 joe and Copyright (C) joe 1995.")
public class CopyrightMatcher extends AbstractHeaderMatcher {
/** String to build a pattern to match the various recognized copyright symbols */
private static final String COPYRIGHT_SYMBOL_DEFN = "\\([Cc]\\)|©|\\&[Cc][Oo][Pp][Yy]\\;";
/** String to build a pattern to match symbols or word "copyright" */
private static final String COPYRIGHT_PATTERN_DEFN = "(\\b)?" + COPYRIGHT_SYMBOL_DEFN + "|Copyright\\b";
/** The compiled pattern from {@link #COPYRIGHT_PATTERN_DEFN} */
private static final Pattern COPYRIGHT_PATTERN = Pattern.compile(COPYRIGHT_PATTERN_DEFN);
/** The string to build a Pattern to match a copyright with a single part (date or name) */
private static final String ONE_PART = "\\W+((" + COPYRIGHT_SYMBOL_DEFN + ")\\W+)?%s";
/** The string to build a Pattern to match a copyright iwth both name and date */
private static final String TWO_PART = "\\W+((" + COPYRIGHT_SYMBOL_DEFN + ")\\W+)?%s,?\\W+%s";
/** Format string to build a pattern to match two dates */
private static final String DOUBLE_DATE_FMT = "%s\\W*-\\W*%s";
/** String to build a pattern to match an arbitrary date (year) */
private static final String ARBITRARY_DATE = "[0-9]{4}";
/** The built Pattern for matching "Copyright date owner" */
private final Pattern dateOwnerPattern;
/** The built pattern for matching "Copyright owner date" */
private final Pattern ownerDatePattern;
/** The start date of the copyright. May be null. */
@ConfigComponent(type = ComponentType.PARAMETER, desc = "The initial year of the copyright if any")
private final String start;
/** The end date of the copyright. May be null. */
@ConfigComponent(type = ComponentType.PARAMETER, desc = "The last year the copyright. Only valid with 'start'")
private final String end;
/** The owner of the copyright. May be null */
@ConfigComponent(type = ComponentType.PARAMETER, desc = "The owner of the copyright")
private final String owner;
/**
* Constructs the CopyrightMatcher with the specified start, stop and owner
* strings and a unique random id.
*
* @param start the start date for the copyright, may be null.
* @param end the stop date for the copyright, may be null. May not be
* specified if start is not specified.
* @param owner the owner of the copyright, may be null.
*/
public CopyrightMatcher(final String start, final String end, final String owner) {
this(null, start, end, owner);
}
private static void assertNumber(final String label, final String value) {
try {
if (StringUtils.isNotEmpty(value)) {
Integer.parseInt(value);
}
} catch (NumberFormatException e) {
throw new ConfigurationException(String.format("'%s' must be numeric (value provided: '%s')", label, value));
}
}
/**
* Converts the owner string into a set of tokens that may be separated by multiple whitespaces and comment characters.
* Any non word character may appear between the tokens. Tokens are escaped so that any text is not interpreted as
* regex characters.
* @param owner the owner string.
* @return the regex string to match the owner string.
*/
private String parseOwner(final String owner) {
return Arrays.stream(owner.split("\\s+")).map(s -> "\\Q" + s + "\\E").collect(Collectors.joining("\\W+"));
}
/**
* Constructs the CopyrightMatcher with the specified start, stop and owner
* strings.
*
* @param id the id for the matcher.
* @param start the start date for the copyright, may be null.
* @param end the end date for the copyright, may be null. May not be
* specified if start is not specified.
* @param owner the owner of the copyright. May be null.
*/
public CopyrightMatcher(final String id, final String start, final String end, final String owner) {
super(id);
if (StringUtils.isBlank(start) && !StringUtils.isBlank(end)) {
throw new ConfigurationException("'end' may not be set if 'start' is not set.");
}
assertNumber("start", start);
assertNumber("end", end);
this.start = start;
this.end = end;
this.owner = owner;
String dateDefn = "";
if (StringUtils.isNotEmpty(start)) {
if (StringUtils.isNotEmpty(end)) {
dateDefn = String.format(DOUBLE_DATE_FMT, this.start, this.end);
} else {
dateDefn = this.start;
}
}
if (StringUtils.isEmpty(owner)) {
// no owner
if (StringUtils.isEmpty(dateDefn)) {
dateDefn = ARBITRARY_DATE;
}
dateOwnerPattern = Pattern.compile(String.format(ONE_PART, dateDefn));
ownerDatePattern = null;
} else {
String formattedOwner = parseOwner(owner);
if (StringUtils.isEmpty(dateDefn)) {
dateDefn = String.format(DOUBLE_DATE_FMT, "(((" + ARBITRARY_DATE, ")?" + ARBITRARY_DATE + "))?");
dateOwnerPattern = Pattern.compile(String.format(TWO_PART, dateDefn, formattedOwner));
ownerDatePattern = Pattern.compile(String.format(ONE_PART, formattedOwner));
} else {
dateOwnerPattern = Pattern.compile(String.format(TWO_PART, dateDefn, formattedOwner));
ownerDatePattern = Pattern.compile(String.format(TWO_PART, formattedOwner, dateDefn));
}
}
}
/**
* Gets the start date of the copyright.
* @return the start date of the copyright or {@code null} if not set
*/
public String getStart() {
return start;
}
/**
* Gets the end date of the copyright.
* @return the end date of the copyright or {@code null} if not set
*/
public String getEnd() {
return end;
}
/**
* Gets the owner of the copyright.
* @return the owner of the copyright or {@code null} if not set
*/
public String getOwner() {
return owner;
}
@Override
public boolean matches(final IHeaders headers) {
String lowerLine = headers.raw().toLowerCase();
if (lowerLine.contains("copyright") || lowerLine.contains("(c)") || lowerLine.contains("©") ||
lowerLine.contains("©")) {
Matcher matcher = COPYRIGHT_PATTERN.matcher(headers.raw());
if (matcher.find()) {
String buffer = headers.raw().substring(matcher.end());
matcher = dateOwnerPattern.matcher(buffer);
if (matcher.find() && matcher.start() == 0) {
return true;
}
if (ownerDatePattern != null) {
matcher = ownerDatePattern.matcher(buffer);
return matcher.find() && matcher.start() == 0;
}
}
}
return false;
}
}