View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   *  Unless required by applicable law or agreed to in writing, software
12   *  distributed under the License is distributed on an "AS IS" BASIS,
13   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   *  See the License for the specific language governing permissions and
15   *  limitations under the License.
16   */
17  package org.apache.creadur.tentacles;
18  
19  import org.apache.http.Header;
20  import org.apache.http.HttpHeaders;
21  import org.apache.http.client.methods.CloseableHttpResponse;
22  import org.apache.http.client.methods.HttpGet;
23  import org.apache.http.client.methods.HttpHead;
24  import org.apache.http.client.methods.HttpUriRequest;
25  import org.apache.http.impl.client.CloseableHttpClient;
26  import org.apache.http.impl.client.HttpClientBuilder;
27  import org.apache.logging.log4j.*;
28  import org.codehaus.swizzle.stream.StreamLexer;
29  
30  import java.io.File;
31  import java.io.IOException;
32  import java.io.InputStream;
33  import java.net.URI;
34  import java.util.LinkedHashSet;
35  import java.util.Set;
36  
37  public class NexusClient {
38  
39      private static final Logger log = LogManager.getLogger(NexusClient.class);
40      private static final String SLASH = "/";
41      private static final String ONE_UP = "../";
42      private static final String USER_AGENT_CONTENTS = "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101206 Ubuntu/10.10 (maverick) Firefox/3.6.13";
43  
44      private final CloseableHttpClient client;
45      private final FileSystem fileSystem;
46      private final IOSystem ioSystem;
47      private final int retries;
48  
49      public NexusClient(final Platform platform) {
50  
51          System.setProperty("http.keepAlive", "false");
52          System.setProperty("http.maxConnections", "50");
53  
54          this.retries = Integer.parseInt(System.getProperty("NexusClient.retries", "5"));
55  
56          this.client = HttpClientBuilder.create().disableContentCompression()
57                  .build();
58          this.fileSystem = platform.getFileSystem();
59          this.ioSystem = platform.getIoSystem();
60      }
61  
62      public File download(final URI uri, final File file) throws IOException {
63          if (file.exists()) {
64  
65              final long length = getContentLength(uri);
66  
67              if (file.length() == length) {
68                  log.info("Exists {}", uri);
69                  return file;
70              } else {
71                  log.info("Incomplete {}", uri);
72              }
73          }
74  
75          log.info("Download {}", uri);
76  
77          try (CloseableHttpResponse response = get(uri); InputStream content = response.getEntity().getContent()) {
78  
79              this.fileSystem.mkparent(file);
80  
81              this.ioSystem.copy(content, file);
82          }
83  
84          return file;
85      }
86  
87      private Long getContentLength(final URI uri) throws IOException {
88          final CloseableHttpResponse head = head(uri);
89          final Header[] headers = head.getHeaders(HttpHeaders.CONTENT_LENGTH);
90  
91          if (headers != null && headers.length >= 1) {
92              return Long.valueOf(headers[0].getValue());
93          }
94  
95          head.close();
96  
97          return (long) -1;
98      }
99  
100     private CloseableHttpResponse get(final URI uri) throws IOException {
101         return get(new HttpGet(uri), this.retries);
102     }
103 
104     private CloseableHttpResponse head(final URI uri) throws IOException {
105         return get(new HttpHead(uri), this.retries);
106     }
107 
108     private CloseableHttpResponse get(final HttpUriRequest request, int tries) throws IOException {
109         try {
110             request.setHeader(HttpHeaders.USER_AGENT, USER_AGENT_CONTENTS);
111             return this.client.execute(request);
112         } catch (final IOException e) {
113             if (tries > 0) {
114                 try {
115                     Thread.sleep(250);
116                 } catch (final InterruptedException ie) {
117                     Thread.interrupted();
118                     throw new IOException("Interrupted", ie);
119                 }
120                 return get(request, tries--);
121             } else {
122                 throw e;
123             }
124         }
125     }
126 
127     public Set<URI> crawl(final URI index) throws IOException {
128         log.info("Crawl {}", index);
129         final Set<URI> resources = new LinkedHashSet<>();
130 
131         final CloseableHttpResponse response = get(index);
132 
133         final InputStream content = response.getEntity().getContent();
134         final StreamLexer lexer = new StreamLexer(content);
135 
136         final Set<URI> crawl = new LinkedHashSet<>();
137 
138         // <a
139         // href="https://repository.apache.org/content/repositories/orgapacheopenejb-094/archetype-catalog.xml">archetype-catalog.xml</a>
140         while (lexer.readAndMark("<a ", "/a>")) {
141 
142             try {
143                 final String link = lexer.peek("href=\"", "\"");
144                 final String name = lexer.peek(">", "<");
145 
146                 final URI uri = index.resolve(link);
147 
148                 if (name.equals(ONE_UP)) {
149                     continue;
150                 }
151                 if (link.equals(ONE_UP)) {
152                     continue;
153                 }
154 
155                 if (name.endsWith(SLASH)) {
156                     crawl.add(uri);
157                     continue;
158                 }
159 
160                 resources.add(uri);
161 
162             } finally {
163                 lexer.unmark();
164             }
165         }
166 
167         content.close();
168         response.close();
169 
170         for (final URI uri : crawl) {
171             resources.addAll(crawl(uri));
172         }
173 
174         return resources;
175     }
176 }