1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.creadur.tentacles;
18
19 import org.apache.http.Header;
20 import org.apache.http.HttpHeaders;
21 import org.apache.http.client.methods.CloseableHttpResponse;
22 import org.apache.http.client.methods.HttpGet;
23 import org.apache.http.client.methods.HttpHead;
24 import org.apache.http.client.methods.HttpUriRequest;
25 import org.apache.http.impl.client.CloseableHttpClient;
26 import org.apache.http.impl.client.HttpClientBuilder;
27 import org.apache.logging.log4j.*;
28 import org.codehaus.swizzle.stream.StreamLexer;
29
30 import java.io.File;
31 import java.io.IOException;
32 import java.io.InputStream;
33 import java.net.URI;
34 import java.util.LinkedHashSet;
35 import java.util.Set;
36
37 public class NexusClient {
38
39 private static final Logger log = LogManager.getLogger(NexusClient.class);
40 private static final String SLASH = "/";
41 private static final String ONE_UP = "../";
42 private static final String USER_AGENT_CONTENTS = "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101206 Ubuntu/10.10 (maverick) Firefox/3.6.13";
43
44 private final CloseableHttpClient client;
45 private final FileSystem fileSystem;
46 private final IOSystem ioSystem;
47 private final int retries;
48
49 public NexusClient(final Platform platform) {
50
51 System.setProperty("http.keepAlive", "false");
52 System.setProperty("http.maxConnections", "50");
53
54 this.retries = Integer.parseInt(System.getProperty("NexusClient.retries", "5"));
55
56 this.client = HttpClientBuilder.create().disableContentCompression()
57 .build();
58 this.fileSystem = platform.getFileSystem();
59 this.ioSystem = platform.getIoSystem();
60 }
61
62 public File download(final URI uri, final File file) throws IOException {
63 if (file.exists()) {
64
65 final long length = getContentLength(uri);
66
67 if (file.length() == length) {
68 log.info("Exists {}", uri);
69 return file;
70 } else {
71 log.info("Incomplete {}", uri);
72 }
73 }
74
75 log.info("Download {}", uri);
76
77 try (CloseableHttpResponse response = get(uri); InputStream content = response.getEntity().getContent()) {
78
79 this.fileSystem.mkparent(file);
80
81 this.ioSystem.copy(content, file);
82 }
83
84 return file;
85 }
86
87 private Long getContentLength(final URI uri) throws IOException {
88 final CloseableHttpResponse head = head(uri);
89 final Header[] headers = head.getHeaders(HttpHeaders.CONTENT_LENGTH);
90
91 if (headers != null && headers.length >= 1) {
92 return Long.valueOf(headers[0].getValue());
93 }
94
95 head.close();
96
97 return (long) -1;
98 }
99
100 private CloseableHttpResponse get(final URI uri) throws IOException {
101 return get(new HttpGet(uri), this.retries);
102 }
103
104 private CloseableHttpResponse head(final URI uri) throws IOException {
105 return get(new HttpHead(uri), this.retries);
106 }
107
108 private CloseableHttpResponse get(final HttpUriRequest request, int tries) throws IOException {
109 try {
110 request.setHeader(HttpHeaders.USER_AGENT, USER_AGENT_CONTENTS);
111 return this.client.execute(request);
112 } catch (final IOException e) {
113 if (tries > 0) {
114 try {
115 Thread.sleep(250);
116 } catch (final InterruptedException ie) {
117 Thread.interrupted();
118 throw new IOException("Interrupted", ie);
119 }
120 return get(request, tries--);
121 } else {
122 throw e;
123 }
124 }
125 }
126
127 public Set<URI> crawl(final URI index) throws IOException {
128 log.info("Crawl {}", index);
129 final Set<URI> resources = new LinkedHashSet<>();
130
131 final CloseableHttpResponse response = get(index);
132
133 final InputStream content = response.getEntity().getContent();
134 final StreamLexer lexer = new StreamLexer(content);
135
136 final Set<URI> crawl = new LinkedHashSet<>();
137
138
139
140 while (lexer.readAndMark("<a ", "/a>")) {
141
142 try {
143 final String link = lexer.peek("href=\"", "\"");
144 final String name = lexer.peek(">", "<");
145
146 final URI uri = index.resolve(link);
147
148 if (name.equals(ONE_UP)) {
149 continue;
150 }
151 if (link.equals(ONE_UP)) {
152 continue;
153 }
154
155 if (name.endsWith(SLASH)) {
156 crawl.add(uri);
157 continue;
158 }
159
160 resources.add(uri);
161
162 } finally {
163 lexer.unmark();
164 }
165 }
166
167 content.close();
168 response.close();
169
170 for (final URI uri : crawl) {
171 resources.addAll(crawl(uri));
172 }
173
174 return resources;
175 }
176 }