1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.creadur.tentacles;
18
19 import org.apache.http.Header;
20 import org.apache.http.HttpHeaders;
21 import org.apache.http.client.methods.CloseableHttpResponse;
22 import org.apache.http.client.methods.HttpGet;
23 import org.apache.http.client.methods.HttpHead;
24 import org.apache.http.client.methods.HttpUriRequest;
25 import org.apache.http.impl.client.CloseableHttpClient;
26 import org.apache.http.impl.client.HttpClientBuilder;
27 import org.apache.logging.log4j.*;
28 import org.codehaus.swizzle.stream.StreamLexer;
29
30 import java.io.File;
31 import java.io.IOException;
32 import java.io.InputStream;
33 import java.net.URI;
34 import java.util.LinkedHashSet;
35 import java.util.Set;
36
37 public class NexusClient {
38
39 private static final Logger log = LogManager.getLogger(NexusClient.class);
40 private static final String SLASH = "/";
41 private static final String ONE_UP = "../";
42 private static final String USER_AGENT_CONTENTS = "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101206 Ubuntu/10.10 (maverick) Firefox/3.6.13";
43
44 private final CloseableHttpClient client;
45 private final FileSystem fileSystem;
46 private final IOSystem ioSystem;
47 private final int retries;
48
49 public NexusClient(final Platform platform) {
50
51 System.setProperty("http.keepAlive", "false");
52 System.setProperty("http.maxConnections", "50");
53
54 this.retries = Integer.parseInt(System.getProperty("NexusClient.retries", "5"));
55
56 this.client = HttpClientBuilder.create().disableContentCompression()
57 .build();
58 this.fileSystem = platform.getFileSystem();
59 this.ioSystem = platform.getIoSystem();
60 }
61
62 public File download(final URI uri, final File file) throws IOException {
63 if (file.exists()) {
64
65 final long length = getContentLength(uri);
66
67 if (file.length() == length) {
68 log.info("Exists " + uri);
69 return file;
70 } else {
71 log.info("Incomplete " + uri);
72 }
73 }
74
75 log.info("Download " + uri);
76
77 final CloseableHttpResponse response = get(uri);
78
79 InputStream content = null;
80 try {
81 content = response.getEntity().getContent();
82
83 this.fileSystem.mkparent(file);
84
85 this.ioSystem.copy(content, file);
86 } finally {
87 if (content != null) {
88 content.close();
89 }
90
91 response.close();
92 }
93
94 return file;
95 }
96
97 private Long getContentLength(final URI uri) throws IOException {
98 final CloseableHttpResponse head = head(uri);
99 final Header[] headers = head.getHeaders(HttpHeaders.CONTENT_LENGTH);
100
101 if (headers != null && headers.length >= 1) {
102 return Long.valueOf(headers[0].getValue());
103 }
104
105 head.close();
106
107 return (long) -1;
108 }
109
110 private CloseableHttpResponse get(final URI uri) throws IOException {
111 return get(new HttpGet(uri), this.retries);
112 }
113
114 private CloseableHttpResponse head(final URI uri) throws IOException {
115 return get(new HttpHead(uri), this.retries);
116 }
117
118 private CloseableHttpResponse get(final HttpUriRequest request, int tries) throws IOException {
119 try {
120 request.setHeader(HttpHeaders.USER_AGENT, USER_AGENT_CONTENTS);
121 return this.client.execute(request);
122 } catch (final IOException e) {
123 if (tries > 0) {
124 try {
125 Thread.sleep(250);
126 } catch (final InterruptedException ie) {
127 Thread.interrupted();
128 throw new IOException("Interrupted", ie);
129 }
130 return get(request, tries--);
131 } else {
132 throw e;
133 }
134 }
135 }
136
137 public Set<URI> crawl(final URI index) throws IOException {
138 log.info("Crawl " + index);
139 final Set<URI> resources = new LinkedHashSet<URI>();
140
141 final CloseableHttpResponse response = get(index);
142
143 final InputStream content = response.getEntity().getContent();
144 final StreamLexer lexer = new StreamLexer(content);
145
146 final Set<URI> crawl = new LinkedHashSet<URI>();
147
148
149
150 while (lexer.readAndMark("<a ", "/a>")) {
151
152 try {
153 final String link = lexer.peek("href=\"", "\"");
154 final String name = lexer.peek(">", "<");
155
156 final URI uri = index.resolve(link);
157
158 if (name.equals(ONE_UP)) {
159 continue;
160 }
161 if (link.equals(ONE_UP)) {
162 continue;
163 }
164
165 if (name.endsWith(SLASH)) {
166 crawl.add(uri);
167 continue;
168 }
169
170 resources.add(uri);
171
172 } finally {
173 lexer.unmark();
174 }
175 }
176
177 content.close();
178 response.close();
179
180 for (final URI uri : crawl) {
181 resources.addAll(crawl(uri));
182 }
183
184 return resources;
185 }
186 }