Revision 38180
Added by Andrea Mannocci about 9 years ago
modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.1.12/src/main/java/eu/dnetlib/data/collector/plugins/ftp/FtpFileWalker.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.ftp; |
|
2 |
|
|
3 |
import java.io.IOException; |
|
4 |
import java.util.Collection; |
|
5 |
import java.util.concurrent.BlockingQueue; |
|
6 |
|
|
7 |
import org.apache.commons.io.filefilter.FileFilterUtils; |
|
8 |
import org.apache.commons.io.filefilter.HiddenFileFilter; |
|
9 |
import org.apache.commons.io.filefilter.IOFileFilter; |
|
10 |
import org.apache.commons.io.filefilter.SuffixFileFilter; |
|
11 |
import org.apache.commons.logging.Log; |
|
12 |
import org.apache.commons.logging.LogFactory; |
|
13 |
import org.apache.commons.net.ftp.FTPFile; |
|
14 |
import org.apache.commons.net.ftp.FTPFileFilter; |
|
15 |
import org.xml.sax.SAXException; |
|
16 |
|
|
17 |
/** |
|
18 |
* FTPFileWalker runs recursively under a FTP directory structure starting from a given path and for each file reads it's content and puts |
|
19 |
* it in a shared queue. |
|
20 |
* |
|
21 |
* Acts as a producer. |
|
22 |
* |
|
23 |
* @author sandro |
|
24 |
* |
|
25 |
* @param <T> |
|
26 |
* Type of expected content to extract from files NB. Generally strings. |
|
27 |
*/ |
|
28 |
public class FtpFileWalker<T> extends FtpDirectoryWalker<T> { |
|
29 |
|
|
30 |
/** |
|
31 |
* Logger. |
|
32 |
*/ |
|
33 |
private static final Log log = LogFactory.getLog(FtpFileWalker.class); |
|
34 |
|
|
35 |
/** |
|
36 |
* Shared queue. |
|
37 |
*/ |
|
38 |
private final BlockingQueue<T> queue; |
|
39 |
|
|
40 |
public static String IGNORE_PREFIX = "."; |
|
41 |
|
|
42 |
public static String IGNORE_SUFFIX = "~"; |
|
43 |
|
|
44 |
static IOFileFilter fileFilter = FileFilterUtils.notFileFilter(FileFilterUtils.or(new SuffixFileFilter("~"), HiddenFileFilter.HIDDEN)); |
|
45 |
|
|
46 |
/** |
|
47 |
* Instantiates a new fTP file walker. |
|
48 |
* |
|
49 |
* @param queue |
|
50 |
* the queue |
|
51 |
* @param filter |
|
52 |
* the filter |
|
53 |
* @param depthLimit |
|
54 |
* the depth limit |
|
55 |
* @param itemParam |
|
56 |
* the item param |
|
57 |
* @param protocol |
|
58 |
* the protocol |
|
59 |
*/ |
|
60 |
public FtpFileWalker(final BlockingQueue<T> queue, final FTPFileFilter filter, final int depthLimit, final ItemUtility itemParam, final String protocol) { |
|
61 |
super(filter, depthLimit, itemParam, protocol); |
|
62 |
this.queue = queue; |
|
63 |
} |
|
64 |
|
|
65 |
@SuppressWarnings("unchecked") |
|
66 |
public void doWalk(final String source) throws IOException { |
|
67 |
try { |
|
68 |
initialize(); |
|
69 |
} catch (Exception e) { |
|
70 |
enqueue(queue, (T) FtpIterable.done); |
|
71 |
throw new IllegalStateException(e); |
|
72 |
} |
|
73 |
|
|
74 |
FTPFile[] files = clientProvider.listFiles(source); |
|
75 |
if (files.length == 1) { |
|
76 |
if (source.contains(files[0].getName())) { |
|
77 |
enqueue(queue, (T) getFileFrom(source)); |
|
78 |
} else { |
|
79 |
walk(source, queue); |
|
80 |
} |
|
81 |
} else if (files.length > 0) { |
|
82 |
walk(source, queue); |
|
83 |
} |
|
84 |
enqueue(queue, (T) FtpIterable.done); |
|
85 |
} |
|
86 |
|
|
87 |
@Override |
|
88 |
@SuppressWarnings({ "unchecked", "rawtypes" }) |
|
89 |
protected void handleFile(final FTPFile file, final int depth, final Collection results) throws IOException { |
|
90 |
if (file.getName().endsWith(IGNORE_SUFFIX)) return; |
|
91 |
T myFile = (T) readFile(file); |
|
92 |
if (myFile == null) return; |
|
93 |
enqueue((BlockingQueue<T>) results, myFile); |
|
94 |
} |
|
95 |
|
|
96 |
@Override |
|
97 |
@SuppressWarnings({ "unchecked", "rawtypes" }) |
|
98 |
protected boolean handleDirectory(final String directory, final int depth, final Collection results) throws IOException { |
|
99 |
String[] tmp = directory.split("/"); |
|
100 |
if ((tmp.length > 0) && (tmp[tmp.length - 1].startsWith(IGNORE_PREFIX))) return false; |
|
101 |
return super.handleDirectory(directory, depth, results); |
|
102 |
} |
|
103 |
|
|
104 |
// ///Utility |
|
105 |
|
|
106 |
/** |
|
107 |
* Adds the element to the queue |
|
108 |
*/ |
|
109 |
private void enqueue(final BlockingQueue<T> queue, final T element) { |
|
110 |
try { |
|
111 |
queue.put(element); |
|
112 |
// In order to avoid orphan FTP walkers, a better version would be.. |
|
113 |
// if (!queue.offer(element, 5, TimeUnit.MINUTES)) { |
|
114 |
// log.info("Nobody is consuming my records! I stop gracefully!"); |
|
115 |
// } |
|
116 |
} catch (InterruptedException e) { |
|
117 |
log.warn("Hopssss... ", e); |
|
118 |
} |
|
119 |
} |
|
120 |
|
|
121 |
/** |
|
122 |
* given a file, return its content as a string |
|
123 |
* |
|
124 |
* @param file |
|
125 |
* the source |
|
126 |
* @return the file content as a single string |
|
127 |
* @throws IOException |
|
128 |
* @throws TikaException |
|
129 |
* @throws SAXException |
|
130 |
*/ |
|
131 |
private String readFile(final FTPFile file) throws IOException { |
|
132 |
|
|
133 |
if (getCurrentDirectory() == null) { |
|
134 |
clientProvider.completePendingCommand(); |
|
135 |
return getCurrentDirectory(); |
|
136 |
} |
|
137 |
String fileContent = getFileFrom(getCurrentDirectory() + "/" + file.getName()); |
|
138 |
// remove BOM if present |
|
139 |
if (fileContent.startsWith("\uFEFF")) { |
|
140 |
fileContent = fileContent.substring(1); |
|
141 |
} |
|
142 |
return fileContent; |
|
143 |
} |
|
144 |
|
|
145 |
} |
modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.1.12/src/main/java/eu/dnetlib/data/collector/plugins/ftp/ClientSftpDataProvider.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.ftp; |
|
2 |
|
|
3 |
import java.io.IOException; |
|
4 |
import java.io.InputStream; |
|
5 |
import java.io.StringWriter; |
|
6 |
import java.util.ArrayList; |
|
7 |
import java.util.Vector; |
|
8 |
|
|
9 |
import org.apache.commons.io.IOUtils; |
|
10 |
import org.apache.commons.logging.Log; |
|
11 |
import org.apache.commons.logging.LogFactory; |
|
12 |
import org.apache.commons.net.ftp.FTPFile; |
|
13 |
|
|
14 |
import com.jcraft.jsch.Channel; |
|
15 |
import com.jcraft.jsch.ChannelSftp; |
|
16 |
import com.jcraft.jsch.ChannelSftp.LsEntry; |
|
17 |
import com.jcraft.jsch.JSch; |
|
18 |
import com.jcraft.jsch.JSchException; |
|
19 |
import com.jcraft.jsch.Session; |
|
20 |
import com.jcraft.jsch.SftpATTRS; |
|
21 |
import com.jcraft.jsch.SftpException; |
|
22 |
import com.jcraft.jsch.UserInfo; |
|
23 |
|
|
24 |
/** |
|
25 |
* The Class ClientSFTPDataProvider. |
|
26 |
*/ |
|
27 |
public class ClientSftpDataProvider implements FtpClientProvider { |
|
28 |
|
|
29 |
/** The Constant log. */ |
|
30 |
private static final Log log = LogFactory.getLog(ClientSftpDataProvider.class); |
|
31 |
|
|
32 |
/** The session. */ |
|
33 |
private Session session; |
|
34 |
|
|
35 |
/** The sftp channel. */ |
|
36 |
private ChannelSftp sftpChannel; |
|
37 |
|
|
38 |
/** The port. */ |
|
39 |
private int port; |
|
40 |
|
|
41 |
/** The item param. */ |
|
42 |
private ItemUtility itemParam; |
|
43 |
|
|
44 |
/* |
|
45 |
* (non-Javadoc) |
|
46 |
* |
|
47 |
* @see eu.dnetlib.data.collective.harvest.provider.IClientProvider#connect() |
|
48 |
*/ |
|
49 |
@Override |
|
50 |
public void connect() { |
|
51 |
try { |
|
52 |
JSch jsch = new JSch(); |
|
53 |
port = 22; |
|
54 |
session = jsch.getSession(itemParam.getUsername(), itemParam.getHost(), port); |
|
55 |
UserSFTP muser = new UserSFTP(); |
|
56 |
muser.setPassword(itemParam.getPassword()); |
|
57 |
session.setUserInfo(muser); |
|
58 |
session.connect(); |
|
59 |
Channel channel; |
|
60 |
channel = session.openChannel("sftp"); |
|
61 |
channel.connect(); |
|
62 |
sftpChannel = (ChannelSftp) channel; |
|
63 |
} catch (JSchException e) { |
|
64 |
e.printStackTrace(); |
|
65 |
} |
|
66 |
} |
|
67 |
|
|
68 |
/* |
|
69 |
* (non-Javadoc) |
|
70 |
* |
|
71 |
* @see eu.dnetlib.data.collective.harvest.provider.IClientProvider#disconnect() |
|
72 |
*/ |
|
73 |
@Override |
|
74 |
public void disconnect() { |
|
75 |
session.disconnect(); |
|
76 |
} |
|
77 |
|
|
78 |
/* |
|
79 |
* (non-Javadoc) |
|
80 |
* |
|
81 |
* @see eu.dnetlib.data.collective.harvest.provider.ftp.FTPClientProvider#changeWorkingDirectory(java.lang.String) |
|
82 |
*/ |
|
83 |
@Override |
|
84 |
public boolean changeWorkingDirectory(final String path) { |
|
85 |
boolean b = false; |
|
86 |
try { |
|
87 |
sftpChannel.cd(path); |
|
88 |
b = true; |
|
89 |
} catch (SftpException e) { |
|
90 |
b = false; |
|
91 |
} |
|
92 |
return b; |
|
93 |
} |
|
94 |
|
|
95 |
/* |
|
96 |
* (non-Javadoc) |
|
97 |
* |
|
98 |
* @see eu.dnetlib.data.collective.harvest.provider.ftp.FTPClientProvider#listFiles() |
|
99 |
*/ |
|
100 |
@SuppressWarnings("unchecked") |
|
101 |
@Override |
|
102 |
public FTPFile[] listFiles() { |
|
103 |
ArrayList<FTPFile> tmp = new ArrayList<FTPFile>(); |
|
104 |
try { |
|
105 |
Vector<LsEntry> v = sftpChannel.ls(sftpChannel.pwd()); |
|
106 |
for (LsEntry obj : v) { |
|
107 |
|
|
108 |
if (obj.getFilename().startsWith(".") == false) { |
|
109 |
|
|
110 |
FTPFile afile = new FTPFile(); |
|
111 |
afile.setName(obj.getFilename()); |
|
112 |
SftpATTRS attributes = obj.getAttrs(); |
|
113 |
if (attributes.isDir()) { |
|
114 |
afile.setType(1); |
|
115 |
} else { |
|
116 |
afile.setType(0); |
|
117 |
} |
|
118 |
tmp.add(afile); |
|
119 |
} |
|
120 |
|
|
121 |
} |
|
122 |
} catch (SftpException e) { |
|
123 |
e.printStackTrace(); |
|
124 |
} |
|
125 |
FTPFile[] files = new FTPFile[tmp.size()]; |
|
126 |
files = tmp.toArray(files); |
|
127 |
return files; |
|
128 |
} |
|
129 |
|
|
130 |
/* |
|
131 |
* (non-Javadoc) |
|
132 |
* |
|
133 |
* @see eu.dnetlib.data.collective.harvest.provider.ftp.FTPClientProvider#listFiles(java.lang.String) |
|
134 |
*/ |
|
135 |
@SuppressWarnings("unchecked") |
|
136 |
@Override |
|
137 |
public FTPFile[] listFiles(final String source) { |
|
138 |
ArrayList<FTPFile> tmp = new ArrayList<FTPFile>(); |
|
139 |
try { |
|
140 |
Vector<LsEntry> v = sftpChannel.ls(source); |
|
141 |
for (LsEntry obj : v) { |
|
142 |
|
|
143 |
if (obj.getFilename().startsWith(".") == false) { |
|
144 |
|
|
145 |
FTPFile afile = new FTPFile(); |
|
146 |
afile.setName(obj.getFilename()); |
|
147 |
SftpATTRS attributes = obj.getAttrs(); |
|
148 |
if (attributes.isDir()) { |
|
149 |
afile.setType(1); |
|
150 |
} else { |
|
151 |
afile.setType(0); |
|
152 |
} |
|
153 |
tmp.add(afile); |
|
154 |
} |
|
155 |
|
|
156 |
} |
|
157 |
} catch (SftpException e) { |
|
158 |
e.printStackTrace(); |
|
159 |
} |
|
160 |
FTPFile[] files = new FTPFile[tmp.size()]; |
|
161 |
files = tmp.toArray(files); |
|
162 |
return files; |
|
163 |
} |
|
164 |
|
|
165 |
/* |
|
166 |
* (non-Javadoc) |
|
167 |
* |
|
168 |
* @see eu.dnetlib.data.collective.harvest.provider.ftp.FTPClientProvider#printWorkingDirectory() |
|
169 |
*/ |
|
170 |
@Override |
|
171 |
public String printWorkingDirectory() { |
|
172 |
String path = null; |
|
173 |
try { |
|
174 |
path = sftpChannel.pwd(); |
|
175 |
} catch (SftpException e) { |
|
176 |
e.printStackTrace(); |
|
177 |
} |
|
178 |
return path; |
|
179 |
|
|
180 |
} |
|
181 |
|
|
182 |
/* |
|
183 |
* (non-Javadoc) |
|
184 |
* |
|
185 |
* @see eu.dnetlib.data.collective.harvest.provider.IClientProvider#retrieveFileStream(java.lang.String) |
|
186 |
*/ |
|
187 |
@Override |
|
188 |
public String retrieveFileStream(final String remote) { |
|
189 |
try { |
|
190 |
InputStream stream = sftpChannel.get(remote); |
|
191 |
|
|
192 |
StringWriter writer = new StringWriter(); |
|
193 |
IOUtils.copy(stream, writer); |
|
194 |
|
|
195 |
return writer.toString(); |
|
196 |
|
|
197 |
} catch (IOException e) { |
|
198 |
log.error("cannot read file: " + remote, e); |
|
199 |
throw new RuntimeException(e); |
|
200 |
} catch (SftpException e) { |
|
201 |
log.error("cannot read file: " + remote, e); |
|
202 |
throw new RuntimeException(e); |
|
203 |
} |
|
204 |
} |
|
205 |
|
|
206 |
/* |
|
207 |
* (non-Javadoc) |
|
208 |
* |
|
209 |
* @see eu.dnetlib.data.collective.harvest.provider.ftp.FTPClientProvider#completePendingCommand() |
|
210 |
*/ |
|
211 |
@Override |
|
212 |
public void completePendingCommand() throws IOException { |
|
213 |
// TODO Auto-generated method stub |
|
214 |
|
|
215 |
} |
|
216 |
|
|
217 |
/* |
|
218 |
* (non-Javadoc) |
|
219 |
* |
|
220 |
* @see |
|
221 |
* eu.dnetlib.data.collective.harvest.provider.IClientProvider#setItemParam(eu.dnetlib.data.collective.harvest.provider.ItemUtility) |
|
222 |
*/ |
|
223 |
@Override |
|
224 |
public void setItemParam(final ItemUtility itemUtility) { |
|
225 |
this.itemParam = itemUtility; |
|
226 |
} |
|
227 |
|
|
228 |
/* |
|
229 |
* (non-Javadoc) |
|
230 |
* |
|
231 |
* @see eu.dnetlib.data.collective.harvest.provider.IClientProvider#isConnected() |
|
232 |
*/ |
|
233 |
@Override |
|
234 |
public boolean isConnected() { |
|
235 |
return sftpChannel.isConnected(); |
|
236 |
} |
|
237 |
} |
|
238 |
|
|
239 |
class UserSFTP implements UserInfo { |
|
240 |
|
|
241 |
String password; |
|
242 |
|
|
243 |
@Override |
|
244 |
public String getPassphrase() { |
|
245 |
return null; |
|
246 |
} |
|
247 |
|
|
248 |
public void setPassword(final String password) { |
|
249 |
this.password = password; |
|
250 |
} |
|
251 |
|
|
252 |
@Override |
|
253 |
public String getPassword() { |
|
254 |
// TODO Auto-generated method stub |
|
255 |
return password; |
|
256 |
} |
|
257 |
|
|
258 |
@Override |
|
259 |
public boolean promptPassphrase(final String arg0) { |
|
260 |
// TODO Auto-generated method stub |
|
261 |
return false; |
|
262 |
} |
|
263 |
|
|
264 |
@Override |
|
265 |
public boolean promptPassword(final String arg0) { |
|
266 |
password = "puppamelo"; |
|
267 |
return true; |
|
268 |
} |
|
269 |
|
|
270 |
@Override |
|
271 |
public boolean promptYesNo(final String arg0) { |
|
272 |
// TODO Auto-generated method stub |
|
273 |
return true; |
|
274 |
} |
|
275 |
|
|
276 |
@Override |
|
277 |
public void showMessage(final String arg0) { |
|
278 |
// TODO Auto-generated method stub |
|
279 |
|
|
280 |
} |
|
281 |
|
|
282 |
} |
modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.1.12/src/main/java/eu/dnetlib/data/collector/plugins/archive/targz/TarGzIterable.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.archive.targz; |
|
2 |
|
|
3 |
import java.io.File; |
|
4 |
import java.net.MalformedURLException; |
|
5 |
import java.net.URL; |
|
6 |
import java.util.Iterator; |
|
7 |
|
|
8 |
import com.google.common.base.Function; |
|
9 |
import com.google.common.collect.Iterators; |
|
10 |
|
|
11 |
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner; |
|
12 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
13 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
14 |
|
|
15 |
/** |
|
16 |
* The Class TarGzIterable. |
|
17 |
* |
|
18 |
* @author Andrea |
|
19 |
*/ |
|
20 |
public class TarGzIterable implements Iterable<String> { |
|
21 |
|
|
22 |
/** The path to tar.gz archive. */ |
|
23 |
private File tarGzFile; |
|
24 |
|
|
25 |
public TarGzIterable(final InterfaceDescriptor interfaceDescriptor) throws CollectorServiceException { |
|
26 |
try { |
|
27 |
final String tarGzPath = interfaceDescriptor.getBaseUrl(); |
|
28 |
URL tarGzUrl = new URL(tarGzPath); |
|
29 |
this.tarGzFile = new File(tarGzUrl.getPath()); |
|
30 |
if (!tarGzFile.exists()) { throw new CollectorServiceException(String.format("The base ULR %s, does not exist", tarGzFile.getPath())); } |
|
31 |
} catch (MalformedURLException e) { |
|
32 |
throw new CollectorServiceException("TarGz collector failed! ", e); |
|
33 |
} |
|
34 |
} |
|
35 |
|
|
36 |
@Override |
|
37 |
public Iterator<String> iterator() { |
|
38 |
final TarGzIterator tgzIterator = new TarGzIterator(tarGzFile.getAbsolutePath()); |
|
39 |
return Iterators.transform(tgzIterator, new Function<String, String>() { |
|
40 |
|
|
41 |
@Override |
|
42 |
public String apply(final String inputRecord) { |
|
43 |
return XmlCleaner.cleanAllEntities(inputRecord.startsWith("\uFEFF") ? inputRecord.substring(1) : inputRecord); |
|
44 |
} |
|
45 |
}); |
|
46 |
} |
|
47 |
|
|
48 |
} |
modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.1.12/src/main/java/eu/dnetlib/data/collector/plugins/FileGZipCollectorPlugin.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins; |
|
2 |
|
|
3 |
import java.io.BufferedInputStream; |
|
4 |
import java.io.FileInputStream; |
|
5 |
import java.net.URL; |
|
6 |
import java.util.zip.GZIPInputStream; |
|
7 |
|
|
8 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
9 |
|
|
10 |
public class FileGZipCollectorPlugin extends AbstractSplittedRecordPlugin { |
|
11 |
|
|
12 |
@Override |
|
13 |
protected BufferedInputStream getBufferedInputStream(final String baseUrl) throws CollectorServiceException { |
|
14 |
|
|
15 |
try { |
|
16 |
GZIPInputStream stream = new GZIPInputStream(new FileInputStream(new URL(baseUrl).getPath())); |
|
17 |
return new BufferedInputStream(stream); |
|
18 |
} catch (Exception e) { |
|
19 |
throw new CollectorServiceException(e); |
|
20 |
} |
|
21 |
} |
|
22 |
|
|
23 |
} |
modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.1.12/src/main/java/eu/dnetlib/data/collector/plugins/ftp/FtpDirectoryWalker.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.ftp; |
|
2 |
|
|
3 |
import java.io.IOException; |
|
4 |
import java.util.Collection; |
|
5 |
|
|
6 |
import org.apache.commons.logging.Log; |
|
7 |
import org.apache.commons.logging.LogFactory; |
|
8 |
import org.apache.commons.net.ftp.FTPFile; |
|
9 |
import org.apache.commons.net.ftp.FTPFileFilter; |
|
10 |
|
|
11 |
/** |
|
12 |
* FTPDirectoryWalker runs recursively under a FTP directory structure starting from a given path. |
|
13 |
* |
|
14 |
* Acts as a producer. |
|
15 |
* |
|
16 |
* @author Sandro |
|
17 |
* |
|
18 |
* @param <T> |
|
19 |
* Type of expected content to extract from files NB. Generally strings. |
|
20 |
*/ |
|
21 |
|
|
22 |
public abstract class FtpDirectoryWalker<T> { |
|
23 |
|
|
24 |
/** |
|
25 |
* logger |
|
26 |
*/ |
|
27 |
private static final Log log = LogFactory.getLog(FtpDirectoryWalker.class); // NOPMD by marko on 11/24/08 5:02 PM |
|
28 |
|
|
29 |
/** |
|
30 |
* The file filter to use to filter files and directories. |
|
31 |
*/ |
|
32 |
// private final FTPFileFilter filter; |
|
33 |
|
|
34 |
/** |
|
35 |
* The limit on the directory depth to walk. |
|
36 |
*/ |
|
37 |
private final int depthLimit; |
|
38 |
|
|
39 |
private ItemUtility itemParam; |
|
40 |
|
|
41 |
protected FtpClientProvider clientProvider; |
|
42 |
|
|
43 |
private FTPFileFilter filter; |
|
44 |
|
|
45 |
/** |
|
46 |
* Construct an FTP instance with no filtering and unlimited <i>depth</i>. |
|
47 |
* |
|
48 |
* @param server |
|
49 |
* the server fTP to connect |
|
50 |
* @param username |
|
51 |
* the username parameter for the fTP connection, if null the connection is anonymous |
|
52 |
* @param password |
|
53 |
* the password parameter for the FTP connection |
|
54 |
*/ |
|
55 |
protected FtpDirectoryWalker(final ItemUtility itemParam) { |
|
56 |
this(null, 0, itemParam, "ftp"); |
|
57 |
} |
|
58 |
|
|
59 |
/** |
|
60 |
* Construct an instance with a filter and limit the <i>depth</i> navigated to. |
|
61 |
* <p> |
|
62 |
* The filter controls which files and directories will be navigated to as part of the walk. |
|
63 |
* |
|
64 |
* @param filter |
|
65 |
* the filter to apply, null means visit all files |
|
66 |
* @param depthLimit |
|
67 |
* controls how <i>deep</i> the hierarchy is navigated to (less than 0 means unlimited) |
|
68 |
* @param server |
|
69 |
* the server fTP to connect |
|
70 |
* @param username |
|
71 |
* the username parameter for the fTP connection, if null the connection is anonymous |
|
72 |
* @param password |
|
73 |
* the password parameter for the FTP connection |
|
74 |
* |
|
75 |
*/ |
|
76 |
protected FtpDirectoryWalker(final FTPFileFilter filter, final int depthLimit, final ItemUtility itemParam, final String protocol) { |
|
77 |
this.filter = filter; |
|
78 |
this.depthLimit = depthLimit; |
|
79 |
this.itemParam = itemParam; |
|
80 |
|
|
81 |
clientProvider = (FtpClientProvider) ClientProviderFactory.getProvider("ftp"); |
|
82 |
|
|
83 |
} |
|
84 |
|
|
85 |
/** |
|
86 |
* Overridable callback method invoked at the start of processing. |
|
87 |
* <p> |
|
88 |
* This implementation does nothing. |
|
89 |
* |
|
90 |
* @param startDirectory |
|
91 |
* the directory to start from |
|
92 |
* @param results |
|
93 |
* the collection of result objects, may be updated |
|
94 |
* @throws IOException |
|
95 |
* if an I/O Error occurs |
|
96 |
*/ |
|
97 |
protected void handleStart(final String startDirectory, final Collection<T> results) throws IOException { |
|
98 |
// do nothing - overridable by subclass |
|
99 |
} |
|
100 |
|
|
101 |
/** |
|
102 |
* Overridable callback method invoked to determine if a directory should be processed. |
|
103 |
* <p> |
|
104 |
* This method returns a boolean to indicate if the directory should be examined or not. If you return false, the entire directory and |
|
105 |
* any subdirectories will be skipped. Note that this functionality is in addition to the filtering by file filter. |
|
106 |
* <p> |
|
107 |
* This implementation does nothing and returns true. |
|
108 |
* |
|
109 |
* @param directory |
|
110 |
* the current directory being processed |
|
111 |
* @param depth |
|
112 |
* the current directory level (starting directory = 0) |
|
113 |
* @param results |
|
114 |
* the collection of result objects, may be updated |
|
115 |
* @return true to process this directory, false to skip this directory |
|
116 |
* @throws IOException |
|
117 |
* if an I/O Error occurs |
|
118 |
*/ |
|
119 |
protected boolean handleDirectory(final String directory, final int depth, final Collection<T> results) throws IOException { |
|
120 |
// do nothing - overridable by subclass |
|
121 |
return true; // process directory |
|
122 |
} |
|
123 |
|
|
124 |
/** |
|
125 |
* Overridable callback method invoked at the start of processing each directory. |
|
126 |
* <p> |
|
127 |
* This implementation does nothing. |
|
128 |
* |
|
129 |
* @param directory |
|
130 |
* the current directory being processed |
|
131 |
* @param depth |
|
132 |
* the current directory level (starting directory = 0) |
|
133 |
* @param results |
|
134 |
* the collection of result objects, may be updated |
|
135 |
* @throws IOException |
|
136 |
* if an I/O Error occurs |
|
137 |
*/ |
|
138 |
protected void handleDirectoryStart(final String directory, final int depth, final Collection<T> results) throws IOException { |
|
139 |
// do nothing - overridable by subclass |
|
140 |
} |
|
141 |
|
|
142 |
/** |
|
143 |
* Overridable callback method invoked for each (non-directory) file. |
|
144 |
* <p> |
|
145 |
* This implementation does nothing. |
|
146 |
* |
|
147 |
* @param file |
|
148 |
* the current file being processed |
|
149 |
* @param depth |
|
150 |
* the current directory level (starting directory = 0) |
|
151 |
* @param results |
|
152 |
* the collection of result objects, may be updated |
|
153 |
* @throws IOException |
|
154 |
* if an I/O Error occurs |
|
155 |
*/ |
|
156 |
protected void handleFile(final FTPFile file, final int depth, final Collection<T> results) throws IOException { |
|
157 |
// do nothing - overridable by subclass |
|
158 |
} |
|
159 |
|
|
160 |
/** |
|
161 |
* Overridable callback method invoked at the end of processing. |
|
162 |
* <p> |
|
163 |
* This implementation does nothing. |
|
164 |
* |
|
165 |
* @param results |
|
166 |
* the collection of result objects, may be updated |
|
167 |
* @throws IOException |
|
168 |
* if an I/O Error occurs |
|
169 |
*/ |
|
170 |
protected void handleEnd(final Collection<T> results) throws IOException { |
|
171 |
// do nothing - overridable by subclass |
|
172 |
clientProvider.disconnect(); |
|
173 |
} |
|
174 |
|
|
175 |
public void initialize() { |
|
176 |
clientProvider.setItemParam(itemParam); |
|
177 |
clientProvider.connect(); |
|
178 |
clientProvider.changeWorkingDirectory(itemParam.getBasePath()); |
|
179 |
|
|
180 |
} |
|
181 |
|
|
182 |
/** |
|
183 |
* Internal method that walks the directory hierarchy in a depth-first manner. |
|
184 |
* <p> |
|
185 |
* Users of this class do not need to call this method. This method will be called automatically by another (public) method on the |
|
186 |
* specific subclass. |
|
187 |
* <p> |
|
188 |
* Writers of subclasses should call this method to start the directory walk. Once called, this method will emit events as it walks the |
|
189 |
* hierarchy. The event methods have the prefix <code>handle</code>. |
|
190 |
* |
|
191 |
* @param startDirectory |
|
192 |
* the directory to start from, not null |
|
193 |
* @param results |
|
194 |
* the collection of result objects, may be updated |
|
195 |
* @throws NullPointerException |
|
196 |
* if the start directory is null |
|
197 |
* @throws IOException |
|
198 |
* if an I/O Error occurs |
|
199 |
*/ |
|
200 |
protected final void walk(final String startDirectory, final Collection<T> results) throws IOException { |
|
201 |
if (startDirectory == null) { throw new NullPointerException("Start Directory is null"); } |
|
202 |
handleStart(startDirectory, results); |
|
203 |
|
|
204 |
walk(startDirectory, -1, results); |
|
205 |
handleEnd(results); |
|
206 |
|
|
207 |
} |
|
208 |
|
|
209 |
/** |
|
210 |
* Main recursive method to examine the directory hierarchy. |
|
211 |
* |
|
212 |
* @param directory |
|
213 |
* the directory to examine, not null |
|
214 |
* @param depth |
|
215 |
* the directory level (starting directory = 0) |
|
216 |
* @param results |
|
217 |
* the collection of result objects, may be updated |
|
218 |
* @throws IOException |
|
219 |
* if an I/O Error occurs |
|
220 |
*/ |
|
221 |
private void walk(final String directory, final int depth, final Collection<T> results) throws IOException { |
|
222 |
if (handleDirectory(directory, depth, results)) { |
|
223 |
String currentDirectory = clientProvider.printWorkingDirectory(); |
|
224 |
clientProvider.changeWorkingDirectory(directory); |
|
225 |
int childDepth = depth + 1; |
|
226 |
if ((depthLimit < 0) || (childDepth <= depthLimit)) { |
|
227 |
// TODO: must be implemented the filter of file |
|
228 |
FTPFile[] childFiles = clientProvider.listFiles(); |
|
229 |
if (childFiles != null) { |
|
230 |
for (FTPFile childFile : childFiles) { |
|
231 |
if (childFile.isDirectory()) { |
|
232 |
handleDirectoryStart(clientProvider.printWorkingDirectory() + "/" + childFile.getName(), childDepth, results); |
|
233 |
walk(clientProvider.printWorkingDirectory() + "/" + childFile.getName(), childDepth, results); |
|
234 |
clientProvider.changeWorkingDirectory(currentDirectory); |
|
235 |
} else { |
|
236 |
if ((filter == null) || filter.accept(childFile)) { |
|
237 |
log.info("Take the File " + childFile.getName()); |
|
238 |
handleFile(childFile, childDepth, results); |
|
239 |
} |
|
240 |
|
|
241 |
} |
|
242 |
} |
|
243 |
} |
|
244 |
} |
|
245 |
handleDirectoryEnd(directory, depth, results); |
|
246 |
} |
|
247 |
|
|
248 |
} |
|
249 |
|
|
250 |
protected String getCurrentDirectory() throws IOException { |
|
251 |
return clientProvider.printWorkingDirectory(); |
|
252 |
|
|
253 |
} |
|
254 |
|
|
255 |
/** |
|
256 |
* Overridable callback method invoked at the end of processing each directory. |
|
257 |
* <p> |
|
258 |
* This implementation does nothing. |
|
259 |
* |
|
260 |
* @param directory |
|
261 |
* the directory being processed |
|
262 |
* @param depth |
|
263 |
* the current directory level (starting directory = 0) |
|
264 |
* @param results |
|
265 |
* the collection of result objects, may be updated |
|
266 |
* @throws IOException |
|
267 |
* if an I/O Error occurs |
|
268 |
*/ |
|
269 |
protected void handleDirectoryEnd(final String directory, final int depth, final Collection<T> results) throws IOException { |
|
270 |
// do nothing - overridable by subclass |
|
271 |
} |
|
272 |
|
|
273 |
protected String getFileFrom(final String remote) throws IOException { |
|
274 |
return clientProvider.retrieveFileStream(remote); |
|
275 |
} |
|
276 |
|
|
277 |
protected void completePendingCommand() throws IOException { |
|
278 |
clientProvider.completePendingCommand(); |
|
279 |
} |
|
280 |
|
|
281 |
} |
modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.1.12/src/main/java/eu/dnetlib/data/collector/plugins/ftp/ItemUtility.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.ftp; |
|
2 |
|
|
3 |
import java.io.UnsupportedEncodingException; |
|
4 |
import java.net.URI; |
|
5 |
import java.net.URISyntaxException; |
|
6 |
import java.net.URLEncoder; |
|
7 |
|
|
8 |
import org.apache.commons.codec.binary.Base64; |
|
9 |
|
|
10 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
11 |
|
|
12 |
/** |
|
13 |
* Utility class for the generation of the Linked resultset |
|
14 |
*/ |
|
15 |
public class ItemUtility { |
|
16 |
|
|
17 |
/** The username. */ |
|
18 |
private String username; |
|
19 |
|
|
20 |
/** The password. */ |
|
21 |
private String password; |
|
22 |
|
|
23 |
/** The host. */ |
|
24 |
private String host; |
|
25 |
|
|
26 |
/** The base path. */ |
|
27 |
private String basePath; |
|
28 |
|
|
29 |
/** The controller url. */ |
|
30 |
private String controllerURL; |
|
31 |
|
|
32 |
private String protocol; |
|
33 |
|
|
34 |
public ItemUtility() {} |
|
35 |
|
|
36 |
public ItemUtility(final InterfaceDescriptor descriptor) { |
|
37 |
|
|
38 |
URI source = null; |
|
39 |
try { |
|
40 |
source = new URI(descriptor.getBaseUrl()); |
|
41 |
} catch (URISyntaxException e) { |
|
42 |
return; |
|
43 |
} |
|
44 |
this.setHost(source.getHost()); |
|
45 |
this.setBasePath(source.getPath()); |
|
46 |
this.protocol = source.getScheme(); |
|
47 |
this.username = descriptor.getParams().get("username"); |
|
48 |
this.password = descriptor.getParams().get("password"); |
|
49 |
|
|
50 |
} |
|
51 |
|
|
52 |
/** |
|
53 |
* Gets the username. |
|
54 |
* |
|
55 |
* @return the username |
|
56 |
*/ |
|
57 |
public String getUsername() { |
|
58 |
return username; |
|
59 |
} |
|
60 |
|
|
61 |
/** |
|
62 |
* Sets the username. |
|
63 |
* |
|
64 |
* @param username |
|
65 |
* the new username |
|
66 |
*/ |
|
67 |
public ItemUtility setUsername(final String username) { |
|
68 |
this.username = username; |
|
69 |
return this; |
|
70 |
} |
|
71 |
|
|
72 |
/** |
|
73 |
* Gets the password. |
|
74 |
* |
|
75 |
* @return the password |
|
76 |
*/ |
|
77 |
public String getPassword() { |
|
78 |
return password; |
|
79 |
} |
|
80 |
|
|
81 |
/** |
|
82 |
* Sets the password. |
|
83 |
* |
|
84 |
* @param password |
|
85 |
* the new password |
|
86 |
*/ |
|
87 |
public ItemUtility setPassword(final String password) { |
|
88 |
this.password = password; |
|
89 |
return this; |
|
90 |
} |
|
91 |
|
|
92 |
/** |
|
93 |
* Gets the host. |
|
94 |
* |
|
95 |
* @return the host |
|
96 |
*/ |
|
97 |
public String getHost() { |
|
98 |
return host; |
|
99 |
} |
|
100 |
|
|
101 |
/** |
|
102 |
* Sets the host. |
|
103 |
* |
|
104 |
* @param host |
|
105 |
* the new host |
|
106 |
*/ |
|
107 |
public ItemUtility setHost(final String host) { |
|
108 |
this.host = host; |
|
109 |
return this; |
|
110 |
} |
|
111 |
|
|
112 |
/** |
|
113 |
* Gets the base path. |
|
114 |
* |
|
115 |
* @return the base path |
|
116 |
*/ |
|
117 |
public String getBasePath() { |
|
118 |
return basePath; |
|
119 |
} |
|
120 |
|
|
121 |
/** |
|
122 |
* Sets the base path. |
|
123 |
* |
|
124 |
* @param basePath |
|
125 |
* the new base path |
|
126 |
*/ |
|
127 |
public ItemUtility setBasePath(final String basePath) { |
|
128 |
this.basePath = basePath; |
|
129 |
return this; |
|
130 |
} |
|
131 |
|
|
132 |
/** |
|
133 |
* Gets the controller url. |
|
134 |
* |
|
135 |
* @return the controller url |
|
136 |
*/ |
|
137 |
public String getControllerURL() { |
|
138 |
return controllerURL; |
|
139 |
} |
|
140 |
|
|
141 |
/** |
|
142 |
* Sets the controller url. |
|
143 |
* |
|
144 |
* @param controllerURL |
|
145 |
* the new controller url |
|
146 |
*/ |
|
147 |
public ItemUtility setControllerURL(final String controllerURL) { |
|
148 |
this.controllerURL = controllerURL; |
|
149 |
return this; |
|
150 |
} |
|
151 |
|
|
152 |
public String getProtocol() { |
|
153 |
return protocol; |
|
154 |
} |
|
155 |
|
|
156 |
public ItemUtility setProtocol(final String protocol) { |
|
157 |
this.protocol = protocol; |
|
158 |
return this; |
|
159 |
} |
|
160 |
|
|
161 |
/** |
|
162 |
* Generate url encodec. |
|
163 |
* |
|
164 |
* @param absolutePath |
|
165 |
* the relative path |
|
166 |
* @return the string |
|
167 |
* @throws UnsupportedEncodingException |
|
168 |
* the unsupported encoding exception |
|
169 |
*/ |
|
170 |
public String generateURLEncoded(final String absolutePath) throws UnsupportedEncodingException { |
|
171 |
URI uri = URI.create(absolutePath); |
|
172 |
|
|
173 |
StringBuilder sb = serialize(uri); |
|
174 |
|
|
175 |
byte[] miaEnc = Base64.encodeBase64(sb.toString().getBytes()); |
|
176 |
return controllerURL + "?var=" + URLEncoder.encode(new String(miaEnc), "UTF-8"); |
|
177 |
} |
|
178 |
|
|
179 |
@Override |
|
180 |
public String toString() { |
|
181 |
return serialize(URI.create(getProtocol() + "://" + getHost() + getBasePath())).toString(); |
|
182 |
} |
|
183 |
|
|
184 |
private StringBuilder serialize(final URI uri) { |
|
185 |
StringBuilder sb = new StringBuilder(); |
|
186 |
sb.append(uri.getScheme() + "://"); |
|
187 |
if ((username != null) && (password != null)) { |
|
188 |
sb.append(username); |
|
189 |
sb.append(":"); |
|
190 |
sb.append(password); |
|
191 |
sb.append("@"); |
|
192 |
} |
|
193 |
sb.append(uri.getHost()); |
|
194 |
sb.append(uri.getPath()); |
|
195 |
|
|
196 |
return sb; |
|
197 |
} |
|
198 |
|
|
199 |
} |
modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.1.12/pom.xml | ||
---|---|---|
1 |
<?xml version="1.0" encoding="UTF-8"?> |
|
2 |
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> |
|
3 |
<parent> |
|
4 |
<groupId>eu.dnetlib</groupId> |
|
5 |
<artifactId>dnet-parent</artifactId> |
|
6 |
<version>1.0.0</version> |
|
7 |
<relativePath /> |
|
8 |
</parent> |
|
9 |
<modelVersion>4.0.0</modelVersion> |
|
10 |
<groupId>eu.dnetlib</groupId> |
|
11 |
<artifactId>dnet-modular-collector-service</artifactId> |
|
12 |
<packaging>jar</packaging> |
|
13 |
<version>3.1.12</version> |
|
14 |
<scm> |
|
15 |
<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet40/modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.1.12</developerConnection> |
|
16 |
</scm> |
|
17 |
<dependencies> |
|
18 |
<dependency> |
|
19 |
<groupId>eu.dnetlib</groupId> |
|
20 |
<artifactId>dnet-modular-collector-service-rmi</artifactId> |
|
21 |
<version>[1.3.0,2.0.0)</version> |
|
22 |
</dependency> |
|
23 |
<dependency> |
|
24 |
<groupId>eu.dnetlib</groupId> |
|
25 |
<artifactId>cnr-resultset-service</artifactId> |
|
26 |
<version>[2.0.0, 3.0.0)</version> |
|
27 |
</dependency> |
|
28 |
<dependency> |
|
29 |
<groupId>eu.dnetlib</groupId> |
|
30 |
<artifactId>cnr-blackboard-common</artifactId> |
|
31 |
<version>[2.0.0,3.0.0)</version> |
|
32 |
</dependency> |
|
33 |
<dependency> |
|
34 |
<groupId>javax.servlet</groupId> |
|
35 |
<artifactId>javax.servlet-api</artifactId> |
|
36 |
<version>${javax.servlet.version}</version> |
|
37 |
<scope>provided</scope> |
|
38 |
</dependency> |
|
39 |
<dependency> |
|
40 |
<groupId>net.sf.opencsv</groupId> |
|
41 |
<artifactId>opencsv</artifactId> |
|
42 |
<version>2.0</version> |
|
43 |
</dependency> |
|
44 |
<dependency> |
|
45 |
<groupId>junit</groupId> |
|
46 |
<artifactId>junit</artifactId> |
|
47 |
<version>4.8.2</version> |
|
48 |
<scope>test</scope> |
|
49 |
</dependency> |
|
50 |
<dependency> |
|
51 |
<groupId>commons-net</groupId> |
|
52 |
<artifactId>commons-net</artifactId> |
|
53 |
<version>3.3</version> |
|
54 |
</dependency> |
|
55 |
<dependency> |
|
56 |
<groupId>jcraft</groupId> |
|
57 |
<artifactId>jsch</artifactId> |
|
58 |
<version>0.1.44</version> |
|
59 |
</dependency> |
|
60 |
<dependency> |
|
61 |
<groupId>org.apache.commons</groupId> |
|
62 |
<artifactId>commons-compress</artifactId> |
|
63 |
<version>1.6</version> |
|
64 |
</dependency> |
|
65 |
<dependency> |
|
66 |
<groupId>org.mockito</groupId> |
|
67 |
<artifactId>mockito-core</artifactId> |
|
68 |
<version>1.6</version> |
|
69 |
<scope>test</scope> |
|
70 |
</dependency> |
|
71 |
<dependency> |
|
72 |
<groupId>commons-httpclient</groupId> |
|
73 |
<artifactId>commons-httpclient</artifactId> |
|
74 |
<version>3.1</version> |
|
75 |
</dependency> |
|
76 |
<dependency> |
|
77 |
<groupId>com.google.code.gson</groupId> |
|
78 |
<artifactId>gson</artifactId> |
|
79 |
<version>${google.gson.version}</version> |
|
80 |
</dependency> |
|
81 |
<dependency> |
|
82 |
<groupId>org.apache.commons</groupId> |
|
83 |
<artifactId>commons-csv</artifactId> |
|
84 |
<version>1.0</version> |
|
85 |
</dependency> |
|
86 |
</dependencies> |
|
87 |
</project> |
modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.1.12/src/main/java/eu/dnetlib/data/collector/plugins/oai/engine/HttpConnector.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.oai.engine; |
|
2 |
|
|
3 |
import java.io.IOException; |
|
4 |
import java.io.InputStream; |
|
5 |
import java.net.HttpURLConnection; |
|
6 |
import java.net.URL; |
|
7 |
import java.security.GeneralSecurityException; |
|
8 |
import java.security.KeyManagementException; |
|
9 |
import java.security.NoSuchAlgorithmException; |
|
10 |
import java.security.cert.CertificateException; |
|
11 |
import java.security.cert.X509Certificate; |
|
12 |
import java.util.List; |
|
13 |
import java.util.Map; |
|
14 |
|
|
15 |
import javax.net.ssl.HttpsURLConnection; |
|
16 |
import javax.net.ssl.SSLContext; |
|
17 |
import javax.net.ssl.TrustManager; |
|
18 |
import javax.net.ssl.X509TrustManager; |
|
19 |
|
|
20 |
import org.apache.commons.io.IOUtils; |
|
21 |
import org.apache.commons.lang.math.NumberUtils; |
|
22 |
import org.apache.commons.logging.Log; |
|
23 |
import org.apache.commons.logging.LogFactory; |
|
24 |
|
|
25 |
import eu.dnetlib.data.collector.plugin.CollectorPluginErrorLogList; |
|
26 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
27 |
|
|
28 |
/** |
|
29 |
* @author jochen, michele, andrea |
|
30 |
* |
|
31 |
*/ |
|
32 |
public class HttpConnector { |
|
33 |
|
|
34 |
private static final Log log = LogFactory.getLog(HttpConnector.class); |
|
35 |
|
|
36 |
private int maxNumberOfRetry = 6; |
|
37 |
private int defaultDelay = 120; // seconds |
|
38 |
private int readTimeOut = 120; // seconds |
|
39 |
|
|
40 |
/** |
|
41 |
* @param requestUrl |
|
42 |
* @return the content of the downloaded resource |
|
43 |
* @throws CollectorServiceException |
|
44 |
*/ |
|
45 |
public String getInputSource(final String requestUrl) throws CollectorServiceException { |
|
46 |
return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); |
|
47 |
} |
|
48 |
|
|
49 |
private String attemptDownload(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList) |
|
50 |
throws CollectorServiceException { |
|
51 |
|
|
52 |
if (retryNumber > maxNumberOfRetry) { throw new CollectorServiceException("Max number of retries exceeded. Cause: \n " + errorList); } |
|
53 |
|
|
54 |
log.debug("Downloading " + requestUrl + " - try: " + retryNumber); |
|
55 |
try { |
|
56 |
InputStream input = null; |
|
57 |
|
|
58 |
try { |
|
59 |
final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection(); |
|
60 |
urlConn.setInstanceFollowRedirects(false); |
|
61 |
urlConn.setReadTimeout(readTimeOut * 1000); |
|
62 |
|
|
63 |
if (log.isDebugEnabled()) { |
|
64 |
logHeaderFields(urlConn); |
|
65 |
} |
|
66 |
|
|
67 |
int retryAfter = obtainRetryAfter(urlConn.getHeaderFields()); |
|
68 |
if (retryAfter > 0) { |
|
69 |
log.warn("waiting and repeating request after " + retryAfter + " sec."); |
|
70 |
Thread.sleep(retryAfter * 1000); |
|
71 |
errorList.add("503 Service Unavailable"); |
|
72 |
return attemptDownload(requestUrl, retryNumber + 1, errorList); |
|
73 |
} else if ((urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM) || (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP)) { |
|
74 |
final String newUrl = obtainNewLocation(urlConn.getHeaderFields()); |
|
75 |
log.info("The requested url has been moved to " + newUrl); |
|
76 |
errorList.add(String.format("%s %s. Moved to: %s", urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl)); |
|
77 |
return attemptDownload(newUrl, retryNumber + 1, errorList); |
|
78 |
} else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) { |
|
79 |
log.error(String.format("HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); |
|
80 |
Thread.sleep(defaultDelay * 1000); |
|
81 |
errorList.add(String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); |
|
82 |
return attemptDownload(requestUrl, retryNumber + 1, errorList); |
|
83 |
} else { |
|
84 |
input = urlConn.getInputStream(); |
|
85 |
return IOUtils.toString(input); |
|
86 |
} |
|
87 |
} catch (IOException e) { |
|
88 |
log.error("error while retrieving from http-connection occured: " + e, e); |
|
89 |
Thread.sleep(defaultDelay * 1000); |
|
90 |
errorList.add(e.getMessage()); |
|
91 |
return attemptDownload(requestUrl, retryNumber + 1, errorList); |
|
92 |
} finally { |
|
93 |
IOUtils.closeQuietly(input); |
|
94 |
} |
|
95 |
} catch (InterruptedException e) { |
|
96 |
throw new CollectorServiceException(e); |
|
97 |
} |
|
98 |
} |
|
99 |
|
|
100 |
private void logHeaderFields(final HttpURLConnection urlConn) throws IOException { |
|
101 |
log.debug("StatusCode: " + urlConn.getResponseMessage()); |
|
102 |
|
|
103 |
for (Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) { |
|
104 |
if (e.getKey() != null) { |
|
105 |
for (String v : e.getValue()) { |
|
106 |
log.debug(" key: " + e.getKey() + " - value: " + v); |
|
107 |
} |
|
108 |
} |
|
109 |
} |
|
110 |
} |
|
111 |
|
|
112 |
private int obtainRetryAfter(final Map<String, List<String>> headerMap) { |
|
113 |
for (String key : headerMap.keySet()) { |
|
114 |
if ((key != null) && key.toLowerCase().equals("retry-after") && (headerMap.get(key).size() > 0) && NumberUtils.isNumber(headerMap.get(key).get(0))) { return Integer |
|
115 |
.parseInt(headerMap.get(key).get(0)) + 10; } |
|
116 |
} |
|
117 |
return -1; |
|
118 |
} |
|
119 |
|
|
120 |
private String obtainNewLocation(final Map<String, List<String>> headerMap) throws CollectorServiceException { |
|
121 |
for (String key : headerMap.keySet()) { |
|
122 |
if ((key != null) && key.toLowerCase().equals("location") && (headerMap.get(key).size() > 0)) { return headerMap.get(key).get(0); } |
|
123 |
} |
|
124 |
throw new CollectorServiceException("The requested url has been MOVED, but 'location' param is MISSING"); |
|
125 |
} |
|
126 |
|
|
127 |
/** |
|
128 |
* register for https scheme; this is a workaround and not intended for the use in trusted environments |
|
129 |
* |
|
130 |
* @throws NoSuchAlgorithmException |
|
131 |
* @throws KeyManagementException |
|
132 |
*/ |
|
133 |
public void initTrustManager() { |
|
134 |
final X509TrustManager tm = new X509TrustManager() { |
|
135 |
|
|
136 |
@Override |
|
137 |
public void checkClientTrusted(final X509Certificate[] xcs, final String string) throws CertificateException {} |
|
138 |
|
|
139 |
@Override |
|
140 |
public void checkServerTrusted(final X509Certificate[] xcs, final String string) throws CertificateException {} |
|
141 |
|
|
142 |
@Override |
|
143 |
public X509Certificate[] getAcceptedIssuers() { |
|
144 |
return null; |
|
145 |
} |
|
146 |
}; |
|
147 |
try { |
|
148 |
final SSLContext ctx = SSLContext.getInstance("TLS"); |
|
149 |
ctx.init(null, new TrustManager[] { tm }, null); |
|
150 |
HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory()); |
|
151 |
} catch (GeneralSecurityException e) { |
|
152 |
log.fatal(e); |
|
153 |
throw new IllegalStateException(e); |
|
154 |
} |
|
155 |
} |
|
156 |
|
|
157 |
public int getMaxNumberOfRetry() { |
|
158 |
return maxNumberOfRetry; |
|
159 |
} |
|
160 |
|
|
161 |
public void setMaxNumberOfRetry(final int maxNumberOfRetry) { |
|
162 |
this.maxNumberOfRetry = maxNumberOfRetry; |
|
163 |
} |
|
164 |
|
|
165 |
public int getDefaultDelay() { |
|
166 |
return defaultDelay; |
|
167 |
} |
|
168 |
|
|
169 |
public void setDefaultDelay(final int defaultDelay) { |
|
170 |
this.defaultDelay = defaultDelay; |
|
171 |
} |
|
172 |
|
|
173 |
public int getReadTimeOut() { |
|
174 |
return readTimeOut; |
|
175 |
} |
|
176 |
|
|
177 |
public void setReadTimeOut(final int readTimeOut) { |
|
178 |
this.readTimeOut = readTimeOut; |
|
179 |
} |
|
180 |
|
|
181 |
} |
modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.1.12/src/main/java/eu/dnetlib/data/collector/functions/ListOaiSetsFunction.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.functions; |
|
2 |
|
|
3 |
import java.io.StringReader; |
|
4 |
import java.util.Iterator; |
|
5 |
import java.util.List; |
|
6 |
import java.util.Map; |
|
7 |
|
|
8 |
import org.dom4j.Document; |
|
9 |
import org.dom4j.DocumentException; |
|
10 |
import org.dom4j.io.SAXReader; |
|
11 |
import org.springframework.beans.factory.annotation.Required; |
|
12 |
|
|
13 |
import com.google.common.base.Function; |
|
14 |
import com.google.common.collect.Iterators; |
|
15 |
import com.google.common.collect.Lists; |
|
16 |
|
|
17 |
import eu.dnetlib.data.collector.plugins.oaisets.OaiSetsIteratorFactory; |
|
18 |
import eu.dnetlib.data.collector.rmi.ProtocolParameterValue; |
|
19 |
|
|
20 |
public class ListOaiSetsFunction implements ParamValuesFunction { |
|
21 |
|
|
22 |
private OaiSetsIteratorFactory oaiSetsIteratorFactory; |
|
23 |
|
|
24 |
@Override |
|
25 |
public List<ProtocolParameterValue> findValues(final String baseUrl, final Map<String, String> params) { |
|
26 |
final SAXReader reader = new SAXReader(); |
|
27 |
|
|
28 |
final Iterator<ProtocolParameterValue> iter = Iterators.transform(oaiSetsIteratorFactory.newIterator(baseUrl), |
|
29 |
new Function<String, ProtocolParameterValue>() { |
|
30 |
|
|
31 |
@Override |
|
32 |
public ProtocolParameterValue apply(final String s) { |
|
33 |
try { |
|
34 |
final Document doc = reader.read(new StringReader(s)); |
|
35 |
final String id = doc.valueOf("//*[local-name()='setSpec']"); |
|
36 |
final String name = doc.valueOf("//*[local-name()='setName']"); |
|
37 |
return new ProtocolParameterValue(id, name); |
|
38 |
} catch (DocumentException e) { |
|
39 |
throw new RuntimeException("Error in ListSets", e); |
|
40 |
} |
|
41 |
} |
|
42 |
}); |
|
43 |
return Lists.newArrayList(iter); |
|
44 |
} |
|
45 |
|
|
46 |
public OaiSetsIteratorFactory getOaiSetsIteratorFactory() { |
|
47 |
return oaiSetsIteratorFactory; |
|
48 |
} |
|
49 |
|
|
50 |
@Required |
|
51 |
public void setOaiSetsIteratorFactory(final OaiSetsIteratorFactory oaiSetsIteratorFactory) { |
|
52 |
this.oaiSetsIteratorFactory = oaiSetsIteratorFactory; |
|
53 |
} |
|
54 |
} |
modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.1.12/src/main/java/eu/dnetlib/data/collector/CollectorServiceImpl.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector; |
|
2 |
|
|
3 |
import java.util.List; |
|
4 |
import java.util.Map; |
|
5 |
import java.util.Set; |
|
6 |
|
|
7 |
import javax.annotation.Resource; |
|
8 |
import javax.xml.ws.wsaddressing.W3CEndpointReference; |
|
9 |
|
|
10 |
import com.google.common.collect.Lists; |
|
11 |
import com.google.common.collect.Sets; |
|
12 |
|
|
13 |
import eu.dnetlib.data.collector.plugin.CollectorPlugin; |
|
14 |
import eu.dnetlib.data.collector.rmi.CollectorService; |
|
15 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
16 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
17 |
import eu.dnetlib.data.collector.rmi.ProtocolDescriptor; |
|
18 |
import eu.dnetlib.data.collector.rmi.ProtocolParameter; |
|
19 |
import eu.dnetlib.data.collector.rmi.ProtocolParameterValue; |
|
20 |
import eu.dnetlib.enabling.resultset.IterableResultSetFactory; |
|
21 |
import eu.dnetlib.enabling.tools.AbstractBaseService; |
|
22 |
|
|
23 |
public class CollectorServiceImpl extends AbstractBaseService implements CollectorService { |
|
24 |
|
|
25 |
@Resource |
|
26 |
private CollectorPluginEnumerator collectorPluginEnumerator; |
|
27 |
|
|
28 |
@Resource |
|
29 |
private IterableResultSetFactory iterableResultSetFactory; |
|
30 |
|
|
31 |
@Override |
|
32 |
public W3CEndpointReference collect(final InterfaceDescriptor ifDescriptor) throws CollectorServiceException { |
|
33 |
return dateRangeCollect(ifDescriptor, null, null); |
|
34 |
} |
|
35 |
|
|
36 |
@Override |
|
37 |
public W3CEndpointReference dateRangeCollect( |
|
38 |
final InterfaceDescriptor ifDescriptor, final String from, final String until) |
|
39 |
throws CollectorServiceException { |
|
40 |
final CollectorPlugin plugin = collectorPluginEnumerator.get(ifDescriptor.getProtocol()); |
|
41 |
|
|
42 |
if (!verifyParams(ifDescriptor.getParams().keySet(), Sets.newHashSet(plugin.listNameParameters()))) { throw new CollectorServiceException( |
|
43 |
"Invalid parameters, valid: " + plugin.listNameParameters() + ", current: " + ifDescriptor.getParams().keySet()); } |
|
44 |
|
|
45 |
final Iterable<String> iter = plugin.collect(ifDescriptor, from, until); |
|
46 |
|
|
47 |
return iterableResultSetFactory.createIterableResultSet(iter); |
|
48 |
} |
|
49 |
|
|
50 |
@Override |
|
51 |
public List<ProtocolDescriptor> listProtocols() { |
|
52 |
final List<ProtocolDescriptor> list = Lists.newArrayList(); |
|
53 |
for (CollectorPlugin plugin : collectorPluginEnumerator.getAll()) { |
|
54 |
list.add(plugin.getProtocolDescriptor()); |
|
55 |
} |
|
56 |
return list; |
|
57 |
} |
|
58 |
|
|
59 |
@Override |
|
60 |
public List<ProtocolParameterValue> listValidValuesForParam(final String protocol, |
|
61 |
final String baseUrl, |
|
62 |
final String param, |
|
63 |
final Map<String, String> otherParams) throws CollectorServiceException { |
|
64 |
final CollectorPlugin plugin = collectorPluginEnumerator.get(protocol); |
|
65 |
|
|
66 |
for (ProtocolParameter pp : plugin.getProtocolDescriptor().getParams()) { |
|
67 |
if (pp.getName().equals(param) && pp.isFunctionPopulated()) { return pp.getPopulateFunction().findValues(baseUrl, otherParams); } |
|
68 |
} |
|
69 |
|
|
70 |
return Lists.newArrayList(); |
|
71 |
} |
|
72 |
|
|
73 |
private boolean verifyParams(final Set<String> curr, final Set<String> valid) { |
|
74 |
return valid.containsAll(curr); |
|
75 |
} |
|
76 |
|
|
77 |
} |
modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.1.12/src/main/java/eu/dnetlib/data/collector/plugins/oai/engine/XmlCleaner.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.oai.engine; |
|
2 |
|
|
3 |
import java.util.HashMap; |
|
4 |
import java.util.HashSet; |
|
5 |
import java.util.Map; |
|
6 |
import java.util.Set; |
|
7 |
import java.util.regex.Pattern; |
|
8 |
|
|
9 |
/** |
|
10 |
* @author jochen |
|
11 |
* |
|
12 |
*/ |
|
13 |
public class XmlCleaner { |
|
14 |
/** |
|
15 |
* Pattern for numeric entities. |
|
16 |
*/ |
|
17 |
private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};"); //$NON-NLS-1$ |
|
18 |
// private static Pattern validCharacterEntityPattern = Pattern.compile("^&#?\\d{2,4};"); //$NON-NLS-1$ |
|
19 |
private static Pattern invalidControlCharPattern = Pattern.compile(""); |
|
20 |
/** |
|
21 |
* Pattern that negates the allowable XML 4 byte unicode characters. Valid |
|
22 |
* are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | |
|
23 |
* [#x10000-#x10FFFF] |
|
24 |
*/ |
|
25 |
private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]"); //$NON-NLS-1$ |
|
26 |
|
|
27 |
// Map entities to their unicode equivalent |
|
28 |
private static Set<String> goodEntities = new HashSet<String>(); |
|
29 |
private static Map<String, String> badEntities = new HashMap<String, String>(); |
|
30 |
|
|
31 |
static { |
|
32 |
// pre-defined XML entities |
|
33 |
goodEntities.add("""); //$NON-NLS-1$ // quotation mark |
|
34 |
goodEntities.add("&"); //$NON-NLS-1$ // ampersand |
|
35 |
goodEntities.add("<"); //$NON-NLS-1$ // less-than sign |
|
36 |
goodEntities.add(">"); //$NON-NLS-1$ // greater-than sign |
|
37 |
// control entities |
|
38 |
//badEntities.put("", ""); |
|
39 |
// misc entities |
|
40 |
badEntities.put("€", "\u20AC"); //$NON-NLS-1$ //$NON-NLS-2$ // euro |
|
41 |
badEntities.put("‘", "\u2018"); //$NON-NLS-1$ //$NON-NLS-2$ // left single quotation mark |
|
42 |
badEntities.put("’", "\u2019"); //$NON-NLS-1$ //$NON-NLS-2$ // right single quotation mark |
|
43 |
// Latin 1 entities |
|
44 |
badEntities.put(" ", "\u00A0"); //$NON-NLS-1$ //$NON-NLS-2$ // no-break space |
|
45 |
badEntities.put("¡", "\u00A1"); //$NON-NLS-1$ //$NON-NLS-2$ // inverted exclamation mark |
|
46 |
badEntities.put("¢", "\u00A2"); //$NON-NLS-1$ //$NON-NLS-2$ // cent sign |
|
47 |
badEntities.put("£", "\u00A3"); //$NON-NLS-1$ //$NON-NLS-2$ // pound sign |
|
48 |
badEntities.put("¤", "\u00A4"); //$NON-NLS-1$ //$NON-NLS-2$ // currency sign |
|
49 |
badEntities.put("¥", "\u00A5"); //$NON-NLS-1$ //$NON-NLS-2$ // yen sign |
|
50 |
badEntities.put("¦", "\u00A6"); //$NON-NLS-1$ //$NON-NLS-2$ // broken vertical bar |
|
51 |
badEntities.put("§", "\u00A7"); //$NON-NLS-1$ //$NON-NLS-2$ // section sign |
|
52 |
badEntities.put("¨", "\u00A8"); //$NON-NLS-1$ //$NON-NLS-2$ // diaeresis |
|
53 |
badEntities.put("©", "\u00A9"); //$NON-NLS-1$ //$NON-NLS-2$ // copyright sign |
|
54 |
badEntities.put("ª", "\u00AA"); //$NON-NLS-1$ //$NON-NLS-2$ // feminine ordinal indicator |
|
55 |
badEntities.put("«", "\u00AB"); //$NON-NLS-1$ //$NON-NLS-2$ // left-pointing double angle quotation mark |
|
56 |
badEntities.put("¬", "\u00AC"); //$NON-NLS-1$ //$NON-NLS-2$ // not sign |
|
57 |
badEntities.put("­", "\u00AD"); //$NON-NLS-1$ //$NON-NLS-2$ // soft hyphen |
|
58 |
badEntities.put("®", "\u00AE"); //$NON-NLS-1$ //$NON-NLS-2$ // registered sign |
|
59 |
badEntities.put("¯", "\u00AF"); //$NON-NLS-1$ //$NON-NLS-2$ // macron |
|
60 |
badEntities.put("°", "\u00B0"); //$NON-NLS-1$ //$NON-NLS-2$ // degree sign |
|
61 |
badEntities.put("±", "\u00B1"); //$NON-NLS-1$ //$NON-NLS-2$ // plus-minus sign |
|
62 |
badEntities.put("²", "\u00B2"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript two |
|
63 |
badEntities.put("³", "\u00B3"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript three |
|
64 |
badEntities.put("´", "\u00B4"); //$NON-NLS-1$ //$NON-NLS-2$ // acute accent |
|
65 |
badEntities.put("µ", "\u00B5"); //$NON-NLS-1$ //$NON-NLS-2$ // micro sign |
|
66 |
badEntities.put("¶", "\u00B6"); //$NON-NLS-1$ //$NON-NLS-2$ // pilcrow sign |
|
67 |
badEntities.put("·", "\u00B7"); //$NON-NLS-1$ //$NON-NLS-2$ // middle dot |
|
68 |
badEntities.put("¸", "\u00B8"); //$NON-NLS-1$ //$NON-NLS-2$ // cedilla |
Also available in: Unified diff
[maven-release-plugin] copy for tag dnet-modular-collector-service-3.1.12