Revision 51556
Added by Claudio Atzori about 6 years ago
modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/mongo/MongoDumpIterable.java | ||
---|---|---|
8 | 8 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
9 | 9 |
import eu.dnetlib.data.information.collectionservice.rmi.CollectionServiceException; |
10 | 10 |
|
11 |
// TODO: Auto-generated Javadoc |
|
12 | 11 |
/** |
13 | 12 |
* The Class MongoDumpIterable. |
14 | 13 |
*/ |
... | ... | |
20 | 19 |
/** |
21 | 20 |
* Instantiates a new mongo dump iterable. |
22 | 21 |
* |
23 |
* @param inputFile |
|
24 |
* the input file |
|
25 |
* @throws CollectionServiceException |
|
26 |
* the collection service exception |
|
22 |
* @param inputFile the input file |
|
23 |
* @throws CollectionServiceException the collection service exception |
|
27 | 24 |
*/ |
28 | 25 |
public MongoDumpIterable(final File inputFile) throws CollectorServiceException { |
29 | 26 |
try { |
modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/filesystem/FileSystemIterator.java | ||
---|---|---|
16 | 16 |
import com.google.common.collect.Sets; |
17 | 17 |
|
18 | 18 |
/** |
19 |
* Class enabling lazy & recursive iteration of a filesystem tree. The iterator iterates over file paths.
|
|
19 |
* Class enabling lazy and recursive iteration of a filesystem tree. The iterator iterates over file paths.
|
|
20 | 20 |
* |
21 | 21 |
* @author Andrea |
22 | 22 |
* |
modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/oai/OaiIterator.java | ||
---|---|---|
15 | 15 |
import org.dom4j.io.SAXReader; |
16 | 16 |
|
17 | 17 |
import eu.dnetlib.data.collector.plugins.HttpConnector; |
18 |
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner; |
|
19 | 18 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
20 | 19 |
|
20 |
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner; |
|
21 |
|
|
21 | 22 |
public class OaiIterator implements Iterator<String> { |
22 | 23 |
|
23 | 24 |
private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on 11/24/08 5:02 PM |
modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/oai/engine/XmlCleaner.java | ||
---|---|---|
174 | 174 |
/** |
175 | 175 |
* For each entity in the input that is not allowed in XML, replace the |
176 | 176 |
* entity with its unicode equivalent or remove it. For each instance of a |
177 |
* bare &, replace it with &<br/>
|
|
178 |
* XML only allows 4 entities: &amp;, &quot;, &lt; and &gt;.
|
|
177 |
* bare {@literal &}, replace it with {@literal &<br/>}
|
|
178 |
* XML only allows 4 entities: {@literal &amp;}, {@literal &quot;}, {@literal &lt;} and {@literal &gt;}.
|
|
179 | 179 |
* |
180 | 180 |
* @param broken |
181 | 181 |
* the string to handle entities |
modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/oai/OaiCollectorPlugin.java | ||
---|---|---|
1 | 1 |
package eu.dnetlib.data.collector.plugins.oai; |
2 | 2 |
|
3 |
import java.util.Iterator; |
|
4 | 3 |
import java.util.List; |
5 | 4 |
|
6 |
import org.springframework.beans.factory.annotation.Required; |
|
7 |
|
|
8 |
import com.google.common.base.Function; |
|
9 | 5 |
import com.google.common.base.Splitter; |
10 |
import com.google.common.collect.Iterables; |
|
11 | 6 |
import com.google.common.collect.Iterators; |
12 | 7 |
import com.google.common.collect.Lists; |
13 |
|
|
14 | 8 |
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin; |
15 | 9 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
16 | 10 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
11 |
import org.springframework.beans.factory.annotation.Required; |
|
17 | 12 |
|
18 | 13 |
public class OaiCollectorPlugin extends AbstractCollectorPlugin { |
19 | 14 |
|
... | ... | |
45 | 40 |
|
46 | 41 |
if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new CollectorServiceException("Invalid date (YYYY-MM-DD): " + untilDate); } |
47 | 42 |
|
48 |
return new Iterable<String>() { |
|
49 |
|
|
50 |
@SuppressWarnings("unchecked") |
|
51 |
@Override |
|
52 |
public Iterator<String> iterator() { |
|
53 |
final Iterable<Iterator<String>> iter = Iterables.transform(sets, new Function<String, Iterator<String>>() { |
|
54 |
|
|
55 |
@Override |
|
56 |
public Iterator<String> apply(final String set) { |
|
57 |
return oaiIteratorFactory.newIterator(baseUrl, mdFormat, set, fromDate, untilDate); |
|
58 |
} |
|
59 |
}); |
|
60 |
return Iterators.concat(Iterables.toArray(iter, Iterator.class)); |
|
61 |
} |
|
62 |
}; |
|
43 |
return () -> Iterators.concat( |
|
44 |
sets.stream() |
|
45 |
.map(set -> oaiIteratorFactory.newIterator(baseUrl, mdFormat, set, fromDate, untilDate)) |
|
46 |
.iterator()); |
|
63 | 47 |
} |
64 | 48 |
|
65 | 49 |
public OaiIteratorFactory getOaiIteratorFactory() { |
modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HttpConnector.java | ||
---|---|---|
2 | 2 |
|
3 | 3 |
import java.io.IOException; |
4 | 4 |
import java.io.InputStream; |
5 |
import java.net.CookieHandler; |
|
6 |
import java.net.CookieManager; |
|
7 |
import java.net.CookiePolicy; |
|
8 |
import java.net.HttpURLConnection; |
|
9 |
import java.net.URL; |
|
5 |
import java.net.*; |
|
10 | 6 |
import java.security.GeneralSecurityException; |
11 |
import java.security.KeyManagementException; |
|
12 |
import java.security.NoSuchAlgorithmException; |
|
13 | 7 |
import java.security.cert.CertificateException; |
14 | 8 |
import java.security.cert.X509Certificate; |
15 | 9 |
import java.util.List; |
16 | 10 |
import java.util.Map; |
17 |
|
|
18 | 11 |
import javax.net.ssl.HttpsURLConnection; |
19 | 12 |
import javax.net.ssl.SSLContext; |
20 | 13 |
import javax.net.ssl.TrustManager; |
21 | 14 |
import javax.net.ssl.X509TrustManager; |
22 | 15 |
|
16 |
import eu.dnetlib.data.collector.plugin.CollectorPluginErrorLogList; |
|
17 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
23 | 18 |
import org.apache.commons.io.IOUtils; |
24 | 19 |
import org.apache.commons.lang3.math.NumberUtils; |
25 | 20 |
import org.apache.commons.logging.Log; |
26 | 21 |
import org.apache.commons.logging.LogFactory; |
27 | 22 |
|
28 |
import eu.dnetlib.data.collector.plugin.CollectorPluginErrorLogList; |
|
29 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
30 |
|
|
31 | 23 |
/** |
32 | 24 |
* @author jochen, michele, andrea |
33 | 25 |
* |
... | ... | |
46 | 38 |
} |
47 | 39 |
|
48 | 40 |
/** |
49 |
* @param requestUrl |
|
41 |
* Given the URL returns the content via HTTP GET |
|
42 |
* |
|
43 |
* @param requestUrl the URL |
|
50 | 44 |
* @return the content of the downloaded resource |
51 |
* @throws CollectorServiceException |
|
45 |
* @throws CollectorServiceException when retrying more than maxNumberOfRetry times
|
|
52 | 46 |
*/ |
53 | 47 |
public String getInputSource(final String requestUrl) throws CollectorServiceException { |
54 | 48 |
return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); |
... | ... | |
67 | 61 |
final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection(); |
68 | 62 |
urlConn.setInstanceFollowRedirects(false); |
69 | 63 |
urlConn.setReadTimeout(readTimeOut * 1000); |
70 |
urlConn.addRequestProperty("User-Agent", userAgent);
|
|
64 |
urlConn.addRequestProperty("User-Agent", userAgent);
|
|
71 | 65 |
|
72 | 66 |
if (log.isDebugEnabled()) { |
73 | 67 |
logHeaderFields(urlConn); |
... | ... | |
82 | 76 |
return attemptDownload(requestUrl, retryNumber + 1, errorList); |
83 | 77 |
} else if ((urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM) || (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP)) { |
84 | 78 |
final String newUrl = obtainNewLocation(urlConn.getHeaderFields()); |
85 |
log.info("The requested url has been moved to " + newUrl);
|
|
79 |
log.debug("The requested url has been moved to " + newUrl);
|
|
86 | 80 |
errorList.add(String.format("%s %s. Moved to: %s", urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl)); |
87 | 81 |
urlConn.disconnect(); |
88 | 82 |
return attemptDownload(newUrl, retryNumber + 1, errorList); |
... | ... | |
138 | 132 |
|
139 | 133 |
/** |
140 | 134 |
* register for https scheme; this is a workaround and not intended for the use in trusted environments |
141 |
* |
|
142 |
* @throws NoSuchAlgorithmException |
|
143 |
* @throws KeyManagementException |
|
144 | 135 |
*/ |
145 | 136 |
public void initTrustManager() { |
146 | 137 |
final X509TrustManager tm = new X509TrustManager() { |
modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/CollectorPluginEnumerator.java | ||
---|---|---|
22 | 22 |
/** |
23 | 23 |
* Get all beans implementing the CollectorPlugin interface. |
24 | 24 |
* |
25 |
* @return |
|
25 |
* @return the set of eu.dnetlib.data.collector.plugin.CollectorPlugin(s)
|
|
26 | 26 |
*/ |
27 | 27 |
public Collection<CollectorPlugin> getAll() { |
28 | 28 |
return beanFactory.getBeansOfType(CollectorPlugin.class).values(); |
... | ... | |
40 | 40 |
/** |
41 | 41 |
* Get given CollectorPlugin or throws exception. |
42 | 42 |
* |
43 |
* @param protocol |
|
44 |
* @return |
|
45 |
* @throws CollectorServiceException |
|
43 |
* @param protocol the given protocol
|
|
44 |
* @return a CollectorPlugin compatible with the given protocol
|
|
45 |
* @throws CollectorServiceException when no suitable plugin is found
|
|
46 | 46 |
*/ |
47 | 47 |
public CollectorPlugin get(final String protocol) throws CollectorServiceException { |
48 | 48 |
for (CollectorPlugin cp : getAll()) { |
Also available in: Unified diff
javadoc and some java8 stuff