Revision 48025
Added by Claudio Atzori about 7 years ago
modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.3.11/src/main/java/eu/dnetlib/data/collector/plugins/sftp/SftpIterator.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.sftp; |
|
2 |
|
|
3 |
import java.io.OutputStream; |
|
4 |
import java.net.URI; |
|
5 |
import java.net.URISyntaxException; |
|
6 |
import java.util.*; |
|
7 |
|
|
8 |
import com.jcraft.jsch.*; |
|
9 |
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException; |
|
10 |
import org.apache.commons.io.output.ByteArrayOutputStream; |
|
11 |
import org.apache.commons.lang.StringUtils; |
|
12 |
import org.apache.commons.logging.Log; |
|
13 |
import org.apache.commons.logging.LogFactory; |
|
14 |
import org.joda.time.DateTime; |
|
15 |
import org.joda.time.format.DateTimeFormat; |
|
16 |
import org.joda.time.format.DateTimeFormatter; |
|
17 |
|
|
18 |
/** |
|
19 |
* Created by andrea on 11/01/16. |
|
20 |
*/ |
|
21 |
public class SftpIterator implements Iterator<String> { |
|
22 |
private static final Log log = LogFactory.getLog(SftpIterator.class); |
|
23 |
|
|
24 |
private static final int MAX_RETRIES = 5; |
|
25 |
private static final int DEFAULT_TIMEOUT = 30000; |
|
26 |
private static final long BACKOFF_MILLIS = 10000; |
|
27 |
|
|
28 |
private String baseUrl; |
|
29 |
private String sftpURIScheme; |
|
30 |
private String sftpServerAddress; |
|
31 |
private String remoteSftpBasePath; |
|
32 |
private String username; |
|
33 |
private String password; |
|
34 |
private boolean isRecursive; |
|
35 |
private Set<String> extensionsSet; |
|
36 |
private boolean incremental; |
|
37 |
|
|
38 |
private Session sftpSession; |
|
39 |
private ChannelSftp sftpChannel; |
|
40 |
|
|
41 |
private Queue<String> queue; |
|
42 |
|
|
43 |
private DateTime fromDate = null; |
|
44 |
private DateTimeFormatter simpleDateTimeFormatter = DateTimeFormat.forPattern("yyyy-MM-dd"); |
|
45 |
|
|
46 |
public SftpIterator(String baseUrl, String username, String password, boolean isRecursive, Set<String> extensionsSet, String fromDate) { |
|
47 |
this.baseUrl = baseUrl; |
|
48 |
this.username = username; |
|
49 |
this.password = password; |
|
50 |
this.isRecursive = isRecursive; |
|
51 |
this.extensionsSet = extensionsSet; |
|
52 |
this.incremental = StringUtils.isNotBlank(fromDate); |
|
53 |
if (incremental) { |
|
54 |
//I expect fromDate in the format 'yyyy-MM-dd'. See class eu.dnetlib.msro.workflows.nodes.collect.FindDateRangeForIncrementalHarvestingJobNode . |
|
55 |
this.fromDate = DateTime.parse(fromDate, simpleDateTimeFormatter); |
|
56 |
log.debug("fromDate string: " + fromDate + " -- parsed: " + this.fromDate.toString()); |
|
57 |
} |
|
58 |
try { |
|
59 |
URI sftpServer = new URI(baseUrl); |
|
60 |
this.sftpURIScheme = sftpServer.getScheme(); |
|
61 |
this.sftpServerAddress = sftpServer.getHost(); |
|
62 |
this.remoteSftpBasePath = sftpServer.getPath(); |
|
63 |
} catch (URISyntaxException e) { |
|
64 |
throw new CollectorServiceRuntimeException("Bad syntax in the URL " + baseUrl); |
|
65 |
} |
|
66 |
|
|
67 |
connectToSftpServer(); |
|
68 |
initializeQueue(); |
|
69 |
} |
|
70 |
|
|
71 |
private void connectToSftpServer() { |
|
72 |
JSch jsch = new JSch(); |
|
73 |
|
|
74 |
try { |
|
75 |
JSch.setConfig("StrictHostKeyChecking", "no"); |
|
76 |
sftpSession = jsch.getSession(username, sftpServerAddress); |
|
77 |
sftpSession.setPassword(password); |
|
78 |
sftpSession.connect(); |
|
79 |
|
|
80 |
Channel channel = sftpSession.openChannel(sftpURIScheme); |
|
81 |
channel.connect(); |
|
82 |
sftpChannel = (ChannelSftp) channel; |
|
83 |
String pwd = sftpChannel.pwd(); |
|
84 |
log.debug("PWD from server: " + pwd); |
|
85 |
String fullPath = pwd + remoteSftpBasePath; |
|
86 |
sftpChannel.cd(fullPath); |
|
87 |
log.debug("PWD from server 2 after 'cd " + fullPath + "' : " + sftpChannel.pwd()); |
|
88 |
log.info("Connected to SFTP server " + sftpServerAddress); |
|
89 |
} catch (JSchException e) { |
|
90 |
throw new CollectorServiceRuntimeException("Unable to connect to remote SFTP server.", e); |
|
91 |
} catch (SftpException e) { |
|
92 |
throw new CollectorServiceRuntimeException("Unable to access the base remote path on the SFTP server.", e); |
|
93 |
} |
|
94 |
} |
|
95 |
|
|
96 |
private void disconnectFromSftpServer() { |
|
97 |
sftpChannel.exit(); |
|
98 |
sftpSession.disconnect(); |
|
99 |
} |
|
100 |
|
|
101 |
private void initializeQueue() { |
|
102 |
queue = new LinkedList<String>(); |
|
103 |
log.info(String.format("SFTP collector plugin collecting from %s with recursion = %s, incremental = %s with fromDate=%s", remoteSftpBasePath, |
|
104 |
isRecursive, |
|
105 |
incremental, fromDate)); |
|
106 |
listDirectoryRecursive(".", ""); |
|
107 |
} |
|
108 |
|
|
109 |
private void listDirectoryRecursive(final String parentDir, final String currentDir) { |
|
110 |
String dirToList = parentDir; |
|
111 |
if (StringUtils.isNotBlank(currentDir)) { |
|
112 |
dirToList += "/" + currentDir; |
|
113 |
} |
|
114 |
log.debug("PARENT DIR: " + parentDir); |
|
115 |
log.debug("DIR TO LIST: " + dirToList); |
|
116 |
try { |
|
117 |
Vector<ChannelSftp.LsEntry> ls = sftpChannel.ls(dirToList); |
|
118 |
for (ChannelSftp.LsEntry entry : ls) { |
|
119 |
String currentFileName = entry.getFilename(); |
|
120 |
if (currentFileName.equals(".") || currentFileName.equals("..")) { |
|
121 |
// skip parent directory and directory itself |
|
122 |
continue; |
|
123 |
} |
|
124 |
|
|
125 |
SftpATTRS attrs = entry.getAttrs(); |
|
126 |
if (attrs.isDir()) { |
|
127 |
if (isRecursive) { |
|
128 |
listDirectoryRecursive(dirToList, currentFileName); |
|
129 |
} |
|
130 |
} else { |
|
131 |
// test the file for extensions compliance and, just in case, add it to the list. |
|
132 |
for (String ext : extensionsSet) { |
|
133 |
if (currentFileName.endsWith(ext)) { |
|
134 |
//test if the file has been changed after the last collection date: |
|
135 |
if (incremental) { |
|
136 |
int mTime = attrs.getMTime(); |
|
137 |
//int times are values reduced by the milliseconds, hence we multiply per 1000L |
|
138 |
DateTime dt = new DateTime(mTime * 1000L); |
|
139 |
if (dt.isAfter(fromDate)) { |
|
140 |
queue.add(currentFileName); |
|
141 |
log.debug(currentFileName + " has changed and must be re-collected"); |
|
142 |
} else { |
|
143 |
if (log.isDebugEnabled()) { |
|
144 |
log.debug(currentFileName + " has not changed since last collection"); |
|
145 |
} |
|
146 |
} |
|
147 |
} else { |
|
148 |
//if it is not incremental, just add it to the queue |
|
149 |
queue.add(currentFileName); |
|
150 |
} |
|
151 |
|
|
152 |
} |
|
153 |
} |
|
154 |
} |
|
155 |
} |
|
156 |
} catch (SftpException e) { |
|
157 |
throw new CollectorServiceRuntimeException("Cannot list the sftp remote directory", e); |
|
158 |
|
|
159 |
} |
|
160 |
} |
|
161 |
|
|
162 |
@Override |
|
163 |
public boolean hasNext() { |
|
164 |
if (queue.isEmpty()) { |
|
165 |
disconnectFromSftpServer(); |
|
166 |
return false; |
|
167 |
} else { |
|
168 |
return true; |
|
169 |
} |
|
170 |
} |
|
171 |
|
|
172 |
@Override |
|
173 |
public String next() { |
|
174 |
String nextRemotePath = queue.remove(); |
|
175 |
int nRepeat = 0; |
|
176 |
String fullPathFile = nextRemotePath; |
|
177 |
while (nRepeat < MAX_RETRIES) { |
|
178 |
try { |
|
179 |
OutputStream baos = new ByteArrayOutputStream(); |
|
180 |
sftpChannel.get(nextRemotePath, baos); |
|
181 |
if (log.isDebugEnabled()) { |
|
182 |
fullPathFile = sftpChannel.pwd() + "/" + nextRemotePath; |
|
183 |
log.debug(String.format("Collected file from SFTP: %s%s", sftpServerAddress, fullPathFile)); |
|
184 |
} |
|
185 |
return baos.toString(); |
|
186 |
} catch (SftpException e) { |
|
187 |
nRepeat++; |
|
188 |
log.warn(String.format("An error occurred [%s] for %s%s, retrying.. [retried %s time(s)]", e.getMessage(), sftpServerAddress, fullPathFile, |
|
189 |
nRepeat)); |
|
190 |
// disconnectFromSftpServer(); |
|
191 |
try { |
|
192 |
Thread.sleep(BACKOFF_MILLIS); |
|
193 |
} catch (InterruptedException e1) { |
|
194 |
log.error(e1); |
|
195 |
} |
|
196 |
} |
|
197 |
} |
|
198 |
throw new CollectorServiceRuntimeException( |
|
199 |
String.format("Impossible to retrieve FTP file %s after %s retries. Aborting FTP collection.", fullPathFile, nRepeat)); |
|
200 |
} |
|
201 |
|
|
202 |
@Override |
|
203 |
public void remove() { |
|
204 |
throw new UnsupportedOperationException(); |
|
205 |
} |
|
206 |
} |
modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.3.11/src/main/java/eu/dnetlib/data/collector/plugins/ftp/FtpCollectorPlugin.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.ftp; |
|
2 |
|
|
3 |
import com.google.common.base.Splitter; |
|
4 |
import com.google.common.collect.Sets; |
|
5 |
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin; |
|
6 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
7 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
8 |
import org.springframework.beans.factory.annotation.Required; |
|
9 |
|
|
10 |
import java.util.Iterator; |
|
11 |
import java.util.Set; |
|
12 |
|
|
13 |
/** |
|
14 |
* |
|
15 |
* @author Author: Andrea Mannocci |
|
16 |
* |
|
17 |
*/ |
|
18 |
public class FtpCollectorPlugin extends AbstractCollectorPlugin { |
|
19 |
|
|
20 |
private FtpIteratorFactory ftpIteratorFactory; |
|
21 |
|
|
22 |
@Override |
|
23 |
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate) |
|
24 |
throws CollectorServiceException { |
|
25 |
|
|
26 |
final String baseUrl = interfaceDescriptor.getBaseUrl(); |
|
27 |
final String username = interfaceDescriptor.getParams().get("username"); |
|
28 |
final String password = interfaceDescriptor.getParams().get("password"); |
|
29 |
final String recursive = interfaceDescriptor.getParams().get("recursive"); |
|
30 |
final String extensions = interfaceDescriptor.getParams().get("extensions"); |
|
31 |
|
|
32 |
if ((baseUrl == null) || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); } |
|
33 |
if ((username == null) || username.isEmpty()) { throw new CollectorServiceException("Param 'username' is null or empty"); } |
|
34 |
if ((password == null) || password.isEmpty()) { throw new CollectorServiceException("Param 'password' is null or empty"); } |
|
35 |
if ((recursive == null) || recursive.isEmpty()) { throw new CollectorServiceException("Param 'recursive' is null or empty"); } |
|
36 |
if ((extensions == null) || extensions.isEmpty()) { throw new CollectorServiceException("Param 'extensions' is null or empty"); } |
|
37 |
|
|
38 |
return new Iterable<String>() { |
|
39 |
|
|
40 |
boolean isRecursive = "true".equals(recursive); |
|
41 |
|
|
42 |
Set<String> extensionsSet = parseSet(extensions); |
|
43 |
|
|
44 |
@Override |
|
45 |
public Iterator<String> iterator() { |
|
46 |
return getFtpIteratorFactory().newIterator(baseUrl, username, password, isRecursive, extensionsSet); |
|
47 |
} |
|
48 |
|
|
49 |
private Set<String> parseSet(final String extensions) { |
|
50 |
return Sets.newHashSet(Splitter.on(",").omitEmptyStrings().trimResults().split(extensions)); |
|
51 |
} |
|
52 |
}; |
|
53 |
} |
|
54 |
|
|
55 |
public FtpIteratorFactory getFtpIteratorFactory() { |
|
56 |
return ftpIteratorFactory; |
|
57 |
} |
|
58 |
|
|
59 |
@Required |
|
60 |
public void setFtpIteratorFactory(final FtpIteratorFactory ftpIteratorFactory) { |
|
61 |
this.ftpIteratorFactory = ftpIteratorFactory; |
|
62 |
} |
|
63 |
|
|
64 |
} |
modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.3.11/src/main/java/eu/dnetlib/data/collector/plugins/filesfrommetadata/PopulateFileDownloadBasePath.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.filesfrommetadata; |
|
2 |
|
|
3 |
import java.util.List; |
|
4 |
import java.util.Map; |
|
5 |
|
|
6 |
import com.google.common.base.Function; |
|
7 |
import com.google.common.collect.Lists; |
|
8 |
import eu.dnetlib.data.collector.functions.ParamValuesFunction; |
|
9 |
import eu.dnetlib.data.collector.rmi.ProtocolParameterValue; |
|
10 |
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; |
|
11 |
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; |
|
12 |
import eu.dnetlib.enabling.locators.UniqueServiceLocator; |
|
13 |
import org.apache.commons.logging.Log; |
|
14 |
import org.apache.commons.logging.LogFactory; |
|
15 |
import org.springframework.beans.factory.annotation.Autowired; |
|
16 |
import org.springframework.beans.factory.annotation.Value; |
|
17 |
|
|
18 |
/** |
|
19 |
* Created by alessia on 17/12/15. |
|
20 |
*/ |
|
21 |
public class PopulateFileDownloadBasePath implements ParamValuesFunction { |
|
22 |
|
|
23 |
private static final Log log = LogFactory.getLog(PopulateFileDownloadBasePath.class); |
|
24 |
@Autowired |
|
25 |
private UniqueServiceLocator serviceLocator; |
|
26 |
|
|
27 |
@Value("${services.objectstore.basePathList.xquery}") |
|
28 |
private String xQueryForObjectStoreBasePath; |
|
29 |
|
|
30 |
@Override |
|
31 |
public List<ProtocolParameterValue> findValues(final String s, final Map<String, String> map) { |
|
32 |
try { |
|
33 |
return Lists.transform(serviceLocator.getService(ISLookUpService.class).quickSearchProfile(xQueryForObjectStoreBasePath), |
|
34 |
new Function<String, ProtocolParameterValue>() { |
|
35 |
@Override |
|
36 |
public ProtocolParameterValue apply(final String s) { |
|
37 |
return new ProtocolParameterValue(s, s); |
|
38 |
} |
|
39 |
}); |
|
40 |
} catch (ISLookUpException e) { |
|
41 |
log.error("Cannot read Object store service properties", e); |
|
42 |
} |
|
43 |
return Lists.newArrayList(); |
|
44 |
} |
|
45 |
|
|
46 |
public UniqueServiceLocator getServiceLocator() { |
|
47 |
return serviceLocator; |
|
48 |
} |
|
49 |
|
|
50 |
public void setServiceLocator(final UniqueServiceLocator serviceLocator) { |
|
51 |
this.serviceLocator = serviceLocator; |
|
52 |
} |
|
53 |
|
|
54 |
public String getxQueryForObjectStoreBasePath() { |
|
55 |
return xQueryForObjectStoreBasePath; |
|
56 |
} |
|
57 |
|
|
58 |
public void setxQueryForObjectStoreBasePath(final String xQueryForObjectStoreBasePath) { |
|
59 |
this.xQueryForObjectStoreBasePath = xQueryForObjectStoreBasePath; |
|
60 |
} |
|
61 |
} |
modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.3.11/src/main/java/eu/dnetlib/data/collector/plugins/oaisets/OaiSetsCollectorPlugin.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.oaisets; |
|
2 |
|
|
3 |
import java.util.Iterator; |
|
4 |
|
|
5 |
import org.springframework.beans.factory.annotation.Required; |
|
6 |
|
|
7 |
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin; |
|
8 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
9 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
10 |
|
|
11 |
public class OaiSetsCollectorPlugin extends AbstractCollectorPlugin { |
|
12 |
|
|
13 |
private OaiSetsIteratorFactory oaiSetsIteratorFactory; |
|
14 |
|
|
15 |
@Override |
|
16 |
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate) |
|
17 |
throws CollectorServiceException { |
|
18 |
final String baseUrl = interfaceDescriptor.getBaseUrl(); |
|
19 |
|
|
20 |
if (baseUrl == null || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); } |
|
21 |
|
|
22 |
return new Iterable<String>() { |
|
23 |
|
|
24 |
@Override |
|
25 |
public Iterator<String> iterator() { |
|
26 |
return oaiSetsIteratorFactory.newIterator(baseUrl); |
|
27 |
} |
|
28 |
}; |
|
29 |
} |
|
30 |
|
|
31 |
public OaiSetsIteratorFactory getOaiSetsIteratorFactory() { |
|
32 |
return oaiSetsIteratorFactory; |
|
33 |
} |
|
34 |
|
|
35 |
@Required |
|
36 |
public void setOaiSetsIteratorFactory(final OaiSetsIteratorFactory oaiSetsIteratorFactory) { |
|
37 |
this.oaiSetsIteratorFactory = oaiSetsIteratorFactory; |
|
38 |
} |
|
39 |
|
|
40 |
} |
modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.3.11/src/main/java/eu/dnetlib/data/collector/plugins/filesystem/FileSystemIterator.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.filesystem; |
|
2 |
|
|
3 |
import java.io.IOException; |
|
4 |
import java.nio.file.Files; |
|
5 |
import java.nio.file.Path; |
|
6 |
import java.nio.file.Paths; |
|
7 |
import java.util.Iterator; |
|
8 |
import java.util.Set; |
|
9 |
|
|
10 |
import org.apache.commons.io.FilenameUtils; |
|
11 |
import org.apache.commons.lang.StringUtils; |
|
12 |
import org.apache.commons.logging.Log; |
|
13 |
import org.apache.commons.logging.LogFactory; |
|
14 |
|
|
15 |
import com.google.common.collect.Iterators; |
|
16 |
import com.google.common.collect.Sets; |
|
17 |
|
|
18 |
/** |
|
19 |
* Class enabling lazy & recursive iteration of a filesystem tree. The iterator iterates over file paths. |
|
20 |
* |
|
21 |
* @author Andrea |
|
22 |
* |
|
23 |
*/ |
|
24 |
public class FileSystemIterator implements Iterator<String> { |
|
25 |
|
|
26 |
/** The logger */ |
|
27 |
private static final Log log = LogFactory.getLog(FileSystemIterator.class); |
|
28 |
|
|
29 |
private Set<String> extensions = Sets.newHashSet(); |
|
30 |
private Iterator<Path> pathIterator; |
|
31 |
private String current; |
|
32 |
|
|
33 |
public FileSystemIterator(final String baseDir, final String extensions) { |
|
34 |
if(StringUtils.isNotBlank(extensions)) { |
|
35 |
this.extensions = Sets.newHashSet(extensions.split(",")); |
|
36 |
} |
|
37 |
try { |
|
38 |
this.pathIterator = Files.newDirectoryStream(Paths.get(baseDir)).iterator(); |
|
39 |
this.current = walkTillNext(); |
|
40 |
} catch (IOException e) { |
|
41 |
log.error("Cannot initialize File System Iterator. Is this path correct? " + baseDir); |
|
42 |
throw new RuntimeException("Filesystem collection error.", e); |
|
43 |
} |
|
44 |
} |
|
45 |
|
|
46 |
@Override |
|
47 |
public boolean hasNext() { |
|
48 |
return current != null; |
|
49 |
} |
|
50 |
|
|
51 |
@Override |
|
52 |
public synchronized String next() { |
|
53 |
String pivot = new String(current); |
|
54 |
current = walkTillNext(); |
|
55 |
log.debug("Returning: " + pivot); |
|
56 |
return pivot; |
|
57 |
} |
|
58 |
|
|
59 |
@Override |
|
60 |
public void remove() {} |
|
61 |
|
|
62 |
/** |
|
63 |
* Walk the filesystem recursively until it finds a candidate. Strategies: a) For any directory found during the walk, an iterator is |
|
64 |
* built and concat to the main one; b) Any file is checked against admitted extensions |
|
65 |
* |
|
66 |
* @return the next element to be returned by next call of this.next() |
|
67 |
*/ |
|
68 |
private synchronized String walkTillNext() { |
|
69 |
while (pathIterator.hasNext()) { |
|
70 |
Path nextFilePath = pathIterator.next(); |
|
71 |
if (Files.isDirectory(nextFilePath)) { |
|
72 |
// concat |
|
73 |
try { |
|
74 |
pathIterator = Iterators.concat(pathIterator, Files.newDirectoryStream(nextFilePath).iterator()); |
|
75 |
log.debug("Adding folder iterator: " + nextFilePath.toString()); |
|
76 |
} catch (IOException e) { |
|
77 |
log.error("Cannot create folder iterator! Is this path correct? " + nextFilePath.toString()); |
|
78 |
return null; |
|
79 |
} |
|
80 |
} else { |
|
81 |
if (extensions.isEmpty() || extensions.contains(FilenameUtils.getExtension(nextFilePath.toString()))) { |
|
82 |
log.debug("Returning: " + nextFilePath.toString()); |
|
83 |
return nextFilePath.toString(); |
|
84 |
} |
|
85 |
} |
|
86 |
} |
|
87 |
return null; |
|
88 |
} |
|
89 |
} |
modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.3.11/pom.xml | ||
---|---|---|
1 |
<?xml version="1.0" encoding="UTF-8"?> |
|
2 |
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> |
|
3 |
<parent> |
|
4 |
<groupId>eu.dnetlib</groupId> |
|
5 |
<artifactId>dnet45-parent</artifactId> |
|
6 |
<version>1.0.0</version> |
|
7 |
<relativePath /> |
|
8 |
</parent> |
|
9 |
<modelVersion>4.0.0</modelVersion> |
|
10 |
<groupId>eu.dnetlib</groupId> |
|
11 |
<artifactId>dnet-modular-collector-service</artifactId> |
|
12 |
<packaging>jar</packaging> |
|
13 |
<version>3.3.11</version> |
|
14 |
<scm> |
|
15 |
<developerConnection>scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.3.11</developerConnection> |
|
16 |
</scm> |
|
17 |
<dependencies> |
|
18 |
<dependency> |
|
19 |
<groupId>eu.dnetlib</groupId> |
|
20 |
<artifactId>dnet-modular-collector-service-rmi</artifactId> |
|
21 |
<version>[1.3.0,2.0.0)</version> |
|
22 |
</dependency> |
|
23 |
<dependency> |
|
24 |
<groupId>eu.dnetlib</groupId> |
|
25 |
<artifactId>cnr-resultset-service</artifactId> |
|
26 |
<version>[2.0.0,3.0.0)</version> |
|
27 |
</dependency> |
|
28 |
<dependency> |
|
29 |
<groupId>eu.dnetlib</groupId> |
|
30 |
<artifactId>cnr-blackboard-common</artifactId> |
|
31 |
<version>[2.0.0,3.0.0)</version> |
|
32 |
</dependency> |
|
33 |
<dependency> |
|
34 |
<groupId>javax.servlet</groupId> |
|
35 |
<artifactId>javax.servlet-api</artifactId> |
|
36 |
<version>${javax.servlet.version}</version> |
|
37 |
<scope>provided</scope> |
|
38 |
</dependency> |
|
39 |
<dependency> |
|
40 |
<groupId>net.sf.opencsv</groupId> |
|
41 |
<artifactId>opencsv</artifactId> |
|
42 |
<version>2.0</version> |
|
43 |
</dependency> |
|
44 |
<dependency> |
|
45 |
<groupId>junit</groupId> |
|
46 |
<artifactId>junit</artifactId> |
|
47 |
<version>${junit.version}</version> |
|
48 |
<scope>test</scope> |
|
49 |
</dependency> |
|
50 |
<dependency> |
|
51 |
<groupId>commons-net</groupId> |
|
52 |
<artifactId>commons-net</artifactId> |
|
53 |
<version>3.3</version> |
|
54 |
</dependency> |
|
55 |
<dependency> |
|
56 |
<groupId>org.apache.commons</groupId> |
|
57 |
<artifactId>commons-compress</artifactId> |
|
58 |
<version>1.6</version> |
|
59 |
</dependency> |
|
60 |
<dependency> |
|
61 |
<groupId>org.mockito</groupId> |
|
62 |
<artifactId>mockito-core</artifactId> |
|
63 |
<version>${mockito.version}</version> |
|
64 |
<scope>test</scope> |
|
65 |
</dependency> |
|
66 |
<dependency> |
|
67 |
<groupId>commons-httpclient</groupId> |
|
68 |
<artifactId>commons-httpclient</artifactId> |
|
69 |
<version>3.1</version> |
|
70 |
</dependency> |
|
71 |
<dependency> |
|
72 |
<groupId>com.google.code.gson</groupId> |
|
73 |
<artifactId>gson</artifactId> |
|
74 |
<version>${google.gson.version}</version> |
|
75 |
</dependency> |
|
76 |
<dependency> |
|
77 |
<groupId>org.apache.commons</groupId> |
|
78 |
<artifactId>commons-csv</artifactId> |
|
79 |
<version>1.4</version> |
|
80 |
</dependency> |
|
81 |
<dependency> |
|
82 |
<groupId>com.jcraft</groupId> |
|
83 |
<artifactId>jsch</artifactId> |
|
84 |
<version>0.1.53</version> |
|
85 |
</dependency> |
|
86 |
<dependency> |
|
87 |
<groupId>joda-time</groupId> |
|
88 |
<artifactId>joda-time</artifactId> |
|
89 |
<version>2.9.2</version> |
|
90 |
</dependency> |
|
91 |
<dependency> |
|
92 |
<groupId>org.json</groupId> |
|
93 |
<artifactId>json</artifactId> |
|
94 |
<version>20160810</version> |
|
95 |
</dependency> |
|
96 |
|
|
97 |
<dependency> |
|
98 |
<groupId>com.ximpleware</groupId> |
|
99 |
<artifactId>vtd-xml</artifactId> |
|
100 |
<version>2.13.2</version> |
|
101 |
</dependency> |
|
102 |
|
|
103 |
|
|
104 |
</dependencies> |
|
105 |
</project> |
modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.3.11/src/main/java/eu/dnetlib/data/collector/plugins/HttpCSVCollectorPlugin.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins; |
|
2 |
|
|
3 |
import java.io.*; |
|
4 |
import java.net.URL; |
|
5 |
import java.util.Iterator; |
|
6 |
import java.util.Set; |
|
7 |
|
|
8 |
import com.google.common.collect.Iterators; |
|
9 |
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin; |
|
10 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
11 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
12 |
import org.apache.commons.csv.CSVFormat; |
|
13 |
import org.apache.commons.csv.CSVParser; |
|
14 |
import org.apache.commons.io.input.BOMInputStream; |
|
15 |
import org.apache.commons.lang.StringUtils; |
|
16 |
import org.apache.commons.logging.Log; |
|
17 |
import org.apache.commons.logging.LogFactory; |
|
18 |
import org.dom4j.Document; |
|
19 |
import org.dom4j.DocumentHelper; |
|
20 |
import org.dom4j.Element; |
|
21 |
|
|
22 |
/** |
|
23 |
* The Class HttpCSVCollectorPlugin. |
|
24 |
*/ |
|
25 |
public class HttpCSVCollectorPlugin extends AbstractCollectorPlugin { |
|
26 |
|
|
27 |
private static final Log log = LogFactory.getLog(HttpCSVCollectorPlugin.class); |
|
28 |
|
|
29 |
public static final String UTF8_BOM = "\uFEFF"; |
|
30 |
|
|
31 |
/** |
|
32 |
* The Class HTTPCSVIterator. |
|
33 |
*/ |
|
34 |
class HTTPCSVIterator implements Iterable<String> { |
|
35 |
|
|
36 |
/** The descriptor. */ |
|
37 |
private InterfaceDescriptor descriptor; |
|
38 |
|
|
39 |
/** |
|
40 |
* Instantiates a new HTTPCSV iterator. |
|
41 |
* |
|
42 |
* @param descriptor |
|
43 |
* the descriptor |
|
44 |
*/ |
|
45 |
public HTTPCSVIterator(final InterfaceDescriptor descriptor) { |
|
46 |
this.descriptor = descriptor; |
|
47 |
} |
|
48 |
|
|
49 |
/** |
|
50 |
* Iterator. |
|
51 |
* |
|
52 |
* @return the iterator |
|
53 |
*/ |
|
54 |
@SuppressWarnings("resource") |
|
55 |
@Override |
|
56 |
public Iterator<String> iterator() { |
|
57 |
|
|
58 |
try { |
|
59 |
final String separator = descriptor.getParams().get("separator"); |
|
60 |
final String identifier = descriptor.getParams().get("identifier"); |
|
61 |
final String quote = descriptor.getParams().get("quote"); |
|
62 |
final URL url = new URL(descriptor.getBaseUrl()); |
|
63 |
long nLines = 0; |
|
64 |
|
|
65 |
// FIX |
|
66 |
// This code should skip the lines with invalid quotes |
|
67 |
final File tempFile = File.createTempFile("csv-", ".tmp"); |
|
68 |
try (InputStream is = url.openConnection().getInputStream(); |
|
69 |
BOMInputStream bomIs = new BOMInputStream(is); |
|
70 |
BufferedReader reader = new BufferedReader(new InputStreamReader(bomIs)); |
|
71 |
FileWriter fw = new FileWriter(tempFile)) { |
|
72 |
|
|
73 |
String line; |
|
74 |
while ((line = reader.readLine()) != null) { |
|
75 |
if (StringUtils.isBlank(quote) || (quote.charAt(0) != '"') || verifyQuotes(line, separator.charAt(0))) { |
|
76 |
fw.write(line); |
|
77 |
fw.write("\n"); |
|
78 |
nLines++; |
|
79 |
} |
|
80 |
} |
|
81 |
} |
|
82 |
// END FIX |
|
83 |
|
|
84 |
final CSVFormat format = CSVFormat.EXCEL |
|
85 |
.withHeader() |
|
86 |
.withDelimiter(separator.equals("\\t") || StringUtils.isBlank(separator) ? '\t' : separator.charAt(0)) |
|
87 |
.withQuote(StringUtils.isBlank(quote) ? null : quote.charAt(0)) |
|
88 |
.withTrim(); |
|
89 |
|
|
90 |
final CSVParser parser = new CSVParser(new FileReader(tempFile), format); |
|
91 |
final Set<String> headers = parser.getHeaderMap().keySet(); |
|
92 |
|
|
93 |
final long nRecords = nLines - 1; |
|
94 |
|
|
95 |
return Iterators.transform(parser.iterator(), input -> { |
|
96 |
try { |
|
97 |
final Document document = DocumentHelper.createDocument(); |
|
98 |
final Element root = document.addElement("csvRecord"); |
|
99 |
for (final String key : headers) { |
|
100 |
final Element row = root.addElement("column"); |
|
101 |
row.addAttribute("name", key).addText(input.get(key)); |
|
102 |
if (key.equals(identifier)) { |
|
103 |
row.addAttribute("isID", "true"); |
|
104 |
} |
|
105 |
} |
|
106 |
|
|
107 |
return document.asXML(); |
|
108 |
} finally { |
|
109 |
System.out.println(tempFile.getAbsolutePath()); |
|
110 |
if (parser.getRecordNumber() == nRecords) { |
|
111 |
System.out.println("DELETING " + tempFile.getAbsolutePath()); |
|
112 |
tempFile.delete(); |
|
113 |
} |
|
114 |
} |
|
115 |
}); |
|
116 |
} catch (final Exception e) { |
|
117 |
log.error("Error iterating csv lines", e); |
|
118 |
return null; |
|
119 |
} |
|
120 |
} |
|
121 |
|
|
122 |
} |
|
123 |
|
|
124 |
/* |
|
125 |
* (non-Javadoc) |
|
126 |
* |
|
127 |
* @see eu.dnetlib.data.collector.plugin.CollectorPlugin#collect(eu.dnetlib.data.collector.rmi.InterfaceDescriptor, java.lang.String, |
|
128 |
* java.lang.String) |
|
129 |
*/ |
|
130 |
@Override |
|
131 |
public Iterable<String> collect(final InterfaceDescriptor descriptor, final String fromDate, final String untilDate) throws CollectorServiceException { |
|
132 |
|
|
133 |
return new HTTPCSVIterator(descriptor); |
|
134 |
} |
|
135 |
|
|
136 |
public boolean verifyQuotes(final String line, final char separator) { |
|
137 |
final char[] cs = line.trim().toCharArray(); |
|
138 |
boolean inField = false; |
|
139 |
boolean skipNext = false; |
|
140 |
for (int i = 0; i < cs.length; i++) { |
|
141 |
if (skipNext) { |
|
142 |
skipNext = false; |
|
143 |
} else if (inField) { |
|
144 |
if ((cs[i] == '\"') && ((i == (cs.length - 1)) || (cs[i + 1] == separator))) { |
|
145 |
inField = false; |
|
146 |
} else if ((cs[i] == '\"') && (i < (cs.length - 1))) { |
|
147 |
if ((cs[i + 1] == '\"')) { |
|
148 |
skipNext = true; |
|
149 |
} else { |
|
150 |
log.warn("Skipped invalid line: " + line); |
|
151 |
return false; |
|
152 |
} |
|
153 |
} |
|
154 |
} else { |
|
155 |
if ((cs[i] == '\"') && ((i == 0) || (cs[i - 1] == separator))) { |
|
156 |
inField = true; |
|
157 |
} |
|
158 |
} |
|
159 |
} |
|
160 |
|
|
161 |
if (inField) { |
|
162 |
log.warn("Skipped invalid line: " + line); |
|
163 |
return false; |
|
164 |
} |
|
165 |
|
|
166 |
return true; |
|
167 |
} |
|
168 |
|
|
169 |
} |
modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.3.11/src/test/java/eu/dnetlib/data/collector/plugins/csv/HTTPCSVCollectorPluginTest.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.csv; |
|
2 |
|
|
3 |
import java.net.URISyntaxException; |
|
4 |
import java.util.HashMap; |
|
5 |
|
|
6 |
import eu.dnetlib.data.collector.plugins.HttpCSVCollectorPlugin; |
|
7 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
8 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
9 |
import org.junit.Test; |
|
10 |
|
|
11 |
import static org.junit.Assert.assertFalse; |
|
12 |
import static org.junit.Assert.assertTrue; |
|
13 |
|
|
14 |
public class HTTPCSVCollectorPluginTest { |
|
15 |
|
|
16 |
private String FILE_URL = HTTPCSVCollectorPluginTest.class.getResource("testCSVwithBOM.csv").toString(); |
|
17 |
final HttpCSVCollectorPlugin plugin = new HttpCSVCollectorPlugin(); |
|
18 |
|
|
19 |
@Test |
|
20 |
public void testCSVHeader() throws URISyntaxException, CollectorServiceException { |
|
21 |
|
|
22 |
final InterfaceDescriptor descr = new InterfaceDescriptor(); |
|
23 |
final HashMap<String, String> params = new HashMap<String, String>(); |
|
24 |
|
|
25 |
params.put("separator", ","); |
|
26 |
params.put("quote", "\""); |
|
27 |
params.put("identifier", "ID"); |
|
28 |
descr.setBaseUrl(FILE_URL); |
|
29 |
descr.setParams(params); |
|
30 |
|
|
31 |
int i = 0; |
|
32 |
for (final String s : plugin.collect(descr, null, null)) { |
|
33 |
assertTrue(s.length() > 0); |
|
34 |
System.out.println(s); |
|
35 |
i++; |
|
36 |
} |
|
37 |
System.out.println(i); |
|
38 |
assertTrue(i > 0); |
|
39 |
} |
|
40 |
|
|
41 |
@Test |
|
42 |
public void testVerifyQuotesOk(){ |
|
43 |
String correct = "\"5\",\"Il Padrino\",\"EEEEEEEE \"\"ZZZZZ\"\" EEEEEEEEEE\",1970"; |
|
44 |
assertTrue(plugin.verifyQuotes(correct, ',')); |
|
45 |
} |
|
46 |
|
|
47 |
@Test |
|
48 |
public void testVerifyQuotesWRONG(){ |
|
49 |
String correct = "5\",\"Il Padrino\",\"EEEEEEEE \"ZZZZZ\" EEEEEEEEEE\",1970"; |
|
50 |
assertFalse(plugin.verifyQuotes(correct, ',')); |
|
51 |
} |
|
52 |
|
|
53 |
@Test |
|
54 |
public void testSNSF(){ |
|
55 |
String s = "\"8773\";\"3101-008773\";\"EMBO workshop on structure, function and regulation of membrane transport proteins\";\"\";\"Rossier Bernard C.\";\"Scientific Conferences\";\"Science communication\";\"Département de Pharmacologie & Toxicologie Faculté de Biologie et de Médecine Université de Lausanne\";\"Université de Lausanne - LA\";\"30103\";\"Cellular Biology, Cytology\";\"Biology and Medicine;Basic Biological Research\";\"01.04.1987\";\"30.09.1987\";\"10000.00\";\"\";\"30103\"" ; |
|
56 |
assertTrue(plugin.verifyQuotes(s, ';')); |
|
57 |
} |
|
58 |
|
|
59 |
@Test |
|
60 |
public void testSNSF2(){ |
|
61 |
String s = "\"11000\";\"4021-011000\";\"Literarische und nationale Erziehung : Schweizerisches Selbstverständnis in der Literatur für Kinder und Jugend- liche\";\"\";\"Tschirky Rosmarie\";\"NRP 21 Cultural Diversity and National Identity\";\"Programmes;National Research Programmes (NRPs)\";\"Schweiz. Inst. für Kinder- und Jugendmedien\";\"Universität Zürich - ZH\";\"10501\";\"German and English languages and literature\";\"Human and Social Sciences;Linguistics and literature, philosophy\";\"10501\";\"01.10.1986\";\"31.03.1990\";\"308807.00\";\"\""; |
|
62 |
assertTrue(plugin.verifyQuotes(s, ';')); |
|
63 |
} |
|
64 |
|
|
65 |
@Test |
|
66 |
public void testSNSFInvalid(){ |
|
67 |
String s = "\"35918\";\"1113-035918\";\"Entwicklung eines dreisprachigen Thesaurus des schweizerischen Rechts zur Unterstützung der Suche in Volltextdatenbanken.\";\"\";\"Verein \"Schweizerische Juristische Datenbank\"\";\"Project funding (Div. I-III)\";\"Project funding\";\"Verein \"\"Schweizerische Juristische Datenbank\"\"\";\"NPO (Biblioth., Museen, Verwalt.) - NPO\";\"10205\";\"Legal sciences\";\"Human and Social Sciences;Economics, law\";\"10205\";\"01.12.1992\";\"31.03.1995\";\"500366.00\";\"\""; |
|
68 |
assertFalse(plugin.verifyQuotes(s, ';')); |
|
69 |
} |
|
70 |
|
|
71 |
} |
modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.3.11/src/test/java/eu/dnetlib/data/collector/plugins/csv/CSVCollectorPluginTest.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.csv; |
|
2 |
|
|
3 |
import java.net.URISyntaxException; |
|
4 |
import java.net.URL; |
|
5 |
import java.util.HashMap; |
|
6 |
|
|
7 |
import org.junit.Assert; |
|
8 |
import org.junit.Test; |
|
9 |
|
|
10 |
import eu.dnetlib.data.collector.plugins.FileCSVCollectorPlugin; |
|
11 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
12 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
13 |
|
|
14 |
public class CSVCollectorPluginTest { |
|
15 |
|
|
16 |
@Test |
|
17 |
public void testCSVHeader() throws URISyntaxException, CollectorServiceException { |
|
18 |
URL resource = CSVCollectorPluginTest.class.getResource("/eu/dnetlib/data/collector/filesystem/csv/input.tsv"); |
|
19 |
InterfaceDescriptor descr = new InterfaceDescriptor(); |
|
20 |
HashMap<String, String> params = new HashMap<String, String>(); |
|
21 |
params.put("header", "TrUe"); |
|
22 |
params.put("separator", "\t"); |
|
23 |
params.put("identifier", "56"); |
|
24 |
descr.setBaseUrl(resource.toString()); |
|
25 |
descr.setParams(params); |
|
26 |
FileCSVCollectorPlugin plugin = new FileCSVCollectorPlugin(); |
|
27 |
int i = 0; |
|
28 |
for (String s : plugin.collect(descr, null, null)) { |
|
29 |
Assert.assertTrue(s.length() > 0); |
|
30 |
i++; |
|
31 |
System.out.println(s); |
|
32 |
break; |
|
33 |
} |
|
34 |
Assert.assertTrue(i > 0); |
|
35 |
|
|
36 |
} |
|
37 |
|
|
38 |
|
|
39 |
@Test |
|
40 |
public void testTSVQuote() throws URISyntaxException, CollectorServiceException { |
|
41 |
URL resource = CSVCollectorPluginTest.class.getResource("/eu/dnetlib/data/collector/filesystem/csv/input-quoted.tsv"); |
|
42 |
InterfaceDescriptor descr = new InterfaceDescriptor(); |
|
43 |
HashMap<String, String> params = new HashMap<String, String>(); |
|
44 |
params.put("header", "true"); |
|
45 |
params.put("separator", ";"); |
|
46 |
params.put("identifier", "0"); |
|
47 |
params.put("quote", "\\\""); |
|
48 |
descr.setBaseUrl(resource.toString()); |
|
49 |
descr.setParams(params); |
|
50 |
FileCSVCollectorPlugin plugin = new FileCSVCollectorPlugin(); |
|
51 |
int i = 0; |
|
52 |
for (String s : plugin.collect(descr, null, null)) { |
|
53 |
Assert.assertTrue(s.length() > 0); |
|
54 |
i++; |
|
55 |
System.out.println(s); |
|
56 |
break; |
|
57 |
} |
|
58 |
Assert.assertTrue(i > 0); |
|
59 |
|
|
60 |
} |
|
61 |
} |
modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.3.11/src/test/java/eu/dnetlib/data/collector/plugins/oai/OaiCollectorPluginTest.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.oai; |
|
2 |
|
|
3 |
import static org.junit.Assert.assertEquals; |
|
4 |
import static org.junit.Assert.assertNotNull; |
|
5 |
import static org.mockito.Mockito.verify; |
|
6 |
import static org.mockito.Mockito.when; |
|
7 |
|
|
8 |
import java.util.ArrayList; |
|
9 |
import java.util.HashMap; |
|
10 |
import java.util.Iterator; |
|
11 |
import java.util.List; |
|
12 |
|
|
13 |
import org.junit.Before; |
|
14 |
import org.junit.Test; |
|
15 |
import org.junit.runner.RunWith; |
|
16 |
import org.mockito.Mock; |
|
17 |
import org.mockito.internal.verification.Times; |
|
18 |
import org.mockito.junit.MockitoJUnitRunner; |
|
19 |
|
|
20 |
import com.google.common.base.Joiner; |
|
21 |
import com.google.common.collect.Lists; |
|
22 |
|
|
23 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
24 |
import eu.dnetlib.data.collector.rmi.ProtocolDescriptor; |
|
25 |
import eu.dnetlib.data.collector.rmi.ProtocolParameter; |
|
26 |
|
|
27 |
@RunWith(MockitoJUnitRunner.class) |
|
28 |
public class OaiCollectorPluginTest { |
|
29 |
|
|
30 |
private OaiCollectorPlugin oai; |
|
31 |
|
|
32 |
@Mock |
|
33 |
private OaiIteratorFactory oaiIteratorFactory; |
|
34 |
|
|
35 |
private List<String> elements = Lists.newArrayList("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"); |
|
36 |
|
|
37 |
private Iterator<String> oaiIterator1 = elements.subList(0, 3).iterator(); |
|
38 |
private Iterator<String> oaiIterator2 = elements.subList(3, 7).iterator(); |
|
39 |
private Iterator<String> oaiIterator3 = elements.subList(7, elements.size()).iterator(); |
|
40 |
|
|
41 |
private static final String BASE_URL = "http://oai.test.it/oai"; |
|
42 |
private static final String FORMAT = "oai_dc"; |
|
43 |
private static final String PROTOCOL = "OAI"; |
|
44 |
private static final String SET_1 = "set01"; |
|
45 |
private static final String SET_2 = "set02"; |
|
46 |
private static final String SET_3 = "set03"; |
|
47 |
|
|
48 |
@Before |
|
49 |
public void setUp() { |
|
50 |
oai = new OaiCollectorPlugin(); |
|
51 |
oai.setOaiIteratorFactory(oaiIteratorFactory); |
|
52 |
oai.setProtocolDescriptor(new ProtocolDescriptor(PROTOCOL, new ArrayList<ProtocolParameter>())); |
|
53 |
when(oaiIteratorFactory.newIterator(BASE_URL, FORMAT, SET_1, null, null)).thenReturn(oaiIterator1); |
|
54 |
when(oaiIteratorFactory.newIterator(BASE_URL, FORMAT, SET_2, null, null)).thenReturn(oaiIterator2); |
|
55 |
when(oaiIteratorFactory.newIterator(BASE_URL, FORMAT, SET_3, null, null)).thenReturn(oaiIterator3); |
|
56 |
} |
|
57 |
|
|
58 |
public void test() { |
|
59 |
oai = new OaiCollectorPlugin(); |
|
60 |
} |
|
61 |
|
|
62 |
@Test |
|
63 |
public void testGetProtocol() { |
|
64 |
assertEquals(PROTOCOL, oai.getProtocol()); |
|
65 |
} |
|
66 |
|
|
67 |
@Test |
|
68 |
public void testCollect() throws Exception { |
|
69 |
final InterfaceDescriptor iface = new InterfaceDescriptor(); |
|
70 |
iface.setId("123"); |
|
71 |
iface.setProtocol(PROTOCOL); |
|
72 |
iface.setBaseUrl(BASE_URL); |
|
73 |
iface.setParams(new HashMap<String, String>()); |
|
74 |
iface.getParams().put("format", FORMAT); |
|
75 |
iface.getParams().put("set", Joiner.on(", ").join(SET_1, SET_2, SET_3)); |
|
76 |
|
|
77 |
final Iterable<String> records = oai.collect(iface, null, null); |
|
78 |
|
|
79 |
assertNotNull(records); |
|
80 |
verify(oaiIteratorFactory, new Times(0)).newIterator(BASE_URL, FORMAT, SET_1, null, null); |
|
81 |
verify(oaiIteratorFactory, new Times(0)).newIterator(BASE_URL, FORMAT, SET_2, null, null); |
|
82 |
verify(oaiIteratorFactory, new Times(0)).newIterator(BASE_URL, FORMAT, SET_3, null, null); |
|
83 |
|
|
84 |
int count = 0; |
|
85 |
for (String s : records) { |
|
86 |
System.out.println("RECORD: " + s); |
|
87 |
assertEquals("" + count, s); |
|
88 |
count++; |
|
89 |
} |
|
90 |
assertEquals(elements.size(), count); |
|
91 |
verify(oaiIteratorFactory).newIterator(BASE_URL, FORMAT, SET_1, null, null); |
|
92 |
verify(oaiIteratorFactory).newIterator(BASE_URL, FORMAT, SET_2, null, null); |
|
93 |
verify(oaiIteratorFactory).newIterator(BASE_URL, FORMAT, SET_3, null, null); |
|
94 |
} |
|
95 |
} |
modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.3.11/src/main/java/eu/dnetlib/data/collector/plugins/oai/engine/XmlCleaner.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.oai.engine; |
|
2 |
|
|
3 |
import java.util.HashMap; |
|
4 |
import java.util.HashSet; |
|
5 |
import java.util.Map; |
|
6 |
import java.util.Set; |
|
7 |
import java.util.regex.Pattern; |
|
8 |
|
|
9 |
/** |
|
10 |
* @author jochen |
|
11 |
* |
|
12 |
*/ |
|
13 |
public class XmlCleaner { |
|
14 |
/** |
|
15 |
* Pattern for numeric entities. |
|
16 |
*/ |
|
17 |
private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};"); //$NON-NLS-1$ |
|
18 |
// private static Pattern validCharacterEntityPattern = Pattern.compile("^&#?\\d{2,4};"); //$NON-NLS-1$ |
|
19 |
private static Pattern invalidControlCharPattern = Pattern.compile(""); |
|
20 |
/** |
|
21 |
* Pattern that negates the allowable XML 4 byte unicode characters. Valid |
|
22 |
* are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | |
|
23 |
* [#x10000-#x10FFFF] |
|
24 |
*/ |
|
25 |
private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]"); //$NON-NLS-1$ |
|
26 |
|
|
27 |
// Map entities to their unicode equivalent |
|
28 |
private static Set<String> goodEntities = new HashSet<String>(); |
|
29 |
private static Map<String, String> badEntities = new HashMap<String, String>(); |
|
30 |
|
|
31 |
static { |
|
32 |
// pre-defined XML entities |
|
33 |
goodEntities.add("""); //$NON-NLS-1$ // quotation mark |
|
34 |
goodEntities.add("&"); //$NON-NLS-1$ // ampersand |
|
35 |
goodEntities.add("<"); //$NON-NLS-1$ // less-than sign |
|
36 |
goodEntities.add(">"); //$NON-NLS-1$ // greater-than sign |
|
37 |
// control entities |
|
38 |
//badEntities.put("", ""); |
|
39 |
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
40 |
badEntities.put("€", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
41 |
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
42 |
badEntities.put("‚", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
43 |
badEntities.put("ƒ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
44 |
badEntities.put("„", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
45 |
badEntities.put("…", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
46 |
badEntities.put("†", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
47 |
badEntities.put("‡", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
48 |
badEntities.put("ˆ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
49 |
badEntities.put("‰", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
50 |
badEntities.put("Š", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
51 |
badEntities.put("‹", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
52 |
badEntities.put("Œ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
53 |
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
54 |
badEntities.put("Ž", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
55 |
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
56 |
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
57 |
badEntities.put("‘", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
58 |
badEntities.put("’", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
59 |
badEntities.put("“", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
60 |
badEntities.put("”", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
61 |
badEntities.put("•", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
62 |
badEntities.put("–", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
63 |
badEntities.put("—", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
64 |
badEntities.put("˜", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
65 |
badEntities.put("™", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
66 |
badEntities.put("š", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
67 |
badEntities.put("›", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
68 |
badEntities.put("œ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
69 |
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
70 |
badEntities.put("ž", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
71 |
badEntities.put("Ÿ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character |
|
72 |
// misc entities |
|
73 |
badEntities.put("€", "\u20AC"); //$NON-NLS-1$ //$NON-NLS-2$ // euro |
|
74 |
badEntities.put("‘", "\u2018"); //$NON-NLS-1$ //$NON-NLS-2$ // left single quotation mark |
|
75 |
badEntities.put("’", "\u2019"); //$NON-NLS-1$ //$NON-NLS-2$ // right single quotation mark |
|
76 |
// Latin 1 entities |
|
77 |
badEntities.put(" ", "\u00A0"); //$NON-NLS-1$ //$NON-NLS-2$ // no-break space |
|
78 |
badEntities.put("¡", "\u00A1"); //$NON-NLS-1$ //$NON-NLS-2$ // inverted exclamation mark |
|
79 |
badEntities.put("¢", "\u00A2"); //$NON-NLS-1$ //$NON-NLS-2$ // cent sign |
|
80 |
badEntities.put("£", "\u00A3"); //$NON-NLS-1$ //$NON-NLS-2$ // pound sign |
|
81 |
badEntities.put("¤", "\u00A4"); //$NON-NLS-1$ //$NON-NLS-2$ // currency sign |
|
82 |
badEntities.put("¥", "\u00A5"); //$NON-NLS-1$ //$NON-NLS-2$ // yen sign |
|
83 |
badEntities.put("¦", "\u00A6"); //$NON-NLS-1$ //$NON-NLS-2$ // broken vertical bar |
|
84 |
badEntities.put("§", "\u00A7"); //$NON-NLS-1$ //$NON-NLS-2$ // section sign |
|
85 |
badEntities.put("¨", "\u00A8"); //$NON-NLS-1$ //$NON-NLS-2$ // diaeresis |
|
86 |
badEntities.put("©", "\u00A9"); //$NON-NLS-1$ //$NON-NLS-2$ // copyright sign |
|
87 |
badEntities.put("ª", "\u00AA"); //$NON-NLS-1$ //$NON-NLS-2$ // feminine ordinal indicator |
|
88 |
badEntities.put("«", "\u00AB"); //$NON-NLS-1$ //$NON-NLS-2$ // left-pointing double angle quotation mark |
|
89 |
badEntities.put("¬", "\u00AC"); //$NON-NLS-1$ //$NON-NLS-2$ // not sign |
|
90 |
badEntities.put("­", "\u00AD"); //$NON-NLS-1$ //$NON-NLS-2$ // soft hyphen |
|
91 |
badEntities.put("®", "\u00AE"); //$NON-NLS-1$ //$NON-NLS-2$ // registered sign |
|
92 |
badEntities.put("¯", "\u00AF"); //$NON-NLS-1$ //$NON-NLS-2$ // macron |
|
93 |
badEntities.put("°", "\u00B0"); //$NON-NLS-1$ //$NON-NLS-2$ // degree sign |
|
94 |
badEntities.put("±", "\u00B1"); //$NON-NLS-1$ //$NON-NLS-2$ // plus-minus sign |
|
95 |
badEntities.put("²", "\u00B2"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript two |
|
96 |
badEntities.put("³", "\u00B3"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript three |
|
97 |
badEntities.put("´", "\u00B4"); //$NON-NLS-1$ //$NON-NLS-2$ // acute accent |
|
98 |
badEntities.put("µ", "\u00B5"); //$NON-NLS-1$ //$NON-NLS-2$ // micro sign |
|
99 |
badEntities.put("¶", "\u00B6"); //$NON-NLS-1$ //$NON-NLS-2$ // pilcrow sign |
|
100 |
badEntities.put("·", "\u00B7"); //$NON-NLS-1$ //$NON-NLS-2$ // middle dot |
|
101 |
badEntities.put("¸", "\u00B8"); //$NON-NLS-1$ //$NON-NLS-2$ // cedilla |
|
102 |
badEntities.put("¹", "\u00B9"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript one |
|
103 |
badEntities.put("º", "\u00BA"); //$NON-NLS-1$ //$NON-NLS-2$ // masculine ordinal indicator |
|
104 |
badEntities.put("»", "\u00BB"); //$NON-NLS-1$ //$NON-NLS-2$ // right-pointing double angle quotation mark |
|
105 |
badEntities.put("¼", "\u00BC"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one quarter |
|
106 |
badEntities.put("½", "\u00BD"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one half |
|
107 |
badEntities.put("¾", "\u00BE"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction three quarters |
|
108 |
badEntities.put("¿", "\u00BF"); //$NON-NLS-1$ //$NON-NLS-2$ // inverted question mark |
|
109 |
badEntities.put("À", "\u00C0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with grave |
|
110 |
badEntities.put("Á", "\u00C1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with acute |
|
111 |
badEntities.put("Â", "\u00C2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with circumflex |
|
112 |
badEntities.put("Ã", "\u00C3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with tilde |
|
113 |
badEntities.put("Ä", "\u00C4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with diaeresis |
|
114 |
badEntities.put("Å", "\u00C5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with ring above |
|
115 |
badEntities.put("Æ", "\u00C6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter AE |
|
116 |
badEntities.put("Ç", "\u00C7"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter C with cedilla |
|
117 |
badEntities.put("È", "\u00C8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with grave |
|
118 |
badEntities.put("É", "\u00C9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with acute |
|
119 |
badEntities.put("Ê", "\u00CA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with circumflex |
|
120 |
badEntities.put("Ë", "\u00CB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with diaeresis |
|
121 |
badEntities.put("Ì", "\u00CC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with grave |
|
122 |
badEntities.put("Í", "\u00CD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with acute |
|
123 |
badEntities.put("Î", "\u00CE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with circumflex |
|
124 |
badEntities.put("Ï", "\u00CF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with diaeresis |
|
125 |
badEntities.put("Ð", "\u00D0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter ETH |
|
126 |
badEntities.put("Ñ", "\u00D1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter N with tilde |
|
127 |
badEntities.put("Ò", "\u00D2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with grave |
|
128 |
badEntities.put("Ó", "\u00D3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with acute |
|
129 |
badEntities.put("Ô", "\u00D4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with circumflex |
|
130 |
badEntities.put("Õ", "\u00D5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with tilde |
|
131 |
badEntities.put("Ö", "\u00D6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with diaeresis |
|
132 |
badEntities.put("×", "\u00D7"); //$NON-NLS-1$ //$NON-NLS-2$ // multiplication sign |
|
133 |
badEntities.put("Ø", "\u00D8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with stroke |
|
134 |
badEntities.put("Ù", "\u00D9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with grave |
|
135 |
badEntities.put("Ú", "\u00DA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with acute |
|
136 |
badEntities.put("Û", "\u00DB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with circumflex |
|
137 |
badEntities.put("Ü", "\u00DC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with diaeresis |
|
138 |
badEntities.put("Ý", "\u00DD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter Y with acute |
|
139 |
badEntities.put("Þ", "\u00DE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter THORN |
|
140 |
badEntities.put("ß", "\u00DF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter sharp s |
|
141 |
badEntities.put("à", "\u00E0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with grave |
|
142 |
badEntities.put("á", "\u00E1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with acute |
|
143 |
badEntities.put("â", "\u00E2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with circumflex |
|
144 |
badEntities.put("ã", "\u00E3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with tilde |
|
145 |
badEntities.put("ä", "\u00E4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with diaeresis |
|
146 |
badEntities.put("å", "\u00E5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with ring above |
|
147 |
badEntities.put("æ", "\u00E6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter ae |
|
148 |
badEntities.put("ç", "\u00E7"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter c with cedilla |
|
149 |
badEntities.put("è", "\u00E8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with grave |
|
150 |
badEntities.put("é", "\u00E9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with acute |
|
151 |
badEntities.put("ê", "\u00EA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with circumflex |
|
152 |
badEntities.put("ë", "\u00EB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with diaeresis |
|
153 |
badEntities.put("ì", "\u00EC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with grave |
|
154 |
badEntities.put("í", "\u00ED"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with acute |
|
155 |
badEntities.put("î", "\u00EE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with circumflex |
|
156 |
badEntities.put("ï", "\u00EF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with diaeresis |
|
157 |
badEntities.put("ð", "\u00F0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter eth |
|
158 |
badEntities.put("ñ", "\u00F1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter n with tilde |
|
159 |
badEntities.put("ò", "\u00F2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with grave |
|
160 |
badEntities.put("ó", "\u00F3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with acute |
|
161 |
badEntities.put("ô", "\u00F4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with circumflex |
|
162 |
badEntities.put("õ", "\u00F5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with tilde |
|
163 |
badEntities.put("ö", "\u00F6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with diaeresis |
|
164 |
badEntities.put("÷", "\u00F7"); //$NON-NLS-1$ //$NON-NLS-2$ // division sign |
|
165 |
badEntities.put("ø", "\u00F8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with stroke |
|
166 |
badEntities.put("ù", "\u00F9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with grave |
|
167 |
badEntities.put("ú", "\u00FA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with acute |
|
168 |
badEntities.put("û", "\u00FB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with circumflex |
|
169 |
badEntities.put("ü", "\u00FC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with diaeresis |
|
170 |
badEntities.put("ý", "\u00FD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with acute |
|
171 |
badEntities.put("þ", "\u00FE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter thorn |
|
172 |
badEntities.put("ÿ", "\u00FF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with diaeresis |
|
173 |
} |
|
174 |
/** |
|
175 |
* For each entity in the input that is not allowed in XML, replace the |
|
176 |
* entity with its unicode equivalent or remove it. For each instance of a |
|
177 |
* bare &, replace it with &<br/> |
|
178 |
* XML only allows 4 entities: &amp;, &quot;, &lt; and &gt;. |
|
179 |
* |
|
180 |
* @param broken |
|
181 |
* the string to handle entities |
|
182 |
* @return the string with entities appropriately fixed up |
|
183 |
*/ |
|
184 |
static public String cleanAllEntities(final String broken) { |
|
185 |
if (broken == null) { |
|
186 |
return null; |
|
187 |
} |
|
188 |
|
|
189 |
String working = invalidControlCharPattern.matcher(broken).replaceAll(""); |
|
190 |
working = invalidCharacterPattern.matcher(working).replaceAll(""); |
|
191 |
|
|
192 |
int cleanfrom = 0; |
|
193 |
|
|
194 |
while (true) { |
|
195 |
int amp = working.indexOf('&', cleanfrom); |
|
196 |
// If there are no more amps then we are done |
|
197 |
if (amp == -1) { |
|
198 |
break; |
|
199 |
} |
|
200 |
// Skip references of the kind &#ddd; |
|
201 |
if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) { |
|
202 |
cleanfrom = working.indexOf(';', amp) + 1; |
|
203 |
continue; |
|
204 |
} |
|
205 |
int i = amp + 1; |
|
206 |
while (true) { |
|
207 |
// if we are at the end of the string then just escape the '&'; |
|
208 |
if (i >= working.length()) { |
|
209 |
return working.substring(0, amp) + "&" + working.substring(amp + 1); //$NON-NLS-1$ |
|
210 |
} |
|
211 |
// if we have come to a ; then we have an entity |
|
212 |
// If it is something that xml can't handle then replace it. |
|
213 |
char c = working.charAt(i); |
|
214 |
if (c == ';') { |
|
215 |
final String entity = working.substring(amp, i + 1); |
|
216 |
final String replace = handleEntity(entity); |
|
217 |
working = working.substring(0, amp) + replace + working.substring(i + 1); |
|
218 |
break; |
|
219 |
} |
|
220 |
// Did we end an entity without finding a closing ; |
|
221 |
// Then treat it as an '&' that needs to be replaced with & |
|
222 |
if (!Character.isLetterOrDigit(c)) { |
|
223 |
working = working.substring(0, amp) + "&" + working.substring(amp + 1); //$NON-NLS-1$ |
|
224 |
amp = i + 4; // account for the 4 extra characters |
|
225 |
break; |
|
226 |
} |
|
227 |
i++; |
|
228 |
} |
|
229 |
cleanfrom = amp + 1; |
|
230 |
} |
|
231 |
|
|
232 |
if (Pattern.compile("<<").matcher(working).find()) { |
|
233 |
working = working.replaceAll("<<", "<<"); |
|
234 |
} |
|
235 |
|
|
236 |
if (Pattern.compile(">>").matcher(working).find()) { |
|
237 |
working = working.replaceAll(">>", ">>"); |
|
238 |
} |
|
239 |
|
|
240 |
return working; |
|
241 |
} |
|
242 |
|
|
243 |
/** |
|
244 |
* Replace entity with its unicode equivalent, if it is not a valid XML |
|
245 |
* entity. Otherwise strip it out. XML only allows 4 entities: &amp;, |
|
246 |
* &quot;, &lt; and &gt;. |
|
247 |
* |
|
248 |
* @param entity |
|
249 |
* the entity to be replaced |
|
250 |
* @return the substitution for the entity, either itself, the unicode |
|
251 |
* equivalent or an empty string. |
|
252 |
*/ |
|
253 |
private static String handleEntity(final String entity) { |
|
254 |
if (goodEntities.contains(entity)) { |
|
255 |
return entity; |
|
256 |
} |
|
257 |
|
|
258 |
final String replace = (String) badEntities.get(entity); |
|
259 |
if (replace != null) { |
|
260 |
return replace; |
|
261 |
} |
|
262 |
|
|
263 |
return replace != null ? replace : ""; |
|
264 |
} |
|
265 |
} |
modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.3.11/src/test/java/eu/dnetlib/data/collector/plugins/oai/HttpConnectorTest.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.oai; |
|
2 |
|
|
3 |
import org.junit.Before; |
|
4 |
import org.junit.Ignore; |
|
5 |
import org.junit.Test; |
|
6 |
|
|
7 |
import eu.dnetlib.data.collector.plugins.oai.engine.HttpConnector; |
|
8 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
9 |
|
|
10 |
public class HttpConnectorTest { |
|
11 |
|
|
12 |
private HttpConnector connector; |
|
13 |
|
|
14 |
private static final String URL = "https://researchdata.ands.org.au/registry/services/oai?verb=Identify"; |
|
15 |
|
|
16 |
@Before |
|
17 |
public void setUp() { |
|
18 |
connector = new HttpConnector(); |
|
19 |
connector.initTrustManager(); |
|
20 |
connector.setMaxNumberOfRetry(1); |
|
21 |
} |
|
22 |
|
|
23 |
@Test |
|
24 |
@Ignore |
|
25 |
public void testGetInputSource() throws CollectorServiceException { |
|
26 |
System.out.println(connector.getInputSource(URL)); |
|
27 |
} |
|
28 |
} |
modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.3.11/src/test/java/eu/dnetlib/data/collector/plugins/oai/OaiIteratorTest.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.oai; |
|
2 |
|
|
3 |
import org.junit.Before; |
|
4 |
import org.junit.Ignore; |
|
5 |
import org.junit.Test; |
|
6 |
|
|
7 |
import eu.dnetlib.data.collector.plugins.oai.engine.HttpConnector; |
|
8 |
|
|
9 |
public class OaiIteratorTest { |
|
10 |
|
|
11 |
private static final String BASE_URL = "http://oai.d.efg.research-infrastructures.eu/oai.do"; |
|
12 |
private static final String FORMAT = "oai_dc"; |
|
13 |
private static final String SET = "d937bab1-d44c-44aa-bf7d-df5312a3b623"; |
|
14 |
|
|
15 |
private OaiIterator oai; |
|
16 |
|
|
17 |
@Before |
|
18 |
public void setUp() { |
|
19 |
HttpConnector httpConnector = new HttpConnector(); |
|
20 |
httpConnector.initTrustManager(); |
|
21 |
oai = new OaiIterator(BASE_URL, FORMAT, SET, null, null, httpConnector); |
|
22 |
} |
|
23 |
|
|
24 |
@Test |
|
25 |
@Ignore |
|
26 |
public void test() { |
|
27 |
int count = 0; |
|
28 |
while (oai.hasNext()) { |
|
29 |
oai.next(); |
|
30 |
count++; |
|
31 |
} |
|
32 |
System.out.println("TOTAL: " + count); |
|
33 |
} |
|
34 |
} |
modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.3.11/src/main/java/eu/dnetlib/data/collector/plugins/oai/OaiCollectorPlugin.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.oai; |
|
2 |
|
|
3 |
import java.util.Iterator; |
|
4 |
import java.util.List; |
|
5 |
|
|
6 |
import org.springframework.beans.factory.annotation.Required; |
|
7 |
|
|
8 |
import com.google.common.base.Function; |
|
9 |
import com.google.common.base.Splitter; |
|
10 |
import com.google.common.collect.Iterables; |
|
11 |
import com.google.common.collect.Iterators; |
|
12 |
import com.google.common.collect.Lists; |
|
13 |
|
|
14 |
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin; |
|
15 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
16 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
17 |
|
|
18 |
public class OaiCollectorPlugin extends AbstractCollectorPlugin { |
|
19 |
|
|
20 |
private static final String FORMAT_PARAM = "format"; |
|
21 |
private static final String OAI_SET_PARAM = "set"; |
|
22 |
|
|
23 |
private OaiIteratorFactory oaiIteratorFactory; |
|
24 |
|
|
25 |
@Override |
|
26 |
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate) |
|
27 |
throws CollectorServiceException { |
|
28 |
final String baseUrl = interfaceDescriptor.getBaseUrl(); |
|
29 |
final String mdFormat = interfaceDescriptor.getParams().get(FORMAT_PARAM); |
|
30 |
final String setParam = interfaceDescriptor.getParams().get(OAI_SET_PARAM); |
|
31 |
final List<String> sets = Lists.newArrayList(); |
|
32 |
if (setParam != null) { |
|
33 |
sets.addAll(Lists.newArrayList(Splitter.on(",").omitEmptyStrings().trimResults().split(setParam))); |
|
34 |
} |
|
35 |
if (sets.isEmpty()) { |
|
36 |
// If no set is defined, ALL the sets must be harvested |
|
37 |
sets.add(""); |
|
38 |
} |
|
39 |
|
|
40 |
if (baseUrl == null || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); } |
|
41 |
|
|
42 |
if (mdFormat == null || mdFormat.isEmpty()) { throw new CollectorServiceException("Param 'mdFormat' is null or empty"); } |
|
43 |
|
|
44 |
if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new CollectorServiceException("Invalid date (YYYY-MM-DD): " + fromDate); } |
|
45 |
|
|
46 |
if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new CollectorServiceException("Invalid date (YYYY-MM-DD): " + untilDate); } |
|
47 |
|
|
48 |
return new Iterable<String>() { |
|
49 |
|
|
50 |
@SuppressWarnings("unchecked") |
|
51 |
@Override |
|
52 |
public Iterator<String> iterator() { |
|
53 |
final Iterable<Iterator<String>> iter = Iterables.transform(sets, new Function<String, Iterator<String>>() { |
|
54 |
|
|
55 |
@Override |
|
56 |
public Iterator<String> apply(final String set) { |
|
57 |
return oaiIteratorFactory.newIterator(baseUrl, mdFormat, set, fromDate, untilDate); |
|
58 |
} |
|
59 |
}); |
|
60 |
return Iterators.concat(Iterables.toArray(iter, Iterator.class)); |
|
61 |
} |
|
62 |
}; |
|
63 |
} |
|
64 |
|
|
65 |
public OaiIteratorFactory getOaiIteratorFactory() { |
|
66 |
return oaiIteratorFactory; |
|
67 |
} |
|
68 |
|
|
69 |
@Required |
|
70 |
public void setOaiIteratorFactory(final OaiIteratorFactory oaiIteratorFactory) { |
|
71 |
this.oaiIteratorFactory = oaiIteratorFactory; |
|
72 |
} |
|
73 |
|
|
74 |
} |
modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.3.11/src/main/java/eu/dnetlib/data/collector/plugins/httplist/HttpListCollectorPlugin.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.httplist; |
|
2 |
|
|
3 |
import java.util.Iterator; |
|
4 |
|
|
5 |
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin; |
|
6 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
7 |
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; |
|
8 |
|
|
9 |
public class HttpListCollectorPlugin extends AbstractCollectorPlugin { |
|
10 |
|
|
11 |
@Override |
|
12 |
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate) |
|
13 |
throws CollectorServiceException { |
|
14 |
final String baseUrl = interfaceDescriptor.getBaseUrl(); |
|
15 |
final String listAddress = interfaceDescriptor.getParams().get("listUrl"); |
|
16 |
|
|
17 |
return new Iterable<String>() { |
|
18 |
|
|
19 |
@Override |
|
20 |
public Iterator<String> iterator() { |
|
21 |
return new HttpListIterator(baseUrl, listAddress); |
|
22 |
} |
|
23 |
}; |
|
24 |
} |
|
25 |
} |
modules/dnet-modular-collector-service/tags/dnet-modular-collector-service-3.3.11/src/main/java/eu/dnetlib/data/collector/plugins/oai/OaiIterator.java | ||
---|---|---|
1 |
package eu.dnetlib.data.collector.plugins.oai; |
|
2 |
|
|
3 |
import java.io.StringReader; |
|
4 |
import java.io.UnsupportedEncodingException; |
|
5 |
import java.net.URLEncoder; |
|
6 |
import java.util.Iterator; |
|
7 |
import java.util.Queue; |
|
8 |
import java.util.concurrent.PriorityBlockingQueue; |
|
9 |
|
|
10 |
import org.apache.commons.logging.Log; |
|
11 |
import org.apache.commons.logging.LogFactory; |
|
12 |
import org.dom4j.Document; |
|
13 |
import org.dom4j.DocumentException; |
|
14 |
import org.dom4j.Node; |
|
15 |
import org.dom4j.io.SAXReader; |
|
16 |
|
|
17 |
import eu.dnetlib.data.collector.plugins.oai.engine.HttpConnector; |
|
18 |
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner; |
|
19 |
import eu.dnetlib.data.collector.rmi.CollectorServiceException; |
|
20 |
|
|
21 |
public class OaiIterator implements Iterator<String> { |
|
22 |
|
|
23 |
private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on 11/24/08 5:02 PM |
|
24 |
|
|
25 |
private Queue<String> queue = new PriorityBlockingQueue<String>(); |
|
26 |
private SAXReader reader = new SAXReader(); |
|
27 |
|
|
28 |
private String baseUrl; |
|
29 |
private String set; |
|
30 |
private String mdFormat; |
|
31 |
private String fromDate; |
|
32 |
private String untilDate; |
|
33 |
private String token; |
|
34 |
private boolean started; |
Also available in: Unified diff
[maven-release-plugin] copy for tag dnet-modular-collector-service-3.3.11