1
|
package eu.dnetlib.pid.resolver.parser;
|
2
|
|
3
|
import eu.dnetlib.data.transform.VtdUtilityParser;
|
4
|
import eu.dnetlib.pid.resolver.model.ObjectType;
|
5
|
import eu.dnetlib.pid.resolver.model.PID;
|
6
|
import eu.dnetlib.pid.resolver.model.ResolvedObject;
|
7
|
import eu.dnetlib.pid.resolver.model.SubjectType;
|
8
|
import org.apache.commons.lang3.StringUtils;
|
9
|
import org.apache.commons.logging.Log;
|
10
|
import org.apache.commons.logging.LogFactory;
|
11
|
|
12
|
import javax.xml.stream.XMLStreamReader;
|
13
|
import java.util.*;
|
14
|
import java.util.regex.Matcher;
|
15
|
import java.util.regex.Pattern;
|
16
|
|
17
|
public abstract class AbstractResolverParser {
|
18
|
|
19
|
protected static final Log log = LogFactory.getLog(AbstractResolverParser.class);
|
20
|
final static Pattern pattern = Pattern.compile("10\\.\\d{4,9}/[-._;()/:A-Z0-9]+$", Pattern.CASE_INSENSITIVE);
|
21
|
private List<String> datasetSubTypes = Arrays.asList("dataset", "software", "film", "sound", "physicalobject", "audiovisual", "collection", "other", "study", "metadata");
|
22
|
|
23
|
public abstract ResolvedObject parseObject(final String record);
|
24
|
|
25
|
protected Map<String, String> getAttributes(final XMLStreamReader parser) {
|
26
|
final Map<String, String> attributesMap = new HashMap<>();
|
27
|
for (int i = 0; i < parser.getAttributeCount(); i++) {
|
28
|
attributesMap.put(parser.getAttributeLocalName(i), parser.getAttributeValue(i));
|
29
|
}
|
30
|
return attributesMap;
|
31
|
}
|
32
|
|
33
|
protected void setType(final ResolvedObject object, final String type) {
|
34
|
if (!StringUtils.isBlank(type)) {
|
35
|
if (datasetSubTypes.contains(type.toLowerCase())) {
|
36
|
object.setType(ObjectType.dataset);
|
37
|
return;
|
38
|
} else if (type.toLowerCase().contains("publication")) {
|
39
|
object.setType(ObjectType.publication);
|
40
|
return;
|
41
|
} else {
|
42
|
object.setType(ObjectType.unknown);
|
43
|
}
|
44
|
}
|
45
|
}
|
46
|
|
47
|
protected void extractSubject(ResolvedObject parsedObject, List<VtdUtilityParser.Node> subjects) {
|
48
|
if (subjects != null && subjects.size() > 0) {
|
49
|
final List<SubjectType> subjectResult = new ArrayList<>();
|
50
|
subjects.forEach(subjectMap -> {
|
51
|
final SubjectType subject = new SubjectType(subjectMap.getAttributes().get("subjectScheme"), subjectMap.getTextValue());
|
52
|
subjectResult.add(subject);
|
53
|
});
|
54
|
parsedObject.setSubjects(subjectResult);
|
55
|
}
|
56
|
}
|
57
|
|
58
|
protected boolean extractIdentifier(ResolvedObject parsedObject, List<VtdUtilityParser.Node> identifierType) {
|
59
|
return extractIdentifier(parsedObject, identifierType, "identifierType");
|
60
|
}
|
61
|
|
62
|
|
63
|
protected boolean extractIdentifier(ResolvedObject parsedObject, List<VtdUtilityParser.Node> identifierType, final String fieldName) {
|
64
|
if (identifierType != null && identifierType.size() > 0) {
|
65
|
|
66
|
final VtdUtilityParser.Node result = identifierType.get(0);
|
67
|
parsedObject.setPid(result.getTextValue());
|
68
|
parsedObject.setPidType(result.getAttributes().get(fieldName));
|
69
|
} else {
|
70
|
log.debug("Error on parsing record the identifier should not null ");
|
71
|
return true;
|
72
|
}
|
73
|
return false;
|
74
|
}
|
75
|
|
76
|
protected PID inferPid(final PID input) {
|
77
|
final Matcher matcher = pattern.matcher(input.getId());
|
78
|
if (matcher.find()) {
|
79
|
input.setId(matcher.group());
|
80
|
input.setType("doi");
|
81
|
}
|
82
|
return input;
|
83
|
}
|
84
|
|
85
|
}
|