1
|
package eu.dnetlib.data.mdstore.plugins;
|
2
|
|
3
|
import java.io.StringReader;
|
4
|
import java.net.URI;
|
5
|
import java.net.URISyntaxException;
|
6
|
import java.util.ArrayList;
|
7
|
import java.util.Arrays;
|
8
|
import java.util.HashMap;
|
9
|
import java.util.List;
|
10
|
import java.util.Map;
|
11
|
import java.util.Set;
|
12
|
import java.util.stream.Collectors;
|
13
|
|
14
|
import org.apache.commons.lang3.StringUtils;
|
15
|
import org.apache.commons.logging.Log;
|
16
|
import org.apache.commons.logging.LogFactory;
|
17
|
import org.dom4j.Document;
|
18
|
import org.dom4j.DocumentException;
|
19
|
import org.dom4j.Element;
|
20
|
import org.dom4j.Node;
|
21
|
import org.dom4j.io.SAXReader;
|
22
|
import org.springframework.beans.factory.annotation.Autowired;
|
23
|
import org.springframework.beans.factory.annotation.Value;
|
24
|
|
25
|
import com.google.common.base.Splitter;
|
26
|
import com.mongodb.MongoClient;
|
27
|
import com.mongodb.client.MongoCollection;
|
28
|
|
29
|
import eu.dnetlib.data.mdstore.plugins.objects.MdRecord;
|
30
|
import eu.dnetlib.data.mdstore.plugins.objects.MyURL;
|
31
|
import eu.dnetlib.data.mdstore.plugins.objects.Project;
|
32
|
import eu.dnetlib.data.utils.XsltFunctions;
|
33
|
|
34
|
public class EnrichOpenairePlugin extends GenericDoiMdstorePlugin {
|
35
|
|
36
|
private static final Log log = LogFactory.getLog(EnrichOpenairePlugin.class);
|
37
|
|
38
|
@Value("${plugin.enrich.publications.openaire.url}")
|
39
|
private String baseUrl;
|
40
|
|
41
|
@Value("${plugin.enrich.openaire.datasources.blacklist}")
|
42
|
private String datasourceBlackList;
|
43
|
|
44
|
@Autowired
|
45
|
private MongoClient mongoClient;
|
46
|
|
47
|
private Map<String, Counter> counters = new HashMap<>();
|
48
|
|
49
|
@Override
|
50
|
protected URI prepareURI(final String doi) throws URISyntaxException {
|
51
|
return new URI(String.format(baseUrl, doi));
|
52
|
}
|
53
|
|
54
|
@Override
|
55
|
protected MongoCollection<org.bson.Document> getCacheCollection() {
|
56
|
return mongoClient.getDatabase("API_CACHES").getCollection("OPENAIRE_API_CACHE");
|
57
|
}
|
58
|
|
59
|
@Override
|
60
|
protected void reconfigure(final Map<String, String> params) {
|
61
|
counters.clear();
|
62
|
counters.put("subjects", new Counter());
|
63
|
counters.put("citations", new Counter());
|
64
|
counters.put("urls", new Counter());
|
65
|
counters.put("projects", new Counter());
|
66
|
counters.put("dois", new Counter());
|
67
|
}
|
68
|
|
69
|
@Override
|
70
|
protected void resetConfiguration() {
|
71
|
log.info("***** Openaire Enrichment - subjects : " + counters.get("subjects"));
|
72
|
log.info("***** Openaire Enrichment - citations : " + counters.get("citations"));
|
73
|
log.info("***** Openaire Enrichment - urls : " + counters.get("urls"));
|
74
|
log.info("***** Openaire Enrichment - projects : " + counters.get("projects"));
|
75
|
log.info("***** Openaire Enrichment - dois : " + counters.get("dois"));
|
76
|
counters.clear();
|
77
|
}
|
78
|
|
79
|
@Override
|
80
|
protected boolean updateDocument(final MdRecord doc, final String response) {
|
81
|
counters.get("subjects").incrementBefore(doc.getSubjects().size());
|
82
|
counters.get("citations").incrementBefore(doc.getCitations().size());
|
83
|
counters.get("urls").incrementBefore(doc.getUrls().size());
|
84
|
counters.get("projects").incrementBefore(doc.getProjects().size());
|
85
|
counters.get("dois").incrementBefore(doc.getDois().size());
|
86
|
|
87
|
try {
|
88
|
final Document docRes = (new SAXReader()).read(new StringReader(response));
|
89
|
|
90
|
final List<?> results = docRes.selectNodes("/response/results/result");
|
91
|
|
92
|
if (results.size() == 1) {
|
93
|
final Node n = (Node) results.get(0);
|
94
|
updateSubjects(doc, n);
|
95
|
updateCitations(doc, n);
|
96
|
updateUrls(doc, n);
|
97
|
updateProjects(doc, n);
|
98
|
updateDois(doc, n);
|
99
|
updateBestRights(doc);
|
100
|
|
101
|
return true;
|
102
|
} else if (results.size() == 1) {
|
103
|
log.warn("Too many responses");
|
104
|
}
|
105
|
} catch (final DocumentException e) {
|
106
|
log.warn("Invalid response", e);
|
107
|
} finally {
|
108
|
counters.get("subjects").incrementAfter(doc.getSubjects().size());
|
109
|
counters.get("citations").incrementAfter(doc.getCitations().size());
|
110
|
counters.get("urls").incrementAfter(doc.getUrls().size());
|
111
|
counters.get("projects").incrementAfter(doc.getProjects().size());
|
112
|
counters.get("dois").incrementAfter(doc.getDois().size());
|
113
|
}
|
114
|
|
115
|
return false;
|
116
|
}
|
117
|
|
118
|
private void updateSubjects(final MdRecord doc, final Node n) {
|
119
|
final Set<String> subjects = doc.getSubjects()
|
120
|
.stream()
|
121
|
.map(EnrichOpenairePlugin::cleanSubject)
|
122
|
.flatMap(List::stream)
|
123
|
.collect(Collectors.toSet());
|
124
|
|
125
|
for (final Object o : n.selectNodes(".//subject[@classid='keyword']")) {
|
126
|
subjects.addAll(cleanSubject(((Node) o).getText().trim()));
|
127
|
}
|
128
|
|
129
|
doc.setSubjects(subjects);
|
130
|
}
|
131
|
|
132
|
public static List<String> cleanSubject(final String s) {
|
133
|
if (s.isEmpty()) {
|
134
|
return new ArrayList<>();
|
135
|
} else if (s.startsWith("info:eu-repo/classification/msc/")) {
|
136
|
return new ArrayList<>();
|
137
|
} else if (s.startsWith("info:eu-repo/classification/acm/")) {
|
138
|
return Arrays.asList(s.replaceFirst("info:eu-repo/classification/acm/", ""));
|
139
|
} else if (s.contains(";")) {
|
140
|
return Splitter.on(";").trimResults().omitEmptyStrings().splitToList(s);
|
141
|
} else if (s.contains(",")) {
|
142
|
return Splitter.on(",").trimResults().omitEmptyStrings().splitToList(s);
|
143
|
} else {
|
144
|
return Arrays.asList(s);
|
145
|
}
|
146
|
}
|
147
|
|
148
|
private void updateCitations(final MdRecord doc, final Node n) {
|
149
|
doc.getCitations().clear();
|
150
|
|
151
|
for (final Object o : n.selectNodes(".//citations/citation/rawText")) {
|
152
|
doc.getCitations().add(((Node) o).getText());
|
153
|
}
|
154
|
}
|
155
|
|
156
|
private void updateUrls(final MdRecord doc, final Node n) {
|
157
|
doc.getUrls().addAll(doc.getUrls());
|
158
|
|
159
|
final Set<String> blacklist =
|
160
|
Arrays.stream(datasourceBlackList.split(",")).map(String::trim).filter(StringUtils::isNotBlank).collect(Collectors.toSet());
|
161
|
|
162
|
for (final Object oin : n.selectNodes(".//instance")) {
|
163
|
|
164
|
final String hostedByid = ((Element) oin).valueOf("./hostedby/@id").trim();
|
165
|
|
166
|
if (!blacklist.contains(hostedByid)) {
|
167
|
final String hostedBy = ((Element) oin).valueOf("./hostedby/@name").trim();
|
168
|
final String rights = ((Element) oin).valueOf("./accessright/@classname").trim();
|
169
|
|
170
|
for (final Object ourl : ((Element) oin).selectNodes("./webresource/url")) {
|
171
|
final String url = ((Node) ourl).getText().trim();
|
172
|
final String name =
|
173
|
hostedBy.equalsIgnoreCase("Unknown") || hostedBy.equalsIgnoreCase("Unknown Repository") ? XsltFunctions.serverName(url) : hostedBy;
|
174
|
final MyURL u = new MyURL(url, name, rights);
|
175
|
doc.getUrls().remove(u);
|
176
|
doc.getUrls().add(u);
|
177
|
}
|
178
|
}
|
179
|
}
|
180
|
}
|
181
|
|
182
|
private void updateProjects(final MdRecord doc, final Node n) {
|
183
|
|
184
|
for (final Object op : n.selectNodes(".//rels/rel[./to/@type='project']")) {
|
185
|
final Node p = (Node) op;
|
186
|
final String name = p.valueOf("./title").trim();
|
187
|
|
188
|
if (StringUtils.isNotBlank(name) && !name.equalsIgnoreCase("null") && !name.equalsIgnoreCase("unidentified") && !name.equalsIgnoreCase("unknown")) {
|
189
|
final Project np = new Project();
|
190
|
np.setOpenaireId(p.valueOf("./to"));
|
191
|
np.setCode(p.valueOf("./code"));
|
192
|
np.setName(name);
|
193
|
np.setAcronym(p.valueOf("./acronym"));
|
194
|
np.setFunder(p.valueOf(".//funder/@shortname"));
|
195
|
np.setProgram(p.valueOf(".//funding_level_0/@name"));
|
196
|
np.setJurisdiction(p.valueOf(".//funder/@jurisdiction"));
|
197
|
np.setInfoId(XsltFunctions.projectLongId(np.getFunder(), np.getProgram(), np.getCode(), np.getJurisdiction(),
|
198
|
np.getName(), np.getAcronym()));
|
199
|
|
200
|
doc.getProjects().add(np);
|
201
|
}
|
202
|
}
|
203
|
}
|
204
|
|
205
|
private void updateDois(final MdRecord doc, final Node n) {
|
206
|
for (final Object od : n.selectNodes(".//*[local-name()='result']/pid[@classid='doi']")) {
|
207
|
final String doi = XsltFunctions.cleanDoi(((Node) od).getText());
|
208
|
if (StringUtils.isNotBlank(doi)) {
|
209
|
doc.getDois().add(doi);
|
210
|
}
|
211
|
}
|
212
|
}
|
213
|
|
214
|
private void updateBestRights(final MdRecord doc) {
|
215
|
final Set<String> availables = doc.getUrls().stream().map(MyURL::getRights).map(String::toUpperCase).collect(Collectors.toSet());
|
216
|
if (availables.contains("OPEN ACCESS")) {
|
217
|
doc.setBestRights("Open Access");
|
218
|
} else if (availables.contains("EMBARGO")) {
|
219
|
doc.setBestRights("Embargo");
|
220
|
} else if (availables.contains("RESTRICTED")) {
|
221
|
doc.setBestRights("Restricted");
|
222
|
} else if (availables.contains("CLOSED ACCESS")) {
|
223
|
doc.setBestRights("Closed Access");
|
224
|
} else {
|
225
|
doc.setBestRights("Unknown");
|
226
|
}
|
227
|
}
|
228
|
|
229
|
}
|